Momentum Loom Nexus is a portfolio engine that watches a basket of currency pairs as a living network, then favors the pairs whose movement looks persistent while avoiding those trapped inside a crowded cluster. It runs as an object oriented strategy with a clear separation between memory handling, feature creation, network building, scoring, and adaptive control. Every update cycle begins by collecting a compact set of nine aspects for each pair. These aspects describe how the pair is moving right now, how it has moved over a longer lookback, how unstable it is, whether price is stretched away from a reference, whether movement is range like or directional, how active the tape feels, and whether recent direction repeats. The features are stored in a structure of arrays ring buffer so the system can stream new values efficiently and read recent history without copying large blocks.

The core engine builds a similarity map between all pairs by measuring how their feature histories co move. This is computationally heavy, so the strategy can optionally hand the pairwise correlation step to an OpenCL kernel. If the OpenCL runtime is missing, a device is unavailable, or the kernel fails, the strategy quietly falls back to the CPU path and continues with identical logic. The result of this step is a correlation matrix that represents how synchronized pairs are on average across all features.

Next, the strategy transforms similarity into distance and blends it with an exposure distance table that represents structural overlap between pairs based on shared currencies. This creates a single distance matrix that combines price behavior and fundamental pair overlap. The strategy then runs a shortest path pass across the distance network to measure how connected each pair is to the rest of the universe through direct and indirect links. From this it derives a compactness measure for each pair, which acts like a crowding detector. In parallel it extracts a momentum signal from the longer horizon return feature.

Scores are then produced by combining momentum, compactness, and a penalty based on how compact the surrounding neighborhood is. The effect is a momentum bias that is tempered when the environment is tightly coupled. Finally, a learning controller sits above the scoring layer. It forms a snapshot of the universe, clusters the current condition into regimes, and uses a simple reinforcement style rule to adjust how many pairs are selected and how strongly scores are scaled. The printed output is a rotating top list that reveals which pairs are favored, why they are favored, and whether acceleration is coming from GPU or CPU.

Code
// TGr06E_MomentumBias_v4.cpp - Zorro64 Strategy DLL
// Strategy E v4: Momentum-Biased with MX06 OOP + OpenCL + Learning Controller
// Notes:
// - Keeps full CPU fallback.
// - OpenCL is optional: if OpenCL.dll missing / no device / kernel build fails -> CPU path.
// - OpenCL accelerates the heavy correlation matrix step by offloading pairwise correlations.
// - Correlation is computed in float on GPU; results are stored back into fvar corrMatrix.

#define _CRT_SECURE_NO_WARNINGS
#include <zorro.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <windows.h>
#include <stddef.h>

#define INF 1e30
#define EPS 1e-12
#define N_ASSETS 28
#define FEAT_N 9
#define FEAT_WINDOW 200
#define UPDATE_EVERY 5
#define TOP_K 5

#define ALPHA 0.1
#define BETA 0.2
#define GAMMA 3.5
#define LAMBDA_META 0.7

#define USE_ML 1
#define USE_UNSUP 1
#define USE_RL 1

#ifdef TIGHT_MEM
typedef float fvar;
#else
typedef double fvar;
#endif

static const char* ASSET_NAMES[] = {
  "EURUSD","GBPUSD","USDCHF","USDJPY","AUDUSD","AUDCAD","AUDCHF","AUDJPY","AUDNZD",
  "CADJPY","CADCHF","EURAUD","EURCAD","EURCHF","EURGBP","EURJPY","EURNZD","GBPAUD",
  "GBPCAD","GBPCHF","GBPJPY","GBPNZD","NZDCAD","NZDCHF","NZDJPY","NZDUSD","USDCAD"
};
static const char* CURRENCIES[] = {"EUR","GBP","USD","CHF","JPY","AUD","CAD","NZD"};
#define N_CURRENCIES 8

// ---------------------------- Exposure Table ----------------------------

struct ExposureTable {
  int exposure[N_ASSETS][N_CURRENCIES];
  double exposureDist[N_ASSETS][N_ASSETS];

  void init() {
    for(int i=0;i<N_ASSETS;i++){
      for(int c=0;c<N_CURRENCIES;c++){
        exposure[i][c] = 0;
      }
    }
    for(int i=0;i<N_ASSETS;i++){
      for(int j=0;j<N_ASSETS;j++){
        exposureDist[i][j] = 0.0;
      }
    }
  }

  inline double getDist(int i,int j) const { return exposureDist[i][j]; }
};

// ---------------------------- Slab Allocator ----------------------------

template<typename T>
class SlabAllocator {
public:
  T* data;
  int capacity;

  SlabAllocator() : data(NULL), capacity(0) {}
  ~SlabAllocator() { shutdown(); }

  void init(int size) {
    shutdown();
    capacity = size;
    data = (T*)malloc((size_t)capacity * sizeof(T));
    if(data) memset(data, 0, (size_t)capacity * sizeof(T));
  }

  void shutdown() {
    if(data) free(data);
    data = NULL;
    capacity = 0;
  }

  T& operator[](int i) { return data[i]; }
  const T& operator[](int i) const { return data[i]; }
};

// ---------------------------- Feature Buffer (SoA ring) ----------------------------

struct FeatureBufferSoA {
  SlabAllocator<fvar> buffer;
  int windowSize;
  int currentIndex;

  void init(int assets, int window) {
    windowSize = window;
    currentIndex = 0;
    buffer.init(FEAT_N * assets * window);
  }

  void shutdown() { buffer.shutdown(); }

  inline int offset(int feat,int asset,int t) const {
    return (feat * N_ASSETS + asset) * windowSize + t;
  }

  void push(int feat,int asset,fvar value) {
    buffer[offset(feat, asset, currentIndex)] = value;
    currentIndex = (currentIndex + 1) % windowSize;
  }

  // t=0 => most recent
  fvar get(int feat,int asset,int t) const {
    int idx = (currentIndex - 1 - t + windowSize) % windowSize;
    return buffer[offset(feat, asset, idx)];
  }
};

// ---------------------------- Minimal OpenCL (dynamic) ----------------------------

typedef struct _cl_platform_id*   cl_platform_id;
typedef struct _cl_device_id*     cl_device_id;
typedef struct _cl_context*       cl_context;
typedef struct _cl_command_queue* cl_command_queue;
typedef struct _cl_program*       cl_program;
typedef struct _cl_kernel*        cl_kernel;
typedef struct _cl_mem*           cl_mem;
typedef unsigned int              cl_uint;
typedef int                       cl_int;
typedef unsigned long long        cl_ulong;
typedef size_t                    cl_bool;

#define CL_SUCCESS 0
#define CL_DEVICE_TYPE_CPU (1ULL << 1)
#define CL_DEVICE_TYPE_GPU (1ULL << 2)
#define CL_MEM_READ_ONLY   (1ULL << 2)
#define CL_MEM_WRITE_ONLY  (1ULL << 1)
#define CL_MEM_READ_WRITE  (1ULL << 0)
#define CL_TRUE  1
#define CL_FALSE 0
#define CL_PROGRAM_BUILD_LOG 0x1183

class OpenCLBackend {
public:
  HMODULE hOpenCL;
  int ready;

  cl_platform_id platform;
  cl_device_id device;
  cl_context context;
  cl_command_queue queue;
  cl_program program;
  cl_kernel kCorr;

  cl_mem bufFeat;
  cl_mem bufCorr;

  int featBytes;
  int corrBytes;

  cl_int (*clGetPlatformIDs)(cl_uint, cl_platform_id*, cl_uint*);
  cl_int (*clGetDeviceIDs)(cl_platform_id, cl_ulong, cl_uint, cl_device_id*, cl_uint*);
  cl_context (*clCreateContext)(void*, cl_uint, const cl_device_id*, void*, void*, cl_int*);
  cl_command_queue (*clCreateCommandQueue)(cl_context, cl_device_id, cl_ulong, cl_int*);
  cl_program (*clCreateProgramWithSource)(cl_context, cl_uint, const char**, const size_t*, cl_int*);
  cl_int (*clBuildProgram)(cl_program, cl_uint, const cl_device_id*, const char*, void*, void*);
  cl_int (*clGetProgramBuildInfo)(cl_program, cl_device_id, cl_uint, size_t, void*, size_t*);
  cl_kernel (*clCreateKernel)(cl_program, const char*, cl_int*);
  cl_int (*clSetKernelArg)(cl_kernel, cl_uint, size_t, const void*);
  cl_mem (*clCreateBuffer)(cl_context, cl_ulong, size_t, void*, cl_int*);
  cl_int (*clEnqueueWriteBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void*, cl_uint, const void*, void*);
  cl_int (*clEnqueueReadBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void*, cl_uint, const void*, void*);
  cl_int (*clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*, cl_uint, const void*, void*);
  cl_int (*clFinish)(cl_command_queue);
  cl_int (*clReleaseMemObject)(cl_mem);
  cl_int (*clReleaseKernel)(cl_kernel);
  cl_int (*clReleaseProgram)(cl_program);
  cl_int (*clReleaseCommandQueue)(cl_command_queue);
  cl_int (*clReleaseContext)(cl_context);

  OpenCLBackend()
  : hOpenCL(NULL), ready(0),
    platform(NULL), device(NULL), context(NULL), queue(NULL), program(NULL), kCorr(NULL),
    bufFeat(NULL), bufCorr(NULL),
    featBytes(0), corrBytes(0),
    clGetPlatformIDs(NULL), clGetDeviceIDs(NULL), clCreateContext(NULL), clCreateCommandQueue(NULL),
    clCreateProgramWithSource(NULL), clBuildProgram(NULL), clGetProgramBuildInfo(NULL),
    clCreateKernel(NULL), clSetKernelArg(NULL),
    clCreateBuffer(NULL), clEnqueueWriteBuffer(NULL), clEnqueueReadBuffer(NULL),
    clEnqueueNDRangeKernel(NULL), clFinish(NULL),
    clReleaseMemObject(NULL), clReleaseKernel(NULL), clReleaseProgram(NULL),
    clReleaseCommandQueue(NULL), clReleaseContext(NULL)
  {}

  int loadSymbol(void** fp, const char* name) {
    *fp = (void*)GetProcAddress(hOpenCL, name);
    return (*fp != NULL);
  }

  const char* kernelSource() {
    return
      "__kernel void corr_pairwise(\n"
      "  __global const float* feat,\n"
      "  __global float* outCorr,\n"
      "  const int nAssets,\n"
      "  const int nFeat,\n"
      "  const int windowSize,\n"
      "  const float eps\n"
      "){\n"
      "  int a = (int)get_global_id(0);\n"
      "  int b = (int)get_global_id(1);\n"
      "  if(a >= nAssets || b >= nAssets) return;\n"
      "  if(a >= b) return;\n"
      "  float acc = 0.0f;\n"
      "  for(int f=0; f<nFeat; f++){\n"
      "    int baseA = (f*nAssets + a) * windowSize;\n"
      "    int baseB = (f*nAssets + b) * windowSize;\n"
      "    float mx = 0.0f;\n"
      "    float my = 0.0f;\n"
      "    for(int t=0; t<windowSize; t++){\n"
      "      mx += feat[baseA + t];\n"
      "      my += feat[baseB + t];\n"
      "    }\n"
      "    mx /= (float)windowSize;\n"
      "    my /= (float)windowSize;\n"
      "    float sxx = 0.0f;\n"
      "    float syy = 0.0f;\n"
      "    float sxy = 0.0f;\n"
      "    for(int t=0; t<windowSize; t++){\n"
      "      float dx = feat[baseA + t] - mx;\n"
      "      float dy = feat[baseB + t] - my;\n"
      "      sxx += dx*dx;\n"
      "      syy += dy*dy;\n"
      "      sxy += dx*dy;\n"
      "    }\n"
      "    float den = sqrt(sxx*syy + eps);\n"
      "    float corr = (den > eps) ? (sxy/den) : 0.0f;\n"
      "    acc += corr;\n"
      "  }\n"
      "  outCorr[a*nAssets + b] = acc / (float)nFeat;\n"
      "}\n";
  }

  void printBuildLog() {
    if(!clGetProgramBuildInfo || !program || !device) return;
    size_t logSize = 0;
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
    if(logSize == 0) return;
    char* log = (char*)malloc(logSize + 1);
    if(!log) return;
    memset(log, 0, logSize + 1);
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
    printf("OpenCL build log:\n%s\n", log);
    free(log);
  }

  void init() {
    ready = 0;

    hOpenCL = LoadLibraryA("OpenCL.dll");
    if(!hOpenCL) {
      printf("OpenCL: CPU (OpenCL.dll missing)\n");
      return;
    }

    if(!loadSymbol((void**)&clGetPlatformIDs,       "clGetPlatformIDs")) return;
    if(!loadSymbol((void**)&clGetDeviceIDs,         "clGetDeviceIDs")) return;
    if(!loadSymbol((void**)&clCreateContext,        "clCreateContext")) return;
    if(!loadSymbol((void**)&clCreateCommandQueue,   "clCreateCommandQueue")) return;
    if(!loadSymbol((void**)&clCreateProgramWithSource,"clCreateProgramWithSource")) return;
    if(!loadSymbol((void**)&clBuildProgram,         "clBuildProgram")) return;
    if(!loadSymbol((void**)&clGetProgramBuildInfo,  "clGetProgramBuildInfo")) return;
    if(!loadSymbol((void**)&clCreateKernel,         "clCreateKernel")) return;
    if(!loadSymbol((void**)&clSetKernelArg,         "clSetKernelArg")) return;
    if(!loadSymbol((void**)&clCreateBuffer,         "clCreateBuffer")) return;
    if(!loadSymbol((void**)&clEnqueueWriteBuffer,   "clEnqueueWriteBuffer")) return;
    if(!loadSymbol((void**)&clEnqueueReadBuffer,    "clEnqueueReadBuffer")) return;
    if(!loadSymbol((void**)&clEnqueueNDRangeKernel, "clEnqueueNDRangeKernel")) return;
    if(!loadSymbol((void**)&clFinish,               "clFinish")) return;
    if(!loadSymbol((void**)&clReleaseMemObject,     "clReleaseMemObject")) return;
    if(!loadSymbol((void**)&clReleaseKernel,        "clReleaseKernel")) return;
    if(!loadSymbol((void**)&clReleaseProgram,       "clReleaseProgram")) return;
    if(!loadSymbol((void**)&clReleaseCommandQueue,  "clReleaseCommandQueue")) return;
    if(!loadSymbol((void**)&clReleaseContext,       "clReleaseContext")) return;

    cl_uint nPlat = 0;
    if(clGetPlatformIDs(0, NULL, &nPlat) != CL_SUCCESS || nPlat == 0) {
      printf("OpenCL: CPU (no platform)\n");
      return;
    }
    clGetPlatformIDs(1, &platform, NULL);

    cl_uint nDev = 0;
    cl_int ok = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &nDev);
    if(ok != CL_SUCCESS || nDev == 0) {
      ok = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &nDev);
      if(ok != CL_SUCCESS || nDev == 0) {
        printf("OpenCL: CPU (no device)\n");
        return;
      }
    }

    cl_int err = 0;
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if(err != CL_SUCCESS || !context) {
      printf("OpenCL: CPU (context fail)\n");
      return;
    }

    queue = clCreateCommandQueue(context, device, 0, &err);
    if(err != CL_SUCCESS || !queue) {
      printf("OpenCL: CPU (queue fail)\n");
      return;
    }

    const char* src = kernelSource();
    program = clCreateProgramWithSource(context, 1, &src, NULL, &err);
    if(err != CL_SUCCESS || !program) {
      printf("OpenCL: CPU (program fail)\n");
      return;
    }

    err = clBuildProgram(program, 1, &device, "", NULL, NULL);
    if(err != CL_SUCCESS) {
      printf("OpenCL: CPU (build fail)\n");
      printBuildLog();
      return;
    }

    kCorr = clCreateKernel(program, "corr_pairwise", &err);
    if(err != CL_SUCCESS || !kCorr) {
      printf("OpenCL: CPU (kernel fail)\n");
      printBuildLog();
      return;
    }

    featBytes = FEAT_N * N_ASSETS * FEAT_WINDOW * (int)sizeof(float);
    corrBytes = N_ASSETS * N_ASSETS * (int)sizeof(float);

    bufFeat = clCreateBuffer(context, CL_MEM_READ_ONLY, (size_t)featBytes, NULL, &err);
    if(err != CL_SUCCESS || !bufFeat) {
      printf("OpenCL: CPU (bufFeat fail)\n");
      return;
    }

    bufCorr = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (size_t)corrBytes, NULL, &err);
    if(err != CL_SUCCESS || !bufCorr) {
      printf("OpenCL: CPU (bufCorr fail)\n");
      return;
    }

    ready = 1;
    printf("OpenCL: READY (kernel+buffers)\n");
  }

  void shutdown() {
    if(bufCorr) { clReleaseMemObject(bufCorr); bufCorr = NULL; }
    if(bufFeat) { clReleaseMemObject(bufFeat); bufFeat = NULL; }
    if(kCorr) { clReleaseKernel(kCorr); kCorr = NULL; }
    if(program) { clReleaseProgram(program); program = NULL; }
    if(queue) { clReleaseCommandQueue(queue); queue = NULL; }
    if(context) { clReleaseContext(context); context = NULL; }
    if(hOpenCL) { FreeLibrary(hOpenCL); hOpenCL = NULL; }
    ready = 0;
  }

  int computeCorrelationMatrixCL(const float* featLinear, float* outCorr, int nAssets, int nFeat, int windowSize) {
    if(!ready) return 0;
    if(!featLinear || !outCorr) return 0;

    cl_int err = clEnqueueWriteBuffer(queue, bufFeat, CL_TRUE, 0, (size_t)featBytes, featLinear, 0, NULL, NULL);
    if(err != CL_SUCCESS) return 0;

    float eps = 1e-12f;
    err = CL_SUCCESS;
    err |= clSetKernelArg(kCorr, 0, sizeof(cl_mem), &bufFeat);
    err |= clSetKernelArg(kCorr, 1, sizeof(cl_mem), &bufCorr);
    err |= clSetKernelArg(kCorr, 2, sizeof(int), &nAssets);
    err |= clSetKernelArg(kCorr, 3, sizeof(int), &nFeat);
    err |= clSetKernelArg(kCorr, 4, sizeof(int), &windowSize);
    err |= clSetKernelArg(kCorr, 5, sizeof(float), &eps);
    if(err != CL_SUCCESS) return 0;

    size_t global[2];
    global[0] = (size_t)nAssets;
    global[1] = (size_t)nAssets;

    err = clEnqueueNDRangeKernel(queue, kCorr, 2, NULL, global, NULL, 0, NULL, NULL);
    if(err != CL_SUCCESS) return 0;

    err = clFinish(queue);
    if(err != CL_SUCCESS) return 0;

    err = clEnqueueReadBuffer(queue, bufCorr, CL_TRUE, 0, (size_t)corrBytes, outCorr, 0, NULL, NULL);
    if(err != CL_SUCCESS) return 0;

    return 1;
  }
};

// ---------------------------- Learning Layer ----------------------------

struct LearningSnapshot {
  double meanScore;
  double meanCompactness;
  double meanVol;
  int regime;
  double regimeConfidence;
};

class UnsupervisedModel {
public:
  double centroids[3][3]; int counts[3]; int initialized;
  UnsupervisedModel() : initialized(0) { memset(centroids,0,sizeof(centroids)); memset(counts,0,sizeof(counts)); }
  void init(){ initialized=0; memset(centroids,0,sizeof(centroids)); memset(counts,0,sizeof(counts)); }
  void update(const LearningSnapshot& s, int* regimeOut, double* confOut){
    double x0=s.meanScore,x1=s.meanCompactness,x2=s.meanVol;
    if(!initialized){ for(int k=0;k<3;k++){ centroids[k][0]=x0+0.01*(k-1); centroids[k][1]=x1+0.01*(1-k); centroids[k][2]=x2+0.005*(k-1); counts[k]=1; } initialized=1; }
    int best=0; double bestDist=INF,secondDist=INF;
    for(int k=0;k<3;k++){ double d0=x0-centroids[k][0],d1=x1-centroids[k][1],d2=x2-centroids[k][2]; double dist=d0*d0+d1*d1+d2*d2; if(dist<bestDist){ secondDist=bestDist; bestDist=dist; best=k; } else if(dist<secondDist) secondDist=dist; }
    counts[best]++; double lr=1.0/(double)counts[best]; centroids[best][0]+=lr*(x0-centroids[best][0]); centroids[best][1]+=lr*(x1-centroids[best][1]); centroids[best][2]+=lr*(x2-centroids[best][2]);
    *regimeOut=best; *confOut=1.0/(1.0+sqrt(fabs(secondDist-bestDist)+EPS));
  }
};

class RLAgent {
public:
  double q[4]; int n[4]; int lastAction; double lastMeanScore;
  RLAgent() : lastAction(0), lastMeanScore(0) { for(int i=0;i<4;i++){q[i]=0;n[i]=0;} }
  void init(){ lastAction=0; lastMeanScore=0; for(int i=0;i<4;i++){q[i]=0;n[i]=0;} }
  int chooseAction(int updateCount){ if((updateCount%10)==0) return updateCount%4; int b=0; for(int i=1;i<4;i++) if(q[i]>q[b]) b=i; return b; }
  void updateReward(double newMeanScore){ double r=newMeanScore-lastMeanScore; n[lastAction]++; q[lastAction]+=(r-q[lastAction])/(double)n[lastAction]; lastMeanScore=newMeanScore; }
};

class StrategyController {
public:
  UnsupervisedModel unsup; RLAgent rl; int dynamicTopK; double scoreScale; int regime;
  StrategyController() : dynamicTopK(TOP_K), scoreScale(1.0), regime(0) {}
  void init(){ unsup.init(); rl.init(); dynamicTopK=TOP_K; scoreScale=1.0; regime=0; }
  void onUpdate(const LearningSnapshot& snap, fvar* scores, int nScores, int updateCount){
#if USE_ML
    double conf=0; unsup.update(snap,&regime,&conf); rl.updateReward(snap.meanScore); rl.lastAction=rl.chooseAction(updateCount);
    if(rl.lastAction==0){dynamicTopK=3;scoreScale=0.98;} else if(rl.lastAction==1){dynamicTopK=5;scoreScale=1.00;} else if(rl.lastAction==2){dynamicTopK=5;scoreScale=1.03;} else {dynamicTopK=3;scoreScale=1.01;}
    if(regime==2) scoreScale*=0.98; if(regime==0) scoreScale*=1.02;
    if(dynamicTopK<1) dynamicTopK=1; if(dynamicTopK>TOP_K) dynamicTopK=TOP_K;
    for(int i=0;i<nScores;i++){ double s=(double)scores[i]*scoreScale; if(s>1.0)s=1.0; if(s<0.0)s=0.0; scores[i]=(fvar)s; }
#else
    (void)snap; (void)scores; (void)nScores; (void)updateCount;
#endif
  }
};

// ---------------------------- Strategy ----------------------------

class MomentumBiasStrategy {
public:
  ExposureTable exposureTable;
  FeatureBufferSoA featSoA;
  OpenCLBackend openCL;

  SlabAllocator<fvar> corrMatrix;
  SlabAllocator<fvar> distMatrix;
  SlabAllocator<fvar> compactness;
  SlabAllocator<fvar> momentum;
  SlabAllocator<fvar> scores;

  SlabAllocator<float> featLinear;
  SlabAllocator<float> corrLinear;

  int barCount;
  int updateCount;
  StrategyController controller;

  MomentumBiasStrategy() : barCount(0), updateCount(0) {}

  void init() {
    printf("MomentumBias_v4: Initializing...\n");
    exposureTable.init();
    featSoA.init(N_ASSETS, FEAT_WINDOW);
    corrMatrix.init(N_ASSETS * N_ASSETS);
    distMatrix.init(N_ASSETS * N_ASSETS);
    compactness.init(N_ASSETS);
    momentum.init(N_ASSETS);
    scores.init(N_ASSETS);
    featLinear.init(FEAT_N * N_ASSETS * FEAT_WINDOW);
    corrLinear.init(N_ASSETS * N_ASSETS);
    openCL.init();
    printf("MomentumBias_v4: Ready (OpenCL=%d)\n", openCL.ready);
    controller.init();

    barCount = 0;
    updateCount = 0;
  }

  void shutdown() {
    printf("MomentumBias_v4: Shutting down...\n");

    openCL.shutdown();

    featSoA.shutdown();
    corrMatrix.shutdown();
    distMatrix.shutdown();
    compactness.shutdown();
    momentum.shutdown();
    scores.shutdown();

    featLinear.shutdown();
    corrLinear.shutdown();
  }

  void computeFeatures(int assetIdx) {
    asset((char*)ASSET_NAMES[assetIdx]);

    vars C = series(priceClose(0));
    vars V = series(Volatility(C, 20));

    if(Bar < 50) return;

    fvar r1 = (fvar)log(C[0] / C[1]);
    fvar rN = (fvar)log(C[0] / C[12]);
    fvar vol = (fvar)V[0];
    fvar zscore = (fvar)((C[0] - C[50]) / (V[0] * 20.0 + EPS));
    fvar rangeP = (fvar)((C[0] - C[50]) / (C[0] + EPS));
    fvar flow = (fvar)(r1 * vol);
    fvar regime = (fvar)((vol > 0.001) ? 1.0 : 0.0);
    fvar volOfVol = (fvar)(vol * vol);
    fvar persistence = (fvar)fabs(r1);

    featSoA.push(0, assetIdx, r1);
    featSoA.push(1, assetIdx, rN);
    featSoA.push(2, assetIdx, vol);
    featSoA.push(3, assetIdx, zscore);
    featSoA.push(4, assetIdx, rangeP);
    featSoA.push(5, assetIdx, flow);
    featSoA.push(6, assetIdx, regime);
    featSoA.push(7, assetIdx, volOfVol);
    featSoA.push(8, assetIdx, persistence);
  }

  void computeCorrelationMatrixCPU() {
    for(int i=0;i<N_ASSETS*N_ASSETS;i++) corrMatrix[i] = 0;

    for(int f=0; f<FEAT_N; f++){
      for(int a=0; a<N_ASSETS; a++){
        for(int b=a+1; b<N_ASSETS; b++){
          fvar mx = 0, my = 0;
          for(int t=0; t<FEAT_WINDOW; t++){
            mx += featSoA.get(f,a,t);
            my += featSoA.get(f,b,t);
          }
          mx /= (fvar)FEAT_WINDOW;
          my /= (fvar)FEAT_WINDOW;

          fvar sxx = 0, syy = 0, sxy = 0;
          for(int t=0; t<FEAT_WINDOW; t++){
            fvar dx = featSoA.get(f,a,t) - mx;
            fvar dy = featSoA.get(f,b,t) - my;
            sxx += dx*dx;
            syy += dy*dy;
            sxy += dx*dy;
          }

          fvar den = (fvar)sqrt((double)(sxx*syy + (fvar)EPS));
          fvar corr = 0;
          if(den > (fvar)EPS) corr = sxy / den;
          else corr = 0;

          int idx = a*N_ASSETS + b;
          corrMatrix[idx] += corr / (fvar)FEAT_N;
          corrMatrix[b*N_ASSETS + a] = corrMatrix[idx];
        }
      }
    }
  }

  void buildFeatLinear() {
    int idx = 0;
    for(int f=0; f<FEAT_N; f++){
      for(int a=0; a<N_ASSETS; a++){
        for(int t=0; t<FEAT_WINDOW; t++){
          featLinear[idx] = (float)featSoA.get(f, a, t);
          idx++;
        }
      }
    }
  }

  void computeCorrelationMatrix() {
    if(openCL.ready) {
      buildFeatLinear();

      for(int i=0;i<N_ASSETS*N_ASSETS;i++) corrLinear[i] = 0.0f;

      int ok = openCL.computeCorrelationMatrixCL(
        featLinear.data,
        corrLinear.data,
        N_ASSETS,
        FEAT_N,
        FEAT_WINDOW
      );

      if(ok) {
        for(int i=0;i<N_ASSETS*N_ASSETS;i++) corrMatrix[i] = (fvar)0;

        for(int a=0; a<N_ASSETS; a++){
          corrMatrix[a*N_ASSETS + a] = (fvar)1.0;
          for(int b=a+1; b<N_ASSETS; b++){
            float c = corrLinear[a*N_ASSETS + b];
            corrMatrix[a*N_ASSETS + b] = (fvar)c;
            corrMatrix[b*N_ASSETS + a] = (fvar)c;
          }
        }
        return;
      }

      printf("OpenCL: runtime fail -> CPU fallback\n");
      openCL.ready = 0;
    }

    computeCorrelationMatrixCPU();
  }

  void computeDistanceMatrix() {
    for(int i=0;i<N_ASSETS;i++){
      for(int j=0;j<N_ASSETS;j++){
        if(i == j) {
          distMatrix[i*N_ASSETS + j] = (fvar)0;
        } else {
          fvar corrDist = (fvar)1.0 - (fvar)fabs((double)corrMatrix[i*N_ASSETS + j]);
          fvar expDist  = (fvar)exposureTable.getDist(i, j);
          fvar blended = (fvar)LAMBDA_META * corrDist + (fvar)(1.0 - (double)LAMBDA_META) * expDist;
          distMatrix[i*N_ASSETS + j] = blended;
        }
      }
    }
  }

  void floydWarshall() {
    fvar d[28][28];

    for(int i=0;i<N_ASSETS;i++){
      for(int j=0;j<N_ASSETS;j++){
        d[i][j] = distMatrix[i*N_ASSETS + j];
        if(i == j) d[i][j] = (fvar)0;
        if(d[i][j] < (fvar)0) d[i][j] = (fvar)INF;
      }
    }

    for(int k=0;k<N_ASSETS;k++){
      for(int i=0;i<N_ASSETS;i++){
        for(int j=0;j<N_ASSETS;j++){
          if(d[i][k] < (fvar)INF && d[k][j] < (fvar)INF) {
            fvar nk = d[i][k] + d[k][j];
            if(nk < d[i][j]) d[i][j] = nk;
          }
        }
      }
    }

    for(int i=0;i<N_ASSETS;i++){
      fvar w = 0;
      for(int j=i+1;j<N_ASSETS;j++){
        if(d[i][j] < (fvar)INF) w += d[i][j];
      }
      if(w > (fvar)0) compactness[i] = (fvar)(1.0 / (1.0 + (double)w));
      else compactness[i] = (fvar)0;
      momentum[i] = featSoA.get(1, i, 0);
    }
  }

  void computeScores() {
    for(int i=0;i<N_ASSETS;i++){
      fvar coupling = 0;
      int count = 0;

      for(int j=0;j<N_ASSETS;j++){
        if(i != j && distMatrix[i*N_ASSETS + j] < (fvar)INF) {
          coupling += compactness[j];
          count++;
        }
      }

      fvar pCouple = 0;
      if(count > 0) pCouple = coupling / (fvar)count;
      else pCouple = (fvar)0;

      fvar rawScore = (fvar)GAMMA * momentum[i] + (fvar)ALPHA * compactness[i] - (fvar)BETA * pCouple;

      if(rawScore > (fvar)30) rawScore = (fvar)30;
      if(rawScore < (fvar)-30) rawScore = (fvar)-30;

      scores[i] = (fvar)(1.0 / (1.0 + exp(-(double)rawScore)));
    }
  }

  LearningSnapshot buildSnapshot() {
    LearningSnapshot s;
    s.meanScore = 0; s.meanCompactness = 0; s.meanVol = 0;
    for(int i=0;i<N_ASSETS;i++) {
      s.meanScore += (double)scores[i];
      s.meanCompactness += (double)compactness[i];
      s.meanVol += (double)featSoA.get(2, i, 0);
    }
    s.meanScore /= (double)N_ASSETS;
    s.meanCompactness /= (double)N_ASSETS;
    s.meanVol /= (double)N_ASSETS;
    s.regime = 0;
    s.regimeConfidence = 0;
    return s;
  }

  void onBar() {
    barCount++;

    for(int i=0;i<N_ASSETS;i++) computeFeatures(i);

    if(barCount % UPDATE_EVERY == 0) {
      updateCount++;

      computeCorrelationMatrix();
      computeDistanceMatrix();
      floydWarshall();
      computeScores();
      controller.onUpdate(buildSnapshot(), scores.data, N_ASSETS, updateCount);
      printTopK();
    }
  }

  void printTopK() {
    int indices[N_ASSETS];
    for(int i=0;i<N_ASSETS;i++) indices[i] = i;

    int topN = controller.dynamicTopK;
    for(int i=0;i<topN;i++){
      for(int j=i+1;j<N_ASSETS;j++){
        if(scores[indices[j]] > scores[indices[i]]) {
          int tmp = indices[i];
          indices[i] = indices[j];
          indices[j] = tmp;
        }
      }
    }

    if(updateCount % 10 == 0) {
      printf("===MomentumBias_v4 Top-K(update#%d,OpenCL=%d)===\n",
        updateCount, openCL.ready);

      for(int i=0;i<topN;i++){
        int idx = indices[i];
        printf(" %d.%s: score=%.4f, M=%.4f, C=%.4f\n", i+1, ASSET_NAMES[idx], (double)scores[idx], (double)momentum[idx], (double)compactness[idx]);
      }
    }
  }
};

// ---------------------------- Zorro DLL entry ----------------------------

static MomentumBiasStrategy* S = NULL;

DLLFUNC void run()
{
  if(is(INITRUN)) {
    BarPeriod = 60;
    LookBack = max(LookBack, FEAT_WINDOW + 50);

    asset((char*)ASSET_NAMES[0]);

    if(!S) {
      S = new MomentumBiasStrategy();
      S->init();
    }
  }

  if(is(EXITRUN)) {
    if(S) {
      S->shutdown();
      delete S;
      S = NULL;
    }
    return;
  }

  if(!S || Bar < LookBack)
    return;

  S->onBar();
}