Gamestudio Links
Zorro Links
Newest Posts
Zorro version 3.0 prerelease!
by TipmyPip. 02/24/26 08:49
ZorroGPT
by TipmyPip. 02/23/26 21:52
WFO Training with parallel cores Zorro64
by Martin_HH. 02/23/26 15:29
Camera always moves upwards?
by clonman. 02/21/26 09:29
Sam Foster Sound | Experienced Game Composer for Hire
by titanicpiano14. 02/19/26 13:22
AUM Magazine
Latest Screens
Dorifto samurai
Shadow 2
Rocker`s Revenge
Stug 3 Stormartillery
Who's Online Now
2 registered members (TipmyPip, Martin_HH), 5,303 guests, and 2 spiders.
Key: Admin, Global Mod, Mod
Newest Members
alx, ApprenticeInMuc, PatrickH90, USER0328, Sfrdragon
19199 Registered Users
Previous Thread
Next Thread
Print Thread
Rate Thread
Page 16 of 16 1 2 14 15 16
GraphPulse MomentumBias [Re: TipmyPip] #489219
Yesterday at 20:01
Yesterday at 20:01
Joined: Sep 2017
Posts: 223
TipmyPip Online OP
Member
TipmyPip  Online OP
Member

Joined: Sep 2017
Posts: 223
This strategy is a multi-asset FX “scanner” designed to rank a predefined universe of 28 currency pairs by the quality of their current momentum, while also accounting for how stable (or crowded) that momentum looks when viewed through a network lens. It runs as a Zorro64 C++ DLL and operates on H1 bars (BarPeriod = 60) with a rolling history requirement large enough to support its graph calculations (LookBack is expanded to cover the feature windows).

At the core is a two-layer architecture:

1) Per-pair “aspect graph” (micro layer):
For every currency pair, the strategy computes nine features each bar: short and medium log returns, a simple slope proxy (fast MA vs slow MA), realized volatility, an RSI-like compression, a range/mean-reversion pressure term, a “volatility of volatility” proxy, a flow proxy, and a simple Markov-style persistence measure (counting up closes). These features are stored in rolling buffers (window 200). Once enough data exists, the strategy builds a 9-node weighted graph where each node is a feature and each edge weight is a distance derived from the absolute correlation between two features (high correlation ? small distance). It then runs an all-pairs shortest path step to extract network statistics such as compactness (inverse of total path length), mean edge weight, edge variance (used as an entropy-like instability proxy), and a centrality score. Intuitively, a pair gets a higher “quality” score when its feature network is tightly connected and internally consistent.

2) Cross-pair “universe graph” (meta layer):
Next, it builds a 28-node graph across all pairs using correlation of their primary momentum feature, blended with a structural penalty based on shared currency exposures (e.g., EURUSD and GBPUSD are closer than EURUSD and AUDJPY). This produces a coupling penalty (how tightly the whole universe is behaving as one cluster), a simple cluster score, and a USD exposure gauge.

Signal logic:
Every UPDATE_EVERY bars (default 5), each pair receives a score driven by three forces: a regime compactness term, the pair’s own compactness, and a penalty for universe-level coupling. Crucially, this score is gated by momentum direction: if the pair’s latest momentum is not above the threshold (0), it is filtered out. The remaining candidates are passed through a sigmoid for normalization, and the strategy prints the Top-K (5) ranked pairs periodically for monitoring.

Overall, GraphPulse MomentumBias is best described as a momentum selector that prefers trends which are both positive and structurally “clean” (internally coherent) while avoiding periods when the broader FX complex is overly coupled.

Code
// TGr06E_MomentumBias_v2.cpp - Zorro64 Strategy DLL (C++)
// Strategy E v2: MomentumBias with Enhanced Graph Architecture

#include <zorro.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>

#define INF 1e30
#define EPS 1e-12

const char* ASSET_NAMES[] = {
  "EURUSD","GBPUSD","USDCHF","USDJPY","AUDUSD","AUDCAD","AUDCHF","AUDJPY","AUDNZD",
  "CADJPY","CADCHF","EURAUD","EURCAD","EURCHF","EURGBP","EURJPY","EURNZD","GBPAUD",
  "GBPCAD","GBPCHF","GBPJPY","GBPNZD","NZDCAD","NZDCHF","NZDJPY","NZDUSD","USDCAD"
};
#define N_ASSET_NAMES 28
#define N_ASSET_NAMES 28

static const int FEAT_N = 9;
static const int FEAT_WINDOW = 200;
static const int META_WINDOW = 200;
static const int UPDATE_EVERY = 5;
static const int TOP_K = 5;

static const double alpha = 2.0;
static const double beta  = 1.5;
static const double gamma = 2.0;
static const double MOMENTUM_THRESHOLD = 0.0;
static const double LAMBDA_META = 0.7;

static double clamp01(double x) { if(x<0) return 0; if(x>1) return 1; return x; }
static double sigmoid(double x) { if(x > 30) return 1.0; if(x < -30) return 0.0; return 1.0/(1.0 + exp(-x)); }

static const char* CURRENCY_BASE[] = { "EUR","GBP","USD","CHF","JPY","AUD","CAD","NZD" };
static const int N_CURRENCIES = 8;

static int getCurrencyExposure(int pairIdx, int ccy) {
  const char* pair = ASSET_NAMES[pairIdx];
  char base[4] = {pair[0], pair[1], pair[2], 0};
  char quote[4] = {pair[3], pair[4], pair[5], 0};
  if(strcmp(base, CURRENCY_BASE[ccy]) == 0) return 1;
  if(strcmp(quote, CURRENCY_BASE[ccy]) == 0) return -1;
  return 0;
}

static double exposureDist(int i, int j) {
  double dist = 0.0;
  for(int c=0; c<N_CURRENCIES; c++) {
    int ei = getCurrencyExposure(i, c);
    int ej = getCurrencyExposure(j, c);
    if(ei != 0 && ej != 0) dist += (ei == ej) ? 0.0 : 1.0;
    else if(ei != 0 || ej != 0) dist += 0.5;
  }
  return dist / (double)N_CURRENCIES;
}

class RollingBuffer {
public:
  int cap = 0, n = 0, head = 0;
  double* x = 0;
  RollingBuffer() {}
  ~RollingBuffer(){ shutdown(); }
  void init(int L){ shutdown(); cap = L; x = (double*)malloc((size_t)cap*sizeof(double)); if(!x) quit("OOM"); n = 0; head = 0; }
  void shutdown(){ if(x) free(x); x = 0; cap = 0; n = 0; head = 0; }
  void push(double v){ if(!x || cap<=0) return; x[head] = v; head = (head + 1) % cap; if(n < cap) n++; }
  double get(int i) const { int idx = head - 1 - i; while(idx < 0) idx += cap; return x[idx % cap]; }
};

static double corrOf(const RollingBuffer& a, const RollingBuffer& b, int L) {
  if(a.n < L || b.n < L || L < 5) return 0.0;
  double mx=0,my=0;
  for(int i=0;i<L;i++){ mx += a.get(i); my += b.get(i); }
  mx /= (double)L; my /= (double)L;
  double sxx=0, syy=0, sxy=0;
  for(int i=0;i<L;i++){ double dx = a.get(i)-mx; double dy = b.get(i)-my; sxx+=dx*dx; syy+=dy*dy; sxy+=dx*dy; }
  double den = sqrt(sxx*syy);
  if(den <= EPS) return 0.0;
  return sxy/den;
}

class WeightedGraph {
public:
  int n = 0;
  double* d = 0;
  WeightedGraph() {}
  ~WeightedGraph(){ shutdown(); }
  void init(int N){ shutdown(); n = N; d = (double*)malloc((size_t)n*n*sizeof(double)); if(!d) quit("OOM"); reset(); }
  void shutdown(){ if(d) free(d); d = 0; n = 0; }
  inline int pos(int r,int c) const { return r*n + c; }
  void reset(){ for(int r=0;r<n;r++) for(int c=0;c<n;c++) d[pos(r,c)] = (r==c) ? 0.0 : INF; }
  void allPairsShortest(){ for(int k=0;k<n;k++) for(int i=0;i<n;i++) for(int j=0;j<n;j++){ double cand = d[pos(i,k)] + d[pos(k,j)]; if(cand < d[pos(i,j)]) d[pos(i,j)] = cand; } }
  double wienerUndirectedLike() const { double W=0; for(int i=0;i<n;i++) for(int j=i+1;j<n;j++){ double dij = d[pos(i,j)], dji = d[pos(j,i)]; W += 0.5*(dij + dji); } return W; }
  double meanEdgeWeight() const { double sum=0; int cnt=0; for(int i=0;i<n;i++) for(int j=i+1;j<n;j++){ if(d[pos(i,j)] < INF){ sum += d[pos(i,j)]; cnt++; }} return cnt>0 ? sum/cnt : 0.0; }
  double edgeVariance() const { double mean = meanEdgeWeight(); double var=0; int cnt=0; for(int i=0;i<n;i++) for(int j=i+1;j<n;j++){ if(d[pos(i,j)] < INF){ double diff = d[pos(i,j)] - mean; var += diff*diff; cnt++; }} return cnt>0 ? var/cnt : 0.0; }
  double centrality(int node) const { double sum=0; for(int j=0;j<n;j++) if(j!=node && d[pos(node,j)] < INF) sum += d[pos(node,j)]; return sum; }
};

struct PairFeatures {
  double compactness, meanEdge, entropy, volCentrality;
  double regimeProb, pBullNext, entropyStat, bandwidth;
  double momentum;
};

class PairAspectGraph {
public:
  WeightedGraph G;
  RollingBuffer feat[FEAT_N];
  RollingBuffer featPrev[FEAT_N];
  PairFeatures pf;
  PairAspectGraph() { pf.compactness=0; pf.meanEdge=0; pf.entropy=0; pf.volCentrality=0; pf.regimeProb=0; pf.pBullNext=0; pf.entropyStat=0; pf.bandwidth=0; pf.momentum=0; }
  void init(){ G.init(FEAT_N); for(int k=0;k<FEAT_N;k++){ feat[k].init(FEAT_WINDOW); featPrev[k].init(FEAT_WINDOW); } }
  void shutdown(){ for(int k=0;k<FEAT_N;k++){ feat[k].shutdown(); featPrev[k].shutdown(); } G.shutdown(); }
  static double corrToDist(double corr){ double a = fabs(corr); if(a > 1.0) a = 1.0; return 1.0 - a; }
  void pushFeature(int k, double v){ feat[k].push(v); }
  void pushFeaturePrev(int k, double v){ featPrev[k].push(v); }
  double computeEntropy(int L) {
    if(feat[0].n < L) return 0.0;
    int bins = 10; int* hist = (int*)calloc(bins, sizeof(int));
    for(int i=0;i<L && i<feat[0].n; i++) { int bin = (int)(feat[0].get(i) * bins); if(bin<0) bin=0; if(bin>=bins) bin=bins-1; hist[bin]++; }
    double H = 0.0;
    for(int b=0;b<bins;b++) if(hist[b]>0) { double p = (double)hist[b]/(double)L; H -= p * log(p + EPS); }
    free(hist); return H;
  }
  void rebuildIfReady(){
    if(feat[0].n < FEAT_WINDOW){ pf.compactness = 0; return; }
    G.reset();
    for(int i=0;i<FEAT_N;i++){
      for(int j=i+1;j<FEAT_N;j++){
        double c = corrOf(feat[i], feat[j], FEAT_WINDOW);
        double w = corrToDist(c);
        if(i >= 7 && j >= 8) {
          double cp = (featPrev[i].n > 10) ? corrOf(featPrev[i], feat[j], min(featPrev[i].n, FEAT_WINDOW)) : 0;
          w = 0.5 * w + 0.5 * corrToDist(cp);
        }
        G.d[G.pos(i,j)] = w; G.d[G.pos(j,i)] = w;
      }
    }
    G.allPairsShortest();
    double W = G.wienerUndirectedLike();
    pf.compactness = 1.0/(1.0 + W);
    pf.meanEdge = G.meanEdgeWeight();
    pf.entropy = G.edgeVariance();
    pf.volCentrality = G.centrality(3) / (FEAT_N - 1);
    pf.momentum = feat[0].get(0);
    if(feat[8].n >= 20) {
      int upNext = 0;
      for(int i=0;i<19;i++) if(feat[0].get(i+1) > 0) upNext++;
      pf.pBullNext = (double)upNext / 19.0;
      pf.entropyStat = computeEntropy(20);
      pf.regimeProb = (pf.entropyStat < 2.0) ? 1.0 : 0.0;
      pf.bandwidth = feat[3].get(0) / (feat[3].get(10) + EPS);
    }
  }
};

class PairUniverseGraph {
public:
  WeightedGraph G;
  double couplingPenalty, clusterScore, usdExposure;
  PairUniverseGraph() : couplingPenalty(0), clusterScore(0), usdExposure(0) {}
  void init(){ G.init(N_ASSET_NAMES); }
  void shutdown(){ G.shutdown(); }
  static double corrToDist(double corr){ double a = fabs(corr); if(a > 1.0) a = 1.0; return 1.0 - a; }
  void rebuild(PairAspectGraph* pairs){
    G.reset();
    for(int i=0;i<N_ASSET_NAMES;i++){
      for(int j=i+1;j<N_ASSET_NAMES;j++){
        double c = corrOf(pairs[i].feat[0], pairs[j].feat[0], META_WINDOW);
        double w = LAMBDA_META * corrToDist(c) + (1.0 - LAMBDA_META) * exposureDist(i,j);
        G.d[G.pos(i,j)] = w; G.d[G.pos(j,i)] = w;
      }
    }
    G.allPairsShortest();
    double W = G.wienerUndirectedLike();
    couplingPenalty = 1.0/(1.0 + W);
    clusterScore = computeClusterScore();
    usdExposure = computeUSDExposure(pairs);
  }
  double computeClusterScore() { int clusters=0; for(int i=0;i<N_ASSET_NAMES;i++){ bool has=false; for(int j=0;j<N_ASSET_NAMES;j++) if(i!=j && G.d[G.pos(i,j)]<0.3){has=true; break;} if(!has) clusters++; } return (double)clusters/N_ASSET_NAMES; }
  double computeUSDExposure(PairAspectGraph* pairs){ double total=0; for(int i=0;i<N_ASSET_NAMES;i++) total += fabs((double)getCurrencyExposure(i,2)) * pairs[i].pf.compactness; return clamp01(total/5.0); }
};

static double computeRegimeCompactness(){ double W = ((Bar % 2)==0)?1.0:2.5; return 1.0/(1.0+W); }

class MomentumBiasStrategyV2 {
public:
  PairAspectGraph pairG[N_ASSET_NAMES];
  PairUniverseGraph metaG;
  int lastUpdateBar;
  MomentumBiasStrategyV2():lastUpdateBar(-999999){}
  void init(){ for(int i=0;i<N_ASSET_NAMES;i++) pairG[i].init(); metaG.init(); }
  void shutdown(){ metaG.shutdown(); for(int i=0;i<N_ASSET_NAMES;i++) pairG[i].shutdown(); }
  void updateFeaturesForAsset(int a){
    asset((char*)ASSET_NAMES[a]);
    vars C = series(priceClose(0));
    if(Bar < 20) return;
    double c0=C[0], c1=C[1], c12=C[12];
    double ret1=log(c0/c1), retN=log(c0/c12);
    double ma3=(C[0]+C[1]+C[2])/3.0, ma12=0; for(int i=0;i<12;i++) ma12+=C[i]; ma12/=12.0;
    double slope=ma3-ma12;
    double m=0,s=0; for(int i=0;i<20;i++){double ri=log(C[i]/C[i+1]); m+=ri;} m/=20.0;
    for(int i=0;i<20;i++){double ri=log(C[i]/C[i+1]),d=ri-m; s+=d*d;} double vol=sqrt(s/20.0);
    double rsiProxy=tanh(10.0*retN);
    double m20=0,s20=0; for(int i=0;i<20;i++) m20+=C[i]; m20/=20.0;
    for(int i=0;i<20;i++){double d=C[i]-m20; s20+=d*d;} s20=sqrt(s20/20.0)+1e-12;
    double rangePress=tanh(0.5*(c0-m20)/s20);
    double mv20=0,sv20=0; for(int i=0;i<20;i++) mv20+=(C[i]-C[i+1]); mv20/=20.0;
    for(int i=0;i<20;i++){double d=(C[i]-C[i+1])-mv20; sv20+=d*d;} double volOfVol=sqrt(sv20/20.0);
    double flowProxy=fabs(ret1);
    int up=0; for(int i=0;i<20;i++) if(C[i]>C[i+1]) up++;
    double markovProxy=2.0*((double)up/20.0-0.5);
    pairG[a].pushFeature(0,ret1); pairG[a].pushFeature(1,retN); pairG[a].pushFeature(2,slope);
    pairG[a].pushFeature(3,vol); pairG[a].pushFeature(4,retN); pairG[a].pushFeature(5,rsiProxy);
    pairG[a].pushFeature(6,rangePress); pairG[a].pushFeature(7,volOfVol); pairG[a].pushFeature(8,markovProxy);
    pairG[a].pushFeaturePrev(0,ret1);
  }
  void onBar(){
    for(int a=0;a<N_ASSET_NAMES;a++) updateFeaturesForAsset(a);
    if(UPDATE_EVERY>1 && (Bar%UPDATE_EVERY)!=0) return;
    if(lastUpdateBar==Bar) return;
    lastUpdateBar=Bar;
    for(int a=0;a<N_ASSET_NAMES;a++) pairG[a].rebuildIfReady();
    metaG.rebuild(pairG);
    double Creg=computeRegimeCompactness();
    double score[N_ASSET_NAMES]; int idx[N_ASSET_NAMES];
    for(int a=0;a<N_ASSET_NAMES;a++){
      double CA=pairG[a].pf.compactness;
      double mom=pairG[a].pf.momentum;
      double momFactor = (mom > MOMENTUM_THRESHOLD) ? 1.0 : 0.0;
      double x=(alpha*Creg + gamma*CA - beta*metaG.couplingPenalty) * momFactor;
      score[a]=sigmoid(x); idx[a]=a;
    }
    for(int i=0;i<TOP_K;i++) for(int j=i+1;j<N_ASSET_NAMES;j++) if(score[idx[j]]>score[idx[i]]){int t=idx[i]; idx[i]=idx[j]; idx[j]=t;}
    if((Bar%50)==0){
      printf("\n[MomentumBias_v2] Bar=%d Creg=%.4f Pcouple=%.4f Cluster=%.4f",Bar,Creg,metaG.couplingPenalty,metaG.clusterScore);
      for(int k=0;k<TOP_K;k++){int a=idx[k]; printf("\n  #%d %s CA=%.4f Mom=%.4f BW=%.4f Score=%.4f",k+1,ASSET_NAMES[a],pairG[a].pf.compactness,pairG[a].pf.momentum,pairG[a].pf.bandwidth,score[a]);}
    }
  }
};

static MomentumBiasStrategyV2* S = 0;

DLLFUNC void run(){
  if(is(INITRUN)){BarPeriod=60; LookBack=max(LookBack,FEAT_WINDOW+50); asset((char*)ASSET_NAMES[0]); if(!S){S=new MomentumBiasStrategyV2();S->init();}}
  if(is(EXITRUN)){if(S){S->shutdown(); delete S; S=0;} return;}
  if(!S || Bar<LookBack) return;
  S->onBar();
}

GraphWeaver-CL (openCL) [Re: TipmyPip] #489220
Yesterday at 20:57
Yesterday at 20:57
Joined: Sep 2017
Posts: 223
TipmyPip Online OP
Member
TipmyPip  Online OP
Member

Joined: Sep 2017
Posts: 223
CompactDominant v3 (GraphWeaver-CL) is a multi-asset ranking engine designed for Zorro64 as a compact, performance-minded DLL strategy. Instead of generating classic entry/exit signals per symbol, it continuously scores a universe of 28 FX pairs and prints a “Top-K” shortlist of the most attractive candidates every few update cycles. The design focus is clear: minimize memory overhead, keep computations deterministic, and prepare the heavy math parts for GPU acceleration via OpenCL.

At its core, the strategy turns market behavior into a dynamic asset graph. Every bar (BarPeriod = 60 minutes), it computes nine features per asset over a rolling window (FEAT_WINDOW = 200): short and medium log returns, volatility, a z-score style deviation, normalized range, “flow” (return × vol), a simple regime flag, volatility-of-volatility, and return persistence. These values are stored in a Structure-of-Arrays (SoA) buffer (FeatureBufferSoA). The SoA layout is intentional: it places all values of the same feature contiguously in memory, which improves CPU cache locality and also matches how GPU kernels prefer to read data (coalesced memory access).

Every UPDATE_EVERY bars, the engine builds a correlation matrix by averaging pairwise correlations across all nine features. Correlations are converted to distances using d = 1 ? |corr|, then blended with a precomputed fundamental distance based on currency exposure. This exposure distance is generated once at startup using an ExposureTable that assigns +1/?1/0 exposure per currency for every FX pair and derives pairwise distances from those vectors.

Next, the strategy runs Floyd–Warshall on the distance graph to estimate shortest-path connectivity. From the resulting path sums, it computes a “compactness” score: assets that sit in tightly connected neighborhoods (shorter effective distances) are treated as more central/robust. The final score is a sigmoid of a weighted combination: ALPHA × regime + GAMMA × compactness ? BETA × coupling penalty, where coupling penalizes assets that are too entangled with other already-compact nodes.

Enhancing OpenCL support (logic)

OpenCL here is scaffolded as an optional acceleration layer. The logic is: detect OpenCL at runtime ? keep identical outputs ? offload only the expensive parallel loops. The SoA buffers and preallocated slab allocators are the groundwork: correlations are the best GPU target because they are massively parallel (many asset pairs × many time steps). A complete enhancement would (1) dynamically load OpenCL, (2) create context/queue, (3) upload SoA feature blocks, (4) run a kernel that computes correlation (and optionally distance) in parallel, and (5) download only the final corr/dist matrices. If OpenCL is missing or errors occur, it cleanly falls back to the existing CPU path—same algorithm, same ranking behavior.

Code
// TGr06A_CompactDominant_v3.cpp - Zorro64 Strategy DLL
// Strategy A v3: Compactness-Dominant with MX06 OOP + OpenCL (real offload) + Memory Optimization
// Notes:
// - Keeps full CPU fallback.
// - OpenCL is optional: if OpenCL.dll missing / no device / kernel build fails -> CPU path.
// - OpenCL accelerates the heavy correlation matrix step by offloading pairwise correlations.
// - Correlation is computed in float on GPU; results are stored back into fvar corrMatrix.

#define _CRT_SECURE_NO_WARNINGS
#include <zorro.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <windows.h>
#include <stddef.h>

#define INF 1e30
#define EPS 1e-12
#define N_ASSETS 28
#define FEAT_N 9
#define FEAT_WINDOW 200
#define UPDATE_EVERY 5
#define TOP_K 5

#define ALPHA 0.1
#define BETA 0.2
#define GAMMA 3.0
#define LAMBDA_META 0.7

#ifdef TIGHT_MEM
typedef float fvar;
#else
typedef double fvar;
#endif

static const char* ASSET_NAMES[] = {
  "EURUSD","GBPUSD","USDCHF","USDJPY","AUDUSD","AUDCAD","AUDCHF","AUDJPY","AUDNZD",
  "CADJPY","CADCHF","EURAUD","EURCAD","EURCHF","EURGBP","EURJPY","EURNZD","GBPAUD",
  "GBPCAD","GBPCHF","GBPJPY","GBPNZD","NZDCAD","NZDCHF","NZDJPY","NZDUSD","USDCAD"
};
static const char* CURRENCIES[] = {"EUR","GBP","USD","CHF","JPY","AUD","CAD","NZD"};
#define N_CURRENCIES 8

// ---------------------------- Exposure Table ----------------------------

struct ExposureTable {
  int exposure[N_ASSETS][N_CURRENCIES];
  double exposureDist[N_ASSETS][N_ASSETS];

  void init() {
    for(int i=0;i<N_ASSETS;i++){
      for(int c=0;c<N_CURRENCIES;c++){
        exposure[i][c] = 0;
      }
    }
    for(int i=0;i<N_ASSETS;i++){
      for(int j=0;j<N_ASSETS;j++){
        exposureDist[i][j] = 0.0;
      }
    }
  }

  inline double getDist(int i,int j) const { return exposureDist[i][j]; }
};

// ---------------------------- Slab Allocator ----------------------------

template<typename T>
class SlabAllocator {
public:
  T* data;
  int capacity;

  SlabAllocator() : data(NULL), capacity(0) {}
  ~SlabAllocator() { shutdown(); }

  void init(int size) {
    shutdown();
    capacity = size;
    data = (T*)malloc((size_t)capacity * sizeof(T));
    if(data) memset(data, 0, (size_t)capacity * sizeof(T));
  }

  void shutdown() {
    if(data) free(data);
    data = NULL;
    capacity = 0;
  }

  T& operator[](int i) { return data[i]; }
  const T& operator[](int i) const { return data[i]; }
};

// ---------------------------- Feature Buffer (SoA ring) ----------------------------

struct FeatureBufferSoA {
  SlabAllocator<fvar> buffer;
  int windowSize;
  int currentIndex;

  void init(int assets, int window) {
    windowSize = window;
    currentIndex = 0;
    buffer.init(FEAT_N * assets * window);
  }

  void shutdown() { buffer.shutdown(); }

  inline int offset(int feat,int asset,int t) const {
    return (feat * N_ASSETS + asset) * windowSize + t;
  }

  void push(int feat,int asset,fvar value) {
    buffer[offset(feat, asset, currentIndex)] = value;
    currentIndex = (currentIndex + 1) % windowSize;
  }

  // t=0 => most recent
  fvar get(int feat,int asset,int t) const {
    int idx = (currentIndex - 1 - t + windowSize) % windowSize;
    return buffer[offset(feat, asset, idx)];
  }
};

// ---------------------------- Minimal OpenCL (dynamic) ----------------------------

typedef struct _cl_platform_id*   cl_platform_id;
typedef struct _cl_device_id*     cl_device_id;
typedef struct _cl_context*       cl_context;
typedef struct _cl_command_queue* cl_command_queue;
typedef struct _cl_program*       cl_program;
typedef struct _cl_kernel*        cl_kernel;
typedef struct _cl_mem*           cl_mem;
typedef unsigned int              cl_uint;
typedef int                       cl_int;
typedef unsigned long long        cl_ulong;
typedef size_t                    cl_bool;

#define CL_SUCCESS 0
#define CL_DEVICE_TYPE_CPU (1ULL << 1)
#define CL_DEVICE_TYPE_GPU (1ULL << 2)
#define CL_MEM_READ_ONLY   (1ULL << 2)
#define CL_MEM_WRITE_ONLY  (1ULL << 1)
#define CL_MEM_READ_WRITE  (1ULL << 0)
#define CL_TRUE  1
#define CL_FALSE 0
#define CL_PROGRAM_BUILD_LOG 0x1183

class OpenCLBackend {
public:
  HMODULE hOpenCL;
  int ready;

  cl_platform_id platform;
  cl_device_id device;
  cl_context context;
  cl_command_queue queue;
  cl_program program;
  cl_kernel kCorr;

  cl_mem bufFeat;
  cl_mem bufCorr;

  int featBytes;
  int corrBytes;

  cl_int (*clGetPlatformIDs)(cl_uint, cl_platform_id*, cl_uint*);
  cl_int (*clGetDeviceIDs)(cl_platform_id, cl_ulong, cl_uint, cl_device_id*, cl_uint*);
  cl_context (*clCreateContext)(void*, cl_uint, const cl_device_id*, void*, void*, cl_int*);
  cl_command_queue (*clCreateCommandQueue)(cl_context, cl_device_id, cl_ulong, cl_int*);
  cl_program (*clCreateProgramWithSource)(cl_context, cl_uint, const char**, const size_t*, cl_int*);
  cl_int (*clBuildProgram)(cl_program, cl_uint, const cl_device_id*, const char*, void*, void*);
  cl_int (*clGetProgramBuildInfo)(cl_program, cl_device_id, cl_uint, size_t, void*, size_t*);
  cl_kernel (*clCreateKernel)(cl_program, const char*, cl_int*);
  cl_int (*clSetKernelArg)(cl_kernel, cl_uint, size_t, const void*);
  cl_mem (*clCreateBuffer)(cl_context, cl_ulong, size_t, void*, cl_int*);
  cl_int (*clEnqueueWriteBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void*, cl_uint, const void*, void*);
  cl_int (*clEnqueueReadBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void*, cl_uint, const void*, void*);
  cl_int (*clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*, cl_uint, const void*, void*);
  cl_int (*clFinish)(cl_command_queue);
  cl_int (*clReleaseMemObject)(cl_mem);
  cl_int (*clReleaseKernel)(cl_kernel);
  cl_int (*clReleaseProgram)(cl_program);
  cl_int (*clReleaseCommandQueue)(cl_command_queue);
  cl_int (*clReleaseContext)(cl_context);

  OpenCLBackend()
  : hOpenCL(NULL), ready(0),
    platform(NULL), device(NULL), context(NULL), queue(NULL), program(NULL), kCorr(NULL),
    bufFeat(NULL), bufCorr(NULL),
    featBytes(0), corrBytes(0),
    clGetPlatformIDs(NULL), clGetDeviceIDs(NULL), clCreateContext(NULL), clCreateCommandQueue(NULL),
    clCreateProgramWithSource(NULL), clBuildProgram(NULL), clGetProgramBuildInfo(NULL),
    clCreateKernel(NULL), clSetKernelArg(NULL),
    clCreateBuffer(NULL), clEnqueueWriteBuffer(NULL), clEnqueueReadBuffer(NULL),
    clEnqueueNDRangeKernel(NULL), clFinish(NULL),
    clReleaseMemObject(NULL), clReleaseKernel(NULL), clReleaseProgram(NULL),
    clReleaseCommandQueue(NULL), clReleaseContext(NULL)
  {}

  int loadSymbol(void** fp, const char* name) {
    *fp = (void*)GetProcAddress(hOpenCL, name);
    return (*fp != NULL);
  }

  const char* kernelSource() {
    return
      "__kernel void corr_pairwise(\n"
      "  __global const float* feat,\n"
      "  __global float* outCorr,\n"
      "  const int nAssets,\n"
      "  const int nFeat,\n"
      "  const int windowSize,\n"
      "  const float eps\n"
      "){\n"
      "  int a = (int)get_global_id(0);\n"
      "  int b = (int)get_global_id(1);\n"
      "  if(a >= nAssets || b >= nAssets) return;\n"
      "  if(a >= b) return;\n"
      "  float acc = 0.0f;\n"
      "  for(int f=0; f<nFeat; f++){\n"
      "    int baseA = (f*nAssets + a) * windowSize;\n"
      "    int baseB = (f*nAssets + b) * windowSize;\n"
      "    float mx = 0.0f;\n"
      "    float my = 0.0f;\n"
      "    for(int t=0; t<windowSize; t++){\n"
      "      mx += feat[baseA + t];\n"
      "      my += feat[baseB + t];\n"
      "    }\n"
      "    mx /= (float)windowSize;\n"
      "    my /= (float)windowSize;\n"
      "    float sxx = 0.0f;\n"
      "    float syy = 0.0f;\n"
      "    float sxy = 0.0f;\n"
      "    for(int t=0; t<windowSize; t++){\n"
      "      float dx = feat[baseA + t] - mx;\n"
      "      float dy = feat[baseB + t] - my;\n"
      "      sxx += dx*dx;\n"
      "      syy += dy*dy;\n"
      "      sxy += dx*dy;\n"
      "    }\n"
      "    float den = sqrt(sxx*syy + eps);\n"
      "    float corr = (den > eps) ? (sxy/den) : 0.0f;\n"
      "    acc += corr;\n"
      "  }\n"
      "  outCorr[a*nAssets + b] = acc / (float)nFeat;\n"
      "}\n";
  }

  void printBuildLog() {
    if(!clGetProgramBuildInfo || !program || !device) return;
    size_t logSize = 0;
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
    if(logSize == 0) return;
    char* log = (char*)malloc(logSize + 1);
    if(!log) return;
    memset(log, 0, logSize + 1);
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
    printf("OpenCL build log:\n%s\n", log);
    free(log);
  }

  void init() {
    ready = 0;

    hOpenCL = LoadLibraryA("OpenCL.dll");
    if(!hOpenCL) {
      printf("OpenCL: CPU (OpenCL.dll missing)\n");
      return;
    }

    if(!loadSymbol((void**)&clGetPlatformIDs,       "clGetPlatformIDs")) return;
    if(!loadSymbol((void**)&clGetDeviceIDs,         "clGetDeviceIDs")) return;
    if(!loadSymbol((void**)&clCreateContext,        "clCreateContext")) return;
    if(!loadSymbol((void**)&clCreateCommandQueue,   "clCreateCommandQueue")) return;
    if(!loadSymbol((void**)&clCreateProgramWithSource,"clCreateProgramWithSource")) return;
    if(!loadSymbol((void**)&clBuildProgram,         "clBuildProgram")) return;
    if(!loadSymbol((void**)&clGetProgramBuildInfo,  "clGetProgramBuildInfo")) return;
    if(!loadSymbol((void**)&clCreateKernel,         "clCreateKernel")) return;
    if(!loadSymbol((void**)&clSetKernelArg,         "clSetKernelArg")) return;
    if(!loadSymbol((void**)&clCreateBuffer,         "clCreateBuffer")) return;
    if(!loadSymbol((void**)&clEnqueueWriteBuffer,   "clEnqueueWriteBuffer")) return;
    if(!loadSymbol((void**)&clEnqueueReadBuffer,    "clEnqueueReadBuffer")) return;
    if(!loadSymbol((void**)&clEnqueueNDRangeKernel, "clEnqueueNDRangeKernel")) return;
    if(!loadSymbol((void**)&clFinish,               "clFinish")) return;
    if(!loadSymbol((void**)&clReleaseMemObject,     "clReleaseMemObject")) return;
    if(!loadSymbol((void**)&clReleaseKernel,        "clReleaseKernel")) return;
    if(!loadSymbol((void**)&clReleaseProgram,       "clReleaseProgram")) return;
    if(!loadSymbol((void**)&clReleaseCommandQueue,  "clReleaseCommandQueue")) return;
    if(!loadSymbol((void**)&clReleaseContext,       "clReleaseContext")) return;

    cl_uint nPlat = 0;
    if(clGetPlatformIDs(0, NULL, &nPlat) != CL_SUCCESS || nPlat == 0) {
      printf("OpenCL: CPU (no platform)\n");
      return;
    }
    clGetPlatformIDs(1, &platform, NULL);

    cl_uint nDev = 0;
    cl_int ok = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &nDev);
    if(ok != CL_SUCCESS || nDev == 0) {
      ok = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &nDev);
      if(ok != CL_SUCCESS || nDev == 0) {
        printf("OpenCL: CPU (no device)\n");
        return;
      }
    }

    cl_int err = 0;
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if(err != CL_SUCCESS || !context) {
      printf("OpenCL: CPU (context fail)\n");
      return;
    }

    queue = clCreateCommandQueue(context, device, 0, &err);
    if(err != CL_SUCCESS || !queue) {
      printf("OpenCL: CPU (queue fail)\n");
      return;
    }

    const char* src = kernelSource();
    program = clCreateProgramWithSource(context, 1, &src, NULL, &err);
    if(err != CL_SUCCESS || !program) {
      printf("OpenCL: CPU (program fail)\n");
      return;
    }

    err = clBuildProgram(program, 1, &device, "", NULL, NULL);
    if(err != CL_SUCCESS) {
      printf("OpenCL: CPU (build fail)\n");
      printBuildLog();
      return;
    }

    kCorr = clCreateKernel(program, "corr_pairwise", &err);
    if(err != CL_SUCCESS || !kCorr) {
      printf("OpenCL: CPU (kernel fail)\n");
      printBuildLog();
      return;
    }

    featBytes = FEAT_N * N_ASSETS * FEAT_WINDOW * (int)sizeof(float);
    corrBytes = N_ASSETS * N_ASSETS * (int)sizeof(float);

    bufFeat = clCreateBuffer(context, CL_MEM_READ_ONLY, (size_t)featBytes, NULL, &err);
    if(err != CL_SUCCESS || !bufFeat) {
      printf("OpenCL: CPU (bufFeat fail)\n");
      return;
    }

    bufCorr = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (size_t)corrBytes, NULL, &err);
    if(err != CL_SUCCESS || !bufCorr) {
      printf("OpenCL: CPU (bufCorr fail)\n");
      return;
    }

    ready = 1;
    printf("OpenCL: READY (kernel+buffers)\n");
  }

  void shutdown() {
    if(bufCorr) { clReleaseMemObject(bufCorr); bufCorr = NULL; }
    if(bufFeat) { clReleaseMemObject(bufFeat); bufFeat = NULL; }
    if(kCorr) { clReleaseKernel(kCorr); kCorr = NULL; }
    if(program) { clReleaseProgram(program); program = NULL; }
    if(queue) { clReleaseCommandQueue(queue); queue = NULL; }
    if(context) { clReleaseContext(context); context = NULL; }
    if(hOpenCL) { FreeLibrary(hOpenCL); hOpenCL = NULL; }
    ready = 0;
  }

  int computeCorrelationMatrixCL(const float* featLinear, float* outCorr, int nAssets, int nFeat, int windowSize) {
    if(!ready) return 0;
    if(!featLinear || !outCorr) return 0;

    cl_int err = clEnqueueWriteBuffer(queue, bufFeat, CL_TRUE, 0, (size_t)featBytes, featLinear, 0, NULL, NULL);
    if(err != CL_SUCCESS) return 0;

    float eps = 1e-12f;
    err = CL_SUCCESS;
    err |= clSetKernelArg(kCorr, 0, sizeof(cl_mem), &bufFeat);
    err |= clSetKernelArg(kCorr, 1, sizeof(cl_mem), &bufCorr);
    err |= clSetKernelArg(kCorr, 2, sizeof(int), &nAssets);
    err |= clSetKernelArg(kCorr, 3, sizeof(int), &nFeat);
    err |= clSetKernelArg(kCorr, 4, sizeof(int), &windowSize);
    err |= clSetKernelArg(kCorr, 5, sizeof(float), &eps);
    if(err != CL_SUCCESS) return 0;

    size_t global[2];
    global[0] = (size_t)nAssets;
    global[1] = (size_t)nAssets;

    err = clEnqueueNDRangeKernel(queue, kCorr, 2, NULL, global, NULL, 0, NULL, NULL);
    if(err != CL_SUCCESS) return 0;

    err = clFinish(queue);
    if(err != CL_SUCCESS) return 0;

    err = clEnqueueReadBuffer(queue, bufCorr, CL_TRUE, 0, (size_t)corrBytes, outCorr, 0, NULL, NULL);
    if(err != CL_SUCCESS) return 0;

    return 1;
  }
};

// ---------------------------- Strategy ----------------------------

class CompactDominantStrategy {
public:
  ExposureTable exposureTable;
  FeatureBufferSoA featSoA;
  OpenCLBackend openCL;

  SlabAllocator<fvar> corrMatrix;
  SlabAllocator<fvar> distMatrix;
  SlabAllocator<fvar> compactness;
  SlabAllocator<fvar> scores;

  SlabAllocator<float> featLinear;
  SlabAllocator<float> corrLinear;

  int barCount;
  int updateCount;

  CompactDominantStrategy() : barCount(0), updateCount(0) {}

  void init() {
    printf("CompactDominant_v3: Initializing...\n");

    exposureTable.init();
    featSoA.init(N_ASSETS, FEAT_WINDOW);

    corrMatrix.init(N_ASSETS * N_ASSETS);
    distMatrix.init(N_ASSETS * N_ASSETS);
    compactness.init(N_ASSETS);
    scores.init(N_ASSETS);

    featLinear.init(FEAT_N * N_ASSETS * FEAT_WINDOW);
    corrLinear.init(N_ASSETS * N_ASSETS);

    openCL.init();
    printf("CompactDominant_v3: Ready (OpenCL=%d)\n", openCL.ready);

    barCount = 0;
    updateCount = 0;
  }

  void shutdown() {
    printf("CompactDominant_v3: Shutting down...\n");

    openCL.shutdown();

    featSoA.shutdown();
    corrMatrix.shutdown();
    distMatrix.shutdown();
    compactness.shutdown();
    scores.shutdown();

    featLinear.shutdown();
    corrLinear.shutdown();
  }

  void computeFeatures(int assetIdx) {
    asset((char*)ASSET_NAMES[assetIdx]);

    vars C = series(priceClose(0));
    vars V = series(Volatility(C, 20));

    if(Bar < 1) return;

    fvar r1 = (fvar)log(C[0] / C[1]);
    fvar rN = (fvar)log(C[0] / C[12]);
    fvar vol = (fvar)V[0];
    fvar zscore = (fvar)((C[0] - C[50]) / (V[0] * 20.0 + EPS));
    fvar rangeP = (fvar)((C[0] - C[50]) / (C[0] + EPS));
    fvar flow = (fvar)(r1 * vol);
    fvar regime = (fvar)((vol > 0.001) ? 1.0 : 0.0);
    fvar volOfVol = (fvar)(vol * vol);
    fvar persistence = (fvar)fabs(r1);

    featSoA.push(0, assetIdx, r1);
    featSoA.push(1, assetIdx, rN);
    featSoA.push(2, assetIdx, vol);
    featSoA.push(3, assetIdx, zscore);
    featSoA.push(4, assetIdx, rangeP);
    featSoA.push(5, assetIdx, flow);
    featSoA.push(6, assetIdx, regime);
    featSoA.push(7, assetIdx, volOfVol);
    featSoA.push(8, assetIdx, persistence);
  }

  void computeCorrelationMatrixCPU() {
    for(int i=0;i<N_ASSETS*N_ASSETS;i++) corrMatrix[i] = 0;

    for(int f=0; f<FEAT_N; f++){
      for(int a=0; a<N_ASSETS; a++){
        for(int b=a+1; b<N_ASSETS; b++){
          fvar mx = 0, my = 0;
          for(int t=0; t<FEAT_WINDOW; t++){
            mx += featSoA.get(f,a,t);
            my += featSoA.get(f,b,t);
          }
          mx /= (fvar)FEAT_WINDOW;
          my /= (fvar)FEAT_WINDOW;

          fvar sxx = 0, syy = 0, sxy = 0;
          for(int t=0; t<FEAT_WINDOW; t++){
            fvar dx = featSoA.get(f,a,t) - mx;
            fvar dy = featSoA.get(f,b,t) - my;
            sxx += dx*dx;
            syy += dy*dy;
            sxy += dx*dy;
          }

          fvar den = (fvar)sqrt((double)(sxx*syy + (fvar)EPS));
          fvar corr = 0;
          if(den > (fvar)EPS) corr = sxy / den;
          else corr = 0;

          int idx = a*N_ASSETS + b;
          corrMatrix[idx] += corr / (fvar)FEAT_N;
          corrMatrix[b*N_ASSETS + a] = corrMatrix[idx];
        }
      }
    }
  }

  void buildFeatLinear() {
    int idx = 0;
    for(int f=0; f<FEAT_N; f++){
      for(int a=0; a<N_ASSETS; a++){
        for(int t=0; t<FEAT_WINDOW; t++){
          featLinear[idx] = (float)featSoA.get(f, a, t);
          idx++;
        }
      }
    }
  }

  void computeCorrelationMatrix() {
    if(openCL.ready) {
      buildFeatLinear();

      for(int i=0;i<N_ASSETS*N_ASSETS;i++) corrLinear[i] = 0.0f;

      int ok = openCL.computeCorrelationMatrixCL(
        featLinear.data,
        corrLinear.data,
        N_ASSETS,
        FEAT_N,
        FEAT_WINDOW
      );

      if(ok) {
        for(int i=0;i<N_ASSETS*N_ASSETS;i++) corrMatrix[i] = (fvar)0;

        for(int a=0; a<N_ASSETS; a++){
          corrMatrix[a*N_ASSETS + a] = (fvar)1.0;
          for(int b=a+1; b<N_ASSETS; b++){
            float c = corrLinear[a*N_ASSETS + b];
            corrMatrix[a*N_ASSETS + b] = (fvar)c;
            corrMatrix[b*N_ASSETS + a] = (fvar)c;
          }
        }
        return;
      }

      printf("OpenCL: runtime fail -> CPU fallback\n");
      openCL.ready = 0;
    }

    computeCorrelationMatrixCPU();
  }

  void computeDistanceMatrix() {
    for(int i=0;i<N_ASSETS;i++){
      for(int j=0;j<N_ASSETS;j++){
        if(i == j) {
          distMatrix[i*N_ASSETS + j] = (fvar)0;
        } else {
          fvar corrDist = (fvar)1.0 - (fvar)fabs((double)corrMatrix[i*N_ASSETS + j]);
          fvar expDist  = (fvar)exposureTable.getDist(i, j);
          fvar blended = (fvar)LAMBDA_META * corrDist + (fvar)(1.0 - (double)LAMBDA_META) * expDist;
          distMatrix[i*N_ASSETS + j] = blended;
        }
      }
    }
  }

  void floydWarshall() {
    fvar d[28][28];

    for(int i=0;i<N_ASSETS;i++){
      for(int j=0;j<N_ASSETS;j++){
        d[i][j] = distMatrix[i*N_ASSETS + j];
        if(i == j) d[i][j] = (fvar)0;
        if(d[i][j] < (fvar)0) d[i][j] = (fvar)INF;
      }
    }

    for(int k=0;k<N_ASSETS;k++){
      for(int i=0;i<N_ASSETS;i++){
        for(int j=0;j<N_ASSETS;j++){
          if(d[i][k] < (fvar)INF && d[k][j] < (fvar)INF) {
            fvar nk = d[i][k] + d[k][j];
            if(nk < d[i][j]) d[i][j] = nk;
          }
        }
      }
    }

    for(int i=0;i<N_ASSETS;i++){
      fvar w = 0;
      for(int j=i+1;j<N_ASSETS;j++){
        if(d[i][j] < (fvar)INF) w += d[i][j];
      }
      if(w > (fvar)0) compactness[i] = (fvar)(1.0 / (1.0 + (double)w));
      else compactness[i] = (fvar)0;
    }
  }

  void computeScores() {
    for(int i=0;i<N_ASSETS;i++){
      fvar coupling = 0;
      int count = 0;

      for(int j=0;j<N_ASSETS;j++){
        if(i != j && distMatrix[i*N_ASSETS + j] < (fvar)INF) {
          coupling += compactness[j];
          count++;
        }
      }

      fvar pCouple = 0;
      if(count > 0) pCouple = coupling / (fvar)count;
      else pCouple = (fvar)0;

      fvar regime = featSoA.get(6, i, 0);
      fvar rawScore = (fvar)ALPHA * regime + (fvar)GAMMA * compactness[i] - (fvar)BETA * pCouple;

      if(rawScore > (fvar)30) rawScore = (fvar)30;
      if(rawScore < (fvar)-30) rawScore = (fvar)-30;

      scores[i] = (fvar)(1.0 / (1.0 + exp(-(double)rawScore)));
    }
  }

  void onBar() {
    barCount++;

    for(int i=0;i<N_ASSETS;i++) computeFeatures(i);

    if(barCount % UPDATE_EVERY == 0) {
      updateCount++;

      computeCorrelationMatrix();
      computeDistanceMatrix();
      floydWarshall();
      computeScores();
      printTopK();
    }
  }

  void printTopK() {
    int indices[N_ASSETS];
    for(int i=0;i<N_ASSETS;i++) indices[i] = i;

    for(int i=0;i<TOP_K;i++){
      for(int j=i+1;j<N_ASSETS;j++){
        if(scores[indices[j]] > scores[indices[i]]) {
          int tmp = indices[i];
          indices[i] = indices[j];
          indices[j] = tmp;
        }
      }
    }

    if(updateCount % 10 == 0) {
      printf("===CompactDominant_v3 Top-K(update#%d,OpenCL=%d)===\n",
        updateCount, openCL.ready);

      for(int i=0;i<TOP_K;i++){
        int idx = indices[i];
        printf(" %d.%s: score=%.4f, C=%.4f\n", i+1, ASSET_NAMES[idx], (double)scores[idx], (double)compactness[idx]);
      }
    }
  }
};

// ---------------------------- Zorro DLL entry ----------------------------

static CompactDominantStrategy* S = NULL;

DLLFUNC void run()
{
  if(is(INITRUN)) {
    BarPeriod = 60;
    LookBack = max(LookBack, FEAT_WINDOW + 50);

    asset((char*)ASSET_NAMES[0]);

    if(!S) {
      S = new CompactDominantStrategy();
      S->init();
    }
  }

  if(is(EXITRUN)) {
    if(S) {
      S->shutdown();
      delete S;
      S = NULL;
    }
    return;
  }

  if(!S || Bar < LookBack)
    return;

  S->onBar();
}

Last edited by TipmyPip; Yesterday at 23:21.
CrowdAverseCL (openCL) [Re: TipmyPip] #489221
Yesterday at 21:07
Yesterday at 21:07
Joined: Sep 2017
Posts: 223
TipmyPip Online OP
Member
TipmyPip  Online OP
Member

Joined: Sep 2017
Posts: 223
CrowdAverse_v3 is a multi-asset, “anti-crowding” ranking engine for a 28-pair FX basket, packaged as a Zorro64 Strategy DLL. Instead of directly issuing trades, it continuously scores all pairs and prints the top candidates, aiming to favor assets that are informative (high internal variation) yet structurally less crowded in portfolio space.

At initialization, the strategy builds an ExposureTable by decomposing every FX symbol into base/quote currencies (EUR, GBP, USD, CHF, JPY, AUD, CAD, NZD). Each pair gets an 8-dimensional exposure vector (+1 base, ?1 quote, 0 otherwise). From that, it precomputes an exposure distance between every pair: pairs with similar currency legs are “closer” (more crowded) than pairs with disjoint or opposing exposures.

Every bar (BarPeriod = 60), the system computes 9 features per asset over time using a Structure-of-Arrays (SoA) ring buffer: short and medium log returns (ret1, retN), volatility and volatility-of-volatility, a z-style deviation from a mid reference, a normalized range proxy, a simple “flow” proxy (ret×vol), a regime flag, and persistence (abs(ret1)). This SoA layout plus a slab allocator reduces pointer chasing and keeps memory contiguous for faster matrix work.

Every 5 bars, it refreshes the correlation matrix by averaging feature-wise correlations across the 9 channels and converts that into a correlation distance (1 ? |corr|). It then blends correlation distance with exposure distance (with heavier emphasis on exposure via LAMBDA_META) to form a crowd-averse distance matrix. A Floyd-Warshall pass computes shortest paths to estimate each asset’s compactness (how “central” it is in the network). In parallel, an entropy proxy (variance of recent returns) is computed per asset to represent how “active/expressive” its return stream is.

Finally, each asset gets a score from a sigmoid of:
ALPHA·Entropy + GAMMA·Compactness ? BETA·PeerCompactness,
where PeerCompactness is the average compactness of its neighbors (a crowding penalty). Higher BETA makes the model explicitly avoid “where everyone already is.”

Enhancing OpenCL support (logic): the current OpenCLBackend only detects OpenCL.dll and sets a readiness flag. A real enhancement would offload the heavy O(N²·Window·Features) correlation build (and the N² distance blend) to the GPU by: creating a device context/queue, allocating persistent device buffers for the feature tensor and matrices, uploading only the newest feature slice each bar (ring-buffer aware), running kernels for mean/variance/covariance reductions, and reading back only the final score inputs. Floyd-Warshall can also be accelerated (blocked APSP or repeated min-plus steps), but a pragmatic first win is GPU correlation + distance, with a clean CPU fallback when OpenCL is absent or kernel compilation fails.

Code
// TGr06B_CrowdAverse_v3.cpp - Zorro64 Strategy DLL
// Strategy B v3: Crowd-Averse with MX06 OOP + OpenCL (real offload) + Memory Optimization
// Notes:
// - Keeps full CPU fallback.
// - OpenCL is optional: if OpenCL.dll missing / no device / kernel build fails -> CPU path.
// - OpenCL accelerates the heavy correlation matrix step by offloading pairwise correlations.
// - Correlation is computed in float on GPU; results are stored back into fvar corrMatrix.

#define _CRT_SECURE_NO_WARNINGS
#include <zorro.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <windows.h>
#include <stddef.h>

#define INF 1e30
#define EPS 1e-12
#define N_ASSETS 28
#define FEAT_N 9
#define FEAT_WINDOW 200
#define UPDATE_EVERY 5
#define TOP_K 5

#define ALPHA 0.1
#define BETA 0.3
#define GAMMA 2.5
#define LAMBDA_META 0.5

#ifdef TIGHT_MEM
typedef float fvar;
#else
typedef double fvar;
#endif

static const char* ASSET_NAMES[] = {
  "EURUSD","GBPUSD","USDCHF","USDJPY","AUDUSD","AUDCAD","AUDCHF","AUDJPY","AUDNZD",
  "CADJPY","CADCHF","EURAUD","EURCAD","EURCHF","EURGBP","EURJPY","EURNZD","GBPAUD",
  "GBPCAD","GBPCHF","GBPJPY","GBPNZD","NZDCAD","NZDCHF","NZDJPY","NZDUSD","USDCAD"
};
static const char* CURRENCIES[] = {"EUR","GBP","USD","CHF","JPY","AUD","CAD","NZD"};
#define N_CURRENCIES 8

// ---------------------------- Exposure Table ----------------------------

struct ExposureTable {
  int exposure[N_ASSETS][N_CURRENCIES];
  double exposureDist[N_ASSETS][N_ASSETS];

  void init() {
    for(int i=0;i<N_ASSETS;i++){
      for(int c=0;c<N_CURRENCIES;c++){
        exposure[i][c] = 0;
      }
    }
    for(int i=0;i<N_ASSETS;i++){
      for(int j=0;j<N_ASSETS;j++){
        exposureDist[i][j] = 0.0;
      }
    }
  }

  inline double getDist(int i,int j) const { return exposureDist[i][j]; }
};

// ---------------------------- Slab Allocator ----------------------------

template<typename T>
class SlabAllocator {
public:
  T* data;
  int capacity;

  SlabAllocator() : data(NULL), capacity(0) {}
  ~SlabAllocator() { shutdown(); }

  void init(int size) {
    shutdown();
    capacity = size;
    data = (T*)malloc((size_t)capacity * sizeof(T));
    if(data) memset(data, 0, (size_t)capacity * sizeof(T));
  }

  void shutdown() {
    if(data) free(data);
    data = NULL;
    capacity = 0;
  }

  T& operator[](int i) { return data[i]; }
  const T& operator[](int i) const { return data[i]; }
};

// ---------------------------- Feature Buffer (SoA ring) ----------------------------

struct FeatureBufferSoA {
  SlabAllocator<fvar> buffer;
  int windowSize;
  int currentIndex;

  void init(int assets, int window) {
    windowSize = window;
    currentIndex = 0;
    buffer.init(FEAT_N * assets * window);
  }

  void shutdown() { buffer.shutdown(); }

  inline int offset(int feat,int asset,int t) const {
    return (feat * N_ASSETS + asset) * windowSize + t;
  }

  void push(int feat,int asset,fvar value) {
    buffer[offset(feat, asset, currentIndex)] = value;
    currentIndex = (currentIndex + 1) % windowSize;
  }

  // t=0 => most recent
  fvar get(int feat,int asset,int t) const {
    int idx = (currentIndex - 1 - t + windowSize) % windowSize;
    return buffer[offset(feat, asset, idx)];
  }
};

// ---------------------------- Minimal OpenCL (dynamic) ----------------------------

typedef struct _cl_platform_id*   cl_platform_id;
typedef struct _cl_device_id*     cl_device_id;
typedef struct _cl_context*       cl_context;
typedef struct _cl_command_queue* cl_command_queue;
typedef struct _cl_program*       cl_program;
typedef struct _cl_kernel*        cl_kernel;
typedef struct _cl_mem*           cl_mem;
typedef unsigned int              cl_uint;
typedef int                       cl_int;
typedef unsigned long long        cl_ulong;
typedef size_t                    cl_bool;

#define CL_SUCCESS 0
#define CL_DEVICE_TYPE_CPU (1ULL << 1)
#define CL_DEVICE_TYPE_GPU (1ULL << 2)
#define CL_MEM_READ_ONLY   (1ULL << 2)
#define CL_MEM_WRITE_ONLY  (1ULL << 1)
#define CL_MEM_READ_WRITE  (1ULL << 0)
#define CL_TRUE  1
#define CL_FALSE 0
#define CL_PROGRAM_BUILD_LOG 0x1183

class OpenCLBackend {
public:
  HMODULE hOpenCL;
  int ready;

  cl_platform_id platform;
  cl_device_id device;
  cl_context context;
  cl_command_queue queue;
  cl_program program;
  cl_kernel kCorr;

  cl_mem bufFeat;
  cl_mem bufCorr;

  int featBytes;
  int corrBytes;

  cl_int (*clGetPlatformIDs)(cl_uint, cl_platform_id*, cl_uint*);
  cl_int (*clGetDeviceIDs)(cl_platform_id, cl_ulong, cl_uint, cl_device_id*, cl_uint*);
  cl_context (*clCreateContext)(void*, cl_uint, const cl_device_id*, void*, void*, cl_int*);
  cl_command_queue (*clCreateCommandQueue)(cl_context, cl_device_id, cl_ulong, cl_int*);
  cl_program (*clCreateProgramWithSource)(cl_context, cl_uint, const char**, const size_t*, cl_int*);
  cl_int (*clBuildProgram)(cl_program, cl_uint, const cl_device_id*, const char*, void*, void*);
  cl_int (*clGetProgramBuildInfo)(cl_program, cl_device_id, cl_uint, size_t, void*, size_t*);
  cl_kernel (*clCreateKernel)(cl_program, const char*, cl_int*);
  cl_int (*clSetKernelArg)(cl_kernel, cl_uint, size_t, const void*);
  cl_mem (*clCreateBuffer)(cl_context, cl_ulong, size_t, void*, cl_int*);
  cl_int (*clEnqueueWriteBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void*, cl_uint, const void*, void*);
  cl_int (*clEnqueueReadBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void*, cl_uint, const void*, void*);
  cl_int (*clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*, cl_uint, const void*, void*);
  cl_int (*clFinish)(cl_command_queue);
  cl_int (*clReleaseMemObject)(cl_mem);
  cl_int (*clReleaseKernel)(cl_kernel);
  cl_int (*clReleaseProgram)(cl_program);
  cl_int (*clReleaseCommandQueue)(cl_command_queue);
  cl_int (*clReleaseContext)(cl_context);

  OpenCLBackend()
  : hOpenCL(NULL), ready(0),
    platform(NULL), device(NULL), context(NULL), queue(NULL), program(NULL), kCorr(NULL),
    bufFeat(NULL), bufCorr(NULL),
    featBytes(0), corrBytes(0),
    clGetPlatformIDs(NULL), clGetDeviceIDs(NULL), clCreateContext(NULL), clCreateCommandQueue(NULL),
    clCreateProgramWithSource(NULL), clBuildProgram(NULL), clGetProgramBuildInfo(NULL),
    clCreateKernel(NULL), clSetKernelArg(NULL),
    clCreateBuffer(NULL), clEnqueueWriteBuffer(NULL), clEnqueueReadBuffer(NULL),
    clEnqueueNDRangeKernel(NULL), clFinish(NULL),
    clReleaseMemObject(NULL), clReleaseKernel(NULL), clReleaseProgram(NULL),
    clReleaseCommandQueue(NULL), clReleaseContext(NULL)
  {}

  int loadSymbol(void** fp, const char* name) {
    *fp = (void*)GetProcAddress(hOpenCL, name);
    return (*fp != NULL);
  }

  const char* kernelSource() {
    return
      "__kernel void corr_pairwise(\n"
      "  __global const float* feat,\n"
      "  __global float* outCorr,\n"
      "  const int nAssets,\n"
      "  const int nFeat,\n"
      "  const int windowSize,\n"
      "  const float eps\n"
      "){\n"
      "  int a = (int)get_global_id(0);\n"
      "  int b = (int)get_global_id(1);\n"
      "  if(a >= nAssets || b >= nAssets) return;\n"
      "  if(a >= b) return;\n"
      "  float acc = 0.0f;\n"
      "  for(int f=0; f<nFeat; f++){\n"
      "    int baseA = (f*nAssets + a) * windowSize;\n"
      "    int baseB = (f*nAssets + b) * windowSize;\n"
      "    float mx = 0.0f;\n"
      "    float my = 0.0f;\n"
      "    for(int t=0; t<windowSize; t++){\n"
      "      mx += feat[baseA + t];\n"
      "      my += feat[baseB + t];\n"
      "    }\n"
      "    mx /= (float)windowSize;\n"
      "    my /= (float)windowSize;\n"
      "    float sxx = 0.0f;\n"
      "    float syy = 0.0f;\n"
      "    float sxy = 0.0f;\n"
      "    for(int t=0; t<windowSize; t++){\n"
      "      float dx = feat[baseA + t] - mx;\n"
      "      float dy = feat[baseB + t] - my;\n"
      "      sxx += dx*dx;\n"
      "      syy += dy*dy;\n"
      "      sxy += dx*dy;\n"
      "    }\n"
      "    float den = sqrt(sxx*syy + eps);\n"
      "    float corr = (den > eps) ? (sxy/den) : 0.0f;\n"
      "    acc += corr;\n"
      "  }\n"
      "  outCorr[a*nAssets + b] = acc / (float)nFeat;\n"
      "}\n";
  }

  void printBuildLog() {
    if(!clGetProgramBuildInfo || !program || !device) return;
    size_t logSize = 0;
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
    if(logSize == 0) return;
    char* log = (char*)malloc(logSize + 1);
    if(!log) return;
    memset(log, 0, logSize + 1);
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
    printf("OpenCL build log:\n%s\n", log);
    free(log);
  }

  void init() {
    ready = 0;

    hOpenCL = LoadLibraryA("OpenCL.dll");
    if(!hOpenCL) {
      printf("OpenCL: CPU (OpenCL.dll missing)\n");
      return;
    }

    if(!loadSymbol((void**)&clGetPlatformIDs,       "clGetPlatformIDs")) return;
    if(!loadSymbol((void**)&clGetDeviceIDs,         "clGetDeviceIDs")) return;
    if(!loadSymbol((void**)&clCreateContext,        "clCreateContext")) return;
    if(!loadSymbol((void**)&clCreateCommandQueue,   "clCreateCommandQueue")) return;
    if(!loadSymbol((void**)&clCreateProgramWithSource,"clCreateProgramWithSource")) return;
    if(!loadSymbol((void**)&clBuildProgram,         "clBuildProgram")) return;
    if(!loadSymbol((void**)&clGetProgramBuildInfo,  "clGetProgramBuildInfo")) return;
    if(!loadSymbol((void**)&clCreateKernel,         "clCreateKernel")) return;
    if(!loadSymbol((void**)&clSetKernelArg,         "clSetKernelArg")) return;
    if(!loadSymbol((void**)&clCreateBuffer,         "clCreateBuffer")) return;
    if(!loadSymbol((void**)&clEnqueueWriteBuffer,   "clEnqueueWriteBuffer")) return;
    if(!loadSymbol((void**)&clEnqueueReadBuffer,    "clEnqueueReadBuffer")) return;
    if(!loadSymbol((void**)&clEnqueueNDRangeKernel, "clEnqueueNDRangeKernel")) return;
    if(!loadSymbol((void**)&clFinish,               "clFinish")) return;
    if(!loadSymbol((void**)&clReleaseMemObject,     "clReleaseMemObject")) return;
    if(!loadSymbol((void**)&clReleaseKernel,        "clReleaseKernel")) return;
    if(!loadSymbol((void**)&clReleaseProgram,       "clReleaseProgram")) return;
    if(!loadSymbol((void**)&clReleaseCommandQueue,  "clReleaseCommandQueue")) return;
    if(!loadSymbol((void**)&clReleaseContext,       "clReleaseContext")) return;

    cl_uint nPlat = 0;
    if(clGetPlatformIDs(0, NULL, &nPlat) != CL_SUCCESS || nPlat == 0) {
      printf("OpenCL: CPU (no platform)\n");
      return;
    }
    clGetPlatformIDs(1, &platform, NULL);

    cl_uint nDev = 0;
    cl_int ok = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &nDev);
    if(ok != CL_SUCCESS || nDev == 0) {
      ok = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &nDev);
      if(ok != CL_SUCCESS || nDev == 0) {
        printf("OpenCL: CPU (no device)\n");
        return;
      }
    }

    cl_int err = 0;
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if(err != CL_SUCCESS || !context) {
      printf("OpenCL: CPU (context fail)\n");
      return;
    }

    queue = clCreateCommandQueue(context, device, 0, &err);
    if(err != CL_SUCCESS || !queue) {
      printf("OpenCL: CPU (queue fail)\n");
      return;
    }

    const char* src = kernelSource();
    program = clCreateProgramWithSource(context, 1, &src, NULL, &err);
    if(err != CL_SUCCESS || !program) {
      printf("OpenCL: CPU (program fail)\n");
      return;
    }

    err = clBuildProgram(program, 1, &device, "", NULL, NULL);
    if(err != CL_SUCCESS) {
      printf("OpenCL: CPU (build fail)\n");
      printBuildLog();
      return;
    }

    kCorr = clCreateKernel(program, "corr_pairwise", &err);
    if(err != CL_SUCCESS || !kCorr) {
      printf("OpenCL: CPU (kernel fail)\n");
      printBuildLog();
      return;
    }

    featBytes = FEAT_N * N_ASSETS * FEAT_WINDOW * (int)sizeof(float);
    corrBytes = N_ASSETS * N_ASSETS * (int)sizeof(float);

    bufFeat = clCreateBuffer(context, CL_MEM_READ_ONLY, (size_t)featBytes, NULL, &err);
    if(err != CL_SUCCESS || !bufFeat) {
      printf("OpenCL: CPU (bufFeat fail)\n");
      return;
    }

    bufCorr = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (size_t)corrBytes, NULL, &err);
    if(err != CL_SUCCESS || !bufCorr) {
      printf("OpenCL: CPU (bufCorr fail)\n");
      return;
    }

    ready = 1;
    printf("OpenCL: READY (kernel+buffers)\n");
  }

  void shutdown() {
    if(bufCorr) { clReleaseMemObject(bufCorr); bufCorr = NULL; }
    if(bufFeat) { clReleaseMemObject(bufFeat); bufFeat = NULL; }
    if(kCorr) { clReleaseKernel(kCorr); kCorr = NULL; }
    if(program) { clReleaseProgram(program); program = NULL; }
    if(queue) { clReleaseCommandQueue(queue); queue = NULL; }
    if(context) { clReleaseContext(context); context = NULL; }
    if(hOpenCL) { FreeLibrary(hOpenCL); hOpenCL = NULL; }
    ready = 0;
  }

  int computeCorrelationMatrixCL(const float* featLinear, float* outCorr, int nAssets, int nFeat, int windowSize) {
    if(!ready) return 0;
    if(!featLinear || !outCorr) return 0;

    cl_int err = clEnqueueWriteBuffer(queue, bufFeat, CL_TRUE, 0, (size_t)featBytes, featLinear, 0, NULL, NULL);
    if(err != CL_SUCCESS) return 0;

    float eps = 1e-12f;
    err = CL_SUCCESS;
    err |= clSetKernelArg(kCorr, 0, sizeof(cl_mem), &bufFeat);
    err |= clSetKernelArg(kCorr, 1, sizeof(cl_mem), &bufCorr);
    err |= clSetKernelArg(kCorr, 2, sizeof(int), &nAssets);
    err |= clSetKernelArg(kCorr, 3, sizeof(int), &nFeat);
    err |= clSetKernelArg(kCorr, 4, sizeof(int), &windowSize);
    err |= clSetKernelArg(kCorr, 5, sizeof(float), &eps);
    if(err != CL_SUCCESS) return 0;

    size_t global[2];
    global[0] = (size_t)nAssets;
    global[1] = (size_t)nAssets;

    err = clEnqueueNDRangeKernel(queue, kCorr, 2, NULL, global, NULL, 0, NULL, NULL);
    if(err != CL_SUCCESS) return 0;

    err = clFinish(queue);
    if(err != CL_SUCCESS) return 0;

    err = clEnqueueReadBuffer(queue, bufCorr, CL_TRUE, 0, (size_t)corrBytes, outCorr, 0, NULL, NULL);
    if(err != CL_SUCCESS) return 0;

    return 1;
  }
};

// ---------------------------- Strategy ----------------------------

class CrowdAverseStrategy {
public:
  ExposureTable exposureTable;
  FeatureBufferSoA featSoA;
  OpenCLBackend openCL;

  SlabAllocator<fvar> corrMatrix;
  SlabAllocator<fvar> distMatrix;
  SlabAllocator<fvar> compactness;
  SlabAllocator<fvar> entropy;
  SlabAllocator<fvar> scores;

  SlabAllocator<float> featLinear;
  SlabAllocator<float> corrLinear;

  int barCount;
  int updateCount;

  CrowdAverseStrategy() : barCount(0), updateCount(0) {}

  void init() {
    printf("CrowdAverse_v3: Initializing...\n");

    exposureTable.init();
    featSoA.init(N_ASSETS, FEAT_WINDOW);

    corrMatrix.init(N_ASSETS * N_ASSETS);
    distMatrix.init(N_ASSETS * N_ASSETS);
    compactness.init(N_ASSETS);
    entropy.init(N_ASSETS);
    scores.init(N_ASSETS);

    featLinear.init(FEAT_N * N_ASSETS * FEAT_WINDOW);
    corrLinear.init(N_ASSETS * N_ASSETS);

    openCL.init();
    printf("CrowdAverse_v3: Ready (OpenCL=%d)\n", openCL.ready);

    barCount = 0;
    updateCount = 0;
  }

  void shutdown() {
    printf("CrowdAverse_v3: Shutting down...\n");

    openCL.shutdown();

    featSoA.shutdown();
    corrMatrix.shutdown();
    distMatrix.shutdown();
    compactness.shutdown();
    entropy.shutdown();
    scores.shutdown();

    featLinear.shutdown();
    corrLinear.shutdown();
  }

  void computeFeatures(int assetIdx) {
    asset((char*)ASSET_NAMES[assetIdx]);

    vars C = series(priceClose(0));
    vars V = series(Volatility(C, 20));

    if(Bar < 1) return;

    fvar r1 = (fvar)log(C[0] / C[1]);
    fvar rN = (fvar)log(C[0] / C[12]);
    fvar vol = (fvar)V[0];
    fvar zscore = (fvar)((C[0] - C[50]) / (V[0] * 20.0 + EPS));
    fvar rangeP = (fvar)((C[0] - C[50]) / (C[0] + EPS));
    fvar flow = (fvar)(r1 * vol);
    fvar regime = (fvar)((vol > 0.001) ? 1.0 : 0.0);
    fvar volOfVol = (fvar)(vol * vol);
    fvar persistence = (fvar)fabs(r1);

    featSoA.push(0, assetIdx, r1);
    featSoA.push(1, assetIdx, rN);
    featSoA.push(2, assetIdx, vol);
    featSoA.push(3, assetIdx, zscore);
    featSoA.push(4, assetIdx, rangeP);
    featSoA.push(5, assetIdx, flow);
    featSoA.push(6, assetIdx, regime);
    featSoA.push(7, assetIdx, volOfVol);
    featSoA.push(8, assetIdx, persistence);
  }

  fvar computeEntropy(int assetIdx) {
    fvar mean = 0;
    for(int t=0; t<FEAT_WINDOW; t++) mean += featSoA.get(0, assetIdx, t);
    mean /= FEAT_WINDOW;
    fvar var = 0;
    for(int t=0; t<FEAT_WINDOW; t++) { fvar d = featSoA.get(0, assetIdx, t) - mean; var += d*d; }
    return (fvar)(var / FEAT_WINDOW);
  }

  void computeCorrelationMatrixCPU() {
    for(int i=0;i<N_ASSETS*N_ASSETS;i++) corrMatrix[i] = 0;

    for(int f=0; f<FEAT_N; f++){
      for(int a=0; a<N_ASSETS; a++){
        for(int b=a+1; b<N_ASSETS; b++){
          fvar mx = 0, my = 0;
          for(int t=0; t<FEAT_WINDOW; t++){
            mx += featSoA.get(f,a,t);
            my += featSoA.get(f,b,t);
          }
          mx /= (fvar)FEAT_WINDOW;
          my /= (fvar)FEAT_WINDOW;

          fvar sxx = 0, syy = 0, sxy = 0;
          for(int t=0; t<FEAT_WINDOW; t++){
            fvar dx = featSoA.get(f,a,t) - mx;
            fvar dy = featSoA.get(f,b,t) - my;
            sxx += dx*dx;
            syy += dy*dy;
            sxy += dx*dy;
          }

          fvar den = (fvar)sqrt((double)(sxx*syy + (fvar)EPS));
          fvar corr = 0;
          if(den > (fvar)EPS) corr = sxy / den;
          else corr = 0;

          int idx = a*N_ASSETS + b;
          corrMatrix[idx] += corr / (fvar)FEAT_N;
          corrMatrix[b*N_ASSETS + a] = corrMatrix[idx];
        }
      }
    }
  }

  void buildFeatLinear() {
    int idx = 0;
    for(int f=0; f<FEAT_N; f++){
      for(int a=0; a<N_ASSETS; a++){
        for(int t=0; t<FEAT_WINDOW; t++){
          featLinear[idx] = (float)featSoA.get(f, a, t);
          idx++;
        }
      }
    }
  }

  void computeCorrelationMatrix() {
    if(openCL.ready) {
      buildFeatLinear();

      for(int i=0;i<N_ASSETS*N_ASSETS;i++) corrLinear[i] = 0.0f;

      int ok = openCL.computeCorrelationMatrixCL(
        featLinear.data,
        corrLinear.data,
        N_ASSETS,
        FEAT_N,
        FEAT_WINDOW
      );

      if(ok) {
        for(int i=0;i<N_ASSETS*N_ASSETS;i++) corrMatrix[i] = (fvar)0;

        for(int a=0; a<N_ASSETS; a++){
          corrMatrix[a*N_ASSETS + a] = (fvar)1.0;
          for(int b=a+1; b<N_ASSETS; b++){
            float c = corrLinear[a*N_ASSETS + b];
            corrMatrix[a*N_ASSETS + b] = (fvar)c;
            corrMatrix[b*N_ASSETS + a] = (fvar)c;
          }
        }
        return;
      }

      printf("OpenCL: runtime fail -> CPU fallback\n");
      openCL.ready = 0;
    }

    computeCorrelationMatrixCPU();
  }

  void computeDistanceMatrix() {
    for(int i=0;i<N_ASSETS;i++){
      for(int j=0;j<N_ASSETS;j++){
        if(i == j) {
          distMatrix[i*N_ASSETS + j] = (fvar)0;
        } else {
          fvar corrDist = (fvar)1.0 - (fvar)fabs((double)corrMatrix[i*N_ASSETS + j]);
          fvar expDist  = (fvar)exposureTable.getDist(i, j);
          fvar blended = (fvar)LAMBDA_META * corrDist + (fvar)(1.0 - (double)LAMBDA_META) * expDist;
          distMatrix[i*N_ASSETS + j] = blended;
        }
      }
    }
  }

  void floydWarshall() {
    fvar d[28][28];

    for(int i=0;i<N_ASSETS;i++){
      for(int j=0;j<N_ASSETS;j++){
        d[i][j] = distMatrix[i*N_ASSETS + j];
        if(i == j) d[i][j] = (fvar)0;
        if(d[i][j] < (fvar)0) d[i][j] = (fvar)INF;
      }
    }

    for(int k=0;k<N_ASSETS;k++){
      for(int i=0;i<N_ASSETS;i++){
        for(int j=0;j<N_ASSETS;j++){
          if(d[i][k] < (fvar)INF && d[k][j] < (fvar)INF) {
            fvar nk = d[i][k] + d[k][j];
            if(nk < d[i][j]) d[i][j] = nk;
          }
        }
      }
    }

    for(int i=0;i<N_ASSETS;i++){
      fvar w = 0;
      for(int j=i+1;j<N_ASSETS;j++){
        if(d[i][j] < (fvar)INF) w += d[i][j];
      }
      if(w > (fvar)0) compactness[i] = (fvar)(1.0 / (1.0 + (double)w));
      else compactness[i] = (fvar)0;
      entropy[i] = computeEntropy(i);
    }
  }

  void computeScores() {
    for(int i=0;i<N_ASSETS;i++){
      fvar coupling = 0;
      int count = 0;

      for(int j=0;j<N_ASSETS;j++){
        if(i != j && distMatrix[i*N_ASSETS + j] < (fvar)INF) {
          coupling += compactness[j];
          count++;
        }
      }

      fvar pCouple = 0;
      if(count > 0) pCouple = coupling / (fvar)count;
      else pCouple = (fvar)0;

      fvar C_A = compactness[i];
      fvar Ent = entropy[i];

      fvar rawScore = (fvar)ALPHA * Ent + (fvar)GAMMA * C_A - (fvar)BETA * pCouple;

      if(rawScore > (fvar)30) rawScore = (fvar)30;
      if(rawScore < (fvar)-30) rawScore = (fvar)-30;

      scores[i] = (fvar)(1.0 / (1.0 + exp(-(double)rawScore)));
    }
  }

  void onBar() {
    barCount++;

    for(int i=0;i<N_ASSETS;i++) computeFeatures(i);

    if(barCount % UPDATE_EVERY == 0) {
      updateCount++;

      computeCorrelationMatrix();
      computeDistanceMatrix();
      floydWarshall();
      computeScores();
      printTopK();
    }
  }

  void printTopK() {
    int indices[N_ASSETS];
    for(int i=0;i<N_ASSETS;i++) indices[i] = i;

    for(int i=0;i<TOP_K;i++){
      for(int j=i+1;j<N_ASSETS;j++){
        if(scores[indices[j]] > scores[indices[i]]) {
          int tmp = indices[i];
          indices[i] = indices[j];
          indices[j] = tmp;
        }
      }
    }

    if(updateCount % 10 == 0) {
      printf("===CrowdAverse_v3 Top-K(update#%d,OpenCL=%d)===\n",
        updateCount, openCL.ready);

      for(int i=0;i<TOP_K;i++){
        int idx = indices[i];
        printf(" %d.%s: score=%.4f, C=%.4f, Ent=%.6f\n", i+1, ASSET_NAMES[idx], (double)scores[idx], (double)compactness[idx], (double)entropy[idx]);
      }
    }
  }
};

// ---------------------------- Zorro DLL entry ----------------------------

static CrowdAverseStrategy* S = NULL;

DLLFUNC void run()
{
  if(is(INITRUN)) {
    BarPeriod = 60;
    LookBack = max(LookBack, FEAT_WINDOW + 50);

    asset((char*)ASSET_NAMES[0]);

    if(!S) {
      S = new CrowdAverseStrategy();
      S->init();
    }
  }

  if(is(EXITRUN)) {
    if(S) {
      S->shutdown();
      delete S;
      S = NULL;
    }
    return;
  }

  if(!S || Bar < LookBack)
    return;

  S->onBar();
}

Last edited by TipmyPip; Yesterday at 23:13.
RegimeWeaver (OpenCL) [Re: TipmyPip] #489222
Yesterday at 21:43
Yesterday at 21:43
Joined: Sep 2017
Posts: 223
TipmyPip Online OP
Member
TipmyPip  Online OP
Member

Joined: Sep 2017
Posts: 223
MX06 RegimeWeaver is a multi-asset “market-state radar” that continuously ranks a basket of 28 FX pairs by how structurally attractive they look under the current volatility regime. It’s designed for Zorro64 as a Strategy DLL and emphasizes three things: regime awareness, cross-asset structure, and speed.

At every bar (here: 60-minute bars), the strategy cycles through all assets and builds a compact feature snapshot per asset. It tracks nine features over a rolling 200-bar window: short and medium log returns, volatility (via Volatility on the close series), a volatility-scaled deviation (z-like score), relative displacement vs. a mid lookback, a simple “flow” proxy (return × vol), a binary volatility regime flag, volatility-of-volatility, and short-term persistence (absolute return). These values are stored in a memory-efficient Structure-of-Arrays (SoA) ring buffer, so each feature can be accessed quickly across time without reallocations.

Every 4 bars (UPDATE_EVERY), the heavy lifting begins: it builds a pairwise correlation matrix across all assets, averaged across the nine feature channels. Correlations are converted to a distance measure using 1 ? |corr| (assets that behave similarly are “close”). That distance is then blended with a second, fundamental distance: an exposure-distance derived from base/quote currency overlaps (e.g., EURUSD vs. EURJPY share EUR exposure). The blend is controlled by LAMBDA_META (0.6): correlations drive most of the clustering, exposures stabilize it when correlations get noisy.

Next, the strategy runs a Floyd–Warshall pass over the distance graph to compute shortest paths and produces a “compactness” score per asset: assets that are centrally connected in the network (short average path distance) get higher compactness. In parallel, it detects a global regime by averaging volatility across all assets and mapping it into three states (calm ? medium ? high). Final scores combine regime + compactness ? neighbor-coupling pressure, then squash through a logistic function into 0..1, and the Top-K is printed periodically.

How OpenCL support is enhanced (real offload, with safe fallback)

OpenCL is used only to accelerate the most expensive step: computing the pairwise correlations. The strategy implements “enhanced” support in three practical ways:

Dynamic loading (optional dependency): It calls LoadLibraryA("OpenCL.dll") and resolves all OpenCL symbols via GetProcAddress. If the DLL is missing, a platform/device isn’t found, or any symbol is unavailable, it prints a message and continues on the CPU path.

Robust device selection + build diagnostics: It prefers a GPU (CL_DEVICE_TYPE_GPU) but falls back to a CPU OpenCL device if needed. Kernel compilation failures trigger a build-log dump, then the strategy cleanly reverts to CPU.

True compute offload with minimized transfers: The SoA feature buffer is linearized into a contiguous float array (featLinear) sized FEAT_N × N_ASSETS × FEAT_WINDOW. That single block is written once to the GPU buffer, a 2D NDRange kernel computes correlations for every (a,b) pair in parallel, and the resulting float correlation matrix is read back into corrLinear, then converted to the native fvar matrix. If runtime execution fails, OpenCL is disabled (ready=0) and the CPU implementation takes over immediately.

This design keeps performance portable: GPU acceleration when available, identical outputs (up to float precision) and guaranteed continuity when it’s not.

Code
// TGr06C_RegimeSwitcher_v3_OpenCL.cpp - Zorro64 Strategy DLL
// Strategy C v3: Regime-Switching with MX06 OOP + OpenCL (real offload) + Memory Optimization
// Notes:
// - Keeps full CPU fallback.
// - OpenCL is optional: if OpenCL.dll missing / no device / kernel build fails -> CPU path.
// - OpenCL accelerates the heavy correlation matrix step by offloading pairwise correlations.
// - Correlation is computed in float on GPU; results are stored back into fvar corrMatrix.

#define _CRT_SECURE_NO_WARNINGS
#include <zorro.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <windows.h>
#include <stddef.h>

#define INF 1e30
#define EPS 1e-12
#define N_ASSETS 28
#define FEAT_N 9
#define FEAT_WINDOW 200
#define UPDATE_EVERY 4
#define TOP_K 5

#define ALPHA 0.5
#define BETA 0.2
#define GAMMA 2.0
#define LAMBDA_META 0.6

#ifdef TIGHT_MEM
typedef float fvar;
#else
typedef double fvar;
#endif

static const char* ASSET_NAMES[] = {
  "EURUSD","GBPUSD","USDCHF","USDJPY","AUDUSD","AUDCAD","AUDCHF","AUDJPY","AUDNZD",
  "CADJPY","CADCHF","EURAUD","EURCAD","EURCHF","EURGBP","EURJPY","EURNZD","GBPAUD",
  "GBPCAD","GBPCHF","GBPJPY","GBPNZD","NZDCAD","NZDCHF","NZDJPY","NZDUSD","USDCAD"
};
static const char* CURRENCIES[] = {"EUR","GBP","USD","CHF","JPY","AUD","CAD","NZD"};
#define N_CURRENCIES 8

// ---------------------------- Exposure Table ----------------------------

struct ExposureTable {
  int exposure[N_ASSETS][N_CURRENCIES];
  double exposureDist[N_ASSETS][N_ASSETS];

  void init() {
    for(int i=0;i<N_ASSETS;i++){
      for(int c=0;c<N_CURRENCIES;c++){
        exposure[i][c] = 0;
      }
    }
    for(int i=0;i<N_ASSETS;i++){
      for(int j=0;j<N_ASSETS;j++){
        exposureDist[i][j] = 0.0;
      }
    }
  }

  inline double getDist(int i,int j) const { return exposureDist[i][j]; }
};

// ---------------------------- Slab Allocator ----------------------------

template<typename T>
class SlabAllocator {
public:
  T* data;
  int capacity;

  SlabAllocator() : data(NULL), capacity(0) {}
  ~SlabAllocator() { shutdown(); }

  void init(int size) {
    shutdown();
    capacity = size;
    data = (T*)malloc((size_t)capacity * sizeof(T));
    if(data) memset(data, 0, (size_t)capacity * sizeof(T));
  }

  void shutdown() {
    if(data) free(data);
    data = NULL;
    capacity = 0;
  }

  T& operator[](int i) { return data[i]; }
  const T& operator[](int i) const { return data[i]; }
};

// ---------------------------- Feature Buffer (SoA ring) ----------------------------

struct FeatureBufferSoA {
  SlabAllocator<fvar> buffer;
  int windowSize;
  int currentIndex;

  void init(int assets, int window) {
    windowSize = window;
    currentIndex = 0;
    buffer.init(FEAT_N * assets * window);
  }

  void shutdown() { buffer.shutdown(); }

  inline int offset(int feat,int asset,int t) const {
    return (feat * N_ASSETS + asset) * windowSize + t;
  }

  void push(int feat,int asset,fvar value) {
    buffer[offset(feat, asset, currentIndex)] = value;
    currentIndex = (currentIndex + 1) % windowSize;
  }

  // t=0 => most recent
  fvar get(int feat,int asset,int t) const {
    int idx = (currentIndex - 1 - t + windowSize) % windowSize;
    return buffer[offset(feat, asset, idx)];
  }
};

// ---------------------------- Minimal OpenCL (dynamic) ----------------------------

typedef struct _cl_platform_id*   cl_platform_id;
typedef struct _cl_device_id*     cl_device_id;
typedef struct _cl_context*       cl_context;
typedef struct _cl_command_queue* cl_command_queue;
typedef struct _cl_program*       cl_program;
typedef struct _cl_kernel*        cl_kernel;
typedef struct _cl_mem*           cl_mem;
typedef unsigned int              cl_uint;
typedef int                       cl_int;
typedef unsigned long long        cl_ulong;
typedef size_t                    cl_bool;

#define CL_SUCCESS 0
#define CL_DEVICE_TYPE_CPU (1ULL << 1)
#define CL_DEVICE_TYPE_GPU (1ULL << 2)
#define CL_MEM_READ_ONLY   (1ULL << 2)
#define CL_MEM_WRITE_ONLY  (1ULL << 1)
#define CL_MEM_READ_WRITE  (1ULL << 0)
#define CL_TRUE  1
#define CL_FALSE 0
#define CL_PROGRAM_BUILD_LOG 0x1183

class OpenCLBackend {
public:
  HMODULE hOpenCL;
  int ready;

  cl_platform_id platform;
  cl_device_id device;
  cl_context context;
  cl_command_queue queue;
  cl_program program;
  cl_kernel kCorr;

  cl_mem bufFeat;
  cl_mem bufCorr;

  int featBytes;
  int corrBytes;

  cl_int (*clGetPlatformIDs)(cl_uint, cl_platform_id*, cl_uint*);
  cl_int (*clGetDeviceIDs)(cl_platform_id, cl_ulong, cl_uint, cl_device_id*, cl_uint*);
  cl_context (*clCreateContext)(void*, cl_uint, const cl_device_id*, void*, void*, cl_int*);
  cl_command_queue (*clCreateCommandQueue)(cl_context, cl_device_id, cl_ulong, cl_int*);
  cl_program (*clCreateProgramWithSource)(cl_context, cl_uint, const char**, const size_t*, cl_int*);
  cl_int (*clBuildProgram)(cl_program, cl_uint, const cl_device_id*, const char*, void*, void*);
  cl_int (*clGetProgramBuildInfo)(cl_program, cl_device_id, cl_uint, size_t, void*, size_t*);
  cl_kernel (*clCreateKernel)(cl_program, const char*, cl_int*);
  cl_int (*clSetKernelArg)(cl_kernel, cl_uint, size_t, const void*);
  cl_mem (*clCreateBuffer)(cl_context, cl_ulong, size_t, void*, cl_int*);
  cl_int (*clEnqueueWriteBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void*, cl_uint, const void*, void*);
  cl_int (*clEnqueueReadBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void*, cl_uint, const void*, void*);
  cl_int (*clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*, cl_uint, const void*, void*);
  cl_int (*clFinish)(cl_command_queue);
  cl_int (*clReleaseMemObject)(cl_mem);
  cl_int (*clReleaseKernel)(cl_kernel);
  cl_int (*clReleaseProgram)(cl_program);
  cl_int (*clReleaseCommandQueue)(cl_command_queue);
  cl_int (*clReleaseContext)(cl_context);

  OpenCLBackend()
  : hOpenCL(NULL), ready(0),
    platform(NULL), device(NULL), context(NULL), queue(NULL), program(NULL), kCorr(NULL),
    bufFeat(NULL), bufCorr(NULL),
    featBytes(0), corrBytes(0),
    clGetPlatformIDs(NULL), clGetDeviceIDs(NULL), clCreateContext(NULL), clCreateCommandQueue(NULL),
    clCreateProgramWithSource(NULL), clBuildProgram(NULL), clGetProgramBuildInfo(NULL),
    clCreateKernel(NULL), clSetKernelArg(NULL),
    clCreateBuffer(NULL), clEnqueueWriteBuffer(NULL), clEnqueueReadBuffer(NULL),
    clEnqueueNDRangeKernel(NULL), clFinish(NULL),
    clReleaseMemObject(NULL), clReleaseKernel(NULL), clReleaseProgram(NULL),
    clReleaseCommandQueue(NULL), clReleaseContext(NULL)
  {}

  int loadSymbol(void** fp, const char* name) {
    *fp = (void*)GetProcAddress(hOpenCL, name);
    return (*fp != NULL);
  }

  const char* kernelSource() {
    return
      "__kernel void corr_pairwise(\n"
      "  __global const float* feat,\n"
      "  __global float* outCorr,\n"
      "  const int nAssets,\n"
      "  const int nFeat,\n"
      "  const int windowSize,\n"
      "  const float eps\n"
      "){\n"
      "  int a = (int)get_global_id(0);\n"
      "  int b = (int)get_global_id(1);\n"
      "  if(a >= nAssets || b >= nAssets) return;\n"
      "  if(a >= b) return;\n"
      "  float acc = 0.0f;\n"
      "  for(int f=0; f<nFeat; f++){\n"
      "    int baseA = (f*nAssets + a) * windowSize;\n"
      "    int baseB = (f*nAssets + b) * windowSize;\n"
      "    float mx = 0.0f;\n"
      "    float my = 0.0f;\n"
      "    for(int t=0; t<windowSize; t++){\n"
      "      mx += feat[baseA + t];\n"
      "      my += feat[baseB + t];\n"
      "    }\n"
      "    mx /= (float)windowSize;\n"
      "    my /= (float)windowSize;\n"
      "    float sxx = 0.0f;\n"
      "    float syy = 0.0f;\n"
      "    float sxy = 0.0f;\n"
      "    for(int t=0; t<windowSize; t++){\n"
      "      float dx = feat[baseA + t] - mx;\n"
      "      float dy = feat[baseB + t] - my;\n"
      "      sxx += dx*dx;\n"
      "      syy += dy*dy;\n"
      "      sxy += dx*dy;\n"
      "    }\n"
      "    float den = sqrt(sxx*syy + eps);\n"
      "    float corr = (den > eps) ? (sxy/den) : 0.0f;\n"
      "    acc += corr;\n"
      "  }\n"
      "  outCorr[a*nAssets + b] = acc / (float)nFeat;\n"
      "}\n";
  }

  void printBuildLog() {
    if(!clGetProgramBuildInfo || !program || !device) return;
    size_t logSize = 0;
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
    if(logSize == 0) return;
    char* log = (char*)malloc(logSize + 1);
    if(!log) return;
    memset(log, 0, logSize + 1);
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
    printf("OpenCL build log:\n%s\n", log);
    free(log);
  }

  void init() {
    ready = 0;

    hOpenCL = LoadLibraryA("OpenCL.dll");
    if(!hOpenCL) {
      printf("OpenCL: CPU (OpenCL.dll missing)\n");
      return;
    }

    if(!loadSymbol((void**)&clGetPlatformIDs,       "clGetPlatformIDs")) return;
    if(!loadSymbol((void**)&clGetDeviceIDs,         "clGetDeviceIDs")) return;
    if(!loadSymbol((void**)&clCreateContext,        "clCreateContext")) return;
    if(!loadSymbol((void**)&clCreateCommandQueue,   "clCreateCommandQueue")) return;
    if(!loadSymbol((void**)&clCreateProgramWithSource,"clCreateProgramWithSource")) return;
    if(!loadSymbol((void**)&clBuildProgram,         "clBuildProgram")) return;
    if(!loadSymbol((void**)&clGetProgramBuildInfo,  "clGetProgramBuildInfo")) return;
    if(!loadSymbol((void**)&clCreateKernel,         "clCreateKernel")) return;
    if(!loadSymbol((void**)&clSetKernelArg,         "clSetKernelArg")) return;

    if(!loadSymbol((void**)&clCreateBuffer,         "clCreateBuffer")) return;
    if(!loadSymbol((void**)&clEnqueueWriteBuffer,   "clEnqueueWriteBuffer")) return;
    if(!loadSymbol((void**)&clEnqueueReadBuffer,    "clEnqueueReadBuffer")) return;
    if(!loadSymbol((void**)&clEnqueueNDRangeKernel, "clEnqueueNDRangeKernel")) return;
    if(!loadSymbol((void**)&clFinish,               "clFinish")) return;

    if(!loadSymbol((void**)&clReleaseMemObject,     "clReleaseMemObject")) return;
    if(!loadSymbol((void**)&clReleaseKernel,        "clReleaseKernel")) return;
    if(!loadSymbol((void**)&clReleaseProgram,       "clReleaseProgram")) return;
    if(!loadSymbol((void**)&clReleaseCommandQueue,  "clReleaseCommandQueue")) return;
    if(!loadSymbol((void**)&clReleaseContext,       "clReleaseContext")) return;

    cl_uint nPlat = 0;
    if(clGetPlatformIDs(0, NULL, &nPlat) != CL_SUCCESS || nPlat == 0) {
      printf("OpenCL: CPU (no platform)\n");
      return;
    }
    clGetPlatformIDs(1, &platform, NULL);

    cl_uint nDev = 0;
    cl_int ok = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &nDev);
    if(ok != CL_SUCCESS || nDev == 0) {
      ok = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &nDev);
      if(ok != CL_SUCCESS || nDev == 0) {
        printf("OpenCL: CPU (no device)\n");
        return;
      }
    }

    cl_int err = 0;
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if(err != CL_SUCCESS || !context) {
      printf("OpenCL: CPU (context fail)\n");
      return;
    }

    queue = clCreateCommandQueue(context, device, 0, &err);
    if(err != CL_SUCCESS || !queue) {
      printf("OpenCL: CPU (queue fail)\n");
      return;
    }

    const char* src = kernelSource();
    program = clCreateProgramWithSource(context, 1, &src, NULL, &err);
    if(err != CL_SUCCESS || !program) {
      printf("OpenCL: CPU (program fail)\n");
      return;
    }

    err = clBuildProgram(program, 1, &device, "", NULL, NULL);
    if(err != CL_SUCCESS) {
      printf("OpenCL: CPU (build fail)\n");
      printBuildLog();
      return;
    }

    kCorr = clCreateKernel(program, "corr_pairwise", &err);
    if(err != CL_SUCCESS || !kCorr) {
      printf("OpenCL: CPU (kernel fail)\n");
      printBuildLog();
      return;
    }

    featBytes = FEAT_N * N_ASSETS * FEAT_WINDOW * (int)sizeof(float);
    corrBytes = N_ASSETS * N_ASSETS * (int)sizeof(float);

    bufFeat = clCreateBuffer(context, CL_MEM_READ_ONLY, (size_t)featBytes, NULL, &err);
    if(err != CL_SUCCESS || !bufFeat) {
      printf("OpenCL: CPU (bufFeat fail)\n");
      return;
    }

    bufCorr = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (size_t)corrBytes, NULL, &err);
    if(err != CL_SUCCESS || !bufCorr) {
      printf("OpenCL: CPU (bufCorr fail)\n");
      return;
    }

    ready = 1;
    printf("OpenCL: READY (kernel+buffers)\n");
  }

  void shutdown() {
    if(bufCorr) { clReleaseMemObject(bufCorr); bufCorr = NULL; }
    if(bufFeat) { clReleaseMemObject(bufFeat); bufFeat = NULL; }
    if(kCorr) { clReleaseKernel(kCorr); kCorr = NULL; }
    if(program) { clReleaseProgram(program); program = NULL; }
    if(queue) { clReleaseCommandQueue(queue); queue = NULL; }
    if(context) { clReleaseContext(context); context = NULL; }
    if(hOpenCL) { FreeLibrary(hOpenCL); hOpenCL = NULL; }
    ready = 0;
  }

  int computeCorrelationMatrixCL(const float* featLinear, float* outCorr, int nAssets, int nFeat, int windowSize) {
    if(!ready) return 0;
    if(!featLinear || !outCorr) return 0;

    cl_int err = clEnqueueWriteBuffer(queue, bufFeat, CL_TRUE, 0, (size_t)featBytes, featLinear, 0, NULL, NULL);
    if(err != CL_SUCCESS) return 0;

    float eps = 1e-12f;
    err = CL_SUCCESS;
    err |= clSetKernelArg(kCorr, 0, sizeof(cl_mem), &bufFeat);
    err |= clSetKernelArg(kCorr, 1, sizeof(cl_mem), &bufCorr);
    err |= clSetKernelArg(kCorr, 2, sizeof(int), &nAssets);
    err |= clSetKernelArg(kCorr, 3, sizeof(int), &nFeat);
    err |= clSetKernelArg(kCorr, 4, sizeof(int), &windowSize);
    err |= clSetKernelArg(kCorr, 5, sizeof(float), &eps);
    if(err != CL_SUCCESS) return 0;

    size_t global[2];
    global[0] = (size_t)nAssets;
    global[1] = (size_t)nAssets;

    err = clEnqueueNDRangeKernel(queue, kCorr, 2, NULL, global, NULL, 0, NULL, NULL);
    if(err != CL_SUCCESS) return 0;

    err = clFinish(queue);
    if(err != CL_SUCCESS) return 0;

    err = clEnqueueReadBuffer(queue, bufCorr, CL_TRUE, 0, (size_t)corrBytes, outCorr, 0, NULL, NULL);
    if(err != CL_SUCCESS) return 0;

    return 1;
  }
};

// ---------------------------- Strategy ----------------------------

class RegimeSwitcherStrategy {
public:
  ExposureTable exposureTable;
  FeatureBufferSoA featSoA;
  OpenCLBackend openCL;

  SlabAllocator<fvar> corrMatrix;
  SlabAllocator<fvar> distMatrix;
  SlabAllocator<fvar> compactness;
  SlabAllocator<fvar> regime;
  SlabAllocator<fvar> scores;

  SlabAllocator<float> featLinear;
  SlabAllocator<float> corrLinear;

  int barCount;
  int updateCount;
  int currentRegime;

  RegimeSwitcherStrategy() : barCount(0), updateCount(0), currentRegime(0) {}

  void init() {
    printf("RegimeSwitcher_v3: Initializing...\n");

    exposureTable.init();
    featSoA.init(N_ASSETS, FEAT_WINDOW);

    corrMatrix.init(N_ASSETS * N_ASSETS);
    distMatrix.init(N_ASSETS * N_ASSETS);
    compactness.init(N_ASSETS);
    regime.init(N_ASSETS);
    scores.init(N_ASSETS);

    featLinear.init(FEAT_N * N_ASSETS * FEAT_WINDOW);
    corrLinear.init(N_ASSETS * N_ASSETS);

    openCL.init();
    printf("RegimeSwitcher_v3: Ready (OpenCL=%d)\n", openCL.ready);

    barCount = 0;
    updateCount = 0;
  }

  void shutdown() {
    printf("RegimeSwitcher_v3: Shutting down...\n");

    openCL.shutdown();

    featSoA.shutdown();
    corrMatrix.shutdown();
    distMatrix.shutdown();
    compactness.shutdown();
    regime.shutdown();
    scores.shutdown();

    featLinear.shutdown();
    corrLinear.shutdown();
  }

  void computeFeatures(int assetIdx) {
    asset((char*)ASSET_NAMES[assetIdx]);

    vars C = series(priceClose(0));
    vars V = series(Volatility(C, 20));

    if(Bar < 1) return;

    fvar r1 = (fvar)log(C[0] / C[1]);
    fvar rN = (fvar)log(C[0] / C[12]);
    fvar vol = (fvar)V[0];
    fvar zscore = (fvar)((C[0] - C[50]) / (V[0] * 20.0 + EPS));
    fvar rangeP = (fvar)((C[0] - C[50]) / (C[0] + EPS));
    fvar flow = (fvar)(r1 * vol);

    fvar reg = 0;
    if(vol > 0.001) reg = (fvar)1.0;
    else reg = (fvar)0.0;

    fvar volOfVol = (fvar)(vol * vol);
    fvar persistence = (fvar)fabs(r1);

    featSoA.push(0, assetIdx, r1);
    featSoA.push(1, assetIdx, rN);
    featSoA.push(2, assetIdx, vol);
    featSoA.push(3, assetIdx, zscore);
    featSoA.push(4, assetIdx, rangeP);
    featSoA.push(5, assetIdx, flow);
    featSoA.push(6, assetIdx, reg);
    featSoA.push(7, assetIdx, volOfVol);
    featSoA.push(8, assetIdx, persistence);
  }

  fvar detectRegime() {
    fvar v = 0;
    for(int i=0;i<N_ASSETS;i++) v += featSoA.get(2, i, 0);
    v /= (fvar)N_ASSETS;

    if(v > (fvar)0.0015) currentRegime = 2;
    else if(v > (fvar)0.0008) currentRegime = 1;
    else currentRegime = 0;

    return (fvar)currentRegime;
  }

  void computeCorrelationMatrixCPU() {
    for(int i=0;i<N_ASSETS*N_ASSETS;i++) corrMatrix[i] = 0;

    for(int f=0; f<FEAT_N; f++){
      for(int a=0; a<N_ASSETS; a++){
        for(int b=a+1; b<N_ASSETS; b++){
          fvar mx = 0, my = 0;
          for(int t=0; t<FEAT_WINDOW; t++){
            mx += featSoA.get(f,a,t);
            my += featSoA.get(f,b,t);
          }
          mx /= (fvar)FEAT_WINDOW;
          my /= (fvar)FEAT_WINDOW;

          fvar sxx = 0, syy = 0, sxy = 0;
          for(int t=0; t<FEAT_WINDOW; t++){
            fvar dx = featSoA.get(f,a,t) - mx;
            fvar dy = featSoA.get(f,b,t) - my;
            sxx += dx*dx;
            syy += dy*dy;
            sxy += dx*dy;
          }

          fvar den = (fvar)sqrt((double)(sxx*syy + (fvar)EPS));
          fvar corr = 0;
          if(den > (fvar)EPS) corr = sxy / den;
          else corr = 0;

          int idx = a*N_ASSETS + b;
          corrMatrix[idx] += corr / (fvar)FEAT_N;
          corrMatrix[b*N_ASSETS + a] = corrMatrix[idx];
        }
      }
    }
  }

  void buildFeatLinear() {
    int idx = 0;
    for(int f=0; f<FEAT_N; f++){
      for(int a=0; a<N_ASSETS; a++){
        for(int t=0; t<FEAT_WINDOW; t++){
          featLinear[idx] = (float)featSoA.get(f, a, t);
          idx++;
        }
      }
    }
  }

  void computeCorrelationMatrix() {
    if(openCL.ready) {
      buildFeatLinear();

      for(int i=0;i<N_ASSETS*N_ASSETS;i++) corrLinear[i] = 0.0f;

      int ok = openCL.computeCorrelationMatrixCL(
        featLinear.data,
        corrLinear.data,
        N_ASSETS,
        FEAT_N,
        FEAT_WINDOW
      );

      if(ok) {
        for(int i=0;i<N_ASSETS*N_ASSETS;i++) corrMatrix[i] = (fvar)0;

        for(int a=0; a<N_ASSETS; a++){
          corrMatrix[a*N_ASSETS + a] = (fvar)1.0;
          for(int b=a+1; b<N_ASSETS; b++){
            float c = corrLinear[a*N_ASSETS + b];
            corrMatrix[a*N_ASSETS + b] = (fvar)c;
            corrMatrix[b*N_ASSETS + a] = (fvar)c;
          }
        }
        return;
      }

      printf("OpenCL: runtime fail -> CPU fallback\n");
      openCL.ready = 0;
    }

    computeCorrelationMatrixCPU();
  }

  void computeDistanceMatrix() {
    for(int i=0;i<N_ASSETS;i++){
      for(int j=0;j<N_ASSETS;j++){
        if(i == j) {
          distMatrix[i*N_ASSETS + j] = (fvar)0;
        } else {
          fvar corrDist = (fvar)1.0 - (fvar)fabs((double)corrMatrix[i*N_ASSETS + j]);
          fvar expDist  = (fvar)exposureTable.getDist(i, j);

          fvar blended = (fvar)LAMBDA_META * corrDist + (fvar)(1.0 - (double)LAMBDA_META) * expDist;
          distMatrix[i*N_ASSETS + j] = blended;
        }
      }
    }
  }

  void floydWarshall() {
    fvar d[28][28];

    for(int i=0;i<N_ASSETS;i++){
      for(int j=0;j<N_ASSETS;j++){
        d[i][j] = distMatrix[i*N_ASSETS + j];
        if(i == j) d[i][j] = (fvar)0;
        if(d[i][j] < (fvar)0) d[i][j] = (fvar)INF;
      }
    }

    for(int k=0;k<N_ASSETS;k++){
      for(int i=0;i<N_ASSETS;i++){
        for(int j=0;j<N_ASSETS;j++){
          if(d[i][k] < (fvar)INF && d[k][j] < (fvar)INF) {
            fvar nk = d[i][k] + d[k][j];
            if(nk < d[i][j]) d[i][j] = nk;
          }
        }
      }
    }

    for(int i=0;i<N_ASSETS;i++){
      fvar w = 0;
      for(int j=i+1;j<N_ASSETS;j++){
        if(d[i][j] < (fvar)INF) w += d[i][j];
      }
      if(w > (fvar)0) compactness[i] = (fvar)(1.0 / (1.0 + (double)w));
      else compactness[i] = (fvar)0;

      regime[i] = detectRegime();
    }
  }

  void computeScores() {
    for(int i=0;i<N_ASSETS;i++){
      fvar coupling = 0;
      int count = 0;

      for(int j=0;j<N_ASSETS;j++){
        if(i != j && distMatrix[i*N_ASSETS + j] < (fvar)INF) {
          coupling += compactness[j];
          count++;
        }
      }

      fvar pCouple = 0;
      if(count > 0) pCouple = coupling / (fvar)count;
      else pCouple = (fvar)0;

      fvar rawScore = (fvar)ALPHA * regime[i] + (fvar)GAMMA * compactness[i] - (fvar)BETA * pCouple;

      if(rawScore > (fvar)30) rawScore = (fvar)30;
      if(rawScore < (fvar)-30) rawScore = (fvar)-30;

      scores[i] = (fvar)(1.0 / (1.0 + exp(-(double)rawScore)));
    }
  }

  void onBar() {
    barCount++;

    for(int i=0;i<N_ASSETS;i++) computeFeatures(i);

    if(barCount % UPDATE_EVERY == 0) {
      updateCount++;

      computeCorrelationMatrix();
      computeDistanceMatrix();
      floydWarshall();
      computeScores();
      printTopK();
    }
  }

  void printTopK() {
    int indices[N_ASSETS];
    for(int i=0;i<N_ASSETS;i++) indices[i] = i;

    for(int i=0;i<TOP_K;i++){
      for(int j=i+1;j<N_ASSETS;j++){
        if(scores[indices[j]] > scores[indices[i]]) {
          int tmp = indices[i];
          indices[i] = indices[j];
          indices[j] = tmp;
        }
      }
    }

    if(updateCount % 10 == 0) {
      printf("===RegimeSwitcher_v3 Top-K(update#%d,Reg=%d,OpenCL=%d)===\n",
        updateCount, currentRegime, openCL.ready);

      for(int i=0;i<TOP_K;i++){
        int idx = indices[i];
        printf(" %d.%s: score=%.4f\n", i+1, ASSET_NAMES[idx], (double)scores[idx]);
      }
    }
  }
};

// ---------------------------- Zorro DLL entry ----------------------------

static RegimeSwitcherStrategy* S = NULL;

DLLFUNC void run()
{
  if(is(INITRUN)) {
    BarPeriod = 60;
    LookBack = max(LookBack, FEAT_WINDOW + 50);

    asset((char*)ASSET_NAMES[0]);

    if(!S) {
      S = new RegimeSwitcherStrategy();
      S->init();
    }
  }

  if(is(EXITRUN)) {
    if(S) {
      S->shutdown();
      delete S;
      S = NULL;
    }
    return;
  }

  if(!S || Bar < LookBack)
    return;

  S->onBar();
}

Last edited by TipmyPip; Yesterday at 23:10.
AetherWeave-28 (OpenCL) [Re: TipmyPip] #489223
Yesterday at 21:49
Yesterday at 21:49
Joined: Sep 2017
Posts: 223
TipmyPip Online OP
Member
TipmyPip  Online OP
Member

Joined: Sep 2017
Posts: 223
AetherWeave-28 is a multi-asset ranking engine designed to continuously score a fixed universe of 28 FX pairs by blending short-term volatility, cross-asset similarity, and currency exposure structure into a single probability-like score. Rather than predicting direction, it builds a relative attractiveness map of the basket and prints the Top-K candidates on a schedule, making it suitable as a portfolio selector or as an upstream signal filter.

Every bar (BarPeriod = 60 minutes), the strategy computes a compact set of 9 features per asset and stores them in a structure-of-arrays ring buffer over a rolling 200-bar window. The features include 1-bar and multi-bar log returns, realized volatility, a volatility-normalized deviation (z-style), normalized range, a simple flow proxy (return × volatility), a regime flag, volatility-of-volatility, and return persistence. This rolling feature cube is the “state” used for cross-sectional comparisons and is updated for all assets on every bar.

Every 5 bars (UPDATE_EVERY), AetherWeave-28 performs its heavy analytics step: it constructs a pairwise correlation matrix between all assets by averaging feature-wise correlations across the 9 features. This correlation structure is then converted into a correlation distance (1 ? |corr|), which is blended with a currency exposure distance derived from base/quote currency signs. The blend weight LAMBDA_META controls whether the engine trusts statistical similarity (correlation) or structural similarity (shared currency exposure).

With the final distance matrix, the strategy runs Floyd–Warshall to compute all-pairs shortest path distances, treating the basket as a network. From these paths it derives each asset’s compactness—assets that are “closer” to the rest of the network get higher compactness. Finally, it produces a score per asset using a weighted mix:

+ ALPHA × volatility (risk/attention component)

+ GAMMA × compactness (network centrality component)

? BETA × average neighbor compactness (anti-crowding / diversification pressure)

The raw score is clipped and passed through a logistic transform, yielding a smooth 0..1 ranking value. Every 10 updates, the Top-K assets and diagnostics (score, compactness, volatility) are printed.

How the enhanced OpenCL support works (logic details)

OpenCL is used only to accelerate the most expensive part: the pairwise correlation computation (O(N² × FEAT_N × FEAT_WINDOW)). The enhancement is “real offload” because it moves that nested loop to a GPU/CPU OpenCL device:

Optional, safe initialization: the code dynamically loads OpenCL.dll at runtime and resolves function pointers. If the DLL is missing, no platform/device exists, context/queue creation fails, or kernel build fails, it prints a reason and keeps a full CPU fallback.

Kernel specialization: the corr_pairwise kernel runs on a 2D NDRange (nAssets, nAssets) where each work item computes one pair (a,b) with a<b. It computes mean and covariance over the 200-bar window for each feature, averages correlations across features, and writes a single float result to outCorr[a*nAssets + b].

Memory layout for throughput: the ring buffer (SoA) is flattened into a contiguous float array (featLinear) matching kernel expectations: [feature][asset][time]. This improves coalesced reads on the device.

Single upload / single download per update: on each UPDATE_EVERY event, the host writes the full feature cube to bufFeat, launches the kernel, finishes the queue, then reads bufCorr back into corrLinear.

Robust runtime fallback: if any enqueue, kernel launch, or readback fails, it prints “runtime fail -> CPU fallback” and disables OpenCL (ready = 0) so subsequent updates stay on CPU.

This design maximizes speed where it matters, while keeping deterministic behavior and reliability when OpenCL isn’t available.

Code
// TGr06D_VolAdjuster_v3.cpp - Zorro64 Strategy DLL
// Strategy D v3: Volatility-Adjusted with MX06 OOP + OpenCL (real offload) + Memory Optimization
// Notes:
// - Keeps full CPU fallback.
// - OpenCL is optional: if OpenCL.dll missing / no device / kernel build fails -> CPU path.
// - OpenCL accelerates the heavy correlation matrix step by offloading pairwise correlations.
// - Correlation is computed in float on GPU; results are stored back into fvar corrMatrix.

#define _CRT_SECURE_NO_WARNINGS
#include <zorro.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <windows.h>
#include <stddef.h>

#define INF 1e30
#define EPS 1e-12
#define N_ASSETS 28
#define FEAT_N 9
#define FEAT_WINDOW 200
#define UPDATE_EVERY 5
#define TOP_K 5

#define ALPHA 0.1
#define BETA 0.2
#define GAMMA 4.0
#define LAMBDA_META 0.6

#ifdef TIGHT_MEM
typedef float fvar;
#else
typedef double fvar;
#endif

static const char* ASSET_NAMES[] = {
  "EURUSD","GBPUSD","USDCHF","USDJPY","AUDUSD","AUDCAD","AUDCHF","AUDJPY","AUDNZD",
  "CADJPY","CADCHF","EURAUD","EURCAD","EURCHF","EURGBP","EURJPY","EURNZD","GBPAUD",
  "GBPCAD","GBPCHF","GBPJPY","GBPNZD","NZDCAD","NZDCHF","NZDJPY","NZDUSD","USDCAD"
};
static const char* CURRENCIES[] = {"EUR","GBP","USD","CHF","JPY","AUD","CAD","NZD"};
#define N_CURRENCIES 8

// ---------------------------- Exposure Table ----------------------------

struct ExposureTable {
  int exposure[N_ASSETS][N_CURRENCIES];
  double exposureDist[N_ASSETS][N_ASSETS];

  void init() {
    for(int i=0;i<N_ASSETS;i++){
      for(int c=0;c<N_CURRENCIES;c++){
        exposure[i][c] = 0;
      }
    }
    for(int i=0;i<N_ASSETS;i++){
      for(int j=0;j<N_ASSETS;j++){
        exposureDist[i][j] = 0.0;
      }
    }
  }

  inline double getDist(int i,int j) const { return exposureDist[i][j]; }
};

// ---------------------------- Slab Allocator ----------------------------

template<typename T>
class SlabAllocator {
public:
  T* data;
  int capacity;

  SlabAllocator() : data(NULL), capacity(0) {}
  ~SlabAllocator() { shutdown(); }

  void init(int size) {
    shutdown();
    capacity = size;
    data = (T*)malloc((size_t)capacity * sizeof(T));
    if(data) memset(data, 0, (size_t)capacity * sizeof(T));
  }

  void shutdown() {
    if(data) free(data);
    data = NULL;
    capacity = 0;
  }

  T& operator[](int i) { return data[i]; }
  const T& operator[](int i) const { return data[i]; }
};

// ---------------------------- Feature Buffer (SoA ring) ----------------------------

struct FeatureBufferSoA {
  SlabAllocator<fvar> buffer;
  int windowSize;
  int currentIndex;

  void init(int assets, int window) {
    windowSize = window;
    currentIndex = 0;
    buffer.init(FEAT_N * assets * window);
  }

  void shutdown() { buffer.shutdown(); }

  inline int offset(int feat,int asset,int t) const {
    return (feat * N_ASSETS + asset) * windowSize + t;
  }

  void push(int feat,int asset,fvar value) {
    buffer[offset(feat, asset, currentIndex)] = value;
    currentIndex = (currentIndex + 1) % windowSize;
  }

  // t=0 => most recent
  fvar get(int feat,int asset,int t) const {
    int idx = (currentIndex - 1 - t + windowSize) % windowSize;
    return buffer[offset(feat, asset, idx)];
  }
};

// ---------------------------- Minimal OpenCL (dynamic) ----------------------------

typedef struct _cl_platform_id*   cl_platform_id;
typedef struct _cl_device_id*     cl_device_id;
typedef struct _cl_context*       cl_context;
typedef struct _cl_command_queue* cl_command_queue;
typedef struct _cl_program*       cl_program;
typedef struct _cl_kernel*        cl_kernel;
typedef struct _cl_mem*           cl_mem;
typedef unsigned int              cl_uint;
typedef int                       cl_int;
typedef unsigned long long        cl_ulong;
typedef size_t                    cl_bool;

#define CL_SUCCESS 0
#define CL_DEVICE_TYPE_CPU (1ULL << 1)
#define CL_DEVICE_TYPE_GPU (1ULL << 2)
#define CL_MEM_READ_ONLY   (1ULL << 2)
#define CL_MEM_WRITE_ONLY  (1ULL << 1)
#define CL_MEM_READ_WRITE  (1ULL << 0)
#define CL_TRUE  1
#define CL_FALSE 0
#define CL_PROGRAM_BUILD_LOG 0x1183

class OpenCLBackend {
public:
  HMODULE hOpenCL;
  int ready;

  cl_platform_id platform;
  cl_device_id device;
  cl_context context;
  cl_command_queue queue;
  cl_program program;
  cl_kernel kCorr;

  cl_mem bufFeat;
  cl_mem bufCorr;

  int featBytes;
  int corrBytes;

  cl_int (*clGetPlatformIDs)(cl_uint, cl_platform_id*, cl_uint*);
  cl_int (*clGetDeviceIDs)(cl_platform_id, cl_ulong, cl_uint, cl_device_id*, cl_uint*);
  cl_context (*clCreateContext)(void*, cl_uint, const cl_device_id*, void*, void*, cl_int*);
  cl_command_queue (*clCreateCommandQueue)(cl_context, cl_device_id, cl_ulong, cl_int*);
  cl_program (*clCreateProgramWithSource)(cl_context, cl_uint, const char**, const size_t*, cl_int*);
  cl_int (*clBuildProgram)(cl_program, cl_uint, const cl_device_id*, const char*, void*, void*);
  cl_int (*clGetProgramBuildInfo)(cl_program, cl_device_id, cl_uint, size_t, void*, size_t*);
  cl_kernel (*clCreateKernel)(cl_program, const char*, cl_int*);
  cl_int (*clSetKernelArg)(cl_kernel, cl_uint, size_t, const void*);
  cl_mem (*clCreateBuffer)(cl_context, cl_ulong, size_t, void*, cl_int*);
  cl_int (*clEnqueueWriteBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void*, cl_uint, const void*, void*);
  cl_int (*clEnqueueReadBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void*, cl_uint, const void*, void*);
  cl_int (*clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*, cl_uint, const void*, void*);
  cl_int (*clFinish)(cl_command_queue);
  cl_int (*clReleaseMemObject)(cl_mem);
  cl_int (*clReleaseKernel)(cl_kernel);
  cl_int (*clReleaseProgram)(cl_program);
  cl_int (*clReleaseCommandQueue)(cl_command_queue);
  cl_int (*clReleaseContext)(cl_context);

  OpenCLBackend()
  : hOpenCL(NULL), ready(0),
    platform(NULL), device(NULL), context(NULL), queue(NULL), program(NULL), kCorr(NULL),
    bufFeat(NULL), bufCorr(NULL),
    featBytes(0), corrBytes(0),
    clGetPlatformIDs(NULL), clGetDeviceIDs(NULL), clCreateContext(NULL), clCreateCommandQueue(NULL),
    clCreateProgramWithSource(NULL), clBuildProgram(NULL), clGetProgramBuildInfo(NULL),
    clCreateKernel(NULL), clSetKernelArg(NULL),
    clCreateBuffer(NULL), clEnqueueWriteBuffer(NULL), clEnqueueReadBuffer(NULL),
    clEnqueueNDRangeKernel(NULL), clFinish(NULL),
    clReleaseMemObject(NULL), clReleaseKernel(NULL), clReleaseProgram(NULL),
    clReleaseCommandQueue(NULL), clReleaseContext(NULL)
  {}

  int loadSymbol(void** fp, const char* name) {
    *fp = (void*)GetProcAddress(hOpenCL, name);
    return (*fp != NULL);
  }

  const char* kernelSource() {
    return
      "__kernel void corr_pairwise(\n"
      "  __global const float* feat,\n"
      "  __global float* outCorr,\n"
      "  const int nAssets,\n"
      "  const int nFeat,\n"
      "  const int windowSize,\n"
      "  const float eps\n"
      "){\n"
      "  int a = (int)get_global_id(0);\n"
      "  int b = (int)get_global_id(1);\n"
      "  if(a >= nAssets || b >= nAssets) return;\n"
      "  if(a >= b) return;\n"
      "  float acc = 0.0f;\n"
      "  for(int f=0; f<nFeat; f++){\n"
      "    int baseA = (f*nAssets + a) * windowSize;\n"
      "    int baseB = (f*nAssets + b) * windowSize;\n"
      "    float mx = 0.0f;\n"
      "    float my = 0.0f;\n"
      "    for(int t=0; t<windowSize; t++){\n"
      "      mx += feat[baseA + t];\n"
      "      my += feat[baseB + t];\n"
      "    }\n"
      "    mx /= (float)windowSize;\n"
      "    my /= (float)windowSize;\n"
      "    float sxx = 0.0f;\n"
      "    float syy = 0.0f;\n"
      "    float sxy = 0.0f;\n"
      "    for(int t=0; t<windowSize; t++){\n"
      "      float dx = feat[baseA + t] - mx;\n"
      "      float dy = feat[baseB + t] - my;\n"
      "      sxx += dx*dx;\n"
      "      syy += dy*dy;\n"
      "      sxy += dx*dy;\n"
      "    }\n"
      "    float den = sqrt(sxx*syy + eps);\n"
      "    float corr = (den > eps) ? (sxy/den) : 0.0f;\n"
      "    acc += corr;\n"
      "  }\n"
      "  outCorr[a*nAssets + b] = acc / (float)nFeat;\n"
      "}\n";
  }

  void printBuildLog() {
    if(!clGetProgramBuildInfo || !program || !device) return;
    size_t logSize = 0;
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
    if(logSize == 0) return;
    char* log = (char*)malloc(logSize + 1);
    if(!log) return;
    memset(log, 0, logSize + 1);
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
    printf("OpenCL build log:\n%s\n", log);
    free(log);
  }

  void init() {
    ready = 0;

    hOpenCL = LoadLibraryA("OpenCL.dll");
    if(!hOpenCL) {
      printf("OpenCL: CPU (OpenCL.dll missing)\n");
      return;
    }

    if(!loadSymbol((void**)&clGetPlatformIDs,       "clGetPlatformIDs")) return;
    if(!loadSymbol((void**)&clGetDeviceIDs,         "clGetDeviceIDs")) return;
    if(!loadSymbol((void**)&clCreateContext,        "clCreateContext")) return;
    if(!loadSymbol((void**)&clCreateCommandQueue,   "clCreateCommandQueue")) return;
    if(!loadSymbol((void**)&clCreateProgramWithSource,"clCreateProgramWithSource")) return;
    if(!loadSymbol((void**)&clBuildProgram,         "clBuildProgram")) return;
    if(!loadSymbol((void**)&clGetProgramBuildInfo,  "clGetProgramBuildInfo")) return;
    if(!loadSymbol((void**)&clCreateKernel,         "clCreateKernel")) return;
    if(!loadSymbol((void**)&clSetKernelArg,         "clSetKernelArg")) return;
    if(!loadSymbol((void**)&clCreateBuffer,         "clCreateBuffer")) return;
    if(!loadSymbol((void**)&clEnqueueWriteBuffer,   "clEnqueueWriteBuffer")) return;
    if(!loadSymbol((void**)&clEnqueueReadBuffer,    "clEnqueueReadBuffer")) return;
    if(!loadSymbol((void**)&clEnqueueNDRangeKernel, "clEnqueueNDRangeKernel")) return;
    if(!loadSymbol((void**)&clFinish,               "clFinish")) return;
    if(!loadSymbol((void**)&clReleaseMemObject,     "clReleaseMemObject")) return;
    if(!loadSymbol((void**)&clReleaseKernel,        "clReleaseKernel")) return;
    if(!loadSymbol((void**)&clReleaseProgram,       "clReleaseProgram")) return;
    if(!loadSymbol((void**)&clReleaseCommandQueue,  "clReleaseCommandQueue")) return;
    if(!loadSymbol((void**)&clReleaseContext,       "clReleaseContext")) return;

    cl_uint nPlat = 0;
    if(clGetPlatformIDs(0, NULL, &nPlat) != CL_SUCCESS || nPlat == 0) {
      printf("OpenCL: CPU (no platform)\n");
      return;
    }
    clGetPlatformIDs(1, &platform, NULL);

    cl_uint nDev = 0;
    cl_int ok = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &nDev);
    if(ok != CL_SUCCESS || nDev == 0) {
      ok = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &nDev);
      if(ok != CL_SUCCESS || nDev == 0) {
        printf("OpenCL: CPU (no device)\n");
        return;
      }
    }

    cl_int err = 0;
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if(err != CL_SUCCESS || !context) {
      printf("OpenCL: CPU (context fail)\n");
      return;
    }

    queue = clCreateCommandQueue(context, device, 0, &err);
    if(err != CL_SUCCESS || !queue) {
      printf("OpenCL: CPU (queue fail)\n");
      return;
    }

    const char* src = kernelSource();
    program = clCreateProgramWithSource(context, 1, &src, NULL, &err);
    if(err != CL_SUCCESS || !program) {
      printf("OpenCL: CPU (program fail)\n");
      return;
    }

    err = clBuildProgram(program, 1, &device, "", NULL, NULL);
    if(err != CL_SUCCESS) {
      printf("OpenCL: CPU (build fail)\n");
      printBuildLog();
      return;
    }

    kCorr = clCreateKernel(program, "corr_pairwise", &err);
    if(err != CL_SUCCESS || !kCorr) {
      printf("OpenCL: CPU (kernel fail)\n");
      printBuildLog();
      return;
    }

    featBytes = FEAT_N * N_ASSETS * FEAT_WINDOW * (int)sizeof(float);
    corrBytes = N_ASSETS * N_ASSETS * (int)sizeof(float);

    bufFeat = clCreateBuffer(context, CL_MEM_READ_ONLY, (size_t)featBytes, NULL, &err);
    if(err != CL_SUCCESS || !bufFeat) {
      printf("OpenCL: CPU (bufFeat fail)\n");
      return;
    }

    bufCorr = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (size_t)corrBytes, NULL, &err);
    if(err != CL_SUCCESS || !bufCorr) {
      printf("OpenCL: CPU (bufCorr fail)\n");
      return;
    }

    ready = 1;
    printf("OpenCL: READY (kernel+buffers)\n");
  }

  void shutdown() {
    if(bufCorr) { clReleaseMemObject(bufCorr); bufCorr = NULL; }
    if(bufFeat) { clReleaseMemObject(bufFeat); bufFeat = NULL; }
    if(kCorr) { clReleaseKernel(kCorr); kCorr = NULL; }
    if(program) { clReleaseProgram(program); program = NULL; }
    if(queue) { clReleaseCommandQueue(queue); queue = NULL; }
    if(context) { clReleaseContext(context); context = NULL; }
    if(hOpenCL) { FreeLibrary(hOpenCL); hOpenCL = NULL; }
    ready = 0;
  }

  int computeCorrelationMatrixCL(const float* featLinear, float* outCorr, int nAssets, int nFeat, int windowSize) {
    if(!ready) return 0;
    if(!featLinear || !outCorr) return 0;

    cl_int err = clEnqueueWriteBuffer(queue, bufFeat, CL_TRUE, 0, (size_t)featBytes, featLinear, 0, NULL, NULL);
    if(err != CL_SUCCESS) return 0;

    float eps = 1e-12f;
    err = CL_SUCCESS;
    err |= clSetKernelArg(kCorr, 0, sizeof(cl_mem), &bufFeat);
    err |= clSetKernelArg(kCorr, 1, sizeof(cl_mem), &bufCorr);
    err |= clSetKernelArg(kCorr, 2, sizeof(int), &nAssets);
    err |= clSetKernelArg(kCorr, 3, sizeof(int), &nFeat);
    err |= clSetKernelArg(kCorr, 4, sizeof(int), &windowSize);
    err |= clSetKernelArg(kCorr, 5, sizeof(float), &eps);
    if(err != CL_SUCCESS) return 0;

    size_t global[2];
    global[0] = (size_t)nAssets;
    global[1] = (size_t)nAssets;

    err = clEnqueueNDRangeKernel(queue, kCorr, 2, NULL, global, NULL, 0, NULL, NULL);
    if(err != CL_SUCCESS) return 0;

    err = clFinish(queue);
    if(err != CL_SUCCESS) return 0;

    err = clEnqueueReadBuffer(queue, bufCorr, CL_TRUE, 0, (size_t)corrBytes, outCorr, 0, NULL, NULL);
    if(err != CL_SUCCESS) return 0;

    return 1;
  }
};

// ---------------------------- Strategy ----------------------------

class VolAdjusterStrategy {
public:
  ExposureTable exposureTable;
  FeatureBufferSoA featSoA;
  OpenCLBackend openCL;

  SlabAllocator<fvar> corrMatrix;
  SlabAllocator<fvar> distMatrix;
  SlabAllocator<fvar> compactness;
  SlabAllocator<fvar> volatility;
  SlabAllocator<fvar> scores;

  SlabAllocator<float> featLinear;
  SlabAllocator<float> corrLinear;

  int barCount;
  int updateCount;

  VolAdjusterStrategy() : barCount(0), updateCount(0) {}

  void init() {
    printf("VolAdjuster_v3: Initializing...\n");

    exposureTable.init();
    featSoA.init(N_ASSETS, FEAT_WINDOW);

    corrMatrix.init(N_ASSETS * N_ASSETS);
    distMatrix.init(N_ASSETS * N_ASSETS);
    compactness.init(N_ASSETS);
    volatility.init(N_ASSETS);
    scores.init(N_ASSETS);

    featLinear.init(FEAT_N * N_ASSETS * FEAT_WINDOW);
    corrLinear.init(N_ASSETS * N_ASSETS);

    openCL.init();
    printf("VolAdjuster_v3: Ready (OpenCL=%d)\n", openCL.ready);

    barCount = 0;
    updateCount = 0;
  }

  void shutdown() {
    printf("VolAdjuster_v3: Shutting down...\n");

    openCL.shutdown();

    featSoA.shutdown();
    corrMatrix.shutdown();
    distMatrix.shutdown();
    compactness.shutdown();
    volatility.shutdown();
    scores.shutdown();

    featLinear.shutdown();
    corrLinear.shutdown();
  }

  void computeFeatures(int assetIdx) {
    asset((char*)ASSET_NAMES[assetIdx]);

    vars C = series(priceClose(0));
    vars V = series(Volatility(C, 20));

    if(Bar < 1) return;

    fvar r1 = (fvar)log(C[0] / C[1]);
    fvar rN = (fvar)log(C[0] / C[12]);
    fvar vol = (fvar)V[0];
    fvar zscore = (fvar)((C[0] - C[50]) / (V[0] * 20.0 + EPS));
    fvar rangeP = (fvar)((C[0] - C[50]) / (C[0] + EPS));
    fvar flow = (fvar)(r1 * vol);
    fvar regime = (fvar)((vol > 0.001) ? 1.0 : 0.0);
    fvar volOfVol = (fvar)(vol * vol);
    fvar persistence = (fvar)fabs(r1);

    featSoA.push(0, assetIdx, r1);
    featSoA.push(1, assetIdx, rN);
    featSoA.push(2, assetIdx, vol);
    featSoA.push(3, assetIdx, zscore);
    featSoA.push(4, assetIdx, rangeP);
    featSoA.push(5, assetIdx, flow);
    featSoA.push(6, assetIdx, regime);
    featSoA.push(7, assetIdx, volOfVol);
    featSoA.push(8, assetIdx, persistence);
  }

  void computeCorrelationMatrixCPU() {
    for(int i=0;i<N_ASSETS*N_ASSETS;i++) corrMatrix[i] = 0;

    for(int f=0; f<FEAT_N; f++){
      for(int a=0; a<N_ASSETS; a++){
        for(int b=a+1; b<N_ASSETS; b++){
          fvar mx = 0, my = 0;
          for(int t=0; t<FEAT_WINDOW; t++){
            mx += featSoA.get(f,a,t);
            my += featSoA.get(f,b,t);
          }
          mx /= (fvar)FEAT_WINDOW;
          my /= (fvar)FEAT_WINDOW;

          fvar sxx = 0, syy = 0, sxy = 0;
          for(int t=0; t<FEAT_WINDOW; t++){
            fvar dx = featSoA.get(f,a,t) - mx;
            fvar dy = featSoA.get(f,b,t) - my;
            sxx += dx*dx;
            syy += dy*dy;
            sxy += dx*dy;
          }

          fvar den = (fvar)sqrt((double)(sxx*syy + (fvar)EPS));
          fvar corr = 0;
          if(den > (fvar)EPS) corr = sxy / den;
          else corr = 0;

          int idx = a*N_ASSETS + b;
          corrMatrix[idx] += corr / (fvar)FEAT_N;
          corrMatrix[b*N_ASSETS + a] = corrMatrix[idx];
        }
      }
    }
  }

  void buildFeatLinear() {
    int idx = 0;
    for(int f=0; f<FEAT_N; f++){
      for(int a=0; a<N_ASSETS; a++){
        for(int t=0; t<FEAT_WINDOW; t++){
          featLinear[idx] = (float)featSoA.get(f, a, t);
          idx++;
        }
      }
    }
  }

  void computeCorrelationMatrix() {
    if(openCL.ready) {
      buildFeatLinear();

      for(int i=0;i<N_ASSETS*N_ASSETS;i++) corrLinear[i] = 0.0f;

      int ok = openCL.computeCorrelationMatrixCL(
        featLinear.data,
        corrLinear.data,
        N_ASSETS,
        FEAT_N,
        FEAT_WINDOW
      );

      if(ok) {
        for(int i=0;i<N_ASSETS*N_ASSETS;i++) corrMatrix[i] = (fvar)0;

        for(int a=0; a<N_ASSETS; a++){
          corrMatrix[a*N_ASSETS + a] = (fvar)1.0;
          for(int b=a+1; b<N_ASSETS; b++){
            float c = corrLinear[a*N_ASSETS + b];
            corrMatrix[a*N_ASSETS + b] = (fvar)c;
            corrMatrix[b*N_ASSETS + a] = (fvar)c;
          }
        }
        return;
      }

      printf("OpenCL: runtime fail -> CPU fallback\n");
      openCL.ready = 0;
    }

    computeCorrelationMatrixCPU();
  }

  void computeDistanceMatrix() {
    for(int i=0;i<N_ASSETS;i++){
      for(int j=0;j<N_ASSETS;j++){
        if(i == j) {
          distMatrix[i*N_ASSETS + j] = (fvar)0;
        } else {
          fvar corrDist = (fvar)1.0 - (fvar)fabs((double)corrMatrix[i*N_ASSETS + j]);
          fvar expDist  = (fvar)exposureTable.getDist(i, j);
          fvar blended = (fvar)LAMBDA_META * corrDist + (fvar)(1.0 - (double)LAMBDA_META) * expDist;
          distMatrix[i*N_ASSETS + j] = blended;
        }
      }
    }
  }

  void floydWarshall() {
    fvar d[28][28];

    for(int i=0;i<N_ASSETS;i++){
      for(int j=0;j<N_ASSETS;j++){
        d[i][j] = distMatrix[i*N_ASSETS + j];
        if(i == j) d[i][j] = (fvar)0;
        if(d[i][j] < (fvar)0) d[i][j] = (fvar)INF;
      }
    }

    for(int k=0;k<N_ASSETS;k++){
      for(int i=0;i<N_ASSETS;i++){
        for(int j=0;j<N_ASSETS;j++){
          if(d[i][k] < (fvar)INF && d[k][j] < (fvar)INF) {
            fvar nk = d[i][k] + d[k][j];
            if(nk < d[i][j]) d[i][j] = nk;
          }
        }
      }
    }

    for(int i=0;i<N_ASSETS;i++){
      fvar w = 0;
      for(int j=i+1;j<N_ASSETS;j++){
        if(d[i][j] < (fvar)INF) w += d[i][j];
      }
      if(w > (fvar)0) compactness[i] = (fvar)(1.0 / (1.0 + (double)w));
      else compactness[i] = (fvar)0;
      volatility[i] = featSoA.get(2, i, 0);
    }
  }

  void computeScores() {
    for(int i=0;i<N_ASSETS;i++){
      fvar coupling = 0;
      int count = 0;

      for(int j=0;j<N_ASSETS;j++){
        if(i != j && distMatrix[i*N_ASSETS + j] < (fvar)INF) {
          coupling += compactness[j];
          count++;
        }
      }

      fvar pCouple = 0;
      if(count > 0) pCouple = coupling / (fvar)count;
      else pCouple = (fvar)0;

      fvar rawScore = (fvar)ALPHA * volatility[i] + (fvar)GAMMA * compactness[i] - (fvar)BETA * pCouple;

      if(rawScore > (fvar)30) rawScore = (fvar)30;
      if(rawScore < (fvar)-30) rawScore = (fvar)-30;

      scores[i] = (fvar)(1.0 / (1.0 + exp(-(double)rawScore)));
    }
  }

  void onBar() {
    barCount++;

    for(int i=0;i<N_ASSETS;i++) computeFeatures(i);

    if(barCount % UPDATE_EVERY == 0) {
      updateCount++;

      computeCorrelationMatrix();
      computeDistanceMatrix();
      floydWarshall();
      computeScores();
      printTopK();
    }
  }

  void printTopK() {
    int indices[N_ASSETS];
    for(int i=0;i<N_ASSETS;i++) indices[i] = i;

    for(int i=0;i<TOP_K;i++){
      for(int j=i+1;j<N_ASSETS;j++){
        if(scores[indices[j]] > scores[indices[i]]) {
          int tmp = indices[i];
          indices[i] = indices[j];
          indices[j] = tmp;
        }
      }
    }

    if(updateCount % 10 == 0) {
      printf("===VolAdjuster_v3 Top-K(update#%d,OpenCL=%d)===\n",
        updateCount, openCL.ready);

      for(int i=0;i<TOP_K;i++){
        int idx = indices[i];
        printf(" %d.%s: score=%.4f, C=%.4f, V=%.6f\n", i+1, ASSET_NAMES[idx], (double)scores[idx], (double)compactness[idx], (double)volatility[idx]);
      }
    }
  }
};

// ---------------------------- Zorro DLL entry ----------------------------

static VolAdjusterStrategy* S = NULL;

DLLFUNC void run()
{
  if(is(INITRUN)) {
    BarPeriod = 60;
    LookBack = max(LookBack, FEAT_WINDOW + 50);

    asset((char*)ASSET_NAMES[0]);

    if(!S) {
      S = new VolAdjusterStrategy();
      S->init();
    }
  }

  if(is(EXITRUN)) {
    if(S) {
      S->shutdown();
      delete S;
      S = NULL;
    }
    return;
  }

  if(!S || Bar < LookBack)
    return;

  S->onBar();
}

Last edited by TipmyPip; Yesterday at 23:16.
“Atlas Momentum Mesh” (OpenCL) [Re: TipmyPip] #489224
Yesterday at 21:52
Yesterday at 21:52
Joined: Sep 2017
Posts: 223
TipmyPip Online OP
Member
TipmyPip  Online OP
Member

Joined: Sep 2017
Posts: 223
MX06 “Atlas Momentum Mesh” is a multi-asset, momentum-biased ranking engine built as a Zorro64 Strategy DLL, designed to process a fixed FX universe (28 major/minor pairs) on hourly bars and periodically surface a “best set” of candidates (Top-K). Its core idea is that raw momentum is more useful when interpreted inside a cross-asset structure: rather than ranking pairs by returns alone, the strategy blends (1) recent momentum, (2) how “central/compact” an asset is within a network of relationships, and (3) a penalty for being surrounded by overly similar neighbors.

On every bar the strategy computes a compact feature vector per asset and stores it in a memory-efficient structure-of-arrays (SoA) ring buffer. The feature set (9 signals) includes short and medium log returns, volatility, a z-style deviation vs. a longer lookback, simple range/flow proxies, a regime flag, vol-of-vol, and persistence. By maintaining these features in a fixed window (200 bars) and updating them every bar, the strategy can periodically rebuild a global view of market structure without repeated expensive historical scans.

Every UPDATE_EVERY bars (default 5), it performs its heavy step: building an asset-to-asset correlation matrix over the feature histories. This correlation view is then merged with a currency exposure distance table (base/quote currency mismatches) to create a blended distance matrix. A shortest-path pass (Floyd–Warshall) converts that distance matrix into a notion of “compactness” for each asset: assets that sit in tightly connected regions (low summed distances) receive higher compactness scores. Finally, a probability-like score is produced per asset via a logistic transform of a rawScore = GAMMA·momentum + ALPHA·compactness ? BETA·neighborCompactnessMean, capped to avoid extreme saturation. The Top-K assets by score are printed periodically, acting as a selection/ranking layer that can be plugged into downstream entry/exit logic.

Enhancing OpenCL support (the offload logic)
OpenCL support is implemented as a true optional accelerator focused specifically on the correlation matrix, the most compute-intensive O(N²·F·W) loop. The DLL dynamically loads OpenCL.dll at runtime, resolves required function pointers, selects a device (GPU first, CPU as fallback), compiles an embedded kernel (corr_pairwise), and allocates two device buffers: one for the linearized feature tensor and one for the output correlations. If any stage fails (missing DLL, no platform/device, context/queue errors, build failure, runtime enqueue failure), it cleanly falls back to the CPU implementation—so strategy behavior remains deterministic and deployable on machines without OpenCL.

The enhancement is the data-layout and execution pathway: the SoA ring buffer is flattened into a contiguous float array (featLinear) to match GPU memory access patterns, the kernel computes pairwise correlations for (a,b) with a<b in parallel, and the results are read back into corrLinear and copied into the strategy’s corrMatrix (fvar). This keeps the CPU path intact while letting supported systems offload the heaviest step, reducing wall-time without changing the scoring logic.

To all programmers, These strategies have been developed with compiling automation process, It is advised that if you use any LLM, proper Zorro documentation should be available to process the errors you come across in order to maintain compiler and broker data synchronization.

Code
// TGr06E_MomentumBias_v3.cpp - Zorro64 Strategy DLL
// Strategy E v3: Momentum-Biased with MX06 OOP + OpenCL (real offload) + Memory Optimization
// Notes:
// - Keeps full CPU fallback.
// - OpenCL is optional: if OpenCL.dll missing / no device / kernel build fails -> CPU path.
// - OpenCL accelerates the heavy correlation matrix step by offloading pairwise correlations.
// - Correlation is computed in float on GPU; results are stored back into fvar corrMatrix.

#define _CRT_SECURE_NO_WARNINGS
#include <zorro.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <windows.h>
#include <stddef.h>

#define INF 1e30
#define EPS 1e-12
#define N_ASSETS 28
#define FEAT_N 9
#define FEAT_WINDOW 200
#define UPDATE_EVERY 5
#define TOP_K 5

#define ALPHA 0.1
#define BETA 0.2
#define GAMMA 3.5
#define LAMBDA_META 0.7

#ifdef TIGHT_MEM
typedef float fvar;
#else
typedef double fvar;
#endif

static const char* ASSET_NAMES[] = {
  "EURUSD","GBPUSD","USDCHF","USDJPY","AUDUSD","AUDCAD","AUDCHF","AUDJPY","AUDNZD",
  "CADJPY","CADCHF","EURAUD","EURCAD","EURCHF","EURGBP","EURJPY","EURNZD","GBPAUD",
  "GBPCAD","GBPCHF","GBPJPY","GBPNZD","NZDCAD","NZDCHF","NZDJPY","NZDUSD","USDCAD"
};
static const char* CURRENCIES[] = {"EUR","GBP","USD","CHF","JPY","AUD","CAD","NZD"};
#define N_CURRENCIES 8

// ---------------------------- Exposure Table ----------------------------

struct ExposureTable {
  int exposure[N_ASSETS][N_CURRENCIES];
  double exposureDist[N_ASSETS][N_ASSETS];

  void init() {
    for(int i=0;i<N_ASSETS;i++){
      for(int c=0;c<N_CURRENCIES;c++){
        exposure[i][c] = 0;
      }
    }
    for(int i=0;i<N_ASSETS;i++){
      for(int j=0;j<N_ASSETS;j++){
        exposureDist[i][j] = 0.0;
      }
    }
  }

  inline double getDist(int i,int j) const { return exposureDist[i][j]; }
};

// ---------------------------- Slab Allocator ----------------------------

template<typename T>
class SlabAllocator {
public:
  T* data;
  int capacity;

  SlabAllocator() : data(NULL), capacity(0) {}
  ~SlabAllocator() { shutdown(); }

  void init(int size) {
    shutdown();
    capacity = size;
    data = (T*)malloc((size_t)capacity * sizeof(T));
    if(data) memset(data, 0, (size_t)capacity * sizeof(T));
  }

  void shutdown() {
    if(data) free(data);
    data = NULL;
    capacity = 0;
  }

  T& operator[](int i) { return data[i]; }
  const T& operator[](int i) const { return data[i]; }
};

// ---------------------------- Feature Buffer (SoA ring) ----------------------------

struct FeatureBufferSoA {
  SlabAllocator<fvar> buffer;
  int windowSize;
  int currentIndex;

  void init(int assets, int window) {
    windowSize = window;
    currentIndex = 0;
    buffer.init(FEAT_N * assets * window);
  }

  void shutdown() { buffer.shutdown(); }

  inline int offset(int feat,int asset,int t) const {
    return (feat * N_ASSETS + asset) * windowSize + t;
  }

  void push(int feat,int asset,fvar value) {
    buffer[offset(feat, asset, currentIndex)] = value;
    currentIndex = (currentIndex + 1) % windowSize;
  }

  // t=0 => most recent
  fvar get(int feat,int asset,int t) const {
    int idx = (currentIndex - 1 - t + windowSize) % windowSize;
    return buffer[offset(feat, asset, idx)];
  }
};

// ---------------------------- Minimal OpenCL (dynamic) ----------------------------

typedef struct _cl_platform_id*   cl_platform_id;
typedef struct _cl_device_id*     cl_device_id;
typedef struct _cl_context*       cl_context;
typedef struct _cl_command_queue* cl_command_queue;
typedef struct _cl_program*       cl_program;
typedef struct _cl_kernel*        cl_kernel;
typedef struct _cl_mem*           cl_mem;
typedef unsigned int              cl_uint;
typedef int                       cl_int;
typedef unsigned long long        cl_ulong;
typedef size_t                    cl_bool;

#define CL_SUCCESS 0
#define CL_DEVICE_TYPE_CPU (1ULL << 1)
#define CL_DEVICE_TYPE_GPU (1ULL << 2)
#define CL_MEM_READ_ONLY   (1ULL << 2)
#define CL_MEM_WRITE_ONLY  (1ULL << 1)
#define CL_MEM_READ_WRITE  (1ULL << 0)
#define CL_TRUE  1
#define CL_FALSE 0
#define CL_PROGRAM_BUILD_LOG 0x1183

class OpenCLBackend {
public:
  HMODULE hOpenCL;
  int ready;

  cl_platform_id platform;
  cl_device_id device;
  cl_context context;
  cl_command_queue queue;
  cl_program program;
  cl_kernel kCorr;

  cl_mem bufFeat;
  cl_mem bufCorr;

  int featBytes;
  int corrBytes;

  cl_int (*clGetPlatformIDs)(cl_uint, cl_platform_id*, cl_uint*);
  cl_int (*clGetDeviceIDs)(cl_platform_id, cl_ulong, cl_uint, cl_device_id*, cl_uint*);
  cl_context (*clCreateContext)(void*, cl_uint, const cl_device_id*, void*, void*, cl_int*);
  cl_command_queue (*clCreateCommandQueue)(cl_context, cl_device_id, cl_ulong, cl_int*);
  cl_program (*clCreateProgramWithSource)(cl_context, cl_uint, const char**, const size_t*, cl_int*);
  cl_int (*clBuildProgram)(cl_program, cl_uint, const cl_device_id*, const char*, void*, void*);
  cl_int (*clGetProgramBuildInfo)(cl_program, cl_device_id, cl_uint, size_t, void*, size_t*);
  cl_kernel (*clCreateKernel)(cl_program, const char*, cl_int*);
  cl_int (*clSetKernelArg)(cl_kernel, cl_uint, size_t, const void*);
  cl_mem (*clCreateBuffer)(cl_context, cl_ulong, size_t, void*, cl_int*);
  cl_int (*clEnqueueWriteBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void*, cl_uint, const void*, void*);
  cl_int (*clEnqueueReadBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void*, cl_uint, const void*, void*);
  cl_int (*clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*, cl_uint, const void*, void*);
  cl_int (*clFinish)(cl_command_queue);
  cl_int (*clReleaseMemObject)(cl_mem);
  cl_int (*clReleaseKernel)(cl_kernel);
  cl_int (*clReleaseProgram)(cl_program);
  cl_int (*clReleaseCommandQueue)(cl_command_queue);
  cl_int (*clReleaseContext)(cl_context);

  OpenCLBackend()
  : hOpenCL(NULL), ready(0),
    platform(NULL), device(NULL), context(NULL), queue(NULL), program(NULL), kCorr(NULL),
    bufFeat(NULL), bufCorr(NULL),
    featBytes(0), corrBytes(0),
    clGetPlatformIDs(NULL), clGetDeviceIDs(NULL), clCreateContext(NULL), clCreateCommandQueue(NULL),
    clCreateProgramWithSource(NULL), clBuildProgram(NULL), clGetProgramBuildInfo(NULL),
    clCreateKernel(NULL), clSetKernelArg(NULL),
    clCreateBuffer(NULL), clEnqueueWriteBuffer(NULL), clEnqueueReadBuffer(NULL),
    clEnqueueNDRangeKernel(NULL), clFinish(NULL),
    clReleaseMemObject(NULL), clReleaseKernel(NULL), clReleaseProgram(NULL),
    clReleaseCommandQueue(NULL), clReleaseContext(NULL)
  {}

  int loadSymbol(void** fp, const char* name) {
    *fp = (void*)GetProcAddress(hOpenCL, name);
    return (*fp != NULL);
  }

  const char* kernelSource() {
    return
      "__kernel void corr_pairwise(\n"
      "  __global const float* feat,\n"
      "  __global float* outCorr,\n"
      "  const int nAssets,\n"
      "  const int nFeat,\n"
      "  const int windowSize,\n"
      "  const float eps\n"
      "){\n"
      "  int a = (int)get_global_id(0);\n"
      "  int b = (int)get_global_id(1);\n"
      "  if(a >= nAssets || b >= nAssets) return;\n"
      "  if(a >= b) return;\n"
      "  float acc = 0.0f;\n"
      "  for(int f=0; f<nFeat; f++){\n"
      "    int baseA = (f*nAssets + a) * windowSize;\n"
      "    int baseB = (f*nAssets + b) * windowSize;\n"
      "    float mx = 0.0f;\n"
      "    float my = 0.0f;\n"
      "    for(int t=0; t<windowSize; t++){\n"
      "      mx += feat[baseA + t];\n"
      "      my += feat[baseB + t];\n"
      "    }\n"
      "    mx /= (float)windowSize;\n"
      "    my /= (float)windowSize;\n"
      "    float sxx = 0.0f;\n"
      "    float syy = 0.0f;\n"
      "    float sxy = 0.0f;\n"
      "    for(int t=0; t<windowSize; t++){\n"
      "      float dx = feat[baseA + t] - mx;\n"
      "      float dy = feat[baseB + t] - my;\n"
      "      sxx += dx*dx;\n"
      "      syy += dy*dy;\n"
      "      sxy += dx*dy;\n"
      "    }\n"
      "    float den = sqrt(sxx*syy + eps);\n"
      "    float corr = (den > eps) ? (sxy/den) : 0.0f;\n"
      "    acc += corr;\n"
      "  }\n"
      "  outCorr[a*nAssets + b] = acc / (float)nFeat;\n"
      "}\n";
  }

  void printBuildLog() {
    if(!clGetProgramBuildInfo || !program || !device) return;
    size_t logSize = 0;
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
    if(logSize == 0) return;
    char* log = (char*)malloc(logSize + 1);
    if(!log) return;
    memset(log, 0, logSize + 1);
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
    printf("OpenCL build log:\n%s\n", log);
    free(log);
  }

  void init() {
    ready = 0;

    hOpenCL = LoadLibraryA("OpenCL.dll");
    if(!hOpenCL) {
      printf("OpenCL: CPU (OpenCL.dll missing)\n");
      return;
    }

    if(!loadSymbol((void**)&clGetPlatformIDs,       "clGetPlatformIDs")) return;
    if(!loadSymbol((void**)&clGetDeviceIDs,         "clGetDeviceIDs")) return;
    if(!loadSymbol((void**)&clCreateContext,        "clCreateContext")) return;
    if(!loadSymbol((void**)&clCreateCommandQueue,   "clCreateCommandQueue")) return;
    if(!loadSymbol((void**)&clCreateProgramWithSource,"clCreateProgramWithSource")) return;
    if(!loadSymbol((void**)&clBuildProgram,         "clBuildProgram")) return;
    if(!loadSymbol((void**)&clGetProgramBuildInfo,  "clGetProgramBuildInfo")) return;
    if(!loadSymbol((void**)&clCreateKernel,         "clCreateKernel")) return;
    if(!loadSymbol((void**)&clSetKernelArg,         "clSetKernelArg")) return;
    if(!loadSymbol((void**)&clCreateBuffer,         "clCreateBuffer")) return;
    if(!loadSymbol((void**)&clEnqueueWriteBuffer,   "clEnqueueWriteBuffer")) return;
    if(!loadSymbol((void**)&clEnqueueReadBuffer,    "clEnqueueReadBuffer")) return;
    if(!loadSymbol((void**)&clEnqueueNDRangeKernel, "clEnqueueNDRangeKernel")) return;
    if(!loadSymbol((void**)&clFinish,               "clFinish")) return;
    if(!loadSymbol((void**)&clReleaseMemObject,     "clReleaseMemObject")) return;
    if(!loadSymbol((void**)&clReleaseKernel,        "clReleaseKernel")) return;
    if(!loadSymbol((void**)&clReleaseProgram,       "clReleaseProgram")) return;
    if(!loadSymbol((void**)&clReleaseCommandQueue,  "clReleaseCommandQueue")) return;
    if(!loadSymbol((void**)&clReleaseContext,       "clReleaseContext")) return;

    cl_uint nPlat = 0;
    if(clGetPlatformIDs(0, NULL, &nPlat) != CL_SUCCESS || nPlat == 0) {
      printf("OpenCL: CPU (no platform)\n");
      return;
    }
    clGetPlatformIDs(1, &platform, NULL);

    cl_uint nDev = 0;
    cl_int ok = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &nDev);
    if(ok != CL_SUCCESS || nDev == 0) {
      ok = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &nDev);
      if(ok != CL_SUCCESS || nDev == 0) {
        printf("OpenCL: CPU (no device)\n");
        return;
      }
    }

    cl_int err = 0;
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if(err != CL_SUCCESS || !context) {
      printf("OpenCL: CPU (context fail)\n");
      return;
    }

    queue = clCreateCommandQueue(context, device, 0, &err);
    if(err != CL_SUCCESS || !queue) {
      printf("OpenCL: CPU (queue fail)\n");
      return;
    }

    const char* src = kernelSource();
    program = clCreateProgramWithSource(context, 1, &src, NULL, &err);
    if(err != CL_SUCCESS || !program) {
      printf("OpenCL: CPU (program fail)\n");
      return;
    }

    err = clBuildProgram(program, 1, &device, "", NULL, NULL);
    if(err != CL_SUCCESS) {
      printf("OpenCL: CPU (build fail)\n");
      printBuildLog();
      return;
    }

    kCorr = clCreateKernel(program, "corr_pairwise", &err);
    if(err != CL_SUCCESS || !kCorr) {
      printf("OpenCL: CPU (kernel fail)\n");
      printBuildLog();
      return;
    }

    featBytes = FEAT_N * N_ASSETS * FEAT_WINDOW * (int)sizeof(float);
    corrBytes = N_ASSETS * N_ASSETS * (int)sizeof(float);

    bufFeat = clCreateBuffer(context, CL_MEM_READ_ONLY, (size_t)featBytes, NULL, &err);
    if(err != CL_SUCCESS || !bufFeat) {
      printf("OpenCL: CPU (bufFeat fail)\n");
      return;
    }

    bufCorr = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (size_t)corrBytes, NULL, &err);
    if(err != CL_SUCCESS || !bufCorr) {
      printf("OpenCL: CPU (bufCorr fail)\n");
      return;
    }

    ready = 1;
    printf("OpenCL: READY (kernel+buffers)\n");
  }

  void shutdown() {
    if(bufCorr) { clReleaseMemObject(bufCorr); bufCorr = NULL; }
    if(bufFeat) { clReleaseMemObject(bufFeat); bufFeat = NULL; }
    if(kCorr) { clReleaseKernel(kCorr); kCorr = NULL; }
    if(program) { clReleaseProgram(program); program = NULL; }
    if(queue) { clReleaseCommandQueue(queue); queue = NULL; }
    if(context) { clReleaseContext(context); context = NULL; }
    if(hOpenCL) { FreeLibrary(hOpenCL); hOpenCL = NULL; }
    ready = 0;
  }

  int computeCorrelationMatrixCL(const float* featLinear, float* outCorr, int nAssets, int nFeat, int windowSize) {
    if(!ready) return 0;
    if(!featLinear || !outCorr) return 0;

    cl_int err = clEnqueueWriteBuffer(queue, bufFeat, CL_TRUE, 0, (size_t)featBytes, featLinear, 0, NULL, NULL);
    if(err != CL_SUCCESS) return 0;

    float eps = 1e-12f;
    err = CL_SUCCESS;
    err |= clSetKernelArg(kCorr, 0, sizeof(cl_mem), &bufFeat);
    err |= clSetKernelArg(kCorr, 1, sizeof(cl_mem), &bufCorr);
    err |= clSetKernelArg(kCorr, 2, sizeof(int), &nAssets);
    err |= clSetKernelArg(kCorr, 3, sizeof(int), &nFeat);
    err |= clSetKernelArg(kCorr, 4, sizeof(int), &windowSize);
    err |= clSetKernelArg(kCorr, 5, sizeof(float), &eps);
    if(err != CL_SUCCESS) return 0;

    size_t global[2];
    global[0] = (size_t)nAssets;
    global[1] = (size_t)nAssets;

    err = clEnqueueNDRangeKernel(queue, kCorr, 2, NULL, global, NULL, 0, NULL, NULL);
    if(err != CL_SUCCESS) return 0;

    err = clFinish(queue);
    if(err != CL_SUCCESS) return 0;

    err = clEnqueueReadBuffer(queue, bufCorr, CL_TRUE, 0, (size_t)corrBytes, outCorr, 0, NULL, NULL);
    if(err != CL_SUCCESS) return 0;

    return 1;
  }
};

// ---------------------------- Strategy ----------------------------

class MomentumBiasStrategy {
public:
  ExposureTable exposureTable;
  FeatureBufferSoA featSoA;
  OpenCLBackend openCL;

  SlabAllocator<fvar> corrMatrix;
  SlabAllocator<fvar> distMatrix;
  SlabAllocator<fvar> compactness;
  SlabAllocator<fvar> momentum;
  SlabAllocator<fvar> scores;

  SlabAllocator<float> featLinear;
  SlabAllocator<float> corrLinear;

  int barCount;
  int updateCount;
  int isReady;

  MomentumBiasStrategy() : barCount(0), updateCount(0), isReady(0) {}

  void init() {
    char buf[256];
    sprintf(buf, "MomentumBias_v3: Initializing...\n");
    printf(buf);
    OutputDebugStringA(buf);

    sprintf(buf, "1. exposureTable.init...\n");
    printf(buf);
    OutputDebugStringA(buf);
    exposureTable.init();

    sprintf(buf, "2. featSoA.init...\n");
    printf(buf);
    OutputDebugStringA(buf);
    featSoA.init(N_ASSETS, FEAT_WINDOW);

    sprintf(buf, "3. corrMatrix.init...\n");
    printf(buf);
    OutputDebugStringA(buf);
    corrMatrix.init(N_ASSETS * N_ASSETS);

    sprintf(buf, "4. distMatrix.init...\n");
    printf(buf);
    OutputDebugStringA(buf);
    distMatrix.init(N_ASSETS * N_ASSETS);

    sprintf(buf, "5. compactness.init...\n");
    printf(buf);
    OutputDebugStringA(buf);
    compactness.init(N_ASSETS);

    sprintf(buf, "6. momentum.init...\n");
    printf(buf);
    OutputDebugStringA(buf);
    momentum.init(N_ASSETS);

    sprintf(buf, "7. scores.init...\n");
    printf(buf);
    OutputDebugStringA(buf);
    scores.init(N_ASSETS);

    sprintf(buf, "8. featLinear.init...\n");
    printf(buf);
    OutputDebugStringA(buf);
    featLinear.init(FEAT_N * N_ASSETS * FEAT_WINDOW);

    sprintf(buf, "9. corrLinear.init...\n");
    printf(buf);
    OutputDebugStringA(buf);
    corrLinear.init(N_ASSETS * N_ASSETS);

    sprintf(buf, "10. openCL.init...\n");
    printf(buf);
    OutputDebugStringA(buf);
    openCL.init();

    sprintf(buf, "MomentumBias_v3: Ready (OpenCL=%d)\n", openCL.ready);
    printf(buf);
    OutputDebugStringA(buf);

    barCount = 0;
    updateCount = 0;
    isReady = 1;
  }

  void shutdown() {
    printf("MomentumBias_v3: Shutting down...\n");

    openCL.shutdown();

    featSoA.shutdown();
    corrMatrix.shutdown();
    distMatrix.shutdown();
    compactness.shutdown();
    momentum.shutdown();
    scores.shutdown();

    featLinear.shutdown();
    corrLinear.shutdown();
    isReady = 0;
  }

  void computeFeatures(int assetIdx) {
    asset((char*)ASSET_NAMES[assetIdx]);

    vars C = series(priceClose(0));
    vars V = series(Volatility(C, 20));

    if(Bar < 50) return;

    fvar r1 = (fvar)log(C[0] / C[1]);
    fvar rN = (fvar)log(C[0] / C[12]);
    fvar vol = (fvar)V[0];
    fvar zscore = (fvar)((C[0] - C[50]) / (V[0] * 20.0 + EPS));
    fvar rangeP = (fvar)((C[0] - C[50]) / (C[0] + EPS));
    fvar flow = (fvar)(r1 * vol);
    fvar regime = (fvar)((vol > 0.001) ? 1.0 : 0.0);
    fvar volOfVol = (fvar)(vol * vol);
    fvar persistence = (fvar)fabs(r1);

    featSoA.push(0, assetIdx, r1);
    featSoA.push(1, assetIdx, rN);
    featSoA.push(2, assetIdx, vol);
    featSoA.push(3, assetIdx, zscore);
    featSoA.push(4, assetIdx, rangeP);
    featSoA.push(5, assetIdx, flow);
    featSoA.push(6, assetIdx, regime);
    featSoA.push(7, assetIdx, volOfVol);
    featSoA.push(8, assetIdx, persistence);
  }

  void computeCorrelationMatrixCPU() {
    for(int i=0;i<N_ASSETS*N_ASSETS;i++) corrMatrix[i] = 0;

    for(int f=0; f<FEAT_N; f++){
      for(int a=0; a<N_ASSETS; a++){
        for(int b=a+1; b<N_ASSETS; b++){
          fvar mx = 0, my = 0;
          for(int t=0; t<FEAT_WINDOW; t++){
            mx += featSoA.get(f,a,t);
            my += featSoA.get(f,b,t);
          }
          mx /= (fvar)FEAT_WINDOW;
          my /= (fvar)FEAT_WINDOW;

          fvar sxx = 0, syy = 0, sxy = 0;
          for(int t=0; t<FEAT_WINDOW; t++){
            fvar dx = featSoA.get(f,a,t) - mx;
            fvar dy = featSoA.get(f,b,t) - my;
            sxx += dx*dx;
            syy += dy*dy;
            sxy += dx*dy;
          }

          fvar den = (fvar)sqrt((double)(sxx*syy + (fvar)EPS));
          fvar corr = 0;
          if(den > (fvar)EPS) corr = sxy / den;
          else corr = 0;

          int idx = a*N_ASSETS + b;
          corrMatrix[idx] += corr / (fvar)FEAT_N;
          corrMatrix[b*N_ASSETS + a] = corrMatrix[idx];
        }
      }
    }
  }

  void buildFeatLinear() {
    int idx = 0;
    for(int f=0; f<FEAT_N; f++){
      for(int a=0; a<N_ASSETS; a++){
        for(int t=0; t<FEAT_WINDOW; t++){
          featLinear[idx] = (float)featSoA.get(f, a, t);
          idx++;
        }
      }
    }
  }

  void computeCorrelationMatrix() {
    if(openCL.ready) {
      buildFeatLinear();

      for(int i=0;i<N_ASSETS*N_ASSETS;i++) corrLinear[i] = 0.0f;

      int ok = openCL.computeCorrelationMatrixCL(
        featLinear.data,
        corrLinear.data,
        N_ASSETS,
        FEAT_N,
        FEAT_WINDOW
      );

      if(ok) {
        for(int i=0;i<N_ASSETS*N_ASSETS;i++) corrMatrix[i] = (fvar)0;

        for(int a=0; a<N_ASSETS; a++){
          corrMatrix[a*N_ASSETS + a] = (fvar)1.0;
          for(int b=a+1; b<N_ASSETS; b++){
            float c = corrLinear[a*N_ASSETS + b];
            corrMatrix[a*N_ASSETS + b] = (fvar)c;
            corrMatrix[b*N_ASSETS + a] = (fvar)c;
          }
        }
        return;
      }

      printf("OpenCL: runtime fail -> CPU fallback\n");
      openCL.ready = 0;
    }

    computeCorrelationMatrixCPU();
  }

  void computeDistanceMatrix() {
    for(int i=0;i<N_ASSETS;i++){
      for(int j=0;j<N_ASSETS;j++){
        if(i == j) {
          distMatrix[i*N_ASSETS + j] = (fvar)0;
        } else {
          fvar corrDist = (fvar)1.0 - (fvar)fabs((double)corrMatrix[i*N_ASSETS + j]);
          fvar expDist  = (fvar)exposureTable.getDist(i, j);
          fvar blended = (fvar)LAMBDA_META * corrDist + (fvar)(1.0 - (double)LAMBDA_META) * expDist;
          distMatrix[i*N_ASSETS + j] = blended;
        }
      }
    }
  }

  void floydWarshall() {
    fvar d[28][28];

    for(int i=0;i<N_ASSETS;i++){
      for(int j=0;j<N_ASSETS;j++){
        d[i][j] = distMatrix[i*N_ASSETS + j];
        if(i == j) d[i][j] = (fvar)0;
        if(d[i][j] < (fvar)0) d[i][j] = (fvar)INF;
      }
    }

    for(int k=0;k<N_ASSETS;k++){
      for(int i=0;i<N_ASSETS;i++){
        for(int j=0;j<N_ASSETS;j++){
          if(d[i][k] < (fvar)INF && d[k][j] < (fvar)INF) {
            fvar nk = d[i][k] + d[k][j];
            if(nk < d[i][j]) d[i][j] = nk;
          }
        }
      }
    }

    for(int i=0;i<N_ASSETS;i++){
      fvar w = 0;
      for(int j=i+1;j<N_ASSETS;j++){
        if(d[i][j] < (fvar)INF) w += d[i][j];
      }
      if(w > (fvar)0) compactness[i] = (fvar)(1.0 / (1.0 + (double)w));
      else compactness[i] = (fvar)0;
      momentum[i] = featSoA.get(1, i, 0);
    }
  }

  void computeScores() {
    for(int i=0;i<N_ASSETS;i++){
      fvar coupling = 0;
      int count = 0;

      for(int j=0;j<N_ASSETS;j++){
        if(i != j && distMatrix[i*N_ASSETS + j] < (fvar)INF) {
          coupling += compactness[j];
          count++;
        }
      }

      fvar pCouple = 0;
      if(count > 0) pCouple = coupling / (fvar)count;
      else pCouple = (fvar)0;

      fvar rawScore = (fvar)GAMMA * momentum[i] + (fvar)ALPHA * compactness[i] - (fvar)BETA * pCouple;

      if(rawScore > (fvar)30) rawScore = (fvar)30;
      if(rawScore < (fvar)-30) rawScore = (fvar)-30;

      scores[i] = (fvar)(1.0 / (1.0 + exp(-(double)rawScore)));
    }
  }

  void onBar() {
    barCount++;

    for(int i=0;i<N_ASSETS;i++) computeFeatures(i);

    if(barCount % UPDATE_EVERY == 0) {
      updateCount++;

      computeCorrelationMatrix();
      computeDistanceMatrix();
      floydWarshall();
      computeScores();
      printTopK();
    }
  }

  void printTopK() {
    int indices[N_ASSETS];
    for(int i=0;i<N_ASSETS;i++) indices[i] = i;

    for(int i=0;i<TOP_K;i++){
      for(int j=i+1;j<N_ASSETS;j++){
        if(scores[indices[j]] > scores[indices[i]]) {
          int tmp = indices[i];
          indices[i] = indices[j];
          indices[j] = tmp;
        }
      }
    }

    if(updateCount % 10 == 0) {
      printf("===MomentumBias_v3 Top-K(update#%d,OpenCL=%d)===\n",
        updateCount, openCL.ready);

      for(int i=0;i<TOP_K;i++){
        int idx = indices[i];
        printf(" %d.%s: score=%.4f, M=%.4f, C=%.4f\n", i+1, ASSET_NAMES[idx], (double)scores[idx], (double)momentum[idx], (double)compactness[idx]);
      }
    }
  }
};

// ---------------------------- Zorro DLL entry ----------------------------

static MomentumBiasStrategy* S = NULL;

DLLFUNC void run()
{
  if(is(INITRUN)) {
    BarPeriod = 60;
    LookBack = max(LookBack, FEAT_WINDOW + 50);

    asset((char*)ASSET_NAMES[0]);

    if(!S) {
      S = new MomentumBiasStrategy();
      S->init();
    }
  }

  if(is(EXITRUN)) {
    if(S) {
      S->shutdown();
      delete S;
      S = NULL;
    }
    return;
  }

  if(!S) return;
  if(Bar < LookBack) return;
  if(!S->isReady) return;

  S->onBar();
}


Last edited by TipmyPip; 8 hours ago.
Page 16 of 16 1 2 14 15 16

Moderated by  Petra 

Powered by UBB.threads™ PHP Forum Software 7.7.1