Code
// demo_neuralODE_complex_382A_adaptive_main_opencl_nosdk.cpp (Zorro64 OpenCL C++)
//
// Headerless OpenCL dynamic loader (no CL/cl.h, no OpenCL.lib).
// Prints numerical outputs proving GPU drift is computed, and reports GPU usage.
// If OpenCL fails at any point -> CPU fallback.

#include <zorro.h>
#include <math.h>
#include <windows.h>

#define USE_OPENCL 1
#define CL_DEBUG  1   // <-- set to 0 if you want quiet mode

#define MAXS 4
#define TRI_A 6

// -------------------- Dimensions --------------------
#define N         48
#define K_SIZE    192   // MUST be literal: MAXS*N = 4*48 = 192

#define U_DIM     3
#define NOBS      12

// -------------------- Base knobs --------------------
#define T_END     20.0
#define DT_INIT   0.20
#define DT_MIN    1e-6
#define DT_MAX    0.50
#define TOL       1e-6

// -------------------- Shedding --------------------
#define MEM_BUDGET_MB 2000
static int G_ShedTriggered = 0;

// runtime knobs
static var G_Tol   = TOL;
static var G_DtMax = DT_MAX;

// budgets
static int G_MaxRejects  = 2000000;
static int G_DtMinHitMax = 500000;

// termination knobs
static var G_SteadyEps     = 1e-7;
static int G_SteadyNeed    = 250;
static var G_StateLimitRMS = 1e6;
static var G_ObsLimitRMS   = 1e6;

// SDE / hybrid knobs
static var G_SigmaBase      = 0.02;
static var G_JumpLambdaBase = 0.20;
static var G_JumpMagBase    = 0.20;
static var G_RegRate01      = 0.15;
static var G_RegRate10      = 0.10;

// -------------------- TIGHT_MEM types --------------------
#define TIGHT_MEM 1
#ifdef TIGHT_MEM
typedef float fvar;
#else
typedef var fvar;
#endif

// -------------------- Global buffers --------------------
static fvar G_K[K_SIZE]  = { 0 };
static var  G_stage[N]   = { 0 };
static var  G_dh[N]      = { 0 };

static var  G_ycur[N]    = { 0 };
static var  G_full[N]    = { 0 };
static var  G_half[N]    = { 0 };
static var  G_mid[N]     = { 0 };

static var  Y0[N]        = { 0 };
static var  Yref[N]      = { 0 };
static var  Ysafe[N]     = { 0 };
static var  Ybug[N]      = { 0 };

static var  G_xhat[NOBS] = { 0 };
static var  G_xobs[NOBS] = { 0 };

static var  G_u[U_DIM]   = { 0 };

// -------------------- RNG --------------------
static unsigned int G_Rng = 1;

static void rng_seed(unsigned int s)
{
  if(s == 0) s = 1;
  G_Rng = s;
}

static var rng_u01()
{
  G_Rng = 1664525u * G_Rng + 1013904223u;
  unsigned int x = (G_Rng >> 8) & 0x00FFFFFFu;
  return ((var)x + 1.0) / 16777217.0;
}

static var rng_norm()
{
  var u1 = rng_u01();
  var u2 = rng_u01();
  var r  = sqrt(-2.0 * log(u1));
  var th = 6.283185307179586 * u2;
  return r * cos(th);
}

// -------------------- Utility --------------------
static int memMB()
{
  var m = memory(0);
  if(!(m > 0)) return 0;
  return (int)(m/(1024*1024));
}

static void shed_if_needed()
{
  if(G_ShedTriggered) return;

  int mb = memMB();
  if(mb >= MEM_BUDGET_MB)
  {
    G_ShedTriggered = 1;
    G_Tol *= 10.0;

    G_DtMax *= 0.5;
    if(G_DtMax < DT_MIN) G_DtMax = DT_MIN;

    G_MaxRejects = (int)(G_MaxRejects/2);
    if(G_MaxRejects < 10000) G_MaxRejects = 10000;

    G_DtMinHitMax = (int)(G_DtMinHitMax/2);
    if(G_DtMinHitMax < 10000) G_DtMinHitMax = 10000;
  }
}

// -------------------- Triangular indexing --------------------
static int Aidx(int i, int j) { return (i*(i-1))/2 + j; }

static var getA(float* Atri, int i, int j)
{
  if(i <= j) return 0;
  return (var)Atri[Aidx(i,j)];
}

static void setA(float* Atri, int i, int j, var v)
{
  if(i <= j) return;
  Atri[Aidx(i,j)] = (float)v;
}

static void zeroTableau(int* s, float* Atri, float* b, float* c)
{
  int i;
  *s = 0;
  for(i=0;i<TRI_A;i++) Atri[i] = 0;
  for(i=0;i<MAXS;i++) { b[i]=0; c[i]=0; }
}

// -------------------- Activation --------------------
static var softsign(var z) { return z / (1.0 + fabs(z)); }

// -------------------- Exogenous input u(t) --------------------
static int G_Regime = 0;

static void eval_u(var t)
{
  var u0 = sin(0.11*t) * 0.8;
  if(G_Regime) u0 += 0.35;

  var u1 = cos(0.37*t) * 0.6;
  var u2 = sin(2.30*t) * 0.3;

  G_u[0] = u0;
  G_u[1] = u1;
  G_u[2] = u2;
}

// -------------------- theta(t) --------------------
static void theta_t(var t, var* dampOut, var* ampOut, var* scaleOut, var* stiffOut)
{
  var damp  = -0.40;
  var amp   =  0.35;
  var scale =  0.06;

  var drift = sin(0.03*t) * 0.25;

  var u0 = G_u[0];
  var u1 = G_u[1];

  if(G_Regime)
  {
    amp   *= 1.15;
    scale *= 1.10;
    damp  *= 1.05;
  }

  amp   *= (1.0 + 0.30*drift + 0.15*u0);
  scale *= (1.0 + 0.20*drift + 0.10*u1);
  damp  *= (1.0 + 0.15*drift - 0.10*u0);

  *dampOut  = damp;
  *ampOut   = amp;
  *scaleOut = scale;
  *stiffOut = 2.8;
}

// -------------------- Regime switching --------------------
static void update_regime(var t, var dt)
{
  var u0 = G_u[0];
  var u1 = G_u[1];

  var stress = fabs(u0) + 0.5*fabs(u1);
  var r01 = G_RegRate01 * (1.0 + 0.8*stress);
  var r10 = G_RegRate10 * (1.0 + 0.5*stress);

  var p;
  if(G_Regime == 0)
  {
    p = 1.0 - exp(-r01*dt);
    if(rng_u01() < p) G_Regime = 1;
  }
  else
  {
    p = 1.0 - exp(-r10*dt);
    if(rng_u01() < p) G_Regime = 0;
  }
}

// -------------------- Jumps --------------------
static void apply_jump_if_needed(var t, var dt, var* y)
{
  var u0  = G_u[0];
  var lam = G_JumpLambdaBase * (1.0 + 0.7*fabs(u0));
  if(G_Regime) lam *= 1.25;

  var p = 1.0 - exp(-lam*dt);
  if(rng_u01() >= p) return;

  var uu = rng_u01() - 0.5;
  var j  = tan(3.141592653589793 * uu);
  if(j >  6.0) j =  6.0;
  if(j < -6.0) j = -6.0;

  var mag = G_JumpMagBase * (1.0 + 0.5*fabs(u0));
  if(G_Regime) mag *= 1.2;

  for(int k=0;k<N;k++)
  {
    if(rng_u01() < 0.25)
    {
      var delta = mag * j;
      delta *= (1.0 + 0.01*(k+1));
      y[k] += delta;
    }
  }
}

// -------------------- Observations --------------------
static void observe_from_hidden(var t, var* h)
{
  for(int k=0;k<NOBS;k++)
  {
    var a = h[k];
    var b = h[k + NOBS];

    var z = a + 0.30*b + 0.05*sin(0.17*t + 0.3*(k+1));
    var xhat = z / (1.0 + fabs(z));
    G_xhat[k] = xhat;

    G_xobs[k] = xhat + 0.01*rng_norm();
  }
}

// ===========================================================
//           OpenCL (headerless dynamic loader)
// ===========================================================
#if USE_OPENCL

typedef int                 cl_int;
typedef unsigned int        cl_uint;
typedef unsigned long long  cl_ulong;
typedef unsigned long       cl_bitfield;
typedef cl_bitfield         cl_device_type;
typedef cl_bitfield         cl_mem_flags;
typedef cl_uint             cl_bool;

typedef struct _cl_platform_id*   cl_platform_id;
typedef struct _cl_device_id*     cl_device_id;
typedef struct _cl_context*       cl_context;
typedef struct _cl_command_queue* cl_command_queue;
typedef struct _cl_program*       cl_program;
typedef struct _cl_kernel*        cl_kernel;
typedef struct _cl_mem*           cl_mem;

#define CL_SUCCESS              0
#define CL_TRUE                 1

#define CL_DEVICE_TYPE_ALL      (1ULL << 0)
#define CL_DEVICE_TYPE_GPU      (1ULL << 2)

#define CL_MEM_READ_ONLY        (1ULL << 2)
#define CL_MEM_WRITE_ONLY       (1ULL << 1)

#define CL_PROGRAM_BUILD_LOG    0x1183

typedef cl_int   (__stdcall *PFN_clGetPlatformIDs)(cl_uint, cl_platform_id*, cl_uint*);
typedef cl_int   (__stdcall *PFN_clGetDeviceIDs)(cl_platform_id, cl_device_type, cl_uint, cl_device_id*, cl_uint*);
typedef cl_context (__stdcall *PFN_clCreateContext)(const void*, cl_uint, const cl_device_id*, void*, void*, cl_int*);
typedef cl_command_queue (__stdcall *PFN_clCreateCommandQueue)(cl_context, cl_device_id, cl_bitfield, cl_int*);
typedef cl_program (__stdcall *PFN_clCreateProgramWithSource)(cl_context, cl_uint, const char**, const size_t*, cl_int*);
typedef cl_int   (__stdcall *PFN_clBuildProgram)(cl_program, cl_uint, const cl_device_id*, const char*, void*, void*);
typedef cl_int   (__stdcall *PFN_clGetProgramBuildInfo)(cl_program, cl_device_id, cl_uint, size_t, void*, size_t*);
typedef cl_kernel(__stdcall *PFN_clCreateKernel)(cl_program, const char*, cl_int*);
typedef cl_mem   (__stdcall *PFN_clCreateBuffer)(cl_context, cl_mem_flags, size_t, void*, cl_int*);
typedef cl_int   (__stdcall *PFN_clSetKernelArg)(cl_kernel, cl_uint, size_t, const void*);
typedef cl_int   (__stdcall *PFN_clEnqueueWriteBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void*, cl_uint, const void*, void*);
typedef cl_int   (__stdcall *PFN_clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*, cl_uint, const void*, void*);
typedef cl_int   (__stdcall *PFN_clFinish)(cl_command_queue);
typedef cl_int   (__stdcall *PFN_clEnqueueReadBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void*, cl_uint, const void*, void*);

typedef cl_int   (__stdcall *PFN_clReleaseMemObject)(cl_mem);
typedef cl_int   (__stdcall *PFN_clReleaseKernel)(cl_kernel);
typedef cl_int   (__stdcall *PFN_clReleaseProgram)(cl_program);
typedef cl_int   (__stdcall *PFN_clReleaseCommandQueue)(cl_command_queue);
typedef cl_int   (__stdcall *PFN_clReleaseContext)(cl_context);

static HMODULE gCL_Dll = 0;
static PFN_clGetPlatformIDs         p_clGetPlatformIDs = 0;
static PFN_clGetDeviceIDs           p_clGetDeviceIDs = 0;
static PFN_clCreateContext          p_clCreateContext = 0;
static PFN_clCreateCommandQueue     p_clCreateCommandQueue = 0;
static PFN_clCreateProgramWithSource p_clCreateProgramWithSource = 0;
static PFN_clBuildProgram           p_clBuildProgram = 0;
static PFN_clGetProgramBuildInfo    p_clGetProgramBuildInfo = 0;
static PFN_clCreateKernel           p_clCreateKernel = 0;
static PFN_clCreateBuffer           p_clCreateBuffer = 0;
static PFN_clSetKernelArg           p_clSetKernelArg = 0;
static PFN_clEnqueueWriteBuffer     p_clEnqueueWriteBuffer = 0;
static PFN_clEnqueueNDRangeKernel   p_clEnqueueNDRangeKernel = 0;
static PFN_clFinish                 p_clFinish = 0;
static PFN_clEnqueueReadBuffer      p_clEnqueueReadBuffer = 0;

static PFN_clReleaseMemObject       p_clReleaseMemObject = 0;
static PFN_clReleaseKernel          p_clReleaseKernel = 0;
static PFN_clReleaseProgram         p_clReleaseProgram = 0;
static PFN_clReleaseCommandQueue    p_clReleaseCommandQueue = 0;
static PFN_clReleaseContext         p_clReleaseContext = 0;

static int gCL_Ready = 0;
static int gCL_UsedCalls = 0;
static int gCL_FailedCalls = 0;
static cl_int gCL_LastErr = 0;

static cl_platform_id   gCL_Platform = 0;
static cl_device_id     gCL_Device   = 0;
static cl_context       gCL_Context  = 0;
static cl_command_queue gCL_Queue    = 0;
static cl_program       gCL_Program  = 0;
static cl_kernel        gCL_K_DH     = 0;

static cl_mem gCL_B_Stage = 0;
static cl_mem gCL_B_DH    = 0;

static const char* gCL_Source =
"__kernel void dh_kernel(__global const float* stage, __global float* dh,           \n"
"  float t, float damp, float amp, float scale, float stiff, int regime,           \n"
"  float u0, float u1, float u2)                                                   \n"
"{                                                                                \n"
"  int i = get_global_id(0);                                                       \n"
"  if(i >= 48) return;                                                             \n"
"  float ii = (float)(i+1);                                                        \n"
"  float sum = 0.0f;                                                               \n"
"  for(int j=0;j<48;j++)                                                           \n"
"  {                                                                              \n"
"    float jj = (float)(j+1);                                                      \n"
"    float bj = sin(0.17f*jj) * 0.03f;                                              \n"
"    float inj = u0 * 0.02f * sin(0.07f*jj + 0.19f*t);                              \n"
"    float hj = stage[j];                                                          \n"
"    float in = hj + bj + inj;                                                     \n"
"    float aj = in / (1.0f + fabs(in));                                            \n"
"    float ang = (0.013f*ii*jj) + (0.11f*ii) - (0.07f*jj);                          \n"
"    float ww = sin(ang) * scale;                                                  \n"
"    sum += ww * aj;                                                               \n"
"  }                                                                              \n"
"  float forcing = sin(t + 0.05f*ii) * amp;                                        \n"
"  forcing += sin(20.0f*t + 0.10f*ii) * 0.08f;                                     \n"
"  forcing += (u1 * 0.10f) + (0.05f*u2*sin(0.9f*t));                               \n"
"  float hi = stage[i];                                                            \n"
"  float self = ( (hi*1.8f) / (1.0f + fabs(hi*1.8f)) ) * 0.15f;                    \n"
"  float di = damp;                                                                \n"
"  if((i & 7) == 0) di *= stiff;                                                   \n"
"  float ghi = hi * di;                                                            \n"
"  dh[i] = sum + ghi + forcing + self;                                             \n"
"}                                                                                \n";

static void cl_release_all()
{
  if(gCL_B_DH && p_clReleaseMemObject)     { p_clReleaseMemObject(gCL_B_DH);     gCL_B_DH = 0; }
  if(gCL_B_Stage && p_clReleaseMemObject)  { p_clReleaseMemObject(gCL_B_Stage);  gCL_B_Stage = 0; }
  if(gCL_K_DH && p_clReleaseKernel)        { p_clReleaseKernel(gCL_K_DH);        gCL_K_DH = 0; }
  if(gCL_Program && p_clReleaseProgram)    { p_clReleaseProgram(gCL_Program);    gCL_Program = 0; }
  if(gCL_Queue && p_clReleaseCommandQueue) { p_clReleaseCommandQueue(gCL_Queue); gCL_Queue = 0; }
  if(gCL_Context && p_clReleaseContext)    { p_clReleaseContext(gCL_Context);    gCL_Context = 0; }

  gCL_Device = 0;
  gCL_Platform = 0;
  gCL_Ready = 0;

  if(gCL_Dll)
  {
    FreeLibrary(gCL_Dll);
    gCL_Dll = 0;
  }
}

static FARPROC cl_sym(const char* name)
{
  if(!gCL_Dll) return 0;
  return GetProcAddress(gCL_Dll, name);
}

static int cl_load()
{
  gCL_Dll = LoadLibraryA("OpenCL.dll");
  if(!gCL_Dll) return 0;

  p_clGetPlatformIDs = (PFN_clGetPlatformIDs)cl_sym("clGetPlatformIDs");
  p_clGetDeviceIDs = (PFN_clGetDeviceIDs)cl_sym("clGetDeviceIDs");
  p_clCreateContext = (PFN_clCreateContext)cl_sym("clCreateContext");
  p_clCreateCommandQueue = (PFN_clCreateCommandQueue)cl_sym("clCreateCommandQueue");
  p_clCreateProgramWithSource = (PFN_clCreateProgramWithSource)cl_sym("clCreateProgramWithSource");
  p_clBuildProgram = (PFN_clBuildProgram)cl_sym("clBuildProgram");
  p_clGetProgramBuildInfo = (PFN_clGetProgramBuildInfo)cl_sym("clGetProgramBuildInfo");
  p_clCreateKernel = (PFN_clCreateKernel)cl_sym("clCreateKernel");
  p_clCreateBuffer = (PFN_clCreateBuffer)cl_sym("clCreateBuffer");
  p_clSetKernelArg = (PFN_clSetKernelArg)cl_sym("clSetKernelArg");
  p_clEnqueueWriteBuffer = (PFN_clEnqueueWriteBuffer)cl_sym("clEnqueueWriteBuffer");
  p_clEnqueueNDRangeKernel = (PFN_clEnqueueNDRangeKernel)cl_sym("clEnqueueNDRangeKernel");
  p_clFinish = (PFN_clFinish)cl_sym("clFinish");
  p_clEnqueueReadBuffer = (PFN_clEnqueueReadBuffer)cl_sym("clEnqueueReadBuffer");

  p_clReleaseMemObject = (PFN_clReleaseMemObject)cl_sym("clReleaseMemObject");
  p_clReleaseKernel = (PFN_clReleaseKernel)cl_sym("clReleaseKernel");
  p_clReleaseProgram = (PFN_clReleaseProgram)cl_sym("clReleaseProgram");
  p_clReleaseCommandQueue = (PFN_clReleaseCommandQueue)cl_sym("clReleaseCommandQueue");
  p_clReleaseContext = (PFN_clReleaseContext)cl_sym("clReleaseContext");

  if(!p_clGetPlatformIDs || !p_clGetDeviceIDs || !p_clCreateContext || !p_clCreateCommandQueue ||
     !p_clCreateProgramWithSource || !p_clBuildProgram || !p_clCreateKernel || !p_clCreateBuffer ||
     !p_clSetKernelArg || !p_clEnqueueWriteBuffer || !p_clEnqueueNDRangeKernel || !p_clFinish ||
     !p_clEnqueueReadBuffer)
  {
    cl_release_all();
    return 0;
  }
  return 1;
}

static int cl_init()
{
  if(!cl_load()) return 0;

  cl_int err = CL_SUCCESS;

  cl_uint nPlatforms = 0;
  err = p_clGetPlatformIDs(0, 0, &nPlatforms);
  if(err != CL_SUCCESS || nPlatforms == 0) { cl_release_all(); return 0; }

  cl_platform_id platforms[8];
  if(nPlatforms > 8) nPlatforms = 8;
  err = p_clGetPlatformIDs(nPlatforms, platforms, &nPlatforms);
  if(err != CL_SUCCESS) { cl_release_all(); return 0; }

  cl_platform_id chosenP = 0;
  cl_device_id chosenD = 0;

  // choose GPU device if possible
  for(cl_uint p=0; p<nPlatforms; p++)
  {
    cl_uint nDev = 0;
    if(p_clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_GPU, 0, 0, &nDev) == CL_SUCCESS && nDev > 0)
    {
      cl_device_id devs[8];
      if(nDev > 8) nDev = 8;
      if(p_clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_GPU, nDev, devs, &nDev) == CL_SUCCESS)
      {
        chosenP = platforms[p];
        chosenD = devs[0];
        break;
      }
    }
  }

  // fallback: any device
  if(!chosenD)
  {
    for(cl_uint p=0; p<nPlatforms; p++)
    {
      cl_uint nDev = 0;
      if(p_clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, 0, 0, &nDev) == CL_SUCCESS && nDev > 0)
      {
        cl_device_id devs[8];
        if(nDev > 8) nDev = 8;
        if(p_clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, nDev, devs, &nDev) == CL_SUCCESS)
        {
          chosenP = platforms[p];
          chosenD = devs[0];
          break;
        }
      }
    }
  }

  if(!chosenD) { cl_release_all(); return 0; }

  gCL_Platform = chosenP;
  gCL_Device   = chosenD;

  gCL_Context = p_clCreateContext(0, 1, &gCL_Device, 0, 0, &err);
  if(err != CL_SUCCESS || !gCL_Context) { cl_release_all(); return 0; }

  gCL_Queue = p_clCreateCommandQueue(gCL_Context, gCL_Device, 0, &err);
  if(err != CL_SUCCESS || !gCL_Queue) { cl_release_all(); return 0; }

  gCL_Program = p_clCreateProgramWithSource(gCL_Context, 1, &gCL_Source, 0, &err);
  if(err != CL_SUCCESS || !gCL_Program) { cl_release_all(); return 0; }

  // NOTE: you can optionally pass build options here (some drivers like it):
  // const char* opts = "-cl-fast-relaxed-math";
  // err = p_clBuildProgram(gCL_Program, 1, &gCL_Device, opts, 0, 0);
  err = p_clBuildProgram(gCL_Program, 1, &gCL_Device, 0, 0, 0);
  if(err != CL_SUCCESS)
  {
    if(p_clGetProgramBuildInfo)
    {
      char logbuf[4096] = {0};
      size_t logsz = 0;
      p_clGetProgramBuildInfo(gCL_Program, gCL_Device, CL_PROGRAM_BUILD_LOG, sizeof(logbuf), logbuf, &logsz);
      printf("\nOpenCL build failed: %s", logbuf);
    }
    cl_release_all();
    return 0;
  }

  gCL_K_DH = p_clCreateKernel(gCL_Program, "dh_kernel", &err);
  if(err != CL_SUCCESS || !gCL_K_DH) { cl_release_all(); return 0; }

  gCL_B_Stage = p_clCreateBuffer(gCL_Context, CL_MEM_READ_ONLY,  sizeof(float)*N, 0, &err);
  if(err != CL_SUCCESS || !gCL_B_Stage) { cl_release_all(); return 0; }

  gCL_B_DH = p_clCreateBuffer(gCL_Context, CL_MEM_WRITE_ONLY, sizeof(float)*N, 0, &err);
  if(err != CL_SUCCESS || !gCL_B_DH) { cl_release_all(); return 0; }

  gCL_Ready = 1;
  gCL_UsedCalls = 0;
  gCL_FailedCalls = 0;
  gCL_LastErr = 0;
  return 1;
}

static void cl_fail(cl_int err)
{
  gCL_LastErr = err;
  gCL_FailedCalls++;
  gCL_Ready = 0;
#if CL_DEBUG
  printf("\n[OpenCL] ERROR %d -> fallback to CPU", (int)err);
#endif
}

// GPU drift eval
static void f_theta_vec_gpu(var t)
{
  eval_u(t);

  var damp, amp, scale, stiff;
  theta_t(t, &damp, &amp, &scale, &stiff);

  float stageF[N];
  float dhF[N] = { 0 };

  for(int k=0;k<N;k++) stageF[k] = (float)G_stage[k];

  cl_int err = p_clEnqueueWriteBuffer(gCL_Queue, gCL_B_Stage, CL_TRUE, 0, sizeof(float)*N, stageF, 0, 0, 0);
  if(err != CL_SUCCESS) { cl_fail(err); return; }

  int arg = 0;
  float tf     = (float)t;
  float dampf  = (float)damp;
  float ampf   = (float)amp;
  float scalef = (float)scale;
  float stifff = (float)stiff;
  int regime   = G_Regime;
  float u0 = (float)G_u[0];
  float u1 = (float)G_u[1];
  float u2 = (float)G_u[2];

  err  = p_clSetKernelArg(gCL_K_DH, arg++, sizeof(cl_mem), &gCL_B_Stage);
  err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(cl_mem), &gCL_B_DH);
  err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(float),  &tf);
  err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(float),  &dampf);
  err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(float),  &ampf);
  err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(float),  &scalef);
  err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(float),  &stifff);
  err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(int),    &regime);
  err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(float),  &u0);
  err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(float),  &u1);
  err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(float),  &u2);

  if(err != CL_SUCCESS) { cl_fail(err); return; }

  size_t global = (size_t)N;
  err = p_clEnqueueNDRangeKernel(gCL_Queue, gCL_K_DH, 1, 0, &global, 0, 0, 0, 0);
  if(err != CL_SUCCESS) { cl_fail(err); return; }

  err = p_clFinish(gCL_Queue);
  if(err != CL_SUCCESS) { cl_fail(err); return; }

  err = p_clEnqueueReadBuffer(gCL_Queue, gCL_B_DH, CL_TRUE, 0, sizeof(float)*N, dhF, 0, 0, 0);
  if(err != CL_SUCCESS) { cl_fail(err); return; }

  for(int k=0;k<N;k++) G_dh[k] = (var)dhF[k];

  gCL_UsedCalls++;

#if CL_DEBUG
  // print only the first GPU call to prove it produces numbers
  static int printed = 0;
  if(!printed)
  {
    printed = 1;
    printf("\n[OpenCL] dh sample (t=%g): dh[0]=%.6g dh[1]=%.6g dh[2]=%.6g dh[3]=%.6g",
      t, (double)G_dh[0], (double)G_dh[1], (double)G_dh[2], (double)G_dh[3]);
  }
#endif
}
#endif // USE_OPENCL

// ===========================================================
//                    CPU drift eval
// ===========================================================
static void f_theta_vec_cpu(var t)
{
  eval_u(t);

  var damp, amp, scale, stiff;
  theta_t(t, &damp, &amp, &scale, &stiff);

  for(int i=0;i<N;i++)
  {
    var sum = 0;
    var ii = (i+1);

    for(int j=0;j<N;j++)
    {
      var jj = (j+1);

      var bj  = sin(0.17*jj) * 0.03;
      var inj = G_u[0] * 0.02 * sin(0.07*jj + 0.19*t);

      var hj = G_stage[j];
      var in = hj + bj + inj;
      var aj = softsign(in);

      var ang = (0.013*ii*jj) + (0.11*ii) - (0.07*jj);
      var ww  = sin(ang) * scale;

      sum += ww * aj;
    }

    var forcing = sin(t + 0.05*ii) * amp;
    forcing += sin(20.0*t + 0.10*ii) * 0.08;
    forcing += (G_u[1] * 0.10) + (0.05*G_u[2]*sin(0.9*t));

    var hi   = G_stage[i];
    var self = softsign(hi*1.8) * 0.15;

    var di = damp;
    if((i & 7) == 0) di *= stiff;

    var ghi = hi * di;

    G_dh[i] = sum + ghi + forcing + self;
  }
}

static void f_theta_vec(var t)
{
#if USE_OPENCL
  if(gCL_Ready)
  {
    f_theta_vec_gpu(t);
    if(gCL_Ready) return;
  }
#endif
  f_theta_vec_cpu(t);
}

// -------------------- Vector RK step --------------------
static void rk_step_vec(int s, float* Atri, float* b, float* c,
                        var t, var* y, var h, var* yout)
{
  if(s < 1)
  {
    for(int k=0;k<N;k++) yout[k] = y[k];
    return;
  }
  if(s > MAXS) s = MAXS;

  for(int i=0;i<s;i++)
    for(int k=0;k<N;k++)
      G_K[i*N + k] = (fvar)0;

  for(int i=0;i<s;i++)
  {
    for(int k=0;k<N;k++)
    {
      var acc = y[k];
      for(int j=0;j<i;j++)
        acc += ((var)G_K[j*N + k]) * getA(Atri,i,j) * h;

      G_stage[k] = acc;
    }

    var dt = ((var)c[i]) * h;

    f_theta_vec(t + dt);

    for(int k=0;k<N;k++)
      G_K[i*N + k] = (fvar)G_dh[k];
  }

  for(int k=0;k<N;k++)
  {
    var acc = y[k];
    for(int i=0;i<s;i++)
      acc += ((var)G_K[i*N + k]) * (var)b[i] * h;

    yout[k] = acc;
  }
}

static var err_norm(var* a, var* b)
{
  var s = 0;
  for(int k=0;k<N;k++) { var d = a[k] - b[k]; s += d*d; }
  return sqrt(s / (var)N);
}

static var rms_norm(var* a, int n)
{
  var s = 0;
  for(int k=0;k<n;k++) { var x = a[k]; s += x*x; }
  return sqrt(s / (var)n);
}

static void permute_method_into(int s, float* A, float* b, float* c,
                                int* p,
                                int* sOut, float* Aout, float* bout, float* cout)
{
  *sOut = s;
  if(*sOut < 0) *sOut = 0;
  if(*sOut > MAXS) *sOut = MAXS;

  for(int i=0;i<TRI_A;i++) Aout[i] = 0;
  for(int i=0;i<MAXS;i++)  { bout[i]=0; cout[i]=0; }

  for(int i=0;i<*sOut;i++)
  {
    int pi = p[i];
    if(pi < 0) pi = 0;
    if(pi >= s) pi = s-1;
    bout[i] = b[pi];
    cout[i] = c[pi];
  }

  for(int i=0;i<*sOut;i++)
    for(int j=0;j<i;j++)
    {
      int pi = p[i];
      int pj = p[j];
      if(pi < 0) pi = 0;
      if(pj < 0) pj = 0;
      if(pi >= s) pi = s-1;
      if(pj >= s) pj = s-1;
      setA(Aout, i, j, getA(A, pi, pj));
    }
}

static void make_swappable_rk4(int* s, float* Atri, float* b, float* c)
{
  zeroTableau(s, Atri, b, c);
  *s = 4;

  c[0] = 0.0f;
  c[1] = 0.30f;
  c[2] = 0.80f;
  c[3] = 1.00f;

  setA(Atri, 1, 0, 0.30);
  setA(Atri, 2, 0, 0.80);
  setA(Atri, 3, 1, 0.50);
  setA(Atri, 3, 2, 0.50);

  b[0] = 0.10f;
  b[1] = 0.20f;
  b[2] = 0.30f;
  b[3] = 0.40f;
}

static void apply_diffusion(var t, var dt, var* y)
{
  eval_u(t);

  var u0 = G_u[0];
  var u2 = G_u[2];

  var sigma = G_SigmaBase * (1.0 + 0.6*fabs(u0) + 0.3*fabs(u2));
  if(G_Regime) sigma *= 1.25;

  var sdt = sqrt(dt);

  for(int k=0;k<N;k++)
  {
    var gmul = 1.0 + (fabs(y[k]) * 0.15);
    y[k] += sigma * gmul * sdt * rng_norm();
  }
}

static void integrate_adaptive_sde(int s, float* A, float* b, float* c,
                                   var t0, var* y0, var tEnd,
                                   unsigned int seed,
                                   var* yOut,
                                   int* stepsOut, int* rejectsOut,
                                   var* dtMinOut,
                                   int* stopReasonOut)
{
  rng_seed(seed);
  G_Regime = 0;

  var t = t0;
  var dt = DT_INIT;
  var dtMinSeen = 1e9;

  for(int k=0;k<N;k++) G_ycur[k] = y0[k];

  int steps = 0, rejects = 0, iters = 0, dtMinHits = 0, steadyCount = 0;
  int stopReason = 0;

  while(t < tEnd)
  {
    if((iters & 255) == 0) shed_if_needed();
    iters++;

    if(dt < DT_MIN) dt = DT_MIN;
    if(dt > G_DtMax) dt = G_DtMax;
    if(t + dt > tEnd) dt = tEnd - t;

    if(dt < dtMinSeen) dtMinSeen = dt;
    if(dt <= DT_MIN) dtMinHits++;

    if(dtMinHits > G_DtMinHitMax) { stopReason = 4; break; }

    rk_step_vec(s, A, b, c, t, G_ycur, dt, G_full);

    var halfdt = dt * 0.5;
    rk_step_vec(s, A, b, c, t,        G_ycur, halfdt, G_mid);
    rk_step_vec(s, A, b, c, t+halfdt, G_mid,  halfdt, G_half);

    var e = err_norm(G_half, G_full);

    if(e <= G_Tol || dt <= DT_MIN)
    {
      for(int k=0;k<N;k++) G_ycur[k] = G_half[k];

      eval_u(t);
      update_regime(t, dt);

      apply_diffusion(t, dt, G_ycur);
      apply_jump_if_needed(t, dt, G_ycur);

      observe_from_hidden(t + dt, G_ycur);

      var yr = rms_norm(G_ycur, N);
      var xr = rms_norm(G_xobs, NOBS);
      if(yr > G_StateLimitRMS || xr > G_ObsLimitRMS)
      {
        stopReason = 2;
        t += dt; steps++;
        break;
      }

      for(int k=0;k<N;k++) G_stage[k] = G_ycur[k];
      f_theta_vec(t + dt);
      var dr = rms_norm(G_dh, N);

      if(dr < G_SteadyEps) steadyCount++; else steadyCount = 0;

      t += dt; steps++;
      if(steadyCount >= G_SteadyNeed) { stopReason = 1; break; }

      if(e < 1e-16) dt *= 2.0;
      else
      {
        var fac = pow(G_Tol / e, 0.2) * 0.9;
        if(fac < 0.5) fac = 0.5;
        if(fac > 2.0) fac = 2.0;
        dt *= fac;
      }
    }
    else
    {
      rejects++;
      if(rejects > G_MaxRejects) { stopReason = 3; break; }
      dt *= 0.5;
    }
  }

  for(int k=0;k<N;k++) yOut[k] = G_ycur[k];
  *stepsOut = steps;
  *rejectsOut = rejects;
  *dtMinOut = dtMinSeen;
  *stopReasonOut = stopReason;
}

// ===========================================================
//        One-time GPU vs CPU drift self-test (prints numbers)
// ===========================================================
static void drift_selftest()
{
  // stage = Y0 snapshot at t = 0.5 (arbitrary)
  for(int k=0;k<N;k++) G_stage[k] = Y0[k];

  var ttest = 0.5;

  // CPU
  var dh_cpu[N];
  f_theta_vec_cpu(ttest);
  for(int k=0;k<N;k++) dh_cpu[k] = G_dh[k];

#if USE_OPENCL
  if(gCL_Ready)
  {
    // GPU
    f_theta_vec_gpu(ttest);
    var dh_gpu[N];
    for(int k=0;k<N;k++) dh_gpu[k] = G_dh[k];

    // RMS diff
    var s = 0;
    for(int k=0;k<N;k++) { var d = dh_gpu[k] - dh_cpu[k]; s += d*d; }
    var rms = sqrt(s / (var)N);

    printf("\n[SelfTest] dh CPU vs GPU @t=%g: RMS diff = %.3e", ttest, (double)rms);
    printf("\n[SelfTest] CPU dh[0..3]=%.6g %.6g %.6g %.6g",
      (double)dh_cpu[0], (double)dh_cpu[1], (double)dh_cpu[2], (double)dh_cpu[3]);
    printf("\n[SelfTest] GPU dh[0..3]=%.6g %.6g %.6g %.6g",
      (double)dh_gpu[0], (double)dh_gpu[1], (double)dh_gpu[2], (double)dh_gpu[3]);
  }
  else
  {
    printf("\n[SelfTest] OpenCL not ready -> skipped GPU compare");
  }
#else
  printf("\n[SelfTest] USE_OPENCL=0 -> CPU only");
#endif
}

// -------------------- Entry point --------------------
DLLFUNC int main()
{
#if USE_OPENCL
  if(cl_init())
    printf("\nOpenCL: enabled (dynamic loader)");
  else
    printf("\nOpenCL: not available -> CPU fallback");
#endif

  // reset knobs
  G_ShedTriggered = 0;
  G_Tol   = TOL;
  G_DtMax = DT_MAX;

  G_MaxRejects  = 2000000;
  G_DtMinHitMax = 500000;

  G_SteadyEps     = 1e-7;
  G_SteadyNeed    = 250;
  G_StateLimitRMS = 1e6;
  G_ObsLimitRMS   = 1e6;

  G_SigmaBase       = 0.02;
  G_JumpLambdaBase  = 0.20;
  G_JumpMagBase     = 0.20;
  G_RegRate01       = 0.15;
  G_RegRate10       = 0.10;

  int   S = 0;
  float A[TRI_A];
  float b[MAXS];
  float c[MAXS];
  make_swappable_rk4(&S, A, b, c);

  int   SB = 0;
  float AB[TRI_A];
  float bB[MAXS];
  float cB[MAXS];

  int p[MAXS];
  p[0]=0; p[1]=2; p[2]=1; p[3]=3;
  permute_method_into(S, A, b, c, p, &SB, AB, bB, cB);

  int   SW = S;
  float AW[TRI_A];
  float bW[MAXS];
  float cW[MAXS];

  for(int i=0;i<TRI_A;i++) AW[i] = A[i];
  for(int i=0;i<MAXS;i++)  { bW[i]=0; cW[i]=0; }
  for(int i=0;i<SW;i++)
  {
    int pi = p[i];
    bW[i] = b[pi];
    cW[i] = c[pi];
  }

  for(int k=0;k<N;k++)
    Y0[k] = 0.2*sin(0.31*(k+1)) + 0.15*cos(0.17*(k+1))
          + 0.05*sin(0.07*(k+1)*(k+1));

  var t0 = 0.0;

  // seed for custom RNG
  var now = wdate(NOW);
  unsigned int MySeed = (unsigned int)utm(now);
  var frac = now - floor(now);
  var ms = frac * 86400000.0;
  MySeed = 1664525u * MySeed + (unsigned int)ms + 1013904223u;
  if(MySeed == 0) MySeed = 1;
  printf("\nMySeed=%u", MySeed);

  // ---- PROOF: print GPU numerical results + CPU comparison ----
  drift_selftest();

  int stepsRef=0,  rejRef=0;   var dtMinRef=0;  int stopRef=0;
  int stepsSafe=0, rejSafe=0;  var dtMinSafe=0; int stopSafe=0;
  int stepsBug=0,  rejBug=0;   var dtMinBug=0;  int stopBug=0;

  integrate_adaptive_sde(S,  A,  b,  c,  t0, Y0, T_END, MySeed, Yref,
                         &stepsRef,  &rejRef,  &dtMinRef,  &stopRef);

  integrate_adaptive_sde(SB, AB, bB, cB, t0, Y0, T_END, MySeed, Ysafe,
                         &stepsSafe, &rejSafe, &dtMinSafe, &stopSafe);

  integrate_adaptive_sde(SW, AW, bW, cW, t0, Y0, T_END, MySeed, Ybug,
                         &stepsBug,  &rejBug,  &dtMinBug,  &stopBug);

  var diffSafe = err_norm(Ysafe, Yref);
  var diffBug  = err_norm(Ybug,  Yref);

  printf("\n\nNeural SDE / Hybrid ODE integration (N=%d) to T=%g  baseTol=%g", N, T_END, TOL);

  printf("\nA) Reference tableau:");
  printf("\n   accepted=%d rejected=%d minDt=%g stop=%d", stepsRef, rejRef, dtMinRef, stopRef);

  printf("\nB) Theorem-382A safe stage-refactor:");
  printf("\n   accepted=%d rejected=%d minDt=%g stop=%d", stepsSafe, rejSafe, dtMinSafe, stopSafe);
  printf("\n   diff to reference (RMS) = %.3e", (double)diffSafe);

  printf("\nC) Buggy stage-refactor:");
  printf("\n   accepted=%d rejected=%d minDt=%g stop=%d", stepsBug, rejBug, dtMinBug, stopBug);
  printf("\n   diff to reference (RMS) = %.3e", (double)diffBug);

#if USE_OPENCL
  printf("\n\nOpenCL usage: gpu_calls=%d failed_calls=%d last_err=%d",
         gCL_UsedCalls, gCL_FailedCalls, (int)gCL_LastErr);
  cl_release_all();
#endif

  printf("\n\nStop codes: 0=tEnd, 1=steady-state, 2=event(blow-up), 3=rejectBudget, 4=dtMinBudget\n");

  return quit("done");
}