Code
// demo_neuralODE_complex_382A_adaptive_main_cuda_nosdk.cpp (Zorro64 C++)
//
// Headerless CUDA (Driver API) + NVRTC runtime compilation (no cuda.h, no nvrtc.h, no .lib).
// - Loads nvcuda.dll and nvrtc64_*.dll dynamically.
// - Compiles CUDA C kernel to PTX at runtime, loads PTX, launches kernel.
// - Prints numerical outputs proving CUDA drift is computed, reports GPU usage.
// - If CUDA/NVRTC missing or any failure -> CPU fallback.

#include <zorro.h>
#include <math.h>
#include <windows.h>

#define USE_CUDA  1
#define CU_DEBUG  1   // set 0 to reduce prints

#define MAXS 4
#define TRI_A 6

// -------------------- Dimensions --------------------
#define N         48
#define K_SIZE    192

#define U_DIM     3
#define NOBS      12

// -------------------- Base knobs --------------------
#define T_END     20.0
#define DT_INIT   0.20
#define DT_MIN    1e-6
#define DT_MAX    0.50
#define TOL       1e-6

// -------------------- Shedding --------------------
#define MEM_BUDGET_MB 2000
static int G_ShedTriggered = 0;

// runtime knobs
static var G_Tol   = TOL;
static var G_DtMax = DT_MAX;

// budgets
static int G_MaxRejects  = 2000000;
static int G_DtMinHitMax = 500000;

// termination knobs
static var G_SteadyEps     = 1e-7;
static int G_SteadyNeed    = 250;
static var G_StateLimitRMS = 1e6;
static var G_ObsLimitRMS   = 1e6;

// SDE / hybrid knobs
static var G_SigmaBase      = 0.02;
static var G_JumpLambdaBase = 0.20;
static var G_JumpMagBase    = 0.20;
static var G_RegRate01      = 0.15;
static var G_RegRate10      = 0.10;

#define TIGHT_MEM 1
#ifdef TIGHT_MEM
typedef float fvar;
#else
typedef var fvar;
#endif

// -------------------- Global buffers --------------------
static fvar G_K[K_SIZE]  = { 0 };
static var  G_stage[N]   = { 0 };
static var  G_dh[N]      = { 0 };

static var  G_ycur[N]    = { 0 };
static var  G_full[N]    = { 0 };
static var  G_half[N]    = { 0 };
static var  G_mid[N]     = { 0 };

static var  Y0[N]        = { 0 };
static var  Yref[N]      = { 0 };
static var  Ysafe[N]     = { 0 };
static var  Ybug[N]      = { 0 };

static var  G_xhat[NOBS] = { 0 };
static var  G_xobs[NOBS] = { 0 };

static var  G_u[U_DIM]   = { 0 };

// -------------------- RNG --------------------
static unsigned int G_Rng = 1;

static void rng_seed(unsigned int s)
{
  if(s == 0) s = 1;
  G_Rng = s;
}

static var rng_u01()
{
  G_Rng = 1664525u * G_Rng + 1013904223u;
  unsigned int x = (G_Rng >> 8) & 0x00FFFFFFu;
  return ((var)x + 1.0) / 16777217.0;
}

static var rng_norm()
{
  var u1 = rng_u01();
  var u2 = rng_u01();
  var r  = sqrt(-2.0 * log(u1));
  var th = 6.283185307179586 * u2;
  return r * cos(th);
}

// -------------------- Utility --------------------
static int memMB()
{
  var m = memory(0);
  if(!(m > 0)) return 0;
  return (int)(m/(1024*1024));
}

static void shed_if_needed()
{
  if(G_ShedTriggered) return;

  int mb = memMB();
  if(mb >= MEM_BUDGET_MB)
  {
    G_ShedTriggered = 1;
    G_Tol *= 10.0;

    G_DtMax *= 0.5;
    if(G_DtMax < DT_MIN) G_DtMax = DT_MIN;

    G_MaxRejects = (int)(G_MaxRejects/2);
    if(G_MaxRejects < 10000) G_MaxRejects = 10000;

    G_DtMinHitMax = (int)(G_DtMinHitMax/2);
    if(G_DtMinHitMax < 10000) G_DtMinHitMax = 10000;
  }
}

// -------------------- Triangular indexing --------------------
static int Aidx(int i, int j) { return (i*(i-1))/2 + j; }

static var getA(float* Atri, int i, int j)
{
  if(i <= j) return 0;
  return (var)Atri[Aidx(i,j)];
}

static void setA(float* Atri, int i, int j, var v)
{
  if(i <= j) return;
  Atri[Aidx(i,j)] = (float)v;
}

static void zeroTableau(int* s, float* Atri, float* b, float* c)
{
  int i;
  *s = 0;
  for(i=0;i<TRI_A;i++) Atri[i] = 0;
  for(i=0;i<MAXS;i++) { b[i]=0; c[i]=0; }
}

// -------------------- Activation --------------------
static var softsign(var z) { return z / (1.0 + fabs(z)); }

// -------------------- Exogenous input u(t) --------------------
static int G_Regime = 0;

static void eval_u(var t)
{
  var u0 = sin(0.11*t) * 0.8;
  if(G_Regime) u0 += 0.35;

  var u1 = cos(0.37*t) * 0.6;
  var u2 = sin(2.30*t) * 0.3;

  G_u[0] = u0;
  G_u[1] = u1;
  G_u[2] = u2;
}

// -------------------- theta(t) --------------------
static void theta_t(var t, var* dampOut, var* ampOut, var* scaleOut, var* stiffOut)
{
  var damp  = -0.40;
  var amp   =  0.35;
  var scale =  0.06;

  var drift = sin(0.03*t) * 0.25;

  var u0 = G_u[0];
  var u1 = G_u[1];

  if(G_Regime)
  {
    amp   *= 1.15;
    scale *= 1.10;
    damp  *= 1.05;
  }

  amp   *= (1.0 + 0.30*drift + 0.15*u0);
  scale *= (1.0 + 0.20*drift + 0.10*u1);
  damp  *= (1.0 + 0.15*drift - 0.10*u0);

  *dampOut  = damp;
  *ampOut   = amp;
  *scaleOut = scale;
  *stiffOut = 2.8;
}

// -------------------- Regime switching --------------------
static void update_regime(var t, var dt)
{
  var u0 = G_u[0];
  var u1 = G_u[1];

  var stress = fabs(u0) + 0.5*fabs(u1);
  var r01 = G_RegRate01 * (1.0 + 0.8*stress);
  var r10 = G_RegRate10 * (1.0 + 0.5*stress);

  var p;
  if(G_Regime == 0)
  {
    p = 1.0 - exp(-r01*dt);
    if(rng_u01() < p) G_Regime = 1;
  }
  else
  {
    p = 1.0 - exp(-r10*dt);
    if(rng_u01() < p) G_Regime = 0;
  }
}

// -------------------- Jumps --------------------
static void apply_jump_if_needed(var t, var dt, var* y)
{
  var u0  = G_u[0];
  var lam = G_JumpLambdaBase * (1.0 + 0.7*fabs(u0));
  if(G_Regime) lam *= 1.25;

  var p = 1.0 - exp(-lam*dt);
  if(rng_u01() >= p) return;

  var uu = rng_u01() - 0.5;
  var j  = tan(3.141592653589793 * uu);
  if(j >  6.0) j =  6.0;
  if(j < -6.0) j = -6.0;

  var mag = G_JumpMagBase * (1.0 + 0.5*fabs(u0));
  if(G_Regime) mag *= 1.2;

  for(int k=0;k<N;k++)
  {
    if(rng_u01() < 0.25)
    {
      var delta = mag * j;
      delta *= (1.0 + 0.01*(k+1));
      y[k] += delta;
    }
  }
}

// -------------------- Observations --------------------
static void observe_from_hidden(var t, var* h)
{
  for(int k=0;k<NOBS;k++)
  {
    var a = h[k];
    var b = h[k + NOBS];

    var z = a + 0.30*b + 0.05*sin(0.17*t + 0.3*(k+1));
    var xhat = z / (1.0 + fabs(z));
    G_xhat[k] = xhat;

    G_xobs[k] = xhat + 0.01*rng_norm();
  }
}

// ===========================================================
//                    CUDA (Driver API + NVRTC)
// ===========================================================
#if USE_CUDA

// --- minimal CUDA driver types ---
typedef int CUresult;
typedef int CUdevice;
typedef struct CUctx_st* CUcontext;
typedef struct CUmod_st* CUmodule;
typedef struct CUfunc_st* CUfunction;
typedef unsigned long long CUdeviceptr;

#define CUDA_SUCCESS 0

// function pointer types (nvcuda.dll)
typedef CUresult (__stdcall *PFN_cuInit)(unsigned int);
typedef CUresult (__stdcall *PFN_cuDeviceGetCount)(int*);
typedef CUresult (__stdcall *PFN_cuDeviceGet)(CUdevice*, int);
typedef CUresult (__stdcall *PFN_cuDeviceComputeCapability)(int*, int*, CUdevice);
typedef CUresult (__stdcall *PFN_cuCtxCreate_v2)(CUcontext*, unsigned int, CUdevice);
typedef CUresult (__stdcall *PFN_cuCtxDestroy_v2)(CUcontext);

typedef CUresult (__stdcall *PFN_cuModuleLoadDataEx)(CUmodule*, const void*, unsigned int, void*, void*);
typedef CUresult (__stdcall *PFN_cuModuleGetFunction)(CUfunction*, CUmodule, const char*);
typedef CUresult (__stdcall *PFN_cuModuleUnload)(CUmodule);

typedef CUresult (__stdcall *PFN_cuMemAlloc_v2)(CUdeviceptr*, size_t);
typedef CUresult (__stdcall *PFN_cuMemFree_v2)(CUdeviceptr);
typedef CUresult (__stdcall *PFN_cuMemcpyHtoD_v2)(CUdeviceptr, const void*, size_t);
typedef CUresult (__stdcall *PFN_cuMemcpyDtoH_v2)(void*, CUdeviceptr, size_t);
typedef CUresult (__stdcall *PFN_cuLaunchKernel)(
  CUfunction,
  unsigned int, unsigned int, unsigned int,
  unsigned int, unsigned int, unsigned int,
  unsigned int,
  void*,
  void**,
  void**);
typedef CUresult (__stdcall *PFN_cuCtxSynchronize)(void);

// --- minimal NVRTC types ---
typedef int nvrtcResult;
typedef struct _nvrtcProgram* nvrtcProgram;

#define NVRTC_SUCCESS 0

typedef nvrtcResult (__stdcall *PFN_nvrtcCreateProgram)(
  nvrtcProgram*,
  const char*,
  const char*,
  int,
  const char* const*,
  const char* const*);
typedef nvrtcResult (__stdcall *PFN_nvrtcCompileProgram)(nvrtcProgram, int, const char* const*);
typedef nvrtcResult (__stdcall *PFN_nvrtcGetProgramLogSize)(nvrtcProgram, size_t*);
typedef nvrtcResult (__stdcall *PFN_nvrtcGetProgramLog)(nvrtcProgram, char*);
typedef nvrtcResult (__stdcall *PFN_nvrtcGetPTXSize)(nvrtcProgram, size_t*);
typedef nvrtcResult (__stdcall *PFN_nvrtcGetPTX)(nvrtcProgram, char*);
typedef nvrtcResult (__stdcall *PFN_nvrtcDestroyProgram)(nvrtcProgram*);

// loaded DLLs
static HMODULE gCU_Dll = 0;
static HMODULE gNVRTC_Dll = 0;

// CUDA pointers
static PFN_cuInit p_cuInit = 0;
static PFN_cuDeviceGetCount p_cuDeviceGetCount = 0;
static PFN_cuDeviceGet p_cuDeviceGet = 0;
static PFN_cuDeviceComputeCapability p_cuDeviceComputeCapability = 0;
static PFN_cuCtxCreate_v2 p_cuCtxCreate = 0;
static PFN_cuCtxDestroy_v2 p_cuCtxDestroy = 0;

static PFN_cuModuleLoadDataEx p_cuModuleLoadDataEx = 0;
static PFN_cuModuleGetFunction p_cuModuleGetFunction = 0;
static PFN_cuModuleUnload p_cuModuleUnload = 0;

static PFN_cuMemAlloc_v2 p_cuMemAlloc = 0;
static PFN_cuMemFree_v2 p_cuMemFree = 0;
static PFN_cuMemcpyHtoD_v2 p_cuMemcpyHtoD = 0;
static PFN_cuMemcpyDtoH_v2 p_cuMemcpyDtoH = 0;
static PFN_cuLaunchKernel p_cuLaunchKernel = 0;
static PFN_cuCtxSynchronize p_cuCtxSynchronize = 0;

// NVRTC pointers
static PFN_nvrtcCreateProgram p_nvrtcCreateProgram = 0;
static PFN_nvrtcCompileProgram p_nvrtcCompileProgram = 0;
static PFN_nvrtcGetProgramLogSize p_nvrtcGetProgramLogSize = 0;
static PFN_nvrtcGetProgramLog p_nvrtcGetProgramLog = 0;
static PFN_nvrtcGetPTXSize p_nvrtcGetPTXSize = 0;
static PFN_nvrtcGetPTX p_nvrtcGetPTX = 0;
static PFN_nvrtcDestroyProgram p_nvrtcDestroyProgram = 0;

// CUDA state
static int gCU_Ready = 0;
static int gCU_UsedCalls = 0;
static int gCU_FailedCalls = 0;
static CUresult gCU_LastErr = 0;

static CUdevice  gCU_Device = 0;
static CUcontext gCU_Ctx = 0;
static CUmodule  gCU_Mod = 0;
static CUfunction gCU_Func = 0;

static CUdeviceptr gCU_dStage = 0; // float[N]
static CUdeviceptr gCU_dDh    = 0; // float[N]

// CUDA kernel code compiled by NVRTC
static const char* gCU_Source =
"extern \"C\" __global__ void dh_kernel(const float* stage, float* dh,              \n"
"  float t, float damp, float amp, float scale, float stiff, int regime,           \n"
"  float u0, float u1, float u2)                                                   \n"
"{                                                                                \n"
"  int i = (int)(blockIdx.x * blockDim.x + threadIdx.x);                            \n"
"  if(i >= 48) return;                                                             \n"
"  float ii = (float)(i+1);                                                        \n"
"  float sum = 0.0f;                                                               \n"
"  #pragma unroll                                                                  \n"
"  for(int j=0;j<48;j++)                                                           \n"
"  {                                                                              \n"
"    float jj = (float)(j+1);                                                      \n"
"    float bj  = __sinf(0.17f*jj) * 0.03f;                                          \n"
"    float inj = u0 * 0.02f * __sinf(0.07f*jj + 0.19f*t);                           \n"
"    float hj = stage[j];                                                          \n"
"    float in = hj + bj + inj;                                                     \n"
"    float aj = in / (1.0f + fabsf(in));                                           \n"
"    float ang = (0.013f*ii*jj) + (0.11f*ii) - (0.07f*jj);                          \n"
"    float ww = __sinf(ang) * scale;                                               \n"
"    sum += ww * aj;                                                               \n"
"  }                                                                              \n"
"  float forcing = __sinf(t + 0.05f*ii) * amp;                                      \n"
"  forcing += __sinf(20.0f*t + 0.10f*ii) * 0.08f;                                   \n"
"  forcing += (u1 * 0.10f) + (0.05f*u2*__sinf(0.9f*t));                             \n"
"  float hi = stage[i];                                                            \n"
"  float self = ((hi*1.8f) / (1.0f + fabsf(hi*1.8f))) * 0.15f;                      \n"
"  float di = damp;                                                                \n"
"  if((i & 7) == 0) di *= stiff;                                                   \n"
"  float ghi = hi * di;                                                            \n"
"  dh[i] = sum + ghi + forcing + self;                                             \n"
"}                                                                                \n";

static void cu_fail(CUresult e)
{
  gCU_LastErr = e;
  gCU_FailedCalls++;
  gCU_Ready = 0;
#if CU_DEBUG
  printf("\n[CUDA] ERROR %d -> fallback to CPU", (int)e);
#endif
}

// try load nvrtc dll from a list of common names
static HMODULE load_nvrtc_any()
{
  const char* names[] = {
    "nvrtc64_123_0.dll",
    "nvrtc64_122_0.dll",
    "nvrtc64_121_0.dll",
    "nvrtc64_120_0.dll",
    "nvrtc64_118_0.dll",
    "nvrtc64_117_0.dll",
    "nvrtc64_116_0.dll",
    "nvrtc64_115_0.dll",
    "nvrtc64_114_0.dll",
    "nvrtc64_113_0.dll",
    "nvrtc64_112_0.dll",
    "nvrtc64_111_0.dll",
    "nvrtc64_110_0.dll",
    "nvrtc64_102_0.dll",
    "nvrtc64_101_0.dll",
    "nvrtc64_100_0.dll",
    "nvrtc64_92.dll"
  };
  for(int i=0;i<(int)(sizeof(names)/sizeof(names[0])); i++)
  {
    HMODULE h = LoadLibraryA(names[i]);
    if(h) return h;
  }
  // last resort: generic name (some setups provide it)
  return LoadLibraryA("nvrtc64.dll");
}

static FARPROC sym(HMODULE h, const char* name)
{
  if(!h) return 0;
  return GetProcAddress(h, name);
}

static void cu_release_all()
{
  if(gCU_dDh && p_cuMemFree)    { p_cuMemFree(gCU_dDh);    gCU_dDh = 0; }
  if(gCU_dStage && p_cuMemFree) { p_cuMemFree(gCU_dStage); gCU_dStage = 0; }
  if(gCU_Mod && p_cuModuleUnload) { p_cuModuleUnload(gCU_Mod); gCU_Mod = 0; }
  if(gCU_Ctx && p_cuCtxDestroy) { p_cuCtxDestroy(gCU_Ctx); gCU_Ctx = 0; }

  gCU_Ready = 0;

  if(gNVRTC_Dll) { FreeLibrary(gNVRTC_Dll); gNVRTC_Dll = 0; }
  if(gCU_Dll)    { FreeLibrary(gCU_Dll);    gCU_Dll = 0; }
}

static int cu_load()
{
  // CUDA driver
  gCU_Dll = LoadLibraryA("nvcuda.dll");
  if(!gCU_Dll) return 0;

  p_cuInit = (PFN_cuInit)sym(gCU_Dll, "cuInit");
  p_cuDeviceGetCount = (PFN_cuDeviceGetCount)sym(gCU_Dll, "cuDeviceGetCount");
  p_cuDeviceGet = (PFN_cuDeviceGet)sym(gCU_Dll, "cuDeviceGet");
  p_cuDeviceComputeCapability = (PFN_cuDeviceComputeCapability)sym(gCU_Dll, "cuDeviceComputeCapability");
  p_cuCtxCreate = (PFN_cuCtxCreate_v2)sym(gCU_Dll, "cuCtxCreate_v2");
  p_cuCtxDestroy = (PFN_cuCtxDestroy_v2)sym(gCU_Dll, "cuCtxDestroy_v2");

  p_cuModuleLoadDataEx = (PFN_cuModuleLoadDataEx)sym(gCU_Dll, "cuModuleLoadDataEx");
  p_cuModuleGetFunction = (PFN_cuModuleGetFunction)sym(gCU_Dll, "cuModuleGetFunction");
  p_cuModuleUnload = (PFN_cuModuleUnload)sym(gCU_Dll, "cuModuleUnload");

  p_cuMemAlloc = (PFN_cuMemAlloc_v2)sym(gCU_Dll, "cuMemAlloc_v2");
  p_cuMemFree = (PFN_cuMemFree_v2)sym(gCU_Dll, "cuMemFree_v2");
  p_cuMemcpyHtoD = (PFN_cuMemcpyHtoD_v2)sym(gCU_Dll, "cuMemcpyHtoD_v2");
  p_cuMemcpyDtoH = (PFN_cuMemcpyDtoH_v2)sym(gCU_Dll, "cuMemcpyDtoH_v2");
  p_cuLaunchKernel = (PFN_cuLaunchKernel)sym(gCU_Dll, "cuLaunchKernel");
  p_cuCtxSynchronize = (PFN_cuCtxSynchronize)sym(gCU_Dll, "cuCtxSynchronize");

  if(!p_cuInit || !p_cuDeviceGetCount || !p_cuDeviceGet || !p_cuCtxCreate || !p_cuCtxDestroy ||
     !p_cuModuleLoadDataEx || !p_cuModuleGetFunction || !p_cuModuleUnload ||
     !p_cuMemAlloc || !p_cuMemFree || !p_cuMemcpyHtoD || !p_cuMemcpyDtoH ||
     !p_cuLaunchKernel || !p_cuCtxSynchronize)
  {
    cu_release_all();
    return 0;
  }

  // NVRTC
  gNVRTC_Dll = load_nvrtc_any();
  if(!gNVRTC_Dll)
  {
#if CU_DEBUG
    printf("\n[CUDA] NVRTC dll not found -> CPU fallback");
#endif
    cu_release_all();
    return 0;
  }

  p_nvrtcCreateProgram = (PFN_nvrtcCreateProgram)sym(gNVRTC_Dll, "nvrtcCreateProgram");
  p_nvrtcCompileProgram = (PFN_nvrtcCompileProgram)sym(gNVRTC_Dll, "nvrtcCompileProgram");
  p_nvrtcGetProgramLogSize = (PFN_nvrtcGetProgramLogSize)sym(gNVRTC_Dll, "nvrtcGetProgramLogSize");
  p_nvrtcGetProgramLog = (PFN_nvrtcGetProgramLog)sym(gNVRTC_Dll, "nvrtcGetProgramLog");
  p_nvrtcGetPTXSize = (PFN_nvrtcGetPTXSize)sym(gNVRTC_Dll, "nvrtcGetPTXSize");
  p_nvrtcGetPTX = (PFN_nvrtcGetPTX)sym(gNVRTC_Dll, "nvrtcGetPTX");
  p_nvrtcDestroyProgram = (PFN_nvrtcDestroyProgram)sym(gNVRTC_Dll, "nvrtcDestroyProgram");

  if(!p_nvrtcCreateProgram || !p_nvrtcCompileProgram || !p_nvrtcGetProgramLogSize ||
     !p_nvrtcGetProgramLog || !p_nvrtcGetPTXSize || !p_nvrtcGetPTX || !p_nvrtcDestroyProgram)
  {
    cu_release_all();
    return 0;
  }

  return 1;
}

static int cu_init()
{
  if(!cu_load()) return 0;

  CUresult e = p_cuInit(0);
  if(e != CUDA_SUCCESS) { cu_release_all(); return 0; }

  int count = 0;
  e = p_cuDeviceGetCount(&count);
  if(e != CUDA_SUCCESS || count <= 0) { cu_release_all(); return 0; }

  e = p_cuDeviceGet(&gCU_Device, 0);
  if(e != CUDA_SUCCESS) { cu_release_all(); return 0; }

  int maj = 0, min = 0;
  if(p_cuDeviceComputeCapability)
    p_cuDeviceComputeCapability(&maj, &min, gCU_Device);

  e = p_cuCtxCreate(&gCU_Ctx, 0, gCU_Device);
  if(e != CUDA_SUCCESS || !gCU_Ctx) { cu_release_all(); return 0; }

  // Compile kernel to PTX via NVRTC
  nvrtcProgram prog = 0;
  nvrtcResult r = p_nvrtcCreateProgram(&prog, gCU_Source, "dh_kernel.cu", 0, 0, 0);
  if(r != NVRTC_SUCCESS || !prog) { cu_release_all(); return 0; }

  char archOpt[64];
  if(maj <= 0) { maj = 5; min = 2; } // conservative fallback
  sprintf(archOpt, "--gpu-architecture=compute_%d%d", maj, min);

  const char* opts[] = {
    archOpt,
    "--use_fast_math"
  };

  r = p_nvrtcCompileProgram(prog, (int)(sizeof(opts)/sizeof(opts[0])), opts);

  // log
  size_t logSize = 0;
  p_nvrtcGetProgramLogSize(prog, &logSize);
  if(logSize > 1)
  {
    char* logbuf = (char*)malloc(logSize + 1);
    if(logbuf)
    {
      logbuf[0] = 0;
      p_nvrtcGetProgramLog(prog, logbuf);
#if CU_DEBUG
      printf("\n[NVRTC] %s", logbuf);
#endif
      free(logbuf);
    }
  }

  if(r != NVRTC_SUCCESS)
  {
    p_nvrtcDestroyProgram(&prog);
    cu_release_all();
    return 0;
  }

  size_t ptxSize = 0;
  r = p_nvrtcGetPTXSize(prog, &ptxSize);
  if(r != NVRTC_SUCCESS || ptxSize < 8) { p_nvrtcDestroyProgram(&prog); cu_release_all(); return 0; }

  char* ptx = (char*)malloc(ptxSize + 1);
  if(!ptx) { p_nvrtcDestroyProgram(&prog); cu_release_all(); return 0; }
  ptx[0] = 0;

  r = p_nvrtcGetPTX(prog, ptx);
  p_nvrtcDestroyProgram(&prog);
  if(r != NVRTC_SUCCESS) { free(ptx); cu_release_all(); return 0; }

  // Load PTX into driver
  e = p_cuModuleLoadDataEx(&gCU_Mod, (const void*)ptx, 0, 0, 0);
  free(ptx);
  if(e != CUDA_SUCCESS || !gCU_Mod) { cu_release_all(); return 0; }

  e = p_cuModuleGetFunction(&gCU_Func, gCU_Mod, "dh_kernel");
  if(e != CUDA_SUCCESS || !gCU_Func) { cu_release_all(); return 0; }

  // Allocate device buffers
  e = p_cuMemAlloc(&gCU_dStage, sizeof(float)*N);
  if(e != CUDA_SUCCESS || !gCU_dStage) { cu_release_all(); return 0; }

  e = p_cuMemAlloc(&gCU_dDh, sizeof(float)*N);
  if(e != CUDA_SUCCESS || !gCU_dDh) { cu_release_all(); return 0; }

  gCU_Ready = 1;
  gCU_UsedCalls = 0;
  gCU_FailedCalls = 0;
  gCU_LastErr = 0;

#if CU_DEBUG
  printf("\n[CUDA] init OK (device cc=%d.%d)", maj, min);
#endif
  return 1;
}

// GPU drift eval (CUDA)
static void f_theta_vec_cuda(var t)
{
  eval_u(t);

  var damp, amp, scale, stiff;
  theta_t(t, &damp, &amp, &scale, &stiff);

  float stageF[N];
  float dhF[N] = {0};

  for(int k=0;k<N;k++) stageF[k] = (float)G_stage[k];

  CUresult e = p_cuMemcpyHtoD(gCU_dStage, stageF, sizeof(float)*N);
  if(e != CUDA_SUCCESS) { cu_fail(e); return; }

  // Kernel params (must match kernel signature)
  float tf = (float)t;
  float dampf  = (float)damp;
  float ampf   = (float)amp;
  float scalef = (float)scale;
  float stifff = (float)stiff;
  int regime = (int)G_Regime;
  float u0 = (float)G_u[0];
  float u1 = (float)G_u[1];
  float u2 = (float)G_u[2];

  void* params[] = {
    (void*)&gCU_dStage,
    (void*)&gCU_dDh,
    (void*)&tf,
    (void*)&dampf,
    (void*)&ampf,
    (void*)&scalef,
    (void*)&stifff,
    (void*)&regime,
    (void*)&u0,
    (void*)&u1,
    (void*)&u2
  };

  // launch: 1 block of 64 threads is enough for N=48
  unsigned int block = 64;
  unsigned int grid  = 1;

  e = p_cuLaunchKernel(gCU_Func,
                      grid,1,1,
                      block,1,1,
                      0, 0,
                      params, 0);
  if(e != CUDA_SUCCESS) { cu_fail(e); return; }

  e = p_cuCtxSynchronize();
  if(e != CUDA_SUCCESS) { cu_fail(e); return; }

  e = p_cuMemcpyDtoH(dhF, gCU_dDh, sizeof(float)*N);
  if(e != CUDA_SUCCESS) { cu_fail(e); return; }

  for(int k=0;k<N;k++) G_dh[k] = (var)dhF[k];

  gCU_UsedCalls++;

#if CU_DEBUG
  static int printed = 0;
  if(!printed)
  {
    printed = 1;
    printf("\n[CUDA] dh sample (t=%g): dh[0]=%.6g dh[1]=%.6g dh[2]=%.6g dh[3]=%.6g",
      t, (double)G_dh[0], (double)G_dh[1], (double)G_dh[2], (double)G_dh[3]);
  }
#endif
}
#endif // USE_CUDA

// ===========================================================
//                    CPU drift eval
// ===========================================================
static void f_theta_vec_cpu(var t)
{
  eval_u(t);

  var damp, amp, scale, stiff;
  theta_t(t, &damp, &amp, &scale, &stiff);

  for(int i=0;i<N;i++)
  {
    var sum = 0;
    var ii = (i+1);

    for(int j=0;j<N;j++)
    {
      var jj = (j+1);

      var bj  = sin(0.17*jj) * 0.03;
      var inj = G_u[0] * 0.02 * sin(0.07*jj + 0.19*t);

      var hj = G_stage[j];
      var in = hj + bj + inj;
      var aj = softsign(in);

      var ang = (0.013*ii*jj) + (0.11*ii) - (0.07*jj);
      var ww  = sin(ang) * scale;

      sum += ww * aj;
    }

    var forcing = sin(t + 0.05*ii) * amp;
    forcing += sin(20.0*t + 0.10*ii) * 0.08;
    forcing += (G_u[1] * 0.10) + (0.05*G_u[2]*sin(0.9*t));

    var hi   = G_stage[i];
    var self = softsign(hi*1.8) * 0.15;

    var di = damp;
    if((i & 7) == 0) di *= stiff;

    var ghi = hi * di;

    G_dh[i] = sum + ghi + forcing + self;
  }
}

static void f_theta_vec(var t)
{
#if USE_CUDA
  if(gCU_Ready)
  {
    f_theta_vec_cuda(t);
    if(gCU_Ready) return;
  }
#endif
  f_theta_vec_cpu(t);
}

// -------------------- Vector RK step --------------------
static void rk_step_vec(int s, float* Atri, float* b, float* c,
                        var t, var* y, var h, var* yout)
{
  if(s < 1)
  {
    for(int k=0;k<N;k++) yout[k] = y[k];
    return;
  }
  if(s > MAXS) s = MAXS;

  for(int i=0;i<s;i++)
    for(int k=0;k<N;k++)
      G_K[i*N + k] = (fvar)0;

  for(int i=0;i<s;i++)
  {
    for(int k=0;k<N;k++)
    {
      var acc = y[k];
      for(int j=0;j<i;j++)
        acc += ((var)G_K[j*N + k]) * getA(Atri,i,j) * h;

      G_stage[k] = acc;
    }

    var dt = ((var)c[i]) * h;

    f_theta_vec(t + dt);

    for(int k=0;k<N;k++)
      G_K[i*N + k] = (fvar)G_dh[k];
  }

  for(int k=0;k<N;k++)
  {
    var acc = y[k];
    for(int i=0;i<s;i++)
      acc += ((var)G_K[i*N + k]) * (var)b[i] * h;

    yout[k] = acc;
  }
}

static var err_norm(var* a, var* b)
{
  var s = 0;
  for(int k=0;k<N;k++) { var d = a[k] - b[k]; s += d*d; }
  return sqrt(s / (var)N);
}

static var rms_norm(var* a, int n)
{
  var s = 0;
  for(int k=0;k<n;k++) { var x = a[k]; s += x*x; }
  return sqrt(s / (var)n);
}

// -------------------- Tableau permutation --------------------
static void permute_method_into(int s, float* A, float* b, float* c,
                                int* p,
                                int* sOut, float* Aout, float* bout, float* cout)
{
  *sOut = s;
  if(*sOut < 0) *sOut = 0;
  if(*sOut > MAXS) *sOut = MAXS;

  for(int i=0;i<TRI_A;i++) Aout[i] = 0;
  for(int i=0;i<MAXS;i++)  { bout[i]=0; cout[i]=0; }

  for(int i=0;i<*sOut;i++)
  {
    int pi = p[i];
    if(pi < 0) pi = 0;
    if(pi >= s) pi = s-1;
    bout[i] = b[pi];
    cout[i] = c[pi];
  }

  for(int i=0;i<*sOut;i++)
    for(int j=0;j<i;j++)
    {
      int pi = p[i];
      int pj = p[j];
      if(pi < 0) pi = 0;
      if(pj < 0) pj = 0;
      if(pi >= s) pi = s-1;
      if(pj >= s) pj = s-1;
      setA(Aout, i, j, getA(A, pi, pj));
    }
}

static void make_swappable_rk4(int* s, float* Atri, float* b, float* c)
{
  zeroTableau(s, Atri, b, c);
  *s = 4;

  c[0] = 0.0f;
  c[1] = 0.30f;
  c[2] = 0.80f;
  c[3] = 1.00f;

  setA(Atri, 1, 0, 0.30);
  setA(Atri, 2, 0, 0.80);
  setA(Atri, 3, 1, 0.50);
  setA(Atri, 3, 2, 0.50);

  b[0] = 0.10f;
  b[1] = 0.20f;
  b[2] = 0.30f;
  b[3] = 0.40f;
}

static void apply_diffusion(var t, var dt, var* y)
{
  eval_u(t);

  var u0 = G_u[0];
  var u2 = G_u[2];

  var sigma = G_SigmaBase * (1.0 + 0.6*fabs(u0) + 0.3*fabs(u2));
  if(G_Regime) sigma *= 1.25;

  var sdt = sqrt(dt);

  for(int k=0;k<N;k++)
  {
    var gmul = 1.0 + (fabs(y[k]) * 0.15);
    y[k] += sigma * gmul * sdt * rng_norm();
  }
}

static void integrate_adaptive_sde(int s, float* A, float* b, float* c,
                                   var t0, var* y0, var tEnd,
                                   unsigned int seed,
                                   var* yOut,
                                   int* stepsOut, int* rejectsOut,
                                   var* dtMinOut,
                                   int* stopReasonOut)
{
  rng_seed(seed);
  G_Regime = 0;

  var t = t0;
  var dt = DT_INIT;
  var dtMinSeen = 1e9;

  for(int k=0;k<N;k++) G_ycur[k] = y0[k];

  int steps = 0, rejects = 0, iters = 0, dtMinHits = 0, steadyCount = 0;
  int stopReason = 0;

  while(t < tEnd)
  {
    if((iters & 255) == 0) shed_if_needed();
    iters++;

    if(dt < DT_MIN) dt = DT_MIN;
    if(dt > G_DtMax) dt = G_DtMax;
    if(t + dt > tEnd) dt = tEnd - t;

    if(dt < dtMinSeen) dtMinSeen = dt;
    if(dt <= DT_MIN) dtMinHits++;

    if(dtMinHits > G_DtMinHitMax) { stopReason = 4; break; }

    rk_step_vec(s, A, b, c, t, G_ycur, dt, G_full);

    var halfdt = dt * 0.5;
    rk_step_vec(s, A, b, c, t,        G_ycur, halfdt, G_mid);
    rk_step_vec(s, A, b, c, t+halfdt, G_mid,  halfdt, G_half);

    var e = err_norm(G_half, G_full);

    if(e <= G_Tol || dt <= DT_MIN)
    {
      for(int k=0;k<N;k++) G_ycur[k] = G_half[k];

      eval_u(t);
      update_regime(t, dt);

      apply_diffusion(t, dt, G_ycur);
      apply_jump_if_needed(t, dt, G_ycur);

      observe_from_hidden(t + dt, G_ycur);

      var yr = rms_norm(G_ycur, N);
      var xr = rms_norm(G_xobs, NOBS);
      if(yr > G_StateLimitRMS || xr > G_ObsLimitRMS)
      {
        stopReason = 2;
        t += dt; steps++;
        break;
      }

      for(int k=0;k<N;k++) G_stage[k] = G_ycur[k];
      f_theta_vec(t + dt);
      var dr = rms_norm(G_dh, N);

      if(dr < G_SteadyEps) steadyCount++; else steadyCount = 0;

      t += dt; steps++;
      if(steadyCount >= G_SteadyNeed) { stopReason = 1; break; }

      if(e < 1e-16) dt *= 2.0;
      else
      {
        var fac = pow(G_Tol / e, 0.2) * 0.9;
        if(fac < 0.5) fac = 0.5;
        if(fac > 2.0) fac = 2.0;
        dt *= fac;
      }
    }
    else
    {
      rejects++;
      if(rejects > G_MaxRejects) { stopReason = 3; break; }
      dt *= 0.5;
    }
  }

  for(int k=0;k<N;k++) yOut[k] = G_ycur[k];
  *stepsOut = steps;
  *rejectsOut = rejects;
  *dtMinOut = dtMinSeen;
  *stopReasonOut = stopReason;
}

// self-test: CPU vs CUDA drift numbers
static void drift_selftest()
{
  for(int k=0;k<N;k++) G_stage[k] = Y0[k];
  var ttest = 0.5;

  var dh_cpu[N];
  f_theta_vec_cpu(ttest);
  for(int k=0;k<N;k++) dh_cpu[k] = G_dh[k];

#if USE_CUDA
  if(gCU_Ready)
  {
    f_theta_vec_cuda(ttest);
    var dh_gpu[N];
    for(int k=0;k<N;k++) dh_gpu[k] = G_dh[k];

    var s = 0;
    for(int k=0;k<N;k++) { var d = dh_gpu[k] - dh_cpu[k]; s += d*d; }
    var rms = sqrt(s / (var)N);

    printf("\n[SelfTest] dh CPU vs CUDA @t=%g: RMS diff = %.3e", ttest, (double)rms);
    printf("\n[SelfTest] CPU dh[0..3]=%.6g %.6g %.6g %.6g",
      (double)dh_cpu[0], (double)dh_cpu[1], (double)dh_cpu[2], (double)dh_cpu[3]);
    printf("\n[SelfTest] CUDA dh[0..3]=%.6g %.6g %.6g %.6g",
      (double)dh_gpu[0], (double)dh_gpu[1], (double)dh_gpu[2], (double)dh_gpu[3]);
  }
  else
  {
    printf("\n[SelfTest] CUDA not ready -> skipped GPU compare");
  }
#endif
}

// -------------------- Entry point --------------------
DLLFUNC int main()
{
#if USE_CUDA
  if(cu_init())
    printf("\nCUDA: enabled (Driver API + NVRTC)");
  else
    printf("\nCUDA: not available -> CPU fallback");
#endif

  // reset knobs
  G_ShedTriggered = 0;
  G_Tol   = TOL;
  G_DtMax = DT_MAX;

  G_MaxRejects  = 2000000;
  G_DtMinHitMax = 500000;

  G_SteadyEps     = 1e-7;
  G_SteadyNeed    = 250;
  G_StateLimitRMS = 1e6;
  G_ObsLimitRMS   = 1e6;

  G_SigmaBase       = 0.02;
  G_JumpLambdaBase  = 0.20;
  G_JumpMagBase     = 0.20;
  G_RegRate01       = 0.15;
  G_RegRate10       = 0.10;

  int   S = 0;
  float A[TRI_A];
  float b[MAXS];
  float c[MAXS];
  make_swappable_rk4(&S, A, b, c);

  int   SB = 0;
  float AB[TRI_A];
  float bB[MAXS];
  float cB[MAXS];

  int p[MAXS];
  p[0]=0; p[1]=2; p[2]=1; p[3]=3;
  permute_method_into(S, A, b, c, p, &SB, AB, bB, cB);

  int   SW = S;
  float AW[TRI_A];
  float bW[MAXS];
  float cW[MAXS];

  for(int i=0;i<TRI_A;i++) AW[i] = A[i];
  for(int i=0;i<MAXS;i++)  { bW[i]=0; cW[i]=0; }
  for(int i=0;i<SW;i++)
  {
    int pi = p[i];
    bW[i] = b[pi];
    cW[i] = c[pi];
  }

  for(int k=0;k<N;k++)
    Y0[k] = 0.2*sin(0.31*(k+1)) + 0.15*cos(0.17*(k+1))
          + 0.05*sin(0.07*(k+1)*(k+1));

  var t0 = 0.0;

  var now = wdate(NOW);
  unsigned int MySeed = (unsigned int)utm(now);
  var frac = now - floor(now);
  var ms = frac * 86400000.0;
  MySeed = 1664525u * MySeed + (unsigned int)ms + 1013904223u;
  if(MySeed == 0) MySeed = 1;
  printf("\nMySeed=%u", MySeed);

  drift_selftest();

  int stepsRef=0,  rejRef=0;   var dtMinRef=0;  int stopRef=0;
  int stepsSafe=0, rejSafe=0;  var dtMinSafe=0; int stopSafe=0;
  int stepsBug=0,  rejBug=0;   var dtMinBug=0;  int stopBug=0;

  integrate_adaptive_sde(S,  A,  b,  c,  t0, Y0, T_END, MySeed, Yref,
                         &stepsRef,  &rejRef,  &dtMinRef,  &stopRef);

  integrate_adaptive_sde(SB, AB, bB, cB, t0, Y0, T_END, MySeed, Ysafe,
                         &stepsSafe, &rejSafe, &dtMinSafe, &stopSafe);

  integrate_adaptive_sde(SW, AW, bW, cW, t0, Y0, T_END, MySeed, Ybug,
                         &stepsBug,  &rejBug,  &dtMinBug,  &stopBug);

  var diffSafe = err_norm(Ysafe, Yref);
  var diffBug  = err_norm(Ybug,  Yref);

  printf("\n\nNeural SDE / Hybrid ODE integration (N=%d) to T=%g baseTol=%g", N, T_END, TOL);

  printf("\nA) Reference tableau:");
  printf("\n   accepted=%d rejected=%d minDt=%g stop=%d", stepsRef, rejRef, dtMinRef, stopRef);

  printf("\nB) Safe stage-refactor:");
  printf("\n   accepted=%d rejected=%d minDt=%g stop=%d", stepsSafe, rejSafe, dtMinSafe, stopSafe);
  printf("\n   diff to reference (RMS) = %.3e", (double)diffSafe);

  printf("\nC) Buggy stage-refactor:");
  printf("\n   accepted=%d rejected=%d minDt=%g stop=%d", stepsBug, rejBug, dtMinBug, stopBug);
  printf("\n   diff to reference (RMS) = %.3e", (double)diffBug);

#if USE_CUDA
  printf("\n\nCUDA usage: gpu_calls=%d failed_calls=%d last_err=%d",
         gCU_UsedCalls, gCU_FailedCalls, (int)gCU_LastErr);
  cu_release_all();
#endif

  printf("\n\nStop codes: 0=tEnd, 1=steady-state, 2=event(blow-up), 3=rejectBudget, 4=dtMinBudget\n");

  return quit("done");
}

Last edited by TipmyPip; 4 hours ago.