// demo_neuralODE_complex_382A_adaptive_main_cuda_nosdk.cpp (Zorro64 C++)
//
// Headerless CUDA (Driver API) + NVRTC runtime compilation (no cuda.h, no nvrtc.h, no .lib).
// - Loads nvcuda.dll and nvrtc64_*.dll dynamically.
// - Compiles CUDA C kernel to PTX at runtime, loads PTX, launches kernel.
// - Prints numerical outputs proving CUDA drift is computed, reports GPU usage.
// - If CUDA/NVRTC missing or any failure -> CPU fallback.
#include <zorro.h>
#include <math.h>
#include <windows.h>
#define USE_CUDA 1
#define CU_DEBUG 1 // set 0 to reduce prints
#define MAXS 4
#define TRI_A 6
// -------------------- Dimensions --------------------
#define N 48
#define K_SIZE 192
#define U_DIM 3
#define NOBS 12
// -------------------- Base knobs --------------------
#define T_END 20.0
#define DT_INIT 0.20
#define DT_MIN 1e-6
#define DT_MAX 0.50
#define TOL 1e-6
// -------------------- Shedding --------------------
#define MEM_BUDGET_MB 2000
static int G_ShedTriggered = 0;
// runtime knobs
static var G_Tol = TOL;
static var G_DtMax = DT_MAX;
// budgets
static int G_MaxRejects = 2000000;
static int G_DtMinHitMax = 500000;
// termination knobs
static var G_SteadyEps = 1e-7;
static int G_SteadyNeed = 250;
static var G_StateLimitRMS = 1e6;
static var G_ObsLimitRMS = 1e6;
// SDE / hybrid knobs
static var G_SigmaBase = 0.02;
static var G_JumpLambdaBase = 0.20;
static var G_JumpMagBase = 0.20;
static var G_RegRate01 = 0.15;
static var G_RegRate10 = 0.10;
#define TIGHT_MEM 1
#ifdef TIGHT_MEM
typedef float fvar;
#else
typedef var fvar;
#endif
// -------------------- Global buffers --------------------
static fvar G_K[K_SIZE] = { 0 };
static var G_stage[N] = { 0 };
static var G_dh[N] = { 0 };
static var G_ycur[N] = { 0 };
static var G_full[N] = { 0 };
static var G_half[N] = { 0 };
static var G_mid[N] = { 0 };
static var Y0[N] = { 0 };
static var Yref[N] = { 0 };
static var Ysafe[N] = { 0 };
static var Ybug[N] = { 0 };
static var G_xhat[NOBS] = { 0 };
static var G_xobs[NOBS] = { 0 };
static var G_u[U_DIM] = { 0 };
// -------------------- RNG --------------------
static unsigned int G_Rng = 1;
static void rng_seed(unsigned int s)
{
if(s == 0) s = 1;
G_Rng = s;
}
static var rng_u01()
{
G_Rng = 1664525u * G_Rng + 1013904223u;
unsigned int x = (G_Rng >> 8) & 0x00FFFFFFu;
return ((var)x + 1.0) / 16777217.0;
}
static var rng_norm()
{
var u1 = rng_u01();
var u2 = rng_u01();
var r = sqrt(-2.0 * log(u1));
var th = 6.283185307179586 * u2;
return r * cos(th);
}
// -------------------- Utility --------------------
static int memMB()
{
var m = memory(0);
if(!(m > 0)) return 0;
return (int)(m/(1024*1024));
}
static void shed_if_needed()
{
if(G_ShedTriggered) return;
int mb = memMB();
if(mb >= MEM_BUDGET_MB)
{
G_ShedTriggered = 1;
G_Tol *= 10.0;
G_DtMax *= 0.5;
if(G_DtMax < DT_MIN) G_DtMax = DT_MIN;
G_MaxRejects = (int)(G_MaxRejects/2);
if(G_MaxRejects < 10000) G_MaxRejects = 10000;
G_DtMinHitMax = (int)(G_DtMinHitMax/2);
if(G_DtMinHitMax < 10000) G_DtMinHitMax = 10000;
}
}
// -------------------- Triangular indexing --------------------
static int Aidx(int i, int j) { return (i*(i-1))/2 + j; }
static var getA(float* Atri, int i, int j)
{
if(i <= j) return 0;
return (var)Atri[Aidx(i,j)];
}
static void setA(float* Atri, int i, int j, var v)
{
if(i <= j) return;
Atri[Aidx(i,j)] = (float)v;
}
static void zeroTableau(int* s, float* Atri, float* b, float* c)
{
int i;
*s = 0;
for(i=0;i<TRI_A;i++) Atri[i] = 0;
for(i=0;i<MAXS;i++) { b[i]=0; c[i]=0; }
}
// -------------------- Activation --------------------
static var softsign(var z) { return z / (1.0 + fabs(z)); }
// -------------------- Exogenous input u(t) --------------------
static int G_Regime = 0;
static void eval_u(var t)
{
var u0 = sin(0.11*t) * 0.8;
if(G_Regime) u0 += 0.35;
var u1 = cos(0.37*t) * 0.6;
var u2 = sin(2.30*t) * 0.3;
G_u[0] = u0;
G_u[1] = u1;
G_u[2] = u2;
}
// -------------------- theta(t) --------------------
static void theta_t(var t, var* dampOut, var* ampOut, var* scaleOut, var* stiffOut)
{
var damp = -0.40;
var amp = 0.35;
var scale = 0.06;
var drift = sin(0.03*t) * 0.25;
var u0 = G_u[0];
var u1 = G_u[1];
if(G_Regime)
{
amp *= 1.15;
scale *= 1.10;
damp *= 1.05;
}
amp *= (1.0 + 0.30*drift + 0.15*u0);
scale *= (1.0 + 0.20*drift + 0.10*u1);
damp *= (1.0 + 0.15*drift - 0.10*u0);
*dampOut = damp;
*ampOut = amp;
*scaleOut = scale;
*stiffOut = 2.8;
}
// -------------------- Regime switching --------------------
static void update_regime(var t, var dt)
{
var u0 = G_u[0];
var u1 = G_u[1];
var stress = fabs(u0) + 0.5*fabs(u1);
var r01 = G_RegRate01 * (1.0 + 0.8*stress);
var r10 = G_RegRate10 * (1.0 + 0.5*stress);
var p;
if(G_Regime == 0)
{
p = 1.0 - exp(-r01*dt);
if(rng_u01() < p) G_Regime = 1;
}
else
{
p = 1.0 - exp(-r10*dt);
if(rng_u01() < p) G_Regime = 0;
}
}
// -------------------- Jumps --------------------
static void apply_jump_if_needed(var t, var dt, var* y)
{
var u0 = G_u[0];
var lam = G_JumpLambdaBase * (1.0 + 0.7*fabs(u0));
if(G_Regime) lam *= 1.25;
var p = 1.0 - exp(-lam*dt);
if(rng_u01() >= p) return;
var uu = rng_u01() - 0.5;
var j = tan(3.141592653589793 * uu);
if(j > 6.0) j = 6.0;
if(j < -6.0) j = -6.0;
var mag = G_JumpMagBase * (1.0 + 0.5*fabs(u0));
if(G_Regime) mag *= 1.2;
for(int k=0;k<N;k++)
{
if(rng_u01() < 0.25)
{
var delta = mag * j;
delta *= (1.0 + 0.01*(k+1));
y[k] += delta;
}
}
}
// -------------------- Observations --------------------
static void observe_from_hidden(var t, var* h)
{
for(int k=0;k<NOBS;k++)
{
var a = h[k];
var b = h[k + NOBS];
var z = a + 0.30*b + 0.05*sin(0.17*t + 0.3*(k+1));
var xhat = z / (1.0 + fabs(z));
G_xhat[k] = xhat;
G_xobs[k] = xhat + 0.01*rng_norm();
}
}
// ===========================================================
// CUDA (Driver API + NVRTC)
// ===========================================================
#if USE_CUDA
// --- minimal CUDA driver types ---
typedef int CUresult;
typedef int CUdevice;
typedef struct CUctx_st* CUcontext;
typedef struct CUmod_st* CUmodule;
typedef struct CUfunc_st* CUfunction;
typedef unsigned long long CUdeviceptr;
#define CUDA_SUCCESS 0
// function pointer types (nvcuda.dll)
typedef CUresult (__stdcall *PFN_cuInit)(unsigned int);
typedef CUresult (__stdcall *PFN_cuDeviceGetCount)(int*);
typedef CUresult (__stdcall *PFN_cuDeviceGet)(CUdevice*, int);
typedef CUresult (__stdcall *PFN_cuDeviceComputeCapability)(int*, int*, CUdevice);
typedef CUresult (__stdcall *PFN_cuCtxCreate_v2)(CUcontext*, unsigned int, CUdevice);
typedef CUresult (__stdcall *PFN_cuCtxDestroy_v2)(CUcontext);
typedef CUresult (__stdcall *PFN_cuModuleLoadDataEx)(CUmodule*, const void*, unsigned int, void*, void*);
typedef CUresult (__stdcall *PFN_cuModuleGetFunction)(CUfunction*, CUmodule, const char*);
typedef CUresult (__stdcall *PFN_cuModuleUnload)(CUmodule);
typedef CUresult (__stdcall *PFN_cuMemAlloc_v2)(CUdeviceptr*, size_t);
typedef CUresult (__stdcall *PFN_cuMemFree_v2)(CUdeviceptr);
typedef CUresult (__stdcall *PFN_cuMemcpyHtoD_v2)(CUdeviceptr, const void*, size_t);
typedef CUresult (__stdcall *PFN_cuMemcpyDtoH_v2)(void*, CUdeviceptr, size_t);
typedef CUresult (__stdcall *PFN_cuLaunchKernel)(
CUfunction,
unsigned int, unsigned int, unsigned int,
unsigned int, unsigned int, unsigned int,
unsigned int,
void*,
void**,
void**);
typedef CUresult (__stdcall *PFN_cuCtxSynchronize)(void);
// --- minimal NVRTC types ---
typedef int nvrtcResult;
typedef struct _nvrtcProgram* nvrtcProgram;
#define NVRTC_SUCCESS 0
typedef nvrtcResult (__stdcall *PFN_nvrtcCreateProgram)(
nvrtcProgram*,
const char*,
const char*,
int,
const char* const*,
const char* const*);
typedef nvrtcResult (__stdcall *PFN_nvrtcCompileProgram)(nvrtcProgram, int, const char* const*);
typedef nvrtcResult (__stdcall *PFN_nvrtcGetProgramLogSize)(nvrtcProgram, size_t*);
typedef nvrtcResult (__stdcall *PFN_nvrtcGetProgramLog)(nvrtcProgram, char*);
typedef nvrtcResult (__stdcall *PFN_nvrtcGetPTXSize)(nvrtcProgram, size_t*);
typedef nvrtcResult (__stdcall *PFN_nvrtcGetPTX)(nvrtcProgram, char*);
typedef nvrtcResult (__stdcall *PFN_nvrtcDestroyProgram)(nvrtcProgram*);
// loaded DLLs
static HMODULE gCU_Dll = 0;
static HMODULE gNVRTC_Dll = 0;
// CUDA pointers
static PFN_cuInit p_cuInit = 0;
static PFN_cuDeviceGetCount p_cuDeviceGetCount = 0;
static PFN_cuDeviceGet p_cuDeviceGet = 0;
static PFN_cuDeviceComputeCapability p_cuDeviceComputeCapability = 0;
static PFN_cuCtxCreate_v2 p_cuCtxCreate = 0;
static PFN_cuCtxDestroy_v2 p_cuCtxDestroy = 0;
static PFN_cuModuleLoadDataEx p_cuModuleLoadDataEx = 0;
static PFN_cuModuleGetFunction p_cuModuleGetFunction = 0;
static PFN_cuModuleUnload p_cuModuleUnload = 0;
static PFN_cuMemAlloc_v2 p_cuMemAlloc = 0;
static PFN_cuMemFree_v2 p_cuMemFree = 0;
static PFN_cuMemcpyHtoD_v2 p_cuMemcpyHtoD = 0;
static PFN_cuMemcpyDtoH_v2 p_cuMemcpyDtoH = 0;
static PFN_cuLaunchKernel p_cuLaunchKernel = 0;
static PFN_cuCtxSynchronize p_cuCtxSynchronize = 0;
// NVRTC pointers
static PFN_nvrtcCreateProgram p_nvrtcCreateProgram = 0;
static PFN_nvrtcCompileProgram p_nvrtcCompileProgram = 0;
static PFN_nvrtcGetProgramLogSize p_nvrtcGetProgramLogSize = 0;
static PFN_nvrtcGetProgramLog p_nvrtcGetProgramLog = 0;
static PFN_nvrtcGetPTXSize p_nvrtcGetPTXSize = 0;
static PFN_nvrtcGetPTX p_nvrtcGetPTX = 0;
static PFN_nvrtcDestroyProgram p_nvrtcDestroyProgram = 0;
// CUDA state
static int gCU_Ready = 0;
static int gCU_UsedCalls = 0;
static int gCU_FailedCalls = 0;
static CUresult gCU_LastErr = 0;
static CUdevice gCU_Device = 0;
static CUcontext gCU_Ctx = 0;
static CUmodule gCU_Mod = 0;
static CUfunction gCU_Func = 0;
static CUdeviceptr gCU_dStage = 0; // float[N]
static CUdeviceptr gCU_dDh = 0; // float[N]
// CUDA kernel code compiled by NVRTC
static const char* gCU_Source =
"extern \"C\" __global__ void dh_kernel(const float* stage, float* dh, \n"
" float t, float damp, float amp, float scale, float stiff, int regime, \n"
" float u0, float u1, float u2) \n"
"{ \n"
" int i = (int)(blockIdx.x * blockDim.x + threadIdx.x); \n"
" if(i >= 48) return; \n"
" float ii = (float)(i+1); \n"
" float sum = 0.0f; \n"
" #pragma unroll \n"
" for(int j=0;j<48;j++) \n"
" { \n"
" float jj = (float)(j+1); \n"
" float bj = __sinf(0.17f*jj) * 0.03f; \n"
" float inj = u0 * 0.02f * __sinf(0.07f*jj + 0.19f*t); \n"
" float hj = stage[j]; \n"
" float in = hj + bj + inj; \n"
" float aj = in / (1.0f + fabsf(in)); \n"
" float ang = (0.013f*ii*jj) + (0.11f*ii) - (0.07f*jj); \n"
" float ww = __sinf(ang) * scale; \n"
" sum += ww * aj; \n"
" } \n"
" float forcing = __sinf(t + 0.05f*ii) * amp; \n"
" forcing += __sinf(20.0f*t + 0.10f*ii) * 0.08f; \n"
" forcing += (u1 * 0.10f) + (0.05f*u2*__sinf(0.9f*t)); \n"
" float hi = stage[i]; \n"
" float self = ((hi*1.8f) / (1.0f + fabsf(hi*1.8f))) * 0.15f; \n"
" float di = damp; \n"
" if((i & 7) == 0) di *= stiff; \n"
" float ghi = hi * di; \n"
" dh[i] = sum + ghi + forcing + self; \n"
"} \n";
static void cu_fail(CUresult e)
{
gCU_LastErr = e;
gCU_FailedCalls++;
gCU_Ready = 0;
#if CU_DEBUG
printf("\n[CUDA] ERROR %d -> fallback to CPU", (int)e);
#endif
}
// try load nvrtc dll from a list of common names
static HMODULE load_nvrtc_any()
{
const char* names[] = {
"nvrtc64_123_0.dll",
"nvrtc64_122_0.dll",
"nvrtc64_121_0.dll",
"nvrtc64_120_0.dll",
"nvrtc64_118_0.dll",
"nvrtc64_117_0.dll",
"nvrtc64_116_0.dll",
"nvrtc64_115_0.dll",
"nvrtc64_114_0.dll",
"nvrtc64_113_0.dll",
"nvrtc64_112_0.dll",
"nvrtc64_111_0.dll",
"nvrtc64_110_0.dll",
"nvrtc64_102_0.dll",
"nvrtc64_101_0.dll",
"nvrtc64_100_0.dll",
"nvrtc64_92.dll"
};
for(int i=0;i<(int)(sizeof(names)/sizeof(names[0])); i++)
{
HMODULE h = LoadLibraryA(names[i]);
if(h) return h;
}
// last resort: generic name (some setups provide it)
return LoadLibraryA("nvrtc64.dll");
}
static FARPROC sym(HMODULE h, const char* name)
{
if(!h) return 0;
return GetProcAddress(h, name);
}
static void cu_release_all()
{
if(gCU_dDh && p_cuMemFree) { p_cuMemFree(gCU_dDh); gCU_dDh = 0; }
if(gCU_dStage && p_cuMemFree) { p_cuMemFree(gCU_dStage); gCU_dStage = 0; }
if(gCU_Mod && p_cuModuleUnload) { p_cuModuleUnload(gCU_Mod); gCU_Mod = 0; }
if(gCU_Ctx && p_cuCtxDestroy) { p_cuCtxDestroy(gCU_Ctx); gCU_Ctx = 0; }
gCU_Ready = 0;
if(gNVRTC_Dll) { FreeLibrary(gNVRTC_Dll); gNVRTC_Dll = 0; }
if(gCU_Dll) { FreeLibrary(gCU_Dll); gCU_Dll = 0; }
}
static int cu_load()
{
// CUDA driver
gCU_Dll = LoadLibraryA("nvcuda.dll");
if(!gCU_Dll) return 0;
p_cuInit = (PFN_cuInit)sym(gCU_Dll, "cuInit");
p_cuDeviceGetCount = (PFN_cuDeviceGetCount)sym(gCU_Dll, "cuDeviceGetCount");
p_cuDeviceGet = (PFN_cuDeviceGet)sym(gCU_Dll, "cuDeviceGet");
p_cuDeviceComputeCapability = (PFN_cuDeviceComputeCapability)sym(gCU_Dll, "cuDeviceComputeCapability");
p_cuCtxCreate = (PFN_cuCtxCreate_v2)sym(gCU_Dll, "cuCtxCreate_v2");
p_cuCtxDestroy = (PFN_cuCtxDestroy_v2)sym(gCU_Dll, "cuCtxDestroy_v2");
p_cuModuleLoadDataEx = (PFN_cuModuleLoadDataEx)sym(gCU_Dll, "cuModuleLoadDataEx");
p_cuModuleGetFunction = (PFN_cuModuleGetFunction)sym(gCU_Dll, "cuModuleGetFunction");
p_cuModuleUnload = (PFN_cuModuleUnload)sym(gCU_Dll, "cuModuleUnload");
p_cuMemAlloc = (PFN_cuMemAlloc_v2)sym(gCU_Dll, "cuMemAlloc_v2");
p_cuMemFree = (PFN_cuMemFree_v2)sym(gCU_Dll, "cuMemFree_v2");
p_cuMemcpyHtoD = (PFN_cuMemcpyHtoD_v2)sym(gCU_Dll, "cuMemcpyHtoD_v2");
p_cuMemcpyDtoH = (PFN_cuMemcpyDtoH_v2)sym(gCU_Dll, "cuMemcpyDtoH_v2");
p_cuLaunchKernel = (PFN_cuLaunchKernel)sym(gCU_Dll, "cuLaunchKernel");
p_cuCtxSynchronize = (PFN_cuCtxSynchronize)sym(gCU_Dll, "cuCtxSynchronize");
if(!p_cuInit || !p_cuDeviceGetCount || !p_cuDeviceGet || !p_cuCtxCreate || !p_cuCtxDestroy ||
!p_cuModuleLoadDataEx || !p_cuModuleGetFunction || !p_cuModuleUnload ||
!p_cuMemAlloc || !p_cuMemFree || !p_cuMemcpyHtoD || !p_cuMemcpyDtoH ||
!p_cuLaunchKernel || !p_cuCtxSynchronize)
{
cu_release_all();
return 0;
}
// NVRTC
gNVRTC_Dll = load_nvrtc_any();
if(!gNVRTC_Dll)
{
#if CU_DEBUG
printf("\n[CUDA] NVRTC dll not found -> CPU fallback");
#endif
cu_release_all();
return 0;
}
p_nvrtcCreateProgram = (PFN_nvrtcCreateProgram)sym(gNVRTC_Dll, "nvrtcCreateProgram");
p_nvrtcCompileProgram = (PFN_nvrtcCompileProgram)sym(gNVRTC_Dll, "nvrtcCompileProgram");
p_nvrtcGetProgramLogSize = (PFN_nvrtcGetProgramLogSize)sym(gNVRTC_Dll, "nvrtcGetProgramLogSize");
p_nvrtcGetProgramLog = (PFN_nvrtcGetProgramLog)sym(gNVRTC_Dll, "nvrtcGetProgramLog");
p_nvrtcGetPTXSize = (PFN_nvrtcGetPTXSize)sym(gNVRTC_Dll, "nvrtcGetPTXSize");
p_nvrtcGetPTX = (PFN_nvrtcGetPTX)sym(gNVRTC_Dll, "nvrtcGetPTX");
p_nvrtcDestroyProgram = (PFN_nvrtcDestroyProgram)sym(gNVRTC_Dll, "nvrtcDestroyProgram");
if(!p_nvrtcCreateProgram || !p_nvrtcCompileProgram || !p_nvrtcGetProgramLogSize ||
!p_nvrtcGetProgramLog || !p_nvrtcGetPTXSize || !p_nvrtcGetPTX || !p_nvrtcDestroyProgram)
{
cu_release_all();
return 0;
}
return 1;
}
static int cu_init()
{
if(!cu_load()) return 0;
CUresult e = p_cuInit(0);
if(e != CUDA_SUCCESS) { cu_release_all(); return 0; }
int count = 0;
e = p_cuDeviceGetCount(&count);
if(e != CUDA_SUCCESS || count <= 0) { cu_release_all(); return 0; }
e = p_cuDeviceGet(&gCU_Device, 0);
if(e != CUDA_SUCCESS) { cu_release_all(); return 0; }
int maj = 0, min = 0;
if(p_cuDeviceComputeCapability)
p_cuDeviceComputeCapability(&maj, &min, gCU_Device);
e = p_cuCtxCreate(&gCU_Ctx, 0, gCU_Device);
if(e != CUDA_SUCCESS || !gCU_Ctx) { cu_release_all(); return 0; }
// Compile kernel to PTX via NVRTC
nvrtcProgram prog = 0;
nvrtcResult r = p_nvrtcCreateProgram(&prog, gCU_Source, "dh_kernel.cu", 0, 0, 0);
if(r != NVRTC_SUCCESS || !prog) { cu_release_all(); return 0; }
char archOpt[64];
if(maj <= 0) { maj = 5; min = 2; } // conservative fallback
sprintf(archOpt, "--gpu-architecture=compute_%d%d", maj, min);
const char* opts[] = {
archOpt,
"--use_fast_math"
};
r = p_nvrtcCompileProgram(prog, (int)(sizeof(opts)/sizeof(opts[0])), opts);
// log
size_t logSize = 0;
p_nvrtcGetProgramLogSize(prog, &logSize);
if(logSize > 1)
{
char* logbuf = (char*)malloc(logSize + 1);
if(logbuf)
{
logbuf[0] = 0;
p_nvrtcGetProgramLog(prog, logbuf);
#if CU_DEBUG
printf("\n[NVRTC] %s", logbuf);
#endif
free(logbuf);
}
}
if(r != NVRTC_SUCCESS)
{
p_nvrtcDestroyProgram(&prog);
cu_release_all();
return 0;
}
size_t ptxSize = 0;
r = p_nvrtcGetPTXSize(prog, &ptxSize);
if(r != NVRTC_SUCCESS || ptxSize < 8) { p_nvrtcDestroyProgram(&prog); cu_release_all(); return 0; }
char* ptx = (char*)malloc(ptxSize + 1);
if(!ptx) { p_nvrtcDestroyProgram(&prog); cu_release_all(); return 0; }
ptx[0] = 0;
r = p_nvrtcGetPTX(prog, ptx);
p_nvrtcDestroyProgram(&prog);
if(r != NVRTC_SUCCESS) { free(ptx); cu_release_all(); return 0; }
// Load PTX into driver
e = p_cuModuleLoadDataEx(&gCU_Mod, (const void*)ptx, 0, 0, 0);
free(ptx);
if(e != CUDA_SUCCESS || !gCU_Mod) { cu_release_all(); return 0; }
e = p_cuModuleGetFunction(&gCU_Func, gCU_Mod, "dh_kernel");
if(e != CUDA_SUCCESS || !gCU_Func) { cu_release_all(); return 0; }
// Allocate device buffers
e = p_cuMemAlloc(&gCU_dStage, sizeof(float)*N);
if(e != CUDA_SUCCESS || !gCU_dStage) { cu_release_all(); return 0; }
e = p_cuMemAlloc(&gCU_dDh, sizeof(float)*N);
if(e != CUDA_SUCCESS || !gCU_dDh) { cu_release_all(); return 0; }
gCU_Ready = 1;
gCU_UsedCalls = 0;
gCU_FailedCalls = 0;
gCU_LastErr = 0;
#if CU_DEBUG
printf("\n[CUDA] init OK (device cc=%d.%d)", maj, min);
#endif
return 1;
}
// GPU drift eval (CUDA)
static void f_theta_vec_cuda(var t)
{
eval_u(t);
var damp, amp, scale, stiff;
theta_t(t, &damp, &, &scale, &stiff);
float stageF[N];
float dhF[N] = {0};
for(int k=0;k<N;k++) stageF[k] = (float)G_stage[k];
CUresult e = p_cuMemcpyHtoD(gCU_dStage, stageF, sizeof(float)*N);
if(e != CUDA_SUCCESS) { cu_fail(e); return; }
// Kernel params (must match kernel signature)
float tf = (float)t;
float dampf = (float)damp;
float ampf = (float)amp;
float scalef = (float)scale;
float stifff = (float)stiff;
int regime = (int)G_Regime;
float u0 = (float)G_u[0];
float u1 = (float)G_u[1];
float u2 = (float)G_u[2];
void* params[] = {
(void*)&gCU_dStage,
(void*)&gCU_dDh,
(void*)&tf,
(void*)&dampf,
(void*)&f,
(void*)&scalef,
(void*)&stifff,
(void*)®ime,
(void*)&u0,
(void*)&u1,
(void*)&u2
};
// launch: 1 block of 64 threads is enough for N=48
unsigned int block = 64;
unsigned int grid = 1;
e = p_cuLaunchKernel(gCU_Func,
grid,1,1,
block,1,1,
0, 0,
params, 0);
if(e != CUDA_SUCCESS) { cu_fail(e); return; }
e = p_cuCtxSynchronize();
if(e != CUDA_SUCCESS) { cu_fail(e); return; }
e = p_cuMemcpyDtoH(dhF, gCU_dDh, sizeof(float)*N);
if(e != CUDA_SUCCESS) { cu_fail(e); return; }
for(int k=0;k<N;k++) G_dh[k] = (var)dhF[k];
gCU_UsedCalls++;
#if CU_DEBUG
static int printed = 0;
if(!printed)
{
printed = 1;
printf("\n[CUDA] dh sample (t=%g): dh[0]=%.6g dh[1]=%.6g dh[2]=%.6g dh[3]=%.6g",
t, (double)G_dh[0], (double)G_dh[1], (double)G_dh[2], (double)G_dh[3]);
}
#endif
}
#endif // USE_CUDA
// ===========================================================
// CPU drift eval
// ===========================================================
static void f_theta_vec_cpu(var t)
{
eval_u(t);
var damp, amp, scale, stiff;
theta_t(t, &damp, &, &scale, &stiff);
for(int i=0;i<N;i++)
{
var sum = 0;
var ii = (i+1);
for(int j=0;j<N;j++)
{
var jj = (j+1);
var bj = sin(0.17*jj) * 0.03;
var inj = G_u[0] * 0.02 * sin(0.07*jj + 0.19*t);
var hj = G_stage[j];
var in = hj + bj + inj;
var aj = softsign(in);
var ang = (0.013*ii*jj) + (0.11*ii) - (0.07*jj);
var ww = sin(ang) * scale;
sum += ww * aj;
}
var forcing = sin(t + 0.05*ii) * amp;
forcing += sin(20.0*t + 0.10*ii) * 0.08;
forcing += (G_u[1] * 0.10) + (0.05*G_u[2]*sin(0.9*t));
var hi = G_stage[i];
var self = softsign(hi*1.8) * 0.15;
var di = damp;
if((i & 7) == 0) di *= stiff;
var ghi = hi * di;
G_dh[i] = sum + ghi + forcing + self;
}
}
static void f_theta_vec(var t)
{
#if USE_CUDA
if(gCU_Ready)
{
f_theta_vec_cuda(t);
if(gCU_Ready) return;
}
#endif
f_theta_vec_cpu(t);
}
// -------------------- Vector RK step --------------------
static void rk_step_vec(int s, float* Atri, float* b, float* c,
var t, var* y, var h, var* yout)
{
if(s < 1)
{
for(int k=0;k<N;k++) yout[k] = y[k];
return;
}
if(s > MAXS) s = MAXS;
for(int i=0;i<s;i++)
for(int k=0;k<N;k++)
G_K[i*N + k] = (fvar)0;
for(int i=0;i<s;i++)
{
for(int k=0;k<N;k++)
{
var acc = y[k];
for(int j=0;j<i;j++)
acc += ((var)G_K[j*N + k]) * getA(Atri,i,j) * h;
G_stage[k] = acc;
}
var dt = ((var)c[i]) * h;
f_theta_vec(t + dt);
for(int k=0;k<N;k++)
G_K[i*N + k] = (fvar)G_dh[k];
}
for(int k=0;k<N;k++)
{
var acc = y[k];
for(int i=0;i<s;i++)
acc += ((var)G_K[i*N + k]) * (var)b[i] * h;
yout[k] = acc;
}
}
static var err_norm(var* a, var* b)
{
var s = 0;
for(int k=0;k<N;k++) { var d = a[k] - b[k]; s += d*d; }
return sqrt(s / (var)N);
}
static var rms_norm(var* a, int n)
{
var s = 0;
for(int k=0;k<n;k++) { var x = a[k]; s += x*x; }
return sqrt(s / (var)n);
}
// -------------------- Tableau permutation --------------------
static void permute_method_into(int s, float* A, float* b, float* c,
int* p,
int* sOut, float* Aout, float* bout, float* cout)
{
*sOut = s;
if(*sOut < 0) *sOut = 0;
if(*sOut > MAXS) *sOut = MAXS;
for(int i=0;i<TRI_A;i++) Aout[i] = 0;
for(int i=0;i<MAXS;i++) { bout[i]=0; cout[i]=0; }
for(int i=0;i<*sOut;i++)
{
int pi = p[i];
if(pi < 0) pi = 0;
if(pi >= s) pi = s-1;
bout[i] = b[pi];
cout[i] = c[pi];
}
for(int i=0;i<*sOut;i++)
for(int j=0;j<i;j++)
{
int pi = p[i];
int pj = p[j];
if(pi < 0) pi = 0;
if(pj < 0) pj = 0;
if(pi >= s) pi = s-1;
if(pj >= s) pj = s-1;
setA(Aout, i, j, getA(A, pi, pj));
}
}
static void make_swappable_rk4(int* s, float* Atri, float* b, float* c)
{
zeroTableau(s, Atri, b, c);
*s = 4;
c[0] = 0.0f;
c[1] = 0.30f;
c[2] = 0.80f;
c[3] = 1.00f;
setA(Atri, 1, 0, 0.30);
setA(Atri, 2, 0, 0.80);
setA(Atri, 3, 1, 0.50);
setA(Atri, 3, 2, 0.50);
b[0] = 0.10f;
b[1] = 0.20f;
b[2] = 0.30f;
b[3] = 0.40f;
}
static void apply_diffusion(var t, var dt, var* y)
{
eval_u(t);
var u0 = G_u[0];
var u2 = G_u[2];
var sigma = G_SigmaBase * (1.0 + 0.6*fabs(u0) + 0.3*fabs(u2));
if(G_Regime) sigma *= 1.25;
var sdt = sqrt(dt);
for(int k=0;k<N;k++)
{
var gmul = 1.0 + (fabs(y[k]) * 0.15);
y[k] += sigma * gmul * sdt * rng_norm();
}
}
static void integrate_adaptive_sde(int s, float* A, float* b, float* c,
var t0, var* y0, var tEnd,
unsigned int seed,
var* yOut,
int* stepsOut, int* rejectsOut,
var* dtMinOut,
int* stopReasonOut)
{
rng_seed(seed);
G_Regime = 0;
var t = t0;
var dt = DT_INIT;
var dtMinSeen = 1e9;
for(int k=0;k<N;k++) G_ycur[k] = y0[k];
int steps = 0, rejects = 0, iters = 0, dtMinHits = 0, steadyCount = 0;
int stopReason = 0;
while(t < tEnd)
{
if((iters & 255) == 0) shed_if_needed();
iters++;
if(dt < DT_MIN) dt = DT_MIN;
if(dt > G_DtMax) dt = G_DtMax;
if(t + dt > tEnd) dt = tEnd - t;
if(dt < dtMinSeen) dtMinSeen = dt;
if(dt <= DT_MIN) dtMinHits++;
if(dtMinHits > G_DtMinHitMax) { stopReason = 4; break; }
rk_step_vec(s, A, b, c, t, G_ycur, dt, G_full);
var halfdt = dt * 0.5;
rk_step_vec(s, A, b, c, t, G_ycur, halfdt, G_mid);
rk_step_vec(s, A, b, c, t+halfdt, G_mid, halfdt, G_half);
var e = err_norm(G_half, G_full);
if(e <= G_Tol || dt <= DT_MIN)
{
for(int k=0;k<N;k++) G_ycur[k] = G_half[k];
eval_u(t);
update_regime(t, dt);
apply_diffusion(t, dt, G_ycur);
apply_jump_if_needed(t, dt, G_ycur);
observe_from_hidden(t + dt, G_ycur);
var yr = rms_norm(G_ycur, N);
var xr = rms_norm(G_xobs, NOBS);
if(yr > G_StateLimitRMS || xr > G_ObsLimitRMS)
{
stopReason = 2;
t += dt; steps++;
break;
}
for(int k=0;k<N;k++) G_stage[k] = G_ycur[k];
f_theta_vec(t + dt);
var dr = rms_norm(G_dh, N);
if(dr < G_SteadyEps) steadyCount++; else steadyCount = 0;
t += dt; steps++;
if(steadyCount >= G_SteadyNeed) { stopReason = 1; break; }
if(e < 1e-16) dt *= 2.0;
else
{
var fac = pow(G_Tol / e, 0.2) * 0.9;
if(fac < 0.5) fac = 0.5;
if(fac > 2.0) fac = 2.0;
dt *= fac;
}
}
else
{
rejects++;
if(rejects > G_MaxRejects) { stopReason = 3; break; }
dt *= 0.5;
}
}
for(int k=0;k<N;k++) yOut[k] = G_ycur[k];
*stepsOut = steps;
*rejectsOut = rejects;
*dtMinOut = dtMinSeen;
*stopReasonOut = stopReason;
}
// self-test: CPU vs CUDA drift numbers
static void drift_selftest()
{
for(int k=0;k<N;k++) G_stage[k] = Y0[k];
var ttest = 0.5;
var dh_cpu[N];
f_theta_vec_cpu(ttest);
for(int k=0;k<N;k++) dh_cpu[k] = G_dh[k];
#if USE_CUDA
if(gCU_Ready)
{
f_theta_vec_cuda(ttest);
var dh_gpu[N];
for(int k=0;k<N;k++) dh_gpu[k] = G_dh[k];
var s = 0;
for(int k=0;k<N;k++) { var d = dh_gpu[k] - dh_cpu[k]; s += d*d; }
var rms = sqrt(s / (var)N);
printf("\n[SelfTest] dh CPU vs CUDA @t=%g: RMS diff = %.3e", ttest, (double)rms);
printf("\n[SelfTest] CPU dh[0..3]=%.6g %.6g %.6g %.6g",
(double)dh_cpu[0], (double)dh_cpu[1], (double)dh_cpu[2], (double)dh_cpu[3]);
printf("\n[SelfTest] CUDA dh[0..3]=%.6g %.6g %.6g %.6g",
(double)dh_gpu[0], (double)dh_gpu[1], (double)dh_gpu[2], (double)dh_gpu[3]);
}
else
{
printf("\n[SelfTest] CUDA not ready -> skipped GPU compare");
}
#endif
}
// -------------------- Entry point --------------------
DLLFUNC int main()
{
#if USE_CUDA
if(cu_init())
printf("\nCUDA: enabled (Driver API + NVRTC)");
else
printf("\nCUDA: not available -> CPU fallback");
#endif
// reset knobs
G_ShedTriggered = 0;
G_Tol = TOL;
G_DtMax = DT_MAX;
G_MaxRejects = 2000000;
G_DtMinHitMax = 500000;
G_SteadyEps = 1e-7;
G_SteadyNeed = 250;
G_StateLimitRMS = 1e6;
G_ObsLimitRMS = 1e6;
G_SigmaBase = 0.02;
G_JumpLambdaBase = 0.20;
G_JumpMagBase = 0.20;
G_RegRate01 = 0.15;
G_RegRate10 = 0.10;
int S = 0;
float A[TRI_A];
float b[MAXS];
float c[MAXS];
make_swappable_rk4(&S, A, b, c);
int SB = 0;
float AB[TRI_A];
float bB[MAXS];
float cB[MAXS];
int p[MAXS];
p[0]=0; p[1]=2; p[2]=1; p[3]=3;
permute_method_into(S, A, b, c, p, &SB, AB, bB, cB);
int SW = S;
float AW[TRI_A];
float bW[MAXS];
float cW[MAXS];
for(int i=0;i<TRI_A;i++) AW[i] = A[i];
for(int i=0;i<MAXS;i++) { bW[i]=0; cW[i]=0; }
for(int i=0;i<SW;i++)
{
int pi = p[i];
bW[i] = b[pi];
cW[i] = c[pi];
}
for(int k=0;k<N;k++)
Y0[k] = 0.2*sin(0.31*(k+1)) + 0.15*cos(0.17*(k+1))
+ 0.05*sin(0.07*(k+1)*(k+1));
var t0 = 0.0;
var now = wdate(NOW);
unsigned int MySeed = (unsigned int)utm(now);
var frac = now - floor(now);
var ms = frac * 86400000.0;
MySeed = 1664525u * MySeed + (unsigned int)ms + 1013904223u;
if(MySeed == 0) MySeed = 1;
printf("\nMySeed=%u", MySeed);
drift_selftest();
int stepsRef=0, rejRef=0; var dtMinRef=0; int stopRef=0;
int stepsSafe=0, rejSafe=0; var dtMinSafe=0; int stopSafe=0;
int stepsBug=0, rejBug=0; var dtMinBug=0; int stopBug=0;
integrate_adaptive_sde(S, A, b, c, t0, Y0, T_END, MySeed, Yref,
&stepsRef, &rejRef, &dtMinRef, &stopRef);
integrate_adaptive_sde(SB, AB, bB, cB, t0, Y0, T_END, MySeed, Ysafe,
&stepsSafe, &rejSafe, &dtMinSafe, &stopSafe);
integrate_adaptive_sde(SW, AW, bW, cW, t0, Y0, T_END, MySeed, Ybug,
&stepsBug, &rejBug, &dtMinBug, &stopBug);
var diffSafe = err_norm(Ysafe, Yref);
var diffBug = err_norm(Ybug, Yref);
printf("\n\nNeural SDE / Hybrid ODE integration (N=%d) to T=%g baseTol=%g", N, T_END, TOL);
printf("\nA) Reference tableau:");
printf("\n accepted=%d rejected=%d minDt=%g stop=%d", stepsRef, rejRef, dtMinRef, stopRef);
printf("\nB) Safe stage-refactor:");
printf("\n accepted=%d rejected=%d minDt=%g stop=%d", stepsSafe, rejSafe, dtMinSafe, stopSafe);
printf("\n diff to reference (RMS) = %.3e", (double)diffSafe);
printf("\nC) Buggy stage-refactor:");
printf("\n accepted=%d rejected=%d minDt=%g stop=%d", stepsBug, rejBug, dtMinBug, stopBug);
printf("\n diff to reference (RMS) = %.3e", (double)diffBug);
#if USE_CUDA
printf("\n\nCUDA usage: gpu_calls=%d failed_calls=%d last_err=%d",
gCU_UsedCalls, gCU_FailedCalls, (int)gCU_LastErr);
cu_release_all();
#endif
printf("\n\nStop codes: 0=tEnd, 1=steady-state, 2=event(blow-up), 3=rejectBudget, 4=dtMinBudget\n");
return quit("done");
}