// demo_neuralODE_complex_382A_adaptive_main_opencl_nosdk.cpp (Zorro64 OpenCL C++)
//
// Headerless OpenCL dynamic loader (no CL/cl.h, no OpenCL.lib).
// Prints numerical outputs proving GPU drift is computed, and reports GPU usage.
// If OpenCL fails at any point -> CPU fallback.
#include <zorro.h>
#include <math.h>
#include <windows.h>
#define USE_OPENCL 1
#define CL_DEBUG 1 // <-- set to 0 if you want quiet mode
#define MAXS 4
#define TRI_A 6
// -------------------- Dimensions --------------------
#define N 48
#define K_SIZE 192 // MUST be literal: MAXS*N = 4*48 = 192
#define U_DIM 3
#define NOBS 12
// -------------------- Base knobs --------------------
#define T_END 20.0
#define DT_INIT 0.20
#define DT_MIN 1e-6
#define DT_MAX 0.50
#define TOL 1e-6
// -------------------- Shedding --------------------
#define MEM_BUDGET_MB 2000
static int G_ShedTriggered = 0;
// runtime knobs
static var G_Tol = TOL;
static var G_DtMax = DT_MAX;
// budgets
static int G_MaxRejects = 2000000;
static int G_DtMinHitMax = 500000;
// termination knobs
static var G_SteadyEps = 1e-7;
static int G_SteadyNeed = 250;
static var G_StateLimitRMS = 1e6;
static var G_ObsLimitRMS = 1e6;
// SDE / hybrid knobs
static var G_SigmaBase = 0.02;
static var G_JumpLambdaBase = 0.20;
static var G_JumpMagBase = 0.20;
static var G_RegRate01 = 0.15;
static var G_RegRate10 = 0.10;
// -------------------- TIGHT_MEM types --------------------
#define TIGHT_MEM 1
#ifdef TIGHT_MEM
typedef float fvar;
#else
typedef var fvar;
#endif
// -------------------- Global buffers --------------------
static fvar G_K[K_SIZE] = { 0 };
static var G_stage[N] = { 0 };
static var G_dh[N] = { 0 };
static var G_ycur[N] = { 0 };
static var G_full[N] = { 0 };
static var G_half[N] = { 0 };
static var G_mid[N] = { 0 };
static var Y0[N] = { 0 };
static var Yref[N] = { 0 };
static var Ysafe[N] = { 0 };
static var Ybug[N] = { 0 };
static var G_xhat[NOBS] = { 0 };
static var G_xobs[NOBS] = { 0 };
static var G_u[U_DIM] = { 0 };
// -------------------- RNG --------------------
static unsigned int G_Rng = 1;
static void rng_seed(unsigned int s)
{
if(s == 0) s = 1;
G_Rng = s;
}
static var rng_u01()
{
G_Rng = 1664525u * G_Rng + 1013904223u;
unsigned int x = (G_Rng >> 8) & 0x00FFFFFFu;
return ((var)x + 1.0) / 16777217.0;
}
static var rng_norm()
{
var u1 = rng_u01();
var u2 = rng_u01();
var r = sqrt(-2.0 * log(u1));
var th = 6.283185307179586 * u2;
return r * cos(th);
}
// -------------------- Utility --------------------
static int memMB()
{
var m = memory(0);
if(!(m > 0)) return 0;
return (int)(m/(1024*1024));
}
static void shed_if_needed()
{
if(G_ShedTriggered) return;
int mb = memMB();
if(mb >= MEM_BUDGET_MB)
{
G_ShedTriggered = 1;
G_Tol *= 10.0;
G_DtMax *= 0.5;
if(G_DtMax < DT_MIN) G_DtMax = DT_MIN;
G_MaxRejects = (int)(G_MaxRejects/2);
if(G_MaxRejects < 10000) G_MaxRejects = 10000;
G_DtMinHitMax = (int)(G_DtMinHitMax/2);
if(G_DtMinHitMax < 10000) G_DtMinHitMax = 10000;
}
}
// -------------------- Triangular indexing --------------------
static int Aidx(int i, int j) { return (i*(i-1))/2 + j; }
static var getA(float* Atri, int i, int j)
{
if(i <= j) return 0;
return (var)Atri[Aidx(i,j)];
}
static void setA(float* Atri, int i, int j, var v)
{
if(i <= j) return;
Atri[Aidx(i,j)] = (float)v;
}
static void zeroTableau(int* s, float* Atri, float* b, float* c)
{
int i;
*s = 0;
for(i=0;i<TRI_A;i++) Atri[i] = 0;
for(i=0;i<MAXS;i++) { b[i]=0; c[i]=0; }
}
// -------------------- Activation --------------------
static var softsign(var z) { return z / (1.0 + fabs(z)); }
// -------------------- Exogenous input u(t) --------------------
static int G_Regime = 0;
static void eval_u(var t)
{
var u0 = sin(0.11*t) * 0.8;
if(G_Regime) u0 += 0.35;
var u1 = cos(0.37*t) * 0.6;
var u2 = sin(2.30*t) * 0.3;
G_u[0] = u0;
G_u[1] = u1;
G_u[2] = u2;
}
// -------------------- theta(t) --------------------
static void theta_t(var t, var* dampOut, var* ampOut, var* scaleOut, var* stiffOut)
{
var damp = -0.40;
var amp = 0.35;
var scale = 0.06;
var drift = sin(0.03*t) * 0.25;
var u0 = G_u[0];
var u1 = G_u[1];
if(G_Regime)
{
amp *= 1.15;
scale *= 1.10;
damp *= 1.05;
}
amp *= (1.0 + 0.30*drift + 0.15*u0);
scale *= (1.0 + 0.20*drift + 0.10*u1);
damp *= (1.0 + 0.15*drift - 0.10*u0);
*dampOut = damp;
*ampOut = amp;
*scaleOut = scale;
*stiffOut = 2.8;
}
// -------------------- Regime switching --------------------
static void update_regime(var t, var dt)
{
var u0 = G_u[0];
var u1 = G_u[1];
var stress = fabs(u0) + 0.5*fabs(u1);
var r01 = G_RegRate01 * (1.0 + 0.8*stress);
var r10 = G_RegRate10 * (1.0 + 0.5*stress);
var p;
if(G_Regime == 0)
{
p = 1.0 - exp(-r01*dt);
if(rng_u01() < p) G_Regime = 1;
}
else
{
p = 1.0 - exp(-r10*dt);
if(rng_u01() < p) G_Regime = 0;
}
}
// -------------------- Jumps --------------------
static void apply_jump_if_needed(var t, var dt, var* y)
{
var u0 = G_u[0];
var lam = G_JumpLambdaBase * (1.0 + 0.7*fabs(u0));
if(G_Regime) lam *= 1.25;
var p = 1.0 - exp(-lam*dt);
if(rng_u01() >= p) return;
var uu = rng_u01() - 0.5;
var j = tan(3.141592653589793 * uu);
if(j > 6.0) j = 6.0;
if(j < -6.0) j = -6.0;
var mag = G_JumpMagBase * (1.0 + 0.5*fabs(u0));
if(G_Regime) mag *= 1.2;
for(int k=0;k<N;k++)
{
if(rng_u01() < 0.25)
{
var delta = mag * j;
delta *= (1.0 + 0.01*(k+1));
y[k] += delta;
}
}
}
// -------------------- Observations --------------------
static void observe_from_hidden(var t, var* h)
{
for(int k=0;k<NOBS;k++)
{
var a = h[k];
var b = h[k + NOBS];
var z = a + 0.30*b + 0.05*sin(0.17*t + 0.3*(k+1));
var xhat = z / (1.0 + fabs(z));
G_xhat[k] = xhat;
G_xobs[k] = xhat + 0.01*rng_norm();
}
}
// ===========================================================
// OpenCL (headerless dynamic loader)
// ===========================================================
#if USE_OPENCL
typedef int cl_int;
typedef unsigned int cl_uint;
typedef unsigned long long cl_ulong;
typedef unsigned long cl_bitfield;
typedef cl_bitfield cl_device_type;
typedef cl_bitfield cl_mem_flags;
typedef cl_uint cl_bool;
typedef struct _cl_platform_id* cl_platform_id;
typedef struct _cl_device_id* cl_device_id;
typedef struct _cl_context* cl_context;
typedef struct _cl_command_queue* cl_command_queue;
typedef struct _cl_program* cl_program;
typedef struct _cl_kernel* cl_kernel;
typedef struct _cl_mem* cl_mem;
#define CL_SUCCESS 0
#define CL_TRUE 1
#define CL_DEVICE_TYPE_ALL (1ULL << 0)
#define CL_DEVICE_TYPE_GPU (1ULL << 2)
#define CL_MEM_READ_ONLY (1ULL << 2)
#define CL_MEM_WRITE_ONLY (1ULL << 1)
#define CL_PROGRAM_BUILD_LOG 0x1183
typedef cl_int (__stdcall *PFN_clGetPlatformIDs)(cl_uint, cl_platform_id*, cl_uint*);
typedef cl_int (__stdcall *PFN_clGetDeviceIDs)(cl_platform_id, cl_device_type, cl_uint, cl_device_id*, cl_uint*);
typedef cl_context (__stdcall *PFN_clCreateContext)(const void*, cl_uint, const cl_device_id*, void*, void*, cl_int*);
typedef cl_command_queue (__stdcall *PFN_clCreateCommandQueue)(cl_context, cl_device_id, cl_bitfield, cl_int*);
typedef cl_program (__stdcall *PFN_clCreateProgramWithSource)(cl_context, cl_uint, const char**, const size_t*, cl_int*);
typedef cl_int (__stdcall *PFN_clBuildProgram)(cl_program, cl_uint, const cl_device_id*, const char*, void*, void*);
typedef cl_int (__stdcall *PFN_clGetProgramBuildInfo)(cl_program, cl_device_id, cl_uint, size_t, void*, size_t*);
typedef cl_kernel(__stdcall *PFN_clCreateKernel)(cl_program, const char*, cl_int*);
typedef cl_mem (__stdcall *PFN_clCreateBuffer)(cl_context, cl_mem_flags, size_t, void*, cl_int*);
typedef cl_int (__stdcall *PFN_clSetKernelArg)(cl_kernel, cl_uint, size_t, const void*);
typedef cl_int (__stdcall *PFN_clEnqueueWriteBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void*, cl_uint, const void*, void*);
typedef cl_int (__stdcall *PFN_clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*, cl_uint, const void*, void*);
typedef cl_int (__stdcall *PFN_clFinish)(cl_command_queue);
typedef cl_int (__stdcall *PFN_clEnqueueReadBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void*, cl_uint, const void*, void*);
typedef cl_int (__stdcall *PFN_clReleaseMemObject)(cl_mem);
typedef cl_int (__stdcall *PFN_clReleaseKernel)(cl_kernel);
typedef cl_int (__stdcall *PFN_clReleaseProgram)(cl_program);
typedef cl_int (__stdcall *PFN_clReleaseCommandQueue)(cl_command_queue);
typedef cl_int (__stdcall *PFN_clReleaseContext)(cl_context);
static HMODULE gCL_Dll = 0;
static PFN_clGetPlatformIDs p_clGetPlatformIDs = 0;
static PFN_clGetDeviceIDs p_clGetDeviceIDs = 0;
static PFN_clCreateContext p_clCreateContext = 0;
static PFN_clCreateCommandQueue p_clCreateCommandQueue = 0;
static PFN_clCreateProgramWithSource p_clCreateProgramWithSource = 0;
static PFN_clBuildProgram p_clBuildProgram = 0;
static PFN_clGetProgramBuildInfo p_clGetProgramBuildInfo = 0;
static PFN_clCreateKernel p_clCreateKernel = 0;
static PFN_clCreateBuffer p_clCreateBuffer = 0;
static PFN_clSetKernelArg p_clSetKernelArg = 0;
static PFN_clEnqueueWriteBuffer p_clEnqueueWriteBuffer = 0;
static PFN_clEnqueueNDRangeKernel p_clEnqueueNDRangeKernel = 0;
static PFN_clFinish p_clFinish = 0;
static PFN_clEnqueueReadBuffer p_clEnqueueReadBuffer = 0;
static PFN_clReleaseMemObject p_clReleaseMemObject = 0;
static PFN_clReleaseKernel p_clReleaseKernel = 0;
static PFN_clReleaseProgram p_clReleaseProgram = 0;
static PFN_clReleaseCommandQueue p_clReleaseCommandQueue = 0;
static PFN_clReleaseContext p_clReleaseContext = 0;
static int gCL_Ready = 0;
static int gCL_UsedCalls = 0;
static int gCL_FailedCalls = 0;
static cl_int gCL_LastErr = 0;
static cl_platform_id gCL_Platform = 0;
static cl_device_id gCL_Device = 0;
static cl_context gCL_Context = 0;
static cl_command_queue gCL_Queue = 0;
static cl_program gCL_Program = 0;
static cl_kernel gCL_K_DH = 0;
static cl_mem gCL_B_Stage = 0;
static cl_mem gCL_B_DH = 0;
static const char* gCL_Source =
"__kernel void dh_kernel(__global const float* stage, __global float* dh, \n"
" float t, float damp, float amp, float scale, float stiff, int regime, \n"
" float u0, float u1, float u2) \n"
"{ \n"
" int i = get_global_id(0); \n"
" if(i >= 48) return; \n"
" float ii = (float)(i+1); \n"
" float sum = 0.0f; \n"
" for(int j=0;j<48;j++) \n"
" { \n"
" float jj = (float)(j+1); \n"
" float bj = sin(0.17f*jj) * 0.03f; \n"
" float inj = u0 * 0.02f * sin(0.07f*jj + 0.19f*t); \n"
" float hj = stage[j]; \n"
" float in = hj + bj + inj; \n"
" float aj = in / (1.0f + fabs(in)); \n"
" float ang = (0.013f*ii*jj) + (0.11f*ii) - (0.07f*jj); \n"
" float ww = sin(ang) * scale; \n"
" sum += ww * aj; \n"
" } \n"
" float forcing = sin(t + 0.05f*ii) * amp; \n"
" forcing += sin(20.0f*t + 0.10f*ii) * 0.08f; \n"
" forcing += (u1 * 0.10f) + (0.05f*u2*sin(0.9f*t)); \n"
" float hi = stage[i]; \n"
" float self = ( (hi*1.8f) / (1.0f + fabs(hi*1.8f)) ) * 0.15f; \n"
" float di = damp; \n"
" if((i & 7) == 0) di *= stiff; \n"
" float ghi = hi * di; \n"
" dh[i] = sum + ghi + forcing + self; \n"
"} \n";
static void cl_release_all()
{
if(gCL_B_DH && p_clReleaseMemObject) { p_clReleaseMemObject(gCL_B_DH); gCL_B_DH = 0; }
if(gCL_B_Stage && p_clReleaseMemObject) { p_clReleaseMemObject(gCL_B_Stage); gCL_B_Stage = 0; }
if(gCL_K_DH && p_clReleaseKernel) { p_clReleaseKernel(gCL_K_DH); gCL_K_DH = 0; }
if(gCL_Program && p_clReleaseProgram) { p_clReleaseProgram(gCL_Program); gCL_Program = 0; }
if(gCL_Queue && p_clReleaseCommandQueue) { p_clReleaseCommandQueue(gCL_Queue); gCL_Queue = 0; }
if(gCL_Context && p_clReleaseContext) { p_clReleaseContext(gCL_Context); gCL_Context = 0; }
gCL_Device = 0;
gCL_Platform = 0;
gCL_Ready = 0;
if(gCL_Dll)
{
FreeLibrary(gCL_Dll);
gCL_Dll = 0;
}
}
static FARPROC cl_sym(const char* name)
{
if(!gCL_Dll) return 0;
return GetProcAddress(gCL_Dll, name);
}
static int cl_load()
{
gCL_Dll = LoadLibraryA("OpenCL.dll");
if(!gCL_Dll) return 0;
p_clGetPlatformIDs = (PFN_clGetPlatformIDs)cl_sym("clGetPlatformIDs");
p_clGetDeviceIDs = (PFN_clGetDeviceIDs)cl_sym("clGetDeviceIDs");
p_clCreateContext = (PFN_clCreateContext)cl_sym("clCreateContext");
p_clCreateCommandQueue = (PFN_clCreateCommandQueue)cl_sym("clCreateCommandQueue");
p_clCreateProgramWithSource = (PFN_clCreateProgramWithSource)cl_sym("clCreateProgramWithSource");
p_clBuildProgram = (PFN_clBuildProgram)cl_sym("clBuildProgram");
p_clGetProgramBuildInfo = (PFN_clGetProgramBuildInfo)cl_sym("clGetProgramBuildInfo");
p_clCreateKernel = (PFN_clCreateKernel)cl_sym("clCreateKernel");
p_clCreateBuffer = (PFN_clCreateBuffer)cl_sym("clCreateBuffer");
p_clSetKernelArg = (PFN_clSetKernelArg)cl_sym("clSetKernelArg");
p_clEnqueueWriteBuffer = (PFN_clEnqueueWriteBuffer)cl_sym("clEnqueueWriteBuffer");
p_clEnqueueNDRangeKernel = (PFN_clEnqueueNDRangeKernel)cl_sym("clEnqueueNDRangeKernel");
p_clFinish = (PFN_clFinish)cl_sym("clFinish");
p_clEnqueueReadBuffer = (PFN_clEnqueueReadBuffer)cl_sym("clEnqueueReadBuffer");
p_clReleaseMemObject = (PFN_clReleaseMemObject)cl_sym("clReleaseMemObject");
p_clReleaseKernel = (PFN_clReleaseKernel)cl_sym("clReleaseKernel");
p_clReleaseProgram = (PFN_clReleaseProgram)cl_sym("clReleaseProgram");
p_clReleaseCommandQueue = (PFN_clReleaseCommandQueue)cl_sym("clReleaseCommandQueue");
p_clReleaseContext = (PFN_clReleaseContext)cl_sym("clReleaseContext");
if(!p_clGetPlatformIDs || !p_clGetDeviceIDs || !p_clCreateContext || !p_clCreateCommandQueue ||
!p_clCreateProgramWithSource || !p_clBuildProgram || !p_clCreateKernel || !p_clCreateBuffer ||
!p_clSetKernelArg || !p_clEnqueueWriteBuffer || !p_clEnqueueNDRangeKernel || !p_clFinish ||
!p_clEnqueueReadBuffer)
{
cl_release_all();
return 0;
}
return 1;
}
static int cl_init()
{
if(!cl_load()) return 0;
cl_int err = CL_SUCCESS;
cl_uint nPlatforms = 0;
err = p_clGetPlatformIDs(0, 0, &nPlatforms);
if(err != CL_SUCCESS || nPlatforms == 0) { cl_release_all(); return 0; }
cl_platform_id platforms[8];
if(nPlatforms > 8) nPlatforms = 8;
err = p_clGetPlatformIDs(nPlatforms, platforms, &nPlatforms);
if(err != CL_SUCCESS) { cl_release_all(); return 0; }
cl_platform_id chosenP = 0;
cl_device_id chosenD = 0;
// choose GPU device if possible
for(cl_uint p=0; p<nPlatforms; p++)
{
cl_uint nDev = 0;
if(p_clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_GPU, 0, 0, &nDev) == CL_SUCCESS && nDev > 0)
{
cl_device_id devs[8];
if(nDev > 8) nDev = 8;
if(p_clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_GPU, nDev, devs, &nDev) == CL_SUCCESS)
{
chosenP = platforms[p];
chosenD = devs[0];
break;
}
}
}
// fallback: any device
if(!chosenD)
{
for(cl_uint p=0; p<nPlatforms; p++)
{
cl_uint nDev = 0;
if(p_clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, 0, 0, &nDev) == CL_SUCCESS && nDev > 0)
{
cl_device_id devs[8];
if(nDev > 8) nDev = 8;
if(p_clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, nDev, devs, &nDev) == CL_SUCCESS)
{
chosenP = platforms[p];
chosenD = devs[0];
break;
}
}
}
}
if(!chosenD) { cl_release_all(); return 0; }
gCL_Platform = chosenP;
gCL_Device = chosenD;
gCL_Context = p_clCreateContext(0, 1, &gCL_Device, 0, 0, &err);
if(err != CL_SUCCESS || !gCL_Context) { cl_release_all(); return 0; }
gCL_Queue = p_clCreateCommandQueue(gCL_Context, gCL_Device, 0, &err);
if(err != CL_SUCCESS || !gCL_Queue) { cl_release_all(); return 0; }
gCL_Program = p_clCreateProgramWithSource(gCL_Context, 1, &gCL_Source, 0, &err);
if(err != CL_SUCCESS || !gCL_Program) { cl_release_all(); return 0; }
// NOTE: you can optionally pass build options here (some drivers like it):
// const char* opts = "-cl-fast-relaxed-math";
// err = p_clBuildProgram(gCL_Program, 1, &gCL_Device, opts, 0, 0);
err = p_clBuildProgram(gCL_Program, 1, &gCL_Device, 0, 0, 0);
if(err != CL_SUCCESS)
{
if(p_clGetProgramBuildInfo)
{
char logbuf[4096] = {0};
size_t logsz = 0;
p_clGetProgramBuildInfo(gCL_Program, gCL_Device, CL_PROGRAM_BUILD_LOG, sizeof(logbuf), logbuf, &logsz);
printf("\nOpenCL build failed: %s", logbuf);
}
cl_release_all();
return 0;
}
gCL_K_DH = p_clCreateKernel(gCL_Program, "dh_kernel", &err);
if(err != CL_SUCCESS || !gCL_K_DH) { cl_release_all(); return 0; }
gCL_B_Stage = p_clCreateBuffer(gCL_Context, CL_MEM_READ_ONLY, sizeof(float)*N, 0, &err);
if(err != CL_SUCCESS || !gCL_B_Stage) { cl_release_all(); return 0; }
gCL_B_DH = p_clCreateBuffer(gCL_Context, CL_MEM_WRITE_ONLY, sizeof(float)*N, 0, &err);
if(err != CL_SUCCESS || !gCL_B_DH) { cl_release_all(); return 0; }
gCL_Ready = 1;
gCL_UsedCalls = 0;
gCL_FailedCalls = 0;
gCL_LastErr = 0;
return 1;
}
static void cl_fail(cl_int err)
{
gCL_LastErr = err;
gCL_FailedCalls++;
gCL_Ready = 0;
#if CL_DEBUG
printf("\n[OpenCL] ERROR %d -> fallback to CPU", (int)err);
#endif
}
// GPU drift eval
static void f_theta_vec_gpu(var t)
{
eval_u(t);
var damp, amp, scale, stiff;
theta_t(t, &damp, &, &scale, &stiff);
float stageF[N];
float dhF[N] = { 0 };
for(int k=0;k<N;k++) stageF[k] = (float)G_stage[k];
cl_int err = p_clEnqueueWriteBuffer(gCL_Queue, gCL_B_Stage, CL_TRUE, 0, sizeof(float)*N, stageF, 0, 0, 0);
if(err != CL_SUCCESS) { cl_fail(err); return; }
int arg = 0;
float tf = (float)t;
float dampf = (float)damp;
float ampf = (float)amp;
float scalef = (float)scale;
float stifff = (float)stiff;
int regime = G_Regime;
float u0 = (float)G_u[0];
float u1 = (float)G_u[1];
float u2 = (float)G_u[2];
err = p_clSetKernelArg(gCL_K_DH, arg++, sizeof(cl_mem), &gCL_B_Stage);
err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(cl_mem), &gCL_B_DH);
err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(float), &tf);
err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(float), &dampf);
err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(float), &f);
err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(float), &scalef);
err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(float), &stifff);
err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(int), ®ime);
err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(float), &u0);
err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(float), &u1);
err |= p_clSetKernelArg(gCL_K_DH, arg++, sizeof(float), &u2);
if(err != CL_SUCCESS) { cl_fail(err); return; }
size_t global = (size_t)N;
err = p_clEnqueueNDRangeKernel(gCL_Queue, gCL_K_DH, 1, 0, &global, 0, 0, 0, 0);
if(err != CL_SUCCESS) { cl_fail(err); return; }
err = p_clFinish(gCL_Queue);
if(err != CL_SUCCESS) { cl_fail(err); return; }
err = p_clEnqueueReadBuffer(gCL_Queue, gCL_B_DH, CL_TRUE, 0, sizeof(float)*N, dhF, 0, 0, 0);
if(err != CL_SUCCESS) { cl_fail(err); return; }
for(int k=0;k<N;k++) G_dh[k] = (var)dhF[k];
gCL_UsedCalls++;
#if CL_DEBUG
// print only the first GPU call to prove it produces numbers
static int printed = 0;
if(!printed)
{
printed = 1;
printf("\n[OpenCL] dh sample (t=%g): dh[0]=%.6g dh[1]=%.6g dh[2]=%.6g dh[3]=%.6g",
t, (double)G_dh[0], (double)G_dh[1], (double)G_dh[2], (double)G_dh[3]);
}
#endif
}
#endif // USE_OPENCL
// ===========================================================
// CPU drift eval
// ===========================================================
static void f_theta_vec_cpu(var t)
{
eval_u(t);
var damp, amp, scale, stiff;
theta_t(t, &damp, &, &scale, &stiff);
for(int i=0;i<N;i++)
{
var sum = 0;
var ii = (i+1);
for(int j=0;j<N;j++)
{
var jj = (j+1);
var bj = sin(0.17*jj) * 0.03;
var inj = G_u[0] * 0.02 * sin(0.07*jj + 0.19*t);
var hj = G_stage[j];
var in = hj + bj + inj;
var aj = softsign(in);
var ang = (0.013*ii*jj) + (0.11*ii) - (0.07*jj);
var ww = sin(ang) * scale;
sum += ww * aj;
}
var forcing = sin(t + 0.05*ii) * amp;
forcing += sin(20.0*t + 0.10*ii) * 0.08;
forcing += (G_u[1] * 0.10) + (0.05*G_u[2]*sin(0.9*t));
var hi = G_stage[i];
var self = softsign(hi*1.8) * 0.15;
var di = damp;
if((i & 7) == 0) di *= stiff;
var ghi = hi * di;
G_dh[i] = sum + ghi + forcing + self;
}
}
static void f_theta_vec(var t)
{
#if USE_OPENCL
if(gCL_Ready)
{
f_theta_vec_gpu(t);
if(gCL_Ready) return;
}
#endif
f_theta_vec_cpu(t);
}
// -------------------- Vector RK step --------------------
static void rk_step_vec(int s, float* Atri, float* b, float* c,
var t, var* y, var h, var* yout)
{
if(s < 1)
{
for(int k=0;k<N;k++) yout[k] = y[k];
return;
}
if(s > MAXS) s = MAXS;
for(int i=0;i<s;i++)
for(int k=0;k<N;k++)
G_K[i*N + k] = (fvar)0;
for(int i=0;i<s;i++)
{
for(int k=0;k<N;k++)
{
var acc = y[k];
for(int j=0;j<i;j++)
acc += ((var)G_K[j*N + k]) * getA(Atri,i,j) * h;
G_stage[k] = acc;
}
var dt = ((var)c[i]) * h;
f_theta_vec(t + dt);
for(int k=0;k<N;k++)
G_K[i*N + k] = (fvar)G_dh[k];
}
for(int k=0;k<N;k++)
{
var acc = y[k];
for(int i=0;i<s;i++)
acc += ((var)G_K[i*N + k]) * (var)b[i] * h;
yout[k] = acc;
}
}
static var err_norm(var* a, var* b)
{
var s = 0;
for(int k=0;k<N;k++) { var d = a[k] - b[k]; s += d*d; }
return sqrt(s / (var)N);
}
static var rms_norm(var* a, int n)
{
var s = 0;
for(int k=0;k<n;k++) { var x = a[k]; s += x*x; }
return sqrt(s / (var)n);
}
static void permute_method_into(int s, float* A, float* b, float* c,
int* p,
int* sOut, float* Aout, float* bout, float* cout)
{
*sOut = s;
if(*sOut < 0) *sOut = 0;
if(*sOut > MAXS) *sOut = MAXS;
for(int i=0;i<TRI_A;i++) Aout[i] = 0;
for(int i=0;i<MAXS;i++) { bout[i]=0; cout[i]=0; }
for(int i=0;i<*sOut;i++)
{
int pi = p[i];
if(pi < 0) pi = 0;
if(pi >= s) pi = s-1;
bout[i] = b[pi];
cout[i] = c[pi];
}
for(int i=0;i<*sOut;i++)
for(int j=0;j<i;j++)
{
int pi = p[i];
int pj = p[j];
if(pi < 0) pi = 0;
if(pj < 0) pj = 0;
if(pi >= s) pi = s-1;
if(pj >= s) pj = s-1;
setA(Aout, i, j, getA(A, pi, pj));
}
}
static void make_swappable_rk4(int* s, float* Atri, float* b, float* c)
{
zeroTableau(s, Atri, b, c);
*s = 4;
c[0] = 0.0f;
c[1] = 0.30f;
c[2] = 0.80f;
c[3] = 1.00f;
setA(Atri, 1, 0, 0.30);
setA(Atri, 2, 0, 0.80);
setA(Atri, 3, 1, 0.50);
setA(Atri, 3, 2, 0.50);
b[0] = 0.10f;
b[1] = 0.20f;
b[2] = 0.30f;
b[3] = 0.40f;
}
static void apply_diffusion(var t, var dt, var* y)
{
eval_u(t);
var u0 = G_u[0];
var u2 = G_u[2];
var sigma = G_SigmaBase * (1.0 + 0.6*fabs(u0) + 0.3*fabs(u2));
if(G_Regime) sigma *= 1.25;
var sdt = sqrt(dt);
for(int k=0;k<N;k++)
{
var gmul = 1.0 + (fabs(y[k]) * 0.15);
y[k] += sigma * gmul * sdt * rng_norm();
}
}
static void integrate_adaptive_sde(int s, float* A, float* b, float* c,
var t0, var* y0, var tEnd,
unsigned int seed,
var* yOut,
int* stepsOut, int* rejectsOut,
var* dtMinOut,
int* stopReasonOut)
{
rng_seed(seed);
G_Regime = 0;
var t = t0;
var dt = DT_INIT;
var dtMinSeen = 1e9;
for(int k=0;k<N;k++) G_ycur[k] = y0[k];
int steps = 0, rejects = 0, iters = 0, dtMinHits = 0, steadyCount = 0;
int stopReason = 0;
while(t < tEnd)
{
if((iters & 255) == 0) shed_if_needed();
iters++;
if(dt < DT_MIN) dt = DT_MIN;
if(dt > G_DtMax) dt = G_DtMax;
if(t + dt > tEnd) dt = tEnd - t;
if(dt < dtMinSeen) dtMinSeen = dt;
if(dt <= DT_MIN) dtMinHits++;
if(dtMinHits > G_DtMinHitMax) { stopReason = 4; break; }
rk_step_vec(s, A, b, c, t, G_ycur, dt, G_full);
var halfdt = dt * 0.5;
rk_step_vec(s, A, b, c, t, G_ycur, halfdt, G_mid);
rk_step_vec(s, A, b, c, t+halfdt, G_mid, halfdt, G_half);
var e = err_norm(G_half, G_full);
if(e <= G_Tol || dt <= DT_MIN)
{
for(int k=0;k<N;k++) G_ycur[k] = G_half[k];
eval_u(t);
update_regime(t, dt);
apply_diffusion(t, dt, G_ycur);
apply_jump_if_needed(t, dt, G_ycur);
observe_from_hidden(t + dt, G_ycur);
var yr = rms_norm(G_ycur, N);
var xr = rms_norm(G_xobs, NOBS);
if(yr > G_StateLimitRMS || xr > G_ObsLimitRMS)
{
stopReason = 2;
t += dt; steps++;
break;
}
for(int k=0;k<N;k++) G_stage[k] = G_ycur[k];
f_theta_vec(t + dt);
var dr = rms_norm(G_dh, N);
if(dr < G_SteadyEps) steadyCount++; else steadyCount = 0;
t += dt; steps++;
if(steadyCount >= G_SteadyNeed) { stopReason = 1; break; }
if(e < 1e-16) dt *= 2.0;
else
{
var fac = pow(G_Tol / e, 0.2) * 0.9;
if(fac < 0.5) fac = 0.5;
if(fac > 2.0) fac = 2.0;
dt *= fac;
}
}
else
{
rejects++;
if(rejects > G_MaxRejects) { stopReason = 3; break; }
dt *= 0.5;
}
}
for(int k=0;k<N;k++) yOut[k] = G_ycur[k];
*stepsOut = steps;
*rejectsOut = rejects;
*dtMinOut = dtMinSeen;
*stopReasonOut = stopReason;
}
// ===========================================================
// One-time GPU vs CPU drift self-test (prints numbers)
// ===========================================================
static void drift_selftest()
{
// stage = Y0 snapshot at t = 0.5 (arbitrary)
for(int k=0;k<N;k++) G_stage[k] = Y0[k];
var ttest = 0.5;
// CPU
var dh_cpu[N];
f_theta_vec_cpu(ttest);
for(int k=0;k<N;k++) dh_cpu[k] = G_dh[k];
#if USE_OPENCL
if(gCL_Ready)
{
// GPU
f_theta_vec_gpu(ttest);
var dh_gpu[N];
for(int k=0;k<N;k++) dh_gpu[k] = G_dh[k];
// RMS diff
var s = 0;
for(int k=0;k<N;k++) { var d = dh_gpu[k] - dh_cpu[k]; s += d*d; }
var rms = sqrt(s / (var)N);
printf("\n[SelfTest] dh CPU vs GPU @t=%g: RMS diff = %.3e", ttest, (double)rms);
printf("\n[SelfTest] CPU dh[0..3]=%.6g %.6g %.6g %.6g",
(double)dh_cpu[0], (double)dh_cpu[1], (double)dh_cpu[2], (double)dh_cpu[3]);
printf("\n[SelfTest] GPU dh[0..3]=%.6g %.6g %.6g %.6g",
(double)dh_gpu[0], (double)dh_gpu[1], (double)dh_gpu[2], (double)dh_gpu[3]);
}
else
{
printf("\n[SelfTest] OpenCL not ready -> skipped GPU compare");
}
#else
printf("\n[SelfTest] USE_OPENCL=0 -> CPU only");
#endif
}
// -------------------- Entry point --------------------
DLLFUNC int main()
{
#if USE_OPENCL
if(cl_init())
printf("\nOpenCL: enabled (dynamic loader)");
else
printf("\nOpenCL: not available -> CPU fallback");
#endif
// reset knobs
G_ShedTriggered = 0;
G_Tol = TOL;
G_DtMax = DT_MAX;
G_MaxRejects = 2000000;
G_DtMinHitMax = 500000;
G_SteadyEps = 1e-7;
G_SteadyNeed = 250;
G_StateLimitRMS = 1e6;
G_ObsLimitRMS = 1e6;
G_SigmaBase = 0.02;
G_JumpLambdaBase = 0.20;
G_JumpMagBase = 0.20;
G_RegRate01 = 0.15;
G_RegRate10 = 0.10;
int S = 0;
float A[TRI_A];
float b[MAXS];
float c[MAXS];
make_swappable_rk4(&S, A, b, c);
int SB = 0;
float AB[TRI_A];
float bB[MAXS];
float cB[MAXS];
int p[MAXS];
p[0]=0; p[1]=2; p[2]=1; p[3]=3;
permute_method_into(S, A, b, c, p, &SB, AB, bB, cB);
int SW = S;
float AW[TRI_A];
float bW[MAXS];
float cW[MAXS];
for(int i=0;i<TRI_A;i++) AW[i] = A[i];
for(int i=0;i<MAXS;i++) { bW[i]=0; cW[i]=0; }
for(int i=0;i<SW;i++)
{
int pi = p[i];
bW[i] = b[pi];
cW[i] = c[pi];
}
for(int k=0;k<N;k++)
Y0[k] = 0.2*sin(0.31*(k+1)) + 0.15*cos(0.17*(k+1))
+ 0.05*sin(0.07*(k+1)*(k+1));
var t0 = 0.0;
// seed for custom RNG
var now = wdate(NOW);
unsigned int MySeed = (unsigned int)utm(now);
var frac = now - floor(now);
var ms = frac * 86400000.0;
MySeed = 1664525u * MySeed + (unsigned int)ms + 1013904223u;
if(MySeed == 0) MySeed = 1;
printf("\nMySeed=%u", MySeed);
// ---- PROOF: print GPU numerical results + CPU comparison ----
drift_selftest();
int stepsRef=0, rejRef=0; var dtMinRef=0; int stopRef=0;
int stepsSafe=0, rejSafe=0; var dtMinSafe=0; int stopSafe=0;
int stepsBug=0, rejBug=0; var dtMinBug=0; int stopBug=0;
integrate_adaptive_sde(S, A, b, c, t0, Y0, T_END, MySeed, Yref,
&stepsRef, &rejRef, &dtMinRef, &stopRef);
integrate_adaptive_sde(SB, AB, bB, cB, t0, Y0, T_END, MySeed, Ysafe,
&stepsSafe, &rejSafe, &dtMinSafe, &stopSafe);
integrate_adaptive_sde(SW, AW, bW, cW, t0, Y0, T_END, MySeed, Ybug,
&stepsBug, &rejBug, &dtMinBug, &stopBug);
var diffSafe = err_norm(Ysafe, Yref);
var diffBug = err_norm(Ybug, Yref);
printf("\n\nNeural SDE / Hybrid ODE integration (N=%d) to T=%g baseTol=%g", N, T_END, TOL);
printf("\nA) Reference tableau:");
printf("\n accepted=%d rejected=%d minDt=%g stop=%d", stepsRef, rejRef, dtMinRef, stopRef);
printf("\nB) Theorem-382A safe stage-refactor:");
printf("\n accepted=%d rejected=%d minDt=%g stop=%d", stepsSafe, rejSafe, dtMinSafe, stopSafe);
printf("\n diff to reference (RMS) = %.3e", (double)diffSafe);
printf("\nC) Buggy stage-refactor:");
printf("\n accepted=%d rejected=%d minDt=%g stop=%d", stepsBug, rejBug, dtMinBug, stopBug);
printf("\n diff to reference (RMS) = %.3e", (double)diffBug);
#if USE_OPENCL
printf("\n\nOpenCL usage: gpu_calls=%d failed_calls=%d last_err=%d",
gCL_UsedCalls, gCL_FailedCalls, (int)gCL_LastErr);
cl_release_all();
#endif
printf("\n\nStop codes: 0=tEnd, 1=steady-state, 2=event(blow-up), 3=rejectBudget, 4=dtMinBudget\n");
return quit("done");
}