libmir · badnikhil · May 25, 2026 · May 25, 2026 · May 25, 2026 · May 25, 2026
diff --git a/dub.json b/dub.json
@@ -17,21 +17,23 @@
         },
         {
             "name": "unittest",
-            "dflags" : ["-mdcompute-targets=cuda-210" ,"-oq"],
+            "dflags": ["-mdcompute-targets=cuda-800", "-oq"],
             "targetType": "executable",
-            "versions": ["DComputeTesting"],
+            "stringImportPaths": ["."],
+            "versions": ["DComputeTesting", "DComputeCUDA_800"],
         },
         {
             "name": "test-cuda",
             "dflags" : ["--mdcompute-targets=cuda-210", "-oq"],
             "targetType": "executable",
-            "versions": ["DComputeTestCUDA"],
+            "stringImportPaths": ["."],
+            "versions": ["DComputeTestCUDA", "DComputeCUDA_210"],
         },
         {
             "name": "test-ocl",
             "dflags" : ["--mdcompute-targets=ocl-200", "-oq"],
             "targetType": "executable",
             "versions": ["DComputeTestOpenCL"],
         },
-   ]
+    ]
 }
diff --git a/source/dcompute/driver/cuda/buffer.d b/source/dcompute/driver/cuda/buffer.d
@@ -11,13 +11,15 @@ struct Buffer(T)
 
     this(size_t elems)
     {
+        ensureInit();
         status = cast(Status)cuMemAlloc(&raw,elems * T.sizeof);
         checkErrors();
         hostMemory = null;
     }
 
     this(T[] arr)
     {
+        ensureInit();
         status = cast(Status)cuMemAlloc(&raw,arr.length * T.sizeof);
         checkErrors();
         hostMemory = arr;

diff --git a/source/dcompute/driver/cuda/package.d b/source/dcompute/driver/cuda/package.d
@@ -15,6 +15,7 @@ public import dcompute.driver.cuda.platform;
 public import dcompute.driver.cuda.program;
 public import dcompute.driver.cuda.queue;
 public import dcompute.driver.cuda.unified_buffer;
+public import dcompute.driver.cuda.runtime;
 
 enum Copy
 {
@@ -42,3 +43,4 @@ private template ReplaceTemplate(alias needle, alias replacement) {
         }
     }
 }
+
diff --git a/source/dcompute/driver/cuda/program.d b/source/dcompute/driver/cuda/program.d
@@ -38,7 +38,58 @@ struct Program
         checkErrors();
         return ret;
     }
-
+
+    /**
+     * Load a program from a PTX global variable injected into the binary by LDC.
+     *
+     * Relies on LDC embedding the device PTX into the binary as
+     * `__dcompute_ptx_<arch>_<module>` globals. That support exists only when the
+     * D frontend `__VERSION__` is >= 2113 (LDC >= 1.43). NOTE: released LDC 1.42.0
+     * *and* pre-1.43 master both report `__VERSION__` == 2112, so 2113 is the
+     * lowest frontend version guaranteed to carry the PTX-embedding fix. On any
+     * older compiler the `else` overload below fails with a clear message on use.
+     */
+    static if (__VERSION__ >= 2113)
+    static Program fromModule(string moduleName)()
+    {
+        version      (DComputeCUDA_1200) enum _arch = "cuda1200";
+        else version (DComputeCUDA_900)  enum _arch = "cuda900";
+        else version (DComputeCUDA_800)  enum _arch = "cuda800";
+        else version (DComputeCUDA_750)  enum _arch = "cuda750";
+        else version (DComputeCUDA_700)  enum _arch = "cuda700";
+        else version (DComputeCUDA_600)  enum _arch = "cuda600";
+        else version (DComputeCUDA_500)  enum _arch = "cuda500";
+        else version (DComputeCUDA_300)  enum _arch = "cuda300";
+        else version (DComputeCUDA_210)  enum _arch = "cuda210";
+        else static assert(false,
+            "Add a DComputeCUDA_XXX version to your dub config " ~
+            "matching your --mdcompute-targets=cuda-XXX dflag. " ~
+            "Example: \"versions\": [\"DComputeCUDA_800\"]");
+
+        import std.array : replace;
+
+        enum mangledName = moduleName.replace(".", "_");
+        enum symbolName = "__dcompute_ptx_" ~ _arch ~ "_" ~ mangledName;
+
+        mixin("pragma(mangle, \"" ~ symbolName ~ "\") extern(C) extern __gshared const char " ~ symbolName ~ ";");
+
+        Program ret;
+        mixin("status = cast(Status)cuModuleLoadData(&ret.raw, &" ~ symbolName ~ ");");
+        checkErrors();
+        return ret;
+    }
+    else
+    static Program fromModule(string moduleName)()
+    {
+        static assert(false,
+            "Program.fromModule (used directly or via launch!) requires an LDC that " ~
+            "embeds device PTX into the binary as `__dcompute_ptx_*` globals: D " ~
+            "frontend __VERSION__ >= 2113 (LDC >= 1.43). This compiler is too old — " ~
+            "both released 1.42.0 and pre-1.43 master report __VERSION__ 2112. " ~
+            "Upgrade LDC to >= 1.43, or generate the .ptx file and load it via " ~
+            "Program.fromFile / Program.fromString.");
+    }
+
     __gshared static Program globalProgram;
     //cuModuleLoadDataEx
     //cuModuleLoadFatBinary

diff --git a/source/dcompute/driver/cuda/runtime.d b/source/dcompute/driver/cuda/runtime.d
@@ -0,0 +1,159 @@
+/**
+ * dcompute.driver.cuda.runtime
+ *
+ *  initialisation of the CUDA runtime:
+ *
+ *   shared static this()  — runs once at program startup (before main()), on
+ *                           the main thread.  Initialises Platform, Device,
+ *                           Context and the embedded Program.
+ *
+ *   static this()         — runs once per thread (including the main thread)
+ *                           when that thread starts.  Creates the thread-local
+ *                           Queue and pushes the shared Context onto the
+ *                           thread's CUDA context stack.
+ *
+ *
+ * ensureInit() is kept as a defensive fallback.  It is a no-op once the
+ * module constructors have run, so calling it is always safe and costs only
+ * a single bool check on the fast path.
+ */
+module dcompute.driver.cuda.runtime;
+
+import dcompute.driver.cuda;
+import std.experimental.allocator.mallocator : Mallocator;
+
+// Global state (shared across every thread)
+// __gshared: lives in a single memory location, accessible from all threads.
+private __gshared Device  _defaultDevice;
+private __gshared Context _defaultContext;
+private __gshared bool    _platformReady = false; // safety-net flag
+
+// Thread-local state
+// plain `static`: D gives each thread its own copy automatically.
+private static Queue _threadQueue;
+private static bool  _threadReady = false; // safety-net flag
+
+// Primary init: shared static constructor
+// D runtime calls this exactly once, before main(), on the main thread,
+// in module-dependency order.  No locking needed here.
+shared static this()
+{
+    _initPlatform();
+}
+
+// Per-thread init: thread-local static constructor
+// D runtime calls this automatically for every thread (including main) when
+// that thread begins.  For the main thread it runs after shared static this().
+static this()
+{
+    _initThread();
+}
+
+// Public API
+
+/**
+ * Defensive fallback: ensures the runtime is fully initialised for the
+ * calling thread.
+ *
+ * Under normal operation this is a no-op (both flags are already true before
+ * main() starts).  It guards against unusual call sites — e.g. Buffer
+ * constructed at global scope, or code reached before module constructors
+ * have finished in a pathological import order.
+ *
+ * Cost on the fast path: two bool reads, no locking.
+ */
+void ensureInit()
+{
+    if (!_platformReady) _initPlatform(); // global guard (synchronized inside)
+    if (!_threadReady)   _initThread();   // per-thread guard (no lock, TLS)
+}
+
+/**
+ * Returns the default Device (same object from every thread).
+ */
+Device defaultDevice()
+{
+    return _defaultDevice;
+}
+
+/**
+ * Returns the default Queue for the *calling* thread.
+ * Each thread gets its own independent Queue — no contention, no locking.
+ */
+Queue defaultQueue()
+{
+    return _threadQueue;
+}
+
+// Private implementation
+
+private void _initPlatform()
+{
+    // Double-checked locking: fast no-lock path after first init.
+    if (_platformReady) return;
+
+    synchronized
+    {
+        if (_platformReady) return;
+
+        Platform.initialise();
+        // Device 0 is the default.  Users needing a specific device can call
+        // Platform.getDevices() and manage their own Context + Queue.
+
+        // TODO : Make multi-device usage better and easy.
+        _defaultDevice = Platform.getDevices(Mallocator.instance)[0];
+
+        // cuCtxCreate creates the context AND pushes it onto the calling
+        // thread's CUDA context stack.
+        _defaultContext = Context(_defaultDevice);
+
+        // Compile-time PTX embedding
+        // LDC compiles @compute modules to PTX first, then compiles host code,
+        // so import() resolves in a single build pass with no file I/O at
+        // runtime.
+        // The version ladder is pure data — one line per SM level.  Adding a
+        // new target only requires one new `else version` line here plus the
+        // matching "DComputeCUDA_XXX" entry in dub.json.
+
+        _platformReady = true;
+    }
+}
+
+private void _initThread()
+{
+    if (_threadReady) return;
+
+    // The thread that ran _initPlatform() already has _defaultContext on its
+    // CUDA context stack (cuCtxCreate pushes automatically).
+    // Every other thread must push it explicitly before creating a stream.
+    if (Context.current.raw != _defaultContext.raw)
+        Context.push(_defaultContext);
+
+    _threadQueue = Queue(false); // non-async stream, bound to this thread
+    _threadReady = true;
+}
+
+/**
+ * Launch kernel `k` on the calling thread's default Queue using the globally
+ * loaded Program.  Platform, Device, Context, Queue and Program are all
+ * initialised lazily on the first call — the user needs no boilerplate.
+ *
+ * Example:
+ *   launch!saxpy([N,1,1], [1,1,1], b_res, alpha, b_x, b_y, N);
+ *
+ * Params:
+ *   grid      = Grid  dimensions [x, y, z].
+ *   block     = Block dimensions [x, y, z].
+ *   args      = Kernel arguments (host types, Buffer/UnifiedBuffer ).
+ */
+auto launch(alias k)(uint[3] grid, uint[3] block,
+                     HostArgsOf!(typeof(k)) args)
+{
+    if (Program.globalProgram.raw is null)
+    {
+        import std.traits : moduleName;
+        ensureInit();
+        Program.globalProgram = Program.fromModule!(moduleName!(__traits(parent, k)))();
+    }
+    defaultQueue().enqueue!k(grid, block)(args);
+}