diff --git a/dub.json b/dub.json
index 3154710..67ea560 100644
--- a/dub.json
+++ b/dub.json
@@ -17,15 +17,17 @@
         },
         {
             "name": "unittest",
-            "dflags" : ["-mdcompute-targets=cuda-210" ,"-oq"],
+            "dflags": ["-mdcompute-targets=cuda-800", "-oq"],
             "targetType": "executable",
-            "versions": ["DComputeTesting"],
+            "stringImportPaths": ["."],
+            "versions": ["DComputeTesting", "DComputeCUDA_800"],
         },
         {
             "name": "test-cuda",
             "dflags" : ["--mdcompute-targets=cuda-210", "-oq"],
             "targetType": "executable",
-            "versions": ["DComputeTestCUDA"],
+            "stringImportPaths": ["."],
+            "versions": ["DComputeTestCUDA", "DComputeCUDA_210"],
         },
         {
             "name": "test-ocl",
@@ -33,5 +35,5 @@
             "targetType": "executable",
             "versions": ["DComputeTestOpenCL"],
         },
-   ]
+    ]
 }
diff --git a/source/dcompute/driver/cuda/buffer.d b/source/dcompute/driver/cuda/buffer.d
index b1c7b13..9fb9c7c 100644
--- a/source/dcompute/driver/cuda/buffer.d
+++ b/source/dcompute/driver/cuda/buffer.d
@@ -11,6 +11,7 @@ struct Buffer(T)
 
     this(size_t elems)
     {
+        ensureInit();
         status = cast(Status)cuMemAlloc(&raw,elems * T.sizeof);
         checkErrors();
         hostMemory = null;
@@ -18,6 +19,7 @@ struct Buffer(T)
 
     this(T[] arr)
     {
+        ensureInit();
         status = cast(Status)cuMemAlloc(&raw,arr.length * T.sizeof);
         checkErrors();
         hostMemory = arr;
diff --git a/source/dcompute/driver/cuda/package.d b/source/dcompute/driver/cuda/package.d
index 6c9ac93..ab71724 100644
--- a/source/dcompute/driver/cuda/package.d
+++ b/source/dcompute/driver/cuda/package.d
@@ -15,6 +15,7 @@ public import dcompute.driver.cuda.platform;
 public import dcompute.driver.cuda.program;
 public import dcompute.driver.cuda.queue;
 public import dcompute.driver.cuda.unified_buffer;
+public import dcompute.driver.cuda.runtime;
 
 enum Copy
 {
@@ -42,3 +43,4 @@ private template ReplaceTemplate(alias needle, alias replacement) {
         }
     }
 }
+
diff --git a/source/dcompute/driver/cuda/program.d b/source/dcompute/driver/cuda/program.d
index 86d9ca0..f0c3e08 100644
--- a/source/dcompute/driver/cuda/program.d
+++ b/source/dcompute/driver/cuda/program.d
@@ -38,7 +38,58 @@ struct Program
         checkErrors();
         return ret;
     }
-    
+
+    /**
+     * Load a program from a PTX global variable injected into the binary by LDC.
+     *
+     * Relies on LDC embedding the device PTX into the binary as
+     * `__dcompute_ptx_<arch>_<module>` globals. That support exists only when the
+     * D frontend `__VERSION__` is >= 2113 (LDC >= 1.43). NOTE: released LDC 1.42.0
+     * *and* pre-1.43 master both report `__VERSION__` == 2112, so 2113 is the
+     * lowest frontend version guaranteed to carry the PTX-embedding fix. On any
+     * older compiler the `else` overload below fails with a clear message on use.
+     */
+    static if (__VERSION__ >= 2113)
+    static Program fromModule(string moduleName)()
+    {
+        version      (DComputeCUDA_1200) enum _arch = "cuda1200";
+        else version (DComputeCUDA_900)  enum _arch = "cuda900";
+        else version (DComputeCUDA_800)  enum _arch = "cuda800";
+        else version (DComputeCUDA_750)  enum _arch = "cuda750";
+        else version (DComputeCUDA_700)  enum _arch = "cuda700";
+        else version (DComputeCUDA_600)  enum _arch = "cuda600";
+        else version (DComputeCUDA_500)  enum _arch = "cuda500";
+        else version (DComputeCUDA_300)  enum _arch = "cuda300";
+        else version (DComputeCUDA_210)  enum _arch = "cuda210";
+        else static assert(false,
+            "Add a DComputeCUDA_XXX version to your dub config " ~
+            "matching your --mdcompute-targets=cuda-XXX dflag. " ~
+            "Example: \"versions\": [\"DComputeCUDA_800\"]");
+
+        import std.array : replace;
+        
+        enum mangledName = moduleName.replace(".", "_");
+        enum symbolName = "__dcompute_ptx_" ~ _arch ~ "_" ~ mangledName;
+
+        mixin("pragma(mangle, \"" ~ symbolName ~ "\") extern(C) extern __gshared const char " ~ symbolName ~ ";");
+
+        Program ret;
+        mixin("status = cast(Status)cuModuleLoadData(&ret.raw, &" ~ symbolName ~ ");");
+        checkErrors();
+        return ret;
+    }
+    else
+    static Program fromModule(string moduleName)()
+    {
+        static assert(false,
+            "Program.fromModule (used directly or via launch!) requires an LDC that " ~
+            "embeds device PTX into the binary as `__dcompute_ptx_*` globals: D " ~
+            "frontend __VERSION__ >= 2113 (LDC >= 1.43). This compiler is too old — " ~
+            "both released 1.42.0 and pre-1.43 master report __VERSION__ 2112. " ~
+            "Upgrade LDC to >= 1.43, or generate the .ptx file and load it via " ~
+            "Program.fromFile / Program.fromString.");
+    }
+
     __gshared static Program globalProgram;
     //cuModuleLoadDataEx
     //cuModuleLoadFatBinary
diff --git a/source/dcompute/driver/cuda/runtime.d b/source/dcompute/driver/cuda/runtime.d
new file mode 100644
index 0000000..9dce1b3
--- /dev/null
+++ b/source/dcompute/driver/cuda/runtime.d
@@ -0,0 +1,159 @@
+/**
+ * dcompute.driver.cuda.runtime
+ *
+ *  initialisation of the CUDA runtime:
+ *
+ *   shared static this()  — runs once at program startup (before main()), on
+ *                           the main thread.  Initialises Platform, Device,
+ *                           Context and the embedded Program.
+ *
+ *   static this()         — runs once per thread (including the main thread)
+ *                           when that thread starts.  Creates the thread-local
+ *                           Queue and pushes the shared Context onto the
+ *                           thread's CUDA context stack.
+ *
+ *
+ * ensureInit() is kept as a defensive fallback.  It is a no-op once the
+ * module constructors have run, so calling it is always safe and costs only
+ * a single bool check on the fast path.
+ */
+module dcompute.driver.cuda.runtime;
+
+import dcompute.driver.cuda;
+import std.experimental.allocator.mallocator : Mallocator;
+
+// Global state (shared across every thread)
+// __gshared: lives in a single memory location, accessible from all threads.
+private __gshared Device  _defaultDevice;
+private __gshared Context _defaultContext;
+private __gshared bool    _platformReady = false; // safety-net flag
+
+// Thread-local state
+// plain `static`: D gives each thread its own copy automatically.
+private static Queue _threadQueue;
+private static bool  _threadReady = false; // safety-net flag
+
+// Primary init: shared static constructor
+// D runtime calls this exactly once, before main(), on the main thread,
+// in module-dependency order.  No locking needed here.
+shared static this()
+{
+    _initPlatform();
+}
+
+// Per-thread init: thread-local static constructor
+// D runtime calls this automatically for every thread (including main) when
+// that thread begins.  For the main thread it runs after shared static this().
+static this()
+{
+    _initThread();
+}
+
+// Public API
+
+/**
+ * Defensive fallback: ensures the runtime is fully initialised for the
+ * calling thread.
+ *
+ * Under normal operation this is a no-op (both flags are already true before
+ * main() starts).  It guards against unusual call sites — e.g. Buffer
+ * constructed at global scope, or code reached before module constructors
+ * have finished in a pathological import order.
+ *
+ * Cost on the fast path: two bool reads, no locking.
+ */
+void ensureInit()
+{
+    if (!_platformReady) _initPlatform(); // global guard (synchronized inside)
+    if (!_threadReady)   _initThread();   // per-thread guard (no lock, TLS)
+}
+
+/**
+ * Returns the default Device (same object from every thread).
+ */
+Device defaultDevice()
+{
+    return _defaultDevice;
+}
+
+/**
+ * Returns the default Queue for the *calling* thread.
+ * Each thread gets its own independent Queue — no contention, no locking.
+ */
+Queue defaultQueue()
+{
+    return _threadQueue;
+}
+
+// Private implementation
+
+private void _initPlatform()
+{
+    // Double-checked locking: fast no-lock path after first init.
+    if (_platformReady) return;
+
+    synchronized
+    {
+        if (_platformReady) return;
+
+        Platform.initialise();
+        // Device 0 is the default.  Users needing a specific device can call
+        // Platform.getDevices() and manage their own Context + Queue.
+        
+        // TODO : Make multi-device usage better and easy.
+        _defaultDevice = Platform.getDevices(Mallocator.instance)[0];
+
+        // cuCtxCreate creates the context AND pushes it onto the calling
+        // thread's CUDA context stack.
+        _defaultContext = Context(_defaultDevice);
+
+        // Compile-time PTX embedding
+        // LDC compiles @compute modules to PTX first, then compiles host code,
+        // so import() resolves in a single build pass with no file I/O at
+        // runtime.
+        // The version ladder is pure data — one line per SM level.  Adding a
+        // new target only requires one new `else version` line here plus the
+        // matching "DComputeCUDA_XXX" entry in dub.json.
+
+        _platformReady = true;
+    }
+}
+
+private void _initThread()
+{
+    if (_threadReady) return;
+
+    // The thread that ran _initPlatform() already has _defaultContext on its
+    // CUDA context stack (cuCtxCreate pushes automatically).
+    // Every other thread must push it explicitly before creating a stream.
+    if (Context.current.raw != _defaultContext.raw)
+        Context.push(_defaultContext);
+
+    _threadQueue = Queue(false); // non-async stream, bound to this thread
+    _threadReady = true;
+}
+
+/**
+ * Launch kernel `k` on the calling thread's default Queue using the globally
+ * loaded Program.  Platform, Device, Context, Queue and Program are all
+ * initialised lazily on the first call — the user needs no boilerplate.
+ *
+ * Example:
+ *   launch!saxpy([N,1,1], [1,1,1], b_res, alpha, b_x, b_y, N);
+ *
+ * Params:
+ *   grid      = Grid  dimensions [x, y, z].
+ *   block     = Block dimensions [x, y, z].
+ *   args      = Kernel arguments (host types, Buffer/UnifiedBuffer ).
+ */
+auto launch(alias k)(uint[3] grid, uint[3] block,
+                     HostArgsOf!(typeof(k)) args)
+{
+    if (Program.globalProgram.raw is null)
+    {
+        import std.traits : moduleName;
+        ensureInit();
+        Program.globalProgram = Program.fromModule!(moduleName!(__traits(parent, k)))();
+    }
+    defaultQueue().enqueue!k(grid, block)(args);
+}
diff --git a/source/dcompute/tests/main.d b/source/dcompute/tests/main.d
index bc1a918..271c213 100644
--- a/source/dcompute/tests/main.d
+++ b/source/dcompute/tests/main.d
@@ -1,3 +1,5 @@
+module dcompute.tests.main;
+
 version (DComputeTesting) {
     version = DComputeTestCUDA;
 }
@@ -112,61 +114,88 @@ int main(string[] args)
 
     version(DComputeTestCUDA)
     {
-        Platform.initialise();
-	
-        auto devs = Platform.getDevices(theAllocator);
-        auto dev   = devs[0]; 
-        auto ctx   = Context(dev); scope(exit) ctx.detach();
-
-        // Change the file to match your GPU.
-        version (Windows) {
-            Program.globalProgram = Program.fromFile("./kernels_cuda210_64.ptx");
-        } else {
-            Program.globalProgram = Program.fromFile("./kernels_cuda800_64.ptx");
-        }
-        auto q = Queue(false);
-
-        Buffer!(float) b_res, b_x, b_y;
-        b_res =  Buffer!(float)(res[]); scope(exit) b_res.release();
-        b_x   =  Buffer!(float)(x[]);   scope(exit) b_x.release();
-        b_y   =  Buffer!(float)(y[]);   scope(exit) b_y.release();
-
-        b_x.copy!(Copy.hostToDevice);
-        b_y.copy!(Copy.hostToDevice);
+        // The injected-PTX feature (Program.fromModule / launch!) needs an LDC that
+        // embeds device PTX into the binary: D frontend __VERSION__ >= 2113 (LDC >= 1.43).
+        // On older compilers these tests are skipped rather than failing to build.
+        static if (__VERSION__ >= 2113)
+        {
+        // 1. Manual test
+        {
+            Platform.initialise();
+            auto devs = Platform.getDevices(theAllocator);
+            auto dev   = devs[0]; 
+            auto ctx   = Context(dev); scope(exit) ctx.detach();
 
-        q.enqueue!(saxpy)
-                  ([N,1,1],[1,1,1])
-                  (b_res,alpha,b_x,b_y, N);
-        b_res.copy!(Copy.deviceToHost);
+            Program.globalProgram = Program.fromModule!("dcompute.tests.dummykernels")();
+            auto q = Queue(false);
 
-        // --- Unified Memory test (runs only when the device supports it) ---
-        if (dev.supportsUnifiedMemory)
-        {
-            writeln("\nDevice supports Unified Memory — running UnifiedBuffer test...");
+            Buffer!(float) b_res, b_x, b_y;
+            b_res =  Buffer!(float)(res[]); scope(exit) b_res.release();
+            b_x   =  Buffer!(float)(x[]);   scope(exit) b_x.release();
+            b_y   =  Buffer!(float)(y[]);   scope(exit) b_y.release();
 
-            // Allocate managed memory and initialise from host slices.
-            // No explicit H2D copy is needed; the runtime migrates pages.
-            auto ub_x   = UnifiedBuffer!float(x[]);   scope(exit) ub_x.release();
-            auto ub_y   = UnifiedBuffer!float(y[]);   scope(exit) ub_y.release();
-            auto ub_res = UnifiedBuffer!float(N);     scope(exit) ub_res.release();
+            b_x.copy!(Copy.hostToDevice);
+            b_y.copy!(Copy.hostToDevice);
 
             q.enqueue!(saxpy)
                       ([N,1,1],[1,1,1])
-                      (ub_res, alpha, ub_x, ub_y, N);
-
-            // Synchronise so that host can safely read results.
-            // (No D2H copy — the host slice is the same allocation.)
-            Context.sync();
-
-            foreach (i; 0 .. N)
-                enforce(ub_res.hostSlice[i] == alpha * x[i] + y[i],
-                        "Unified Memory verification failed at index " ~ i.to!string ~ "!");
+                      (b_res,alpha,b_x,b_y, N);
+            b_res.copy!(Copy.deviceToHost);
+            
+            // Validate 
+            foreach(i; 0 .. N) enforce(res[i] == alpha * x[i] + y[i]);
+            res[] = 0.0f; // reset
+            Program.globalProgram.raw = null; // reset global state so lazy init triggers
+        }
 
-            writeln("UnifiedBuffer test PASSED.");
+        // 2. New test
+        {
+            Buffer!(float) b_res, b_x, b_y;
+            b_res =  Buffer!(float)(res[]); scope(exit) b_res.release();
+            b_x   =  Buffer!(float)(x[]);   scope(exit) b_x.release();
+            b_y   =  Buffer!(float)(y[]);   scope(exit) b_y.release();
+
+            b_x.copy!(Copy.hostToDevice);
+            b_y.copy!(Copy.hostToDevice);
+    
+            launch!saxpy([N,1,1],[1,1,1], b_res, alpha, b_x, b_y, N);
+            b_res.copy!(Copy.deviceToHost);
+
+            //  Unified Memory test (runs only when the device supports it) 
+            if (defaultDevice().supportsUnifiedMemory)
+            {
+                writeln("\nDevice supports Unified Memory — running UnifiedBuffer test...");
+
+                // Allocate managed memory and initialise from host slices.
+                // No explicit H2D copy is needed; the runtime migrates pages.
+                auto ub_x   = UnifiedBuffer!float(x[]);   scope(exit) ub_x.release();
+                auto ub_y   = UnifiedBuffer!float(y[]);   scope(exit) ub_y.release();
+                auto ub_res = UnifiedBuffer!float(N);     scope(exit) ub_res.release();
+
+                launch!saxpy([N,1,1],[1,1,1], ub_res, alpha, ub_x, ub_y, N);
+
+                // Synchronise so that host can safely read results.
+                // (No D2H copy — the host slice is the same allocation.)
+                Context.sync();
+
+                foreach (i; 0 .. N)
+                    enforce(ub_res.hostSlice[i] == alpha * x[i] + y[i],
+                            "Unified Memory verification failed at index " ~ i.to!string ~ "!");
+
+                writeln("UnifiedBuffer test PASSED.");
+            }
+            else
+            {
+                writeln("\nDevice does not support Unified Memory — skipping UnifiedBuffer test.");
+            }
+        }
         }
         else
         {
-            writeln("\nDevice does not support Unified Memory — skipping UnifiedBuffer test.");
+            writeln("DCompute injected-PTX tests (fromModule / launch!) require LDC >= 1.43 " ~
+                    "(D frontend __VERSION__ >= 2113); this compiler reports an older " ~
+                    "__VERSION__ — skipping the CUDA embedded-PTX tests.");
+            return 0;
         }
     }