diff --git a/dub.json b/dub.json index 3154710..67ea560 100644 --- a/dub.json +++ b/dub.json @@ -17,15 +17,17 @@ }, { "name": "unittest", - "dflags" : ["-mdcompute-targets=cuda-210" ,"-oq"], + "dflags": ["-mdcompute-targets=cuda-800", "-oq"], "targetType": "executable", - "versions": ["DComputeTesting"], + "stringImportPaths": ["."], + "versions": ["DComputeTesting", "DComputeCUDA_800"], }, { "name": "test-cuda", "dflags" : ["--mdcompute-targets=cuda-210", "-oq"], "targetType": "executable", - "versions": ["DComputeTestCUDA"], + "stringImportPaths": ["."], + "versions": ["DComputeTestCUDA", "DComputeCUDA_210"], }, { "name": "test-ocl", @@ -33,5 +35,5 @@ "targetType": "executable", "versions": ["DComputeTestOpenCL"], }, - ] + ] } diff --git a/source/dcompute/driver/cuda/buffer.d b/source/dcompute/driver/cuda/buffer.d index b1c7b13..9fb9c7c 100644 --- a/source/dcompute/driver/cuda/buffer.d +++ b/source/dcompute/driver/cuda/buffer.d @@ -11,6 +11,7 @@ struct Buffer(T) this(size_t elems) { + ensureInit(); status = cast(Status)cuMemAlloc(&raw,elems * T.sizeof); checkErrors(); hostMemory = null; @@ -18,6 +19,7 @@ struct Buffer(T) this(T[] arr) { + ensureInit(); status = cast(Status)cuMemAlloc(&raw,arr.length * T.sizeof); checkErrors(); hostMemory = arr; diff --git a/source/dcompute/driver/cuda/package.d b/source/dcompute/driver/cuda/package.d index 6c9ac93..ab71724 100644 --- a/source/dcompute/driver/cuda/package.d +++ b/source/dcompute/driver/cuda/package.d @@ -15,6 +15,7 @@ public import dcompute.driver.cuda.platform; public import dcompute.driver.cuda.program; public import dcompute.driver.cuda.queue; public import dcompute.driver.cuda.unified_buffer; +public import dcompute.driver.cuda.runtime; enum Copy { @@ -42,3 +43,4 @@ private template ReplaceTemplate(alias needle, alias replacement) { } } } + diff --git a/source/dcompute/driver/cuda/program.d b/source/dcompute/driver/cuda/program.d index 86d9ca0..f0c3e08 100644 --- a/source/dcompute/driver/cuda/program.d +++ b/source/dcompute/driver/cuda/program.d @@ -38,7 +38,58 @@ struct Program checkErrors(); return ret; } - + + /** + * Load a program from a PTX global variable injected into the binary by LDC. + * + * Relies on LDC embedding the device PTX into the binary as + * `__dcompute_ptx__` globals. That support exists only when the + * D frontend `__VERSION__` is >= 2113 (LDC >= 1.43). NOTE: released LDC 1.42.0 + * *and* pre-1.43 master both report `__VERSION__` == 2112, so 2113 is the + * lowest frontend version guaranteed to carry the PTX-embedding fix. On any + * older compiler the `else` overload below fails with a clear message on use. + */ + static if (__VERSION__ >= 2113) + static Program fromModule(string moduleName)() + { + version (DComputeCUDA_1200) enum _arch = "cuda1200"; + else version (DComputeCUDA_900) enum _arch = "cuda900"; + else version (DComputeCUDA_800) enum _arch = "cuda800"; + else version (DComputeCUDA_750) enum _arch = "cuda750"; + else version (DComputeCUDA_700) enum _arch = "cuda700"; + else version (DComputeCUDA_600) enum _arch = "cuda600"; + else version (DComputeCUDA_500) enum _arch = "cuda500"; + else version (DComputeCUDA_300) enum _arch = "cuda300"; + else version (DComputeCUDA_210) enum _arch = "cuda210"; + else static assert(false, + "Add a DComputeCUDA_XXX version to your dub config " ~ + "matching your --mdcompute-targets=cuda-XXX dflag. " ~ + "Example: \"versions\": [\"DComputeCUDA_800\"]"); + + import std.array : replace; + + enum mangledName = moduleName.replace(".", "_"); + enum symbolName = "__dcompute_ptx_" ~ _arch ~ "_" ~ mangledName; + + mixin("pragma(mangle, \"" ~ symbolName ~ "\") extern(C) extern __gshared const char " ~ symbolName ~ ";"); + + Program ret; + mixin("status = cast(Status)cuModuleLoadData(&ret.raw, &" ~ symbolName ~ ");"); + checkErrors(); + return ret; + } + else + static Program fromModule(string moduleName)() + { + static assert(false, + "Program.fromModule (used directly or via launch!) requires an LDC that " ~ + "embeds device PTX into the binary as `__dcompute_ptx_*` globals: D " ~ + "frontend __VERSION__ >= 2113 (LDC >= 1.43). This compiler is too old — " ~ + "both released 1.42.0 and pre-1.43 master report __VERSION__ 2112. " ~ + "Upgrade LDC to >= 1.43, or generate the .ptx file and load it via " ~ + "Program.fromFile / Program.fromString."); + } + __gshared static Program globalProgram; //cuModuleLoadDataEx //cuModuleLoadFatBinary diff --git a/source/dcompute/driver/cuda/runtime.d b/source/dcompute/driver/cuda/runtime.d new file mode 100644 index 0000000..9dce1b3 --- /dev/null +++ b/source/dcompute/driver/cuda/runtime.d @@ -0,0 +1,159 @@ +/** + * dcompute.driver.cuda.runtime + * + * initialisation of the CUDA runtime: + * + * shared static this() — runs once at program startup (before main()), on + * the main thread. Initialises Platform, Device, + * Context and the embedded Program. + * + * static this() — runs once per thread (including the main thread) + * when that thread starts. Creates the thread-local + * Queue and pushes the shared Context onto the + * thread's CUDA context stack. + * + * + * ensureInit() is kept as a defensive fallback. It is a no-op once the + * module constructors have run, so calling it is always safe and costs only + * a single bool check on the fast path. + */ +module dcompute.driver.cuda.runtime; + +import dcompute.driver.cuda; +import std.experimental.allocator.mallocator : Mallocator; + +// Global state (shared across every thread) +// __gshared: lives in a single memory location, accessible from all threads. +private __gshared Device _defaultDevice; +private __gshared Context _defaultContext; +private __gshared bool _platformReady = false; // safety-net flag + +// Thread-local state +// plain `static`: D gives each thread its own copy automatically. +private static Queue _threadQueue; +private static bool _threadReady = false; // safety-net flag + +// Primary init: shared static constructor +// D runtime calls this exactly once, before main(), on the main thread, +// in module-dependency order. No locking needed here. +shared static this() +{ + _initPlatform(); +} + +// Per-thread init: thread-local static constructor +// D runtime calls this automatically for every thread (including main) when +// that thread begins. For the main thread it runs after shared static this(). +static this() +{ + _initThread(); +} + +// Public API + +/** + * Defensive fallback: ensures the runtime is fully initialised for the + * calling thread. + * + * Under normal operation this is a no-op (both flags are already true before + * main() starts). It guards against unusual call sites — e.g. Buffer + * constructed at global scope, or code reached before module constructors + * have finished in a pathological import order. + * + * Cost on the fast path: two bool reads, no locking. + */ +void ensureInit() +{ + if (!_platformReady) _initPlatform(); // global guard (synchronized inside) + if (!_threadReady) _initThread(); // per-thread guard (no lock, TLS) +} + +/** + * Returns the default Device (same object from every thread). + */ +Device defaultDevice() +{ + return _defaultDevice; +} + +/** + * Returns the default Queue for the *calling* thread. + * Each thread gets its own independent Queue — no contention, no locking. + */ +Queue defaultQueue() +{ + return _threadQueue; +} + +// Private implementation + +private void _initPlatform() +{ + // Double-checked locking: fast no-lock path after first init. + if (_platformReady) return; + + synchronized + { + if (_platformReady) return; + + Platform.initialise(); + // Device 0 is the default. Users needing a specific device can call + // Platform.getDevices() and manage their own Context + Queue. + + // TODO : Make multi-device usage better and easy. + _defaultDevice = Platform.getDevices(Mallocator.instance)[0]; + + // cuCtxCreate creates the context AND pushes it onto the calling + // thread's CUDA context stack. + _defaultContext = Context(_defaultDevice); + + // Compile-time PTX embedding + // LDC compiles @compute modules to PTX first, then compiles host code, + // so import() resolves in a single build pass with no file I/O at + // runtime. + // The version ladder is pure data — one line per SM level. Adding a + // new target only requires one new `else version` line here plus the + // matching "DComputeCUDA_XXX" entry in dub.json. + + _platformReady = true; + } +} + +private void _initThread() +{ + if (_threadReady) return; + + // The thread that ran _initPlatform() already has _defaultContext on its + // CUDA context stack (cuCtxCreate pushes automatically). + // Every other thread must push it explicitly before creating a stream. + if (Context.current.raw != _defaultContext.raw) + Context.push(_defaultContext); + + _threadQueue = Queue(false); // non-async stream, bound to this thread + _threadReady = true; +} + +/** + * Launch kernel `k` on the calling thread's default Queue using the globally + * loaded Program. Platform, Device, Context, Queue and Program are all + * initialised lazily on the first call — the user needs no boilerplate. + * + * Example: + * launch!saxpy([N,1,1], [1,1,1], b_res, alpha, b_x, b_y, N); + * + * Params: + * grid = Grid dimensions [x, y, z]. + * block = Block dimensions [x, y, z]. + * args = Kernel arguments (host types, Buffer/UnifiedBuffer ). + */ +auto launch(alias k)(uint[3] grid, uint[3] block, + HostArgsOf!(typeof(k)) args) +{ + if (Program.globalProgram.raw is null) + { + import std.traits : moduleName; + ensureInit(); + Program.globalProgram = Program.fromModule!(moduleName!(__traits(parent, k)))(); + } + defaultQueue().enqueue!k(grid, block)(args); +} diff --git a/source/dcompute/tests/main.d b/source/dcompute/tests/main.d index bc1a918..271c213 100644 --- a/source/dcompute/tests/main.d +++ b/source/dcompute/tests/main.d @@ -1,3 +1,5 @@ +module dcompute.tests.main; + version (DComputeTesting) { version = DComputeTestCUDA; } @@ -112,61 +114,88 @@ int main(string[] args) version(DComputeTestCUDA) { - Platform.initialise(); - - auto devs = Platform.getDevices(theAllocator); - auto dev = devs[0]; - auto ctx = Context(dev); scope(exit) ctx.detach(); - - // Change the file to match your GPU. - version (Windows) { - Program.globalProgram = Program.fromFile("./kernels_cuda210_64.ptx"); - } else { - Program.globalProgram = Program.fromFile("./kernels_cuda800_64.ptx"); - } - auto q = Queue(false); - - Buffer!(float) b_res, b_x, b_y; - b_res = Buffer!(float)(res[]); scope(exit) b_res.release(); - b_x = Buffer!(float)(x[]); scope(exit) b_x.release(); - b_y = Buffer!(float)(y[]); scope(exit) b_y.release(); - - b_x.copy!(Copy.hostToDevice); - b_y.copy!(Copy.hostToDevice); + // The injected-PTX feature (Program.fromModule / launch!) needs an LDC that + // embeds device PTX into the binary: D frontend __VERSION__ >= 2113 (LDC >= 1.43). + // On older compilers these tests are skipped rather than failing to build. + static if (__VERSION__ >= 2113) + { + // 1. Manual test + { + Platform.initialise(); + auto devs = Platform.getDevices(theAllocator); + auto dev = devs[0]; + auto ctx = Context(dev); scope(exit) ctx.detach(); - q.enqueue!(saxpy) - ([N,1,1],[1,1,1]) - (b_res,alpha,b_x,b_y, N); - b_res.copy!(Copy.deviceToHost); + Program.globalProgram = Program.fromModule!("dcompute.tests.dummykernels")(); + auto q = Queue(false); - // --- Unified Memory test (runs only when the device supports it) --- - if (dev.supportsUnifiedMemory) - { - writeln("\nDevice supports Unified Memory — running UnifiedBuffer test..."); + Buffer!(float) b_res, b_x, b_y; + b_res = Buffer!(float)(res[]); scope(exit) b_res.release(); + b_x = Buffer!(float)(x[]); scope(exit) b_x.release(); + b_y = Buffer!(float)(y[]); scope(exit) b_y.release(); - // Allocate managed memory and initialise from host slices. - // No explicit H2D copy is needed; the runtime migrates pages. - auto ub_x = UnifiedBuffer!float(x[]); scope(exit) ub_x.release(); - auto ub_y = UnifiedBuffer!float(y[]); scope(exit) ub_y.release(); - auto ub_res = UnifiedBuffer!float(N); scope(exit) ub_res.release(); + b_x.copy!(Copy.hostToDevice); + b_y.copy!(Copy.hostToDevice); q.enqueue!(saxpy) ([N,1,1],[1,1,1]) - (ub_res, alpha, ub_x, ub_y, N); - - // Synchronise so that host can safely read results. - // (No D2H copy — the host slice is the same allocation.) - Context.sync(); - - foreach (i; 0 .. N) - enforce(ub_res.hostSlice[i] == alpha * x[i] + y[i], - "Unified Memory verification failed at index " ~ i.to!string ~ "!"); + (b_res,alpha,b_x,b_y, N); + b_res.copy!(Copy.deviceToHost); + + // Validate + foreach(i; 0 .. N) enforce(res[i] == alpha * x[i] + y[i]); + res[] = 0.0f; // reset + Program.globalProgram.raw = null; // reset global state so lazy init triggers + } - writeln("UnifiedBuffer test PASSED."); + // 2. New test + { + Buffer!(float) b_res, b_x, b_y; + b_res = Buffer!(float)(res[]); scope(exit) b_res.release(); + b_x = Buffer!(float)(x[]); scope(exit) b_x.release(); + b_y = Buffer!(float)(y[]); scope(exit) b_y.release(); + + b_x.copy!(Copy.hostToDevice); + b_y.copy!(Copy.hostToDevice); + + launch!saxpy([N,1,1],[1,1,1], b_res, alpha, b_x, b_y, N); + b_res.copy!(Copy.deviceToHost); + + // Unified Memory test (runs only when the device supports it) + if (defaultDevice().supportsUnifiedMemory) + { + writeln("\nDevice supports Unified Memory — running UnifiedBuffer test..."); + + // Allocate managed memory and initialise from host slices. + // No explicit H2D copy is needed; the runtime migrates pages. + auto ub_x = UnifiedBuffer!float(x[]); scope(exit) ub_x.release(); + auto ub_y = UnifiedBuffer!float(y[]); scope(exit) ub_y.release(); + auto ub_res = UnifiedBuffer!float(N); scope(exit) ub_res.release(); + + launch!saxpy([N,1,1],[1,1,1], ub_res, alpha, ub_x, ub_y, N); + + // Synchronise so that host can safely read results. + // (No D2H copy — the host slice is the same allocation.) + Context.sync(); + + foreach (i; 0 .. N) + enforce(ub_res.hostSlice[i] == alpha * x[i] + y[i], + "Unified Memory verification failed at index " ~ i.to!string ~ "!"); + + writeln("UnifiedBuffer test PASSED."); + } + else + { + writeln("\nDevice does not support Unified Memory — skipping UnifiedBuffer test."); + } + } } else { - writeln("\nDevice does not support Unified Memory — skipping UnifiedBuffer test."); + writeln("DCompute injected-PTX tests (fromModule / launch!) require LDC >= 1.43 " ~ + "(D frontend __VERSION__ >= 2113); this compiler reports an older " ~ + "__VERSION__ — skipping the CUDA embedded-PTX tests."); + return 0; } }