Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions dub.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,23 @@
},
{
"name": "unittest",
"dflags" : ["-mdcompute-targets=cuda-210" ,"-oq"],
"dflags": ["-mdcompute-targets=cuda-800", "-oq"],
"targetType": "executable",
"versions": ["DComputeTesting"],
"stringImportPaths": ["."],
"versions": ["DComputeTesting", "DComputeCUDA_800"],
},
{
"name": "test-cuda",
"dflags" : ["--mdcompute-targets=cuda-210", "-oq"],
"targetType": "executable",
"versions": ["DComputeTestCUDA"],
"stringImportPaths": ["."],
"versions": ["DComputeTestCUDA", "DComputeCUDA_210"],
},
{
"name": "test-ocl",
"dflags" : ["--mdcompute-targets=ocl-200", "-oq"],
"targetType": "executable",
"versions": ["DComputeTestOpenCL"],
},
]
]
}
2 changes: 2 additions & 0 deletions source/dcompute/driver/cuda/buffer.d
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@ struct Buffer(T)

this(size_t elems)
{
ensureInit();

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are these calls needed here?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer these not to be here so that the user does not need to rely only on the runtime module.

status = cast(Status)cuMemAlloc(&raw,elems * T.sizeof);
checkErrors();
hostMemory = null;
}

this(T[] arr)
{
ensureInit();

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

status = cast(Status)cuMemAlloc(&raw,arr.length * T.sizeof);
checkErrors();
hostMemory = arr;
Expand Down
2 changes: 2 additions & 0 deletions source/dcompute/driver/cuda/package.d
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ public import dcompute.driver.cuda.platform;
public import dcompute.driver.cuda.program;
public import dcompute.driver.cuda.queue;
public import dcompute.driver.cuda.unified_buffer;
public import dcompute.driver.cuda.runtime;

enum Copy
{
Expand Down Expand Up @@ -42,3 +43,4 @@ private template ReplaceTemplate(alias needle, alias replacement) {
}
}
}

53 changes: 52 additions & 1 deletion source/dcompute/driver/cuda/program.d
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,58 @@ struct Program
checkErrors();
return ret;
}


/**
* Load a program from a PTX global variable injected into the binary by LDC.
*
* Relies on LDC embedding the device PTX into the binary as
* `__dcompute_ptx_<arch>_<module>` globals. That support exists only when the
* D frontend `__VERSION__` is >= 2113 (LDC >= 1.43). NOTE: released LDC 1.42.0
* *and* pre-1.43 master both report `__VERSION__` == 2112, so 2113 is the
* lowest frontend version guaranteed to carry the PTX-embedding fix. On any
* older compiler the `else` overload below fails with a clear message on use.
*/
static if (__VERSION__ >= 2113)
static Program fromModule(string moduleName)()
{
version (DComputeCUDA_1200) enum _arch = "cuda1200";
else version (DComputeCUDA_900) enum _arch = "cuda900";
else version (DComputeCUDA_800) enum _arch = "cuda800";
else version (DComputeCUDA_750) enum _arch = "cuda750";
else version (DComputeCUDA_700) enum _arch = "cuda700";
else version (DComputeCUDA_600) enum _arch = "cuda600";
else version (DComputeCUDA_500) enum _arch = "cuda500";
else version (DComputeCUDA_300) enum _arch = "cuda300";
else version (DComputeCUDA_210) enum _arch = "cuda210";
else static assert(false,
"Add a DComputeCUDA_XXX version to your dub config " ~
"matching your --mdcompute-targets=cuda-XXX dflag. " ~
"Example: \"versions\": [\"DComputeCUDA_800\"]");

import std.array : replace;

enum mangledName = moduleName.replace(".", "_");
enum symbolName = "__dcompute_ptx_" ~ _arch ~ "_" ~ mangledName;

mixin("pragma(mangle, \"" ~ symbolName ~ "\") extern(C) extern __gshared const char " ~ symbolName ~ ";");

Program ret;
mixin("status = cast(Status)cuModuleLoadData(&ret.raw, &" ~ symbolName ~ ");");
checkErrors();
return ret;
}
else
static Program fromModule(string moduleName)()
{
static assert(false,
"Program.fromModule (used directly or via launch!) requires an LDC that " ~
"embeds device PTX into the binary as `__dcompute_ptx_*` globals: D " ~
"frontend __VERSION__ >= 2113 (LDC >= 1.43). This compiler is too old — " ~
"both released 1.42.0 and pre-1.43 master report __VERSION__ 2112. " ~
"Upgrade LDC to >= 1.43, or generate the .ptx file and load it via " ~
"Program.fromFile / Program.fromString.");
}

__gshared static Program globalProgram;
//cuModuleLoadDataEx
//cuModuleLoadFatBinary
Expand Down
159 changes: 159 additions & 0 deletions source/dcompute/driver/cuda/runtime.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
/**
* dcompute.driver.cuda.runtime
*
* initialisation of the CUDA runtime:
*
* shared static this() — runs once at program startup (before main()), on
* the main thread. Initialises Platform, Device,
* Context and the embedded Program.
*
* static this() — runs once per thread (including the main thread)
* when that thread starts. Creates the thread-local
* Queue and pushes the shared Context onto the
* thread's CUDA context stack.
*
*
* ensureInit() is kept as a defensive fallback. It is a no-op once the
* module constructors have run, so calling it is always safe and costs only
* a single bool check on the fast path.
*/
module dcompute.driver.cuda.runtime;

import dcompute.driver.cuda;
import std.experimental.allocator.mallocator : Mallocator;

// Global state (shared across every thread)
// __gshared: lives in a single memory location, accessible from all threads.
private __gshared Device _defaultDevice;
private __gshared Context _defaultContext;
private __gshared bool _platformReady = false; // safety-net flag

// Thread-local state
// plain `static`: D gives each thread its own copy automatically.
private static Queue _threadQueue;
private static bool _threadReady = false; // safety-net flag

// Primary init: shared static constructor
// D runtime calls this exactly once, before main(), on the main thread,
// in module-dependency order. No locking needed here.
shared static this()
{
_initPlatform();
}

// Per-thread init: thread-local static constructor
// D runtime calls this automatically for every thread (including main) when
// that thread begins. For the main thread it runs after shared static this().
static this()
{
_initThread();
}

// Public API

/**
* Defensive fallback: ensures the runtime is fully initialised for the
* calling thread.
*
* Under normal operation this is a no-op (both flags are already true before
* main() starts). It guards against unusual call sites — e.g. Buffer
* constructed at global scope, or code reached before module constructors
* have finished in a pathological import order.
*
* Cost on the fast path: two bool reads, no locking.
*/
void ensureInit()
{
if (!_platformReady) _initPlatform(); // global guard (synchronized inside)
if (!_threadReady) _initThread(); // per-thread guard (no lock, TLS)
}

/**
* Returns the default Device (same object from every thread).
*/
Device defaultDevice()
{
return _defaultDevice;
}

/**
* Returns the default Queue for the *calling* thread.
* Each thread gets its own independent Queue — no contention, no locking.
*/
Queue defaultQueue()
{
return _threadQueue;
}

// Private implementation

private void _initPlatform()
{
// Double-checked locking: fast no-lock path after first init.
if (_platformReady) return;

synchronized
{
if (_platformReady) return;

Platform.initialise();
// Device 0 is the default. Users needing a specific device can call
// Platform.getDevices() and manage their own Context + Queue.

// TODO : Make multi-device usage better and easy.
_defaultDevice = Platform.getDevices(Mallocator.instance)[0];

// cuCtxCreate creates the context AND pushes it onto the calling
// thread's CUDA context stack.
_defaultContext = Context(_defaultDevice);

// Compile-time PTX embedding
// LDC compiles @compute modules to PTX first, then compiles host code,
// so import() resolves in a single build pass with no file I/O at
// runtime.
// The version ladder is pure data — one line per SM level. Adding a
// new target only requires one new `else version` line here plus the
// matching "DComputeCUDA_XXX" entry in dub.json.

_platformReady = true;
}
}

private void _initThread()
{
if (_threadReady) return;

// The thread that ran _initPlatform() already has _defaultContext on its
// CUDA context stack (cuCtxCreate pushes automatically).
// Every other thread must push it explicitly before creating a stream.
if (Context.current.raw != _defaultContext.raw)
Context.push(_defaultContext);

_threadQueue = Queue(false); // non-async stream, bound to this thread
_threadReady = true;
}

/**
* Launch kernel `k` on the calling thread's default Queue using the globally
* loaded Program. Platform, Device, Context, Queue and Program are all
* initialised lazily on the first call — the user needs no boilerplate.
*
* Example:
* launch!saxpy([N,1,1], [1,1,1], b_res, alpha, b_x, b_y, N);
*
* Params:
* grid = Grid dimensions [x, y, z].
* block = Block dimensions [x, y, z].
* args = Kernel arguments (host types, Buffer/UnifiedBuffer ).
*/
auto launch(alias k)(uint[3] grid, uint[3] block,
HostArgsOf!(typeof(k)) args)
{
if (Program.globalProgram.raw is null)
{
import std.traits : moduleName;
ensureInit();
Program.globalProgram = Program.fromModule!(moduleName!(__traits(parent, k)))();
}
defaultQueue().enqueue!k(grid, block)(args);
}
Loading