From 87dda3a0129161c2dfeb75c1860d9aaff5877a9a Mon Sep 17 00:00:00 2001 From: Asadbek Date: Thu, 11 Jun 2026 14:51:29 +0200 Subject: [PATCH 1/9] Draft of METAL driver API --- dub.json | 1 + source/dcompute/driver/metal/buffer.d | 40 +++++++++++ source/dcompute/driver/metal/device.d | 17 +++++ source/dcompute/driver/metal/kernel.d | 12 ++++ source/dcompute/driver/metal/package.d | 19 +++++ source/dcompute/driver/metal/platform.d | 29 ++++++++ source/dcompute/driver/metal/program.d | 86 ++++++++++++++++++++++ source/dcompute/driver/metal/queue.d | 94 +++++++++++++++++++++++++ source/dcompute/std/index.d | 7 ++ source/dcompute/std/metal/index.d | 13 ++++ 10 files changed, 318 insertions(+) create mode 100644 source/dcompute/driver/metal/buffer.d create mode 100644 source/dcompute/driver/metal/device.d create mode 100644 source/dcompute/driver/metal/kernel.d create mode 100644 source/dcompute/driver/metal/package.d create mode 100644 source/dcompute/driver/metal/platform.d create mode 100644 source/dcompute/driver/metal/program.d create mode 100644 source/dcompute/driver/metal/queue.d create mode 100644 source/dcompute/std/metal/index.d diff --git a/dub.json b/dub.json index 3154710..80c9b25 100644 --- a/dub.json +++ b/dub.json @@ -7,6 +7,7 @@ "dependencies": { "derelict-cl" : "~>3.2.0", "bindbc-cuda": "~>0.1.0", + "metal-d": "~>0.5.2", "taggedalgebraic": "~>0.10.7" }, "configurations": [ diff --git a/source/dcompute/driver/metal/buffer.d b/source/dcompute/driver/metal/buffer.d new file mode 100644 index 0000000..eaac8c9 --- /dev/null +++ b/source/dcompute/driver/metal/buffer.d @@ -0,0 +1,40 @@ +module dcompute.driver.metal.buffer; +import metal; +import dcompute.driver.metal.program; +import dcompute.driver.metal; +import core.stdc.string; + +struct Buffer(T) +{ + MTLBuffer mtlBuffer; + + // Host memory associated with this buffer + T[] hostMemory; + + this(T[] array) + { + auto device = Program.globalProgram.device; + + size_t sizeInBytes = array.length * T.sizeof; + + mtlBuffer = device.newBuffer(sizeInBytes); + + hostMemory = array; + + if (array.ptr !is null && sizeInBytes > 0) + { + memcpy(mtlBuffer.contents(), array.ptr, sizeInBytes); + } + } + + T* contents() + { + return cast(T*) mtlBuffer.contents(); + } + + void release() + { + mtlBuffer = null; + hostMemory = null; + } +} \ No newline at end of file diff --git a/source/dcompute/driver/metal/device.d b/source/dcompute/driver/metal/device.d new file mode 100644 index 0000000..91a3552 --- /dev/null +++ b/source/dcompute/driver/metal/device.d @@ -0,0 +1,17 @@ +module dcompute.driver.metal.device; +import metal; + +struct Device +{ + MTLDevice raw; + + this(MTLDevice device) + { + raw = device; + } + + MTLBuffer newBuffer(size_t sizeInBytes) + { + return raw.newBuffer(sizeInBytes, MTLResourceOptions.StorageModeShared); + } +} \ No newline at end of file diff --git a/source/dcompute/driver/metal/kernel.d b/source/dcompute/driver/metal/kernel.d new file mode 100644 index 0000000..a188588 --- /dev/null +++ b/source/dcompute/driver/metal/kernel.d @@ -0,0 +1,12 @@ +module dcompute.driver.metal.kernel; +import metal.computepipeline; + +struct Kernel(F) if (is(F==function) || is(F==void)) +{ + MTLComputePipelineState pipelineState; + string name; + this(MTLComputePipelineState ps) + { + pipelineState = ps; + } +} \ No newline at end of file diff --git a/source/dcompute/driver/metal/package.d b/source/dcompute/driver/metal/package.d new file mode 100644 index 0000000..816c4b1 --- /dev/null +++ b/source/dcompute/driver/metal/package.d @@ -0,0 +1,19 @@ +module dcompute.driver.metal; +import dcompute.driver.metal.buffer; +import ldc.dcompute; +import std.range; +import std.meta; +import std.traits; + +template HostArgsOf(F) +{ + template toBuffer(T) + { + static if (is(T: Pointer!(n,U), uint n, U)) + alias toBuffer = Buffer!U; + else + alias toBuffer = T; + } + + alias HostArgsOf = staticMap!(toBuffer, Parameters!F); +} diff --git a/source/dcompute/driver/metal/platform.d b/source/dcompute/driver/metal/platform.d new file mode 100644 index 0000000..22b4d68 --- /dev/null +++ b/source/dcompute/driver/metal/platform.d @@ -0,0 +1,29 @@ +module dcompute.driver.metal.platform; + +import dcompute.driver.metal.device; +import metal.device; + +struct Platform +{ + static void initialize() + { + + } + + // static Device[] getDevices() + // { + // auto hardwareDevice = MTLCreateSystemDefaultDevice(); + + // if (hardwareDevice is null) { + // return []; + // } + + // return [ Device(hardwareDevice) ]; + // } + + static Device getDefaultDevice() + { + auto device = Device(MTLCreateSystemDefaultDevice()); + return device; + } +} \ No newline at end of file diff --git a/source/dcompute/driver/metal/program.d b/source/dcompute/driver/metal/program.d new file mode 100644 index 0000000..04737b8 --- /dev/null +++ b/source/dcompute/driver/metal/program.d @@ -0,0 +1,86 @@ +module dcompute.driver.metal.program; +import dcompute.driver.metal.device; +import dcompute.driver.metal.kernel; +import objc; +import foundation; +import core.stdc.stdio; +import std.string; +import std.path; +import metal.library; +import metal.device; + +struct Program +{ + MTLLibrary metalLibrary; + + Device device; + + Kernel!void getKernelByName(immutable(char)* name) + { + auto kName = fromStringz(name); + + auto kNameInNSString = NSString.create(kName); + + auto kernelFunction = metalLibrary.newFunctionWithName(kNameInNSString); + + if (kernelFunction is null) + { + printf("Error: Could not find kernel function %s in library.\n", name); + assert(0); + } + + NSError error; + + auto pipelineState = device.raw.newComputePipelineStateWithFunction( + kernelFunction, + MTLPipelineOption.None, + null, + error + ); + + if (pipelineState is null) + { + printf("Error: Backend compilation failed: %s\n", error.localizedDescription().ptr); + assert(0); + } + + return Kernel!void(pipelineState); + } + + Kernel!(typeof(k)) getKernel(alias k)() + { + return cast(typeof(return)) getKernelByName(k.mangleof.ptr); + } + + static Program fromFile(Device device, string path) + { + NSError error; + auto nsPath = NSString.create(absolutePath(path)); + + auto library = device.raw.newLibrary(NSURL.fromPath(nsPath), error); + + if (library is null) + { + printf("Error loading .metallib: %s\n", error.localizedDescription().ptr); + assert(0); + } + + foreach(function_; library.functionNames) + { + auto functionName = cast(NSString) function_; + + auto r = functionName.ptr(); + + printf("kernel: %s\n", r); + } + + return Program(library, device); + } + + __gshared static Program globalProgram; + + void unload() + { + metalLibrary = null; + } +} \ No newline at end of file diff --git a/source/dcompute/driver/metal/queue.d b/source/dcompute/driver/metal/queue.d new file mode 100644 index 0000000..95023c3 --- /dev/null +++ b/source/dcompute/driver/metal/queue.d @@ -0,0 +1,94 @@ +module dcompute.driver.metal.queue; +import dcompute.driver.metal.buffer; + +import dcompute.driver.metal; +import dcompute.driver.metal.device; +import dcompute.driver.metal.program; +import metal; +import metal.argument; +import metal.types; + +struct Queue +{ + Device device; + MTLCommandQueue commandQueue; + MTLCommandBuffer lastActiveBuffer; + + this (Device _device /*bool async*/) + { + device = _device; + commandQueue = device.raw.newCommandQueue(); + } + + auto enqueue(alias k)(uint[3] _grid, uint[3] _block) + { + static struct Call + { + Queue q; + uint[3] grid, block; + + this(Queue _q, uint[3] _grid, uint[3] _block) + { + q = _q; + grid = _grid; + block = _block; + } + + void opCall(HostArgsOf!(typeof(k)) args) + { + auto kernel = Program.globalProgram.getKernel!k(); + + auto commandBuffer = q.commandQueue.commandBuffer(); + + auto computeEncoder = commandBuffer.computeCommandEncoder(); + + computeEncoder.setComputePipelineState(kernel.pipelineState); + + foreach (i, arg; args) + { + static if (is(typeof(arg): Buffer!U, U)) + { + computeEncoder.setBuffer(arg.mtlBuffer, 0, i); + } + else static if (__traits(isPOD, typeof(arg)) && !is(typeof(arg) == class)) + { + computeEncoder.setBytes(&val, typeof(val).sizeof, i); + } + else + { + static assert(0, "Unsupported argument type for Metal kernel dispatch!"); + } + } + + // specify the grid size + auto threadgroupsPerGrid = MTLSize(grid[0], grid[1], grid[2]); + + // thread group size + auto threadsPerThreadgroup = MTLSize(block[0], block[1], block[2]); + + // dispatchThreads using compute encoder + computeEncoder.dispatchThreads(threadgroupsPerGrid, threadsPerThreadgroup); + + // commit the commandBuffer + computeEncoder.endEncoding(); + commandBuffer.commit(); + commandBuffer.waitUntilCompleted(); + + // q.lastActiveBuffer = commandBuffer; + } + } + + return Call(this, _grid, _block); + } + + void finish() { + import std.stdio; + printf("waiting until completed......\n"); + if (lastActiveBuffer !is null) { + lastActiveBuffer.waitUntilCompleted(); + lastActiveBuffer = null; + + printf("kernel call completed......\n"); + } + } +} \ No newline at end of file diff --git a/source/dcompute/std/index.d b/source/dcompute/std/index.d index 60abf6c..60af91f 100644 --- a/source/dcompute/std/index.d +++ b/source/dcompute/std/index.d @@ -4,6 +4,7 @@ import ldc.dcompute; private import ocl = dcompute.std.opencl.index; private import cuda = dcompute.std.cuda.index; +private import metal = dcompute.std.metal.index; /* Index Terminology @@ -80,6 +81,8 @@ struct GlobalIndex return ocl.get_global_id(0); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ctaid_x()*cuda.ntid_x() + cuda.tid_x(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.thread_position_in_grid_x(0); else assert(0); } @@ -90,6 +93,8 @@ struct GlobalIndex return ocl.get_global_id(1); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ctaid_y()*cuda.ntid_y() + cuda.tid_y(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.thread_position_in_grid_y(1); else assert(0); } @@ -100,6 +105,8 @@ struct GlobalIndex return ocl.get_global_id(2); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ctaid_z()*cuda.ntid_z() + cuda.tid_z(); + else if(__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.thread_position_in_grid_z(2); else assert(0); } diff --git a/source/dcompute/std/metal/index.d b/source/dcompute/std/metal/index.d new file mode 100644 index 0000000..9ea1a13 --- /dev/null +++ b/source/dcompute/std/metal/index.d @@ -0,0 +1,13 @@ +@compute(CompileFor.deviceOnly) module dcompute.std.metal.index; + +import ldc.dcompute; +pure: nothrow: @nogc: +//tid = threadId +pragma(LDC_intrinsic, "air.get_global_id.i32") +uint thread_position_in_grid_x(uint dim); + +pragma(LDC_intrinsic, "air.get_global_id.i32") +uint thread_position_in_grid_y(uint dim); + +pragma(LDC_intrinsic, "air.get_global_id.i32") +uint thread_position_in_grid_z(uint dim); From ee435baad47897d12b520025d43c44ccb9d3524e Mon Sep 17 00:00:00 2001 From: Asadbek Date: Sat, 13 Jun 2026 19:11:19 +0200 Subject: [PATCH 2/9] Fix issue with encoding scalar values and remove unnecessary comments --- source/dcompute/driver/metal/kernel.d | 2 +- source/dcompute/driver/metal/queue.d | 27 ++++++++++----------------- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/source/dcompute/driver/metal/kernel.d b/source/dcompute/driver/metal/kernel.d index a188588..58a736d 100644 --- a/source/dcompute/driver/metal/kernel.d +++ b/source/dcompute/driver/metal/kernel.d @@ -4,7 +4,7 @@ import metal.computepipeline; struct Kernel(F) if (is(F==function) || is(F==void)) { MTLComputePipelineState pipelineState; - string name; + this(MTLComputePipelineState ps) { pipelineState = ps; diff --git a/source/dcompute/driver/metal/queue.d b/source/dcompute/driver/metal/queue.d index 95023c3..c4b341c 100644 --- a/source/dcompute/driver/metal/queue.d +++ b/source/dcompute/driver/metal/queue.d @@ -14,6 +14,7 @@ struct Queue MTLCommandQueue commandQueue; MTLCommandBuffer lastActiveBuffer; + // TODO(asadbek): explore options to make the use of async execution with events this (Device _device /*bool async*/) { device = _device; @@ -24,10 +25,10 @@ struct Queue { static struct Call { - Queue q; + Queue* q; uint[3] grid, block; - this(Queue _q, uint[3] _grid, uint[3] _block) + this(Queue* _q, uint[3] _grid, uint[3] _block) { q = _q; grid = _grid; @@ -49,10 +50,9 @@ struct Queue static if (is(typeof(arg): Buffer!U, U)) { computeEncoder.setBuffer(arg.mtlBuffer, 0, i); - } - else static if (__traits(isPOD, typeof(arg)) && !is(typeof(arg) == class)) + } else static if (__traits(isScalar, typeof(arg))) { - computeEncoder.setBytes(&val, typeof(val).sizeof, i); + computeEncoder.setBytes(&arg, typeof(arg).sizeof, i); } else { @@ -60,35 +60,28 @@ struct Queue } } - // specify the grid size auto threadgroupsPerGrid = MTLSize(grid[0], grid[1], grid[2]); - // thread group size auto threadsPerThreadgroup = MTLSize(block[0], block[1], block[2]); - // dispatchThreads using compute encoder computeEncoder.dispatchThreads(threadgroupsPerGrid, threadsPerThreadgroup); - // commit the commandBuffer computeEncoder.endEncoding(); commandBuffer.commit(); - commandBuffer.waitUntilCompleted(); - // q.lastActiveBuffer = commandBuffer; + q.lastActiveBuffer = commandBuffer; } } - return Call(this, _grid, _block); + return Call(&this, _grid, _block); } void finish() { - import std.stdio; - printf("waiting until completed......\n"); - if (lastActiveBuffer !is null) { + if (lastActiveBuffer !is null) { lastActiveBuffer.waitUntilCompleted(); - lastActiveBuffer = null; + lastActiveBuffer.release(); - printf("kernel call completed......\n"); + lastActiveBuffer = null; } } } \ No newline at end of file From 6967d3a16a74d3c2962a84172c2fac2dad60331a Mon Sep 17 00:00:00 2001 From: Asadbek Date: Sat, 13 Jun 2026 19:39:55 +0200 Subject: [PATCH 3/9] Fix getDevices method --- source/dcompute/driver/metal/device.d | 19 +++++++++++++++---- source/dcompute/driver/metal/platform.d | 11 +++++++++-- source/dcompute/driver/metal/program.d | 13 ++----------- source/dcompute/driver/metal/queue.d | 2 +- 4 files changed, 27 insertions(+), 18 deletions(-) diff --git a/source/dcompute/driver/metal/device.d b/source/dcompute/driver/metal/device.d index 91a3552..98a1454 100644 --- a/source/dcompute/driver/metal/device.d +++ b/source/dcompute/driver/metal/device.d @@ -3,15 +3,26 @@ import metal; struct Device { - MTLDevice raw; - + /** + A pointer to $(D MTLDevice). It is $(D void*) because upon storing array of $(D Device), + linker look for the $(D MTLDevice) but fails to + find it as it is Objective-C binding hence had to wrap it as such + */ + void* raw; + + @property MTLDevice mtlDevice() + { + return cast(MTLDevice) raw; + } + + this(MTLDevice device) { - raw = device; + raw = cast(void*)device; } MTLBuffer newBuffer(size_t sizeInBytes) { - return raw.newBuffer(sizeInBytes, MTLResourceOptions.StorageModeShared); + return mtlDevice.newBuffer(sizeInBytes, MTLResourceOptions.StorageModeShared); } } \ No newline at end of file diff --git a/source/dcompute/driver/metal/platform.d b/source/dcompute/driver/metal/platform.d index 22b4d68..e30280f 100644 --- a/source/dcompute/driver/metal/platform.d +++ b/source/dcompute/driver/metal/platform.d @@ -5,11 +5,18 @@ import metal.device; struct Platform { - static void initialize() + static Device[] getDevices() { + auto mtlDevices = MTLCopyAllDevices(); + auto devices = new Device[mtlDevices.length]; - } + for(int i=0;i < mtlDevices.length;i ++) + { + devices[i] = Device(mtlDevices[i]); + } + return devices; + } // static Device[] getDevices() // { // auto hardwareDevice = MTLCreateSystemDefaultDevice(); diff --git a/source/dcompute/driver/metal/program.d b/source/dcompute/driver/metal/program.d index 04737b8..ddef97a 100644 --- a/source/dcompute/driver/metal/program.d +++ b/source/dcompute/driver/metal/program.d @@ -31,7 +31,7 @@ struct Program NSError error; - auto pipelineState = device.raw.newComputePipelineStateWithFunction( + auto pipelineState = device.mtlDevice.newComputePipelineStateWithFunction( kernelFunction, MTLPipelineOption.None, null, @@ -57,7 +57,7 @@ struct Program NSError error; auto nsPath = NSString.create(absolutePath(path)); - auto library = device.raw.newLibrary(NSURL.fromPath(nsPath), error); + auto library = device.mtlDevice.newLibrary(NSURL.fromPath(nsPath), error); if (library is null) { @@ -65,15 +65,6 @@ struct Program assert(0); } - foreach(function_; library.functionNames) - { - auto functionName = cast(NSString) function_; - - auto r = functionName.ptr(); - - printf("kernel: %s\n", r); - } - return Program(library, device); } diff --git a/source/dcompute/driver/metal/queue.d b/source/dcompute/driver/metal/queue.d index c4b341c..88d10ac 100644 --- a/source/dcompute/driver/metal/queue.d +++ b/source/dcompute/driver/metal/queue.d @@ -18,7 +18,7 @@ struct Queue this (Device _device /*bool async*/) { device = _device; - commandQueue = device.raw.newCommandQueue(); + commandQueue = device.mtlDevice.newCommandQueue(); } auto enqueue(alias k)(uint[3] _grid, uint[3] _block) From 48af07ad7a5117cf843e2935f538fc5c41ef61be Mon Sep 17 00:00:00 2001 From: Asadbek Date: Sat, 13 Jun 2026 19:41:09 +0200 Subject: [PATCH 4/9] Remove unneccessary comment --- source/dcompute/driver/metal/platform.d | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/source/dcompute/driver/metal/platform.d b/source/dcompute/driver/metal/platform.d index e30280f..ddff71f 100644 --- a/source/dcompute/driver/metal/platform.d +++ b/source/dcompute/driver/metal/platform.d @@ -17,17 +17,7 @@ struct Platform return devices; } - // static Device[] getDevices() - // { - // auto hardwareDevice = MTLCreateSystemDefaultDevice(); - - // if (hardwareDevice is null) { - // return []; - // } - - // return [ Device(hardwareDevice) ]; - // } - + static Device getDefaultDevice() { auto device = Device(MTLCreateSystemDefaultDevice()); From c8d279b7fa20b573f49b91276c5a8355fa4f365e Mon Sep 17 00:00:00 2001 From: Asadbek Date: Mon, 15 Jun 2026 16:48:38 +0200 Subject: [PATCH 5/9] Add newlines at the end of files and move logic for pipeline state to Queue as it fits better there and store MTLFunction in kernel metadata instead --- source/dcompute/driver/metal/buffer.d | 7 +------ source/dcompute/driver/metal/device.d | 14 +++++++++++++- source/dcompute/driver/metal/kernel.d | 10 +++++----- source/dcompute/driver/metal/platform.d | 2 +- source/dcompute/driver/metal/program.d | 19 ++----------------- source/dcompute/driver/metal/queue.d | 22 ++++++++++++++++++++-- 6 files changed, 42 insertions(+), 32 deletions(-) diff --git a/source/dcompute/driver/metal/buffer.d b/source/dcompute/driver/metal/buffer.d index eaac8c9..c09f87b 100644 --- a/source/dcompute/driver/metal/buffer.d +++ b/source/dcompute/driver/metal/buffer.d @@ -20,11 +20,6 @@ struct Buffer(T) mtlBuffer = device.newBuffer(sizeInBytes); hostMemory = array; - - if (array.ptr !is null && sizeInBytes > 0) - { - memcpy(mtlBuffer.contents(), array.ptr, sizeInBytes); - } } T* contents() @@ -37,4 +32,4 @@ struct Buffer(T) mtlBuffer = null; hostMemory = null; } -} \ No newline at end of file +} diff --git a/source/dcompute/driver/metal/device.d b/source/dcompute/driver/metal/device.d index 98a1454..2f183a6 100644 --- a/source/dcompute/driver/metal/device.d +++ b/source/dcompute/driver/metal/device.d @@ -1,4 +1,6 @@ module dcompute.driver.metal.device; +import dcompute.driver.metal.buffer; +import core.stdc.string; import metal; struct Device @@ -25,4 +27,14 @@ struct Device { return mtlDevice.newBuffer(sizeInBytes, MTLResourceOptions.StorageModeShared); } -} \ No newline at end of file + + void copy(T)(Buffer!T buffer) + { + size_t sizeInBytes = buffer.hostMemory.length * T.sizeof; + + if (buffer.hostMemory.ptr !is null && sizeInBytes > 0) + { + memcpy(buffer.mtlBuffer.contents(), buffer.hostMemory.ptr, sizeInBytes); + } + } +} diff --git a/source/dcompute/driver/metal/kernel.d b/source/dcompute/driver/metal/kernel.d index 58a736d..c0596dc 100644 --- a/source/dcompute/driver/metal/kernel.d +++ b/source/dcompute/driver/metal/kernel.d @@ -1,12 +1,12 @@ module dcompute.driver.metal.kernel; -import metal.computepipeline; +import metal.library; struct Kernel(F) if (is(F==function) || is(F==void)) { - MTLComputePipelineState pipelineState; + MTLFunction kernelFunction; - this(MTLComputePipelineState ps) + this(MTLFunction _kernelFunction) { - pipelineState = ps; + kernelFunction = _kernelFunction; } -} \ No newline at end of file +} diff --git a/source/dcompute/driver/metal/platform.d b/source/dcompute/driver/metal/platform.d index ddff71f..5caa7be 100644 --- a/source/dcompute/driver/metal/platform.d +++ b/source/dcompute/driver/metal/platform.d @@ -23,4 +23,4 @@ struct Platform auto device = Device(MTLCreateSystemDefaultDevice()); return device; } -} \ No newline at end of file +} diff --git a/source/dcompute/driver/metal/program.d b/source/dcompute/driver/metal/program.d index ddef97a..bf48d23 100644 --- a/source/dcompute/driver/metal/program.d +++ b/source/dcompute/driver/metal/program.d @@ -29,22 +29,7 @@ struct Program assert(0); } - NSError error; - - auto pipelineState = device.mtlDevice.newComputePipelineStateWithFunction( - kernelFunction, - MTLPipelineOption.None, - null, - error - ); - - if (pipelineState is null) - { - printf("Error: Backend compilation failed: %s\n", error.localizedDescription().ptr); - assert(0); - } - - return Kernel!void(pipelineState); + return Kernel!void(kernelFunction); } Kernel!(typeof(k)) getKernel(alias k)() @@ -74,4 +59,4 @@ struct Program { metalLibrary = null; } -} \ No newline at end of file +} diff --git a/source/dcompute/driver/metal/queue.d b/source/dcompute/driver/metal/queue.d index 88d10ac..d19d308 100644 --- a/source/dcompute/driver/metal/queue.d +++ b/source/dcompute/driver/metal/queue.d @@ -7,6 +7,9 @@ import dcompute.driver.metal.program; import metal; import metal.argument; import metal.types; +import core.stdc.stdio; +import objc; +import foundation; struct Queue { @@ -37,13 +40,28 @@ struct Queue void opCall(HostArgsOf!(typeof(k)) args) { + NSError error; + auto kernel = Program.globalProgram.getKernel!k(); + auto pipelineState = q.device.mtlDevice.newComputePipelineStateWithFunction( + kernel.kernelFunction, + MTLPipelineOption.None, + null, + error + ); + + if (pipelineState is null) + { + printf("Error: Backend compilation failed: %s\n", error.localizedDescription().ptr); + assert(0); + } + auto commandBuffer = q.commandQueue.commandBuffer(); auto computeEncoder = commandBuffer.computeCommandEncoder(); - computeEncoder.setComputePipelineState(kernel.pipelineState); + computeEncoder.setComputePipelineState(pipelineState); foreach (i, arg; args) { @@ -84,4 +102,4 @@ struct Queue lastActiveBuffer = null; } } -} \ No newline at end of file +} From 90de5818eb50c73a41f8fada9a0973bb4b52bd6e Mon Sep 17 00:00:00 2001 From: Asadbek Date: Mon, 15 Jun 2026 17:04:59 +0200 Subject: [PATCH 6/9] Add makeBuffer on Device --- source/dcompute/driver/metal/buffer.d | 9 ++------- source/dcompute/driver/metal/device.d | 9 +++++++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/source/dcompute/driver/metal/buffer.d b/source/dcompute/driver/metal/buffer.d index c09f87b..3700a97 100644 --- a/source/dcompute/driver/metal/buffer.d +++ b/source/dcompute/driver/metal/buffer.d @@ -11,14 +11,9 @@ struct Buffer(T) // Host memory associated with this buffer T[] hostMemory; - this(T[] array) + this(MTLBuffer _mtlBuffer, T[] array) { - auto device = Program.globalProgram.device; - - size_t sizeInBytes = array.length * T.sizeof; - - mtlBuffer = device.newBuffer(sizeInBytes); - + mtlBuffer = _mtlBuffer; hostMemory = array; } diff --git a/source/dcompute/driver/metal/device.d b/source/dcompute/driver/metal/device.d index 2f183a6..688776f 100644 --- a/source/dcompute/driver/metal/device.d +++ b/source/dcompute/driver/metal/device.d @@ -28,13 +28,18 @@ struct Device return mtlDevice.newBuffer(sizeInBytes, MTLResourceOptions.StorageModeShared); } - void copy(T)(Buffer!T buffer) + Buffer!T makeBuffer(T)(T[] hostMemory) { - size_t sizeInBytes = buffer.hostMemory.length * T.sizeof; + size_t sizeInBytes = hostMemory.length * T.sizeof; + + auto mtlBuffer = newBuffer(sizeInBytes); + auto buffer = Buffer!T(mtlBuffer, hostMemory); if (buffer.hostMemory.ptr !is null && sizeInBytes > 0) { memcpy(buffer.mtlBuffer.contents(), buffer.hostMemory.ptr, sizeInBytes); } + + return buffer; } } From 061b96b6f36de973106b1fc849b1e8848f907d73 Mon Sep 17 00:00:00 2001 From: Asadbek Date: Thu, 18 Jun 2026 23:27:39 +0200 Subject: [PATCH 7/9] Add more index intrinsics and add metal target test case --- dub.json | 6 ++++ source/dcompute/driver/metal/package.d | 9 +++++- source/dcompute/std/index.d | 36 +++++++++++++++++++-- source/dcompute/std/metal/index.d | 21 +++++++++---- source/dcompute/tests/dummykernels.d | 43 +++++++++++++++++--------- source/dcompute/tests/main.d | 42 +++++++++++++++++++++++++ 6 files changed, 132 insertions(+), 25 deletions(-) diff --git a/dub.json b/dub.json index 80c9b25..7f14bee 100644 --- a/dub.json +++ b/dub.json @@ -34,5 +34,11 @@ "targetType": "executable", "versions": ["DComputeTestOpenCL"], }, + { + "name" : "test-metal", + "dflags": ["-mdcompute-targets=metal-400", "-version=LDC_DCompute","-oq"], + "targetType": "executable", + "versions": ["DComputeTestMetal"], + }, ] } diff --git a/source/dcompute/driver/metal/package.d b/source/dcompute/driver/metal/package.d index 816c4b1..1775bb4 100644 --- a/source/dcompute/driver/metal/package.d +++ b/source/dcompute/driver/metal/package.d @@ -1,10 +1,17 @@ module dcompute.driver.metal; -import dcompute.driver.metal.buffer; import ldc.dcompute; import std.range; import std.meta; import std.traits; +public import dcompute.driver.metal.buffer; +public import dcompute.driver.metal.device; +public import dcompute.driver.metal.kernel; +public import dcompute.driver.metal.platform; +public import dcompute.driver.metal.program; +public import dcompute.driver.metal.queue; + + template HostArgsOf(F) { template toBuffer(T) diff --git a/source/dcompute/std/index.d b/source/dcompute/std/index.d index 60af91f..54aefa8 100644 --- a/source/dcompute/std/index.d +++ b/source/dcompute/std/index.d @@ -47,6 +47,8 @@ struct GlobalDimension return ocl.get_global_size(0); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ntid_x()*cuda.nctaid_x(); + else if (__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.threads_per_grid(0); else assert(0); } @@ -57,6 +59,8 @@ struct GlobalDimension return ocl.get_global_size(1); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ntid_y()*cuda.nctaid_y(); + else if (__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.threads_per_grid(1); else assert(0); } @@ -67,6 +71,8 @@ struct GlobalDimension return ocl.get_global_size(2); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ntid_z()*cuda.nctaid_z(); + else if (__dcompute_reflect(ReflectTarget.Metal,0)) + return metal.threads_per_grid(2); else assert(0); } @@ -82,7 +88,7 @@ struct GlobalIndex else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ctaid_x()*cuda.ntid_x() + cuda.tid_x(); else if(__dcompute_reflect(ReflectTarget.Metal,0)) - return metal.thread_position_in_grid_x(0); + return metal.thread_position_in_grid(0); else assert(0); } @@ -94,7 +100,7 @@ struct GlobalIndex else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ctaid_y()*cuda.ntid_y() + cuda.tid_y(); else if(__dcompute_reflect(ReflectTarget.Metal,0)) - return metal.thread_position_in_grid_y(1); + return metal.thread_position_in_grid(1); else assert(0); } @@ -106,7 +112,7 @@ struct GlobalIndex else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ctaid_z()*cuda.ntid_z() + cuda.tid_z(); else if(__dcompute_reflect(ReflectTarget.Metal,0)) - return metal.thread_position_in_grid_z(2); + return metal.thread_position_in_grid(2); else assert(0); } @@ -146,6 +152,8 @@ struct GroupDimension return ocl.get_num_groups(0); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.nctaid_x(); + else if(__dcompute_reflect(ReflectTarget.Metal, 0)) + return metal.threadgroups_per_grid(0); else assert(0); } @@ -156,6 +164,8 @@ struct GroupDimension return ocl.get_num_groups(1); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.nctaid_y(); + else if(__dcompute_reflect(ReflectTarget.Metal, 0)) + return metal.threadgroups_per_grid(1); else assert(0); } @@ -166,6 +176,8 @@ struct GroupDimension return ocl.get_num_groups(2); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.nctaid_z(); + else if(__dcompute_reflect(ReflectTarget.Metal, 0)) + return metal.threadgroups_per_grid(2); else assert(0); } @@ -180,6 +192,8 @@ struct GroupIndex return ocl.get_group_id(0); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ctaid_x(); + else if(__dcompute_reflect(ReflectTarget.Metal, 0)) + return metal.threadgroup_position_in_grid(0); else assert(0); } @@ -190,6 +204,8 @@ struct GroupIndex return ocl.get_group_id(1); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ctaid_y(); + else if(__dcompute_reflect(ReflectTarget.Metal, 0)) + return metal.threadgroup_position_in_grid(1); else assert(0); } @@ -200,6 +216,8 @@ struct GroupIndex return ocl.get_group_id(2); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ctaid_z(); + else if(__dcompute_reflect(ReflectTarget.Metal, 0)) + return metal.threadgroup_position_in_grid(2); else assert(0); } @@ -214,6 +232,8 @@ struct SharedDimension return ocl.get_local_size(0); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ntid_x(); + else if(__dcompute_reflect(ReflectTarget.Metal, 0)) + return metal.threads_per_threadgroup(0); else assert(0); } @@ -224,6 +244,8 @@ struct SharedDimension return ocl.get_local_size(1); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ntid_y(); + else if(__dcompute_reflect(ReflectTarget.Metal, 0)) + return metal.threads_per_threadgroup(1); else assert(0); @@ -235,6 +257,8 @@ struct SharedDimension return ocl.get_local_size(2); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.ntid_z(); + else if(__dcompute_reflect(ReflectTarget.Metal, 0)) + return metal.threads_per_threadgroup(2); else assert(0); } @@ -249,6 +273,8 @@ struct SharedIndex return ocl.get_local_id(0); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.tid_x(); + else if(__dcompute_reflect(ReflectTarget.Metal, 0)) + return metal.thread_position_in_threadgroup(0); else assert(0); } @@ -259,6 +285,8 @@ struct SharedIndex return ocl.get_local_id(1); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.tid_y(); + else if(__dcompute_reflect(ReflectTarget.Metal, 0)) + return metal.thread_position_in_threadgroup(1); else assert(0); } @@ -269,6 +297,8 @@ struct SharedIndex return ocl.get_local_id(2); else if(__dcompute_reflect(ReflectTarget.CUDA,0)) return cuda.tid_z(); + else if(__dcompute_reflect(ReflectTarget.Metal, 0)) + return metal.thread_position_in_threadgroup(2); else assert(0); } diff --git a/source/dcompute/std/metal/index.d b/source/dcompute/std/metal/index.d index 9ea1a13..fdbc86f 100644 --- a/source/dcompute/std/metal/index.d +++ b/source/dcompute/std/metal/index.d @@ -2,12 +2,21 @@ import ldc.dcompute; pure: nothrow: @nogc: -//tid = threadId -pragma(LDC_intrinsic, "air.get_global_id.i32") -uint thread_position_in_grid_x(uint dim); pragma(LDC_intrinsic, "air.get_global_id.i32") -uint thread_position_in_grid_y(uint dim); +uint thread_position_in_grid(uint dim); -pragma(LDC_intrinsic, "air.get_global_id.i32") -uint thread_position_in_grid_z(uint dim); +pragma(LDC_intrinsic, "air.get_local_id.i32") +uint thread_position_in_threadgroup(uint dim); + +pragma(LDC_intrinsic, "air.get_local_size.i32") +uint threads_per_threadgroup(uint dim); + +pragma(LDC_intrinsic, "air.get_group_id.i32") +uint threadgroup_position_in_grid(uint dim); + +pragma(LDC_intrinsic, "air.get_global_size.i32") +uint threads_per_grid(uint dim); + +pragma(LDC_intrinsic, "air.get_num_groups.i32") +uint threadgroups_per_grid(uint dim); \ No newline at end of file diff --git a/source/dcompute/tests/dummykernels.d b/source/dcompute/tests/dummykernels.d index ab37d3a..daff4cd 100644 --- a/source/dcompute/tests/dummykernels.d +++ b/source/dcompute/tests/dummykernels.d @@ -5,21 +5,34 @@ pragma(LDC_no_moduleinfo); import ldc.dcompute; import dcompute.std.index; -@kernel() void saxpy(GlobalPointer!(float) res, - float alpha,GlobalPointer!(float) x, - GlobalPointer!(float) y, - size_t N) -{ - auto i = GlobalIndex.x; - if (i >= N) return; - res[i] = alpha*x[i] + y[i]; -} +version(DComputeTestMetal){ + @kernel() void saxpy(GlobalPointer!(float) res, + float alpha,GlobalPointer!(float) x, + GlobalPointer!(float) y, + size_t N) + { + auto i = GlobalIndex.x; + if (i >= N) return; + res[i] = alpha*x[i] + y[i]; + } +} else { + @kernel() void saxpy(GlobalPointer!(float) res, + float alpha,GlobalPointer!(float) x, + GlobalPointer!(float) y, + size_t N) + { + auto i = GlobalIndex.x; + if (i >= N) return; + res[i] = alpha*x[i] + y[i]; + } + -alias aagf = AutoIndexed!(GlobalPointer!(float)); + alias aagf = AutoIndexed!(GlobalPointer!(float)); -@kernel() void auto_index_test(aagf a, - aagf b, - aagf c) -{ - a = b + c; + @kernel() void auto_index_test(aagf a, + aagf b, + aagf c) + { + a = b + c; + } } diff --git a/source/dcompute/tests/main.d b/source/dcompute/tests/main.d index bc1a918..de99fa8 100644 --- a/source/dcompute/tests/main.d +++ b/source/dcompute/tests/main.d @@ -23,6 +23,8 @@ version(DComputeTestOpenCL) import dcompute.driver.ocl; else version(DComputeTestCUDA) import dcompute.driver.cuda; +else version(DComputeTestMetal) + import dcompute.driver.metal; else static assert(false, "Need to test something!"); @@ -170,6 +172,46 @@ int main(string[] args) } } + version(DComputeTestMetal) + { + auto devices = Platform.getDevices(); + + auto device = devices[0]; + + if (device.raw is null) + { + "Failed to fetch default device".writeln; + return 1; + } + + auto program = Program.fromFile(device, "./kernels_metal400_64.metallib"); + + Program.globalProgram = program; + + if (program.metalLibrary is null) + { + "Failed to load .metallibrary".writeln; + return 2; + } + + auto deviceX = device.makeBuffer!float(x); + auto deviceY = device.makeBuffer!float(y); + auto deviceRes = device.makeBuffer!float(res); + + auto queue = Queue(device); + + queue.enqueue!(saxpy) + ([N,1,1],[256,1,1]) + (deviceRes, alpha, deviceX, deviceY, N); + + queue.finish(); + + // Copy data from device buffer to host + auto contents = deviceRes.contents(); + + res = contents[0 .. res.length]; + } + foreach(i; 0 .. N) enforce(res[i] == alpha * x[i] + y[i]); writeln(res[]); From 954408dbc38eaee40bc4ca4cd5f7223c2a9ced35 Mon Sep 17 00:00:00 2001 From: Asadbek Date: Thu, 18 Jun 2026 23:32:13 +0200 Subject: [PATCH 8/9] remove extra space --- source/dcompute/driver/metal/device.d | 1 - 1 file changed, 1 deletion(-) diff --git a/source/dcompute/driver/metal/device.d b/source/dcompute/driver/metal/device.d index 688776f..847bf58 100644 --- a/source/dcompute/driver/metal/device.d +++ b/source/dcompute/driver/metal/device.d @@ -17,7 +17,6 @@ struct Device return cast(MTLDevice) raw; } - this(MTLDevice device) { raw = cast(void*)device; From 58b830838af85fe5c5efff144780eb559eba5e5f Mon Sep 17 00:00:00 2001 From: Asadbek Date: Fri, 19 Jun 2026 00:14:41 +0200 Subject: [PATCH 9/9] update block dimension --- source/dcompute/tests/main.d | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/dcompute/tests/main.d b/source/dcompute/tests/main.d index de99fa8..0f1e524 100644 --- a/source/dcompute/tests/main.d +++ b/source/dcompute/tests/main.d @@ -201,7 +201,7 @@ int main(string[] args) auto queue = Queue(device); queue.enqueue!(saxpy) - ([N,1,1],[256,1,1]) + ([N,1,1],[1,1,1]) (deviceRes, alpha, deviceX, deviceY, N); queue.finish();