From 0ed8290d06a42d1c0e47e8fa491fde483df6205c Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 16 Dec 2025 13:47:51 -0400 Subject: [PATCH 01/16] Add test for kernels with multiple shared buffers --- test/localmem.jl | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/test/localmem.jl b/test/localmem.jl index 9a34d97c6..7894ee61e 100644 --- a/test/localmem.jl +++ b/test/localmem.jl @@ -47,9 +47,26 @@ end end end +@kernel function many_localmem(A) + N = @uniform prod(@groupsize()) + @uniform begin + N2 = prod(@groupsize()) + end + I = @index(Global, Linear) + i = @index(Local, Linear) + lmem1 = @localmem Int (N,) # Ok iff groupsize is static + lmem2 = @localmem Int (N,) # Ok iff groupsize is static + @inbounds begin + lmem1[i] = i-1 + lmem2[i] = 1 + @synchronize + A[I] = lmem1[N2 - i + 1] + lmem2[N2 - i + 1] + end +end + function localmem_testsuite(backend, ArrayT) @testset "kernels" begin - @testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16), localmem_unsafe_indices(backend(), 16)) + @testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16), localmem_unsafe_indices(backend(), 16), many_localmem(backend(), 16)) A = ArrayT{Int}(undef, 64) kernel!(A, ndrange = size(A)) synchronize(backend()) From e79898180ff38c5abe32e86b8fb811c4d168efc0 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 12 Dec 2025 17:44:37 -0400 Subject: [PATCH 02/16] KernelIntrinsics Tweaks --- src/KernelAbstractions.jl | 2 +- src/intrinsics.jl | 4 ++-- test/intrinsics.jl | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index cdb4dd960..3881da55c 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -833,7 +833,7 @@ include("macros.jl") ### function Scratchpad end -SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(t, dims) +SharedMemory(::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(T, dims) __synchronize() = KI.barrier() diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 1811ad2f8..79efa4cbe 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -103,14 +103,14 @@ Returns the unique group ID. function get_group_id end """ - localmemory(T, dims) + localmemory(::Type{T}, dims) Declare memory that is local to a workgroup. !!! note Backend implementations **must** implement: ``` - @device_override localmemory(T::DataType, ::Val{Dims}) where {T, Dims} + @device_override localmemory(::Type{T}, ::Val{Dims}) where {T, Dims} ``` As well as the on-device functionality. """ diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 97548c478..d4952a2d4 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -89,7 +89,7 @@ function intrinsics_testsuite(backend, AT) @test KI.kernel_max_work_group_size(kernel) isa Int @test KI.kernel_max_work_group_size(kernel; max_work_items = 1) == 1 - kernel(results, workgroupsize = 4, numworkgroups = 4) + kernel(results; workgroupsize = 4, numworkgroups = 4) KernelAbstractions.synchronize(backend()) host_results = Array(results) From 93753bb3eb8efc967c815a8015f5419b1aa9bfa2 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 17 Dec 2025 13:53:19 -0400 Subject: [PATCH 03/16] Fix temporary AK compat --- .buildkite/pipeline.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index f1221aac5..d25fd5fba 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -109,8 +109,7 @@ steps: julia -e 'println("--- :julia: Developing oneAPI") using Pkg Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsics") - Pkg.develop(; name="AcceleratedKernels")' - sed -i 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.10"/' \${JULIA_DEPOT_PATH}/dev/AcceleratedKernels/Project.toml + Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")' julia -e 'println("--- :julia: Instantiating project") using Pkg Pkg.develop(; path=pwd())' || exit 3 @@ -141,8 +140,7 @@ steps: command: | julia -e 'println("--- :julia: Developing AMDGPU") using Pkg - Pkg.develop(; name="AcceleratedKernels")' - sed -i 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.9, 0.10"/' \${JULIA_DEPOT_PATH}/dev/AcceleratedKernels/Project.toml + Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")' julia -e ' using Pkg Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsics") From 300d432b7c0c78395ca6ddd9e4a1beb87bfbd79a Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 22 Dec 2025 15:39:43 -0400 Subject: [PATCH 04/16] Improve KI tests --- test/intrinsics.jl | 61 +++++++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/test/intrinsics.jl b/test/intrinsics.jl index d4952a2d4..2c713b5f0 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -1,21 +1,23 @@ import KernelAbstractions.KernelIntrinsics as KI +struct KernelData + global_size::Int + global_id::Int + local_size::Int + local_id::Int + num_groups::Int + group_id::Int +end function test_intrinsics_kernel(results) - # Test all intrinsics return NamedTuples with x, y, z fields - global_size = KI.get_global_size() - global_id = KI.get_global_id() - local_size = KI.get_local_size() - local_id = KI.get_local_id() - num_groups = KI.get_num_groups() - group_id = KI.get_group_id() - - if UInt32(global_id.x) <= UInt32(global_size.x) - results[1, global_id.x] = global_id.x - results[2, global_id.x] = local_id.x - results[3, global_id.x] = group_id.x - results[4, global_id.x] = global_size.x - results[5, global_id.x] = local_size.x - results[6, global_id.x] = num_groups.x + i = KI.get_global_id().x + + if i <= length(results) + @inbounds results[i] = KernelData(KI.get_global_size().x, + KI.get_global_id().x, + KI.get_local_size().x, + KI.get_local_id().x, + KI.get_num_groups().x, + KI.get_group_id().x) end return end @@ -82,41 +84,40 @@ function intrinsics_testsuite(backend, AT) @test KI.multiprocessor_count(backend()) isa Int # Test with small kernel - N = 16 - results = AT(zeros(Int, 6, N)) + workgroupsize = 4 + numworkgroups = 4 + N = workgroupsize * numworkgroups + results = AT(Vector{KernelData}(undef, N)) kernel = KI.@kernel backend() launch = false test_intrinsics_kernel(results) @test KI.kernel_max_work_group_size(kernel) isa Int @test KI.kernel_max_work_group_size(kernel; max_work_items = 1) == 1 - kernel(results; workgroupsize = 4, numworkgroups = 4) + kernel(results; workgroupsize, numworkgroups) KernelAbstractions.synchronize(backend()) host_results = Array(results) # Verify results make sense - for i in 1:N - global_id_x, local_id_x, group_id_x, global_size_x, local_size_x, num_groups_x = host_results[:, i] + for (i, k_data) in enumerate(host_results) # Global IDs should be 1-based and sequential - @test global_id_x == i + @test k_data.global_id == i # Global size should match our ndrange - @test global_size_x == N + @test k_data.global_size == N - # Local size should be 4 (our workgroupsize) - @test local_size_x == 4 + @test k_data.local_size == workgroupsize - # Number of groups should be ceil(N/4) = 4 - @test num_groups_x == 4 + @test k_data.num_groups == numworkgroups # Group ID should be 1-based - expected_group = div(i - 1, 4) + 1 - @test group_id_x == expected_group + expected_group = div(i - 1, numworkgroups) + 1 + @test k_data.group_id == expected_group # Local ID should be 1-based within group - expected_local = ((i - 1) % 4) + 1 - @test local_id_x == expected_local + expected_local = ((i - 1) % workgroupsize) + 1 + @test k_data.local_id == expected_local end end end From 15597e4d165f0330f8c99e0b92e8cf1a2e07a18a Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 22 Dec 2025 16:04:53 -0400 Subject: [PATCH 05/16] Initial subgroups support --- src/intrinsics.jl | 115 ++++++++++++++++++++++++++++++++++++++++++++- test/intrinsics.jl | 54 +++++++++++++++++++++ 2 files changed, 167 insertions(+), 2 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 79efa4cbe..5f5528c7b 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -102,6 +102,78 @@ Returns the unique group ID. """ function get_group_id end +""" + get_sub_group_size()::UInt32 + +Returns the number of work-items in the sub-group. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_sub_group_size()::UInt32 + ``` +""" +function get_sub_group_size end + +""" + get_max_sub_group_size()::UInt32 + +Returns the maximum sub-group size for sub-groups in the current workgroup. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_max_sub_group_size()::UInt32 + ``` +""" +function get_max_sub_group_size end + +""" + get_num_sub_groups()::UInt32 + +Returns the number of sub-groups in the current workgroup. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_num_sub_groups()::UInt32 + ``` +""" +function get_num_sub_groups end + +""" + get_sub_group_id()::UInt32 + +Returns the sub-group ID within the work-group. + +!!! note + 1-based. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_sub_group_id()::UInt32 + ``` +""" +function get_sub_group_id end + +""" + get_sub_group_local_id()::UInt32 + +Returns the work-item ID within the current sub-group. + +!!! note + 1-based. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_sub_group_local_id()::UInt32 + ``` +""" +function get_sub_group_local_id end + + """ localmemory(::Type{T}, dims) @@ -139,6 +211,29 @@ function barrier() error("Group barrier used outside kernel or not captured") end +""" + sub_group_barrier() + +After a `sub_group_barrier()` call, all read and writes to global and local memory +from each thread in the sub-group are visible in from all other threads in the +sub-group. + +This does **not** guarantee that a write from a thread in a certain sub-group will +be visible to a thread in a different sub-group. + +!!! note + `sub_group_barrier()` must be encountered by all workitems of a sub-group executing the kernel or by none at all. + +!!! note + Backend implementations **must** implement: + ``` + @device_override sub_group_barrier() + ``` +""" +function sub_group_barrier() + error("Sub-group barrier used outside kernel or not captured") +end + """ _print(args...) @@ -174,7 +269,7 @@ kernel on the host. Backends must also implement the on-device kernel launch functionality. """ -struct Kernel{B, Kern} +struct Kernel{B,Kern} backend::B kern::Kern end @@ -220,6 +315,22 @@ kernel launch with too big a workgroup is attempted. """ function max_work_group_size end +""" + sub_group_size(backend)::Int + +Returns a reasonable sub-group size supported by the currently +active device for the specified backend. This would typically +be 32, or 64 for devices that don't support 32. + +!!! note + Backend implementations **must** implement: + ``` + sub_group_size(backend::NewBackend)::Int + ``` + As well as the on-device functionality. +""" +function sub_group_size end + """ multiprocessor_count(backend::NewBackend)::Int @@ -299,7 +410,7 @@ There are a few keyword arguments that influence the behavior of `KI.@kernel`: """ macro kernel(backend, ex...) call = ex[end] - kwargs = map(ex[1:(end - 1)]) do kwarg + kwargs = map(ex[1:(end-1)]) do kwarg if kwarg isa Symbol :($kwarg = $kwarg) elseif Meta.isexpr(kwarg, :(=)) diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 2c713b5f0..ebeea4cbe 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -21,6 +21,25 @@ function test_intrinsics_kernel(results) end return end +struct SubgroupData + sub_group_size::UInt32 + max_sub_group_size::UInt32 + num_sub_groups::UInt32 + sub_group_id::UInt32 + sub_group_local_id::UInt32 +end +function test_subgroup_kernel(results) + i = KI.get_global_id().x + + if i <= length(results) + @inbounds results[i] = SubgroupData(KI.get_sub_group_size(), + KI.get_max_sub_group_size(), + KI.get_num_sub_groups(), + KI.get_sub_group_id(), + KI.get_sub_group_local_id()) + end + return +end function intrinsics_testsuite(backend, AT) @testset "KernelIntrinsics Tests" begin @@ -120,6 +139,41 @@ function intrinsics_testsuite(backend, AT) @test k_data.local_id == expected_local end end + + @testset "Subgroups" begin + @test KI.sub_group_size(backend()) isa Int + + # Test with small kernel + sg_size = 32 + sg_n = 2 + workgroupsize = sg_size * sg_n + numworkgroups = 2 + N = workgroupsize * numworkgroups + + results = AT(Vector{SubgroupData}(undef, N)) + kernel = KI.@kernel backend() launch = false test_subgroup_kernel(results) + + kernel(results; workgroupsize, numworkgroups) + KernelAbstractions.synchronize(backend()) + + host_results = Array(results) + + # Verify results make sense + for (i, sg_data) in enumerate(host_results) + @test sg_data.sub_group_size == sg_size + @test sg_data.max_sub_group_size == sg_size + @test sg_data.num_sub_groups == sg_n + + # Group ID should be 1-based + div(((i - 1) % workgroupsize), sg_n) + 1 + expected_sub_group = div(((i - 1) % workgroupsize), sg_size) + 1 + @test sg_data.sub_group_id == expected_sub_group + + # Local ID should be 1-based within group + expected_sg_local = ((i - 1) % sg_size) + 1 + @test sg_data.sub_group_local_id == expected_sg_local + end + end end return nothing end From 28b588441e74645dc5b692ed95817b01776a3092 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 22 Dec 2025 21:42:15 -0400 Subject: [PATCH 06/16] [TEMP] --- test/testsuite.jl | 72 +++++++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/test/testsuite.jl b/test/testsuite.jl index 31b801b3d..f24e85fb0 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -41,57 +41,57 @@ include("convert.jl") include("specialfunctions.jl") function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{String}()) - @conditional_testset "Unittests" skip_tests begin - unittest_testsuite(backend, backend_str, backend_mod, DAT; skip_tests) - end + # @conditional_testset "Unittests" skip_tests begin + # unittest_testsuite(backend, backend_str, backend_mod, DAT; skip_tests) + # end - @conditional_testset "SpecialFunctions" skip_tests begin - specialfunctions_testsuite(backend) - end + # @conditional_testset "SpecialFunctions" skip_tests begin + # specialfunctions_testsuite(backend) + # end @conditional_testset "Intrinsics" skip_tests begin intrinsics_testsuite(backend, AT) end - @conditional_testset "Localmem" skip_tests begin - localmem_testsuite(backend, AT) - end + # @conditional_testset "Localmem" skip_tests begin + # localmem_testsuite(backend, AT) + # end - @conditional_testset "Private" skip_tests begin - private_testsuite(backend, AT) - end + # @conditional_testset "Private" skip_tests begin + # private_testsuite(backend, AT) + # end - @conditional_testset "Unroll" skip_tests begin - unroll_testsuite(backend, AT) - end + # @conditional_testset "Unroll" skip_tests begin + # unroll_testsuite(backend, AT) + # end - @testset "NDIteration" begin - nditeration_testsuite() - end + # @testset "NDIteration" begin + # nditeration_testsuite() + # end - @conditional_testset "copyto!" skip_tests begin - copyto_testsuite(backend, AT) - end + # @conditional_testset "copyto!" skip_tests begin + # copyto_testsuite(backend, AT) + # end - @conditional_testset "Devices" skip_tests begin - devices_testsuite(backend) - end + # @conditional_testset "Devices" skip_tests begin + # devices_testsuite(backend) + # end - @conditional_testset "Printing" skip_tests begin - printing_testsuite(backend) - end + # @conditional_testset "Printing" skip_tests begin + # printing_testsuite(backend) + # end - @conditional_testset "Reflection" skip_tests begin - reflection_testsuite(backend, backend_str, AT) - end + # @conditional_testset "Reflection" skip_tests begin + # reflection_testsuite(backend, backend_str, AT) + # end - @conditional_testset "Convert" skip_tests begin - convert_testsuite(backend, AT) - end + # @conditional_testset "Convert" skip_tests begin + # convert_testsuite(backend, AT) + # end - @conditional_testset "Examples" skip_tests begin - examples_testsuite(backend, backend_str) - end + # @conditional_testset "Examples" skip_tests begin + # examples_testsuite(backend, backend_str) + # end return end From 6f09cac618fefc38362645b3a4b4a35352a31cbf Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 22 Dec 2025 21:43:21 -0400 Subject: [PATCH 07/16] [Temp] CI --- .buildkite/pipeline.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index d25fd5fba..0149d9db5 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -13,7 +13,7 @@ steps: command: | julia -e 'println("--- :julia: Developing CUDA") using Pkg - Pkg.add(url="https://github.com/christiangnrd/CUDA.jl", rev="intrinsics")' + Pkg.add(url="https://github.com/christiangnrd/CUDA.jl", rev="intrinsicsnew")' julia -e 'println("--- :julia: Instantiating project") using Pkg Pkg.develop(; path=pwd())' || exit 3 @@ -76,7 +76,7 @@ steps: command: | julia -e 'println("--- :julia: Developing Metal") using Pkg - Pkg.add(url="https://github.com/JuliaGPU/Metal.jl", rev="kaintr")' + Pkg.add(url="https://github.com/JuliaGPU/Metal.jl", rev="kaintrnew")' julia -e 'println("--- :julia: Instantiating project") using Pkg Pkg.develop(; path=pwd())' || exit 3 From 59e635964b814f36c699c4cd85156832a592fa04 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 22 Dec 2025 22:34:11 -0400 Subject: [PATCH 08/16] Add oneAPI branch --- .buildkite/pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 0149d9db5..28cdaa6ff 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -108,7 +108,7 @@ steps: command: | julia -e 'println("--- :julia: Developing oneAPI") using Pkg - Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsics") + Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsicsnew") Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")' julia -e 'println("--- :julia: Instantiating project") using Pkg From 08a82d438fdc5e0e701736697564978449497af5 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 22 Dec 2025 22:38:17 -0400 Subject: [PATCH 09/16] Adjust test --- test/intrinsics.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/intrinsics.jl b/test/intrinsics.jl index ebeea4cbe..8dcddfc74 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -144,7 +144,7 @@ function intrinsics_testsuite(backend, AT) @test KI.sub_group_size(backend()) isa Int # Test with small kernel - sg_size = 32 + sg_size = KI.sub_group_size(backend()) sg_n = 2 workgroupsize = sg_size * sg_n numworkgroups = 2 From bd4ad1c293f78bdeacdf71fc6b2a8c269ef2953e Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 22 Dec 2025 22:39:02 -0400 Subject: [PATCH 10/16] Add CI for not-yet-existing branches --- .buildkite/pipeline.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 28cdaa6ff..793fb9871 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -143,7 +143,7 @@ steps: Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")' julia -e ' using Pkg - Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsics") + Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsicsnew") println("--- :julia: Instantiating project") Pkg.develop(; path=pwd())' || exit 3 @@ -174,7 +174,7 @@ steps: command: | julia -e 'println("--- :julia: Developing OpenCL") using Pkg - Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsics") + Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew") Pkg.develop(; name="SPIRVIntrinsics")' julia -e 'println("--- :julia: Instantiating project") using Pkg From 9acda75f3f7a4e81800bd40cb71d488079fa50b1 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 2 Jan 2026 00:01:15 -0400 Subject: [PATCH 11/16] Revert "[TEMP]" This reverts commit 956dc2e46a28bc307bc9471ca6972bfc2374c38f. --- test/testsuite.jl | 72 +++++++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/test/testsuite.jl b/test/testsuite.jl index f24e85fb0..31b801b3d 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -41,57 +41,57 @@ include("convert.jl") include("specialfunctions.jl") function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{String}()) - # @conditional_testset "Unittests" skip_tests begin - # unittest_testsuite(backend, backend_str, backend_mod, DAT; skip_tests) - # end + @conditional_testset "Unittests" skip_tests begin + unittest_testsuite(backend, backend_str, backend_mod, DAT; skip_tests) + end - # @conditional_testset "SpecialFunctions" skip_tests begin - # specialfunctions_testsuite(backend) - # end + @conditional_testset "SpecialFunctions" skip_tests begin + specialfunctions_testsuite(backend) + end @conditional_testset "Intrinsics" skip_tests begin intrinsics_testsuite(backend, AT) end - # @conditional_testset "Localmem" skip_tests begin - # localmem_testsuite(backend, AT) - # end + @conditional_testset "Localmem" skip_tests begin + localmem_testsuite(backend, AT) + end - # @conditional_testset "Private" skip_tests begin - # private_testsuite(backend, AT) - # end + @conditional_testset "Private" skip_tests begin + private_testsuite(backend, AT) + end - # @conditional_testset "Unroll" skip_tests begin - # unroll_testsuite(backend, AT) - # end + @conditional_testset "Unroll" skip_tests begin + unroll_testsuite(backend, AT) + end - # @testset "NDIteration" begin - # nditeration_testsuite() - # end + @testset "NDIteration" begin + nditeration_testsuite() + end - # @conditional_testset "copyto!" skip_tests begin - # copyto_testsuite(backend, AT) - # end + @conditional_testset "copyto!" skip_tests begin + copyto_testsuite(backend, AT) + end - # @conditional_testset "Devices" skip_tests begin - # devices_testsuite(backend) - # end + @conditional_testset "Devices" skip_tests begin + devices_testsuite(backend) + end - # @conditional_testset "Printing" skip_tests begin - # printing_testsuite(backend) - # end + @conditional_testset "Printing" skip_tests begin + printing_testsuite(backend) + end - # @conditional_testset "Reflection" skip_tests begin - # reflection_testsuite(backend, backend_str, AT) - # end + @conditional_testset "Reflection" skip_tests begin + reflection_testsuite(backend, backend_str, AT) + end - # @conditional_testset "Convert" skip_tests begin - # convert_testsuite(backend, AT) - # end + @conditional_testset "Convert" skip_tests begin + convert_testsuite(backend, AT) + end - # @conditional_testset "Examples" skip_tests begin - # examples_testsuite(backend, backend_str) - # end + @conditional_testset "Examples" skip_tests begin + examples_testsuite(backend, backend_str) + end return end From 77c6e2061838fcb15598857522a93dc3c6a0052a Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 21 Nov 2025 21:32:27 -0400 Subject: [PATCH 12/16] `shfl_down` intrinsics Co-Authored-By: Anton Smirnov --- src/intrinsics.jl | 26 ++++++++++++++++++++++++++ test/intrinsics.jl | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 5f5528c7b..05f2995e1 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -188,6 +188,32 @@ Declare memory that is local to a workgroup. """ localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims)) +""" + shfl_down(val::T, offset::Integer)::T where T + +Read `val` from a lane with higher id given by `offset`. + +!!! note + Backend implementations **must** implement: + ``` + @device_override shfl_down(val::T, offset::Integer)::T where T + ``` + As well as the on-device functionality. +""" +function shfl_down end + +""" + shfl_down_types(::Backend)::Vector{DataType} + +Returns a vector of `DataType`s supported on `backend` + +!!! note + Backend implementations **must** implement this function + only if they support `shfl_down` for any types. +""" +shfl_down_types(::Backend) = DataType[] + + """ barrier() diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 8dcddfc74..aedc0fd18 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -41,6 +41,32 @@ function test_subgroup_kernel(results) return end +# Do NOT use this kernel as an example for your code. +# It was written assuming one workgroup of size 32 and +# is only valid for those +function shfl_down_test_kernel(a, b) + # This is not valid + idx = KI.get_local_id().x + + temp = KI.localmemory(eltype(b), 32) + temp[idx] = a[idx] + + KI.barrier() + + if idx == 1 + value = temp[idx] + + value = value + KI.shfl_down(value, 16) + value = value + KI.shfl_down(value, 8) + value = value + KI.shfl_down(value, 4) + value = value + KI.shfl_down(value, 2) + value = value + KI.shfl_down(value, 1) + + b[idx] = value + end + return +end + function intrinsics_testsuite(backend, AT) @testset "KernelIntrinsics Tests" begin @testset "Launch parameters" begin @@ -174,6 +200,18 @@ function intrinsics_testsuite(backend, AT) @test sg_data.sub_group_local_id == expected_sg_local end end + @testset "shfl_down(::$T)" for T in KI.shfl_down_types(backend()) + a = zeros(T, 32) + rand!(a, (1:4)) + + dev_a = AT(a) + dev_b = AT(zeros(T, 32)) + + KI.@kernel backend() workgroupsize=32 shfl_down_test_kernel(dev_a, dev_b) + + b = Array(dev_b) + @test sum(a) ≈ b[1] + end end return nothing end From 9e5f96543a5d1e90b76c8153b53bbdc3dad682de Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 2 Jan 2026 00:20:06 -0400 Subject: [PATCH 13/16] Add note about need to synchronize --- src/intrinsics.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 05f2995e1..e8d0e7438 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -192,6 +192,8 @@ localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims)) shfl_down(val::T, offset::Integer)::T where T Read `val` from a lane with higher id given by `offset`. +When writing kernels using this function, it should be +assumed that it is not synchronized. !!! note Backend implementations **must** implement: From f8692becd598d7d25b81225f05ef1caf6d616ec4 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 2 Jan 2026 00:20:23 -0400 Subject: [PATCH 14/16] Fixup --- test/intrinsics.jl | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/test/intrinsics.jl b/test/intrinsics.jl index aedc0fd18..e15096c95 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -44,11 +44,11 @@ end # Do NOT use this kernel as an example for your code. # It was written assuming one workgroup of size 32 and # is only valid for those -function shfl_down_test_kernel(a, b) +function shfl_down_test_kernel(a, b, ::Val{N}) where N # This is not valid - idx = KI.get_local_id().x + idx = KI.get_sub_group_local_id() - temp = KI.localmemory(eltype(b), 32) + temp = KI.localmemory(eltype(b), N) temp[idx] = a[idx] KI.barrier() @@ -56,11 +56,24 @@ function shfl_down_test_kernel(a, b) if idx == 1 value = temp[idx] + if KI.get_sub_group_size() > 32 + value = value + KI.shfl_down(value, 32) + KI.sub_group_barrier() + end value = value + KI.shfl_down(value, 16) + KI.sub_group_barrier() + value = value + KI.shfl_down(value, 8) + KI.sub_group_barrier() + value = value + KI.shfl_down(value, 4) + KI.sub_group_barrier() + value = value + KI.shfl_down(value, 2) + KI.sub_group_barrier() + value = value + KI.shfl_down(value, 1) + KI.sub_group_barrier() b[idx] = value end @@ -201,13 +214,14 @@ function intrinsics_testsuite(backend, AT) end end @testset "shfl_down(::$T)" for T in KI.shfl_down_types(backend()) - a = zeros(T, 32) + N = KI.sub_group_size(backend()) + a = zeros(T, N) rand!(a, (1:4)) dev_a = AT(a) - dev_b = AT(zeros(T, 32)) + dev_b = AT(zeros(T, N)) - KI.@kernel backend() workgroupsize=32 shfl_down_test_kernel(dev_a, dev_b) + KI.@kernel backend() workgroupsize=N shfl_down_test_kernel(dev_a, dev_b, Val(N)) b = Array(dev_b) @test sum(a) ≈ b[1] From 84730d2147eb041a639fedd164434cb8e7243736 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 2 Jan 2026 01:19:02 -0400 Subject: [PATCH 15/16] Fix `shfl_down` test --- test/intrinsics.jl | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/test/intrinsics.jl b/test/intrinsics.jl index e15096c95..c1da42ec2 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -41,41 +41,21 @@ function test_subgroup_kernel(results) return end -# Do NOT use this kernel as an example for your code. -# It was written assuming one workgroup of size 32 and -# is only valid for those function shfl_down_test_kernel(a, b, ::Val{N}) where N - # This is not valid idx = KI.get_sub_group_local_id() - temp = KI.localmemory(eltype(b), N) - temp[idx] = a[idx] + val = a[idx] - KI.barrier() - - if idx == 1 - value = temp[idx] - - if KI.get_sub_group_size() > 32 - value = value + KI.shfl_down(value, 32) - KI.sub_group_barrier() - end - value = value + KI.shfl_down(value, 16) - KI.sub_group_barrier() - - value = value + KI.shfl_down(value, 8) - KI.sub_group_barrier() - - value = value + KI.shfl_down(value, 4) - KI.sub_group_barrier() - - value = value + KI.shfl_down(value, 2) - KI.sub_group_barrier() + offset = 0x00000001 + while offset < N + val += KI.shfl_down(val, offset) + offset <<= 1 + end - value = value + KI.shfl_down(value, 1) - KI.sub_group_barrier() + KI.sub_group_barrier() - b[idx] = value + if idx == 1 + b[idx] = val end return end @@ -215,8 +195,9 @@ function intrinsics_testsuite(backend, AT) end @testset "shfl_down(::$T)" for T in KI.shfl_down_types(backend()) N = KI.sub_group_size(backend()) - a = zeros(T, N) - rand!(a, (1:4)) + a = ones(T, N) + # a = zeros(T, N) + # rand!(a, (1:4)) dev_a = AT(a) dev_b = AT(zeros(T, N)) From cdb07c34621afa4237eb950d1f72b1cb38f46c56 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 2 Jan 2026 19:47:02 -0400 Subject: [PATCH 16/16] 1.10 fix in docstring --- src/intrinsics.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index e8d0e7438..d462bcc0c 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -189,7 +189,7 @@ Declare memory that is local to a workgroup. localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims)) """ - shfl_down(val::T, offset::Integer)::T where T + shfl_down(val::T, offset::Integer) where T Read `val` from a lane with higher id given by `offset`. When writing kernels using this function, it should be @@ -198,7 +198,7 @@ assumed that it is not synchronized. !!! note Backend implementations **must** implement: ``` - @device_override shfl_down(val::T, offset::Integer)::T where T + @device_override shfl_down(val::T, offset::Integer) where T ``` As well as the on-device functionality. """