From 8dd6b38b4a4936c00dccba659d1ee5154f785fec Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Wed, 4 Mar 2026 09:34:38 -0600 Subject: [PATCH 1/2] AcceleratedKernels accumulate and sort --- Project.toml | 2 ++ src/OpenCL.jl | 4 ++++ src/accumulate.jl | 17 +++++++++++++++++ src/sorting.jl | 3 +++ test/setup.jl | 2 +- 5 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 src/accumulate.jl create mode 100644 src/sorting.jl diff --git a/Project.toml b/Project.toml index 828e36be..3eac341c 100644 --- a/Project.toml +++ b/Project.toml @@ -3,6 +3,7 @@ uuid = "08131aa3-fb12-5dee-8b74-c09406e224a2" version = "0.10.9" [deps] +AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" @@ -25,6 +26,7 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" SPIRVIntrinsics = {path = "lib/intrinsics"} [compat] +AcceleratedKernels = "0.3.1, 0.4" Adapt = "4" GPUArrays = "11.2.1" GPUCompiler = "1.7.1" diff --git a/src/OpenCL.jl b/src/OpenCL.jl index 7b2a52d9..812a76dd 100644 --- a/src/OpenCL.jl +++ b/src/OpenCL.jl @@ -10,6 +10,7 @@ using Random using Preferences import KernelAbstractions: KernelAbstractions +import AcceleratedKernels as AK using Core: LLVMPtr @@ -47,4 +48,7 @@ include("random.jl") include("OpenCLKernels.jl") import .OpenCLKernels: OpenCLBackend export OpenCLBackend + +include("sorting.jl") +include("accumulate.jl") end diff --git a/src/accumulate.jl b/src/accumulate.jl new file mode 100644 index 00000000..2fa172b5 --- /dev/null +++ b/src/accumulate.jl @@ -0,0 +1,17 @@ +# Use a smaller block size to work around a scan correctness issue +# with the Blelloch parallel prefix sum at larger block sizes (>=128). +const _ACCUMULATE_BLOCK_SIZE = 64 + +Base.accumulate!(op, B::CLArray, A::CLArray; init=zero(eltype(A)), + block_size=_ACCUMULATE_BLOCK_SIZE, kwargs...) = + AK.accumulate!(op, B, A, OpenCLBackend(); init, block_size, kwargs...) + +Base.accumulate(op, A::CLArray; init=zero(eltype(A)), + block_size=_ACCUMULATE_BLOCK_SIZE, kwargs...) = + AK.accumulate(op, A, OpenCLBackend(); init, block_size, kwargs...) + +Base.cumsum(src::CLArray; block_size=_ACCUMULATE_BLOCK_SIZE, kwargs...) = + AK.cumsum(src, OpenCLBackend(); block_size, kwargs...) + +Base.cumprod(src::CLArray; block_size=_ACCUMULATE_BLOCK_SIZE, kwargs...) = + AK.cumprod(src, OpenCLBackend(); block_size, kwargs...) diff --git a/src/sorting.jl b/src/sorting.jl new file mode 100644 index 00000000..112ae104 --- /dev/null +++ b/src/sorting.jl @@ -0,0 +1,3 @@ +Base.sort!(x::CLArray; kwargs...) = (AK.sort!(x; kwargs...); return x) +Base.sortperm!(ix::CLArray, x::CLArray; kwargs...) = (AK.sortperm!(ix, x; kwargs...); return ix) +Base.sortperm(x::CLArray; kwargs...) = sortperm!(CLArray(1:length(x)), x; kwargs...) diff --git a/test/setup.jl b/test/setup.jl index 90337d36..d582a853 100644 --- a/test/setup.jl +++ b/test/setup.jl @@ -90,7 +90,7 @@ function runtests(f, name, platform_filter) end # some tests require native execution capabilities - requires_il = name in ["atomics", "execution", "intrinsics", "kernelabstractions"] || + requires_il = name in ["acceleratedkernels", "atomics", "execution", "intrinsics", "kernelabstractions"] || startswith(name, "gpuarrays/") || startswith(name, "device/") ex = quote From 0a4fc04ad333be119a16de6d803a7bd828bb08a1 Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Wed, 4 Mar 2026 09:43:53 -0600 Subject: [PATCH 2/2] Format --- src/accumulate.jl | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/accumulate.jl b/src/accumulate.jl index 2fa172b5..e35a3ae9 100644 --- a/src/accumulate.jl +++ b/src/accumulate.jl @@ -2,16 +2,20 @@ # with the Blelloch parallel prefix sum at larger block sizes (>=128). const _ACCUMULATE_BLOCK_SIZE = 64 -Base.accumulate!(op, B::CLArray, A::CLArray; init=zero(eltype(A)), - block_size=_ACCUMULATE_BLOCK_SIZE, kwargs...) = +Base.accumulate!( + op, B::CLArray, A::CLArray; init = zero(eltype(A)), + block_size = _ACCUMULATE_BLOCK_SIZE, kwargs... +) = AK.accumulate!(op, B, A, OpenCLBackend(); init, block_size, kwargs...) -Base.accumulate(op, A::CLArray; init=zero(eltype(A)), - block_size=_ACCUMULATE_BLOCK_SIZE, kwargs...) = +Base.accumulate( + op, A::CLArray; init = zero(eltype(A)), + block_size = _ACCUMULATE_BLOCK_SIZE, kwargs... +) = AK.accumulate(op, A, OpenCLBackend(); init, block_size, kwargs...) -Base.cumsum(src::CLArray; block_size=_ACCUMULATE_BLOCK_SIZE, kwargs...) = +Base.cumsum(src::CLArray; block_size = _ACCUMULATE_BLOCK_SIZE, kwargs...) = AK.cumsum(src, OpenCLBackend(); block_size, kwargs...) -Base.cumprod(src::CLArray; block_size=_ACCUMULATE_BLOCK_SIZE, kwargs...) = +Base.cumprod(src::CLArray; block_size = _ACCUMULATE_BLOCK_SIZE, kwargs...) = AK.cumprod(src, OpenCLBackend(); block_size, kwargs...)