diff --git a/Project.toml b/Project.toml index 828e36be..3eac341c 100644 --- a/Project.toml +++ b/Project.toml @@ -3,6 +3,7 @@ uuid = "08131aa3-fb12-5dee-8b74-c09406e224a2" version = "0.10.9" [deps] +AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" @@ -25,6 +26,7 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" SPIRVIntrinsics = {path = "lib/intrinsics"} [compat] +AcceleratedKernels = "0.3.1, 0.4" Adapt = "4" GPUArrays = "11.2.1" GPUCompiler = "1.7.1" diff --git a/src/OpenCL.jl b/src/OpenCL.jl index 7b2a52d9..812a76dd 100644 --- a/src/OpenCL.jl +++ b/src/OpenCL.jl @@ -10,6 +10,7 @@ using Random using Preferences import KernelAbstractions: KernelAbstractions +import AcceleratedKernels as AK using Core: LLVMPtr @@ -47,4 +48,7 @@ include("random.jl") include("OpenCLKernels.jl") import .OpenCLKernels: OpenCLBackend export OpenCLBackend + +include("sorting.jl") +include("accumulate.jl") end diff --git a/src/accumulate.jl b/src/accumulate.jl new file mode 100644 index 00000000..e35a3ae9 --- /dev/null +++ b/src/accumulate.jl @@ -0,0 +1,21 @@ +# Use a smaller block size to work around a scan correctness issue +# with the Blelloch parallel prefix sum at larger block sizes (>=128). +const _ACCUMULATE_BLOCK_SIZE = 64 + +Base.accumulate!( + op, B::CLArray, A::CLArray; init = zero(eltype(A)), + block_size = _ACCUMULATE_BLOCK_SIZE, kwargs... +) = + AK.accumulate!(op, B, A, OpenCLBackend(); init, block_size, kwargs...) + +Base.accumulate( + op, A::CLArray; init = zero(eltype(A)), + block_size = _ACCUMULATE_BLOCK_SIZE, kwargs... +) = + AK.accumulate(op, A, OpenCLBackend(); init, block_size, kwargs...) + +Base.cumsum(src::CLArray; block_size = _ACCUMULATE_BLOCK_SIZE, kwargs...) = + AK.cumsum(src, OpenCLBackend(); block_size, kwargs...) + +Base.cumprod(src::CLArray; block_size = _ACCUMULATE_BLOCK_SIZE, kwargs...) = + AK.cumprod(src, OpenCLBackend(); block_size, kwargs...) diff --git a/src/sorting.jl b/src/sorting.jl new file mode 100644 index 00000000..112ae104 --- /dev/null +++ b/src/sorting.jl @@ -0,0 +1,3 @@ +Base.sort!(x::CLArray; kwargs...) = (AK.sort!(x; kwargs...); return x) +Base.sortperm!(ix::CLArray, x::CLArray; kwargs...) = (AK.sortperm!(ix, x; kwargs...); return ix) +Base.sortperm(x::CLArray; kwargs...) = sortperm!(CLArray(1:length(x)), x; kwargs...) diff --git a/test/setup.jl b/test/setup.jl index 90337d36..d582a853 100644 --- a/test/setup.jl +++ b/test/setup.jl @@ -90,7 +90,7 @@ function runtests(f, name, platform_filter) end # some tests require native execution capabilities - requires_il = name in ["atomics", "execution", "intrinsics", "kernelabstractions"] || + requires_il = name in ["acceleratedkernels", "atomics", "execution", "intrinsics", "kernelabstractions"] || startswith(name, "gpuarrays/") || startswith(name, "device/") ex = quote