diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index d25fd5fba..648c77c98 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -13,7 +13,7 @@ steps: command: | julia -e 'println("--- :julia: Developing CUDA") using Pkg - Pkg.add(url="https://github.com/christiangnrd/CUDA.jl", rev="intrinsics")' + Pkg.add(url="https://github.com/christiangnrd/CUDA.jl", rev="intrinsicsnew")' julia -e 'println("--- :julia: Instantiating project") using Pkg Pkg.develop(; path=pwd())' || exit 3 @@ -76,7 +76,7 @@ steps: command: | julia -e 'println("--- :julia: Developing Metal") using Pkg - Pkg.add(url="https://github.com/JuliaGPU/Metal.jl", rev="kaintr")' + Pkg.add(url="https://github.com/JuliaGPU/Metal.jl", rev="kaintrnew")' julia -e 'println("--- :julia: Instantiating project") using Pkg Pkg.develop(; path=pwd())' || exit 3 @@ -108,7 +108,7 @@ steps: command: | julia -e 'println("--- :julia: Developing oneAPI") using Pkg - Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsics") + Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsicsnew") Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")' julia -e 'println("--- :julia: Instantiating project") using Pkg @@ -143,7 +143,7 @@ steps: Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")' julia -e ' using Pkg - Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsics") + Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsicsnew") println("--- :julia: Instantiating project") Pkg.develop(; path=pwd())' || exit 3 @@ -174,7 +174,7 @@ steps: command: | julia -e 'println("--- :julia: Developing OpenCL") using Pkg - Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsics") + Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew") Pkg.develop(; name="SPIRVIntrinsics")' julia -e 'println("--- :julia: Instantiating project") using Pkg @@ -192,6 +192,28 @@ steps: env: OCL_ICD_FILENAMES: "libnvidia-opencl.so.1" JULIA_PROJECT: "@OpenCL" + - label: "{{matrix.version}} macos CPU" + matrix: + setup: + version: + - "1.10" + - "1.11" + - "1.12" + plugins: + - JuliaCI/julia#v1: + version: "{{matrix.version}}" + - JuliaCI/julia-test#v1: + test_args: "" + - JuliaCI/julia-coverage#v1: + codecov: true + agents: + queue: "juliaecosystem" + os: "macos" + arch: "aarch64" + timeout_in_minutes: 120 + env: + JULIA_PROJECT: "@cpu" + env: JULIA_PKG_SERVER: "" # it often struggles with our large artifacts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8de2ba4f5..cd7bb7ffc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,7 +22,7 @@ jobs: fail-fast: false matrix: version: ['1.10', '1.11', '1.12'] - os: [ubuntu-24.04, ubuntu-24.04-arm, macOS-15, macOS-15-intel, windows-2022] + os: [ubuntu-24.04, ubuntu-24.04-arm, macOS-15-intel, windows-2022] arch: [x64, arm64] pocl: [jll, local] exclude: @@ -32,15 +32,10 @@ jobs: arch: arm64 - os: ubuntu-24.04-arm arch: x64 - # macOS 13 is Intel-only, while macOS 14+ only support Apple Silicon - - os: macOS-15 - arch: x64 - os: macOS-15-intel arch: arm64 - os: macOS-15-intel pocl: local - - os: macOS-15 - pocl: local - os: windows-2022 pocl: local steps: diff --git a/Project.toml b/Project.toml index 7e22dd203..09501641e 100644 --- a/Project.toml +++ b/Project.toml @@ -40,7 +40,7 @@ LLVM = "9.4.1" LinearAlgebra = "1.6" MacroTools = "0.5" PrecompileTools = "1" -SPIRVIntrinsics = "0.5" +SPIRVIntrinsics = "0.5.7" SPIRV_LLVM_Backend_jll = "20" SPIRV_Tools_jll = "2024.4, 2025.1" SparseArrays = "<0.0.1, 1.6" diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 79efa4cbe..a11b03670 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -102,6 +102,78 @@ Returns the unique group ID. """ function get_group_id end +""" + get_sub_group_size()::UInt32 + +Returns the number of work-items in the sub-group. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_sub_group_size()::UInt32 + ``` +""" +function get_sub_group_size end + +""" + get_max_sub_group_size()::UInt32 + +Returns the maximum sub-group size for sub-groups in the current workgroup. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_max_sub_group_size()::UInt32 + ``` +""" +function get_max_sub_group_size end + +""" + get_num_sub_groups()::UInt32 + +Returns the number of sub-groups in the current workgroup. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_num_sub_groups()::UInt32 + ``` +""" +function get_num_sub_groups end + +""" + get_sub_group_id()::UInt32 + +Returns the sub-group ID within the work-group. + +!!! note + 1-based. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_sub_group_id()::UInt32 + ``` +""" +function get_sub_group_id end + +""" + get_sub_group_local_id()::UInt32 + +Returns the work-item ID within the current sub-group. + +!!! note + 1-based. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_sub_group_local_id()::UInt32 + ``` +""" +function get_sub_group_local_id end + + """ localmemory(::Type{T}, dims) @@ -116,6 +188,40 @@ Declare memory that is local to a workgroup. """ localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims)) +""" + shfl_down(val::T, offset::Integer) where T + +Read `val` from a lane with higher id given by `offset`. + +!!! note + `shfl_down` must be encountered by all workitems of a sub-group executing the kernel or by none at all. + +!!! note + Backend implementations **must** implement: + ``` + @device_override shfl_down(val::T, offset::Integer) where T + ``` + As well as the on-device functionality. + + This implementation **must** be synchronizing. + That is, kernels using this function can safely assume that + they do **not** need a `sub_group_barrier` before calling + this function. +""" +function shfl_down end + +""" + shfl_down_types(::Backend)::Vector{DataType} + +Returns a vector of `DataType`s supported on `backend` + +!!! note + Backend implementations **must** implement this function + only if they support `shfl_down` for any types. +""" +shfl_down_types(::Backend) = DataType[] + + """ barrier() @@ -139,6 +245,29 @@ function barrier() error("Group barrier used outside kernel or not captured") end +""" + sub_group_barrier() + +After a `sub_group_barrier()` call, all read and writes to global and local memory +from each thread in the sub-group are visible in from all other threads in the +sub-group. + +This does **not** guarantee that a write from a thread in a certain sub-group will +be visible to a thread in a different sub-group. + +!!! note + `sub_group_barrier()` must be encountered by all workitems of a sub-group executing the kernel or by none at all. + +!!! note + Backend implementations **must** implement: + ``` + @device_override sub_group_barrier() + ``` +""" +function sub_group_barrier() + error("Sub-group barrier used outside kernel or not captured") +end + """ _print(args...) @@ -220,6 +349,22 @@ kernel launch with too big a workgroup is attempted. """ function max_work_group_size end +""" + sub_group_size(backend)::Int + +Returns a reasonable sub-group size supported by the currently +active device for the specified backend. This would typically +be 32, or 64 for devices that don't support 32. + +!!! note + Backend implementations **must** implement: + ``` + sub_group_size(backend::NewBackend)::Int + ``` + As well as the on-device functionality. +""" +function sub_group_size end + """ multiprocessor_count(backend::NewBackend)::Int diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index f23e20b0f..d3b4fd868 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -7,6 +7,8 @@ using ..POCL: device, clconvert, clfunction import KernelAbstractions as KA import KernelAbstractions.KernelIntrinsics as KI +import SPIRVIntrinsics + import StaticArrays import Adapt @@ -174,10 +176,36 @@ end function KI.max_work_group_size(::POCLBackend)::Int return Int(device().max_work_group_size) end +function KI.sub_group_size(::POCLBackend)::Int + sg_sizes = cl.device().sub_group_sizes + if 32 in sg_sizes + return 32 + elseif 64 in sg_sizes + return 64 + elseif 16 in sg_sizes + return 16 + else + return 1 + end +end function KI.multiprocessor_count(::POCLBackend)::Int return Int(device().max_compute_units) end +function KI.shfl_down_types(::POCLBackend) + res = copy(SPIRVIntrinsics.gentypes) + + backend_extensions = cl.device().extensions + if "cl_khr_fp64" ∉ backend_extensions + res = setdiff(res, [Float64]) + end + if "cl_khr_fp16" ∉ backend_extensions + res = setdiff(res, [Float16]) + end + + return res +end + ## Indexing Functions @device_override @inline function KI.get_local_id() @@ -204,6 +232,16 @@ end return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3))) end +@device_override KI.get_sub_group_size() = get_sub_group_size() + +@device_override KI.get_max_sub_group_size() = get_max_sub_group_size() + +@device_override KI.get_num_sub_groups() = get_num_sub_groups() + +@device_override KI.get_sub_group_id() = get_sub_group_id() + +@device_override KI.get_sub_group_local_id() = get_sub_group_local_id() + @device_override @inline function KA.__validindex(ctx) if KA.__dynamic_checkbounds(ctx) I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) @@ -232,6 +270,14 @@ end work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE) end +@device_override @inline function KI.sub_group_barrier() + sub_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE) +end + +@device_override function KI.shfl_down(val::T, offset::Integer) where {T} + sub_group_shuffle(val, get_sub_group_local_id() + offset) +end + @device_override @inline function KI._print(args...) POCL._print(args...) end diff --git a/src/pocl/compiler/compilation.jl b/src/pocl/compiler/compilation.jl index 3d5930d40..31e0f43c0 100644 --- a/src/pocl/compiler/compilation.jl +++ b/src/pocl/compiler/compilation.jl @@ -1,6 +1,9 @@ ## gpucompiler interface -struct OpenCLCompilerParams <: AbstractCompilerParams end +Base.@kwdef struct OpenCLCompilerParams <: AbstractCompilerParams + sub_group_size::Int +end + const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams} const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget, OpenCLCompilerParams} @@ -19,7 +22,21 @@ GPUCompiler.isintrinsic(job::OpenCLCompilerJob, fn::String) = in(fn, known_intrinsics) || contains(fn, "__spirv_") +function GPUCompiler.finish_module!( + @nospecialize(job::OpenCLCompilerJob), + mod::LLVM.Module, entry::LLVM.Function + ) + entry = invoke( + GPUCompiler.finish_module!, + Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function}, + job, mod, entry + ) + + # Set the subgroup size + metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(job.config.params.sub_group_size))]) + return entry +end ## compiler implementation (cache, configure, compile, and link) # cache of compilation caches, per context @@ -45,14 +62,17 @@ function compiler_config(dev::cl.Device; kwargs...) end return config end -@noinline function _compiler_config(dev; kernel = true, name = nothing, always_inline = false, kwargs...) +@noinline function _compiler_config(dev; kernel = true, name = nothing, always_inline = false, sub_group_size = 32, kwargs...) supports_fp16 = "cl_khr_fp16" in dev.extensions supports_fp64 = "cl_khr_fp64" in dev.extensions + if sub_group_size ∉ dev.sub_group_sizes + @error("$sub_group_size is not a valid sub-group size for this device.") + end # create GPUCompiler objects target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, kwargs...) - params = OpenCLCompilerParams() + params = OpenCLCompilerParams(; sub_group_size) return CompilerConfig(target, params; kernel, name, always_inline) end diff --git a/src/pocl/compiler/execution.jl b/src/pocl/compiler/execution.jl index dc47cb302..6c47952ee 100644 --- a/src/pocl/compiler/execution.jl +++ b/src/pocl/compiler/execution.jl @@ -4,7 +4,7 @@ export @opencl, clfunction, clconvert ## high-level @opencl interface const MACRO_KWARGS = [:launch] -const COMPILER_KWARGS = [:kernel, :name, :always_inline] +const COMPILER_KWARGS = [:kernel, :name, :always_inline, :sub_group_size] const LAUNCH_KWARGS = [:global_size, :local_size, :queue] macro opencl(ex...) diff --git a/src/pocl/nanoOpenCL.jl b/src/pocl/nanoOpenCL.jl index a706710d1..82ec7d281 100644 --- a/src/pocl/nanoOpenCL.jl +++ b/src/pocl/nanoOpenCL.jl @@ -390,6 +390,8 @@ const CL_KERNEL_EXEC_INFO_SVM_PTRS = 0x11b6 const CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM = 0x11b7 +const CL_DEVICE_SUB_GROUP_SIZES_INTEL = 0x4108 + struct CLError <: Exception code::Cint end @@ -935,6 +937,14 @@ devices(p::Platform) = devices(p, CL_DEVICE_TYPE_ALL) return tuple([Int(r) for r in result]...) end + if s == :sub_group_sizes + res_size = Ref{Csize_t}() + clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, C_NULL, C_NULL, res_size) + result = Vector{Csize_t}(undef, res_size[] ÷ sizeof(Csize_t)) + clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, sizeof(result), result, C_NULL) + return tuple([Int(r) for r in result]...) + end + if s == :max_image2d_shape width = Ref{Csize_t}() height = Ref{Csize_t}() diff --git a/src/pocl/pocl.jl b/src/pocl/pocl.jl index 1cc693c86..4638b574a 100644 --- a/src/pocl/pocl.jl +++ b/src/pocl/pocl.jl @@ -41,7 +41,7 @@ function queue() end using GPUCompiler -import LLVM +import LLVM: LLVM, MDNode, ConstantInt, metadata using Adapt ## device overrides diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 63216d32b..03e10970d 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -23,6 +23,46 @@ function test_intrinsics_kernel(results) end return end +struct SubgroupData + sub_group_size::UInt32 + max_sub_group_size::UInt32 + num_sub_groups::UInt32 + sub_group_id::UInt32 + sub_group_local_id::UInt32 +end +function test_subgroup_kernel(results) + i = KI.get_global_id().x + + if i <= length(results) + @inbounds results[i] = SubgroupData( + KI.get_sub_group_size(), + KI.get_max_sub_group_size(), + KI.get_num_sub_groups(), + KI.get_sub_group_id(), + KI.get_sub_group_local_id() + ) + end + return +end + +function shfl_down_test_kernel(a, b, ::Val{N}) where {N} + idx = KI.get_sub_group_local_id() + + val = a[idx] + + offset = 0x00000001 + while offset < N + val += KI.shfl_down(val, offset) + offset <<= 1 + end + + KI.sub_group_barrier() + + if idx == 1 + b[idx] = val + end + return +end function intrinsics_testsuite(backend, AT) @testset "KernelIntrinsics Tests" begin @@ -122,6 +162,57 @@ function intrinsics_testsuite(backend, AT) @test k_data.local_id == expected_local end end + + @testset "Sub-groups" begin + @test KI.sub_group_size(backend()) isa Int + + # Test with small kernel + sg_size = KI.sub_group_size(backend()) + sg_n = 2 + workgroupsize = sg_size * sg_n + numworkgroups = 2 + N = workgroupsize * numworkgroups + + results = AT(Vector{SubgroupData}(undef, N)) + kernel = KI.@kernel backend() launch = false test_subgroup_kernel(results) + + kernel(results; workgroupsize, numworkgroups) + KernelAbstractions.synchronize(backend()) + + host_results = Array(results) + + # Verify results make sense + for (i, sg_data) in enumerate(host_results) + @test sg_data.sub_group_size == sg_size + @test sg_data.max_sub_group_size == sg_size + @test sg_data.num_sub_groups == sg_n + + # Group ID should be 1-based + expected_sub_group = div(((i - 1) % workgroupsize), sg_size) + 1 + @test sg_data.sub_group_id == expected_sub_group + + # Local ID should be 1-based within group + expected_sg_local = ((i - 1) % sg_size) + 1 + @test sg_data.sub_group_local_id == expected_sg_local + end + end + @testset "shfl_down" begin + @test !isempty(KI.shfl_down_types(backend())) + types_to_test = setdiff(KI.shfl_down_types(backend()), [Bool]) + @testset "$T" for T in types_to_test + N = KI.sub_group_size(backend()) + a = zeros(T, N) + rand!(a, (0:1)) + + dev_a = AT(a) + dev_b = AT(zeros(T, N)) + + KI.@kernel backend() workgroupsize = N shfl_down_test_kernel(dev_a, dev_b, Val(N)) + + b = Array(dev_b) + @test sum(a) ≈ b[1] + end + end end return nothing end