From e2d97abd0f1c642780fa5c83cb437bf3c77287c0 Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Tue, 10 Mar 2026 16:04:25 +0100 Subject: [PATCH 01/12] Enhance automatic range computation for @parallel kernel calls and add metadata storage functionality --- src/parallel.jl | 73 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 5 deletions(-) diff --git a/src/parallel.jl b/src/parallel.jl index 4ba959f..e4070b7 100644 --- a/src/parallel.jl +++ b/src/parallel.jl @@ -34,6 +34,8 @@ See also: [`@init_parallel_stencil`](@ref) Declare the `kernelcall` parallel. The kernel will automatically be called as required by the package for parallelization selected with [`@init_parallel_kernel`](@ref). Synchronizes at the end of the call (if a stream is given via keyword arguments, then it synchronizes only this stream). The keyword argument `∇` triggers a parallel call to the gradient kernel instead of the kernel itself. The automatic differentiation is performed with the package Enzyme.jl (refer to the corresponding documentation for Enzyme-specific terms used below); Enzyme needs to be imported before ParallelStencil in order to have it load the corresponding extension. +Automatic computation of `ranges` for `@parallel ` is only possible if the number of parallel indices used by the kernel is equal to the number of dimensions of the highest-dimensional input arrays. Otherwise, specify the `ranges` manually with `@parallel ranges=... `. + !!! note "Runtime hardware selection" When KernelAbstractions is initialized, this wrapper consults [`current_hardware`](@ref) to determine the runtime hardware target. The symbol defaults to `:cpu` and can be switched to select other targets via [`select_hardware`](@ref). @@ -92,6 +94,9 @@ $(replace(ParallelKernel.PARALLEL_ASYNC_DOC, "@init_parallel_kernel" => "@init_p macro parallel_async(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel_async(__source__, __module__, args...)); end +const ERRMSG_AUTOMATIC_RANGES_PARALLEL = "@parallel : the ranges needed for the kernel call cannot be automatically computed (less parallel indices than dimensions of the input arrays); specify the ranges manually with @parallel ranges=... ." + + ## MACROS FORCING PACKAGE, IGNORING INITIALIZATION macro parallel_cuda(args...) check_initialized(__module__); checkargs_parallel(args...); esc(parallel(__source__, __module__, args...; package=PKG_CUDA)); end @@ -184,7 +189,12 @@ function parallel(source::LineNumberNode, caller::Module, args::Union{Symbol,Exp if (length(posargs) > 1) @ArgumentError("maximum one positional argument (ranges) is allowed in a @parallel memopt=true call.") end parallel_call_memopt(caller, posargs..., kernelarg, backend_kwargs_expr, async; kwargs...) else - ParallelKernel.parallel(caller, posargs..., backend_kwargs_expr..., configcall_kwarg_expr, kernelarg; package=package, async=async) + if isempty(posargs) + ranges = add_nb_parallel_indices_check(:(ParallelStencil.ParallelKernel.get_ranges($(configcall.args[2:end]...))), configcall) + ParallelKernel.parallel(caller, ranges, backend_kwargs_expr..., configcall_kwarg_expr, kernelarg; package=package, async=async) + else + ParallelKernel.parallel(caller, posargs..., backend_kwargs_expr..., configcall_kwarg_expr, kernelarg; package=package, async=async) + end end end end @@ -213,6 +223,9 @@ function parallel_indices(source::LineNumberNode, caller::Module, args::Union{Sy else metadata_module, metadata_function = kwargs.metadata_module, kwargs.metadata_function end + if !haskey(kwargs, :metadata_module) + store_metadata(metadata_module, caller, determine_nb_parallel_indices(caller, get_body(kernelarg), extract_tuple(indices_expr))) + end inbounds = haskey(kwargs, :inbounds) ? kwargs.inbounds : get_inbounds(caller) padding = haskey(kwargs, :padding) ? kwargs.padding : get_padding(caller) memopt = haskey(kwargs, :memopt) ? kwargs.memopt : get_memopt(caller) @@ -223,7 +236,11 @@ function parallel_indices(source::LineNumberNode, caller::Module, args::Union{Sy end else kwargs_expr = (:(inbounds=$inbounds), :(padding=$padding)) - ParallelKernel.parallel_indices(caller, posargs..., kwargs_expr..., kernelarg; package=package) + kernel = ParallelKernel.parallel_indices(caller, posargs..., kwargs_expr..., kernelarg; package=package) + quote + $kernel + $metadata_function + end end end end @@ -288,6 +305,9 @@ function parallel_kernel(metadata_module::Module, metadata_function::Expr, calle inbounds = haskey(kwargs, :inbounds) ? kwargs.inbounds : get_inbounds(caller) padding = haskey(kwargs, :padding) ? kwargs.padding : get_padding(caller) memopt = haskey(kwargs, :memopt) ? kwargs.memopt : get_memopt(caller) + if !haskey(kwargs, :metadata_module) + store_metadata(metadata_module, caller, ndims) + end indices = get_indices_expr(ndims).args indices_dir = get_indices_dir_expr(ndims).args body = get_body(kernel) @@ -330,13 +350,39 @@ function parallel_kernel(metadata_module::Module, metadata_function::Expr, calle if package == PKG_KERNELABSTRACTIONS kernel = :(ParallelStencil.ParallelKernel.@ka_kernel $kernel) end - return kernel # TODO: later could be here called parallel_indices instead of adding the threadids etc above. + return quote + $kernel + $metadata_function + end # TODO: later could be here called parallel_indices instead of adding the threadids etc above. end end ## @PARALLEL CALL FUNCTIONS +function add_nb_parallel_indices_check(ranges::Union{Symbol,Expr}, configcall::Expr) + checked_ranges = gensym_world("ranges", @__MODULE__) + nb_parallel_indices = gensym_world("nb_parallel_indices", @__MODULE__) + nb_input_dims = gensym_world("nb_input_dims", @__MODULE__) + metadata_call = create_metadata_call(configcall) + return quote + $checked_ranges = $ranges + $nb_parallel_indices = ($metadata_call).nb_parallel_indices + $nb_input_dims = ParallelStencil.get_nb_input_dims($(configcall.args[2:end]...)) + if $nb_input_dims != $nb_parallel_indices + ParallelStencil.@ArgumentError(ParallelStencil.ERRMSG_AUTOMATIC_RANGES_PARALLEL) + end + $checked_ranges + end +end + +get_nb_input_dims(args...) = maximum((get_nb_input_dims(arg) for arg in args); init=1) +get_nb_input_dims(t::T) where T<:Union{Tuple,NamedTuple} = get_nb_input_dims(t...) +get_nb_input_dims(A::AbstractArray) = ndims(A) +get_nb_input_dims(A::SubArray) = ndims(A.parent) +get_nb_input_dims(a::Number) = 1 +get_nb_input_dims(x) = isbitstype(typeof(x)) ? 1 : @ArgumentError("automatic detection of ranges not possible in @parallel : some kernel arguments are neither arrays nor scalars nor any other bitstypes nor (named) tuple containing any of the former. Specify ranges or nthreads and nblocks manually.") + function parallel_call_memopt(caller::Module, ranges::Union{Symbol,Expr}, kernelcall::Expr, backend_kwargs_expr::Array, async::Bool; memopt::Bool=false, configcall::Expr=kernelcall) if haskey(backend_kwargs_expr, :shmem) @KeywordArgumentError("@parallel : keyword `shmem` is not allowed when memopt=true is set.") end package = get_package(caller) @@ -372,7 +418,7 @@ function parallel_call_memopt(caller::Module, kernelcall::Expr, backend_kwargs_e metadata_module = metadata_call loopdim = :($(metadata_module).loopdim) is_parallel_kernel = :($(metadata_module).is_parallel_kernel) - ranges = :( ($is_parallel_kernel) ? ParallelStencil.get_ranges_memopt($nthreads_x_max, $nthreads_max_memopt, $loopdim, $(configcall.args[2:end]...)) : ParallelStencil.ParallelKernel.get_ranges($(configcall.args[2:end]...))) + ranges = add_nb_parallel_indices_check(:( ($is_parallel_kernel) ? ParallelStencil.get_ranges_memopt($nthreads_x_max, $nthreads_max_memopt, $loopdim, $(configcall.args[2:end]...)) : ParallelStencil.ParallelKernel.get_ranges($(configcall.args[2:end]...))), configcall) parallel_call_memopt(caller, ranges, kernelcall, backend_kwargs_expr, async; memopt=memopt, configcall=configcall) end @@ -493,11 +539,13 @@ function get_indices_dir_expr(ndims::Integer) end end +determine_nb_parallel_indices(caller::Module, body::Expr, indices) = count(index -> inexpr_walk(macroexpand(caller, body), index), indices) + ## FUNCTIONS TO CREATE METADATA STORAGE function create_metadata_storage(source::LineNumberNode, caller::Module, kernel::Expr) - kernelid = get_kernelid(get_name(kernel), source.file, source.line) + kernelid = get_kernelid(kernel, source.file, source.line) create_module(caller, MOD_METADATA_PS) topmodule = @eval(caller, $MOD_METADATA_PS) create_module(topmodule, kernelid) @@ -529,7 +577,22 @@ function create_metadata_call(configcall::Expr) return metadata_call end +function store_metadata(metadata_module::Module, caller::Module, nb_parallel_indices::Integer) + nonconst_metadata = get_nonconst_metadata(caller) + if nonconst_metadata || isdefined(metadata_module, :nb_parallel_indices) + storeexpr = quote + nb_parallel_indices = $nb_parallel_indices + end + else + storeexpr = quote + const nb_parallel_indices = $nb_parallel_indices + end + end + @eval(metadata_module, $storeexpr) +end + get_kernelid(kernelname, file, line) = Symbol("$(kernelname)_$(file)_$(line)") +get_kernelid(kernel::Expr, file, line) = Symbol("$(get_kernelid(get_name(kernel), file, line))_$(hash(string(kernel)))") get_meta_function(kernelname) = Symbol("$(META_FUNCTION_PREFIX)$(GENSYM_SEPARATOR)$(kernelname)") From 3984549e38444b839ff70bb27f68d57ca7769c5b Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Tue, 10 Mar 2026 16:04:32 +0100 Subject: [PATCH 02/12] Update documentation for @parallel kernel calls to clarify range computation requirements --- src/ParallelKernel/parallel.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ParallelKernel/parallel.jl b/src/ParallelKernel/parallel.jl index ce6b70b..c0e97f4 100644 --- a/src/ParallelKernel/parallel.jl +++ b/src/ParallelKernel/parallel.jl @@ -11,6 +11,8 @@ const PARALLEL_DOC = """ Declare the `kernelcall` parallel. The kernel will automatically be called as required by the package for parallelization selected with [`@init_parallel_kernel`](@ref). Synchronizes at the end of the call (if a stream is given via keyword arguments, then it synchronizes only this stream). The keyword argument `∇` triggers a parallel call to the gradient kernel instead of the kernel itself. The automatic differentiation is performed with the package Enzyme.jl (refer to the corresponding documentation for Enzyme-specific terms used below); Enzyme needs to be imported before ParallelKernel in order to have it load the corresponding extension. +Automatic computation of `ranges` for `@parallel ` is only possible if the number of parallel indices used by the kernel is equal to the number of dimensions of the highest-dimensional input arrays. Otherwise, specify the `ranges` manually with `@parallel ranges=... `. + !!! note "Runtime hardware selection" When KernelAbstractions is initialized, this wrapper consults [`current_hardware`](@ref) to determine the runtime hardware target. The symbol defaults to `:cpu` and can be switched to select other targets via [`select_hardware`](@ref). From ad8e065d8e51bbaa4d068c5d4520e4c9723208da Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Tue, 10 Mar 2026 16:04:41 +0100 Subject: [PATCH 03/12] Add tests for automatic range errors in parallel indices usage --- test/test_parallel.jl | 80 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 76 insertions(+), 4 deletions(-) diff --git a/test/test_parallel.jl b/test/test_parallel.jl index 005412b..1076519 100644 --- a/test/test_parallel.jl +++ b/test/test_parallel.jl @@ -55,7 +55,9 @@ eval(:( @testset "@parallel " begin # NOTE: calls must go to ParallelStencil.ParallelKernel.parallel and must therefore give the same result as in ParallelKernel, except for memopt tests (tests copied 1-to-1 from there). @static if $package == $PKG_CUDA call = @prettystring(1, @parallel f(A)) - @test occursin("CUDA.@cuda blocks = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))); nthreads_x_max = 32)) threads = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))); nthreads_x_max = 32) stream = CUDA.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[3])))", call) + @test occursin("CUDA.@cuda", call) + @test occursin("ParallelStencil.ParallelKernel.get_ranges(A)", call) + @test occursin("nb_parallel_indices", call) @test occursin("CUDA.synchronize(CUDA.stream(); blocking = true)", call) call = @prettystring(1, @parallel ranges f(A)) @test occursin("CUDA.@cuda blocks = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)); nthreads_x_max = 32)) threads = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)); nthreads_x_max = 32) stream = CUDA.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))", call) @@ -71,7 +73,9 @@ eval(:( @test occursin("CUDA.@cuda blocks = ParallelStencil.ParallelKernel.compute_nblocks(cld.(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)),", call) # NOTE: now it is a very long multi line expression; before it continued as follows: (1, 1, 16)), ParallelStencil.compute_nthreads_memopt(cld.(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), (1, 1, 16)), 3, (-1:1, -1:1, -1:1))) threads = ParallelStencil.compute_nthreads_memopt(cld.(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), (1, 1, 16)), 3, (-1:1, -1:1, -1:1)) stream = CUDA.stream() shmem = ((ParallelStencil.compute_nthreads_memopt(cld.(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), (1, 1, 16)), 3, (-1:1, -1:1, -1:1)))[1] + 3) * ((ParallelStencil.compute_nthreads_memopt(cld.(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), (1, 1, 16)), 3, (-1:1, -1:1, -1:1)))[2] + 3) * sizeof(Float64) f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))", call) elseif $package == $PKG_AMDGPU call = @prettystring(1, @parallel f(A)) - @test occursin("AMDGPU.@roc gridsize = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))); nthreads_x_max = 64)) groupsize = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A))); nthreads_x_max = 64) stream = AMDGPU.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[3])))", call) + @test occursin("AMDGPU.@roc", call) + @test occursin("ParallelStencil.ParallelKernel.get_ranges(A)", call) + @test occursin("nb_parallel_indices", call) @test occursin("AMDGPU.synchronize(AMDGPU.stream(); blocking = true)", call) call = @prettystring(1, @parallel ranges f(A)) @test occursin("AMDGPU.@roc gridsize = ParallelStencil.ParallelKernel.compute_nblocks(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)), ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)); nthreads_x_max = 64)) groupsize = ParallelStencil.ParallelKernel.compute_nthreads(length.(ParallelStencil.ParallelKernel.promote_ranges(ranges)); nthreads_x_max = 64) stream = AMDGPU.stream() f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))", call) @@ -91,6 +95,8 @@ eval(:( @test occursin("handle(ParallelStencil.ParallelKernel.current_hardware(@__MODULE__()), :$PKG_KERNELABSTRACTIONS)", call) @test occursin("compute_nblocks", call) @test occursin("compute_nthreads", call) + @test occursin("ParallelStencil.ParallelKernel.get_ranges(A)", call) + @test occursin("nb_parallel_indices", call) @test !occursin("CUDA.@cuda", call) @test !occursin("AMDGPU.@roc", call) call = @prettystring(1, @parallel ranges f(A)) @@ -110,11 +116,17 @@ eval(:( # call = @prettystring(2, @parallel ranges memopt=true f(A)) # @test occursin("ParallelStencil.ParallelKernel.@ka", call) elseif @iscpu($package) - @test @prettystring(1, @parallel f(A)) == "f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[3])))" + call = @prettystring(1, @parallel f(A)) + @test occursin("f(A, ParallelStencil.ParallelKernel.promote_ranges(", call) + @test occursin("ParallelStencil.ParallelKernel.get_ranges(A)", call) + @test occursin("nb_parallel_indices", call) @test @prettystring(1, @parallel ranges f(A)) == "f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))" @test @prettystring(1, @parallel nblocks nthreads f(A)) == "f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.compute_ranges(nblocks .* nthreads)))[3])))" @test @prettystring(1, @parallel ranges nblocks nthreads f(A)) == "f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))" - @test @prettystring(1, @parallel stream=mystream f(A)) == "f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[3])))" + call = @prettystring(1, @parallel stream=mystream f(A)) + @test occursin("f(A, ParallelStencil.ParallelKernel.promote_ranges(", call) + @test occursin("ParallelStencil.ParallelKernel.get_ranges(A)", call) + @test occursin("nb_parallel_indices", call) # @test @prettystring(2, @parallel memopt=true f(A)) == "f(A, ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ParallelStencil.ParallelKernel.get_ranges(A)))[3])))" @test @prettystring(2, @parallel ranges memopt=true f(A)) == "f(A, ParallelStencil.ParallelKernel.promote_ranges(ranges), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[1])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[2])), (Int64)(length((ParallelStencil.ParallelKernel.promote_ranges(ranges))[3])))" end; @@ -1230,6 +1242,66 @@ eval(:( @test_throws ArgumentError validate_body(:(a = b + 1; @all(A) = @all(B) + 1)) @test_throws ArgumentError validate_body(:(A = @all(B) + 1; @all(A) = @all(B) + 1)) end; + @testset "automatic ranges: error if not all parallel indices are used" begin + @parallel_indices (ix, iy, iz) function write_xy_plane!(A) + A[ix, iy, 1] = A[ix, iy, 1] + return + end + @parallel_indices (ix, iy) function write_y_line!(A) + A[1, iy] = A[1, iy] + return + end + A3 = @zeros(4, 5, 6) + A2 = @zeros(4, 5) + @test_throws ArgumentError @parallel write_xy_plane!(A3) + @test_throws ArgumentError @parallel write_y_line!(A2) + end; + @testset "automatic ranges: error if input array has more dimensions than parallel indices" begin + @parallel_indices (ix, iy) function write_xy_plane!(A) + A[ix, iy, 1] = A[ix, iy, 1] + A[ix, iy, 2] = A[ix, iy, 2] + return + end + @parallel_indices (ix) function write_x_line!(A) + A[ix, 1] = A[ix, 1] + A[ix, 2] = A[ix, 2] + return + end + A3 = @zeros(4, 5, 6) + A2 = @zeros(4, 5) + @test_throws ArgumentError @parallel write_xy_plane!(A3) + @test_throws ArgumentError @parallel write_x_line!(A2) + end; + @testset "automatic ranges (memopt): error if not all parallel indices are used" begin + @parallel_indices (ix, iy, iz) memopt=true function write_xy_plane!(A) + A[ix, iy, 1] = A[ix, iy, 1] + return + end + @parallel_indices (ix, iy) memopt=true function write_y_line!(A) + A[1, iy] = A[1, iy] + return + end + A3 = @zeros(4, 5, 6) + A2 = @zeros(4, 5) + @test_throws ArgumentError @parallel memopt=true write_xy_plane!(A3) + @test_throws ArgumentError @parallel memopt=true write_y_line!(A2) + end; + @testset "automatic ranges (memopt): error if input array has more dimensions than parallel indices" begin + @parallel_indices (ix, iy) memopt=true function write_xy_plane!(A) + A[ix, iy, 1] = A[ix, iy, 1] + A[ix, iy, 2] = A[ix, iy, 2] + return + end + @parallel_indices (ix) memopt=true function write_x_line!(A) + A[ix, 1] = A[ix, 1] + A[ix, 2] = A[ix, 2] + return + end + A3 = @zeros(4, 5, 6) + A2 = @zeros(4, 5) + @test_throws ArgumentError @parallel memopt=true write_xy_plane!(A3) + @test_throws ArgumentError @parallel memopt=true write_x_line!(A2) + end; @reset_parallel_stencil() end; end; From 5190eae8ec5130b3c4fee0211b5db5f37641a45a Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Wed, 11 Mar 2026 10:45:21 +0100 Subject: [PATCH 04/12] Enhance error handling in determine_nb_parallel_indices to ensure all parallel indices are utilized in the kernel body --- src/parallel.jl | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/parallel.jl b/src/parallel.jl index e4070b7..5708aaf 100644 --- a/src/parallel.jl +++ b/src/parallel.jl @@ -539,7 +539,15 @@ function get_indices_dir_expr(ndims::Integer) end end -determine_nb_parallel_indices(caller::Module, body::Expr, indices) = count(index -> inexpr_walk(macroexpand(caller, body), index), indices) +function determine_nb_parallel_indices(caller::Module, body::Expr, indices) + body = macroexpand(caller, body) + used_indices = filter(index -> inexpr_walk(body, index), indices) + if 0 < length(used_indices) < length(indices) + unused_indices = filter(index -> !inexpr_walk(body, index), indices) + @ArgumentError("@parallel_indices: all parallel indices must be used in the kernel body (unused indices: $(join(string.(unused_indices), ", "))).") + end + return length(indices) +end ## FUNCTIONS TO CREATE METADATA STORAGE From 51d315d28ee1af0d72324925056fb1a8a76d1312 Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Wed, 11 Mar 2026 10:45:32 +0100 Subject: [PATCH 05/12] Add parallel_indices import and enhance tests for automatic range errors --- test/test_parallel.jl | 89 ++++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 43 deletions(-) diff --git a/test/test_parallel.jl b/test/test_parallel.jl index 1076519..0fe8d38 100644 --- a/test/test_parallel.jl +++ b/test/test_parallel.jl @@ -2,7 +2,7 @@ using Test using ParallelStencil import ParallelStencil: @reset_parallel_stencil, @is_initialized, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_THREADS, PKG_POLYESTER, PKG_KERNELABSTRACTIONS, @select_hardware, @current_hardware, INDICES, INDICES_INN, INDICES_DIR, ARRAYTYPES, FIELDTYPES, SCALARTYPES import ParallelStencil: @require, @prettystring, @gorgeousstring, @isgpu, @iscpu, interpolate -import ParallelStencil: checkargs_parallel, validate_body, parallel +import ParallelStencil: checkargs_parallel, validate_body, parallel, parallel_indices using ParallelStencil.Exceptions using ParallelStencil.FiniteDifferences3D using ParallelStencil.FieldAllocators @@ -40,6 +40,9 @@ end end Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered. +parallel_indices(args::Union{Symbol,Expr}...; package::Symbol=ParallelStencil.ParallelKernel.get_package(@__MODULE__)) = (ParallelStencil.checkargs_parallel_indices(args...); ParallelStencil.parallel_indices(LineNumberNode(@__LINE__, Symbol(@__FILE__)), @__MODULE__, args...; package=package)) + + @static for package in TEST_PACKAGES FloatDefault = (package == PKG_METAL) ? Float32 : Float64 # Metal does not support Float64 @@ -1243,65 +1246,65 @@ eval(:( @test_throws ArgumentError validate_body(:(A = @all(B) + 1; @all(A) = @all(B) + 1)) end; @testset "automatic ranges: error if not all parallel indices are used" begin - @parallel_indices (ix, iy, iz) function write_xy_plane!(A) - A[ix, iy, 1] = A[ix, iy, 1] + @test_throws ArgumentError parallel_indices(:((ix, iy, iz)), + :(function write_xy_plane!(A) + A[ix, iy, 1] = 2.0 * A[ix, iy, 1] return - end - @parallel_indices (ix, iy) function write_y_line!(A) - A[1, iy] = A[1, iy] + end)) + @test_throws ArgumentError parallel_indices(:((ix, iy)), + :(function write_y_line!(A) + A[1, iy] = 2.0 * A[1, iy] return - end - A3 = @zeros(4, 5, 6) - A2 = @zeros(4, 5) - @test_throws ArgumentError @parallel write_xy_plane!(A3) - @test_throws ArgumentError @parallel write_y_line!(A2) + end)) end; @testset "automatic ranges: error if input array has more dimensions than parallel indices" begin @parallel_indices (ix, iy) function write_xy_plane!(A) - A[ix, iy, 1] = A[ix, iy, 1] - A[ix, iy, 2] = A[ix, iy, 2] + A[ix, iy, 1] = 2.0 * A[ix, iy, 1] + A[ix, iy, 2] = 2.0 * A[ix, iy, 2] return end @parallel_indices (ix) function write_x_line!(A) - A[ix, 1] = A[ix, 1] - A[ix, 2] = A[ix, 2] + A[ix, 1] = 2.0 * A[ix, 1] + A[ix, 2] = 2.0 * A[ix, 2] return end - A3 = @zeros(4, 5, 6) - A2 = @zeros(4, 5) + A3 = @ones(4, 5, 2) + A2 = @ones(4, 2) @test_throws ArgumentError @parallel write_xy_plane!(A3) @test_throws ArgumentError @parallel write_x_line!(A2) + @parallel (1:size(A3,1), 1:size(A3,2)) write_xy_plane!(A3) + @parallel (1:size(A2,1)) write_x_line!(A2) + @test A3 == 2.0 .* @ones(4, 5, 2) + @test A2 == 2.0 .* @ones(4, 2) end; @testset "automatic ranges (memopt): error if not all parallel indices are used" begin - @parallel_indices (ix, iy, iz) memopt=true function write_xy_plane!(A) - A[ix, iy, 1] = A[ix, iy, 1] + @test_throws ArgumentError parallel_indices(:((ix, iy, iz)), :(memopt=true), + :(function write_xy_plane!(A, B) + A[ix, iy, 1] = 2.0 * A[ix, iy, 1] + B[ix, iy, 1] return - end - @parallel_indices (ix, iy) memopt=true function write_y_line!(A) - A[1, iy] = A[1, iy] + end)) + @test_throws ArgumentError parallel_indices(:((ix, iy)), :(memopt=true), + :(function write_y_line!(A, B) + A[1, iy] = 2.0 * A[1, iy] + B[1, iy] return - end - A3 = @zeros(4, 5, 6) - A2 = @zeros(4, 5) - @test_throws ArgumentError @parallel memopt=true write_xy_plane!(A3) - @test_throws ArgumentError @parallel memopt=true write_y_line!(A2) + end)) + end; + @static if $package != $PKG_KERNELABSTRACTIONS + @testset "automatic ranges (memopt): error if input array has more dimensions than parallel indices" begin + @parallel_indices (ix, iy, iz) memopt=true loopsize=3 optvars=B optranges=(B=(0:0,0:0,0:0),) function write_xy_plane!(A, B, D) + A[ix, iy, iz] = 2.0 * B[ix, iy, iz] + return + end + A3 = @zeros(4, 5, 6) + B3 = @ones(4, 5, 6) + D4 = @ones(4, 5, 6, 2) + @test_throws ArgumentError @parallel memopt=true write_xy_plane!(A3, B3, D4) + @static if $package in [$PKG_CUDA, $PKG_AMDGPU] + @parallel (1:size(A3,1), 1:size(A3,2), 1:size(A3,3)) memopt=true write_xy_plane!(A3, B3, D4) + @test A3 == 2.0 .* @ones(4, 5, 6) + end + end; end; - @testset "automatic ranges (memopt): error if input array has more dimensions than parallel indices" begin - @parallel_indices (ix, iy) memopt=true function write_xy_plane!(A) - A[ix, iy, 1] = A[ix, iy, 1] - A[ix, iy, 2] = A[ix, iy, 2] - return - end - @parallel_indices (ix) memopt=true function write_x_line!(A) - A[ix, 1] = A[ix, 1] - A[ix, 2] = A[ix, 2] - return - end - A3 = @zeros(4, 5, 6) - A2 = @zeros(4, 5) - @test_throws ArgumentError @parallel memopt=true write_xy_plane!(A3) - @test_throws ArgumentError @parallel memopt=true write_x_line!(A2) - end; @reset_parallel_stencil() end; end; From 14a377df018f83527c70cf726c026afcb4e461c0 Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Wed, 11 Mar 2026 16:21:07 +0100 Subject: [PATCH 06/12] Refactor add_nb_parallel_indices_check function to streamline error handling for automatic range detection --- src/parallel.jl | 41 ++++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/src/parallel.jl b/src/parallel.jl index 5708aaf..0c17d14 100644 --- a/src/parallel.jl +++ b/src/parallel.jl @@ -360,29 +360,6 @@ end ## @PARALLEL CALL FUNCTIONS -function add_nb_parallel_indices_check(ranges::Union{Symbol,Expr}, configcall::Expr) - checked_ranges = gensym_world("ranges", @__MODULE__) - nb_parallel_indices = gensym_world("nb_parallel_indices", @__MODULE__) - nb_input_dims = gensym_world("nb_input_dims", @__MODULE__) - metadata_call = create_metadata_call(configcall) - return quote - $checked_ranges = $ranges - $nb_parallel_indices = ($metadata_call).nb_parallel_indices - $nb_input_dims = ParallelStencil.get_nb_input_dims($(configcall.args[2:end]...)) - if $nb_input_dims != $nb_parallel_indices - ParallelStencil.@ArgumentError(ParallelStencil.ERRMSG_AUTOMATIC_RANGES_PARALLEL) - end - $checked_ranges - end -end - -get_nb_input_dims(args...) = maximum((get_nb_input_dims(arg) for arg in args); init=1) -get_nb_input_dims(t::T) where T<:Union{Tuple,NamedTuple} = get_nb_input_dims(t...) -get_nb_input_dims(A::AbstractArray) = ndims(A) -get_nb_input_dims(A::SubArray) = ndims(A.parent) -get_nb_input_dims(a::Number) = 1 -get_nb_input_dims(x) = isbitstype(typeof(x)) ? 1 : @ArgumentError("automatic detection of ranges not possible in @parallel : some kernel arguments are neither arrays nor scalars nor any other bitstypes nor (named) tuple containing any of the former. Specify ranges or nthreads and nblocks manually.") - function parallel_call_memopt(caller::Module, ranges::Union{Symbol,Expr}, kernelcall::Expr, backend_kwargs_expr::Array, async::Bool; memopt::Bool=false, configcall::Expr=kernelcall) if haskey(backend_kwargs_expr, :shmem) @KeywordArgumentError("@parallel : keyword `shmem` is not allowed when memopt=true is set.") end package = get_package(caller) @@ -677,3 +654,21 @@ function create_onthefly_macro(caller, m, expr, var, indices, indices_dir) @eval(caller, $m_macro) return end + + +## FUNCTIONS TO CHECK THE AUTOMATIC DETERMINATION OF RANGES AND NB_PARALLEL_INDICES + +function add_nb_parallel_indices_check(ranges::Union{Symbol,Expr}, configcall::Expr) + metadata_call = create_metadata_call(configcall) + nb_parallel_indices = :(($metadata_call).nb_parallel_indices) + nb_input_dims = :(ParallelStencil.get_nb_input_dims($(configcall.args[2:end]...))) + errorcall = :(ParallelStencil.@ArgumentError(ParallelStencil.ERRMSG_AUTOMATIC_RANGES_PARALLEL)) + return :(($nb_input_dims != $nb_parallel_indices && $errorcall; $ranges)) +end + +get_nb_input_dims(args...) = maximum((get_nb_input_dims(arg) for arg in args); init=1) +get_nb_input_dims(t::T) where T<:Union{Tuple,NamedTuple} = get_nb_input_dims(t...) +get_nb_input_dims(A::AbstractArray) = ndims(A) +get_nb_input_dims(A::SubArray) = ndims(A.parent) +get_nb_input_dims(a::Number) = 1 +get_nb_input_dims(x) = isbitstype(typeof(x)) ? 1 : @ArgumentError("automatic detection of ranges not possible in @parallel : some kernel arguments are neither arrays nor scalars nor any other bitstypes nor (named) tuple containing any of the former. Specify ranges or nthreads and nblocks manually.") \ No newline at end of file From 33e68be582d8fe68a163acea6cc402d8f1af4192 Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Wed, 11 Mar 2026 18:24:16 +0100 Subject: [PATCH 07/12] Refactor parallel_indices and parallel_kernel functions to improve code clarity and maintainability --- src/parallel.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/parallel.jl b/src/parallel.jl index 0c17d14..ed79919 100644 --- a/src/parallel.jl +++ b/src/parallel.jl @@ -231,15 +231,15 @@ function parallel_indices(source::LineNumberNode, caller::Module, args::Union{Sy memopt = haskey(kwargs, :memopt) ? kwargs.memopt : get_memopt(caller) if memopt quote - $(parallel_indices_memopt(metadata_module, metadata_function, is_parallel_kernel, caller, package, posargs..., kernelarg; kwargs...)) #TODO: the package and numbertype will have to be passed here further once supported as kwargs (currently removed from call: package, numbertype, ) $metadata_function + $(parallel_indices_memopt(metadata_module, metadata_function, is_parallel_kernel, caller, package, posargs..., kernelarg; kwargs...)) #TODO: the package and numbertype will have to be passed here further once supported as kwargs (currently removed from call: package, numbertype, ) end else kwargs_expr = (:(inbounds=$inbounds), :(padding=$padding)) kernel = ParallelKernel.parallel_indices(caller, posargs..., kwargs_expr..., kernelarg; package=package) quote - $kernel $metadata_function + $kernel end end end @@ -343,16 +343,16 @@ function parallel_kernel(metadata_module::Module, metadata_function::Expr, calle if memopt expanded_kernel = macroexpand(caller, kernel) quote - $(parallel_indices_memopt(metadata_module, metadata_function, is_parallel_kernel, caller, package, get_indices_expr(ndims), expanded_kernel; kwargs...)) #TODO: the package and numbertype will have to be passed here further once supported as kwargs (currently removed from call: package, numbertype, ) $metadata_function + $(parallel_indices_memopt(metadata_module, metadata_function, is_parallel_kernel, caller, package, get_indices_expr(ndims), expanded_kernel; kwargs...)) #TODO: the package and numbertype will have to be passed here further once supported as kwargs (currently removed from call: package, numbertype, ) end else if package == PKG_KERNELABSTRACTIONS kernel = :(ParallelStencil.ParallelKernel.@ka_kernel $kernel) end return quote - $kernel $metadata_function + $kernel end # TODO: later could be here called parallel_indices instead of adding the threadids etc above. end end From 607ff2c3de4622466983f0de9adc464c521c8b08 Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Wed, 11 Mar 2026 18:24:27 +0100 Subject: [PATCH 08/12] Add metadata macro for kernel call validation in unit tests --- src/shared.jl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/shared.jl b/src/shared.jl index 418d64c..4b5d078 100644 --- a/src/shared.jl +++ b/src/shared.jl @@ -97,3 +97,11 @@ check_nonconst_metadata(nonconst_metadata) = ( if !isa(nonconst_metadata, Bool) ## FUNCTIONS/MACROS FOR DIVERSE SYNTAX SUGAR hasmeta_PS(caller::Module) = isdefined(caller, MOD_METADATA_PS) + + +## FUNCTIONS AND MACROS FOR UNIT TESTS + +macro metadata(kernelcall) + if !is_call(kernelcall) @ArgumentError("@metadata: the argument must be a kernel call (obtained: $kernelcall).") end + return esc(create_metadata_call(kernelcall)) +end From e522083a3ea782a979f93658591549973eb225d8 Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Fri, 13 Mar 2026 10:38:41 +0100 Subject: [PATCH 09/12] Add metadata support in parallel indices tests to enhance kernel information tracking --- test/test_parallel.jl | 62 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/test/test_parallel.jl b/test/test_parallel.jl index 0fe8d38..eb231c0 100644 --- a/test/test_parallel.jl +++ b/test/test_parallel.jl @@ -1,7 +1,7 @@ using Test using ParallelStencil import ParallelStencil: @reset_parallel_stencil, @is_initialized, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_THREADS, PKG_POLYESTER, PKG_KERNELABSTRACTIONS, @select_hardware, @current_hardware, INDICES, INDICES_INN, INDICES_DIR, ARRAYTYPES, FIELDTYPES, SCALARTYPES -import ParallelStencil: @require, @prettystring, @gorgeousstring, @isgpu, @iscpu, interpolate +import ParallelStencil: @require, @prettystring, @gorgeousstring, @isgpu, @iscpu, interpolate, @metadata import ParallelStencil: checkargs_parallel, validate_body, parallel, parallel_indices using ParallelStencil.Exceptions using ParallelStencil.FiniteDifferences3D @@ -1230,8 +1230,58 @@ eval(:( end; @reset_parallel_stencil() end; - @testset "6. Exceptions" begin - @init_parallel_stencil($package, $FloatDefault, 3) + @testset "6. metadata" begin + @require !@is_initialized() + @init_parallel_stencil($package, $FloatDefault, 3, nonconst_metadata=true) + @require @is_initialized() + @testset "standard" begin + @parallel_indices (ix, iy, iz) function metadata_probe!(A, B, D) + A[ix, iy, iz] = 2.0 * B[ix, iy, iz] + return + end + A = @zeros(4, 5, 6) + B = @ones(4, 5, 6) + D = @ones(4, 5, 6, 2) + metadata = @metadata metadata_probe!(A, B, D) + metadata_symbols = sort(setdiff(names(metadata; all=true), names(metadata))) + @test metadata isa Module + @test length(names(metadata)) == 1 + @test metadata_symbols == [:nb_parallel_indices] + @test metadata.nb_parallel_indices == 3 + @test all(Array(A) .== 0) + end; + @static if $package != $PKG_KERNELABSTRACTIONS + @testset "memopt" begin + @parallel_indices (ix, iy, iz) memopt=true loopsize=3 optvars=B optranges=(B=(0:0,0:0,0:0),) function metadata_memopt_probe!(A, B, D) + A[ix, iy, iz] = 2.0 * B[ix, iy, iz] + return + end + A = @zeros(4, 5, 6) + B = @ones(4, 5, 6) + D = @ones(4, 5, 6, 2) + metadata = @metadata metadata_memopt_probe!(A, B, D) + metadata_symbols = sort(setdiff(names(metadata; all=true), names(metadata))) + @test metadata isa Module + @test length(names(metadata)) == 1 + @test metadata_symbols == [:is_parallel_kernel, :loopdim, :loopsize, :memopt, :nb_parallel_indices, :nonconst_metadata, :offsets, :optranges, :optvars, :stencilranges, :use_shmemhalos] + @test metadata.is_parallel_kernel == false + @test metadata.loopdim == 3 + @test metadata.loopsize == 3 + @test metadata.memopt == true + @test metadata.nb_parallel_indices == 3 + @test metadata.nonconst_metadata == true + @test metadata.offsets[:B][(0, 0)][0] == 1 + @test metadata.optranges[:B] == (0:0, 0:0, 0:0) + @test metadata.optvars == (:B,) + @test metadata.stencilranges == (B = (0:0, 0:0, 0:0),) + @test metadata.use_shmemhalos[:B] == true + @test all(Array(A) .== 0) + end; + end; + @reset_parallel_stencil() + end; + @testset "7. Exceptions" begin + @init_parallel_stencil($package, $FloatDefault, 3, nonconst_metadata=true) @require @is_initialized @testset "arguments @parallel" begin @test_throws ArgumentError checkargs_parallel(); # Error: isempty(args) @@ -1310,4 +1360,10 @@ eval(:( end; )) +eval(:( + @testset "$(basename(@__FILE__)) metadata (package: $(nameof($package)))" begin + + end; +)) + end == nothing || true; From 7fea7ca64d5326efac1d9201c6d6ea33df55bac8 Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Fri, 13 Mar 2026 11:26:56 +0100 Subject: [PATCH 10/12] Enhance stderr filtering in runtests to suppress specific warnings and improve output clarity --- test/runtests.jl | 54 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 6ac1473..1cda930 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,15 +1,65 @@ # NOTE: This file contains many parts that are copied from the file runtests.jl from the Package MPI.jl. push!(LOAD_PATH, "../src") +const PREIMPORT_STDERR_SUPPRESSION_RULES = ( + (name="Metal OS support warnings", start=r"^┌ Error: Metal\.jl is only supported on macOS$", stop=r"^└ @ Metal .*$"), +) + +const ANSI_ESCAPE_REGEX = r"\e\[[0-9;]*m" + +function filter_stderr_content(text::AbstractString; rules=STDERR_SUPPRESSION_RULES) + isempty(text) && return text + lines = split(text, '\n'; keepempty=true) + filtered = String[] + active_stop = nothing + for line in lines + match_line = replace(line, ANSI_ESCAPE_REGEX => "") + if !isnothing(active_stop) + if occursin(active_stop, match_line) + active_stop = nothing + end + continue + end + matched = false + for rule in rules + if occursin(rule.start, match_line) + active_stop = rule.stop + matched = true + break + end + end + matched || push!(filtered, line) + end + return join(filtered, '\n') +end + +function import_with_filtered_stderr(modulename::Symbol; rules=PREIMPORT_STDERR_SUPPRESSION_RULES) + mktemp() do path, io + redirect_stderr(io) do + @eval import $(modulename) + end + flush(io) + close(io) + filtered = filter_stderr_content(read(path, String); rules=rules) + isempty(filtered) || print(Base.stderr, filtered) + end +end + import ParallelStencil # Precompile it. import ParallelStencil: SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_KERNELABSTRACTIONS @static if (PKG_CUDA in SUPPORTED_PACKAGES) import CUDA end @static if (PKG_AMDGPU in SUPPORTED_PACKAGES) import AMDGPU end -@static if (PKG_METAL in SUPPORTED_PACKAGES) import Metal end +@static if (PKG_METAL in SUPPORTED_PACKAGES) import_with_filtered_stderr(:Metal) end @static if (PKG_KERNELABSTRACTIONS in SUPPORTED_PACKAGES) import KernelAbstractions end # KernelAbstractions does not require extra harness env vars beyond the existing CUDA/AMDGPU settings. excludedfiles = [ "test_excluded.jl", "test_incremental_compilation.jl", "test_revise.jl"]; # TODO: test_incremental_compilation has to be deactivated until Polyester support released +const STDERR_SUPPRESSION_RULES = ( + (name="metadata method overwrite warnings", start=r"^WARNING: Method definition .*###META.* overwritten.*$", stop=nothing), + (name="[T]Data module replacement warnings", start=r"^WARNING: replacing module [T]?Data\.$", stop=nothing), + (name="Metal OS support warnings", start=r"^┌ Error: Metal\.jl is only supported on macOS$", stop=r"^└ @ Metal .*$"), +) + function runtests(testfiles=String[]; stop_on_fail=false) exename = joinpath(Sys.BINDIR, Base.julia_exename()) testdir = pwd() @@ -61,7 +111,7 @@ function runtests(testfiles=String[]; stop_on_fail=false) stdout_content = read(stdout_path, String) stderr_content = read(stderr_path, String) print(stdout_content) - print(Base.stderr, stderr_content) + print(Base.stderr, filter_stderr_content(stderr_content)) catch ex println("Test Abort: a system-level exception occurred while running the test file $f :") println(ex) From c06e7880f217f4fc3e88715f80b8c431b867a049 Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Fri, 13 Mar 2026 11:43:01 +0100 Subject: [PATCH 11/12] Enhance documentation for kernelcall and automatic range computation in parallel.jl --- src/ParallelKernel/parallel.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ParallelKernel/parallel.jl b/src/ParallelKernel/parallel.jl index c0e97f4..1138d4c 100644 --- a/src/ParallelKernel/parallel.jl +++ b/src/ParallelKernel/parallel.jl @@ -9,9 +9,10 @@ const PARALLEL_DOC = """ @parallel (...) configcall=... backendkwargs... kernelcall @parallel ∇=... ad_mode=... ad_annotations=... (...) backendkwargs... kernelcall -Declare the `kernelcall` parallel. The kernel will automatically be called as required by the package for parallelization selected with [`@init_parallel_kernel`](@ref). Synchronizes at the end of the call (if a stream is given via keyword arguments, then it synchronizes only this stream). The keyword argument `∇` triggers a parallel call to the gradient kernel instead of the kernel itself. The automatic differentiation is performed with the package Enzyme.jl (refer to the corresponding documentation for Enzyme-specific terms used below); Enzyme needs to be imported before ParallelKernel in order to have it load the corresponding extension. +Declare the `kernelcall` parallel. The kernel will automatically be called as required by the package for parallelization selected with [`@init_parallel_kernel`](@ref) (however, see below the note on automatic computation of `ranges`). Synchronizes at the end of the call (if a stream is given via keyword arguments, then it synchronizes only this stream). The keyword argument `∇` triggers a parallel call to the gradient kernel instead of the kernel itself. The automatic differentiation is performed with the package Enzyme.jl (refer to the corresponding documentation for Enzyme-specific terms used below); Enzyme needs to be imported before ParallelKernel in order to have it load the corresponding extension. -Automatic computation of `ranges` for `@parallel ` is only possible if the number of parallel indices used by the kernel is equal to the number of dimensions of the highest-dimensional input arrays. Otherwise, specify the `ranges` manually with `@parallel ranges=... `. +!!! note "Automatic computation of `ranges`" + Automatic computation of `ranges` for `@parallel ` is only possible if the number of parallel indices used by the kernel is equal to the number of dimensions of the highest-dimensional input arrays. Otherwise, specify the `ranges` manually with `@parallel ranges=... `. !!! note "Runtime hardware selection" When KernelAbstractions is initialized, this wrapper consults [`current_hardware`](@ref) to determine the runtime hardware target. The symbol defaults to `:cpu` and can be switched to select other targets via [`select_hardware`](@ref). From f8c25fe4736cdebc6f2d8b5a627204b37f011de5 Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Fri, 13 Mar 2026 11:43:06 +0100 Subject: [PATCH 12/12] Enhance documentation for kernelcall to clarify automatic range computation and add a note for manual specification of ranges --- src/parallel.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/parallel.jl b/src/parallel.jl index ed79919..e07e211 100644 --- a/src/parallel.jl +++ b/src/parallel.jl @@ -32,9 +32,10 @@ See also: [`@init_parallel_stencil`](@ref) @parallel (...) memopt=... configcall=... backendkwargs... kernelcall @parallel ∇=... ad_mode=... ad_annotations=... (...) memopt=... backendkwargs... kernelcall -Declare the `kernelcall` parallel. The kernel will automatically be called as required by the package for parallelization selected with [`@init_parallel_kernel`](@ref). Synchronizes at the end of the call (if a stream is given via keyword arguments, then it synchronizes only this stream). The keyword argument `∇` triggers a parallel call to the gradient kernel instead of the kernel itself. The automatic differentiation is performed with the package Enzyme.jl (refer to the corresponding documentation for Enzyme-specific terms used below); Enzyme needs to be imported before ParallelStencil in order to have it load the corresponding extension. +Declare the `kernelcall` parallel. The kernel will automatically be called as required by the package for parallelization selected with [`@init_parallel_kernel`](@ref) (however, see below the note on automatic computation of `ranges`). Synchronizes at the end of the call (if a stream is given via keyword arguments, then it synchronizes only this stream). The keyword argument `∇` triggers a parallel call to the gradient kernel instead of the kernel itself. The automatic differentiation is performed with the package Enzyme.jl (refer to the corresponding documentation for Enzyme-specific terms used below); Enzyme needs to be imported before ParallelStencil in order to have it load the corresponding extension. -Automatic computation of `ranges` for `@parallel ` is only possible if the number of parallel indices used by the kernel is equal to the number of dimensions of the highest-dimensional input arrays. Otherwise, specify the `ranges` manually with `@parallel ranges=... `. +!!! note "Automatic computation of `ranges`" + Automatic computation of `ranges` for `@parallel ` is only possible if the number of parallel indices used by the kernel is equal to the number of dimensions of the highest-dimensional input arrays. Otherwise, specify the `ranges` manually with `@parallel ranges=... `. !!! note "Runtime hardware selection" When KernelAbstractions is initialized, this wrapper consults [`current_hardware`](@ref) to determine the runtime hardware target. The symbol defaults to `:cpu` and can be switched to select other targets via [`select_hardware`](@ref).