From ac64f8f0686c37445162e3e2e446517d00cc7f99 Mon Sep 17 00:00:00 2001 From: Michael Schlottke-Lakemper Date: Tue, 7 Oct 2025 10:27:58 +0200 Subject: [PATCH 01/18] Adapt memory layout --- examples/convergence_linear_advection.jl | 3 +- examples/linear_advection_demo.jl | 3 +- src/CodexMach.jl | 10 + src/equations.jl | 41 ++++ src/fields.jl | 200 +++++++++++++++ src/time_integration.jl | 299 ++++++++++++++--------- test/runtests.jl | 22 +- 7 files changed, 452 insertions(+), 126 deletions(-) create mode 100644 src/fields.jl diff --git a/examples/convergence_linear_advection.jl b/examples/convergence_linear_advection.jl index 8c15736..0da6bbf 100644 --- a/examples/convergence_linear_advection.jl +++ b/examples/convergence_linear_advection.jl @@ -46,7 +46,8 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16), run_linear_advection!(state, problem; steps = steps, dt = dt) - u_num = solution(state) + u_field = solution(state) + u_num = scalar_component(u_field) u_exact = _exact_solution(problem, vel, final_time, u_num) err = sqrt(sum(abs2, u_num .- u_exact) / length(u_num)) diff --git a/examples/linear_advection_demo.jl b/examples/linear_advection_demo.jl index 09913e7..0ef6360 100644 --- a/examples/linear_advection_demo.jl +++ b/examples/linear_advection_demo.jl @@ -78,7 +78,8 @@ function run_linear_advection_demo(; nx::Int = 256, _write_diagnostics_csv(diagnostics_path, records) end - final_state = copy(solution(state)) + final_field = copy(solution(state)) + final_state = scalar_component(final_field) if state_path !== nothing _write_state_csv(state_path, problem, final_state) end diff --git a/src/CodexMach.jl b/src/CodexMach.jl index 90d9995..e0b17a3 100644 --- a/src/CodexMach.jl +++ b/src/CodexMach.jl @@ -8,6 +8,7 @@ simulation_timers() = _SIMULATION_TIMERS include("mesh.jl") include("boundary_conditions.jl") +include("fields.jl") include("equations.jl") include("time_integration.jl") include("simulation.jl") @@ -20,6 +21,15 @@ export greet, PeriodicBoundaryConditions, is_periodic, periodic_axes, + CellField, + cell_components, + ncomponents, + spatial_size, + component, + scalar_component, + allocate_cellfield, + allocate_like, + map_components!, LinearAdvection, LinearAdvectionProblem, velocity, diff --git a/src/equations.jl b/src/equations.jl index a307275..c129aaf 100644 --- a/src/equations.jl +++ b/src/equations.jl @@ -232,6 +232,43 @@ function primitive_variables(eq::CompressibleEuler, return (; rho = ρ, u = u, v = v, p = p) end +function primitive_variables(eq::CompressibleEuler, + conserved::CellField; + rho_out = nothing, + u_out = nothing, + v_out = nothing, + p_out = nothing) + size(conserved, 1) == 4 || + throw(ArgumentError("Conserved field must have first dimension of length 4")) + + nx, ny = spatial_size(conserved) + + T = component_eltype(conserved) + ρ = rho_out === nothing ? Array{float(T)}(undef, nx, ny) : rho_out + u = u_out === nothing ? similar(ρ) : u_out + v = v_out === nothing ? similar(ρ) : v_out + p = p_out === nothing ? similar(ρ) : p_out + + ρc = component(conserved, 1) + rhouc = component(conserved, 2) + rhovc = component(conserved, 3) + Ec = component(conserved, 4) + + @inbounds for j in 1:ny, i in 1:nx + prim = primitive_variables(eq, + ρc[i, j], + rhouc[i, j], + rhovc[i, j], + Ec[i, j]) + ρ[i, j] = prim.ρ + u[i, j] = prim.u + v[i, j] = prim.v + p[i, j] = prim.p + end + + return (; rho = ρ, u = u, v = v, p = p) +end + primitive_variables(problem::CompressibleEulerProblem, state; kwargs...) = primitive_variables(pde(problem), solution(state); kwargs...) @@ -239,3 +276,7 @@ primitive_variables(problem::CompressibleEulerProblem, primitive_variables(problem::CompressibleEulerProblem, conserved::AbstractArray{T,3}; kwargs...) where {T} = primitive_variables(pde(problem), conserved; kwargs...) + +primitive_variables(problem::CompressibleEulerProblem, + conserved::CellField; kwargs...) = + primitive_variables(pde(problem), conserved; kwargs...) diff --git a/src/fields.jl b/src/fields.jl new file mode 100644 index 0000000..4dca5fb --- /dev/null +++ b/src/fields.jl @@ -0,0 +1,200 @@ +""" + CellField(components...) + +Bundle one or more cell-centered arrays that share a common `(nx, ny)` shape. +Components are stored in a structure-of-arrays layout, which keeps per-variable +data contiguous for GPU and cache-friendly execution. `CellField` behaves like a +3D `AbstractArray` whose first dimension indexes the component. +""" +struct CellField{T,N,A<:NTuple{N,AbstractArray{T,2}}} <: AbstractArray{T,3} + components::A + function CellField(components::A) where {T,N,A<:NTuple{N,AbstractArray{T,2}}} + N > 0 || throw(ArgumentError("CellField must contain at least one component")) + dims = size(components[1]) + for (idx, comp) in pairs(components) + ndims(comp) == 2 || + throw(ArgumentError("Component $(idx) must be two-dimensional")) + size(comp) == dims || + throw(ArgumentError("Component $(idx) does not match CellField shape")) + end + return new{T,N,A}(components) + end +end + +CellField(component::AbstractArray{T,2}) where {T} = + CellField{T,1,Tuple{typeof(component)}}((component,)) + +""" + cell_components(field) + +Return the tuple of component arrays stored within `field`. +""" +cell_components(field::CellField) = field.components + +""" + ncomponents(field) + +Return the number of component arrays bundled inside `field`. +""" +ncomponents(::CellField{T,N}) where {T,N} = N + +""" + spatial_size(field) + +Return the `(nx, ny)` logical dimensions shared by all components. +""" +spatial_size(field::CellField) = size(cell_components(field)[1]) + +""" + component(field, i) + +Access the `i`-th component array stored inside `field`. +""" +component(field::CellField, i::Integer) = cell_components(field)[i] + +component(field::CellField, ::Val{i}) where {i} = component(field, i) + +component_eltype(::CellField{T}) where {T} = T + +scalar_component(field::CellField) = component(field, 1) + +Base.eltype(field::CellField) = component_eltype(field) + +Base.size(field::CellField) = (ncomponents(field), spatial_size(field)...) + +Base.axes(field::CellField) = (Base.OneTo(ncomponents(field)), + axes(component(field, 1))...) + +Base.IndexStyle(::Type{<:CellField}) = IndexCartesian() + +@inline function Base.getindex(field::CellField, i, j, k) + return component(field, i)[j, k] +end + +@inline function Base.getindex(field::CellField{T,1}, i, j) where {T} + return component(field, 1)[i, j] +end + +@inline function Base.getindex(field::CellField{T,1}, ::Colon, ::Colon) where {T} + return component(field, 1)[:, :] +end + +@inline function Base.getindex(field::CellField, I::CartesianIndex{3}) + return field[I[1], I[2], I[3]] +end + +@inline function Base.getindex(field::CellField, i, ::Colon, ::Colon) + return component(field, i)[:, :] +end + +@inline function Base.getindex(field::CellField, i, j, ::Colon) + return component(field, i)[j, :] +end + +@inline function Base.getindex(field::CellField, i, ::Colon, k) + return component(field, i)[:, k] +end + +@inline function Base.setindex!(field::CellField, value, i, j, k) + component(field, i)[j, k] = value + return field +end + +@inline function Base.setindex!(field::CellField{T,1}, value, i, j) where {T} + component(field, 1)[i, j] = value + return field +end + +@inline function Base.setindex!(field::CellField, value, I::CartesianIndex{3}) + field[I[1], I[2], I[3]] = value + return field +end + +""" + allocate_cellfield(array_type, T, dims, n) + +Allocate a `CellField` with `n` components, each created via +`array_type{T}(undef, dims...)`. +""" +function allocate_cellfield(array_type, + ::Type{T}, + dims::NTuple{2,<:Integer}, + n::Integer) where {T} + n > 0 || throw(ArgumentError("Number of components must be positive")) + dimsT = (Int(dims[1]), Int(dims[2])) + components = ntuple(_ -> array_type{T}(undef, dimsT...), n) + return CellField(components) +end + +""" + allocate_like(field) + +Allocate a new `CellField` whose components mirror the array types and shapes of +`field`. +""" +function allocate_like(field::CellField) + comps = ntuple(i -> similar(component(field, i)), ncomponents(field)) + return CellField(comps) +end + +""" + allocate_like(field, T) + +Allocate a new `CellField` with components similar to those in `field` but whose +entries have element type `T`. +""" +function allocate_like(field::CellField, ::Type{T}) where {T} + comps = ntuple(i -> similar(component(field, i), T, size(component(field, i))), + ncomponents(field)) + return CellField(comps) +end + +Base.fill!(field::CellField, value::Number) = begin + for comp in cell_components(field) + fill!(comp, value) + end + return field +end + +function Base.fill!(field::CellField, values::Union{Tuple,NTuple}) + length(values) == ncomponents(field) || + throw(ArgumentError("Tuple length must match number of components")) + for (comp, val) in zip(cell_components(field), values) + fill!(comp, val) + end + return field +end + +function Base.copyto!(dest::CellField, src::CellField) + ncomponents(dest) == ncomponents(src) || + throw(ArgumentError("Component count mismatch in copy")) + for i in 1:ncomponents(dest) + copyto!(component(dest, i), component(src, i)) + end + return dest +end + +Base.similar(field::CellField) = allocate_like(field) + +Base.similar(field::CellField, ::Type{T}) where {T} = allocate_like(field, T) + +""" + map_components!(f, dest, inputs...) + +Apply `f` to each component array in `dest`, passing the corresponding +components drawn from each `inputs...` collection. All inputs must either be +`CellField`s with matching component counts or scalar arrays reused for every +component. +""" +function map_components!(f, dest::CellField, inputs...) + n = ncomponents(dest) + nin = length(inputs) + for i in 1:n + args = ntuple(j -> begin + input = inputs[j] + input isa CellField ? component(input, i) : input + end, nin) + f(component(dest, i), args...) + end + return dest +end diff --git a/src/time_integration.jl b/src/time_integration.jl index 1ab79f7..4e6a07f 100644 --- a/src/time_integration.jl +++ b/src/time_integration.jl @@ -68,12 +68,12 @@ function LinearAdvectionState(problem::LinearAdvectionProblem; init = nothing) mesh_obj = mesh(problem) dims = size(mesh_obj) - field = _allocate_field(array_type, T, dims) - _initialize_field!(field, init, mesh_obj, T) + field = allocate_cellfield(array_type, T, dims, 1) + _initialize_scalar_field!(field, init, mesh_obj, T) - k1 = _allocate_field(array_type, T, dims) - k2 = _allocate_field(array_type, T, dims) - stage = _allocate_field(array_type, T, dims) + k1 = allocate_like(field) + k2 = allocate_like(field) + stage = allocate_like(field) fill!(k1, zero(T)) fill!(k2, zero(T)) fill!(stage, zero(T)) @@ -81,32 +81,26 @@ function LinearAdvectionState(problem::LinearAdvectionProblem; return LinearAdvectionState(field, RK2Workspace(k1, k2, stage)) end -function _allocate_field(array_type, ::Type{T}, dims::Tuple{Vararg{Int}}) where {T} - return array_type{T}(undef, dims...) -end - -function _initialize_field!(field, init, mesh::StructuredMesh, ::Type{T}) where {T} +function _initialize_scalar_field!(field::CellField, init, mesh::StructuredMesh, ::Type{T}) where {T} + data = component(field, 1) if init === nothing - fill!(field, zero(T)) - return field + fill!(data, zero(T)) elseif init isa Number - fill!(field, T(init)) - return field + fill!(data, T(init)) elseif init isa AbstractArray - size(init) == size(field) || + size(init) == size(data) || throw(ArgumentError("Initializer array must match mesh dimensions")) - field .= T.(init) - return field + data .= T.(init) elseif init isa Function centers_x, centers_y = cell_centers(mesh) nx, ny = size(mesh) @inbounds for j in 1:ny, i in 1:nx - field[i, j] = T(init(centers_x[i], centers_y[j])) + data[i, j] = T(init(centers_x[i], centers_y[j])) end - return field else throw(ArgumentError("Unsupported initializer type $(typeof(init))")) end + return field end const _EulerVarCount = 4 @@ -119,12 +113,12 @@ function CompressibleEulerState(problem::CompressibleEulerProblem; dims = size(mesh_obj) eq = pde(problem) - field = _allocate_field(array_type, T, ( _EulerVarCount, dims[1], dims[2])) + field = allocate_cellfield(array_type, T, dims, _EulerVarCount) _initialize_euler_field!(field, init, mesh_obj, eq, T) - k1 = _allocate_field(array_type, T, size(field)) - k2 = _allocate_field(array_type, T, size(field)) - stage = _allocate_field(array_type, T, size(field)) + k1 = allocate_like(field) + k2 = allocate_like(field) + stage = allocate_like(field) fill!(k1, zero(T)) fill!(k2, zero(T)) fill!(stage, zero(T)) @@ -132,7 +126,11 @@ function CompressibleEulerState(problem::CompressibleEulerProblem; return CompressibleEulerState(field, RK2Workspace(k1, k2, stage)) end -function _initialize_euler_field!(field, init, mesh::StructuredMesh, equation::CompressibleEuler, ::Type{T}) where {T} +function _initialize_euler_field!(field::CellField, + init, + mesh::StructuredMesh, + equation::CompressibleEuler, + ::Type{T}) where {T} if init === nothing fill!(field, zero(T)) return field @@ -142,7 +140,9 @@ function _initialize_euler_field!(field, init, mesh::StructuredMesh, equation::C elseif init isa AbstractArray size(init) == size(field) || throw(ArgumentError("Initializer array must match (4, nx, ny) layout")) - field .= T.(init) + for comp in 1:ncomponents(field) + component(field, comp) .= T.(view(init, comp, :, :)) + end return field elseif init isa Function centers_x, centers_y = cell_centers(mesh) @@ -220,18 +220,42 @@ function _rk2_step!(state, problem, dt::Real) @timeit timer "RK2 step" begin @timeit timer "RHS compute" compute_rhs!(ws.k1, u, problem) - @timeit timer "Stage predictor" begin - @. ws.stage = u + dtT * ws.k1 - end + @timeit timer "Stage predictor" _rk2_stage_predict!(ws.stage, u, ws.k1, dtT) @timeit timer "RHS compute" compute_rhs!(ws.k2, ws.stage, problem) - @timeit timer "Solution update" begin - @. u = u + (dtT * half) * (ws.k1 + ws.k2) - end + @timeit timer "Solution update" _rk2_solution_update!(u, ws.k1, ws.k2, dtT * half) end return state end +@inline function _rk2_stage_predict!(stage::CellField, + u::CellField, + rhs::CellField, + α) + n = ncomponents(stage) + @inbounds for comp in 1:n + stage_comp = component(stage, comp) + u_comp = component(u, comp) + rhs_comp = component(rhs, comp) + @. stage_comp = u_comp + α * rhs_comp + end + return stage +end + +@inline function _rk2_solution_update!(u::CellField, + k1::CellField, + k2::CellField, + α) + n = ncomponents(u) + @inbounds for comp in 1:n + u_comp = component(u, comp) + k1_comp = component(k1, comp) + k2_comp = component(k2, comp) + @. u_comp = u_comp + α * (k1_comp + k2_comp) + end + return u +end + """ compute_rhs!(du, u, problem) @@ -239,9 +263,21 @@ Populate `du` with the spatial derivative of `u` for the PDE described by `problem`. Linear advection problems use a second-order upwind stencil, while compressible Euler problems employ slope-limited Rusanov fluxes. """ +function compute_rhs!(du::CellField, + u::CellField, + problem::LinearAdvectionProblem) + return _linear_advection_rhs!(component(du, 1), component(u, 1), problem) +end + function compute_rhs!(du::AbstractArray{T,2}, u::AbstractArray{T,2}, problem::LinearAdvectionProblem) where {T} + return _linear_advection_rhs!(du, u, problem) +end + +function _linear_advection_rhs!(du::AbstractArray{T,2}, + u::AbstractArray{T,2}, + problem::LinearAdvectionProblem) where {T} mesh_obj = mesh(problem) bc = boundary_conditions(problem) eq = pde(problem) @@ -297,9 +333,24 @@ function compute_rhs!(du::AbstractArray{T,2}, return du end +function compute_rhs!(du::CellField, + u::CellField, + problem::CompressibleEulerProblem) + return _compressible_euler_rhs!(du, u, problem) +end + function compute_rhs!(du::AbstractArray{T,3}, u::AbstractArray{T,3}, problem::CompressibleEulerProblem) where {T} + du_field = CellField(ntuple(i -> view(du, i, :, :), Val(_EulerVarCount))) + u_field = CellField(ntuple(i -> view(u, i, :, :), Val(_EulerVarCount))) + _compressible_euler_rhs!(du_field, u_field, problem) + return du +end + +function _compressible_euler_rhs!(du::CellField, + u::CellField, + problem::CompressibleEulerProblem) mesh_obj = mesh(problem) bc = boundary_conditions(problem) eq = pde(problem) @@ -313,13 +364,23 @@ function compute_rhs!(du::AbstractArray{T,3}, nx >= 3 || throw(ArgumentError("At least three cells required along x")) ny >= 3 || throw(ArgumentError("At least three cells required along y")) + T = component_eltype(u) dx, dy = spacing(mesh_obj) inv_dx = one(T) / T(dx) inv_dy = one(T) / T(dy) γ = gamma(eq) - fill!(du, zero(T)) + ρ = component(u, 1) + rhou = component(u, 2) + rhov = component(u, 3) + E = component(u, 4) + + dρ = component(du, 1) + drhou = component(du, 2) + drhov = component(du, 3) + dE = component(du, 4) + # x-direction fluxes @inbounds for j in 1:ny jm1 = j == 1 ? ny : j - 1 @@ -330,57 +391,57 @@ function compute_rhs!(du::AbstractArray{T,3}, im = i == 1 ? nx : i - 1 # Slopes for cell i - ΔLρ = u[1, i, j] - u[1, im, j] - ΔRρ = u[1, ip, j] - u[1, i, j] - ΔLrhox = u[2, i, j] - u[2, im, j] - ΔRrhox = u[2, ip, j] - u[2, i, j] - ΔLrhoy = u[3, i, j] - u[3, im, j] - ΔRrhoy = u[3, ip, j] - u[3, i, j] - ΔLE = u[4, i, j] - u[4, im, j] - ΔRE = u[4, ip, j] - u[4, i, j] + ΔLρ = ρ[i, j] - ρ[im, j] + ΔRρ = ρ[ip, j] - ρ[i, j] + ΔLrhox = rhou[i, j] - rhou[im, j] + ΔRrhox = rhou[ip, j] - rhou[i, j] + ΔLrhoy = rhov[i, j] - rhov[im, j] + ΔRrhoy = rhov[ip, j] - rhov[i, j] + ΔLE = E[i, j] - E[im, j] + ΔRE = E[ip, j] - E[i, j] sρ = _minmod(ΔLρ, ΔRρ) srhox = _minmod(ΔLrhox, ΔRrhox) srhoy = _minmod(ΔLrhoy, ΔRrhoy) sE = _minmod(ΔLE, ΔRE) - ρL = u[1, i, j] + T(0.5) * sρ - rhouL = u[2, i, j] + T(0.5) * srhox - rhovL = u[3, i, j] + T(0.5) * srhoy - EL = u[4, i, j] + T(0.5) * sE + ρL = ρ[i, j] + T(0.5) * sρ + rhouL = rhou[i, j] + T(0.5) * srhox + rhovL = rhov[i, j] + T(0.5) * srhoy + EL = E[i, j] + T(0.5) * sE # Slopes for cell ip - ΔLρ_ip = u[1, ip, j] - u[1, i, j] - ΔRρ_ip = u[1, ip2, j] - u[1, ip, j] - ΔLrhox_ip = u[2, ip, j] - u[2, i, j] - ΔRrhox_ip = u[2, ip2, j] - u[2, ip, j] - ΔLrhoy_ip = u[3, ip, j] - u[3, i, j] - ΔRrhoy_ip = u[3, ip2, j] - u[3, ip, j] - ΔLE_ip = u[4, ip, j] - u[4, i, j] - ΔRE_ip = u[4, ip2, j] - u[4, ip, j] + ΔLρ_ip = ρ[ip, j] - ρ[i, j] + ΔRρ_ip = ρ[ip2, j] - ρ[ip, j] + ΔLrhox_ip = rhou[ip, j] - rhou[i, j] + ΔRrhox_ip = rhou[ip2, j] - rhou[ip, j] + ΔLrhoy_ip = rhov[ip, j] - rhov[i, j] + ΔRrhoy_ip = rhov[ip2, j] - rhov[ip, j] + ΔLE_ip = E[ip, j] - E[i, j] + ΔRE_ip = E[ip2, j] - E[ip, j] sρ_ip = _minmod(ΔLρ_ip, ΔRρ_ip) srhox_ip = _minmod(ΔLrhox_ip, ΔRrhox_ip) srhoy_ip = _minmod(ΔLrhoy_ip, ΔRrhoy_ip) sE_ip = _minmod(ΔLE_ip, ΔRE_ip) - ρR = u[1, ip, j] - T(0.5) * sρ_ip - rhouR = u[2, ip, j] - T(0.5) * srhox_ip - rhovR = u[3, ip, j] - T(0.5) * srhoy_ip - ER = u[4, ip, j] - T(0.5) * sE_ip + ρR = ρ[ip, j] - T(0.5) * sρ_ip + rhouR = rhou[ip, j] - T(0.5) * srhox_ip + rhovR = rhov[ip, j] - T(0.5) * srhoy_ip + ER = E[ip, j] - T(0.5) * sE_ip flux1, flux2, flux3, flux4 = _rusanov_flux_x(eq, ρL, rhouL, rhovL, EL, ρR, rhouR, rhovR, ER) - du[1, i, j] -= flux1 * inv_dx - du[2, i, j] -= flux2 * inv_dx - du[3, i, j] -= flux3 * inv_dx - du[4, i, j] -= flux4 * inv_dx + dρ[i, j] -= flux1 * inv_dx + drhou[i, j] -= flux2 * inv_dx + drhov[i, j] -= flux3 * inv_dx + dE[i, j] -= flux4 * inv_dx - du[1, ip, j] += flux1 * inv_dx - du[2, ip, j] += flux2 * inv_dx - du[3, ip, j] += flux3 * inv_dx - du[4, ip, j] += flux4 * inv_dx + dρ[ip, j] += flux1 * inv_dx + drhou[ip, j] += flux2 * inv_dx + drhov[ip, j] += flux3 * inv_dx + dE[ip, j] += flux4 * inv_dx end end @@ -390,56 +451,56 @@ function compute_rhs!(du::AbstractArray{T,3}, jp2 = jp == ny ? 1 : jp + 1 jm = j == 1 ? ny : j - 1 for i in 1:nx - ΔLρ = u[1, i, j] - u[1, i, jm] - ΔRρ = u[1, i, jp] - u[1, i, j] - ΔLrhox = u[2, i, j] - u[2, i, jm] - ΔRrhox = u[2, i, jp] - u[2, i, j] - ΔLrhoy = u[3, i, j] - u[3, i, jm] - ΔRrhoy = u[3, i, jp] - u[3, i, j] - ΔLE = u[4, i, j] - u[4, i, jm] - ΔRE = u[4, i, jp] - u[4, i, j] + ΔLρ = ρ[i, j] - ρ[i, jm] + ΔRρ = ρ[i, jp] - ρ[i, j] + ΔLrhox = rhou[i, j] - rhou[i, jm] + ΔRrhox = rhou[i, jp] - rhou[i, j] + ΔLrhoy = rhov[i, j] - rhov[i, jm] + ΔRrhoy = rhov[i, jp] - rhov[i, j] + ΔLE = E[i, j] - E[i, jm] + ΔRE = E[i, jp] - E[i, j] sρ = _minmod(ΔLρ, ΔRρ) srhox = _minmod(ΔLrhox, ΔRrhox) srhoy = _minmod(ΔLrhoy, ΔRrhoy) sE = _minmod(ΔLE, ΔRE) - ρL = u[1, i, j] + T(0.5) * sρ - rhouL = u[2, i, j] + T(0.5) * srhox - rhovL = u[3, i, j] + T(0.5) * srhoy - EL = u[4, i, j] + T(0.5) * sE + ρL = ρ[i, j] + T(0.5) * sρ + rhouL = rhou[i, j] + T(0.5) * srhox + rhovL = rhov[i, j] + T(0.5) * srhoy + EL = E[i, j] + T(0.5) * sE - ΔLρ_jp = u[1, i, jp] - u[1, i, j] - ΔRρ_jp = u[1, i, jp2] - u[1, i, jp] - ΔLrhox_jp = u[2, i, jp] - u[2, i, j] - ΔRrhox_jp = u[2, i, jp2] - u[2, i, jp] - ΔLrhoy_jp = u[3, i, jp] - u[3, i, j] - ΔRrhoy_jp = u[3, i, jp2] - u[3, i, jp] - ΔLE_jp = u[4, i, jp] - u[4, i, j] - ΔRE_jp = u[4, i, jp2] - u[4, i, jp] + ΔLρ_jp = ρ[i, jp] - ρ[i, j] + ΔRρ_jp = ρ[i, jp2] - ρ[i, jp] + ΔLrhox_jp = rhou[i, jp] - rhou[i, j] + ΔRrhox_jp = rhou[i, jp2] - rhou[i, jp] + ΔLrhoy_jp = rhov[i, jp] - rhov[i, j] + ΔRrhoy_jp = rhov[i, jp2] - rhov[i, jp] + ΔLE_jp = E[i, jp] - E[i, j] + ΔRE_jp = E[i, jp2] - E[i, jp] sρ_jp = _minmod(ΔLρ_jp, ΔRρ_jp) srhox_jp = _minmod(ΔLrhox_jp, ΔRrhox_jp) srhoy_jp = _minmod(ΔLrhoy_jp, ΔRrhoy_jp) sE_jp = _minmod(ΔLE_jp, ΔRE_jp) - ρR = u[1, i, jp] - T(0.5) * sρ_jp - rhouR = u[2, i, jp] - T(0.5) * srhox_jp - rhovR = u[3, i, jp] - T(0.5) * srhoy_jp - ER = u[4, i, jp] - T(0.5) * sE_jp + ρR = ρ[i, jp] - T(0.5) * sρ_jp + rhouR = rhou[i, jp] - T(0.5) * srhox_jp + rhovR = rhov[i, jp] - T(0.5) * srhoy_jp + ER = E[i, jp] - T(0.5) * sE_jp flux1, flux2, flux3, flux4 = _rusanov_flux_y(eq, ρL, rhouL, rhovL, EL, ρR, rhouR, rhovR, ER) - du[1, i, j] -= flux1 * inv_dy - du[2, i, j] -= flux2 * inv_dy - du[3, i, j] -= flux3 * inv_dy - du[4, i, j] -= flux4 * inv_dy + dρ[i, j] -= flux1 * inv_dy + drhou[i, j] -= flux2 * inv_dy + drhov[i, j] -= flux3 * inv_dy + dE[i, j] -= flux4 * inv_dy - du[1, i, jp] += flux1 * inv_dy - du[2, i, jp] += flux2 * inv_dy - du[3, i, jp] += flux3 * inv_dy - du[4, i, jp] += flux4 * inv_dy + dρ[i, jp] += flux1 * inv_dy + drhou[i, jp] += flux2 * inv_dy + drhov[i, jp] += flux3 * inv_dy + dE[i, jp] += flux4 * inv_dy end end @@ -510,16 +571,21 @@ function cfl_number(problem::CompressibleEulerProblem, max_ax = zero(eltype(u)) max_ay = zero(eltype(u)) + ρ = component(u, 1) + rhou = component(u, 2) + rhov = component(u, 3) + E = component(u, 4) + @inbounds for j in 1:ny, i in 1:nx - ρ = u[1, i, j] - rhou = u[2, i, j] - rhov = u[3, i, j] - E = u[4, i, j] - invρ = one(ρ) / ρ - ux = rhou * invρ - uy = rhov * invρ - kinetic = 0.5 * ρ * (ux^2 + uy^2) - p = (γ - one(γ)) * (E - kinetic) + ρval = ρ[i, j] + rhouval = rhou[i, j] + rhovval = rhov[i, j] + Eval = E[i, j] + invρ = one(ρval) / ρval + ux = rhouval * invρ + uy = rhovval * invρ + kinetic = 0.5 * ρval * (ux^2 + uy^2) + p = (γ - one(γ)) * (Eval - kinetic) c = sqrt(abs(γ * p * invρ)) max_ax = max(max_ax, abs(ux) + c) max_ay = max(max_ay, abs(uy) + c) @@ -549,16 +615,21 @@ function stable_timestep(problem::CompressibleEulerProblem, max_ax = zero(eltype(u)) max_ay = zero(eltype(u)) + ρ = component(u, 1) + rhou = component(u, 2) + rhov = component(u, 3) + E = component(u, 4) + @inbounds for j in 1:ny, i in 1:nx - ρ = u[1, i, j] - rhou = u[2, i, j] - rhov = u[3, i, j] - E = u[4, i, j] - invρ = one(ρ) / ρ - ux = rhou * invρ - uy = rhov * invρ - kinetic = 0.5 * ρ * (ux^2 + uy^2) - p = (γ - one(γ)) * (E - kinetic) + ρval = ρ[i, j] + rhouval = rhou[i, j] + rhovval = rhov[i, j] + Eval = E[i, j] + invρ = one(ρval) / ρval + ux = rhouval * invρ + uy = rhovval * invρ + kinetic = 0.5 * ρval * (ux^2 + uy^2) + p = (γ - one(γ)) * (Eval - kinetic) c = sqrt(abs(γ * p * invρ)) max_ax = max(max_ax, abs(ux) + c) max_ay = max(max_ay, abs(uy) + c) diff --git a/test/runtests.jl b/test/runtests.jl index 61c5bc8..2d910c3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -47,27 +47,29 @@ end @testset "LinearAdvection state" begin problem = setup_linear_advection_problem(8, 4; velocity = (1.0, 0.0)) state = LinearAdvectionState(problem; init = 2.0) - u = solution(state) + u_field = solution(state) + u = scalar_component(u_field) ws = workspace(state) + @test size(u_field) == (1, 8, 4) @test size(u) == (8, 4) @test all(u .== 2.0) - @test size(ws.k1) == size(u) - @test size(ws.k2) == size(u) - @test size(ws.stage) == size(u) + @test size(ws.k1) == size(u_field) + @test size(ws.k2) == size(u_field) + @test size(ws.stage) == size(u_field) mesh = CodexMach.mesh(problem) init_fun(x, y) = x + y state_fun = LinearAdvectionState(problem; init = init_fun) - u_fun = solution(state_fun) + u_fun = scalar_component(solution(state_fun)) centers_x, centers_y = CodexMach.cell_centers(mesh) @inbounds for j in 1:size(u_fun, 2), i in 1:size(u_fun, 1) @test isapprox(u_fun[i, j], init_fun(centers_x[i], centers_y[j]); atol = 1e-12) end - fill!(u, 1.0) - compute_rhs!(ws.k1, u, problem) - @test all(isapprox.(ws.k1, 0.0; atol = 1e-12)) + fill!(u_field, 1.0) + compute_rhs!(ws.k1, u_field, problem) + @test all(isapprox.(scalar_component(ws.k1), 0.0; atol = 1e-12)) end @testset "LinearAdvection RK2" begin @@ -83,7 +85,7 @@ end rk2_step!(state, problem, dt) expected = [sin(2pi * (x - dt)) for x in centers_x] - u = solution(state) + u = scalar_component(solution(state)) @inbounds for i in 1:nx @test isapprox(u[i, 1], expected[i]; atol = 1e-3) end @@ -115,7 +117,7 @@ end state = LinearAdvectionState(problem; init = init_fun) result = run_linear_advection!(state, problem; steps = 4, dt = dt, record_cfl = true) expected = [sin(2pi * (x - 4 * dt)) for x in centers_x] - u = solution(state) + u = scalar_component(solution(state)) @inbounds for i in 1:nx @test isapprox(u[i, 1], expected[i]; atol = 1e-3) end From 2e466b71474c5b64178e9f430034df0adab3ea0c Mon Sep 17 00:00:00 2001 From: Michael Schlottke-Lakemper Date: Tue, 7 Oct 2025 10:46:46 +0200 Subject: [PATCH 02/18] Add backend infrastructure --- src/CodexMach.jl | 7 +++++++ src/backends.jl | 28 ++++++++++++++++++++++++++++ src/time_integration.jl | 36 ++++++++++++++++++++++++++++++------ test/runtests.jl | 10 ++++++++++ 4 files changed, 75 insertions(+), 6 deletions(-) create mode 100644 src/backends.jl diff --git a/src/CodexMach.jl b/src/CodexMach.jl index e0b17a3..987f86a 100644 --- a/src/CodexMach.jl +++ b/src/CodexMach.jl @@ -9,6 +9,7 @@ simulation_timers() = _SIMULATION_TIMERS include("mesh.jl") include("boundary_conditions.jl") include("fields.jl") +include("backends.jl") include("equations.jl") include("time_integration.jl") include("simulation.jl") @@ -30,6 +31,11 @@ export greet, allocate_cellfield, allocate_like, map_components!, + ExecutionBackend, + SerialBackend, + KernelAbstractionsBackend, + default_backend, + describe, LinearAdvection, LinearAdvectionProblem, velocity, @@ -41,6 +47,7 @@ export greet, RK2Workspace, solution, workspace, + backend, compute_rhs!, rk2_step!, cfl_number, diff --git a/src/backends.jl b/src/backends.jl new file mode 100644 index 0000000..6e838fb --- /dev/null +++ b/src/backends.jl @@ -0,0 +1,28 @@ +abstract type ExecutionBackend end + +struct SerialBackend <: ExecutionBackend +end + +struct KernelAbstractionsBackend{D,WS} <: ExecutionBackend + device::D + workgroupsize::WS +end + +function KernelAbstractionsBackend(device; workgroupsize = nothing) + return KernelAbstractionsBackend{typeof(device), typeof(workgroupsize)}(device, workgroupsize) +end + +KernelAbstractionsBackend(; device = :cpu, workgroupsize = nothing) = + KernelAbstractionsBackend(device; workgroupsize = workgroupsize) + +default_backend() = SerialBackend() + +function describe(backend::ExecutionBackend) + if backend isa SerialBackend + return "SerialBackend" + elseif backend isa KernelAbstractionsBackend + return "KernelAbstractionsBackend($(backend.device))" + else + return string(typeof(backend)) + end +end diff --git a/src/time_integration.jl b/src/time_integration.jl index 4e6a07f..efc371a 100644 --- a/src/time_integration.jl +++ b/src/time_integration.jl @@ -16,9 +16,10 @@ end Hold the cell-centered solution field alongside scratch buffers required by the RK2 integrator. """ -struct LinearAdvectionState{A,W} +struct LinearAdvectionState{A,W,B} solution::A workspace::W + backend::B end """ @@ -35,15 +36,23 @@ Access the Runge-Kutta scratch buffers bundled with `state`. """ workspace(state::LinearAdvectionState) = state.workspace +""" + backend(state) + +Return the execution backend associated with `state`. +""" +backend(state::LinearAdvectionState) = state.backend + """ CompressibleEulerState(solution, workspace) Hold the conserved variables for the compressible Euler system along with RK2 scratch storage. """ -struct CompressibleEulerState{A,W} +struct CompressibleEulerState{A,W,B} solution::A workspace::W + backend::B end """ @@ -60,12 +69,15 @@ Access the Runge-Kutta scratch buffers bundled with `state`. """ workspace(state::CompressibleEulerState) = state.workspace +backend(state::CompressibleEulerState) = state.backend + const _DefaultArrayType = Array function LinearAdvectionState(problem::LinearAdvectionProblem; T::Type = Float64, array_type = _DefaultArrayType, - init = nothing) + init = nothing, + backend::ExecutionBackend = default_backend()) mesh_obj = mesh(problem) dims = size(mesh_obj) field = allocate_cellfield(array_type, T, dims, 1) @@ -78,7 +90,7 @@ function LinearAdvectionState(problem::LinearAdvectionProblem; fill!(k2, zero(T)) fill!(stage, zero(T)) - return LinearAdvectionState(field, RK2Workspace(k1, k2, stage)) + return LinearAdvectionState(field, RK2Workspace(k1, k2, stage), backend) end function _initialize_scalar_field!(field::CellField, init, mesh::StructuredMesh, ::Type{T}) where {T} @@ -108,7 +120,8 @@ const _EulerVarCount = 4 function CompressibleEulerState(problem::CompressibleEulerProblem; T::Type = Float64, array_type = _DefaultArrayType, - init = nothing) + init = nothing, + backend::ExecutionBackend = default_backend()) mesh_obj = mesh(problem) dims = size(mesh_obj) eq = pde(problem) @@ -123,7 +136,7 @@ function CompressibleEulerState(problem::CompressibleEulerProblem; fill!(k2, zero(T)) fill!(stage, zero(T)) - return CompressibleEulerState(field, RK2Workspace(k1, k2, stage)) + return CompressibleEulerState(field, RK2Workspace(k1, k2, stage), backend) end function _initialize_euler_field!(field::CellField, @@ -211,6 +224,17 @@ function rk2_step!(state::CompressibleEulerState, end function _rk2_step!(state, problem, dt::Real) + backend_obj = backend(state) + if backend_obj isa SerialBackend + return _rk2_step_serial!(state, problem, dt) + elseif backend_obj isa KernelAbstractionsBackend + throw(ArgumentError("KernelAbstractions backends are not yet implemented for time integration")) + else + throw(ArgumentError("Unsupported execution backend $(describe(backend_obj))")) + end +end + +function _rk2_step_serial!(state, problem, dt::Real) u = solution(state) ws = workspace(state) Tsol = eltype(u) diff --git a/test/runtests.jl b/test/runtests.jl index 2d910c3..5c107e1 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -51,6 +51,8 @@ end u = scalar_component(u_field) ws = workspace(state) + @test backend(state) isa SerialBackend + @test size(u_field) == (1, 8, 4) @test size(u) == (8, 4) @test all(u .== 2.0) @@ -72,6 +74,14 @@ end @test all(isapprox.(scalar_component(ws.k1), 0.0; atol = 1e-12)) end +@testset "Backends" begin + problem = setup_linear_advection_problem(8, 4; velocity = (0.0, 0.0)) + ka_backend = KernelAbstractionsBackend(:cpu) + state = LinearAdvectionState(problem; init = 1.0, backend = ka_backend) + @test backend(state) === ka_backend + @test_throws ArgumentError run_linear_advection!(state, problem; steps = 1, dt = 0.1) +end + @testset "LinearAdvection RK2" begin nx = 64 problem = setup_linear_advection_problem(nx, 1; velocity = (1.0, 0.0)) From ba41fc559d1b549432a1393e0bb8ab5d87c0a1b1 Mon Sep 17 00:00:00 2001 From: Michael Schlottke-Lakemper Date: Tue, 7 Oct 2025 12:30:54 +0200 Subject: [PATCH 03/18] Add KernelAbstractions dependency --- Project.toml | 2 + src/CodexMach.jl | 1 + src/kernel_abstractions_support.jl | 100 +++++++++++++++++++++++++++++ src/time_integration.jl | 80 +++++++++++++++++++++-- test/runtests.jl | 11 +++- 5 files changed, 185 insertions(+), 9 deletions(-) create mode 100644 src/kernel_abstractions_support.jl diff --git a/Project.toml b/Project.toml index 5e60efb..906bd9b 100644 --- a/Project.toml +++ b/Project.toml @@ -4,10 +4,12 @@ authors = ["Michael Schlottke-Lakemper "] version = "0.1.0" [deps] +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" [compat] +KernelAbstractions = "0.9.38" Plots = "1.41.1" TimerOutputs = "0.5.29" julia = "1.11" diff --git a/src/CodexMach.jl b/src/CodexMach.jl index 987f86a..6301b6f 100644 --- a/src/CodexMach.jl +++ b/src/CodexMach.jl @@ -10,6 +10,7 @@ include("mesh.jl") include("boundary_conditions.jl") include("fields.jl") include("backends.jl") +include("kernel_abstractions_support.jl") include("equations.jl") include("time_integration.jl") include("simulation.jl") diff --git a/src/kernel_abstractions_support.jl b/src/kernel_abstractions_support.jl new file mode 100644 index 0000000..693fafc --- /dev/null +++ b/src/kernel_abstractions_support.jl @@ -0,0 +1,100 @@ +using KernelAbstractions + +const _SUPPORTED_SYMBOLIC_DEVICES = (:cpu,) + +function _resolve_ka_device(spec) + if spec isa Symbol + spec === :cpu && return KernelAbstractions.CPU() + throw(ArgumentError("Unsupported KernelAbstractions device spec $(spec); supported values: $(_SUPPORTED_SYMBOLIC_DEVICES)")) + elseif KernelAbstractions.isdevice(spec) + return spec + else + throw(ArgumentError("KernelAbstractionsBackend device must be a Symbol or device, got $(typeof(spec))")) + end +end + +@kernel function _linear_advection_rhs_kernel!(du, u, ax, ay, inv2dx, inv2dy) + i, j = @index(Global, NTuple) + nx, ny = size(u) + if i <= nx && j <= ny + im1 = i == 1 ? nx : i - 1 + im2 = im1 == 1 ? nx : im1 - 1 + ip1 = i == nx ? 1 : i + 1 + ip2 = ip1 == nx ? 1 : ip1 + 1 + + jm1 = j == 1 ? ny : j - 1 + jm2 = jm1 == 1 ? ny : jm1 - 1 + jp1 = j == ny ? 1 : j + 1 + jp2 = jp1 == ny ? 1 : jp1 + 1 + + zero_ax = zero(ax) + dudx = zero_ax + if ax > zero_ax + dudx = (3 * u[i, j] - 4 * u[im1, j] + u[im2, j]) * inv2dx + elseif ax < zero_ax + dudx = (-3 * u[i, j] + 4 * u[ip1, j] - u[ip2, j]) * inv2dx + end + + zero_ay = zero(ay) + dudy = zero_ay + if ay > zero_ay + dudy = (3 * u[i, j] - 4 * u[i, jm1] + u[i, jm2]) * inv2dy + elseif ay < zero_ay + dudy = (-3 * u[i, j] + 4 * u[i, jp1] - u[i, jp2]) * inv2dy + end + + du[i, j] = -(ax * dudx + ay * dudy) + end +end + +@kernel function _rk2_stage_kernel!(stage, u, rhs, dt) + i, j = @index(Global, NTuple) + nx, ny = size(stage) + if i <= nx && j <= ny + stage[i, j] = u[i, j] + dt * rhs[i, j] + end +end + +@kernel function _rk2_update_kernel!(u, k1, k2, factor) + i, j = @index(Global, NTuple) + nx, ny = size(u) + if i <= nx && j <= ny + u[i, j] = u[i, j] + factor * (k1[i, j] + k2[i, j]) + end +end + +function linear_advection_rhs_kernel!(backend::KernelAbstractionsBackend, + du, u, + nx::Int, ny::Int, + ax, ay, + inv2dx, inv2dy) + device = _resolve_ka_device(backend.device) + kernel = backend.workgroupsize === nothing ? + _linear_advection_rhs_kernel!(device) : + _linear_advection_rhs_kernel!(device, backend.workgroupsize) + kernel(du, u, ax, ay, inv2dx, inv2dy; ndrange = (nx, ny)) + KernelAbstractions.synchronize(device) + return du +end + +function rk2_stage_kernel!(backend::KernelAbstractionsBackend, + stage, u, rhs, dt) + device = _resolve_ka_device(backend.device) + kernel = backend.workgroupsize === nothing ? + _rk2_stage_kernel!(device) : + _rk2_stage_kernel!(device, backend.workgroupsize) + kernel(stage, u, rhs, dt; ndrange = size(stage)) + KernelAbstractions.synchronize(device) + return stage +end + +function rk2_update_kernel!(backend::KernelAbstractionsBackend, + u, k1, k2, factor) + device = _resolve_ka_device(backend.device) + kernel = backend.workgroupsize === nothing ? + _rk2_update_kernel!(device) : + _rk2_update_kernel!(device, backend.workgroupsize) + kernel(u, k1, k2, factor; ndrange = size(u)) + KernelAbstractions.synchronize(device) + return u +end diff --git a/src/time_integration.jl b/src/time_integration.jl index efc371a..e8b47ef 100644 --- a/src/time_integration.jl +++ b/src/time_integration.jl @@ -214,23 +214,26 @@ Runge-Kutta step of size `dt`. function rk2_step!(state::LinearAdvectionState, problem::LinearAdvectionProblem, dt::Real) - return _rk2_step!(state, problem, dt) + backend_obj = backend(state) + if backend_obj isa SerialBackend + return _rk2_step_serial!(state, problem, dt) + elseif backend_obj isa KernelAbstractionsBackend + return _rk2_step_linear_advection_ka!(state, problem, dt, backend_obj) + else + throw(ArgumentError("Unsupported execution backend $(describe(backend_obj)) for linear advection")) + end end function rk2_step!(state::CompressibleEulerState, problem::CompressibleEulerProblem, dt::Real) - return _rk2_step!(state, problem, dt) -end - -function _rk2_step!(state, problem, dt::Real) backend_obj = backend(state) if backend_obj isa SerialBackend return _rk2_step_serial!(state, problem, dt) elseif backend_obj isa KernelAbstractionsBackend - throw(ArgumentError("KernelAbstractions backends are not yet implemented for time integration")) + throw(ArgumentError("KernelAbstractions backends are not yet implemented for compressible Euler")) else - throw(ArgumentError("Unsupported execution backend $(describe(backend_obj))")) + throw(ArgumentError("Unsupported execution backend $(describe(backend_obj)) for compressible Euler")) end end @@ -252,6 +255,69 @@ function _rk2_step_serial!(state, problem, dt::Real) return state end +function _rk2_step_linear_advection_ka!(state::LinearAdvectionState, + problem::LinearAdvectionProblem, + dt::Real, + backend_obj::KernelAbstractionsBackend) + u_field = solution(state) + ws = workspace(state) + mesh_obj = mesh(problem) + dims = size(mesh_obj) + nx, ny = dims + + u = scalar_component(u_field) + k1 = scalar_component(ws.k1) + k2 = scalar_component(ws.k2) + stage = scalar_component(ws.stage) + + Tsol = eltype(u) + dtT = convert(Tsol, dt) + half = convert(Tsol, 0.5) + + eq = pde(problem) + bc = boundary_conditions(problem) + axes = periodic_axes(bc) + vel = velocity(eq) + ax_raw, ay_raw = vel + ax = convert(Tsol, ax_raw) + ay = convert(Tsol, ay_raw) + dx, dy = spacing(mesh_obj) + inv2dx = ax == zero(ax) ? zero(Tsol) : Tsol(1) / (Tsol(2) * Tsol(dx)) + inv2dy = ay == zero(ay) ? zero(Tsol) : Tsol(1) / (Tsol(2) * Tsol(dy)) + + if ax != zero(ax) && !axes[1] + throw(ArgumentError("Periodic boundary conditions required along x for advection")) + end + if ay != zero(ay) && !axes[2] + throw(ArgumentError("Periodic boundary conditions required along y for advection")) + end + if ax != zero(ax) && nx < 3 + throw(ArgumentError("At least three cells along x are required for 2nd-order advection")) + end + if ay != zero(ay) && ny < 3 + throw(ArgumentError("At least three cells along y are required for 2nd-order advection")) + end + + timer = simulation_timers() + + @timeit timer "RK2 step" begin + @timeit timer "RHS compute" begin + linear_advection_rhs_kernel!(backend_obj, k1, u, nx, ny, ax, ay, inv2dx, inv2dy) + end + @timeit timer "Stage predictor" begin + rk2_stage_kernel!(backend_obj, stage, u, k1, dtT) + end + @timeit timer "RHS compute" begin + linear_advection_rhs_kernel!(backend_obj, k2, stage, nx, ny, ax, ay, inv2dx, inv2dy) + end + @timeit timer "Solution update" begin + rk2_update_kernel!(backend_obj, u, k1, k2, dtT * half) + end + end + + return state +end + @inline function _rk2_stage_predict!(stage::CellField, u::CellField, rhs::CellField, diff --git a/test/runtests.jl b/test/runtests.jl index 5c107e1..705bf30 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -75,11 +75,18 @@ end end @testset "Backends" begin - problem = setup_linear_advection_problem(8, 4; velocity = (0.0, 0.0)) + problem = setup_linear_advection_problem(8, 4; velocity = (0.5, 0.0)) ka_backend = KernelAbstractionsBackend(:cpu) state = LinearAdvectionState(problem; init = 1.0, backend = ka_backend) @test backend(state) === ka_backend - @test_throws ArgumentError run_linear_advection!(state, problem; steps = 1, dt = 0.1) + + dt = 0.1 + serial_state = LinearAdvectionState(problem; init = 1.0) + rk2_step!(serial_state, problem, dt) + rk2_step!(state, problem, dt) + serial_u = scalar_component(solution(serial_state)) + ka_u = scalar_component(solution(state)) + @test all(isapprox.(ka_u, serial_u; atol = 1e-10)) end @testset "LinearAdvection RK2" begin From 7c10020ac65beafb6186ede7790c956b2a444b53 Mon Sep 17 00:00:00 2001 From: Michael Schlottke-Lakemper Date: Tue, 7 Oct 2025 12:34:47 +0200 Subject: [PATCH 04/18] Update test --- test/runtests.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 705bf30..39a5e29 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -81,9 +81,12 @@ end @test backend(state) === ka_backend dt = 0.1 + steps = 5 serial_state = LinearAdvectionState(problem; init = 1.0) - rk2_step!(serial_state, problem, dt) - rk2_step!(state, problem, dt) + for _ in 1:steps + rk2_step!(serial_state, problem, dt) + rk2_step!(state, problem, dt) + end serial_u = scalar_component(solution(serial_state)) ka_u = scalar_component(solution(state)) @test all(isapprox.(ka_u, serial_u; atol = 1e-10)) From e53255111c34caf789174da8eaf95d3174634ee1 Mon Sep 17 00:00:00 2001 From: Michael Schlottke-Lakemper Date: Tue, 7 Oct 2025 12:38:47 +0200 Subject: [PATCH 05/18] Another KA update --- src/kernel_abstractions_support.jl | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/kernel_abstractions_support.jl b/src/kernel_abstractions_support.jl index 693fafc..6d6b8c9 100644 --- a/src/kernel_abstractions_support.jl +++ b/src/kernel_abstractions_support.jl @@ -6,7 +6,7 @@ function _resolve_ka_device(spec) if spec isa Symbol spec === :cpu && return KernelAbstractions.CPU() throw(ArgumentError("Unsupported KernelAbstractions device spec $(spec); supported values: $(_SUPPORTED_SYMBOLIC_DEVICES)")) - elseif KernelAbstractions.isdevice(spec) + elseif spec isa KernelAbstractions.AbstractDevice return spec else throw(ArgumentError("KernelAbstractionsBackend device must be a Symbol or device, got $(typeof(spec))")) @@ -48,18 +48,16 @@ end end @kernel function _rk2_stage_kernel!(stage, u, rhs, dt) - i, j = @index(Global, NTuple) - nx, ny = size(stage) - if i <= nx && j <= ny - stage[i, j] = u[i, j] + dt * rhs[i, j] + I = @index(Global) + if I <= length(stage) + stage[I] = u[I] + dt * rhs[I] end end @kernel function _rk2_update_kernel!(u, k1, k2, factor) - i, j = @index(Global, NTuple) - nx, ny = size(u) - if i <= nx && j <= ny - u[i, j] = u[i, j] + factor * (k1[i, j] + k2[i, j]) + I = @index(Global) + if I <= length(u) + u[I] = u[I] + factor * (k1[I] + k2[I]) end end @@ -83,7 +81,7 @@ function rk2_stage_kernel!(backend::KernelAbstractionsBackend, kernel = backend.workgroupsize === nothing ? _rk2_stage_kernel!(device) : _rk2_stage_kernel!(device, backend.workgroupsize) - kernel(stage, u, rhs, dt; ndrange = size(stage)) + kernel(stage, u, rhs, dt; ndrange = length(stage)) KernelAbstractions.synchronize(device) return stage end @@ -94,7 +92,7 @@ function rk2_update_kernel!(backend::KernelAbstractionsBackend, kernel = backend.workgroupsize === nothing ? _rk2_update_kernel!(device) : _rk2_update_kernel!(device, backend.workgroupsize) - kernel(u, k1, k2, factor; ndrange = size(u)) + kernel(u, k1, k2, factor; ndrange = length(u)) KernelAbstractions.synchronize(device) return u end From fc087792f8663bf78b8409e36ddace04e3bb947e Mon Sep 17 00:00:00 2001 From: Michael Schlottke-Lakemper Date: Tue, 7 Oct 2025 12:43:02 +0200 Subject: [PATCH 06/18] Add KA.jl backend for Euler --- src/time_integration.jl | 40 ++++++++++++++++++++++++++++++++- test/test_compressible_euler.jl | 28 +++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/src/time_integration.jl b/src/time_integration.jl index e8b47ef..0a69df2 100644 --- a/src/time_integration.jl +++ b/src/time_integration.jl @@ -231,7 +231,7 @@ function rk2_step!(state::CompressibleEulerState, if backend_obj isa SerialBackend return _rk2_step_serial!(state, problem, dt) elseif backend_obj isa KernelAbstractionsBackend - throw(ArgumentError("KernelAbstractions backends are not yet implemented for compressible Euler")) + return _rk2_step_euler_ka!(state, problem, dt, backend_obj) else throw(ArgumentError("Unsupported execution backend $(describe(backend_obj)) for compressible Euler")) end @@ -318,6 +318,44 @@ function _rk2_step_linear_advection_ka!(state::LinearAdvectionState, return state end +function _rk2_step_euler_ka!(state::CompressibleEulerState, + problem::CompressibleEulerProblem, + dt::Real, + backend_obj::KernelAbstractionsBackend) + u_field = solution(state) + ws = workspace(state) + Tsol = eltype(u_field) + dtT = convert(Tsol, dt) + half = convert(Tsol, 0.5) + + timer = simulation_timers() + + @timeit timer "RK2 step" begin + @timeit timer "RHS compute" compute_rhs!(ws.k1, u_field, problem) + @timeit timer "Stage predictor" begin + for comp in 1:ncomponents(u_field) + rk2_stage_kernel!(backend_obj, + component(ws.stage, comp), + component(u_field, comp), + component(ws.k1, comp), + dtT) + end + end + @timeit timer "RHS compute" compute_rhs!(ws.k2, ws.stage, problem) + @timeit timer "Solution update" begin + for comp in 1:ncomponents(u_field) + rk2_update_kernel!(backend_obj, + component(u_field, comp), + component(ws.k1, comp), + component(ws.k2, comp), + dtT * half) + end + end + end + + return state +end + @inline function _rk2_stage_predict!(stage::CellField, u::CellField, rhs::CellField, diff --git a/test/test_compressible_euler.jl b/test/test_compressible_euler.jl index f408a20..065d162 100644 --- a/test/test_compressible_euler.jl +++ b/test/test_compressible_euler.jl @@ -66,3 +66,31 @@ using CodexMach @test all(isapprox.(prim_buf.v, prim.v; atol = 1e-12)) @test all(isapprox.(prim_buf.p, prim.p; atol = 1e-12)) end + +@testset "Compressible Euler KA backend" begin + nx, ny = 16, 16 + prob = setup_compressible_euler_problem(nx, ny; + lengths = (1.0, 1.0), + origin = (0.0, 0.0), + gamma = 1.4) + + init(x, y) = (; rho = 1.0 + 0.05 * sinpi(x), + v1 = 0.02 * cospi(y), + v2 = 0.01 * sinpi(x + y), + p = 1.0 + 0.02 * cospi(x)) + + serial_state = CompressibleEulerState(prob; init = init, T = Float64) + ka_state = CompressibleEulerState(prob; init = init, T = Float64, + backend = KernelAbstractionsBackend(:cpu)) + + steps = 3 + dt = stable_timestep(prob, serial_state; cfl = 0.4) + for _ in 1:steps + rk2_step!(serial_state, prob, dt) + rk2_step!(ka_state, prob, dt) + end + + serial_u = solution(serial_state) + ka_u = solution(ka_state) + @test all(isapprox.(ka_u, serial_u; atol = 1e-10)) +end From dfab12b8e9599ad67f12c5153df2348b60215c92 Mon Sep 17 00:00:00 2001 From: Michael Schlottke-Lakemper Date: Tue, 7 Oct 2025 12:50:52 +0200 Subject: [PATCH 07/18] More KA implementation --- src/kernel_abstractions_support.jl | 248 +++++++++++++++++++++++++++++ src/time_integration.jl | 42 ++++- 2 files changed, 288 insertions(+), 2 deletions(-) diff --git a/src/kernel_abstractions_support.jl b/src/kernel_abstractions_support.jl index 6d6b8c9..3ffabcc 100644 --- a/src/kernel_abstractions_support.jl +++ b/src/kernel_abstractions_support.jl @@ -13,6 +13,75 @@ function _resolve_ka_device(spec) end end +@inline function _ka_minmod(a, b) + S = promote_type(typeof(a), typeof(b)) + if a * b <= 0 + return zero(S) + end + return S(copysign(min(abs(a), abs(b)), a)) +end + +@inline function _ka_velocity_pressure(γ, ρ, rhou, rhov, E) + invρ = one(ρ) / ρ + ux = rhou * invρ + uy = rhov * invρ + kinetic = (one(ρ) / 2) * ρ * (ux^2 + uy^2) + p = (γ - one(γ)) * (E - kinetic) + return ux, uy, p +end + +@inline function _ka_rusanov_flux_x(γ, ρL, rhouL, rhovL, EL, + ρR, rhouR, rhovR, ER) + uxL, uyL, pL = _ka_velocity_pressure(γ, ρL, rhouL, rhovL, EL) + uxR, uyR, pR = _ka_velocity_pressure(γ, ρR, rhouR, rhovR, ER) + cL = sqrt(abs(γ * pL / ρL)) + cR = sqrt(abs(γ * pR / ρR)) + smax = max(abs(uxL) + cL, abs(uxR) + cR) + + FL1 = rhouL + FL2 = rhouL * uxL + pL + FL3 = rhouL * uyL + FL4 = (EL + pL) * uxL + + FR1 = rhouR + FR2 = rhouR * uxR + pR + FR3 = rhouR * uyR + FR4 = (ER + pR) * uxR + + flux1 = 0.5 * (FL1 + FR1) - 0.5 * smax * (ρR - ρL) + flux2 = 0.5 * (FL2 + FR2) - 0.5 * smax * (rhouR - rhouL) + flux3 = 0.5 * (FL3 + FR3) - 0.5 * smax * (rhovR - rhovL) + flux4 = 0.5 * (FL4 + FR4) - 0.5 * smax * (ER - EL) + + return flux1, flux2, flux3, flux4 +end + +@inline function _ka_rusanov_flux_y(γ, ρL, rhouL, rhovL, EL, + ρR, rhouR, rhovR, ER) + uxL, uyL, pL = _ka_velocity_pressure(γ, ρL, rhouL, rhovL, EL) + uxR, uyR, pR = _ka_velocity_pressure(γ, ρR, rhouR, rhovR, ER) + cL = sqrt(abs(γ * pL / ρL)) + cR = sqrt(abs(γ * pR / ρR)) + smax = max(abs(uyL) + cL, abs(uyR) + cR) + + GL1 = rhovL + GL2 = rhovL * uxL + GL3 = rhovL * uyL + pL + GL4 = (EL + pL) * uyL + + GR1 = rhovR + GR2 = rhovR * uxR + GR3 = rhovR * uyR + pR + GR4 = (ER + pR) * uyR + + flux1 = 0.5 * (GL1 + GR1) - 0.5 * smax * (ρR - ρL) + flux2 = 0.5 * (GL2 + GR2) - 0.5 * smax * (rhouR - rhouL) + flux3 = 0.5 * (GL3 + GR3) - 0.5 * smax * (rhovR - rhovL) + flux4 = 0.5 * (GL4 + GR4) - 0.5 * smax * (ER - EL) + + return flux1, flux2, flux3, flux4 +end + @kernel function _linear_advection_rhs_kernel!(du, u, ax, ay, inv2dx, inv2dy) i, j = @index(Global, NTuple) nx, ny = size(u) @@ -96,3 +165,182 @@ function rk2_update_kernel!(backend::KernelAbstractionsBackend, KernelAbstractions.synchronize(device) return u end + +@kernel function _compressible_euler_rhs_kernel!(dρ, drhou, drhov, dE, + ρ, rhou, rhov, E, + γ, inv_dx, inv_dy) + i, j = @index(Global, NTuple) + nx, ny = size(ρ) + if i <= nx && j <= ny + ip = i == nx ? 1 : i + 1 + ip2 = ip == nx ? 1 : ip + 1 + im = i == 1 ? nx : i - 1 + im2 = im == 1 ? nx : im - 1 + + jp = j == ny ? 1 : j + 1 + jp2 = jp == ny ? 1 : jp + 1 + jm = j == 1 ? ny : j - 1 + jm2 = jm == 1 ? ny : jm - 1 + + half = typeof(ρ[i, j])(0.5) + + # Slopes for cell i + ΔLρ = ρ[i, j] - ρ[im, j] + ΔRρ = ρ[ip, j] - ρ[i, j] + ΔLrhox = rhou[i, j] - rhou[im, j] + ΔRrhox = rhou[ip, j] - rhou[i, j] + ΔLrhoy = rhov[i, j] - rhov[im, j] + ΔRrhoy = rhov[ip, j] - rhov[i, j] + ΔLE = E[i, j] - E[im, j] + ΔRE = E[ip, j] - E[i, j] + + sρ = _ka_minmod(ΔLρ, ΔRρ) + srhox = _ka_minmod(ΔLrhox, ΔRrhox) + srhoy = _ka_minmod(ΔLrhoy, ΔRrhoy) + sE = _ka_minmod(ΔLE, ΔRE) + + ρL_plus = ρ[i, j] + half * sρ + rhouL_plus = rhou[i, j] + half * srhox + rhovL_plus = rhov[i, j] + half * srhoy + EL_plus = E[i, j] + half * sE + + # Slopes for cell ip + ΔLρ_ip = ρ[ip, j] - ρ[i, j] + ΔRρ_ip = ρ[ip2, j] - ρ[ip, j] + ΔLrhox_ip = rhou[ip, j] - rhou[i, j] + ΔRrhox_ip = rhou[ip2, j] - rhou[ip, j] + ΔLrhoy_ip = rhov[ip, j] - rhov[i, j] + ΔRrhoy_ip = rhov[ip2, j] - rhov[ip, j] + ΔLE_ip = E[ip, j] - E[i, j] + ΔRE_ip = E[ip2, j] - E[ip, j] + + sρ_ip = _ka_minmod(ΔLρ_ip, ΔRρ_ip) + srhox_ip = _ka_minmod(ΔLrhox_ip, ΔRrhox_ip) + srhoy_ip = _ka_minmod(ΔLrhoy_ip, ΔRrhoy_ip) + sE_ip = _ka_minmod(ΔLE_ip, ΔRE_ip) + + ρR_plus = ρ[ip, j] - half * sρ_ip + rhouR_plus = rhou[ip, j] - half * srhox_ip + rhovR_plus = rhov[ip, j] - half * srhoy_ip + ER_plus = E[ip, j] - half * sE_ip + + flux1_plus, flux2_plus, flux3_plus, flux4_plus = + _ka_rusanov_flux_x(γ, ρL_plus, rhouL_plus, rhovL_plus, EL_plus, + ρR_plus, rhouR_plus, rhovR_plus, ER_plus) + + # Interface i-1/2 + ΔLρ_im = ρ[im, j] - ρ[im2, j] + ΔRρ_im = ρ[i, j] - ρ[im, j] + ΔLrhox_im = rhou[im, j] - rhou[im2, j] + ΔRrhox_im = rhou[i, j] - rhou[im, j] + ΔLrhoy_im = rhov[im, j] - rhov[im2, j] + ΔRrhoy_im = rhov[i, j] - rhov[im, j] + ΔLE_im = E[im, j] - E[im2, j] + ΔRE_im = E[i, j] - E[im, j] + + sρ_im = _ka_minmod(ΔLρ_im, ΔRρ_im) + srhox_im = _ka_minmod(ΔLrhox_im, ΔRrhox_im) + srhoy_im = _ka_minmod(ΔLrhoy_im, ΔRrhoy_im) + sE_im = _ka_minmod(ΔLE_im, ΔRE_im) + + ρL_minus = ρ[im, j] + half * sρ_im + rhouL_minus = rhou[im, j] + half * srhox_im + rhovL_minus = rhov[im, j] + half * srhoy_im + EL_minus = E[im, j] + half * sE_im + + ρR_minus = ρ[i, j] - half * sρ + rhouR_minus = rhou[i, j] - half * srhox + rhovR_minus = rhov[i, j] - half * srhoy + ER_minus = E[i, j] - half * sE + + flux1_minus, flux2_minus, flux3_minus, flux4_minus = + _ka_rusanov_flux_x(γ, ρL_minus, rhouL_minus, rhovL_minus, EL_minus, + ρR_minus, rhouR_minus, rhovR_minus, ER_minus) + + # Y-direction interface j+1/2 + ΔLρ_y = ρ[i, j] - ρ[i, jm] + ΔRρ_y = ρ[i, jp] - ρ[i, j] + ΔLrhox_y = rhou[i, j] - rhou[i, jm] + ΔRrhox_y = rhou[i, jp] - rhou[i, j] + ΔLrhoy_y = rhov[i, j] - rhov[i, jm] + ΔRrhoy_y = rhov[i, jp] - rhov[i, j] + ΔLE_y = E[i, j] - E[i, jm] + ΔRE_y = E[i, jp] - E[i, j] + + sρ_y = _ka_minmod(ΔLρ_y, ΔRρ_y) + srhox_y = _ka_minmod(ΔLrhox_y, ΔRrhox_y) + srhoy_y = _ka_minmod(ΔLrhoy_y, ΔRrhoy_y) + sE_y = _ka_minmod(ΔLE_y, ΔRE_y) + + ρL_plus_y = ρ[i, j] + half * sρ_y + rhouL_plus_y = rhou[i, j] + half * srhox_y + rhovL_plus_y = rhov[i, j] + half * srhoy_y + EL_plus_y = E[i, j] + half * sE_y + + ΔLρ_jp = ρ[i, jp] - ρ[i, j] + ΔRρ_jp = ρ[i, jp2] - ρ[i, jp] + ΔLrhox_jp = rhou[i, jp] - rhou[i, j] + ΔRrhox_jp = rhou[i, jp2] - rhou[i, jp] + ΔLrhoy_jp = rhov[i, jp] - rhov[i, j] + ΔRrhoy_jp = rhov[i, jp2] - rhov[i, jp] + ΔLE_jp = E[i, jp] - E[i, j] + ΔRE_jp = E[i, jp2] - E[i, jp] + + sρ_jp = _ka_minmod(ΔLρ_jp, ΔRρ_jp) + srhox_jp = _ka_minmod(ΔLrhox_jp, ΔRrhox_jp) + srhoy_jp = _ka_minmod(ΔLrhoy_jp, ΔRrhoy_jp) + sE_jp = _ka_minmod(ΔLE_jp, ΔRE_jp) + + ρR_plus_y = ρ[i, jp] - half * sρ_jp + rhouR_plus_y = rhou[i, jp] - half * srhox_jp + rhovR_plus_y = rhov[i, jp] - half * srhoy_jp + ER_plus_y = E[i, jp] - half * sE_jp + + flux1_plus_y, flux2_plus_y, flux3_plus_y, flux4_plus_y = + _ka_rusanov_flux_y(γ, ρL_plus_y, rhouL_plus_y, rhovL_plus_y, EL_plus_y, + ρR_plus_y, rhouR_plus_y, rhovR_plus_y, ER_plus_y) + + # Interface j-1/2 + ΔLρ_jm = ρ[i, jm] - ρ[i, jm2] + ΔRρ_jm = ρ[i, j] - ρ[i, jm] + ΔLrhox_jm = rhou[i, jm] - rhou[i, jm2] + ΔRrhox_jm = rhou[i, j] - rhou[i, jm] + ΔLrhoy_jm = rhov[i, jm] - rhov[i, jm2] + ΔRrhoy_jm = rhov[i, j] - rhov[i, jm] + ΔLE_jm = E[i, jm] - E[i, jm2] + ΔRE_jm = E[i, j] - E[i, jm] + + sρ_jm = _ka_minmod(ΔLρ_jm, ΔRρ_jm) + srhox_jm = _ka_minmod(ΔLrhox_jm, ΔRrhox_jm) + srhoy_jm = _ka_minmod(ΔLrhoy_jm, ΔRrhoy_jm) + sE_jm = _ka_minmod(ΔLE_jm, ΔRE_jm) + + ρL_minus_y = ρ[i, jm] + half * sρ_jm + rhouL_minus_y = rhou[i, jm] + half * srhox_jm + rhovL_minus_y = rhov[i, jm] + half * srhoy_jm + EL_minus_y = E[i, jm] + half * sE_jm + + ρR_minus_y = ρ[i, j] - half * sρ_y + rhouR_minus_y = rhou[i, j] - half * srhox_y + rhovR_minus_y = rhov[i, j] - half * srhoy_y + ER_minus_y = E[i, j] - half * sE_y + + flux1_minus_y, flux2_minus_y, flux3_minus_y, flux4_minus_y = + _ka_rusanov_flux_y(γ, ρL_minus_y, rhouL_minus_y, rhovL_minus_y, EL_minus_y, + ρR_minus_y, rhouR_minus_y, rhovR_minus_y, ER_minus_y) + + dρ_val = -(flux1_plus - flux1_minus) * inv_dx - + (flux1_plus_y - flux1_minus_y) * inv_dy + drhou_val = -(flux2_plus - flux2_minus) * inv_dx - + (flux2_plus_y - flux2_minus_y) * inv_dy + drhov_val = -(flux3_plus - flux3_minus) * inv_dx - + (flux3_plus_y - flux3_minus_y) * inv_dy + dE_val = -(flux4_plus - flux4_minus) * inv_dx - + (flux4_plus_y - flux4_minus_y) * inv_dy + + dρ[i, j] = dρ_val + drhou[i, j] = drhou_val + drhov[i, j] = drhov_val + dE[i, j] = dE_val + end +end diff --git a/src/time_integration.jl b/src/time_integration.jl index 0a69df2..69beb25 100644 --- a/src/time_integration.jl +++ b/src/time_integration.jl @@ -331,7 +331,7 @@ function _rk2_step_euler_ka!(state::CompressibleEulerState, timer = simulation_timers() @timeit timer "RK2 step" begin - @timeit timer "RHS compute" compute_rhs!(ws.k1, u_field, problem) + @timeit timer "RHS compute" _compressible_euler_rhs_ka!(backend_obj, ws.k1, u_field, problem) @timeit timer "Stage predictor" begin for comp in 1:ncomponents(u_field) rk2_stage_kernel!(backend_obj, @@ -341,7 +341,7 @@ function _rk2_step_euler_ka!(state::CompressibleEulerState, dtT) end end - @timeit timer "RHS compute" compute_rhs!(ws.k2, ws.stage, problem) + @timeit timer "RHS compute" _compressible_euler_rhs_ka!(backend_obj, ws.k2, ws.stage, problem) @timeit timer "Solution update" begin for comp in 1:ncomponents(u_field) rk2_update_kernel!(backend_obj, @@ -356,6 +356,44 @@ function _rk2_step_euler_ka!(state::CompressibleEulerState, return state end +function _compressible_euler_rhs_ka!(backend::KernelAbstractionsBackend, + du::CellField, + u::CellField, + problem::CompressibleEulerProblem) + mesh_obj = mesh(problem) + nx, ny = size(mesh_obj) + eq = pde(problem) + T = component_eltype(u) + dx, dy = spacing(mesh_obj) + inv_dx = T(1) / T(dx) + inv_dy = T(1) / T(dy) + γ = T(gamma(eq)) + + for comp in 1:ncomponents(du) + fill!(component(du, comp), zero(T)) + end + + dρ = component(du, 1) + drhou = component(du, 2) + drhov = component(du, 3) + dE = component(du, 4) + + ρ = component(u, 1) + rhou = component(u, 2) + rhov = component(u, 3) + E = component(u, 4) + + device = _resolve_ka_device(backend.device) + kernel = backend.workgroupsize === nothing ? + _compressible_euler_rhs_kernel!(device) : + _compressible_euler_rhs_kernel!(device, backend.workgroupsize) + kernel(dρ, drhou, drhov, dE, + ρ, rhou, rhov, E, + γ, inv_dx, inv_dy; ndrange = (nx, ny)) + KernelAbstractions.synchronize(device) + return du +end + @inline function _rk2_stage_predict!(stage::CellField, u::CellField, rhs::CellField, From 290d3f46c0deee14c2a86d58647e7624923732c5 Mon Sep 17 00:00:00 2001 From: Michael Schlottke-Lakemper Date: Tue, 7 Oct 2025 12:56:54 +0200 Subject: [PATCH 08/18] Add actual GPU backends as package extensions --- Project.toml | 8 ++++++++ ext/CodexMachCUDAExt.jl | 11 +++++++++++ ext/CodexMachMetalExt.jl | 11 +++++++++++ src/CodexMach.jl | 2 ++ src/backends.jl | 10 +++------- src/kernel_abstractions_support.jl | 17 ++++++++++++++--- test/runtests.jl | 10 ++++++++++ test/test_compressible_euler.jl | 10 ++++++++++ 8 files changed, 69 insertions(+), 10 deletions(-) create mode 100644 ext/CodexMachCUDAExt.jl create mode 100644 ext/CodexMachMetalExt.jl diff --git a/Project.toml b/Project.toml index 906bd9b..6d72bac 100644 --- a/Project.toml +++ b/Project.toml @@ -8,6 +8,14 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +[weakdeps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Metal = "dde4c033-4e86-5a71-bd2e-8d6045ea3b36" + +[extensions] +CodexMachCUDAExt = "CUDA" +CodexMachMetalExt = "Metal" + [compat] KernelAbstractions = "0.9.38" Plots = "1.41.1" diff --git a/ext/CodexMachCUDAExt.jl b/ext/CodexMachCUDAExt.jl new file mode 100644 index 0000000..6c510e6 --- /dev/null +++ b/ext/CodexMachCUDAExt.jl @@ -0,0 +1,11 @@ +module CodexMachCUDAExt + +using CodexMach +using KernelAbstractions +using CUDA + +function __init__() + CodexMach.register_backend!(:cuda, () -> KernelAbstractions.CUDADevice()) +end + +end # module diff --git a/ext/CodexMachMetalExt.jl b/ext/CodexMachMetalExt.jl new file mode 100644 index 0000000..cfc9abe --- /dev/null +++ b/ext/CodexMachMetalExt.jl @@ -0,0 +1,11 @@ +module CodexMachMetalExt + +using CodexMach +using KernelAbstractions +using Metal + +function __init__() + CodexMach.register_backend!(:metal, () -> KernelAbstractions.MetalDevice()) +end + +end # module diff --git a/src/CodexMach.jl b/src/CodexMach.jl index 6301b6f..fd9675d 100644 --- a/src/CodexMach.jl +++ b/src/CodexMach.jl @@ -32,6 +32,8 @@ export greet, allocate_cellfield, allocate_like, map_components!, + register_backend!, + available_backends, ExecutionBackend, SerialBackend, KernelAbstractionsBackend, diff --git a/src/backends.jl b/src/backends.jl index 6e838fb..208d927 100644 --- a/src/backends.jl +++ b/src/backends.jl @@ -18,11 +18,7 @@ KernelAbstractionsBackend(; device = :cpu, workgroupsize = nothing) = default_backend() = SerialBackend() function describe(backend::ExecutionBackend) - if backend isa SerialBackend - return "SerialBackend" - elseif backend isa KernelAbstractionsBackend - return "KernelAbstractionsBackend($(backend.device))" - else - return string(typeof(backend)) - end + backend isa SerialBackend && return "SerialBackend" + backend isa KernelAbstractionsBackend && return "KernelAbstractionsBackend($(backend.device))" + return string(typeof(backend)) end diff --git a/src/kernel_abstractions_support.jl b/src/kernel_abstractions_support.jl index 3ffabcc..d9ee63a 100644 --- a/src/kernel_abstractions_support.jl +++ b/src/kernel_abstractions_support.jl @@ -1,11 +1,20 @@ using KernelAbstractions -const _SUPPORTED_SYMBOLIC_DEVICES = (:cpu,) +const _DEVICE_REGISTRY = Dict{Symbol,Function}() + +function register_backend!(name::Symbol, factory::Function) + _DEVICE_REGISTRY[name] = factory + return name +end + +available_backends() = collect(keys(_DEVICE_REGISTRY)) function _resolve_ka_device(spec) if spec isa Symbol - spec === :cpu && return KernelAbstractions.CPU() - throw(ArgumentError("Unsupported KernelAbstractions device spec $(spec); supported values: $(_SUPPORTED_SYMBOLIC_DEVICES)")) + factory = get(_DEVICE_REGISTRY, spec) do + throw(ArgumentError("Unsupported KernelAbstractions device spec $(spec); supported values: $(join(sort!(collect(keys(_DEVICE_REGISTRY))), ", "))")) + end + return factory() elseif spec isa KernelAbstractions.AbstractDevice return spec else @@ -344,3 +353,5 @@ end dE[i, j] = dE_val end end + +register_backend!(:cpu, () -> KernelAbstractions.CPU()) diff --git a/test/runtests.jl b/test/runtests.jl index 39a5e29..d328c10 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -90,6 +90,16 @@ end serial_u = scalar_component(solution(serial_state)) ka_u = scalar_component(solution(state)) @test all(isapprox.(ka_u, serial_u; atol = 1e-10)) + + if :metal in available_backends() + metal_state = LinearAdvectionState(problem; init = 1.0, + backend = KernelAbstractionsBackend(:metal)) + for _ in 1:steps + rk2_step!(metal_state, problem, dt) + end + metal_u = scalar_component(solution(metal_state)) + @test all(isapprox.(metal_u, serial_u; atol = 1e-10)) + end end @testset "LinearAdvection RK2" begin diff --git a/test/test_compressible_euler.jl b/test/test_compressible_euler.jl index 065d162..683b341 100644 --- a/test/test_compressible_euler.jl +++ b/test/test_compressible_euler.jl @@ -93,4 +93,14 @@ end serial_u = solution(serial_state) ka_u = solution(ka_state) @test all(isapprox.(ka_u, serial_u; atol = 1e-10)) + + if :metal in available_backends() + metal_state = CompressibleEulerState(prob; init = init, T = Float64, + backend = KernelAbstractionsBackend(:metal)) + for _ in 1:steps + rk2_step!(metal_state, prob, dt) + end + metal_u = solution(metal_state) + @test all(isapprox.(metal_u, serial_u; atol = 1e-10)) + end end From 09c3a394bdedc1353e08bc0c9870bc8b87bd176c Mon Sep 17 00:00:00 2001 From: Michael Schlottke-Lakemper Date: Tue, 7 Oct 2025 13:10:56 +0200 Subject: [PATCH 09/18] Make Metal a guarded test --- Project.toml | 4 ++++ test/runtests.jl | 2 +- test/test_compressible_euler.jl | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 6d72bac..155df3f 100644 --- a/Project.toml +++ b/Project.toml @@ -8,6 +8,9 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +[extras] +Metal = "dde4c033-4e86-5a71-bd2e-8d6045ea3b36" + [weakdeps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Metal = "dde4c033-4e86-5a71-bd2e-8d6045ea3b36" @@ -18,6 +21,7 @@ CodexMachMetalExt = "Metal" [compat] KernelAbstractions = "0.9.38" +Metal = "1" Plots = "1.41.1" TimerOutputs = "0.5.29" julia = "1.11" diff --git a/test/runtests.jl b/test/runtests.jl index d328c10..2bf623a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -91,7 +91,7 @@ end ka_u = scalar_component(solution(state)) @test all(isapprox.(ka_u, serial_u; atol = 1e-10)) - if :metal in available_backends() + if Sys.isapple() && :metal in available_backends() metal_state = LinearAdvectionState(problem; init = 1.0, backend = KernelAbstractionsBackend(:metal)) for _ in 1:steps diff --git a/test/test_compressible_euler.jl b/test/test_compressible_euler.jl index 683b341..a42a17e 100644 --- a/test/test_compressible_euler.jl +++ b/test/test_compressible_euler.jl @@ -94,7 +94,7 @@ end ka_u = solution(ka_state) @test all(isapprox.(ka_u, serial_u; atol = 1e-10)) - if :metal in available_backends() + if Sys.isapple() && :metal in available_backends() metal_state = CompressibleEulerState(prob; init = init, T = Float64, backend = KernelAbstractionsBackend(:metal)) for _ in 1:steps From d71e718861ca86eb15088f142258a2948daddaae Mon Sep 17 00:00:00 2001 From: Michael Schlottke-Lakemper Date: Tue, 7 Oct 2025 13:55:58 +0200 Subject: [PATCH 10/18] Fix backend guards --- Project.toml | 2 ++ test/runtests.jl | 12 +++++++++++- test/test_compressible_euler.jl | 12 +++++++++++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 155df3f..2dd69e6 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" [extras] Metal = "dde4c033-4e86-5a71-bd2e-8d6045ea3b36" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" [weakdeps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" @@ -21,6 +22,7 @@ CodexMachMetalExt = "Metal" [compat] KernelAbstractions = "0.9.38" +CUDA = "5" Metal = "1" Plots = "1.41.1" TimerOutputs = "0.5.29" diff --git a/test/runtests.jl b/test/runtests.jl index 2bf623a..6791eed 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -91,7 +91,7 @@ end ka_u = scalar_component(solution(state)) @test all(isapprox.(ka_u, serial_u; atol = 1e-10)) - if Sys.isapple() && :metal in available_backends() + if :metal in available_backends() metal_state = LinearAdvectionState(problem; init = 1.0, backend = KernelAbstractionsBackend(:metal)) for _ in 1:steps @@ -100,6 +100,16 @@ end metal_u = scalar_component(solution(metal_state)) @test all(isapprox.(metal_u, serial_u; atol = 1e-10)) end + + if :cuda in available_backends() + cuda_state = LinearAdvectionState(problem; init = 1.0, + backend = KernelAbstractionsBackend(:cuda)) + for _ in 1:steps + rk2_step!(cuda_state, problem, dt) + end + cuda_u = scalar_component(solution(cuda_state)) + @test all(isapprox.(cuda_u, serial_u; atol = 1e-10)) + end end @testset "LinearAdvection RK2" begin diff --git a/test/test_compressible_euler.jl b/test/test_compressible_euler.jl index a42a17e..5717eb6 100644 --- a/test/test_compressible_euler.jl +++ b/test/test_compressible_euler.jl @@ -94,7 +94,7 @@ end ka_u = solution(ka_state) @test all(isapprox.(ka_u, serial_u; atol = 1e-10)) - if Sys.isapple() && :metal in available_backends() + if :metal in available_backends() metal_state = CompressibleEulerState(prob; init = init, T = Float64, backend = KernelAbstractionsBackend(:metal)) for _ in 1:steps @@ -103,4 +103,14 @@ end metal_u = solution(metal_state) @test all(isapprox.(metal_u, serial_u; atol = 1e-10)) end + + if :cuda in available_backends() + cuda_state = CompressibleEulerState(prob; init = init, T = Float64, + backend = KernelAbstractionsBackend(:cuda)) + for _ in 1:steps + rk2_step!(cuda_state, prob, dt) + end + cuda_u = solution(cuda_state) + @test all(isapprox.(cuda_u, serial_u; atol = 1e-10)) + end end From 6d6c5fb36aabd1ebc357cec90126ce469f651f53 Mon Sep 17 00:00:00 2001 From: Michael Schlottke-Lakemper Date: Tue, 7 Oct 2025 15:25:37 +0200 Subject: [PATCH 11/18] Fix UUIDs --- Project.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 2dd69e6..b6671ca 100644 --- a/Project.toml +++ b/Project.toml @@ -9,12 +9,12 @@ Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" [extras] -Metal = "dde4c033-4e86-5a71-bd2e-8d6045ea3b36" +Metal = "dde4c033-4e86-420c-a63e-0dd931031962" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" [weakdeps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" -Metal = "dde4c033-4e86-5a71-bd2e-8d6045ea3b36" +Metal = "dde4c033-4e86-420c-a63e-0dd931031962" [extensions] CodexMachCUDAExt = "CUDA" From 94b0545f18d35bbfc9411bde851af95c1ca6710a Mon Sep 17 00:00:00 2001 From: Michael Schlottke-Lakemper Date: Tue, 7 Oct 2025 15:50:08 +0200 Subject: [PATCH 12/18] First working Metal implementation --- examples/profile_backends.jl | 56 ++++++++++++++++++++++++++++++ ext/CodexMachCUDAExt.jl | 2 ++ ext/CodexMachMetalExt.jl | 5 ++- src/backends.jl | 16 +++++++++ src/kernel_abstractions_support.jl | 20 ++++++----- src/time_integration.jl | 40 +++++++++++++++------ 6 files changed, 119 insertions(+), 20 deletions(-) create mode 100644 examples/profile_backends.jl diff --git a/examples/profile_backends.jl b/examples/profile_backends.jl new file mode 100644 index 0000000..f82012f --- /dev/null +++ b/examples/profile_backends.jl @@ -0,0 +1,56 @@ +using CodexMach +try + using Metal + CodexMach.register_backend!(:metal, () -> Metal.MetalBackend()) + println("Metal backend registered") +catch err + println("Metal unavailable: ", err) +end + +function profile(label, setup) + println("\n== $label ==") + for (name, state, prob, dt, steps) in setup() + GC.gc() + elapsed = @elapsed for _ in 1:steps + rk2_step!(state, prob, dt) + end + println(rpad(name, 10), ": ", round(elapsed, digits=3), " s") + end +end + +nx = 1024 +ny = 1024 +steps = 100 +prob_la = setup_linear_advection_problem(nx, ny; velocity=(1.0, 0.5)) +dt_la = 1e-3 +profile_T = Float32 + +function advection_cases() + cases = Tuple{String,Any,Any,Float64,Int}[] + push!(cases, ("Serial", LinearAdvectionState(prob_la; init=1.0, T=profile_T), prob_la, dt_la, steps)) + push!(cases, ("KA CPU", LinearAdvectionState(prob_la; init=1.0, T=profile_T, backend=KernelAbstractionsBackend(:cpu)), prob_la, dt_la, steps)) + if :metal in available_backends() + push!(cases, ("Metal", LinearAdvectionState(prob_la; init=1.0, T=profile_T, backend=KernelAbstractionsBackend(:metal)), prob_la, dt_la, steps)) + end + return cases +end + +profile("Linear Advection $(nx)x$(ny), steps=$(steps)", advection_cases) + +prob_eu = setup_compressible_euler_problem(nx÷2, ny÷2; gamma=1.4) +init(x,y) = (; rho = 1.0 + 0.1 * sinpi(x), v1 = 0.05*cospi(y), v2 = 0.02*sinpi(x+y), p = 1.0 + 0.05*cospi(x)) +serial_state = CompressibleEulerState(prob_eu; init=init) +dt_eu = stable_timestep(prob_eu, serial_state; cfl=0.3) +steps_eu = 100 + +function euler_cases() + cases = Tuple{String,Any,Any,Float64,Int}[] + push!(cases, ("Serial", CompressibleEulerState(prob_eu; init=init, T=profile_T), prob_eu, dt_eu, steps_eu)) + push!(cases, ("KA CPU", CompressibleEulerState(prob_eu; init=init, T=profile_T, backend=KernelAbstractionsBackend(:cpu)), prob_eu, dt_eu, steps_eu)) + if :metal in available_backends() + push!(cases, ("Metal", CompressibleEulerState(prob_eu; init=init, T=profile_T, backend=KernelAbstractionsBackend(:metal)), prob_eu, dt_eu, steps_eu)) + end + return cases +end + +profile("Compressible Euler $(nx÷2)x$(ny÷2), steps=$(steps_eu)", euler_cases) diff --git a/ext/CodexMachCUDAExt.jl b/ext/CodexMachCUDAExt.jl index 6c510e6..2b47f55 100644 --- a/ext/CodexMachCUDAExt.jl +++ b/ext/CodexMachCUDAExt.jl @@ -8,4 +8,6 @@ function __init__() CodexMach.register_backend!(:cuda, () -> KernelAbstractions.CUDADevice()) end +CodexMach._default_array_type_from_symbol(::Val{:cuda}) = CUDA.CuArray + end # module diff --git a/ext/CodexMachMetalExt.jl b/ext/CodexMachMetalExt.jl index cfc9abe..2fa8915 100644 --- a/ext/CodexMachMetalExt.jl +++ b/ext/CodexMachMetalExt.jl @@ -5,7 +5,10 @@ using KernelAbstractions using Metal function __init__() - CodexMach.register_backend!(:metal, () -> KernelAbstractions.MetalDevice()) + CodexMach.register_backend!(:metal, () -> Metal.MetalBackend()) end +CodexMach._default_array_type_from_symbol(::Val{:metal}) = Metal.MtlArray +CodexMach._default_array_type_from_spec(::Metal.MetalBackend) = Metal.MtlArray + end # module diff --git a/src/backends.jl b/src/backends.jl index 208d927..d699814 100644 --- a/src/backends.jl +++ b/src/backends.jl @@ -22,3 +22,19 @@ function describe(backend::ExecutionBackend) backend isa KernelAbstractionsBackend && return "KernelAbstractionsBackend($(backend.device))" return string(typeof(backend)) end + +function default_array_type(::SerialBackend) + return Array +end + +function default_array_type(backend::KernelAbstractionsBackend) + return _default_array_type_from_spec(backend.device) +end + +_default_array_type_from_spec(::Any) = Array + +function _default_array_type_from_spec(spec::Symbol) + return _default_array_type_from_symbol(Val(spec)) +end + +_default_array_type_from_symbol(::Val) = Array diff --git a/src/kernel_abstractions_support.jl b/src/kernel_abstractions_support.jl index d9ee63a..ba3966f 100644 --- a/src/kernel_abstractions_support.jl +++ b/src/kernel_abstractions_support.jl @@ -41,6 +41,8 @@ end @inline function _ka_rusanov_flux_x(γ, ρL, rhouL, rhovL, EL, ρR, rhouR, rhovR, ER) + T = promote_type(typeof(ρL), typeof(ρR)) + half = inv(T(2)) uxL, uyL, pL = _ka_velocity_pressure(γ, ρL, rhouL, rhovL, EL) uxR, uyR, pR = _ka_velocity_pressure(γ, ρR, rhouR, rhovR, ER) cL = sqrt(abs(γ * pL / ρL)) @@ -57,16 +59,18 @@ end FR3 = rhouR * uyR FR4 = (ER + pR) * uxR - flux1 = 0.5 * (FL1 + FR1) - 0.5 * smax * (ρR - ρL) - flux2 = 0.5 * (FL2 + FR2) - 0.5 * smax * (rhouR - rhouL) - flux3 = 0.5 * (FL3 + FR3) - 0.5 * smax * (rhovR - rhovL) - flux4 = 0.5 * (FL4 + FR4) - 0.5 * smax * (ER - EL) + flux1 = half * (FL1 + FR1) - half * smax * (ρR - ρL) + flux2 = half * (FL2 + FR2) - half * smax * (rhouR - rhouL) + flux3 = half * (FL3 + FR3) - half * smax * (rhovR - rhovL) + flux4 = half * (FL4 + FR4) - half * smax * (ER - EL) return flux1, flux2, flux3, flux4 end @inline function _ka_rusanov_flux_y(γ, ρL, rhouL, rhovL, EL, ρR, rhouR, rhovR, ER) + T = promote_type(typeof(ρL), typeof(ρR)) + half = inv(T(2)) uxL, uyL, pL = _ka_velocity_pressure(γ, ρL, rhouL, rhovL, EL) uxR, uyR, pR = _ka_velocity_pressure(γ, ρR, rhouR, rhovR, ER) cL = sqrt(abs(γ * pL / ρL)) @@ -83,10 +87,10 @@ end GR3 = rhovR * uyR + pR GR4 = (ER + pR) * uyR - flux1 = 0.5 * (GL1 + GR1) - 0.5 * smax * (ρR - ρL) - flux2 = 0.5 * (GL2 + GR2) - 0.5 * smax * (rhouR - rhouL) - flux3 = 0.5 * (GL3 + GR3) - 0.5 * smax * (rhovR - rhovL) - flux4 = 0.5 * (GL4 + GR4) - 0.5 * smax * (ER - EL) + flux1 = half * (GL1 + GR1) - half * smax * (ρR - ρL) + flux2 = half * (GL2 + GR2) - half * smax * (rhouR - rhouL) + flux3 = half * (GL3 + GR3) - half * smax * (rhovR - rhovL) + flux4 = half * (GL4 + GR4) - half * smax * (ER - EL) return flux1, flux2, flux3, flux4 end diff --git a/src/time_integration.jl b/src/time_integration.jl index 69beb25..3853572 100644 --- a/src/time_integration.jl +++ b/src/time_integration.jl @@ -71,16 +71,15 @@ workspace(state::CompressibleEulerState) = state.workspace backend(state::CompressibleEulerState) = state.backend -const _DefaultArrayType = Array - function LinearAdvectionState(problem::LinearAdvectionProblem; T::Type = Float64, - array_type = _DefaultArrayType, + array_type = nothing, init = nothing, backend::ExecutionBackend = default_backend()) mesh_obj = mesh(problem) dims = size(mesh_obj) - field = allocate_cellfield(array_type, T, dims, 1) + array_type_val = array_type === nothing ? default_array_type(backend) : array_type + field = allocate_cellfield(array_type_val, T, dims, 1) _initialize_scalar_field!(field, init, mesh_obj, T) k1 = allocate_like(field) @@ -106,8 +105,16 @@ function _initialize_scalar_field!(field::CellField, init, mesh::StructuredMesh, elseif init isa Function centers_x, centers_y = cell_centers(mesh) nx, ny = size(mesh) - @inbounds for j in 1:ny, i in 1:nx - data[i, j] = T(init(centers_x[i], centers_y[j])) + if data isa Array + @inbounds for j in 1:ny, i in 1:nx + data[i, j] = T(init(centers_x[i], centers_y[j])) + end + else + host = Array{T}(undef, nx, ny) + @inbounds for j in 1:ny, i in 1:nx + host[i, j] = T(init(centers_x[i], centers_y[j])) + end + data .= host end else throw(ArgumentError("Unsupported initializer type $(typeof(init))")) @@ -119,14 +126,15 @@ const _EulerVarCount = 4 function CompressibleEulerState(problem::CompressibleEulerProblem; T::Type = Float64, - array_type = _DefaultArrayType, + array_type = nothing, init = nothing, backend::ExecutionBackend = default_backend()) mesh_obj = mesh(problem) dims = size(mesh_obj) eq = pde(problem) - field = allocate_cellfield(array_type, T, dims, _EulerVarCount) + array_type_val = array_type === nothing ? default_array_type(backend) : array_type + field = allocate_cellfield(array_type_val, T, dims, _EulerVarCount) _initialize_euler_field!(field, init, mesh_obj, eq, T) k1 = allocate_like(field) @@ -161,9 +169,19 @@ function _initialize_euler_field!(field::CellField, centers_x, centers_y = cell_centers(mesh) nx, ny = size(mesh) γ = gamma(equation) - @inbounds for j in 1:ny, i in 1:nx - ρ, u, v, p = _extract_primitive(init, centers_x[i], centers_y[j]) - _store_conserved!(field, i, j, ρ, u, v, p, γ, T) + density = component(field, 1) + if density isa Array + @inbounds for j in 1:ny, i in 1:nx + ρ, u, v, p = _extract_primitive(init, centers_x[i], centers_y[j]) + _store_conserved!(field, i, j, ρ, u, v, p, γ, T) + end + else + host_field = allocate_cellfield(Array, T, (nx, ny), _EulerVarCount) + @inbounds for j in 1:ny, i in 1:nx + ρ, u, v, p = _extract_primitive(init, centers_x[i], centers_y[j]) + _store_conserved!(host_field, i, j, ρ, u, v, p, γ, T) + end + copyto!(field, host_field) end return field else From c581f29ecbdcb8d0f5af3d824abc2a8d35c587c9 Mon Sep 17 00:00:00 2001 From: Michael Schlottke-Lakemper Date: Tue, 7 Oct 2025 15:59:04 +0200 Subject: [PATCH 13/18] Allow running examples with KA.jl --- examples/convergence_linear_advection.jl | 27 ++++++++++++++++++++--- examples/kelvin_helmholtz_euler.jl | 28 ++++++++++++++++++++---- examples/linear_advection_demo.jl | 27 ++++++++++++++++++++++- 3 files changed, 74 insertions(+), 8 deletions(-) diff --git a/examples/convergence_linear_advection.jl b/examples/convergence_linear_advection.jl index 0da6bbf..612e52f 100644 --- a/examples/convergence_linear_advection.jl +++ b/examples/convergence_linear_advection.jl @@ -5,7 +5,8 @@ using Printf """ run_convergence_study(; base_resolution=(16, 16), levels=5, lengths=(1.0, 1.0), velocity=(1.0, 0.5), - final_time=0.5, cfl=0.45) + final_time=0.5, cfl=0.45, + backend=default_backend(), state_eltype=nothing) Execute a grid refinement study for periodic linear advection with a sinusoidal initial condition on a square domain. Prints the L₂ error for each resolution, @@ -16,7 +17,9 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16), lengths::Tuple{<:Real,<:Real} = (1.0, 1.0), velocity::Tuple{<:Real,<:Real} = (1.0, 0.5), final_time::Real = 0.5, - cfl::Real = 0.45) + cfl::Real = 0.45, + backend::Union{ExecutionBackend,Symbol} = default_backend(), + state_eltype::Union{Nothing,Type} = nothing) @assert levels >= 2 "Need at least two levels to measure convergence" Lx, Ly = float(lengths[1]), float(lengths[2]) @@ -31,6 +34,8 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16), println(" level nx ny dt L2 error EOC") init = _sinusoidal_initializer(Lx, Ly) + backend_obj = _resolve_example_backend(backend) + state_T = isnothing(state_eltype) ? _default_state_eltype(backend_obj) : state_eltype for level in 0:levels-1 nx = nx0 * 2^level @@ -38,7 +43,7 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16), problem = setup_linear_advection_problem(nx, ny; lengths = (Lx, Ly), velocity = vel) - state = LinearAdvectionState(problem; init = init) + state = LinearAdvectionState(problem; init = init, T = state_T, backend = backend_obj) dt_stable = stable_timestep(problem; cfl = cfl) steps = max(1, ceil(Int, final_time / dt_stable)) @@ -75,6 +80,22 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16), errors = errors, eocs = eocs) end +function _resolve_example_backend(spec::ExecutionBackend) + return spec +end + +function _resolve_example_backend(spec::Symbol) + return KernelAbstractionsBackend(spec) +end + +function _default_state_eltype(::ExecutionBackend) + return Float64 +end + +function _default_state_eltype(::KernelAbstractionsBackend) + return Float32 +end + function _sinusoidal_initializer(Lx::Float64, Ly::Float64) function init(x, y) return sin(2pi * x / Lx) * sin(2pi * y / Ly) diff --git a/examples/kelvin_helmholtz_euler.jl b/examples/kelvin_helmholtz_euler.jl index cf2ac25..3d33b57 100644 --- a/examples/kelvin_helmholtz_euler.jl +++ b/examples/kelvin_helmholtz_euler.jl @@ -5,7 +5,7 @@ using Plots """ run_kelvin_helmholtz(; nx=256, ny=256, gamma=1.4, final_time=1.5, - cfl=0.45, T=Float32, log_every=25, + cfl=0.45, T=Float32, backend=default_backend(), log_every=25, diagnostics_path=nothing, pdf_path=nothing, animation_path=nothing, @@ -22,7 +22,8 @@ function run_kelvin_helmholtz(; nx::Int = 256, gamma::Real = 1.4, final_time::Real = 1.5, cfl::Real = 0.45, - T::Type = Float32, + T::Union{Type,Nothing} = Float32, + backend::Union{ExecutionBackend,Symbol} = default_backend(), log_every::Integer = 25, diagnostics_path::Union{Nothing,AbstractString} = nothing, pdf_path::Union{Nothing,AbstractString} = nothing, @@ -42,8 +43,11 @@ function run_kelvin_helmholtz(; nx::Int = 256, gamma = gamma, boundary_conditions = PeriodicBoundaryConditions()) - init = _kelvin_helmholtz_initializer(T) - state = CompressibleEulerState(problem; T = T, init = init) + backend_obj = _resolve_example_backend(backend) + state_T = isnothing(T) ? _default_state_eltype(backend_obj) : T + + init = _kelvin_helmholtz_initializer(state_T) + state = CompressibleEulerState(problem; T = state_T, init = init, backend = backend_obj) sim_time = 0.0 step = 0 @@ -112,6 +116,22 @@ function run_kelvin_helmholtz(; nx::Int = 256, animation_path = animation_path) end +function _resolve_example_backend(spec::ExecutionBackend) + return spec +end + +function _resolve_example_backend(spec::Symbol) + return KernelAbstractionsBackend(spec) +end + +function _default_state_eltype(::ExecutionBackend) + return Float64 +end + +function _default_state_eltype(::KernelAbstractionsBackend) + return Float32 +end + function _kelvin_helmholtz_initializer(::Type{T}) where {T} slope = T(15) offset = T(7.5) diff --git a/examples/linear_advection_demo.jl b/examples/linear_advection_demo.jl index 0ef6360..1ed2348 100644 --- a/examples/linear_advection_demo.jl +++ b/examples/linear_advection_demo.jl @@ -4,6 +4,7 @@ using CodexMach """ run_linear_advection_demo(; nx=256, ny=128, lengths=(2.0, 1.0), velocity=(1.0, 0.0), cfl=0.4, steps=100, + backend=default_backend(), state_eltype=nothing, sample_every=nothing, diagnostics_path=nothing, state_path=nothing, match_return_period=true) @@ -22,6 +23,8 @@ function run_linear_advection_demo(; nx::Int = 256, velocity::Tuple{<:Real,<:Real} = (1.0, 0.0), cfl::Real = 0.4, steps::Int = 100, + backend::Union{ExecutionBackend,Symbol} = default_backend(), + state_eltype::Union{Nothing,Type} = nothing, sample_every::Union{Nothing,Int} = nothing, diagnostics_path::Union{Nothing,AbstractString} = nothing, state_path::Union{Nothing,AbstractString} = nothing, @@ -37,8 +40,14 @@ function run_linear_advection_demo(; nx::Int = 256, problem = setup_linear_advection_problem(nx_cells, ny_cells; velocity = velocity, lengths = lengths_tuple) + backend_obj = _resolve_example_backend(backend) + Tstate = isnothing(state_eltype) ? _default_state_eltype(backend_obj) : state_eltype + init_field = _sine_blob_initializer(lengths_tuple) - state = LinearAdvectionState(problem; init = init_field) + state = LinearAdvectionState(problem; + init = init_field, + T = Tstate, + backend = backend_obj) dt_cfl = stable_timestep(problem; cfl = cfl) target_time = _return_period(lengths_tuple, velocity) @@ -92,6 +101,22 @@ function run_linear_advection_demo(; nx::Int = 256, target_time = target_time)) end +function _resolve_example_backend(spec::ExecutionBackend) + return spec +end + +function _resolve_example_backend(spec::Symbol) + return KernelAbstractionsBackend(spec) +end + +function _default_state_eltype(::ExecutionBackend) + return Float64 +end + +function _default_state_eltype(::KernelAbstractionsBackend) + return Float32 +end + function _write_diagnostics_csv(path::AbstractString, records::Vector{NamedTuple{(:step, :time, :rms, :cfl), NTuple{4,Float64}}}) open(path, "w") do io From 765b21cb360d4011ce9a17b91f447344cef1291a Mon Sep 17 00:00:00 2001 From: Michael Schlottke-Lakemper Date: Tue, 7 Oct 2025 16:03:54 +0200 Subject: [PATCH 14/18] Document stuff --- README.md | 43 ++++++++++++++++++++++++++++++++++++------- docs/src/examples.md | 43 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 76 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a4526c2..90b53a1 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,16 @@ julia --project=. examples/linear_advection_demo.jl ``` It prints periodic RMS diagnostics along with the CFL number used for the run. -Tune parameters by editing the keyword arguments in `run_linear_advection_demo`. +The driver defaults to the serial backend with `Float64` storage, but you can +switch to a registered KernelAbstractions backend (dropping to `Float32` by +default on GPUs) via keyword arguments: + +```bash +julia --project=. -e 'include("examples/linear_advection_demo.jl"); run_linear_advection_demo(backend=:metal)' +``` + +Override the precision explicitly with `state_eltype=Float64` if you want to run +the GPU case in double precision (CUDA) or stay on `Float32` for Metal. To capture diagnostics, pass output paths (created if missing): @@ -73,6 +82,12 @@ convergence (EOC), and finishes with the average EOC. Edit `run_convergence_study` inside the script to adjust the final time, CFL target, or refinement levels. +To exercise a GPU backend, reuse the `backend` keyword: + +```bash +julia --project=. -e 'include("examples/convergence_linear_advection.jl"); run_convergence_study(backend=:metal, levels=4)' +``` + ## Kelvin-Helmholtz instability The compressible Euler path is exercised by @@ -84,12 +99,26 @@ julia --project=. examples/kelvin_helmholtz_euler.jl ``` By default the driver uses a 256×256 mesh, RK2 time stepping with adaptive CFL -control, and prints periodic log messages. Pass a file path via -`diagnostics_path` to capture per-step CFL and kinetic-energy measurements. The -routine returns the final state so you can post-process density, vorticity, or -other derived fields. Supply `pdf_path` to snapshot the terminal density field, -and `animation_path` (MP4 or GIF) plus `animation_every`/`animation_fps` to -produce a time-resolved movie. +control, and prints periodic log messages. Set `backend=:metal` (or `:cuda`) +to run the scenario on a GPU; the helper automatically falls back to `Float32` +storage for KernelAbstractions backends while keeping `Float64` for the serial +path. Pass a file path via `diagnostics_path` to capture per-step CFL and +kinetic-energy measurements. The routine returns the final state so you can +post-process density, vorticity, or other derived fields. Supply `pdf_path` to +snapshot the terminal density field, and `animation_path` (MP4 or GIF) plus +`animation_every`/`animation_fps` to produce a time-resolved movie. + +## Backend profiling + +Compare the serial and GPU paths with the profiling helper: + +```bash +julia --project=. examples/profile_backends.jl +``` + +It benchmarks the RK2 loops for linear advection and compressible Euler across +the serial and any registered KernelAbstractions backends (CPU, Metal, CUDA), +forcing `Float32` to keep Metal hardware supported. ## Maintainer diff --git a/docs/src/examples.md b/docs/src/examples.md index 2e8d9b1..5078adc 100644 --- a/docs/src/examples.md +++ b/docs/src/examples.md @@ -24,7 +24,16 @@ until it returns to its starting position. By default it: The function returns a named tuple with the step size, final time, and diagnostic history. When you pass `diagnostics_path="diagnostics.csv"` it writes a CSV mirroring the on-screen log, and `state_path="state.csv"` captures the final -solution for later visualisation. +solution for later visualisation. By default the demo runs on the serial backend +with `Float64` storage, but you can switch to a KernelAbstractions backend while +automatically selecting a GPU-friendly element type: + +``` +julia --project=. -e 'include("examples/linear_advection_demo.jl"); run_linear_advection_demo(backend=:metal)' +``` + +Override the precision explicitly with `state_eltype=Float64` (for CUDA) or +`state_eltype=Float32` if you want to minimise device memory traffic. To plot the diagnostics or the final field, call @@ -43,7 +52,15 @@ julia --project=. examples/convergence_linear_advection.jl The convergence driver repeats the sinusoidal advection problem on a hierarchy of meshes. It prints the L² error for each resolution along with the observed order of accuracy. You should expect second-order convergence (EOC ≈ 2.0) once -the mesh is sufficiently fine. +the mesh is sufficiently fine. To assess GPU correctness, pass a backend symbol +when calling the driver: + +``` +julia --project=. -e 'include("examples/convergence_linear_advection.jl"); run_convergence_study(backend=:metal, levels=4)' +``` + +The serial path remains the default, so simple invocations continue to run on +the CPU without any extra keywords. ## Kelvin–Helmholtz Instability (Compressible Euler) @@ -54,7 +71,16 @@ julia --project=. examples/kelvin_helmholtz_euler.jl This example launches a Kelvin–Helmholtz roll-up on a periodic square domain using the compressible Euler equations and the slope-limited Rusanov fluxes. The script logs CFL numbers, tracks the volume-averaged kinetic energy, and can -produce publication-ready figures: +produce publication-ready figures. To exercise GPU execution, pass +`backend=:metal` (or `:cuda` if available); the helper will automatically fall +back to `Float32` storage on GPU backends while keeping `Float64` on the serial +path: + +``` +julia --project=. -e 'include("examples/kelvin_helmholtz_euler.jl"); run_kelvin_helmholtz(backend=:metal, final_time=1.0)' +``` + +Further output customisation includes: - set `diagnostics_path="diagnostics.csv"` to capture the per-step log; - set `pdf_path="khi_density.pdf"` for a static density snapshot at `t = final_time`; @@ -65,6 +91,17 @@ A stable run with the default settings reaches `t = 1.5` in roughly a thousand steps with CFL ≈ 0.4–0.45. The density field develops rolled vortices that match the reference figures checked into the repository (`khi_*.pdf/mp4`). +## Backend Profiling + +``` +julia --project=. examples/profile_backends.jl +``` + +This utility benchmarks the serial driver against available KernelAbstractions +backends for both linear advection and compressible Euler. It uses `Float32` +storage so Metal-backed GPUs remain supported and prints per-backend wall-clock +timings for quick regressions after kernel changes. + ## Tips - All drivers expose keyword arguments for mesh size, timestep control, and file From 6354616e4800dbee2be9bf72364c23bb7d5b2516 Mon Sep 17 00:00:00 2001 From: Michael Schlottke-Lakemper Date: Tue, 7 Oct 2025 16:54:51 +0200 Subject: [PATCH 15/18] Update for EUler GPU stuff --- docs/src/api.md | 14 +++ examples/kelvin_helmholtz_euler.jl | 33 ++++-- src/equations.jl | 170 ++++++++++++++++++++++------- src/kernel_abstractions_support.jl | 40 +++++++ src/time_integration.jl | 95 ++++++++++------ 5 files changed, 275 insertions(+), 77 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 197d5ce..42212d9 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -62,6 +62,20 @@ stable_timestep cfl_number ``` +## Cell Fields + +```@docs +CellField +allocate_cellfield +allocate_like +map_components! +cell_components +component +ncomponents +spatial_size +backend +``` + ## Simulation Drivers ```@docs diff --git a/examples/kelvin_helmholtz_euler.jl b/examples/kelvin_helmholtz_euler.jl index 3d33b57..9f038a9 100644 --- a/examples/kelvin_helmholtz_euler.jl +++ b/examples/kelvin_helmholtz_euler.jl @@ -53,7 +53,8 @@ function run_kelvin_helmholtz(; nx::Int = 256, step = 0 last_cfl = NaN records = diagnostics_path === nothing ? nothing : Vector{NamedTuple{(:step, :time, :cfl, :kinetic_energy),NTuple{4,Float64}}}() - prim_buffers = primitive_variables(problem, solution(state)) + prim_buffers = primitive_variables(problem, solution(state); + backend = CodexMach.backend(state)) last_progress = time() centers_x, centers_y = cell_centers(mesh(problem)) animation_obj = animation_path === nothing ? nothing : Animation() @@ -152,19 +153,31 @@ function _volume_average_kinetic_energy(state::CompressibleEulerState, problem::CompressibleEulerProblem, buffers) prim = primitive_variables(problem, solution(state); + backend = CodexMach.backend(state), rho_out = buffers.rho, u_out = buffers.u, v_out = buffers.v, p_out = buffers.p) - nx, ny = size(prim.rho) - total = zero(eltype(prim.rho)) - half = convert(eltype(prim.rho), 0.5) - @inbounds for j in 1:ny, i in 1:nx - vel2 = prim.u[i, j]^2 + prim.v[i, j]^2 - total += half * prim.rho[i, j] * vel2 + ρ = prim.rho + u = prim.u + v = prim.v + nx, ny = size(ρ) + if ρ isa Array + total = zero(eltype(ρ)) + half = convert(eltype(ρ), 0.5) + @inbounds for j in 1:ny, i in 1:nx + vel2 = u[i, j]^2 + v[i, j]^2 + total += half * ρ[i, j] * vel2 + end + return float(total) / (nx * ny), prim + else + T = eltype(ρ) + half = inv(convert(T, 2)) + vel2 = u .* u .+ v .* v + total = sum(ρ .* vel2 .* half) + return float(total) / (nx * ny), prim end - return float(total) / (nx * ny), prim end function _write_khi_diagnostics(path::AbstractString, @@ -185,6 +198,7 @@ function _write_khi_pdf(path::AbstractString, centers_x, centers_y) prim = primitive_variables(problem, solution(state); + backend = CodexMach.backend(state), rho_out = buffers.rho, u_out = buffers.u, v_out = buffers.v, @@ -196,9 +210,10 @@ function _write_khi_pdf(path::AbstractString, end function _density_plot(centers_x, centers_y, density; title::AbstractString) + dens = density isa Array ? density : Array(density) heatmap(centers_x, centers_y, - density'; + dens'; xlabel = "x", ylabel = "y", title = title, diff --git a/src/equations.jl b/src/equations.jl index c129aaf..563f300 100644 --- a/src/equations.jl +++ b/src/equations.jl @@ -201,28 +201,38 @@ Convert a conserved-field array with layout `(4, nx, ny)` to primitive variables. Optionally provide preallocated output arrays via the keyword arguments. Returns a named tuple `(rho, u, v, p)`. """ -function primitive_variables(eq::CompressibleEuler, - conserved::AbstractArray{T,3}; - rho_out = nothing, - u_out = nothing, - v_out = nothing, - p_out = nothing) where {T} - size(conserved, 1) == 4 || - throw(ArgumentError("Conserved field must have first dimension of length 4")) +function _primitive_variables_cpu(eq::CompressibleEuler, + conserved::CellField; + rho_out = nothing, + u_out = nothing, + v_out = nothing, + p_out = nothing) + nx, ny = spatial_size(conserved) - nx, ny = size(conserved, 2), size(conserved, 3) + T = component_eltype(conserved) + out_T = float(T) - ρ = rho_out === nothing ? Array{float(T)}(undef, nx, ny) : rho_out + ρ = rho_out === nothing ? Array{out_T}(undef, nx, ny) : rho_out u = u_out === nothing ? similar(ρ) : u_out v = v_out === nothing ? similar(ρ) : v_out p = p_out === nothing ? similar(ρ) : p_out + size(ρ) == (nx, ny) || + throw(ArgumentError("rho_out size mismatch")) + (size(u) == (nx, ny) && size(v) == (nx, ny) && size(p) == (nx, ny)) || + throw(ArgumentError("Primitive output buffers must match conserved field shape")) + + ρc = component(conserved, 1) + rhouc = component(conserved, 2) + rhovc = component(conserved, 3) + Ec = component(conserved, 4) + @inbounds for j in 1:ny, i in 1:nx prim = primitive_variables(eq, - conserved[1, i, j], - conserved[2, i, j], - conserved[3, i, j], - conserved[4, i, j]) + ρc[i, j], + rhouc[i, j], + rhovc[i, j], + Ec[i, j]) ρ[i, j] = prim.ρ u[i, j] = prim.u v[i, j] = prim.v @@ -232,34 +242,34 @@ function primitive_variables(eq::CompressibleEuler, return (; rho = ρ, u = u, v = v, p = p) end -function primitive_variables(eq::CompressibleEuler, - conserved::CellField; - rho_out = nothing, - u_out = nothing, - v_out = nothing, - p_out = nothing) +function _primitive_variables_cpu(eq::CompressibleEuler, + conserved::AbstractArray{T,3}; + rho_out = nothing, + u_out = nothing, + v_out = nothing, + p_out = nothing) where {T} size(conserved, 1) == 4 || throw(ArgumentError("Conserved field must have first dimension of length 4")) - nx, ny = spatial_size(conserved) + nx, ny = size(conserved, 2), size(conserved, 3) + out_T = float(T) - T = component_eltype(conserved) - ρ = rho_out === nothing ? Array{float(T)}(undef, nx, ny) : rho_out + ρ = rho_out === nothing ? Array{out_T}(undef, nx, ny) : rho_out u = u_out === nothing ? similar(ρ) : u_out v = v_out === nothing ? similar(ρ) : v_out p = p_out === nothing ? similar(ρ) : p_out - ρc = component(conserved, 1) - rhouc = component(conserved, 2) - rhovc = component(conserved, 3) - Ec = component(conserved, 4) + size(ρ) == (nx, ny) || + throw(ArgumentError("rho_out size mismatch")) + (size(u) == (nx, ny) && size(v) == (nx, ny) && size(p) == (nx, ny)) || + throw(ArgumentError("Primitive output buffers must match conserved field shape")) @inbounds for j in 1:ny, i in 1:nx prim = primitive_variables(eq, - ρc[i, j], - rhouc[i, j], - rhovc[i, j], - Ec[i, j]) + conserved[1, i, j], + conserved[2, i, j], + conserved[3, i, j], + conserved[4, i, j]) ρ[i, j] = prim.ρ u[i, j] = prim.u v[i, j] = prim.v @@ -269,14 +279,100 @@ function primitive_variables(eq::CompressibleEuler, return (; rho = ρ, u = u, v = v, p = p) end +function _primitive_variables_gpu(eq::CompressibleEuler, + conserved::CellField, + backend::KernelAbstractionsBackend; + rho_out = nothing, + u_out = nothing, + v_out = nothing, + p_out = nothing) + size(conserved, 1) == 4 || + throw(ArgumentError("Conserved field must have first dimension of length 4")) + + ρc = component(conserved, 1) + rhouc = component(conserved, 2) + rhovc = component(conserved, 3) + Ec = component(conserved, 4) + + ρ = rho_out === nothing ? similar(ρc) : rho_out + u = u_out === nothing ? similar(ρc) : u_out + v = v_out === nothing ? similar(ρc) : v_out + p = p_out === nothing ? similar(ρc) : p_out + + size(ρ) == size(ρc) || + throw(ArgumentError("rho_out size mismatch")) + (size(u) == size(ρc) && size(v) == size(ρc) && size(p) == size(ρc)) || + throw(ArgumentError("Primitive output buffers must match conserved field shape")) + + T = eltype(ρc) + γT = convert(T, gamma(eq)) + γm1 = γT - one(γT) + epsT = eps(T) + + primitive_variables_kernel!(backend, + ρ, u, v, p, + ρc, rhouc, rhovc, Ec, + γm1, epsT) + + return (; rho = ρ, u = u, v = v, p = p) +end + +function primitive_variables(eq::CompressibleEuler, + conserved::CellField; + backend::ExecutionBackend = SerialBackend(), + rho_out = nothing, + u_out = nothing, + v_out = nothing, + p_out = nothing) + if backend isa KernelAbstractionsBackend + return _primitive_variables_gpu(eq, conserved, backend; + rho_out = rho_out, + u_out = u_out, + v_out = v_out, + p_out = p_out) + else + return _primitive_variables_cpu(eq, conserved; + rho_out = rho_out, + u_out = u_out, + v_out = v_out, + p_out = p_out) + end +end + +function primitive_variables(eq::CompressibleEuler, + conserved::AbstractArray{T,3}; + backend::ExecutionBackend = SerialBackend(), + rho_out = nothing, + u_out = nothing, + v_out = nothing, + p_out = nothing) where {T} + backend isa KernelAbstractionsBackend && + throw(ArgumentError("primitive_variables does not support generic AbstractArray inputs on GPU")) + return _primitive_variables_cpu(eq, conserved; + rho_out = rho_out, + u_out = u_out, + v_out = v_out, + p_out = p_out) +end + primitive_variables(problem::CompressibleEulerProblem, - state; kwargs...) = - primitive_variables(pde(problem), solution(state); kwargs...) + state; + backend::ExecutionBackend = backend(state), kwargs...) = + primitive_variables(pde(problem), + solution(state); + backend = backend, + kwargs...) primitive_variables(problem::CompressibleEulerProblem, - conserved::AbstractArray{T,3}; kwargs...) where {T} = - primitive_variables(pde(problem), conserved; kwargs...) + conserved::AbstractArray{T,3}; + backend::ExecutionBackend = SerialBackend(), kwargs...) where {T} = + primitive_variables(pde(problem), conserved; + backend = backend, + kwargs...) primitive_variables(problem::CompressibleEulerProblem, - conserved::CellField; kwargs...) = - primitive_variables(pde(problem), conserved; kwargs...) + conserved::CellField; + backend::ExecutionBackend = SerialBackend(), kwargs...) = + primitive_variables(pde(problem), conserved; + backend = backend, + kwargs...) diff --git a/src/kernel_abstractions_support.jl b/src/kernel_abstractions_support.jl index ba3966f..ad3c2e5 100644 --- a/src/kernel_abstractions_support.jl +++ b/src/kernel_abstractions_support.jl @@ -143,6 +143,31 @@ end end end +@kernel function _primitive_variables_kernel!(ρ_out, u_out, v_out, p_out, + ρc, rhouc, rhovc, Ec, + γm1, epsT) + i, j = @index(Global, NTuple) + nx, ny = size(ρc) + if i <= nx && j <= ny + ρval = ρc[i, j] + ρval = ρval < epsT ? epsT : ρval + invρ = one(ρval) / ρval + ux = rhouc[i, j] * invρ + uy = rhovc[i, j] * invρ + half = inv(convert(typeof(ρval), 2)) + kinetic = half * ρval * (ux^2 + uy^2) + internal = Ec[i, j] - kinetic + internal = internal < epsT ? epsT : internal + p = γm1 * internal + p = p < epsT ? epsT : p + + ρ_out[i, j] = ρval + u_out[i, j] = ux + v_out[i, j] = uy + p_out[i, j] = p + end +end + function linear_advection_rhs_kernel!(backend::KernelAbstractionsBackend, du, u, nx::Int, ny::Int, @@ -179,6 +204,21 @@ function rk2_update_kernel!(backend::KernelAbstractionsBackend, return u end +function primitive_variables_kernel!(backend::KernelAbstractionsBackend, + ρ_out, u_out, v_out, p_out, + ρc, rhouc, rhovc, Ec, + γm1, epsT) + device = _resolve_ka_device(backend.device) + kernel = backend.workgroupsize === nothing ? + _primitive_variables_kernel!(device) : + _primitive_variables_kernel!(device, backend.workgroupsize) + kernel(ρ_out, u_out, v_out, p_out, + ρc, rhouc, rhovc, Ec, + γm1, epsT; ndrange = size(ρc)) + KernelAbstractions.synchronize(device) + return ρ_out, u_out, v_out, p_out +end + @kernel function _compressible_euler_rhs_kernel!(dρ, drhou, drhov, dE, ρ, rhou, rhov, E, γ, inv_dx, inv_dy) diff --git a/src/time_integration.jl b/src/time_integration.jl index 3853572..29595df 100644 --- a/src/time_integration.jl +++ b/src/time_integration.jl @@ -749,31 +749,8 @@ function cfl_number(problem::CompressibleEulerProblem, mesh_obj = mesh(problem) dx, dy = spacing(mesh_obj) γ = gamma(pde(problem)) - u = solution(state) - nx, ny = size(mesh_obj) - - max_ax = zero(eltype(u)) - max_ay = zero(eltype(u)) - - ρ = component(u, 1) - rhou = component(u, 2) - rhov = component(u, 3) - E = component(u, 4) - - @inbounds for j in 1:ny, i in 1:nx - ρval = ρ[i, j] - rhouval = rhou[i, j] - rhovval = rhov[i, j] - Eval = E[i, j] - invρ = one(ρval) / ρval - ux = rhouval * invρ - uy = rhovval * invρ - kinetic = 0.5 * ρval * (ux^2 + uy^2) - p = (γ - one(γ)) * (Eval - kinetic) - c = sqrt(abs(γ * p * invρ)) - max_ax = max(max_ax, abs(ux) + c) - max_ay = max(max_ay, abs(uy) + c) - end + backend_obj = backend(state) + max_ax, max_ay = _euler_characteristics(solution(state), γ, backend_obj) dtT = float(dt) return dtT * (max_ax / dx + max_ay / dy) @@ -790,12 +767,49 @@ function stable_timestep(problem::CompressibleEulerProblem, cfl::Real = 0.45) cfl > 0 || throw(ArgumentError("CFL target must be positive")) + backend_obj = backend(state) + if backend_obj isa KernelAbstractionsBackend + return _stable_timestep_gpu(problem, state, cfl, backend_obj) + else + return _stable_timestep_cpu(problem, state, cfl) + end +end + +function _stable_timestep_cpu(problem::CompressibleEulerProblem, + state::CompressibleEulerState, + cfl::Real) mesh_obj = mesh(problem) dx, dy = spacing(mesh_obj) γ = gamma(pde(problem)) - u = solution(state) - nx, ny = size(mesh_obj) + max_ax, max_ay = _euler_characteristics(solution(state), γ, SerialBackend()) + + denom = (max_ax == 0 ? zero(Float64) : max_ax / dx) + + (max_ay == 0 ? zero(Float64) : max_ay / dy) + + iszero(denom) && return Inf + + return float(cfl) / denom +end + +function _stable_timestep_gpu(problem::CompressibleEulerProblem, + state::CompressibleEulerState, + cfl::Real, + backend_obj::KernelAbstractionsBackend) + mesh_obj = mesh(problem) + dx, dy = spacing(mesh_obj) + γ = gamma(pde(problem)) + max_ax, max_ay = _euler_characteristics(solution(state), γ, backend_obj) + denom = (max_ax == 0 ? zero(Float64) : max_ax / dx) + + (max_ay == 0 ? zero(Float64) : max_ay / dy) + + iszero(denom) && return Inf + + return float(cfl) / denom +end + +function _euler_characteristics(u, γ, ::ExecutionBackend) + nx, ny = size(component(u, 1)) max_ax = zero(eltype(u)) max_ay = zero(eltype(u)) @@ -819,12 +833,31 @@ function stable_timestep(problem::CompressibleEulerProblem, max_ay = max(max_ay, abs(uy) + c) end - denom = (max_ax == 0 ? zero(Float64) : max_ax / dx) + - (max_ay == 0 ? zero(Float64) : max_ay / dy) + return float(max_ax), float(max_ay) +end - iszero(denom) && return Inf +function _euler_characteristics(u, γ, backend::KernelAbstractionsBackend) + ρ = component(u, 1) + rhou = component(u, 2) + rhov = component(u, 3) + E = component(u, 4) - return float(cfl) / denom + T = eltype(ρ) + γT = convert(T, γ) + half = inv(convert(T, 2)) + epsT = eps(T) + + ux = rhou ./ ρ + uy = rhov ./ ρ + vel2 = ux .* ux .+ uy .* uy + kinetic = half .* ρ .* vel2 + internal = max.(E .- kinetic, epsT) + p = max.((γT - one(γT)) .* internal, epsT) + c = sqrt.(abs.(γT .* p ./ ρ)) + + max_ax = float(maximum(abs.(ux) .+ c)) + max_ay = float(maximum(abs.(uy) .+ c)) + return max_ax, max_ay end # Utility micro-kernels for Euler RHS From 38e0c1e0708fad05dfd645b9bbce2f336fc2f184 Mon Sep 17 00:00:00 2001 From: Michael Schlottke-Lakemper Date: Tue, 7 Oct 2025 17:04:31 +0200 Subject: [PATCH 16/18] Fix docstrings? --- docs/src/api.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/api.md b/docs/src/api.md index 42212d9..b6d32b7 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -48,6 +48,7 @@ CompressibleEulerProblem setup_compressible_euler_problem CompressibleEulerState primitive_variables +_primitive_variables_cpu ``` ## Time Integration From a6163897d3e61318e74ee8b0ec3fb24d71914bc9 Mon Sep 17 00:00:00 2001 From: Michael Schlottke-Lakemper Date: Tue, 7 Oct 2025 17:14:40 +0200 Subject: [PATCH 17/18] Fix linear advection example --- examples/convergence_linear_advection.jl | 3 ++- src/time_integration.jl | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/convergence_linear_advection.jl b/examples/convergence_linear_advection.jl index 612e52f..f970647 100644 --- a/examples/convergence_linear_advection.jl +++ b/examples/convergence_linear_advection.jl @@ -52,7 +52,8 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16), run_linear_advection!(state, problem; steps = steps, dt = dt) u_field = solution(state) - u_num = scalar_component(u_field) + u_num_backend = scalar_component(u_field) + u_num = Array(u_num_backend) # copy to host for error evaluation u_exact = _exact_solution(problem, vel, final_time, u_num) err = sqrt(sum(abs2, u_num .- u_exact) / length(u_num)) diff --git a/src/time_integration.jl b/src/time_integration.jl index 29595df..cc42eb8 100644 --- a/src/time_integration.jl +++ b/src/time_integration.jl @@ -114,7 +114,7 @@ function _initialize_scalar_field!(field::CellField, init, mesh::StructuredMesh, @inbounds for j in 1:ny, i in 1:nx host[i, j] = T(init(centers_x[i], centers_y[j])) end - data .= host + copyto!(data, host) end else throw(ArgumentError("Unsupported initializer type $(typeof(init))")) From dfa6a90dd16b4754bff7e0c74fd8c03adb518d1e Mon Sep 17 00:00:00 2001 From: Michael Schlottke-Lakemper Date: Tue, 7 Oct 2025 17:19:31 +0200 Subject: [PATCH 18/18] Update docs --- README.md | 50 +++++++++++++++++--------------- docs/src/examples.md | 69 ++++++++++++++++++++++---------------------- 2 files changed, 61 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index 90b53a1..31fd0aa 100644 --- a/README.md +++ b/README.md @@ -27,30 +27,28 @@ The repository ships with a bare `docs/` folder ready to host Documenter.jl-base ## Example simulation -The `examples/linear_advection_demo.jl` script wires together mesh generation, -problem setup, RK2 time integration, and the high-level driver. Run it from the -repository root: +Run the linear advection demo on the CPU: ```bash -julia --project=. examples/linear_advection_demo.jl +julia --project=run examples/linear_advection_demo.jl ``` -It prints periodic RMS diagnostics along with the CFL number used for the run. -The driver defaults to the serial backend with `Float64` storage, but you can -switch to a registered KernelAbstractions backend (dropping to `Float32` by -default on GPUs) via keyword arguments: +Run the same demo on a Metal-capable GPU: ```bash -julia --project=. -e 'include("examples/linear_advection_demo.jl"); run_linear_advection_demo(backend=:metal)' +julia --project=run -e 'using Metal; include("examples/linear_advection_demo.jl"); run_linear_advection_demo(backend=:metal)' ``` -Override the precision explicitly with `state_eltype=Float64` if you want to run -the GPU case in double precision (CUDA) or stay on `Float32` for Metal. +The script wires together mesh generation, problem setup, RK2 time integration, +and the high-level driver. It prints periodic RMS diagnostics along with the CFL +number used for the run. Override the precision explicitly with +`state_eltype=Float64` if you want to run the GPU case in double precision (CUDA) +or stay on `Float32` for Metal. To capture diagnostics, pass output paths (created if missing): ```bash -julia --project=. examples/linear_advection_demo.jl diagnostics.csv final_state.csv +julia --project=run examples/linear_advection_demo.jl diagnostics.csv final_state.csv ``` The first file lists sampled step/time/RMS/CFL data, and the second stores the @@ -62,7 +60,7 @@ To render the sampled diagnostics (and optionally the final field), use the helper script: ```bash -julia --project=. examples/plot_linear_advection.jl diagnostics.csv final_state.csv plot.png +julia --project=run examples/plot_linear_advection.jl diagnostics.csv final_state.csv plot.png ``` Install `Plots.jl` in your environment first (`import Pkg; Pkg.add("Plots")`). @@ -70,10 +68,16 @@ The script falls back to a readable error if the dependency is missing. ## Convergence study -To verify spatial accuracy, run the periodic convergence sweep: +CPU run: ```bash -julia --project=. examples/convergence_linear_advection.jl +julia --project=run examples/convergence_linear_advection.jl +``` + +Metal GPU run: + +```bash +julia --project=run -e 'using Metal; include("examples/convergence_linear_advection.jl"); run_convergence_study(backend=:metal, levels=4)' ``` The driver evolves a sinusoidal field across five nested grids, prints the L₂ @@ -82,20 +86,18 @@ convergence (EOC), and finishes with the average EOC. Edit `run_convergence_study` inside the script to adjust the final time, CFL target, or refinement levels. -To exercise a GPU backend, reuse the `backend` keyword: +## Kelvin-Helmholtz instability + +CPU run: ```bash -julia --project=. -e 'include("examples/convergence_linear_advection.jl"); run_convergence_study(backend=:metal, levels=4)' +julia --project=run examples/kelvin_helmholtz_euler.jl ``` -## Kelvin-Helmholtz instability - -The compressible Euler path is exercised by -`examples/kelvin_helmholtz_euler.jl`, which seeds a periodic Kelvin-Helmholtz -roll-up on a square box: +Metal GPU run: ```bash -julia --project=. examples/kelvin_helmholtz_euler.jl +julia --project=run -e 'using Metal; include("examples/kelvin_helmholtz_euler.jl"); run_kelvin_helmholtz(backend=:metal, final_time=1.0)' ``` By default the driver uses a 256×256 mesh, RK2 time stepping with adaptive CFL @@ -113,7 +115,7 @@ snapshot the terminal density field, and `animation_path` (MP4 or GIF) plus Compare the serial and GPU paths with the profiling helper: ```bash -julia --project=. examples/profile_backends.jl +julia --project=run examples/profile_backends.jl ``` It benchmarks the RK2 loops for linear advection and compressible Euler across diff --git a/docs/src/examples.md b/docs/src/examples.md index 5078adc..5b75e0a 100644 --- a/docs/src/examples.md +++ b/docs/src/examples.md @@ -5,13 +5,21 @@ CurrentModule = CodexMach ``` The `examples/` directory contains reproducible scripts that exercise the core -numerics. You can run each script directly with Julia once the project -environment has been instantiated (`julia --project=.`). +numerics. You can run each script directly with Julia once the `run/` project +environment has been instantiated (`julia --project=run`). ## Linear Advection Demo +CPU: + +``` +julia --project=run examples/linear_advection_demo.jl +``` + +Metal GPU: + ``` -julia --project=. examples/linear_advection_demo.jl +julia --project=run -e 'using Metal; include("examples/linear_advection_demo.jl"); run_linear_advection_demo(backend=:metal)' ``` This script transports a smooth sine blob around a rectangular periodic domain @@ -24,63 +32,56 @@ until it returns to its starting position. By default it: The function returns a named tuple with the step size, final time, and diagnostic history. When you pass `diagnostics_path="diagnostics.csv"` it writes a CSV mirroring the on-screen log, and `state_path="state.csv"` captures the final -solution for later visualisation. By default the demo runs on the serial backend -with `Float64` storage, but you can switch to a KernelAbstractions backend while -automatically selecting a GPU-friendly element type: - -``` -julia --project=. -e 'include("examples/linear_advection_demo.jl"); run_linear_advection_demo(backend=:metal)' -``` - -Override the precision explicitly with `state_eltype=Float64` (for CUDA) or -`state_eltype=Float32` if you want to minimise device memory traffic. +solution for later visualisation. Override the precision explicitly with +`state_eltype=Float64` (for CUDA) or `state_eltype=Float32` if you want to minimise +device memory traffic. To plot the diagnostics or the final field, call ``` -julia --project=. examples/plot_linear_advection.jl diagnostics.csv state.csv +julia --project=run examples/plot_linear_advection.jl diagnostics.csv state.csv ``` which generates a PDF overlaying the analytic and numerical solutions. ## Linear Advection Convergence Study +CPU: + ``` -julia --project=. examples/convergence_linear_advection.jl +julia --project=run examples/convergence_linear_advection.jl ``` -The convergence driver repeats the sinusoidal advection problem on a hierarchy -of meshes. It prints the L² error for each resolution along with the observed -order of accuracy. You should expect second-order convergence (EOC ≈ 2.0) once -the mesh is sufficiently fine. To assess GPU correctness, pass a backend symbol -when calling the driver: +Metal GPU: ``` -julia --project=. -e 'include("examples/convergence_linear_advection.jl"); run_convergence_study(backend=:metal, levels=4)' +julia --project=run -e 'using Metal; include("examples/convergence_linear_advection.jl"); run_convergence_study(backend=:metal, levels=4)' ``` -The serial path remains the default, so simple invocations continue to run on -the CPU without any extra keywords. +The convergence driver repeats the sinusoidal advection problem on a hierarchy +of meshes. It prints the L² error for each resolution along with the observed +order of accuracy. You should expect second-order convergence (EOC ≈ 2.0) once +the mesh is sufficiently fine. The serial path remains the default, so simple +invocations continue to run on the CPU without any extra keywords. ## Kelvin–Helmholtz Instability (Compressible Euler) +CPU: + ``` -julia --project=. examples/kelvin_helmholtz_euler.jl +julia --project=run examples/kelvin_helmholtz_euler.jl ``` -This example launches a Kelvin–Helmholtz roll-up on a periodic square domain -using the compressible Euler equations and the slope-limited Rusanov fluxes. The -script logs CFL numbers, tracks the volume-averaged kinetic energy, and can -produce publication-ready figures. To exercise GPU execution, pass -`backend=:metal` (or `:cuda` if available); the helper will automatically fall -back to `Float32` storage on GPU backends while keeping `Float64` on the serial -path: +Metal GPU: ``` -julia --project=. -e 'include("examples/kelvin_helmholtz_euler.jl"); run_kelvin_helmholtz(backend=:metal, final_time=1.0)' +julia --project=run -e 'using Metal; include("examples/kelvin_helmholtz_euler.jl"); run_kelvin_helmholtz(backend=:metal, final_time=1.0)' ``` -Further output customisation includes: +This example launches a Kelvin–Helmholtz roll-up on a periodic square domain +using the compressible Euler equations and the slope-limited Rusanov fluxes. The +script logs CFL numbers, tracks the volume-averaged kinetic energy, and can +produce publication-ready figures. Further output customisation includes: - set `diagnostics_path="diagnostics.csv"` to capture the per-step log; - set `pdf_path="khi_density.pdf"` for a static density snapshot at `t = final_time`; @@ -94,7 +95,7 @@ the reference figures checked into the repository (`khi_*.pdf/mp4`). ## Backend Profiling ``` -julia --project=. examples/profile_backends.jl +julia --project=run examples/profile_backends.jl ``` This utility benchmarks the serial driver against available KernelAbstractions