From ac64f8f0686c37445162e3e2e446517d00cc7f99 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 7 Oct 2025 10:27:58 +0200
Subject: [PATCH 01/18] Adapt memory layout

---
 examples/convergence_linear_advection.jl |   3 +-
 examples/linear_advection_demo.jl        |   3 +-
 src/CodexMach.jl                         |  10 +
 src/equations.jl                         |  41 ++++
 src/fields.jl                            | 200 +++++++++++++++
 src/time_integration.jl                  | 299 ++++++++++++++---------
 test/runtests.jl                         |  22 +-
 7 files changed, 452 insertions(+), 126 deletions(-)
 create mode 100644 src/fields.jl

diff --git a/examples/convergence_linear_advection.jl b/examples/convergence_linear_advection.jl
index 8c15736..0da6bbf 100644
--- a/examples/convergence_linear_advection.jl
+++ b/examples/convergence_linear_advection.jl
@@ -46,7 +46,8 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16),
 
         run_linear_advection!(state, problem; steps = steps, dt = dt)
 
-        u_num = solution(state)
+        u_field = solution(state)
+        u_num = scalar_component(u_field)
         u_exact = _exact_solution(problem, vel, final_time, u_num)
 
         err = sqrt(sum(abs2, u_num .- u_exact) / length(u_num))
diff --git a/examples/linear_advection_demo.jl b/examples/linear_advection_demo.jl
index 09913e7..0ef6360 100644
--- a/examples/linear_advection_demo.jl
+++ b/examples/linear_advection_demo.jl
@@ -78,7 +78,8 @@ function run_linear_advection_demo(; nx::Int = 256,
         _write_diagnostics_csv(diagnostics_path, records)
     end
 
-    final_state = copy(solution(state))
+    final_field = copy(solution(state))
+    final_state = scalar_component(final_field)
     if state_path !== nothing
         _write_state_csv(state_path, problem, final_state)
     end
diff --git a/src/CodexMach.jl b/src/CodexMach.jl
index 90d9995..e0b17a3 100644
--- a/src/CodexMach.jl
+++ b/src/CodexMach.jl
@@ -8,6 +8,7 @@ simulation_timers() = _SIMULATION_TIMERS
 
 include("mesh.jl")
 include("boundary_conditions.jl")
+include("fields.jl")
 include("equations.jl")
 include("time_integration.jl")
 include("simulation.jl")
@@ -20,6 +21,15 @@ export greet,
        PeriodicBoundaryConditions,
        is_periodic,
        periodic_axes,
+       CellField,
+       cell_components,
+       ncomponents,
+       spatial_size,
+       component,
+       scalar_component,
+       allocate_cellfield,
+       allocate_like,
+       map_components!,
        LinearAdvection,
        LinearAdvectionProblem,
        velocity,
diff --git a/src/equations.jl b/src/equations.jl
index a307275..c129aaf 100644
--- a/src/equations.jl
+++ b/src/equations.jl
@@ -232,6 +232,43 @@ function primitive_variables(eq::CompressibleEuler,
     return (; rho = ρ, u = u, v = v, p = p)
 end
 
+function primitive_variables(eq::CompressibleEuler,
+                             conserved::CellField;
+                             rho_out = nothing,
+                             u_out = nothing,
+                             v_out = nothing,
+                             p_out = nothing)
+    size(conserved, 1) == 4 ||
+        throw(ArgumentError("Conserved field must have first dimension of length 4"))
+
+    nx, ny = spatial_size(conserved)
+
+    T = component_eltype(conserved)
+    ρ = rho_out === nothing ? Array{float(T)}(undef, nx, ny) : rho_out
+    u = u_out === nothing ? similar(ρ) : u_out
+    v = v_out === nothing ? similar(ρ) : v_out
+    p = p_out === nothing ? similar(ρ) : p_out
+
+    ρc = component(conserved, 1)
+    rhouc = component(conserved, 2)
+    rhovc = component(conserved, 3)
+    Ec = component(conserved, 4)
+
+    @inbounds for j in 1:ny, i in 1:nx
+        prim = primitive_variables(eq,
+                                   ρc[i, j],
+                                   rhouc[i, j],
+                                   rhovc[i, j],
+                                   Ec[i, j])
+        ρ[i, j] = prim.ρ
+        u[i, j] = prim.u
+        v[i, j] = prim.v
+        p[i, j] = prim.p
+    end
+
+    return (; rho = ρ, u = u, v = v, p = p)
+end
+
 primitive_variables(problem::CompressibleEulerProblem,
                     state; kwargs...) =
     primitive_variables(pde(problem), solution(state); kwargs...)
@@ -239,3 +276,7 @@ primitive_variables(problem::CompressibleEulerProblem,
 primitive_variables(problem::CompressibleEulerProblem,
                     conserved::AbstractArray{T,3}; kwargs...) where {T} =
     primitive_variables(pde(problem), conserved; kwargs...)
+
+primitive_variables(problem::CompressibleEulerProblem,
+                    conserved::CellField; kwargs...) =
+    primitive_variables(pde(problem), conserved; kwargs...)
diff --git a/src/fields.jl b/src/fields.jl
new file mode 100644
index 0000000..4dca5fb
--- /dev/null
+++ b/src/fields.jl
@@ -0,0 +1,200 @@
+"""
+    CellField(components...)
+
+Bundle one or more cell-centered arrays that share a common `(nx, ny)` shape.
+Components are stored in a structure-of-arrays layout, which keeps per-variable
+data contiguous for GPU and cache-friendly execution. `CellField` behaves like a
+3D `AbstractArray` whose first dimension indexes the component.
+"""
+struct CellField{T,N,A<:NTuple{N,AbstractArray{T,2}}} <: AbstractArray{T,3}
+    components::A
+    function CellField(components::A) where {T,N,A<:NTuple{N,AbstractArray{T,2}}}
+        N > 0 || throw(ArgumentError("CellField must contain at least one component"))
+        dims = size(components[1])
+        for (idx, comp) in pairs(components)
+            ndims(comp) == 2 ||
+                throw(ArgumentError("Component $(idx) must be two-dimensional"))
+            size(comp) == dims ||
+                throw(ArgumentError("Component $(idx) does not match CellField shape"))
+        end
+        return new{T,N,A}(components)
+    end
+end
+
+CellField(component::AbstractArray{T,2}) where {T} =
+    CellField{T,1,Tuple{typeof(component)}}((component,))
+
+"""
+    cell_components(field)
+
+Return the tuple of component arrays stored within `field`.
+"""
+cell_components(field::CellField) = field.components
+
+"""
+    ncomponents(field)
+
+Return the number of component arrays bundled inside `field`.
+"""
+ncomponents(::CellField{T,N}) where {T,N} = N
+
+"""
+    spatial_size(field)
+
+Return the `(nx, ny)` logical dimensions shared by all components.
+"""
+spatial_size(field::CellField) = size(cell_components(field)[1])
+
+"""
+    component(field, i)
+
+Access the `i`-th component array stored inside `field`.
+"""
+component(field::CellField, i::Integer) = cell_components(field)[i]
+
+component(field::CellField, ::Val{i}) where {i} = component(field, i)
+
+component_eltype(::CellField{T}) where {T} = T
+
+scalar_component(field::CellField) = component(field, 1)
+
+Base.eltype(field::CellField) = component_eltype(field)
+
+Base.size(field::CellField) = (ncomponents(field), spatial_size(field)...)
+
+Base.axes(field::CellField) = (Base.OneTo(ncomponents(field)),
+                               axes(component(field, 1))...)
+
+Base.IndexStyle(::Type{<:CellField}) = IndexCartesian()
+
+@inline function Base.getindex(field::CellField, i, j, k)
+    return component(field, i)[j, k]
+end
+
+@inline function Base.getindex(field::CellField{T,1}, i, j) where {T}
+    return component(field, 1)[i, j]
+end
+
+@inline function Base.getindex(field::CellField{T,1}, ::Colon, ::Colon) where {T}
+    return component(field, 1)[:, :]
+end
+
+@inline function Base.getindex(field::CellField, I::CartesianIndex{3})
+    return field[I[1], I[2], I[3]]
+end
+
+@inline function Base.getindex(field::CellField, i, ::Colon, ::Colon)
+    return component(field, i)[:, :]
+end
+
+@inline function Base.getindex(field::CellField, i, j, ::Colon)
+    return component(field, i)[j, :]
+end
+
+@inline function Base.getindex(field::CellField, i, ::Colon, k)
+    return component(field, i)[:, k]
+end
+
+@inline function Base.setindex!(field::CellField, value, i, j, k)
+    component(field, i)[j, k] = value
+    return field
+end
+
+@inline function Base.setindex!(field::CellField{T,1}, value, i, j) where {T}
+    component(field, 1)[i, j] = value
+    return field
+end
+
+@inline function Base.setindex!(field::CellField, value, I::CartesianIndex{3})
+    field[I[1], I[2], I[3]] = value
+    return field
+end
+
+"""
+    allocate_cellfield(array_type, T, dims, n)
+
+Allocate a `CellField` with `n` components, each created via
+`array_type{T}(undef, dims...)`.
+"""
+function allocate_cellfield(array_type,
+                            ::Type{T},
+                            dims::NTuple{2,<:Integer},
+                            n::Integer) where {T}
+    n > 0 || throw(ArgumentError("Number of components must be positive"))
+    dimsT = (Int(dims[1]), Int(dims[2]))
+    components = ntuple(_ -> array_type{T}(undef, dimsT...), n)
+    return CellField(components)
+end
+
+"""
+    allocate_like(field)
+
+Allocate a new `CellField` whose components mirror the array types and shapes of
+`field`.
+"""
+function allocate_like(field::CellField)
+    comps = ntuple(i -> similar(component(field, i)), ncomponents(field))
+    return CellField(comps)
+end
+
+"""
+    allocate_like(field, T)
+
+Allocate a new `CellField` with components similar to those in `field` but whose
+entries have element type `T`.
+"""
+function allocate_like(field::CellField, ::Type{T}) where {T}
+    comps = ntuple(i -> similar(component(field, i), T, size(component(field, i))),
+                   ncomponents(field))
+    return CellField(comps)
+end
+
+Base.fill!(field::CellField, value::Number) = begin
+    for comp in cell_components(field)
+        fill!(comp, value)
+    end
+    return field
+end
+
+function Base.fill!(field::CellField, values::Union{Tuple,NTuple})
+    length(values) == ncomponents(field) ||
+        throw(ArgumentError("Tuple length must match number of components"))
+    for (comp, val) in zip(cell_components(field), values)
+        fill!(comp, val)
+    end
+    return field
+end
+
+function Base.copyto!(dest::CellField, src::CellField)
+    ncomponents(dest) == ncomponents(src) ||
+        throw(ArgumentError("Component count mismatch in copy"))
+    for i in 1:ncomponents(dest)
+        copyto!(component(dest, i), component(src, i))
+    end
+    return dest
+end
+
+Base.similar(field::CellField) = allocate_like(field)
+
+Base.similar(field::CellField, ::Type{T}) where {T} = allocate_like(field, T)
+
+"""
+    map_components!(f, dest, inputs...)
+
+Apply `f` to each component array in `dest`, passing the corresponding
+components drawn from each `inputs...` collection. All inputs must either be
+`CellField`s with matching component counts or scalar arrays reused for every
+component.
+"""
+function map_components!(f, dest::CellField, inputs...)
+    n = ncomponents(dest)
+    nin = length(inputs)
+    for i in 1:n
+        args = ntuple(j -> begin
+                input = inputs[j]
+                input isa CellField ? component(input, i) : input
+            end, nin)
+        f(component(dest, i), args...)
+    end
+    return dest
+end
diff --git a/src/time_integration.jl b/src/time_integration.jl
index 1ab79f7..4e6a07f 100644
--- a/src/time_integration.jl
+++ b/src/time_integration.jl
@@ -68,12 +68,12 @@ function LinearAdvectionState(problem::LinearAdvectionProblem;
                               init = nothing)
     mesh_obj = mesh(problem)
     dims = size(mesh_obj)
-    field = _allocate_field(array_type, T, dims)
-    _initialize_field!(field, init, mesh_obj, T)
+    field = allocate_cellfield(array_type, T, dims, 1)
+    _initialize_scalar_field!(field, init, mesh_obj, T)
 
-    k1 = _allocate_field(array_type, T, dims)
-    k2 = _allocate_field(array_type, T, dims)
-    stage = _allocate_field(array_type, T, dims)
+    k1 = allocate_like(field)
+    k2 = allocate_like(field)
+    stage = allocate_like(field)
     fill!(k1, zero(T))
     fill!(k2, zero(T))
     fill!(stage, zero(T))
@@ -81,32 +81,26 @@ function LinearAdvectionState(problem::LinearAdvectionProblem;
     return LinearAdvectionState(field, RK2Workspace(k1, k2, stage))
 end
 
-function _allocate_field(array_type, ::Type{T}, dims::Tuple{Vararg{Int}}) where {T}
-    return array_type{T}(undef, dims...)
-end
-
-function _initialize_field!(field, init, mesh::StructuredMesh, ::Type{T}) where {T}
+function _initialize_scalar_field!(field::CellField, init, mesh::StructuredMesh, ::Type{T}) where {T}
+    data = component(field, 1)
     if init === nothing
-        fill!(field, zero(T))
-        return field
+        fill!(data, zero(T))
     elseif init isa Number
-        fill!(field, T(init))
-        return field
+        fill!(data, T(init))
     elseif init isa AbstractArray
-        size(init) == size(field) ||
+        size(init) == size(data) ||
             throw(ArgumentError("Initializer array must match mesh dimensions"))
-        field .= T.(init)
-        return field
+        data .= T.(init)
     elseif init isa Function
         centers_x, centers_y = cell_centers(mesh)
         nx, ny = size(mesh)
         @inbounds for j in 1:ny, i in 1:nx
-            field[i, j] = T(init(centers_x[i], centers_y[j]))
+            data[i, j] = T(init(centers_x[i], centers_y[j]))
         end
-        return field
     else
         throw(ArgumentError("Unsupported initializer type $(typeof(init))"))
     end
+    return field
 end
 
 const _EulerVarCount = 4
@@ -119,12 +113,12 @@ function CompressibleEulerState(problem::CompressibleEulerProblem;
     dims = size(mesh_obj)
     eq = pde(problem)
 
-    field = _allocate_field(array_type, T, ( _EulerVarCount, dims[1], dims[2]))
+    field = allocate_cellfield(array_type, T, dims, _EulerVarCount)
     _initialize_euler_field!(field, init, mesh_obj, eq, T)
 
-    k1 = _allocate_field(array_type, T, size(field))
-    k2 = _allocate_field(array_type, T, size(field))
-    stage = _allocate_field(array_type, T, size(field))
+    k1 = allocate_like(field)
+    k2 = allocate_like(field)
+    stage = allocate_like(field)
     fill!(k1, zero(T))
     fill!(k2, zero(T))
     fill!(stage, zero(T))
@@ -132,7 +126,11 @@ function CompressibleEulerState(problem::CompressibleEulerProblem;
     return CompressibleEulerState(field, RK2Workspace(k1, k2, stage))
 end
 
-function _initialize_euler_field!(field, init, mesh::StructuredMesh, equation::CompressibleEuler, ::Type{T}) where {T}
+function _initialize_euler_field!(field::CellField,
+                                  init,
+                                  mesh::StructuredMesh,
+                                  equation::CompressibleEuler,
+                                  ::Type{T}) where {T}
     if init === nothing
         fill!(field, zero(T))
         return field
@@ -142,7 +140,9 @@ function _initialize_euler_field!(field, init, mesh::StructuredMesh, equation::C
     elseif init isa AbstractArray
         size(init) == size(field) ||
             throw(ArgumentError("Initializer array must match (4, nx, ny) layout"))
-        field .= T.(init)
+        for comp in 1:ncomponents(field)
+            component(field, comp) .= T.(view(init, comp, :, :))
+        end
         return field
     elseif init isa Function
         centers_x, centers_y = cell_centers(mesh)
@@ -220,18 +220,42 @@ function _rk2_step!(state, problem, dt::Real)
 
     @timeit timer "RK2 step" begin
         @timeit timer "RHS compute" compute_rhs!(ws.k1, u, problem)
-        @timeit timer "Stage predictor" begin
-            @. ws.stage = u + dtT * ws.k1
-        end
+        @timeit timer "Stage predictor" _rk2_stage_predict!(ws.stage, u, ws.k1, dtT)
         @timeit timer "RHS compute" compute_rhs!(ws.k2, ws.stage, problem)
-        @timeit timer "Solution update" begin
-            @. u = u + (dtT * half) * (ws.k1 + ws.k2)
-        end
+        @timeit timer "Solution update" _rk2_solution_update!(u, ws.k1, ws.k2, dtT * half)
     end
 
     return state
 end
 
+@inline function _rk2_stage_predict!(stage::CellField,
+                                     u::CellField,
+                                     rhs::CellField,
+                                     α)
+    n = ncomponents(stage)
+    @inbounds for comp in 1:n
+        stage_comp = component(stage, comp)
+        u_comp = component(u, comp)
+        rhs_comp = component(rhs, comp)
+        @. stage_comp = u_comp + α * rhs_comp
+    end
+    return stage
+end
+
+@inline function _rk2_solution_update!(u::CellField,
+                                       k1::CellField,
+                                       k2::CellField,
+                                       α)
+    n = ncomponents(u)
+    @inbounds for comp in 1:n
+        u_comp = component(u, comp)
+        k1_comp = component(k1, comp)
+        k2_comp = component(k2, comp)
+        @. u_comp = u_comp + α * (k1_comp + k2_comp)
+    end
+    return u
+end
+
 """
     compute_rhs!(du, u, problem)
 
@@ -239,9 +263,21 @@ Populate `du` with the spatial derivative of `u` for the PDE described by
 `problem`. Linear advection problems use a second-order upwind stencil, while
 compressible Euler problems employ slope-limited Rusanov fluxes.
 """
+function compute_rhs!(du::CellField,
+                      u::CellField,
+                      problem::LinearAdvectionProblem)
+    return _linear_advection_rhs!(component(du, 1), component(u, 1), problem)
+end
+
 function compute_rhs!(du::AbstractArray{T,2},
                       u::AbstractArray{T,2},
                       problem::LinearAdvectionProblem) where {T}
+    return _linear_advection_rhs!(du, u, problem)
+end
+
+function _linear_advection_rhs!(du::AbstractArray{T,2},
+                                u::AbstractArray{T,2},
+                                problem::LinearAdvectionProblem) where {T}
     mesh_obj = mesh(problem)
     bc = boundary_conditions(problem)
     eq = pde(problem)
@@ -297,9 +333,24 @@ function compute_rhs!(du::AbstractArray{T,2},
     return du
 end
 
+function compute_rhs!(du::CellField,
+                      u::CellField,
+                      problem::CompressibleEulerProblem)
+    return _compressible_euler_rhs!(du, u, problem)
+end
+
 function compute_rhs!(du::AbstractArray{T,3},
                       u::AbstractArray{T,3},
                       problem::CompressibleEulerProblem) where {T}
+    du_field = CellField(ntuple(i -> view(du, i, :, :), Val(_EulerVarCount)))
+    u_field = CellField(ntuple(i -> view(u, i, :, :), Val(_EulerVarCount)))
+    _compressible_euler_rhs!(du_field, u_field, problem)
+    return du
+end
+
+function _compressible_euler_rhs!(du::CellField,
+                                  u::CellField,
+                                  problem::CompressibleEulerProblem)
     mesh_obj = mesh(problem)
     bc = boundary_conditions(problem)
     eq = pde(problem)
@@ -313,13 +364,23 @@ function compute_rhs!(du::AbstractArray{T,3},
     nx >= 3 || throw(ArgumentError("At least three cells required along x"))
     ny >= 3 || throw(ArgumentError("At least three cells required along y"))
 
+    T = component_eltype(u)
     dx, dy = spacing(mesh_obj)
     inv_dx = one(T) / T(dx)
     inv_dy = one(T) / T(dy)
     γ = gamma(eq)
-
     fill!(du, zero(T))
 
+    ρ = component(u, 1)
+    rhou = component(u, 2)
+    rhov = component(u, 3)
+    E = component(u, 4)
+
+    dρ = component(du, 1)
+    drhou = component(du, 2)
+    drhov = component(du, 3)
+    dE = component(du, 4)
+
     # x-direction fluxes
     @inbounds for j in 1:ny
         jm1 = j == 1 ? ny : j - 1
@@ -330,57 +391,57 @@ function compute_rhs!(du::AbstractArray{T,3},
             im = i == 1 ? nx : i - 1
 
             # Slopes for cell i
-            ΔLρ = u[1, i, j] - u[1, im, j]
-            ΔRρ = u[1, ip, j] - u[1, i, j]
-            ΔLrhox = u[2, i, j] - u[2, im, j]
-            ΔRrhox = u[2, ip, j] - u[2, i, j]
-            ΔLrhoy = u[3, i, j] - u[3, im, j]
-            ΔRrhoy = u[3, ip, j] - u[3, i, j]
-            ΔLE = u[4, i, j] - u[4, im, j]
-            ΔRE = u[4, ip, j] - u[4, i, j]
+            ΔLρ = ρ[i, j] - ρ[im, j]
+            ΔRρ = ρ[ip, j] - ρ[i, j]
+            ΔLrhox = rhou[i, j] - rhou[im, j]
+            ΔRrhox = rhou[ip, j] - rhou[i, j]
+            ΔLrhoy = rhov[i, j] - rhov[im, j]
+            ΔRrhoy = rhov[ip, j] - rhov[i, j]
+            ΔLE = E[i, j] - E[im, j]
+            ΔRE = E[ip, j] - E[i, j]
 
             sρ = _minmod(ΔLρ, ΔRρ)
             srhox = _minmod(ΔLrhox, ΔRrhox)
             srhoy = _minmod(ΔLrhoy, ΔRrhoy)
             sE = _minmod(ΔLE, ΔRE)
 
-            ρL = u[1, i, j] + T(0.5) * sρ
-            rhouL = u[2, i, j] + T(0.5) * srhox
-            rhovL = u[3, i, j] + T(0.5) * srhoy
-            EL = u[4, i, j] + T(0.5) * sE
+            ρL = ρ[i, j] + T(0.5) * sρ
+            rhouL = rhou[i, j] + T(0.5) * srhox
+            rhovL = rhov[i, j] + T(0.5) * srhoy
+            EL = E[i, j] + T(0.5) * sE
 
             # Slopes for cell ip
-            ΔLρ_ip = u[1, ip, j] - u[1, i, j]
-            ΔRρ_ip = u[1, ip2, j] - u[1, ip, j]
-            ΔLrhox_ip = u[2, ip, j] - u[2, i, j]
-            ΔRrhox_ip = u[2, ip2, j] - u[2, ip, j]
-            ΔLrhoy_ip = u[3, ip, j] - u[3, i, j]
-            ΔRrhoy_ip = u[3, ip2, j] - u[3, ip, j]
-            ΔLE_ip = u[4, ip, j] - u[4, i, j]
-            ΔRE_ip = u[4, ip2, j] - u[4, ip, j]
+            ΔLρ_ip = ρ[ip, j] - ρ[i, j]
+            ΔRρ_ip = ρ[ip2, j] - ρ[ip, j]
+            ΔLrhox_ip = rhou[ip, j] - rhou[i, j]
+            ΔRrhox_ip = rhou[ip2, j] - rhou[ip, j]
+            ΔLrhoy_ip = rhov[ip, j] - rhov[i, j]
+            ΔRrhoy_ip = rhov[ip2, j] - rhov[ip, j]
+            ΔLE_ip = E[ip, j] - E[i, j]
+            ΔRE_ip = E[ip2, j] - E[ip, j]
 
             sρ_ip = _minmod(ΔLρ_ip, ΔRρ_ip)
             srhox_ip = _minmod(ΔLrhox_ip, ΔRrhox_ip)
             srhoy_ip = _minmod(ΔLrhoy_ip, ΔRrhoy_ip)
             sE_ip = _minmod(ΔLE_ip, ΔRE_ip)
 
-            ρR = u[1, ip, j] - T(0.5) * sρ_ip
-            rhouR = u[2, ip, j] - T(0.5) * srhox_ip
-            rhovR = u[3, ip, j] - T(0.5) * srhoy_ip
-            ER = u[4, ip, j] - T(0.5) * sE_ip
+            ρR = ρ[ip, j] - T(0.5) * sρ_ip
+            rhouR = rhou[ip, j] - T(0.5) * srhox_ip
+            rhovR = rhov[ip, j] - T(0.5) * srhoy_ip
+            ER = E[ip, j] - T(0.5) * sE_ip
 
             flux1, flux2, flux3, flux4 = _rusanov_flux_x(eq, ρL, rhouL, rhovL, EL,
                                                          ρR, rhouR, rhovR, ER)
 
-            du[1, i, j] -= flux1 * inv_dx
-            du[2, i, j] -= flux2 * inv_dx
-            du[3, i, j] -= flux3 * inv_dx
-            du[4, i, j] -= flux4 * inv_dx
+            dρ[i, j] -= flux1 * inv_dx
+            drhou[i, j] -= flux2 * inv_dx
+            drhov[i, j] -= flux3 * inv_dx
+            dE[i, j] -= flux4 * inv_dx
 
-            du[1, ip, j] += flux1 * inv_dx
-            du[2, ip, j] += flux2 * inv_dx
-            du[3, ip, j] += flux3 * inv_dx
-            du[4, ip, j] += flux4 * inv_dx
+            dρ[ip, j] += flux1 * inv_dx
+            drhou[ip, j] += flux2 * inv_dx
+            drhov[ip, j] += flux3 * inv_dx
+            dE[ip, j] += flux4 * inv_dx
         end
     end
 
@@ -390,56 +451,56 @@ function compute_rhs!(du::AbstractArray{T,3},
         jp2 = jp == ny ? 1 : jp + 1
         jm = j == 1 ? ny : j - 1
         for i in 1:nx
-            ΔLρ = u[1, i, j] - u[1, i, jm]
-            ΔRρ = u[1, i, jp] - u[1, i, j]
-            ΔLrhox = u[2, i, j] - u[2, i, jm]
-            ΔRrhox = u[2, i, jp] - u[2, i, j]
-            ΔLrhoy = u[3, i, j] - u[3, i, jm]
-            ΔRrhoy = u[3, i, jp] - u[3, i, j]
-            ΔLE = u[4, i, j] - u[4, i, jm]
-            ΔRE = u[4, i, jp] - u[4, i, j]
+            ΔLρ = ρ[i, j] - ρ[i, jm]
+            ΔRρ = ρ[i, jp] - ρ[i, j]
+            ΔLrhox = rhou[i, j] - rhou[i, jm]
+            ΔRrhox = rhou[i, jp] - rhou[i, j]
+            ΔLrhoy = rhov[i, j] - rhov[i, jm]
+            ΔRrhoy = rhov[i, jp] - rhov[i, j]
+            ΔLE = E[i, j] - E[i, jm]
+            ΔRE = E[i, jp] - E[i, j]
 
             sρ = _minmod(ΔLρ, ΔRρ)
             srhox = _minmod(ΔLrhox, ΔRrhox)
             srhoy = _minmod(ΔLrhoy, ΔRrhoy)
             sE = _minmod(ΔLE, ΔRE)
 
-            ρL = u[1, i, j] + T(0.5) * sρ
-            rhouL = u[2, i, j] + T(0.5) * srhox
-            rhovL = u[3, i, j] + T(0.5) * srhoy
-            EL = u[4, i, j] + T(0.5) * sE
+            ρL = ρ[i, j] + T(0.5) * sρ
+            rhouL = rhou[i, j] + T(0.5) * srhox
+            rhovL = rhov[i, j] + T(0.5) * srhoy
+            EL = E[i, j] + T(0.5) * sE
 
-            ΔLρ_jp = u[1, i, jp] - u[1, i, j]
-            ΔRρ_jp = u[1, i, jp2] - u[1, i, jp]
-            ΔLrhox_jp = u[2, i, jp] - u[2, i, j]
-            ΔRrhox_jp = u[2, i, jp2] - u[2, i, jp]
-            ΔLrhoy_jp = u[3, i, jp] - u[3, i, j]
-            ΔRrhoy_jp = u[3, i, jp2] - u[3, i, jp]
-            ΔLE_jp = u[4, i, jp] - u[4, i, j]
-            ΔRE_jp = u[4, i, jp2] - u[4, i, jp]
+            ΔLρ_jp = ρ[i, jp] - ρ[i, j]
+            ΔRρ_jp = ρ[i, jp2] - ρ[i, jp]
+            ΔLrhox_jp = rhou[i, jp] - rhou[i, j]
+            ΔRrhox_jp = rhou[i, jp2] - rhou[i, jp]
+            ΔLrhoy_jp = rhov[i, jp] - rhov[i, j]
+            ΔRrhoy_jp = rhov[i, jp2] - rhov[i, jp]
+            ΔLE_jp = E[i, jp] - E[i, j]
+            ΔRE_jp = E[i, jp2] - E[i, jp]
 
             sρ_jp = _minmod(ΔLρ_jp, ΔRρ_jp)
             srhox_jp = _minmod(ΔLrhox_jp, ΔRrhox_jp)
             srhoy_jp = _minmod(ΔLrhoy_jp, ΔRrhoy_jp)
             sE_jp = _minmod(ΔLE_jp, ΔRE_jp)
 
-            ρR = u[1, i, jp] - T(0.5) * sρ_jp
-            rhouR = u[2, i, jp] - T(0.5) * srhox_jp
-            rhovR = u[3, i, jp] - T(0.5) * srhoy_jp
-            ER = u[4, i, jp] - T(0.5) * sE_jp
+            ρR = ρ[i, jp] - T(0.5) * sρ_jp
+            rhouR = rhou[i, jp] - T(0.5) * srhox_jp
+            rhovR = rhov[i, jp] - T(0.5) * srhoy_jp
+            ER = E[i, jp] - T(0.5) * sE_jp
 
             flux1, flux2, flux3, flux4 = _rusanov_flux_y(eq, ρL, rhouL, rhovL, EL,
                                                          ρR, rhouR, rhovR, ER)
 
-            du[1, i, j] -= flux1 * inv_dy
-            du[2, i, j] -= flux2 * inv_dy
-            du[3, i, j] -= flux3 * inv_dy
-            du[4, i, j] -= flux4 * inv_dy
+            dρ[i, j] -= flux1 * inv_dy
+            drhou[i, j] -= flux2 * inv_dy
+            drhov[i, j] -= flux3 * inv_dy
+            dE[i, j] -= flux4 * inv_dy
 
-            du[1, i, jp] += flux1 * inv_dy
-            du[2, i, jp] += flux2 * inv_dy
-            du[3, i, jp] += flux3 * inv_dy
-            du[4, i, jp] += flux4 * inv_dy
+            dρ[i, jp] += flux1 * inv_dy
+            drhou[i, jp] += flux2 * inv_dy
+            drhov[i, jp] += flux3 * inv_dy
+            dE[i, jp] += flux4 * inv_dy
         end
     end
 
@@ -510,16 +571,21 @@ function cfl_number(problem::CompressibleEulerProblem,
     max_ax = zero(eltype(u))
     max_ay = zero(eltype(u))
 
+    ρ = component(u, 1)
+    rhou = component(u, 2)
+    rhov = component(u, 3)
+    E = component(u, 4)
+
     @inbounds for j in 1:ny, i in 1:nx
-        ρ = u[1, i, j]
-        rhou = u[2, i, j]
-        rhov = u[3, i, j]
-        E = u[4, i, j]
-        invρ = one(ρ) / ρ
-        ux = rhou * invρ
-        uy = rhov * invρ
-        kinetic = 0.5 * ρ * (ux^2 + uy^2)
-        p = (γ - one(γ)) * (E - kinetic)
+        ρval = ρ[i, j]
+        rhouval = rhou[i, j]
+        rhovval = rhov[i, j]
+        Eval = E[i, j]
+        invρ = one(ρval) / ρval
+        ux = rhouval * invρ
+        uy = rhovval * invρ
+        kinetic = 0.5 * ρval * (ux^2 + uy^2)
+        p = (γ - one(γ)) * (Eval - kinetic)
         c = sqrt(abs(γ * p * invρ))
         max_ax = max(max_ax, abs(ux) + c)
         max_ay = max(max_ay, abs(uy) + c)
@@ -549,16 +615,21 @@ function stable_timestep(problem::CompressibleEulerProblem,
     max_ax = zero(eltype(u))
     max_ay = zero(eltype(u))
 
+    ρ = component(u, 1)
+    rhou = component(u, 2)
+    rhov = component(u, 3)
+    E = component(u, 4)
+
     @inbounds for j in 1:ny, i in 1:nx
-        ρ = u[1, i, j]
-        rhou = u[2, i, j]
-        rhov = u[3, i, j]
-        E = u[4, i, j]
-        invρ = one(ρ) / ρ
-        ux = rhou * invρ
-        uy = rhov * invρ
-        kinetic = 0.5 * ρ * (ux^2 + uy^2)
-        p = (γ - one(γ)) * (E - kinetic)
+        ρval = ρ[i, j]
+        rhouval = rhou[i, j]
+        rhovval = rhov[i, j]
+        Eval = E[i, j]
+        invρ = one(ρval) / ρval
+        ux = rhouval * invρ
+        uy = rhovval * invρ
+        kinetic = 0.5 * ρval * (ux^2 + uy^2)
+        p = (γ - one(γ)) * (Eval - kinetic)
         c = sqrt(abs(γ * p * invρ))
         max_ax = max(max_ax, abs(ux) + c)
         max_ay = max(max_ay, abs(uy) + c)
diff --git a/test/runtests.jl b/test/runtests.jl
index 61c5bc8..2d910c3 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -47,27 +47,29 @@ end
 @testset "LinearAdvection state" begin
     problem = setup_linear_advection_problem(8, 4; velocity = (1.0, 0.0))
     state = LinearAdvectionState(problem; init = 2.0)
-    u = solution(state)
+    u_field = solution(state)
+    u = scalar_component(u_field)
     ws = workspace(state)
 
+    @test size(u_field) == (1, 8, 4)
     @test size(u) == (8, 4)
     @test all(u .== 2.0)
-    @test size(ws.k1) == size(u)
-    @test size(ws.k2) == size(u)
-    @test size(ws.stage) == size(u)
+    @test size(ws.k1) == size(u_field)
+    @test size(ws.k2) == size(u_field)
+    @test size(ws.stage) == size(u_field)
 
     mesh = CodexMach.mesh(problem)
     init_fun(x, y) = x + y
     state_fun = LinearAdvectionState(problem; init = init_fun)
-    u_fun = solution(state_fun)
+    u_fun = scalar_component(solution(state_fun))
     centers_x, centers_y = CodexMach.cell_centers(mesh)
     @inbounds for j in 1:size(u_fun, 2), i in 1:size(u_fun, 1)
         @test isapprox(u_fun[i, j], init_fun(centers_x[i], centers_y[j]); atol = 1e-12)
     end
 
-    fill!(u, 1.0)
-    compute_rhs!(ws.k1, u, problem)
-    @test all(isapprox.(ws.k1, 0.0; atol = 1e-12))
+    fill!(u_field, 1.0)
+    compute_rhs!(ws.k1, u_field, problem)
+    @test all(isapprox.(scalar_component(ws.k1), 0.0; atol = 1e-12))
 end
 
 @testset "LinearAdvection RK2" begin
@@ -83,7 +85,7 @@ end
     rk2_step!(state, problem, dt)
 
     expected = [sin(2pi * (x - dt)) for x in centers_x]
-    u = solution(state)
+    u = scalar_component(solution(state))
     @inbounds for i in 1:nx
         @test isapprox(u[i, 1], expected[i]; atol = 1e-3)
     end
@@ -115,7 +117,7 @@ end
     state = LinearAdvectionState(problem; init = init_fun)
     result = run_linear_advection!(state, problem; steps = 4, dt = dt, record_cfl = true)
     expected = [sin(2pi * (x - 4 * dt)) for x in centers_x]
-    u = solution(state)
+    u = scalar_component(solution(state))
     @inbounds for i in 1:nx
         @test isapprox(u[i, 1], expected[i]; atol = 1e-3)
     end

From 2e466b71474c5b64178e9f430034df0adab3ea0c Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 7 Oct 2025 10:46:46 +0200
Subject: [PATCH 02/18] Add backend infrastructure

---
 src/CodexMach.jl        |  7 +++++++
 src/backends.jl         | 28 ++++++++++++++++++++++++++++
 src/time_integration.jl | 36 ++++++++++++++++++++++++++++++------
 test/runtests.jl        | 10 ++++++++++
 4 files changed, 75 insertions(+), 6 deletions(-)
 create mode 100644 src/backends.jl

diff --git a/src/CodexMach.jl b/src/CodexMach.jl
index e0b17a3..987f86a 100644
--- a/src/CodexMach.jl
+++ b/src/CodexMach.jl
@@ -9,6 +9,7 @@ simulation_timers() = _SIMULATION_TIMERS
 include("mesh.jl")
 include("boundary_conditions.jl")
 include("fields.jl")
+include("backends.jl")
 include("equations.jl")
 include("time_integration.jl")
 include("simulation.jl")
@@ -30,6 +31,11 @@ export greet,
        allocate_cellfield,
        allocate_like,
        map_components!,
+       ExecutionBackend,
+        SerialBackend,
+        KernelAbstractionsBackend,
+        default_backend,
+        describe,
        LinearAdvection,
        LinearAdvectionProblem,
        velocity,
@@ -41,6 +47,7 @@ export greet,
        RK2Workspace,
        solution,
        workspace,
+        backend,
        compute_rhs!,
        rk2_step!,
        cfl_number,
diff --git a/src/backends.jl b/src/backends.jl
new file mode 100644
index 0000000..6e838fb
--- /dev/null
+++ b/src/backends.jl
@@ -0,0 +1,28 @@
+abstract type ExecutionBackend end
+
+struct SerialBackend <: ExecutionBackend
+end
+
+struct KernelAbstractionsBackend{D,WS} <: ExecutionBackend
+    device::D
+    workgroupsize::WS
+end
+
+function KernelAbstractionsBackend(device; workgroupsize = nothing)
+    return KernelAbstractionsBackend{typeof(device), typeof(workgroupsize)}(device, workgroupsize)
+end
+
+KernelAbstractionsBackend(; device = :cpu, workgroupsize = nothing) =
+    KernelAbstractionsBackend(device; workgroupsize = workgroupsize)
+
+default_backend() = SerialBackend()
+
+function describe(backend::ExecutionBackend)
+    if backend isa SerialBackend
+        return "SerialBackend"
+    elseif backend isa KernelAbstractionsBackend
+        return "KernelAbstractionsBackend($(backend.device))"
+    else
+        return string(typeof(backend))
+    end
+end
diff --git a/src/time_integration.jl b/src/time_integration.jl
index 4e6a07f..efc371a 100644
--- a/src/time_integration.jl
+++ b/src/time_integration.jl
@@ -16,9 +16,10 @@ end
 Hold the cell-centered solution field alongside scratch buffers required by the
 RK2 integrator.
 """
-struct LinearAdvectionState{A,W}
+struct LinearAdvectionState{A,W,B}
     solution::A
     workspace::W
+    backend::B
 end
 
 """
@@ -35,15 +36,23 @@ Access the Runge-Kutta scratch buffers bundled with `state`.
 """
 workspace(state::LinearAdvectionState) = state.workspace
 
+"""
+    backend(state)
+
+Return the execution backend associated with `state`.
+"""
+backend(state::LinearAdvectionState) = state.backend
+
 """
     CompressibleEulerState(solution, workspace)
 
 Hold the conserved variables for the compressible Euler system along with RK2
 scratch storage.
 """
-struct CompressibleEulerState{A,W}
+struct CompressibleEulerState{A,W,B}
     solution::A
     workspace::W
+    backend::B
 end
 
 """
@@ -60,12 +69,15 @@ Access the Runge-Kutta scratch buffers bundled with `state`.
 """
 workspace(state::CompressibleEulerState) = state.workspace
 
+backend(state::CompressibleEulerState) = state.backend
+
 const _DefaultArrayType = Array
 
 function LinearAdvectionState(problem::LinearAdvectionProblem;
                               T::Type = Float64,
                               array_type = _DefaultArrayType,
-                              init = nothing)
+                              init = nothing,
+                              backend::ExecutionBackend = default_backend())
     mesh_obj = mesh(problem)
     dims = size(mesh_obj)
     field = allocate_cellfield(array_type, T, dims, 1)
@@ -78,7 +90,7 @@ function LinearAdvectionState(problem::LinearAdvectionProblem;
     fill!(k2, zero(T))
     fill!(stage, zero(T))
 
-    return LinearAdvectionState(field, RK2Workspace(k1, k2, stage))
+    return LinearAdvectionState(field, RK2Workspace(k1, k2, stage), backend)
 end
 
 function _initialize_scalar_field!(field::CellField, init, mesh::StructuredMesh, ::Type{T}) where {T}
@@ -108,7 +120,8 @@ const _EulerVarCount = 4
 function CompressibleEulerState(problem::CompressibleEulerProblem;
                                 T::Type = Float64,
                                 array_type = _DefaultArrayType,
-                                init = nothing)
+                                init = nothing,
+                                backend::ExecutionBackend = default_backend())
     mesh_obj = mesh(problem)
     dims = size(mesh_obj)
     eq = pde(problem)
@@ -123,7 +136,7 @@ function CompressibleEulerState(problem::CompressibleEulerProblem;
     fill!(k2, zero(T))
     fill!(stage, zero(T))
 
-    return CompressibleEulerState(field, RK2Workspace(k1, k2, stage))
+    return CompressibleEulerState(field, RK2Workspace(k1, k2, stage), backend)
 end
 
 function _initialize_euler_field!(field::CellField,
@@ -211,6 +224,17 @@ function rk2_step!(state::CompressibleEulerState,
 end
 
 function _rk2_step!(state, problem, dt::Real)
+    backend_obj = backend(state)
+    if backend_obj isa SerialBackend
+        return _rk2_step_serial!(state, problem, dt)
+    elseif backend_obj isa KernelAbstractionsBackend
+        throw(ArgumentError("KernelAbstractions backends are not yet implemented for time integration"))
+    else
+        throw(ArgumentError("Unsupported execution backend $(describe(backend_obj))"))
+    end
+end
+
+function _rk2_step_serial!(state, problem, dt::Real)
     u = solution(state)
     ws = workspace(state)
     Tsol = eltype(u)
diff --git a/test/runtests.jl b/test/runtests.jl
index 2d910c3..5c107e1 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -51,6 +51,8 @@ end
     u = scalar_component(u_field)
     ws = workspace(state)
 
+    @test backend(state) isa SerialBackend
+
     @test size(u_field) == (1, 8, 4)
     @test size(u) == (8, 4)
     @test all(u .== 2.0)
@@ -72,6 +74,14 @@ end
     @test all(isapprox.(scalar_component(ws.k1), 0.0; atol = 1e-12))
 end
 
+@testset "Backends" begin
+    problem = setup_linear_advection_problem(8, 4; velocity = (0.0, 0.0))
+    ka_backend = KernelAbstractionsBackend(:cpu)
+    state = LinearAdvectionState(problem; init = 1.0, backend = ka_backend)
+    @test backend(state) === ka_backend
+    @test_throws ArgumentError run_linear_advection!(state, problem; steps = 1, dt = 0.1)
+end
+
 @testset "LinearAdvection RK2" begin
     nx = 64
     problem = setup_linear_advection_problem(nx, 1; velocity = (1.0, 0.0))

From ba41fc559d1b549432a1393e0bb8ab5d87c0a1b1 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 7 Oct 2025 12:30:54 +0200
Subject: [PATCH 03/18] Add KernelAbstractions dependency

---
 Project.toml                       |   2 +
 src/CodexMach.jl                   |   1 +
 src/kernel_abstractions_support.jl | 100 +++++++++++++++++++++++++++++
 src/time_integration.jl            |  80 +++++++++++++++++++++--
 test/runtests.jl                   |  11 +++-
 5 files changed, 185 insertions(+), 9 deletions(-)
 create mode 100644 src/kernel_abstractions_support.jl

diff --git a/Project.toml b/Project.toml
index 5e60efb..906bd9b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,10 +4,12 @@ authors = ["Michael Schlottke-Lakemper <michael.schlottke-lakemper@uni-a.de>"]
 version = "0.1.0"
 
 [deps]
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 
 [compat]
+KernelAbstractions = "0.9.38"
 Plots = "1.41.1"
 TimerOutputs = "0.5.29"
 julia = "1.11"
diff --git a/src/CodexMach.jl b/src/CodexMach.jl
index 987f86a..6301b6f 100644
--- a/src/CodexMach.jl
+++ b/src/CodexMach.jl
@@ -10,6 +10,7 @@ include("mesh.jl")
 include("boundary_conditions.jl")
 include("fields.jl")
 include("backends.jl")
+include("kernel_abstractions_support.jl")
 include("equations.jl")
 include("time_integration.jl")
 include("simulation.jl")
diff --git a/src/kernel_abstractions_support.jl b/src/kernel_abstractions_support.jl
new file mode 100644
index 0000000..693fafc
--- /dev/null
+++ b/src/kernel_abstractions_support.jl
@@ -0,0 +1,100 @@
+using KernelAbstractions
+
+const _SUPPORTED_SYMBOLIC_DEVICES = (:cpu,)
+
+function _resolve_ka_device(spec)
+    if spec isa Symbol
+        spec === :cpu && return KernelAbstractions.CPU()
+        throw(ArgumentError("Unsupported KernelAbstractions device spec $(spec); supported values: $(_SUPPORTED_SYMBOLIC_DEVICES)"))
+    elseif KernelAbstractions.isdevice(spec)
+        return spec
+    else
+        throw(ArgumentError("KernelAbstractionsBackend device must be a Symbol or device, got $(typeof(spec))"))
+    end
+end
+
+@kernel function _linear_advection_rhs_kernel!(du, u, ax, ay, inv2dx, inv2dy)
+    i, j = @index(Global, NTuple)
+    nx, ny = size(u)
+    if i <= nx && j <= ny
+        im1 = i == 1 ? nx : i - 1
+        im2 = im1 == 1 ? nx : im1 - 1
+        ip1 = i == nx ? 1 : i + 1
+        ip2 = ip1 == nx ? 1 : ip1 + 1
+
+        jm1 = j == 1 ? ny : j - 1
+        jm2 = jm1 == 1 ? ny : jm1 - 1
+        jp1 = j == ny ? 1 : j + 1
+        jp2 = jp1 == ny ? 1 : jp1 + 1
+
+        zero_ax = zero(ax)
+        dudx = zero_ax
+        if ax > zero_ax
+            dudx = (3 * u[i, j] - 4 * u[im1, j] + u[im2, j]) * inv2dx
+        elseif ax < zero_ax
+            dudx = (-3 * u[i, j] + 4 * u[ip1, j] - u[ip2, j]) * inv2dx
+        end
+
+        zero_ay = zero(ay)
+        dudy = zero_ay
+        if ay > zero_ay
+            dudy = (3 * u[i, j] - 4 * u[i, jm1] + u[i, jm2]) * inv2dy
+        elseif ay < zero_ay
+            dudy = (-3 * u[i, j] + 4 * u[i, jp1] - u[i, jp2]) * inv2dy
+        end
+
+        du[i, j] = -(ax * dudx + ay * dudy)
+    end
+end
+
+@kernel function _rk2_stage_kernel!(stage, u, rhs, dt)
+    i, j = @index(Global, NTuple)
+    nx, ny = size(stage)
+    if i <= nx && j <= ny
+        stage[i, j] = u[i, j] + dt * rhs[i, j]
+    end
+end
+
+@kernel function _rk2_update_kernel!(u, k1, k2, factor)
+    i, j = @index(Global, NTuple)
+    nx, ny = size(u)
+    if i <= nx && j <= ny
+        u[i, j] = u[i, j] + factor * (k1[i, j] + k2[i, j])
+    end
+end
+
+function linear_advection_rhs_kernel!(backend::KernelAbstractionsBackend,
+                                      du, u,
+                                      nx::Int, ny::Int,
+                                      ax, ay,
+                                      inv2dx, inv2dy)
+    device = _resolve_ka_device(backend.device)
+    kernel = backend.workgroupsize === nothing ?
+        _linear_advection_rhs_kernel!(device) :
+        _linear_advection_rhs_kernel!(device, backend.workgroupsize)
+    kernel(du, u, ax, ay, inv2dx, inv2dy; ndrange = (nx, ny))
+    KernelAbstractions.synchronize(device)
+    return du
+end
+
+function rk2_stage_kernel!(backend::KernelAbstractionsBackend,
+                           stage, u, rhs, dt)
+    device = _resolve_ka_device(backend.device)
+    kernel = backend.workgroupsize === nothing ?
+        _rk2_stage_kernel!(device) :
+        _rk2_stage_kernel!(device, backend.workgroupsize)
+    kernel(stage, u, rhs, dt; ndrange = size(stage))
+    KernelAbstractions.synchronize(device)
+    return stage
+end
+
+function rk2_update_kernel!(backend::KernelAbstractionsBackend,
+                            u, k1, k2, factor)
+    device = _resolve_ka_device(backend.device)
+    kernel = backend.workgroupsize === nothing ?
+        _rk2_update_kernel!(device) :
+        _rk2_update_kernel!(device, backend.workgroupsize)
+    kernel(u, k1, k2, factor; ndrange = size(u))
+    KernelAbstractions.synchronize(device)
+    return u
+end
diff --git a/src/time_integration.jl b/src/time_integration.jl
index efc371a..e8b47ef 100644
--- a/src/time_integration.jl
+++ b/src/time_integration.jl
@@ -214,23 +214,26 @@ Runge-Kutta step of size `dt`.
 function rk2_step!(state::LinearAdvectionState,
                    problem::LinearAdvectionProblem,
                    dt::Real)
-    return _rk2_step!(state, problem, dt)
+    backend_obj = backend(state)
+    if backend_obj isa SerialBackend
+        return _rk2_step_serial!(state, problem, dt)
+    elseif backend_obj isa KernelAbstractionsBackend
+        return _rk2_step_linear_advection_ka!(state, problem, dt, backend_obj)
+    else
+        throw(ArgumentError("Unsupported execution backend $(describe(backend_obj)) for linear advection"))
+    end
 end
 
 function rk2_step!(state::CompressibleEulerState,
                    problem::CompressibleEulerProblem,
                    dt::Real)
-    return _rk2_step!(state, problem, dt)
-end
-
-function _rk2_step!(state, problem, dt::Real)
     backend_obj = backend(state)
     if backend_obj isa SerialBackend
         return _rk2_step_serial!(state, problem, dt)
     elseif backend_obj isa KernelAbstractionsBackend
-        throw(ArgumentError("KernelAbstractions backends are not yet implemented for time integration"))
+        throw(ArgumentError("KernelAbstractions backends are not yet implemented for compressible Euler"))
     else
-        throw(ArgumentError("Unsupported execution backend $(describe(backend_obj))"))
+        throw(ArgumentError("Unsupported execution backend $(describe(backend_obj)) for compressible Euler"))
     end
 end
 
@@ -252,6 +255,69 @@ function _rk2_step_serial!(state, problem, dt::Real)
     return state
 end
 
+function _rk2_step_linear_advection_ka!(state::LinearAdvectionState,
+                                        problem::LinearAdvectionProblem,
+                                        dt::Real,
+                                        backend_obj::KernelAbstractionsBackend)
+    u_field = solution(state)
+    ws = workspace(state)
+    mesh_obj = mesh(problem)
+    dims = size(mesh_obj)
+    nx, ny = dims
+
+    u = scalar_component(u_field)
+    k1 = scalar_component(ws.k1)
+    k2 = scalar_component(ws.k2)
+    stage = scalar_component(ws.stage)
+
+    Tsol = eltype(u)
+    dtT = convert(Tsol, dt)
+    half = convert(Tsol, 0.5)
+
+    eq = pde(problem)
+    bc = boundary_conditions(problem)
+    axes = periodic_axes(bc)
+    vel = velocity(eq)
+    ax_raw, ay_raw = vel
+    ax = convert(Tsol, ax_raw)
+    ay = convert(Tsol, ay_raw)
+    dx, dy = spacing(mesh_obj)
+    inv2dx = ax == zero(ax) ? zero(Tsol) : Tsol(1) / (Tsol(2) * Tsol(dx))
+    inv2dy = ay == zero(ay) ? zero(Tsol) : Tsol(1) / (Tsol(2) * Tsol(dy))
+
+    if ax != zero(ax) && !axes[1]
+        throw(ArgumentError("Periodic boundary conditions required along x for advection"))
+    end
+    if ay != zero(ay) && !axes[2]
+        throw(ArgumentError("Periodic boundary conditions required along y for advection"))
+    end
+    if ax != zero(ax) && nx < 3
+        throw(ArgumentError("At least three cells along x are required for 2nd-order advection"))
+    end
+    if ay != zero(ay) && ny < 3
+        throw(ArgumentError("At least three cells along y are required for 2nd-order advection"))
+    end
+
+    timer = simulation_timers()
+
+    @timeit timer "RK2 step" begin
+        @timeit timer "RHS compute" begin
+            linear_advection_rhs_kernel!(backend_obj, k1, u, nx, ny, ax, ay, inv2dx, inv2dy)
+        end
+        @timeit timer "Stage predictor" begin
+            rk2_stage_kernel!(backend_obj, stage, u, k1, dtT)
+        end
+        @timeit timer "RHS compute" begin
+            linear_advection_rhs_kernel!(backend_obj, k2, stage, nx, ny, ax, ay, inv2dx, inv2dy)
+        end
+        @timeit timer "Solution update" begin
+            rk2_update_kernel!(backend_obj, u, k1, k2, dtT * half)
+        end
+    end
+
+    return state
+end
+
 @inline function _rk2_stage_predict!(stage::CellField,
                                      u::CellField,
                                      rhs::CellField,
diff --git a/test/runtests.jl b/test/runtests.jl
index 5c107e1..705bf30 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -75,11 +75,18 @@ end
 end
 
 @testset "Backends" begin
-    problem = setup_linear_advection_problem(8, 4; velocity = (0.0, 0.0))
+    problem = setup_linear_advection_problem(8, 4; velocity = (0.5, 0.0))
     ka_backend = KernelAbstractionsBackend(:cpu)
     state = LinearAdvectionState(problem; init = 1.0, backend = ka_backend)
     @test backend(state) === ka_backend
-    @test_throws ArgumentError run_linear_advection!(state, problem; steps = 1, dt = 0.1)
+
+    dt = 0.1
+    serial_state = LinearAdvectionState(problem; init = 1.0)
+    rk2_step!(serial_state, problem, dt)
+    rk2_step!(state, problem, dt)
+    serial_u = scalar_component(solution(serial_state))
+    ka_u = scalar_component(solution(state))
+    @test all(isapprox.(ka_u, serial_u; atol = 1e-10))
 end
 
 @testset "LinearAdvection RK2" begin

From 7c10020ac65beafb6186ede7790c956b2a444b53 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 7 Oct 2025 12:34:47 +0200
Subject: [PATCH 04/18] Update test

---
 test/runtests.jl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 705bf30..39a5e29 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -81,9 +81,12 @@ end
     @test backend(state) === ka_backend
 
     dt = 0.1
+    steps = 5
     serial_state = LinearAdvectionState(problem; init = 1.0)
-    rk2_step!(serial_state, problem, dt)
-    rk2_step!(state, problem, dt)
+    for _ in 1:steps
+        rk2_step!(serial_state, problem, dt)
+        rk2_step!(state, problem, dt)
+    end
     serial_u = scalar_component(solution(serial_state))
     ka_u = scalar_component(solution(state))
     @test all(isapprox.(ka_u, serial_u; atol = 1e-10))

From e53255111c34caf789174da8eaf95d3174634ee1 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 7 Oct 2025 12:38:47 +0200
Subject: [PATCH 05/18] Another KA update

---
 src/kernel_abstractions_support.jl | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/kernel_abstractions_support.jl b/src/kernel_abstractions_support.jl
index 693fafc..6d6b8c9 100644
--- a/src/kernel_abstractions_support.jl
+++ b/src/kernel_abstractions_support.jl
@@ -6,7 +6,7 @@ function _resolve_ka_device(spec)
     if spec isa Symbol
         spec === :cpu && return KernelAbstractions.CPU()
         throw(ArgumentError("Unsupported KernelAbstractions device spec $(spec); supported values: $(_SUPPORTED_SYMBOLIC_DEVICES)"))
-    elseif KernelAbstractions.isdevice(spec)
+    elseif spec isa KernelAbstractions.AbstractDevice
         return spec
     else
         throw(ArgumentError("KernelAbstractionsBackend device must be a Symbol or device, got $(typeof(spec))"))
@@ -48,18 +48,16 @@ end
 end
 
 @kernel function _rk2_stage_kernel!(stage, u, rhs, dt)
-    i, j = @index(Global, NTuple)
-    nx, ny = size(stage)
-    if i <= nx && j <= ny
-        stage[i, j] = u[i, j] + dt * rhs[i, j]
+    I = @index(Global)
+    if I <= length(stage)
+        stage[I] = u[I] + dt * rhs[I]
     end
 end
 
 @kernel function _rk2_update_kernel!(u, k1, k2, factor)
-    i, j = @index(Global, NTuple)
-    nx, ny = size(u)
-    if i <= nx && j <= ny
-        u[i, j] = u[i, j] + factor * (k1[i, j] + k2[i, j])
+    I = @index(Global)
+    if I <= length(u)
+        u[I] = u[I] + factor * (k1[I] + k2[I])
     end
 end
 
@@ -83,7 +81,7 @@ function rk2_stage_kernel!(backend::KernelAbstractionsBackend,
     kernel = backend.workgroupsize === nothing ?
         _rk2_stage_kernel!(device) :
         _rk2_stage_kernel!(device, backend.workgroupsize)
-    kernel(stage, u, rhs, dt; ndrange = size(stage))
+    kernel(stage, u, rhs, dt; ndrange = length(stage))
     KernelAbstractions.synchronize(device)
     return stage
 end
@@ -94,7 +92,7 @@ function rk2_update_kernel!(backend::KernelAbstractionsBackend,
     kernel = backend.workgroupsize === nothing ?
         _rk2_update_kernel!(device) :
         _rk2_update_kernel!(device, backend.workgroupsize)
-    kernel(u, k1, k2, factor; ndrange = size(u))
+    kernel(u, k1, k2, factor; ndrange = length(u))
     KernelAbstractions.synchronize(device)
     return u
 end

From fc087792f8663bf78b8409e36ddace04e3bb947e Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 7 Oct 2025 12:43:02 +0200
Subject: [PATCH 06/18] Add KA.jl backend for Euler

---
 src/time_integration.jl         | 40 ++++++++++++++++++++++++++++++++-
 test/test_compressible_euler.jl | 28 +++++++++++++++++++++++
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/src/time_integration.jl b/src/time_integration.jl
index e8b47ef..0a69df2 100644
--- a/src/time_integration.jl
+++ b/src/time_integration.jl
@@ -231,7 +231,7 @@ function rk2_step!(state::CompressibleEulerState,
     if backend_obj isa SerialBackend
         return _rk2_step_serial!(state, problem, dt)
     elseif backend_obj isa KernelAbstractionsBackend
-        throw(ArgumentError("KernelAbstractions backends are not yet implemented for compressible Euler"))
+        return _rk2_step_euler_ka!(state, problem, dt, backend_obj)
     else
         throw(ArgumentError("Unsupported execution backend $(describe(backend_obj)) for compressible Euler"))
     end
@@ -318,6 +318,44 @@ function _rk2_step_linear_advection_ka!(state::LinearAdvectionState,
     return state
 end
 
+function _rk2_step_euler_ka!(state::CompressibleEulerState,
+                             problem::CompressibleEulerProblem,
+                             dt::Real,
+                             backend_obj::KernelAbstractionsBackend)
+    u_field = solution(state)
+    ws = workspace(state)
+    Tsol = eltype(u_field)
+    dtT = convert(Tsol, dt)
+    half = convert(Tsol, 0.5)
+
+    timer = simulation_timers()
+
+    @timeit timer "RK2 step" begin
+        @timeit timer "RHS compute" compute_rhs!(ws.k1, u_field, problem)
+        @timeit timer "Stage predictor" begin
+            for comp in 1:ncomponents(u_field)
+                rk2_stage_kernel!(backend_obj,
+                                  component(ws.stage, comp),
+                                  component(u_field, comp),
+                                  component(ws.k1, comp),
+                                  dtT)
+            end
+        end
+        @timeit timer "RHS compute" compute_rhs!(ws.k2, ws.stage, problem)
+        @timeit timer "Solution update" begin
+            for comp in 1:ncomponents(u_field)
+                rk2_update_kernel!(backend_obj,
+                                   component(u_field, comp),
+                                   component(ws.k1, comp),
+                                   component(ws.k2, comp),
+                                   dtT * half)
+            end
+        end
+    end
+
+    return state
+end
+
 @inline function _rk2_stage_predict!(stage::CellField,
                                      u::CellField,
                                      rhs::CellField,
diff --git a/test/test_compressible_euler.jl b/test/test_compressible_euler.jl
index f408a20..065d162 100644
--- a/test/test_compressible_euler.jl
+++ b/test/test_compressible_euler.jl
@@ -66,3 +66,31 @@ using CodexMach
     @test all(isapprox.(prim_buf.v, prim.v; atol = 1e-12))
     @test all(isapprox.(prim_buf.p, prim.p; atol = 1e-12))
 end
+
+@testset "Compressible Euler KA backend" begin
+    nx, ny = 16, 16
+    prob = setup_compressible_euler_problem(nx, ny;
+                                            lengths = (1.0, 1.0),
+                                            origin = (0.0, 0.0),
+                                            gamma = 1.4)
+
+    init(x, y) = (; rho = 1.0 + 0.05 * sinpi(x),
+                   v1 = 0.02 * cospi(y),
+                   v2 = 0.01 * sinpi(x + y),
+                   p = 1.0 + 0.02 * cospi(x))
+
+    serial_state = CompressibleEulerState(prob; init = init, T = Float64)
+    ka_state = CompressibleEulerState(prob; init = init, T = Float64,
+                                      backend = KernelAbstractionsBackend(:cpu))
+
+    steps = 3
+    dt = stable_timestep(prob, serial_state; cfl = 0.4)
+    for _ in 1:steps
+        rk2_step!(serial_state, prob, dt)
+        rk2_step!(ka_state, prob, dt)
+    end
+
+    serial_u = solution(serial_state)
+    ka_u = solution(ka_state)
+    @test all(isapprox.(ka_u, serial_u; atol = 1e-10))
+end

From dfab12b8e9599ad67f12c5153df2348b60215c92 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 7 Oct 2025 12:50:52 +0200
Subject: [PATCH 07/18] More KA implementation

---
 src/kernel_abstractions_support.jl | 248 +++++++++++++++++++++++++++++
 src/time_integration.jl            |  42 ++++-
 2 files changed, 288 insertions(+), 2 deletions(-)

diff --git a/src/kernel_abstractions_support.jl b/src/kernel_abstractions_support.jl
index 6d6b8c9..3ffabcc 100644
--- a/src/kernel_abstractions_support.jl
+++ b/src/kernel_abstractions_support.jl
@@ -13,6 +13,75 @@ function _resolve_ka_device(spec)
     end
 end
 
+@inline function _ka_minmod(a, b)
+    S = promote_type(typeof(a), typeof(b))
+    if a * b <= 0
+        return zero(S)
+    end
+    return S(copysign(min(abs(a), abs(b)), a))
+end
+
+@inline function _ka_velocity_pressure(γ, ρ, rhou, rhov, E)
+    invρ = one(ρ) / ρ
+    ux = rhou * invρ
+    uy = rhov * invρ
+    kinetic = (one(ρ) / 2) * ρ * (ux^2 + uy^2)
+    p = (γ - one(γ)) * (E - kinetic)
+    return ux, uy, p
+end
+
+@inline function _ka_rusanov_flux_x(γ, ρL, rhouL, rhovL, EL,
+                                    ρR, rhouR, rhovR, ER)
+    uxL, uyL, pL = _ka_velocity_pressure(γ, ρL, rhouL, rhovL, EL)
+    uxR, uyR, pR = _ka_velocity_pressure(γ, ρR, rhouR, rhovR, ER)
+    cL = sqrt(abs(γ * pL / ρL))
+    cR = sqrt(abs(γ * pR / ρR))
+    smax = max(abs(uxL) + cL, abs(uxR) + cR)
+
+    FL1 = rhouL
+    FL2 = rhouL * uxL + pL
+    FL3 = rhouL * uyL
+    FL4 = (EL + pL) * uxL
+
+    FR1 = rhouR
+    FR2 = rhouR * uxR + pR
+    FR3 = rhouR * uyR
+    FR4 = (ER + pR) * uxR
+
+    flux1 = 0.5 * (FL1 + FR1) - 0.5 * smax * (ρR - ρL)
+    flux2 = 0.5 * (FL2 + FR2) - 0.5 * smax * (rhouR - rhouL)
+    flux3 = 0.5 * (FL3 + FR3) - 0.5 * smax * (rhovR - rhovL)
+    flux4 = 0.5 * (FL4 + FR4) - 0.5 * smax * (ER - EL)
+
+    return flux1, flux2, flux3, flux4
+end
+
+@inline function _ka_rusanov_flux_y(γ, ρL, rhouL, rhovL, EL,
+                                    ρR, rhouR, rhovR, ER)
+    uxL, uyL, pL = _ka_velocity_pressure(γ, ρL, rhouL, rhovL, EL)
+    uxR, uyR, pR = _ka_velocity_pressure(γ, ρR, rhouR, rhovR, ER)
+    cL = sqrt(abs(γ * pL / ρL))
+    cR = sqrt(abs(γ * pR / ρR))
+    smax = max(abs(uyL) + cL, abs(uyR) + cR)
+
+    GL1 = rhovL
+    GL2 = rhovL * uxL
+    GL3 = rhovL * uyL + pL
+    GL4 = (EL + pL) * uyL
+
+    GR1 = rhovR
+    GR2 = rhovR * uxR
+    GR3 = rhovR * uyR + pR
+    GR4 = (ER + pR) * uyR
+
+    flux1 = 0.5 * (GL1 + GR1) - 0.5 * smax * (ρR - ρL)
+    flux2 = 0.5 * (GL2 + GR2) - 0.5 * smax * (rhouR - rhouL)
+    flux3 = 0.5 * (GL3 + GR3) - 0.5 * smax * (rhovR - rhovL)
+    flux4 = 0.5 * (GL4 + GR4) - 0.5 * smax * (ER - EL)
+
+    return flux1, flux2, flux3, flux4
+end
+
 @kernel function _linear_advection_rhs_kernel!(du, u, ax, ay, inv2dx, inv2dy)
     i, j = @index(Global, NTuple)
     nx, ny = size(u)
@@ -96,3 +165,182 @@ function rk2_update_kernel!(backend::KernelAbstractionsBackend,
     KernelAbstractions.synchronize(device)
     return u
 end
+
+@kernel function _compressible_euler_rhs_kernel!(dρ, drhou, drhov, dE,
+                                                 ρ, rhou, rhov, E,
+                                                 γ, inv_dx, inv_dy)
+    i, j = @index(Global, NTuple)
+    nx, ny = size(ρ)
+    if i <= nx && j <= ny
+        ip = i == nx ? 1 : i + 1
+        ip2 = ip == nx ? 1 : ip + 1
+        im = i == 1 ? nx : i - 1
+        im2 = im == 1 ? nx : im - 1
+
+        jp = j == ny ? 1 : j + 1
+        jp2 = jp == ny ? 1 : jp + 1
+        jm = j == 1 ? ny : j - 1
+        jm2 = jm == 1 ? ny : jm - 1
+
+        half = typeof(ρ[i, j])(0.5)
+
+        # Slopes for cell i
+        ΔLρ = ρ[i, j] - ρ[im, j]
+        ΔRρ = ρ[ip, j] - ρ[i, j]
+        ΔLrhox = rhou[i, j] - rhou[im, j]
+        ΔRrhox = rhou[ip, j] - rhou[i, j]
+        ΔLrhoy = rhov[i, j] - rhov[im, j]
+        ΔRrhoy = rhov[ip, j] - rhov[i, j]
+        ΔLE = E[i, j] - E[im, j]
+        ΔRE = E[ip, j] - E[i, j]
+
+        sρ = _ka_minmod(ΔLρ, ΔRρ)
+        srhox = _ka_minmod(ΔLrhox, ΔRrhox)
+        srhoy = _ka_minmod(ΔLrhoy, ΔRrhoy)
+        sE = _ka_minmod(ΔLE, ΔRE)
+
+        ρL_plus = ρ[i, j] + half * sρ
+        rhouL_plus = rhou[i, j] + half * srhox
+        rhovL_plus = rhov[i, j] + half * srhoy
+        EL_plus = E[i, j] + half * sE
+
+        # Slopes for cell ip
+        ΔLρ_ip = ρ[ip, j] - ρ[i, j]
+        ΔRρ_ip = ρ[ip2, j] - ρ[ip, j]
+        ΔLrhox_ip = rhou[ip, j] - rhou[i, j]
+        ΔRrhox_ip = rhou[ip2, j] - rhou[ip, j]
+        ΔLrhoy_ip = rhov[ip, j] - rhov[i, j]
+        ΔRrhoy_ip = rhov[ip2, j] - rhov[ip, j]
+        ΔLE_ip = E[ip, j] - E[i, j]
+        ΔRE_ip = E[ip2, j] - E[ip, j]
+
+        sρ_ip = _ka_minmod(ΔLρ_ip, ΔRρ_ip)
+        srhox_ip = _ka_minmod(ΔLrhox_ip, ΔRrhox_ip)
+        srhoy_ip = _ka_minmod(ΔLrhoy_ip, ΔRrhoy_ip)
+        sE_ip = _ka_minmod(ΔLE_ip, ΔRE_ip)
+
+        ρR_plus = ρ[ip, j] - half * sρ_ip
+        rhouR_plus = rhou[ip, j] - half * srhox_ip
+        rhovR_plus = rhov[ip, j] - half * srhoy_ip
+        ER_plus = E[ip, j] - half * sE_ip
+
+        flux1_plus, flux2_plus, flux3_plus, flux4_plus =
+            _ka_rusanov_flux_x(γ, ρL_plus, rhouL_plus, rhovL_plus, EL_plus,
+                               ρR_plus, rhouR_plus, rhovR_plus, ER_plus)
+
+        # Interface i-1/2
+        ΔLρ_im = ρ[im, j] - ρ[im2, j]
+        ΔRρ_im = ρ[i, j] - ρ[im, j]
+        ΔLrhox_im = rhou[im, j] - rhou[im2, j]
+        ΔRrhox_im = rhou[i, j] - rhou[im, j]
+        ΔLrhoy_im = rhov[im, j] - rhov[im2, j]
+        ΔRrhoy_im = rhov[i, j] - rhov[im, j]
+        ΔLE_im = E[im, j] - E[im2, j]
+        ΔRE_im = E[i, j] - E[im, j]
+
+        sρ_im = _ka_minmod(ΔLρ_im, ΔRρ_im)
+        srhox_im = _ka_minmod(ΔLrhox_im, ΔRrhox_im)
+        srhoy_im = _ka_minmod(ΔLrhoy_im, ΔRrhoy_im)
+        sE_im = _ka_minmod(ΔLE_im, ΔRE_im)
+
+        ρL_minus = ρ[im, j] + half * sρ_im
+        rhouL_minus = rhou[im, j] + half * srhox_im
+        rhovL_minus = rhov[im, j] + half * srhoy_im
+        EL_minus = E[im, j] + half * sE_im
+
+        ρR_minus = ρ[i, j] - half * sρ
+        rhouR_minus = rhou[i, j] - half * srhox
+        rhovR_minus = rhov[i, j] - half * srhoy
+        ER_minus = E[i, j] - half * sE
+
+        flux1_minus, flux2_minus, flux3_minus, flux4_minus =
+            _ka_rusanov_flux_x(γ, ρL_minus, rhouL_minus, rhovL_minus, EL_minus,
+                               ρR_minus, rhouR_minus, rhovR_minus, ER_minus)
+
+        # Y-direction interface j+1/2
+        ΔLρ_y = ρ[i, j] - ρ[i, jm]
+        ΔRρ_y = ρ[i, jp] - ρ[i, j]
+        ΔLrhox_y = rhou[i, j] - rhou[i, jm]
+        ΔRrhox_y = rhou[i, jp] - rhou[i, j]
+        ΔLrhoy_y = rhov[i, j] - rhov[i, jm]
+        ΔRrhoy_y = rhov[i, jp] - rhov[i, j]
+        ΔLE_y = E[i, j] - E[i, jm]
+        ΔRE_y = E[i, jp] - E[i, j]
+
+        sρ_y = _ka_minmod(ΔLρ_y, ΔRρ_y)
+        srhox_y = _ka_minmod(ΔLrhox_y, ΔRrhox_y)
+        srhoy_y = _ka_minmod(ΔLrhoy_y, ΔRrhoy_y)
+        sE_y = _ka_minmod(ΔLE_y, ΔRE_y)
+
+        ρL_plus_y = ρ[i, j] + half * sρ_y
+        rhouL_plus_y = rhou[i, j] + half * srhox_y
+        rhovL_plus_y = rhov[i, j] + half * srhoy_y
+        EL_plus_y = E[i, j] + half * sE_y
+
+        ΔLρ_jp = ρ[i, jp] - ρ[i, j]
+        ΔRρ_jp = ρ[i, jp2] - ρ[i, jp]
+        ΔLrhox_jp = rhou[i, jp] - rhou[i, j]
+        ΔRrhox_jp = rhou[i, jp2] - rhou[i, jp]
+        ΔLrhoy_jp = rhov[i, jp] - rhov[i, j]
+        ΔRrhoy_jp = rhov[i, jp2] - rhov[i, jp]
+        ΔLE_jp = E[i, jp] - E[i, j]
+        ΔRE_jp = E[i, jp2] - E[i, jp]
+
+        sρ_jp = _ka_minmod(ΔLρ_jp, ΔRρ_jp)
+        srhox_jp = _ka_minmod(ΔLrhox_jp, ΔRrhox_jp)
+        srhoy_jp = _ka_minmod(ΔLrhoy_jp, ΔRrhoy_jp)
+        sE_jp = _ka_minmod(ΔLE_jp, ΔRE_jp)
+
+        ρR_plus_y = ρ[i, jp] - half * sρ_jp
+        rhouR_plus_y = rhou[i, jp] - half * srhox_jp
+        rhovR_plus_y = rhov[i, jp] - half * srhoy_jp
+        ER_plus_y = E[i, jp] - half * sE_jp
+
+        flux1_plus_y, flux2_plus_y, flux3_plus_y, flux4_plus_y =
+            _ka_rusanov_flux_y(γ, ρL_plus_y, rhouL_plus_y, rhovL_plus_y, EL_plus_y,
+                               ρR_plus_y, rhouR_plus_y, rhovR_plus_y, ER_plus_y)
+
+        # Interface j-1/2
+        ΔLρ_jm = ρ[i, jm] - ρ[i, jm2]
+        ΔRρ_jm = ρ[i, j] - ρ[i, jm]
+        ΔLrhox_jm = rhou[i, jm] - rhou[i, jm2]
+        ΔRrhox_jm = rhou[i, j] - rhou[i, jm]
+        ΔLrhoy_jm = rhov[i, jm] - rhov[i, jm2]
+        ΔRrhoy_jm = rhov[i, j] - rhov[i, jm]
+        ΔLE_jm = E[i, jm] - E[i, jm2]
+        ΔRE_jm = E[i, j] - E[i, jm]
+
+        sρ_jm = _ka_minmod(ΔLρ_jm, ΔRρ_jm)
+        srhox_jm = _ka_minmod(ΔLrhox_jm, ΔRrhox_jm)
+        srhoy_jm = _ka_minmod(ΔLrhoy_jm, ΔRrhoy_jm)
+        sE_jm = _ka_minmod(ΔLE_jm, ΔRE_jm)
+
+        ρL_minus_y = ρ[i, jm] + half * sρ_jm
+        rhouL_minus_y = rhou[i, jm] + half * srhox_jm
+        rhovL_minus_y = rhov[i, jm] + half * srhoy_jm
+        EL_minus_y = E[i, jm] + half * sE_jm
+
+        ρR_minus_y = ρ[i, j] - half * sρ_y
+        rhouR_minus_y = rhou[i, j] - half * srhox_y
+        rhovR_minus_y = rhov[i, j] - half * srhoy_y
+        ER_minus_y = E[i, j] - half * sE_y
+
+        flux1_minus_y, flux2_minus_y, flux3_minus_y, flux4_minus_y =
+            _ka_rusanov_flux_y(γ, ρL_minus_y, rhouL_minus_y, rhovL_minus_y, EL_minus_y,
+                               ρR_minus_y, rhouR_minus_y, rhovR_minus_y, ER_minus_y)
+
+        dρ_val = -(flux1_plus - flux1_minus) * inv_dx -
+                 (flux1_plus_y - flux1_minus_y) * inv_dy
+        drhou_val = -(flux2_plus - flux2_minus) * inv_dx -
+                   (flux2_plus_y - flux2_minus_y) * inv_dy
+        drhov_val = -(flux3_plus - flux3_minus) * inv_dx -
+                   (flux3_plus_y - flux3_minus_y) * inv_dy
+        dE_val = -(flux4_plus - flux4_minus) * inv_dx -
+                 (flux4_plus_y - flux4_minus_y) * inv_dy
+
+        dρ[i, j] = dρ_val
+        drhou[i, j] = drhou_val
+        drhov[i, j] = drhov_val
+        dE[i, j] = dE_val
+    end
+end
diff --git a/src/time_integration.jl b/src/time_integration.jl
index 0a69df2..69beb25 100644
--- a/src/time_integration.jl
+++ b/src/time_integration.jl
@@ -331,7 +331,7 @@ function _rk2_step_euler_ka!(state::CompressibleEulerState,
     timer = simulation_timers()
 
     @timeit timer "RK2 step" begin
-        @timeit timer "RHS compute" compute_rhs!(ws.k1, u_field, problem)
+        @timeit timer "RHS compute" _compressible_euler_rhs_ka!(backend_obj, ws.k1, u_field, problem)
         @timeit timer "Stage predictor" begin
             for comp in 1:ncomponents(u_field)
                 rk2_stage_kernel!(backend_obj,
@@ -341,7 +341,7 @@ function _rk2_step_euler_ka!(state::CompressibleEulerState,
                                   dtT)
             end
         end
-        @timeit timer "RHS compute" compute_rhs!(ws.k2, ws.stage, problem)
+        @timeit timer "RHS compute" _compressible_euler_rhs_ka!(backend_obj, ws.k2, ws.stage, problem)
         @timeit timer "Solution update" begin
             for comp in 1:ncomponents(u_field)
                 rk2_update_kernel!(backend_obj,
@@ -356,6 +356,44 @@ function _rk2_step_euler_ka!(state::CompressibleEulerState,
     return state
 end
 
+function _compressible_euler_rhs_ka!(backend::KernelAbstractionsBackend,
+                                     du::CellField,
+                                     u::CellField,
+                                     problem::CompressibleEulerProblem)
+    mesh_obj = mesh(problem)
+    nx, ny = size(mesh_obj)
+    eq = pde(problem)
+    T = component_eltype(u)
+    dx, dy = spacing(mesh_obj)
+    inv_dx = T(1) / T(dx)
+    inv_dy = T(1) / T(dy)
+    γ = T(gamma(eq))
+
+    for comp in 1:ncomponents(du)
+        fill!(component(du, comp), zero(T))
+    end
+
+    dρ = component(du, 1)
+    drhou = component(du, 2)
+    drhov = component(du, 3)
+    dE = component(du, 4)
+
+    ρ = component(u, 1)
+    rhou = component(u, 2)
+    rhov = component(u, 3)
+    E = component(u, 4)
+
+    device = _resolve_ka_device(backend.device)
+    kernel = backend.workgroupsize === nothing ?
+        _compressible_euler_rhs_kernel!(device) :
+        _compressible_euler_rhs_kernel!(device, backend.workgroupsize)
+    kernel(dρ, drhou, drhov, dE,
+           ρ, rhou, rhov, E,
+           γ, inv_dx, inv_dy; ndrange = (nx, ny))
+    KernelAbstractions.synchronize(device)
+    return du
+end
+
 @inline function _rk2_stage_predict!(stage::CellField,
                                      u::CellField,
                                      rhs::CellField,

From 290d3f46c0deee14c2a86d58647e7624923732c5 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 7 Oct 2025 12:56:54 +0200
Subject: [PATCH 08/18] Add actual GPU backends as package extensions

---
 Project.toml                       |  8 ++++++++
 ext/CodexMachCUDAExt.jl            | 11 +++++++++++
 ext/CodexMachMetalExt.jl           | 11 +++++++++++
 src/CodexMach.jl                   |  2 ++
 src/backends.jl                    | 10 +++-------
 src/kernel_abstractions_support.jl | 17 ++++++++++++++---
 test/runtests.jl                   | 10 ++++++++++
 test/test_compressible_euler.jl    | 10 ++++++++++
 8 files changed, 69 insertions(+), 10 deletions(-)
 create mode 100644 ext/CodexMachCUDAExt.jl
 create mode 100644 ext/CodexMachMetalExt.jl

diff --git a/Project.toml b/Project.toml
index 906bd9b..6d72bac 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,6 +8,14 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Metal = "dde4c033-4e86-5a71-bd2e-8d6045ea3b36"
+
+[extensions]
+CodexMachCUDAExt = "CUDA"
+CodexMachMetalExt = "Metal"
+
 [compat]
 KernelAbstractions = "0.9.38"
 Plots = "1.41.1"
diff --git a/ext/CodexMachCUDAExt.jl b/ext/CodexMachCUDAExt.jl
new file mode 100644
index 0000000..6c510e6
--- /dev/null
+++ b/ext/CodexMachCUDAExt.jl
@@ -0,0 +1,11 @@
+module CodexMachCUDAExt
+
+using CodexMach
+using KernelAbstractions
+using CUDA
+
+function __init__()
+    CodexMach.register_backend!(:cuda, () -> KernelAbstractions.CUDADevice())
+end
+
+end # module
diff --git a/ext/CodexMachMetalExt.jl b/ext/CodexMachMetalExt.jl
new file mode 100644
index 0000000..cfc9abe
--- /dev/null
+++ b/ext/CodexMachMetalExt.jl
@@ -0,0 +1,11 @@
+module CodexMachMetalExt
+
+using CodexMach
+using KernelAbstractions
+using Metal
+
+function __init__()
+    CodexMach.register_backend!(:metal, () -> KernelAbstractions.MetalDevice())
+end
+
+end # module
diff --git a/src/CodexMach.jl b/src/CodexMach.jl
index 6301b6f..fd9675d 100644
--- a/src/CodexMach.jl
+++ b/src/CodexMach.jl
@@ -32,6 +32,8 @@ export greet,
        allocate_cellfield,
        allocate_like,
        map_components!,
+       register_backend!,
+        available_backends,
        ExecutionBackend,
         SerialBackend,
         KernelAbstractionsBackend,
diff --git a/src/backends.jl b/src/backends.jl
index 6e838fb..208d927 100644
--- a/src/backends.jl
+++ b/src/backends.jl
@@ -18,11 +18,7 @@ KernelAbstractionsBackend(; device = :cpu, workgroupsize = nothing) =
 default_backend() = SerialBackend()
 
 function describe(backend::ExecutionBackend)
-    if backend isa SerialBackend
-        return "SerialBackend"
-    elseif backend isa KernelAbstractionsBackend
-        return "KernelAbstractionsBackend($(backend.device))"
-    else
-        return string(typeof(backend))
-    end
+    backend isa SerialBackend && return "SerialBackend"
+    backend isa KernelAbstractionsBackend && return "KernelAbstractionsBackend($(backend.device))"
+    return string(typeof(backend))
 end
diff --git a/src/kernel_abstractions_support.jl b/src/kernel_abstractions_support.jl
index 3ffabcc..d9ee63a 100644
--- a/src/kernel_abstractions_support.jl
+++ b/src/kernel_abstractions_support.jl
@@ -1,11 +1,20 @@
 using KernelAbstractions
 
-const _SUPPORTED_SYMBOLIC_DEVICES = (:cpu,)
+const _DEVICE_REGISTRY = Dict{Symbol,Function}()
+
+function register_backend!(name::Symbol, factory::Function)
+    _DEVICE_REGISTRY[name] = factory
+    return name
+end
+
+available_backends() = collect(keys(_DEVICE_REGISTRY))
 
 function _resolve_ka_device(spec)
     if spec isa Symbol
-        spec === :cpu && return KernelAbstractions.CPU()
-        throw(ArgumentError("Unsupported KernelAbstractions device spec $(spec); supported values: $(_SUPPORTED_SYMBOLIC_DEVICES)"))
+        factory = get(_DEVICE_REGISTRY, spec) do
+            throw(ArgumentError("Unsupported KernelAbstractions device spec $(spec); supported values: $(join(sort!(collect(keys(_DEVICE_REGISTRY))), ", "))"))
+        end
+        return factory()
     elseif spec isa KernelAbstractions.AbstractDevice
         return spec
     else
@@ -344,3 +353,5 @@ end
         dE[i, j] = dE_val
     end
 end
+
+register_backend!(:cpu, () -> KernelAbstractions.CPU())
diff --git a/test/runtests.jl b/test/runtests.jl
index 39a5e29..d328c10 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -90,6 +90,16 @@ end
     serial_u = scalar_component(solution(serial_state))
     ka_u = scalar_component(solution(state))
     @test all(isapprox.(ka_u, serial_u; atol = 1e-10))
+
+    if :metal in available_backends()
+        metal_state = LinearAdvectionState(problem; init = 1.0,
+                                           backend = KernelAbstractionsBackend(:metal))
+        for _ in 1:steps
+            rk2_step!(metal_state, problem, dt)
+        end
+        metal_u = scalar_component(solution(metal_state))
+        @test all(isapprox.(metal_u, serial_u; atol = 1e-10))
+    end
 end
 
 @testset "LinearAdvection RK2" begin
diff --git a/test/test_compressible_euler.jl b/test/test_compressible_euler.jl
index 065d162..683b341 100644
--- a/test/test_compressible_euler.jl
+++ b/test/test_compressible_euler.jl
@@ -93,4 +93,14 @@ end
     serial_u = solution(serial_state)
     ka_u = solution(ka_state)
     @test all(isapprox.(ka_u, serial_u; atol = 1e-10))
+
+    if :metal in available_backends()
+        metal_state = CompressibleEulerState(prob; init = init, T = Float64,
+                                             backend = KernelAbstractionsBackend(:metal))
+        for _ in 1:steps
+            rk2_step!(metal_state, prob, dt)
+        end
+        metal_u = solution(metal_state)
+        @test all(isapprox.(metal_u, serial_u; atol = 1e-10))
+    end
 end

From 09c3a394bdedc1353e08bc0c9870bc8b87bd176c Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 7 Oct 2025 13:10:56 +0200
Subject: [PATCH 09/18] Make Metal a guarded test

---
 Project.toml                    | 4 ++++
 test/runtests.jl                | 2 +-
 test/test_compressible_euler.jl | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 6d72bac..155df3f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,6 +8,9 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 
+[extras]
+Metal = "dde4c033-4e86-5a71-bd2e-8d6045ea3b36"
+
 [weakdeps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Metal = "dde4c033-4e86-5a71-bd2e-8d6045ea3b36"
@@ -18,6 +21,7 @@ CodexMachMetalExt = "Metal"
 
 [compat]
 KernelAbstractions = "0.9.38"
+Metal = "1"
 Plots = "1.41.1"
 TimerOutputs = "0.5.29"
 julia = "1.11"
diff --git a/test/runtests.jl b/test/runtests.jl
index d328c10..2bf623a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -91,7 +91,7 @@ end
     ka_u = scalar_component(solution(state))
     @test all(isapprox.(ka_u, serial_u; atol = 1e-10))
 
-    if :metal in available_backends()
+    if Sys.isapple() && :metal in available_backends()
         metal_state = LinearAdvectionState(problem; init = 1.0,
                                            backend = KernelAbstractionsBackend(:metal))
         for _ in 1:steps
diff --git a/test/test_compressible_euler.jl b/test/test_compressible_euler.jl
index 683b341..a42a17e 100644
--- a/test/test_compressible_euler.jl
+++ b/test/test_compressible_euler.jl
@@ -94,7 +94,7 @@ end
     ka_u = solution(ka_state)
     @test all(isapprox.(ka_u, serial_u; atol = 1e-10))
 
-    if :metal in available_backends()
+    if Sys.isapple() && :metal in available_backends()
         metal_state = CompressibleEulerState(prob; init = init, T = Float64,
                                              backend = KernelAbstractionsBackend(:metal))
         for _ in 1:steps

From d71e718861ca86eb15088f142258a2948daddaae Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 7 Oct 2025 13:55:58 +0200
Subject: [PATCH 10/18] Fix backend guards

---
 Project.toml                    |  2 ++
 test/runtests.jl                | 12 +++++++++++-
 test/test_compressible_euler.jl | 12 +++++++++++-
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 155df3f..2dd69e6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,6 +10,7 @@ TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 
 [extras]
 Metal = "dde4c033-4e86-5a71-bd2e-8d6045ea3b36"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [weakdeps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
@@ -21,6 +22,7 @@ CodexMachMetalExt = "Metal"
 
 [compat]
 KernelAbstractions = "0.9.38"
+CUDA = "5"
 Metal = "1"
 Plots = "1.41.1"
 TimerOutputs = "0.5.29"
diff --git a/test/runtests.jl b/test/runtests.jl
index 2bf623a..6791eed 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -91,7 +91,7 @@ end
     ka_u = scalar_component(solution(state))
     @test all(isapprox.(ka_u, serial_u; atol = 1e-10))
 
-    if Sys.isapple() && :metal in available_backends()
+    if :metal in available_backends()
         metal_state = LinearAdvectionState(problem; init = 1.0,
                                            backend = KernelAbstractionsBackend(:metal))
         for _ in 1:steps
@@ -100,6 +100,16 @@ end
         metal_u = scalar_component(solution(metal_state))
         @test all(isapprox.(metal_u, serial_u; atol = 1e-10))
     end
+
+    if :cuda in available_backends()
+        cuda_state = LinearAdvectionState(problem; init = 1.0,
+                                          backend = KernelAbstractionsBackend(:cuda))
+        for _ in 1:steps
+            rk2_step!(cuda_state, problem, dt)
+        end
+        cuda_u = scalar_component(solution(cuda_state))
+        @test all(isapprox.(cuda_u, serial_u; atol = 1e-10))
+    end
 end
 
 @testset "LinearAdvection RK2" begin
diff --git a/test/test_compressible_euler.jl b/test/test_compressible_euler.jl
index a42a17e..5717eb6 100644
--- a/test/test_compressible_euler.jl
+++ b/test/test_compressible_euler.jl
@@ -94,7 +94,7 @@ end
     ka_u = solution(ka_state)
     @test all(isapprox.(ka_u, serial_u; atol = 1e-10))
 
-    if Sys.isapple() && :metal in available_backends()
+    if :metal in available_backends()
         metal_state = CompressibleEulerState(prob; init = init, T = Float64,
                                              backend = KernelAbstractionsBackend(:metal))
         for _ in 1:steps
@@ -103,4 +103,14 @@ end
         metal_u = solution(metal_state)
         @test all(isapprox.(metal_u, serial_u; atol = 1e-10))
     end
+
+    if :cuda in available_backends()
+        cuda_state = CompressibleEulerState(prob; init = init, T = Float64,
+                                            backend = KernelAbstractionsBackend(:cuda))
+        for _ in 1:steps
+            rk2_step!(cuda_state, prob, dt)
+        end
+        cuda_u = solution(cuda_state)
+        @test all(isapprox.(cuda_u, serial_u; atol = 1e-10))
+    end
 end

From 6d6c5fb36aabd1ebc357cec90126ce469f651f53 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 7 Oct 2025 15:25:37 +0200
Subject: [PATCH 11/18] Fix UUIDs

---
 Project.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 2dd69e6..b6671ca 100644
--- a/Project.toml
+++ b/Project.toml
@@ -9,12 +9,12 @@ Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 
 [extras]
-Metal = "dde4c033-4e86-5a71-bd2e-8d6045ea3b36"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [weakdeps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-Metal = "dde4c033-4e86-5a71-bd2e-8d6045ea3b36"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 
 [extensions]
 CodexMachCUDAExt = "CUDA"

From 94b0545f18d35bbfc9411bde851af95c1ca6710a Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 7 Oct 2025 15:50:08 +0200
Subject: [PATCH 12/18] First working Metal implementation

---
 examples/profile_backends.jl       | 56 ++++++++++++++++++++++++++++++
 ext/CodexMachCUDAExt.jl            |  2 ++
 ext/CodexMachMetalExt.jl           |  5 ++-
 src/backends.jl                    | 16 +++++++++
 src/kernel_abstractions_support.jl | 20 ++++++-----
 src/time_integration.jl            | 40 +++++++++++++++------
 6 files changed, 119 insertions(+), 20 deletions(-)
 create mode 100644 examples/profile_backends.jl

diff --git a/examples/profile_backends.jl b/examples/profile_backends.jl
new file mode 100644
index 0000000..f82012f
--- /dev/null
+++ b/examples/profile_backends.jl
@@ -0,0 +1,56 @@
+using CodexMach
+try
+    using Metal
+    CodexMach.register_backend!(:metal, () -> Metal.MetalBackend())
+    println("Metal backend registered")
+catch err
+    println("Metal unavailable: ", err)
+end
+
+function profile(label, setup)
+    println("\n== $label ==")
+    for (name, state, prob, dt, steps) in setup()
+        GC.gc()
+        elapsed = @elapsed for _ in 1:steps
+            rk2_step!(state, prob, dt)
+        end
+        println(rpad(name, 10), ": ", round(elapsed, digits=3), " s")
+    end
+end
+
+nx = 1024
+ny = 1024
+steps = 100
+prob_la = setup_linear_advection_problem(nx, ny; velocity=(1.0, 0.5))
+dt_la = 1e-3
+profile_T = Float32
+
+function advection_cases()
+    cases = Tuple{String,Any,Any,Float64,Int}[]
+    push!(cases, ("Serial", LinearAdvectionState(prob_la; init=1.0, T=profile_T), prob_la, dt_la, steps))
+    push!(cases, ("KA CPU", LinearAdvectionState(prob_la; init=1.0, T=profile_T, backend=KernelAbstractionsBackend(:cpu)), prob_la, dt_la, steps))
+    if :metal in available_backends()
+        push!(cases, ("Metal", LinearAdvectionState(prob_la; init=1.0, T=profile_T, backend=KernelAbstractionsBackend(:metal)), prob_la, dt_la, steps))
+    end
+    return cases
+end
+
+profile("Linear Advection $(nx)x$(ny), steps=$(steps)", advection_cases)
+
+prob_eu = setup_compressible_euler_problem(nx÷2, ny÷2; gamma=1.4)
+init(x,y) = (; rho = 1.0 + 0.1 * sinpi(x), v1 = 0.05*cospi(y), v2 = 0.02*sinpi(x+y), p = 1.0 + 0.05*cospi(x))
+serial_state = CompressibleEulerState(prob_eu; init=init)
+dt_eu = stable_timestep(prob_eu, serial_state; cfl=0.3)
+steps_eu = 100
+
+function euler_cases()
+    cases = Tuple{String,Any,Any,Float64,Int}[]
+    push!(cases, ("Serial", CompressibleEulerState(prob_eu; init=init, T=profile_T), prob_eu, dt_eu, steps_eu))
+    push!(cases, ("KA CPU", CompressibleEulerState(prob_eu; init=init, T=profile_T, backend=KernelAbstractionsBackend(:cpu)), prob_eu, dt_eu, steps_eu))
+    if :metal in available_backends()
+        push!(cases, ("Metal", CompressibleEulerState(prob_eu; init=init, T=profile_T, backend=KernelAbstractionsBackend(:metal)), prob_eu, dt_eu, steps_eu))
+    end
+    return cases
+end
+
+profile("Compressible Euler $(nx÷2)x$(ny÷2), steps=$(steps_eu)", euler_cases)
diff --git a/ext/CodexMachCUDAExt.jl b/ext/CodexMachCUDAExt.jl
index 6c510e6..2b47f55 100644
--- a/ext/CodexMachCUDAExt.jl
+++ b/ext/CodexMachCUDAExt.jl
@@ -8,4 +8,6 @@ function __init__()
     CodexMach.register_backend!(:cuda, () -> KernelAbstractions.CUDADevice())
 end
 
+CodexMach._default_array_type_from_symbol(::Val{:cuda}) = CUDA.CuArray
+
 end # module
diff --git a/ext/CodexMachMetalExt.jl b/ext/CodexMachMetalExt.jl
index cfc9abe..2fa8915 100644
--- a/ext/CodexMachMetalExt.jl
+++ b/ext/CodexMachMetalExt.jl
@@ -5,7 +5,10 @@ using KernelAbstractions
 using Metal
 
 function __init__()
-    CodexMach.register_backend!(:metal, () -> KernelAbstractions.MetalDevice())
+    CodexMach.register_backend!(:metal, () -> Metal.MetalBackend())
 end
 
+CodexMach._default_array_type_from_symbol(::Val{:metal}) = Metal.MtlArray
+CodexMach._default_array_type_from_spec(::Metal.MetalBackend) = Metal.MtlArray
+
 end # module
diff --git a/src/backends.jl b/src/backends.jl
index 208d927..d699814 100644
--- a/src/backends.jl
+++ b/src/backends.jl
@@ -22,3 +22,19 @@ function describe(backend::ExecutionBackend)
     backend isa KernelAbstractionsBackend && return "KernelAbstractionsBackend($(backend.device))"
     return string(typeof(backend))
 end
+
+function default_array_type(::SerialBackend)
+    return Array
+end
+
+function default_array_type(backend::KernelAbstractionsBackend)
+    return _default_array_type_from_spec(backend.device)
+end
+
+_default_array_type_from_spec(::Any) = Array
+
+function _default_array_type_from_spec(spec::Symbol)
+    return _default_array_type_from_symbol(Val(spec))
+end
+
+_default_array_type_from_symbol(::Val) = Array
diff --git a/src/kernel_abstractions_support.jl b/src/kernel_abstractions_support.jl
index d9ee63a..ba3966f 100644
--- a/src/kernel_abstractions_support.jl
+++ b/src/kernel_abstractions_support.jl
@@ -41,6 +41,8 @@ end
 
 @inline function _ka_rusanov_flux_x(γ, ρL, rhouL, rhovL, EL,
                                     ρR, rhouR, rhovR, ER)
+    T = promote_type(typeof(ρL), typeof(ρR))
+    half = inv(T(2))
     uxL, uyL, pL = _ka_velocity_pressure(γ, ρL, rhouL, rhovL, EL)
     uxR, uyR, pR = _ka_velocity_pressure(γ, ρR, rhouR, rhovR, ER)
     cL = sqrt(abs(γ * pL / ρL))
@@ -57,16 +59,18 @@ end
     FR3 = rhouR * uyR
     FR4 = (ER + pR) * uxR
 
-    flux1 = 0.5 * (FL1 + FR1) - 0.5 * smax * (ρR - ρL)
-    flux2 = 0.5 * (FL2 + FR2) - 0.5 * smax * (rhouR - rhouL)
-    flux3 = 0.5 * (FL3 + FR3) - 0.5 * smax * (rhovR - rhovL)
-    flux4 = 0.5 * (FL4 + FR4) - 0.5 * smax * (ER - EL)
+    flux1 = half * (FL1 + FR1) - half * smax * (ρR - ρL)
+    flux2 = half * (FL2 + FR2) - half * smax * (rhouR - rhouL)
+    flux3 = half * (FL3 + FR3) - half * smax * (rhovR - rhovL)
+    flux4 = half * (FL4 + FR4) - half * smax * (ER - EL)
 
     return flux1, flux2, flux3, flux4
 end
 
 @inline function _ka_rusanov_flux_y(γ, ρL, rhouL, rhovL, EL,
                                     ρR, rhouR, rhovR, ER)
+    T = promote_type(typeof(ρL), typeof(ρR))
+    half = inv(T(2))
     uxL, uyL, pL = _ka_velocity_pressure(γ, ρL, rhouL, rhovL, EL)
     uxR, uyR, pR = _ka_velocity_pressure(γ, ρR, rhouR, rhovR, ER)
     cL = sqrt(abs(γ * pL / ρL))
@@ -83,10 +87,10 @@ end
     GR3 = rhovR * uyR + pR
     GR4 = (ER + pR) * uyR
 
-    flux1 = 0.5 * (GL1 + GR1) - 0.5 * smax * (ρR - ρL)
-    flux2 = 0.5 * (GL2 + GR2) - 0.5 * smax * (rhouR - rhouL)
-    flux3 = 0.5 * (GL3 + GR3) - 0.5 * smax * (rhovR - rhovL)
-    flux4 = 0.5 * (GL4 + GR4) - 0.5 * smax * (ER - EL)
+    flux1 = half * (GL1 + GR1) - half * smax * (ρR - ρL)
+    flux2 = half * (GL2 + GR2) - half * smax * (rhouR - rhouL)
+    flux3 = half * (GL3 + GR3) - half * smax * (rhovR - rhovL)
+    flux4 = half * (GL4 + GR4) - half * smax * (ER - EL)
 
     return flux1, flux2, flux3, flux4
 end
diff --git a/src/time_integration.jl b/src/time_integration.jl
index 69beb25..3853572 100644
--- a/src/time_integration.jl
+++ b/src/time_integration.jl
@@ -71,16 +71,15 @@ workspace(state::CompressibleEulerState) = state.workspace
 
 backend(state::CompressibleEulerState) = state.backend
 
-const _DefaultArrayType = Array
-
 function LinearAdvectionState(problem::LinearAdvectionProblem;
                               T::Type = Float64,
-                              array_type = _DefaultArrayType,
+                              array_type = nothing,
                               init = nothing,
                               backend::ExecutionBackend = default_backend())
     mesh_obj = mesh(problem)
     dims = size(mesh_obj)
-    field = allocate_cellfield(array_type, T, dims, 1)
+    array_type_val = array_type === nothing ? default_array_type(backend) : array_type
+    field = allocate_cellfield(array_type_val, T, dims, 1)
     _initialize_scalar_field!(field, init, mesh_obj, T)
 
     k1 = allocate_like(field)
@@ -106,8 +105,16 @@ function _initialize_scalar_field!(field::CellField, init, mesh::StructuredMesh,
     elseif init isa Function
         centers_x, centers_y = cell_centers(mesh)
         nx, ny = size(mesh)
-        @inbounds for j in 1:ny, i in 1:nx
-            data[i, j] = T(init(centers_x[i], centers_y[j]))
+        if data isa Array
+            @inbounds for j in 1:ny, i in 1:nx
+                data[i, j] = T(init(centers_x[i], centers_y[j]))
+            end
+        else
+            host = Array{T}(undef, nx, ny)
+            @inbounds for j in 1:ny, i in 1:nx
+                host[i, j] = T(init(centers_x[i], centers_y[j]))
+            end
+            data .= host
         end
     else
         throw(ArgumentError("Unsupported initializer type $(typeof(init))"))
@@ -119,14 +126,15 @@ const _EulerVarCount = 4
 
 function CompressibleEulerState(problem::CompressibleEulerProblem;
                                 T::Type = Float64,
-                                array_type = _DefaultArrayType,
+                                array_type = nothing,
                                 init = nothing,
                                 backend::ExecutionBackend = default_backend())
     mesh_obj = mesh(problem)
     dims = size(mesh_obj)
     eq = pde(problem)
 
-    field = allocate_cellfield(array_type, T, dims, _EulerVarCount)
+    array_type_val = array_type === nothing ? default_array_type(backend) : array_type
+    field = allocate_cellfield(array_type_val, T, dims, _EulerVarCount)
     _initialize_euler_field!(field, init, mesh_obj, eq, T)
 
     k1 = allocate_like(field)
@@ -161,9 +169,19 @@ function _initialize_euler_field!(field::CellField,
         centers_x, centers_y = cell_centers(mesh)
         nx, ny = size(mesh)
         γ = gamma(equation)
-        @inbounds for j in 1:ny, i in 1:nx
-            ρ, u, v, p = _extract_primitive(init, centers_x[i], centers_y[j])
-            _store_conserved!(field, i, j, ρ, u, v, p, γ, T)
+        density = component(field, 1)
+        if density isa Array
+            @inbounds for j in 1:ny, i in 1:nx
+                ρ, u, v, p = _extract_primitive(init, centers_x[i], centers_y[j])
+                _store_conserved!(field, i, j, ρ, u, v, p, γ, T)
+            end
+        else
+            host_field = allocate_cellfield(Array, T, (nx, ny), _EulerVarCount)
+            @inbounds for j in 1:ny, i in 1:nx
+                ρ, u, v, p = _extract_primitive(init, centers_x[i], centers_y[j])
+                _store_conserved!(host_field, i, j, ρ, u, v, p, γ, T)
+            end
+            copyto!(field, host_field)
         end
         return field
     else

From c581f29ecbdcb8d0f5af3d824abc2a8d35c587c9 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 7 Oct 2025 15:59:04 +0200
Subject: [PATCH 13/18] Allow running examples with KA.jl

---
 examples/convergence_linear_advection.jl | 27 ++++++++++++++++++++---
 examples/kelvin_helmholtz_euler.jl       | 28 ++++++++++++++++++++----
 examples/linear_advection_demo.jl        | 27 ++++++++++++++++++++++-
 3 files changed, 74 insertions(+), 8 deletions(-)

diff --git a/examples/convergence_linear_advection.jl b/examples/convergence_linear_advection.jl
index 0da6bbf..612e52f 100644
--- a/examples/convergence_linear_advection.jl
+++ b/examples/convergence_linear_advection.jl
@@ -5,7 +5,8 @@ using Printf
 """
     run_convergence_study(; base_resolution=(16, 16), levels=5,
                              lengths=(1.0, 1.0), velocity=(1.0, 0.5),
-                             final_time=0.5, cfl=0.45)
+                             final_time=0.5, cfl=0.45,
+                             backend=default_backend(), state_eltype=nothing)
 
 Execute a grid refinement study for periodic linear advection with a sinusoidal
 initial condition on a square domain. Prints the L₂ error for each resolution,
@@ -16,7 +17,9 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16),
                                 lengths::Tuple{<:Real,<:Real} = (1.0, 1.0),
                                 velocity::Tuple{<:Real,<:Real} = (1.0, 0.5),
                                 final_time::Real = 0.5,
-                                cfl::Real = 0.45)
+                                cfl::Real = 0.45,
+                                backend::Union{ExecutionBackend,Symbol} = default_backend(),
+                                state_eltype::Union{Nothing,Type} = nothing)
     @assert levels >= 2 "Need at least two levels to measure convergence"
 
     Lx, Ly = float(lengths[1]), float(lengths[2])
@@ -31,6 +34,8 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16),
     println(" level    nx    ny        dt          L2 error        EOC")
 
     init = _sinusoidal_initializer(Lx, Ly)
+    backend_obj = _resolve_example_backend(backend)
+    state_T = isnothing(state_eltype) ? _default_state_eltype(backend_obj) : state_eltype
 
     for level in 0:levels-1
         nx = nx0 * 2^level
@@ -38,7 +43,7 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16),
         problem = setup_linear_advection_problem(nx, ny;
                                                  lengths = (Lx, Ly),
                                                  velocity = vel)
-        state = LinearAdvectionState(problem; init = init)
+        state = LinearAdvectionState(problem; init = init, T = state_T, backend = backend_obj)
 
         dt_stable = stable_timestep(problem; cfl = cfl)
         steps = max(1, ceil(Int, final_time / dt_stable))
@@ -75,6 +80,22 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16),
             errors = errors, eocs = eocs)
 end
 
+function _resolve_example_backend(spec::ExecutionBackend)
+    return spec
+end
+
+function _resolve_example_backend(spec::Symbol)
+    return KernelAbstractionsBackend(spec)
+end
+
+function _default_state_eltype(::ExecutionBackend)
+    return Float64
+end
+
+function _default_state_eltype(::KernelAbstractionsBackend)
+    return Float32
+end
+
 function _sinusoidal_initializer(Lx::Float64, Ly::Float64)
     function init(x, y)
         return sin(2pi * x / Lx) * sin(2pi * y / Ly)
diff --git a/examples/kelvin_helmholtz_euler.jl b/examples/kelvin_helmholtz_euler.jl
index cf2ac25..3d33b57 100644
--- a/examples/kelvin_helmholtz_euler.jl
+++ b/examples/kelvin_helmholtz_euler.jl
@@ -5,7 +5,7 @@ using Plots
 
 """
     run_kelvin_helmholtz(; nx=256, ny=256, gamma=1.4, final_time=1.5,
-                           cfl=0.45, T=Float32, log_every=25,
+                           cfl=0.45, T=Float32, backend=default_backend(), log_every=25,
                            diagnostics_path=nothing,
                            pdf_path=nothing,
                            animation_path=nothing,
@@ -22,7 +22,8 @@ function run_kelvin_helmholtz(; nx::Int = 256,
                                gamma::Real = 1.4,
                                final_time::Real = 1.5,
                                cfl::Real = 0.45,
-                               T::Type = Float32,
+                               T::Union{Type,Nothing} = Float32,
+                               backend::Union{ExecutionBackend,Symbol} = default_backend(),
                                log_every::Integer = 25,
                                diagnostics_path::Union{Nothing,AbstractString} = nothing,
                                pdf_path::Union{Nothing,AbstractString} = nothing,
@@ -42,8 +43,11 @@ function run_kelvin_helmholtz(; nx::Int = 256,
                                                gamma = gamma,
                                                boundary_conditions = PeriodicBoundaryConditions())
 
-    init = _kelvin_helmholtz_initializer(T)
-    state = CompressibleEulerState(problem; T = T, init = init)
+    backend_obj = _resolve_example_backend(backend)
+    state_T = isnothing(T) ? _default_state_eltype(backend_obj) : T
+
+    init = _kelvin_helmholtz_initializer(state_T)
+    state = CompressibleEulerState(problem; T = state_T, init = init, backend = backend_obj)
 
     sim_time = 0.0
     step = 0
@@ -112,6 +116,22 @@ function run_kelvin_helmholtz(; nx::Int = 256,
             animation_path = animation_path)
 end
 
+function _resolve_example_backend(spec::ExecutionBackend)
+    return spec
+end
+
+function _resolve_example_backend(spec::Symbol)
+    return KernelAbstractionsBackend(spec)
+end
+
+function _default_state_eltype(::ExecutionBackend)
+    return Float64
+end
+
+function _default_state_eltype(::KernelAbstractionsBackend)
+    return Float32
+end
+
 function _kelvin_helmholtz_initializer(::Type{T}) where {T}
     slope = T(15)
     offset = T(7.5)
diff --git a/examples/linear_advection_demo.jl b/examples/linear_advection_demo.jl
index 0ef6360..1ed2348 100644
--- a/examples/linear_advection_demo.jl
+++ b/examples/linear_advection_demo.jl
@@ -4,6 +4,7 @@ using CodexMach
 """
     run_linear_advection_demo(; nx=256, ny=128, lengths=(2.0, 1.0),
                                velocity=(1.0, 0.0), cfl=0.4, steps=100,
+                               backend=default_backend(), state_eltype=nothing,
                                sample_every=nothing,
                                diagnostics_path=nothing, state_path=nothing,
                                match_return_period=true)
@@ -22,6 +23,8 @@ function run_linear_advection_demo(; nx::Int = 256,
                                     velocity::Tuple{<:Real,<:Real} = (1.0, 0.0),
                                     cfl::Real = 0.4,
                                     steps::Int = 100,
+                                    backend::Union{ExecutionBackend,Symbol} = default_backend(),
+                                    state_eltype::Union{Nothing,Type} = nothing,
                                     sample_every::Union{Nothing,Int} = nothing,
                                     diagnostics_path::Union{Nothing,AbstractString} = nothing,
                                     state_path::Union{Nothing,AbstractString} = nothing,
@@ -37,8 +40,14 @@ function run_linear_advection_demo(; nx::Int = 256,
     problem = setup_linear_advection_problem(nx_cells, ny_cells;
                                              velocity = velocity,
                                              lengths = lengths_tuple)
+    backend_obj = _resolve_example_backend(backend)
+    Tstate = isnothing(state_eltype) ? _default_state_eltype(backend_obj) : state_eltype
+
     init_field = _sine_blob_initializer(lengths_tuple)
-    state = LinearAdvectionState(problem; init = init_field)
+    state = LinearAdvectionState(problem;
+                                 init = init_field,
+                                 T = Tstate,
+                                 backend = backend_obj)
 
     dt_cfl = stable_timestep(problem; cfl = cfl)
     target_time = _return_period(lengths_tuple, velocity)
@@ -92,6 +101,22 @@ function run_linear_advection_demo(; nx::Int = 256,
                             target_time = target_time))
 end
 
+function _resolve_example_backend(spec::ExecutionBackend)
+    return spec
+end
+
+function _resolve_example_backend(spec::Symbol)
+    return KernelAbstractionsBackend(spec)
+end
+
+function _default_state_eltype(::ExecutionBackend)
+    return Float64
+end
+
+function _default_state_eltype(::KernelAbstractionsBackend)
+    return Float32
+end
+
 function _write_diagnostics_csv(path::AbstractString,
                                 records::Vector{NamedTuple{(:step, :time, :rms, :cfl), NTuple{4,Float64}}})
     open(path, "w") do io

From 765b21cb360d4011ce9a17b91f447344cef1291a Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 7 Oct 2025 16:03:54 +0200
Subject: [PATCH 14/18] Document stuff

---
 README.md            | 43 ++++++++++++++++++++++++++++++++++++-------
 docs/src/examples.md | 43 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 76 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index a4526c2..90b53a1 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,16 @@ julia --project=. examples/linear_advection_demo.jl
 ```
 
 It prints periodic RMS diagnostics along with the CFL number used for the run.
-Tune parameters by editing the keyword arguments in `run_linear_advection_demo`.
+The driver defaults to the serial backend with `Float64` storage, but you can
+switch to a registered KernelAbstractions backend (dropping to `Float32` by
+default on GPUs) via keyword arguments:
+
+```bash
+julia --project=. -e 'include("examples/linear_advection_demo.jl"); run_linear_advection_demo(backend=:metal)'
+```
+
+Override the precision explicitly with `state_eltype=Float64` if you want to run
+the GPU case in double precision (CUDA) or stay on `Float32` for Metal.
 
 To capture diagnostics, pass output paths (created if missing):
 
@@ -73,6 +82,12 @@ convergence (EOC), and finishes with the average EOC. Edit
 `run_convergence_study` inside the script to adjust the final time, CFL target,
 or refinement levels.
 
+To exercise a GPU backend, reuse the `backend` keyword:
+
+```bash
+julia --project=. -e 'include("examples/convergence_linear_advection.jl"); run_convergence_study(backend=:metal, levels=4)'
+```
+
 ## Kelvin-Helmholtz instability
 
 The compressible Euler path is exercised by
@@ -84,12 +99,26 @@ julia --project=. examples/kelvin_helmholtz_euler.jl
 ```
 
 By default the driver uses a 256×256 mesh, RK2 time stepping with adaptive CFL
-control, and prints periodic log messages. Pass a file path via
-`diagnostics_path` to capture per-step CFL and kinetic-energy measurements. The
-routine returns the final state so you can post-process density, vorticity, or
-other derived fields. Supply `pdf_path` to snapshot the terminal density field,
-and `animation_path` (MP4 or GIF) plus `animation_every`/`animation_fps` to
-produce a time-resolved movie.
+control, and prints periodic log messages. Set `backend=:metal` (or `:cuda`)
+to run the scenario on a GPU; the helper automatically falls back to `Float32`
+storage for KernelAbstractions backends while keeping `Float64` for the serial
+path. Pass a file path via `diagnostics_path` to capture per-step CFL and
+kinetic-energy measurements. The routine returns the final state so you can
+post-process density, vorticity, or other derived fields. Supply `pdf_path` to
+snapshot the terminal density field, and `animation_path` (MP4 or GIF) plus
+`animation_every`/`animation_fps` to produce a time-resolved movie.
+
+## Backend profiling
+
+Compare the serial and GPU paths with the profiling helper:
+
+```bash
+julia --project=. examples/profile_backends.jl
+```
+
+It benchmarks the RK2 loops for linear advection and compressible Euler across
+the serial and any registered KernelAbstractions backends (CPU, Metal, CUDA),
+forcing `Float32` to keep Metal hardware supported.
 
 ## Maintainer
 
diff --git a/docs/src/examples.md b/docs/src/examples.md
index 2e8d9b1..5078adc 100644
--- a/docs/src/examples.md
+++ b/docs/src/examples.md
@@ -24,7 +24,16 @@ until it returns to its starting position. By default it:
 The function returns a named tuple with the step size, final time, and diagnostic
 history. When you pass `diagnostics_path="diagnostics.csv"` it writes a CSV
 mirroring the on-screen log, and `state_path="state.csv"` captures the final
-solution for later visualisation.
+solution for later visualisation. By default the demo runs on the serial backend
+with `Float64` storage, but you can switch to a KernelAbstractions backend while
+automatically selecting a GPU-friendly element type:
+
+```
+julia --project=. -e 'include("examples/linear_advection_demo.jl"); run_linear_advection_demo(backend=:metal)'
+```
+
+Override the precision explicitly with `state_eltype=Float64` (for CUDA) or
+`state_eltype=Float32` if you want to minimise device memory traffic.
 
 To plot the diagnostics or the final field, call
 
@@ -43,7 +52,15 @@ julia --project=. examples/convergence_linear_advection.jl
 The convergence driver repeats the sinusoidal advection problem on a hierarchy
 of meshes. It prints the L² error for each resolution along with the observed
 order of accuracy. You should expect second-order convergence (EOC ≈ 2.0) once
-the mesh is sufficiently fine.
+the mesh is sufficiently fine. To assess GPU correctness, pass a backend symbol
+when calling the driver:
+
+```
+julia --project=. -e 'include("examples/convergence_linear_advection.jl"); run_convergence_study(backend=:metal, levels=4)'
+```
+
+The serial path remains the default, so simple invocations continue to run on
+the CPU without any extra keywords.
 
 ## Kelvin–Helmholtz Instability (Compressible Euler)
 
@@ -54,7 +71,16 @@ julia --project=. examples/kelvin_helmholtz_euler.jl
 This example launches a Kelvin–Helmholtz roll-up on a periodic square domain
 using the compressible Euler equations and the slope-limited Rusanov fluxes. The
 script logs CFL numbers, tracks the volume-averaged kinetic energy, and can
-produce publication-ready figures:
+produce publication-ready figures. To exercise GPU execution, pass
+`backend=:metal` (or `:cuda` if available); the helper will automatically fall
+back to `Float32` storage on GPU backends while keeping `Float64` on the serial
+path:
+
+```
+julia --project=. -e 'include("examples/kelvin_helmholtz_euler.jl"); run_kelvin_helmholtz(backend=:metal, final_time=1.0)'
+```
+
+Further output customisation includes:
 
 - set `diagnostics_path="diagnostics.csv"` to capture the per-step log;
 - set `pdf_path="khi_density.pdf"` for a static density snapshot at `t = final_time`;
@@ -65,6 +91,17 @@ A stable run with the default settings reaches `t = 1.5` in roughly a thousand
 steps with CFL ≈ 0.4–0.45. The density field develops rolled vortices that match
 the reference figures checked into the repository (`khi_*.pdf/mp4`).
 
+## Backend Profiling
+
+```
+julia --project=. examples/profile_backends.jl
+```
+
+This utility benchmarks the serial driver against available KernelAbstractions
+backends for both linear advection and compressible Euler. It uses `Float32`
+storage so Metal-backed GPUs remain supported and prints per-backend wall-clock
+timings for quick regressions after kernel changes.
+
 ## Tips
 
 - All drivers expose keyword arguments for mesh size, timestep control, and file

From 6354616e4800dbee2be9bf72364c23bb7d5b2516 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 7 Oct 2025 16:54:51 +0200
Subject: [PATCH 15/18] Update for EUler GPU stuff

---
 docs/src/api.md                    |  14 +++
 examples/kelvin_helmholtz_euler.jl |  33 ++++--
 src/equations.jl                   | 170 ++++++++++++++++++++++-------
 src/kernel_abstractions_support.jl |  40 +++++++
 src/time_integration.jl            |  95 ++++++++++------
 5 files changed, 275 insertions(+), 77 deletions(-)

diff --git a/docs/src/api.md b/docs/src/api.md
index 197d5ce..42212d9 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -62,6 +62,20 @@ stable_timestep
 cfl_number
 ```
 
+## Cell Fields
+
+```@docs
+CellField
+allocate_cellfield
+allocate_like
+map_components!
+cell_components
+component
+ncomponents
+spatial_size
+backend
+```
+
 ## Simulation Drivers
 
 ```@docs
diff --git a/examples/kelvin_helmholtz_euler.jl b/examples/kelvin_helmholtz_euler.jl
index 3d33b57..9f038a9 100644
--- a/examples/kelvin_helmholtz_euler.jl
+++ b/examples/kelvin_helmholtz_euler.jl
@@ -53,7 +53,8 @@ function run_kelvin_helmholtz(; nx::Int = 256,
     step = 0
     last_cfl = NaN
     records = diagnostics_path === nothing ? nothing : Vector{NamedTuple{(:step, :time, :cfl, :kinetic_energy),NTuple{4,Float64}}}()
-    prim_buffers = primitive_variables(problem, solution(state))
+    prim_buffers = primitive_variables(problem, solution(state);
+                                       backend = CodexMach.backend(state))
     last_progress = time()
     centers_x, centers_y = cell_centers(mesh(problem))
     animation_obj = animation_path === nothing ? nothing : Animation()
@@ -152,19 +153,31 @@ function _volume_average_kinetic_energy(state::CompressibleEulerState,
                                         problem::CompressibleEulerProblem,
                                         buffers)
     prim = primitive_variables(problem, solution(state);
+                                backend = CodexMach.backend(state),
                                 rho_out = buffers.rho,
                                 u_out = buffers.u,
                                 v_out = buffers.v,
                                 p_out = buffers.p)
 
-    nx, ny = size(prim.rho)
-    total = zero(eltype(prim.rho))
-    half = convert(eltype(prim.rho), 0.5)
-    @inbounds for j in 1:ny, i in 1:nx
-        vel2 = prim.u[i, j]^2 + prim.v[i, j]^2
-        total += half * prim.rho[i, j] * vel2
+    ρ = prim.rho
+    u = prim.u
+    v = prim.v
+    nx, ny = size(ρ)
+    if ρ isa Array
+        total = zero(eltype(ρ))
+        half = convert(eltype(ρ), 0.5)
+        @inbounds for j in 1:ny, i in 1:nx
+            vel2 = u[i, j]^2 + v[i, j]^2
+            total += half * ρ[i, j] * vel2
+        end
+        return float(total) / (nx * ny), prim
+    else
+        T = eltype(ρ)
+        half = inv(convert(T, 2))
+        vel2 = u .* u .+ v .* v
+        total = sum(ρ .* vel2 .* half)
+        return float(total) / (nx * ny), prim
     end
-    return float(total) / (nx * ny), prim
 end
 
 function _write_khi_diagnostics(path::AbstractString,
@@ -185,6 +198,7 @@ function _write_khi_pdf(path::AbstractString,
                         centers_x,
                         centers_y)
     prim = primitive_variables(problem, solution(state);
+                                backend = CodexMach.backend(state),
                                 rho_out = buffers.rho,
                                 u_out = buffers.u,
                                 v_out = buffers.v,
@@ -196,9 +210,10 @@ function _write_khi_pdf(path::AbstractString,
 end
 
 function _density_plot(centers_x, centers_y, density; title::AbstractString)
+    dens = density isa Array ? density : Array(density)
     heatmap(centers_x,
             centers_y,
-            density';
+            dens';
             xlabel = "x",
             ylabel = "y",
             title = title,
diff --git a/src/equations.jl b/src/equations.jl
index c129aaf..563f300 100644
--- a/src/equations.jl
+++ b/src/equations.jl
@@ -201,28 +201,38 @@ Convert a conserved-field array with layout `(4, nx, ny)` to primitive
 variables. Optionally provide preallocated output arrays via the keyword
 arguments. Returns a named tuple `(rho, u, v, p)`.
 """
-function primitive_variables(eq::CompressibleEuler,
-                             conserved::AbstractArray{T,3};
-                             rho_out = nothing,
-                             u_out = nothing,
-                             v_out = nothing,
-                             p_out = nothing) where {T}
-    size(conserved, 1) == 4 ||
-        throw(ArgumentError("Conserved field must have first dimension of length 4"))
+function _primitive_variables_cpu(eq::CompressibleEuler,
+                                  conserved::CellField;
+                                  rho_out = nothing,
+                                  u_out = nothing,
+                                  v_out = nothing,
+                                  p_out = nothing)
+    nx, ny = spatial_size(conserved)
 
-    nx, ny = size(conserved, 2), size(conserved, 3)
+    T = component_eltype(conserved)
+    out_T = float(T)
 
-    ρ = rho_out === nothing ? Array{float(T)}(undef, nx, ny) : rho_out
+    ρ = rho_out === nothing ? Array{out_T}(undef, nx, ny) : rho_out
     u = u_out === nothing ? similar(ρ) : u_out
     v = v_out === nothing ? similar(ρ) : v_out
     p = p_out === nothing ? similar(ρ) : p_out
 
+    size(ρ) == (nx, ny) ||
+        throw(ArgumentError("rho_out size mismatch"))
+    (size(u) == (nx, ny) && size(v) == (nx, ny) && size(p) == (nx, ny)) ||
+        throw(ArgumentError("Primitive output buffers must match conserved field shape"))
+
+    ρc = component(conserved, 1)
+    rhouc = component(conserved, 2)
+    rhovc = component(conserved, 3)
+    Ec = component(conserved, 4)
+
     @inbounds for j in 1:ny, i in 1:nx
         prim = primitive_variables(eq,
-                                   conserved[1, i, j],
-                                   conserved[2, i, j],
-                                   conserved[3, i, j],
-                                   conserved[4, i, j])
+                                   ρc[i, j],
+                                   rhouc[i, j],
+                                   rhovc[i, j],
+                                   Ec[i, j])
         ρ[i, j] = prim.ρ
         u[i, j] = prim.u
         v[i, j] = prim.v
@@ -232,34 +242,34 @@ function primitive_variables(eq::CompressibleEuler,
     return (; rho = ρ, u = u, v = v, p = p)
 end
 
-function primitive_variables(eq::CompressibleEuler,
-                             conserved::CellField;
-                             rho_out = nothing,
-                             u_out = nothing,
-                             v_out = nothing,
-                             p_out = nothing)
+function _primitive_variables_cpu(eq::CompressibleEuler,
+                                  conserved::AbstractArray{T,3};
+                                  rho_out = nothing,
+                                  u_out = nothing,
+                                  v_out = nothing,
+                                  p_out = nothing) where {T}
     size(conserved, 1) == 4 ||
         throw(ArgumentError("Conserved field must have first dimension of length 4"))
 
-    nx, ny = spatial_size(conserved)
+    nx, ny = size(conserved, 2), size(conserved, 3)
+    out_T = float(T)
 
-    T = component_eltype(conserved)
-    ρ = rho_out === nothing ? Array{float(T)}(undef, nx, ny) : rho_out
+    ρ = rho_out === nothing ? Array{out_T}(undef, nx, ny) : rho_out
     u = u_out === nothing ? similar(ρ) : u_out
     v = v_out === nothing ? similar(ρ) : v_out
     p = p_out === nothing ? similar(ρ) : p_out
 
-    ρc = component(conserved, 1)
-    rhouc = component(conserved, 2)
-    rhovc = component(conserved, 3)
-    Ec = component(conserved, 4)
+    size(ρ) == (nx, ny) ||
+        throw(ArgumentError("rho_out size mismatch"))
+    (size(u) == (nx, ny) && size(v) == (nx, ny) && size(p) == (nx, ny)) ||
+        throw(ArgumentError("Primitive output buffers must match conserved field shape"))
 
     @inbounds for j in 1:ny, i in 1:nx
         prim = primitive_variables(eq,
-                                   ρc[i, j],
-                                   rhouc[i, j],
-                                   rhovc[i, j],
-                                   Ec[i, j])
+                                   conserved[1, i, j],
+                                   conserved[2, i, j],
+                                   conserved[3, i, j],
+                                   conserved[4, i, j])
         ρ[i, j] = prim.ρ
         u[i, j] = prim.u
         v[i, j] = prim.v
@@ -269,14 +279,100 @@ function primitive_variables(eq::CompressibleEuler,
     return (; rho = ρ, u = u, v = v, p = p)
 end
 
+function _primitive_variables_gpu(eq::CompressibleEuler,
+                                  conserved::CellField,
+                                  backend::KernelAbstractionsBackend;
+                                  rho_out = nothing,
+                                  u_out = nothing,
+                                  v_out = nothing,
+                                  p_out = nothing)
+    size(conserved, 1) == 4 ||
+        throw(ArgumentError("Conserved field must have first dimension of length 4"))
+
+    ρc = component(conserved, 1)
+    rhouc = component(conserved, 2)
+    rhovc = component(conserved, 3)
+    Ec = component(conserved, 4)
+
+    ρ = rho_out === nothing ? similar(ρc) : rho_out
+    u = u_out === nothing ? similar(ρc) : u_out
+    v = v_out === nothing ? similar(ρc) : v_out
+    p = p_out === nothing ? similar(ρc) : p_out
+
+    size(ρ) == size(ρc) ||
+        throw(ArgumentError("rho_out size mismatch"))
+    (size(u) == size(ρc) && size(v) == size(ρc) && size(p) == size(ρc)) ||
+        throw(ArgumentError("Primitive output buffers must match conserved field shape"))
+
+    T = eltype(ρc)
+    γT = convert(T, gamma(eq))
+    γm1 = γT - one(γT)
+    epsT = eps(T)
+
+    primitive_variables_kernel!(backend,
+                                ρ, u, v, p,
+                                ρc, rhouc, rhovc, Ec,
+                                γm1, epsT)
+
+    return (; rho = ρ, u = u, v = v, p = p)
+end
+
+function primitive_variables(eq::CompressibleEuler,
+                             conserved::CellField;
+                             backend::ExecutionBackend = SerialBackend(),
+                             rho_out = nothing,
+                             u_out = nothing,
+                             v_out = nothing,
+                             p_out = nothing)
+    if backend isa KernelAbstractionsBackend
+        return _primitive_variables_gpu(eq, conserved, backend;
+                                        rho_out = rho_out,
+                                        u_out = u_out,
+                                        v_out = v_out,
+                                        p_out = p_out)
+    else
+        return _primitive_variables_cpu(eq, conserved;
+                                        rho_out = rho_out,
+                                        u_out = u_out,
+                                        v_out = v_out,
+                                        p_out = p_out)
+    end
+end
+
+function primitive_variables(eq::CompressibleEuler,
+                             conserved::AbstractArray{T,3};
+                             backend::ExecutionBackend = SerialBackend(),
+                             rho_out = nothing,
+                             u_out = nothing,
+                             v_out = nothing,
+                             p_out = nothing) where {T}
+    backend isa KernelAbstractionsBackend &&
+        throw(ArgumentError("primitive_variables does not support generic AbstractArray inputs on GPU"))
+    return _primitive_variables_cpu(eq, conserved;
+                                     rho_out = rho_out,
+                                     u_out = u_out,
+                                     v_out = v_out,
+                                     p_out = p_out)
+end
+
 primitive_variables(problem::CompressibleEulerProblem,
-                    state; kwargs...) =
-    primitive_variables(pde(problem), solution(state); kwargs...)
+                    state;
+                    backend::ExecutionBackend = backend(state), kwargs...) =
+    primitive_variables(pde(problem),
+                        solution(state);
+                        backend = backend,
+                        kwargs...)
 
 primitive_variables(problem::CompressibleEulerProblem,
-                    conserved::AbstractArray{T,3}; kwargs...) where {T} =
-    primitive_variables(pde(problem), conserved; kwargs...)
+                    conserved::AbstractArray{T,3};
+                    backend::ExecutionBackend = SerialBackend(), kwargs...) where {T} =
+    primitive_variables(pde(problem), conserved;
+                        backend = backend,
+                        kwargs...)
 
 primitive_variables(problem::CompressibleEulerProblem,
-                    conserved::CellField; kwargs...) =
-    primitive_variables(pde(problem), conserved; kwargs...)
+                    conserved::CellField;
+                    backend::ExecutionBackend = SerialBackend(), kwargs...) =
+    primitive_variables(pde(problem), conserved;
+                        backend = backend,
+                        kwargs...)
diff --git a/src/kernel_abstractions_support.jl b/src/kernel_abstractions_support.jl
index ba3966f..ad3c2e5 100644
--- a/src/kernel_abstractions_support.jl
+++ b/src/kernel_abstractions_support.jl
@@ -143,6 +143,31 @@ end
     end
 end
 
+@kernel function _primitive_variables_kernel!(ρ_out, u_out, v_out, p_out,
+                                              ρc, rhouc, rhovc, Ec,
+                                              γm1, epsT)
+    i, j = @index(Global, NTuple)
+    nx, ny = size(ρc)
+    if i <= nx && j <= ny
+        ρval = ρc[i, j]
+        ρval = ρval < epsT ? epsT : ρval
+        invρ = one(ρval) / ρval
+        ux = rhouc[i, j] * invρ
+        uy = rhovc[i, j] * invρ
+        half = inv(convert(typeof(ρval), 2))
+        kinetic = half * ρval * (ux^2 + uy^2)
+        internal = Ec[i, j] - kinetic
+        internal = internal < epsT ? epsT : internal
+        p = γm1 * internal
+        p = p < epsT ? epsT : p
+
+        ρ_out[i, j] = ρval
+        u_out[i, j] = ux
+        v_out[i, j] = uy
+        p_out[i, j] = p
+    end
+end
+
 function linear_advection_rhs_kernel!(backend::KernelAbstractionsBackend,
                                       du, u,
                                       nx::Int, ny::Int,
@@ -179,6 +204,21 @@ function rk2_update_kernel!(backend::KernelAbstractionsBackend,
     return u
 end
 
+function primitive_variables_kernel!(backend::KernelAbstractionsBackend,
+                                     ρ_out, u_out, v_out, p_out,
+                                     ρc, rhouc, rhovc, Ec,
+                                     γm1, epsT)
+    device = _resolve_ka_device(backend.device)
+    kernel = backend.workgroupsize === nothing ?
+        _primitive_variables_kernel!(device) :
+        _primitive_variables_kernel!(device, backend.workgroupsize)
+    kernel(ρ_out, u_out, v_out, p_out,
+           ρc, rhouc, rhovc, Ec,
+           γm1, epsT; ndrange = size(ρc))
+    KernelAbstractions.synchronize(device)
+    return ρ_out, u_out, v_out, p_out
+end
+
 @kernel function _compressible_euler_rhs_kernel!(dρ, drhou, drhov, dE,
                                                  ρ, rhou, rhov, E,
                                                  γ, inv_dx, inv_dy)
diff --git a/src/time_integration.jl b/src/time_integration.jl
index 3853572..29595df 100644
--- a/src/time_integration.jl
+++ b/src/time_integration.jl
@@ -749,31 +749,8 @@ function cfl_number(problem::CompressibleEulerProblem,
     mesh_obj = mesh(problem)
     dx, dy = spacing(mesh_obj)
     γ = gamma(pde(problem))
-    u = solution(state)
-    nx, ny = size(mesh_obj)
-
-    max_ax = zero(eltype(u))
-    max_ay = zero(eltype(u))
-
-    ρ = component(u, 1)
-    rhou = component(u, 2)
-    rhov = component(u, 3)
-    E = component(u, 4)
-
-    @inbounds for j in 1:ny, i in 1:nx
-        ρval = ρ[i, j]
-        rhouval = rhou[i, j]
-        rhovval = rhov[i, j]
-        Eval = E[i, j]
-        invρ = one(ρval) / ρval
-        ux = rhouval * invρ
-        uy = rhovval * invρ
-        kinetic = 0.5 * ρval * (ux^2 + uy^2)
-        p = (γ - one(γ)) * (Eval - kinetic)
-        c = sqrt(abs(γ * p * invρ))
-        max_ax = max(max_ax, abs(ux) + c)
-        max_ay = max(max_ay, abs(uy) + c)
-    end
+    backend_obj = backend(state)
+    max_ax, max_ay = _euler_characteristics(solution(state), γ, backend_obj)
 
     dtT = float(dt)
     return dtT * (max_ax / dx + max_ay / dy)
@@ -790,12 +767,49 @@ function stable_timestep(problem::CompressibleEulerProblem,
                          cfl::Real = 0.45)
     cfl > 0 || throw(ArgumentError("CFL target must be positive"))
 
+    backend_obj = backend(state)
+    if backend_obj isa KernelAbstractionsBackend
+        return _stable_timestep_gpu(problem, state, cfl, backend_obj)
+    else
+        return _stable_timestep_cpu(problem, state, cfl)
+    end
+end
+
+function _stable_timestep_cpu(problem::CompressibleEulerProblem,
+                              state::CompressibleEulerState,
+                              cfl::Real)
     mesh_obj = mesh(problem)
     dx, dy = spacing(mesh_obj)
     γ = gamma(pde(problem))
-    u = solution(state)
-    nx, ny = size(mesh_obj)
+    max_ax, max_ay = _euler_characteristics(solution(state), γ, SerialBackend())
+
+    denom = (max_ax == 0 ? zero(Float64) : max_ax / dx) +
+            (max_ay == 0 ? zero(Float64) : max_ay / dy)
+
+    iszero(denom) && return Inf
+
+    return float(cfl) / denom
+end
+
+function _stable_timestep_gpu(problem::CompressibleEulerProblem,
+                              state::CompressibleEulerState,
+                              cfl::Real,
+                              backend_obj::KernelAbstractionsBackend)
+    mesh_obj = mesh(problem)
+    dx, dy = spacing(mesh_obj)
+    γ = gamma(pde(problem))
+    max_ax, max_ay = _euler_characteristics(solution(state), γ, backend_obj)
 
+    denom = (max_ax == 0 ? zero(Float64) : max_ax / dx) +
+            (max_ay == 0 ? zero(Float64) : max_ay / dy)
+
+    iszero(denom) && return Inf
+
+    return float(cfl) / denom
+end
+
+function _euler_characteristics(u, γ, ::ExecutionBackend)
+    nx, ny = size(component(u, 1))
     max_ax = zero(eltype(u))
     max_ay = zero(eltype(u))
 
@@ -819,12 +833,31 @@ function stable_timestep(problem::CompressibleEulerProblem,
         max_ay = max(max_ay, abs(uy) + c)
     end
 
-    denom = (max_ax == 0 ? zero(Float64) : max_ax / dx) +
-            (max_ay == 0 ? zero(Float64) : max_ay / dy)
+    return float(max_ax), float(max_ay)
+end
 
-    iszero(denom) && return Inf
+function _euler_characteristics(u, γ, backend::KernelAbstractionsBackend)
+    ρ = component(u, 1)
+    rhou = component(u, 2)
+    rhov = component(u, 3)
+    E = component(u, 4)
 
-    return float(cfl) / denom
+    T = eltype(ρ)
+    γT = convert(T, γ)
+    half = inv(convert(T, 2))
+    epsT = eps(T)
+
+    ux = rhou ./ ρ
+    uy = rhov ./ ρ
+    vel2 = ux .* ux .+ uy .* uy
+    kinetic = half .* ρ .* vel2
+    internal = max.(E .- kinetic, epsT)
+    p = max.((γT - one(γT)) .* internal, epsT)
+    c = sqrt.(abs.(γT .* p ./ ρ))
+
+    max_ax = float(maximum(abs.(ux) .+ c))
+    max_ay = float(maximum(abs.(uy) .+ c))
+    return max_ax, max_ay
 end
 
 # Utility micro-kernels for Euler RHS

From 38e0c1e0708fad05dfd645b9bbce2f336fc2f184 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 7 Oct 2025 17:04:31 +0200
Subject: [PATCH 16/18] Fix docstrings?

---
 docs/src/api.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/src/api.md b/docs/src/api.md
index 42212d9..b6d32b7 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -48,6 +48,7 @@ CompressibleEulerProblem
 setup_compressible_euler_problem
 CompressibleEulerState
 primitive_variables
+_primitive_variables_cpu
 ```
 
 ## Time Integration

From a6163897d3e61318e74ee8b0ec3fb24d71914bc9 Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 7 Oct 2025 17:14:40 +0200
Subject: [PATCH 17/18] Fix linear advection example

---
 examples/convergence_linear_advection.jl | 3 ++-
 src/time_integration.jl                  | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/convergence_linear_advection.jl b/examples/convergence_linear_advection.jl
index 612e52f..f970647 100644
--- a/examples/convergence_linear_advection.jl
+++ b/examples/convergence_linear_advection.jl
@@ -52,7 +52,8 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16),
         run_linear_advection!(state, problem; steps = steps, dt = dt)
 
         u_field = solution(state)
-        u_num = scalar_component(u_field)
+        u_num_backend = scalar_component(u_field)
+        u_num = Array(u_num_backend)  # copy to host for error evaluation
         u_exact = _exact_solution(problem, vel, final_time, u_num)
 
         err = sqrt(sum(abs2, u_num .- u_exact) / length(u_num))
diff --git a/src/time_integration.jl b/src/time_integration.jl
index 29595df..cc42eb8 100644
--- a/src/time_integration.jl
+++ b/src/time_integration.jl
@@ -114,7 +114,7 @@ function _initialize_scalar_field!(field::CellField, init, mesh::StructuredMesh,
             @inbounds for j in 1:ny, i in 1:nx
                 host[i, j] = T(init(centers_x[i], centers_y[j]))
             end
-            data .= host
+            copyto!(data, host)
         end
     else
         throw(ArgumentError("Unsupported initializer type $(typeof(init))"))

From dfa6a90dd16b4754bff7e0c74fd8c03adb518d1e Mon Sep 17 00:00:00 2001
From: Michael Schlottke-Lakemper <michael@sloede.com>
Date: Tue, 7 Oct 2025 17:19:31 +0200
Subject: [PATCH 18/18] Update docs

---
 README.md            | 50 +++++++++++++++++---------------
 docs/src/examples.md | 69 ++++++++++++++++++++++----------------------
 2 files changed, 61 insertions(+), 58 deletions(-)

diff --git a/README.md b/README.md
index 90b53a1..31fd0aa 100644
--- a/README.md
+++ b/README.md
@@ -27,30 +27,28 @@ The repository ships with a bare `docs/` folder ready to host Documenter.jl-base
 
 ## Example simulation
 
-The `examples/linear_advection_demo.jl` script wires together mesh generation,
-problem setup, RK2 time integration, and the high-level driver. Run it from the
-repository root:
+Run the linear advection demo on the CPU:
 
 ```bash
-julia --project=. examples/linear_advection_demo.jl
+julia --project=run examples/linear_advection_demo.jl
 ```
 
-It prints periodic RMS diagnostics along with the CFL number used for the run.
-The driver defaults to the serial backend with `Float64` storage, but you can
-switch to a registered KernelAbstractions backend (dropping to `Float32` by
-default on GPUs) via keyword arguments:
+Run the same demo on a Metal-capable GPU:
 
 ```bash
-julia --project=. -e 'include("examples/linear_advection_demo.jl"); run_linear_advection_demo(backend=:metal)'
+julia --project=run -e 'using Metal; include("examples/linear_advection_demo.jl"); run_linear_advection_demo(backend=:metal)'
 ```
 
-Override the precision explicitly with `state_eltype=Float64` if you want to run
-the GPU case in double precision (CUDA) or stay on `Float32` for Metal.
+The script wires together mesh generation, problem setup, RK2 time integration,
+and the high-level driver. It prints periodic RMS diagnostics along with the CFL
+number used for the run. Override the precision explicitly with
+`state_eltype=Float64` if you want to run the GPU case in double precision (CUDA)
+or stay on `Float32` for Metal.
 
 To capture diagnostics, pass output paths (created if missing):
 
 ```bash
-julia --project=. examples/linear_advection_demo.jl diagnostics.csv final_state.csv
+julia --project=run examples/linear_advection_demo.jl diagnostics.csv final_state.csv
 ```
 
 The first file lists sampled step/time/RMS/CFL data, and the second stores the
@@ -62,7 +60,7 @@ To render the sampled diagnostics (and optionally the final field), use the
 helper script:
 
 ```bash
-julia --project=. examples/plot_linear_advection.jl diagnostics.csv final_state.csv plot.png
+julia --project=run examples/plot_linear_advection.jl diagnostics.csv final_state.csv plot.png
 ```
 
 Install `Plots.jl` in your environment first (`import Pkg; Pkg.add("Plots")`).
@@ -70,10 +68,16 @@ The script falls back to a readable error if the dependency is missing.
 
 ## Convergence study
 
-To verify spatial accuracy, run the periodic convergence sweep:
+CPU run:
 
 ```bash
-julia --project=. examples/convergence_linear_advection.jl
+julia --project=run examples/convergence_linear_advection.jl
+```
+
+Metal GPU run:
+
+```bash
+julia --project=run -e 'using Metal; include("examples/convergence_linear_advection.jl"); run_convergence_study(backend=:metal, levels=4)'
 ```
 
 The driver evolves a sinusoidal field across five nested grids, prints the L₂
@@ -82,20 +86,18 @@ convergence (EOC), and finishes with the average EOC. Edit
 `run_convergence_study` inside the script to adjust the final time, CFL target,
 or refinement levels.
 
-To exercise a GPU backend, reuse the `backend` keyword:
+## Kelvin-Helmholtz instability
+
+CPU run:
 
 ```bash
-julia --project=. -e 'include("examples/convergence_linear_advection.jl"); run_convergence_study(backend=:metal, levels=4)'
+julia --project=run examples/kelvin_helmholtz_euler.jl
 ```
 
-## Kelvin-Helmholtz instability
-
-The compressible Euler path is exercised by
-`examples/kelvin_helmholtz_euler.jl`, which seeds a periodic Kelvin-Helmholtz
-roll-up on a square box:
+Metal GPU run:
 
 ```bash
-julia --project=. examples/kelvin_helmholtz_euler.jl
+julia --project=run -e 'using Metal; include("examples/kelvin_helmholtz_euler.jl"); run_kelvin_helmholtz(backend=:metal, final_time=1.0)'
 ```
 
 By default the driver uses a 256×256 mesh, RK2 time stepping with adaptive CFL
@@ -113,7 +115,7 @@ snapshot the terminal density field, and `animation_path` (MP4 or GIF) plus
 Compare the serial and GPU paths with the profiling helper:
 
 ```bash
-julia --project=. examples/profile_backends.jl
+julia --project=run examples/profile_backends.jl
 ```
 
 It benchmarks the RK2 loops for linear advection and compressible Euler across
diff --git a/docs/src/examples.md b/docs/src/examples.md
index 5078adc..5b75e0a 100644
--- a/docs/src/examples.md
+++ b/docs/src/examples.md
@@ -5,13 +5,21 @@ CurrentModule = CodexMach
 ```
 
 The `examples/` directory contains reproducible scripts that exercise the core
-numerics. You can run each script directly with Julia once the project
-environment has been instantiated (`julia --project=.`).
+numerics. You can run each script directly with Julia once the `run/` project
+environment has been instantiated (`julia --project=run`).
 
 ## Linear Advection Demo
 
+CPU:
+
+```
+julia --project=run examples/linear_advection_demo.jl
+```
+
+Metal GPU:
+
 ```
-julia --project=. examples/linear_advection_demo.jl
+julia --project=run -e 'using Metal; include("examples/linear_advection_demo.jl"); run_linear_advection_demo(backend=:metal)'
 ```
 
 This script transports a smooth sine blob around a rectangular periodic domain
@@ -24,63 +32,56 @@ until it returns to its starting position. By default it:
 The function returns a named tuple with the step size, final time, and diagnostic
 history. When you pass `diagnostics_path="diagnostics.csv"` it writes a CSV
 mirroring the on-screen log, and `state_path="state.csv"` captures the final
-solution for later visualisation. By default the demo runs on the serial backend
-with `Float64` storage, but you can switch to a KernelAbstractions backend while
-automatically selecting a GPU-friendly element type:
-
-```
-julia --project=. -e 'include("examples/linear_advection_demo.jl"); run_linear_advection_demo(backend=:metal)'
-```
-
-Override the precision explicitly with `state_eltype=Float64` (for CUDA) or
-`state_eltype=Float32` if you want to minimise device memory traffic.
+solution for later visualisation. Override the precision explicitly with
+`state_eltype=Float64` (for CUDA) or `state_eltype=Float32` if you want to minimise
+device memory traffic.
 
 To plot the diagnostics or the final field, call
 
 ```
-julia --project=. examples/plot_linear_advection.jl diagnostics.csv state.csv
+julia --project=run examples/plot_linear_advection.jl diagnostics.csv state.csv
 ```
 
 which generates a PDF overlaying the analytic and numerical solutions.
 
 ## Linear Advection Convergence Study
 
+CPU:
+
 ```
-julia --project=. examples/convergence_linear_advection.jl
+julia --project=run examples/convergence_linear_advection.jl
 ```
 
-The convergence driver repeats the sinusoidal advection problem on a hierarchy
-of meshes. It prints the L² error for each resolution along with the observed
-order of accuracy. You should expect second-order convergence (EOC ≈ 2.0) once
-the mesh is sufficiently fine. To assess GPU correctness, pass a backend symbol
-when calling the driver:
+Metal GPU:
 
 ```
-julia --project=. -e 'include("examples/convergence_linear_advection.jl"); run_convergence_study(backend=:metal, levels=4)'
+julia --project=run -e 'using Metal; include("examples/convergence_linear_advection.jl"); run_convergence_study(backend=:metal, levels=4)'
 ```
 
-The serial path remains the default, so simple invocations continue to run on
-the CPU without any extra keywords.
+The convergence driver repeats the sinusoidal advection problem on a hierarchy
+of meshes. It prints the L² error for each resolution along with the observed
+order of accuracy. You should expect second-order convergence (EOC ≈ 2.0) once
+the mesh is sufficiently fine. The serial path remains the default, so simple
+invocations continue to run on the CPU without any extra keywords.
 
 ## Kelvin–Helmholtz Instability (Compressible Euler)
 
+CPU:
+
 ```
-julia --project=. examples/kelvin_helmholtz_euler.jl
+julia --project=run examples/kelvin_helmholtz_euler.jl
 ```
 
-This example launches a Kelvin–Helmholtz roll-up on a periodic square domain
-using the compressible Euler equations and the slope-limited Rusanov fluxes. The
-script logs CFL numbers, tracks the volume-averaged kinetic energy, and can
-produce publication-ready figures. To exercise GPU execution, pass
-`backend=:metal` (or `:cuda` if available); the helper will automatically fall
-back to `Float32` storage on GPU backends while keeping `Float64` on the serial
-path:
+Metal GPU:
 
 ```
-julia --project=. -e 'include("examples/kelvin_helmholtz_euler.jl"); run_kelvin_helmholtz(backend=:metal, final_time=1.0)'
+julia --project=run -e 'using Metal; include("examples/kelvin_helmholtz_euler.jl"); run_kelvin_helmholtz(backend=:metal, final_time=1.0)'
 ```
 
-Further output customisation includes:
+This example launches a Kelvin–Helmholtz roll-up on a periodic square domain
+using the compressible Euler equations and the slope-limited Rusanov fluxes. The
+script logs CFL numbers, tracks the volume-averaged kinetic energy, and can
+produce publication-ready figures. Further output customisation includes:
 
 - set `diagnostics_path="diagnostics.csv"` to capture the per-step log;
 - set `pdf_path="khi_density.pdf"` for a static density snapshot at `t = final_time`;
@@ -94,7 +95,7 @@ the reference figures checked into the repository (`khi_*.pdf/mp4`).
 ## Backend Profiling
 
 ```
-julia --project=. examples/profile_backends.jl
+julia --project=run examples/profile_backends.jl
 ```
 
 This utility benchmarks the serial driver against available KernelAbstractions