firedrakeproject · dsroberts · Mar 6, 2026 · Mar 10, 2026 · Mar 10, 2026 · Mar 11, 2026
diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
@@ -0,0 +1,4 @@
+self-hosted-runner:
+  labels:
+    # Custom label for GPU-enabled self-hosted runners
+    - gpu
diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml
@@ -23,6 +23,10 @@ on:
         description: Whether to test using macOS
         type: boolean
         default: false
+      test_gpu:
+        description: Whether to test using CUDA-enabled PETSc
+        type: boolean
+        default: false
       deploy_website:
         description: Whether to deploy the website
         type: boolean
@@ -54,6 +58,10 @@ on:
         description: Whether to test using macOS
         type: boolean
         default: false
+      test_gpu:
+        description: Whether to test using CUDA-enabled PETSc
+        type: boolean
+        default: false
       deploy_website:
         description: Whether to deploy the website
         type: boolean
@@ -465,6 +473,141 @@ jobs:
         run: |
           find . -delete
 
+  test_gpu:
+    name: Build and test Firedrake (Linux CUDA)
+    runs-on: [self-hosted, Linux, gpu]
+    container:
+      image: ubuntu:latest
+      options: --gpus all
+    if: inputs.test_gpu
+    env:
+      OMPI_ALLOW_RUN_AS_ROOT: 1
+      OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1
+      OMP_NUM_THREADS: 1
+      OPENBLAS_NUM_THREADS: 1
+      FIREDRAKE_CI: 1
+      PYOP2_SPMD_STRICT: 1
+      # Disable fast math as it exposes compiler bugs
+      PYOP2_CFLAGS: -fno-fast-math
+      # NOTE: One should occasionally update test_durations.json by running
+      # 'make test_durations' inside a 'firedrake:latest' Docker image.
+      EXTRA_PYTEST_ARGS: --splitting-algorithm least_duration --timeout=600 --timeout-method=thread -o faulthandler_timeout=660 --durations-path=./firedrake-repo/tests/test_durations.json --durations=50
+      PYTEST_MPI_MAX_NPROCS: 8
+      PETSC_OPTIONS: -use_gpu_aware_mpi 0
+      EXTRA_OPTIONS: -use_gpu_aware_mpi 0
+    steps:
+      - name: Confirm Nvidia GPUs are enabled
+        # The presence of the nvidia-smi command indicates that the Nvidia drivers have
+        # successfully been imported into the container, there is no point continuing
+        # if nvidia-smi is not present
+        run: nvidia-smi
+
+      - name: Fix HOME
+        # For unknown reasons GitHub actions overwrite HOME to /github/home
+        # which will break everything unless fixed
+        # (https://github.com/actions/runner/issues/863)
+        run: echo "HOME=/root" >> "$GITHUB_ENV"
+
+
+      # Git is needed for actions/checkout and Python for firedrake-configure
+      # curl needed for adding new deb repositories to ubuntu
+      - name: Install system dependencies (1)
+        run: |
+          apt-get update
+          apt-get -y install git python3 curl
+
+
+      - name: Pre-run cleanup
+        # Make sure the current directory is empty
+        run: find . -delete
+
+      - uses: actions/checkout@v5
+        with:
+          path: firedrake-repo
+          ref: ${{ inputs.source_ref }}
+
+      - name: Add Nvidia CUDA deb repositories
+        run: |
+          deburl=$( python3 ./firedrake-repo/scripts/firedrake-configure --show-extra-repo-pkg-url --gpu-arch cuda )
+          debfile=$( basename "${deburl}" )
+          curl -fsSLO "${deburl}"
+          dpkg -i "${debfile}"
+          apt-get update
+
+      - name: Install system dependencies (2)
+        run: |
+          apt-get -y install \
+            $(python3 ./firedrake-repo/scripts/firedrake-configure --arch default --gpu-arch cuda --show-system-packages)
+          apt-get -y install python3-venv
+          : # Dependencies needed to run the test suite
+          apt-get -y install fonts-dejavu graphviz graphviz-dev parallel poppler-utils
+
+      - name: Install PETSc
+        run: |
+          if [ ${{ inputs.target_branch }} = 'release' ]; then
+            git clone --depth 1 \
+              --branch $(python3 ./firedrake-repo/scripts/firedrake-configure --gpu-arch cuda --show-petsc-version) \
+              https://gitlab.com/petsc/petsc.git
+          else
+            git clone --depth 1 https://gitlab.com/petsc/petsc.git
+          fi
+          cd petsc
+          python3 ../firedrake-repo/scripts/firedrake-configure \
+            --arch default --gpu-arch cuda --show-petsc-configure-options | \
+            xargs -L1 ./configure --with-make-np=4
+          make
+          make check
+          {
+            echo "PETSC_DIR=/__w/firedrake/firedrake/petsc"
+            echo "PETSC_ARCH=arch-firedrake-default-cuda"
+            echo "SLEPC_DIR=/__w/firedrake/firedrake/petsc/arch-firedrake-default-cuda"
+          } >> "$GITHUB_ENV"
+
+      - name: Install Firedrake
+        id: install
+        run: |
+          export $(python3 ./firedrake-repo/scripts/firedrake-configure --arch default --gpu-arch cuda --show-env)
+          python3 -m venv venv
+          . venv/bin/activate
+
+          : # Empty the pip cache to ensure that everything is compiled from scratch
+          pip cache purge
+
+          if [ ${{ inputs.target_branch }} = 'release' ]; then
+            EXTRA_PIP_FLAGS=''
+          else
+            : # Install build dependencies
+            pip install "$PETSC_DIR"/src/binding/petsc4py
+            pip install -r ./firedrake-repo/requirements-build.txt
+
+            : # We have to pass '--no-build-isolation' to use a custom petsc4py
+            EXTRA_PIP_FLAGS='--no-build-isolation'
+          fi
+
+          pip install --verbose $EXTRA_PIP_FLAGS \
+            --no-binary h5py \
+            './firedrake-repo[check]'
+
+          firedrake-clean
+          pip list
+
+      - name: Run smoke tests
+        run: |
+          . venv/bin/activate
+          firedrake-check
+        timeout-minutes: 10
+
+      - name: Verify GPU usage
+        run: |
+          . venv/bin/activate
+          export PETSC_OPTIONS="${PETSC_OPTIONS} -log_view_gpu_time -log_view"
+          python3 ./firedrake-repo/tests/firedrake/offload/test_poisson_offloading_pc.py
+
+      - name: Post-run cleanup
+        if: always()
+        run: |
+          find . -delete
+
   lint:
     name: Lint codebase
     runs-on: ubuntu-latest

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
@@ -12,4 +12,6 @@ jobs:
       target_branch: ${{ github.base_ref }}
       # Only run macOS tests if the PR is labelled 'macOS'
       test_macos: ${{ contains(github.event.pull_request.labels.*.name, 'macOS') }}
+      # Only run GPU tests if the PR is labelled 'gpu'
+      test_gpu: ${{ contains(github.event.pull_request.labels.*.name, 'gpu') }}
     secrets: inherit
diff --git a/firedrake/__init__.py b/firedrake/__init__.py
@@ -88,7 +88,7 @@ def init_petsc():
     ASMLinesmoothPC, ASMExtrudedStarPC, AssembledPC, AuxiliaryOperatorPC,
     MassInvPC, PCDPC, PatchPC, PlaneSmoother, PatchSNES, P1PC, P1SNES,
     LORPC, GTMGPC, PMGPC, PMGSNES, HypreAMS, HypreADS, FDMPC,
-    PoissonFDMPC, TwoLevelPC, HiptmairPC, FacetSplitPC, BDDCPC
+    PoissonFDMPC, TwoLevelPC, HiptmairPC, FacetSplitPC, BDDCPC, OffloadPC
 )
 from firedrake.mesh import (  # noqa: F401
     Mesh, ExtrudedMesh, VertexOnlyMesh, RelabeledMesh,

diff --git a/firedrake/preconditioners/__init__.py b/firedrake/preconditioners/__init__.py
@@ -9,6 +9,7 @@
     AssembledPC, AuxiliaryOperatorPC
 )
 from firedrake.preconditioners.massinv import MassInvPC  # noqa: F401
+from firedrake.preconditioners.offload import OffloadPC  # noqa: F401
 from firedrake.preconditioners.pcd import PCDPC  # noqa: F401
 from firedrake.preconditioners.patch import (  # noqa: F401
     PatchPC, PlaneSmoother, PatchSNES

diff --git a/firedrake/preconditioners/offload.py b/firedrake/preconditioners/offload.py
@@ -0,0 +1,99 @@
+from firedrake.preconditioners.assembled import AssembledPC
+from firedrake.petsc import PETSc
+from firedrake.utils import device_matrix_type
+from firedrake.logging import logger
+from functools import cache
+import warnings
+
+import firedrake.dmhooks as dmhooks
+
+__all__ = ("OffloadPC",)
+
+
+@cache
+def offload_mat_type(pc_comm_rank) -> str | None:
+    mat_type = device_matrix_type()
+    if mat_type is None:
+        if pc_comm_rank == 0:
+            warnings.warn(
+                "This installation of Firedrake is not GPU-enabled, therefore OffloadPC"
+                "will do nothing. For this preconditioner to function correctly PETSc"
+                "will need to be rebuilt with some GPU capability (e.g. '--with-cuda=1')."
+            )
+        return None
+    try:
+        dev = PETSc.Device.create()
+    except PETSc.Error:
+        if pc_comm_rank == 0:
+            logger.warning(
+                "This installation of Firedrake is GPU-enabled, but no GPU device has"
+                "been detected. OffloadPC will do nothing on this host"
+            )
+        return None
+    if dev.getDeviceType() == "HOST":
+        raise RuntimeError(
+            "A GPU-enabled Firedrake build has been detected, and GPU hardware has been"
+            "detected but a GPU device was unable to be initialised."
+        )
+    dev.destroy()
+    return mat_type
+
+
+class OffloadPC(AssembledPC):
+    """Offload PC from CPU to GPU and back.
+
+    Internally this makes a PETSc PC object that can be controlled by
+    options using the extra options prefix ``offload_``.
+    """
+
+    _prefix = "offload_"
+
+    def initialize(self, pc):
+        # Check if our PETSc installation is GPU enabled
+        super().initialize(pc)
+        self.offload_mat_type = offload_mat_type(pc.comm.rank)
+        if self.offload_mat_type is not None:
+            with PETSc.Log.Event("Event: initialize offload"):
+                A, P = pc.getOperators()
+
+                # Convert matrix to ajicusparse
+                with PETSc.Log.Event("Event: matrix offload"):
+                    P_cu = P.convert(self.offload_mat_type)  # todo
+
+            # Transfer nullspace
+            P_cu.setNullSpace(P.getNullSpace())
+            P_cu.setTransposeNullSpace(P.getTransposeNullSpace())
+            P_cu.setNearNullSpace(P.getNearNullSpace())
+
+            # Update preconditioner with GPU matrix
+            self.pc.setOperators(A, P_cu)
+
+    # Convert vectors to CUDA, solve and get solution on CPU back
+    def apply(self, pc, x, y):
+        if self.offload_mat_type is None:
+            self.pc.apply(x, y)
+        else:
+            with PETSc.Log.Event("Event: apply offload"):  #
+                dm = pc.getDM()
+                with dmhooks.add_hooks(dm, self, appctx=self._ctx_ref):
+                    with PETSc.Log.Event("Event: vectors offload"):
+                        y_cu = PETSc.Vec()  # begin
+                        y_cu.createCUDAWithArrays(y)
-                        y_cu = PETSc.Vec()  # begin
-                        y_cu.createCUDAWithArrays(y)
+                        y_cu = PETSc.Vec().createCUDAWithArrays(y)
-                        y_cu = PETSc.Vec()  # begin
-                        y_cu.createCUDAWithArrays(y)
+                        y_cu = PETSc.Vec().createCUDAWithArrays(y)
+                        x_cu = PETSc.Vec()
+                        # Passing a vec into another vec doesnt work because original is locked
+                        x_cu.createCUDAWithArrays(x.array_r)
-                        x_cu = PETSc.Vec()
-                        # Passing a vec into another vec doesnt work because original is locked
-                        x_cu.createCUDAWithArrays(x.array_r)
+                        # Passing a vec into another vec doesnt work because original is locked
+                        x_cu = PETSc.Vec().createCUDAWithArrays(x.array_r)
-                        x_cu = PETSc.Vec()
-                        # Passing a vec into another vec doesnt work because original is locked
-                        x_cu.createCUDAWithArrays(x.array_r)
+                        # Passing a vec into another vec doesnt work because original is locked
+                        x_cu = PETSc.Vec().createCUDAWithArrays(x.array_r)
+                    with PETSc.Log.Event("Event: solve"):
+                        self.pc.apply(x_cu, y_cu)
+                        # Calling data to synchronize vector
+                        tmp = y_cu.array_r  # noqa: F841
-                        tmp = y_cu.array_r  # noqa: F841
+                        y_cu.array_r
-                        tmp = y_cu.array_r  # noqa: F841
+                        y_cu.array_r
+                with PETSc.Log.Event("Event: vectors copy back"):
+                    y.copy(y_cu)  #
+
+    def applyTranspose(self, pc, X, Y):
+        raise NotImplementedError
+
+    def view(self, pc, viewer=None):
+        super().view(pc, viewer)
+        if hasattr(self, "pc"):
+            viewer.printfASCII("PC to solve on GPU\n")
+            self.pc.view(viewer)
diff --git a/firedrake/utils.py b/firedrake/utils.py
@@ -23,6 +23,14 @@
 SLATE_SUPPORTS_COMPLEX = False
 
 
+def device_matrix_type() -> str | None:
+    _device_mat_type_map = {"cuda": "aijcusparse"}
+    for device, mat_type in _device_mat_type_map.items():
+        if device in petsctools.get_external_packages():
+            return mat_type
+    return None
+
+
 def _new_uid(comm):
     uid = comm.Get_attr(FIREDRAKE_UID)
     if uid is None:

diff --git a/scripts/firedrake-check b/scripts/firedrake-check
@@ -23,6 +23,8 @@ TESTS = {
         "tests/firedrake/regression/test_matrix_free.py::test_fieldsplitting[parameters3-cofunc_rhs-variational]",
         # near nullspace
         "tests/firedrake/regression/test_nullspace.py::test_near_nullspace",
+        # GPU offload
+        "tests/firedrake/offload/test_poisson_offloading_pc.py::test_poisson_offload"
     ),
     2: (
         # HDF5/checkpointing