diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 8e4cd7d..e14e19e 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -26,9 +26,10 @@ jobs:
       matrix:
         version:
           - "1"
-          - "lts"
+          # - "lts"
         os:
           - ubuntu-latest
+          - windows-latest
           - macos-latest
     uses: LuxDL/Lux.jl/.github/workflows/CommonCI.yml@main
     with:
@@ -42,35 +43,3 @@ jobs:
       julia_version: "1.11"
       downgrade_testing: true
       project: "."
-
-  invalidations:
-    # Only run on PRs to the default branch.
-    # In the PR trigger above branches can be specified only explicitly whereas this check should work for master, main, or any other default branch
-    if: github.base_ref == github.event.repository.default_branch
-    runs-on: ubuntu-latest
-    steps:
-      - uses: julia-actions/setup-julia@v2
-        with:
-          version: "1"
-      - uses: actions/checkout@v6
-      - uses: julia-actions/julia-buildpkg@v1
-      - uses: julia-actions/julia-invalidations@v1
-        id: invs_pr
-
-      - uses: actions/checkout@v6
-        with:
-          ref: ${{ github.event.repository.default_branch }}
-      - uses: julia-actions/julia-buildpkg@v1
-      - uses: julia-actions/julia-invalidations@v1
-        id: invs_default
-
-      - name: Report invalidation counts
-        run: |
-          echo "Invalidations on default branch: ${{ steps.invs_default.outputs.total }} (${{ steps.invs_default.outputs.deps }} via deps)" >> $GITHUB_STEP_SUMMARY
-          echo "This branch: ${{ steps.invs_pr.outputs.total }} (${{ steps.invs_pr.outputs.deps }} via deps)" >> $GITHUB_STEP_SUMMARY
-      - name: Check if the PR does increase number of invalidations
-        if: steps.invs_pr.outputs.total > steps.invs_default.outputs.total
-        run: exit 1
-
-env:
-  BACKEND_GROUP: "CPU"
diff --git a/.github/workflows/QualityCheck.yml b/.github/workflows/QualityCheck.yml
index 0f2086f..2088103 100644
--- a/.github/workflows/QualityCheck.yml
+++ b/.github/workflows/QualityCheck.yml
@@ -3,14 +3,6 @@ name: Code Quality Check
 on: [pull_request]
 
 jobs:
-  code-style:
-    name: Format Suggestions
-    runs-on: ubuntu-latest
-    steps:
-      - uses: julia-actions/julia-format@v4
-        with:
-          version: "1"
-
   typos-check:
     name: Spell Check with Typos
     runs-on: ubuntu-latest
diff --git a/Project.toml b/Project.toml
index 1bb9795..d7981d4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "NeuralOperators"
 uuid = "ea5c82af-86e5-48da-8ee1-382d6ad7af4b"
 authors = ["Avik Pal <avikpal@mit.edu>"]
-version = "0.6.2"
+version = "0.6.3"
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
@@ -16,10 +16,14 @@ WeightInitializers = "d49dbf32-c5c2-4618-8acc-27bb2598ef2d"
 [compat]
 AbstractFFTs = "1.5.0"
 ConcreteStructs = "0.2.3"
-Lux = "1.13"
-LuxCore = "1.2"
-LuxLib = "1.8"
-NNlib = "0.9.30"
+Lux = "1.31.3"
+LuxCore = "1.5.1"
+LuxLib = "1.15.1"
+NNlib = "0.9.33"
 Random = "1.10"
-WeightInitializers = "1"
+WeightInitializers = "1.3"
 julia = "1.10"
+
+[workspace]
+projects = ["test", "docs"]
+
diff --git a/docs/Project.toml b/docs/Project.toml
index 7667376..95690e0 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -6,6 +6,7 @@ CondaPkg = "992eb4ea-22a4-4c89-a5bb-47a3300528ab"
 DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+Git = "d7ba0133-e1db-5d97-8f8c-041e4b3a1eb2"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
@@ -14,12 +15,13 @@ Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
 Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
+Scratch = "6c6a2e73-6563-6170-7368-637461726353"
 
-[sources.NeuralOperators]
-path = ".."
+[sources]
+NeuralOperators = {path = ".."}
 
 [compat]
-AlgebraOfGraphics = "0.10.7, 0.11"
+AlgebraOfGraphics = "0.10.7, 0.11, 0.12"
 CSV = "0.10"
 CairoMakie = "0.13, 0.14, 0.15"
 CondaPkg = "0.2.23"
@@ -27,7 +29,7 @@ DataDeps = "0.7.13"
 DataFrames = "1"
 Documenter = "1.7.0"
 Lux = "1"
-MAT = "0.10.7"
+MAT = "0.10.7, 0.11"
 MLUtils = "0.4.4"
 NeuralOperators = "0.6"
 Optimisers = "0.4"
diff --git a/docs/make.jl b/docs/make.jl
index dd7f093..4b9eec4 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,7 +1,14 @@
 using Documenter, NeuralOperators
 
-cp("./docs/Manifest.toml", "./docs/src/assets/Manifest.toml"; force = true)
-cp("./docs/Project.toml", "./docs/src/assets/Project.toml"; force = true)
+if isfile("Manifest.toml")
+    cp("./Manifest.toml", "./docs/src/assets/Manifest.toml"; force = true)
+elseif isfile("../Manifest.toml")
+    cp("../Manifest.toml", "./docs/src/assets/Manifest.toml"; force = true)
+end
+
+if isfile("Project.toml")
+    cp("./Project.toml", "./docs/src/assets/Project.toml"; force = true)
+end
 
 ENV["GKSwstype"] = "100"
 ENV["DATADEPS_ALWAYS_ACCEPT"] = true
diff --git a/docs/src/tutorials/burgers_deeponet.md b/docs/src/tutorials/burgers_deeponet.md
index 12581e0..f77983e 100644
--- a/docs/src/tutorials/burgers_deeponet.md
+++ b/docs/src/tutorials/burgers_deeponet.md
@@ -3,9 +3,8 @@
 ## Data Loading
 
 ```@example burgers
-using DataDeps, MAT, MLUtils
+using MAT, MLUtils, Printf, DataDeps
 using PythonCall, CondaPkg # For `gdown`
-using Printf
 
 const gdown = pyimport("gdown")
 
diff --git a/docs/src/tutorials/double_pendulum.gif b/docs/src/tutorials/double_pendulum.gif
index b1b57b4..008823e 100644
Binary files a/docs/src/tutorials/double_pendulum.gif and b/docs/src/tutorials/double_pendulum.gif differ
diff --git a/docs/src/tutorials/double_pendulum.md b/docs/src/tutorials/double_pendulum.md
index 665e10c..d2ef14c 100644
--- a/docs/src/tutorials/double_pendulum.md
+++ b/docs/src/tutorials/double_pendulum.md
@@ -3,37 +3,27 @@
 ## Data Loading
 
 ```julia
-using DataDeps, CSV, MLUtils, DataFrames
+using CSV, MLUtils, DataFrames, Git, Scratch
 using Printf
 
-register(
-    DataDep(
-        "DoublePendulumChaotic",
-        """
-        Dataset was generated on the basis of 21 individual runs of a double pendulum.
-        Each of the recorded sequences lasted around 40s and consisted of around 17500 frames.
-
-          * `x_red`: Horizontal pixel coordinate of the red point (the central pivot to the
-            first pendulum)
-          * `y_red`: Vertical pixel coordinate of the red point (the central pivot to the first
-            pendulum)
-          * `x_green`: Horizontal pixel coordinate of the green point (the first pendulum)
-          * `y_green`: Vertical pixel coordinate of the green point (the first pendulum)
-          * `x_blue`: Horizontal pixel coordinate of the blue point (the second pendulum)
-          * `y_blue`: Vertical pixel coordinate of the blue point (the second pendulum)
-
-        Page: https://developer.ibm.com/exchanges/data/all/double-pendulum-chaotic/
-        """,
-        "https://dax-cdn.cdn.appdomain.cloud/dax-double-pendulum-chaotic/2.0.1/double-pendulum-chaotic.tar.gz",
-        "4ca743b4b783094693d313ebedc2e8e53cf29821ee8b20abd99f8fb4c0866f8d";
-        post_fetch_method=unpack,
-    ),
-)
+function get_dataset_path()
+    scratch_path = @get_scratch!("double_pendulum")
+    if !isdir(joinpath(scratch_path, "data", "ibm-double-pendulum"))
+        run(Git.git(
+            [
+                "clone",
+                "https://github.com/erwincoumans/tiny-differentiable-simulator",
+                "$(scratch_path)",
+            ]
+        ))
+    end
+    return joinpath(scratch_path, "data", "ibm-double-pendulum")
+end
 
 function get_data(; i=0, n=-1)
-    data_path = joinpath(datadep"DoublePendulumChaotic", "original", "dpc_dataset_csv")
+    data_path = get_dataset_path()
     df = CSV.read(
-        joinpath(data_path, "$i.csv"),
+        joinpath(data_path, "original", "dpc_dataset_csv", "$i.csv"),
         DataFrame;
         header=[:x_red, :y_red, :x_green, :y_green, :x_blue, :y_blue],
     )
@@ -112,14 +102,9 @@ function train_model!(model, ps, st, trainloader, testloader; epochs=20)
     train_state = Training.TrainState(model, ps, st, AdamW(; eta=3.0f-4, lambda=1.0f-5))
 
     (xtest, ytest) = first(testloader)
-    prediction_loss_compiled = Reactant.with_config(;
-        convolution_precision=PrecisionConfig.HIGH,
-        dot_general_precision=PrecisionConfig.HIGH,
-    ) do
-        @compile prediction_loss(
-            model, xtest, train_state.parameters, train_state.states, ytest
-        )
-    end
+    prediction_loss_compiled = @compile prediction_loss(
+        model, xtest, train_state.parameters, train_state.states, ytest
+    )
 
     for epoch in 1:epochs
         for data in trainloader
@@ -167,15 +152,8 @@ n = 5
 inferenced_data = x_data[:, :, :, 1:1]
 for i in 1:n
     input_data = inferenced_data[:, :, :, i:i] |> xdev
-    prediction = first(
-        Reactant.with_config(;
-            convolution_precision=PrecisionConfig.HIGH,
-            dot_general_precision=PrecisionConfig.HIGH,
-        ) do
-            @jit fno(input_data, ps_trained, st_trained)
-        end,
-    )
-    inferenced_data = cat(inferenced_data, cdev(prediction); dims=4)
+    prediction = @jit fno(input_data, ps_trained, st_trained)
+    inferenced_data = cat(inferenced_data, cdev(prediction[1]); dims=4)
 end
 inferenced_data = cat([inferenced_data[:, :, :, i] for i in 1:n]...; dims=1)[:, :, 1]'
 
diff --git a/docs/src/tutorials/poisson_equation.md b/docs/src/tutorials/poisson_equation.md
index 294e825..17e600c 100644
--- a/docs/src/tutorials/poisson_equation.md
+++ b/docs/src/tutorials/poisson_equation.md
@@ -94,9 +94,9 @@ function predict(model, f_input, x_input, ps, st)
     return vec(pred .* max_u)
 end
 
-compiled_predict_fn = Reactant.with_config(; dot_general_precision=PrecisionConfig.HIGH) do
-    @compile predict(deeponet, f_data[:, 1:1], x_data, ps_trained, st_trained)
-end
+compiled_predict_fn = @compile predict(
+    deeponet, f_data[:, 1:1], x_data, ps_trained, st_trained
+)
 
 # Testing and visualization
 begin
diff --git a/test/Project.toml b/test/Project.toml
index 20701c0..5f960fa 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,39 +1,39 @@
 [deps]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ExplicitImports = "7d51a73a-1435-4ff3-83d9-f097790105c7"
-FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
-Hwloc = "0e44f5e4-bd66-52a0-8798-143a42290a1d"
+FastTransforms = "057dd010-8810-581a-b7be-e3fc3b93f78c"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623"
 LuxLib = "82251201-b29d-42c6-8e01-566dec8acb11"
 LuxTestUtils = "ac9de150-d08f-4546-94fb-7472b5760531"
+NeuralOperators = "ea5c82af-86e5-48da-8ee1-382d6ad7af4b"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
+ParallelTestRunner = "d3525ed8-44d0-4b2c-a655-542cee43accc"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-ReTestItems = "817f1d60-ba6b-4fd5-9520-3cf149f6a823"
 Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
-Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+
+[sources]
+NeuralOperators = {path = ".."}
+Reactant = {url = "https://github.com/EnzymeAD/Reactant.jl", rev = "ap/fix_complex_fd"}
 
 [compat]
 Aqua = "0.8.7"
 Documenter = "1.5.0"
 Enzyme = "0.13.48"
 ExplicitImports = "1.9.0"
-FFTW = "1.9.0"
-Hwloc = "3.2"
 Lux = "1"
 LuxCore = "1"
 LuxLib = "1.2"
 LuxTestUtils = "1.1.2, 2"
 Optimisers = "0.4"
+ParallelTestRunner = "2.1"
 Random = "1.10"
-ReTestItems = "1.24.0"
-Reactant = "0.2.130"
-Reexport = "1.2.2"
+Reactant = "0.2.203"
 StableRNGs = "1.0.2"
 Test = "1.10"
-Zygote = "0.7"
diff --git a/test/doctests.jl b/test/doctests.jl
new file mode 100644
index 0000000..5fe0e0e
--- /dev/null
+++ b/test/doctests.jl
@@ -0,0 +1,11 @@
+using NeuralOperators, Test, Documenter, FastTransforms
+
+@testset "Doctests: Quality Assurance" begin
+    DocMeta.setdocmeta!(
+        NeuralOperators,
+        :DocTestSetup,
+        :(using Lux, NeuralOperators, Random);
+        recursive = true,
+    )
+    doctest(NeuralOperators; manual = false)
+end
diff --git a/test/fno_tests.jl b/test/fno_tests.jl
deleted file mode 100644
index 41cded5..0000000
--- a/test/fno_tests.jl
+++ /dev/null
@@ -1,73 +0,0 @@
-@testitem "Fourier Neural Operator" setup = [SharedTestSetup] begin
-    rng = StableRNG(12345)
-
-    setups = [
-        (
-            modes = (16,),
-            chs = (2, 64, 64, 64, 64, 64, 128, 1),
-            x_size = (1024, 2, 5),
-            y_size = (1024, 1, 5),
-            shift = false,
-        ),
-        (
-            modes = (16, 16),
-            chs = (2, 64, 64, 64, 64, 64, 128, 4),
-            x_size = (32, 32, 2, 5),
-            y_size = (32, 32, 4, 5),
-            shift = false,
-        ),
-        (
-            modes = (16, 16),
-            chs = (2, 64, 64, 64, 64, 64, 128, 4),
-            x_size = (32, 32, 2, 5),
-            y_size = (32, 32, 4, 5),
-            shift = true,
-        ),
-    ]
-
-    @testset "$(length(setup.modes))D | shift=$(setup.shift)" for setup in setups
-        fno = FourierNeuralOperator(; setup.chs, setup.modes, setup.shift)
-        display(fno)
-        ps, st = Lux.setup(rng, fno)
-
-        x = rand(rng, Float32, setup.x_size...)
-        y = rand(rng, Float32, setup.y_size...)
-
-        @test size(first(fno(x, ps, st))) == setup.y_size
-
-        ps_ra, st_ra = (ps, st) |> reactant_device()
-        x_ra, y_ra = (x, y) |> reactant_device()
-
-        res = first(fno(x, ps, st))
-        res_ra, _ = Reactant.with_config(;
-            dot_general_precision = PrecisionConfig.HIGH,
-            convolution_precision = PrecisionConfig.HIGH,
-        ) do
-            @jit fno(x_ra, ps_ra, st_ra)
-        end
-        @test res_ra ≈ res atol = 1.0f-2 rtol = 1.0f-2
-
-        @test begin
-            l2, l1 = train!(
-                MSELoss(), AutoEnzyme(), fno, ps_ra, st_ra, [(x_ra, y_ra)]; epochs = 10
-            )
-            l2 < l1
-        end
-
-        @testset "check gradients" begin
-            ∂x_zyg, ∂ps_zyg = zygote_gradient(fno, x, ps, st)
-
-            ∂x_ra, ∂ps_ra = Reactant.with_config(;
-                dot_general_precision = PrecisionConfig.HIGH,
-                convolution_precision = PrecisionConfig.HIGH,
-            ) do
-                @jit enzyme_gradient(fno, x_ra, ps_ra, st_ra)
-            end
-            ∂x_ra, ∂ps_ra = (∂x_ra, ∂ps_ra) |> cpu_device()
-
-            # TODO: is zygote off here?
-            @test ∂x_zyg ≈ ∂x_ra atol = 1.0f-2 rtol = 1.0f-2 skip = setup.shift
-            @test check_approx(∂ps_zyg, ∂ps_ra; atol = 1.0f-2, rtol = 1.0f-2) skip = setup.shift
-        end
-    end
-end
diff --git a/test/layers/layers_testsetup.jl b/test/layers/layers_testsetup.jl
new file mode 100644
index 0000000..f0a910a
--- /dev/null
+++ b/test/layers/layers_testsetup.jl
@@ -0,0 +1,45 @@
+using NeuralOperators, Test
+
+include("../shared_testsetup.jl")
+
+const LAYERS_SETUPS = [
+    (; m = (4,), x_size = (8, 2, 2), y_size = (8, 4, 2), shift = false),
+    (; m = (4, 4), x_size = (8, 8, 1, 2), y_size = (8, 8, 4, 2), shift = false),
+    (; m = (4, 4), x_size = (8, 8, 1, 2), y_size = (8, 8, 4, 2), shift = true),
+]
+
+function run_op_tests(op, setups)
+    rng = StableRNG(12345)
+    xdev = reactant_device(; force = true)
+
+    @testset "$(length(setup.m))D | shift=$(setup.shift)" for setup in setups
+        in_chs = setup.x_size[end - 1]
+        out_chs = setup.y_size[end - 1]
+        ch = 4 => out_chs
+
+        l1 = Conv(ntuple(_ -> 1, length(setup.m)), in_chs => first(ch))
+        m = Chain(l1, op(ch, setup.m; setup.shift))
+        display(m)
+        ps, st = Lux.setup(rng, m)
+
+        x = rand(rng, Float32, setup.x_size...)
+        @test size(first(m(x, ps, st))) == setup.y_size
+        res = first(m(x, ps, st))
+
+        ps_ra, st_ra = xdev((ps, st))
+        x_ra = xdev(x)
+        y_ra = xdev(rand(rng, Float32, setup.y_size...))
+
+        res_ra, _ = @jit m(x_ra, ps_ra, st_ra)
+        @test res_ra ≈ res atol = 1.0f-2 rtol = 1.0f-2
+
+        @testset "check gradients" begin
+            ∂x_fd, ∂ps_fd = ∇sumabs2_reactant_fd(m, x_ra, ps_ra, st_ra)
+            ∂x_ra, ∂ps_ra = ∇sumabs2_reactant(m, x_ra, ps_ra, st_ra)
+
+            @test ∂x_fd ≈ ∂x_ra atol = 1.0f-2 rtol = 1.0f-2
+            @test check_approx(∂ps_fd, ∂ps_ra; atol = 1.0f-2, rtol = 1.0f-2)
+        end
+    end
+    return nothing
+end
diff --git a/test/layers/spectral_conv_tests.jl b/test/layers/spectral_conv_tests.jl
new file mode 100644
index 0000000..2184b92
--- /dev/null
+++ b/test/layers/spectral_conv_tests.jl
@@ -0,0 +1,5 @@
+include("layers_testsetup.jl")
+
+@testset "SpectralConv" begin
+    run_op_tests(SpectralConv, LAYERS_SETUPS)
+end
diff --git a/test/layers/spectral_kernel_tests.jl b/test/layers/spectral_kernel_tests.jl
new file mode 100644
index 0000000..63d4ff1
--- /dev/null
+++ b/test/layers/spectral_kernel_tests.jl
@@ -0,0 +1,5 @@
+include("layers_testsetup.jl")
+
+@testset "SpectralKernel" begin
+    run_op_tests(SpectralKernel, LAYERS_SETUPS)
+end
diff --git a/test/layers_tests.jl b/test/layers_tests.jl
deleted file mode 100644
index ae2e864..0000000
--- a/test/layers_tests.jl
+++ /dev/null
@@ -1,66 +0,0 @@
-@testitem "SpectralConv & SpectralKernel" setup = [SharedTestSetup] begin
-    rng = StableRNG(12345)
-
-    opconv = [SpectralConv, SpectralKernel]
-    setups = [
-        (; m = (16,), x_size = (1024, 2, 5), y_size = (1024, 16, 5), shift = false),
-        (; m = (10, 10), x_size = (22, 22, 1, 5), y_size = (22, 22, 16, 5), shift = false),
-        (; m = (10, 10), x_size = (22, 22, 1, 5), y_size = (22, 22, 16, 5), shift = true),
-    ]
-
-    rdev = reactant_device()
-
-    @testset "$(op) $(length(setup.m))D | shift=$(setup.shift)" for op in opconv,
-            setup in setups
-
-        in_chs = setup.x_size[end - 1]
-        out_chs = setup.y_size[end - 1]
-        ch = 4 => out_chs
-
-        l1 = Conv(ntuple(_ -> 1, length(setup.m)), in_chs => first(ch))
-        m = Chain(l1, op(ch, setup.m; setup.shift))
-        display(m)
-        ps, st = Lux.setup(rng, m)
-
-        x = rand(rng, Float32, setup.x_size...)
-        @test size(first(m(x, ps, st))) == setup.y_size
-        res = first(m(x, ps, st))
-
-        ps_ra, st_ra = rdev((ps, st))
-        x_ra = rdev(x)
-        y_ra = rdev(rand(rng, Float32, setup.y_size...))
-
-        res_ra, _ = Reactant.with_config(;
-            dot_general_precision = PrecisionConfig.HIGH,
-            convolution_precision = PrecisionConfig.HIGH,
-        ) do
-            @jit m(x_ra, ps_ra, st_ra)
-        end
-        @test res_ra ≈ res atol = 1.0f-2 rtol = 1.0f-2 skip = (
-            setup.shift && op === SpectralConv
-        )
-
-        @test begin
-            l2, l1 = train!(
-                MSELoss(), AutoEnzyme(), m, ps_ra, st_ra, [(x_ra, y_ra)]; epochs = 10
-            )
-            l2 < l1
-        end
-
-        @testset "check gradients" begin
-            ∂x_zyg, ∂ps_zyg = zygote_gradient(m, x, ps, st)
-
-            ∂x_ra, ∂ps_ra = Reactant.with_config(;
-                dot_general_precision = PrecisionConfig.HIGH,
-                convolution_precision = PrecisionConfig.HIGH,
-            ) do
-                @jit enzyme_gradient(m, x_ra, ps_ra, st_ra)
-            end
-            ∂x_ra, ∂ps_ra = (∂x_ra, ∂ps_ra) |> cpu_device()
-
-            # TODO: is zygote off here?
-            @test ∂x_zyg ≈ ∂x_ra atol = 1.0f-2 rtol = 1.0f-2 skip = setup.shift
-            @test check_approx(∂ps_zyg, ∂ps_ra; atol = 1.0f-2, rtol = 1.0f-2) skip = setup.shift
-        end
-    end
-end
diff --git a/test/deeponet_tests.jl b/test/models/deeponet_tests.jl
similarity index 56%
rename from test/deeponet_tests.jl
rename to test/models/deeponet_tests.jl
index 3f32e1d..4577b86 100644
--- a/test/deeponet_tests.jl
+++ b/test/models/deeponet_tests.jl
@@ -1,4 +1,8 @@
-@testitem "DeepONet" setup = [SharedTestSetup] begin
+using NeuralOperators, Test
+
+include("../shared_testsetup.jl")
+
+@testset "DeepONet" begin
     rng = StableRNG(12345)
 
     setups = [
@@ -20,7 +24,7 @@
         ),
     ]
 
-    xdev = reactant_device()
+    xdev = reactant_device(; force = true)
 
     @testset "$(setup.name)" for setup in setups
         u = rand(Float32, setup.u_size...)
@@ -35,20 +39,18 @@
         ps_ra, st_ra = (ps, st) |> xdev
         u_ra, y_ra = (u, y) |> xdev
 
+        pred_ra = @jit deeponet((u_ra, y_ra), ps_ra, st_ra)
+        @test first(pred_ra) ≈ pred atol = 1.0f-2 rtol = 1.0f-2
+
         @testset "check gradients" begin
-            ∂u_zyg, ∂ps_zyg = zygote_gradient(deeponet, (u, y), ps, st)
-
-            ∂u_ra, ∂ps_ra = Reactant.with_config(;
-                dot_general_precision = PrecisionConfig.HIGH,
-                convolution_precision = PrecisionConfig.HIGH,
-            ) do
-                @jit enzyme_gradient(deeponet, (u_ra, y_ra), ps_ra, st_ra)
-            end
-            ∂u_ra, ∂ps_ra = (∂u_ra, ∂ps_ra) |> cpu_device()
-
-            @test ∂u_zyg[1] ≈ ∂u_ra[1] atol = 1.0f-2 rtol = 1.0f-2
-            @test ∂u_zyg[2] ≈ ∂u_ra[2] atol = 1.0f-2 rtol = 1.0f-2
-            @test check_approx(∂ps_zyg, ∂ps_ra; atol = 1.0f-2, rtol = 1.0f-2)
+            (∂u_fd, ∂y_fd), ∂ps_fd = ∇sumabs2_reactant_fd(
+                deeponet, (u_ra, y_ra), ps_ra, st_ra
+            )
+            (∂u_ra, ∂y_ra), ∂ps_ra = ∇sumabs2_reactant(deeponet, (u_ra, y_ra), ps_ra, st_ra)
+
+            @test ∂u_fd ≈ ∂u_ra atol = 1.0f-2 rtol = 1.0f-2
+            @test ∂y_fd ≈ ∂y_ra atol = 1.0f-2 rtol = 1.0f-2
+            @test check_approx(∂ps_fd, ∂ps_ra; atol = 1.0f-2, rtol = 1.0f-2)
         end
     end
 end
diff --git a/test/models/fno_tests.jl b/test/models/fno_tests.jl
new file mode 100644
index 0000000..c17bcb9
--- /dev/null
+++ b/test/models/fno_tests.jl
@@ -0,0 +1,59 @@
+using NeuralOperators, Test
+
+include("../shared_testsetup.jl")
+
+@testset "Fourier Neural Operator" begin
+    rng = StableRNG(12345)
+
+    setups = [
+        (
+            modes = (4,),
+            chs = (2, 4, 4, 4, 1),
+            x_size = (8, 2, 2),
+            y_size = (8, 1, 2),
+            shift = false,
+        ),
+        (
+            modes = (4, 4),
+            chs = (2, 4, 4, 4, 4),
+            x_size = (8, 8, 2, 2),
+            y_size = (8, 8, 4, 2),
+            shift = false,
+        ),
+        (
+            modes = (4, 4),
+            chs = (2, 4, 4, 4, 4),
+            x_size = (8, 8, 2, 2),
+            y_size = (8, 8, 4, 2),
+            shift = true,
+        ),
+    ]
+
+    xdev = reactant_device(; force = true)
+
+    @testset "$(length(setup.modes))D | shift=$(setup.shift)" for setup in setups
+        fno = FourierNeuralOperator(; setup.chs, setup.modes, setup.shift)
+        display(fno)
+        ps, st = Lux.setup(rng, fno)
+
+        x = rand(rng, Float32, setup.x_size...)
+        y = rand(rng, Float32, setup.y_size...)
+
+        @test size(first(fno(x, ps, st))) == setup.y_size
+
+        ps_ra, st_ra = (ps, st) |> xdev
+        x_ra, y_ra = (x, y) |> xdev
+
+        res = first(fno(x, ps, st))
+        res_ra, _ = @jit fno(x_ra, ps_ra, st_ra)
+        @test res_ra ≈ res atol = 1.0f-2 rtol = 1.0f-2
+
+        @testset "check gradients" begin
+            ∂x_fd, ∂ps_fd = ∇sumabs2_reactant_fd(fno, x_ra, ps_ra, st_ra)
+            ∂x_ra, ∂ps_ra = ∇sumabs2_reactant(fno, x_ra, ps_ra, st_ra)
+
+            @test ∂x_fd ≈ ∂x_ra atol = 1.0f-2 rtol = 1.0f-2
+            @test check_approx(∂ps_fd, ∂ps_ra; atol = 1.0f-2, rtol = 1.0f-2)
+        end
+    end
+end
diff --git a/test/nomad_tests.jl b/test/models/nomad_tests.jl
similarity index 57%
rename from test/nomad_tests.jl
rename to test/models/nomad_tests.jl
index c340dc4..b4ecdc7 100644
--- a/test/nomad_tests.jl
+++ b/test/models/nomad_tests.jl
@@ -1,4 +1,8 @@
-@testitem "NOMAD" setup = [SharedTestSetup] begin
+using NeuralOperators, Test
+
+include("../shared_testsetup.jl")
+
+@testset "NOMAD" begin
     rng = StableRNG(12345)
 
     setups = [
@@ -20,7 +24,7 @@
         ),
     ]
 
-    xdev = reactant_device()
+    xdev = reactant_device(; force = true)
 
     @testset "$(setup.name)" for setup in setups
         u = rand(Float32, setup.u_size...)
@@ -35,20 +39,16 @@
         ps_ra, st_ra = xdev((ps, st))
         u_ra, y_ra = xdev(u), xdev(y)
 
+        pred_ra, _ = @jit nomad((u_ra, y_ra), ps_ra, st_ra)
+        @test pred_ra ≈ pred atol = 1.0f-2 rtol = 1.0f-2
+
         @testset "check gradients" begin
-            ∂u_zyg, ∂ps_zyg = zygote_gradient(nomad, (u, y), ps, st)
-
-            ∂u_ra, ∂ps_ra = Reactant.with_config(;
-                dot_general_precision = PrecisionConfig.HIGH,
-                convolution_precision = PrecisionConfig.HIGH,
-            ) do
-                @jit enzyme_gradient(nomad, (u_ra, y_ra), ps_ra, st_ra)
-            end
-            ∂u_ra, ∂ps_ra = (∂u_ra, ∂ps_ra) |> cpu_device()
-
-            @test ∂u_zyg[1] ≈ ∂u_ra[1] atol = 1.0f-2 rtol = 1.0f-2
-            @test ∂u_zyg[2] ≈ ∂u_ra[2] atol = 1.0f-2 rtol = 1.0f-2
-            @test check_approx(∂ps_zyg, ∂ps_ra; atol = 1.0f-2, rtol = 1.0f-2)
+            (∂u_fd, ∂y_fd), ∂ps_fd = ∇sumabs2_reactant_fd(nomad, (u_ra, y_ra), ps_ra, st_ra)
+            (∂u_ra, ∂y_ra), ∂ps_ra = ∇sumabs2_reactant(nomad, (u_ra, y_ra), ps_ra, st_ra)
+
+            @test ∂u_fd ≈ ∂u_ra atol = 1.0f-2 rtol = 1.0f-2
+            @test ∂y_fd ≈ ∂y_ra atol = 1.0f-2 rtol = 1.0f-2
+            @test check_approx(∂ps_fd, ∂ps_ra; atol = 1.0f-2, rtol = 1.0f-2)
         end
     end
 end
diff --git a/test/qa_tests.jl b/test/qa_tests.jl
index 24b4aea..6e42a2c 100644
--- a/test/qa_tests.jl
+++ b/test/qa_tests.jl
@@ -1,29 +1,10 @@
-@testitem "doctests: Quality Assurance" tags = [:qa] begin
-    using Documenter, NeuralOperators
-
-    DocMeta.setdocmeta!(
-        NeuralOperators,
-        :DocTestSetup,
-        :(using Lux, NeuralOperators, Random);
-        recursive = true,
-    )
-    doctest(NeuralOperators; manual = false)
-end
-
-@testitem "Aqua: Quality Assurance" tags = [:qa] begin
-    using Aqua
+using NeuralOperators, Test, ExplicitImports, Aqua
 
+@testset "Aqua: Quality Assurance" begin
     Aqua.test_all(NeuralOperators; ambiguities = false)
     Aqua.test_ambiguities(NeuralOperators; recursive = false)
 end
 
-@testitem "Explicit Imports: Quality Assurance" tags = [:qa] begin
-    using ExplicitImports, Lux
-
-    # Skip our own packages
-    @test check_no_implicit_imports(NeuralOperators; skip = (Base, Core, Lux)) === nothing
-    @test check_no_stale_explicit_imports(NeuralOperators) === nothing
-    @test check_no_self_qualified_accesses(NeuralOperators) === nothing
-    @test check_all_explicit_imports_via_owners(NeuralOperators) === nothing
-    @test check_all_qualified_accesses_via_owners(NeuralOperators) === nothing
+@testset "Explicit Imports: Quality Assurance" begin
+    test_explicit_imports(NeuralOperators; all_qualified_accesses_are_public = false)
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 7b8141d..86f90f5 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,16 +1,24 @@
-using ReTestItems, Test, Hwloc, NeuralOperators, Reactant
+using NeuralOperators, Test, ParallelTestRunner
 
-const BACKEND_GROUP = lowercase(get(ENV, "BACKEND_GROUP", "all"))
+parsed_args = parse_args(@isdefined(TEST_ARGS) ? TEST_ARGS : ARGS)
 
-const RETESTITEMS_NWORKER_THREADS = parse(
-    Int, get(ENV, "RETESTITEMS_NWORKER_THREADS", string(Hwloc.num_virtual_cores()))
+# Find all tests
+testsuite = find_tests(@__DIR__)
+
+filter_tests!(testsuite, parsed_args)
+
+# Remove shared setup files that shouldn't be run directly
+delete!(testsuite, "shared_testsetup")
+delete!(testsuite, "layers/layers_testsetup")
+
+total_jobs = min(
+    something(parsed_args.jobs, ParallelTestRunner.default_njobs()), length(keys(testsuite))
 )
 
-@testset "NeuralOperators.jl Tests" begin
-    ReTestItems.runtests(
-        NeuralOperators;
-        nworkers = 1,
-        nworker_threads = RETESTITEMS_NWORKER_THREADS,
-        testitem_timeout = 3600,
-    )
+withenv(
+    "XLA_REACTANT_GPU_MEM_FRACTION" => 1 / (total_jobs + 0.1),
+    "XLA_REACTANT_GPU_PREALLOCATE" => false,
+    "JULIA_CUDA_HARD_MEMORY_LIMIT" => "$(100 / (total_jobs + 0.1))%",
+) do
+    runtests(NeuralOperators, parsed_args; testsuite)
 end
diff --git a/test/shared_testsetup.jl b/test/shared_testsetup.jl
index afe7d2c..a7bfc09 100644
--- a/test/shared_testsetup.jl
+++ b/test/shared_testsetup.jl
@@ -1,39 +1,30 @@
-@testsetup module SharedTestSetup
-import Reexport: @reexport
-
-@reexport using Lux, Zygote, Optimisers, Random, StableRNGs, Reactant, Enzyme
+using Lux, Optimisers, Random, StableRNGs, Reactant, Enzyme, FastTransforms
 using LuxTestUtils: check_approx
-using FFTW
-
-const BACKEND_GROUP = lowercase(get(ENV, "BACKEND_GROUP", "All"))
-
-train!(args...; kwargs...) = train!(MSELoss(), AutoZygote(), args...; kwargs...)
-
-function train!(loss, backend, model, ps, st, data; epochs = 10)
-    l1 = @jit loss(model, ps, st, first(data))
-
-    tstate = Training.TrainState(model, ps, st, Adam(0.01f0))
-    for _ in 1:epochs, (x, y) in data
-        _, _, _, tstate = Training.single_train_step!(backend, loss, (x, y), tstate)
-    end
-
-    l2 = @jit loss(model, tstate.parameters, tstate.states, first(data))
-
-    return l2, l1
-end
 
 sumabs2first(model, x, ps, st) = sum(abs2, first(model(x, ps, st)))
 
-function zygote_gradient(model, x, ps, st)
-    return Zygote.gradient(sumabs2first, model, x, ps, st)[2:3]
+function ∇sumabs2_reactant_fd(model, x, ps, st)
+    _, ∂x_fd, ∂ps_fd, _ = @jit Reactant.TestUtils.finite_difference_gradient(
+        sumabs2first, Const(model), f64(x), f64(ps), Const(f64(st))
+    )
+    return ∂x_fd, ∂ps_fd
 end
 
-function enzyme_gradient(model, x, ps, st)
-    return Enzyme.gradient(Reverse, sumabs2first, Const(model), x, ps, Const(st))[2:3]
+function ∇sumabs2_enzyme(model, x, ps, st)
+    dx = Enzyme.make_zero(x)
+    dps = Enzyme.make_zero(ps)
+    Enzyme.autodiff(
+        Enzyme.Reverse,
+        sumabs2first,
+        Active,
+        Const(model),
+        Duplicated(x, dx),
+        Duplicated(ps, dps),
+        Const(st),
+    )
+    return dx, dps
 end
 
-export check_approx
-export BACKEND_GROUP, train!
-export zygote_gradient, enzyme_gradient
-
+function ∇sumabs2_reactant(model, x, ps, st)
+    return @jit ∇sumabs2_enzyme(model, x, ps, st)
 end