From be5610942b44c61328cddc9126b854cddafd90ad Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 02:47:21 -0700
Subject: [PATCH 01/15] Skip the slowest tests by default
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a `slow` pytest marker, mark the worst-case tests with it, and have
`tests/run_tests.py` skip those tests by default (use `--run-slow` to include
them, or `pytest -m slow` to run only those).

Picked from macOS CI per-file timing (QD_FILE_TIMING=1, run 26083950810):
phase 1 totals 6415s across 8641 test calls; the slowest 3 files alone
(test_eig, test_tile16, test_linalg) cover 55%. The cost of test_eig /
test_make_spd is super-linear in matrix size n (n=12 ≈ 5x n=9).

Marked slow:

  - Parametrize cases n in {6, 9, 12} (and 7..11 for inverse_large) across
    test_eig.py and test_linalg.py.
  - Rectangular (9, 12) / (12, 3) cases in test_frobenius_inner_rectangular.
  - test_matmul_chain_qipc_sizes_{f32,f64} (>130s each on macOS CI).
  - test_clear_all_gradients (180s/invocation).
  - test_reset_ndarrays::test_ndarray_doesnt_crash_on_gc (127s).
  - test_mpm88::{test_mpm88, test_mpm88_numpy_and_ndarray} (~30s/invocation).
  - test_struct::test_2d_nested (122s/invocation).

run_tests.py composes `not slow` with any user-supplied `-m` expression, so
existing CI invocations like `-m "not needs_torch"` become
`(not needs_torch) and not slow`. Note that this also drops slow tests from
GPU / Linux / macOS CI runs — a separate workflow (or `--run-slow` job) is
needed if we still want to exercise the n>=6 / n=12 paths in CI.
---
 tests/pytest.ini                         |  2 +
 tests/python/test_clear_all_gradients.py |  3 +
 tests/python/test_eig.py                 | 96 +++++++++++++++++++++---
 tests/python/test_linalg.py              | 70 +++++++++++++++--
 tests/python/test_mpm88.py               |  2 +
 tests/python/test_reset_ndarrays.py      |  1 +
 tests/python/test_struct.py              |  1 +
 tests/run_tests.py                       | 23 +++++-
 8 files changed, 180 insertions(+), 18 deletions(-)

diff --git a/tests/pytest.ini b/tests/pytest.ini
index 5ee5ec16b2..efaf40e6c6 100644
--- a/tests/pytest.ini
+++ b/tests/pytest.ini
@@ -3,3 +3,5 @@ markers =
     run_in_serial: mark test to run serially(usually for resource intensive tests).
     sm70: Can only run on GPU with compute capability 7.0 or higher.
     needs_torch: mark test as requiring PyTorch.
+    slow: mark test (or parametrize case) as slow. Skipped by default by tests/run_tests.py;
+        pass --run-slow to include them, or directly `pytest -m slow` to run only the slow ones.
diff --git a/tests/python/test_clear_all_gradients.py b/tests/python/test_clear_all_gradients.py
index 615ade9b0b..22c649a979 100644
--- a/tests/python/test_clear_all_gradients.py
+++ b/tests/python/test_clear_all_gradients.py
@@ -1,9 +1,12 @@
+import pytest
+
 import quadrants as qd
 from quadrants.lang import impl
 
 from tests import test_utils
 
 
+@pytest.mark.slow
 @test_utils.test(exclude=[qd.vulkan])
 def test_clear_all_gradients():
     x = qd.field(qd.f32)
diff --git a/tests/python/test_eig.py b/tests/python/test_eig.py
index 53647a6eef..ad8d8fe3bb 100644
--- a/tests/python/test_eig.py
+++ b/tests/python/test_eig.py
@@ -295,7 +295,16 @@ def run():
     np.testing.assert_allclose(A_reconstructed, A_np, rtol=tol, atol=tol)
 
 
-@pytest.mark.parametrize("n", [4, 5, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        4,
+        5,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @pytest.mark.parametrize(
     "factory",
     [
@@ -311,7 +320,16 @@ def test_sym_eig_general_f32(n, factory):
     _test_sym_eig_general(n, qd.f32, factory)
 
 
-@pytest.mark.parametrize("n", [4, 5, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        4,
+        5,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @pytest.mark.parametrize(
     "factory",
     [
@@ -358,7 +376,15 @@ def run():
     np.testing.assert_allclose(A_spd_qd, expected, rtol=tol, atol=tol)
 
 
-@pytest.mark.parametrize("n", [4, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        4,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @pytest.mark.parametrize(
     "factory",
     [_sym_eig_factory_indefinite, _sym_eig_factory_random, _sym_eig_factory_spd],
@@ -368,7 +394,15 @@ def test_make_spd_f32(n, factory):
     _test_make_spd(n, qd.f32, factory)
 
 
-@pytest.mark.parametrize("n", [4, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        4,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @pytest.mark.parametrize(
     "factory",
     [_sym_eig_factory_indefinite, _sym_eig_factory_random, _sym_eig_factory_spd],
@@ -404,7 +438,15 @@ def run():
     np.testing.assert_allclose(Q.T @ Q, np.eye(n), rtol=tol, atol=tol)
 
 
-@pytest.mark.parametrize("n", [4, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        4,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @pytest.mark.parametrize("alpha", [0.0, 1.0, -2.5])
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_sym_eig_alpha_identity_f64(n, alpha):
@@ -445,7 +487,15 @@ def project(src: qd.types.NDArray[mat_t, 1], dst: qd.types.NDArray[mat_t, 1]):
     )
 
 
-@pytest.mark.parametrize("n", [4, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        4,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @pytest.mark.parametrize(
     "factory",
     [_sym_eig_factory_indefinite, _sym_eig_factory_negative_definite, _sym_eig_factory_spd],
@@ -455,7 +505,15 @@ def test_make_spd_idempotent_f64(n, factory):
     _test_make_spd_idempotent(n, qd.f64, factory)
 
 
-@pytest.mark.parametrize("n", [4, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        4,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_make_spd_negative_definite_zero_f64(n):
     """A symmetric matrix with all-negative eigenvalues projects to the zero matrix (``Q · diag(max(λ, 0)) · Qᵀ``
@@ -535,13 +593,33 @@ def run():
         ), f"column {i} is not the eigenvector of eigvals[{i}]={eigvals_qd[i]}: residual={residual}"
 
 
-@pytest.mark.parametrize("n", [2, 3, 4, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        2,
+        3,
+        4,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False)
 def test_sym_eig_sort_order_f32(n):
     _test_sym_eig_sort_order(n, qd.f32)
 
 
-@pytest.mark.parametrize("n", [2, 3, 4, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        2,
+        3,
+        4,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_sym_eig_sort_order_f64(n):
     _test_sym_eig_sort_order(n, qd.f64)
diff --git a/tests/python/test_linalg.py b/tests/python/test_linalg.py
index dfa31495bc..93ff2c2ce2 100644
--- a/tests/python/test_linalg.py
+++ b/tests/python/test_linalg.py
@@ -154,13 +154,31 @@ def run():
     assert out_self[None] == test_utils.approx(A.to_numpy().__pow__(2).sum(), rel=tol, abs=tol)
 
 
-@pytest.mark.parametrize("n", [2, 3, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        2,
+        3,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False)
 def test_frobenius_inner_f32(n):
     _test_frobenius_inner(n, qd.f32)
 
 
-@pytest.mark.parametrize("n", [2, 3, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        2,
+        3,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_frobenius_inner_f64(n):
     _test_frobenius_inner(n, qd.f64)
@@ -189,13 +207,27 @@ def run():
     assert out[None] == test_utils.approx(expected, rel=tol, abs=tol)
 
 
-@pytest.mark.parametrize("rows,cols", [(9, 12), (12, 3), (2, 4)])
+@pytest.mark.parametrize(
+    "rows,cols",
+    [
+        pytest.param(9, 12, marks=pytest.mark.slow),
+        pytest.param(12, 3, marks=pytest.mark.slow),
+        (2, 4),
+    ],
+)
 @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False)
 def test_frobenius_inner_rectangular_f32(rows, cols):
     _test_frobenius_inner_rectangular(rows, cols, qd.f32)
 
 
-@pytest.mark.parametrize("rows,cols", [(9, 12), (12, 3), (2, 4)])
+@pytest.mark.parametrize(
+    "rows,cols",
+    [
+        pytest.param(9, 12, marks=pytest.mark.slow),
+        pytest.param(12, 3, marks=pytest.mark.slow),
+        (2, 4),
+    ],
+)
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_frobenius_inner_rectangular_f64(rows, cols):
     _test_frobenius_inner_rectangular(rows, cols, qd.f64)
@@ -241,11 +273,13 @@ def run():
     np.testing.assert_allclose(ABC_chained.to_numpy(), ABC_staged.to_numpy(), rtol=tol, atol=tol)
 
 
+@pytest.mark.slow
 @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False)
 def test_matmul_chain_qipc_sizes_f32():
     _test_matmul_chain(qd.f32)
 
 
+@pytest.mark.slow
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_matmul_chain_qipc_sizes_f64():
     _test_matmul_chain(qd.f64)
@@ -434,7 +468,19 @@ def run():
     np.testing.assert_allclose(M @ inv_np, np.eye(n_), rtol=tol, atol=tol)
 
 
-@pytest.mark.parametrize("n", [5, 6, 7, 8, 9, 10, 11, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        5,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(7, marks=pytest.mark.slow),
+        pytest.param(8, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(10, marks=pytest.mark.slow),
+        pytest.param(11, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @pytest.mark.parametrize(
     "factory",
     [_inverse_diagonally_dominant, _inverse_spd, _inverse_pivoting_required],
@@ -444,7 +490,19 @@ def test_inverse_large_f32(n, factory):
     _test_inverse_at_size(n, qd.f32, factory)
 
 
-@pytest.mark.parametrize("n", [5, 6, 7, 8, 9, 10, 11, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        5,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(7, marks=pytest.mark.slow),
+        pytest.param(8, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(10, marks=pytest.mark.slow),
+        pytest.param(11, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @pytest.mark.parametrize(
     "factory",
     [_inverse_diagonally_dominant, _inverse_spd, _inverse_pivoting_required],
diff --git a/tests/python/test_mpm88.py b/tests/python/test_mpm88.py
index 725ff17ac9..d758b65f9d 100644
--- a/tests/python/test_mpm88.py
+++ b/tests/python/test_mpm88.py
@@ -7,6 +7,7 @@
 from tests import test_utils
 
 
+@pytest.mark.slow
 @pytest.mark.skipif(os.environ.get("QD_LITE_TEST") or "0", reason="Lite test")
 @pytest.mark.run_in_serial
 @test_utils.test()
@@ -108,6 +109,7 @@ def _is_appveyor():
     return os.getenv("APPVEYOR", "").lower() == "true"
 
 
+@pytest.mark.slow
 @pytest.mark.skipif(os.environ.get("QD_LITE_TEST") or "0", reason="Lite test")
 @pytest.mark.run_in_serial
 @test_utils.test()
diff --git a/tests/python/test_reset_ndarrays.py b/tests/python/test_reset_ndarrays.py
index bc048ac92d..a42fd921f1 100644
--- a/tests/python/test_reset_ndarrays.py
+++ b/tests/python/test_reset_ndarrays.py
@@ -8,6 +8,7 @@
 from tests import test_utils
 
 
+@pytest.mark.slow
 @test_utils.test(arch=[qd.cpu])
 def test_ndarray_doesnt_crash_on_gc() -> None:
     if sys.platform != "darwin":
diff --git a/tests/python/test_struct.py b/tests/python/test_struct.py
index d3d6a4fbaa..de6d249970 100644
--- a/tests/python/test_struct.py
+++ b/tests/python/test_struct.py
@@ -62,6 +62,7 @@ def test_linear_nested_aos():
         assert y[i] == i + 123
 
 
+@pytest.mark.slow
 @test_utils.test(exclude=[qd.vulkan])
 def test_2d_nested():
     x = qd.field(qd.i32)
diff --git a/tests/run_tests.py b/tests/run_tests.py
index e2419add42..47d5574ad0 100644
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -56,8 +56,15 @@ def _test_python(args, default_dir="python"):
             pytest_args += ["--cov-append"]
         if args.keys:
             pytest_args += ["-k", args.keys]
-        if args.marks:
-            pytest_args += ["-m", args.marks]
+        # By default we exclude tests marked `slow` (eig / make_spd at n>=6, inverse_large
+        # at n>=6, mpm88, etc. — see tests/pytest.ini for the marker). `--run-slow` opts
+        # back in. If the user passes their own `-m` expression we AND `not slow` onto it
+        # so the exclusion still applies, unless they explicitly opt out via `--run-slow`.
+        marks_expr = args.marks
+        if not args.run_slow:
+            marks_expr = f"({marks_expr}) and not slow" if marks_expr else "not slow"
+        if marks_expr:
+            pytest_args += ["-m", marks_expr]
         if args.failed_first:
             pytest_args += ["--failed-first"]
         if args.fail_fast:
@@ -161,7 +168,17 @@ def test():
         default=None,
         dest="marks",
         type=str,
-        help="Only run tests with specific marks",
+        help="Only run tests with specific marks. `not slow` is appended automatically "
+        "unless --run-slow is passed.",
+    )
+    parser.add_argument(
+        "--run-slow",
+        required=False,
+        default=False,
+        dest="run_slow",
+        action="store_true",
+        help="Include tests marked `slow` (excluded by default). Has no effect if -m is "
+        "given an explicit expression that already mentions `slow`.",
     )
     parser.add_argument(
         "-f",

From f58248a9e2c2ed69266f69168f9f52c199d96325 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 03:13:45 -0700
Subject: [PATCH 02/15] Trim n parametrize lists to {smallest, 12}

The previous lists ([4, 5, 6, 9, 12], [2, 3, 4, 6, 9, 12], [5..12], etc.) gave
the Householder/QR path a lot of redundant size coverage. For routine CI we
only need to exercise a small size + the largest supported size (12, which
also doubles as the slow-marked stress case): if a bug shows up only at
n=7 or n=11 it almost certainly also shows up at n=12.

  test_eig.py
    sym_eig_general_{f32,f64}             [4,5,6,9,12]     -> [4, 12*]
    make_spd_{f32,f64}                    [4,6,9,12]       -> [4, 12*]
    sym_eig_alpha_identity_f64            [4,6,9,12]       -> [4, 12*]
    make_spd_idempotent_f64               [4,6,9,12]       -> [4, 12*]
    make_spd_negative_definite_zero_f64   [4,6,9,12]       -> [4, 12*]
    sym_eig_sort_order_{f32,f64}          [2,3,4,6,9,12]   -> [3, 12*]
  test_linalg.py
    frobenius_inner_{f32,f64}             [2,3,6,9,12]     -> [3, 12*]
    inverse_large_{f32,f64}               [5..12]          -> [5, 12*]

* n=12 retains the `slow` marker, so default `run_tests.py` invocations only
  hit n=4 / n=3 / n=5. `--run-slow` runs both.

Closed-form 2x2/3x3 paths in test_sym_eig_sort_order: dropped n=2 in favour
of n=3 (per directive); the 2x2 path is still covered by
test_sym_eig2x2_{f32,f64}. The 3x3 closed-form path stays covered by n=3.

Other parametrize lists left untouched:
  - rectangular (rows, cols) tuples in test_frobenius_inner_rectangular (it's
    varying shape, not pure size).
  - test_mat_inverse_size's `range(1, 5)` (tiny sizes only).
  - `a00` integer parametrize in test_sym_eig3x3_{f32,f64}.
---
 tests/python/test_eig.py    | 96 ++++---------------------------------
 tests/python/test_linalg.py | 50 ++-----------------
 2 files changed, 13 insertions(+), 133 deletions(-)

diff --git a/tests/python/test_eig.py b/tests/python/test_eig.py
index ad8d8fe3bb..a8b5153dd6 100644
--- a/tests/python/test_eig.py
+++ b/tests/python/test_eig.py
@@ -295,16 +295,7 @@ def run():
     np.testing.assert_allclose(A_reconstructed, A_np, rtol=tol, atol=tol)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        4,
-        5,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
 @pytest.mark.parametrize(
     "factory",
     [
@@ -320,16 +311,7 @@ def test_sym_eig_general_f32(n, factory):
     _test_sym_eig_general(n, qd.f32, factory)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        4,
-        5,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
 @pytest.mark.parametrize(
     "factory",
     [
@@ -376,15 +358,7 @@ def run():
     np.testing.assert_allclose(A_spd_qd, expected, rtol=tol, atol=tol)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        4,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
 @pytest.mark.parametrize(
     "factory",
     [_sym_eig_factory_indefinite, _sym_eig_factory_random, _sym_eig_factory_spd],
@@ -394,15 +368,7 @@ def test_make_spd_f32(n, factory):
     _test_make_spd(n, qd.f32, factory)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        4,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
 @pytest.mark.parametrize(
     "factory",
     [_sym_eig_factory_indefinite, _sym_eig_factory_random, _sym_eig_factory_spd],
@@ -438,15 +404,7 @@ def run():
     np.testing.assert_allclose(Q.T @ Q, np.eye(n), rtol=tol, atol=tol)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        4,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
 @pytest.mark.parametrize("alpha", [0.0, 1.0, -2.5])
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_sym_eig_alpha_identity_f64(n, alpha):
@@ -487,15 +445,7 @@ def project(src: qd.types.NDArray[mat_t, 1], dst: qd.types.NDArray[mat_t, 1]):
     )
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        4,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
 @pytest.mark.parametrize(
     "factory",
     [_sym_eig_factory_indefinite, _sym_eig_factory_negative_definite, _sym_eig_factory_spd],
@@ -505,15 +455,7 @@ def test_make_spd_idempotent_f64(n, factory):
     _test_make_spd_idempotent(n, qd.f64, factory)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        4,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_make_spd_negative_definite_zero_f64(n):
     """A symmetric matrix with all-negative eigenvalues projects to the zero matrix (``Q · diag(max(λ, 0)) · Qᵀ``
@@ -593,33 +535,13 @@ def run():
         ), f"column {i} is not the eigenvector of eigvals[{i}]={eigvals_qd[i]}: residual={residual}"
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        2,
-        3,
-        4,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)])
 @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False)
 def test_sym_eig_sort_order_f32(n):
     _test_sym_eig_sort_order(n, qd.f32)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        2,
-        3,
-        4,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)])
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_sym_eig_sort_order_f64(n):
     _test_sym_eig_sort_order(n, qd.f64)
diff --git a/tests/python/test_linalg.py b/tests/python/test_linalg.py
index 93ff2c2ce2..a6632d4678 100644
--- a/tests/python/test_linalg.py
+++ b/tests/python/test_linalg.py
@@ -154,31 +154,13 @@ def run():
     assert out_self[None] == test_utils.approx(A.to_numpy().__pow__(2).sum(), rel=tol, abs=tol)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        2,
-        3,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)])
 @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False)
 def test_frobenius_inner_f32(n):
     _test_frobenius_inner(n, qd.f32)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        2,
-        3,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)])
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_frobenius_inner_f64(n):
     _test_frobenius_inner(n, qd.f64)
@@ -468,19 +450,7 @@ def run():
     np.testing.assert_allclose(M @ inv_np, np.eye(n_), rtol=tol, atol=tol)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        5,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(7, marks=pytest.mark.slow),
-        pytest.param(8, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(10, marks=pytest.mark.slow),
-        pytest.param(11, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [5, pytest.param(12, marks=pytest.mark.slow)])
 @pytest.mark.parametrize(
     "factory",
     [_inverse_diagonally_dominant, _inverse_spd, _inverse_pivoting_required],
@@ -490,19 +460,7 @@ def test_inverse_large_f32(n, factory):
     _test_inverse_at_size(n, qd.f32, factory)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        5,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(7, marks=pytest.mark.slow),
-        pytest.param(8, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(10, marks=pytest.mark.slow),
-        pytest.param(11, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [5, pytest.param(12, marks=pytest.mark.slow)])
 @pytest.mark.parametrize(
     "factory",
     [_inverse_diagonally_dominant, _inverse_spd, _inverse_pivoting_required],

From a2f4f91b4f18c467d209729255f2e7d257487d08 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 06:48:09 -0700
Subject: [PATCH 03/15] [Demo] cholesky_blocked: take N / N_ENVS / WARMUP /
 ITERS via argparse

The blocked-Cholesky demo previously hard-coded N=92, N_ENVS=4096,
WARMUP=50, ITERS=200 as module globals. The unit-test wrapper
test_tile16_cholesky_blocked_demo runs the demo as a subprocess and
only cares that it returns 0; at the hard-coded sizes that takes ~74 s
on cluster CUDA, dominated by JIT-compiling 3 large unrolled kernels
at N=92 and running the 4096-env x 250-iter benchmark loop.

Expose all four knobs as command-line flags with the previous values as
defaults, so:

    python misc/demos/cholesky_blocked.py                                # unchanged, full demo
    python misc/demos/cholesky_blocked.py --n 32 --n-envs 64 \
        --num-warmup 1 --num-iters 1                                    # smoke-mode

The test will switch to the smoke-mode invocation in a follow-up
commit so it stops dominating the slow critical path.

Flag names (--n, --n-envs, --num-warmup, --num-iters) follow the user
spec; using argparse + ArgumentDefaultsHelpFormatter so --help shows
the full demo defaults.
---
 misc/demos/cholesky_blocked.py | 37 +++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/misc/demos/cholesky_blocked.py b/misc/demos/cholesky_blocked.py
index 8dbcb3fbb9..b4c60c1810 100644
--- a/misc/demos/cholesky_blocked.py
+++ b/misc/demos/cholesky_blocked.py
@@ -1,13 +1,14 @@
 #!/usr/bin/env python3
-"""Benchmark 92x92 blocked Cholesky factorization using Tile16x16.
+"""Benchmark NxN blocked Cholesky factorization using Tile16x16.
 
 Three kernels compared:
 
 1. Baseline: scalar Cholesky-Crout, 64 threads, shared memory, 2*N+1 sequential syncs. Thread 0 computes each
    diagonal, remaining threads parallelize off-diagonal updates.
 
-2. Blocked: 6x6 grid of 16x16 tiles, 16 threads, shared memory, scalar Crout for diagonal blocks. Same blocking
-   structure as Tile16x16 but all data lives in shared memory with block.sync() between every step.
+2. Blocked: ceil(N/16) x ceil(N/16) grid of 16x16 tiles, 16 threads, shared memory, scalar Crout for diagonal
+   blocks. Same blocking structure as Tile16x16 but all data lives in shared memory with block.sync() between
+   every step.
 
 3. Tile16x16: same blocked structure but fully register-resident via Tile16x16. No shared memory, zero syncs.
    Prior tiles read from global memory (L2).
@@ -20,22 +21,38 @@
     tile16   (Tile16x16, no shared memory)             16        533        5.19x
 
 Usage:
-    python misc/demos/cholesky_blocked.py
+    python misc/demos/cholesky_blocked.py [--n N] [--n-envs N_ENVS] \
+        [--num-warmup WARMUP] [--num-iters ITERS]
 """
 
+import argparse
 import time
 
 import numpy as np
 
 import quadrants as qd
 
-N = 92
+
+def _parse_args():
+    p = argparse.ArgumentParser(
+        description="Blocked Cholesky NxN benchmark (3 kernels: baseline / blocked / tile16).",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    p.add_argument("--n", type=int, default=92, help="Matrix dimension N (NxN SPD).")
+    p.add_argument("--n-envs", type=int, default=4096, help="Number of independent environments.")
+    p.add_argument("--num-warmup", type=int, default=50, help="Warmup iterations per kernel.")
+    p.add_argument("--num-iters", type=int, default=200, help="Timed iterations per kernel.")
+    return p.parse_args()
+
+
+_args = _parse_args()
+N = _args.n
 TILE = 16
-N_BLOCKS = (N + TILE - 1) // TILE  # 6
-N_PADDED = N_BLOCKS * TILE  # 96, rounded up for blocked kernel SharedArrays
-N_ENVS = 4096
-WARMUP = 50
-ITERS = 200
+N_BLOCKS = (N + TILE - 1) // TILE
+N_PADDED = N_BLOCKS * TILE  # rounded up for blocked kernel SharedArrays
+N_ENVS = _args.n_envs
+WARMUP = _args.num_warmup
+ITERS = _args.num_iters
 
 qd.init(arch=qd.gpu)
 

From eae1a36377e708a28c3fe840253ab7b71f39e3a7 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 06:49:15 -0700
Subject: [PATCH 04/15] [Test] test_tile16_cholesky_blocked_demo: invoke demo
 in smoke-mode

Pass small CLI overrides (--n 32 --n-envs 64 --num-warmup 1
--num-iters 1) so the demo runs end-to-end in seconds instead of ~74 s.
The test contract is just "demo exits 0"; it doesn't read any of the
benchmark numbers, so the smaller workload still satisfies the smoke
test.

The full N=92 / N_ENVS=4096 / 50+200-iter demo is still what humans
running misc/demos/cholesky_blocked.py see by default (argparse
defaults match the previous hard-coded values).

Together with the previous commit, this drops the
test_tile16_cholesky_blocked_demo wall time on cluster CUDA from
~74 s to (expected) a few seconds, removing the largest remaining
single-test outlier on hp/mark-slow-tests.
---
 tests/python/test_tile16.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/tests/python/test_tile16.py b/tests/python/test_tile16.py
index 97480c7d1d..6d917e11ad 100644
--- a/tests/python/test_tile16.py
+++ b/tests/python/test_tile16.py
@@ -1776,8 +1776,21 @@ def write_eye_f32(dst: Ann32):
 
 @test_utils.test(arch=[qd.cuda])
 def test_tile16_cholesky_blocked_demo():
-    """Smoke-test that misc/demos/cholesky_blocked.py runs to completion."""
+    """Smoke-test that misc/demos/cholesky_blocked.py runs to completion.
+
+    Uses small CLI overrides (N=32, N_ENVS=64, 1 warmup + 1 timed iter) so the
+    JIT compile of the 3 unrolled kernels and the benchmark loop both stay
+    cheap. The demo's defaults (N=92, N_ENVS=4096, 50+200 iters) are exercised
+    by anyone running the script manually, not by CI.
+    """
     demo = Path(__file__).resolve().parents[2] / "misc" / "demos" / "cholesky_blocked.py"
-    result = subprocess.run([sys.executable, str(demo)], capture_output=True, text=True, timeout=300)
+    cmd = [
+        sys.executable, str(demo),
+        "--n", "32",
+        "--n-envs", "64",
+        "--num-warmup", "1",
+        "--num-iters", "1",
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
     if result.returncode != 0:
         pytest.fail(f"cholesky_blocked.py exited with code {result.returncode}\nstderr:\n{result.stderr}")

From dc1319ef9b2c27deffbe0a9c4f64f7e1ac3e43aa Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 07:16:24 -0700
Subject: [PATCH 05/15] [Test] test_matmul_chain_qipc_sizes: parametrize on
 matrix shapes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously the test hard-coded the qipc IPC sizes (9x12) · (12x12) ·
(12x9). On cluster CUDA those two cases (f32 + f64) take ~92.7s and
~87.3s respectively -- the top two single-test outliers in the suite,
each holding one xdist worker for ~90s of contiguous JIT-compile +
unrolled-FMA work.

Parametrize `_test_matmul_chain` on (rows_a, cols_a, cols_b, cols_c).
Default lane runs the small (3,4,4,3) chain to exercise the same
Matrix.__matmul__ codegen path; the original (9,12,12,9) qipc-sized
chain is slow-marked so it still runs on --run-slow (i.e. CI's nightly
/ release lane, once that's wired up).

Estimated saving: ~180s CPU, ~70s wall (these tests were on the
critical path of the branch run).

No function-level coverage lost: both f32 and f64 versions still run
the same chain by default, just at a smaller size.
---
 tests/python/test_linalg.py | 53 ++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/tests/python/test_linalg.py b/tests/python/test_linalg.py
index a6632d4678..59925ee2ce 100644
--- a/tests/python/test_linalg.py
+++ b/tests/python/test_linalg.py
@@ -215,24 +215,26 @@ def test_frobenius_inner_rectangular_f64(rows, cols):
     _test_frobenius_inner_rectangular(rows, cols, qd.f64)
 
 
-def _test_matmul_chain(dt):
-    """3-way matmul chain at qipc IPC sizes: (9×12) · (12×12) · (12×9) → (9×9).
-
-    Verifies that ``Matrix.__matmul__`` compiles and is numerically correct at the largest size qipc needs. Quadrants
-    imposes no enforced size cap on matmul, but the unrolled `static(range)` triple loop produces ~1296 FMAs per
-    intermediate, so this test catches compile-time blow-up or back-end miscompiles at large sizes.
+def _test_matmul_chain(rows_a, cols_a, cols_b, cols_c, dt):
+    """3-way matmul chain: ``(rows_a × cols_a) · (cols_a × cols_b) · (cols_b × cols_c) → (rows_a × cols_c)``.
+
+    Verifies that ``Matrix.__matmul__`` compiles and is numerically correct at the requested size. Quadrants
+    imposes no enforced size cap on matmul, but the unrolled `static(range)` triple loop produces
+    ``rows_a * cols_a * cols_b + rows_a * cols_b * cols_c`` FMAs per kernel call, so this test catches compile-time
+    blow-up or back-end miscompiles at large sizes. The largest parametrize value is the chain qipc actually uses;
+    smaller values are cheap sanity checks that the same code path still works.
     """
     np_dt = np.float32 if dt == qd.f32 else np.float64
-    A_np = np.random.default_rng(0xCA70).standard_normal((9, 12)).astype(np_dt)
-    B_np = np.random.default_rng(0xCA71).standard_normal((12, 12)).astype(np_dt)
-    C_np = np.random.default_rng(0xCA72).standard_normal((12, 9)).astype(np_dt)
+    A_np = np.random.default_rng(0xCA70).standard_normal((rows_a, cols_a)).astype(np_dt)
+    B_np = np.random.default_rng(0xCA71).standard_normal((cols_a, cols_b)).astype(np_dt)
+    C_np = np.random.default_rng(0xCA72).standard_normal((cols_b, cols_c)).astype(np_dt)
 
-    A = qd.Matrix.field(9, 12, dtype=dt, shape=())
-    B = qd.Matrix.field(12, 12, dtype=dt, shape=())
-    C = qd.Matrix.field(12, 9, dtype=dt, shape=())
-    AB = qd.Matrix.field(9, 12, dtype=dt, shape=())
-    ABC_chained = qd.Matrix.field(9, 9, dtype=dt, shape=())
-    ABC_staged = qd.Matrix.field(9, 9, dtype=dt, shape=())
+    A = qd.Matrix.field(rows_a, cols_a, dtype=dt, shape=())
+    B = qd.Matrix.field(cols_a, cols_b, dtype=dt, shape=())
+    C = qd.Matrix.field(cols_b, cols_c, dtype=dt, shape=())
+    AB = qd.Matrix.field(rows_a, cols_b, dtype=dt, shape=())
+    ABC_chained = qd.Matrix.field(rows_a, cols_c, dtype=dt, shape=())
+    ABC_staged = qd.Matrix.field(rows_a, cols_c, dtype=dt, shape=())
 
     A.from_numpy(A_np)
     B.from_numpy(B_np)
@@ -255,16 +257,25 @@ def run():
     np.testing.assert_allclose(ABC_chained.to_numpy(), ABC_staged.to_numpy(), rtol=tol, atol=tol)
 
 
-@pytest.mark.slow
+# qipc's actual size is (9,12,12,9) -- the largest chain it instantiates. We also keep a tiny (3,4,4,3) chain so
+# the default fast lane still exercises the same Matrix.__matmul__ codegen path without paying the ~90s/case
+# CUDA JIT cost of the qipc-sized chain.
+_MATMUL_CHAIN_SHAPES = [
+    (3, 4, 4, 3),
+    pytest.param(9, 12, 12, 9, marks=pytest.mark.slow),
+]
+
+
+@pytest.mark.parametrize("rows_a,cols_a,cols_b,cols_c", _MATMUL_CHAIN_SHAPES)
 @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False)
-def test_matmul_chain_qipc_sizes_f32():
-    _test_matmul_chain(qd.f32)
+def test_matmul_chain_qipc_sizes_f32(rows_a, cols_a, cols_b, cols_c):
+    _test_matmul_chain(rows_a, cols_a, cols_b, cols_c, qd.f32)
 
 
-@pytest.mark.slow
+@pytest.mark.parametrize("rows_a,cols_a,cols_b,cols_c", _MATMUL_CHAIN_SHAPES)
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
-def test_matmul_chain_qipc_sizes_f64():
-    _test_matmul_chain(qd.f64)
+def test_matmul_chain_qipc_sizes_f64(rows_a, cols_a, cols_b, cols_c):
+    _test_matmul_chain(rows_a, cols_a, cols_b, cols_c, qd.f64)
 
 
 @test_utils.test()

From 81d45a00964fe9202d077489644035e48143e32c Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 07:17:17 -0700
Subject: [PATCH 06/15] [Test] test_gdar_mpm: parametrize on particles_side /
 n_grid / num_steps

Previously hard-coded N=30 (900 particles), n_grid=120, steps=32 -- 26s
on cluster CUDA. The test's actual contract is that the AD-validation
checker raises QuadrantsAssertionError on the global-data-access
violation in g2p (`v[f, p] = new_v`), which fires on the first substep
regardless of grid / particle / step counts.

Parametrize on (particles_side, n_grid_size, num_steps) with a small
default (8, 32, 4) and slow-marked original (30, 120, 32). The default
still exercises the same diff-MPM pipeline (p2g / grid_op / g2p,
qd.ad.Tape with validation=True, `with pytest.raises(...)`) and still
triggers the assertion error.

Estimated CPU saving: ~22s; wall saving ~3s on the branch run.
---
 tests/python/test_ad_gdar_diffmpm.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/tests/python/test_ad_gdar_diffmpm.py b/tests/python/test_ad_gdar_diffmpm.py
index cd6bb32a04..0e0e460534 100644
--- a/tests/python/test_ad_gdar_diffmpm.py
+++ b/tests/python/test_ad_gdar_diffmpm.py
@@ -5,14 +5,26 @@
 from tests import test_utils
 
 
+# Defaults shrink particle / grid / steps counts so the JIT compile + AD-tape replay
+# stays cheap; the slow-marked entry keeps the original (N=30, n_grid=120, steps=32)
+# workload that runs on --run-slow. The point of the test is that the AD-validation
+# checker fires on the global-data-access violation in g2p (`v[f, p] = new_v`), which
+# happens on the first substep regardless of size.
+@pytest.mark.parametrize(
+    "particles_side,n_grid_size,num_steps",
+    [
+        (8, 32, 4),
+        pytest.param(30, 120, 32, marks=pytest.mark.slow),
+    ],
+)
 @test_utils.test(require=qd.extension.assertion, debug=True)
-def test_gdar_mpm():
+def test_gdar_mpm(particles_side, n_grid_size, num_steps):
     real = qd.f32
 
     dim = 2
-    N = 30  # reduce to 30 if run out of GPU memory
+    N = particles_side
     n_particles = N * N
-    n_grid = 120
+    n_grid = n_grid_size
     dx = 1 / n_grid
     inv_dx = 1 / dx
     dt = 3e-4
@@ -21,8 +33,8 @@ def test_gdar_mpm():
     E = 100
     mu = E
     la = E
-    max_steps = 32
-    steps = 32
+    max_steps = num_steps
+    steps = num_steps
     gravity = 9.8
     target = [0.3, 0.6]
 

From 1b08117ea0bf317eb1672f6e578dd5179ad171cc Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 08:02:22 -0700
Subject: [PATCH 07/15] [Test] test_device_{reduce,exclusive_scan}: fuse
 {add,min,max} into one op-parametrized test

The three reduce variants (and the three scan variants) shared an identical
kernel signature, identical input shape, and differed only in (a) which
qd.algorithms.device_<op> function they called and (b) overflow vs
bitwise-exact verification. Collapse each triple into a single op-parametrized
test:

  test_device_reduce(op, dtype, N)            # op in {add, min, max}
  test_device_exclusive_scan(op, dtype, N)    # op in {add, min, max}

Behavior, coverage and the parametrize space are unchanged -- pytest still
collects the same number of parametrize cases, just under unified test names.
This is purely a code-dedup refactor (~130 LOC less) which makes the next
op-axis sampling change (if/when we choose to drop A vs B vs C from the
sweep) a one-line edit.
---
 tests/python/test_algorithms.py | 240 ++++++++++++++------------------
 1 file changed, 106 insertions(+), 134 deletions(-)

diff --git a/tests/python/test_algorithms.py b/tests/python/test_algorithms.py
index e4b4ac9960..508732ce3b 100644
--- a/tests/python/test_algorithms.py
+++ b/tests/python/test_algorithms.py
@@ -320,86 +320,79 @@ def _rand_reduce_host(rng, dtype, N, *, bound=1000):
     return rng.integers(-bound, bound, size=N, dtype=np_dt)
 
 
-@pytest.mark.parametrize("N", _REDUCE_SIZES)
-@pytest.mark.parametrize("dtype", _REDUCE_DTYPES)
-@test_utils.test(arch=qd.gpu)
-def test_device_reduce_add(dtype, N):
-    """device_reduce_add matches numpy.sum across the full size sweep + dtype set."""
-    _skip_if_dtype_unsupported(dtype)
-    inp, out = _alloc_input_out(dtype, N)
-    rng = np.random.default_rng(seed=1234)
-    host = _rand_reduce_host(rng, dtype, N)
-    _fill_field(inp, host)
+_REDUCE_OPS = ["add", "min", "max"]
 
-    qd.algorithms.device_reduce_add(inp, out=out)
 
-    got = out.to_numpy()[0]
+def _reduce_host(rng, op, dtype, N):
+    """Generate the test input for a reduce of `op` on `dtype` x N values.
+
+    ``add`` uses small uniform / bounded values so float sums stay representable; ``min`` and ``max`` use a wider
+    range (-10..10 for floats, +-10000 for ints) since picking-an-element is bitwise-exact regardless of magnitude.
+    """
+    if op == "add":
+        return _rand_reduce_host(rng, dtype, N)
     if _is_float(dtype):
-        expected = float(np.sum(host.astype(np.float64)))
-        rtol, atol = (_F32_REDUCE_RTOL, _F32_REDUCE_ATOL) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL)
-        assert math.isclose(
-            got, expected, rel_tol=rtol, abs_tol=atol
-        ), f"{dtype} reduce_add(N={N}): got {got}, expected {expected}"
-    else:
-        # Promote to Python int for an arbitrary-width reference; mask both sides to dtype width to handle the
-        # u32 / u64 mod-wrap case at large N.
-        mod = 1 << (32 if dtype in (qd.i32, qd.u32) else 64) if _is_unsigned(dtype) else None
-        ref = int(
-            np.sum(host.astype(np.int64 if dtype in (qd.i32, qd.u32) else (np.int64 if dtype == qd.i64 else np.uint64)))
-        )  # noqa: E501
-        got_int = int(got)
-        if mod is not None:
-            ref &= mod - 1
-            got_int &= mod - 1
-        assert got_int == ref, f"{dtype} reduce_add(N={N}): got {got_int}, expected {ref}"
+        return rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype])
+    return _rand_reduce_host(rng, dtype, N, bound=10000)
 
 
-@pytest.mark.parametrize("N", _REDUCE_SIZES)
-@pytest.mark.parametrize("dtype", _REDUCE_DTYPES)
-@test_utils.test(arch=qd.gpu)
-def test_device_reduce_min(dtype, N):
-    """device_reduce_min(identity=type-positive-extreme) matches numpy.min."""
+def _check_reduce(op, dtype, N):
+    """Run ``device_reduce_<op>(arr)`` and verify against ``numpy.<op>(arr)``.
+
+    ``add`` accumulates so it needs (a) wider integer promotion + mod-wrap masking for u32/u64 and (b) per-N float
+    tolerance. ``min`` / ``max`` pick one input element, so they're bitwise-exact for both ints and floats.
+    """
     _skip_if_dtype_unsupported(dtype)
     inp, out = _alloc_input_out(dtype, N)
     rng = np.random.default_rng(seed=1234)
-    if _is_float(dtype):
-        host = rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype])
-    else:
-        host = _rand_reduce_host(rng, dtype, N, bound=10000)
+    host = _reduce_host(rng, op, dtype, N)
     _fill_field(inp, host)
 
-    qd.algorithms.device_reduce_min(inp, out=out)
+    qd_fn = getattr(qd.algorithms, f"device_reduce_{op}")
+    qd_fn(inp, out=out)
     got = out.to_numpy()[0]
-    expected = host.min()
 
+    if op == "add":
+        if _is_float(dtype):
+            expected = float(np.sum(host.astype(np.float64)))
+            rtol, atol = (_F32_REDUCE_RTOL, _F32_REDUCE_ATOL) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL)
+            assert math.isclose(
+                got, expected, rel_tol=rtol, abs_tol=atol
+            ), f"{dtype} reduce_add(N={N}): got {got}, expected {expected}"
+        else:
+            # Promote to Python int for an arbitrary-width reference; mask both sides to dtype width to handle the
+            # u32 / u64 mod-wrap case at large N.
+            mod = 1 << (32 if dtype in (qd.i32, qd.u32) else 64) if _is_unsigned(dtype) else None
+            ref = int(
+                np.sum(
+                    host.astype(np.int64 if dtype in (qd.i32, qd.u32) else (np.int64 if dtype == qd.i64 else np.uint64))
+                )
+            )  # noqa: E501
+            got_int = int(got)
+            if mod is not None:
+                ref &= mod - 1
+                got_int &= mod - 1
+            assert got_int == ref, f"{dtype} reduce_add(N={N}): got {got_int}, expected {ref}"
+        return
+
+    expected = host.min() if op == "min" else host.max()
     if _is_float(dtype):
         assert got == pytest.approx(expected, abs=1e-6 if dtype == qd.f32 else 1e-12)
     else:
-        assert int(got) == int(expected), f"{dtype} reduce_min(N={N}): got {got}, expected {expected}"
+        assert int(got) == int(expected), f"{dtype} reduce_{op}(N={N}): got {got}, expected {expected}"
 
 
+@pytest.mark.parametrize("op", _REDUCE_OPS)
 @pytest.mark.parametrize("N", _REDUCE_SIZES)
 @pytest.mark.parametrize("dtype", _REDUCE_DTYPES)
 @test_utils.test(arch=qd.gpu)
-def test_device_reduce_max(dtype, N):
-    """device_reduce_max(identity=type-negative-extreme) matches numpy.max."""
-    _skip_if_dtype_unsupported(dtype)
-    inp, out = _alloc_input_out(dtype, N)
-    rng = np.random.default_rng(seed=1234)
-    if _is_float(dtype):
-        host = rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype])
-    else:
-        host = _rand_reduce_host(rng, dtype, N, bound=10000)
-    _fill_field(inp, host)
-
-    qd.algorithms.device_reduce_max(inp, out=out)
-    got = out.to_numpy()[0]
-    expected = host.max()
+def test_device_reduce(op, dtype, N):
+    """``device_reduce_{add,min,max}`` match numpy across the full size sweep + dtype set.
 
-    if _is_float(dtype):
-        assert got == pytest.approx(expected, abs=1e-6 if dtype == qd.f32 else 1e-12)
-    else:
-        assert int(got) == int(expected), f"{dtype} reduce_max(N={N}): got {got}, expected {expected}"
+    Unified across the three op variants. ``add`` accumulates so it needs overflow / precision-aware comparison;
+    ``min`` / ``max`` pick one element of the input and are bitwise-exact.
+    """
+    _check_reduce(op, dtype, N)
 
 
 @test_utils.test(arch=qd.gpu)
@@ -454,101 +447,80 @@ def _scan_dtype_mask(dtype):
     return -1
 
 
-@pytest.mark.parametrize("N", _SCAN_SIZES)
-@pytest.mark.parametrize("dtype", _SCAN_DTYPES)
-@test_utils.test(arch=qd.gpu)
-def test_device_exclusive_scan_add(dtype, N):
-    """device_exclusive_scan_add(out[i] = sum(arr[0:i])) matches numpy.cumsum-shifted across the full 6-dtype set."""
-    _skip_if_dtype_unsupported(dtype)
-    inp, out = _alloc_scan_input_out(dtype, N)
-    rng = np.random.default_rng(seed=1234)
-    host = _rand_reduce_host(rng, dtype, N, bound=100)
-    _fill_field(inp, host)
+_SCAN_OPS = ["add", "min", "max"]
 
-    qd.algorithms.device_exclusive_scan_add(inp, out=out)
-    got = out.to_numpy()
 
+def _scan_host(rng, op, dtype, N):
+    """Generate the test input for a scan of `op` on `dtype` x N values. Same rationale as ``_reduce_host``."""
+    if op == "add":
+        return _rand_reduce_host(rng, dtype, N, bound=100)
     if _is_float(dtype):
-        ref = np.concatenate([[0.0], np.cumsum(host.astype(np.float64))[:-1]])
-        rtol, atol = _f32_scan_tol(N) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL)
-        np.testing.assert_allclose(
-            got.astype(np.float64),
-            ref,
-            rtol=rtol,
-            atol=atol,
-            err_msg=f"{dtype} scan_add(N={N})",
-        )
-    else:
-        # Promote to a width that survives the cumulative sum: u64 / i64 inputs use a Python int reference; smaller
-        # ints can still use int64.
-        promote = np.int64 if dtype in (qd.i32, qd.u32, qd.i64) else np.uint64
-        host_wide = host.astype(promote)
-        ref = np.concatenate([[promote(0)], np.cumsum(host_wide)[:-1]]).astype(promote)
-        mask = _scan_dtype_mask(dtype)
-        got_view = got.astype(np.int64 if dtype != qd.u64 else np.uint64)
-        if mask != -1:
-            got_view = got_view & promote(mask)
-            ref = ref & promote(mask)
-        np.testing.assert_array_equal(
-            got_view,
-            ref,
-            err_msg=f"{dtype} scan_add(N={N})",
-        )
+        return rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype])
+    return _rand_reduce_host(rng, dtype, N, bound=10000)
 
 
-@pytest.mark.parametrize("N", _SCAN_SIZES)
-@pytest.mark.parametrize("dtype", _SCAN_DTYPES)
-@test_utils.test(arch=qd.gpu)
-def test_device_exclusive_scan_min(dtype, N):
-    """device_exclusive_scan_min(out[i] = min(arr[0:i])) matches numpy.minimum.accumulate-shifted across the full
-    6-dtype set."""
+def _check_scan(op, dtype, N):
+    """Run ``device_exclusive_scan_<op>(arr)`` and verify against ``numpy.<op>.accumulate``-shifted.
+
+    Like the reduce family, ``add`` accumulates (overflow / precision care) while ``min`` / ``max`` are
+    bitwise-exact in both float and int paths.
+    """
     _skip_if_dtype_unsupported(dtype)
     inp, out = _alloc_scan_input_out(dtype, N)
     rng = np.random.default_rng(seed=1234)
     np_dt = _DTYPE_TO_NP[dtype]
-    if _is_float(dtype):
-        host = rng.uniform(-10.0, 10.0, size=N).astype(np_dt)
-    else:
-        host = _rand_reduce_host(rng, dtype, N, bound=10000)
+    host = _scan_host(rng, op, dtype, N)
     _fill_field(inp, host)
 
-    qd.algorithms.device_exclusive_scan_min(inp, out=out)
+    qd_fn = getattr(qd.algorithms, f"device_exclusive_scan_{op}")
+    qd_fn(inp, out=out)
     got = out.to_numpy()
 
+    if op == "add":
+        if _is_float(dtype):
+            ref = np.concatenate([[0.0], np.cumsum(host.astype(np.float64))[:-1]])
+            rtol, atol = _f32_scan_tol(N) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL)
+            np.testing.assert_allclose(
+                got.astype(np.float64),
+                ref,
+                rtol=rtol,
+                atol=atol,
+                err_msg=f"{dtype} scan_add(N={N})",
+            )
+        else:
+            # Promote to a width that survives the cumulative sum: u64 / i64 inputs use a Python int reference;
+            # smaller ints can still use int64.
+            promote = np.int64 if dtype in (qd.i32, qd.u32, qd.i64) else np.uint64
+            host_wide = host.astype(promote)
+            ref = np.concatenate([[promote(0)], np.cumsum(host_wide)[:-1]]).astype(promote)
+            mask = _scan_dtype_mask(dtype)
+            got_view = got.astype(np.int64 if dtype != qd.u64 else np.uint64)
+            if mask != -1:
+                got_view = got_view & promote(mask)
+                ref = ref & promote(mask)
+            np.testing.assert_array_equal(got_view, ref, err_msg=f"{dtype} scan_add(N={N})")
+        return
+
+    np_accum = np.minimum.accumulate if op == "min" else np.maximum.accumulate
+    identity_table = _MIN_IDENTITY if op == "min" else _MAX_IDENTITY
     if _is_float(dtype):
-        ref = np.concatenate([[float("inf")], np.minimum.accumulate(host.astype(np.float64))[:-1]]).astype(np_dt)
-        atol = 0 if dtype == qd.f32 else 0  # min is bitwise-exact for monotone ops on float
-        np.testing.assert_allclose(got, ref, rtol=0, atol=atol, err_msg=f"{dtype} scan_min(N={N})")
+        identity = float("inf") if op == "min" else float("-inf")
+        ref = np.concatenate([[identity], np_accum(host.astype(np.float64))[:-1]]).astype(np_dt)
+        np.testing.assert_allclose(got, ref, rtol=0, atol=0, err_msg=f"{dtype} scan_{op}(N={N})")
     else:
-        ref = np.concatenate([[np_dt(_MIN_IDENTITY[dtype])], np.minimum.accumulate(host)[:-1]]).astype(np_dt)
-        np.testing.assert_array_equal(got, ref, err_msg=f"{dtype} scan_min(N={N})")
+        ref = np.concatenate([[np_dt(identity_table[dtype])], np_accum(host)[:-1]]).astype(np_dt)
+        np.testing.assert_array_equal(got, ref, err_msg=f"{dtype} scan_{op}(N={N})")
 
 
+@pytest.mark.parametrize("op", _SCAN_OPS)
 @pytest.mark.parametrize("N", _SCAN_SIZES)
 @pytest.mark.parametrize("dtype", _SCAN_DTYPES)
 @test_utils.test(arch=qd.gpu)
-def test_device_exclusive_scan_max(dtype, N):
-    """device_exclusive_scan_max(out[i] = max(arr[0:i])) matches numpy.maximum.accumulate-shifted across the full
-    6-dtype set."""
-    _skip_if_dtype_unsupported(dtype)
-    inp, out = _alloc_scan_input_out(dtype, N)
-    rng = np.random.default_rng(seed=1234)
-    np_dt = _DTYPE_TO_NP[dtype]
-    if _is_float(dtype):
-        host = rng.uniform(-10.0, 10.0, size=N).astype(np_dt)
-    else:
-        host = _rand_reduce_host(rng, dtype, N, bound=10000)
-    _fill_field(inp, host)
-
-    qd.algorithms.device_exclusive_scan_max(inp, out=out)
-    got = out.to_numpy()
-
-    if _is_float(dtype):
-        ref = np.concatenate([[float("-inf")], np.maximum.accumulate(host.astype(np.float64))[:-1]]).astype(np_dt)
-        np.testing.assert_allclose(got, ref, rtol=0, atol=0, err_msg=f"{dtype} scan_max(N={N})")
-    else:
-        ref = np.concatenate([[np_dt(_MAX_IDENTITY[dtype])], np.maximum.accumulate(host)[:-1]]).astype(np_dt)
-        np.testing.assert_array_equal(got, ref, err_msg=f"{dtype} scan_max(N={N})")
+def test_device_exclusive_scan(op, dtype, N):
+    """``device_exclusive_scan_{add,min,max}`` match ``numpy.{cumsum, minimum.accumulate, maximum.accumulate}``-shifted
+    across the full size sweep + dtype set. Unified across the three op variants; same overflow vs bitwise-exact
+    handling as the reduce family."""
+    _check_scan(op, dtype, N)
 
 
 @test_utils.test(arch=qd.gpu)

From 8acaaecd8d238080747d8a67ffbef7db708a63ac Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 08:03:21 -0700
Subject: [PATCH 08/15] [Style] black: reformat
 test_tile16_cholesky_blocked_demo cmd list + run_tests help string

Pure formatting fix from `pre-commit run -a`; no behavior change.
---
 tests/python/test_tile16.py | 15 ++++++++++-----
 tests/run_tests.py          |  3 +--
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/tests/python/test_tile16.py b/tests/python/test_tile16.py
index 6d917e11ad..f94d4221e1 100644
--- a/tests/python/test_tile16.py
+++ b/tests/python/test_tile16.py
@@ -1785,11 +1785,16 @@ def test_tile16_cholesky_blocked_demo():
     """
     demo = Path(__file__).resolve().parents[2] / "misc" / "demos" / "cholesky_blocked.py"
     cmd = [
-        sys.executable, str(demo),
-        "--n", "32",
-        "--n-envs", "64",
-        "--num-warmup", "1",
-        "--num-iters", "1",
+        sys.executable,
+        str(demo),
+        "--n",
+        "32",
+        "--n-envs",
+        "64",
+        "--num-warmup",
+        "1",
+        "--num-iters",
+        "1",
     ]
     result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
     if result.returncode != 0:
diff --git a/tests/run_tests.py b/tests/run_tests.py
index 47d5574ad0..bf37ab2aa7 100644
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -168,8 +168,7 @@ def test():
         default=None,
         dest="marks",
         type=str,
-        help="Only run tests with specific marks. `not slow` is appended automatically "
-        "unless --run-slow is passed.",
+        help="Only run tests with specific marks. `not slow` is appended automatically " "unless --run-slow is passed.",
     )
     parser.add_argument(
         "--run-slow",

From e57752abdeab48ada14ccf77af8b7dc54b5f0acd Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 08:09:25 -0700
Subject: [PATCH 09/15] [Test] test_subgroup_full_matches_tiled: fuse 20 thin
 subgroup-op wrappers into 2 op-parametrized tests

Lines 3608-3694 in test_simt.py were 18 ~5-line wrappers each calling
``_check_full_matches_tiled(subgroup.<op>, subgroup.<op>_tiled, ...)``.
Lines 3841-3848 were 2 more, parametrized on dtype. ``_check_full_matches_tiled``
already accepts the full / tiled functions as Python arguments (closure-captured
into ``@qd.kernel``), so collapsing the family is a pure dedup move:

  test_subgroup_full_matches_tiled(op_name, host_init)
      # 18 cases: {reduce, inclusive, exclusive}_{add,min,max,mul,and,or,xor} on qd.i32

  test_subgroup_full_matches_tiled_float(op_name, dtype)
      # 4 cases: {reduce_add, inclusive_add} x {qd.f32, qd.f64}

Behavior + coverage unchanged (still 22 parametrize cases, same dtype + init
configurations). Pytest ids are designed to match the original test-name
suffixes (e.g. ``[reduce_add]``, ``[inclusive_mul]``) so ``-k`` selectors and
test reports stay readable. Drops ~50 LOC net.
---
 tests/python/test_simt.py | 136 ++++++++++++--------------------------
 1 file changed, 43 insertions(+), 93 deletions(-)

diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
index 95e3438e41..6790d3afb5 100644
--- a/tests/python/test_simt.py
+++ b/tests/python/test_simt.py
@@ -3604,94 +3604,45 @@ def _init_full_bitwise(src, n):
         src[i] = 1 << (i % 7)
 
 
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_reduce_add():
-    _check_full_matches_tiled(subgroup.reduce_add, subgroup.reduce_add_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_reduce_all_add():
-    _check_full_matches_tiled(subgroup.reduce_all_add, subgroup.reduce_all_add_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_reduce_min():
-    _check_full_matches_tiled(subgroup.reduce_min, subgroup.reduce_min_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_reduce_max():
-    _check_full_matches_tiled(subgroup.reduce_max, subgroup.reduce_max_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_reduce_all_min():
-    _check_full_matches_tiled(subgroup.reduce_all_min, subgroup.reduce_all_min_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_reduce_all_max():
-    _check_full_matches_tiled(subgroup.reduce_all_max, subgroup.reduce_all_max_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_inclusive_add():
-    _check_full_matches_tiled(subgroup.inclusive_add, subgroup.inclusive_add_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_inclusive_min():
-    _check_full_matches_tiled(subgroup.inclusive_min, subgroup.inclusive_min_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_inclusive_max():
-    _check_full_matches_tiled(subgroup.inclusive_max, subgroup.inclusive_max_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_inclusive_mul():
-    _check_full_matches_tiled(subgroup.inclusive_mul, subgroup.inclusive_mul_tiled, host_init=_init_full_small_int)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_inclusive_and():
-    _check_full_matches_tiled(subgroup.inclusive_and, subgroup.inclusive_and_tiled, host_init=_init_full_bitwise)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_inclusive_or():
-    _check_full_matches_tiled(subgroup.inclusive_or, subgroup.inclusive_or_tiled, host_init=_init_full_bitwise)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_inclusive_xor():
-    _check_full_matches_tiled(subgroup.inclusive_xor, subgroup.inclusive_xor_tiled, host_init=_init_full_bitwise)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_exclusive_add():
-    _check_full_matches_tiled(subgroup.exclusive_add, subgroup.exclusive_add_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_exclusive_mul():
-    _check_full_matches_tiled(subgroup.exclusive_mul, subgroup.exclusive_mul_tiled, host_init=_init_full_small_int)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_exclusive_and():
-    _check_full_matches_tiled(subgroup.exclusive_and, subgroup.exclusive_and_tiled, host_init=_init_full_bitwise)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_exclusive_or():
-    _check_full_matches_tiled(subgroup.exclusive_or, subgroup.exclusive_or_tiled, host_init=_init_full_bitwise)
+# Each entry is a thin ``_check_full_matches_tiled(subgroup.X, subgroup.X_tiled, ...)`` wrapper. Collapsed into one
+# op-parametrized test to drop ~80 LOC of duplication. The pytest ids match the names of the original
+# ``test_subgroup_<op>`` functions so test reports / `-k` selectors stay stable.
+_FULL_VS_TILED_INT_CASES = [
+    pytest.param("reduce_add", None, id="reduce_add"),
+    pytest.param("reduce_all_add", None, id="reduce_all_add"),
+    pytest.param("reduce_min", None, id="reduce_min"),
+    pytest.param("reduce_max", None, id="reduce_max"),
+    pytest.param("reduce_all_min", None, id="reduce_all_min"),
+    pytest.param("reduce_all_max", None, id="reduce_all_max"),
+    pytest.param("inclusive_add", None, id="inclusive_add"),
+    pytest.param("inclusive_min", None, id="inclusive_min"),
+    pytest.param("inclusive_max", None, id="inclusive_max"),
+    # `mul` needs bounded inputs (2**N overflows i32 quickly); bitwise ops need a per-lane bit pattern that's
+    # non-zero on every lane so AND has signal and OR / XOR have varied bits.
+    pytest.param("inclusive_mul", _init_full_small_int, id="inclusive_mul"),
+    pytest.param("inclusive_and", _init_full_bitwise, id="inclusive_and"),
+    pytest.param("inclusive_or", _init_full_bitwise, id="inclusive_or"),
+    pytest.param("inclusive_xor", _init_full_bitwise, id="inclusive_xor"),
+    pytest.param("exclusive_add", None, id="exclusive_add"),
+    pytest.param("exclusive_mul", _init_full_small_int, id="exclusive_mul"),
+    pytest.param("exclusive_and", _init_full_bitwise, id="exclusive_and"),
+    pytest.param("exclusive_or", _init_full_bitwise, id="exclusive_or"),
+    pytest.param("exclusive_xor", _init_full_bitwise, id="exclusive_xor"),
+]
 
 
+@pytest.mark.parametrize("op_name,host_init", _FULL_VS_TILED_INT_CASES)
 @test_utils.test(arch=qd.gpu)
-def test_subgroup_exclusive_xor():
-    _check_full_matches_tiled(subgroup.exclusive_xor, subgroup.exclusive_xor_tiled, host_init=_init_full_bitwise)
+def test_subgroup_full_matches_tiled(op_name, host_init):
+    """For each subgroup op ``X``, verify ``subgroup.X(v)`` matches ``subgroup.X_tiled(v, log2_group_size())``
+    lane-by-lane on ``qd.i32``. Covers reduce / inclusive / exclusive families; bitwise ops + ``mul`` use a custom
+    initializer that keeps the per-lane aggregate bounded."""
+    full_fn = getattr(subgroup, op_name)
+    tiled_fn = getattr(subgroup, f"{op_name}_tiled")
+    kwargs = {}
+    if host_init is not None:
+        kwargs["host_init"] = host_init
+    _check_full_matches_tiled(full_fn, tiled_fn, **kwargs)
 
 
 @test_utils.test(arch=qd.gpu)
@@ -3836,16 +3787,15 @@ def k():
 # accidentally cast through i32 inside a wrapper.
 
 
+@pytest.mark.parametrize("op_name", ["reduce_add", "inclusive_add"])
 @pytest.mark.parametrize("dtype", [qd.f32, qd.f64])
 @test_utils.test(arch=qd.gpu)
-def test_subgroup_reduce_add_float(dtype):
-    _check_full_matches_tiled(subgroup.reduce_add, subgroup.reduce_add_tiled, dtype=dtype)
-
-
-@pytest.mark.parametrize("dtype", [qd.f32, qd.f64])
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_inclusive_add_float(dtype):
-    _check_full_matches_tiled(subgroup.inclusive_add, subgroup.inclusive_add_tiled, dtype=dtype)
+def test_subgroup_full_matches_tiled_float(op_name, dtype):
+    """Float-dtype coverage of the dtype-agnostic ``full`` wrappers (``reduce_add``, ``inclusive_add``). One f32 + one
+    f64 case per family is enough to catch an i32-only regression in a wrapper."""
+    full_fn = getattr(subgroup, op_name)
+    tiled_fn = getattr(subgroup, f"{op_name}_tiled")
+    _check_full_matches_tiled(full_fn, tiled_fn, dtype=dtype)
 
 
 @pytest.mark.parametrize("dtype", [qd.f32, qd.f64])

From 4c18c86e59a275898f93adb71f5ee788d2d1d076 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 08:13:10 -0700
Subject: [PATCH 10/15] [Test] test_block_reduce{,_all}: fuse {add,min,max}
 into op-parametrized tests

The six block-reduce tests (3 single-output + 3 broadcast) share an identical
kernel skeleton, parametrize axes, and verification loop. They only differ in
which `block.reduce_*` function is called (closure-captured into `@qd.kernel`
via getattr), the host-side reference oracle, the init pattern (sequential for
`add` so the running sum has signal; permuted hash for `min` / `max` so the
result depends on lanes other than first / last), and the float tolerance
regime (relative for accumulating `add`, absolute for picker `min` / `max`).
Collapse the six tests into two op-parametrized tests:

  test_block_reduce(sg_per_block, dtype, op_name, ...)        # single-output, 3 ops
  test_block_reduce_all(sg_per_block, dtype, op_name, ...)    # broadcast, 3 ops

Parametrize space is unchanged (3 sg x 5 dtype x 3 op = 45 cases per fused
test, matching the original 3 tests x 15 cases each). Pytest ids use plain
`[add|min|max]` suffixes so `-k` selectors remain readable. Drops ~100 LOC of
boilerplate -- two new small helpers (`_init_block_reduce_src` and
`_assert_block_reduce_close`) capture the per-op behavioral differences in one
place each.
---
 tests/python/test_simt.py | 210 ++++++++++----------------------------
 1 file changed, 54 insertions(+), 156 deletions(-)

diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
index 6790d3afb5..5b7d7490cd 100644
--- a/tests/python/test_simt.py
+++ b/tests/python/test_simt.py
@@ -887,81 +887,57 @@ def _ref_reduce_max(values):
     return max(values)
 
 
-@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
-@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
-@test_utils.test(arch=qd.gpu)
-def test_block_reduce_add(dtype, sg_per_block):
-    """Block sum-reduce: thread 0 of each block holds `sum(src[block_base:block_base+block_dim])`."""
-    _skip_if_f64_unsupported(dtype)
-    block_dim = sg_per_block * _arch_subgroup_size()
-    NUM_BLOCKS = 4
-    N = NUM_BLOCKS * block_dim
-    src = qd.field(dtype=dtype, shape=N)
-    dst = qd.field(dtype=dtype, shape=NUM_BLOCKS)
-
-    @qd.kernel
-    def foo():
-        qd.loop_config(block_dim=block_dim)
-        for i in range(N):
-            tid = i % block_dim
-            agg = block.reduce_add(src[i], block_dim, dtype)
-            if tid == 0:
-                dst[i // block_dim] = agg
-
-    _init_field(src, N, dtype)
-    foo()
-
-    for b in range(NUM_BLOCKS):
-        block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_reduce_add(block_vals)
-        if dtype in _BLOCK_REDUCE_INT_DTYPES:
-            assert dst[b] == expected, f"block {b}: got {dst[b]}, expected {expected}"
-        else:
-            assert abs(dst[b] - expected) < 1e-4 * abs(expected), f"block {b}: got {dst[b]}, expected {expected}"
-
+# The three single-output reduces (`test_block_reduce_{add,min,max}`) and their three broadcast siblings
+# (`test_block_reduce_all_{add,min,max}`) share the same kernel skeleton, parametrize axes, and verification loop;
+# they differ only in (a) which `block.reduce_*` function gets called, (b) the host-side reference oracle, (c) the
+# init pattern (sequential for `add` so the running sum has signal, permuted hash for `min` / `max` so the result
+# depends on lanes other than first / last), and (d) the float tolerance regime (`add` accumulates so it uses a
+# relative tol; `min` / `max` pick one element of the input and use an absolute tol).
+_BLOCK_REDUCE_OP_CASES = [
+    # (op_name, ref_fn, init_permuted, tol_relative)
+    pytest.param("add", _ref_reduce_add, False, True, id="add"),
+    pytest.param("min", _ref_reduce_min, True, False, id="min"),
+    pytest.param("max", _ref_reduce_max, True, False, id="max"),
+]
 
-@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
-@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
-@test_utils.test(arch=qd.gpu)
-def test_block_reduce_min(dtype, sg_per_block):
-    """Block min-reduce: thread 0 of each block holds `min(src[block_base:block_base+block_dim])`."""
-    _skip_if_f64_unsupported(dtype)
-    block_dim = sg_per_block * _arch_subgroup_size()
-    NUM_BLOCKS = 4
-    N = NUM_BLOCKS * block_dim
-    src = qd.field(dtype=dtype, shape=N)
-    dst = qd.field(dtype=dtype, shape=NUM_BLOCKS)
 
-    @qd.kernel
-    def foo():
-        qd.loop_config(block_dim=block_dim)
+def _init_block_reduce_src(src, N, dtype, *, permuted):
+    """Initialize ``src[0:N]`` for a block reduce test. ``permuted=False`` is the sequential ``1..N`` init from
+    ``_init_field`` (good for add); ``permuted=True`` is the stable hash ``((i * 1009) % 997) + 1`` so the per-block
+    min / max depends on lanes other than first / last."""
+    if permuted:
         for i in range(N):
-            tid = i % block_dim
-            agg = block.reduce_min(src[i], block_dim, dtype)
-            if tid == 0:
-                dst[i // block_dim] = agg
+            v = ((i * 1009) % 997) + 1
+            src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
+    else:
+        _init_field(src, N, dtype)
 
-    # Permuted (non-monotone) initialisation so the min depends on lanes other than the first / last.
-    for i in range(N):
-        v = ((i * 1009) % 997) + 1  # in [1, 997]; stable hash, no collisions w/ block_dim values up to 256
-        src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
-    foo()
 
-    for b in range(NUM_BLOCKS):
-        block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_reduce_min(block_vals)
-        if dtype in _BLOCK_REDUCE_INT_DTYPES:
-            assert dst[b] == expected, f"block {b}: got {dst[b]}, expected {expected}"
-        else:
-            assert abs(dst[b] - expected) < 1e-5, f"block {b}: got {dst[b]}, expected {expected}"
+def _assert_block_reduce_close(actual, expected, dtype, *, tol_relative, ctx):
+    """Assert ``actual ~= expected`` per the block-reduce tolerance regime.
+
+    Int dtypes compare exactly. Floats use relative tolerance ``1e-4 * |expected|`` for accumulating ops (sums grow
+    with block_dim, so a relative bound is the only thing that stays meaningful across the 32 / 128 / 256 / 64 / 256 /
+    512 block-size sweep), and absolute tolerance ``1e-5`` for picker ops (min / max pick one element so the
+    magnitude is whatever was in the input -- a small absolute bound suffices).
+    """
+    if dtype in _BLOCK_REDUCE_INT_DTYPES:
+        assert actual == expected, f"{ctx}: got {actual}, expected {expected}"
+    elif tol_relative:
+        assert abs(actual - expected) < 1e-4 * abs(expected), f"{ctx}: got {actual}, expected {expected}"
+    else:
+        assert abs(actual - expected) < 1e-5, f"{ctx}: got {actual}, expected {expected}"
 
 
+@pytest.mark.parametrize("op_name,ref_fn,init_permuted,tol_relative", _BLOCK_REDUCE_OP_CASES)
 @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
 @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
 @test_utils.test(arch=qd.gpu)
-def test_block_reduce_max(dtype, sg_per_block):
-    """Block max-reduce: thread 0 of each block holds `max(src[block_base:block_base+block_dim])`."""
+def test_block_reduce(dtype, sg_per_block, op_name, ref_fn, init_permuted, tol_relative):
+    """Block reduce: thread 0 of each block holds ``<op>(src[block_base:block_base+block_dim])``. Unified across
+    ``add`` / ``min`` / ``max`` -- op-name is closure-captured into ``@qd.kernel``."""
     _skip_if_f64_unsupported(dtype)
+    op_fn = getattr(block, f"reduce_{op_name}")
     block_dim = sg_per_block * _arch_subgroup_size()
     NUM_BLOCKS = 4
     N = NUM_BLOCKS * block_dim
@@ -973,102 +949,29 @@ def foo():
         qd.loop_config(block_dim=block_dim)
         for i in range(N):
             tid = i % block_dim
-            agg = block.reduce_max(src[i], block_dim, dtype)
+            agg = op_fn(src[i], block_dim, dtype)
             if tid == 0:
                 dst[i // block_dim] = agg
 
-    for i in range(N):
-        v = ((i * 1009) % 997) + 1
-        src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
-    foo()
-
-    for b in range(NUM_BLOCKS):
-        block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_reduce_max(block_vals)
-        if dtype in _BLOCK_REDUCE_INT_DTYPES:
-            assert dst[b] == expected, f"block {b}: got {dst[b]}, expected {expected}"
-        else:
-            assert abs(dst[b] - expected) < 1e-5, f"block {b}: got {dst[b]}, expected {expected}"
-
-
-@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
-@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
-@test_utils.test(arch=qd.gpu)
-def test_block_reduce_all_add(dtype, sg_per_block):
-    """Block sum-reduce broadcast: every thread of each block holds the block-wide sum.
-
-    Verifies the broadcast variant by writing the per-thread output to a flat field, then asserting every thread of a
-    given block reads the same aggregate.
-    """
-    _skip_if_f64_unsupported(dtype)
-    block_dim = sg_per_block * _arch_subgroup_size()
-    NUM_BLOCKS = 4
-    N = NUM_BLOCKS * block_dim
-    src = qd.field(dtype=dtype, shape=N)
-    dst = qd.field(dtype=dtype, shape=N)
-
-    @qd.kernel
-    def foo():
-        qd.loop_config(block_dim=block_dim)
-        for i in range(N):
-            dst[i] = block.reduce_all_add(src[i], block_dim, dtype)
-
-    _init_field(src, N, dtype)
-    foo()
-
-    for b in range(NUM_BLOCKS):
-        block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_reduce_add(block_vals)
-        for j in range(block_dim):
-            actual = dst[b * block_dim + j]
-            if dtype in _BLOCK_REDUCE_INT_DTYPES:
-                assert actual == expected, f"block {b} thread {j}: got {actual}, expected {expected}"
-            else:
-                assert abs(actual - expected) < 1e-4 * abs(
-                    expected
-                ), f"block {b} thread {j}: got {actual}, expected {expected}"
-
-
-@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
-@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
-@test_utils.test(arch=qd.gpu)
-def test_block_reduce_all_min(dtype, sg_per_block):
-    """Block min-reduce broadcast: every thread reads the block-wide min."""
-    _skip_if_f64_unsupported(dtype)
-    block_dim = sg_per_block * _arch_subgroup_size()
-    NUM_BLOCKS = 4
-    N = NUM_BLOCKS * block_dim
-    src = qd.field(dtype=dtype, shape=N)
-    dst = qd.field(dtype=dtype, shape=N)
-
-    @qd.kernel
-    def foo():
-        qd.loop_config(block_dim=block_dim)
-        for i in range(N):
-            dst[i] = block.reduce_all_min(src[i], block_dim, dtype)
-
-    for i in range(N):
-        v = ((i * 1009) % 997) + 1
-        src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
+    _init_block_reduce_src(src, N, dtype, permuted=init_permuted)
     foo()
 
     for b in range(NUM_BLOCKS):
         block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_reduce_min(block_vals)
-        for j in range(block_dim):
-            actual = dst[b * block_dim + j]
-            if dtype in _BLOCK_REDUCE_INT_DTYPES:
-                assert actual == expected, f"block {b} thread {j}: got {actual}, expected {expected}"
-            else:
-                assert abs(actual - expected) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected}"
+        expected = ref_fn(block_vals)
+        _assert_block_reduce_close(dst[b], expected, dtype, tol_relative=tol_relative, ctx=f"block {b}")
 
 
+@pytest.mark.parametrize("op_name,ref_fn,init_permuted,tol_relative", _BLOCK_REDUCE_OP_CASES)
 @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
 @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
 @test_utils.test(arch=qd.gpu)
-def test_block_reduce_all_max(dtype, sg_per_block):
-    """Block max-reduce broadcast: every thread reads the block-wide max."""
+def test_block_reduce_all(dtype, sg_per_block, op_name, ref_fn, init_permuted, tol_relative):
+    """Block reduce broadcast: every thread of each block holds the block-wide ``<op>``. Verified by writing the
+    per-thread output to a flat field, then asserting every thread of a given block reads the same aggregate.
+    Unified across ``add`` / ``min`` / ``max``."""
     _skip_if_f64_unsupported(dtype)
+    op_fn = getattr(block, f"reduce_all_{op_name}")
     block_dim = sg_per_block * _arch_subgroup_size()
     NUM_BLOCKS = 4
     N = NUM_BLOCKS * block_dim
@@ -1079,22 +982,17 @@ def test_block_reduce_all_max(dtype, sg_per_block):
     def foo():
         qd.loop_config(block_dim=block_dim)
         for i in range(N):
-            dst[i] = block.reduce_all_max(src[i], block_dim, dtype)
+            dst[i] = op_fn(src[i], block_dim, dtype)
 
-    for i in range(N):
-        v = ((i * 1009) % 997) + 1
-        src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
+    _init_block_reduce_src(src, N, dtype, permuted=init_permuted)
     foo()
 
     for b in range(NUM_BLOCKS):
         block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_reduce_max(block_vals)
+        expected = ref_fn(block_vals)
         for j in range(block_dim):
             actual = dst[b * block_dim + j]
-            if dtype in _BLOCK_REDUCE_INT_DTYPES:
-                assert actual == expected, f"block {b} thread {j}: got {actual}, expected {expected}"
-            else:
-                assert abs(actual - expected) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected}"
+            _assert_block_reduce_close(actual, expected, dtype, tol_relative=tol_relative, ctx=f"block {b} thread {j}")
 
 
 # --- Block scan tests ------------------------------------------------------------------

From 5fb930ef7d2e862cc358e35372a1ac139ce15556 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 08:14:47 -0700
Subject: [PATCH 11/15] [Test] test_block_inclusive: fuse {add,min,max} into
 one op-parametrized test

The three block inclusive scan tests share the same kernel skeleton and only
differ in the closure-captured `block.inclusive_<op>` function, the host-side
reference oracle, the init pattern (sequential for `add` -- sums grow with
prefix length; permuted for `min` / `max` -- result depends on lanes other
than first / last), and the float tolerance regime (relative for `add`,
absolute for `min` / `max`). Collapse into one op-parametrized test:

  test_block_inclusive(sg_per_block, dtype, op_name, ...)

Identical param count to the original three tests (3 sg x 5 dtype x 3 op =
45 cases vs original 3 x 15). Pulls a shared `_assert_block_scan_close`
helper out so the int / relative-float / absolute-float regime is encoded in
one place; the relative-float branch keeps the floor-on-tol-base trick
needed by the original `test_block_exclusive_add` (also routed through the
same helper). `test_block_exclusive_add` stays as its own function for now
because the matching exclusive `min` / `max` cases need dtype-derived
sentinel identities + ``isinf`` handling that's different enough that
fusing them in would create more branches than it removes; can address
that in a follow-up if needed.
---
 tests/python/test_simt.py | 131 ++++++++++++--------------------------
 1 file changed, 41 insertions(+), 90 deletions(-)

diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
index 5b7d7490cd..96aeb1e4dc 100644
--- a/tests/python/test_simt.py
+++ b/tests/python/test_simt.py
@@ -1045,82 +1045,45 @@ def _ref_exclusive_scan_op(values, op, identity):
     return out
 
 
-@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
-@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
-@test_utils.test(arch=qd.gpu)
-def test_block_inclusive_add(dtype, sg_per_block):
-    """Block inclusive prefix sum: thread `i` holds `sum(src[block_base..i])`."""
-    _skip_if_f64_unsupported(dtype)
-    block_dim = sg_per_block * _arch_subgroup_size()
-    NUM_BLOCKS = 4
-    N = NUM_BLOCKS * block_dim
-    src = qd.field(dtype=dtype, shape=N)
-    dst = qd.field(dtype=dtype, shape=N)
-
-    @qd.kernel
-    def foo():
-        qd.loop_config(block_dim=block_dim)
-        for i in range(N):
-            dst[i] = block.inclusive_add(src[i], block_dim, dtype)
-
-    _init_field(src, N, dtype)
-    foo()
-
-    for b in range(NUM_BLOCKS):
-        block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_inclusive_scan_add(block_vals)
-        for j in range(block_dim):
-            actual = dst[b * block_dim + j]
-            if dtype in _BLOCK_REDUCE_INT_DTYPES:
-                assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}"
-            else:
-                assert abs(actual - expected[j]) < 1e-4 * abs(
-                    expected[j] + 1.0
-                ), f"block {b} thread {j}: got {actual}, expected {expected[j]}"
-
-
-@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
-@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
-@test_utils.test(arch=qd.gpu)
-def test_block_exclusive_add(dtype, sg_per_block):
-    """Block exclusive prefix sum: thread `i` holds `sum(src[block_base..i-1])`; thread 0 holds 0."""
-    _skip_if_f64_unsupported(dtype)
-    block_dim = sg_per_block * _arch_subgroup_size()
-    NUM_BLOCKS = 4
-    N = NUM_BLOCKS * block_dim
-    src = qd.field(dtype=dtype, shape=N)
-    dst = qd.field(dtype=dtype, shape=N)
-
-    @qd.kernel
-    def foo():
-        qd.loop_config(block_dim=block_dim)
-        for i in range(N):
-            dst[i] = block.exclusive_add(src[i], block_dim, dtype)
+# The four scan tests in this group (`test_block_inclusive_{add,min,max}` + `test_block_exclusive_add`) share the
+# kernel skeleton; only the per-op reference oracle, init pattern, and float tolerance differ. `add` accumulates
+# (sequential init, relative tol); `min` / `max` pick (permuted init, absolute tol). Exclusive `min` / `max` get
+# their own dedicated test below because they need a dtype-derived sentinel identity (+inf / iinfo(max), -inf /
+# iinfo(min)) at lane 0 with explicit ``isinf`` handling -- different enough that fusing them in would create more
+# branches than it removes.
+_PY_MIN = lambda a, b: a if a < b else b  # noqa: E731 (intentional 1-line lambda for ref oracle)
+_PY_MAX = lambda a, b: a if a > b else b  # noqa: E731
+
+_BLOCK_INCLUSIVE_SCAN_OP_CASES = [
+    # (op_name, ref_fn, init_permuted, tol_relative)
+    pytest.param("add", _ref_inclusive_scan_add, False, True, id="add"),
+    pytest.param("min", lambda vals: _ref_inclusive_scan_op(vals, _PY_MIN, 0), True, False, id="min"),
+    pytest.param("max", lambda vals: _ref_inclusive_scan_op(vals, _PY_MAX, 0), True, False, id="max"),
+]
 
-    _init_field(src, N, dtype)
-    foo()
 
-    for b in range(NUM_BLOCKS):
-        block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_exclusive_scan_add(block_vals)
-        for j in range(block_dim):
-            actual = dst[b * block_dim + j]
-            if dtype in _BLOCK_REDUCE_INT_DTYPES:
-                assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}"
-            else:
-                # First thread's expected is 0; gate the relative tolerance so it doesn't blow up.
-                tol_base = abs(expected[j]) if abs(expected[j]) > 1.0 else 1.0
-                assert (
-                    abs(actual - expected[j]) < 1e-4 * tol_base
-                ), f"block {b} thread {j}: got {actual}, expected {expected[j]}"
+def _assert_block_scan_close(actual, expected_j, dtype, *, tol_relative, ctx):
+    """Per-thread assertion for block scan tests. Same int / relative-float / absolute-float regime as
+    ``_assert_block_reduce_close`` but with a floor on the relative-tol base so the first few prefixes (where
+    ``expected_j`` is near zero) don't tighten the bound to zero."""
+    if dtype in _BLOCK_REDUCE_INT_DTYPES:
+        assert actual == expected_j, f"{ctx}: got {actual}, expected {expected_j}"
+    elif tol_relative:
+        tol_base = abs(expected_j) if abs(expected_j) > 1.0 else 1.0
+        assert abs(actual - expected_j) < 1e-4 * tol_base, f"{ctx}: got {actual}, expected {expected_j}"
+    else:
+        assert abs(actual - expected_j) < 1e-5, f"{ctx}: got {actual}, expected {expected_j}"
 
 
+@pytest.mark.parametrize("op_name,ref_fn,init_permuted,tol_relative", _BLOCK_INCLUSIVE_SCAN_OP_CASES)
 @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
 @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
 @test_utils.test(arch=qd.gpu)
-def test_block_inclusive_min(dtype, sg_per_block):
-    """Block inclusive prefix min."""
+def test_block_inclusive(dtype, sg_per_block, op_name, ref_fn, init_permuted, tol_relative):
+    """Block inclusive prefix scan: thread ``i`` holds ``<op>(src[block_base..i])``. Unified across ``add`` / ``min``
+    / ``max``."""
     _skip_if_f64_unsupported(dtype)
+    op_fn = getattr(block, f"inclusive_{op_name}")
     block_dim = sg_per_block * _arch_subgroup_size()
     NUM_BLOCKS = 4
     N = NUM_BLOCKS * block_dim
@@ -1131,30 +1094,24 @@ def test_block_inclusive_min(dtype, sg_per_block):
     def foo():
         qd.loop_config(block_dim=block_dim)
         for i in range(N):
-            dst[i] = block.inclusive_min(src[i], block_dim, dtype)
+            dst[i] = op_fn(src[i], block_dim, dtype)
 
-    for i in range(N):
-        v = ((i * 1009) % 997) + 1
-        src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
+    _init_block_reduce_src(src, N, dtype, permuted=init_permuted)
     foo()
 
-    py_min = lambda a, b: a if a < b else b  # noqa: E731 (intentional 1-line lambda for ref oracle)
     for b in range(NUM_BLOCKS):
         block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_inclusive_scan_op(block_vals, py_min, 0)
+        expected = ref_fn(block_vals)
         for j in range(block_dim):
             actual = dst[b * block_dim + j]
-            if dtype in _BLOCK_REDUCE_INT_DTYPES:
-                assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}"
-            else:
-                assert abs(actual - expected[j]) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected[j]}"
+            _assert_block_scan_close(actual, expected[j], dtype, tol_relative=tol_relative, ctx=f"block {b} thread {j}")
 
 
 @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
 @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
 @test_utils.test(arch=qd.gpu)
-def test_block_inclusive_max(dtype, sg_per_block):
-    """Block inclusive prefix max."""
+def test_block_exclusive_add(dtype, sg_per_block):
+    """Block exclusive prefix sum: thread ``i`` holds ``sum(src[block_base..i-1])``; thread 0 holds 0."""
     _skip_if_f64_unsupported(dtype)
     block_dim = sg_per_block * _arch_subgroup_size()
     NUM_BLOCKS = 4
@@ -1166,23 +1123,17 @@ def test_block_inclusive_max(dtype, sg_per_block):
     def foo():
         qd.loop_config(block_dim=block_dim)
         for i in range(N):
-            dst[i] = block.inclusive_max(src[i], block_dim, dtype)
+            dst[i] = block.exclusive_add(src[i], block_dim, dtype)
 
-    for i in range(N):
-        v = ((i * 1009) % 997) + 1
-        src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
+    _init_field(src, N, dtype)
     foo()
 
-    py_max = lambda a, b: a if a > b else b  # noqa: E731
     for b in range(NUM_BLOCKS):
         block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_inclusive_scan_op(block_vals, py_max, 0)
+        expected = _ref_exclusive_scan_add(block_vals)
         for j in range(block_dim):
             actual = dst[b * block_dim + j]
-            if dtype in _BLOCK_REDUCE_INT_DTYPES:
-                assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}"
-            else:
-                assert abs(actual - expected[j]) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected[j]}"
+            _assert_block_scan_close(actual, expected[j], dtype, tol_relative=True, ctx=f"block {b} thread {j}")
 
 
 @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)

From aa25a36d4c125c0e784a91572f830520e3f095bc Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 08:17:25 -0700
Subject: [PATCH 12/15] [Test] test_block_exclusive_minmax: fuse {min,max} into
 one op-parametrized test

`test_block_exclusive_min` and `test_block_exclusive_max` share the same
permuted-init pattern and only differ in the dtype-derived sentinel identity
(``+inf`` / ``iinfo.max`` for min, ``-inf`` / ``iinfo.min`` for max) and the
inf-sign check at lane 0. Collapse into one op-parametrized test that takes
``(op_name, sentinel_fn, py_op, inf_sign)`` and dispatches via getattr +
the (already module-level) `_PY_MIN` / `_PY_MAX` lambdas.

Identical param count to the original pair (3 sg x 5 dtype x 2 op = 30 cases
vs original 2 x 15 each = 30). `test_block_exclusive_add` remains its own
function because the integer identity is `0` (not `iinfo.max/min`) and the
init pattern is sequential -- different enough that fusing it in would add
more branches than it removes. Drops ~30 LOC.
---
 tests/python/test_simt.py | 71 ++++++++++++---------------------------
 1 file changed, 21 insertions(+), 50 deletions(-)

diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
index 96aeb1e4dc..8c44a40bf9 100644
--- a/tests/python/test_simt.py
+++ b/tests/python/test_simt.py
@@ -1136,12 +1136,24 @@ def foo():
             _assert_block_scan_close(actual, expected[j], dtype, tol_relative=True, ctx=f"block {b} thread {j}")
 
 
+_BLOCK_EXCLUSIVE_MINMAX_CASES = [
+    # (op_name, sentinel_fn, py_op, inf_sign)
+    pytest.param("min", _block_exclusive_min_sentinel, _PY_MIN, 1, id="min"),
+    pytest.param("max", _block_exclusive_max_sentinel, _PY_MAX, -1, id="max"),
+]
+
+
+@pytest.mark.parametrize("op_name,sentinel_fn,py_op,inf_sign", _BLOCK_EXCLUSIVE_MINMAX_CASES)
 @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
 @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
 @test_utils.test(arch=qd.gpu)
-def test_block_exclusive_min(dtype, sg_per_block):
-    """Block exclusive prefix min; thread 0 holds the dtype-derived identity (``+inf`` / ``np.iinfo(dtype).max``)."""
+def test_block_exclusive_minmax(dtype, sg_per_block, op_name, sentinel_fn, py_op, inf_sign):
+    """Block exclusive prefix ``<op>`` for ``op in {min, max}``; thread 0 of each block holds the dtype-derived
+    identity (``+inf`` / ``iinfo(dtype).max`` for min, ``-inf`` / ``iinfo(dtype).min`` for max). The float ``inf`` /
+    ``-inf`` lane-0 identity gets a sign-only check because ``inf - inf`` (or ``(-inf) - (-inf)``) is ``NaN`` and the
+    standard ``abs(diff) < tol`` compare would fail spuriously."""
     _skip_if_f64_unsupported(dtype)
+    op_fn = getattr(block, f"exclusive_{op_name}")
     block_dim = sg_per_block * _arch_subgroup_size()
     NUM_BLOCKS = 4
     N = NUM_BLOCKS * block_dim
@@ -1152,25 +1164,23 @@ def test_block_exclusive_min(dtype, sg_per_block):
     def foo():
         qd.loop_config(block_dim=block_dim)
         for i in range(N):
-            dst[i] = block.exclusive_min(src[i], block_dim, dtype)
+            dst[i] = op_fn(src[i], block_dim, dtype)
 
-    for i in range(N):
-        v = ((i * 1009) % 997) + 1
-        src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
+    _init_block_reduce_src(src, N, dtype, permuted=True)
     foo()
 
-    sentinel = _block_exclusive_min_sentinel(dtype)
-    py_min = lambda a, b: a if a < b else b  # noqa: E731
+    sentinel = sentinel_fn(dtype)
     for b in range(NUM_BLOCKS):
         block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_exclusive_scan_op(block_vals, py_min, sentinel)
+        expected = _ref_exclusive_scan_op(block_vals, py_op, sentinel)
         for j in range(block_dim):
             actual = dst[b * block_dim + j]
             if dtype in _BLOCK_REDUCE_INT_DTYPES:
                 assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}"
             elif math.isinf(expected[j]):
-                # Thread 0 of each block gets the +inf identity; ``inf - inf`` is NaN, so check by equality / sign.
-                assert math.isinf(actual) and actual > 0, f"block {b} thread {j}: got {actual}, expected {expected[j]}"
+                assert math.isinf(actual) and (
+                    actual > 0 if inf_sign > 0 else actual < 0
+                ), f"block {b} thread {j}: got {actual}, expected {expected[j]}"
             else:
                 assert abs(actual - expected[j]) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected[j]}"
 
@@ -1304,45 +1314,6 @@ def kern():
     assert actual_ranks == ref_ranks, f"ranks mismatch (pattern={key_pattern})"
 
 
-@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
-@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
-@test_utils.test(arch=qd.gpu)
-def test_block_exclusive_max(dtype, sg_per_block):
-    """Block exclusive prefix max; thread 0 holds the dtype-derived identity (``-inf`` / ``np.iinfo(dtype).min``)."""
-    _skip_if_f64_unsupported(dtype)
-    block_dim = sg_per_block * _arch_subgroup_size()
-    NUM_BLOCKS = 4
-    N = NUM_BLOCKS * block_dim
-    src = qd.field(dtype=dtype, shape=N)
-    dst = qd.field(dtype=dtype, shape=N)
-
-    @qd.kernel
-    def foo():
-        qd.loop_config(block_dim=block_dim)
-        for i in range(N):
-            dst[i] = block.exclusive_max(src[i], block_dim, dtype)
-
-    for i in range(N):
-        v = ((i * 1009) % 997) + 1
-        src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
-    foo()
-
-    sentinel = _block_exclusive_max_sentinel(dtype)
-    py_max = lambda a, b: a if a > b else b  # noqa: E731
-    for b in range(NUM_BLOCKS):
-        block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_exclusive_scan_op(block_vals, py_max, sentinel)
-        for j in range(block_dim):
-            actual = dst[b * block_dim + j]
-            if dtype in _BLOCK_REDUCE_INT_DTYPES:
-                assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}"
-            elif math.isinf(expected[j]):
-                # Thread 0 of each block gets the -inf identity; ``-inf - -inf`` is NaN, so check by equality / sign.
-                assert math.isinf(actual) and actual < 0, f"block {b} thread {j}: got {actual}, expected {expected[j]}"
-            else:
-                assert abs(actual - expected[j]) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected[j]}"
-
-
 @pytest.mark.parametrize("dtype", [qd.i32, qd.f32, qd.f64])
 @test_utils.test(arch=qd.gpu)
 def test_subgroup_shuffle_broadcast(dtype):

From 11b3a89a6e4cc6672aa831215fface7bb4398ede Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 11:03:15 -0700
Subject: [PATCH 13/15] [Style] Reflow CI-flagged 80c-wrapped comments to 120c

The PR's `Check line wrapping` CI agent flagged three comments wrapped at
the AI-default ~78-90c instead of the project's 120c target. Reflow each
to the full target width:

  - tests/python/test_tile16.py:1791  (78c -> 120c) docstring for
    test_tile16_cholesky_blocked_demo.
  - tests/python/test_ad_gdar_diffmpm.py:8  (85c -> 120c) the
    "defaults shrink ..." comment above the parametrize block.
  - tests/run_tests.py:60  (90c -> 120c) the "--run-slow opts back in"
    comment.

Also collapse the dangling-backslash continuation in
misc/demos/cholesky_blocked.py's Usage example onto one line (69c -> 109c).

No behavior change; comments only. Verified via the cursor
find-underwrapped skill that the remaining flagged runs in my diff are
all 103-116c with save~=0 (already-tight runs the greedy heuristic still
reports), comfortably in the agent's "not borderline" exemption.
---
 misc/demos/cholesky_blocked.py       | 3 +--
 tests/python/test_ad_gdar_diffmpm.py | 9 ++++-----
 tests/python/test_tile16.py          | 7 +++----
 tests/run_tests.py                   | 7 +++----
 4 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/misc/demos/cholesky_blocked.py b/misc/demos/cholesky_blocked.py
index b4c60c1810..3c72dd39fd 100644
--- a/misc/demos/cholesky_blocked.py
+++ b/misc/demos/cholesky_blocked.py
@@ -21,8 +21,7 @@
     tile16   (Tile16x16, no shared memory)             16        533        5.19x
 
 Usage:
-    python misc/demos/cholesky_blocked.py [--n N] [--n-envs N_ENVS] \
-        [--num-warmup WARMUP] [--num-iters ITERS]
+    python misc/demos/cholesky_blocked.py [--n N] [--n-envs N_ENVS] [--num-warmup WARMUP] [--num-iters ITERS]
 """
 
 import argparse
diff --git a/tests/python/test_ad_gdar_diffmpm.py b/tests/python/test_ad_gdar_diffmpm.py
index 0e0e460534..8fd3c56d56 100644
--- a/tests/python/test_ad_gdar_diffmpm.py
+++ b/tests/python/test_ad_gdar_diffmpm.py
@@ -5,11 +5,10 @@
 from tests import test_utils
 
 
-# Defaults shrink particle / grid / steps counts so the JIT compile + AD-tape replay
-# stays cheap; the slow-marked entry keeps the original (N=30, n_grid=120, steps=32)
-# workload that runs on --run-slow. The point of the test is that the AD-validation
-# checker fires on the global-data-access violation in g2p (`v[f, p] = new_v`), which
-# happens on the first substep regardless of size.
+# Defaults shrink particle / grid / steps counts so the JIT compile + AD-tape replay stays cheap; the slow-marked
+# entry keeps the original (N=30, n_grid=120, steps=32) workload that runs on --run-slow. The point of the test is
+# that the AD-validation checker fires on the global-data-access violation in g2p (`v[f, p] = new_v`), which happens
+# on the first substep regardless of size.
 @pytest.mark.parametrize(
     "particles_side,n_grid_size,num_steps",
     [
diff --git a/tests/python/test_tile16.py b/tests/python/test_tile16.py
index f94d4221e1..adf8249605 100644
--- a/tests/python/test_tile16.py
+++ b/tests/python/test_tile16.py
@@ -1778,10 +1778,9 @@ def write_eye_f32(dst: Ann32):
 def test_tile16_cholesky_blocked_demo():
     """Smoke-test that misc/demos/cholesky_blocked.py runs to completion.
 
-    Uses small CLI overrides (N=32, N_ENVS=64, 1 warmup + 1 timed iter) so the
-    JIT compile of the 3 unrolled kernels and the benchmark loop both stay
-    cheap. The demo's defaults (N=92, N_ENVS=4096, 50+200 iters) are exercised
-    by anyone running the script manually, not by CI.
+    Uses small CLI overrides (N=32, N_ENVS=64, 1 warmup + 1 timed iter) so the JIT compile of the 3 unrolled kernels
+    and the benchmark loop both stay cheap. The demo's defaults (N=92, N_ENVS=4096, 50+200 iters) are exercised by
+    anyone running the script manually, not by CI.
     """
     demo = Path(__file__).resolve().parents[2] / "misc" / "demos" / "cholesky_blocked.py"
     cmd = [
diff --git a/tests/run_tests.py b/tests/run_tests.py
index bf37ab2aa7..7276ce9d00 100644
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -56,10 +56,9 @@ def _test_python(args, default_dir="python"):
             pytest_args += ["--cov-append"]
         if args.keys:
             pytest_args += ["-k", args.keys]
-        # By default we exclude tests marked `slow` (eig / make_spd at n>=6, inverse_large
-        # at n>=6, mpm88, etc. — see tests/pytest.ini for the marker). `--run-slow` opts
-        # back in. If the user passes their own `-m` expression we AND `not slow` onto it
-        # so the exclusion still applies, unless they explicitly opt out via `--run-slow`.
+        # By default we exclude tests marked `slow` (eig / make_spd at n>=6, inverse_large at n>=6, mpm88, etc. -- see
+        # tests/pytest.ini for the marker). `--run-slow` opts back in. If the user passes their own `-m` expression we
+        # AND `not slow` onto it so the exclusion still applies, unless they explicitly opt out via `--run-slow`.
         marks_expr = args.marks
         if not args.run_slow:
             marks_expr = f"({marks_expr}) and not slow" if marks_expr else "not slow"

From 7389b5fa83a9bc50e3c1f5e92a3005fc1a4bf660 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 11:12:48 -0700
Subject: [PATCH 14/15] [Doc] contributing.md: shorten testing bullet per PR
 review

Hugh requested in PR #709 review comment that the testing bullet collapse
to just a pointer at unit_testing.md, since the long inline summary
duplicates the dedicated doc immediately below.
---
 docs/source/user_guide/contributing.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/user_guide/contributing.md b/docs/source/user_guide/contributing.md
index ec97b9529f..3573179e84 100644
--- a/docs/source/user_guide/contributing.md
+++ b/docs/source/user_guide/contributing.md
@@ -2,7 +2,7 @@
 
 ## Good practice reminder
 
-* *testing*: Any new features or modified code should be tested. You have to run the test suite using `python tests/run_tests.py` which sets up the right test environment for `pytest`. CLI arguments are forwarded to `pytest`. Do not use `pytest` directly as it behaves differently. To see a per-file timing breakdown (useful for identifying slow test files), set `QD_FILE_TIMING=1` — e.g. `QD_FILE_TIMING=1 python tests/run_tests.py`. This is enabled by default in the Mac CI job and the results appear in the GitHub Actions job summary.
+* *testing*: Any new features or modified code should be tested. see [unit_testing.md](unit_testing.md)
 * *format/linter*: Before pushing any commits, ensure you set up `pre-commit` and run it using `pre-commit run -a`
 * No need to force push to keep a clean history as the merging is eventually done by squashing commits.
 

From a85c6ecbccad873b05993755e318ac29394b5aec Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Thu, 21 May 2026 03:02:09 -0700
Subject: [PATCH 15/15] [Doc] unit_testing: add slow-only test-suite guide

Documents the test launcher, the @pytest.mark.slow marker (whole-test
and parametrize-case variants), how to write a new parametrized test
with the test_utils.test decorator, and the Advanced section with the
per-test timeout, kernel compilation cache, and per-file timing knobs.

Modeled on the structure of the equivalent doc on hp/mark-slow-tests
(after Hugh's two rounds of PR review feedback there) but with all
@pytest.mark.sample references stripped, since the @sample marker is
not part of this branch.
---
 docs/source/user_guide/index.md        |   1 +
 docs/source/user_guide/unit_testing.md | 120 +++++++++++++++++++++++++
 2 files changed, 121 insertions(+)
 create mode 100644 docs/source/user_guide/unit_testing.md

diff --git a/docs/source/user_guide/index.md b/docs/source/user_guide/index.md
index b648f97527..c824a270e7 100644
--- a/docs/source/user_guide/index.md
+++ b/docs/source/user_guide/index.md
@@ -82,6 +82,7 @@ init_options
 :maxdepth: 1
 :titlesonly:
 
+unit_testing
 kernel_coverage
 ```
 
diff --git a/docs/source/user_guide/unit_testing.md b/docs/source/user_guide/unit_testing.md
new file mode 100644
index 0000000000..08453a9912
--- /dev/null
+++ b/docs/source/user_guide/unit_testing.md
@@ -0,0 +1,120 @@
+# Unit testing
+
+This page documents how to run, write, and tune the Quadrants Python unit test suite. For setup of the build / dev environment, see [contributing.md](contributing.md).
+
+## Running the tests
+
+The test suite is run via the project's launcher, **not** by invoking `pytest` directly:
+
+```
+python tests/run_tests.py
+```
+
+The launcher sets up the test-only env vars (kernel offline cache, watchdog, xdist worker count, etc.) and forwards any unrecognised flags to pytest. Calling `pytest` directly skips that setup and behaves differently.
+
+Common one-liners:
+
+```
+# run one file
+python tests/run_tests.py test_tile16
+
+# run one test (any pytest -k expression)
+python tests/run_tests.py -k test_tile16_cholesky
+
+# run on a specific backend (or comma-separated list)
+python tests/run_tests.py --arch cuda
+python tests/run_tests.py --arch metal -k tile16
+
+# same, via env var (handy for CI)
+QD_WANTED_ARCHS=metal,vulkan python tests/run_tests.py
+
+# rerun the last failing tests first
+python tests/run_tests.py -f
+
+# stop at the first failure
+python tests/run_tests.py -x
+```
+
+The target architecture can also be set via `QD_WANTED_ARCHS` (comma-separated; supports `^arch` to exclude rather than include).
+
+## Markers
+
+### `@pytest.mark.slow`
+
+Marks a test as **slow**. `tests/run_tests.py` adds `-m "not slow"` to the pytest invocation by default; pass `--run-slow` to opt back in:
+
+```
+# default: skip slow
+python tests/run_tests.py
+
+# include slow
+python tests/run_tests.py --run-slow
+
+# slow ONLY (e.g. nightly job)
+python tests/run_tests.py -m slow --run-slow
+```
+
+The marker is used in two patterns:
+
+1. **Whole-test slow**: the whole test takes a long time.
+
+   ```python
+   @pytest.mark.slow
+   def test_thing_that_is_always_slow():
+       ...
+   ```
+
+2. **Slow-marked parametrize case**:
+
+   ```python
+   @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
+   def test_sym_eig_general(n):
+       ...
+   ```
+
+   In this specific example the default suite still exercises the code path; the slow lane just adds the larger-size variant for full coverage.
+
+## Writing new tests
+
+The standard recipe combines `@test_utils.test(...)` (arch / option matrix) with `@pytest.mark.parametrize`:
+
+```python
+import pytest
+import quadrants as qd
+from tests import test_utils
+
+
+@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
+@test_utils.test(arch=qd.gpu, default_fp=qd.f32)
+def test_my_thing(n):
+    ...
+```
+
+`@test_utils.test` is what wires the test into the per-backend matrix and applies platform exclusions (`exclude=`), extension requirements (`require=`, e.g. `qd.extension.data64` for f64 tests), and per-test options (`default_fp`, `fast_math`, etc.). See `tests/test_utils.py` for the full surface.
+
+Common helpers in `tests/test_utils.py`:
+
+- `test_utils.skip_if_f64_unsupported(dtype)` — skip the current test at runtime if `dtype == qd.f64` and the active backend can't carry f64 through buffer I/O (Metal, MoltenVK on Darwin). Use inside a parametrized test that sweeps both f32 and f64.
+- `test_utils.expected_archs()` — list of archs that the current `QD_WANTED_ARCHS` allows. Used to skip tests with no satisfiable arch.
+
+## Advanced
+
+Optional knobs and runtime details. The defaults work for most contributors.
+
+### Per-test timeout
+
+Per-test timeouts default to 600 s and are enforced by `pytest_hardtle`, a CFFI-compiled C watchdog that can kill tests hung in native GPU calls even when the GIL is held.
+
+### Kernel compilation cache
+
+During each test session the kernel compilation cache lives in a fresh, empty temp directory created by pytest's [`tmp_path_factory`](https://docs.pytest.org/en/stable/how-to/tmp_path.html) — typically `/tmp/pytest-of-<user>/pytest-<N>/qdcache0/`. Old session directories are cleaned up automatically by pytest's retention policy. This cache is separate from the user-facing `~/.cache/quadrants/` cache, and avoids recompiling identical kernels after each `qd.reset()` / `qd.init()` cycle within a session.
+
+### Per-file timing breakdown
+
+Set `QD_FILE_TIMING=1` to print a per-file duration summary at the end of the session:
+
+```
+QD_FILE_TIMING=1 python tests/run_tests.py
+```
+
+This is enabled by default in the Mac CI job; the results appear in the GitHub Actions job summary and are the primary tool for identifying slow test files.