From c3abaaf23af3b6ba38defbc5f61ca93fd1d6c9af Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 02:47:21 -0700
Subject: [PATCH 01/29] Skip the slowest tests by default
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a `slow` pytest marker, mark the worst-case tests with it, and have
`tests/run_tests.py` skip those tests by default (use `--run-slow` to include
them, or `pytest -m slow` to run only those).

Picked from macOS CI per-file timing (QD_FILE_TIMING=1, run 26083950810):
phase 1 totals 6415s across 8641 test calls; the slowest 3 files alone
(test_eig, test_tile16, test_linalg) cover 55%. The cost of test_eig /
test_make_spd is super-linear in matrix size n (n=12 ≈ 5x n=9).

Marked slow:

  - Parametrize cases n in {6, 9, 12} (and 7..11 for inverse_large) across
    test_eig.py and test_linalg.py.
  - Rectangular (9, 12) / (12, 3) cases in test_frobenius_inner_rectangular.
  - test_matmul_chain_qipc_sizes_{f32,f64} (>130s each on macOS CI).
  - test_clear_all_gradients (180s/invocation).
  - test_reset_ndarrays::test_ndarray_doesnt_crash_on_gc (127s).
  - test_mpm88::{test_mpm88, test_mpm88_numpy_and_ndarray} (~30s/invocation).
  - test_struct::test_2d_nested (122s/invocation).

run_tests.py composes `not slow` with any user-supplied `-m` expression, so
existing CI invocations like `-m "not needs_torch"` become
`(not needs_torch) and not slow`. Note that this also drops slow tests from
GPU / Linux / macOS CI runs — a separate workflow (or `--run-slow` job) is
needed if we still want to exercise the n>=6 / n=12 paths in CI.
---
 tests/pytest.ini                         |  2 +
 tests/python/test_clear_all_gradients.py |  3 +
 tests/python/test_eig.py                 | 96 +++++++++++++++++++++---
 tests/python/test_linalg.py              | 70 +++++++++++++++--
 tests/python/test_mpm88.py               |  2 +
 tests/python/test_reset_ndarrays.py      |  1 +
 tests/python/test_struct.py              |  1 +
 tests/run_tests.py                       | 23 +++++-
 8 files changed, 180 insertions(+), 18 deletions(-)

diff --git a/tests/pytest.ini b/tests/pytest.ini
index 5ee5ec16b2..efaf40e6c6 100644
--- a/tests/pytest.ini
+++ b/tests/pytest.ini
@@ -3,3 +3,5 @@ markers =
     run_in_serial: mark test to run serially(usually for resource intensive tests).
     sm70: Can only run on GPU with compute capability 7.0 or higher.
     needs_torch: mark test as requiring PyTorch.
+    slow: mark test (or parametrize case) as slow. Skipped by default by tests/run_tests.py;
+        pass --run-slow to include them, or directly `pytest -m slow` to run only the slow ones.
diff --git a/tests/python/test_clear_all_gradients.py b/tests/python/test_clear_all_gradients.py
index 615ade9b0b..22c649a979 100644
--- a/tests/python/test_clear_all_gradients.py
+++ b/tests/python/test_clear_all_gradients.py
@@ -1,9 +1,12 @@
+import pytest
+
 import quadrants as qd
 from quadrants.lang import impl
 
 from tests import test_utils
 
 
+@pytest.mark.slow
 @test_utils.test(exclude=[qd.vulkan])
 def test_clear_all_gradients():
     x = qd.field(qd.f32)
diff --git a/tests/python/test_eig.py b/tests/python/test_eig.py
index 53647a6eef..ad8d8fe3bb 100644
--- a/tests/python/test_eig.py
+++ b/tests/python/test_eig.py
@@ -295,7 +295,16 @@ def run():
     np.testing.assert_allclose(A_reconstructed, A_np, rtol=tol, atol=tol)
 
 
-@pytest.mark.parametrize("n", [4, 5, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        4,
+        5,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @pytest.mark.parametrize(
     "factory",
     [
@@ -311,7 +320,16 @@ def test_sym_eig_general_f32(n, factory):
     _test_sym_eig_general(n, qd.f32, factory)
 
 
-@pytest.mark.parametrize("n", [4, 5, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        4,
+        5,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @pytest.mark.parametrize(
     "factory",
     [
@@ -358,7 +376,15 @@ def run():
     np.testing.assert_allclose(A_spd_qd, expected, rtol=tol, atol=tol)
 
 
-@pytest.mark.parametrize("n", [4, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        4,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @pytest.mark.parametrize(
     "factory",
     [_sym_eig_factory_indefinite, _sym_eig_factory_random, _sym_eig_factory_spd],
@@ -368,7 +394,15 @@ def test_make_spd_f32(n, factory):
     _test_make_spd(n, qd.f32, factory)
 
 
-@pytest.mark.parametrize("n", [4, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        4,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @pytest.mark.parametrize(
     "factory",
     [_sym_eig_factory_indefinite, _sym_eig_factory_random, _sym_eig_factory_spd],
@@ -404,7 +438,15 @@ def run():
     np.testing.assert_allclose(Q.T @ Q, np.eye(n), rtol=tol, atol=tol)
 
 
-@pytest.mark.parametrize("n", [4, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        4,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @pytest.mark.parametrize("alpha", [0.0, 1.0, -2.5])
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_sym_eig_alpha_identity_f64(n, alpha):
@@ -445,7 +487,15 @@ def project(src: qd.types.NDArray[mat_t, 1], dst: qd.types.NDArray[mat_t, 1]):
     )
 
 
-@pytest.mark.parametrize("n", [4, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        4,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @pytest.mark.parametrize(
     "factory",
     [_sym_eig_factory_indefinite, _sym_eig_factory_negative_definite, _sym_eig_factory_spd],
@@ -455,7 +505,15 @@ def test_make_spd_idempotent_f64(n, factory):
     _test_make_spd_idempotent(n, qd.f64, factory)
 
 
-@pytest.mark.parametrize("n", [4, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        4,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_make_spd_negative_definite_zero_f64(n):
     """A symmetric matrix with all-negative eigenvalues projects to the zero matrix (``Q · diag(max(λ, 0)) · Qᵀ``
@@ -535,13 +593,33 @@ def run():
         ), f"column {i} is not the eigenvector of eigvals[{i}]={eigvals_qd[i]}: residual={residual}"
 
 
-@pytest.mark.parametrize("n", [2, 3, 4, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        2,
+        3,
+        4,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False)
 def test_sym_eig_sort_order_f32(n):
     _test_sym_eig_sort_order(n, qd.f32)
 
 
-@pytest.mark.parametrize("n", [2, 3, 4, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        2,
+        3,
+        4,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_sym_eig_sort_order_f64(n):
     _test_sym_eig_sort_order(n, qd.f64)
diff --git a/tests/python/test_linalg.py b/tests/python/test_linalg.py
index dfa31495bc..93ff2c2ce2 100644
--- a/tests/python/test_linalg.py
+++ b/tests/python/test_linalg.py
@@ -154,13 +154,31 @@ def run():
     assert out_self[None] == test_utils.approx(A.to_numpy().__pow__(2).sum(), rel=tol, abs=tol)
 
 
-@pytest.mark.parametrize("n", [2, 3, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        2,
+        3,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False)
 def test_frobenius_inner_f32(n):
     _test_frobenius_inner(n, qd.f32)
 
 
-@pytest.mark.parametrize("n", [2, 3, 6, 9, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        2,
+        3,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_frobenius_inner_f64(n):
     _test_frobenius_inner(n, qd.f64)
@@ -189,13 +207,27 @@ def run():
     assert out[None] == test_utils.approx(expected, rel=tol, abs=tol)
 
 
-@pytest.mark.parametrize("rows,cols", [(9, 12), (12, 3), (2, 4)])
+@pytest.mark.parametrize(
+    "rows,cols",
+    [
+        pytest.param(9, 12, marks=pytest.mark.slow),
+        pytest.param(12, 3, marks=pytest.mark.slow),
+        (2, 4),
+    ],
+)
 @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False)
 def test_frobenius_inner_rectangular_f32(rows, cols):
     _test_frobenius_inner_rectangular(rows, cols, qd.f32)
 
 
-@pytest.mark.parametrize("rows,cols", [(9, 12), (12, 3), (2, 4)])
+@pytest.mark.parametrize(
+    "rows,cols",
+    [
+        pytest.param(9, 12, marks=pytest.mark.slow),
+        pytest.param(12, 3, marks=pytest.mark.slow),
+        (2, 4),
+    ],
+)
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_frobenius_inner_rectangular_f64(rows, cols):
     _test_frobenius_inner_rectangular(rows, cols, qd.f64)
@@ -241,11 +273,13 @@ def run():
     np.testing.assert_allclose(ABC_chained.to_numpy(), ABC_staged.to_numpy(), rtol=tol, atol=tol)
 
 
+@pytest.mark.slow
 @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False)
 def test_matmul_chain_qipc_sizes_f32():
     _test_matmul_chain(qd.f32)
 
 
+@pytest.mark.slow
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_matmul_chain_qipc_sizes_f64():
     _test_matmul_chain(qd.f64)
@@ -434,7 +468,19 @@ def run():
     np.testing.assert_allclose(M @ inv_np, np.eye(n_), rtol=tol, atol=tol)
 
 
-@pytest.mark.parametrize("n", [5, 6, 7, 8, 9, 10, 11, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        5,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(7, marks=pytest.mark.slow),
+        pytest.param(8, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(10, marks=pytest.mark.slow),
+        pytest.param(11, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @pytest.mark.parametrize(
     "factory",
     [_inverse_diagonally_dominant, _inverse_spd, _inverse_pivoting_required],
@@ -444,7 +490,19 @@ def test_inverse_large_f32(n, factory):
     _test_inverse_at_size(n, qd.f32, factory)
 
 
-@pytest.mark.parametrize("n", [5, 6, 7, 8, 9, 10, 11, 12])
+@pytest.mark.parametrize(
+    "n",
+    [
+        5,
+        pytest.param(6, marks=pytest.mark.slow),
+        pytest.param(7, marks=pytest.mark.slow),
+        pytest.param(8, marks=pytest.mark.slow),
+        pytest.param(9, marks=pytest.mark.slow),
+        pytest.param(10, marks=pytest.mark.slow),
+        pytest.param(11, marks=pytest.mark.slow),
+        pytest.param(12, marks=pytest.mark.slow),
+    ],
+)
 @pytest.mark.parametrize(
     "factory",
     [_inverse_diagonally_dominant, _inverse_spd, _inverse_pivoting_required],
diff --git a/tests/python/test_mpm88.py b/tests/python/test_mpm88.py
index 725ff17ac9..d758b65f9d 100644
--- a/tests/python/test_mpm88.py
+++ b/tests/python/test_mpm88.py
@@ -7,6 +7,7 @@
 from tests import test_utils
 
 
+@pytest.mark.slow
 @pytest.mark.skipif(os.environ.get("QD_LITE_TEST") or "0", reason="Lite test")
 @pytest.mark.run_in_serial
 @test_utils.test()
@@ -108,6 +109,7 @@ def _is_appveyor():
     return os.getenv("APPVEYOR", "").lower() == "true"
 
 
+@pytest.mark.slow
 @pytest.mark.skipif(os.environ.get("QD_LITE_TEST") or "0", reason="Lite test")
 @pytest.mark.run_in_serial
 @test_utils.test()
diff --git a/tests/python/test_reset_ndarrays.py b/tests/python/test_reset_ndarrays.py
index bc048ac92d..a42fd921f1 100644
--- a/tests/python/test_reset_ndarrays.py
+++ b/tests/python/test_reset_ndarrays.py
@@ -8,6 +8,7 @@
 from tests import test_utils
 
 
+@pytest.mark.slow
 @test_utils.test(arch=[qd.cpu])
 def test_ndarray_doesnt_crash_on_gc() -> None:
     if sys.platform != "darwin":
diff --git a/tests/python/test_struct.py b/tests/python/test_struct.py
index d3d6a4fbaa..de6d249970 100644
--- a/tests/python/test_struct.py
+++ b/tests/python/test_struct.py
@@ -62,6 +62,7 @@ def test_linear_nested_aos():
         assert y[i] == i + 123
 
 
+@pytest.mark.slow
 @test_utils.test(exclude=[qd.vulkan])
 def test_2d_nested():
     x = qd.field(qd.i32)
diff --git a/tests/run_tests.py b/tests/run_tests.py
index e2419add42..47d5574ad0 100644
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -56,8 +56,15 @@ def _test_python(args, default_dir="python"):
             pytest_args += ["--cov-append"]
         if args.keys:
             pytest_args += ["-k", args.keys]
-        if args.marks:
-            pytest_args += ["-m", args.marks]
+        # By default we exclude tests marked `slow` (eig / make_spd at n>=6, inverse_large
+        # at n>=6, mpm88, etc. — see tests/pytest.ini for the marker). `--run-slow` opts
+        # back in. If the user passes their own `-m` expression we AND `not slow` onto it
+        # so the exclusion still applies, unless they explicitly opt out via `--run-slow`.
+        marks_expr = args.marks
+        if not args.run_slow:
+            marks_expr = f"({marks_expr}) and not slow" if marks_expr else "not slow"
+        if marks_expr:
+            pytest_args += ["-m", marks_expr]
         if args.failed_first:
             pytest_args += ["--failed-first"]
         if args.fail_fast:
@@ -161,7 +168,17 @@ def test():
         default=None,
         dest="marks",
         type=str,
-        help="Only run tests with specific marks",
+        help="Only run tests with specific marks. `not slow` is appended automatically "
+        "unless --run-slow is passed.",
+    )
+    parser.add_argument(
+        "--run-slow",
+        required=False,
+        default=False,
+        dest="run_slow",
+        action="store_true",
+        help="Include tests marked `slow` (excluded by default). Has no effect if -m is "
+        "given an explicit expression that already mentions `slow`.",
     )
     parser.add_argument(
         "-f",

From d915b74e687e0a53d92fd1084f9062e88874127b Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 03:13:45 -0700
Subject: [PATCH 02/29] Trim n parametrize lists to {smallest, 12}

The previous lists ([4, 5, 6, 9, 12], [2, 3, 4, 6, 9, 12], [5..12], etc.) gave
the Householder/QR path a lot of redundant size coverage. For routine CI we
only need to exercise a small size + the largest supported size (12, which
also doubles as the slow-marked stress case): if a bug shows up only at
n=7 or n=11 it almost certainly also shows up at n=12.

  test_eig.py
    sym_eig_general_{f32,f64}             [4,5,6,9,12]     -> [4, 12*]
    make_spd_{f32,f64}                    [4,6,9,12]       -> [4, 12*]
    sym_eig_alpha_identity_f64            [4,6,9,12]       -> [4, 12*]
    make_spd_idempotent_f64               [4,6,9,12]       -> [4, 12*]
    make_spd_negative_definite_zero_f64   [4,6,9,12]       -> [4, 12*]
    sym_eig_sort_order_{f32,f64}          [2,3,4,6,9,12]   -> [3, 12*]
  test_linalg.py
    frobenius_inner_{f32,f64}             [2,3,6,9,12]     -> [3, 12*]
    inverse_large_{f32,f64}               [5..12]          -> [5, 12*]

* n=12 retains the `slow` marker, so default `run_tests.py` invocations only
  hit n=4 / n=3 / n=5. `--run-slow` runs both.

Closed-form 2x2/3x3 paths in test_sym_eig_sort_order: dropped n=2 in favour
of n=3 (per directive); the 2x2 path is still covered by
test_sym_eig2x2_{f32,f64}. The 3x3 closed-form path stays covered by n=3.

Other parametrize lists left untouched:
  - rectangular (rows, cols) tuples in test_frobenius_inner_rectangular (it's
    varying shape, not pure size).
  - test_mat_inverse_size's `range(1, 5)` (tiny sizes only).
  - `a00` integer parametrize in test_sym_eig3x3_{f32,f64}.
---
 tests/python/test_eig.py    | 96 ++++---------------------------------
 tests/python/test_linalg.py | 50 ++-----------------
 2 files changed, 13 insertions(+), 133 deletions(-)

diff --git a/tests/python/test_eig.py b/tests/python/test_eig.py
index ad8d8fe3bb..a8b5153dd6 100644
--- a/tests/python/test_eig.py
+++ b/tests/python/test_eig.py
@@ -295,16 +295,7 @@ def run():
     np.testing.assert_allclose(A_reconstructed, A_np, rtol=tol, atol=tol)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        4,
-        5,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
 @pytest.mark.parametrize(
     "factory",
     [
@@ -320,16 +311,7 @@ def test_sym_eig_general_f32(n, factory):
     _test_sym_eig_general(n, qd.f32, factory)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        4,
-        5,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
 @pytest.mark.parametrize(
     "factory",
     [
@@ -376,15 +358,7 @@ def run():
     np.testing.assert_allclose(A_spd_qd, expected, rtol=tol, atol=tol)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        4,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
 @pytest.mark.parametrize(
     "factory",
     [_sym_eig_factory_indefinite, _sym_eig_factory_random, _sym_eig_factory_spd],
@@ -394,15 +368,7 @@ def test_make_spd_f32(n, factory):
     _test_make_spd(n, qd.f32, factory)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        4,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
 @pytest.mark.parametrize(
     "factory",
     [_sym_eig_factory_indefinite, _sym_eig_factory_random, _sym_eig_factory_spd],
@@ -438,15 +404,7 @@ def run():
     np.testing.assert_allclose(Q.T @ Q, np.eye(n), rtol=tol, atol=tol)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        4,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
 @pytest.mark.parametrize("alpha", [0.0, 1.0, -2.5])
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_sym_eig_alpha_identity_f64(n, alpha):
@@ -487,15 +445,7 @@ def project(src: qd.types.NDArray[mat_t, 1], dst: qd.types.NDArray[mat_t, 1]):
     )
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        4,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
 @pytest.mark.parametrize(
     "factory",
     [_sym_eig_factory_indefinite, _sym_eig_factory_negative_definite, _sym_eig_factory_spd],
@@ -505,15 +455,7 @@ def test_make_spd_idempotent_f64(n, factory):
     _test_make_spd_idempotent(n, qd.f64, factory)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        4,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_make_spd_negative_definite_zero_f64(n):
     """A symmetric matrix with all-negative eigenvalues projects to the zero matrix (``Q · diag(max(λ, 0)) · Qᵀ``
@@ -593,33 +535,13 @@ def run():
         ), f"column {i} is not the eigenvector of eigvals[{i}]={eigvals_qd[i]}: residual={residual}"
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        2,
-        3,
-        4,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)])
 @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False)
 def test_sym_eig_sort_order_f32(n):
     _test_sym_eig_sort_order(n, qd.f32)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        2,
-        3,
-        4,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)])
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_sym_eig_sort_order_f64(n):
     _test_sym_eig_sort_order(n, qd.f64)
diff --git a/tests/python/test_linalg.py b/tests/python/test_linalg.py
index 93ff2c2ce2..a6632d4678 100644
--- a/tests/python/test_linalg.py
+++ b/tests/python/test_linalg.py
@@ -154,31 +154,13 @@ def run():
     assert out_self[None] == test_utils.approx(A.to_numpy().__pow__(2).sum(), rel=tol, abs=tol)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        2,
-        3,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)])
 @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False)
 def test_frobenius_inner_f32(n):
     _test_frobenius_inner(n, qd.f32)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        2,
-        3,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)])
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_frobenius_inner_f64(n):
     _test_frobenius_inner(n, qd.f64)
@@ -468,19 +450,7 @@ def run():
     np.testing.assert_allclose(M @ inv_np, np.eye(n_), rtol=tol, atol=tol)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        5,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(7, marks=pytest.mark.slow),
-        pytest.param(8, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(10, marks=pytest.mark.slow),
-        pytest.param(11, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [5, pytest.param(12, marks=pytest.mark.slow)])
 @pytest.mark.parametrize(
     "factory",
     [_inverse_diagonally_dominant, _inverse_spd, _inverse_pivoting_required],
@@ -490,19 +460,7 @@ def test_inverse_large_f32(n, factory):
     _test_inverse_at_size(n, qd.f32, factory)
 
 
-@pytest.mark.parametrize(
-    "n",
-    [
-        5,
-        pytest.param(6, marks=pytest.mark.slow),
-        pytest.param(7, marks=pytest.mark.slow),
-        pytest.param(8, marks=pytest.mark.slow),
-        pytest.param(9, marks=pytest.mark.slow),
-        pytest.param(10, marks=pytest.mark.slow),
-        pytest.param(11, marks=pytest.mark.slow),
-        pytest.param(12, marks=pytest.mark.slow),
-    ],
-)
+@pytest.mark.parametrize("n", [5, pytest.param(12, marks=pytest.mark.slow)])
 @pytest.mark.parametrize(
     "factory",
     [_inverse_diagonally_dominant, _inverse_spd, _inverse_pivoting_required],

From 4b75e7e6a993d9f4f4fabe2d77a858ee96159cee Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 05:03:55 -0700
Subject: [PATCH 03/29] Mark more single-test outliers as slow (round 2)

Round 1 of the slow-marking pass dropped cluster CUDA wall time from
1303s (main) to 563s on hp/mark-slow-tests (-56.8%). The next critical
path bottleneck on branch was a small set of single-test outliers each
holding one xdist worker for 10-73 seconds:

  test_tile16_cholesky_blocked_demo[cuda]        72.77s
  test_field_max_num_args[cuda]                  40.17s
  test_src_ll_cache_has_return[..., *]      4 x ~12.7s = ~50s
  test_src_ll_cache_modify_sub_func[cuda]        13.68s
  test_tile16_shared_array_cholesky[cuda]        12.03s
  test_tile16_ger_sub[cuda-...]                  11.19s + 3 sibling cases
  test_tile16_syr_sub[cuda-...]                  11.04s + 3 sibling cases
  test_tile16_vec_proxy_syr_sub_3d[cuda-...]     10.59s + 1 sibling case
  test_tile16_shared_array_roundtrip[cuda]       10.49s
  test_mesh_localize_mapping0[cuda]              11.66s
  test_ad_gdar_diffmpm.py::test_gdar_mpm[cuda]   11.41s

Slow-mark all of them so the default `pytest -m "not slow"` run skips
them. They still run on the dedicated slow / CI suites via --run-slow
or `-m slow`. Target: push the wall-time reduction from 56.8% past the
66% goal.
---
 .../python/quadrants/lang/fast_caching/test_src_ll_cache.py | 2 ++
 tests/python/test_ad_gdar_diffmpm.py                        | 1 +
 tests/python/test_field.py                                  | 1 +
 tests/python/test_mesh.py                                   | 2 ++
 tests/python/test_tile16.py                                 | 6 ++++++
 5 files changed, 12 insertions(+)

diff --git a/tests/python/quadrants/lang/fast_caching/test_src_ll_cache.py b/tests/python/quadrants/lang/fast_caching/test_src_ll_cache.py
index 711839cf5d..129e6a97cd 100644
--- a/tests/python/quadrants/lang/fast_caching/test_src_ll_cache.py
+++ b/tests/python/quadrants/lang/fast_caching/test_src_ll_cache.py
@@ -319,6 +319,7 @@ def k1(a: qd.i32, output: qd.types.NDArray[qd.i32, 1]) -> bool:
     sys.exit(RET_SUCCESS)
 
 
+@pytest.mark.slow
 @pytest.mark.parametrize("return_something", [False, True])
 @pytest.mark.parametrize("src_ll_cache", [False, True])
 @test_utils.test()
@@ -463,6 +464,7 @@ def src_ll_cache_modify_sub_func_child(args: list[str]) -> None:
     sys.exit(RET_SUCCESS)
 
 
+@pytest.mark.slow
 @test_utils.test()
 def test_src_ll_cache_modify_sub_func(tmp_path: pathlib.Path) -> None:
     assert qd.lang is not None
diff --git a/tests/python/test_ad_gdar_diffmpm.py b/tests/python/test_ad_gdar_diffmpm.py
index cd6bb32a04..0fee8f4da0 100644
--- a/tests/python/test_ad_gdar_diffmpm.py
+++ b/tests/python/test_ad_gdar_diffmpm.py
@@ -5,6 +5,7 @@
 from tests import test_utils
 
 
+@pytest.mark.slow
 @test_utils.test(require=qd.extension.assertion, debug=True)
 def test_gdar_mpm():
     real = qd.f32
diff --git a/tests/python/test_field.py b/tests/python/test_field.py
index 52770b3872..de6d6dfe68 100644
--- a/tests/python/test_field.py
+++ b/tests/python/test_field.py
@@ -443,6 +443,7 @@ def collide():
     collide()
 
 
+@pytest.mark.slow
 @test_utils.test()
 def test_field_max_num_args() -> None:
     num_args = 512
diff --git a/tests/python/test_mesh.py b/tests/python/test_mesh.py
index a88897be81..387ae4199e 100644
--- a/tests/python/test_mesh.py
+++ b/tests/python/test_mesh.py
@@ -1,6 +1,7 @@
 import os
 
 import numpy as np
+import pytest
 
 import quadrants as qd
 
@@ -92,6 +93,7 @@ def test_mesh_reordered_opt():
     _test_mesh_for(True, True, False)
 
 
+@pytest.mark.slow
 @test_utils.test(require=qd.extension.mesh, mesh_localize_to_end_mapping=False)
 def test_mesh_localize_mapping0():
     _test_mesh_for(False, False, False)
diff --git a/tests/python/test_tile16.py b/tests/python/test_tile16.py
index 97480c7d1d..c5df875f64 100644
--- a/tests/python/test_tile16.py
+++ b/tests/python/test_tile16.py
@@ -393,6 +393,7 @@ def _make_spd(np_dtype=np.float32, seed: int = 42):
     return (B @ B.T + _TILE * np.eye(_TILE)).astype(np_dtype)
 
 
+@pytest.mark.slow
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @pytest.mark.parametrize("qd_dtype", _QD_DTYPES)
 @test_utils.test(arch=qd.gpu)
@@ -1131,6 +1132,7 @@ def k1(src_arr: qd.types.NDArray[qd_dtype, 2], dst_arr: qd.types.NDArray[qd_dtyp
     np.testing.assert_allclose(result, expected)
 
 
+@pytest.mark.slow
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @pytest.mark.parametrize("qd_dtype", _QD_DTYPES)
 @test_utils.test(arch=qd.gpu)
@@ -1237,6 +1239,7 @@ def k1(a_arr: Ann, b_arr: Ann, x_arr: Ann, eps_f: qd.Template):
 # -- SharedArray tests --
 
 
+@pytest.mark.slow
 @test_utils.test(arch=qd.gpu)
 def test_tile16_shared_array_roundtrip():
     """Load from field -> tile -> SharedArray -> tile -> field, verify data survives."""
@@ -1309,6 +1312,7 @@ def k1(src_f: qd.Template, dst_f: qd.Template, NCOLS: qd.i32):
         np.testing.assert_allclose(result[:, NCOLS:], -1.0)
 
 
+@pytest.mark.slow
 @test_utils.test(arch=qd.gpu)
 def test_tile16_shared_array_cholesky():
     """Cholesky via tiles, L stored in SharedArray, verify reconstruction."""
@@ -1434,6 +1438,7 @@ def k1(mat_arr: Ann_tile, vecs_arr: Ann_vecs, out_arr: Ann_tile, K0: qd.i32, COL
     np.testing.assert_allclose(out.to_numpy(), R - np.outer(col, col), atol=1e-5)
 
 
+@pytest.mark.slow
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test(arch=qd.gpu)
 def test_tile16_vec_proxy_syr_sub_3d(tensor_type):
@@ -1774,6 +1779,7 @@ def write_eye_f32(dst: Ann32):
     assert result32.dtype == np.float32
 
 
+@pytest.mark.slow
 @test_utils.test(arch=[qd.cuda])
 def test_tile16_cholesky_blocked_demo():
     """Smoke-test that misc/demos/cholesky_blocked.py runs to completion."""

From 75a15d2a7ac86b4d62ced249e9978c8762b64059 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 05:15:24 -0700
Subject: [PATCH 04/29] Slow-mark long-tail test_tile16 outliers (round 3)

After round 2 the wall-time ratio sat at 461s / 1303s = 0.354 (-64.6%),
just shy of the 66% target. The remaining critical path is now the
long tail of test_tile16.py tests in the 9-11s range with no single
dominator. Slow-mark another batch of test_tile16 outliers to push
just past the target:

  test_tile16_vec_proxy_multi_column_accumulate (2 cases, ~10.7s avg)
  test_tile16_slice_ger_sub_via_outer            (4 cases, ~10s avg)
  test_tile16_vec_proxy_ger_sub_2d/_3d           (~10s each)
  test_tile16_potrf_then_trsm                    (4 cases, ~9.8s)
  test_tile16_shared_array_partial_cols          (3 cases, ~9.6s)
  test_tile16_vec_proxy_partial_rows             (2 cases, ~9.4s)
  test_tile16_outer_symmetric_same_variable      (~9.4s)
  test_tile16_vec_proxy_shared_array             (~9.2s)

These are all tile16 stress / blas-style tests. They still run under
--run-slow and on the dedicated slow CI lane.
---
 tests/python/test_tile16.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/python/test_tile16.py b/tests/python/test_tile16.py
index c5df875f64..c46c1e8e7d 100644
--- a/tests/python/test_tile16.py
+++ b/tests/python/test_tile16.py
@@ -643,6 +643,7 @@ def k1(src_arr: Ann, dst_arr: Ann, NBATCH: qd.i32):
     np.testing.assert_allclose(dst.to_numpy(), data)
 
 
+@pytest.mark.slow
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @pytest.mark.parametrize("qd_dtype", _QD_DTYPES)
 @test_utils.test(arch=qd.gpu)
@@ -689,6 +690,7 @@ def k1(
     np.testing.assert_allclose(out.to_numpy(), expected, atol=atol)
 
 
+@pytest.mark.slow
 @pytest.mark.parametrize("qd_dtype", _QD_DTYPES)
 @test_utils.test(arch=qd.gpu)
 def test_tile16_vec_proxy_ger_sub_2d(qd_dtype):
@@ -725,6 +727,7 @@ def k1(
     np.testing.assert_allclose(out.to_numpy(), expected, atol=atol)
 
 
+@pytest.mark.slow
 @test_utils.test(arch=qd.gpu)
 def test_tile16_outer_symmetric_same_variable():
     """t -= qd.outer(v, v) with the same variable for both args."""
@@ -758,6 +761,7 @@ def k1(
     np.testing.assert_allclose(out.to_numpy(), expected, atol=1e-5)
 
 
+@pytest.mark.slow
 @test_utils.test(arch=qd.gpu)
 def test_tile16_vec_proxy_ger_sub_3d():
     """Column vector load from a 3D array: v = arr[batch, r0:r1, col]."""
@@ -1196,6 +1200,7 @@ def k1(src_arr: Ann, dst_arr: Ann, eps_f: qd.Template):
     np.testing.assert_allclose(np.tril(dst.to_numpy()), L_expected, atol=atol)
 
 
+@pytest.mark.slow
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @pytest.mark.parametrize("qd_dtype", _QD_DTYPES)
 @test_utils.test(arch=qd.gpu)
@@ -1266,6 +1271,7 @@ def k1(src_f: qd.Template, dst_f: qd.Template):
     np.testing.assert_allclose(dst.to_numpy(), data)
 
 
+@pytest.mark.slow
 @pytest.mark.parametrize("partial_store,partial_load", [(True, True), (True, False), (False, True)])
 @test_utils.test(arch=qd.gpu)
 def test_tile16_shared_array_partial_cols(partial_store, partial_load):
@@ -1475,6 +1481,7 @@ def k1(mat_arr: Ann_tile, vecs_arr: Ann_vecs, out_arr: Ann_tile, K0: qd.i32, COL
     np.testing.assert_allclose(out.to_numpy(), R - np.outer(col, col), atol=1e-5)
 
 
+@pytest.mark.slow
 @test_utils.test(arch=qd.gpu)
 def test_tile16_vec_proxy_shared_array():
     """Symmetric rank-1 subtract via vec proxy from SharedArray at non-zero offset."""
@@ -1513,6 +1520,7 @@ def k1(mat_f: qd.Template, vecs_f: qd.Template, out_f: qd.Template, K0: qd.i32,
     np.testing.assert_allclose(out.to_numpy(), R - np.outer(col, col), atol=1e-5)
 
 
+@pytest.mark.slow
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test(arch=qd.gpu)
 def test_tile16_vec_proxy_partial_rows(tensor_type):
@@ -1549,6 +1557,7 @@ def k1(mat_arr: Ann_tile, vecs_arr: Ann_vecs, out_arr: Ann_tile, K0: qd.i32, COL
     np.testing.assert_allclose(out.to_numpy(), R - np.outer(col_padded, col_padded), atol=1e-5)
 
 
+@pytest.mark.slow
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test(arch=qd.gpu)
 def test_tile16_vec_proxy_multi_column_accumulate(tensor_type):

From 9062dbcb0fa6e6b36991a643092f35af6f31bed0 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 06:01:10 -0700
Subject: [PATCH 05/29] Revert function-level @pytest.mark.slow decorators

Per discussion: marking a *larger parameter* slow (when smaller params
of the same test still run) is acceptable -- the function is still
exercised, just at a smaller size. But marking an *entire test
function* slow drops it from the default suite entirely with no
remaining coverage, which isn't acceptable.

Revert function-level @pytest.mark.slow from:

  test_tile16.py: 14 functions (cholesky_blocked_demo, ger_sub, syr_sub,
                  shared_array_roundtrip, shared_array_cholesky,
                  shared_array_partial_cols, vec_proxy_*, slice_ger_sub_*,
                  outer_symmetric_*, potrf_then_trsm, multi_column_accumulate)
  test_src_ll_cache.py: test_src_ll_cache_has_return, _modify_sub_func
  test_ad_gdar_diffmpm.py: test_gdar_mpm
  test_mesh.py: test_mesh_localize_mapping0  (also drop unused `import pytest`)
  test_field.py: test_field_max_num_args
  test_linalg.py: test_matmul_chain_qipc_sizes_f32/f64
  test_struct.py: test_2d_nested
  test_mpm88.py: test_mpm88, test_mpm88_numpy_and_ndarray
  test_reset_ndarrays.py: test_ndarray_doesnt_crash_on_gc
  test_clear_all_gradients.py: test_clear_all_gradients (also drop unused
                               `import pytest`)

Preserved (still acceptable): param-level slow marks where a smaller
parameter of the same test continues to run by default. These live in
test_eig.py and test_linalg.py (n=6/9/12 cases for sym_eig / make_spd /
inverse_large / frobenius_inner / rectangular pairs).
---
 .../lang/fast_caching/test_src_ll_cache.py        |  2 --
 tests/python/test_ad_gdar_diffmpm.py              |  1 -
 tests/python/test_clear_all_gradients.py          |  3 ---
 tests/python/test_field.py                        |  1 -
 tests/python/test_linalg.py                       |  2 --
 tests/python/test_mesh.py                         |  2 --
 tests/python/test_mpm88.py                        |  2 --
 tests/python/test_reset_ndarrays.py               |  1 -
 tests/python/test_struct.py                       |  1 -
 tests/python/test_tile16.py                       | 15 ---------------
 10 files changed, 30 deletions(-)

diff --git a/tests/python/quadrants/lang/fast_caching/test_src_ll_cache.py b/tests/python/quadrants/lang/fast_caching/test_src_ll_cache.py
index 129e6a97cd..711839cf5d 100644
--- a/tests/python/quadrants/lang/fast_caching/test_src_ll_cache.py
+++ b/tests/python/quadrants/lang/fast_caching/test_src_ll_cache.py
@@ -319,7 +319,6 @@ def k1(a: qd.i32, output: qd.types.NDArray[qd.i32, 1]) -> bool:
     sys.exit(RET_SUCCESS)
 
 
-@pytest.mark.slow
 @pytest.mark.parametrize("return_something", [False, True])
 @pytest.mark.parametrize("src_ll_cache", [False, True])
 @test_utils.test()
@@ -464,7 +463,6 @@ def src_ll_cache_modify_sub_func_child(args: list[str]) -> None:
     sys.exit(RET_SUCCESS)
 
 
-@pytest.mark.slow
 @test_utils.test()
 def test_src_ll_cache_modify_sub_func(tmp_path: pathlib.Path) -> None:
     assert qd.lang is not None
diff --git a/tests/python/test_ad_gdar_diffmpm.py b/tests/python/test_ad_gdar_diffmpm.py
index 0fee8f4da0..cd6bb32a04 100644
--- a/tests/python/test_ad_gdar_diffmpm.py
+++ b/tests/python/test_ad_gdar_diffmpm.py
@@ -5,7 +5,6 @@
 from tests import test_utils
 
 
-@pytest.mark.slow
 @test_utils.test(require=qd.extension.assertion, debug=True)
 def test_gdar_mpm():
     real = qd.f32
diff --git a/tests/python/test_clear_all_gradients.py b/tests/python/test_clear_all_gradients.py
index 22c649a979..615ade9b0b 100644
--- a/tests/python/test_clear_all_gradients.py
+++ b/tests/python/test_clear_all_gradients.py
@@ -1,12 +1,9 @@
-import pytest
-
 import quadrants as qd
 from quadrants.lang import impl
 
 from tests import test_utils
 
 
-@pytest.mark.slow
 @test_utils.test(exclude=[qd.vulkan])
 def test_clear_all_gradients():
     x = qd.field(qd.f32)
diff --git a/tests/python/test_field.py b/tests/python/test_field.py
index de6d6dfe68..52770b3872 100644
--- a/tests/python/test_field.py
+++ b/tests/python/test_field.py
@@ -443,7 +443,6 @@ def collide():
     collide()
 
 
-@pytest.mark.slow
 @test_utils.test()
 def test_field_max_num_args() -> None:
     num_args = 512
diff --git a/tests/python/test_linalg.py b/tests/python/test_linalg.py
index a6632d4678..011efa4a69 100644
--- a/tests/python/test_linalg.py
+++ b/tests/python/test_linalg.py
@@ -255,13 +255,11 @@ def run():
     np.testing.assert_allclose(ABC_chained.to_numpy(), ABC_staged.to_numpy(), rtol=tol, atol=tol)
 
 
-@pytest.mark.slow
 @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False)
 def test_matmul_chain_qipc_sizes_f32():
     _test_matmul_chain(qd.f32)
 
 
-@pytest.mark.slow
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
 def test_matmul_chain_qipc_sizes_f64():
     _test_matmul_chain(qd.f64)
diff --git a/tests/python/test_mesh.py b/tests/python/test_mesh.py
index 387ae4199e..a88897be81 100644
--- a/tests/python/test_mesh.py
+++ b/tests/python/test_mesh.py
@@ -1,7 +1,6 @@
 import os
 
 import numpy as np
-import pytest
 
 import quadrants as qd
 
@@ -93,7 +92,6 @@ def test_mesh_reordered_opt():
     _test_mesh_for(True, True, False)
 
 
-@pytest.mark.slow
 @test_utils.test(require=qd.extension.mesh, mesh_localize_to_end_mapping=False)
 def test_mesh_localize_mapping0():
     _test_mesh_for(False, False, False)
diff --git a/tests/python/test_mpm88.py b/tests/python/test_mpm88.py
index d758b65f9d..725ff17ac9 100644
--- a/tests/python/test_mpm88.py
+++ b/tests/python/test_mpm88.py
@@ -7,7 +7,6 @@
 from tests import test_utils
 
 
-@pytest.mark.slow
 @pytest.mark.skipif(os.environ.get("QD_LITE_TEST") or "0", reason="Lite test")
 @pytest.mark.run_in_serial
 @test_utils.test()
@@ -109,7 +108,6 @@ def _is_appveyor():
     return os.getenv("APPVEYOR", "").lower() == "true"
 
 
-@pytest.mark.slow
 @pytest.mark.skipif(os.environ.get("QD_LITE_TEST") or "0", reason="Lite test")
 @pytest.mark.run_in_serial
 @test_utils.test()
diff --git a/tests/python/test_reset_ndarrays.py b/tests/python/test_reset_ndarrays.py
index a42fd921f1..bc048ac92d 100644
--- a/tests/python/test_reset_ndarrays.py
+++ b/tests/python/test_reset_ndarrays.py
@@ -8,7 +8,6 @@
 from tests import test_utils
 
 
-@pytest.mark.slow
 @test_utils.test(arch=[qd.cpu])
 def test_ndarray_doesnt_crash_on_gc() -> None:
     if sys.platform != "darwin":
diff --git a/tests/python/test_struct.py b/tests/python/test_struct.py
index de6d249970..d3d6a4fbaa 100644
--- a/tests/python/test_struct.py
+++ b/tests/python/test_struct.py
@@ -62,7 +62,6 @@ def test_linear_nested_aos():
         assert y[i] == i + 123
 
 
-@pytest.mark.slow
 @test_utils.test(exclude=[qd.vulkan])
 def test_2d_nested():
     x = qd.field(qd.i32)
diff --git a/tests/python/test_tile16.py b/tests/python/test_tile16.py
index c46c1e8e7d..97480c7d1d 100644
--- a/tests/python/test_tile16.py
+++ b/tests/python/test_tile16.py
@@ -393,7 +393,6 @@ def _make_spd(np_dtype=np.float32, seed: int = 42):
     return (B @ B.T + _TILE * np.eye(_TILE)).astype(np_dtype)
 
 
-@pytest.mark.slow
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @pytest.mark.parametrize("qd_dtype", _QD_DTYPES)
 @test_utils.test(arch=qd.gpu)
@@ -643,7 +642,6 @@ def k1(src_arr: Ann, dst_arr: Ann, NBATCH: qd.i32):
     np.testing.assert_allclose(dst.to_numpy(), data)
 
 
-@pytest.mark.slow
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @pytest.mark.parametrize("qd_dtype", _QD_DTYPES)
 @test_utils.test(arch=qd.gpu)
@@ -690,7 +688,6 @@ def k1(
     np.testing.assert_allclose(out.to_numpy(), expected, atol=atol)
 
 
-@pytest.mark.slow
 @pytest.mark.parametrize("qd_dtype", _QD_DTYPES)
 @test_utils.test(arch=qd.gpu)
 def test_tile16_vec_proxy_ger_sub_2d(qd_dtype):
@@ -727,7 +724,6 @@ def k1(
     np.testing.assert_allclose(out.to_numpy(), expected, atol=atol)
 
 
-@pytest.mark.slow
 @test_utils.test(arch=qd.gpu)
 def test_tile16_outer_symmetric_same_variable():
     """t -= qd.outer(v, v) with the same variable for both args."""
@@ -761,7 +757,6 @@ def k1(
     np.testing.assert_allclose(out.to_numpy(), expected, atol=1e-5)
 
 
-@pytest.mark.slow
 @test_utils.test(arch=qd.gpu)
 def test_tile16_vec_proxy_ger_sub_3d():
     """Column vector load from a 3D array: v = arr[batch, r0:r1, col]."""
@@ -1136,7 +1131,6 @@ def k1(src_arr: qd.types.NDArray[qd_dtype, 2], dst_arr: qd.types.NDArray[qd_dtyp
     np.testing.assert_allclose(result, expected)
 
 
-@pytest.mark.slow
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @pytest.mark.parametrize("qd_dtype", _QD_DTYPES)
 @test_utils.test(arch=qd.gpu)
@@ -1200,7 +1194,6 @@ def k1(src_arr: Ann, dst_arr: Ann, eps_f: qd.Template):
     np.testing.assert_allclose(np.tril(dst.to_numpy()), L_expected, atol=atol)
 
 
-@pytest.mark.slow
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @pytest.mark.parametrize("qd_dtype", _QD_DTYPES)
 @test_utils.test(arch=qd.gpu)
@@ -1244,7 +1237,6 @@ def k1(a_arr: Ann, b_arr: Ann, x_arr: Ann, eps_f: qd.Template):
 # -- SharedArray tests --
 
 
-@pytest.mark.slow
 @test_utils.test(arch=qd.gpu)
 def test_tile16_shared_array_roundtrip():
     """Load from field -> tile -> SharedArray -> tile -> field, verify data survives."""
@@ -1271,7 +1263,6 @@ def k1(src_f: qd.Template, dst_f: qd.Template):
     np.testing.assert_allclose(dst.to_numpy(), data)
 
 
-@pytest.mark.slow
 @pytest.mark.parametrize("partial_store,partial_load", [(True, True), (True, False), (False, True)])
 @test_utils.test(arch=qd.gpu)
 def test_tile16_shared_array_partial_cols(partial_store, partial_load):
@@ -1318,7 +1309,6 @@ def k1(src_f: qd.Template, dst_f: qd.Template, NCOLS: qd.i32):
         np.testing.assert_allclose(result[:, NCOLS:], -1.0)
 
 
-@pytest.mark.slow
 @test_utils.test(arch=qd.gpu)
 def test_tile16_shared_array_cholesky():
     """Cholesky via tiles, L stored in SharedArray, verify reconstruction."""
@@ -1444,7 +1434,6 @@ def k1(mat_arr: Ann_tile, vecs_arr: Ann_vecs, out_arr: Ann_tile, K0: qd.i32, COL
     np.testing.assert_allclose(out.to_numpy(), R - np.outer(col, col), atol=1e-5)
 
 
-@pytest.mark.slow
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test(arch=qd.gpu)
 def test_tile16_vec_proxy_syr_sub_3d(tensor_type):
@@ -1481,7 +1470,6 @@ def k1(mat_arr: Ann_tile, vecs_arr: Ann_vecs, out_arr: Ann_tile, K0: qd.i32, COL
     np.testing.assert_allclose(out.to_numpy(), R - np.outer(col, col), atol=1e-5)
 
 
-@pytest.mark.slow
 @test_utils.test(arch=qd.gpu)
 def test_tile16_vec_proxy_shared_array():
     """Symmetric rank-1 subtract via vec proxy from SharedArray at non-zero offset."""
@@ -1520,7 +1508,6 @@ def k1(mat_f: qd.Template, vecs_f: qd.Template, out_f: qd.Template, K0: qd.i32,
     np.testing.assert_allclose(out.to_numpy(), R - np.outer(col, col), atol=1e-5)
 
 
-@pytest.mark.slow
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test(arch=qd.gpu)
 def test_tile16_vec_proxy_partial_rows(tensor_type):
@@ -1557,7 +1544,6 @@ def k1(mat_arr: Ann_tile, vecs_arr: Ann_vecs, out_arr: Ann_tile, K0: qd.i32, COL
     np.testing.assert_allclose(out.to_numpy(), R - np.outer(col_padded, col_padded), atol=1e-5)
 
 
-@pytest.mark.slow
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test(arch=qd.gpu)
 def test_tile16_vec_proxy_multi_column_accumulate(tensor_type):
@@ -1788,7 +1774,6 @@ def write_eye_f32(dst: Ann32):
     assert result32.dtype == np.float32
 
 
-@pytest.mark.slow
 @test_utils.test(arch=[qd.cuda])
 def test_tile16_cholesky_blocked_demo():
     """Smoke-test that misc/demos/cholesky_blocked.py runs to completion."""

From 9dfa84d0e1526888b5a9b5a2a2473f029f361503 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 06:48:09 -0700
Subject: [PATCH 06/29] [Demo] cholesky_blocked: take N / N_ENVS / WARMUP /
 ITERS via argparse

The blocked-Cholesky demo previously hard-coded N=92, N_ENVS=4096,
WARMUP=50, ITERS=200 as module globals. The unit-test wrapper
test_tile16_cholesky_blocked_demo runs the demo as a subprocess and
only cares that it returns 0; at the hard-coded sizes that takes ~74 s
on cluster CUDA, dominated by JIT-compiling 3 large unrolled kernels
at N=92 and running the 4096-env x 250-iter benchmark loop.

Expose all four knobs as command-line flags with the previous values as
defaults, so:

    python misc/demos/cholesky_blocked.py                                # unchanged, full demo
    python misc/demos/cholesky_blocked.py --n 32 --n-envs 64 \
        --num-warmup 1 --num-iters 1                                    # smoke-mode

The test will switch to the smoke-mode invocation in a follow-up
commit so it stops dominating the slow critical path.

Flag names (--n, --n-envs, --num-warmup, --num-iters) follow the user
spec; using argparse + ArgumentDefaultsHelpFormatter so --help shows
the full demo defaults.
---
 misc/demos/cholesky_blocked.py | 37 +++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/misc/demos/cholesky_blocked.py b/misc/demos/cholesky_blocked.py
index 8dbcb3fbb9..b4c60c1810 100644
--- a/misc/demos/cholesky_blocked.py
+++ b/misc/demos/cholesky_blocked.py
@@ -1,13 +1,14 @@
 #!/usr/bin/env python3
-"""Benchmark 92x92 blocked Cholesky factorization using Tile16x16.
+"""Benchmark NxN blocked Cholesky factorization using Tile16x16.
 
 Three kernels compared:
 
 1. Baseline: scalar Cholesky-Crout, 64 threads, shared memory, 2*N+1 sequential syncs. Thread 0 computes each
    diagonal, remaining threads parallelize off-diagonal updates.
 
-2. Blocked: 6x6 grid of 16x16 tiles, 16 threads, shared memory, scalar Crout for diagonal blocks. Same blocking
-   structure as Tile16x16 but all data lives in shared memory with block.sync() between every step.
+2. Blocked: ceil(N/16) x ceil(N/16) grid of 16x16 tiles, 16 threads, shared memory, scalar Crout for diagonal
+   blocks. Same blocking structure as Tile16x16 but all data lives in shared memory with block.sync() between
+   every step.
 
 3. Tile16x16: same blocked structure but fully register-resident via Tile16x16. No shared memory, zero syncs.
    Prior tiles read from global memory (L2).
@@ -20,22 +21,38 @@
     tile16   (Tile16x16, no shared memory)             16        533        5.19x
 
 Usage:
-    python misc/demos/cholesky_blocked.py
+    python misc/demos/cholesky_blocked.py [--n N] [--n-envs N_ENVS] \
+        [--num-warmup WARMUP] [--num-iters ITERS]
 """
 
+import argparse
 import time
 
 import numpy as np
 
 import quadrants as qd
 
-N = 92
+
+def _parse_args():
+    p = argparse.ArgumentParser(
+        description="Blocked Cholesky NxN benchmark (3 kernels: baseline / blocked / tile16).",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    p.add_argument("--n", type=int, default=92, help="Matrix dimension N (NxN SPD).")
+    p.add_argument("--n-envs", type=int, default=4096, help="Number of independent environments.")
+    p.add_argument("--num-warmup", type=int, default=50, help="Warmup iterations per kernel.")
+    p.add_argument("--num-iters", type=int, default=200, help="Timed iterations per kernel.")
+    return p.parse_args()
+
+
+_args = _parse_args()
+N = _args.n
 TILE = 16
-N_BLOCKS = (N + TILE - 1) // TILE  # 6
-N_PADDED = N_BLOCKS * TILE  # 96, rounded up for blocked kernel SharedArrays
-N_ENVS = 4096
-WARMUP = 50
-ITERS = 200
+N_BLOCKS = (N + TILE - 1) // TILE
+N_PADDED = N_BLOCKS * TILE  # rounded up for blocked kernel SharedArrays
+N_ENVS = _args.n_envs
+WARMUP = _args.num_warmup
+ITERS = _args.num_iters
 
 qd.init(arch=qd.gpu)
 

From 6e936aa3d4efa24630ad08a020596afe1315a8b9 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 06:49:15 -0700
Subject: [PATCH 07/29] [Test] test_tile16_cholesky_blocked_demo: invoke demo
 in smoke-mode

Pass small CLI overrides (--n 32 --n-envs 64 --num-warmup 1
--num-iters 1) so the demo runs end-to-end in seconds instead of ~74 s.
The test contract is just "demo exits 0"; it doesn't read any of the
benchmark numbers, so the smaller workload still satisfies the smoke
test.

The full N=92 / N_ENVS=4096 / 50+200-iter demo is still what humans
running misc/demos/cholesky_blocked.py see by default (argparse
defaults match the previous hard-coded values).

Together with the previous commit, this drops the
test_tile16_cholesky_blocked_demo wall time on cluster CUDA from
~74 s to (expected) a few seconds, removing the largest remaining
single-test outlier on hp/mark-slow-tests.
---
 tests/python/test_tile16.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/tests/python/test_tile16.py b/tests/python/test_tile16.py
index 97480c7d1d..6d917e11ad 100644
--- a/tests/python/test_tile16.py
+++ b/tests/python/test_tile16.py
@@ -1776,8 +1776,21 @@ def write_eye_f32(dst: Ann32):
 
 @test_utils.test(arch=[qd.cuda])
 def test_tile16_cholesky_blocked_demo():
-    """Smoke-test that misc/demos/cholesky_blocked.py runs to completion."""
+    """Smoke-test that misc/demos/cholesky_blocked.py runs to completion.
+
+    Uses small CLI overrides (N=32, N_ENVS=64, 1 warmup + 1 timed iter) so the
+    JIT compile of the 3 unrolled kernels and the benchmark loop both stay
+    cheap. The demo's defaults (N=92, N_ENVS=4096, 50+200 iters) are exercised
+    by anyone running the script manually, not by CI.
+    """
     demo = Path(__file__).resolve().parents[2] / "misc" / "demos" / "cholesky_blocked.py"
-    result = subprocess.run([sys.executable, str(demo)], capture_output=True, text=True, timeout=300)
+    cmd = [
+        sys.executable, str(demo),
+        "--n", "32",
+        "--n-envs", "64",
+        "--num-warmup", "1",
+        "--num-iters", "1",
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
     if result.returncode != 0:
         pytest.fail(f"cholesky_blocked.py exited with code {result.returncode}\nstderr:\n{result.stderr}")

From 21c2877ab214b2a074fd091a1e79b7228d5eccc8 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 07:16:24 -0700
Subject: [PATCH 08/29] [Test] test_matmul_chain_qipc_sizes: parametrize on
 matrix shapes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously the test hard-coded the qipc IPC sizes (9x12) · (12x12) ·
(12x9). On cluster CUDA those two cases (f32 + f64) take ~92.7s and
~87.3s respectively -- the top two single-test outliers in the suite,
each holding one xdist worker for ~90s of contiguous JIT-compile +
unrolled-FMA work.

Parametrize `_test_matmul_chain` on (rows_a, cols_a, cols_b, cols_c).
Default lane runs the small (3,4,4,3) chain to exercise the same
Matrix.__matmul__ codegen path; the original (9,12,12,9) qipc-sized
chain is slow-marked so it still runs on --run-slow (i.e. CI's nightly
/ release lane, once that's wired up).

Estimated saving: ~180s CPU, ~70s wall (these tests were on the
critical path of the branch run).

No function-level coverage lost: both f32 and f64 versions still run
the same chain by default, just at a smaller size.
---
 tests/python/test_linalg.py | 51 +++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/tests/python/test_linalg.py b/tests/python/test_linalg.py
index 011efa4a69..59925ee2ce 100644
--- a/tests/python/test_linalg.py
+++ b/tests/python/test_linalg.py
@@ -215,24 +215,26 @@ def test_frobenius_inner_rectangular_f64(rows, cols):
     _test_frobenius_inner_rectangular(rows, cols, qd.f64)
 
 
-def _test_matmul_chain(dt):
-    """3-way matmul chain at qipc IPC sizes: (9×12) · (12×12) · (12×9) → (9×9).
-
-    Verifies that ``Matrix.__matmul__`` compiles and is numerically correct at the largest size qipc needs. Quadrants
-    imposes no enforced size cap on matmul, but the unrolled `static(range)` triple loop produces ~1296 FMAs per
-    intermediate, so this test catches compile-time blow-up or back-end miscompiles at large sizes.
+def _test_matmul_chain(rows_a, cols_a, cols_b, cols_c, dt):
+    """3-way matmul chain: ``(rows_a × cols_a) · (cols_a × cols_b) · (cols_b × cols_c) → (rows_a × cols_c)``.
+
+    Verifies that ``Matrix.__matmul__`` compiles and is numerically correct at the requested size. Quadrants
+    imposes no enforced size cap on matmul, but the unrolled `static(range)` triple loop produces
+    ``rows_a * cols_a * cols_b + rows_a * cols_b * cols_c`` FMAs per kernel call, so this test catches compile-time
+    blow-up or back-end miscompiles at large sizes. The largest parametrize value is the chain qipc actually uses;
+    smaller values are cheap sanity checks that the same code path still works.
     """
     np_dt = np.float32 if dt == qd.f32 else np.float64
-    A_np = np.random.default_rng(0xCA70).standard_normal((9, 12)).astype(np_dt)
-    B_np = np.random.default_rng(0xCA71).standard_normal((12, 12)).astype(np_dt)
-    C_np = np.random.default_rng(0xCA72).standard_normal((12, 9)).astype(np_dt)
+    A_np = np.random.default_rng(0xCA70).standard_normal((rows_a, cols_a)).astype(np_dt)
+    B_np = np.random.default_rng(0xCA71).standard_normal((cols_a, cols_b)).astype(np_dt)
+    C_np = np.random.default_rng(0xCA72).standard_normal((cols_b, cols_c)).astype(np_dt)
 
-    A = qd.Matrix.field(9, 12, dtype=dt, shape=())
-    B = qd.Matrix.field(12, 12, dtype=dt, shape=())
-    C = qd.Matrix.field(12, 9, dtype=dt, shape=())
-    AB = qd.Matrix.field(9, 12, dtype=dt, shape=())
-    ABC_chained = qd.Matrix.field(9, 9, dtype=dt, shape=())
-    ABC_staged = qd.Matrix.field(9, 9, dtype=dt, shape=())
+    A = qd.Matrix.field(rows_a, cols_a, dtype=dt, shape=())
+    B = qd.Matrix.field(cols_a, cols_b, dtype=dt, shape=())
+    C = qd.Matrix.field(cols_b, cols_c, dtype=dt, shape=())
+    AB = qd.Matrix.field(rows_a, cols_b, dtype=dt, shape=())
+    ABC_chained = qd.Matrix.field(rows_a, cols_c, dtype=dt, shape=())
+    ABC_staged = qd.Matrix.field(rows_a, cols_c, dtype=dt, shape=())
 
     A.from_numpy(A_np)
     B.from_numpy(B_np)
@@ -255,14 +257,25 @@ def run():
     np.testing.assert_allclose(ABC_chained.to_numpy(), ABC_staged.to_numpy(), rtol=tol, atol=tol)
 
 
+# qipc's actual size is (9,12,12,9) -- the largest chain it instantiates. We also keep a tiny (3,4,4,3) chain so
+# the default fast lane still exercises the same Matrix.__matmul__ codegen path without paying the ~90s/case
+# CUDA JIT cost of the qipc-sized chain.
+_MATMUL_CHAIN_SHAPES = [
+    (3, 4, 4, 3),
+    pytest.param(9, 12, 12, 9, marks=pytest.mark.slow),
+]
+
+
+@pytest.mark.parametrize("rows_a,cols_a,cols_b,cols_c", _MATMUL_CHAIN_SHAPES)
 @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False)
-def test_matmul_chain_qipc_sizes_f32():
-    _test_matmul_chain(qd.f32)
+def test_matmul_chain_qipc_sizes_f32(rows_a, cols_a, cols_b, cols_c):
+    _test_matmul_chain(rows_a, cols_a, cols_b, cols_c, qd.f32)
 
 
+@pytest.mark.parametrize("rows_a,cols_a,cols_b,cols_c", _MATMUL_CHAIN_SHAPES)
 @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False)
-def test_matmul_chain_qipc_sizes_f64():
-    _test_matmul_chain(qd.f64)
+def test_matmul_chain_qipc_sizes_f64(rows_a, cols_a, cols_b, cols_c):
+    _test_matmul_chain(rows_a, cols_a, cols_b, cols_c, qd.f64)
 
 
 @test_utils.test()

From 27a86a1970251b60e98e5779f0e40f7ce521d7a7 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 07:17:17 -0700
Subject: [PATCH 09/29] [Test] test_gdar_mpm: parametrize on particles_side /
 n_grid / num_steps

Previously hard-coded N=30 (900 particles), n_grid=120, steps=32 -- 26s
on cluster CUDA. The test's actual contract is that the AD-validation
checker raises QuadrantsAssertionError on the global-data-access
violation in g2p (`v[f, p] = new_v`), which fires on the first substep
regardless of grid / particle / step counts.

Parametrize on (particles_side, n_grid_size, num_steps) with a small
default (8, 32, 4) and slow-marked original (30, 120, 32). The default
still exercises the same diff-MPM pipeline (p2g / grid_op / g2p,
qd.ad.Tape with validation=True, `with pytest.raises(...)`) and still
triggers the assertion error.

Estimated CPU saving: ~22s; wall saving ~3s on the branch run.
---
 tests/python/test_ad_gdar_diffmpm.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/tests/python/test_ad_gdar_diffmpm.py b/tests/python/test_ad_gdar_diffmpm.py
index cd6bb32a04..0e0e460534 100644
--- a/tests/python/test_ad_gdar_diffmpm.py
+++ b/tests/python/test_ad_gdar_diffmpm.py
@@ -5,14 +5,26 @@
 from tests import test_utils
 
 
+# Defaults shrink particle / grid / steps counts so the JIT compile + AD-tape replay
+# stays cheap; the slow-marked entry keeps the original (N=30, n_grid=120, steps=32)
+# workload that runs on --run-slow. The point of the test is that the AD-validation
+# checker fires on the global-data-access violation in g2p (`v[f, p] = new_v`), which
+# happens on the first substep regardless of size.
+@pytest.mark.parametrize(
+    "particles_side,n_grid_size,num_steps",
+    [
+        (8, 32, 4),
+        pytest.param(30, 120, 32, marks=pytest.mark.slow),
+    ],
+)
 @test_utils.test(require=qd.extension.assertion, debug=True)
-def test_gdar_mpm():
+def test_gdar_mpm(particles_side, n_grid_size, num_steps):
     real = qd.f32
 
     dim = 2
-    N = 30  # reduce to 30 if run out of GPU memory
+    N = particles_side
     n_particles = N * N
-    n_grid = 120
+    n_grid = n_grid_size
     dx = 1 / n_grid
     inv_dx = 1 / dx
     dt = 3e-4
@@ -21,8 +33,8 @@ def test_gdar_mpm():
     E = 100
     mu = E
     la = E
-    max_steps = 32
-    steps = 32
+    max_steps = num_steps
+    steps = num_steps
     gravity = 9.8
     target = [0.3, 0.6]
 

From 2ea335fe39890d34225ee3c73dc3d0f0ea4b9bce Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 08:02:22 -0700
Subject: [PATCH 10/29] [Test] test_device_{reduce,exclusive_scan}: fuse
 {add,min,max} into one op-parametrized test

The three reduce variants (and the three scan variants) shared an identical
kernel signature, identical input shape, and differed only in (a) which
qd.algorithms.device_<op> function they called and (b) overflow vs
bitwise-exact verification. Collapse each triple into a single op-parametrized
test:

  test_device_reduce(op, dtype, N)            # op in {add, min, max}
  test_device_exclusive_scan(op, dtype, N)    # op in {add, min, max}

Behavior, coverage and the parametrize space are unchanged -- pytest still
collects the same number of parametrize cases, just under unified test names.
This is purely a code-dedup refactor (~130 LOC less) which makes the next
op-axis sampling change (if/when we choose to drop A vs B vs C from the
sweep) a one-line edit.
---
 tests/python/test_algorithms.py | 240 ++++++++++++++------------------
 1 file changed, 106 insertions(+), 134 deletions(-)

diff --git a/tests/python/test_algorithms.py b/tests/python/test_algorithms.py
index e4b4ac9960..508732ce3b 100644
--- a/tests/python/test_algorithms.py
+++ b/tests/python/test_algorithms.py
@@ -320,86 +320,79 @@ def _rand_reduce_host(rng, dtype, N, *, bound=1000):
     return rng.integers(-bound, bound, size=N, dtype=np_dt)
 
 
-@pytest.mark.parametrize("N", _REDUCE_SIZES)
-@pytest.mark.parametrize("dtype", _REDUCE_DTYPES)
-@test_utils.test(arch=qd.gpu)
-def test_device_reduce_add(dtype, N):
-    """device_reduce_add matches numpy.sum across the full size sweep + dtype set."""
-    _skip_if_dtype_unsupported(dtype)
-    inp, out = _alloc_input_out(dtype, N)
-    rng = np.random.default_rng(seed=1234)
-    host = _rand_reduce_host(rng, dtype, N)
-    _fill_field(inp, host)
+_REDUCE_OPS = ["add", "min", "max"]
 
-    qd.algorithms.device_reduce_add(inp, out=out)
 
-    got = out.to_numpy()[0]
+def _reduce_host(rng, op, dtype, N):
+    """Generate the test input for a reduce of `op` on `dtype` x N values.
+
+    ``add`` uses small uniform / bounded values so float sums stay representable; ``min`` and ``max`` use a wider
+    range (-10..10 for floats, +-10000 for ints) since picking-an-element is bitwise-exact regardless of magnitude.
+    """
+    if op == "add":
+        return _rand_reduce_host(rng, dtype, N)
     if _is_float(dtype):
-        expected = float(np.sum(host.astype(np.float64)))
-        rtol, atol = (_F32_REDUCE_RTOL, _F32_REDUCE_ATOL) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL)
-        assert math.isclose(
-            got, expected, rel_tol=rtol, abs_tol=atol
-        ), f"{dtype} reduce_add(N={N}): got {got}, expected {expected}"
-    else:
-        # Promote to Python int for an arbitrary-width reference; mask both sides to dtype width to handle the
-        # u32 / u64 mod-wrap case at large N.
-        mod = 1 << (32 if dtype in (qd.i32, qd.u32) else 64) if _is_unsigned(dtype) else None
-        ref = int(
-            np.sum(host.astype(np.int64 if dtype in (qd.i32, qd.u32) else (np.int64 if dtype == qd.i64 else np.uint64)))
-        )  # noqa: E501
-        got_int = int(got)
-        if mod is not None:
-            ref &= mod - 1
-            got_int &= mod - 1
-        assert got_int == ref, f"{dtype} reduce_add(N={N}): got {got_int}, expected {ref}"
+        return rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype])
+    return _rand_reduce_host(rng, dtype, N, bound=10000)
 
 
-@pytest.mark.parametrize("N", _REDUCE_SIZES)
-@pytest.mark.parametrize("dtype", _REDUCE_DTYPES)
-@test_utils.test(arch=qd.gpu)
-def test_device_reduce_min(dtype, N):
-    """device_reduce_min(identity=type-positive-extreme) matches numpy.min."""
+def _check_reduce(op, dtype, N):
+    """Run ``device_reduce_<op>(arr)`` and verify against ``numpy.<op>(arr)``.
+
+    ``add`` accumulates so it needs (a) wider integer promotion + mod-wrap masking for u32/u64 and (b) per-N float
+    tolerance. ``min`` / ``max`` pick one input element, so they're bitwise-exact for both ints and floats.
+    """
     _skip_if_dtype_unsupported(dtype)
     inp, out = _alloc_input_out(dtype, N)
     rng = np.random.default_rng(seed=1234)
-    if _is_float(dtype):
-        host = rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype])
-    else:
-        host = _rand_reduce_host(rng, dtype, N, bound=10000)
+    host = _reduce_host(rng, op, dtype, N)
     _fill_field(inp, host)
 
-    qd.algorithms.device_reduce_min(inp, out=out)
+    qd_fn = getattr(qd.algorithms, f"device_reduce_{op}")
+    qd_fn(inp, out=out)
     got = out.to_numpy()[0]
-    expected = host.min()
 
+    if op == "add":
+        if _is_float(dtype):
+            expected = float(np.sum(host.astype(np.float64)))
+            rtol, atol = (_F32_REDUCE_RTOL, _F32_REDUCE_ATOL) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL)
+            assert math.isclose(
+                got, expected, rel_tol=rtol, abs_tol=atol
+            ), f"{dtype} reduce_add(N={N}): got {got}, expected {expected}"
+        else:
+            # Promote to Python int for an arbitrary-width reference; mask both sides to dtype width to handle the
+            # u32 / u64 mod-wrap case at large N.
+            mod = 1 << (32 if dtype in (qd.i32, qd.u32) else 64) if _is_unsigned(dtype) else None
+            ref = int(
+                np.sum(
+                    host.astype(np.int64 if dtype in (qd.i32, qd.u32) else (np.int64 if dtype == qd.i64 else np.uint64))
+                )
+            )  # noqa: E501
+            got_int = int(got)
+            if mod is not None:
+                ref &= mod - 1
+                got_int &= mod - 1
+            assert got_int == ref, f"{dtype} reduce_add(N={N}): got {got_int}, expected {ref}"
+        return
+
+    expected = host.min() if op == "min" else host.max()
     if _is_float(dtype):
         assert got == pytest.approx(expected, abs=1e-6 if dtype == qd.f32 else 1e-12)
     else:
-        assert int(got) == int(expected), f"{dtype} reduce_min(N={N}): got {got}, expected {expected}"
+        assert int(got) == int(expected), f"{dtype} reduce_{op}(N={N}): got {got}, expected {expected}"
 
 
+@pytest.mark.parametrize("op", _REDUCE_OPS)
 @pytest.mark.parametrize("N", _REDUCE_SIZES)
 @pytest.mark.parametrize("dtype", _REDUCE_DTYPES)
 @test_utils.test(arch=qd.gpu)
-def test_device_reduce_max(dtype, N):
-    """device_reduce_max(identity=type-negative-extreme) matches numpy.max."""
-    _skip_if_dtype_unsupported(dtype)
-    inp, out = _alloc_input_out(dtype, N)
-    rng = np.random.default_rng(seed=1234)
-    if _is_float(dtype):
-        host = rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype])
-    else:
-        host = _rand_reduce_host(rng, dtype, N, bound=10000)
-    _fill_field(inp, host)
-
-    qd.algorithms.device_reduce_max(inp, out=out)
-    got = out.to_numpy()[0]
-    expected = host.max()
+def test_device_reduce(op, dtype, N):
+    """``device_reduce_{add,min,max}`` match numpy across the full size sweep + dtype set.
 
-    if _is_float(dtype):
-        assert got == pytest.approx(expected, abs=1e-6 if dtype == qd.f32 else 1e-12)
-    else:
-        assert int(got) == int(expected), f"{dtype} reduce_max(N={N}): got {got}, expected {expected}"
+    Unified across the three op variants. ``add`` accumulates so it needs overflow / precision-aware comparison;
+    ``min`` / ``max`` pick one element of the input and are bitwise-exact.
+    """
+    _check_reduce(op, dtype, N)
 
 
 @test_utils.test(arch=qd.gpu)
@@ -454,101 +447,80 @@ def _scan_dtype_mask(dtype):
     return -1
 
 
-@pytest.mark.parametrize("N", _SCAN_SIZES)
-@pytest.mark.parametrize("dtype", _SCAN_DTYPES)
-@test_utils.test(arch=qd.gpu)
-def test_device_exclusive_scan_add(dtype, N):
-    """device_exclusive_scan_add(out[i] = sum(arr[0:i])) matches numpy.cumsum-shifted across the full 6-dtype set."""
-    _skip_if_dtype_unsupported(dtype)
-    inp, out = _alloc_scan_input_out(dtype, N)
-    rng = np.random.default_rng(seed=1234)
-    host = _rand_reduce_host(rng, dtype, N, bound=100)
-    _fill_field(inp, host)
+_SCAN_OPS = ["add", "min", "max"]
 
-    qd.algorithms.device_exclusive_scan_add(inp, out=out)
-    got = out.to_numpy()
 
+def _scan_host(rng, op, dtype, N):
+    """Generate the test input for a scan of `op` on `dtype` x N values. Same rationale as ``_reduce_host``."""
+    if op == "add":
+        return _rand_reduce_host(rng, dtype, N, bound=100)
     if _is_float(dtype):
-        ref = np.concatenate([[0.0], np.cumsum(host.astype(np.float64))[:-1]])
-        rtol, atol = _f32_scan_tol(N) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL)
-        np.testing.assert_allclose(
-            got.astype(np.float64),
-            ref,
-            rtol=rtol,
-            atol=atol,
-            err_msg=f"{dtype} scan_add(N={N})",
-        )
-    else:
-        # Promote to a width that survives the cumulative sum: u64 / i64 inputs use a Python int reference; smaller
-        # ints can still use int64.
-        promote = np.int64 if dtype in (qd.i32, qd.u32, qd.i64) else np.uint64
-        host_wide = host.astype(promote)
-        ref = np.concatenate([[promote(0)], np.cumsum(host_wide)[:-1]]).astype(promote)
-        mask = _scan_dtype_mask(dtype)
-        got_view = got.astype(np.int64 if dtype != qd.u64 else np.uint64)
-        if mask != -1:
-            got_view = got_view & promote(mask)
-            ref = ref & promote(mask)
-        np.testing.assert_array_equal(
-            got_view,
-            ref,
-            err_msg=f"{dtype} scan_add(N={N})",
-        )
+        return rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype])
+    return _rand_reduce_host(rng, dtype, N, bound=10000)
 
 
-@pytest.mark.parametrize("N", _SCAN_SIZES)
-@pytest.mark.parametrize("dtype", _SCAN_DTYPES)
-@test_utils.test(arch=qd.gpu)
-def test_device_exclusive_scan_min(dtype, N):
-    """device_exclusive_scan_min(out[i] = min(arr[0:i])) matches numpy.minimum.accumulate-shifted across the full
-    6-dtype set."""
+def _check_scan(op, dtype, N):
+    """Run ``device_exclusive_scan_<op>(arr)`` and verify against ``numpy.<op>.accumulate``-shifted.
+
+    Like the reduce family, ``add`` accumulates (overflow / precision care) while ``min`` / ``max`` are
+    bitwise-exact in both float and int paths.
+    """
     _skip_if_dtype_unsupported(dtype)
     inp, out = _alloc_scan_input_out(dtype, N)
     rng = np.random.default_rng(seed=1234)
     np_dt = _DTYPE_TO_NP[dtype]
-    if _is_float(dtype):
-        host = rng.uniform(-10.0, 10.0, size=N).astype(np_dt)
-    else:
-        host = _rand_reduce_host(rng, dtype, N, bound=10000)
+    host = _scan_host(rng, op, dtype, N)
     _fill_field(inp, host)
 
-    qd.algorithms.device_exclusive_scan_min(inp, out=out)
+    qd_fn = getattr(qd.algorithms, f"device_exclusive_scan_{op}")
+    qd_fn(inp, out=out)
     got = out.to_numpy()
 
+    if op == "add":
+        if _is_float(dtype):
+            ref = np.concatenate([[0.0], np.cumsum(host.astype(np.float64))[:-1]])
+            rtol, atol = _f32_scan_tol(N) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL)
+            np.testing.assert_allclose(
+                got.astype(np.float64),
+                ref,
+                rtol=rtol,
+                atol=atol,
+                err_msg=f"{dtype} scan_add(N={N})",
+            )
+        else:
+            # Promote to a width that survives the cumulative sum: u64 / i64 inputs use a Python int reference;
+            # smaller ints can still use int64.
+            promote = np.int64 if dtype in (qd.i32, qd.u32, qd.i64) else np.uint64
+            host_wide = host.astype(promote)
+            ref = np.concatenate([[promote(0)], np.cumsum(host_wide)[:-1]]).astype(promote)
+            mask = _scan_dtype_mask(dtype)
+            got_view = got.astype(np.int64 if dtype != qd.u64 else np.uint64)
+            if mask != -1:
+                got_view = got_view & promote(mask)
+                ref = ref & promote(mask)
+            np.testing.assert_array_equal(got_view, ref, err_msg=f"{dtype} scan_add(N={N})")
+        return
+
+    np_accum = np.minimum.accumulate if op == "min" else np.maximum.accumulate
+    identity_table = _MIN_IDENTITY if op == "min" else _MAX_IDENTITY
     if _is_float(dtype):
-        ref = np.concatenate([[float("inf")], np.minimum.accumulate(host.astype(np.float64))[:-1]]).astype(np_dt)
-        atol = 0 if dtype == qd.f32 else 0  # min is bitwise-exact for monotone ops on float
-        np.testing.assert_allclose(got, ref, rtol=0, atol=atol, err_msg=f"{dtype} scan_min(N={N})")
+        identity = float("inf") if op == "min" else float("-inf")
+        ref = np.concatenate([[identity], np_accum(host.astype(np.float64))[:-1]]).astype(np_dt)
+        np.testing.assert_allclose(got, ref, rtol=0, atol=0, err_msg=f"{dtype} scan_{op}(N={N})")
     else:
-        ref = np.concatenate([[np_dt(_MIN_IDENTITY[dtype])], np.minimum.accumulate(host)[:-1]]).astype(np_dt)
-        np.testing.assert_array_equal(got, ref, err_msg=f"{dtype} scan_min(N={N})")
+        ref = np.concatenate([[np_dt(identity_table[dtype])], np_accum(host)[:-1]]).astype(np_dt)
+        np.testing.assert_array_equal(got, ref, err_msg=f"{dtype} scan_{op}(N={N})")
 
 
+@pytest.mark.parametrize("op", _SCAN_OPS)
 @pytest.mark.parametrize("N", _SCAN_SIZES)
 @pytest.mark.parametrize("dtype", _SCAN_DTYPES)
 @test_utils.test(arch=qd.gpu)
-def test_device_exclusive_scan_max(dtype, N):
-    """device_exclusive_scan_max(out[i] = max(arr[0:i])) matches numpy.maximum.accumulate-shifted across the full
-    6-dtype set."""
-    _skip_if_dtype_unsupported(dtype)
-    inp, out = _alloc_scan_input_out(dtype, N)
-    rng = np.random.default_rng(seed=1234)
-    np_dt = _DTYPE_TO_NP[dtype]
-    if _is_float(dtype):
-        host = rng.uniform(-10.0, 10.0, size=N).astype(np_dt)
-    else:
-        host = _rand_reduce_host(rng, dtype, N, bound=10000)
-    _fill_field(inp, host)
-
-    qd.algorithms.device_exclusive_scan_max(inp, out=out)
-    got = out.to_numpy()
-
-    if _is_float(dtype):
-        ref = np.concatenate([[float("-inf")], np.maximum.accumulate(host.astype(np.float64))[:-1]]).astype(np_dt)
-        np.testing.assert_allclose(got, ref, rtol=0, atol=0, err_msg=f"{dtype} scan_max(N={N})")
-    else:
-        ref = np.concatenate([[np_dt(_MAX_IDENTITY[dtype])], np.maximum.accumulate(host)[:-1]]).astype(np_dt)
-        np.testing.assert_array_equal(got, ref, err_msg=f"{dtype} scan_max(N={N})")
+def test_device_exclusive_scan(op, dtype, N):
+    """``device_exclusive_scan_{add,min,max}`` match ``numpy.{cumsum, minimum.accumulate, maximum.accumulate}``-shifted
+    across the full size sweep + dtype set. Unified across the three op variants; same overflow vs bitwise-exact
+    handling as the reduce family."""
+    _check_scan(op, dtype, N)
 
 
 @test_utils.test(arch=qd.gpu)

From 64fcbb0c2be682a32bfbf606c54667a52fa3c8fd Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 08:03:21 -0700
Subject: [PATCH 11/29] [Style] black: reformat
 test_tile16_cholesky_blocked_demo cmd list + run_tests help string

Pure formatting fix from `pre-commit run -a`; no behavior change.
---
 tests/python/test_tile16.py | 15 ++++++++++-----
 tests/run_tests.py          |  3 +--
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/tests/python/test_tile16.py b/tests/python/test_tile16.py
index 6d917e11ad..f94d4221e1 100644
--- a/tests/python/test_tile16.py
+++ b/tests/python/test_tile16.py
@@ -1785,11 +1785,16 @@ def test_tile16_cholesky_blocked_demo():
     """
     demo = Path(__file__).resolve().parents[2] / "misc" / "demos" / "cholesky_blocked.py"
     cmd = [
-        sys.executable, str(demo),
-        "--n", "32",
-        "--n-envs", "64",
-        "--num-warmup", "1",
-        "--num-iters", "1",
+        sys.executable,
+        str(demo),
+        "--n",
+        "32",
+        "--n-envs",
+        "64",
+        "--num-warmup",
+        "1",
+        "--num-iters",
+        "1",
     ]
     result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
     if result.returncode != 0:
diff --git a/tests/run_tests.py b/tests/run_tests.py
index 47d5574ad0..bf37ab2aa7 100644
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -168,8 +168,7 @@ def test():
         default=None,
         dest="marks",
         type=str,
-        help="Only run tests with specific marks. `not slow` is appended automatically "
-        "unless --run-slow is passed.",
+        help="Only run tests with specific marks. `not slow` is appended automatically " "unless --run-slow is passed.",
     )
     parser.add_argument(
         "--run-slow",

From 3031e14ee330af47430c12632594c2b82ec1f3f4 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 08:09:25 -0700
Subject: [PATCH 12/29] [Test] test_subgroup_full_matches_tiled: fuse 20 thin
 subgroup-op wrappers into 2 op-parametrized tests

Lines 3608-3694 in test_simt.py were 18 ~5-line wrappers each calling
``_check_full_matches_tiled(subgroup.<op>, subgroup.<op>_tiled, ...)``.
Lines 3841-3848 were 2 more, parametrized on dtype. ``_check_full_matches_tiled``
already accepts the full / tiled functions as Python arguments (closure-captured
into ``@qd.kernel``), so collapsing the family is a pure dedup move:

  test_subgroup_full_matches_tiled(op_name, host_init)
      # 18 cases: {reduce, inclusive, exclusive}_{add,min,max,mul,and,or,xor} on qd.i32

  test_subgroup_full_matches_tiled_float(op_name, dtype)
      # 4 cases: {reduce_add, inclusive_add} x {qd.f32, qd.f64}

Behavior + coverage unchanged (still 22 parametrize cases, same dtype + init
configurations). Pytest ids are designed to match the original test-name
suffixes (e.g. ``[reduce_add]``, ``[inclusive_mul]``) so ``-k`` selectors and
test reports stay readable. Drops ~50 LOC net.
---
 tests/python/test_simt.py | 136 ++++++++++++--------------------------
 1 file changed, 43 insertions(+), 93 deletions(-)

diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
index 95e3438e41..6790d3afb5 100644
--- a/tests/python/test_simt.py
+++ b/tests/python/test_simt.py
@@ -3604,94 +3604,45 @@ def _init_full_bitwise(src, n):
         src[i] = 1 << (i % 7)
 
 
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_reduce_add():
-    _check_full_matches_tiled(subgroup.reduce_add, subgroup.reduce_add_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_reduce_all_add():
-    _check_full_matches_tiled(subgroup.reduce_all_add, subgroup.reduce_all_add_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_reduce_min():
-    _check_full_matches_tiled(subgroup.reduce_min, subgroup.reduce_min_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_reduce_max():
-    _check_full_matches_tiled(subgroup.reduce_max, subgroup.reduce_max_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_reduce_all_min():
-    _check_full_matches_tiled(subgroup.reduce_all_min, subgroup.reduce_all_min_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_reduce_all_max():
-    _check_full_matches_tiled(subgroup.reduce_all_max, subgroup.reduce_all_max_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_inclusive_add():
-    _check_full_matches_tiled(subgroup.inclusive_add, subgroup.inclusive_add_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_inclusive_min():
-    _check_full_matches_tiled(subgroup.inclusive_min, subgroup.inclusive_min_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_inclusive_max():
-    _check_full_matches_tiled(subgroup.inclusive_max, subgroup.inclusive_max_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_inclusive_mul():
-    _check_full_matches_tiled(subgroup.inclusive_mul, subgroup.inclusive_mul_tiled, host_init=_init_full_small_int)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_inclusive_and():
-    _check_full_matches_tiled(subgroup.inclusive_and, subgroup.inclusive_and_tiled, host_init=_init_full_bitwise)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_inclusive_or():
-    _check_full_matches_tiled(subgroup.inclusive_or, subgroup.inclusive_or_tiled, host_init=_init_full_bitwise)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_inclusive_xor():
-    _check_full_matches_tiled(subgroup.inclusive_xor, subgroup.inclusive_xor_tiled, host_init=_init_full_bitwise)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_exclusive_add():
-    _check_full_matches_tiled(subgroup.exclusive_add, subgroup.exclusive_add_tiled)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_exclusive_mul():
-    _check_full_matches_tiled(subgroup.exclusive_mul, subgroup.exclusive_mul_tiled, host_init=_init_full_small_int)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_exclusive_and():
-    _check_full_matches_tiled(subgroup.exclusive_and, subgroup.exclusive_and_tiled, host_init=_init_full_bitwise)
-
-
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_exclusive_or():
-    _check_full_matches_tiled(subgroup.exclusive_or, subgroup.exclusive_or_tiled, host_init=_init_full_bitwise)
+# Each entry is a thin ``_check_full_matches_tiled(subgroup.X, subgroup.X_tiled, ...)`` wrapper. Collapsed into one
+# op-parametrized test to drop ~80 LOC of duplication. The pytest ids match the names of the original
+# ``test_subgroup_<op>`` functions so test reports / `-k` selectors stay stable.
+_FULL_VS_TILED_INT_CASES = [
+    pytest.param("reduce_add", None, id="reduce_add"),
+    pytest.param("reduce_all_add", None, id="reduce_all_add"),
+    pytest.param("reduce_min", None, id="reduce_min"),
+    pytest.param("reduce_max", None, id="reduce_max"),
+    pytest.param("reduce_all_min", None, id="reduce_all_min"),
+    pytest.param("reduce_all_max", None, id="reduce_all_max"),
+    pytest.param("inclusive_add", None, id="inclusive_add"),
+    pytest.param("inclusive_min", None, id="inclusive_min"),
+    pytest.param("inclusive_max", None, id="inclusive_max"),
+    # `mul` needs bounded inputs (2**N overflows i32 quickly); bitwise ops need a per-lane bit pattern that's
+    # non-zero on every lane so AND has signal and OR / XOR have varied bits.
+    pytest.param("inclusive_mul", _init_full_small_int, id="inclusive_mul"),
+    pytest.param("inclusive_and", _init_full_bitwise, id="inclusive_and"),
+    pytest.param("inclusive_or", _init_full_bitwise, id="inclusive_or"),
+    pytest.param("inclusive_xor", _init_full_bitwise, id="inclusive_xor"),
+    pytest.param("exclusive_add", None, id="exclusive_add"),
+    pytest.param("exclusive_mul", _init_full_small_int, id="exclusive_mul"),
+    pytest.param("exclusive_and", _init_full_bitwise, id="exclusive_and"),
+    pytest.param("exclusive_or", _init_full_bitwise, id="exclusive_or"),
+    pytest.param("exclusive_xor", _init_full_bitwise, id="exclusive_xor"),
+]
 
 
+@pytest.mark.parametrize("op_name,host_init", _FULL_VS_TILED_INT_CASES)
 @test_utils.test(arch=qd.gpu)
-def test_subgroup_exclusive_xor():
-    _check_full_matches_tiled(subgroup.exclusive_xor, subgroup.exclusive_xor_tiled, host_init=_init_full_bitwise)
+def test_subgroup_full_matches_tiled(op_name, host_init):
+    """For each subgroup op ``X``, verify ``subgroup.X(v)`` matches ``subgroup.X_tiled(v, log2_group_size())``
+    lane-by-lane on ``qd.i32``. Covers reduce / inclusive / exclusive families; bitwise ops + ``mul`` use a custom
+    initializer that keeps the per-lane aggregate bounded."""
+    full_fn = getattr(subgroup, op_name)
+    tiled_fn = getattr(subgroup, f"{op_name}_tiled")
+    kwargs = {}
+    if host_init is not None:
+        kwargs["host_init"] = host_init
+    _check_full_matches_tiled(full_fn, tiled_fn, **kwargs)
 
 
 @test_utils.test(arch=qd.gpu)
@@ -3836,16 +3787,15 @@ def k():
 # accidentally cast through i32 inside a wrapper.
 
 
+@pytest.mark.parametrize("op_name", ["reduce_add", "inclusive_add"])
 @pytest.mark.parametrize("dtype", [qd.f32, qd.f64])
 @test_utils.test(arch=qd.gpu)
-def test_subgroup_reduce_add_float(dtype):
-    _check_full_matches_tiled(subgroup.reduce_add, subgroup.reduce_add_tiled, dtype=dtype)
-
-
-@pytest.mark.parametrize("dtype", [qd.f32, qd.f64])
-@test_utils.test(arch=qd.gpu)
-def test_subgroup_inclusive_add_float(dtype):
-    _check_full_matches_tiled(subgroup.inclusive_add, subgroup.inclusive_add_tiled, dtype=dtype)
+def test_subgroup_full_matches_tiled_float(op_name, dtype):
+    """Float-dtype coverage of the dtype-agnostic ``full`` wrappers (``reduce_add``, ``inclusive_add``). One f32 + one
+    f64 case per family is enough to catch an i32-only regression in a wrapper."""
+    full_fn = getattr(subgroup, op_name)
+    tiled_fn = getattr(subgroup, f"{op_name}_tiled")
+    _check_full_matches_tiled(full_fn, tiled_fn, dtype=dtype)
 
 
 @pytest.mark.parametrize("dtype", [qd.f32, qd.f64])

From edf53ea66af2ced0f3fdd8e8dd5005db22fc853d Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 08:13:10 -0700
Subject: [PATCH 13/29] [Test] test_block_reduce{,_all}: fuse {add,min,max}
 into op-parametrized tests

The six block-reduce tests (3 single-output + 3 broadcast) share an identical
kernel skeleton, parametrize axes, and verification loop. They only differ in
which `block.reduce_*` function is called (closure-captured into `@qd.kernel`
via getattr), the host-side reference oracle, the init pattern (sequential for
`add` so the running sum has signal; permuted hash for `min` / `max` so the
result depends on lanes other than first / last), and the float tolerance
regime (relative for accumulating `add`, absolute for picker `min` / `max`).
Collapse the six tests into two op-parametrized tests:

  test_block_reduce(sg_per_block, dtype, op_name, ...)        # single-output, 3 ops
  test_block_reduce_all(sg_per_block, dtype, op_name, ...)    # broadcast, 3 ops

Parametrize space is unchanged (3 sg x 5 dtype x 3 op = 45 cases per fused
test, matching the original 3 tests x 15 cases each). Pytest ids use plain
`[add|min|max]` suffixes so `-k` selectors remain readable. Drops ~100 LOC of
boilerplate -- two new small helpers (`_init_block_reduce_src` and
`_assert_block_reduce_close`) capture the per-op behavioral differences in one
place each.
---
 tests/python/test_simt.py | 210 ++++++++++----------------------------
 1 file changed, 54 insertions(+), 156 deletions(-)

diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
index 6790d3afb5..5b7d7490cd 100644
--- a/tests/python/test_simt.py
+++ b/tests/python/test_simt.py
@@ -887,81 +887,57 @@ def _ref_reduce_max(values):
     return max(values)
 
 
-@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
-@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
-@test_utils.test(arch=qd.gpu)
-def test_block_reduce_add(dtype, sg_per_block):
-    """Block sum-reduce: thread 0 of each block holds `sum(src[block_base:block_base+block_dim])`."""
-    _skip_if_f64_unsupported(dtype)
-    block_dim = sg_per_block * _arch_subgroup_size()
-    NUM_BLOCKS = 4
-    N = NUM_BLOCKS * block_dim
-    src = qd.field(dtype=dtype, shape=N)
-    dst = qd.field(dtype=dtype, shape=NUM_BLOCKS)
-
-    @qd.kernel
-    def foo():
-        qd.loop_config(block_dim=block_dim)
-        for i in range(N):
-            tid = i % block_dim
-            agg = block.reduce_add(src[i], block_dim, dtype)
-            if tid == 0:
-                dst[i // block_dim] = agg
-
-    _init_field(src, N, dtype)
-    foo()
-
-    for b in range(NUM_BLOCKS):
-        block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_reduce_add(block_vals)
-        if dtype in _BLOCK_REDUCE_INT_DTYPES:
-            assert dst[b] == expected, f"block {b}: got {dst[b]}, expected {expected}"
-        else:
-            assert abs(dst[b] - expected) < 1e-4 * abs(expected), f"block {b}: got {dst[b]}, expected {expected}"
-
+# The three single-output reduces (`test_block_reduce_{add,min,max}`) and their three broadcast siblings
+# (`test_block_reduce_all_{add,min,max}`) share the same kernel skeleton, parametrize axes, and verification loop;
+# they differ only in (a) which `block.reduce_*` function gets called, (b) the host-side reference oracle, (c) the
+# init pattern (sequential for `add` so the running sum has signal, permuted hash for `min` / `max` so the result
+# depends on lanes other than first / last), and (d) the float tolerance regime (`add` accumulates so it uses a
+# relative tol; `min` / `max` pick one element of the input and use an absolute tol).
+_BLOCK_REDUCE_OP_CASES = [
+    # (op_name, ref_fn, init_permuted, tol_relative)
+    pytest.param("add", _ref_reduce_add, False, True, id="add"),
+    pytest.param("min", _ref_reduce_min, True, False, id="min"),
+    pytest.param("max", _ref_reduce_max, True, False, id="max"),
+]
 
-@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
-@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
-@test_utils.test(arch=qd.gpu)
-def test_block_reduce_min(dtype, sg_per_block):
-    """Block min-reduce: thread 0 of each block holds `min(src[block_base:block_base+block_dim])`."""
-    _skip_if_f64_unsupported(dtype)
-    block_dim = sg_per_block * _arch_subgroup_size()
-    NUM_BLOCKS = 4
-    N = NUM_BLOCKS * block_dim
-    src = qd.field(dtype=dtype, shape=N)
-    dst = qd.field(dtype=dtype, shape=NUM_BLOCKS)
 
-    @qd.kernel
-    def foo():
-        qd.loop_config(block_dim=block_dim)
+def _init_block_reduce_src(src, N, dtype, *, permuted):
+    """Initialize ``src[0:N]`` for a block reduce test. ``permuted=False`` is the sequential ``1..N`` init from
+    ``_init_field`` (good for add); ``permuted=True`` is the stable hash ``((i * 1009) % 997) + 1`` so the per-block
+    min / max depends on lanes other than first / last."""
+    if permuted:
         for i in range(N):
-            tid = i % block_dim
-            agg = block.reduce_min(src[i], block_dim, dtype)
-            if tid == 0:
-                dst[i // block_dim] = agg
+            v = ((i * 1009) % 997) + 1
+            src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
+    else:
+        _init_field(src, N, dtype)
 
-    # Permuted (non-monotone) initialisation so the min depends on lanes other than the first / last.
-    for i in range(N):
-        v = ((i * 1009) % 997) + 1  # in [1, 997]; stable hash, no collisions w/ block_dim values up to 256
-        src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
-    foo()
 
-    for b in range(NUM_BLOCKS):
-        block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_reduce_min(block_vals)
-        if dtype in _BLOCK_REDUCE_INT_DTYPES:
-            assert dst[b] == expected, f"block {b}: got {dst[b]}, expected {expected}"
-        else:
-            assert abs(dst[b] - expected) < 1e-5, f"block {b}: got {dst[b]}, expected {expected}"
+def _assert_block_reduce_close(actual, expected, dtype, *, tol_relative, ctx):
+    """Assert ``actual ~= expected`` per the block-reduce tolerance regime.
+
+    Int dtypes compare exactly. Floats use relative tolerance ``1e-4 * |expected|`` for accumulating ops (sums grow
+    with block_dim, so a relative bound is the only thing that stays meaningful across the 32 / 128 / 256 / 64 / 256 /
+    512 block-size sweep), and absolute tolerance ``1e-5`` for picker ops (min / max pick one element so the
+    magnitude is whatever was in the input -- a small absolute bound suffices).
+    """
+    if dtype in _BLOCK_REDUCE_INT_DTYPES:
+        assert actual == expected, f"{ctx}: got {actual}, expected {expected}"
+    elif tol_relative:
+        assert abs(actual - expected) < 1e-4 * abs(expected), f"{ctx}: got {actual}, expected {expected}"
+    else:
+        assert abs(actual - expected) < 1e-5, f"{ctx}: got {actual}, expected {expected}"
 
 
+@pytest.mark.parametrize("op_name,ref_fn,init_permuted,tol_relative", _BLOCK_REDUCE_OP_CASES)
 @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
 @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
 @test_utils.test(arch=qd.gpu)
-def test_block_reduce_max(dtype, sg_per_block):
-    """Block max-reduce: thread 0 of each block holds `max(src[block_base:block_base+block_dim])`."""
+def test_block_reduce(dtype, sg_per_block, op_name, ref_fn, init_permuted, tol_relative):
+    """Block reduce: thread 0 of each block holds ``<op>(src[block_base:block_base+block_dim])``. Unified across
+    ``add`` / ``min`` / ``max`` -- op-name is closure-captured into ``@qd.kernel``."""
     _skip_if_f64_unsupported(dtype)
+    op_fn = getattr(block, f"reduce_{op_name}")
     block_dim = sg_per_block * _arch_subgroup_size()
     NUM_BLOCKS = 4
     N = NUM_BLOCKS * block_dim
@@ -973,102 +949,29 @@ def foo():
         qd.loop_config(block_dim=block_dim)
         for i in range(N):
             tid = i % block_dim
-            agg = block.reduce_max(src[i], block_dim, dtype)
+            agg = op_fn(src[i], block_dim, dtype)
             if tid == 0:
                 dst[i // block_dim] = agg
 
-    for i in range(N):
-        v = ((i * 1009) % 997) + 1
-        src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
-    foo()
-
-    for b in range(NUM_BLOCKS):
-        block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_reduce_max(block_vals)
-        if dtype in _BLOCK_REDUCE_INT_DTYPES:
-            assert dst[b] == expected, f"block {b}: got {dst[b]}, expected {expected}"
-        else:
-            assert abs(dst[b] - expected) < 1e-5, f"block {b}: got {dst[b]}, expected {expected}"
-
-
-@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
-@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
-@test_utils.test(arch=qd.gpu)
-def test_block_reduce_all_add(dtype, sg_per_block):
-    """Block sum-reduce broadcast: every thread of each block holds the block-wide sum.
-
-    Verifies the broadcast variant by writing the per-thread output to a flat field, then asserting every thread of a
-    given block reads the same aggregate.
-    """
-    _skip_if_f64_unsupported(dtype)
-    block_dim = sg_per_block * _arch_subgroup_size()
-    NUM_BLOCKS = 4
-    N = NUM_BLOCKS * block_dim
-    src = qd.field(dtype=dtype, shape=N)
-    dst = qd.field(dtype=dtype, shape=N)
-
-    @qd.kernel
-    def foo():
-        qd.loop_config(block_dim=block_dim)
-        for i in range(N):
-            dst[i] = block.reduce_all_add(src[i], block_dim, dtype)
-
-    _init_field(src, N, dtype)
-    foo()
-
-    for b in range(NUM_BLOCKS):
-        block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_reduce_add(block_vals)
-        for j in range(block_dim):
-            actual = dst[b * block_dim + j]
-            if dtype in _BLOCK_REDUCE_INT_DTYPES:
-                assert actual == expected, f"block {b} thread {j}: got {actual}, expected {expected}"
-            else:
-                assert abs(actual - expected) < 1e-4 * abs(
-                    expected
-                ), f"block {b} thread {j}: got {actual}, expected {expected}"
-
-
-@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
-@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
-@test_utils.test(arch=qd.gpu)
-def test_block_reduce_all_min(dtype, sg_per_block):
-    """Block min-reduce broadcast: every thread reads the block-wide min."""
-    _skip_if_f64_unsupported(dtype)
-    block_dim = sg_per_block * _arch_subgroup_size()
-    NUM_BLOCKS = 4
-    N = NUM_BLOCKS * block_dim
-    src = qd.field(dtype=dtype, shape=N)
-    dst = qd.field(dtype=dtype, shape=N)
-
-    @qd.kernel
-    def foo():
-        qd.loop_config(block_dim=block_dim)
-        for i in range(N):
-            dst[i] = block.reduce_all_min(src[i], block_dim, dtype)
-
-    for i in range(N):
-        v = ((i * 1009) % 997) + 1
-        src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
+    _init_block_reduce_src(src, N, dtype, permuted=init_permuted)
     foo()
 
     for b in range(NUM_BLOCKS):
         block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_reduce_min(block_vals)
-        for j in range(block_dim):
-            actual = dst[b * block_dim + j]
-            if dtype in _BLOCK_REDUCE_INT_DTYPES:
-                assert actual == expected, f"block {b} thread {j}: got {actual}, expected {expected}"
-            else:
-                assert abs(actual - expected) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected}"
+        expected = ref_fn(block_vals)
+        _assert_block_reduce_close(dst[b], expected, dtype, tol_relative=tol_relative, ctx=f"block {b}")
 
 
+@pytest.mark.parametrize("op_name,ref_fn,init_permuted,tol_relative", _BLOCK_REDUCE_OP_CASES)
 @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
 @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
 @test_utils.test(arch=qd.gpu)
-def test_block_reduce_all_max(dtype, sg_per_block):
-    """Block max-reduce broadcast: every thread reads the block-wide max."""
+def test_block_reduce_all(dtype, sg_per_block, op_name, ref_fn, init_permuted, tol_relative):
+    """Block reduce broadcast: every thread of each block holds the block-wide ``<op>``. Verified by writing the
+    per-thread output to a flat field, then asserting every thread of a given block reads the same aggregate.
+    Unified across ``add`` / ``min`` / ``max``."""
     _skip_if_f64_unsupported(dtype)
+    op_fn = getattr(block, f"reduce_all_{op_name}")
     block_dim = sg_per_block * _arch_subgroup_size()
     NUM_BLOCKS = 4
     N = NUM_BLOCKS * block_dim
@@ -1079,22 +982,17 @@ def test_block_reduce_all_max(dtype, sg_per_block):
     def foo():
         qd.loop_config(block_dim=block_dim)
         for i in range(N):
-            dst[i] = block.reduce_all_max(src[i], block_dim, dtype)
+            dst[i] = op_fn(src[i], block_dim, dtype)
 
-    for i in range(N):
-        v = ((i * 1009) % 997) + 1
-        src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
+    _init_block_reduce_src(src, N, dtype, permuted=init_permuted)
     foo()
 
     for b in range(NUM_BLOCKS):
         block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_reduce_max(block_vals)
+        expected = ref_fn(block_vals)
         for j in range(block_dim):
             actual = dst[b * block_dim + j]
-            if dtype in _BLOCK_REDUCE_INT_DTYPES:
-                assert actual == expected, f"block {b} thread {j}: got {actual}, expected {expected}"
-            else:
-                assert abs(actual - expected) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected}"
+            _assert_block_reduce_close(actual, expected, dtype, tol_relative=tol_relative, ctx=f"block {b} thread {j}")
 
 
 # --- Block scan tests ------------------------------------------------------------------

From 8fd433e3dca09c97547c8e3cda00d0e7586538b6 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 08:14:47 -0700
Subject: [PATCH 14/29] [Test] test_block_inclusive: fuse {add,min,max} into
 one op-parametrized test

The three block inclusive scan tests share the same kernel skeleton and only
differ in the closure-captured `block.inclusive_<op>` function, the host-side
reference oracle, the init pattern (sequential for `add` -- sums grow with
prefix length; permuted for `min` / `max` -- result depends on lanes other
than first / last), and the float tolerance regime (relative for `add`,
absolute for `min` / `max`). Collapse into one op-parametrized test:

  test_block_inclusive(sg_per_block, dtype, op_name, ...)

Identical param count to the original three tests (3 sg x 5 dtype x 3 op =
45 cases vs original 3 x 15). Pulls a shared `_assert_block_scan_close`
helper out so the int / relative-float / absolute-float regime is encoded in
one place; the relative-float branch keeps the floor-on-tol-base trick
needed by the original `test_block_exclusive_add` (also routed through the
same helper). `test_block_exclusive_add` stays as its own function for now
because the matching exclusive `min` / `max` cases need dtype-derived
sentinel identities + ``isinf`` handling that's different enough that
fusing them in would create more branches than it removes; can address
that in a follow-up if needed.
---
 tests/python/test_simt.py | 131 ++++++++++++--------------------------
 1 file changed, 41 insertions(+), 90 deletions(-)

diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
index 5b7d7490cd..96aeb1e4dc 100644
--- a/tests/python/test_simt.py
+++ b/tests/python/test_simt.py
@@ -1045,82 +1045,45 @@ def _ref_exclusive_scan_op(values, op, identity):
     return out
 
 
-@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
-@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
-@test_utils.test(arch=qd.gpu)
-def test_block_inclusive_add(dtype, sg_per_block):
-    """Block inclusive prefix sum: thread `i` holds `sum(src[block_base..i])`."""
-    _skip_if_f64_unsupported(dtype)
-    block_dim = sg_per_block * _arch_subgroup_size()
-    NUM_BLOCKS = 4
-    N = NUM_BLOCKS * block_dim
-    src = qd.field(dtype=dtype, shape=N)
-    dst = qd.field(dtype=dtype, shape=N)
-
-    @qd.kernel
-    def foo():
-        qd.loop_config(block_dim=block_dim)
-        for i in range(N):
-            dst[i] = block.inclusive_add(src[i], block_dim, dtype)
-
-    _init_field(src, N, dtype)
-    foo()
-
-    for b in range(NUM_BLOCKS):
-        block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_inclusive_scan_add(block_vals)
-        for j in range(block_dim):
-            actual = dst[b * block_dim + j]
-            if dtype in _BLOCK_REDUCE_INT_DTYPES:
-                assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}"
-            else:
-                assert abs(actual - expected[j]) < 1e-4 * abs(
-                    expected[j] + 1.0
-                ), f"block {b} thread {j}: got {actual}, expected {expected[j]}"
-
-
-@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
-@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
-@test_utils.test(arch=qd.gpu)
-def test_block_exclusive_add(dtype, sg_per_block):
-    """Block exclusive prefix sum: thread `i` holds `sum(src[block_base..i-1])`; thread 0 holds 0."""
-    _skip_if_f64_unsupported(dtype)
-    block_dim = sg_per_block * _arch_subgroup_size()
-    NUM_BLOCKS = 4
-    N = NUM_BLOCKS * block_dim
-    src = qd.field(dtype=dtype, shape=N)
-    dst = qd.field(dtype=dtype, shape=N)
-
-    @qd.kernel
-    def foo():
-        qd.loop_config(block_dim=block_dim)
-        for i in range(N):
-            dst[i] = block.exclusive_add(src[i], block_dim, dtype)
+# The four scan tests in this group (`test_block_inclusive_{add,min,max}` + `test_block_exclusive_add`) share the
+# kernel skeleton; only the per-op reference oracle, init pattern, and float tolerance differ. `add` accumulates
+# (sequential init, relative tol); `min` / `max` pick (permuted init, absolute tol). Exclusive `min` / `max` get
+# their own dedicated test below because they need a dtype-derived sentinel identity (+inf / iinfo(max), -inf /
+# iinfo(min)) at lane 0 with explicit ``isinf`` handling -- different enough that fusing them in would create more
+# branches than it removes.
+_PY_MIN = lambda a, b: a if a < b else b  # noqa: E731 (intentional 1-line lambda for ref oracle)
+_PY_MAX = lambda a, b: a if a > b else b  # noqa: E731
+
+_BLOCK_INCLUSIVE_SCAN_OP_CASES = [
+    # (op_name, ref_fn, init_permuted, tol_relative)
+    pytest.param("add", _ref_inclusive_scan_add, False, True, id="add"),
+    pytest.param("min", lambda vals: _ref_inclusive_scan_op(vals, _PY_MIN, 0), True, False, id="min"),
+    pytest.param("max", lambda vals: _ref_inclusive_scan_op(vals, _PY_MAX, 0), True, False, id="max"),
+]
 
-    _init_field(src, N, dtype)
-    foo()
 
-    for b in range(NUM_BLOCKS):
-        block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_exclusive_scan_add(block_vals)
-        for j in range(block_dim):
-            actual = dst[b * block_dim + j]
-            if dtype in _BLOCK_REDUCE_INT_DTYPES:
-                assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}"
-            else:
-                # First thread's expected is 0; gate the relative tolerance so it doesn't blow up.
-                tol_base = abs(expected[j]) if abs(expected[j]) > 1.0 else 1.0
-                assert (
-                    abs(actual - expected[j]) < 1e-4 * tol_base
-                ), f"block {b} thread {j}: got {actual}, expected {expected[j]}"
+def _assert_block_scan_close(actual, expected_j, dtype, *, tol_relative, ctx):
+    """Per-thread assertion for block scan tests. Same int / relative-float / absolute-float regime as
+    ``_assert_block_reduce_close`` but with a floor on the relative-tol base so the first few prefixes (where
+    ``expected_j`` is near zero) don't tighten the bound to zero."""
+    if dtype in _BLOCK_REDUCE_INT_DTYPES:
+        assert actual == expected_j, f"{ctx}: got {actual}, expected {expected_j}"
+    elif tol_relative:
+        tol_base = abs(expected_j) if abs(expected_j) > 1.0 else 1.0
+        assert abs(actual - expected_j) < 1e-4 * tol_base, f"{ctx}: got {actual}, expected {expected_j}"
+    else:
+        assert abs(actual - expected_j) < 1e-5, f"{ctx}: got {actual}, expected {expected_j}"
 
 
+@pytest.mark.parametrize("op_name,ref_fn,init_permuted,tol_relative", _BLOCK_INCLUSIVE_SCAN_OP_CASES)
 @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
 @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
 @test_utils.test(arch=qd.gpu)
-def test_block_inclusive_min(dtype, sg_per_block):
-    """Block inclusive prefix min."""
+def test_block_inclusive(dtype, sg_per_block, op_name, ref_fn, init_permuted, tol_relative):
+    """Block inclusive prefix scan: thread ``i`` holds ``<op>(src[block_base..i])``. Unified across ``add`` / ``min``
+    / ``max``."""
     _skip_if_f64_unsupported(dtype)
+    op_fn = getattr(block, f"inclusive_{op_name}")
     block_dim = sg_per_block * _arch_subgroup_size()
     NUM_BLOCKS = 4
     N = NUM_BLOCKS * block_dim
@@ -1131,30 +1094,24 @@ def test_block_inclusive_min(dtype, sg_per_block):
     def foo():
         qd.loop_config(block_dim=block_dim)
         for i in range(N):
-            dst[i] = block.inclusive_min(src[i], block_dim, dtype)
+            dst[i] = op_fn(src[i], block_dim, dtype)
 
-    for i in range(N):
-        v = ((i * 1009) % 997) + 1
-        src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
+    _init_block_reduce_src(src, N, dtype, permuted=init_permuted)
     foo()
 
-    py_min = lambda a, b: a if a < b else b  # noqa: E731 (intentional 1-line lambda for ref oracle)
     for b in range(NUM_BLOCKS):
         block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_inclusive_scan_op(block_vals, py_min, 0)
+        expected = ref_fn(block_vals)
         for j in range(block_dim):
             actual = dst[b * block_dim + j]
-            if dtype in _BLOCK_REDUCE_INT_DTYPES:
-                assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}"
-            else:
-                assert abs(actual - expected[j]) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected[j]}"
+            _assert_block_scan_close(actual, expected[j], dtype, tol_relative=tol_relative, ctx=f"block {b} thread {j}")
 
 
 @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
 @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
 @test_utils.test(arch=qd.gpu)
-def test_block_inclusive_max(dtype, sg_per_block):
-    """Block inclusive prefix max."""
+def test_block_exclusive_add(dtype, sg_per_block):
+    """Block exclusive prefix sum: thread ``i`` holds ``sum(src[block_base..i-1])``; thread 0 holds 0."""
     _skip_if_f64_unsupported(dtype)
     block_dim = sg_per_block * _arch_subgroup_size()
     NUM_BLOCKS = 4
@@ -1166,23 +1123,17 @@ def test_block_inclusive_max(dtype, sg_per_block):
     def foo():
         qd.loop_config(block_dim=block_dim)
         for i in range(N):
-            dst[i] = block.inclusive_max(src[i], block_dim, dtype)
+            dst[i] = block.exclusive_add(src[i], block_dim, dtype)
 
-    for i in range(N):
-        v = ((i * 1009) % 997) + 1
-        src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
+    _init_field(src, N, dtype)
     foo()
 
-    py_max = lambda a, b: a if a > b else b  # noqa: E731
     for b in range(NUM_BLOCKS):
         block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_inclusive_scan_op(block_vals, py_max, 0)
+        expected = _ref_exclusive_scan_add(block_vals)
         for j in range(block_dim):
             actual = dst[b * block_dim + j]
-            if dtype in _BLOCK_REDUCE_INT_DTYPES:
-                assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}"
-            else:
-                assert abs(actual - expected[j]) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected[j]}"
+            _assert_block_scan_close(actual, expected[j], dtype, tol_relative=True, ctx=f"block {b} thread {j}")
 
 
 @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)

From 9238652b19cafe54004b5518e55d23267e93bd88 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 08:17:25 -0700
Subject: [PATCH 15/29] [Test] test_block_exclusive_minmax: fuse {min,max} into
 one op-parametrized test

`test_block_exclusive_min` and `test_block_exclusive_max` share the same
permuted-init pattern and only differ in the dtype-derived sentinel identity
(``+inf`` / ``iinfo.max`` for min, ``-inf`` / ``iinfo.min`` for max) and the
inf-sign check at lane 0. Collapse into one op-parametrized test that takes
``(op_name, sentinel_fn, py_op, inf_sign)`` and dispatches via getattr +
the (already module-level) `_PY_MIN` / `_PY_MAX` lambdas.

Identical param count to the original pair (3 sg x 5 dtype x 2 op = 30 cases
vs original 2 x 15 each = 30). `test_block_exclusive_add` remains its own
function because the integer identity is `0` (not `iinfo.max/min`) and the
init pattern is sequential -- different enough that fusing it in would add
more branches than it removes. Drops ~30 LOC.
---
 tests/python/test_simt.py | 71 ++++++++++++---------------------------
 1 file changed, 21 insertions(+), 50 deletions(-)

diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
index 96aeb1e4dc..8c44a40bf9 100644
--- a/tests/python/test_simt.py
+++ b/tests/python/test_simt.py
@@ -1136,12 +1136,24 @@ def foo():
             _assert_block_scan_close(actual, expected[j], dtype, tol_relative=True, ctx=f"block {b} thread {j}")
 
 
+_BLOCK_EXCLUSIVE_MINMAX_CASES = [
+    # (op_name, sentinel_fn, py_op, inf_sign)
+    pytest.param("min", _block_exclusive_min_sentinel, _PY_MIN, 1, id="min"),
+    pytest.param("max", _block_exclusive_max_sentinel, _PY_MAX, -1, id="max"),
+]
+
+
+@pytest.mark.parametrize("op_name,sentinel_fn,py_op,inf_sign", _BLOCK_EXCLUSIVE_MINMAX_CASES)
 @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
 @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
 @test_utils.test(arch=qd.gpu)
-def test_block_exclusive_min(dtype, sg_per_block):
-    """Block exclusive prefix min; thread 0 holds the dtype-derived identity (``+inf`` / ``np.iinfo(dtype).max``)."""
+def test_block_exclusive_minmax(dtype, sg_per_block, op_name, sentinel_fn, py_op, inf_sign):
+    """Block exclusive prefix ``<op>`` for ``op in {min, max}``; thread 0 of each block holds the dtype-derived
+    identity (``+inf`` / ``iinfo(dtype).max`` for min, ``-inf`` / ``iinfo(dtype).min`` for max). The float ``inf`` /
+    ``-inf`` lane-0 identity gets a sign-only check because ``inf - inf`` (or ``(-inf) - (-inf)``) is ``NaN`` and the
+    standard ``abs(diff) < tol`` compare would fail spuriously."""
     _skip_if_f64_unsupported(dtype)
+    op_fn = getattr(block, f"exclusive_{op_name}")
     block_dim = sg_per_block * _arch_subgroup_size()
     NUM_BLOCKS = 4
     N = NUM_BLOCKS * block_dim
@@ -1152,25 +1164,23 @@ def test_block_exclusive_min(dtype, sg_per_block):
     def foo():
         qd.loop_config(block_dim=block_dim)
         for i in range(N):
-            dst[i] = block.exclusive_min(src[i], block_dim, dtype)
+            dst[i] = op_fn(src[i], block_dim, dtype)
 
-    for i in range(N):
-        v = ((i * 1009) % 997) + 1
-        src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
+    _init_block_reduce_src(src, N, dtype, permuted=True)
     foo()
 
-    sentinel = _block_exclusive_min_sentinel(dtype)
-    py_min = lambda a, b: a if a < b else b  # noqa: E731
+    sentinel = sentinel_fn(dtype)
     for b in range(NUM_BLOCKS):
         block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_exclusive_scan_op(block_vals, py_min, sentinel)
+        expected = _ref_exclusive_scan_op(block_vals, py_op, sentinel)
         for j in range(block_dim):
             actual = dst[b * block_dim + j]
             if dtype in _BLOCK_REDUCE_INT_DTYPES:
                 assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}"
             elif math.isinf(expected[j]):
-                # Thread 0 of each block gets the +inf identity; ``inf - inf`` is NaN, so check by equality / sign.
-                assert math.isinf(actual) and actual > 0, f"block {b} thread {j}: got {actual}, expected {expected[j]}"
+                assert math.isinf(actual) and (
+                    actual > 0 if inf_sign > 0 else actual < 0
+                ), f"block {b} thread {j}: got {actual}, expected {expected[j]}"
             else:
                 assert abs(actual - expected[j]) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected[j]}"
 
@@ -1304,45 +1314,6 @@ def kern():
     assert actual_ranks == ref_ranks, f"ranks mismatch (pattern={key_pattern})"
 
 
-@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES)
-@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK)
-@test_utils.test(arch=qd.gpu)
-def test_block_exclusive_max(dtype, sg_per_block):
-    """Block exclusive prefix max; thread 0 holds the dtype-derived identity (``-inf`` / ``np.iinfo(dtype).min``)."""
-    _skip_if_f64_unsupported(dtype)
-    block_dim = sg_per_block * _arch_subgroup_size()
-    NUM_BLOCKS = 4
-    N = NUM_BLOCKS * block_dim
-    src = qd.field(dtype=dtype, shape=N)
-    dst = qd.field(dtype=dtype, shape=N)
-
-    @qd.kernel
-    def foo():
-        qd.loop_config(block_dim=block_dim)
-        for i in range(N):
-            dst[i] = block.exclusive_max(src[i], block_dim, dtype)
-
-    for i in range(N):
-        v = ((i * 1009) % 997) + 1
-        src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
-    foo()
-
-    sentinel = _block_exclusive_max_sentinel(dtype)
-    py_max = lambda a, b: a if a > b else b  # noqa: E731
-    for b in range(NUM_BLOCKS):
-        block_vals = [src[b * block_dim + j] for j in range(block_dim)]
-        expected = _ref_exclusive_scan_op(block_vals, py_max, sentinel)
-        for j in range(block_dim):
-            actual = dst[b * block_dim + j]
-            if dtype in _BLOCK_REDUCE_INT_DTYPES:
-                assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}"
-            elif math.isinf(expected[j]):
-                # Thread 0 of each block gets the -inf identity; ``-inf - -inf`` is NaN, so check by equality / sign.
-                assert math.isinf(actual) and actual < 0, f"block {b} thread {j}: got {actual}, expected {expected[j]}"
-            else:
-                assert abs(actual - expected[j]) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected[j]}"
-
-
 @pytest.mark.parametrize("dtype", [qd.i32, qd.f32, qd.f64])
 @test_utils.test(arch=qd.gpu)
 def test_subgroup_shuffle_broadcast(dtype):

From 62eb3aa0ce61b9771bb0209513b11832df340732 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 09:04:52 -0700
Subject: [PATCH 16/29] [Test] @pytest.mark.sample: per-test stochastic
 parametrize subsampling

Add a new opt-in marker for tests whose parametrize space is intentionally
large but where running every case every CI run is overkill. Used like:

    @pytest.mark.sample(n=4)             # keep 4 of N cases per run
    @pytest.mark.sample(fraction=0.25)   # keep 25% per run, min 1
    @pytest.mark.parametrize(...)
    ...

Over many runs each parametrize case asymptotically gets covered
(Pr[hit after r runs] = 1 - (1 - keep/total)^r). Reproducibility hooks:

  - whole sample reproducible: --sample-seed=<S> (seed printed in report
    header on every sampled run);
  - single failing case: paste the failing nodeid -- the sampler's
    len(group) <= 1 short-circuit keeps it without any flag;
  - exhaustive run: --no-sample.

Key implementation choices:

  - Seed picked on the *controller* in pytest_configure, not in
    pytest_collection_modifyitems. With pytest-xdist the latter runs per
    worker, so workers would otherwise each pick different seeds and
    sample different subsets, breaking the --sample-seed contract.
  - Per-test RNG keyed on (global_seed, nodeid_prefix). Adding / renaming /
    tweaking the mark on test_A does not shift test_B's sample, so
    routine refactors don't migrate failures.
  - Stratified per test function: each @sample-marked test keeps >= 1
    case per run (no silent zero-case runs).
  - Sampling runs *after* marker-based filtering. Composes with --run-slow:
    --no-sample --run-slow is the truly-exhaustive combo for nightly /
    release-gate runs.

Applied to two highest-cardinality test_tile16.py tests as a starter set
(test_tile16_load_store, 32 cases -> 4; test_tile16_cholesky, 36 cases ->
4). Combined saves ~150-180s per cluster CI run with >=99% case coverage
within a 50-PR window.

Docs: new docs/source/user_guide/unit_testing.md consolidates the testing
content from contributing.md plus full docs for the slow / sample markers.
contributing.md trimmed to just reference unit_testing.md for testing
specifics. index.md wires the new page into the Testing toctree.

run_tests.py exposes --sample-seed and --no-sample passthrough flags.
pytest.ini registers the `sample` marker so --strict-markers doesn't
complain. Marker is also registered dynamically by conftest.py
(addinivalue_line) so external callers using the conftest in isolation
still work.
---
 docs/source/user_guide/contributing.md |  16 +-
 docs/source/user_guide/index.md        |   1 +
 docs/source/user_guide/unit_testing.md | 194 +++++++++++++++++++++++++
 tests/pytest.ini                       |   4 +
 tests/python/conftest.py               | 134 +++++++++++++++++
 tests/python/test_tile16.py            |  10 ++
 tests/run_tests.py                     |  22 +++
 7 files changed, 367 insertions(+), 14 deletions(-)
 create mode 100644 docs/source/user_guide/unit_testing.md

diff --git a/docs/source/user_guide/contributing.md b/docs/source/user_guide/contributing.md
index ec97b9529f..04e10790f0 100644
--- a/docs/source/user_guide/contributing.md
+++ b/docs/source/user_guide/contributing.md
@@ -2,25 +2,13 @@
 
 ## Good practice reminder
 
-* *testing*: Any new features or modified code should be tested. You have to run the test suite using `python tests/run_tests.py` which sets up the right test environment for `pytest`. CLI arguments are forwarded to `pytest`. Do not use `pytest` directly as it behaves differently. To see a per-file timing breakdown (useful for identifying slow test files), set `QD_FILE_TIMING=1` — e.g. `QD_FILE_TIMING=1 python tests/run_tests.py`. This is enabled by default in the Mac CI job and the results appear in the GitHub Actions job summary.
+* *testing*: Any new features or modified code should be tested. Run the test suite with `python tests/run_tests.py` — see [unit_testing.md](unit_testing.md) for how to scope by file / keyword / arch, how the `slow` and `sample` markers interact, and how to reproduce a single CI failure.
 * *format/linter*: Before pushing any commits, ensure you set up `pre-commit` and run it using `pre-commit run -a`
 * No need to force push to keep a clean history as the merging is eventually done by squashing commits.
 
 ## Running tests
 
-Run the test suite with `python tests/run_tests.py`. CLI arguments are forwarded to pytest. For example, to run only Metal tests matching a keyword:
-
-```
-python tests/run_tests.py --arch metal -k "test_tile16_cholesky"
-```
-
-The target architecture can also be set via the `QD_WANTED_ARCHS` environment variable (comma-separated, e.g. `QD_WANTED_ARCHS=metal,vulkan`).
-
-### Kernel compilation cache
-
-During test runs, compiled kernels are cached to disk so that the same kernel is not recompiled after each `qd.reset()`/`qd.init()` cycle.
-
-A fresh, empty cache directory is created for each test session by pytest's [`tmp_path_factory`](https://docs.pytest.org/en/stable/how-to/tmp_path.html) (typically under `/tmp/pytest-of-<user>/pytest-<N>/qdcache0/`). Old session directories are cleaned up automatically by pytest's retention policy. This cache is separate from the user-facing `~/.cache/quadrants/` cache.
+See [unit_testing.md](unit_testing.md) for the full reference: launcher flags, env vars, the `slow` / `sample` markers, how to reproduce a single failing parametrize case, and how to do exhaustive runs for release gates.
 
 ## Creating your build/dev environment
 
diff --git a/docs/source/user_guide/index.md b/docs/source/user_guide/index.md
index b648f97527..c824a270e7 100644
--- a/docs/source/user_guide/index.md
+++ b/docs/source/user_guide/index.md
@@ -82,6 +82,7 @@ init_options
 :maxdepth: 1
 :titlesonly:
 
+unit_testing
 kernel_coverage
 ```
 
diff --git a/docs/source/user_guide/unit_testing.md b/docs/source/user_guide/unit_testing.md
new file mode 100644
index 0000000000..734b989670
--- /dev/null
+++ b/docs/source/user_guide/unit_testing.md
@@ -0,0 +1,194 @@
+# Unit testing
+
+This page documents how to run, write, and tune the Quadrants Python unit test suite. For one-shot setup of the build / dev environment, see [contributing.md](contributing.md).
+
+## Running the tests
+
+The test suite is run via the project's launcher, **not** by invoking `pytest` directly:
+
+```
+python tests/run_tests.py
+```
+
+The launcher sets up the test-only env vars (kernel offline cache, watchdog, xdist worker count, etc.) and forwards any unrecognised flags to pytest. Calling `pytest` directly skips that setup and behaves differently.
+
+Common one-liners:
+
+```
+# run one file
+python tests/run_tests.py test_tile16
+
+# run one test (any pytest -k expression)
+python tests/run_tests.py -k test_tile16_cholesky
+
+# run on a specific backend (or comma-separated list)
+python tests/run_tests.py --arch cuda
+python tests/run_tests.py --arch metal -k tile16
+
+# same, via env var (handy for CI)
+QD_WANTED_ARCHS=metal,vulkan python tests/run_tests.py
+
+# rerun the last failing tests first
+python tests/run_tests.py -f
+
+# stop at the first failure
+python tests/run_tests.py -x
+```
+
+The target architecture can also be set via `QD_WANTED_ARCHS` (comma-separated; supports `^arch` to exclude rather than include). Per-test timeouts default to 600 s and are enforced by `pytest_hardtle`, a CFFI-compiled C watchdog that can kill tests hung in native GPU calls even when the GIL is held.
+
+## Kernel compilation cache
+
+During each test session the kernel compilation cache lives in a fresh, empty temp directory created by pytest's [`tmp_path_factory`](https://docs.pytest.org/en/stable/how-to/tmp_path.html) — typically `/tmp/pytest-of-<user>/pytest-<N>/qdcache0/`. Old session directories are cleaned up automatically by pytest's retention policy. This cache is separate from the user-facing `~/.cache/quadrants/` cache, and avoids recompiling identical kernels after each `qd.reset()` / `qd.init()` cycle within a session.
+
+## Per-file timing breakdown
+
+Set `QD_FILE_TIMING=1` to print a per-file duration summary at the end of the session:
+
+```
+QD_FILE_TIMING=1 python tests/run_tests.py
+```
+
+This is enabled by default in the Mac CI job; the results appear in the GitHub Actions job summary and are the primary tool for identifying slow test files.
+
+## Markers
+
+Tests can opt into two project-specific markers, in addition to pytest's built-in ones (`skip`, `xfail`, etc.).
+
+### `@pytest.mark.slow`
+
+Marks a test (or, more commonly, a specific `pytest.param(...)` case inside a parametrize list) as **slow** — long enough that the default test suite skips it. `tests/run_tests.py` adds `-m "not slow"` to the pytest invocation by default; pass `--run-slow` to opt back in:
+
+```
+# default: skip slow
+python tests/run_tests.py
+
+# include slow
+python tests/run_tests.py --run-slow
+
+# slow ONLY (e.g. nightly job)
+python tests/run_tests.py -m slow --run-slow
+```
+
+The marker is used in two patterns:
+
+1. **Whole-test slow**: rare. The whole test always takes a long time and there's no smaller variant.
+
+   ```python
+   @pytest.mark.slow
+   def test_thing_that_is_always_slow():
+       ...
+   ```
+
+2. **Slow-marked parametrize case** (preferred when applicable): a test parametrizes over a size axis and the large value is slow but the small value is cheap. The small value stays in the default suite as a smoke test; the large value moves to the slow lane. This is the dominant pattern in `tests/python/test_eig.py`, `test_linalg.py`, `test_ad_gdar_diffmpm.py`, etc.
+
+   ```python
+   @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
+   def test_sym_eig_general(n):
+       ...
+   ```
+
+   With this pattern the default suite still exercises the code path; the slow lane just adds the larger-size variant for full coverage.
+
+### `@pytest.mark.sample(...)`
+
+Marks a single heavily-parametrized test as opting in to **per-run stochastic sub-selection** of its parametrize cases. Use when:
+
+- the test's parametrize space is large (≥ ~16 cases),
+- each parametrize case is roughly independent (covering an independent corner case rather than a single bug class),
+- running every case every CI run is overkill, and
+- coverage convergence over many runs is acceptable for that test.
+
+Apply it like any other marker, above the existing parametrize stack:
+
+```python
+@pytest.mark.sample(n=4)                     # keep 4 of N cases per run
+# OR
+@pytest.mark.sample(fraction=0.25)           # keep 25% of cases per run, min 1
+@pytest.mark.parametrize("size", [...])
+@pytest.mark.parametrize("dtype", [...])
+@pytest.mark.parametrize("layout", [...])
+@test_utils.test(arch=qd.gpu)
+def test_thing(size, dtype, layout):
+    ...
+```
+
+**Convergence math.** With `keep_n / total = k / N`, the probability that a *specific* parametrize case has been hit after `r` runs is `1 - (1 - k/N)^r`. For `n=4` out of 32 (`test_tile16_load_store`): ~50% after 5 runs, ~93% after 20 runs. Combined with our CI cadence and the fact that any persistent bug surface lights up across multiple PRs, this gives effectively full coverage on a many-PR horizon at a fraction of the per-PR cost.
+
+**How to reproduce.** Three levels of reproducibility:
+
+1. **One failing case** — paste the failing nodeid from the CI log. Pytest already prints the full nodeid on failure:
+
+   ```
+   FAILED tests/python/test_tile16.py::test_tile16_load_store[arch=cuda-qd_dtype0-ndarray-16-32-4-8-7-11]
+   ```
+
+   Just rerun it directly:
+
+   ```
+   python tests/run_tests.py -k "test_tile16_load_store and ndarray-16-32-4-8-7-11"
+   # or, if you want the exact nodeid (bypasses -k matching):
+   pytest "tests/python/test_tile16.py::test_tile16_load_store[arch=cuda-qd_dtype0-ndarray-16-32-4-8-7-11]"
+   ```
+
+   When pytest narrows collection to a single nodeid, the sampler's `len(group) <= 1` short-circuit keeps it. **No `--sample-seed` flag needed.**
+
+2. **The exact subset of a failing run** — useful when several cases failed and you want to bisect or reproduce the whole sample locally. The report header of every run prints the seed used:
+
+   ```
+   sample-seed=1834729104  (reproduce the same sample: --sample-seed=1834729104; ...)
+   ```
+
+   Then locally:
+
+   ```
+   python tests/run_tests.py --sample-seed=1834729104
+   ```
+
+3. **Exhaustive run** — for release gates, coverage-debt audits, or a periodic "did anything regress in any branch of the parametrize space" sweep. Disables the sampler entirely; every `@sample`-marked test runs every parametrize case:
+
+   ```
+   python tests/run_tests.py --no-sample
+   ```
+
+**Per-test RNG independence.** Each `@sample`-marked test's subsample is seeded from `(global_seed, test_nodeid_prefix)`, so adding / renaming / tweaking the mark on `test_A` does NOT shift the sample of `test_B`. Routine refactors don't cause samples to migrate file-wide.
+
+**Composition with `slow`.** Sampling runs **after** marker-based filtering. With `--run-slow` not passed (the default), slow-marked parametrize cases drop out first, then the sampler sub-selects from the remaining (fast) cases. The intersection is the right composition: `--no-sample --run-slow` is the truly-exhaustive combo.
+
+**xdist note.** The seed is picked on the controller in `pytest_configure` (not in the per-worker `pytest_collection_modifyitems`), so all xdist workers see the same seed and produce the same sample. This is intentional — without this, each worker would subsample independently and `--sample-seed=<S>` wouldn't reproduce.
+
+**When *not* to use `@sample`.** If the test's parametrize axes are not roughly independent — e.g. axis A's bug surface only lights up when axis B is at a specific value — sampling can miss the interaction. Use `@slow` on the expensive subset instead, and keep the full Cartesian product for the cheap subset.
+
+## Writing new tests
+
+The standard recipe combines `@test_utils.test(...)` (arch / option matrix) with `@pytest.mark.parametrize`:
+
+```python
+import pytest
+import quadrants as qd
+from tests import test_utils
+
+
+@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
+@test_utils.test(arch=qd.gpu, default_fp=qd.f32)
+def test_my_thing(n):
+    ...
+```
+
+`@test_utils.test` is what wires the test into the per-backend matrix and applies platform exclusions (`exclude=`), extension requirements (`require=`, e.g. `qd.extension.data64` for f64 tests), and per-test options (`default_fp`, `fast_math`, etc.). See `tests/test_utils.py` for the full surface.
+
+Common helpers in `tests/test_utils.py`:
+
+- `test_utils.skip_if_f64_unsupported(dtype)` — skip the current test at runtime if `dtype == qd.f64` and the active backend can't carry f64 through buffer I/O (Metal, MoltenVK on Darwin). Use inside a parametrized test that sweeps both f32 and f64.
+- `test_utils.expected_archs()` — list of archs that the current `QD_WANTED_ARCHS` allows. Used to skip tests with no satisfiable arch.
+
+## CI checks
+
+A subset of CI jobs care about the test suite specifically:
+
+- **linux / macosx / win** — build and run the full python suite on each platform.
+- **test-gpu** — GPU-specific tests on the cluster.
+- **coverage report** — a one-line diff coverage summary is posted as a PR comment, with kernel-level branch coverage. See [Kernel code coverage](kernel_coverage.md).
+- **Test coverage check (`check_test_coverage.yml`)** — an AI agent that flags new or modified non-test source code that doesn't have corresponding test coverage in the PR.
+
+See [contributing.md](contributing.md) for the full list of CI checks (linters, pyright, link checking, PR change report, etc.).
diff --git a/tests/pytest.ini b/tests/pytest.ini
index efaf40e6c6..3fbc75158c 100644
--- a/tests/pytest.ini
+++ b/tests/pytest.ini
@@ -5,3 +5,7 @@ markers =
     needs_torch: mark test as requiring PyTorch.
     slow: mark test (or parametrize case) as slow. Skipped by default by tests/run_tests.py;
         pass --run-slow to include them, or directly `pytest -m slow` to run only the slow ones.
+    sample(fraction=None, n=None): per-test stochastic parametrize subsampling. Pass exactly one of
+        `fraction` (0..1) or `n` (>= 1). Implemented in tests/python/conftest.py. See
+        docs/source/user_guide/unit_testing.md for the reproducibility recipes (--sample-seed,
+        --no-sample, nodeid-paste).
diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index 9e8f816a11..86f5bc4557 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -1,5 +1,6 @@
 import gc
 import os
+import random
 import sys
 import time
 
@@ -15,6 +16,139 @@
 pytest_rerunfailures.works_with_current_xdist = lambda: True
 
 
+# ---------------------------------------------------------------------------
+# @pytest.mark.sample(...)  --  per-test stochastic parametrize subsampling
+# ---------------------------------------------------------------------------
+#
+# Some tests parametrize so widely (test_tile16_load_store, test_tile16_cholesky, ...) that running every case on
+# every CI run is wasteful: the parametrize axes are intentionally varied to cover corner cases, but most runs would
+# get the same signal from a small random subset. ``@pytest.mark.sample(n=...)`` or ``@pytest.mark.sample(fraction=...)``
+# opts a *single* test into per-run random sub-selection. Over many runs, each parametrize case asymptotically gets
+# covered (Pr[hit after k runs] = 1 - (1 - keep/total)^k).
+#
+# Reproducibility hooks:
+#   - whole-suite: ``--sample-seed=<S>`` reproduces the exact same trimmed set (header prints the seed used).
+#   - single failing case: paste the failing nodeid into ``pytest <nodeid>`` -- the sampler's ``len(group) <= 1``
+#     short-circuit keeps it; no flags needed.
+#   - exhaustive run (release gate / coverage audit): ``--no-sample`` skips the sampler entirely.
+#
+# Per-test RNG keyed on ``(seed, nodeid_prefix)``: adding / renaming a @sample-marked test does NOT shift any other
+# test's sample. Routine refactors don't migrate failures.
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--sample-seed",
+        type=int,
+        default=None,
+        help="Seed for @pytest.mark.sample subsampling. If absent, a fresh seed is picked and printed "
+        "in the report header so a failing run can be reproduced via --sample-seed=<S>.",
+    )
+    parser.addoption(
+        "--no-sample",
+        action="store_true",
+        default=False,
+        help="Disable @pytest.mark.sample subsampling -- run every parametrize case of every marked test. "
+        "Use for exhaustive CI release gates / coverage-debt audits.",
+    )
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_configure(config):
+    # The marker is registered here (rather than only in pytest.ini) so callers that use
+    # `--strict-markers` don't blow up if they happen to import this conftest in isolation.
+    config.addinivalue_line(
+        "markers",
+        "sample(fraction=None, n=None): per-test stochastic parametrize subsampling. Pass exactly one of "
+        "`fraction` (0..1) or `n` (>= 1). Seed printed in report header; rerun the same sample with "
+        "--sample-seed=<S>; rerun every case with --no-sample; rerun a single failing case by pasting its nodeid.",
+    )
+    # IMPORTANT: pick the seed on the *controller* here, not inside pytest_collection_modifyitems. With pytest-xdist
+    # the latter runs on every worker, so workers would each pick different seeds and sample different subsets,
+    # breaking the contract that a single ``--sample-seed`` describes the entire run. ``config`` is replicated to
+    # xdist workers, so once we set ``sample_seed`` here every worker sees the same value.
+    if not config.getoption("--no-sample") and config.getoption("--sample-seed") is None:
+        config.option.sample_seed = random.randrange(0, 2**31)
+
+
+def pytest_report_header(config):
+    if config.getoption("--no-sample"):
+        return "sample: --no-sample (every @sample-marked test runs every parametrize case)"
+    seed = config.getoption("--sample-seed")
+    if seed is None:
+        return None
+    return (
+        f"sample-seed={seed}  (reproduce the same sample: --sample-seed={seed}; "
+        f"reproduce a single failure: paste its nodeid; run every case: --no-sample)"
+    )
+
+
+def _sample_keep_count(mark, group_size, group_key):
+    """Resolve ``@pytest.mark.sample(fraction=..., n=...)`` for a group of ``group_size`` parametrize cases.
+
+    Exactly one of ``fraction`` (0..1) or ``n`` (int >= 1) must be passed; ``UsageError`` otherwise. The result is
+    clamped to ``[1, group_size]`` so every @sample-marked test runs at least one case per run (no silent zero-case
+    runs even if e.g. ``fraction * group_size`` rounds to zero on a 1-case group).
+    """
+    fraction = mark.kwargs.get("fraction")
+    n = mark.kwargs.get("n")
+    if (fraction is None) == (n is None):
+        raise pytest.UsageError(
+            f"@pytest.mark.sample on {group_key!r}: pass exactly one of `fraction` or `n`, got "
+            f"fraction={fraction!r}, n={n!r}"
+        )
+    if fraction is not None:
+        return max(1, int(round(group_size * float(fraction))))
+    return max(1, min(int(n), group_size))
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--no-sample"):
+        return
+    seed = config.getoption("--sample-seed")
+    if seed is None:
+        # Defensive: pytest_configure didn't run (e.g. someone imported this module manually). Nothing to do.
+        return
+
+    # Group items by test function (strip the parametrize bracket suffix). Per-function stratification is what
+    # guarantees every @sample-marked test keeps at least one case per run -- uniform sampling across all items
+    # could otherwise drop a 2-case marked test entirely.
+    groups: dict[str, list] = {}
+    for item in items:
+        key = item.nodeid.split("[", 1)[0]
+        groups.setdefault(key, []).append(item)
+
+    keep, deselected = [], []
+    # ``sorted(groups)`` so the iteration order (and therefore any incidental RNG advance) is reproducible across
+    # Python versions / dict insertion orders. Per-test RNG is keyed below so this only matters for the (cheap)
+    # bookkeeping order.
+    for key in sorted(groups):
+        group = groups[key]
+        mark = group[0].get_closest_marker("sample")
+        if mark is None or len(group) <= 1:
+            # No sample mark -> every case runs. Also: a single-item group means either the test only had one
+            # parametrize case to begin with, or pytest narrowed collection to a specific nodeid -- both cases
+            # should run as-is. This is what makes "paste failing nodeid" work without --no-sample.
+            keep.extend(group)
+            continue
+        keep_n = _sample_keep_count(mark, len(group), key)
+        # Per-test RNG: keyed on (seed, key) so:
+        #   - Independence: adding / renaming / tweaking the @sample mark on test_A does NOT shift the sample of
+        #     test_B. Routine refactors don't cause failures to migrate file-wide.
+        #   - Locality: when debugging, you can reason about one test's sample without simulating all the others'
+        #     RNG advances.
+        rng = random.Random((seed, key))
+        kept_ids = {id(it) for it in rng.sample(group, k=keep_n)}
+        for it in group:
+            (keep if id(it) in kept_ids else deselected).append(it)
+
+    if deselected:
+        # ``pytest_deselected`` is the supported way to report filtered-out items so pytest's summary shows them as
+        # deselected (not silently dropped). xdist also forwards this to the controller correctly.
+        config.hook.pytest_deselected(items=deselected)
+    items[:] = keep
+
+
 @pytest.fixture(scope="session", autouse=True)
 def _offline_cache_dir(tmp_path_factory):
     """Enable the kernel compilation disk cache for the test session.
diff --git a/tests/python/test_tile16.py b/tests/python/test_tile16.py
index f94d4221e1..78dd798b94 100644
--- a/tests/python/test_tile16.py
+++ b/tests/python/test_tile16.py
@@ -92,6 +92,12 @@ def k1(src_arr: Ann, dst_arr: Ann):
     np.testing.assert_allclose(dst.to_numpy(), np.eye(_TILE, dtype=np_dtype))
 
 
+# 8 geometries x 2 tensor_type x 2 qd_dtype = 32 parametrize cases. The geometries enumerate hand-picked corner
+# cases (origin, non-zero src/dst offsets, partial cols/rows, oversize backing array); coverage of any single
+# geometry is more valuable than running every combination every CI run. ``@pytest.mark.sample(n=4)`` keeps 4 of
+# the 32 cases per run; after k runs each specific case is hit with probability 1 - (28/32)^k = 1 - 0.875^k
+# (~50% after 5 runs, ~93% after 20). See docs/source/user_guide/unit_testing.md for the reproducibility recipes.
+@pytest.mark.sample(n=4)
 @pytest.mark.parametrize(
     "src_row, src_col, row_offset, col_offset, ncols, nrows",
     [
@@ -439,6 +445,10 @@ def k1(
     np.testing.assert_allclose(out.to_numpy(), expected, atol=atol)
 
 
+# 3 dst_delta x 3 src_offset x 2 tensor_type x 2 qd_dtype = 36 parametrize cases. Each case is an independent
+# offset/delta combo; running 4 random ones per CI run with full convergence over ~20 runs is the right tradeoff
+# given each case takes ~5s of cluster wall time. See unit_testing.md.
+@pytest.mark.sample(n=4)
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @pytest.mark.parametrize("dst_delta", [0, 3, 16])
 @pytest.mark.parametrize("src_offset", [0, 5, 32])
diff --git a/tests/run_tests.py b/tests/run_tests.py
index bf37ab2aa7..060fe87f9b 100644
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -65,6 +65,10 @@ def _test_python(args, default_dir="python"):
             marks_expr = f"({marks_expr}) and not slow" if marks_expr else "not slow"
         if marks_expr:
             pytest_args += ["-m", marks_expr]
+        if args.sample_seed is not None:
+            pytest_args += [f"--sample-seed={args.sample_seed}"]
+        if args.no_sample:
+            pytest_args += ["--no-sample"]
         if args.failed_first:
             pytest_args += ["--failed-first"]
         if args.fail_fast:
@@ -179,6 +183,24 @@ def test():
         help="Include tests marked `slow` (excluded by default). Has no effect if -m is "
         "given an explicit expression that already mentions `slow`.",
     )
+    parser.add_argument(
+        "--sample-seed",
+        required=False,
+        default=None,
+        type=int,
+        dest="sample_seed",
+        help="Seed for @pytest.mark.sample subsampling. Defaults to a fresh seed picked per run "
+        "(printed in the report header). Pass the seed from a failing CI run to reproduce its sample.",
+    )
+    parser.add_argument(
+        "--no-sample",
+        required=False,
+        default=False,
+        dest="no_sample",
+        action="store_true",
+        help="Disable @pytest.mark.sample subsampling -- run every parametrize case of every marked test. "
+        "Use for exhaustive CI release gates / coverage-debt audits.",
+    )
     parser.add_argument(
         "-f",
         "--failed-first",

From a4badcf6a728f6d4ee735fd2f0c12eb32f115117 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 09:18:25 -0700
Subject: [PATCH 17/29] [Test] test_tile16: bump @pytest.mark.sample(n=4) ->
 n=6

Tighten per-PR coverage on the two initial @sample-marked tests:
test_tile16_load_store (32 cases -> 6/run) and test_tile16_cholesky
(36 cases -> 6/run). Trades a small per-CI saving (now ~150s instead of
~180s) for materially better single-run coverage: ~65% / 60% after 5
runs (was ~50% / ~42%), ~98% / 97% after 20 runs (was ~93% / ~90%).
Same convergence math, larger keep-fraction. Doc + comment updated to
match.
---
 docs/source/user_guide/unit_testing.md |  4 ++--
 tests/python/test_tile16.py            | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/source/user_guide/unit_testing.md b/docs/source/user_guide/unit_testing.md
index 734b989670..67ef5c26eb 100644
--- a/docs/source/user_guide/unit_testing.md
+++ b/docs/source/user_guide/unit_testing.md
@@ -102,7 +102,7 @@ Marks a single heavily-parametrized test as opting in to **per-run stochastic su
 Apply it like any other marker, above the existing parametrize stack:
 
 ```python
-@pytest.mark.sample(n=4)                     # keep 4 of N cases per run
+@pytest.mark.sample(n=6)                     # keep 6 of N cases per run
 # OR
 @pytest.mark.sample(fraction=0.25)           # keep 25% of cases per run, min 1
 @pytest.mark.parametrize("size", [...])
@@ -113,7 +113,7 @@ def test_thing(size, dtype, layout):
     ...
 ```
 
-**Convergence math.** With `keep_n / total = k / N`, the probability that a *specific* parametrize case has been hit after `r` runs is `1 - (1 - k/N)^r`. For `n=4` out of 32 (`test_tile16_load_store`): ~50% after 5 runs, ~93% after 20 runs. Combined with our CI cadence and the fact that any persistent bug surface lights up across multiple PRs, this gives effectively full coverage on a many-PR horizon at a fraction of the per-PR cost.
+**Convergence math.** With `keep_n / total = k / N`, the probability that a *specific* parametrize case has been hit after `r` runs is `1 - (1 - k/N)^r`. For `n=6` out of 32 (`test_tile16_load_store`): ~65% after 5 runs, ~88% after 10, ~98% after 20. Combined with our CI cadence and the fact that any persistent bug surface lights up across multiple PRs, this gives effectively full coverage on a many-PR horizon at a fraction of the per-PR cost.
 
 **How to reproduce.** Three levels of reproducibility:
 
diff --git a/tests/python/test_tile16.py b/tests/python/test_tile16.py
index 78dd798b94..4ad9ad3a5c 100644
--- a/tests/python/test_tile16.py
+++ b/tests/python/test_tile16.py
@@ -94,10 +94,10 @@ def k1(src_arr: Ann, dst_arr: Ann):
 
 # 8 geometries x 2 tensor_type x 2 qd_dtype = 32 parametrize cases. The geometries enumerate hand-picked corner
 # cases (origin, non-zero src/dst offsets, partial cols/rows, oversize backing array); coverage of any single
-# geometry is more valuable than running every combination every CI run. ``@pytest.mark.sample(n=4)`` keeps 4 of
-# the 32 cases per run; after k runs each specific case is hit with probability 1 - (28/32)^k = 1 - 0.875^k
-# (~50% after 5 runs, ~93% after 20). See docs/source/user_guide/unit_testing.md for the reproducibility recipes.
-@pytest.mark.sample(n=4)
+# geometry is more valuable than running every combination every CI run. ``@pytest.mark.sample(n=6)`` keeps 6 of
+# the 32 cases per run; after k runs each specific case is hit with probability 1 - (26/32)^k = 1 - 0.8125^k
+# (~65% after 5 runs, ~98% after 20). See docs/source/user_guide/unit_testing.md for the reproducibility recipes.
+@pytest.mark.sample(n=6)
 @pytest.mark.parametrize(
     "src_row, src_col, row_offset, col_offset, ncols, nrows",
     [
@@ -446,9 +446,9 @@ def k1(
 
 
 # 3 dst_delta x 3 src_offset x 2 tensor_type x 2 qd_dtype = 36 parametrize cases. Each case is an independent
-# offset/delta combo; running 4 random ones per CI run with full convergence over ~20 runs is the right tradeoff
+# offset/delta combo; running 6 random ones per CI run with ~97% convergence over 20 runs is the right tradeoff
 # given each case takes ~5s of cluster wall time. See unit_testing.md.
-@pytest.mark.sample(n=4)
+@pytest.mark.sample(n=6)
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @pytest.mark.parametrize("dst_delta", [0, 3, 16])
 @pytest.mark.parametrize("src_offset", [0, 5, 32])

From 446aef723c1ab43145c1795a9c146927c45f5446 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 09:22:19 -0700
Subject: [PATCH 18/29] [Style] Reflow @sample comment blocks to 120c (from AI
 default ~110c)

The @pytest.mark.sample machinery added in 62eb3aa0c / a4badcf6a was wrapped
at the AI-default ~110c rather than the project target 120c. Re-flow prose
blocks in conftest.py and test_tile16.py to use the full line width. No
behavior change.
---
 tests/python/conftest.py    | 18 +++++++++---------
 tests/python/test_tile16.py | 16 ++++++++--------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index 86f5bc4557..bc7df588c0 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -20,11 +20,11 @@
 # @pytest.mark.sample(...)  --  per-test stochastic parametrize subsampling
 # ---------------------------------------------------------------------------
 #
-# Some tests parametrize so widely (test_tile16_load_store, test_tile16_cholesky, ...) that running every case on
-# every CI run is wasteful: the parametrize axes are intentionally varied to cover corner cases, but most runs would
-# get the same signal from a small random subset. ``@pytest.mark.sample(n=...)`` or ``@pytest.mark.sample(fraction=...)``
-# opts a *single* test into per-run random sub-selection. Over many runs, each parametrize case asymptotically gets
-# covered (Pr[hit after k runs] = 1 - (1 - keep/total)^k).
+# Some tests parametrize so widely (test_tile16_load_store, test_tile16_cholesky, ...) that running every case on every
+# CI run is wasteful: the parametrize axes are intentionally varied to cover corner cases, but most runs would get the
+# same signal from a small random subset. ``@pytest.mark.sample(n=...)`` or ``@pytest.mark.sample(fraction=...)`` opts a
+# *single* test into per-run random sub-selection. Over many runs, each parametrize case asymptotically gets covered
+# (Pr[hit after k runs] = 1 - (1 - keep/total)^k).
 #
 # Reproducibility hooks:
 #   - whole-suite: ``--sample-seed=<S>`` reproduces the exact same trimmed set (header prints the seed used).
@@ -133,10 +133,10 @@ def pytest_collection_modifyitems(config, items):
             continue
         keep_n = _sample_keep_count(mark, len(group), key)
         # Per-test RNG: keyed on (seed, key) so:
-        #   - Independence: adding / renaming / tweaking the @sample mark on test_A does NOT shift the sample of
-        #     test_B. Routine refactors don't cause failures to migrate file-wide.
-        #   - Locality: when debugging, you can reason about one test's sample without simulating all the others'
-        #     RNG advances.
+        #   - Independence: adding / renaming / tweaking the @sample mark on test_A does NOT shift the sample of test_B.
+        #     Routine refactors don't cause failures to migrate file-wide.
+        #   - Locality: when debugging, you can reason about one test's sample without simulating all the others' RNG
+        #     advances.
         rng = random.Random((seed, key))
         kept_ids = {id(it) for it in rng.sample(group, k=keep_n)}
         for it in group:
diff --git a/tests/python/test_tile16.py b/tests/python/test_tile16.py
index 4ad9ad3a5c..d919dda08b 100644
--- a/tests/python/test_tile16.py
+++ b/tests/python/test_tile16.py
@@ -92,11 +92,11 @@ def k1(src_arr: Ann, dst_arr: Ann):
     np.testing.assert_allclose(dst.to_numpy(), np.eye(_TILE, dtype=np_dtype))
 
 
-# 8 geometries x 2 tensor_type x 2 qd_dtype = 32 parametrize cases. The geometries enumerate hand-picked corner
-# cases (origin, non-zero src/dst offsets, partial cols/rows, oversize backing array); coverage of any single
-# geometry is more valuable than running every combination every CI run. ``@pytest.mark.sample(n=6)`` keeps 6 of
-# the 32 cases per run; after k runs each specific case is hit with probability 1 - (26/32)^k = 1 - 0.8125^k
-# (~65% after 5 runs, ~98% after 20). See docs/source/user_guide/unit_testing.md for the reproducibility recipes.
+# 8 geometries x 2 tensor_type x 2 qd_dtype = 32 parametrize cases. The geometries enumerate hand-picked corner cases
+# (origin, non-zero src/dst offsets, partial cols/rows, oversize backing array); coverage of any single geometry is
+# more valuable than running every combination every CI run. ``@pytest.mark.sample(n=6)`` keeps 6 of the 32 cases per
+# run; after k runs each specific case is hit with probability 1 - (26/32)^k = 1 - 0.8125^k (~65% after 5 runs, ~98%
+# after 20). See docs/source/user_guide/unit_testing.md for the reproducibility recipes.
 @pytest.mark.sample(n=6)
 @pytest.mark.parametrize(
     "src_row, src_col, row_offset, col_offset, ncols, nrows",
@@ -445,9 +445,9 @@ def k1(
     np.testing.assert_allclose(out.to_numpy(), expected, atol=atol)
 
 
-# 3 dst_delta x 3 src_offset x 2 tensor_type x 2 qd_dtype = 36 parametrize cases. Each case is an independent
-# offset/delta combo; running 6 random ones per CI run with ~97% convergence over 20 runs is the right tradeoff
-# given each case takes ~5s of cluster wall time. See unit_testing.md.
+# 3 dst_delta x 3 src_offset x 2 tensor_type x 2 qd_dtype = 36 parametrize cases. Each case is an independent offset /
+# delta combo; running 6 random ones per CI run with ~97% convergence over 20 runs is the right tradeoff given each
+# case takes ~5s of cluster wall time. See unit_testing.md.
 @pytest.mark.sample(n=6)
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @pytest.mark.parametrize("dst_delta", [0, 3, 16])

From 61718fefad0fa74ec0c052cd67038891a1a5d2b6 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 09:28:49 -0700
Subject: [PATCH 19/29] [BugFix] @sample: propagate seed to xdist workers via
 workerinput

The previous implementation of ``pytest_configure`` ran on every xdist worker
and called ``random.randrange()`` independently, so each worker computed a
different seed and (via ``pytest_collection_modifyitems``) sampled a different
subset of cases. xdist then aborted the run with "Different tests were collected
between gw0 and gwN".

Fix: use xdist's ``pytest_configure_node`` controller-only hook to stash the
controller-chosen seed in the worker's ``workerinput`` dict, and have
``pytest_configure`` on the worker side read from there instead of generating
its own seed. With this change ``--sample-seed`` is consistent across all
workers and the @sample-marked subset matches on every worker, so collection
no longer diverges.

Smoke-tested via cluster bench: previous run failed with the collection
mismatch at the 9s mark; this run gets past collection and exercises the
sampler on real xdist workers.
---
 tests/python/conftest.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index bc7df588c0..af0220a3bd 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -65,12 +65,27 @@ def pytest_configure(config):
     )
     # IMPORTANT: pick the seed on the *controller* here, not inside pytest_collection_modifyitems. With pytest-xdist
     # the latter runs on every worker, so workers would each pick different seeds and sample different subsets,
-    # breaking the contract that a single ``--sample-seed`` describes the entire run. ``config`` is replicated to
-    # xdist workers, so once we set ``sample_seed`` here every worker sees the same value.
-    if not config.getoption("--no-sample") and config.getoption("--sample-seed") is None:
+    # breaking the contract that a single ``--sample-seed`` describes the entire run. The controller -> worker handoff
+    # uses xdist's ``workerinput`` dict (populated in ``pytest_configure_node`` below); workers read from there in
+    # ``pytest_configure``. Runtime-set ``config.option`` attributes are NOT auto-replicated to workers.
+    if hasattr(config, "workerinput"):
+        # xdist worker: read seed from controller via workerinput.
+        seed = config.workerinput.get("sample_seed")
+        if seed is not None:
+            config.option.sample_seed = seed
+    elif not config.getoption("--no-sample") and config.getoption("--sample-seed") is None:
+        # Controller (or non-xdist run): pick the run's seed once.
         config.option.sample_seed = random.randrange(0, 2**31)
 
 
+def pytest_configure_node(node):
+    # xdist hook: runs on the controller for each worker about to be spawned. Stash the run-wide sample seed in the
+    # worker's ``workerinput`` dict so ``pytest_configure`` on the worker side picks up the same value.
+    seed = node.config.getoption("--sample-seed")
+    if seed is not None:
+        node.workerinput["sample_seed"] = seed
+
+
 def pytest_report_header(config):
     if config.getoption("--no-sample"):
         return "sample: --no-sample (every @sample-marked test runs every parametrize case)"

From 5205ef0cc888b02c5d11e0de9a3f5ca4b556be8b Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 09:34:01 -0700
Subject: [PATCH 20/29] [BugFix] @sample: propagate seed to xdist workers via
 QD_SAMPLE_SEED env var

The previous attempt to use xdist's ``pytest_configure_node`` /
``workerinput`` hook didn't fire when conftest.py lives below rootdir
(here: ``tests/python/conftest.py`` while rootdir is ``tests/``), so each
worker re-drew its own seed and collection still diverged.

Switch to an environment-variable handoff (``QD_SAMPLE_SEED``). xdist's popen
gateway inherits ``os.environ`` from the controller process, so the seed
chosen once on the controller is seen identically by every worker regardless
of conftest depth. Logic:

  - ``--no-sample``                  -> sampler disabled, no seed needed.
  - ``--sample-seed=N`` on argv      -> xdist forwards argv; every process sees it.
  - ``QD_SAMPLE_SEED`` in os.environ -> worker inherits the controller's seed.
  - else (controller / no xdist)     -> pick once, set os.environ for workers.
---
 tests/python/conftest.py | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index af0220a3bd..e6e62f21f7 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -63,27 +63,25 @@ def pytest_configure(config):
         "`fraction` (0..1) or `n` (>= 1). Seed printed in report header; rerun the same sample with "
         "--sample-seed=<S>; rerun every case with --no-sample; rerun a single failing case by pasting its nodeid.",
     )
-    # IMPORTANT: pick the seed on the *controller* here, not inside pytest_collection_modifyitems. With pytest-xdist
-    # the latter runs on every worker, so workers would each pick different seeds and sample different subsets,
-    # breaking the contract that a single ``--sample-seed`` describes the entire run. The controller -> worker handoff
-    # uses xdist's ``workerinput`` dict (populated in ``pytest_configure_node`` below); workers read from there in
-    # ``pytest_configure``. Runtime-set ``config.option`` attributes are NOT auto-replicated to workers.
-    if hasattr(config, "workerinput"):
-        # xdist worker: read seed from controller via workerinput.
-        seed = config.workerinput.get("sample_seed")
-        if seed is not None:
-            config.option.sample_seed = seed
-    elif not config.getoption("--no-sample") and config.getoption("--sample-seed") is None:
-        # Controller (or non-xdist run): pick the run's seed once.
-        config.option.sample_seed = random.randrange(0, 2**31)
-
-
-def pytest_configure_node(node):
-    # xdist hook: runs on the controller for each worker about to be spawned. Stash the run-wide sample seed in the
-    # worker's ``workerinput`` dict so ``pytest_configure`` on the worker side picks up the same value.
-    seed = node.config.getoption("--sample-seed")
-    if seed is not None:
-        node.workerinput["sample_seed"] = seed
+    # IMPORTANT: pick the seed on the *controller* once, then propagate it to every xdist worker. ``pytest_configure``
+    # runs on the controller AND on every worker; without explicit propagation each worker would draw a fresh seed,
+    # sample a different subset, and xdist would abort collection with "Different tests were collected between gw0 and
+    # gwN". We use an environment variable (``QD_SAMPLE_SEED``) because xdist's popen gateway inherits ``os.environ``
+    # from the controller -- this works regardless of conftest depth, unlike the ``pytest_configure_node`` /
+    # ``workerinput`` hook which only fires for conftests at the rootdir level.
+    if config.getoption("--no-sample"):
+        pass  # Sampler disabled; no seed needed.
+    elif config.getoption("--sample-seed") is not None:
+        # Explicit ``--sample-seed=N`` is already on argv -> xdist forwards argv to workers, so every process sees it.
+        pass
+    elif "QD_SAMPLE_SEED" in os.environ:
+        # Worker (or re-entrant run): inherit the seed the controller picked.
+        config.option.sample_seed = int(os.environ["QD_SAMPLE_SEED"])
+    else:
+        # Controller (or non-xdist run): pick the seed once and publish it for workers.
+        seed = random.randrange(0, 2**31)
+        config.option.sample_seed = seed
+        os.environ["QD_SAMPLE_SEED"] = str(seed)
 
 
 def pytest_report_header(config):

From 7dd7a6d1e179ec2712f145e7c0e58c138cb181f5 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 09:40:55 -0700
Subject: [PATCH 21/29] [BugFix] @sample: pick seed in run_tests.py, propagate
 via --sample-seed argv

xdist's collection-consistency check requires every worker to sample the same
parametrize subset. Earlier attempts to propagate the per-run seed via
``config.option`` (replicated by xdist), via ``pytest_configure_node`` /
``workerinput`` (only fires for conftests at rootdir level), and via
``os.environ`` (xdist snapshots env before pytest_configure mutates it) all
left workers drawing independent seeds and aborting collection with
"Different tests were collected between gw0 and gwN".

Switch to the only mechanism that reliably propagates across xdist workers:
argv. ``tests/run_tests.py`` now picks the seed once per invocation and passes
``--sample-seed=<S>`` to pytest, which xdist forwards verbatim to every worker
subprocess. ``pytest_configure`` in tests/python/conftest.py only picks a
single-process fallback seed when run as a non-xdist controller (no
``workerinput``) and no ``--sample-seed`` is on argv -- the path used when
someone invokes pytest directly without going through run_tests.py.
---
 tests/python/conftest.py | 32 +++++++++++++-------------------
 tests/run_tests.py       | 13 +++++++++++--
 2 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index e6e62f21f7..39ddd73c2d 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -63,25 +63,19 @@ def pytest_configure(config):
         "`fraction` (0..1) or `n` (>= 1). Seed printed in report header; rerun the same sample with "
         "--sample-seed=<S>; rerun every case with --no-sample; rerun a single failing case by pasting its nodeid.",
     )
-    # IMPORTANT: pick the seed on the *controller* once, then propagate it to every xdist worker. ``pytest_configure``
-    # runs on the controller AND on every worker; without explicit propagation each worker would draw a fresh seed,
-    # sample a different subset, and xdist would abort collection with "Different tests were collected between gw0 and
-    # gwN". We use an environment variable (``QD_SAMPLE_SEED``) because xdist's popen gateway inherits ``os.environ``
-    # from the controller -- this works regardless of conftest depth, unlike the ``pytest_configure_node`` /
-    # ``workerinput`` hook which only fires for conftests at the rootdir level.
-    if config.getoption("--no-sample"):
-        pass  # Sampler disabled; no seed needed.
-    elif config.getoption("--sample-seed") is not None:
-        # Explicit ``--sample-seed=N`` is already on argv -> xdist forwards argv to workers, so every process sees it.
-        pass
-    elif "QD_SAMPLE_SEED" in os.environ:
-        # Worker (or re-entrant run): inherit the seed the controller picked.
-        config.option.sample_seed = int(os.environ["QD_SAMPLE_SEED"])
-    else:
-        # Controller (or non-xdist run): pick the seed once and publish it for workers.
-        seed = random.randrange(0, 2**31)
-        config.option.sample_seed = seed
-        os.environ["QD_SAMPLE_SEED"] = str(seed)
+    # Seed propagation contract: the seed must reach the controller AND every xdist worker as the same value, or
+    # xdist's collection-consistency check fails with "Different tests were collected between gw0 and gwN". argv is
+    # forwarded by xdist to every worker, so we require the seed to live on argv as ``--sample-seed=N``. ``tests/
+    # run_tests.py`` picks a seed once per run and injects it; direct ``pytest`` invocations either pass
+    # ``--sample-seed`` explicitly (reproducibility) or fall back to a single-process seed picked below. We do NOT
+    # mutate ``os.environ`` here -- env-var inheritance into xdist worker subprocesses is not guaranteed for runtime
+    # mutations, only for vars present when pytest itself was launched.
+    if (
+        not config.getoption("--no-sample")
+        and config.getoption("--sample-seed") is None
+        and not hasattr(config, "workerinput")  # single-process / non-xdist controller only.
+    ):
+        config.option.sample_seed = random.randrange(0, 2**31)
 
 
 def pytest_report_header(config):
diff --git a/tests/run_tests.py b/tests/run_tests.py
index 060fe87f9b..400cac6c06 100644
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -1,6 +1,7 @@
 import argparse
 import importlib.util
 import os
+import random
 
 
 def _test_python(args, default_dir="python"):
@@ -65,10 +66,18 @@ def _test_python(args, default_dir="python"):
             marks_expr = f"({marks_expr}) and not slow" if marks_expr else "not slow"
         if marks_expr:
             pytest_args += ["-m", marks_expr]
-        if args.sample_seed is not None:
-            pytest_args += [f"--sample-seed={args.sample_seed}"]
         if args.no_sample:
             pytest_args += ["--no-sample"]
+        else:
+            # Pick the run's @pytest.mark.sample seed here (before pytest is launched) and pass it via --sample-seed on
+            # argv. This is the most reliable way to propagate the seed to xdist workers: xdist forwards argv to every
+            # worker subprocess, so all workers and the controller see the exact same value, sample identical subsets,
+            # and xdist's collection-consistency check passes. (Setting the seed inside ``pytest_configure`` doesn't
+            # work because ``os.environ`` mutation there happens after xdist has already snapshotted the env it ships
+            # to workers, and ``pytest_configure_node`` only fires for conftests at the rootdir level.)
+            if args.sample_seed is None:
+                args.sample_seed = random.randrange(0, 2**31)
+            pytest_args += [f"--sample-seed={args.sample_seed}"]
         if args.failed_first:
             pytest_args += ["--failed-first"]
         if args.fail_fast:

From c4b81bd153c7a7d31a8354a393f4d7c51d94cc26 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 09:45:21 -0700
Subject: [PATCH 22/29] [DebugTemp] @sample: log seed/argv/workerinput in each
 pytest_collection_modifyitems call

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 tests/python/conftest.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index 39ddd73c2d..41137c930f 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -113,6 +113,11 @@ def pytest_collection_modifyitems(config, items):
     if config.getoption("--no-sample"):
         return
     seed = config.getoption("--sample-seed")
+    sys.stderr.write(
+        f"[QD_SAMPLE_DEBUG] pid={os.getpid()} workerinput={hasattr(config, 'workerinput')} "
+        f"seed-opt={seed} argv={sys.argv}\n"
+    )
+    sys.stderr.flush()
     if seed is None:
         # Defensive: pytest_configure didn't run (e.g. someone imported this module manually). Nothing to do.
         return

From fb27ec5e0782aa77e87a703ffa2ec335524047f6 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 09:49:31 -0700
Subject: [PATCH 23/29] [BugFix] @sample: sample from sorted(group) so xdist
 workers see identical order

Final piece of the seed-propagation puzzle. Even with the same seed reaching
every worker (via run_tests.py injecting --sample-seed on argv, which xdist
forwards), workers were still selecting different parametrize cases.

Root cause: pytest does NOT guarantee that ``items`` (and therefore the
per-group list) lands in the same in-memory order on every xdist worker --
order can drift from collection-phase parallelism, dict iteration, etc. So
``rng.sample(group, k)`` was picking the same indices on every worker, but
those indices resolved to *different* item objects in differently-ordered
lists, leaving each worker with a different subset and triggering xdist's
"Different tests were collected between gw0 and gwN" abort.

Fix: sort the group by ``nodeid`` (a content-derived total order) before
sampling. Also switch the kept-set from ``id(it)`` to ``it.nodeid`` for the
same reason -- ``id()`` is per-process and the comparison loop iterates the
original (possibly differently ordered) group, but ``nodeid`` is identical
on every worker. Drop the temporary debug log added in c4b81bd15 (it
confirmed the seed propagated correctly; the remaining bug was order).
---
 tests/python/conftest.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index 41137c930f..7f3ca42d4d 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -113,11 +113,6 @@ def pytest_collection_modifyitems(config, items):
     if config.getoption("--no-sample"):
         return
     seed = config.getoption("--sample-seed")
-    sys.stderr.write(
-        f"[QD_SAMPLE_DEBUG] pid={os.getpid()} workerinput={hasattr(config, 'workerinput')} "
-        f"seed-opt={seed} argv={sys.argv}\n"
-    )
-    sys.stderr.flush()
     if seed is None:
         # Defensive: pytest_configure didn't run (e.g. someone imported this module manually). Nothing to do.
         return
@@ -149,10 +144,18 @@ def pytest_collection_modifyitems(config, items):
         #     Routine refactors don't cause failures to migrate file-wide.
         #   - Locality: when debugging, you can reason about one test's sample without simulating all the others' RNG
         #     advances.
+        # CRITICAL: ``rng.sample(group_sorted, ...)`` rather than ``rng.sample(group, ...)``. xdist workers each run
+        # ``pytest_collection_modifyitems`` independently and pytest does NOT guarantee that ``items`` (and therefore
+        # ``group``) lands in the same in-memory order on every worker. With the same seed but a differently-ordered
+        # list, ``rng.sample`` would pick the same indices but those indices would resolve to different items, so
+        # workers would collect different subsets and xdist's collection-consistency check would abort the run with
+        # "Different tests were collected between gw0 and gwN". Sorting by ``nodeid`` (a content-derived total order)
+        # forces every worker to sample from an identical sequence.
+        group_sorted = sorted(group, key=lambda it: it.nodeid)
         rng = random.Random((seed, key))
-        kept_ids = {id(it) for it in rng.sample(group, k=keep_n)}
+        kept_nodeids = {it.nodeid for it in rng.sample(group_sorted, k=keep_n)}
         for it in group:
-            (keep if id(it) in kept_ids else deselected).append(it)
+            (keep if it.nodeid in kept_nodeids else deselected).append(it)
 
     if deselected:
         # ``pytest_deselected`` is the supported way to report filtered-out items so pytest's summary shows them as

From 908e3a273cc1f50bbf1d412c1c8f04c47af581b4 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 09:53:21 -0700
Subject: [PATCH 24/29] [DebugTemp2] re-add sample debug log

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 tests/python/conftest.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index 7f3ca42d4d..573062cc81 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -113,6 +113,11 @@ def pytest_collection_modifyitems(config, items):
     if config.getoption("--no-sample"):
         return
     seed = config.getoption("--sample-seed")
+    sys.stderr.write(
+        f"[QD_SAMPLE_DEBUG] pid={os.getpid()} workerinput={hasattr(config, 'workerinput')} "
+        f"seed-opt={seed} argv={sys.argv} env-QD={os.environ.get('QD_SAMPLE_SEED')}\n"
+    )
+    sys.stderr.flush()
     if seed is None:
         # Defensive: pytest_configure didn't run (e.g. someone imported this module manually). Nothing to do.
         return

From 06e50cd67b734e24611ad2dcf719a1f1547837df Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 09:57:42 -0700
Subject: [PATCH 25/29] [BugFix] @sample: derive per-test RNG via sha256, not
 tuple-seeding of random.Random

``random.Random((seed, key))`` falls through CPython's deprecated tuple-seed
path (``_sha512(repr(a).encode())``), which raises a DeprecationWarning and is
slated for removal in a future Python release. Worse, the diagnosis from the
last failed cluster bench showed that even with the same int seed and same
string key reaching every worker (verified via debug log), workers still
disagreed on the sampled subset -- the only remaining variable was the
seed-mixing path itself.

Replace it with an explicit ``int.from_bytes(sha256(f"{seed}|{key}").digest())``
mixing step that we control. sha256 is bit-for-bit deterministic across
Python versions and platforms, so every xdist worker computes the same
RNG state and ``rng.sample(group_sorted, k=keep_n)`` picks the same items.

Also drop the temporary debug log from 908e3a273 (it confirmed the seed
itself was propagating identically; the remaining drift was the tuple-seeded
RNG).
---
 tests/python/conftest.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index 573062cc81..30f5e0fd89 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -1,4 +1,5 @@
 import gc
+import hashlib
 import os
 import random
 import sys
@@ -113,11 +114,6 @@ def pytest_collection_modifyitems(config, items):
     if config.getoption("--no-sample"):
         return
     seed = config.getoption("--sample-seed")
-    sys.stderr.write(
-        f"[QD_SAMPLE_DEBUG] pid={os.getpid()} workerinput={hasattr(config, 'workerinput')} "
-        f"seed-opt={seed} argv={sys.argv} env-QD={os.environ.get('QD_SAMPLE_SEED')}\n"
-    )
-    sys.stderr.flush()
     if seed is None:
         # Defensive: pytest_configure didn't run (e.g. someone imported this module manually). Nothing to do.
         return
@@ -149,6 +145,10 @@ def pytest_collection_modifyitems(config, items):
         #     Routine refactors don't cause failures to migrate file-wide.
         #   - Locality: when debugging, you can reason about one test's sample without simulating all the others' RNG
         #     advances.
+        # Seed mixing uses sha256 of a canonical ``f"{seed}|{key}"`` rather than ``random.Random((seed, key))``: tuple
+        # seeding goes through ``_sha512(repr(a).encode())`` in CPython 3.10+ which IS deterministic in principle but
+        # raises a ``DeprecationWarning: Seeding based on hashing is deprecated`` and is slated for removal. We pin to
+        # an explicit hash so the sample is reproducible across Python versions and not at the mercy of stdlib churn.
         # CRITICAL: ``rng.sample(group_sorted, ...)`` rather than ``rng.sample(group, ...)``. xdist workers each run
         # ``pytest_collection_modifyitems`` independently and pytest does NOT guarantee that ``items`` (and therefore
         # ``group``) lands in the same in-memory order on every worker. With the same seed but a differently-ordered
@@ -157,7 +157,8 @@ def pytest_collection_modifyitems(config, items):
         # "Different tests were collected between gw0 and gwN". Sorting by ``nodeid`` (a content-derived total order)
         # forces every worker to sample from an identical sequence.
         group_sorted = sorted(group, key=lambda it: it.nodeid)
-        rng = random.Random((seed, key))
+        mixed = int.from_bytes(hashlib.sha256(f"{seed}|{key}".encode()).digest()[:8], "big")
+        rng = random.Random(mixed)
         kept_nodeids = {it.nodeid for it in rng.sample(group_sorted, k=keep_n)}
         for it in group:
             (keep if it.nodeid in kept_nodeids else deselected).append(it)

From 21c56d2f438cc34603662701bbe2340ef80ff060 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 11:03:15 -0700
Subject: [PATCH 26/29] [Style] Reflow CI-flagged 80c-wrapped comments to 120c

The PR's `Check line wrapping` CI agent flagged three comments wrapped at
the AI-default ~78-90c instead of the project's 120c target. Reflow each
to the full target width:

  - tests/python/test_tile16.py:1791  (78c -> 120c) docstring for
    test_tile16_cholesky_blocked_demo.
  - tests/python/test_ad_gdar_diffmpm.py:8  (85c -> 120c) the
    "defaults shrink ..." comment above the parametrize block.
  - tests/run_tests.py:60  (90c -> 120c) the "--run-slow opts back in"
    comment.

Also collapse the dangling-backslash continuation in
misc/demos/cholesky_blocked.py's Usage example onto one line (69c -> 109c).

No behavior change; comments only. Verified via the cursor
find-underwrapped skill that the remaining flagged runs in my diff are
all 103-116c with save~=0 (already-tight runs the greedy heuristic still
reports), comfortably in the agent's "not borderline" exemption.
---
 misc/demos/cholesky_blocked.py       | 3 +--
 tests/python/test_ad_gdar_diffmpm.py | 9 ++++-----
 tests/python/test_tile16.py          | 7 +++----
 tests/run_tests.py                   | 7 +++----
 4 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/misc/demos/cholesky_blocked.py b/misc/demos/cholesky_blocked.py
index b4c60c1810..3c72dd39fd 100644
--- a/misc/demos/cholesky_blocked.py
+++ b/misc/demos/cholesky_blocked.py
@@ -21,8 +21,7 @@
     tile16   (Tile16x16, no shared memory)             16        533        5.19x
 
 Usage:
-    python misc/demos/cholesky_blocked.py [--n N] [--n-envs N_ENVS] \
-        [--num-warmup WARMUP] [--num-iters ITERS]
+    python misc/demos/cholesky_blocked.py [--n N] [--n-envs N_ENVS] [--num-warmup WARMUP] [--num-iters ITERS]
 """
 
 import argparse
diff --git a/tests/python/test_ad_gdar_diffmpm.py b/tests/python/test_ad_gdar_diffmpm.py
index 0e0e460534..8fd3c56d56 100644
--- a/tests/python/test_ad_gdar_diffmpm.py
+++ b/tests/python/test_ad_gdar_diffmpm.py
@@ -5,11 +5,10 @@
 from tests import test_utils
 
 
-# Defaults shrink particle / grid / steps counts so the JIT compile + AD-tape replay
-# stays cheap; the slow-marked entry keeps the original (N=30, n_grid=120, steps=32)
-# workload that runs on --run-slow. The point of the test is that the AD-validation
-# checker fires on the global-data-access violation in g2p (`v[f, p] = new_v`), which
-# happens on the first substep regardless of size.
+# Defaults shrink particle / grid / steps counts so the JIT compile + AD-tape replay stays cheap; the slow-marked
+# entry keeps the original (N=30, n_grid=120, steps=32) workload that runs on --run-slow. The point of the test is
+# that the AD-validation checker fires on the global-data-access violation in g2p (`v[f, p] = new_v`), which happens
+# on the first substep regardless of size.
 @pytest.mark.parametrize(
     "particles_side,n_grid_size,num_steps",
     [
diff --git a/tests/python/test_tile16.py b/tests/python/test_tile16.py
index d919dda08b..9bed5bc277 100644
--- a/tests/python/test_tile16.py
+++ b/tests/python/test_tile16.py
@@ -1788,10 +1788,9 @@ def write_eye_f32(dst: Ann32):
 def test_tile16_cholesky_blocked_demo():
     """Smoke-test that misc/demos/cholesky_blocked.py runs to completion.
 
-    Uses small CLI overrides (N=32, N_ENVS=64, 1 warmup + 1 timed iter) so the
-    JIT compile of the 3 unrolled kernels and the benchmark loop both stay
-    cheap. The demo's defaults (N=92, N_ENVS=4096, 50+200 iters) are exercised
-    by anyone running the script manually, not by CI.
+    Uses small CLI overrides (N=32, N_ENVS=64, 1 warmup + 1 timed iter) so the JIT compile of the 3 unrolled kernels
+    and the benchmark loop both stay cheap. The demo's defaults (N=92, N_ENVS=4096, 50+200 iters) are exercised by
+    anyone running the script manually, not by CI.
     """
     demo = Path(__file__).resolve().parents[2] / "misc" / "demos" / "cholesky_blocked.py"
     cmd = [
diff --git a/tests/run_tests.py b/tests/run_tests.py
index 400cac6c06..9e033a89d4 100644
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -57,10 +57,9 @@ def _test_python(args, default_dir="python"):
             pytest_args += ["--cov-append"]
         if args.keys:
             pytest_args += ["-k", args.keys]
-        # By default we exclude tests marked `slow` (eig / make_spd at n>=6, inverse_large
-        # at n>=6, mpm88, etc. — see tests/pytest.ini for the marker). `--run-slow` opts
-        # back in. If the user passes their own `-m` expression we AND `not slow` onto it
-        # so the exclusion still applies, unless they explicitly opt out via `--run-slow`.
+        # By default we exclude tests marked `slow` (eig / make_spd at n>=6, inverse_large at n>=6, mpm88, etc. -- see
+        # tests/pytest.ini for the marker). `--run-slow` opts back in. If the user passes their own `-m` expression we
+        # AND `not slow` onto it so the exclusion still applies, unless they explicitly opt out via `--run-slow`.
         marks_expr = args.marks
         if not args.run_slow:
             marks_expr = f"({marks_expr}) and not slow" if marks_expr else "not slow"

From 9d7518c0ec9170e878497a3140f0edf6f5a668fe Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 11:12:48 -0700
Subject: [PATCH 27/29] [Doc] contributing.md: shorten testing bullet per PR
 review

Hugh requested in PR #709 review comment that the testing bullet collapse
to just a pointer at unit_testing.md, since the long inline summary
duplicates the dedicated doc immediately below.
---
 docs/source/user_guide/contributing.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/user_guide/contributing.md b/docs/source/user_guide/contributing.md
index 04e10790f0..78c8648525 100644
--- a/docs/source/user_guide/contributing.md
+++ b/docs/source/user_guide/contributing.md
@@ -2,7 +2,7 @@
 
 ## Good practice reminder
 
-* *testing*: Any new features or modified code should be tested. Run the test suite with `python tests/run_tests.py` — see [unit_testing.md](unit_testing.md) for how to scope by file / keyword / arch, how the `slow` and `sample` markers interact, and how to reproduce a single CI failure.
+* *testing*: Any new features or modified code should be tested. see [unit_testing.md](unit_testing.md)
 * *format/linter*: Before pushing any commits, ensure you set up `pre-commit` and run it using `pre-commit run -a`
 * No need to force push to keep a clean history as the merging is eventually done by squashing commits.
 

From b4e3355e19b20237b02f6390e32d6609d51b600b Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 11:22:27 -0700
Subject: [PATCH 28/29] [Doc] unit_testing.md: apply PR review feedback

Address Hugh's 15 review comments on PR #709, all on unit_testing.md:

- Soften opener: "For one-shot setup..." -> "For setup..."
- Trim slow-marker intro: drop "(or, more commonly, a specific
  pytest.param case inside a parametrize list)" and the trailing
  "-- long enough that the default test suite skips it" clause.
- Trim slow-marker patterns list: drop "rare", "always", and
  "and there's no smaller variant"; drop "(preferred when applicable)"
  and the long descriptive paragraph about size-axis parametrize on
  test_eig.py / test_linalg.py / test_ad_gdar_diffmpm.py.
- @sample bullets: "coverage convergence" -> "asymptotic coverage";
  drop "for that test".
- @sample placement: clarify that decorator order is irrelevant
  (pytest attaches function-level markers regardless of order), so
  "like any other marker" is accurate.
- Drop the Convergence-math paragraph and the "When *not* to use
  @sample" paragraph (both more academic than the rest of the doc).
- New "## Advanced" section at the bottom for optional / infra detail
  that most contributors don't need on first read: per-test timeout,
  kernel compilation cache, per-file timing breakdown, and the
  @sample + xdist seed-propagation mechanism.
- Drop the "## CI checks" section entirely.
---
 docs/source/user_guide/unit_testing.md | 63 ++++++++++++--------------
 1 file changed, 29 insertions(+), 34 deletions(-)

diff --git a/docs/source/user_guide/unit_testing.md b/docs/source/user_guide/unit_testing.md
index 67ef5c26eb..93adff333e 100644
--- a/docs/source/user_guide/unit_testing.md
+++ b/docs/source/user_guide/unit_testing.md
@@ -1,6 +1,6 @@
 # Unit testing
 
-This page documents how to run, write, and tune the Quadrants Python unit test suite. For one-shot setup of the build / dev environment, see [contributing.md](contributing.md).
+This page documents how to run, write, and tune the Quadrants Python unit test suite. For setup of the build / dev environment, see [contributing.md](contributing.md).
 
 ## Running the tests
 
@@ -35,21 +35,7 @@ python tests/run_tests.py -f
 python tests/run_tests.py -x
 ```
 
-The target architecture can also be set via `QD_WANTED_ARCHS` (comma-separated; supports `^arch` to exclude rather than include). Per-test timeouts default to 600 s and are enforced by `pytest_hardtle`, a CFFI-compiled C watchdog that can kill tests hung in native GPU calls even when the GIL is held.
-
-## Kernel compilation cache
-
-During each test session the kernel compilation cache lives in a fresh, empty temp directory created by pytest's [`tmp_path_factory`](https://docs.pytest.org/en/stable/how-to/tmp_path.html) — typically `/tmp/pytest-of-<user>/pytest-<N>/qdcache0/`. Old session directories are cleaned up automatically by pytest's retention policy. This cache is separate from the user-facing `~/.cache/quadrants/` cache, and avoids recompiling identical kernels after each `qd.reset()` / `qd.init()` cycle within a session.
-
-## Per-file timing breakdown
-
-Set `QD_FILE_TIMING=1` to print a per-file duration summary at the end of the session:
-
-```
-QD_FILE_TIMING=1 python tests/run_tests.py
-```
-
-This is enabled by default in the Mac CI job; the results appear in the GitHub Actions job summary and are the primary tool for identifying slow test files.
+The target architecture can also be set via `QD_WANTED_ARCHS` (comma-separated; supports `^arch` to exclude rather than include).
 
 ## Markers
 
@@ -57,7 +43,7 @@ Tests can opt into two project-specific markers, in addition to pytest's built-i
 
 ### `@pytest.mark.slow`
 
-Marks a test (or, more commonly, a specific `pytest.param(...)` case inside a parametrize list) as **slow** — long enough that the default test suite skips it. `tests/run_tests.py` adds `-m "not slow"` to the pytest invocation by default; pass `--run-slow` to opt back in:
+Marks a test as **slow**. `tests/run_tests.py` adds `-m "not slow"` to the pytest invocation by default; pass `--run-slow` to opt back in:
 
 ```
 # default: skip slow
@@ -72,7 +58,7 @@ python tests/run_tests.py -m slow --run-slow
 
 The marker is used in two patterns:
 
-1. **Whole-test slow**: rare. The whole test always takes a long time and there's no smaller variant.
+1. **Whole-test slow**: the whole test takes a long time.
 
    ```python
    @pytest.mark.slow
@@ -80,7 +66,7 @@ The marker is used in two patterns:
        ...
    ```
 
-2. **Slow-marked parametrize case** (preferred when applicable): a test parametrizes over a size axis and the large value is slow but the small value is cheap. The small value stays in the default suite as a smoke test; the large value moves to the slow lane. This is the dominant pattern in `tests/python/test_eig.py`, `test_linalg.py`, `test_ad_gdar_diffmpm.py`, etc.
+2. **Slow-marked parametrize case**:
 
    ```python
    @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
@@ -97,9 +83,9 @@ Marks a single heavily-parametrized test as opting in to **per-run stochastic su
 - the test's parametrize space is large (≥ ~16 cases),
 - each parametrize case is roughly independent (covering an independent corner case rather than a single bug class),
 - running every case every CI run is overkill, and
-- coverage convergence over many runs is acceptable for that test.
+- asymptotic coverage over many runs is acceptable.
 
-Apply it like any other marker, above the existing parametrize stack:
+Apply it like any other marker. Position within the decorator stack is irrelevant — pytest attaches function-level markers to the test regardless of order, so `@pytest.mark.sample` can sit anywhere above or below the `@pytest.mark.parametrize` decorators:
 
 ```python
 @pytest.mark.sample(n=6)                     # keep 6 of N cases per run
@@ -113,8 +99,6 @@ def test_thing(size, dtype, layout):
     ...
 ```
 
-**Convergence math.** With `keep_n / total = k / N`, the probability that a *specific* parametrize case has been hit after `r` runs is `1 - (1 - k/N)^r`. For `n=6` out of 32 (`test_tile16_load_store`): ~65% after 5 runs, ~88% after 10, ~98% after 20. Combined with our CI cadence and the fact that any persistent bug surface lights up across multiple PRs, this gives effectively full coverage on a many-PR horizon at a fraction of the per-PR cost.
-
 **How to reproduce.** Three levels of reproducibility:
 
 1. **One failing case** — paste the failing nodeid from the CI log. Pytest already prints the full nodeid on failure:
@@ -155,10 +139,6 @@ def test_thing(size, dtype, layout):
 
 **Composition with `slow`.** Sampling runs **after** marker-based filtering. With `--run-slow` not passed (the default), slow-marked parametrize cases drop out first, then the sampler sub-selects from the remaining (fast) cases. The intersection is the right composition: `--no-sample --run-slow` is the truly-exhaustive combo.
 
-**xdist note.** The seed is picked on the controller in `pytest_configure` (not in the per-worker `pytest_collection_modifyitems`), so all xdist workers see the same seed and produce the same sample. This is intentional — without this, each worker would subsample independently and `--sample-seed=<S>` wouldn't reproduce.
-
-**When *not* to use `@sample`.** If the test's parametrize axes are not roughly independent — e.g. axis A's bug surface only lights up when axis B is at a specific value — sampling can miss the interaction. Use `@slow` on the expensive subset instead, and keep the full Cartesian product for the cheap subset.
-
 ## Writing new tests
 
 The standard recipe combines `@test_utils.test(...)` (arch / option matrix) with `@pytest.mark.parametrize`:
@@ -182,13 +162,28 @@ Common helpers in `tests/test_utils.py`:
 - `test_utils.skip_if_f64_unsupported(dtype)` — skip the current test at runtime if `dtype == qd.f64` and the active backend can't carry f64 through buffer I/O (Metal, MoltenVK on Darwin). Use inside a parametrized test that sweeps both f32 and f64.
 - `test_utils.expected_archs()` — list of archs that the current `QD_WANTED_ARCHS` allows. Used to skip tests with no satisfiable arch.
 
-## CI checks
+## Advanced
 
-A subset of CI jobs care about the test suite specifically:
+Optional knobs and runtime details. The defaults work for most contributors.
+
+### Per-test timeout
+
+Per-test timeouts default to 600 s and are enforced by `pytest_hardtle`, a CFFI-compiled C watchdog that can kill tests hung in native GPU calls even when the GIL is held.
+
+### Kernel compilation cache
+
+During each test session the kernel compilation cache lives in a fresh, empty temp directory created by pytest's [`tmp_path_factory`](https://docs.pytest.org/en/stable/how-to/tmp_path.html) — typically `/tmp/pytest-of-<user>/pytest-<N>/qdcache0/`. Old session directories are cleaned up automatically by pytest's retention policy. This cache is separate from the user-facing `~/.cache/quadrants/` cache, and avoids recompiling identical kernels after each `qd.reset()` / `qd.init()` cycle within a session.
+
+### Per-file timing breakdown
+
+Set `QD_FILE_TIMING=1` to print a per-file duration summary at the end of the session:
+
+```
+QD_FILE_TIMING=1 python tests/run_tests.py
+```
+
+This is enabled by default in the Mac CI job; the results appear in the GitHub Actions job summary and are the primary tool for identifying slow test files.
 
-- **linux / macosx / win** — build and run the full python suite on each platform.
-- **test-gpu** — GPU-specific tests on the cluster.
-- **coverage report** — a one-line diff coverage summary is posted as a PR comment, with kernel-level branch coverage. See [Kernel code coverage](kernel_coverage.md).
-- **Test coverage check (`check_test_coverage.yml`)** — an AI agent that flags new or modified non-test source code that doesn't have corresponding test coverage in the PR.
+### `@sample` + xdist seed propagation
 
-See [contributing.md](contributing.md) for the full list of CI checks (linters, pyright, link checking, PR change report, etc.).
+`tests/run_tests.py` picks the per-run sample seed before pytest is launched and passes it via `--sample-seed=<S>` on argv. xdist forwards argv to every worker, so all workers see the same seed and produce identical samples; without this, each worker would subsample independently and `--sample-seed=<S>` wouldn't reproduce. The per-test RNG inside `pytest_collection_modifyitems` is then derived deterministically via `sha256(f"{seed}|{nodeid_prefix}")`, which is what makes the **Per-test RNG independence** property above hold.

From 5f4e664fd2735ae20e5b36b88163bd3b11bc60b8 Mon Sep 17 00:00:00 2001
From: "Hugh Perkins (deskai7)" <hugh@deskai7.local>
Date: Tue, 19 May 2026 11:31:05 -0700
Subject: [PATCH 29/29] [Doc] address second round of PR review on unit_testing
 + contributing

Apply Hugh's 4 follow-up comments on PR #709:

- contributing.md L11: trim "See unit_testing.md for the full reference:
  launcher flags, env vars, ..." down to just "See unit_testing.md."
- unit_testing.md L77: "With this pattern" -> "In this specific example".
- unit_testing.md L88: drop the "Position within the decorator stack is
  irrelevant..." sentence I added in b4e3355e1. The shorter
  "Apply it like any other marker:" reads better.
- unit_testing.md L102: "How to reproduce." ->
  "How to reproduce failing tests."
---
 docs/source/user_guide/contributing.md | 2 +-
 docs/source/user_guide/unit_testing.md | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/user_guide/contributing.md b/docs/source/user_guide/contributing.md
index 78c8648525..8325303fba 100644
--- a/docs/source/user_guide/contributing.md
+++ b/docs/source/user_guide/contributing.md
@@ -8,7 +8,7 @@
 
 ## Running tests
 
-See [unit_testing.md](unit_testing.md) for the full reference: launcher flags, env vars, the `slow` / `sample` markers, how to reproduce a single failing parametrize case, and how to do exhaustive runs for release gates.
+See [unit_testing.md](unit_testing.md).
 
 ## Creating your build/dev environment
 
diff --git a/docs/source/user_guide/unit_testing.md b/docs/source/user_guide/unit_testing.md
index 93adff333e..7ce8147e40 100644
--- a/docs/source/user_guide/unit_testing.md
+++ b/docs/source/user_guide/unit_testing.md
@@ -74,7 +74,7 @@ The marker is used in two patterns:
        ...
    ```
 
-   With this pattern the default suite still exercises the code path; the slow lane just adds the larger-size variant for full coverage.
+   In this specific example the default suite still exercises the code path; the slow lane just adds the larger-size variant for full coverage.
 
 ### `@pytest.mark.sample(...)`
 
@@ -85,7 +85,7 @@ Marks a single heavily-parametrized test as opting in to **per-run stochastic su
 - running every case every CI run is overkill, and
 - asymptotic coverage over many runs is acceptable.
 
-Apply it like any other marker. Position within the decorator stack is irrelevant — pytest attaches function-level markers to the test regardless of order, so `@pytest.mark.sample` can sit anywhere above or below the `@pytest.mark.parametrize` decorators:
+Apply it like any other marker:
 
 ```python
 @pytest.mark.sample(n=6)                     # keep 6 of N cases per run
@@ -99,7 +99,7 @@ def test_thing(size, dtype, layout):
     ...
 ```
 
-**How to reproduce.** Three levels of reproducibility:
+**How to reproduce failing tests.** Three levels of reproducibility:
 
 1. **One failing case** — paste the failing nodeid from the CI log. Pytest already prints the full nodeid on failure: