Genesis-Embodied-AI · hughperkins · May 19, 2026 · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/docs/source/user_guide/contributing.md b/docs/source/user_guide/contributing.md
@@ -2,7 +2,7 @@
 
 ## Good practice reminder
 
-* *testing*: Any new features or modified code should be tested. You have to run the test suite using `python tests/run_tests.py` which sets up the right test environment for `pytest`. CLI arguments are forwarded to `pytest`. Do not use `pytest` directly as it behaves differently. To see a per-file timing breakdown (useful for identifying slow test files), set `QD_FILE_TIMING=1` — e.g. `QD_FILE_TIMING=1 python tests/run_tests.py`. This is enabled by default in the Mac CI job and the results appear in the GitHub Actions job summary.
+* *testing*: Any new features or modified code should be tested. see [unit_testing.md](unit_testing.md)
 * *format/linter*: Before pushing any commits, ensure you set up `pre-commit` and run it using `pre-commit run -a`
 * No need to force push to keep a clean history as the merging is eventually done by squashing commits.
 

diff --git a/docs/source/user_guide/index.md b/docs/source/user_guide/index.md
@@ -82,6 +82,7 @@ init_options
 :maxdepth: 1
 :titlesonly:
 
+unit_testing
 kernel_coverage
 ```
 

diff --git a/docs/source/user_guide/unit_testing.md b/docs/source/user_guide/unit_testing.md
@@ -0,0 +1,120 @@
+# Unit testing
+
+This page documents how to run, write, and tune the Quadrants Python unit test suite. For setup of the build / dev environment, see [contributing.md](contributing.md).
+
+## Running the tests
+
+The test suite is run via the project's launcher, **not** by invoking `pytest` directly:
+
+```
+python tests/run_tests.py
+```
+
+The launcher sets up the test-only env vars (kernel offline cache, watchdog, xdist worker count, etc.) and forwards any unrecognised flags to pytest. Calling `pytest` directly skips that setup and behaves differently.
+
+Common one-liners:
+
+```
+# run one file
+python tests/run_tests.py test_tile16
+
+# run one test (any pytest -k expression)
+python tests/run_tests.py -k test_tile16_cholesky
+
+# run on a specific backend (or comma-separated list)
+python tests/run_tests.py --arch cuda
+python tests/run_tests.py --arch metal -k tile16
+
+# same, via env var (handy for CI)
+QD_WANTED_ARCHS=metal,vulkan python tests/run_tests.py
+
+# rerun the last failing tests first
+python tests/run_tests.py -f
+
+# stop at the first failure
+python tests/run_tests.py -x
+```
+
+The target architecture can also be set via `QD_WANTED_ARCHS` (comma-separated; supports `^arch` to exclude rather than include).
+
+## Markers
+
+### `@pytest.mark.slow`
+
+Marks a test as **slow**. `tests/run_tests.py` adds `-m "not slow"` to the pytest invocation by default; pass `--run-slow` to opt back in:
+
+```
+# default: skip slow
+python tests/run_tests.py
+
+# include slow
+python tests/run_tests.py --run-slow
+
+# slow ONLY (e.g. nightly job)
+python tests/run_tests.py -m slow --run-slow
+```
+
+The marker is used in two patterns:
+
+1. **Whole-test slow**: the whole test takes a long time.
+
+   ```python
+   @pytest.mark.slow
+   def test_thing_that_is_always_slow():
+       ...
+   ```
+
+2. **Slow-marked parametrize case**:
+
+   ```python
+   @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
+   def test_sym_eig_general(n):
+       ...
+   ```
+
+   In this specific example the default suite still exercises the code path; the slow lane just adds the larger-size variant for full coverage.
+
+## Writing new tests
+
+The standard recipe combines `@test_utils.test(...)` (arch / option matrix) with `@pytest.mark.parametrize`:
+
+```python
+import pytest
+import quadrants as qd
+from tests import test_utils
+
+
+@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
+@test_utils.test(arch=qd.gpu, default_fp=qd.f32)
+def test_my_thing(n):
+    ...
+```
+
+`@test_utils.test` is what wires the test into the per-backend matrix and applies platform exclusions (`exclude=`), extension requirements (`require=`, e.g. `qd.extension.data64` for f64 tests), and per-test options (`default_fp`, `fast_math`, etc.). See `tests/test_utils.py` for the full surface.
+
+Common helpers in `tests/test_utils.py`:
+
+- `test_utils.skip_if_f64_unsupported(dtype)` — skip the current test at runtime if `dtype == qd.f64` and the active backend can't carry f64 through buffer I/O (Metal, MoltenVK on Darwin). Use inside a parametrized test that sweeps both f32 and f64.
+- `test_utils.expected_archs()` — list of archs that the current `QD_WANTED_ARCHS` allows. Used to skip tests with no satisfiable arch.
+
+## Advanced
+
+Optional knobs and runtime details. The defaults work for most contributors.
+
+### Per-test timeout
+
+Per-test timeouts default to 600 s and are enforced by `pytest_hardtle`, a CFFI-compiled C watchdog that can kill tests hung in native GPU calls even when the GIL is held.
+
+### Kernel compilation cache
+
+During each test session the kernel compilation cache lives in a fresh, empty temp directory created by pytest's [`tmp_path_factory`](https://docs.pytest.org/en/stable/how-to/tmp_path.html) — typically `/tmp/pytest-of-<user>/pytest-<N>/qdcache0/`. Old session directories are cleaned up automatically by pytest's retention policy. This cache is separate from the user-facing `~/.cache/quadrants/` cache, and avoids recompiling identical kernels after each `qd.reset()` / `qd.init()` cycle within a session.
+
+### Per-file timing breakdown
+
+Set `QD_FILE_TIMING=1` to print a per-file duration summary at the end of the session:
+
+```
+QD_FILE_TIMING=1 python tests/run_tests.py
+```
+
+This is enabled by default in the Mac CI job; the results appear in the GitHub Actions job summary and are the primary tool for identifying slow test files.
diff --git a/misc/demos/cholesky_blocked.py b/misc/demos/cholesky_blocked.py
@@ -1,13 +1,14 @@
 #!/usr/bin/env python3
-"""Benchmark 92x92 blocked Cholesky factorization using Tile16x16.
+"""Benchmark NxN blocked Cholesky factorization using Tile16x16.
 
 Three kernels compared:
 
 1. Baseline: scalar Cholesky-Crout, 64 threads, shared memory, 2*N+1 sequential syncs. Thread 0 computes each
    diagonal, remaining threads parallelize off-diagonal updates.
 
-2. Blocked: 6x6 grid of 16x16 tiles, 16 threads, shared memory, scalar Crout for diagonal blocks. Same blocking
-   structure as Tile16x16 but all data lives in shared memory with block.sync() between every step.
+2. Blocked: ceil(N/16) x ceil(N/16) grid of 16x16 tiles, 16 threads, shared memory, scalar Crout for diagonal
+   blocks. Same blocking structure as Tile16x16 but all data lives in shared memory with block.sync() between
+   every step.
 
 3. Tile16x16: same blocked structure but fully register-resident via Tile16x16. No shared memory, zero syncs.
    Prior tiles read from global memory (L2).
@@ -20,22 +21,37 @@
     tile16   (Tile16x16, no shared memory)             16        533        5.19x
 
 Usage:
-    python misc/demos/cholesky_blocked.py
+    python misc/demos/cholesky_blocked.py [--n N] [--n-envs N_ENVS] [--num-warmup WARMUP] [--num-iters ITERS]
 """
 
+import argparse
 import time
 
 import numpy as np
 
 import quadrants as qd
 
-N = 92
+
+def _parse_args():
+    p = argparse.ArgumentParser(
+        description="Blocked Cholesky NxN benchmark (3 kernels: baseline / blocked / tile16).",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    p.add_argument("--n", type=int, default=92, help="Matrix dimension N (NxN SPD).")
+    p.add_argument("--n-envs", type=int, default=4096, help="Number of independent environments.")
+    p.add_argument("--num-warmup", type=int, default=50, help="Warmup iterations per kernel.")
+    p.add_argument("--num-iters", type=int, default=200, help="Timed iterations per kernel.")
+    return p.parse_args()
+
+
+_args = _parse_args()
+N = _args.n
 TILE = 16
-N_BLOCKS = (N + TILE - 1) // TILE  # 6
-N_PADDED = N_BLOCKS * TILE  # 96, rounded up for blocked kernel SharedArrays
-N_ENVS = 4096
-WARMUP = 50
-ITERS = 200
+N_BLOCKS = (N + TILE - 1) // TILE
+N_PADDED = N_BLOCKS * TILE  # rounded up for blocked kernel SharedArrays
+N_ENVS = _args.n_envs
+WARMUP = _args.num_warmup
+ITERS = _args.num_iters
 
 qd.init(arch=qd.gpu)
 

diff --git a/tests/pytest.ini b/tests/pytest.ini
@@ -3,3 +3,5 @@ markers =
     run_in_serial: mark test to run serially(usually for resource intensive tests).
     sm70: Can only run on GPU with compute capability 7.0 or higher.
     needs_torch: mark test as requiring PyTorch.
+    slow: mark test (or parametrize case) as slow. Skipped by default by tests/run_tests.py;
+        pass --run-slow to include them, or directly `pytest -m slow` to run only the slow ones.
diff --git a/tests/python/test_ad_gdar_diffmpm.py b/tests/python/test_ad_gdar_diffmpm.py
@@ -5,14 +5,25 @@
 from tests import test_utils
 
 
+# Defaults shrink particle / grid / steps counts so the JIT compile + AD-tape replay stays cheap; the slow-marked
+# entry keeps the original (N=30, n_grid=120, steps=32) workload that runs on --run-slow. The point of the test is
+# that the AD-validation checker fires on the global-data-access violation in g2p (`v[f, p] = new_v`), which happens
+# on the first substep regardless of size.
+@pytest.mark.parametrize(
+    "particles_side,n_grid_size,num_steps",
+    [
+        (8, 32, 4),
+        pytest.param(30, 120, 32, marks=pytest.mark.slow),
+    ],
+)
 @test_utils.test(require=qd.extension.assertion, debug=True)
-def test_gdar_mpm():
+def test_gdar_mpm(particles_side, n_grid_size, num_steps):
     real = qd.f32
 
     dim = 2
-    N = 30  # reduce to 30 if run out of GPU memory
+    N = particles_side
     n_particles = N * N
-    n_grid = 120
+    n_grid = n_grid_size
     dx = 1 / n_grid
     inv_dx = 1 / dx
     dt = 3e-4
@@ -21,8 +32,8 @@ def test_gdar_mpm():
     E = 100
     mu = E
     la = E
-    max_steps = 32
-    steps = 32
+    max_steps = num_steps
+    steps = num_steps
     gravity = 9.8
     target = [0.3, 0.6]