From be5610942b44c61328cddc9126b854cddafd90ad Mon Sep 17 00:00:00 2001 From: "Hugh Perkins (deskai7)" Date: Tue, 19 May 2026 02:47:21 -0700 Subject: [PATCH 01/15] Skip the slowest tests by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a `slow` pytest marker, mark the worst-case tests with it, and have `tests/run_tests.py` skip those tests by default (use `--run-slow` to include them, or `pytest -m slow` to run only those). Picked from macOS CI per-file timing (QD_FILE_TIMING=1, run 26083950810): phase 1 totals 6415s across 8641 test calls; the slowest 3 files alone (test_eig, test_tile16, test_linalg) cover 55%. The cost of test_eig / test_make_spd is super-linear in matrix size n (n=12 ≈ 5x n=9). Marked slow: - Parametrize cases n in {6, 9, 12} (and 7..11 for inverse_large) across test_eig.py and test_linalg.py. - Rectangular (9, 12) / (12, 3) cases in test_frobenius_inner_rectangular. - test_matmul_chain_qipc_sizes_{f32,f64} (>130s each on macOS CI). - test_clear_all_gradients (180s/invocation). - test_reset_ndarrays::test_ndarray_doesnt_crash_on_gc (127s). - test_mpm88::{test_mpm88, test_mpm88_numpy_and_ndarray} (~30s/invocation). - test_struct::test_2d_nested (122s/invocation). run_tests.py composes `not slow` with any user-supplied `-m` expression, so existing CI invocations like `-m "not needs_torch"` become `(not needs_torch) and not slow`. Note that this also drops slow tests from GPU / Linux / macOS CI runs — a separate workflow (or `--run-slow` job) is needed if we still want to exercise the n>=6 / n=12 paths in CI. --- tests/pytest.ini | 2 + tests/python/test_clear_all_gradients.py | 3 + tests/python/test_eig.py | 96 +++++++++++++++++++++--- tests/python/test_linalg.py | 70 +++++++++++++++-- tests/python/test_mpm88.py | 2 + tests/python/test_reset_ndarrays.py | 1 + tests/python/test_struct.py | 1 + tests/run_tests.py | 23 +++++- 8 files changed, 180 insertions(+), 18 deletions(-) diff --git a/tests/pytest.ini b/tests/pytest.ini index 5ee5ec16b2..efaf40e6c6 100644 --- a/tests/pytest.ini +++ b/tests/pytest.ini @@ -3,3 +3,5 @@ markers = run_in_serial: mark test to run serially(usually for resource intensive tests). sm70: Can only run on GPU with compute capability 7.0 or higher. needs_torch: mark test as requiring PyTorch. + slow: mark test (or parametrize case) as slow. Skipped by default by tests/run_tests.py; + pass --run-slow to include them, or directly `pytest -m slow` to run only the slow ones. diff --git a/tests/python/test_clear_all_gradients.py b/tests/python/test_clear_all_gradients.py index 615ade9b0b..22c649a979 100644 --- a/tests/python/test_clear_all_gradients.py +++ b/tests/python/test_clear_all_gradients.py @@ -1,9 +1,12 @@ +import pytest + import quadrants as qd from quadrants.lang import impl from tests import test_utils +@pytest.mark.slow @test_utils.test(exclude=[qd.vulkan]) def test_clear_all_gradients(): x = qd.field(qd.f32) diff --git a/tests/python/test_eig.py b/tests/python/test_eig.py index 53647a6eef..ad8d8fe3bb 100644 --- a/tests/python/test_eig.py +++ b/tests/python/test_eig.py @@ -295,7 +295,16 @@ def run(): np.testing.assert_allclose(A_reconstructed, A_np, rtol=tol, atol=tol) -@pytest.mark.parametrize("n", [4, 5, 6, 9, 12]) +@pytest.mark.parametrize( + "n", + [ + 4, + 5, + pytest.param(6, marks=pytest.mark.slow), + pytest.param(9, marks=pytest.mark.slow), + pytest.param(12, marks=pytest.mark.slow), + ], +) @pytest.mark.parametrize( "factory", [ @@ -311,7 +320,16 @@ def test_sym_eig_general_f32(n, factory): _test_sym_eig_general(n, qd.f32, factory) -@pytest.mark.parametrize("n", [4, 5, 6, 9, 12]) +@pytest.mark.parametrize( + "n", + [ + 4, + 5, + pytest.param(6, marks=pytest.mark.slow), + pytest.param(9, marks=pytest.mark.slow), + pytest.param(12, marks=pytest.mark.slow), + ], +) @pytest.mark.parametrize( "factory", [ @@ -358,7 +376,15 @@ def run(): np.testing.assert_allclose(A_spd_qd, expected, rtol=tol, atol=tol) -@pytest.mark.parametrize("n", [4, 6, 9, 12]) +@pytest.mark.parametrize( + "n", + [ + 4, + pytest.param(6, marks=pytest.mark.slow), + pytest.param(9, marks=pytest.mark.slow), + pytest.param(12, marks=pytest.mark.slow), + ], +) @pytest.mark.parametrize( "factory", [_sym_eig_factory_indefinite, _sym_eig_factory_random, _sym_eig_factory_spd], @@ -368,7 +394,15 @@ def test_make_spd_f32(n, factory): _test_make_spd(n, qd.f32, factory) -@pytest.mark.parametrize("n", [4, 6, 9, 12]) +@pytest.mark.parametrize( + "n", + [ + 4, + pytest.param(6, marks=pytest.mark.slow), + pytest.param(9, marks=pytest.mark.slow), + pytest.param(12, marks=pytest.mark.slow), + ], +) @pytest.mark.parametrize( "factory", [_sym_eig_factory_indefinite, _sym_eig_factory_random, _sym_eig_factory_spd], @@ -404,7 +438,15 @@ def run(): np.testing.assert_allclose(Q.T @ Q, np.eye(n), rtol=tol, atol=tol) -@pytest.mark.parametrize("n", [4, 6, 9, 12]) +@pytest.mark.parametrize( + "n", + [ + 4, + pytest.param(6, marks=pytest.mark.slow), + pytest.param(9, marks=pytest.mark.slow), + pytest.param(12, marks=pytest.mark.slow), + ], +) @pytest.mark.parametrize("alpha", [0.0, 1.0, -2.5]) @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False) def test_sym_eig_alpha_identity_f64(n, alpha): @@ -445,7 +487,15 @@ def project(src: qd.types.NDArray[mat_t, 1], dst: qd.types.NDArray[mat_t, 1]): ) -@pytest.mark.parametrize("n", [4, 6, 9, 12]) +@pytest.mark.parametrize( + "n", + [ + 4, + pytest.param(6, marks=pytest.mark.slow), + pytest.param(9, marks=pytest.mark.slow), + pytest.param(12, marks=pytest.mark.slow), + ], +) @pytest.mark.parametrize( "factory", [_sym_eig_factory_indefinite, _sym_eig_factory_negative_definite, _sym_eig_factory_spd], @@ -455,7 +505,15 @@ def test_make_spd_idempotent_f64(n, factory): _test_make_spd_idempotent(n, qd.f64, factory) -@pytest.mark.parametrize("n", [4, 6, 9, 12]) +@pytest.mark.parametrize( + "n", + [ + 4, + pytest.param(6, marks=pytest.mark.slow), + pytest.param(9, marks=pytest.mark.slow), + pytest.param(12, marks=pytest.mark.slow), + ], +) @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False) def test_make_spd_negative_definite_zero_f64(n): """A symmetric matrix with all-negative eigenvalues projects to the zero matrix (``Q · diag(max(λ, 0)) · Qᵀ`` @@ -535,13 +593,33 @@ def run(): ), f"column {i} is not the eigenvector of eigvals[{i}]={eigvals_qd[i]}: residual={residual}" -@pytest.mark.parametrize("n", [2, 3, 4, 6, 9, 12]) +@pytest.mark.parametrize( + "n", + [ + 2, + 3, + 4, + pytest.param(6, marks=pytest.mark.slow), + pytest.param(9, marks=pytest.mark.slow), + pytest.param(12, marks=pytest.mark.slow), + ], +) @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False) def test_sym_eig_sort_order_f32(n): _test_sym_eig_sort_order(n, qd.f32) -@pytest.mark.parametrize("n", [2, 3, 4, 6, 9, 12]) +@pytest.mark.parametrize( + "n", + [ + 2, + 3, + 4, + pytest.param(6, marks=pytest.mark.slow), + pytest.param(9, marks=pytest.mark.slow), + pytest.param(12, marks=pytest.mark.slow), + ], +) @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False) def test_sym_eig_sort_order_f64(n): _test_sym_eig_sort_order(n, qd.f64) diff --git a/tests/python/test_linalg.py b/tests/python/test_linalg.py index dfa31495bc..93ff2c2ce2 100644 --- a/tests/python/test_linalg.py +++ b/tests/python/test_linalg.py @@ -154,13 +154,31 @@ def run(): assert out_self[None] == test_utils.approx(A.to_numpy().__pow__(2).sum(), rel=tol, abs=tol) -@pytest.mark.parametrize("n", [2, 3, 6, 9, 12]) +@pytest.mark.parametrize( + "n", + [ + 2, + 3, + pytest.param(6, marks=pytest.mark.slow), + pytest.param(9, marks=pytest.mark.slow), + pytest.param(12, marks=pytest.mark.slow), + ], +) @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False) def test_frobenius_inner_f32(n): _test_frobenius_inner(n, qd.f32) -@pytest.mark.parametrize("n", [2, 3, 6, 9, 12]) +@pytest.mark.parametrize( + "n", + [ + 2, + 3, + pytest.param(6, marks=pytest.mark.slow), + pytest.param(9, marks=pytest.mark.slow), + pytest.param(12, marks=pytest.mark.slow), + ], +) @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False) def test_frobenius_inner_f64(n): _test_frobenius_inner(n, qd.f64) @@ -189,13 +207,27 @@ def run(): assert out[None] == test_utils.approx(expected, rel=tol, abs=tol) -@pytest.mark.parametrize("rows,cols", [(9, 12), (12, 3), (2, 4)]) +@pytest.mark.parametrize( + "rows,cols", + [ + pytest.param(9, 12, marks=pytest.mark.slow), + pytest.param(12, 3, marks=pytest.mark.slow), + (2, 4), + ], +) @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False) def test_frobenius_inner_rectangular_f32(rows, cols): _test_frobenius_inner_rectangular(rows, cols, qd.f32) -@pytest.mark.parametrize("rows,cols", [(9, 12), (12, 3), (2, 4)]) +@pytest.mark.parametrize( + "rows,cols", + [ + pytest.param(9, 12, marks=pytest.mark.slow), + pytest.param(12, 3, marks=pytest.mark.slow), + (2, 4), + ], +) @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False) def test_frobenius_inner_rectangular_f64(rows, cols): _test_frobenius_inner_rectangular(rows, cols, qd.f64) @@ -241,11 +273,13 @@ def run(): np.testing.assert_allclose(ABC_chained.to_numpy(), ABC_staged.to_numpy(), rtol=tol, atol=tol) +@pytest.mark.slow @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False) def test_matmul_chain_qipc_sizes_f32(): _test_matmul_chain(qd.f32) +@pytest.mark.slow @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False) def test_matmul_chain_qipc_sizes_f64(): _test_matmul_chain(qd.f64) @@ -434,7 +468,19 @@ def run(): np.testing.assert_allclose(M @ inv_np, np.eye(n_), rtol=tol, atol=tol) -@pytest.mark.parametrize("n", [5, 6, 7, 8, 9, 10, 11, 12]) +@pytest.mark.parametrize( + "n", + [ + 5, + pytest.param(6, marks=pytest.mark.slow), + pytest.param(7, marks=pytest.mark.slow), + pytest.param(8, marks=pytest.mark.slow), + pytest.param(9, marks=pytest.mark.slow), + pytest.param(10, marks=pytest.mark.slow), + pytest.param(11, marks=pytest.mark.slow), + pytest.param(12, marks=pytest.mark.slow), + ], +) @pytest.mark.parametrize( "factory", [_inverse_diagonally_dominant, _inverse_spd, _inverse_pivoting_required], @@ -444,7 +490,19 @@ def test_inverse_large_f32(n, factory): _test_inverse_at_size(n, qd.f32, factory) -@pytest.mark.parametrize("n", [5, 6, 7, 8, 9, 10, 11, 12]) +@pytest.mark.parametrize( + "n", + [ + 5, + pytest.param(6, marks=pytest.mark.slow), + pytest.param(7, marks=pytest.mark.slow), + pytest.param(8, marks=pytest.mark.slow), + pytest.param(9, marks=pytest.mark.slow), + pytest.param(10, marks=pytest.mark.slow), + pytest.param(11, marks=pytest.mark.slow), + pytest.param(12, marks=pytest.mark.slow), + ], +) @pytest.mark.parametrize( "factory", [_inverse_diagonally_dominant, _inverse_spd, _inverse_pivoting_required], diff --git a/tests/python/test_mpm88.py b/tests/python/test_mpm88.py index 725ff17ac9..d758b65f9d 100644 --- a/tests/python/test_mpm88.py +++ b/tests/python/test_mpm88.py @@ -7,6 +7,7 @@ from tests import test_utils +@pytest.mark.slow @pytest.mark.skipif(os.environ.get("QD_LITE_TEST") or "0", reason="Lite test") @pytest.mark.run_in_serial @test_utils.test() @@ -108,6 +109,7 @@ def _is_appveyor(): return os.getenv("APPVEYOR", "").lower() == "true" +@pytest.mark.slow @pytest.mark.skipif(os.environ.get("QD_LITE_TEST") or "0", reason="Lite test") @pytest.mark.run_in_serial @test_utils.test() diff --git a/tests/python/test_reset_ndarrays.py b/tests/python/test_reset_ndarrays.py index bc048ac92d..a42fd921f1 100644 --- a/tests/python/test_reset_ndarrays.py +++ b/tests/python/test_reset_ndarrays.py @@ -8,6 +8,7 @@ from tests import test_utils +@pytest.mark.slow @test_utils.test(arch=[qd.cpu]) def test_ndarray_doesnt_crash_on_gc() -> None: if sys.platform != "darwin": diff --git a/tests/python/test_struct.py b/tests/python/test_struct.py index d3d6a4fbaa..de6d249970 100644 --- a/tests/python/test_struct.py +++ b/tests/python/test_struct.py @@ -62,6 +62,7 @@ def test_linear_nested_aos(): assert y[i] == i + 123 +@pytest.mark.slow @test_utils.test(exclude=[qd.vulkan]) def test_2d_nested(): x = qd.field(qd.i32) diff --git a/tests/run_tests.py b/tests/run_tests.py index e2419add42..47d5574ad0 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -56,8 +56,15 @@ def _test_python(args, default_dir="python"): pytest_args += ["--cov-append"] if args.keys: pytest_args += ["-k", args.keys] - if args.marks: - pytest_args += ["-m", args.marks] + # By default we exclude tests marked `slow` (eig / make_spd at n>=6, inverse_large + # at n>=6, mpm88, etc. — see tests/pytest.ini for the marker). `--run-slow` opts + # back in. If the user passes their own `-m` expression we AND `not slow` onto it + # so the exclusion still applies, unless they explicitly opt out via `--run-slow`. + marks_expr = args.marks + if not args.run_slow: + marks_expr = f"({marks_expr}) and not slow" if marks_expr else "not slow" + if marks_expr: + pytest_args += ["-m", marks_expr] if args.failed_first: pytest_args += ["--failed-first"] if args.fail_fast: @@ -161,7 +168,17 @@ def test(): default=None, dest="marks", type=str, - help="Only run tests with specific marks", + help="Only run tests with specific marks. `not slow` is appended automatically " + "unless --run-slow is passed.", + ) + parser.add_argument( + "--run-slow", + required=False, + default=False, + dest="run_slow", + action="store_true", + help="Include tests marked `slow` (excluded by default). Has no effect if -m is " + "given an explicit expression that already mentions `slow`.", ) parser.add_argument( "-f", From f58248a9e2c2ed69266f69168f9f52c199d96325 Mon Sep 17 00:00:00 2001 From: "Hugh Perkins (deskai7)" Date: Tue, 19 May 2026 03:13:45 -0700 Subject: [PATCH 02/15] Trim n parametrize lists to {smallest, 12} The previous lists ([4, 5, 6, 9, 12], [2, 3, 4, 6, 9, 12], [5..12], etc.) gave the Householder/QR path a lot of redundant size coverage. For routine CI we only need to exercise a small size + the largest supported size (12, which also doubles as the slow-marked stress case): if a bug shows up only at n=7 or n=11 it almost certainly also shows up at n=12. test_eig.py sym_eig_general_{f32,f64} [4,5,6,9,12] -> [4, 12*] make_spd_{f32,f64} [4,6,9,12] -> [4, 12*] sym_eig_alpha_identity_f64 [4,6,9,12] -> [4, 12*] make_spd_idempotent_f64 [4,6,9,12] -> [4, 12*] make_spd_negative_definite_zero_f64 [4,6,9,12] -> [4, 12*] sym_eig_sort_order_{f32,f64} [2,3,4,6,9,12] -> [3, 12*] test_linalg.py frobenius_inner_{f32,f64} [2,3,6,9,12] -> [3, 12*] inverse_large_{f32,f64} [5..12] -> [5, 12*] * n=12 retains the `slow` marker, so default `run_tests.py` invocations only hit n=4 / n=3 / n=5. `--run-slow` runs both. Closed-form 2x2/3x3 paths in test_sym_eig_sort_order: dropped n=2 in favour of n=3 (per directive); the 2x2 path is still covered by test_sym_eig2x2_{f32,f64}. The 3x3 closed-form path stays covered by n=3. Other parametrize lists left untouched: - rectangular (rows, cols) tuples in test_frobenius_inner_rectangular (it's varying shape, not pure size). - test_mat_inverse_size's `range(1, 5)` (tiny sizes only). - `a00` integer parametrize in test_sym_eig3x3_{f32,f64}. --- tests/python/test_eig.py | 96 ++++--------------------------------- tests/python/test_linalg.py | 50 ++----------------- 2 files changed, 13 insertions(+), 133 deletions(-) diff --git a/tests/python/test_eig.py b/tests/python/test_eig.py index ad8d8fe3bb..a8b5153dd6 100644 --- a/tests/python/test_eig.py +++ b/tests/python/test_eig.py @@ -295,16 +295,7 @@ def run(): np.testing.assert_allclose(A_reconstructed, A_np, rtol=tol, atol=tol) -@pytest.mark.parametrize( - "n", - [ - 4, - 5, - pytest.param(6, marks=pytest.mark.slow), - pytest.param(9, marks=pytest.mark.slow), - pytest.param(12, marks=pytest.mark.slow), - ], -) +@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)]) @pytest.mark.parametrize( "factory", [ @@ -320,16 +311,7 @@ def test_sym_eig_general_f32(n, factory): _test_sym_eig_general(n, qd.f32, factory) -@pytest.mark.parametrize( - "n", - [ - 4, - 5, - pytest.param(6, marks=pytest.mark.slow), - pytest.param(9, marks=pytest.mark.slow), - pytest.param(12, marks=pytest.mark.slow), - ], -) +@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)]) @pytest.mark.parametrize( "factory", [ @@ -376,15 +358,7 @@ def run(): np.testing.assert_allclose(A_spd_qd, expected, rtol=tol, atol=tol) -@pytest.mark.parametrize( - "n", - [ - 4, - pytest.param(6, marks=pytest.mark.slow), - pytest.param(9, marks=pytest.mark.slow), - pytest.param(12, marks=pytest.mark.slow), - ], -) +@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)]) @pytest.mark.parametrize( "factory", [_sym_eig_factory_indefinite, _sym_eig_factory_random, _sym_eig_factory_spd], @@ -394,15 +368,7 @@ def test_make_spd_f32(n, factory): _test_make_spd(n, qd.f32, factory) -@pytest.mark.parametrize( - "n", - [ - 4, - pytest.param(6, marks=pytest.mark.slow), - pytest.param(9, marks=pytest.mark.slow), - pytest.param(12, marks=pytest.mark.slow), - ], -) +@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)]) @pytest.mark.parametrize( "factory", [_sym_eig_factory_indefinite, _sym_eig_factory_random, _sym_eig_factory_spd], @@ -438,15 +404,7 @@ def run(): np.testing.assert_allclose(Q.T @ Q, np.eye(n), rtol=tol, atol=tol) -@pytest.mark.parametrize( - "n", - [ - 4, - pytest.param(6, marks=pytest.mark.slow), - pytest.param(9, marks=pytest.mark.slow), - pytest.param(12, marks=pytest.mark.slow), - ], -) +@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)]) @pytest.mark.parametrize("alpha", [0.0, 1.0, -2.5]) @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False) def test_sym_eig_alpha_identity_f64(n, alpha): @@ -487,15 +445,7 @@ def project(src: qd.types.NDArray[mat_t, 1], dst: qd.types.NDArray[mat_t, 1]): ) -@pytest.mark.parametrize( - "n", - [ - 4, - pytest.param(6, marks=pytest.mark.slow), - pytest.param(9, marks=pytest.mark.slow), - pytest.param(12, marks=pytest.mark.slow), - ], -) +@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)]) @pytest.mark.parametrize( "factory", [_sym_eig_factory_indefinite, _sym_eig_factory_negative_definite, _sym_eig_factory_spd], @@ -505,15 +455,7 @@ def test_make_spd_idempotent_f64(n, factory): _test_make_spd_idempotent(n, qd.f64, factory) -@pytest.mark.parametrize( - "n", - [ - 4, - pytest.param(6, marks=pytest.mark.slow), - pytest.param(9, marks=pytest.mark.slow), - pytest.param(12, marks=pytest.mark.slow), - ], -) +@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)]) @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False) def test_make_spd_negative_definite_zero_f64(n): """A symmetric matrix with all-negative eigenvalues projects to the zero matrix (``Q · diag(max(λ, 0)) · Qᵀ`` @@ -593,33 +535,13 @@ def run(): ), f"column {i} is not the eigenvector of eigvals[{i}]={eigvals_qd[i]}: residual={residual}" -@pytest.mark.parametrize( - "n", - [ - 2, - 3, - 4, - pytest.param(6, marks=pytest.mark.slow), - pytest.param(9, marks=pytest.mark.slow), - pytest.param(12, marks=pytest.mark.slow), - ], -) +@pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)]) @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False) def test_sym_eig_sort_order_f32(n): _test_sym_eig_sort_order(n, qd.f32) -@pytest.mark.parametrize( - "n", - [ - 2, - 3, - 4, - pytest.param(6, marks=pytest.mark.slow), - pytest.param(9, marks=pytest.mark.slow), - pytest.param(12, marks=pytest.mark.slow), - ], -) +@pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)]) @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False) def test_sym_eig_sort_order_f64(n): _test_sym_eig_sort_order(n, qd.f64) diff --git a/tests/python/test_linalg.py b/tests/python/test_linalg.py index 93ff2c2ce2..a6632d4678 100644 --- a/tests/python/test_linalg.py +++ b/tests/python/test_linalg.py @@ -154,31 +154,13 @@ def run(): assert out_self[None] == test_utils.approx(A.to_numpy().__pow__(2).sum(), rel=tol, abs=tol) -@pytest.mark.parametrize( - "n", - [ - 2, - 3, - pytest.param(6, marks=pytest.mark.slow), - pytest.param(9, marks=pytest.mark.slow), - pytest.param(12, marks=pytest.mark.slow), - ], -) +@pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)]) @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False) def test_frobenius_inner_f32(n): _test_frobenius_inner(n, qd.f32) -@pytest.mark.parametrize( - "n", - [ - 2, - 3, - pytest.param(6, marks=pytest.mark.slow), - pytest.param(9, marks=pytest.mark.slow), - pytest.param(12, marks=pytest.mark.slow), - ], -) +@pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)]) @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False) def test_frobenius_inner_f64(n): _test_frobenius_inner(n, qd.f64) @@ -468,19 +450,7 @@ def run(): np.testing.assert_allclose(M @ inv_np, np.eye(n_), rtol=tol, atol=tol) -@pytest.mark.parametrize( - "n", - [ - 5, - pytest.param(6, marks=pytest.mark.slow), - pytest.param(7, marks=pytest.mark.slow), - pytest.param(8, marks=pytest.mark.slow), - pytest.param(9, marks=pytest.mark.slow), - pytest.param(10, marks=pytest.mark.slow), - pytest.param(11, marks=pytest.mark.slow), - pytest.param(12, marks=pytest.mark.slow), - ], -) +@pytest.mark.parametrize("n", [5, pytest.param(12, marks=pytest.mark.slow)]) @pytest.mark.parametrize( "factory", [_inverse_diagonally_dominant, _inverse_spd, _inverse_pivoting_required], @@ -490,19 +460,7 @@ def test_inverse_large_f32(n, factory): _test_inverse_at_size(n, qd.f32, factory) -@pytest.mark.parametrize( - "n", - [ - 5, - pytest.param(6, marks=pytest.mark.slow), - pytest.param(7, marks=pytest.mark.slow), - pytest.param(8, marks=pytest.mark.slow), - pytest.param(9, marks=pytest.mark.slow), - pytest.param(10, marks=pytest.mark.slow), - pytest.param(11, marks=pytest.mark.slow), - pytest.param(12, marks=pytest.mark.slow), - ], -) +@pytest.mark.parametrize("n", [5, pytest.param(12, marks=pytest.mark.slow)]) @pytest.mark.parametrize( "factory", [_inverse_diagonally_dominant, _inverse_spd, _inverse_pivoting_required], From a2f4f91b4f18c467d209729255f2e7d257487d08 Mon Sep 17 00:00:00 2001 From: "Hugh Perkins (deskai7)" Date: Tue, 19 May 2026 06:48:09 -0700 Subject: [PATCH 03/15] [Demo] cholesky_blocked: take N / N_ENVS / WARMUP / ITERS via argparse The blocked-Cholesky demo previously hard-coded N=92, N_ENVS=4096, WARMUP=50, ITERS=200 as module globals. The unit-test wrapper test_tile16_cholesky_blocked_demo runs the demo as a subprocess and only cares that it returns 0; at the hard-coded sizes that takes ~74 s on cluster CUDA, dominated by JIT-compiling 3 large unrolled kernels at N=92 and running the 4096-env x 250-iter benchmark loop. Expose all four knobs as command-line flags with the previous values as defaults, so: python misc/demos/cholesky_blocked.py # unchanged, full demo python misc/demos/cholesky_blocked.py --n 32 --n-envs 64 \ --num-warmup 1 --num-iters 1 # smoke-mode The test will switch to the smoke-mode invocation in a follow-up commit so it stops dominating the slow critical path. Flag names (--n, --n-envs, --num-warmup, --num-iters) follow the user spec; using argparse + ArgumentDefaultsHelpFormatter so --help shows the full demo defaults. --- misc/demos/cholesky_blocked.py | 37 +++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/misc/demos/cholesky_blocked.py b/misc/demos/cholesky_blocked.py index 8dbcb3fbb9..b4c60c1810 100644 --- a/misc/demos/cholesky_blocked.py +++ b/misc/demos/cholesky_blocked.py @@ -1,13 +1,14 @@ #!/usr/bin/env python3 -"""Benchmark 92x92 blocked Cholesky factorization using Tile16x16. +"""Benchmark NxN blocked Cholesky factorization using Tile16x16. Three kernels compared: 1. Baseline: scalar Cholesky-Crout, 64 threads, shared memory, 2*N+1 sequential syncs. Thread 0 computes each diagonal, remaining threads parallelize off-diagonal updates. -2. Blocked: 6x6 grid of 16x16 tiles, 16 threads, shared memory, scalar Crout for diagonal blocks. Same blocking - structure as Tile16x16 but all data lives in shared memory with block.sync() between every step. +2. Blocked: ceil(N/16) x ceil(N/16) grid of 16x16 tiles, 16 threads, shared memory, scalar Crout for diagonal + blocks. Same blocking structure as Tile16x16 but all data lives in shared memory with block.sync() between + every step. 3. Tile16x16: same blocked structure but fully register-resident via Tile16x16. No shared memory, zero syncs. Prior tiles read from global memory (L2). @@ -20,22 +21,38 @@ tile16 (Tile16x16, no shared memory) 16 533 5.19x Usage: - python misc/demos/cholesky_blocked.py + python misc/demos/cholesky_blocked.py [--n N] [--n-envs N_ENVS] \ + [--num-warmup WARMUP] [--num-iters ITERS] """ +import argparse import time import numpy as np import quadrants as qd -N = 92 + +def _parse_args(): + p = argparse.ArgumentParser( + description="Blocked Cholesky NxN benchmark (3 kernels: baseline / blocked / tile16).", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + p.add_argument("--n", type=int, default=92, help="Matrix dimension N (NxN SPD).") + p.add_argument("--n-envs", type=int, default=4096, help="Number of independent environments.") + p.add_argument("--num-warmup", type=int, default=50, help="Warmup iterations per kernel.") + p.add_argument("--num-iters", type=int, default=200, help="Timed iterations per kernel.") + return p.parse_args() + + +_args = _parse_args() +N = _args.n TILE = 16 -N_BLOCKS = (N + TILE - 1) // TILE # 6 -N_PADDED = N_BLOCKS * TILE # 96, rounded up for blocked kernel SharedArrays -N_ENVS = 4096 -WARMUP = 50 -ITERS = 200 +N_BLOCKS = (N + TILE - 1) // TILE +N_PADDED = N_BLOCKS * TILE # rounded up for blocked kernel SharedArrays +N_ENVS = _args.n_envs +WARMUP = _args.num_warmup +ITERS = _args.num_iters qd.init(arch=qd.gpu) From eae1a36377e708a28c3fe840253ab7b71f39e3a7 Mon Sep 17 00:00:00 2001 From: "Hugh Perkins (deskai7)" Date: Tue, 19 May 2026 06:49:15 -0700 Subject: [PATCH 04/15] [Test] test_tile16_cholesky_blocked_demo: invoke demo in smoke-mode Pass small CLI overrides (--n 32 --n-envs 64 --num-warmup 1 --num-iters 1) so the demo runs end-to-end in seconds instead of ~74 s. The test contract is just "demo exits 0"; it doesn't read any of the benchmark numbers, so the smaller workload still satisfies the smoke test. The full N=92 / N_ENVS=4096 / 50+200-iter demo is still what humans running misc/demos/cholesky_blocked.py see by default (argparse defaults match the previous hard-coded values). Together with the previous commit, this drops the test_tile16_cholesky_blocked_demo wall time on cluster CUDA from ~74 s to (expected) a few seconds, removing the largest remaining single-test outlier on hp/mark-slow-tests. --- tests/python/test_tile16.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/python/test_tile16.py b/tests/python/test_tile16.py index 97480c7d1d..6d917e11ad 100644 --- a/tests/python/test_tile16.py +++ b/tests/python/test_tile16.py @@ -1776,8 +1776,21 @@ def write_eye_f32(dst: Ann32): @test_utils.test(arch=[qd.cuda]) def test_tile16_cholesky_blocked_demo(): - """Smoke-test that misc/demos/cholesky_blocked.py runs to completion.""" + """Smoke-test that misc/demos/cholesky_blocked.py runs to completion. + + Uses small CLI overrides (N=32, N_ENVS=64, 1 warmup + 1 timed iter) so the + JIT compile of the 3 unrolled kernels and the benchmark loop both stay + cheap. The demo's defaults (N=92, N_ENVS=4096, 50+200 iters) are exercised + by anyone running the script manually, not by CI. + """ demo = Path(__file__).resolve().parents[2] / "misc" / "demos" / "cholesky_blocked.py" - result = subprocess.run([sys.executable, str(demo)], capture_output=True, text=True, timeout=300) + cmd = [ + sys.executable, str(demo), + "--n", "32", + "--n-envs", "64", + "--num-warmup", "1", + "--num-iters", "1", + ] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) if result.returncode != 0: pytest.fail(f"cholesky_blocked.py exited with code {result.returncode}\nstderr:\n{result.stderr}") From dc1319ef9b2c27deffbe0a9c4f64f7e1ac3e43aa Mon Sep 17 00:00:00 2001 From: "Hugh Perkins (deskai7)" Date: Tue, 19 May 2026 07:16:24 -0700 Subject: [PATCH 05/15] [Test] test_matmul_chain_qipc_sizes: parametrize on matrix shapes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the test hard-coded the qipc IPC sizes (9x12) · (12x12) · (12x9). On cluster CUDA those two cases (f32 + f64) take ~92.7s and ~87.3s respectively -- the top two single-test outliers in the suite, each holding one xdist worker for ~90s of contiguous JIT-compile + unrolled-FMA work. Parametrize `_test_matmul_chain` on (rows_a, cols_a, cols_b, cols_c). Default lane runs the small (3,4,4,3) chain to exercise the same Matrix.__matmul__ codegen path; the original (9,12,12,9) qipc-sized chain is slow-marked so it still runs on --run-slow (i.e. CI's nightly / release lane, once that's wired up). Estimated saving: ~180s CPU, ~70s wall (these tests were on the critical path of the branch run). No function-level coverage lost: both f32 and f64 versions still run the same chain by default, just at a smaller size. --- tests/python/test_linalg.py | 53 ++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/tests/python/test_linalg.py b/tests/python/test_linalg.py index a6632d4678..59925ee2ce 100644 --- a/tests/python/test_linalg.py +++ b/tests/python/test_linalg.py @@ -215,24 +215,26 @@ def test_frobenius_inner_rectangular_f64(rows, cols): _test_frobenius_inner_rectangular(rows, cols, qd.f64) -def _test_matmul_chain(dt): - """3-way matmul chain at qipc IPC sizes: (9×12) · (12×12) · (12×9) → (9×9). - - Verifies that ``Matrix.__matmul__`` compiles and is numerically correct at the largest size qipc needs. Quadrants - imposes no enforced size cap on matmul, but the unrolled `static(range)` triple loop produces ~1296 FMAs per - intermediate, so this test catches compile-time blow-up or back-end miscompiles at large sizes. +def _test_matmul_chain(rows_a, cols_a, cols_b, cols_c, dt): + """3-way matmul chain: ``(rows_a × cols_a) · (cols_a × cols_b) · (cols_b × cols_c) → (rows_a × cols_c)``. + + Verifies that ``Matrix.__matmul__`` compiles and is numerically correct at the requested size. Quadrants + imposes no enforced size cap on matmul, but the unrolled `static(range)` triple loop produces + ``rows_a * cols_a * cols_b + rows_a * cols_b * cols_c`` FMAs per kernel call, so this test catches compile-time + blow-up or back-end miscompiles at large sizes. The largest parametrize value is the chain qipc actually uses; + smaller values are cheap sanity checks that the same code path still works. """ np_dt = np.float32 if dt == qd.f32 else np.float64 - A_np = np.random.default_rng(0xCA70).standard_normal((9, 12)).astype(np_dt) - B_np = np.random.default_rng(0xCA71).standard_normal((12, 12)).astype(np_dt) - C_np = np.random.default_rng(0xCA72).standard_normal((12, 9)).astype(np_dt) + A_np = np.random.default_rng(0xCA70).standard_normal((rows_a, cols_a)).astype(np_dt) + B_np = np.random.default_rng(0xCA71).standard_normal((cols_a, cols_b)).astype(np_dt) + C_np = np.random.default_rng(0xCA72).standard_normal((cols_b, cols_c)).astype(np_dt) - A = qd.Matrix.field(9, 12, dtype=dt, shape=()) - B = qd.Matrix.field(12, 12, dtype=dt, shape=()) - C = qd.Matrix.field(12, 9, dtype=dt, shape=()) - AB = qd.Matrix.field(9, 12, dtype=dt, shape=()) - ABC_chained = qd.Matrix.field(9, 9, dtype=dt, shape=()) - ABC_staged = qd.Matrix.field(9, 9, dtype=dt, shape=()) + A = qd.Matrix.field(rows_a, cols_a, dtype=dt, shape=()) + B = qd.Matrix.field(cols_a, cols_b, dtype=dt, shape=()) + C = qd.Matrix.field(cols_b, cols_c, dtype=dt, shape=()) + AB = qd.Matrix.field(rows_a, cols_b, dtype=dt, shape=()) + ABC_chained = qd.Matrix.field(rows_a, cols_c, dtype=dt, shape=()) + ABC_staged = qd.Matrix.field(rows_a, cols_c, dtype=dt, shape=()) A.from_numpy(A_np) B.from_numpy(B_np) @@ -255,16 +257,25 @@ def run(): np.testing.assert_allclose(ABC_chained.to_numpy(), ABC_staged.to_numpy(), rtol=tol, atol=tol) -@pytest.mark.slow +# qipc's actual size is (9,12,12,9) -- the largest chain it instantiates. We also keep a tiny (3,4,4,3) chain so +# the default fast lane still exercises the same Matrix.__matmul__ codegen path without paying the ~90s/case +# CUDA JIT cost of the qipc-sized chain. +_MATMUL_CHAIN_SHAPES = [ + (3, 4, 4, 3), + pytest.param(9, 12, 12, 9, marks=pytest.mark.slow), +] + + +@pytest.mark.parametrize("rows_a,cols_a,cols_b,cols_c", _MATMUL_CHAIN_SHAPES) @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False) -def test_matmul_chain_qipc_sizes_f32(): - _test_matmul_chain(qd.f32) +def test_matmul_chain_qipc_sizes_f32(rows_a, cols_a, cols_b, cols_c): + _test_matmul_chain(rows_a, cols_a, cols_b, cols_c, qd.f32) -@pytest.mark.slow +@pytest.mark.parametrize("rows_a,cols_a,cols_b,cols_c", _MATMUL_CHAIN_SHAPES) @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False) -def test_matmul_chain_qipc_sizes_f64(): - _test_matmul_chain(qd.f64) +def test_matmul_chain_qipc_sizes_f64(rows_a, cols_a, cols_b, cols_c): + _test_matmul_chain(rows_a, cols_a, cols_b, cols_c, qd.f64) @test_utils.test() From 81d45a00964fe9202d077489644035e48143e32c Mon Sep 17 00:00:00 2001 From: "Hugh Perkins (deskai7)" Date: Tue, 19 May 2026 07:17:17 -0700 Subject: [PATCH 06/15] [Test] test_gdar_mpm: parametrize on particles_side / n_grid / num_steps Previously hard-coded N=30 (900 particles), n_grid=120, steps=32 -- 26s on cluster CUDA. The test's actual contract is that the AD-validation checker raises QuadrantsAssertionError on the global-data-access violation in g2p (`v[f, p] = new_v`), which fires on the first substep regardless of grid / particle / step counts. Parametrize on (particles_side, n_grid_size, num_steps) with a small default (8, 32, 4) and slow-marked original (30, 120, 32). The default still exercises the same diff-MPM pipeline (p2g / grid_op / g2p, qd.ad.Tape with validation=True, `with pytest.raises(...)`) and still triggers the assertion error. Estimated CPU saving: ~22s; wall saving ~3s on the branch run. --- tests/python/test_ad_gdar_diffmpm.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/tests/python/test_ad_gdar_diffmpm.py b/tests/python/test_ad_gdar_diffmpm.py index cd6bb32a04..0e0e460534 100644 --- a/tests/python/test_ad_gdar_diffmpm.py +++ b/tests/python/test_ad_gdar_diffmpm.py @@ -5,14 +5,26 @@ from tests import test_utils +# Defaults shrink particle / grid / steps counts so the JIT compile + AD-tape replay +# stays cheap; the slow-marked entry keeps the original (N=30, n_grid=120, steps=32) +# workload that runs on --run-slow. The point of the test is that the AD-validation +# checker fires on the global-data-access violation in g2p (`v[f, p] = new_v`), which +# happens on the first substep regardless of size. +@pytest.mark.parametrize( + "particles_side,n_grid_size,num_steps", + [ + (8, 32, 4), + pytest.param(30, 120, 32, marks=pytest.mark.slow), + ], +) @test_utils.test(require=qd.extension.assertion, debug=True) -def test_gdar_mpm(): +def test_gdar_mpm(particles_side, n_grid_size, num_steps): real = qd.f32 dim = 2 - N = 30 # reduce to 30 if run out of GPU memory + N = particles_side n_particles = N * N - n_grid = 120 + n_grid = n_grid_size dx = 1 / n_grid inv_dx = 1 / dx dt = 3e-4 @@ -21,8 +33,8 @@ def test_gdar_mpm(): E = 100 mu = E la = E - max_steps = 32 - steps = 32 + max_steps = num_steps + steps = num_steps gravity = 9.8 target = [0.3, 0.6] From 1b08117ea0bf317eb1672f6e578dd5179ad171cc Mon Sep 17 00:00:00 2001 From: "Hugh Perkins (deskai7)" Date: Tue, 19 May 2026 08:02:22 -0700 Subject: [PATCH 07/15] [Test] test_device_{reduce,exclusive_scan}: fuse {add,min,max} into one op-parametrized test The three reduce variants (and the three scan variants) shared an identical kernel signature, identical input shape, and differed only in (a) which qd.algorithms.device_ function they called and (b) overflow vs bitwise-exact verification. Collapse each triple into a single op-parametrized test: test_device_reduce(op, dtype, N) # op in {add, min, max} test_device_exclusive_scan(op, dtype, N) # op in {add, min, max} Behavior, coverage and the parametrize space are unchanged -- pytest still collects the same number of parametrize cases, just under unified test names. This is purely a code-dedup refactor (~130 LOC less) which makes the next op-axis sampling change (if/when we choose to drop A vs B vs C from the sweep) a one-line edit. --- tests/python/test_algorithms.py | 240 ++++++++++++++------------------ 1 file changed, 106 insertions(+), 134 deletions(-) diff --git a/tests/python/test_algorithms.py b/tests/python/test_algorithms.py index e4b4ac9960..508732ce3b 100644 --- a/tests/python/test_algorithms.py +++ b/tests/python/test_algorithms.py @@ -320,86 +320,79 @@ def _rand_reduce_host(rng, dtype, N, *, bound=1000): return rng.integers(-bound, bound, size=N, dtype=np_dt) -@pytest.mark.parametrize("N", _REDUCE_SIZES) -@pytest.mark.parametrize("dtype", _REDUCE_DTYPES) -@test_utils.test(arch=qd.gpu) -def test_device_reduce_add(dtype, N): - """device_reduce_add matches numpy.sum across the full size sweep + dtype set.""" - _skip_if_dtype_unsupported(dtype) - inp, out = _alloc_input_out(dtype, N) - rng = np.random.default_rng(seed=1234) - host = _rand_reduce_host(rng, dtype, N) - _fill_field(inp, host) +_REDUCE_OPS = ["add", "min", "max"] - qd.algorithms.device_reduce_add(inp, out=out) - got = out.to_numpy()[0] +def _reduce_host(rng, op, dtype, N): + """Generate the test input for a reduce of `op` on `dtype` x N values. + + ``add`` uses small uniform / bounded values so float sums stay representable; ``min`` and ``max`` use a wider + range (-10..10 for floats, +-10000 for ints) since picking-an-element is bitwise-exact regardless of magnitude. + """ + if op == "add": + return _rand_reduce_host(rng, dtype, N) if _is_float(dtype): - expected = float(np.sum(host.astype(np.float64))) - rtol, atol = (_F32_REDUCE_RTOL, _F32_REDUCE_ATOL) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL) - assert math.isclose( - got, expected, rel_tol=rtol, abs_tol=atol - ), f"{dtype} reduce_add(N={N}): got {got}, expected {expected}" - else: - # Promote to Python int for an arbitrary-width reference; mask both sides to dtype width to handle the - # u32 / u64 mod-wrap case at large N. - mod = 1 << (32 if dtype in (qd.i32, qd.u32) else 64) if _is_unsigned(dtype) else None - ref = int( - np.sum(host.astype(np.int64 if dtype in (qd.i32, qd.u32) else (np.int64 if dtype == qd.i64 else np.uint64))) - ) # noqa: E501 - got_int = int(got) - if mod is not None: - ref &= mod - 1 - got_int &= mod - 1 - assert got_int == ref, f"{dtype} reduce_add(N={N}): got {got_int}, expected {ref}" + return rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype]) + return _rand_reduce_host(rng, dtype, N, bound=10000) -@pytest.mark.parametrize("N", _REDUCE_SIZES) -@pytest.mark.parametrize("dtype", _REDUCE_DTYPES) -@test_utils.test(arch=qd.gpu) -def test_device_reduce_min(dtype, N): - """device_reduce_min(identity=type-positive-extreme) matches numpy.min.""" +def _check_reduce(op, dtype, N): + """Run ``device_reduce_(arr)`` and verify against ``numpy.(arr)``. + + ``add`` accumulates so it needs (a) wider integer promotion + mod-wrap masking for u32/u64 and (b) per-N float + tolerance. ``min`` / ``max`` pick one input element, so they're bitwise-exact for both ints and floats. + """ _skip_if_dtype_unsupported(dtype) inp, out = _alloc_input_out(dtype, N) rng = np.random.default_rng(seed=1234) - if _is_float(dtype): - host = rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype]) - else: - host = _rand_reduce_host(rng, dtype, N, bound=10000) + host = _reduce_host(rng, op, dtype, N) _fill_field(inp, host) - qd.algorithms.device_reduce_min(inp, out=out) + qd_fn = getattr(qd.algorithms, f"device_reduce_{op}") + qd_fn(inp, out=out) got = out.to_numpy()[0] - expected = host.min() + if op == "add": + if _is_float(dtype): + expected = float(np.sum(host.astype(np.float64))) + rtol, atol = (_F32_REDUCE_RTOL, _F32_REDUCE_ATOL) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL) + assert math.isclose( + got, expected, rel_tol=rtol, abs_tol=atol + ), f"{dtype} reduce_add(N={N}): got {got}, expected {expected}" + else: + # Promote to Python int for an arbitrary-width reference; mask both sides to dtype width to handle the + # u32 / u64 mod-wrap case at large N. + mod = 1 << (32 if dtype in (qd.i32, qd.u32) else 64) if _is_unsigned(dtype) else None + ref = int( + np.sum( + host.astype(np.int64 if dtype in (qd.i32, qd.u32) else (np.int64 if dtype == qd.i64 else np.uint64)) + ) + ) # noqa: E501 + got_int = int(got) + if mod is not None: + ref &= mod - 1 + got_int &= mod - 1 + assert got_int == ref, f"{dtype} reduce_add(N={N}): got {got_int}, expected {ref}" + return + + expected = host.min() if op == "min" else host.max() if _is_float(dtype): assert got == pytest.approx(expected, abs=1e-6 if dtype == qd.f32 else 1e-12) else: - assert int(got) == int(expected), f"{dtype} reduce_min(N={N}): got {got}, expected {expected}" + assert int(got) == int(expected), f"{dtype} reduce_{op}(N={N}): got {got}, expected {expected}" +@pytest.mark.parametrize("op", _REDUCE_OPS) @pytest.mark.parametrize("N", _REDUCE_SIZES) @pytest.mark.parametrize("dtype", _REDUCE_DTYPES) @test_utils.test(arch=qd.gpu) -def test_device_reduce_max(dtype, N): - """device_reduce_max(identity=type-negative-extreme) matches numpy.max.""" - _skip_if_dtype_unsupported(dtype) - inp, out = _alloc_input_out(dtype, N) - rng = np.random.default_rng(seed=1234) - if _is_float(dtype): - host = rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype]) - else: - host = _rand_reduce_host(rng, dtype, N, bound=10000) - _fill_field(inp, host) - - qd.algorithms.device_reduce_max(inp, out=out) - got = out.to_numpy()[0] - expected = host.max() +def test_device_reduce(op, dtype, N): + """``device_reduce_{add,min,max}`` match numpy across the full size sweep + dtype set. - if _is_float(dtype): - assert got == pytest.approx(expected, abs=1e-6 if dtype == qd.f32 else 1e-12) - else: - assert int(got) == int(expected), f"{dtype} reduce_max(N={N}): got {got}, expected {expected}" + Unified across the three op variants. ``add`` accumulates so it needs overflow / precision-aware comparison; + ``min`` / ``max`` pick one element of the input and are bitwise-exact. + """ + _check_reduce(op, dtype, N) @test_utils.test(arch=qd.gpu) @@ -454,101 +447,80 @@ def _scan_dtype_mask(dtype): return -1 -@pytest.mark.parametrize("N", _SCAN_SIZES) -@pytest.mark.parametrize("dtype", _SCAN_DTYPES) -@test_utils.test(arch=qd.gpu) -def test_device_exclusive_scan_add(dtype, N): - """device_exclusive_scan_add(out[i] = sum(arr[0:i])) matches numpy.cumsum-shifted across the full 6-dtype set.""" - _skip_if_dtype_unsupported(dtype) - inp, out = _alloc_scan_input_out(dtype, N) - rng = np.random.default_rng(seed=1234) - host = _rand_reduce_host(rng, dtype, N, bound=100) - _fill_field(inp, host) +_SCAN_OPS = ["add", "min", "max"] - qd.algorithms.device_exclusive_scan_add(inp, out=out) - got = out.to_numpy() +def _scan_host(rng, op, dtype, N): + """Generate the test input for a scan of `op` on `dtype` x N values. Same rationale as ``_reduce_host``.""" + if op == "add": + return _rand_reduce_host(rng, dtype, N, bound=100) if _is_float(dtype): - ref = np.concatenate([[0.0], np.cumsum(host.astype(np.float64))[:-1]]) - rtol, atol = _f32_scan_tol(N) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL) - np.testing.assert_allclose( - got.astype(np.float64), - ref, - rtol=rtol, - atol=atol, - err_msg=f"{dtype} scan_add(N={N})", - ) - else: - # Promote to a width that survives the cumulative sum: u64 / i64 inputs use a Python int reference; smaller - # ints can still use int64. - promote = np.int64 if dtype in (qd.i32, qd.u32, qd.i64) else np.uint64 - host_wide = host.astype(promote) - ref = np.concatenate([[promote(0)], np.cumsum(host_wide)[:-1]]).astype(promote) - mask = _scan_dtype_mask(dtype) - got_view = got.astype(np.int64 if dtype != qd.u64 else np.uint64) - if mask != -1: - got_view = got_view & promote(mask) - ref = ref & promote(mask) - np.testing.assert_array_equal( - got_view, - ref, - err_msg=f"{dtype} scan_add(N={N})", - ) + return rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype]) + return _rand_reduce_host(rng, dtype, N, bound=10000) -@pytest.mark.parametrize("N", _SCAN_SIZES) -@pytest.mark.parametrize("dtype", _SCAN_DTYPES) -@test_utils.test(arch=qd.gpu) -def test_device_exclusive_scan_min(dtype, N): - """device_exclusive_scan_min(out[i] = min(arr[0:i])) matches numpy.minimum.accumulate-shifted across the full - 6-dtype set.""" +def _check_scan(op, dtype, N): + """Run ``device_exclusive_scan_(arr)`` and verify against ``numpy..accumulate``-shifted. + + Like the reduce family, ``add`` accumulates (overflow / precision care) while ``min`` / ``max`` are + bitwise-exact in both float and int paths. + """ _skip_if_dtype_unsupported(dtype) inp, out = _alloc_scan_input_out(dtype, N) rng = np.random.default_rng(seed=1234) np_dt = _DTYPE_TO_NP[dtype] - if _is_float(dtype): - host = rng.uniform(-10.0, 10.0, size=N).astype(np_dt) - else: - host = _rand_reduce_host(rng, dtype, N, bound=10000) + host = _scan_host(rng, op, dtype, N) _fill_field(inp, host) - qd.algorithms.device_exclusive_scan_min(inp, out=out) + qd_fn = getattr(qd.algorithms, f"device_exclusive_scan_{op}") + qd_fn(inp, out=out) got = out.to_numpy() + if op == "add": + if _is_float(dtype): + ref = np.concatenate([[0.0], np.cumsum(host.astype(np.float64))[:-1]]) + rtol, atol = _f32_scan_tol(N) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL) + np.testing.assert_allclose( + got.astype(np.float64), + ref, + rtol=rtol, + atol=atol, + err_msg=f"{dtype} scan_add(N={N})", + ) + else: + # Promote to a width that survives the cumulative sum: u64 / i64 inputs use a Python int reference; + # smaller ints can still use int64. + promote = np.int64 if dtype in (qd.i32, qd.u32, qd.i64) else np.uint64 + host_wide = host.astype(promote) + ref = np.concatenate([[promote(0)], np.cumsum(host_wide)[:-1]]).astype(promote) + mask = _scan_dtype_mask(dtype) + got_view = got.astype(np.int64 if dtype != qd.u64 else np.uint64) + if mask != -1: + got_view = got_view & promote(mask) + ref = ref & promote(mask) + np.testing.assert_array_equal(got_view, ref, err_msg=f"{dtype} scan_add(N={N})") + return + + np_accum = np.minimum.accumulate if op == "min" else np.maximum.accumulate + identity_table = _MIN_IDENTITY if op == "min" else _MAX_IDENTITY if _is_float(dtype): - ref = np.concatenate([[float("inf")], np.minimum.accumulate(host.astype(np.float64))[:-1]]).astype(np_dt) - atol = 0 if dtype == qd.f32 else 0 # min is bitwise-exact for monotone ops on float - np.testing.assert_allclose(got, ref, rtol=0, atol=atol, err_msg=f"{dtype} scan_min(N={N})") + identity = float("inf") if op == "min" else float("-inf") + ref = np.concatenate([[identity], np_accum(host.astype(np.float64))[:-1]]).astype(np_dt) + np.testing.assert_allclose(got, ref, rtol=0, atol=0, err_msg=f"{dtype} scan_{op}(N={N})") else: - ref = np.concatenate([[np_dt(_MIN_IDENTITY[dtype])], np.minimum.accumulate(host)[:-1]]).astype(np_dt) - np.testing.assert_array_equal(got, ref, err_msg=f"{dtype} scan_min(N={N})") + ref = np.concatenate([[np_dt(identity_table[dtype])], np_accum(host)[:-1]]).astype(np_dt) + np.testing.assert_array_equal(got, ref, err_msg=f"{dtype} scan_{op}(N={N})") +@pytest.mark.parametrize("op", _SCAN_OPS) @pytest.mark.parametrize("N", _SCAN_SIZES) @pytest.mark.parametrize("dtype", _SCAN_DTYPES) @test_utils.test(arch=qd.gpu) -def test_device_exclusive_scan_max(dtype, N): - """device_exclusive_scan_max(out[i] = max(arr[0:i])) matches numpy.maximum.accumulate-shifted across the full - 6-dtype set.""" - _skip_if_dtype_unsupported(dtype) - inp, out = _alloc_scan_input_out(dtype, N) - rng = np.random.default_rng(seed=1234) - np_dt = _DTYPE_TO_NP[dtype] - if _is_float(dtype): - host = rng.uniform(-10.0, 10.0, size=N).astype(np_dt) - else: - host = _rand_reduce_host(rng, dtype, N, bound=10000) - _fill_field(inp, host) - - qd.algorithms.device_exclusive_scan_max(inp, out=out) - got = out.to_numpy() - - if _is_float(dtype): - ref = np.concatenate([[float("-inf")], np.maximum.accumulate(host.astype(np.float64))[:-1]]).astype(np_dt) - np.testing.assert_allclose(got, ref, rtol=0, atol=0, err_msg=f"{dtype} scan_max(N={N})") - else: - ref = np.concatenate([[np_dt(_MAX_IDENTITY[dtype])], np.maximum.accumulate(host)[:-1]]).astype(np_dt) - np.testing.assert_array_equal(got, ref, err_msg=f"{dtype} scan_max(N={N})") +def test_device_exclusive_scan(op, dtype, N): + """``device_exclusive_scan_{add,min,max}`` match ``numpy.{cumsum, minimum.accumulate, maximum.accumulate}``-shifted + across the full size sweep + dtype set. Unified across the three op variants; same overflow vs bitwise-exact + handling as the reduce family.""" + _check_scan(op, dtype, N) @test_utils.test(arch=qd.gpu) From 8acaaecd8d238080747d8a67ffbef7db708a63ac Mon Sep 17 00:00:00 2001 From: "Hugh Perkins (deskai7)" Date: Tue, 19 May 2026 08:03:21 -0700 Subject: [PATCH 08/15] [Style] black: reformat test_tile16_cholesky_blocked_demo cmd list + run_tests help string Pure formatting fix from `pre-commit run -a`; no behavior change. --- tests/python/test_tile16.py | 15 ++++++++++----- tests/run_tests.py | 3 +-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/python/test_tile16.py b/tests/python/test_tile16.py index 6d917e11ad..f94d4221e1 100644 --- a/tests/python/test_tile16.py +++ b/tests/python/test_tile16.py @@ -1785,11 +1785,16 @@ def test_tile16_cholesky_blocked_demo(): """ demo = Path(__file__).resolve().parents[2] / "misc" / "demos" / "cholesky_blocked.py" cmd = [ - sys.executable, str(demo), - "--n", "32", - "--n-envs", "64", - "--num-warmup", "1", - "--num-iters", "1", + sys.executable, + str(demo), + "--n", + "32", + "--n-envs", + "64", + "--num-warmup", + "1", + "--num-iters", + "1", ] result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) if result.returncode != 0: diff --git a/tests/run_tests.py b/tests/run_tests.py index 47d5574ad0..bf37ab2aa7 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -168,8 +168,7 @@ def test(): default=None, dest="marks", type=str, - help="Only run tests with specific marks. `not slow` is appended automatically " - "unless --run-slow is passed.", + help="Only run tests with specific marks. `not slow` is appended automatically " "unless --run-slow is passed.", ) parser.add_argument( "--run-slow", From e57752abdeab48ada14ccf77af8b7dc54b5f0acd Mon Sep 17 00:00:00 2001 From: "Hugh Perkins (deskai7)" Date: Tue, 19 May 2026 08:09:25 -0700 Subject: [PATCH 09/15] [Test] test_subgroup_full_matches_tiled: fuse 20 thin subgroup-op wrappers into 2 op-parametrized tests Lines 3608-3694 in test_simt.py were 18 ~5-line wrappers each calling ``_check_full_matches_tiled(subgroup., subgroup._tiled, ...)``. Lines 3841-3848 were 2 more, parametrized on dtype. ``_check_full_matches_tiled`` already accepts the full / tiled functions as Python arguments (closure-captured into ``@qd.kernel``), so collapsing the family is a pure dedup move: test_subgroup_full_matches_tiled(op_name, host_init) # 18 cases: {reduce, inclusive, exclusive}_{add,min,max,mul,and,or,xor} on qd.i32 test_subgroup_full_matches_tiled_float(op_name, dtype) # 4 cases: {reduce_add, inclusive_add} x {qd.f32, qd.f64} Behavior + coverage unchanged (still 22 parametrize cases, same dtype + init configurations). Pytest ids are designed to match the original test-name suffixes (e.g. ``[reduce_add]``, ``[inclusive_mul]``) so ``-k`` selectors and test reports stay readable. Drops ~50 LOC net. --- tests/python/test_simt.py | 136 ++++++++++++-------------------------- 1 file changed, 43 insertions(+), 93 deletions(-) diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py index 95e3438e41..6790d3afb5 100644 --- a/tests/python/test_simt.py +++ b/tests/python/test_simt.py @@ -3604,94 +3604,45 @@ def _init_full_bitwise(src, n): src[i] = 1 << (i % 7) -@test_utils.test(arch=qd.gpu) -def test_subgroup_reduce_add(): - _check_full_matches_tiled(subgroup.reduce_add, subgroup.reduce_add_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_reduce_all_add(): - _check_full_matches_tiled(subgroup.reduce_all_add, subgroup.reduce_all_add_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_reduce_min(): - _check_full_matches_tiled(subgroup.reduce_min, subgroup.reduce_min_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_reduce_max(): - _check_full_matches_tiled(subgroup.reduce_max, subgroup.reduce_max_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_reduce_all_min(): - _check_full_matches_tiled(subgroup.reduce_all_min, subgroup.reduce_all_min_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_reduce_all_max(): - _check_full_matches_tiled(subgroup.reduce_all_max, subgroup.reduce_all_max_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_inclusive_add(): - _check_full_matches_tiled(subgroup.inclusive_add, subgroup.inclusive_add_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_inclusive_min(): - _check_full_matches_tiled(subgroup.inclusive_min, subgroup.inclusive_min_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_inclusive_max(): - _check_full_matches_tiled(subgroup.inclusive_max, subgroup.inclusive_max_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_inclusive_mul(): - _check_full_matches_tiled(subgroup.inclusive_mul, subgroup.inclusive_mul_tiled, host_init=_init_full_small_int) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_inclusive_and(): - _check_full_matches_tiled(subgroup.inclusive_and, subgroup.inclusive_and_tiled, host_init=_init_full_bitwise) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_inclusive_or(): - _check_full_matches_tiled(subgroup.inclusive_or, subgroup.inclusive_or_tiled, host_init=_init_full_bitwise) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_inclusive_xor(): - _check_full_matches_tiled(subgroup.inclusive_xor, subgroup.inclusive_xor_tiled, host_init=_init_full_bitwise) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_exclusive_add(): - _check_full_matches_tiled(subgroup.exclusive_add, subgroup.exclusive_add_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_exclusive_mul(): - _check_full_matches_tiled(subgroup.exclusive_mul, subgroup.exclusive_mul_tiled, host_init=_init_full_small_int) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_exclusive_and(): - _check_full_matches_tiled(subgroup.exclusive_and, subgroup.exclusive_and_tiled, host_init=_init_full_bitwise) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_exclusive_or(): - _check_full_matches_tiled(subgroup.exclusive_or, subgroup.exclusive_or_tiled, host_init=_init_full_bitwise) +# Each entry is a thin ``_check_full_matches_tiled(subgroup.X, subgroup.X_tiled, ...)`` wrapper. Collapsed into one +# op-parametrized test to drop ~80 LOC of duplication. The pytest ids match the names of the original +# ``test_subgroup_`` functions so test reports / `-k` selectors stay stable. +_FULL_VS_TILED_INT_CASES = [ + pytest.param("reduce_add", None, id="reduce_add"), + pytest.param("reduce_all_add", None, id="reduce_all_add"), + pytest.param("reduce_min", None, id="reduce_min"), + pytest.param("reduce_max", None, id="reduce_max"), + pytest.param("reduce_all_min", None, id="reduce_all_min"), + pytest.param("reduce_all_max", None, id="reduce_all_max"), + pytest.param("inclusive_add", None, id="inclusive_add"), + pytest.param("inclusive_min", None, id="inclusive_min"), + pytest.param("inclusive_max", None, id="inclusive_max"), + # `mul` needs bounded inputs (2**N overflows i32 quickly); bitwise ops need a per-lane bit pattern that's + # non-zero on every lane so AND has signal and OR / XOR have varied bits. + pytest.param("inclusive_mul", _init_full_small_int, id="inclusive_mul"), + pytest.param("inclusive_and", _init_full_bitwise, id="inclusive_and"), + pytest.param("inclusive_or", _init_full_bitwise, id="inclusive_or"), + pytest.param("inclusive_xor", _init_full_bitwise, id="inclusive_xor"), + pytest.param("exclusive_add", None, id="exclusive_add"), + pytest.param("exclusive_mul", _init_full_small_int, id="exclusive_mul"), + pytest.param("exclusive_and", _init_full_bitwise, id="exclusive_and"), + pytest.param("exclusive_or", _init_full_bitwise, id="exclusive_or"), + pytest.param("exclusive_xor", _init_full_bitwise, id="exclusive_xor"), +] +@pytest.mark.parametrize("op_name,host_init", _FULL_VS_TILED_INT_CASES) @test_utils.test(arch=qd.gpu) -def test_subgroup_exclusive_xor(): - _check_full_matches_tiled(subgroup.exclusive_xor, subgroup.exclusive_xor_tiled, host_init=_init_full_bitwise) +def test_subgroup_full_matches_tiled(op_name, host_init): + """For each subgroup op ``X``, verify ``subgroup.X(v)`` matches ``subgroup.X_tiled(v, log2_group_size())`` + lane-by-lane on ``qd.i32``. Covers reduce / inclusive / exclusive families; bitwise ops + ``mul`` use a custom + initializer that keeps the per-lane aggregate bounded.""" + full_fn = getattr(subgroup, op_name) + tiled_fn = getattr(subgroup, f"{op_name}_tiled") + kwargs = {} + if host_init is not None: + kwargs["host_init"] = host_init + _check_full_matches_tiled(full_fn, tiled_fn, **kwargs) @test_utils.test(arch=qd.gpu) @@ -3836,16 +3787,15 @@ def k(): # accidentally cast through i32 inside a wrapper. +@pytest.mark.parametrize("op_name", ["reduce_add", "inclusive_add"]) @pytest.mark.parametrize("dtype", [qd.f32, qd.f64]) @test_utils.test(arch=qd.gpu) -def test_subgroup_reduce_add_float(dtype): - _check_full_matches_tiled(subgroup.reduce_add, subgroup.reduce_add_tiled, dtype=dtype) - - -@pytest.mark.parametrize("dtype", [qd.f32, qd.f64]) -@test_utils.test(arch=qd.gpu) -def test_subgroup_inclusive_add_float(dtype): - _check_full_matches_tiled(subgroup.inclusive_add, subgroup.inclusive_add_tiled, dtype=dtype) +def test_subgroup_full_matches_tiled_float(op_name, dtype): + """Float-dtype coverage of the dtype-agnostic ``full`` wrappers (``reduce_add``, ``inclusive_add``). One f32 + one + f64 case per family is enough to catch an i32-only regression in a wrapper.""" + full_fn = getattr(subgroup, op_name) + tiled_fn = getattr(subgroup, f"{op_name}_tiled") + _check_full_matches_tiled(full_fn, tiled_fn, dtype=dtype) @pytest.mark.parametrize("dtype", [qd.f32, qd.f64]) From 4c18c86e59a275898f93adb71f5ee788d2d1d076 Mon Sep 17 00:00:00 2001 From: "Hugh Perkins (deskai7)" Date: Tue, 19 May 2026 08:13:10 -0700 Subject: [PATCH 10/15] [Test] test_block_reduce{,_all}: fuse {add,min,max} into op-parametrized tests The six block-reduce tests (3 single-output + 3 broadcast) share an identical kernel skeleton, parametrize axes, and verification loop. They only differ in which `block.reduce_*` function is called (closure-captured into `@qd.kernel` via getattr), the host-side reference oracle, the init pattern (sequential for `add` so the running sum has signal; permuted hash for `min` / `max` so the result depends on lanes other than first / last), and the float tolerance regime (relative for accumulating `add`, absolute for picker `min` / `max`). Collapse the six tests into two op-parametrized tests: test_block_reduce(sg_per_block, dtype, op_name, ...) # single-output, 3 ops test_block_reduce_all(sg_per_block, dtype, op_name, ...) # broadcast, 3 ops Parametrize space is unchanged (3 sg x 5 dtype x 3 op = 45 cases per fused test, matching the original 3 tests x 15 cases each). Pytest ids use plain `[add|min|max]` suffixes so `-k` selectors remain readable. Drops ~100 LOC of boilerplate -- two new small helpers (`_init_block_reduce_src` and `_assert_block_reduce_close`) capture the per-op behavioral differences in one place each. --- tests/python/test_simt.py | 210 ++++++++++---------------------------- 1 file changed, 54 insertions(+), 156 deletions(-) diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py index 6790d3afb5..5b7d7490cd 100644 --- a/tests/python/test_simt.py +++ b/tests/python/test_simt.py @@ -887,81 +887,57 @@ def _ref_reduce_max(values): return max(values) -@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) -@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) -@test_utils.test(arch=qd.gpu) -def test_block_reduce_add(dtype, sg_per_block): - """Block sum-reduce: thread 0 of each block holds `sum(src[block_base:block_base+block_dim])`.""" - _skip_if_f64_unsupported(dtype) - block_dim = sg_per_block * _arch_subgroup_size() - NUM_BLOCKS = 4 - N = NUM_BLOCKS * block_dim - src = qd.field(dtype=dtype, shape=N) - dst = qd.field(dtype=dtype, shape=NUM_BLOCKS) - - @qd.kernel - def foo(): - qd.loop_config(block_dim=block_dim) - for i in range(N): - tid = i % block_dim - agg = block.reduce_add(src[i], block_dim, dtype) - if tid == 0: - dst[i // block_dim] = agg - - _init_field(src, N, dtype) - foo() - - for b in range(NUM_BLOCKS): - block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_reduce_add(block_vals) - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert dst[b] == expected, f"block {b}: got {dst[b]}, expected {expected}" - else: - assert abs(dst[b] - expected) < 1e-4 * abs(expected), f"block {b}: got {dst[b]}, expected {expected}" - +# The three single-output reduces (`test_block_reduce_{add,min,max}`) and their three broadcast siblings +# (`test_block_reduce_all_{add,min,max}`) share the same kernel skeleton, parametrize axes, and verification loop; +# they differ only in (a) which `block.reduce_*` function gets called, (b) the host-side reference oracle, (c) the +# init pattern (sequential for `add` so the running sum has signal, permuted hash for `min` / `max` so the result +# depends on lanes other than first / last), and (d) the float tolerance regime (`add` accumulates so it uses a +# relative tol; `min` / `max` pick one element of the input and use an absolute tol). +_BLOCK_REDUCE_OP_CASES = [ + # (op_name, ref_fn, init_permuted, tol_relative) + pytest.param("add", _ref_reduce_add, False, True, id="add"), + pytest.param("min", _ref_reduce_min, True, False, id="min"), + pytest.param("max", _ref_reduce_max, True, False, id="max"), +] -@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) -@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) -@test_utils.test(arch=qd.gpu) -def test_block_reduce_min(dtype, sg_per_block): - """Block min-reduce: thread 0 of each block holds `min(src[block_base:block_base+block_dim])`.""" - _skip_if_f64_unsupported(dtype) - block_dim = sg_per_block * _arch_subgroup_size() - NUM_BLOCKS = 4 - N = NUM_BLOCKS * block_dim - src = qd.field(dtype=dtype, shape=N) - dst = qd.field(dtype=dtype, shape=NUM_BLOCKS) - @qd.kernel - def foo(): - qd.loop_config(block_dim=block_dim) +def _init_block_reduce_src(src, N, dtype, *, permuted): + """Initialize ``src[0:N]`` for a block reduce test. ``permuted=False`` is the sequential ``1..N`` init from + ``_init_field`` (good for add); ``permuted=True`` is the stable hash ``((i * 1009) % 997) + 1`` so the per-block + min / max depends on lanes other than first / last.""" + if permuted: for i in range(N): - tid = i % block_dim - agg = block.reduce_min(src[i], block_dim, dtype) - if tid == 0: - dst[i // block_dim] = agg + v = ((i * 1009) % 997) + 1 + src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v + else: + _init_field(src, N, dtype) - # Permuted (non-monotone) initialisation so the min depends on lanes other than the first / last. - for i in range(N): - v = ((i * 1009) % 997) + 1 # in [1, 997]; stable hash, no collisions w/ block_dim values up to 256 - src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v - foo() - for b in range(NUM_BLOCKS): - block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_reduce_min(block_vals) - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert dst[b] == expected, f"block {b}: got {dst[b]}, expected {expected}" - else: - assert abs(dst[b] - expected) < 1e-5, f"block {b}: got {dst[b]}, expected {expected}" +def _assert_block_reduce_close(actual, expected, dtype, *, tol_relative, ctx): + """Assert ``actual ~= expected`` per the block-reduce tolerance regime. + + Int dtypes compare exactly. Floats use relative tolerance ``1e-4 * |expected|`` for accumulating ops (sums grow + with block_dim, so a relative bound is the only thing that stays meaningful across the 32 / 128 / 256 / 64 / 256 / + 512 block-size sweep), and absolute tolerance ``1e-5`` for picker ops (min / max pick one element so the + magnitude is whatever was in the input -- a small absolute bound suffices). + """ + if dtype in _BLOCK_REDUCE_INT_DTYPES: + assert actual == expected, f"{ctx}: got {actual}, expected {expected}" + elif tol_relative: + assert abs(actual - expected) < 1e-4 * abs(expected), f"{ctx}: got {actual}, expected {expected}" + else: + assert abs(actual - expected) < 1e-5, f"{ctx}: got {actual}, expected {expected}" +@pytest.mark.parametrize("op_name,ref_fn,init_permuted,tol_relative", _BLOCK_REDUCE_OP_CASES) @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) @test_utils.test(arch=qd.gpu) -def test_block_reduce_max(dtype, sg_per_block): - """Block max-reduce: thread 0 of each block holds `max(src[block_base:block_base+block_dim])`.""" +def test_block_reduce(dtype, sg_per_block, op_name, ref_fn, init_permuted, tol_relative): + """Block reduce: thread 0 of each block holds ``(src[block_base:block_base+block_dim])``. Unified across + ``add`` / ``min`` / ``max`` -- op-name is closure-captured into ``@qd.kernel``.""" _skip_if_f64_unsupported(dtype) + op_fn = getattr(block, f"reduce_{op_name}") block_dim = sg_per_block * _arch_subgroup_size() NUM_BLOCKS = 4 N = NUM_BLOCKS * block_dim @@ -973,102 +949,29 @@ def foo(): qd.loop_config(block_dim=block_dim) for i in range(N): tid = i % block_dim - agg = block.reduce_max(src[i], block_dim, dtype) + agg = op_fn(src[i], block_dim, dtype) if tid == 0: dst[i // block_dim] = agg - for i in range(N): - v = ((i * 1009) % 997) + 1 - src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v - foo() - - for b in range(NUM_BLOCKS): - block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_reduce_max(block_vals) - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert dst[b] == expected, f"block {b}: got {dst[b]}, expected {expected}" - else: - assert abs(dst[b] - expected) < 1e-5, f"block {b}: got {dst[b]}, expected {expected}" - - -@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) -@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) -@test_utils.test(arch=qd.gpu) -def test_block_reduce_all_add(dtype, sg_per_block): - """Block sum-reduce broadcast: every thread of each block holds the block-wide sum. - - Verifies the broadcast variant by writing the per-thread output to a flat field, then asserting every thread of a - given block reads the same aggregate. - """ - _skip_if_f64_unsupported(dtype) - block_dim = sg_per_block * _arch_subgroup_size() - NUM_BLOCKS = 4 - N = NUM_BLOCKS * block_dim - src = qd.field(dtype=dtype, shape=N) - dst = qd.field(dtype=dtype, shape=N) - - @qd.kernel - def foo(): - qd.loop_config(block_dim=block_dim) - for i in range(N): - dst[i] = block.reduce_all_add(src[i], block_dim, dtype) - - _init_field(src, N, dtype) - foo() - - for b in range(NUM_BLOCKS): - block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_reduce_add(block_vals) - for j in range(block_dim): - actual = dst[b * block_dim + j] - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert actual == expected, f"block {b} thread {j}: got {actual}, expected {expected}" - else: - assert abs(actual - expected) < 1e-4 * abs( - expected - ), f"block {b} thread {j}: got {actual}, expected {expected}" - - -@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) -@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) -@test_utils.test(arch=qd.gpu) -def test_block_reduce_all_min(dtype, sg_per_block): - """Block min-reduce broadcast: every thread reads the block-wide min.""" - _skip_if_f64_unsupported(dtype) - block_dim = sg_per_block * _arch_subgroup_size() - NUM_BLOCKS = 4 - N = NUM_BLOCKS * block_dim - src = qd.field(dtype=dtype, shape=N) - dst = qd.field(dtype=dtype, shape=N) - - @qd.kernel - def foo(): - qd.loop_config(block_dim=block_dim) - for i in range(N): - dst[i] = block.reduce_all_min(src[i], block_dim, dtype) - - for i in range(N): - v = ((i * 1009) % 997) + 1 - src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v + _init_block_reduce_src(src, N, dtype, permuted=init_permuted) foo() for b in range(NUM_BLOCKS): block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_reduce_min(block_vals) - for j in range(block_dim): - actual = dst[b * block_dim + j] - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert actual == expected, f"block {b} thread {j}: got {actual}, expected {expected}" - else: - assert abs(actual - expected) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected}" + expected = ref_fn(block_vals) + _assert_block_reduce_close(dst[b], expected, dtype, tol_relative=tol_relative, ctx=f"block {b}") +@pytest.mark.parametrize("op_name,ref_fn,init_permuted,tol_relative", _BLOCK_REDUCE_OP_CASES) @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) @test_utils.test(arch=qd.gpu) -def test_block_reduce_all_max(dtype, sg_per_block): - """Block max-reduce broadcast: every thread reads the block-wide max.""" +def test_block_reduce_all(dtype, sg_per_block, op_name, ref_fn, init_permuted, tol_relative): + """Block reduce broadcast: every thread of each block holds the block-wide ````. Verified by writing the + per-thread output to a flat field, then asserting every thread of a given block reads the same aggregate. + Unified across ``add`` / ``min`` / ``max``.""" _skip_if_f64_unsupported(dtype) + op_fn = getattr(block, f"reduce_all_{op_name}") block_dim = sg_per_block * _arch_subgroup_size() NUM_BLOCKS = 4 N = NUM_BLOCKS * block_dim @@ -1079,22 +982,17 @@ def test_block_reduce_all_max(dtype, sg_per_block): def foo(): qd.loop_config(block_dim=block_dim) for i in range(N): - dst[i] = block.reduce_all_max(src[i], block_dim, dtype) + dst[i] = op_fn(src[i], block_dim, dtype) - for i in range(N): - v = ((i * 1009) % 997) + 1 - src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v + _init_block_reduce_src(src, N, dtype, permuted=init_permuted) foo() for b in range(NUM_BLOCKS): block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_reduce_max(block_vals) + expected = ref_fn(block_vals) for j in range(block_dim): actual = dst[b * block_dim + j] - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert actual == expected, f"block {b} thread {j}: got {actual}, expected {expected}" - else: - assert abs(actual - expected) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected}" + _assert_block_reduce_close(actual, expected, dtype, tol_relative=tol_relative, ctx=f"block {b} thread {j}") # --- Block scan tests ------------------------------------------------------------------ From 5fb930ef7d2e862cc358e35372a1ac139ce15556 Mon Sep 17 00:00:00 2001 From: "Hugh Perkins (deskai7)" Date: Tue, 19 May 2026 08:14:47 -0700 Subject: [PATCH 11/15] [Test] test_block_inclusive: fuse {add,min,max} into one op-parametrized test The three block inclusive scan tests share the same kernel skeleton and only differ in the closure-captured `block.inclusive_` function, the host-side reference oracle, the init pattern (sequential for `add` -- sums grow with prefix length; permuted for `min` / `max` -- result depends on lanes other than first / last), and the float tolerance regime (relative for `add`, absolute for `min` / `max`). Collapse into one op-parametrized test: test_block_inclusive(sg_per_block, dtype, op_name, ...) Identical param count to the original three tests (3 sg x 5 dtype x 3 op = 45 cases vs original 3 x 15). Pulls a shared `_assert_block_scan_close` helper out so the int / relative-float / absolute-float regime is encoded in one place; the relative-float branch keeps the floor-on-tol-base trick needed by the original `test_block_exclusive_add` (also routed through the same helper). `test_block_exclusive_add` stays as its own function for now because the matching exclusive `min` / `max` cases need dtype-derived sentinel identities + ``isinf`` handling that's different enough that fusing them in would create more branches than it removes; can address that in a follow-up if needed. --- tests/python/test_simt.py | 131 ++++++++++++-------------------------- 1 file changed, 41 insertions(+), 90 deletions(-) diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py index 5b7d7490cd..96aeb1e4dc 100644 --- a/tests/python/test_simt.py +++ b/tests/python/test_simt.py @@ -1045,82 +1045,45 @@ def _ref_exclusive_scan_op(values, op, identity): return out -@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) -@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) -@test_utils.test(arch=qd.gpu) -def test_block_inclusive_add(dtype, sg_per_block): - """Block inclusive prefix sum: thread `i` holds `sum(src[block_base..i])`.""" - _skip_if_f64_unsupported(dtype) - block_dim = sg_per_block * _arch_subgroup_size() - NUM_BLOCKS = 4 - N = NUM_BLOCKS * block_dim - src = qd.field(dtype=dtype, shape=N) - dst = qd.field(dtype=dtype, shape=N) - - @qd.kernel - def foo(): - qd.loop_config(block_dim=block_dim) - for i in range(N): - dst[i] = block.inclusive_add(src[i], block_dim, dtype) - - _init_field(src, N, dtype) - foo() - - for b in range(NUM_BLOCKS): - block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_inclusive_scan_add(block_vals) - for j in range(block_dim): - actual = dst[b * block_dim + j] - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}" - else: - assert abs(actual - expected[j]) < 1e-4 * abs( - expected[j] + 1.0 - ), f"block {b} thread {j}: got {actual}, expected {expected[j]}" - - -@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) -@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) -@test_utils.test(arch=qd.gpu) -def test_block_exclusive_add(dtype, sg_per_block): - """Block exclusive prefix sum: thread `i` holds `sum(src[block_base..i-1])`; thread 0 holds 0.""" - _skip_if_f64_unsupported(dtype) - block_dim = sg_per_block * _arch_subgroup_size() - NUM_BLOCKS = 4 - N = NUM_BLOCKS * block_dim - src = qd.field(dtype=dtype, shape=N) - dst = qd.field(dtype=dtype, shape=N) - - @qd.kernel - def foo(): - qd.loop_config(block_dim=block_dim) - for i in range(N): - dst[i] = block.exclusive_add(src[i], block_dim, dtype) +# The four scan tests in this group (`test_block_inclusive_{add,min,max}` + `test_block_exclusive_add`) share the +# kernel skeleton; only the per-op reference oracle, init pattern, and float tolerance differ. `add` accumulates +# (sequential init, relative tol); `min` / `max` pick (permuted init, absolute tol). Exclusive `min` / `max` get +# their own dedicated test below because they need a dtype-derived sentinel identity (+inf / iinfo(max), -inf / +# iinfo(min)) at lane 0 with explicit ``isinf`` handling -- different enough that fusing them in would create more +# branches than it removes. +_PY_MIN = lambda a, b: a if a < b else b # noqa: E731 (intentional 1-line lambda for ref oracle) +_PY_MAX = lambda a, b: a if a > b else b # noqa: E731 + +_BLOCK_INCLUSIVE_SCAN_OP_CASES = [ + # (op_name, ref_fn, init_permuted, tol_relative) + pytest.param("add", _ref_inclusive_scan_add, False, True, id="add"), + pytest.param("min", lambda vals: _ref_inclusive_scan_op(vals, _PY_MIN, 0), True, False, id="min"), + pytest.param("max", lambda vals: _ref_inclusive_scan_op(vals, _PY_MAX, 0), True, False, id="max"), +] - _init_field(src, N, dtype) - foo() - for b in range(NUM_BLOCKS): - block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_exclusive_scan_add(block_vals) - for j in range(block_dim): - actual = dst[b * block_dim + j] - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}" - else: - # First thread's expected is 0; gate the relative tolerance so it doesn't blow up. - tol_base = abs(expected[j]) if abs(expected[j]) > 1.0 else 1.0 - assert ( - abs(actual - expected[j]) < 1e-4 * tol_base - ), f"block {b} thread {j}: got {actual}, expected {expected[j]}" +def _assert_block_scan_close(actual, expected_j, dtype, *, tol_relative, ctx): + """Per-thread assertion for block scan tests. Same int / relative-float / absolute-float regime as + ``_assert_block_reduce_close`` but with a floor on the relative-tol base so the first few prefixes (where + ``expected_j`` is near zero) don't tighten the bound to zero.""" + if dtype in _BLOCK_REDUCE_INT_DTYPES: + assert actual == expected_j, f"{ctx}: got {actual}, expected {expected_j}" + elif tol_relative: + tol_base = abs(expected_j) if abs(expected_j) > 1.0 else 1.0 + assert abs(actual - expected_j) < 1e-4 * tol_base, f"{ctx}: got {actual}, expected {expected_j}" + else: + assert abs(actual - expected_j) < 1e-5, f"{ctx}: got {actual}, expected {expected_j}" +@pytest.mark.parametrize("op_name,ref_fn,init_permuted,tol_relative", _BLOCK_INCLUSIVE_SCAN_OP_CASES) @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) @test_utils.test(arch=qd.gpu) -def test_block_inclusive_min(dtype, sg_per_block): - """Block inclusive prefix min.""" +def test_block_inclusive(dtype, sg_per_block, op_name, ref_fn, init_permuted, tol_relative): + """Block inclusive prefix scan: thread ``i`` holds ``(src[block_base..i])``. Unified across ``add`` / ``min`` + / ``max``.""" _skip_if_f64_unsupported(dtype) + op_fn = getattr(block, f"inclusive_{op_name}") block_dim = sg_per_block * _arch_subgroup_size() NUM_BLOCKS = 4 N = NUM_BLOCKS * block_dim @@ -1131,30 +1094,24 @@ def test_block_inclusive_min(dtype, sg_per_block): def foo(): qd.loop_config(block_dim=block_dim) for i in range(N): - dst[i] = block.inclusive_min(src[i], block_dim, dtype) + dst[i] = op_fn(src[i], block_dim, dtype) - for i in range(N): - v = ((i * 1009) % 997) + 1 - src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v + _init_block_reduce_src(src, N, dtype, permuted=init_permuted) foo() - py_min = lambda a, b: a if a < b else b # noqa: E731 (intentional 1-line lambda for ref oracle) for b in range(NUM_BLOCKS): block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_inclusive_scan_op(block_vals, py_min, 0) + expected = ref_fn(block_vals) for j in range(block_dim): actual = dst[b * block_dim + j] - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}" - else: - assert abs(actual - expected[j]) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected[j]}" + _assert_block_scan_close(actual, expected[j], dtype, tol_relative=tol_relative, ctx=f"block {b} thread {j}") @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) @test_utils.test(arch=qd.gpu) -def test_block_inclusive_max(dtype, sg_per_block): - """Block inclusive prefix max.""" +def test_block_exclusive_add(dtype, sg_per_block): + """Block exclusive prefix sum: thread ``i`` holds ``sum(src[block_base..i-1])``; thread 0 holds 0.""" _skip_if_f64_unsupported(dtype) block_dim = sg_per_block * _arch_subgroup_size() NUM_BLOCKS = 4 @@ -1166,23 +1123,17 @@ def test_block_inclusive_max(dtype, sg_per_block): def foo(): qd.loop_config(block_dim=block_dim) for i in range(N): - dst[i] = block.inclusive_max(src[i], block_dim, dtype) + dst[i] = block.exclusive_add(src[i], block_dim, dtype) - for i in range(N): - v = ((i * 1009) % 997) + 1 - src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v + _init_field(src, N, dtype) foo() - py_max = lambda a, b: a if a > b else b # noqa: E731 for b in range(NUM_BLOCKS): block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_inclusive_scan_op(block_vals, py_max, 0) + expected = _ref_exclusive_scan_add(block_vals) for j in range(block_dim): actual = dst[b * block_dim + j] - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}" - else: - assert abs(actual - expected[j]) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected[j]}" + _assert_block_scan_close(actual, expected[j], dtype, tol_relative=True, ctx=f"block {b} thread {j}") @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) From aa25a36d4c125c0e784a91572f830520e3f095bc Mon Sep 17 00:00:00 2001 From: "Hugh Perkins (deskai7)" Date: Tue, 19 May 2026 08:17:25 -0700 Subject: [PATCH 12/15] [Test] test_block_exclusive_minmax: fuse {min,max} into one op-parametrized test `test_block_exclusive_min` and `test_block_exclusive_max` share the same permuted-init pattern and only differ in the dtype-derived sentinel identity (``+inf`` / ``iinfo.max`` for min, ``-inf`` / ``iinfo.min`` for max) and the inf-sign check at lane 0. Collapse into one op-parametrized test that takes ``(op_name, sentinel_fn, py_op, inf_sign)`` and dispatches via getattr + the (already module-level) `_PY_MIN` / `_PY_MAX` lambdas. Identical param count to the original pair (3 sg x 5 dtype x 2 op = 30 cases vs original 2 x 15 each = 30). `test_block_exclusive_add` remains its own function because the integer identity is `0` (not `iinfo.max/min`) and the init pattern is sequential -- different enough that fusing it in would add more branches than it removes. Drops ~30 LOC. --- tests/python/test_simt.py | 71 ++++++++++++--------------------------- 1 file changed, 21 insertions(+), 50 deletions(-) diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py index 96aeb1e4dc..8c44a40bf9 100644 --- a/tests/python/test_simt.py +++ b/tests/python/test_simt.py @@ -1136,12 +1136,24 @@ def foo(): _assert_block_scan_close(actual, expected[j], dtype, tol_relative=True, ctx=f"block {b} thread {j}") +_BLOCK_EXCLUSIVE_MINMAX_CASES = [ + # (op_name, sentinel_fn, py_op, inf_sign) + pytest.param("min", _block_exclusive_min_sentinel, _PY_MIN, 1, id="min"), + pytest.param("max", _block_exclusive_max_sentinel, _PY_MAX, -1, id="max"), +] + + +@pytest.mark.parametrize("op_name,sentinel_fn,py_op,inf_sign", _BLOCK_EXCLUSIVE_MINMAX_CASES) @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) @test_utils.test(arch=qd.gpu) -def test_block_exclusive_min(dtype, sg_per_block): - """Block exclusive prefix min; thread 0 holds the dtype-derived identity (``+inf`` / ``np.iinfo(dtype).max``).""" +def test_block_exclusive_minmax(dtype, sg_per_block, op_name, sentinel_fn, py_op, inf_sign): + """Block exclusive prefix ```` for ``op in {min, max}``; thread 0 of each block holds the dtype-derived + identity (``+inf`` / ``iinfo(dtype).max`` for min, ``-inf`` / ``iinfo(dtype).min`` for max). The float ``inf`` / + ``-inf`` lane-0 identity gets a sign-only check because ``inf - inf`` (or ``(-inf) - (-inf)``) is ``NaN`` and the + standard ``abs(diff) < tol`` compare would fail spuriously.""" _skip_if_f64_unsupported(dtype) + op_fn = getattr(block, f"exclusive_{op_name}") block_dim = sg_per_block * _arch_subgroup_size() NUM_BLOCKS = 4 N = NUM_BLOCKS * block_dim @@ -1152,25 +1164,23 @@ def test_block_exclusive_min(dtype, sg_per_block): def foo(): qd.loop_config(block_dim=block_dim) for i in range(N): - dst[i] = block.exclusive_min(src[i], block_dim, dtype) + dst[i] = op_fn(src[i], block_dim, dtype) - for i in range(N): - v = ((i * 1009) % 997) + 1 - src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v + _init_block_reduce_src(src, N, dtype, permuted=True) foo() - sentinel = _block_exclusive_min_sentinel(dtype) - py_min = lambda a, b: a if a < b else b # noqa: E731 + sentinel = sentinel_fn(dtype) for b in range(NUM_BLOCKS): block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_exclusive_scan_op(block_vals, py_min, sentinel) + expected = _ref_exclusive_scan_op(block_vals, py_op, sentinel) for j in range(block_dim): actual = dst[b * block_dim + j] if dtype in _BLOCK_REDUCE_INT_DTYPES: assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}" elif math.isinf(expected[j]): - # Thread 0 of each block gets the +inf identity; ``inf - inf`` is NaN, so check by equality / sign. - assert math.isinf(actual) and actual > 0, f"block {b} thread {j}: got {actual}, expected {expected[j]}" + assert math.isinf(actual) and ( + actual > 0 if inf_sign > 0 else actual < 0 + ), f"block {b} thread {j}: got {actual}, expected {expected[j]}" else: assert abs(actual - expected[j]) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected[j]}" @@ -1304,45 +1314,6 @@ def kern(): assert actual_ranks == ref_ranks, f"ranks mismatch (pattern={key_pattern})" -@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) -@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) -@test_utils.test(arch=qd.gpu) -def test_block_exclusive_max(dtype, sg_per_block): - """Block exclusive prefix max; thread 0 holds the dtype-derived identity (``-inf`` / ``np.iinfo(dtype).min``).""" - _skip_if_f64_unsupported(dtype) - block_dim = sg_per_block * _arch_subgroup_size() - NUM_BLOCKS = 4 - N = NUM_BLOCKS * block_dim - src = qd.field(dtype=dtype, shape=N) - dst = qd.field(dtype=dtype, shape=N) - - @qd.kernel - def foo(): - qd.loop_config(block_dim=block_dim) - for i in range(N): - dst[i] = block.exclusive_max(src[i], block_dim, dtype) - - for i in range(N): - v = ((i * 1009) % 997) + 1 - src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v - foo() - - sentinel = _block_exclusive_max_sentinel(dtype) - py_max = lambda a, b: a if a > b else b # noqa: E731 - for b in range(NUM_BLOCKS): - block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_exclusive_scan_op(block_vals, py_max, sentinel) - for j in range(block_dim): - actual = dst[b * block_dim + j] - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}" - elif math.isinf(expected[j]): - # Thread 0 of each block gets the -inf identity; ``-inf - -inf`` is NaN, so check by equality / sign. - assert math.isinf(actual) and actual < 0, f"block {b} thread {j}: got {actual}, expected {expected[j]}" - else: - assert abs(actual - expected[j]) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected[j]}" - - @pytest.mark.parametrize("dtype", [qd.i32, qd.f32, qd.f64]) @test_utils.test(arch=qd.gpu) def test_subgroup_shuffle_broadcast(dtype): From 11b3a89a6e4cc6672aa831215fface7bb4398ede Mon Sep 17 00:00:00 2001 From: "Hugh Perkins (deskai7)" Date: Tue, 19 May 2026 11:03:15 -0700 Subject: [PATCH 13/15] [Style] Reflow CI-flagged 80c-wrapped comments to 120c The PR's `Check line wrapping` CI agent flagged three comments wrapped at the AI-default ~78-90c instead of the project's 120c target. Reflow each to the full target width: - tests/python/test_tile16.py:1791 (78c -> 120c) docstring for test_tile16_cholesky_blocked_demo. - tests/python/test_ad_gdar_diffmpm.py:8 (85c -> 120c) the "defaults shrink ..." comment above the parametrize block. - tests/run_tests.py:60 (90c -> 120c) the "--run-slow opts back in" comment. Also collapse the dangling-backslash continuation in misc/demos/cholesky_blocked.py's Usage example onto one line (69c -> 109c). No behavior change; comments only. Verified via the cursor find-underwrapped skill that the remaining flagged runs in my diff are all 103-116c with save~=0 (already-tight runs the greedy heuristic still reports), comfortably in the agent's "not borderline" exemption. --- misc/demos/cholesky_blocked.py | 3 +-- tests/python/test_ad_gdar_diffmpm.py | 9 ++++----- tests/python/test_tile16.py | 7 +++---- tests/run_tests.py | 7 +++---- 4 files changed, 11 insertions(+), 15 deletions(-) diff --git a/misc/demos/cholesky_blocked.py b/misc/demos/cholesky_blocked.py index b4c60c1810..3c72dd39fd 100644 --- a/misc/demos/cholesky_blocked.py +++ b/misc/demos/cholesky_blocked.py @@ -21,8 +21,7 @@ tile16 (Tile16x16, no shared memory) 16 533 5.19x Usage: - python misc/demos/cholesky_blocked.py [--n N] [--n-envs N_ENVS] \ - [--num-warmup WARMUP] [--num-iters ITERS] + python misc/demos/cholesky_blocked.py [--n N] [--n-envs N_ENVS] [--num-warmup WARMUP] [--num-iters ITERS] """ import argparse diff --git a/tests/python/test_ad_gdar_diffmpm.py b/tests/python/test_ad_gdar_diffmpm.py index 0e0e460534..8fd3c56d56 100644 --- a/tests/python/test_ad_gdar_diffmpm.py +++ b/tests/python/test_ad_gdar_diffmpm.py @@ -5,11 +5,10 @@ from tests import test_utils -# Defaults shrink particle / grid / steps counts so the JIT compile + AD-tape replay -# stays cheap; the slow-marked entry keeps the original (N=30, n_grid=120, steps=32) -# workload that runs on --run-slow. The point of the test is that the AD-validation -# checker fires on the global-data-access violation in g2p (`v[f, p] = new_v`), which -# happens on the first substep regardless of size. +# Defaults shrink particle / grid / steps counts so the JIT compile + AD-tape replay stays cheap; the slow-marked +# entry keeps the original (N=30, n_grid=120, steps=32) workload that runs on --run-slow. The point of the test is +# that the AD-validation checker fires on the global-data-access violation in g2p (`v[f, p] = new_v`), which happens +# on the first substep regardless of size. @pytest.mark.parametrize( "particles_side,n_grid_size,num_steps", [ diff --git a/tests/python/test_tile16.py b/tests/python/test_tile16.py index f94d4221e1..adf8249605 100644 --- a/tests/python/test_tile16.py +++ b/tests/python/test_tile16.py @@ -1778,10 +1778,9 @@ def write_eye_f32(dst: Ann32): def test_tile16_cholesky_blocked_demo(): """Smoke-test that misc/demos/cholesky_blocked.py runs to completion. - Uses small CLI overrides (N=32, N_ENVS=64, 1 warmup + 1 timed iter) so the - JIT compile of the 3 unrolled kernels and the benchmark loop both stay - cheap. The demo's defaults (N=92, N_ENVS=4096, 50+200 iters) are exercised - by anyone running the script manually, not by CI. + Uses small CLI overrides (N=32, N_ENVS=64, 1 warmup + 1 timed iter) so the JIT compile of the 3 unrolled kernels + and the benchmark loop both stay cheap. The demo's defaults (N=92, N_ENVS=4096, 50+200 iters) are exercised by + anyone running the script manually, not by CI. """ demo = Path(__file__).resolve().parents[2] / "misc" / "demos" / "cholesky_blocked.py" cmd = [ diff --git a/tests/run_tests.py b/tests/run_tests.py index bf37ab2aa7..7276ce9d00 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -56,10 +56,9 @@ def _test_python(args, default_dir="python"): pytest_args += ["--cov-append"] if args.keys: pytest_args += ["-k", args.keys] - # By default we exclude tests marked `slow` (eig / make_spd at n>=6, inverse_large - # at n>=6, mpm88, etc. — see tests/pytest.ini for the marker). `--run-slow` opts - # back in. If the user passes their own `-m` expression we AND `not slow` onto it - # so the exclusion still applies, unless they explicitly opt out via `--run-slow`. + # By default we exclude tests marked `slow` (eig / make_spd at n>=6, inverse_large at n>=6, mpm88, etc. -- see + # tests/pytest.ini for the marker). `--run-slow` opts back in. If the user passes their own `-m` expression we + # AND `not slow` onto it so the exclusion still applies, unless they explicitly opt out via `--run-slow`. marks_expr = args.marks if not args.run_slow: marks_expr = f"({marks_expr}) and not slow" if marks_expr else "not slow" From 7389b5fa83a9bc50e3c1f5e92a3005fc1a4bf660 Mon Sep 17 00:00:00 2001 From: "Hugh Perkins (deskai7)" Date: Tue, 19 May 2026 11:12:48 -0700 Subject: [PATCH 14/15] [Doc] contributing.md: shorten testing bullet per PR review Hugh requested in PR #709 review comment that the testing bullet collapse to just a pointer at unit_testing.md, since the long inline summary duplicates the dedicated doc immediately below. --- docs/source/user_guide/contributing.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/user_guide/contributing.md b/docs/source/user_guide/contributing.md index ec97b9529f..3573179e84 100644 --- a/docs/source/user_guide/contributing.md +++ b/docs/source/user_guide/contributing.md @@ -2,7 +2,7 @@ ## Good practice reminder -* *testing*: Any new features or modified code should be tested. You have to run the test suite using `python tests/run_tests.py` which sets up the right test environment for `pytest`. CLI arguments are forwarded to `pytest`. Do not use `pytest` directly as it behaves differently. To see a per-file timing breakdown (useful for identifying slow test files), set `QD_FILE_TIMING=1` — e.g. `QD_FILE_TIMING=1 python tests/run_tests.py`. This is enabled by default in the Mac CI job and the results appear in the GitHub Actions job summary. +* *testing*: Any new features or modified code should be tested. see [unit_testing.md](unit_testing.md) * *format/linter*: Before pushing any commits, ensure you set up `pre-commit` and run it using `pre-commit run -a` * No need to force push to keep a clean history as the merging is eventually done by squashing commits. From a85c6ecbccad873b05993755e318ac29394b5aec Mon Sep 17 00:00:00 2001 From: "Hugh Perkins (deskai7)" Date: Thu, 21 May 2026 03:02:09 -0700 Subject: [PATCH 15/15] [Doc] unit_testing: add slow-only test-suite guide Documents the test launcher, the @pytest.mark.slow marker (whole-test and parametrize-case variants), how to write a new parametrized test with the test_utils.test decorator, and the Advanced section with the per-test timeout, kernel compilation cache, and per-file timing knobs. Modeled on the structure of the equivalent doc on hp/mark-slow-tests (after Hugh's two rounds of PR review feedback there) but with all @pytest.mark.sample references stripped, since the @sample marker is not part of this branch. --- docs/source/user_guide/index.md | 1 + docs/source/user_guide/unit_testing.md | 120 +++++++++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 docs/source/user_guide/unit_testing.md diff --git a/docs/source/user_guide/index.md b/docs/source/user_guide/index.md index b648f97527..c824a270e7 100644 --- a/docs/source/user_guide/index.md +++ b/docs/source/user_guide/index.md @@ -82,6 +82,7 @@ init_options :maxdepth: 1 :titlesonly: +unit_testing kernel_coverage ``` diff --git a/docs/source/user_guide/unit_testing.md b/docs/source/user_guide/unit_testing.md new file mode 100644 index 0000000000..08453a9912 --- /dev/null +++ b/docs/source/user_guide/unit_testing.md @@ -0,0 +1,120 @@ +# Unit testing + +This page documents how to run, write, and tune the Quadrants Python unit test suite. For setup of the build / dev environment, see [contributing.md](contributing.md). + +## Running the tests + +The test suite is run via the project's launcher, **not** by invoking `pytest` directly: + +``` +python tests/run_tests.py +``` + +The launcher sets up the test-only env vars (kernel offline cache, watchdog, xdist worker count, etc.) and forwards any unrecognised flags to pytest. Calling `pytest` directly skips that setup and behaves differently. + +Common one-liners: + +``` +# run one file +python tests/run_tests.py test_tile16 + +# run one test (any pytest -k expression) +python tests/run_tests.py -k test_tile16_cholesky + +# run on a specific backend (or comma-separated list) +python tests/run_tests.py --arch cuda +python tests/run_tests.py --arch metal -k tile16 + +# same, via env var (handy for CI) +QD_WANTED_ARCHS=metal,vulkan python tests/run_tests.py + +# rerun the last failing tests first +python tests/run_tests.py -f + +# stop at the first failure +python tests/run_tests.py -x +``` + +The target architecture can also be set via `QD_WANTED_ARCHS` (comma-separated; supports `^arch` to exclude rather than include). + +## Markers + +### `@pytest.mark.slow` + +Marks a test as **slow**. `tests/run_tests.py` adds `-m "not slow"` to the pytest invocation by default; pass `--run-slow` to opt back in: + +``` +# default: skip slow +python tests/run_tests.py + +# include slow +python tests/run_tests.py --run-slow + +# slow ONLY (e.g. nightly job) +python tests/run_tests.py -m slow --run-slow +``` + +The marker is used in two patterns: + +1. **Whole-test slow**: the whole test takes a long time. + + ```python + @pytest.mark.slow + def test_thing_that_is_always_slow(): + ... + ``` + +2. **Slow-marked parametrize case**: + + ```python + @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)]) + def test_sym_eig_general(n): + ... + ``` + + In this specific example the default suite still exercises the code path; the slow lane just adds the larger-size variant for full coverage. + +## Writing new tests + +The standard recipe combines `@test_utils.test(...)` (arch / option matrix) with `@pytest.mark.parametrize`: + +```python +import pytest +import quadrants as qd +from tests import test_utils + + +@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)]) +@test_utils.test(arch=qd.gpu, default_fp=qd.f32) +def test_my_thing(n): + ... +``` + +`@test_utils.test` is what wires the test into the per-backend matrix and applies platform exclusions (`exclude=`), extension requirements (`require=`, e.g. `qd.extension.data64` for f64 tests), and per-test options (`default_fp`, `fast_math`, etc.). See `tests/test_utils.py` for the full surface. + +Common helpers in `tests/test_utils.py`: + +- `test_utils.skip_if_f64_unsupported(dtype)` — skip the current test at runtime if `dtype == qd.f64` and the active backend can't carry f64 through buffer I/O (Metal, MoltenVK on Darwin). Use inside a parametrized test that sweeps both f32 and f64. +- `test_utils.expected_archs()` — list of archs that the current `QD_WANTED_ARCHS` allows. Used to skip tests with no satisfiable arch. + +## Advanced + +Optional knobs and runtime details. The defaults work for most contributors. + +### Per-test timeout + +Per-test timeouts default to 600 s and are enforced by `pytest_hardtle`, a CFFI-compiled C watchdog that can kill tests hung in native GPU calls even when the GIL is held. + +### Kernel compilation cache + +During each test session the kernel compilation cache lives in a fresh, empty temp directory created by pytest's [`tmp_path_factory`](https://docs.pytest.org/en/stable/how-to/tmp_path.html) — typically `/tmp/pytest-of-/pytest-/qdcache0/`. Old session directories are cleaned up automatically by pytest's retention policy. This cache is separate from the user-facing `~/.cache/quadrants/` cache, and avoids recompiling identical kernels after each `qd.reset()` / `qd.init()` cycle within a session. + +### Per-file timing breakdown + +Set `QD_FILE_TIMING=1` to print a per-file duration summary at the end of the session: + +``` +QD_FILE_TIMING=1 python tests/run_tests.py +``` + +This is enabled by default in the Mac CI job; the results appear in the GitHub Actions job summary and are the primary tool for identifying slow test files.