diff --git a/docs/source/user_guide/contributing.md b/docs/source/user_guide/contributing.md index ec97b9529f..3573179e84 100644 --- a/docs/source/user_guide/contributing.md +++ b/docs/source/user_guide/contributing.md @@ -2,7 +2,7 @@ ## Good practice reminder -* *testing*: Any new features or modified code should be tested. You have to run the test suite using `python tests/run_tests.py` which sets up the right test environment for `pytest`. CLI arguments are forwarded to `pytest`. Do not use `pytest` directly as it behaves differently. To see a per-file timing breakdown (useful for identifying slow test files), set `QD_FILE_TIMING=1` — e.g. `QD_FILE_TIMING=1 python tests/run_tests.py`. This is enabled by default in the Mac CI job and the results appear in the GitHub Actions job summary. +* *testing*: Any new features or modified code should be tested. see [unit_testing.md](unit_testing.md) * *format/linter*: Before pushing any commits, ensure you set up `pre-commit` and run it using `pre-commit run -a` * No need to force push to keep a clean history as the merging is eventually done by squashing commits. diff --git a/docs/source/user_guide/index.md b/docs/source/user_guide/index.md index b648f97527..c824a270e7 100644 --- a/docs/source/user_guide/index.md +++ b/docs/source/user_guide/index.md @@ -82,6 +82,7 @@ init_options :maxdepth: 1 :titlesonly: +unit_testing kernel_coverage ``` diff --git a/docs/source/user_guide/unit_testing.md b/docs/source/user_guide/unit_testing.md new file mode 100644 index 0000000000..08453a9912 --- /dev/null +++ b/docs/source/user_guide/unit_testing.md @@ -0,0 +1,120 @@ +# Unit testing + +This page documents how to run, write, and tune the Quadrants Python unit test suite. For setup of the build / dev environment, see [contributing.md](contributing.md). + +## Running the tests + +The test suite is run via the project's launcher, **not** by invoking `pytest` directly: + +``` +python tests/run_tests.py +``` + +The launcher sets up the test-only env vars (kernel offline cache, watchdog, xdist worker count, etc.) and forwards any unrecognised flags to pytest. Calling `pytest` directly skips that setup and behaves differently. + +Common one-liners: + +``` +# run one file +python tests/run_tests.py test_tile16 + +# run one test (any pytest -k expression) +python tests/run_tests.py -k test_tile16_cholesky + +# run on a specific backend (or comma-separated list) +python tests/run_tests.py --arch cuda +python tests/run_tests.py --arch metal -k tile16 + +# same, via env var (handy for CI) +QD_WANTED_ARCHS=metal,vulkan python tests/run_tests.py + +# rerun the last failing tests first +python tests/run_tests.py -f + +# stop at the first failure +python tests/run_tests.py -x +``` + +The target architecture can also be set via `QD_WANTED_ARCHS` (comma-separated; supports `^arch` to exclude rather than include). + +## Markers + +### `@pytest.mark.slow` + +Marks a test as **slow**. `tests/run_tests.py` adds `-m "not slow"` to the pytest invocation by default; pass `--run-slow` to opt back in: + +``` +# default: skip slow +python tests/run_tests.py + +# include slow +python tests/run_tests.py --run-slow + +# slow ONLY (e.g. nightly job) +python tests/run_tests.py -m slow --run-slow +``` + +The marker is used in two patterns: + +1. **Whole-test slow**: the whole test takes a long time. + + ```python + @pytest.mark.slow + def test_thing_that_is_always_slow(): + ... + ``` + +2. **Slow-marked parametrize case**: + + ```python + @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)]) + def test_sym_eig_general(n): + ... + ``` + + In this specific example the default suite still exercises the code path; the slow lane just adds the larger-size variant for full coverage. + +## Writing new tests + +The standard recipe combines `@test_utils.test(...)` (arch / option matrix) with `@pytest.mark.parametrize`: + +```python +import pytest +import quadrants as qd +from tests import test_utils + + +@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)]) +@test_utils.test(arch=qd.gpu, default_fp=qd.f32) +def test_my_thing(n): + ... +``` + +`@test_utils.test` is what wires the test into the per-backend matrix and applies platform exclusions (`exclude=`), extension requirements (`require=`, e.g. `qd.extension.data64` for f64 tests), and per-test options (`default_fp`, `fast_math`, etc.). See `tests/test_utils.py` for the full surface. + +Common helpers in `tests/test_utils.py`: + +- `test_utils.skip_if_f64_unsupported(dtype)` — skip the current test at runtime if `dtype == qd.f64` and the active backend can't carry f64 through buffer I/O (Metal, MoltenVK on Darwin). Use inside a parametrized test that sweeps both f32 and f64. +- `test_utils.expected_archs()` — list of archs that the current `QD_WANTED_ARCHS` allows. Used to skip tests with no satisfiable arch. + +## Advanced + +Optional knobs and runtime details. The defaults work for most contributors. + +### Per-test timeout + +Per-test timeouts default to 600 s and are enforced by `pytest_hardtle`, a CFFI-compiled C watchdog that can kill tests hung in native GPU calls even when the GIL is held. + +### Kernel compilation cache + +During each test session the kernel compilation cache lives in a fresh, empty temp directory created by pytest's [`tmp_path_factory`](https://docs.pytest.org/en/stable/how-to/tmp_path.html) — typically `/tmp/pytest-of-/pytest-/qdcache0/`. Old session directories are cleaned up automatically by pytest's retention policy. This cache is separate from the user-facing `~/.cache/quadrants/` cache, and avoids recompiling identical kernels after each `qd.reset()` / `qd.init()` cycle within a session. + +### Per-file timing breakdown + +Set `QD_FILE_TIMING=1` to print a per-file duration summary at the end of the session: + +``` +QD_FILE_TIMING=1 python tests/run_tests.py +``` + +This is enabled by default in the Mac CI job; the results appear in the GitHub Actions job summary and are the primary tool for identifying slow test files. diff --git a/misc/demos/cholesky_blocked.py b/misc/demos/cholesky_blocked.py index 8dbcb3fbb9..3c72dd39fd 100644 --- a/misc/demos/cholesky_blocked.py +++ b/misc/demos/cholesky_blocked.py @@ -1,13 +1,14 @@ #!/usr/bin/env python3 -"""Benchmark 92x92 blocked Cholesky factorization using Tile16x16. +"""Benchmark NxN blocked Cholesky factorization using Tile16x16. Three kernels compared: 1. Baseline: scalar Cholesky-Crout, 64 threads, shared memory, 2*N+1 sequential syncs. Thread 0 computes each diagonal, remaining threads parallelize off-diagonal updates. -2. Blocked: 6x6 grid of 16x16 tiles, 16 threads, shared memory, scalar Crout for diagonal blocks. Same blocking - structure as Tile16x16 but all data lives in shared memory with block.sync() between every step. +2. Blocked: ceil(N/16) x ceil(N/16) grid of 16x16 tiles, 16 threads, shared memory, scalar Crout for diagonal + blocks. Same blocking structure as Tile16x16 but all data lives in shared memory with block.sync() between + every step. 3. Tile16x16: same blocked structure but fully register-resident via Tile16x16. No shared memory, zero syncs. Prior tiles read from global memory (L2). @@ -20,22 +21,37 @@ tile16 (Tile16x16, no shared memory) 16 533 5.19x Usage: - python misc/demos/cholesky_blocked.py + python misc/demos/cholesky_blocked.py [--n N] [--n-envs N_ENVS] [--num-warmup WARMUP] [--num-iters ITERS] """ +import argparse import time import numpy as np import quadrants as qd -N = 92 + +def _parse_args(): + p = argparse.ArgumentParser( + description="Blocked Cholesky NxN benchmark (3 kernels: baseline / blocked / tile16).", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + p.add_argument("--n", type=int, default=92, help="Matrix dimension N (NxN SPD).") + p.add_argument("--n-envs", type=int, default=4096, help="Number of independent environments.") + p.add_argument("--num-warmup", type=int, default=50, help="Warmup iterations per kernel.") + p.add_argument("--num-iters", type=int, default=200, help="Timed iterations per kernel.") + return p.parse_args() + + +_args = _parse_args() +N = _args.n TILE = 16 -N_BLOCKS = (N + TILE - 1) // TILE # 6 -N_PADDED = N_BLOCKS * TILE # 96, rounded up for blocked kernel SharedArrays -N_ENVS = 4096 -WARMUP = 50 -ITERS = 200 +N_BLOCKS = (N + TILE - 1) // TILE +N_PADDED = N_BLOCKS * TILE # rounded up for blocked kernel SharedArrays +N_ENVS = _args.n_envs +WARMUP = _args.num_warmup +ITERS = _args.num_iters qd.init(arch=qd.gpu) diff --git a/tests/pytest.ini b/tests/pytest.ini index 5ee5ec16b2..efaf40e6c6 100644 --- a/tests/pytest.ini +++ b/tests/pytest.ini @@ -3,3 +3,5 @@ markers = run_in_serial: mark test to run serially(usually for resource intensive tests). sm70: Can only run on GPU with compute capability 7.0 or higher. needs_torch: mark test as requiring PyTorch. + slow: mark test (or parametrize case) as slow. Skipped by default by tests/run_tests.py; + pass --run-slow to include them, or directly `pytest -m slow` to run only the slow ones. diff --git a/tests/python/test_ad_gdar_diffmpm.py b/tests/python/test_ad_gdar_diffmpm.py index cd6bb32a04..8fd3c56d56 100644 --- a/tests/python/test_ad_gdar_diffmpm.py +++ b/tests/python/test_ad_gdar_diffmpm.py @@ -5,14 +5,25 @@ from tests import test_utils +# Defaults shrink particle / grid / steps counts so the JIT compile + AD-tape replay stays cheap; the slow-marked +# entry keeps the original (N=30, n_grid=120, steps=32) workload that runs on --run-slow. The point of the test is +# that the AD-validation checker fires on the global-data-access violation in g2p (`v[f, p] = new_v`), which happens +# on the first substep regardless of size. +@pytest.mark.parametrize( + "particles_side,n_grid_size,num_steps", + [ + (8, 32, 4), + pytest.param(30, 120, 32, marks=pytest.mark.slow), + ], +) @test_utils.test(require=qd.extension.assertion, debug=True) -def test_gdar_mpm(): +def test_gdar_mpm(particles_side, n_grid_size, num_steps): real = qd.f32 dim = 2 - N = 30 # reduce to 30 if run out of GPU memory + N = particles_side n_particles = N * N - n_grid = 120 + n_grid = n_grid_size dx = 1 / n_grid inv_dx = 1 / dx dt = 3e-4 @@ -21,8 +32,8 @@ def test_gdar_mpm(): E = 100 mu = E la = E - max_steps = 32 - steps = 32 + max_steps = num_steps + steps = num_steps gravity = 9.8 target = [0.3, 0.6] diff --git a/tests/python/test_algorithms.py b/tests/python/test_algorithms.py index e4b4ac9960..508732ce3b 100644 --- a/tests/python/test_algorithms.py +++ b/tests/python/test_algorithms.py @@ -320,86 +320,79 @@ def _rand_reduce_host(rng, dtype, N, *, bound=1000): return rng.integers(-bound, bound, size=N, dtype=np_dt) -@pytest.mark.parametrize("N", _REDUCE_SIZES) -@pytest.mark.parametrize("dtype", _REDUCE_DTYPES) -@test_utils.test(arch=qd.gpu) -def test_device_reduce_add(dtype, N): - """device_reduce_add matches numpy.sum across the full size sweep + dtype set.""" - _skip_if_dtype_unsupported(dtype) - inp, out = _alloc_input_out(dtype, N) - rng = np.random.default_rng(seed=1234) - host = _rand_reduce_host(rng, dtype, N) - _fill_field(inp, host) +_REDUCE_OPS = ["add", "min", "max"] - qd.algorithms.device_reduce_add(inp, out=out) - got = out.to_numpy()[0] +def _reduce_host(rng, op, dtype, N): + """Generate the test input for a reduce of `op` on `dtype` x N values. + + ``add`` uses small uniform / bounded values so float sums stay representable; ``min`` and ``max`` use a wider + range (-10..10 for floats, +-10000 for ints) since picking-an-element is bitwise-exact regardless of magnitude. + """ + if op == "add": + return _rand_reduce_host(rng, dtype, N) if _is_float(dtype): - expected = float(np.sum(host.astype(np.float64))) - rtol, atol = (_F32_REDUCE_RTOL, _F32_REDUCE_ATOL) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL) - assert math.isclose( - got, expected, rel_tol=rtol, abs_tol=atol - ), f"{dtype} reduce_add(N={N}): got {got}, expected {expected}" - else: - # Promote to Python int for an arbitrary-width reference; mask both sides to dtype width to handle the - # u32 / u64 mod-wrap case at large N. - mod = 1 << (32 if dtype in (qd.i32, qd.u32) else 64) if _is_unsigned(dtype) else None - ref = int( - np.sum(host.astype(np.int64 if dtype in (qd.i32, qd.u32) else (np.int64 if dtype == qd.i64 else np.uint64))) - ) # noqa: E501 - got_int = int(got) - if mod is not None: - ref &= mod - 1 - got_int &= mod - 1 - assert got_int == ref, f"{dtype} reduce_add(N={N}): got {got_int}, expected {ref}" + return rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype]) + return _rand_reduce_host(rng, dtype, N, bound=10000) -@pytest.mark.parametrize("N", _REDUCE_SIZES) -@pytest.mark.parametrize("dtype", _REDUCE_DTYPES) -@test_utils.test(arch=qd.gpu) -def test_device_reduce_min(dtype, N): - """device_reduce_min(identity=type-positive-extreme) matches numpy.min.""" +def _check_reduce(op, dtype, N): + """Run ``device_reduce_(arr)`` and verify against ``numpy.(arr)``. + + ``add`` accumulates so it needs (a) wider integer promotion + mod-wrap masking for u32/u64 and (b) per-N float + tolerance. ``min`` / ``max`` pick one input element, so they're bitwise-exact for both ints and floats. + """ _skip_if_dtype_unsupported(dtype) inp, out = _alloc_input_out(dtype, N) rng = np.random.default_rng(seed=1234) - if _is_float(dtype): - host = rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype]) - else: - host = _rand_reduce_host(rng, dtype, N, bound=10000) + host = _reduce_host(rng, op, dtype, N) _fill_field(inp, host) - qd.algorithms.device_reduce_min(inp, out=out) + qd_fn = getattr(qd.algorithms, f"device_reduce_{op}") + qd_fn(inp, out=out) got = out.to_numpy()[0] - expected = host.min() + if op == "add": + if _is_float(dtype): + expected = float(np.sum(host.astype(np.float64))) + rtol, atol = (_F32_REDUCE_RTOL, _F32_REDUCE_ATOL) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL) + assert math.isclose( + got, expected, rel_tol=rtol, abs_tol=atol + ), f"{dtype} reduce_add(N={N}): got {got}, expected {expected}" + else: + # Promote to Python int for an arbitrary-width reference; mask both sides to dtype width to handle the + # u32 / u64 mod-wrap case at large N. + mod = 1 << (32 if dtype in (qd.i32, qd.u32) else 64) if _is_unsigned(dtype) else None + ref = int( + np.sum( + host.astype(np.int64 if dtype in (qd.i32, qd.u32) else (np.int64 if dtype == qd.i64 else np.uint64)) + ) + ) # noqa: E501 + got_int = int(got) + if mod is not None: + ref &= mod - 1 + got_int &= mod - 1 + assert got_int == ref, f"{dtype} reduce_add(N={N}): got {got_int}, expected {ref}" + return + + expected = host.min() if op == "min" else host.max() if _is_float(dtype): assert got == pytest.approx(expected, abs=1e-6 if dtype == qd.f32 else 1e-12) else: - assert int(got) == int(expected), f"{dtype} reduce_min(N={N}): got {got}, expected {expected}" + assert int(got) == int(expected), f"{dtype} reduce_{op}(N={N}): got {got}, expected {expected}" +@pytest.mark.parametrize("op", _REDUCE_OPS) @pytest.mark.parametrize("N", _REDUCE_SIZES) @pytest.mark.parametrize("dtype", _REDUCE_DTYPES) @test_utils.test(arch=qd.gpu) -def test_device_reduce_max(dtype, N): - """device_reduce_max(identity=type-negative-extreme) matches numpy.max.""" - _skip_if_dtype_unsupported(dtype) - inp, out = _alloc_input_out(dtype, N) - rng = np.random.default_rng(seed=1234) - if _is_float(dtype): - host = rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype]) - else: - host = _rand_reduce_host(rng, dtype, N, bound=10000) - _fill_field(inp, host) - - qd.algorithms.device_reduce_max(inp, out=out) - got = out.to_numpy()[0] - expected = host.max() +def test_device_reduce(op, dtype, N): + """``device_reduce_{add,min,max}`` match numpy across the full size sweep + dtype set. - if _is_float(dtype): - assert got == pytest.approx(expected, abs=1e-6 if dtype == qd.f32 else 1e-12) - else: - assert int(got) == int(expected), f"{dtype} reduce_max(N={N}): got {got}, expected {expected}" + Unified across the three op variants. ``add`` accumulates so it needs overflow / precision-aware comparison; + ``min`` / ``max`` pick one element of the input and are bitwise-exact. + """ + _check_reduce(op, dtype, N) @test_utils.test(arch=qd.gpu) @@ -454,101 +447,80 @@ def _scan_dtype_mask(dtype): return -1 -@pytest.mark.parametrize("N", _SCAN_SIZES) -@pytest.mark.parametrize("dtype", _SCAN_DTYPES) -@test_utils.test(arch=qd.gpu) -def test_device_exclusive_scan_add(dtype, N): - """device_exclusive_scan_add(out[i] = sum(arr[0:i])) matches numpy.cumsum-shifted across the full 6-dtype set.""" - _skip_if_dtype_unsupported(dtype) - inp, out = _alloc_scan_input_out(dtype, N) - rng = np.random.default_rng(seed=1234) - host = _rand_reduce_host(rng, dtype, N, bound=100) - _fill_field(inp, host) +_SCAN_OPS = ["add", "min", "max"] - qd.algorithms.device_exclusive_scan_add(inp, out=out) - got = out.to_numpy() +def _scan_host(rng, op, dtype, N): + """Generate the test input for a scan of `op` on `dtype` x N values. Same rationale as ``_reduce_host``.""" + if op == "add": + return _rand_reduce_host(rng, dtype, N, bound=100) if _is_float(dtype): - ref = np.concatenate([[0.0], np.cumsum(host.astype(np.float64))[:-1]]) - rtol, atol = _f32_scan_tol(N) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL) - np.testing.assert_allclose( - got.astype(np.float64), - ref, - rtol=rtol, - atol=atol, - err_msg=f"{dtype} scan_add(N={N})", - ) - else: - # Promote to a width that survives the cumulative sum: u64 / i64 inputs use a Python int reference; smaller - # ints can still use int64. - promote = np.int64 if dtype in (qd.i32, qd.u32, qd.i64) else np.uint64 - host_wide = host.astype(promote) - ref = np.concatenate([[promote(0)], np.cumsum(host_wide)[:-1]]).astype(promote) - mask = _scan_dtype_mask(dtype) - got_view = got.astype(np.int64 if dtype != qd.u64 else np.uint64) - if mask != -1: - got_view = got_view & promote(mask) - ref = ref & promote(mask) - np.testing.assert_array_equal( - got_view, - ref, - err_msg=f"{dtype} scan_add(N={N})", - ) + return rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype]) + return _rand_reduce_host(rng, dtype, N, bound=10000) -@pytest.mark.parametrize("N", _SCAN_SIZES) -@pytest.mark.parametrize("dtype", _SCAN_DTYPES) -@test_utils.test(arch=qd.gpu) -def test_device_exclusive_scan_min(dtype, N): - """device_exclusive_scan_min(out[i] = min(arr[0:i])) matches numpy.minimum.accumulate-shifted across the full - 6-dtype set.""" +def _check_scan(op, dtype, N): + """Run ``device_exclusive_scan_(arr)`` and verify against ``numpy..accumulate``-shifted. + + Like the reduce family, ``add`` accumulates (overflow / precision care) while ``min`` / ``max`` are + bitwise-exact in both float and int paths. + """ _skip_if_dtype_unsupported(dtype) inp, out = _alloc_scan_input_out(dtype, N) rng = np.random.default_rng(seed=1234) np_dt = _DTYPE_TO_NP[dtype] - if _is_float(dtype): - host = rng.uniform(-10.0, 10.0, size=N).astype(np_dt) - else: - host = _rand_reduce_host(rng, dtype, N, bound=10000) + host = _scan_host(rng, op, dtype, N) _fill_field(inp, host) - qd.algorithms.device_exclusive_scan_min(inp, out=out) + qd_fn = getattr(qd.algorithms, f"device_exclusive_scan_{op}") + qd_fn(inp, out=out) got = out.to_numpy() + if op == "add": + if _is_float(dtype): + ref = np.concatenate([[0.0], np.cumsum(host.astype(np.float64))[:-1]]) + rtol, atol = _f32_scan_tol(N) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL) + np.testing.assert_allclose( + got.astype(np.float64), + ref, + rtol=rtol, + atol=atol, + err_msg=f"{dtype} scan_add(N={N})", + ) + else: + # Promote to a width that survives the cumulative sum: u64 / i64 inputs use a Python int reference; + # smaller ints can still use int64. + promote = np.int64 if dtype in (qd.i32, qd.u32, qd.i64) else np.uint64 + host_wide = host.astype(promote) + ref = np.concatenate([[promote(0)], np.cumsum(host_wide)[:-1]]).astype(promote) + mask = _scan_dtype_mask(dtype) + got_view = got.astype(np.int64 if dtype != qd.u64 else np.uint64) + if mask != -1: + got_view = got_view & promote(mask) + ref = ref & promote(mask) + np.testing.assert_array_equal(got_view, ref, err_msg=f"{dtype} scan_add(N={N})") + return + + np_accum = np.minimum.accumulate if op == "min" else np.maximum.accumulate + identity_table = _MIN_IDENTITY if op == "min" else _MAX_IDENTITY if _is_float(dtype): - ref = np.concatenate([[float("inf")], np.minimum.accumulate(host.astype(np.float64))[:-1]]).astype(np_dt) - atol = 0 if dtype == qd.f32 else 0 # min is bitwise-exact for monotone ops on float - np.testing.assert_allclose(got, ref, rtol=0, atol=atol, err_msg=f"{dtype} scan_min(N={N})") + identity = float("inf") if op == "min" else float("-inf") + ref = np.concatenate([[identity], np_accum(host.astype(np.float64))[:-1]]).astype(np_dt) + np.testing.assert_allclose(got, ref, rtol=0, atol=0, err_msg=f"{dtype} scan_{op}(N={N})") else: - ref = np.concatenate([[np_dt(_MIN_IDENTITY[dtype])], np.minimum.accumulate(host)[:-1]]).astype(np_dt) - np.testing.assert_array_equal(got, ref, err_msg=f"{dtype} scan_min(N={N})") + ref = np.concatenate([[np_dt(identity_table[dtype])], np_accum(host)[:-1]]).astype(np_dt) + np.testing.assert_array_equal(got, ref, err_msg=f"{dtype} scan_{op}(N={N})") +@pytest.mark.parametrize("op", _SCAN_OPS) @pytest.mark.parametrize("N", _SCAN_SIZES) @pytest.mark.parametrize("dtype", _SCAN_DTYPES) @test_utils.test(arch=qd.gpu) -def test_device_exclusive_scan_max(dtype, N): - """device_exclusive_scan_max(out[i] = max(arr[0:i])) matches numpy.maximum.accumulate-shifted across the full - 6-dtype set.""" - _skip_if_dtype_unsupported(dtype) - inp, out = _alloc_scan_input_out(dtype, N) - rng = np.random.default_rng(seed=1234) - np_dt = _DTYPE_TO_NP[dtype] - if _is_float(dtype): - host = rng.uniform(-10.0, 10.0, size=N).astype(np_dt) - else: - host = _rand_reduce_host(rng, dtype, N, bound=10000) - _fill_field(inp, host) - - qd.algorithms.device_exclusive_scan_max(inp, out=out) - got = out.to_numpy() - - if _is_float(dtype): - ref = np.concatenate([[float("-inf")], np.maximum.accumulate(host.astype(np.float64))[:-1]]).astype(np_dt) - np.testing.assert_allclose(got, ref, rtol=0, atol=0, err_msg=f"{dtype} scan_max(N={N})") - else: - ref = np.concatenate([[np_dt(_MAX_IDENTITY[dtype])], np.maximum.accumulate(host)[:-1]]).astype(np_dt) - np.testing.assert_array_equal(got, ref, err_msg=f"{dtype} scan_max(N={N})") +def test_device_exclusive_scan(op, dtype, N): + """``device_exclusive_scan_{add,min,max}`` match ``numpy.{cumsum, minimum.accumulate, maximum.accumulate}``-shifted + across the full size sweep + dtype set. Unified across the three op variants; same overflow vs bitwise-exact + handling as the reduce family.""" + _check_scan(op, dtype, N) @test_utils.test(arch=qd.gpu) diff --git a/tests/python/test_clear_all_gradients.py b/tests/python/test_clear_all_gradients.py index 615ade9b0b..22c649a979 100644 --- a/tests/python/test_clear_all_gradients.py +++ b/tests/python/test_clear_all_gradients.py @@ -1,9 +1,12 @@ +import pytest + import quadrants as qd from quadrants.lang import impl from tests import test_utils +@pytest.mark.slow @test_utils.test(exclude=[qd.vulkan]) def test_clear_all_gradients(): x = qd.field(qd.f32) diff --git a/tests/python/test_eig.py b/tests/python/test_eig.py index 53647a6eef..a8b5153dd6 100644 --- a/tests/python/test_eig.py +++ b/tests/python/test_eig.py @@ -295,7 +295,7 @@ def run(): np.testing.assert_allclose(A_reconstructed, A_np, rtol=tol, atol=tol) -@pytest.mark.parametrize("n", [4, 5, 6, 9, 12]) +@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)]) @pytest.mark.parametrize( "factory", [ @@ -311,7 +311,7 @@ def test_sym_eig_general_f32(n, factory): _test_sym_eig_general(n, qd.f32, factory) -@pytest.mark.parametrize("n", [4, 5, 6, 9, 12]) +@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)]) @pytest.mark.parametrize( "factory", [ @@ -358,7 +358,7 @@ def run(): np.testing.assert_allclose(A_spd_qd, expected, rtol=tol, atol=tol) -@pytest.mark.parametrize("n", [4, 6, 9, 12]) +@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)]) @pytest.mark.parametrize( "factory", [_sym_eig_factory_indefinite, _sym_eig_factory_random, _sym_eig_factory_spd], @@ -368,7 +368,7 @@ def test_make_spd_f32(n, factory): _test_make_spd(n, qd.f32, factory) -@pytest.mark.parametrize("n", [4, 6, 9, 12]) +@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)]) @pytest.mark.parametrize( "factory", [_sym_eig_factory_indefinite, _sym_eig_factory_random, _sym_eig_factory_spd], @@ -404,7 +404,7 @@ def run(): np.testing.assert_allclose(Q.T @ Q, np.eye(n), rtol=tol, atol=tol) -@pytest.mark.parametrize("n", [4, 6, 9, 12]) +@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)]) @pytest.mark.parametrize("alpha", [0.0, 1.0, -2.5]) @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False) def test_sym_eig_alpha_identity_f64(n, alpha): @@ -445,7 +445,7 @@ def project(src: qd.types.NDArray[mat_t, 1], dst: qd.types.NDArray[mat_t, 1]): ) -@pytest.mark.parametrize("n", [4, 6, 9, 12]) +@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)]) @pytest.mark.parametrize( "factory", [_sym_eig_factory_indefinite, _sym_eig_factory_negative_definite, _sym_eig_factory_spd], @@ -455,7 +455,7 @@ def test_make_spd_idempotent_f64(n, factory): _test_make_spd_idempotent(n, qd.f64, factory) -@pytest.mark.parametrize("n", [4, 6, 9, 12]) +@pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)]) @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False) def test_make_spd_negative_definite_zero_f64(n): """A symmetric matrix with all-negative eigenvalues projects to the zero matrix (``Q · diag(max(λ, 0)) · Qᵀ`` @@ -535,13 +535,13 @@ def run(): ), f"column {i} is not the eigenvector of eigvals[{i}]={eigvals_qd[i]}: residual={residual}" -@pytest.mark.parametrize("n", [2, 3, 4, 6, 9, 12]) +@pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)]) @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False) def test_sym_eig_sort_order_f32(n): _test_sym_eig_sort_order(n, qd.f32) -@pytest.mark.parametrize("n", [2, 3, 4, 6, 9, 12]) +@pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)]) @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False) def test_sym_eig_sort_order_f64(n): _test_sym_eig_sort_order(n, qd.f64) diff --git a/tests/python/test_linalg.py b/tests/python/test_linalg.py index dfa31495bc..59925ee2ce 100644 --- a/tests/python/test_linalg.py +++ b/tests/python/test_linalg.py @@ -154,13 +154,13 @@ def run(): assert out_self[None] == test_utils.approx(A.to_numpy().__pow__(2).sum(), rel=tol, abs=tol) -@pytest.mark.parametrize("n", [2, 3, 6, 9, 12]) +@pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)]) @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False) def test_frobenius_inner_f32(n): _test_frobenius_inner(n, qd.f32) -@pytest.mark.parametrize("n", [2, 3, 6, 9, 12]) +@pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)]) @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False) def test_frobenius_inner_f64(n): _test_frobenius_inner(n, qd.f64) @@ -189,36 +189,52 @@ def run(): assert out[None] == test_utils.approx(expected, rel=tol, abs=tol) -@pytest.mark.parametrize("rows,cols", [(9, 12), (12, 3), (2, 4)]) +@pytest.mark.parametrize( + "rows,cols", + [ + pytest.param(9, 12, marks=pytest.mark.slow), + pytest.param(12, 3, marks=pytest.mark.slow), + (2, 4), + ], +) @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False) def test_frobenius_inner_rectangular_f32(rows, cols): _test_frobenius_inner_rectangular(rows, cols, qd.f32) -@pytest.mark.parametrize("rows,cols", [(9, 12), (12, 3), (2, 4)]) +@pytest.mark.parametrize( + "rows,cols", + [ + pytest.param(9, 12, marks=pytest.mark.slow), + pytest.param(12, 3, marks=pytest.mark.slow), + (2, 4), + ], +) @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False) def test_frobenius_inner_rectangular_f64(rows, cols): _test_frobenius_inner_rectangular(rows, cols, qd.f64) -def _test_matmul_chain(dt): - """3-way matmul chain at qipc IPC sizes: (9×12) · (12×12) · (12×9) → (9×9). +def _test_matmul_chain(rows_a, cols_a, cols_b, cols_c, dt): + """3-way matmul chain: ``(rows_a × cols_a) · (cols_a × cols_b) · (cols_b × cols_c) → (rows_a × cols_c)``. - Verifies that ``Matrix.__matmul__`` compiles and is numerically correct at the largest size qipc needs. Quadrants - imposes no enforced size cap on matmul, but the unrolled `static(range)` triple loop produces ~1296 FMAs per - intermediate, so this test catches compile-time blow-up or back-end miscompiles at large sizes. + Verifies that ``Matrix.__matmul__`` compiles and is numerically correct at the requested size. Quadrants + imposes no enforced size cap on matmul, but the unrolled `static(range)` triple loop produces + ``rows_a * cols_a * cols_b + rows_a * cols_b * cols_c`` FMAs per kernel call, so this test catches compile-time + blow-up or back-end miscompiles at large sizes. The largest parametrize value is the chain qipc actually uses; + smaller values are cheap sanity checks that the same code path still works. """ np_dt = np.float32 if dt == qd.f32 else np.float64 - A_np = np.random.default_rng(0xCA70).standard_normal((9, 12)).astype(np_dt) - B_np = np.random.default_rng(0xCA71).standard_normal((12, 12)).astype(np_dt) - C_np = np.random.default_rng(0xCA72).standard_normal((12, 9)).astype(np_dt) + A_np = np.random.default_rng(0xCA70).standard_normal((rows_a, cols_a)).astype(np_dt) + B_np = np.random.default_rng(0xCA71).standard_normal((cols_a, cols_b)).astype(np_dt) + C_np = np.random.default_rng(0xCA72).standard_normal((cols_b, cols_c)).astype(np_dt) - A = qd.Matrix.field(9, 12, dtype=dt, shape=()) - B = qd.Matrix.field(12, 12, dtype=dt, shape=()) - C = qd.Matrix.field(12, 9, dtype=dt, shape=()) - AB = qd.Matrix.field(9, 12, dtype=dt, shape=()) - ABC_chained = qd.Matrix.field(9, 9, dtype=dt, shape=()) - ABC_staged = qd.Matrix.field(9, 9, dtype=dt, shape=()) + A = qd.Matrix.field(rows_a, cols_a, dtype=dt, shape=()) + B = qd.Matrix.field(cols_a, cols_b, dtype=dt, shape=()) + C = qd.Matrix.field(cols_b, cols_c, dtype=dt, shape=()) + AB = qd.Matrix.field(rows_a, cols_b, dtype=dt, shape=()) + ABC_chained = qd.Matrix.field(rows_a, cols_c, dtype=dt, shape=()) + ABC_staged = qd.Matrix.field(rows_a, cols_c, dtype=dt, shape=()) A.from_numpy(A_np) B.from_numpy(B_np) @@ -241,14 +257,25 @@ def run(): np.testing.assert_allclose(ABC_chained.to_numpy(), ABC_staged.to_numpy(), rtol=tol, atol=tol) +# qipc's actual size is (9,12,12,9) -- the largest chain it instantiates. We also keep a tiny (3,4,4,3) chain so +# the default fast lane still exercises the same Matrix.__matmul__ codegen path without paying the ~90s/case +# CUDA JIT cost of the qipc-sized chain. +_MATMUL_CHAIN_SHAPES = [ + (3, 4, 4, 3), + pytest.param(9, 12, 12, 9, marks=pytest.mark.slow), +] + + +@pytest.mark.parametrize("rows_a,cols_a,cols_b,cols_c", _MATMUL_CHAIN_SHAPES) @test_utils.test(arch=qd.gpu, default_fp=qd.f32, fast_math=False) -def test_matmul_chain_qipc_sizes_f32(): - _test_matmul_chain(qd.f32) +def test_matmul_chain_qipc_sizes_f32(rows_a, cols_a, cols_b, cols_c): + _test_matmul_chain(rows_a, cols_a, cols_b, cols_c, qd.f32) +@pytest.mark.parametrize("rows_a,cols_a,cols_b,cols_c", _MATMUL_CHAIN_SHAPES) @test_utils.test(require=qd.extension.data64, arch=qd.gpu, default_fp=qd.f64, fast_math=False) -def test_matmul_chain_qipc_sizes_f64(): - _test_matmul_chain(qd.f64) +def test_matmul_chain_qipc_sizes_f64(rows_a, cols_a, cols_b, cols_c): + _test_matmul_chain(rows_a, cols_a, cols_b, cols_c, qd.f64) @test_utils.test() @@ -434,7 +461,7 @@ def run(): np.testing.assert_allclose(M @ inv_np, np.eye(n_), rtol=tol, atol=tol) -@pytest.mark.parametrize("n", [5, 6, 7, 8, 9, 10, 11, 12]) +@pytest.mark.parametrize("n", [5, pytest.param(12, marks=pytest.mark.slow)]) @pytest.mark.parametrize( "factory", [_inverse_diagonally_dominant, _inverse_spd, _inverse_pivoting_required], @@ -444,7 +471,7 @@ def test_inverse_large_f32(n, factory): _test_inverse_at_size(n, qd.f32, factory) -@pytest.mark.parametrize("n", [5, 6, 7, 8, 9, 10, 11, 12]) +@pytest.mark.parametrize("n", [5, pytest.param(12, marks=pytest.mark.slow)]) @pytest.mark.parametrize( "factory", [_inverse_diagonally_dominant, _inverse_spd, _inverse_pivoting_required], diff --git a/tests/python/test_mpm88.py b/tests/python/test_mpm88.py index 725ff17ac9..d758b65f9d 100644 --- a/tests/python/test_mpm88.py +++ b/tests/python/test_mpm88.py @@ -7,6 +7,7 @@ from tests import test_utils +@pytest.mark.slow @pytest.mark.skipif(os.environ.get("QD_LITE_TEST") or "0", reason="Lite test") @pytest.mark.run_in_serial @test_utils.test() @@ -108,6 +109,7 @@ def _is_appveyor(): return os.getenv("APPVEYOR", "").lower() == "true" +@pytest.mark.slow @pytest.mark.skipif(os.environ.get("QD_LITE_TEST") or "0", reason="Lite test") @pytest.mark.run_in_serial @test_utils.test() diff --git a/tests/python/test_reset_ndarrays.py b/tests/python/test_reset_ndarrays.py index bc048ac92d..a42fd921f1 100644 --- a/tests/python/test_reset_ndarrays.py +++ b/tests/python/test_reset_ndarrays.py @@ -8,6 +8,7 @@ from tests import test_utils +@pytest.mark.slow @test_utils.test(arch=[qd.cpu]) def test_ndarray_doesnt_crash_on_gc() -> None: if sys.platform != "darwin": diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py index 95e3438e41..8c44a40bf9 100644 --- a/tests/python/test_simt.py +++ b/tests/python/test_simt.py @@ -887,81 +887,57 @@ def _ref_reduce_max(values): return max(values) -@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) -@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) -@test_utils.test(arch=qd.gpu) -def test_block_reduce_add(dtype, sg_per_block): - """Block sum-reduce: thread 0 of each block holds `sum(src[block_base:block_base+block_dim])`.""" - _skip_if_f64_unsupported(dtype) - block_dim = sg_per_block * _arch_subgroup_size() - NUM_BLOCKS = 4 - N = NUM_BLOCKS * block_dim - src = qd.field(dtype=dtype, shape=N) - dst = qd.field(dtype=dtype, shape=NUM_BLOCKS) - - @qd.kernel - def foo(): - qd.loop_config(block_dim=block_dim) - for i in range(N): - tid = i % block_dim - agg = block.reduce_add(src[i], block_dim, dtype) - if tid == 0: - dst[i // block_dim] = agg - - _init_field(src, N, dtype) - foo() - - for b in range(NUM_BLOCKS): - block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_reduce_add(block_vals) - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert dst[b] == expected, f"block {b}: got {dst[b]}, expected {expected}" - else: - assert abs(dst[b] - expected) < 1e-4 * abs(expected), f"block {b}: got {dst[b]}, expected {expected}" +# The three single-output reduces (`test_block_reduce_{add,min,max}`) and their three broadcast siblings +# (`test_block_reduce_all_{add,min,max}`) share the same kernel skeleton, parametrize axes, and verification loop; +# they differ only in (a) which `block.reduce_*` function gets called, (b) the host-side reference oracle, (c) the +# init pattern (sequential for `add` so the running sum has signal, permuted hash for `min` / `max` so the result +# depends on lanes other than first / last), and (d) the float tolerance regime (`add` accumulates so it uses a +# relative tol; `min` / `max` pick one element of the input and use an absolute tol). +_BLOCK_REDUCE_OP_CASES = [ + # (op_name, ref_fn, init_permuted, tol_relative) + pytest.param("add", _ref_reduce_add, False, True, id="add"), + pytest.param("min", _ref_reduce_min, True, False, id="min"), + pytest.param("max", _ref_reduce_max, True, False, id="max"), +] -@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) -@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) -@test_utils.test(arch=qd.gpu) -def test_block_reduce_min(dtype, sg_per_block): - """Block min-reduce: thread 0 of each block holds `min(src[block_base:block_base+block_dim])`.""" - _skip_if_f64_unsupported(dtype) - block_dim = sg_per_block * _arch_subgroup_size() - NUM_BLOCKS = 4 - N = NUM_BLOCKS * block_dim - src = qd.field(dtype=dtype, shape=N) - dst = qd.field(dtype=dtype, shape=NUM_BLOCKS) - - @qd.kernel - def foo(): - qd.loop_config(block_dim=block_dim) +def _init_block_reduce_src(src, N, dtype, *, permuted): + """Initialize ``src[0:N]`` for a block reduce test. ``permuted=False`` is the sequential ``1..N`` init from + ``_init_field`` (good for add); ``permuted=True`` is the stable hash ``((i * 1009) % 997) + 1`` so the per-block + min / max depends on lanes other than first / last.""" + if permuted: for i in range(N): - tid = i % block_dim - agg = block.reduce_min(src[i], block_dim, dtype) - if tid == 0: - dst[i // block_dim] = agg + v = ((i * 1009) % 997) + 1 + src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v + else: + _init_field(src, N, dtype) - # Permuted (non-monotone) initialisation so the min depends on lanes other than the first / last. - for i in range(N): - v = ((i * 1009) % 997) + 1 # in [1, 997]; stable hash, no collisions w/ block_dim values up to 256 - src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v - foo() - for b in range(NUM_BLOCKS): - block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_reduce_min(block_vals) - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert dst[b] == expected, f"block {b}: got {dst[b]}, expected {expected}" - else: - assert abs(dst[b] - expected) < 1e-5, f"block {b}: got {dst[b]}, expected {expected}" +def _assert_block_reduce_close(actual, expected, dtype, *, tol_relative, ctx): + """Assert ``actual ~= expected`` per the block-reduce tolerance regime. + Int dtypes compare exactly. Floats use relative tolerance ``1e-4 * |expected|`` for accumulating ops (sums grow + with block_dim, so a relative bound is the only thing that stays meaningful across the 32 / 128 / 256 / 64 / 256 / + 512 block-size sweep), and absolute tolerance ``1e-5`` for picker ops (min / max pick one element so the + magnitude is whatever was in the input -- a small absolute bound suffices). + """ + if dtype in _BLOCK_REDUCE_INT_DTYPES: + assert actual == expected, f"{ctx}: got {actual}, expected {expected}" + elif tol_relative: + assert abs(actual - expected) < 1e-4 * abs(expected), f"{ctx}: got {actual}, expected {expected}" + else: + assert abs(actual - expected) < 1e-5, f"{ctx}: got {actual}, expected {expected}" + +@pytest.mark.parametrize("op_name,ref_fn,init_permuted,tol_relative", _BLOCK_REDUCE_OP_CASES) @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) @test_utils.test(arch=qd.gpu) -def test_block_reduce_max(dtype, sg_per_block): - """Block max-reduce: thread 0 of each block holds `max(src[block_base:block_base+block_dim])`.""" +def test_block_reduce(dtype, sg_per_block, op_name, ref_fn, init_permuted, tol_relative): + """Block reduce: thread 0 of each block holds ``(src[block_base:block_base+block_dim])``. Unified across + ``add`` / ``min`` / ``max`` -- op-name is closure-captured into ``@qd.kernel``.""" _skip_if_f64_unsupported(dtype) + op_fn = getattr(block, f"reduce_{op_name}") block_dim = sg_per_block * _arch_subgroup_size() NUM_BLOCKS = 4 N = NUM_BLOCKS * block_dim @@ -973,34 +949,29 @@ def foo(): qd.loop_config(block_dim=block_dim) for i in range(N): tid = i % block_dim - agg = block.reduce_max(src[i], block_dim, dtype) + agg = op_fn(src[i], block_dim, dtype) if tid == 0: dst[i // block_dim] = agg - for i in range(N): - v = ((i * 1009) % 997) + 1 - src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v + _init_block_reduce_src(src, N, dtype, permuted=init_permuted) foo() for b in range(NUM_BLOCKS): block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_reduce_max(block_vals) - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert dst[b] == expected, f"block {b}: got {dst[b]}, expected {expected}" - else: - assert abs(dst[b] - expected) < 1e-5, f"block {b}: got {dst[b]}, expected {expected}" + expected = ref_fn(block_vals) + _assert_block_reduce_close(dst[b], expected, dtype, tol_relative=tol_relative, ctx=f"block {b}") +@pytest.mark.parametrize("op_name,ref_fn,init_permuted,tol_relative", _BLOCK_REDUCE_OP_CASES) @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) @test_utils.test(arch=qd.gpu) -def test_block_reduce_all_add(dtype, sg_per_block): - """Block sum-reduce broadcast: every thread of each block holds the block-wide sum. - - Verifies the broadcast variant by writing the per-thread output to a flat field, then asserting every thread of a - given block reads the same aggregate. - """ +def test_block_reduce_all(dtype, sg_per_block, op_name, ref_fn, init_permuted, tol_relative): + """Block reduce broadcast: every thread of each block holds the block-wide ````. Verified by writing the + per-thread output to a flat field, then asserting every thread of a given block reads the same aggregate. + Unified across ``add`` / ``min`` / ``max``.""" _skip_if_f64_unsupported(dtype) + op_fn = getattr(block, f"reduce_all_{op_name}") block_dim = sg_per_block * _arch_subgroup_size() NUM_BLOCKS = 4 N = NUM_BLOCKS * block_dim @@ -1011,90 +982,17 @@ def test_block_reduce_all_add(dtype, sg_per_block): def foo(): qd.loop_config(block_dim=block_dim) for i in range(N): - dst[i] = block.reduce_all_add(src[i], block_dim, dtype) + dst[i] = op_fn(src[i], block_dim, dtype) - _init_field(src, N, dtype) + _init_block_reduce_src(src, N, dtype, permuted=init_permuted) foo() for b in range(NUM_BLOCKS): block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_reduce_add(block_vals) + expected = ref_fn(block_vals) for j in range(block_dim): actual = dst[b * block_dim + j] - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert actual == expected, f"block {b} thread {j}: got {actual}, expected {expected}" - else: - assert abs(actual - expected) < 1e-4 * abs( - expected - ), f"block {b} thread {j}: got {actual}, expected {expected}" - - -@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) -@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) -@test_utils.test(arch=qd.gpu) -def test_block_reduce_all_min(dtype, sg_per_block): - """Block min-reduce broadcast: every thread reads the block-wide min.""" - _skip_if_f64_unsupported(dtype) - block_dim = sg_per_block * _arch_subgroup_size() - NUM_BLOCKS = 4 - N = NUM_BLOCKS * block_dim - src = qd.field(dtype=dtype, shape=N) - dst = qd.field(dtype=dtype, shape=N) - - @qd.kernel - def foo(): - qd.loop_config(block_dim=block_dim) - for i in range(N): - dst[i] = block.reduce_all_min(src[i], block_dim, dtype) - - for i in range(N): - v = ((i * 1009) % 997) + 1 - src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v - foo() - - for b in range(NUM_BLOCKS): - block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_reduce_min(block_vals) - for j in range(block_dim): - actual = dst[b * block_dim + j] - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert actual == expected, f"block {b} thread {j}: got {actual}, expected {expected}" - else: - assert abs(actual - expected) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected}" - - -@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) -@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) -@test_utils.test(arch=qd.gpu) -def test_block_reduce_all_max(dtype, sg_per_block): - """Block max-reduce broadcast: every thread reads the block-wide max.""" - _skip_if_f64_unsupported(dtype) - block_dim = sg_per_block * _arch_subgroup_size() - NUM_BLOCKS = 4 - N = NUM_BLOCKS * block_dim - src = qd.field(dtype=dtype, shape=N) - dst = qd.field(dtype=dtype, shape=N) - - @qd.kernel - def foo(): - qd.loop_config(block_dim=block_dim) - for i in range(N): - dst[i] = block.reduce_all_max(src[i], block_dim, dtype) - - for i in range(N): - v = ((i * 1009) % 997) + 1 - src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v - foo() - - for b in range(NUM_BLOCKS): - block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_reduce_max(block_vals) - for j in range(block_dim): - actual = dst[b * block_dim + j] - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert actual == expected, f"block {b} thread {j}: got {actual}, expected {expected}" - else: - assert abs(actual - expected) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected}" + _assert_block_reduce_close(actual, expected, dtype, tol_relative=tol_relative, ctx=f"block {b} thread {j}") # --- Block scan tests ------------------------------------------------------------------ @@ -1147,46 +1045,45 @@ def _ref_exclusive_scan_op(values, op, identity): return out -@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) -@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) -@test_utils.test(arch=qd.gpu) -def test_block_inclusive_add(dtype, sg_per_block): - """Block inclusive prefix sum: thread `i` holds `sum(src[block_base..i])`.""" - _skip_if_f64_unsupported(dtype) - block_dim = sg_per_block * _arch_subgroup_size() - NUM_BLOCKS = 4 - N = NUM_BLOCKS * block_dim - src = qd.field(dtype=dtype, shape=N) - dst = qd.field(dtype=dtype, shape=N) - - @qd.kernel - def foo(): - qd.loop_config(block_dim=block_dim) - for i in range(N): - dst[i] = block.inclusive_add(src[i], block_dim, dtype) +# The four scan tests in this group (`test_block_inclusive_{add,min,max}` + `test_block_exclusive_add`) share the +# kernel skeleton; only the per-op reference oracle, init pattern, and float tolerance differ. `add` accumulates +# (sequential init, relative tol); `min` / `max` pick (permuted init, absolute tol). Exclusive `min` / `max` get +# their own dedicated test below because they need a dtype-derived sentinel identity (+inf / iinfo(max), -inf / +# iinfo(min)) at lane 0 with explicit ``isinf`` handling -- different enough that fusing them in would create more +# branches than it removes. +_PY_MIN = lambda a, b: a if a < b else b # noqa: E731 (intentional 1-line lambda for ref oracle) +_PY_MAX = lambda a, b: a if a > b else b # noqa: E731 + +_BLOCK_INCLUSIVE_SCAN_OP_CASES = [ + # (op_name, ref_fn, init_permuted, tol_relative) + pytest.param("add", _ref_inclusive_scan_add, False, True, id="add"), + pytest.param("min", lambda vals: _ref_inclusive_scan_op(vals, _PY_MIN, 0), True, False, id="min"), + pytest.param("max", lambda vals: _ref_inclusive_scan_op(vals, _PY_MAX, 0), True, False, id="max"), +] - _init_field(src, N, dtype) - foo() - for b in range(NUM_BLOCKS): - block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_inclusive_scan_add(block_vals) - for j in range(block_dim): - actual = dst[b * block_dim + j] - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}" - else: - assert abs(actual - expected[j]) < 1e-4 * abs( - expected[j] + 1.0 - ), f"block {b} thread {j}: got {actual}, expected {expected[j]}" +def _assert_block_scan_close(actual, expected_j, dtype, *, tol_relative, ctx): + """Per-thread assertion for block scan tests. Same int / relative-float / absolute-float regime as + ``_assert_block_reduce_close`` but with a floor on the relative-tol base so the first few prefixes (where + ``expected_j`` is near zero) don't tighten the bound to zero.""" + if dtype in _BLOCK_REDUCE_INT_DTYPES: + assert actual == expected_j, f"{ctx}: got {actual}, expected {expected_j}" + elif tol_relative: + tol_base = abs(expected_j) if abs(expected_j) > 1.0 else 1.0 + assert abs(actual - expected_j) < 1e-4 * tol_base, f"{ctx}: got {actual}, expected {expected_j}" + else: + assert abs(actual - expected_j) < 1e-5, f"{ctx}: got {actual}, expected {expected_j}" +@pytest.mark.parametrize("op_name,ref_fn,init_permuted,tol_relative", _BLOCK_INCLUSIVE_SCAN_OP_CASES) @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) @test_utils.test(arch=qd.gpu) -def test_block_exclusive_add(dtype, sg_per_block): - """Block exclusive prefix sum: thread `i` holds `sum(src[block_base..i-1])`; thread 0 holds 0.""" +def test_block_inclusive(dtype, sg_per_block, op_name, ref_fn, init_permuted, tol_relative): + """Block inclusive prefix scan: thread ``i`` holds ``(src[block_base..i])``. Unified across ``add`` / ``min`` + / ``max``.""" _skip_if_f64_unsupported(dtype) + op_fn = getattr(block, f"inclusive_{op_name}") block_dim = sg_per_block * _arch_subgroup_size() NUM_BLOCKS = 4 N = NUM_BLOCKS * block_dim @@ -1197,31 +1094,24 @@ def test_block_exclusive_add(dtype, sg_per_block): def foo(): qd.loop_config(block_dim=block_dim) for i in range(N): - dst[i] = block.exclusive_add(src[i], block_dim, dtype) + dst[i] = op_fn(src[i], block_dim, dtype) - _init_field(src, N, dtype) + _init_block_reduce_src(src, N, dtype, permuted=init_permuted) foo() for b in range(NUM_BLOCKS): block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_exclusive_scan_add(block_vals) + expected = ref_fn(block_vals) for j in range(block_dim): actual = dst[b * block_dim + j] - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}" - else: - # First thread's expected is 0; gate the relative tolerance so it doesn't blow up. - tol_base = abs(expected[j]) if abs(expected[j]) > 1.0 else 1.0 - assert ( - abs(actual - expected[j]) < 1e-4 * tol_base - ), f"block {b} thread {j}: got {actual}, expected {expected[j]}" + _assert_block_scan_close(actual, expected[j], dtype, tol_relative=tol_relative, ctx=f"block {b} thread {j}") @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) @test_utils.test(arch=qd.gpu) -def test_block_inclusive_min(dtype, sg_per_block): - """Block inclusive prefix min.""" +def test_block_exclusive_add(dtype, sg_per_block): + """Block exclusive prefix sum: thread ``i`` holds ``sum(src[block_base..i-1])``; thread 0 holds 0.""" _skip_if_f64_unsupported(dtype) block_dim = sg_per_block * _arch_subgroup_size() NUM_BLOCKS = 4 @@ -1233,66 +1123,37 @@ def test_block_inclusive_min(dtype, sg_per_block): def foo(): qd.loop_config(block_dim=block_dim) for i in range(N): - dst[i] = block.inclusive_min(src[i], block_dim, dtype) + dst[i] = block.exclusive_add(src[i], block_dim, dtype) - for i in range(N): - v = ((i * 1009) % 997) + 1 - src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v + _init_field(src, N, dtype) foo() - py_min = lambda a, b: a if a < b else b # noqa: E731 (intentional 1-line lambda for ref oracle) for b in range(NUM_BLOCKS): block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_inclusive_scan_op(block_vals, py_min, 0) + expected = _ref_exclusive_scan_add(block_vals) for j in range(block_dim): actual = dst[b * block_dim + j] - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}" - else: - assert abs(actual - expected[j]) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected[j]}" - - -@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) -@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) -@test_utils.test(arch=qd.gpu) -def test_block_inclusive_max(dtype, sg_per_block): - """Block inclusive prefix max.""" - _skip_if_f64_unsupported(dtype) - block_dim = sg_per_block * _arch_subgroup_size() - NUM_BLOCKS = 4 - N = NUM_BLOCKS * block_dim - src = qd.field(dtype=dtype, shape=N) - dst = qd.field(dtype=dtype, shape=N) - - @qd.kernel - def foo(): - qd.loop_config(block_dim=block_dim) - for i in range(N): - dst[i] = block.inclusive_max(src[i], block_dim, dtype) + _assert_block_scan_close(actual, expected[j], dtype, tol_relative=True, ctx=f"block {b} thread {j}") - for i in range(N): - v = ((i * 1009) % 997) + 1 - src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v - foo() - py_max = lambda a, b: a if a > b else b # noqa: E731 - for b in range(NUM_BLOCKS): - block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_inclusive_scan_op(block_vals, py_max, 0) - for j in range(block_dim): - actual = dst[b * block_dim + j] - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}" - else: - assert abs(actual - expected[j]) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected[j]}" +_BLOCK_EXCLUSIVE_MINMAX_CASES = [ + # (op_name, sentinel_fn, py_op, inf_sign) + pytest.param("min", _block_exclusive_min_sentinel, _PY_MIN, 1, id="min"), + pytest.param("max", _block_exclusive_max_sentinel, _PY_MAX, -1, id="max"), +] +@pytest.mark.parametrize("op_name,sentinel_fn,py_op,inf_sign", _BLOCK_EXCLUSIVE_MINMAX_CASES) @pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) @pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) @test_utils.test(arch=qd.gpu) -def test_block_exclusive_min(dtype, sg_per_block): - """Block exclusive prefix min; thread 0 holds the dtype-derived identity (``+inf`` / ``np.iinfo(dtype).max``).""" +def test_block_exclusive_minmax(dtype, sg_per_block, op_name, sentinel_fn, py_op, inf_sign): + """Block exclusive prefix ```` for ``op in {min, max}``; thread 0 of each block holds the dtype-derived + identity (``+inf`` / ``iinfo(dtype).max`` for min, ``-inf`` / ``iinfo(dtype).min`` for max). The float ``inf`` / + ``-inf`` lane-0 identity gets a sign-only check because ``inf - inf`` (or ``(-inf) - (-inf)``) is ``NaN`` and the + standard ``abs(diff) < tol`` compare would fail spuriously.""" _skip_if_f64_unsupported(dtype) + op_fn = getattr(block, f"exclusive_{op_name}") block_dim = sg_per_block * _arch_subgroup_size() NUM_BLOCKS = 4 N = NUM_BLOCKS * block_dim @@ -1303,25 +1164,23 @@ def test_block_exclusive_min(dtype, sg_per_block): def foo(): qd.loop_config(block_dim=block_dim) for i in range(N): - dst[i] = block.exclusive_min(src[i], block_dim, dtype) + dst[i] = op_fn(src[i], block_dim, dtype) - for i in range(N): - v = ((i * 1009) % 997) + 1 - src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v + _init_block_reduce_src(src, N, dtype, permuted=True) foo() - sentinel = _block_exclusive_min_sentinel(dtype) - py_min = lambda a, b: a if a < b else b # noqa: E731 + sentinel = sentinel_fn(dtype) for b in range(NUM_BLOCKS): block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_exclusive_scan_op(block_vals, py_min, sentinel) + expected = _ref_exclusive_scan_op(block_vals, py_op, sentinel) for j in range(block_dim): actual = dst[b * block_dim + j] if dtype in _BLOCK_REDUCE_INT_DTYPES: assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}" elif math.isinf(expected[j]): - # Thread 0 of each block gets the +inf identity; ``inf - inf`` is NaN, so check by equality / sign. - assert math.isinf(actual) and actual > 0, f"block {b} thread {j}: got {actual}, expected {expected[j]}" + assert math.isinf(actual) and ( + actual > 0 if inf_sign > 0 else actual < 0 + ), f"block {b} thread {j}: got {actual}, expected {expected[j]}" else: assert abs(actual - expected[j]) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected[j]}" @@ -1455,45 +1314,6 @@ def kern(): assert actual_ranks == ref_ranks, f"ranks mismatch (pattern={key_pattern})" -@pytest.mark.parametrize("dtype", _BLOCK_REDUCE_DTYPES) -@pytest.mark.parametrize("sg_per_block", _BLOCK_REDUCE_SG_PER_BLOCK) -@test_utils.test(arch=qd.gpu) -def test_block_exclusive_max(dtype, sg_per_block): - """Block exclusive prefix max; thread 0 holds the dtype-derived identity (``-inf`` / ``np.iinfo(dtype).min``).""" - _skip_if_f64_unsupported(dtype) - block_dim = sg_per_block * _arch_subgroup_size() - NUM_BLOCKS = 4 - N = NUM_BLOCKS * block_dim - src = qd.field(dtype=dtype, shape=N) - dst = qd.field(dtype=dtype, shape=N) - - @qd.kernel - def foo(): - qd.loop_config(block_dim=block_dim) - for i in range(N): - dst[i] = block.exclusive_max(src[i], block_dim, dtype) - - for i in range(N): - v = ((i * 1009) % 997) + 1 - src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v - foo() - - sentinel = _block_exclusive_max_sentinel(dtype) - py_max = lambda a, b: a if a > b else b # noqa: E731 - for b in range(NUM_BLOCKS): - block_vals = [src[b * block_dim + j] for j in range(block_dim)] - expected = _ref_exclusive_scan_op(block_vals, py_max, sentinel) - for j in range(block_dim): - actual = dst[b * block_dim + j] - if dtype in _BLOCK_REDUCE_INT_DTYPES: - assert actual == expected[j], f"block {b} thread {j}: got {actual}, expected {expected[j]}" - elif math.isinf(expected[j]): - # Thread 0 of each block gets the -inf identity; ``-inf - -inf`` is NaN, so check by equality / sign. - assert math.isinf(actual) and actual < 0, f"block {b} thread {j}: got {actual}, expected {expected[j]}" - else: - assert abs(actual - expected[j]) < 1e-5, f"block {b} thread {j}: got {actual}, expected {expected[j]}" - - @pytest.mark.parametrize("dtype", [qd.i32, qd.f32, qd.f64]) @test_utils.test(arch=qd.gpu) def test_subgroup_shuffle_broadcast(dtype): @@ -3604,94 +3424,45 @@ def _init_full_bitwise(src, n): src[i] = 1 << (i % 7) -@test_utils.test(arch=qd.gpu) -def test_subgroup_reduce_add(): - _check_full_matches_tiled(subgroup.reduce_add, subgroup.reduce_add_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_reduce_all_add(): - _check_full_matches_tiled(subgroup.reduce_all_add, subgroup.reduce_all_add_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_reduce_min(): - _check_full_matches_tiled(subgroup.reduce_min, subgroup.reduce_min_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_reduce_max(): - _check_full_matches_tiled(subgroup.reduce_max, subgroup.reduce_max_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_reduce_all_min(): - _check_full_matches_tiled(subgroup.reduce_all_min, subgroup.reduce_all_min_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_reduce_all_max(): - _check_full_matches_tiled(subgroup.reduce_all_max, subgroup.reduce_all_max_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_inclusive_add(): - _check_full_matches_tiled(subgroup.inclusive_add, subgroup.inclusive_add_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_inclusive_min(): - _check_full_matches_tiled(subgroup.inclusive_min, subgroup.inclusive_min_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_inclusive_max(): - _check_full_matches_tiled(subgroup.inclusive_max, subgroup.inclusive_max_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_inclusive_mul(): - _check_full_matches_tiled(subgroup.inclusive_mul, subgroup.inclusive_mul_tiled, host_init=_init_full_small_int) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_inclusive_and(): - _check_full_matches_tiled(subgroup.inclusive_and, subgroup.inclusive_and_tiled, host_init=_init_full_bitwise) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_inclusive_or(): - _check_full_matches_tiled(subgroup.inclusive_or, subgroup.inclusive_or_tiled, host_init=_init_full_bitwise) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_inclusive_xor(): - _check_full_matches_tiled(subgroup.inclusive_xor, subgroup.inclusive_xor_tiled, host_init=_init_full_bitwise) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_exclusive_add(): - _check_full_matches_tiled(subgroup.exclusive_add, subgroup.exclusive_add_tiled) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_exclusive_mul(): - _check_full_matches_tiled(subgroup.exclusive_mul, subgroup.exclusive_mul_tiled, host_init=_init_full_small_int) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_exclusive_and(): - _check_full_matches_tiled(subgroup.exclusive_and, subgroup.exclusive_and_tiled, host_init=_init_full_bitwise) - - -@test_utils.test(arch=qd.gpu) -def test_subgroup_exclusive_or(): - _check_full_matches_tiled(subgroup.exclusive_or, subgroup.exclusive_or_tiled, host_init=_init_full_bitwise) +# Each entry is a thin ``_check_full_matches_tiled(subgroup.X, subgroup.X_tiled, ...)`` wrapper. Collapsed into one +# op-parametrized test to drop ~80 LOC of duplication. The pytest ids match the names of the original +# ``test_subgroup_`` functions so test reports / `-k` selectors stay stable. +_FULL_VS_TILED_INT_CASES = [ + pytest.param("reduce_add", None, id="reduce_add"), + pytest.param("reduce_all_add", None, id="reduce_all_add"), + pytest.param("reduce_min", None, id="reduce_min"), + pytest.param("reduce_max", None, id="reduce_max"), + pytest.param("reduce_all_min", None, id="reduce_all_min"), + pytest.param("reduce_all_max", None, id="reduce_all_max"), + pytest.param("inclusive_add", None, id="inclusive_add"), + pytest.param("inclusive_min", None, id="inclusive_min"), + pytest.param("inclusive_max", None, id="inclusive_max"), + # `mul` needs bounded inputs (2**N overflows i32 quickly); bitwise ops need a per-lane bit pattern that's + # non-zero on every lane so AND has signal and OR / XOR have varied bits. + pytest.param("inclusive_mul", _init_full_small_int, id="inclusive_mul"), + pytest.param("inclusive_and", _init_full_bitwise, id="inclusive_and"), + pytest.param("inclusive_or", _init_full_bitwise, id="inclusive_or"), + pytest.param("inclusive_xor", _init_full_bitwise, id="inclusive_xor"), + pytest.param("exclusive_add", None, id="exclusive_add"), + pytest.param("exclusive_mul", _init_full_small_int, id="exclusive_mul"), + pytest.param("exclusive_and", _init_full_bitwise, id="exclusive_and"), + pytest.param("exclusive_or", _init_full_bitwise, id="exclusive_or"), + pytest.param("exclusive_xor", _init_full_bitwise, id="exclusive_xor"), +] +@pytest.mark.parametrize("op_name,host_init", _FULL_VS_TILED_INT_CASES) @test_utils.test(arch=qd.gpu) -def test_subgroup_exclusive_xor(): - _check_full_matches_tiled(subgroup.exclusive_xor, subgroup.exclusive_xor_tiled, host_init=_init_full_bitwise) +def test_subgroup_full_matches_tiled(op_name, host_init): + """For each subgroup op ``X``, verify ``subgroup.X(v)`` matches ``subgroup.X_tiled(v, log2_group_size())`` + lane-by-lane on ``qd.i32``. Covers reduce / inclusive / exclusive families; bitwise ops + ``mul`` use a custom + initializer that keeps the per-lane aggregate bounded.""" + full_fn = getattr(subgroup, op_name) + tiled_fn = getattr(subgroup, f"{op_name}_tiled") + kwargs = {} + if host_init is not None: + kwargs["host_init"] = host_init + _check_full_matches_tiled(full_fn, tiled_fn, **kwargs) @test_utils.test(arch=qd.gpu) @@ -3836,16 +3607,15 @@ def k(): # accidentally cast through i32 inside a wrapper. +@pytest.mark.parametrize("op_name", ["reduce_add", "inclusive_add"]) @pytest.mark.parametrize("dtype", [qd.f32, qd.f64]) @test_utils.test(arch=qd.gpu) -def test_subgroup_reduce_add_float(dtype): - _check_full_matches_tiled(subgroup.reduce_add, subgroup.reduce_add_tiled, dtype=dtype) - - -@pytest.mark.parametrize("dtype", [qd.f32, qd.f64]) -@test_utils.test(arch=qd.gpu) -def test_subgroup_inclusive_add_float(dtype): - _check_full_matches_tiled(subgroup.inclusive_add, subgroup.inclusive_add_tiled, dtype=dtype) +def test_subgroup_full_matches_tiled_float(op_name, dtype): + """Float-dtype coverage of the dtype-agnostic ``full`` wrappers (``reduce_add``, ``inclusive_add``). One f32 + one + f64 case per family is enough to catch an i32-only regression in a wrapper.""" + full_fn = getattr(subgroup, op_name) + tiled_fn = getattr(subgroup, f"{op_name}_tiled") + _check_full_matches_tiled(full_fn, tiled_fn, dtype=dtype) @pytest.mark.parametrize("dtype", [qd.f32, qd.f64]) diff --git a/tests/python/test_struct.py b/tests/python/test_struct.py index d3d6a4fbaa..de6d249970 100644 --- a/tests/python/test_struct.py +++ b/tests/python/test_struct.py @@ -62,6 +62,7 @@ def test_linear_nested_aos(): assert y[i] == i + 123 +@pytest.mark.slow @test_utils.test(exclude=[qd.vulkan]) def test_2d_nested(): x = qd.field(qd.i32) diff --git a/tests/python/test_tile16.py b/tests/python/test_tile16.py index 97480c7d1d..adf8249605 100644 --- a/tests/python/test_tile16.py +++ b/tests/python/test_tile16.py @@ -1776,8 +1776,25 @@ def write_eye_f32(dst: Ann32): @test_utils.test(arch=[qd.cuda]) def test_tile16_cholesky_blocked_demo(): - """Smoke-test that misc/demos/cholesky_blocked.py runs to completion.""" + """Smoke-test that misc/demos/cholesky_blocked.py runs to completion. + + Uses small CLI overrides (N=32, N_ENVS=64, 1 warmup + 1 timed iter) so the JIT compile of the 3 unrolled kernels + and the benchmark loop both stay cheap. The demo's defaults (N=92, N_ENVS=4096, 50+200 iters) are exercised by + anyone running the script manually, not by CI. + """ demo = Path(__file__).resolve().parents[2] / "misc" / "demos" / "cholesky_blocked.py" - result = subprocess.run([sys.executable, str(demo)], capture_output=True, text=True, timeout=300) + cmd = [ + sys.executable, + str(demo), + "--n", + "32", + "--n-envs", + "64", + "--num-warmup", + "1", + "--num-iters", + "1", + ] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) if result.returncode != 0: pytest.fail(f"cholesky_blocked.py exited with code {result.returncode}\nstderr:\n{result.stderr}") diff --git a/tests/run_tests.py b/tests/run_tests.py index e2419add42..7276ce9d00 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -56,8 +56,14 @@ def _test_python(args, default_dir="python"): pytest_args += ["--cov-append"] if args.keys: pytest_args += ["-k", args.keys] - if args.marks: - pytest_args += ["-m", args.marks] + # By default we exclude tests marked `slow` (eig / make_spd at n>=6, inverse_large at n>=6, mpm88, etc. -- see + # tests/pytest.ini for the marker). `--run-slow` opts back in. If the user passes their own `-m` expression we + # AND `not slow` onto it so the exclusion still applies, unless they explicitly opt out via `--run-slow`. + marks_expr = args.marks + if not args.run_slow: + marks_expr = f"({marks_expr}) and not slow" if marks_expr else "not slow" + if marks_expr: + pytest_args += ["-m", marks_expr] if args.failed_first: pytest_args += ["--failed-first"] if args.fail_fast: @@ -161,7 +167,16 @@ def test(): default=None, dest="marks", type=str, - help="Only run tests with specific marks", + help="Only run tests with specific marks. `not slow` is appended automatically " "unless --run-slow is passed.", + ) + parser.add_argument( + "--run-slow", + required=False, + default=False, + dest="run_slow", + action="store_true", + help="Include tests marked `slow` (excluded by default). Has no effect if -m is " + "given an explicit expression that already mentions `slow`.", ) parser.add_argument( "-f",