From 9484be30ab0bfa4d5c83c5bf6ce74bb332f5dfbc Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 20 May 2026 05:33:45 -0700 Subject: [PATCH 01/10] ndrange: add layout= keyword qd.ndrange(*args, layout=None) now accepts a permutation of range(N) describing the iteration nesting order, outermost (slowest-varying) first. Mirrors layout= on qd.tensor exactly: canonical-preserving (loop variables stay bound to canonical axes; only visit order changes), identity / None is the no-op default. Implementation is pure-Python and confined to two files: * python/quadrants/lang/_ndrange.py: validate layout up front (QuadrantsSyntaxError on wrong length / non-permutation), normalize identity to None, and store bounds / dimensions / acc_dimensions in *physical* order (with _physical_to_canonical capturing the per-level remap). __iter__ keeps yielding canonical-ordered tuples in the new physical visit order. * python/quadrants/lang/ast/ast_transformer.py: build_ndrange_for and build_grouped_ndrange_for assign each decomposed flat-thread index to targets[physical_to_canonical[p]] instead of targets[p]. For the identity case physical_to_canonical is range(n) and the emitted IR matches the pre-layout codegen byte-for-byte (no perf regression on existing kernels). Primary motivation: pairing qd.ndrange(layout=L) with qd.tensor(..., layout=L) lines adjacent flat threads up with adjacent physical memory slots, restoring coalesced / cache-friendly access for layout-tagged tensors while keeping canonical indexing in the body. Design doc: perso_hugh/doc/ndrange_layout.md. --- python/quadrants/lang/_ndrange.py | 103 +++++++++++++++---- python/quadrants/lang/ast/ast_transformer.py | 40 ++++--- 2 files changed, 111 insertions(+), 32 deletions(-) diff --git a/python/quadrants/lang/_ndrange.py b/python/quadrants/lang/_ndrange.py index 3766c4a0b0..8c96c9cefb 100644 --- a/python/quadrants/lang/_ndrange.py +++ b/python/quadrants/lang/_ndrange.py @@ -31,7 +31,7 @@ def _coerce_to_int(v): class _Ndrange: - def __init__(self, *args): + def __init__(self, *args, layout=None): args = list(args) for i, arg in enumerate(args): if not isinstance(arg, collections.abc.Sequence): @@ -49,33 +49,78 @@ def __init__(self, *args): raise QuadrantsTypeError( "Every argument of ndrange should be an integer scalar or a tuple/list of (int, int)" ) - self.bounds = args - self.dimensions = [None] * len(args) - for i, bound in enumerate(self.bounds): - self.dimensions[i] = bound[1] - bound[0] + n = len(args) + + # Validate and normalize ``layout``. Stored as ``self.layout`` (``None`` for the identity + # permutation, else the user-supplied tuple) for introspection / tests, and as + # ``self._physical_to_canonical`` (a Python list of int of length ``n``) for the AST + # builder to use when remapping per-physical-level decomposed indices to canonical loop + # targets. The identity case is kept as ``None`` so the AST-builder fast-path matches + # the pre-layout codegen byte-for-byte. + if layout is None: + self.layout = None + physical_to_canonical = list(range(n)) + else: + layout_t = tuple(layout) + if len(layout_t) != n: + raise QuadrantsSyntaxError( + f"qd.ndrange(layout={layout_t!r}) has {len(layout_t)} entries " + f"but ndrange was called with {n} dimension argument(s); they must match" + ) + if sorted(layout_t) != list(range(n)): + raise QuadrantsSyntaxError( + f"qd.ndrange(layout={layout_t!r}) is not a permutation of range({n})" + ) + if layout_t == tuple(range(n)): + self.layout = None + physical_to_canonical = list(range(n)) + else: + self.layout = layout_t + physical_to_canonical = list(layout_t) - self.acc_dimensions = self.dimensions.copy() - for i in reversed(range(len(self.bounds) - 1)): - self.acc_dimensions[i] = self.acc_dimensions[i] * self.acc_dimensions[i + 1] - if len(self.acc_dimensions) == 0: # for the empty case, e.g. qd.ndrange() - self.acc_dimensions = [1] + self._physical_to_canonical = physical_to_canonical - def __iter__(self): - def gen(d, prefix): - if d == len(self.bounds): - yield prefix - else: - for t in range(self.bounds[d][0], self.bounds[d][1]): - yield from gen(d + 1, prefix + (t,)) + canonical_bounds = args + canonical_dimensions = [bound[1] - bound[0] for bound in canonical_bounds] - yield from gen(0, ()) + physical_bounds = [canonical_bounds[c] for c in physical_to_canonical] + physical_dimensions = [canonical_dimensions[c] for c in physical_to_canonical] + + acc_dimensions = physical_dimensions.copy() + for i in reversed(range(n - 1)): + acc_dimensions[i] = acc_dimensions[i] * acc_dimensions[i + 1] + if not acc_dimensions: # for the empty case, e.g. qd.ndrange() + acc_dimensions = [1] + + self._canonical_bounds = canonical_bounds + self._canonical_dimensions = canonical_dimensions + self.bounds = physical_bounds + self.dimensions = physical_dimensions + self.acc_dimensions = acc_dimensions + + def __iter__(self): + p2c = self._physical_to_canonical + cbounds = self._canonical_bounds + n = len(p2c) + + def gen(level, current): + if level == n: + yield tuple(current) + return + ax = p2c[level] + b, e = cbounds[ax] + for t in range(b, e): + current[ax] = t + yield from gen(level + 1, current) + + yield from gen(0, [0] * n) def grouped(self): return GroupedNDRange(self) -def ndrange(*args) -> Iterable: +def ndrange(*args, layout=None) -> Iterable: """Return an immutable iterator object for looping over multi-dimensional indices. This returned set of multi-dimensional indices is the direct product (in the set-theory sense) @@ -91,6 +136,14 @@ def ndrange(*args) -> Iterable: Args: entries: (int, tuple): Must be either an integer, or a tuple/list of two integers. + layout (tuple of int, optional): Permutation of canonical axes describing the iteration + nesting order, outermost (slowest-varying) first. For an N-argument ndrange, must be + a permutation of ``range(N)``. ``None`` (default) and the identity permutation are + equivalent and reproduce the default order in which the **last argument is the + innermost / fastest-varying axis**. The values yielded by the loop are unchanged; + only the visit order is. This is the matching iteration-order knob for the + ``layout=`` keyword on :func:`quadrants.tensor` — pair them with the same + permutation to align parallel iteration with the tensor's physical memory order. Returns: An immutable iterator object. @@ -154,8 +207,18 @@ def ndrange(*args) -> Iterable: >>> def loop_tensor(): >>> for row, col, channel in qd.ndrange(image_height, image_width, channels): >>> image[row, col, channel] = ... + + Aligning iteration order with a non-default tensor layout via ``layout=``: + + >>> A = qd.tensor(qd.f32, shape=(M, N), layout=(1, 0)) # axis 1 outer, axis 0 inner + >>> @qd.kernel + >>> def fill(): + >>> # adjacent flat threads now step along axis 0 (the inner physical axis of A), + >>> # i.e. touch physically adjacent memory in A + >>> for i, j in qd.ndrange(M, N, layout=(1, 0)): + >>> A[i, j] = i + j """ - return _Ndrange(*args) + return _Ndrange(*args, layout=layout) class GroupedNDRange: diff --git a/python/quadrants/lang/ast/ast_transformer.py b/python/quadrants/lang/ast/ast_transformer.py index 263a4a11a3..e59e189c0a 100644 --- a/python/quadrants/lang/ast/ast_transformer.py +++ b/python/quadrants/lang/ast/ast_transformer.py @@ -1052,24 +1052,32 @@ def build_ndrange_for(ctx: ASTTransformerFuncContext, node: ast.For) -> None: "Please check if the number of arguments of qd.ndrange() is equal to " "the number of the loop variables." ) - for i, target in enumerate(targets): - if i + 1 < len(targets): - target_tmp = impl.expr_init(I // ndrange_var.acc_dimensions[i + 1]) + # ``physical_to_canonical[p]`` is the canonical (user-visible) axis index that receives + # the decomposed index for physical nesting level ``p``. For the identity / ``layout=None`` + # case this is ``[0, 1, ..., n-1]`` and the emitted IR matches the pre-layout codegen + # byte-for-byte. + physical_to_canonical = ndrange_var._physical_to_canonical + n_levels = len(ndrange_var.dimensions) + for p in range(n_levels): + if p + 1 < n_levels: + target_tmp = impl.expr_init(I // ndrange_var.acc_dimensions[p + 1]) else: target_tmp = impl.expr_init(I) + canonical_idx = physical_to_canonical[p] + target = targets[canonical_idx] ctx.create_variable( target, impl.expr_init( target_tmp + impl.subscript( ctx.ast_builder, - impl.subscript(ctx.ast_builder, ndrange_var.bounds, i), + impl.subscript(ctx.ast_builder, ndrange_var.bounds, p), 0, ) ), ) - if i + 1 < len(targets): - I._assign(I - target_tmp * ndrange_var.acc_dimensions[i + 1]) + if p + 1 < n_levels: + I._assign(I - target_tmp * ndrange_var.acc_dimensions[p + 1]) ctx.loop_depth += 1 build_stmts(ctx, node.body) ctx.loop_depth -= 1 @@ -1098,14 +1106,22 @@ def build_grouped_ndrange_for(ctx: ASTTransformerFuncContext, node: ast.For) -> ctx.create_variable(target, target_var) I = impl.expr_init(ndrange_loop_var) - for i in range(len(ndrange_var.dimensions)): - if i + 1 < len(ndrange_var.dimensions): - target_tmp = I // ndrange_var.acc_dimensions[i + 1] + # See ``build_ndrange_for`` above for the layout semantics. The grouped target_var is a + # vector indexed by canonical axis, so element ``physical_to_canonical[p]`` (not ``p``) + # receives the decomposition of physical level ``p``. + physical_to_canonical = ndrange_var._physical_to_canonical + n_levels = len(ndrange_var.dimensions) + for p in range(n_levels): + if p + 1 < n_levels: + target_tmp = I // ndrange_var.acc_dimensions[p + 1] else: target_tmp = I - impl.subscript(ctx.ast_builder, target_var, i)._assign(target_tmp + ndrange_var.bounds[i][0]) - if i + 1 < len(ndrange_var.dimensions): - I._assign(I - target_tmp * ndrange_var.acc_dimensions[i + 1]) + canonical_idx = physical_to_canonical[p] + impl.subscript(ctx.ast_builder, target_var, canonical_idx)._assign( + target_tmp + ndrange_var.bounds[p][0] + ) + if p + 1 < n_levels: + I._assign(I - target_tmp * ndrange_var.acc_dimensions[p + 1]) ctx.loop_depth += 1 build_stmts(ctx, node.body) ctx.loop_depth -= 1 From 263bbd1728f48760136ee6432456e659865c962f Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 20 May 2026 05:33:57 -0700 Subject: [PATCH 02/10] docs: document qd.ndrange(layout=...) New subsection in parallelization.md after the qd.grouped section, with a worked example pairing qd.tensor(layout=(1, 0)) with qd.ndrange(layout=(1, 0)) to align iteration order with physical memory order. Notes that the layout permutation has the exact same meaning as on qd.tensor (canonical axis index at each successive nesting level, outermost first), and that loop variables remain bound to canonical axes regardless of layout. Cross-reference added from the "Controlling physical layout" section of tensor.md so users discovering qd.tensor(layout=...) find the matching iteration-order knob. --- docs/source/user_guide/parallelization.md | 32 +++++++++++++++++++++++ docs/source/user_guide/tensor.md | 2 ++ 2 files changed, 34 insertions(+) diff --git a/docs/source/user_guide/parallelization.md b/docs/source/user_guide/parallelization.md index 9d7ebb3845..be8b58f1cb 100644 --- a/docs/source/user_guide/parallelization.md +++ b/docs/source/user_guide/parallelization.md @@ -48,6 +48,38 @@ def fill(a: qd.Template) -> None: `I` is a `qd.Vector` with one element per dimension. +### Controlling iteration order with `layout=` + +By default, `qd.ndrange(d0, d1, ..., dN-1)` makes the **last argument the innermost (fastest-varying) axis** in the flat parallel loop: adjacent flat threads differ in the last index. This pairs naturally with a default-layout tensor. + +For a tensor allocated with a non-default `layout=` (see [`tensor`](tensor.md#controlling-physical-layout)), the matching iteration order is different — the inner physical axis is no longer the last canonical axis. The `layout=` keyword on `qd.ndrange` lets you align the iteration order with the tensor's physical layout, while keeping canonical indexing in the loop body: + +```python +A = qd.tensor(qd.f32, shape=(M, N), layout=(1, 0)) # axis 1 outer, axis 0 inner + +@qd.kernel +def fill(): + # iterate with axis 1 outer, axis 0 inner — adjacent flat threads now step along axis 0 + # in canonical space, which is the inner physical axis of A, so they touch physically + # adjacent memory in A. + for i, j in qd.ndrange(M, N, layout=(1, 0)): + A[i, j] = i + j +``` + +`layout` works exactly like `layout=` on `qd.tensor`: a tuple of `int` listing the **canonical axis index at each successive iteration-nesting level, outermost first**. It must be a permutation of `range(N)` where `N` is the number of arguments to `qd.ndrange`. The yielded loop variables (`i`, `j`, ...) are still bound to canonical axes 0, 1, ... — only the visit order changes. `layout=None` (the default) and the identity permutation `(0, 1, ..., N-1)` are equivalent and reproduce the default last-argument-innermost order. + +`layout=` is supported by both the plain and `qd.grouped` forms: + +```python +for i, j in qd.ndrange(M, N, layout=(1, 0)): + ... +for I in qd.grouped(qd.ndrange(M, N, layout=(1, 0))): + # I[0] is still the canonical axis-0 index, regardless of layout + ... +``` + +Mismatched length and non-permutation values are rejected up front with `qd.QuadrantsSyntaxError`. + ## Does GPU kernel launch latency matter? Kernel launch can be done in parallel whilst the previously launched kernel is still running. This means that if the previously launched kernel takes longer to run than the launch time for the new kernel, then the kernel launch latency will be perfectly hidden. diff --git a/docs/source/user_guide/tensor.md b/docs/source/user_guide/tensor.md index f4b0ab25c3..8063ff524f 100644 --- a/docs/source/user_guide/tensor.md +++ b/docs/source/user_guide/tensor.md @@ -113,6 +113,8 @@ b[i, j] = ... # canonical indexing in kernels still works Any permutation is supported, up to Quadrants' `quadrants_max_num_indices` (currently 12). `layout=None` and the identity permutation (`(0, 1, ..., N-1)`) are equivalent and forward no permutation to the underlying allocator. +For best performance, pair `qd.tensor(..., layout=...)` with a matching iteration order via `qd.ndrange(..., layout=...)` (see [`parallelization`](parallelization.md#controlling-iteration-order-with-layout)): the permutation has the same meaning on both APIs, and using the same value on both lines adjacent flat threads up with adjacent physical memory slots. + Quadrants rejects mismatched / invalid layouts up front: ```python From 83ab414b46643dd23b5606e581f3fd25290ea04b Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 20 May 2026 05:33:57 -0700 Subject: [PATCH 03/10] test: add test_ndrange_layout Covers: * layout=None / identity equivalence with the no-keyword form (in-kernel, field backend) * non-identity layouts (rank 2 transposed, rank 3 with (2, 0, 1)): canonical loop targets, full coverage of the index space, preserved (begin, end) offsets on tuple bounds * flat-index decomposition matches the physical iteration order * qd.grouped(...) returns canonical indices regardless of layout * qd.static(qd.grouped(...)) (unrolled path) also sees canonical indices in the layout-induced order * pairing with qd.tensor(layout=...) (the documented primary use case) * Python-side iteration outside @qd.kernel, including a helper that reconstructs the expected canonical-tuple sequence from (dims, layout) * introspection: layout=None and identity both normalize self.layout to None; non-identity preserves the user-supplied tuple * degenerate 1-D and 0-D ndranges * error cases: wrong length, not a permutation, out-of-range entries --- tests/python/test_ndrange_layout.py | 377 ++++++++++++++++++++++++++++ 1 file changed, 377 insertions(+) create mode 100644 tests/python/test_ndrange_layout.py diff --git a/tests/python/test_ndrange_layout.py b/tests/python/test_ndrange_layout.py new file mode 100644 index 0000000000..339fc86183 --- /dev/null +++ b/tests/python/test_ndrange_layout.py @@ -0,0 +1,377 @@ +"""Tests for the ``layout=`` keyword on :func:`quadrants.ndrange`. + +``layout=`` is canonical-preserving: the loop variables stay bound to canonical axes regardless of layout; +only the visit order (which canonical axis is the outermost / innermost iteration nesting level) changes. +``layout=None`` and the identity permutation are equivalent and produce the default last-arg-innermost +behaviour. + +See ``perso_hugh/doc/ndrange_layout.md`` for design notes. +""" + +import itertools + +import numpy as np +import pytest + +import quadrants as qd + +from tests import test_utils + + +def _expected_flat_to_canonical(dims, layout): + """Build the expected sequence of canonical multi-indices yielded by an ``ndrange`` of the given + dimensions and layout. + + Iteration nests with physical level 0 outermost; physical level ``p`` indexes canonical axis ``layout[p]``. + """ + layout = tuple(range(len(dims))) if layout is None else tuple(layout) + ranges = [range(dims[axis]) for axis in layout] + out = [] + for physical_tuple in itertools.product(*ranges): + canonical = [0] * len(dims) + for p, ax in enumerate(layout): + canonical[ax] = physical_tuple[p] + out.append(tuple(canonical)) + return out + + +def _expected_flat_index(canonical, dims, layout): + """Return the flat thread index that visits ``canonical`` under (``dims``, ``layout``). + + Mirrors the AST-builder's decomposition: flat = sum_{p} canonical[layout[p]] * prod(dims[layout[p+1:]]). + """ + layout = tuple(range(len(dims))) if layout is None else tuple(layout) + n = len(dims) + flat = 0 + for p in range(n): + ax = layout[p] + inner = 1 + for q in range(p + 1, n): + inner *= dims[layout[q]] + flat += canonical[ax] * inner + return flat + + +# ---------------------------------------------------------------------------- +# Identity / default equivalence +# ---------------------------------------------------------------------------- + + +@test_utils.test() +def test_layout_none_matches_default(): + M, N = 5, 7 + x = qd.field(qd.i32, shape=(M, N)) + y = qd.field(qd.i32, shape=(M, N)) + + @qd.kernel + def fill_default(): + for i, j in qd.ndrange(M, N): + x[i, j] = i * 100 + j + + @qd.kernel + def fill_layout_none(): + for i, j in qd.ndrange(M, N, layout=None): + y[i, j] = i * 100 + j + + fill_default() + fill_layout_none() + np.testing.assert_array_equal(x.to_numpy(), y.to_numpy()) + + +@test_utils.test() +def test_layout_identity_matches_default(): + M, N = 5, 7 + x = qd.field(qd.i32, shape=(M, N)) + y = qd.field(qd.i32, shape=(M, N)) + + @qd.kernel + def fill_default(): + for i, j in qd.ndrange(M, N): + x[i, j] = i * 100 + j + + @qd.kernel + def fill_layout_identity(): + for i, j in qd.ndrange(M, N, layout=(0, 1)): + y[i, j] = i * 100 + j + + fill_default() + fill_layout_identity() + np.testing.assert_array_equal(x.to_numpy(), y.to_numpy()) + + +# ---------------------------------------------------------------------------- +# Non-identity layouts: canonical loop targets, full coverage +# ---------------------------------------------------------------------------- + + +@test_utils.test() +def test_layout_2d_transposed_canonical_targets(): + """With ``layout=(1, 0)``, the loop variables (i, j) are still canonical axes 0, 1.""" + M, N = 4, 6 + x = qd.field(qd.i32, shape=(M, N)) + + @qd.kernel + def fill(): + for i, j in qd.ndrange(M, N, layout=(1, 0)): + x[i, j] = i * 100 + j + + fill() + expected = np.array([[i * 100 + j for j in range(N)] for i in range(M)], dtype=np.int32) + np.testing.assert_array_equal(x.to_numpy(), expected) + + +@test_utils.test() +def test_layout_3d_arbitrary_permutation_canonical_targets(): + """Rank-3 with a non-cyclic permutation.""" + D0, D1, D2 = 3, 4, 5 + x = qd.field(qd.i32, shape=(D0, D1, D2)) + + @qd.kernel + def fill(): + for i, j, k in qd.ndrange(D0, D1, D2, layout=(2, 0, 1)): + x[i, j, k] = i * 10000 + j * 100 + k + + fill() + expected = np.array( + [[[i * 10000 + j * 100 + k for k in range(D2)] for j in range(D1)] for i in range(D0)], + dtype=np.int32, + ) + np.testing.assert_array_equal(x.to_numpy(), expected) + + +@test_utils.test() +def test_layout_with_tuple_bounds_preserves_offsets(): + """Layout doesn't disturb (begin, end) tuples — each canonical axis keeps its own bounds.""" + M, N = 16, 16 + x = qd.field(qd.i32, shape=(M, N)) + + @qd.kernel + def fill(): + for i, j in qd.ndrange((2, 10), (3, 7), layout=(1, 0)): + x[i, j] = i * 100 + j + + fill() + expected = np.zeros((M, N), dtype=np.int32) + for i in range(2, 10): + for j in range(3, 7): + expected[i, j] = i * 100 + j + np.testing.assert_array_equal(x.to_numpy(), expected) + + +@test_utils.test() +def test_layout_full_coverage_via_atomic_count(): + """Every canonical slot is visited exactly once.""" + M, N = 5, 7 + counts = qd.field(qd.i32, shape=(M, N)) + + @qd.kernel + def fill(): + for i, j in qd.ndrange(M, N, layout=(1, 0)): + counts[i, j] += 1 + + fill() + np.testing.assert_array_equal(counts.to_numpy(), np.ones((M, N), dtype=np.int32)) + + +@test_utils.test() +def test_layout_flat_index_matches_decomposition(): + """The flat thread index reconstructed from the canonical loop variables under the requested + layout permutation matches what a sequential range-loop would assign — i.e. the AST decomposition + is the inverse of the canonical-from-physical mapping. + """ + M, N = 4, 6 + flat = qd.field(qd.i32, shape=(M, N)) + + @qd.kernel + def fill(): + for i, j in qd.ndrange(M, N, layout=(1, 0)): + # If physical level 0 = axis 1 (outer) and level 1 = axis 0 (inner), then the flat index + # is j * M + i. Writing it into a per-canonical-slot grid lets us check coverage and the + # bijection in one pass. + flat[i, j] = j * M + i + + fill() + expected = np.array([[j * M + i for j in range(N)] for i in range(M)], dtype=np.int32) + np.testing.assert_array_equal(flat.to_numpy(), expected) + + +# ---------------------------------------------------------------------------- +# qd.grouped + layout +# ---------------------------------------------------------------------------- + + +@test_utils.test() +def test_layout_grouped_indices_are_canonical(): + """``I[0]`` is the canonical axis-0 index regardless of layout.""" + M, N = 4, 5 + x = qd.field(qd.i32, shape=(M, N)) + + @qd.kernel + def fill(): + for I in qd.grouped(qd.ndrange(M, N, layout=(1, 0))): + x[I] = I[0] * 100 + I[1] + + fill() + expected = np.array([[i * 100 + j for j in range(N)] for i in range(M)], dtype=np.int32) + np.testing.assert_array_equal(x.to_numpy(), expected) + + +@test_utils.test() +def test_layout_static_grouped(): + """Unrolled (qd.static) grouped path also sees canonical indices in physical iteration order.""" + M, N = 3, 4 + x = qd.field(qd.i32, shape=(M, N)) + + @qd.kernel + def fill(): + for I in qd.static(qd.grouped(qd.ndrange(M, N, layout=(1, 0)))): + x[I] = I[0] * 100 + I[1] + + fill() + expected = np.array([[i * 100 + j for j in range(N)] for i in range(M)], dtype=np.int32) + np.testing.assert_array_equal(x.to_numpy(), expected) + + +# ---------------------------------------------------------------------------- +# Pairing with qd.tensor(..., layout=...) +# ---------------------------------------------------------------------------- + + +@test_utils.test(arch=qd.cpu) +def test_layout_pairs_with_tensor_layout_field(): + """The documented pairing use case: matching ``layout=`` on both tensor and ndrange. The kernel + body uses canonical indexing throughout; correctness must hold (this exercises the + canonical->physical AST rewrite on the tensor side and the layout-aware decomposition on the + ndrange side together). + """ + M, N = 4, 6 + A = qd.tensor(qd.i32, shape=(M, N), backend=qd.Backend.FIELD, layout=(1, 0)) + + @qd.kernel + def fill(a: qd.template()): + for i, j in qd.ndrange(M, N, layout=(1, 0)): + a[i, j] = i * 100 + j + + fill(A) + expected = np.array([[i * 100 + j for j in range(N)] for i in range(M)], dtype=np.int32) + np.testing.assert_array_equal(A.to_numpy(), expected) + + +# ---------------------------------------------------------------------------- +# Python-side iteration (outside @qd.kernel) +# ---------------------------------------------------------------------------- + + +def test_layout_python_iteration_2d(): + qd.init(arch=qd.cpu) + M, N = 3, 4 + got = list(qd.ndrange(M, N, layout=(1, 0))) + assert got == _expected_flat_to_canonical((M, N), (1, 0)) + + +def test_layout_python_iteration_3d(): + qd.init(arch=qd.cpu) + dims = (2, 3, 4) + got = list(qd.ndrange(*dims, layout=(2, 0, 1))) + assert got == _expected_flat_to_canonical(dims, (2, 0, 1)) + + +def test_layout_python_iteration_identity_matches_default(): + qd.init(arch=qd.cpu) + M, N = 3, 4 + assert list(qd.ndrange(M, N, layout=(0, 1))) == list(qd.ndrange(M, N)) + assert list(qd.ndrange(M, N, layout=None)) == list(qd.ndrange(M, N)) + + +def test_layout_grouped_python_iteration_2d(): + qd.init(arch=qd.cpu) + M, N = 3, 4 + got = [] + for vec in qd.grouped(qd.ndrange(M, N, layout=(1, 0))): + # ``vec`` is a qd.Matrix; convert to a plain tuple via to_list() for comparison + got.append(tuple(vec.to_list())) + assert got == _expected_flat_to_canonical((M, N), (1, 0)) + + +# ---------------------------------------------------------------------------- +# Introspection +# ---------------------------------------------------------------------------- + + +def test_layout_attribute_identity_normalizes_to_none(): + qd.init(arch=qd.cpu) + # ``layout=None`` and identity layout both expose ``layout = None`` for introspection + # (so user code can treat "no layout" symmetrically). + from quadrants.lang._ndrange import _Ndrange + + a = _Ndrange(3, 4) + b = _Ndrange(3, 4, layout=None) + c = _Ndrange(3, 4, layout=(0, 1)) + assert a.layout is None + assert b.layout is None + assert c.layout is None + + +def test_layout_attribute_non_identity_preserved(): + qd.init(arch=qd.cpu) + from quadrants.lang._ndrange import _Ndrange + + a = _Ndrange(3, 4, layout=(1, 0)) + assert a.layout == (1, 0) + + +# ---------------------------------------------------------------------------- +# Degenerate ranks +# ---------------------------------------------------------------------------- + + +@test_utils.test() +def test_layout_1d_degenerate(): + """Layout (0,) on a 1-D ndrange is the only permutation and must match the default.""" + M = 7 + x = qd.field(qd.i32, shape=(M,)) + y = qd.field(qd.i32, shape=(M,)) + + @qd.kernel + def fill_default(): + for i in qd.ndrange(M): + x[i] = i + + @qd.kernel + def fill_layout(): + for i in qd.ndrange(M, layout=(0,)): + y[i] = i + + fill_default() + fill_layout() + np.testing.assert_array_equal(x.to_numpy(), y.to_numpy()) + + +def test_layout_zero_dim_degenerate(): + qd.init(arch=qd.cpu) + # Empty ndrange yields exactly one (empty) tuple. + assert list(qd.ndrange()) == [()] + assert list(qd.ndrange(layout=())) == [()] + + +# ---------------------------------------------------------------------------- +# Error cases +# ---------------------------------------------------------------------------- + + +def test_layout_wrong_length_raises(): + qd.init(arch=qd.cpu) + with pytest.raises(qd.QuadrantsSyntaxError, match=r"qd\.ndrange\(layout=.*\) has 3 entries but ndrange"): + qd.ndrange(4, 5, layout=(0, 1, 2)) + + +def test_layout_not_a_permutation_raises(): + qd.init(arch=qd.cpu) + with pytest.raises(qd.QuadrantsSyntaxError, match=r"qd\.ndrange\(layout=.*\) is not a permutation"): + qd.ndrange(4, 5, layout=(0, 0)) + + +def test_layout_out_of_range_raises(): + qd.init(arch=qd.cpu) + with pytest.raises(qd.QuadrantsSyntaxError, match=r"qd\.ndrange\(layout=.*\) is not a permutation"): + qd.ndrange(4, 5, layout=(0, 2)) From 24778270d092fbaf9a036408e80476a598537c99 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 20 May 2026 05:42:35 -0700 Subject: [PATCH 04/10] test: fix Python-scope grouped iteration test ``qd.grouped`` is decorated ``@quadrants_scope`` and raises when called outside a kernel, so test the underlying ``_Ndrange.grouped()`` method directly. Equivalent coverage, no qd.grouped-from-Python-scope assertion. --- tests/python/test_ndrange_layout.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/python/test_ndrange_layout.py b/tests/python/test_ndrange_layout.py index 339fc86183..396469c4d8 100644 --- a/tests/python/test_ndrange_layout.py +++ b/tests/python/test_ndrange_layout.py @@ -283,12 +283,17 @@ def test_layout_python_iteration_identity_matches_default(): assert list(qd.ndrange(M, N, layout=None)) == list(qd.ndrange(M, N)) -def test_layout_grouped_python_iteration_2d(): +def test_layout_grouped_python_iteration_via_method(): + """``_Ndrange.grouped()`` (Python-scope method, not ``qd.grouped``) preserves the layout-induced + iteration order. ``qd.grouped`` itself is decorated ``@quadrants_scope`` and cannot be invoked + outside a kernel, so test the underlying method directly here. + """ qd.init(arch=qd.cpu) + from quadrants.lang._ndrange import _Ndrange + M, N = 3, 4 got = [] - for vec in qd.grouped(qd.ndrange(M, N, layout=(1, 0))): - # ``vec`` is a qd.Matrix; convert to a plain tuple via to_list() for comparison + for vec in _Ndrange(M, N, layout=(1, 0)).grouped(): got.append(tuple(vec.to_list())) assert got == _expected_flat_to_canonical((M, N), (1, 0)) From fa19333d61d8c3f45839ff434175bfc86af817d3 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 20 May 2026 06:16:03 -0700 Subject: [PATCH 05/10] docs: don't predicate qd.ndrange(layout=) on qd.Tensor Reframe the parallelization.md subsection so that ``layout=`` is presented as an iteration-order control in its own right, not "the matching keyword for qd.Tensor". Add an explicit "independent of what's in the loop body" note and move the qd.tensor pairing into a "When is layout= useful?" subsection as the motivating but not gating use case. Also update the ``ndrange`` docstring to match: layout= works with field, ndarray, tensor, vector/matrix variants, or no tensor at all; the tensor pairing is the motivating use case, not the only one. --- docs/source/user_guide/parallelization.md | 35 +++++++++++++++-------- python/quadrants/lang/_ndrange.py | 12 +++++--- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/docs/source/user_guide/parallelization.md b/docs/source/user_guide/parallelization.md index be8b58f1cb..05b39408e7 100644 --- a/docs/source/user_guide/parallelization.md +++ b/docs/source/user_guide/parallelization.md @@ -50,23 +50,19 @@ def fill(a: qd.Template) -> None: ### Controlling iteration order with `layout=` -By default, `qd.ndrange(d0, d1, ..., dN-1)` makes the **last argument the innermost (fastest-varying) axis** in the flat parallel loop: adjacent flat threads differ in the last index. This pairs naturally with a default-layout tensor. - -For a tensor allocated with a non-default `layout=` (see [`tensor`](tensor.md#controlling-physical-layout)), the matching iteration order is different — the inner physical axis is no longer the last canonical axis. The `layout=` keyword on `qd.ndrange` lets you align the iteration order with the tensor's physical layout, while keeping canonical indexing in the loop body: +By default, `qd.ndrange(d0, d1, ..., dN-1)` makes the **last argument the innermost (fastest-varying) axis** in the flat parallel loop: adjacent flat threads differ in the last index. The `layout=` keyword lets you choose a different iteration-nesting order. It's a tuple of `int` listing the **canonical axis index at each successive iteration-nesting level, outermost first**, and must be a permutation of `range(N)` where `N` is the number of arguments to `qd.ndrange`: ```python -A = qd.tensor(qd.f32, shape=(M, N), layout=(1, 0)) # axis 1 outer, axis 0 inner - @qd.kernel -def fill(): - # iterate with axis 1 outer, axis 0 inner — adjacent flat threads now step along axis 0 - # in canonical space, which is the inner physical axis of A, so they touch physically - # adjacent memory in A. +def k(): + # axis 1 is outermost (slowest-varying), axis 0 is innermost (fastest-varying) for i, j in qd.ndrange(M, N, layout=(1, 0)): - A[i, j] = i + j + ... ``` -`layout` works exactly like `layout=` on `qd.tensor`: a tuple of `int` listing the **canonical axis index at each successive iteration-nesting level, outermost first**. It must be a permutation of `range(N)` where `N` is the number of arguments to `qd.ndrange`. The yielded loop variables (`i`, `j`, ...) are still bound to canonical axes 0, 1, ... — only the visit order changes. `layout=None` (the default) and the identity permutation `(0, 1, ..., N-1)` are equivalent and reproduce the default last-argument-innermost order. +The yielded loop variables (`i`, `j`, ...) are still bound to canonical axes 0, 1, ... — only the visit order changes. `layout=None` (the default) and the identity permutation `(0, 1, ..., N-1)` are equivalent and reproduce the default last-argument-innermost order. Mismatched length and non-permutation values are rejected up front with `qd.QuadrantsSyntaxError`. + +`layout=` is independent of what's in the loop body: it controls the iteration order regardless of whether the body touches a `qd.field`, a `qd.ndarray`, a `qd.tensor`, a `qd.Vector` / `qd.Matrix` variant, or no tensor at all. `layout=` is supported by both the plain and `qd.grouped` forms: @@ -78,7 +74,22 @@ for I in qd.grouped(qd.ndrange(M, N, layout=(1, 0))): ... ``` -Mismatched length and non-permutation values are rejected up front with `qd.QuadrantsSyntaxError`. +#### When is `layout=` useful? + +The motivating use case is aligning iteration with a non-default physical memory layout — most often a tensor allocated via `qd.tensor(..., layout=...)` or a field allocated via `qd.field(..., order=...)`. Using the matching permutation makes adjacent flat threads step through physically adjacent memory, which restores coalesced / cache-friendly access: + +```python +A = qd.tensor(qd.f32, shape=(M, N), layout=(1, 0)) # axis 1 outer, axis 0 inner in memory + +@qd.kernel +def fill(): + # Same permutation on the ndrange: adjacent flat threads step along axis 0 in canonical + # space, which is the inner physical axis of A, so they touch physically adjacent memory. + for i, j in qd.ndrange(M, N, layout=(1, 0)): + A[i, j] = i + j +``` + +The same applies to bare `qd.field(..., order='ji')` or any other layout-tagged tensor — `qd.ndrange` doesn't inspect the body, so any data structure with a known memory order can be paired with a matching iteration order this way. See [`tensor`](tensor.md#controlling-physical-layout) for the tensor-side `layout=` keyword. ## Does GPU kernel launch latency matter? diff --git a/python/quadrants/lang/_ndrange.py b/python/quadrants/lang/_ndrange.py index 8c96c9cefb..29074013e8 100644 --- a/python/quadrants/lang/_ndrange.py +++ b/python/quadrants/lang/_ndrange.py @@ -140,10 +140,14 @@ def ndrange(*args, layout=None) -> Iterable: nesting order, outermost (slowest-varying) first. For an N-argument ndrange, must be a permutation of ``range(N)``. ``None`` (default) and the identity permutation are equivalent and reproduce the default order in which the **last argument is the - innermost / fastest-varying axis**. The values yielded by the loop are unchanged; - only the visit order is. This is the matching iteration-order knob for the - ``layout=`` keyword on :func:`quadrants.tensor` — pair them with the same - permutation to align parallel iteration with the tensor's physical memory order. + innermost / fastest-varying axis**. The yielded loop variables stay bound to + canonical axes 0, 1, ..., N-1 regardless of layout — only the visit order changes. + ``layout=`` is independent of the loop body; it controls iteration order whether + the body touches a field, ndarray, tensor, vector/matrix variant, or no tensor at + all. The motivating use case is aligning iteration with a non-default physical + memory layout (e.g. ``qd.tensor(..., layout=...)`` or ``qd.field(..., order=...)``): + using the matching permutation makes adjacent flat threads step through physically + adjacent memory. Returns: An immutable iterator object. From f890f34faa44b8b9bdb4817f65c47dbcfd5daa18 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 20 May 2026 06:18:02 -0700 Subject: [PATCH 06/10] docs: drop "non-default physical memory layout" framing The motivating use case framing is just "align iteration with physical memory layout", not specifically with non-default layouts; the example that follows speaks for itself. --- docs/source/user_guide/parallelization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/user_guide/parallelization.md b/docs/source/user_guide/parallelization.md index 05b39408e7..94e02ddbf0 100644 --- a/docs/source/user_guide/parallelization.md +++ b/docs/source/user_guide/parallelization.md @@ -76,7 +76,7 @@ for I in qd.grouped(qd.ndrange(M, N, layout=(1, 0))): #### When is `layout=` useful? -The motivating use case is aligning iteration with a non-default physical memory layout — most often a tensor allocated via `qd.tensor(..., layout=...)` or a field allocated via `qd.field(..., order=...)`. Using the matching permutation makes adjacent flat threads step through physically adjacent memory, which restores coalesced / cache-friendly access: +The motivating use case is aligning the iteration order with the physical memory layout of the data accessed in the loop body. Using the matching permutation makes adjacent flat threads step through physically adjacent memory, which gives coalesced / cache-friendly access: ```python A = qd.tensor(qd.f32, shape=(M, N), layout=(1, 0)) # axis 1 outer, axis 0 inner in memory From bfd49e7c97d0b110b006d85a3cd13e131371ad37 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 20 May 2026 06:20:14 -0700 Subject: [PATCH 07/10] docs: drop "When is layout= useful?" subsection from parallelization.md The subsection's example (qd.tensor + matching ndrange layout) belongs in the tensor user guide, not the parallelization one, where it muddies the "layout= is independent of the body" message. --- docs/source/user_guide/parallelization.md | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/docs/source/user_guide/parallelization.md b/docs/source/user_guide/parallelization.md index 94e02ddbf0..8f910927b6 100644 --- a/docs/source/user_guide/parallelization.md +++ b/docs/source/user_guide/parallelization.md @@ -74,23 +74,6 @@ for I in qd.grouped(qd.ndrange(M, N, layout=(1, 0))): ... ``` -#### When is `layout=` useful? - -The motivating use case is aligning the iteration order with the physical memory layout of the data accessed in the loop body. Using the matching permutation makes adjacent flat threads step through physically adjacent memory, which gives coalesced / cache-friendly access: - -```python -A = qd.tensor(qd.f32, shape=(M, N), layout=(1, 0)) # axis 1 outer, axis 0 inner in memory - -@qd.kernel -def fill(): - # Same permutation on the ndrange: adjacent flat threads step along axis 0 in canonical - # space, which is the inner physical axis of A, so they touch physically adjacent memory. - for i, j in qd.ndrange(M, N, layout=(1, 0)): - A[i, j] = i + j -``` - -The same applies to bare `qd.field(..., order='ji')` or any other layout-tagged tensor — `qd.ndrange` doesn't inspect the body, so any data structure with a known memory order can be paired with a matching iteration order this way. See [`tensor`](tensor.md#controlling-physical-layout) for the tensor-side `layout=` keyword. - ## Does GPU kernel launch latency matter? Kernel launch can be done in parallel whilst the previously launched kernel is still running. This means that if the previously launched kernel takes longer to run than the launch time for the new kernel, then the kernel launch latency will be perfectly hidden. From 370a86bc2da8d18fcc7116fb641618c2cbeb6cf3 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 20 May 2026 06:21:17 -0700 Subject: [PATCH 08/10] test: drop reference to private design-notes location Quadrants is the public repo; design notes live in a private one and shouldn't be cited from here. --- tests/python/test_ndrange_layout.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/python/test_ndrange_layout.py b/tests/python/test_ndrange_layout.py index 396469c4d8..d7ee1d9c76 100644 --- a/tests/python/test_ndrange_layout.py +++ b/tests/python/test_ndrange_layout.py @@ -4,8 +4,6 @@ only the visit order (which canonical axis is the outermost / innermost iteration nesting level) changes. ``layout=None`` and the identity permutation are equivalent and produce the default last-arg-innermost behaviour. - -See ``perso_hugh/doc/ndrange_layout.md`` for design notes. """ import itertools From 8c1b45909defded2e4ccff3907437ee91db14bde Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 20 May 2026 06:23:55 -0700 Subject: [PATCH 09/10] ndrange: type-check layout entries before permutation check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses codex review on PR #710: ``sorted(layout_t)`` raises a raw Python ``TypeError`` on mixed-type / non-integer entries (e.g. ``layout=(0, "1")``), exposing an internal exception instead of the documented Quadrants error type. Add an explicit per-element check that raises ``QuadrantsTypeError`` (matching the convention used for bounds validation above). ``bool`` is rejected explicitly even though it is an ``int`` subclass — accepting ``True`` / ``False`` as axis indices would be a foot-gun. Zero perf impact: ``_Ndrange.__init__`` runs at AST-build time, once per kernel compile (cached thereafter). The new ``isinstance`` loop only runs when ``layout is not None``; default ``layout=None`` short- circuits before any new check. Adds three tests: string entry, float entry, bool entry. --- python/quadrants/lang/_ndrange.py | 11 +++++++++++ tests/python/test_ndrange_layout.py | 20 ++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/python/quadrants/lang/_ndrange.py b/python/quadrants/lang/_ndrange.py index 29074013e8..2bfb23877a 100644 --- a/python/quadrants/lang/_ndrange.py +++ b/python/quadrants/lang/_ndrange.py @@ -68,6 +68,17 @@ def __init__(self, *args, layout=None): f"qd.ndrange(layout={layout_t!r}) has {len(layout_t)} entries " f"but ndrange was called with {n} dimension argument(s); they must match" ) + # Type-check each entry before sorting / permutation checks, so mixed-type or + # non-integer entries surface a Quadrants error instead of Python's raw + # ``TypeError`` from ``sorted``. ``bool`` is rejected explicitly even though it is + # an ``int`` subclass — accepting ``True`` / ``False`` as axis indices would be a + # foot-gun. + for e in layout_t: + if isinstance(e, bool) or not isinstance(e, (int, np.integer)): + raise QuadrantsTypeError( + f"qd.ndrange(layout={layout_t!r}) entries must be Python ints; " + f"got {type(e).__name__} ({e!r})" + ) if sorted(layout_t) != list(range(n)): raise QuadrantsSyntaxError( f"qd.ndrange(layout={layout_t!r}) is not a permutation of range({n})" diff --git a/tests/python/test_ndrange_layout.py b/tests/python/test_ndrange_layout.py index d7ee1d9c76..a1456fce25 100644 --- a/tests/python/test_ndrange_layout.py +++ b/tests/python/test_ndrange_layout.py @@ -378,3 +378,23 @@ def test_layout_out_of_range_raises(): qd.init(arch=qd.cpu) with pytest.raises(qd.QuadrantsSyntaxError, match=r"qd\.ndrange\(layout=.*\) is not a permutation"): qd.ndrange(4, 5, layout=(0, 2)) + + +def test_layout_non_integer_entry_raises(): + """Non-integer entries (string, float, mixed) surface a QuadrantsTypeError instead of the raw + Python ``TypeError`` ``sorted`` would emit on mixed-type sequences. + """ + qd.init(arch=qd.cpu) + with pytest.raises(qd.QuadrantsTypeError, match=r"entries must be Python ints"): + qd.ndrange(4, 5, layout=(0, "1")) + with pytest.raises(qd.QuadrantsTypeError, match=r"entries must be Python ints"): + qd.ndrange(4, 5, layout=(0.0, 1.0)) + + +def test_layout_bool_entry_rejected(): + """``bool`` is an ``int`` subclass but rejecting ``True`` / ``False`` as axis indices avoids a + foot-gun. + """ + qd.init(arch=qd.cpu) + with pytest.raises(qd.QuadrantsTypeError, match=r"entries must be Python ints"): + qd.ndrange(4, 5, layout=(True, False)) From 3fa8d2903b08e46f66d7f218ffae954cb1559be2 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 20 May 2026 06:28:21 -0700 Subject: [PATCH 10/10] ndrange: black reformat Reflow the "not a permutation" QuadrantsSyntaxError onto one line per black's default config in this repo (length still under the project limit). No semantic change. --- python/quadrants/lang/_ndrange.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/quadrants/lang/_ndrange.py b/python/quadrants/lang/_ndrange.py index 2bfb23877a..049d657cf3 100644 --- a/python/quadrants/lang/_ndrange.py +++ b/python/quadrants/lang/_ndrange.py @@ -80,9 +80,7 @@ def __init__(self, *args, layout=None): f"got {type(e).__name__} ({e!r})" ) if sorted(layout_t) != list(range(n)): - raise QuadrantsSyntaxError( - f"qd.ndrange(layout={layout_t!r}) is not a permutation of range({n})" - ) + raise QuadrantsSyntaxError(f"qd.ndrange(layout={layout_t!r}) is not a permutation of range({n})") if layout_t == tuple(range(n)): self.layout = None physical_to_canonical = list(range(n))