Genesis-Embodied-AI · hughperkins · May 20, 2026 · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/docs/source/user_guide/parallelization.md b/docs/source/user_guide/parallelization.md
@@ -48,6 +48,32 @@ def fill(a: qd.Template) -> None:
 
 `I` is a `qd.Vector` with one element per dimension.
 
+### Controlling iteration order with `layout=`
+
+By default, `qd.ndrange(d0, d1, ..., dN-1)` makes the **last argument the innermost (fastest-varying) axis** in the flat parallel loop: adjacent flat threads differ in the last index. The `layout=` keyword lets you choose a different iteration-nesting order. It's a tuple of `int` listing the **canonical axis index at each successive iteration-nesting level, outermost first**, and must be a permutation of `range(N)` where `N` is the number of arguments to `qd.ndrange`:
+
+```python
+@qd.kernel
+def k():
+    # axis 1 is outermost (slowest-varying), axis 0 is innermost (fastest-varying)
+    for i, j in qd.ndrange(M, N, layout=(1, 0)):
+        ...
+```
+
+The yielded loop variables (`i`, `j`, ...) are still bound to canonical axes 0, 1, ... — only the visit order changes. `layout=None` (the default) and the identity permutation `(0, 1, ..., N-1)` are equivalent and reproduce the default last-argument-innermost order. Mismatched length and non-permutation values are rejected up front with `qd.QuadrantsSyntaxError`.
+
+`layout=` is independent of what's in the loop body: it controls the iteration order regardless of whether the body touches a `qd.field`, a `qd.ndarray`, a `qd.tensor`, a `qd.Vector` / `qd.Matrix` variant, or no tensor at all.
+
+`layout=` is supported by both the plain and `qd.grouped` forms:
+
+```python
+for i, j in qd.ndrange(M, N, layout=(1, 0)):
+    ...
+for I in qd.grouped(qd.ndrange(M, N, layout=(1, 0))):
+    # I[0] is still the canonical axis-0 index, regardless of layout
+    ...
+```
+
 ## Does GPU kernel launch latency matter?
 
 Kernel launch can be done in parallel whilst the previously launched kernel is still running. This means that if the previously launched kernel takes longer to run than the launch time for the new kernel, then the kernel launch latency will be perfectly hidden.

diff --git a/docs/source/user_guide/tensor.md b/docs/source/user_guide/tensor.md
@@ -113,6 +113,8 @@ b[i, j] = ...                   # canonical indexing in kernels still works
 
 Any permutation is supported, up to Quadrants' `quadrants_max_num_indices` (currently 12). `layout=None` and the identity permutation (`(0, 1, ..., N-1)`) are equivalent and forward no permutation to the underlying allocator.
 
+For best performance, pair `qd.tensor(..., layout=...)` with a matching iteration order via `qd.ndrange(..., layout=...)` (see [`parallelization`](parallelization.md#controlling-iteration-order-with-layout)): the permutation has the same meaning on both APIs, and using the same value on both lines adjacent flat threads up with adjacent physical memory slots.
+
 Quadrants rejects mismatched / invalid layouts up front:
 
 ```python

diff --git a/python/quadrants/lang/_ndrange.py b/python/quadrants/lang/_ndrange.py
@@ -31,7 +31,7 @@ def _coerce_to_int(v):
 
 
 class _Ndrange:
-    def __init__(self, *args):
+    def __init__(self, *args, layout=None):
         args = list(args)
         for i, arg in enumerate(args):
             if not isinstance(arg, collections.abc.Sequence):
@@ -49,33 +49,87 @@ def __init__(self, *args):
                     raise QuadrantsTypeError(
                         "Every argument of ndrange should be an integer scalar or a tuple/list of (int, int)"
                     )
-        self.bounds = args
 
-        self.dimensions = [None] * len(args)
-        for i, bound in enumerate(self.bounds):
-            self.dimensions[i] = bound[1] - bound[0]
+        n = len(args)
+
+        # Validate and normalize ``layout``. Stored as ``self.layout`` (``None`` for the identity
+        # permutation, else the user-supplied tuple) for introspection / tests, and as
+        # ``self._physical_to_canonical`` (a Python list of int of length ``n``) for the AST
+        # builder to use when remapping per-physical-level decomposed indices to canonical loop
+        # targets. The identity case is kept as ``None`` so the AST-builder fast-path matches
+        # the pre-layout codegen byte-for-byte.
+        if layout is None:
+            self.layout = None
+            physical_to_canonical = list(range(n))
+        else:
+            layout_t = tuple(layout)
+            if len(layout_t) != n:
+                raise QuadrantsSyntaxError(
+                    f"qd.ndrange(layout={layout_t!r}) has {len(layout_t)} entries "
+                    f"but ndrange was called with {n} dimension argument(s); they must match"
+                )
+            # Type-check each entry before sorting / permutation checks, so mixed-type or
+            # non-integer entries surface a Quadrants error instead of Python's raw
+            # ``TypeError`` from ``sorted``. ``bool`` is rejected explicitly even though it is
+            # an ``int`` subclass — accepting ``True`` / ``False`` as axis indices would be a
+            # foot-gun.
+            for e in layout_t:
+                if isinstance(e, bool) or not isinstance(e, (int, np.integer)):
+                    raise QuadrantsTypeError(
+                        f"qd.ndrange(layout={layout_t!r}) entries must be Python ints; "
+                        f"got {type(e).__name__} ({e!r})"
+                    )
+            if sorted(layout_t) != list(range(n)):
+                raise QuadrantsSyntaxError(f"qd.ndrange(layout={layout_t!r}) is not a permutation of range({n})")
+            if layout_t == tuple(range(n)):
+                self.layout = None
+                physical_to_canonical = list(range(n))
+            else:
+                self.layout = layout_t
+                physical_to_canonical = list(layout_t)
 
-        self.acc_dimensions = self.dimensions.copy()
-        for i in reversed(range(len(self.bounds) - 1)):
-            self.acc_dimensions[i] = self.acc_dimensions[i] * self.acc_dimensions[i + 1]
-        if len(self.acc_dimensions) == 0:  # for the empty case, e.g. qd.ndrange()
-            self.acc_dimensions = [1]
+        self._physical_to_canonical = physical_to_canonical
 
-    def __iter__(self):
-        def gen(d, prefix):
-            if d == len(self.bounds):
-                yield prefix
-            else:
-                for t in range(self.bounds[d][0], self.bounds[d][1]):
-                    yield from gen(d + 1, prefix + (t,))
+        canonical_bounds = args
+        canonical_dimensions = [bound[1] - bound[0] for bound in canonical_bounds]
 
-        yield from gen(0, ())
+        physical_bounds = [canonical_bounds[c] for c in physical_to_canonical]
+        physical_dimensions = [canonical_dimensions[c] for c in physical_to_canonical]
+
+        acc_dimensions = physical_dimensions.copy()
+        for i in reversed(range(n - 1)):
+            acc_dimensions[i] = acc_dimensions[i] * acc_dimensions[i + 1]
+        if not acc_dimensions:  # for the empty case, e.g. qd.ndrange()
+            acc_dimensions = [1]
+
+        self._canonical_bounds = canonical_bounds
+        self._canonical_dimensions = canonical_dimensions
+        self.bounds = physical_bounds
+        self.dimensions = physical_dimensions
+        self.acc_dimensions = acc_dimensions
+
+    def __iter__(self):
+        p2c = self._physical_to_canonical
+        cbounds = self._canonical_bounds
+        n = len(p2c)
+
+        def gen(level, current):
+            if level == n:
+                yield tuple(current)
+                return
+            ax = p2c[level]
+            b, e = cbounds[ax]
+            for t in range(b, e):
+                current[ax] = t
+                yield from gen(level + 1, current)
+
+        yield from gen(0, [0] * n)
 
     def grouped(self):
         return GroupedNDRange(self)
 
 
-def ndrange(*args) -> Iterable:
+def ndrange(*args, layout=None) -> Iterable:
     """Return an immutable iterator object for looping over multi-dimensional indices.
 
     This returned set of multi-dimensional indices is the direct product (in the set-theory sense)
@@ -91,6 +145,18 @@ def ndrange(*args) -> Iterable:
 
     Args:
         entries: (int, tuple): Must be either an integer, or a tuple/list of two integers.
+        layout (tuple of int, optional): Permutation of canonical axes describing the iteration
+            nesting order, outermost (slowest-varying) first. For an N-argument ndrange, must be
+            a permutation of ``range(N)``. ``None`` (default) and the identity permutation are
+            equivalent and reproduce the default order in which the **last argument is the
+            innermost / fastest-varying axis**. The yielded loop variables stay bound to
+            canonical axes 0, 1, ..., N-1 regardless of layout — only the visit order changes.
+            ``layout=`` is independent of the loop body; it controls iteration order whether
+            the body touches a field, ndarray, tensor, vector/matrix variant, or no tensor at
+            all. The motivating use case is aligning iteration with a non-default physical
+            memory layout (e.g. ``qd.tensor(..., layout=...)`` or ``qd.field(..., order=...)``):
+            using the matching permutation makes adjacent flat threads step through physically
+            adjacent memory.
 
     Returns:
         An immutable iterator object.
@@ -154,8 +220,18 @@ def ndrange(*args) -> Iterable:
             >>> def loop_tensor():
             >>>     for row, col, channel in qd.ndrange(image_height, image_width, channels):
             >>>         image[row, col, channel] = ...
+
+        Aligning iteration order with a non-default tensor layout via ``layout=``:
+
+            >>> A = qd.tensor(qd.f32, shape=(M, N), layout=(1, 0))    # axis 1 outer, axis 0 inner
+            >>> @qd.kernel
+            >>> def fill():
+            >>>     # adjacent flat threads now step along axis 0 (the inner physical axis of A),
+            >>>     # i.e. touch physically adjacent memory in A
+            >>>     for i, j in qd.ndrange(M, N, layout=(1, 0)):
+            >>>         A[i, j] = i + j
     """
-    return _Ndrange(*args)
+    return _Ndrange(*args, layout=layout)
 
 
 class GroupedNDRange:

diff --git a/python/quadrants/lang/ast/ast_transformer.py b/python/quadrants/lang/ast/ast_transformer.py
@@ -1052,24 +1052,32 @@ def build_ndrange_for(ctx: ASTTransformerFuncContext, node: ast.For) -> None:
                     "Please check if the number of arguments of qd.ndrange() is equal to "
                     "the number of the loop variables."
                 )
-            for i, target in enumerate(targets):
-                if i + 1 < len(targets):
-                    target_tmp = impl.expr_init(I // ndrange_var.acc_dimensions[i + 1])
+            # ``physical_to_canonical[p]`` is the canonical (user-visible) axis index that receives
+            # the decomposed index for physical nesting level ``p``. For the identity / ``layout=None``
+            # case this is ``[0, 1, ..., n-1]`` and the emitted IR matches the pre-layout codegen
+            # byte-for-byte.
+            physical_to_canonical = ndrange_var._physical_to_canonical
+            n_levels = len(ndrange_var.dimensions)
+            for p in range(n_levels):
+                if p + 1 < n_levels:
+                    target_tmp = impl.expr_init(I // ndrange_var.acc_dimensions[p + 1])
                 else:
                     target_tmp = impl.expr_init(I)
+                canonical_idx = physical_to_canonical[p]
+                target = targets[canonical_idx]
                 ctx.create_variable(
                     target,
                     impl.expr_init(
                         target_tmp
                         + impl.subscript(
                             ctx.ast_builder,
-                            impl.subscript(ctx.ast_builder, ndrange_var.bounds, i),
+                            impl.subscript(ctx.ast_builder, ndrange_var.bounds, p),
                             0,
                         )
                     ),
                 )
-                if i + 1 < len(targets):
-                    I._assign(I - target_tmp * ndrange_var.acc_dimensions[i + 1])
+                if p + 1 < n_levels:
+                    I._assign(I - target_tmp * ndrange_var.acc_dimensions[p + 1])
             ctx.loop_depth += 1
             build_stmts(ctx, node.body)
             ctx.loop_depth -= 1
@@ -1098,14 +1106,22 @@ def build_grouped_ndrange_for(ctx: ASTTransformerFuncContext, node: ast.For) ->
 
             ctx.create_variable(target, target_var)
             I = impl.expr_init(ndrange_loop_var)
-            for i in range(len(ndrange_var.dimensions)):
-                if i + 1 < len(ndrange_var.dimensions):
-                    target_tmp = I // ndrange_var.acc_dimensions[i + 1]
+            # See ``build_ndrange_for`` above for the layout semantics. The grouped target_var is a
+            # vector indexed by canonical axis, so element ``physical_to_canonical[p]`` (not ``p``)
+            # receives the decomposition of physical level ``p``.
+            physical_to_canonical = ndrange_var._physical_to_canonical
+            n_levels = len(ndrange_var.dimensions)
+            for p in range(n_levels):
+                if p + 1 < n_levels:
+                    target_tmp = I // ndrange_var.acc_dimensions[p + 1]
                 else:
                     target_tmp = I
-                impl.subscript(ctx.ast_builder, target_var, i)._assign(target_tmp + ndrange_var.bounds[i][0])
-                if i + 1 < len(ndrange_var.dimensions):
-                    I._assign(I - target_tmp * ndrange_var.acc_dimensions[i + 1])
+                canonical_idx = physical_to_canonical[p]
+                impl.subscript(ctx.ast_builder, target_var, canonical_idx)._assign(
+                    target_tmp + ndrange_var.bounds[p][0]
+                )
+                if p + 1 < n_levels:
+                    I._assign(I - target_tmp * ndrange_var.acc_dimensions[p + 1])
             ctx.loop_depth += 1
             build_stmts(ctx, node.body)
             ctx.loop_depth -= 1