[Perf] CUDA graph 2: graph_do_while (#406)

hughperkins · web-flow · commit 79dd8bf583d2 · 2026-03-16T18:32:48.000Z
diff --git a/.gitignore b/.gitignore
@@ -96,3 +96,4 @@ imgui.ini
 stubs/
 CHANGELOG.md
 python/quadrants/_version.py
+env.sh
diff --git a/docs/source/user_guide/cuda_graph.md b/docs/source/user_guide/cuda_graph.md
@@ -2,7 +2,7 @@
 
 CUDA graphs reduce kernel launch overhead by capturing a sequence of GPU operations into a graph, then replaying it in a single launch. On non-CUDA platforms, the cuda graph annotation is simply ignored, and code runs normally.
 
-## Usage
+## Basic usage
 
 Add `cuda_graph=True` to a `@qd.kernel` decorator:
 
@@ -52,3 +52,80 @@ my_kernel(x2, y2)  # replays graph with new array pointers
 ### Fields as arguments
 
 When different fields are passed as template arguments, each unique combination of fields produces a separately compiled kernel with its own graph cache entry. There is no interference between them.
+
+
+## GPU-side iteration with `graph_do_while`
+
+For iterative algorithms (physics solvers, convergence loops), you often want to repeat the kernel body until a condition is met, without returning to the host each iteration. Use `while qd.graph_do_while(flag):` inside a `cuda_graph=True` kernel:
+
+```python
+@qd.kernel(cuda_graph=True)
+def solve(x: qd.types.ndarray(qd.f32, ndim=1),
+          counter: qd.types.ndarray(qd.i32, ndim=0)):
+    while qd.graph_do_while(counter):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1.0
+        for i in range(1):
+            counter[()] = counter[()] - 1
+
+x = qd.ndarray(qd.f32, shape=(N,))
+counter = qd.ndarray(qd.i32, shape=())
+counter.from_numpy(np.array(10, dtype=np.int32))
+solve(x, counter)
+# x is now incremented 10 times; counter is 0
+```
+
+The argument to `qd.graph_do_while()` must be the name of a scalar `qd.i32` ndarray parameter. The loop body repeats while this value is non-zero.
+
+- On SM 9.0+ (Hopper), this uses CUDA conditional while nodes — the entire iteration runs on the GPU with no host involvement.
+- Older CUDA GPUs, and non-CUDA backends not currently supported.
+
+### Patterns
+
+**Counter-based**: set the counter to N, decrement each iteration. The body runs exactly N times.
+
+```python
+@qd.kernel(cuda_graph=True)
+def iterate(x: qd.types.ndarray(qd.f32, ndim=1),
+            counter: qd.types.ndarray(qd.i32, ndim=0)):
+    while qd.graph_do_while(counter):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1.0
+        for i in range(1):
+            counter[()] = counter[()] - 1
+```
+
+**Boolean flag**: set a `keep_going` flag to 1, have the kernel set it to 0 when a convergence criterion is met.
+
+```python
+@qd.kernel(cuda_graph=True)
+def converge(x: qd.types.ndarray(qd.f32, ndim=1),
+             keep_going: qd.types.ndarray(qd.i32, ndim=0)):
+    while qd.graph_do_while(keep_going):
+        for i in range(x.shape[0]):
+            # ... do work ...
+            pass
+        for i in range(1):
+            if some_condition(x):
+                keep_going[()] = 0
+```
+
+### Do-while semantics
+
+`graph_do_while` has **do-while** semantics: the kernel body always executes at least once before the condition is checked. This matches the behavior of CUDA conditional while nodes. The flag value must be >= 1 at launch time. Passing 0 with a kernel that decrements the counter will cause an infinite loop.
+
+### ndarray vs field
+
+The parameter used by `graph_do_while` MUST be an ndarray.
+
+However, other parameters can be any supported Quadrants kernel parameter type.
+
+### Restrictions
+
+- The same physical ndarray must be used for the counter parameter on every
+  call. Passing a different ndarray raises an error, because the counter's
+  device pointer is baked into the CUDA graph at creation time.
+
+### Caveats
+
+Only runs on CUDA. No fallback on non-CUDA platforms currently.
diff --git a/python/quadrants/lang/ast/ast_transformer.py b/python/quadrants/lang/ast/ast_transformer.py
@@ -1193,11 +1193,41 @@ def build_For(ctx: ASTTransformerFuncContext, node: ast.For) -> None:
                 # Struct for
                 return ASTTransformer.build_struct_for(ctx, node, is_grouped=False)
 
+    @staticmethod
+    def _is_graph_do_while_call(node: ast.expr) -> str | None:
+        """If *node* is ``qd.graph_do_while(var)`` return the arg name, else None."""
+        if not isinstance(node, ast.Call):
+            return None
+        func = node.func
+        if isinstance(func, ast.Attribute) and func.attr == "graph_do_while":
+            if len(node.args) == 1 and isinstance(node.args[0], ast.Name):
+                return node.args[0].id
+        if isinstance(func, ast.Name) and func.id == "graph_do_while":
+            if len(node.args) == 1 and isinstance(node.args[0], ast.Name):
+                return node.args[0].id
+        return None
+
     @staticmethod
     def build_While(ctx: ASTTransformerFuncContext, node: ast.While) -> None:
         if node.orelse:
             raise QuadrantsSyntaxError("'else' clause for 'while' not supported in Quadrants kernels")
 
+        graph_do_while_arg = ASTTransformer._is_graph_do_while_call(node.test)
+        if graph_do_while_arg is not None:
+            kernel = ctx.global_context.current_kernel
+            arg_names = [m.name for m in kernel.arg_metas]
+            if graph_do_while_arg not in arg_names:
+                raise QuadrantsSyntaxError(
+                    f"qd.graph_do_while({graph_do_while_arg!r}) does not match any "
+                    f"parameter of kernel {kernel.func.__name__!r}. "
+                    f"Available parameters: {arg_names}"
+                )
+            if not kernel.use_cuda_graph:
+                raise QuadrantsSyntaxError("qd.graph_do_while() requires @qd.kernel(cuda_graph=True)")
+            kernel.graph_do_while_arg = graph_do_while_arg
+            build_stmts(ctx, node.body)
+            return None
+
         with ctx.loop_scope_guard():
             stmt_dbg_info = _qd_core.DebugInfo(ctx.get_pos_info(node))
             ctx.ast_builder.begin_frontend_while(expr.Expr(1, dtype=primitive_types.i32).ptr, stmt_dbg_info)
diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
@@ -292,6 +292,7 @@ def __init__(self, _func: Callable, autodiff_mode: AutodiffMode, _is_classkernel
         self.materialized_kernels: dict[CompiledKernelKeyType, KernelCxx] = {}
         self.has_print = False
         self.use_cuda_graph: bool = False
+        self.graph_do_while_arg: str | None = None
         self.quadrants_callable: QuadrantsCallable | None = None
         self.visited_functions: set[FunctionSourceInfo] = set()
         self.kernel_function_info: FunctionSourceInfo | None = None
@@ -444,6 +445,8 @@ def launch_kernel(self, key, t_kernel: KernelCxx, compiled_kernel_data: Compiled
                     template_num += 1
                     i_out += 1
                     continue
+                if self.graph_do_while_arg is not None and self.arg_metas[i_in].name == self.graph_do_while_arg:
+                    self._graph_do_while_cpp_arg_id = i_out - template_num
                 num_args_, is_launch_ctx_cacheable_ = self._recursive_set_args(
                     self.used_py_dataclass_parameters_by_key_enforcing[key],
                     self.arg_metas[i_in].name,
@@ -505,6 +508,8 @@ def launch_kernel(self, key, t_kernel: KernelCxx, compiled_kernel_data: Compiled
                     self.src_ll_cache_observations.cache_stored = True
             self._last_compiled_kernel_data = compiled_kernel_data
             launch_ctx.use_cuda_graph = self.use_cuda_graph
+            if self.graph_do_while_arg is not None and hasattr(self, "_graph_do_while_cpp_arg_id"):
+                launch_ctx.graph_do_while_arg_id = self._graph_do_while_cpp_arg_id
             prog.launch_kernel(compiled_kernel_data, launch_ctx)
         except Exception as e:
             e = handle_exception_from_cpp(e)
diff --git a/python/quadrants/lang/kernel_impl.py b/python/quadrants/lang/kernel_impl.py
@@ -124,7 +124,10 @@ def _inside_class(level_of_class_stackframe: int) -> bool:
 
 
 def _kernel_impl(
-    _func: Callable, level_of_class_stackframe: int, verbose: bool = False, cuda_graph: bool = False
+    _func: Callable,
+    level_of_class_stackframe: int,
+    verbose: bool = False,
+    cuda_graph: bool = False,
 ) -> QuadrantsCallable:
     # Can decorators determine if a function is being defined inside a class?
     # https://stackoverflow.com/a/8793684/12003165
@@ -206,6 +209,12 @@ def kernel(
 
     Kernel's gradient kernel would be generated automatically by the AutoDiff system.
 
+    Args:
+        cuda_graph: If True, kernels with 2+ top-level for loops are captured
+            into a CUDA graph on first launch and replayed on subsequent
+            launches, reducing per-kernel launch overhead. Non-CUDA backends
+            are not supported currently.
+
     Example::
 
         >>> x = qd.field(qd.i32, shape=(4, 8))
diff --git a/python/quadrants/lang/misc.py b/python/quadrants/lang/misc.py
@@ -701,6 +701,24 @@ def copy():
         _bit_vectorize()
 
 
+def graph_do_while(condition) -> bool:
+    """Marks a while loop as a CUDA graph do-while conditional node.
+
+    Used as ``while qd.graph_do_while(flag):`` inside a
+    ``@qd.kernel(cuda_graph=True)`` kernel. The loop body repeats while
+    ``flag`` (a scalar ``qd.i32`` ndarray) is non-zero.
+
+    On SM 9.0+ (Hopper) GPUs this compiles to a native CUDA graph
+    conditional while node. Older CUDA GPUs and non-CUDA backends
+    are not currently supported.
+
+    This function should not be called directly at runtime; it is
+    recognised and transformed during AST compilation.
+    Requires ``@qd.kernel(cuda_graph=True)``.
+    """
+    return bool(condition)
+
+
 def global_thread_idx():
     """Returns the global thread id of this running thread,
     only available for cpu and cuda backends.
@@ -837,6 +855,7 @@ def dump_compile_config() -> None:
     "python",
     "vulkan",
     "extension",
+    "graph_do_while",
     "loop_config",
     "global_thread_idx",
     "assume_in_range",
diff --git a/quadrants/program/launch_context_builder.h b/quadrants/program/launch_context_builder.h
@@ -151,6 +151,8 @@ class LaunchContextBuilder {
   const StructType *args_type{nullptr};
   size_t result_buffer_size{0};
   bool use_cuda_graph{false};
+  int graph_do_while_arg_id{-1};
+  void *graph_do_while_flag_dev_ptr{nullptr};
 
   // Note that I've tried to group `array_runtime_size` and
   // `is_device_allocations` into a small struct. However, it caused some test
diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
@@ -667,7 +667,9 @@ void export_lang(py::module &m) {
       .def("get_struct_ret_int", &LaunchContextBuilder::get_struct_ret_int)
       .def("get_struct_ret_uint", &LaunchContextBuilder::get_struct_ret_uint)
       .def("get_struct_ret_float", &LaunchContextBuilder::get_struct_ret_float)
-      .def_readwrite("use_cuda_graph", &LaunchContextBuilder::use_cuda_graph);
+      .def_readwrite("use_cuda_graph", &LaunchContextBuilder::use_cuda_graph)
+      .def_readwrite("graph_do_while_arg_id",
+                     &LaunchContextBuilder::graph_do_while_arg_id);
 
   py::class_<Function>(m, "Function")
       .def("insert_scalar_param", &Function::insert_scalar_param)
diff --git a/quadrants/rhi/cuda/cuda_driver_functions.inc.h b/quadrants/rhi/cuda/cuda_driver_functions.inc.h
@@ -73,8 +73,18 @@ PER_CUDA_FUNCTION(import_external_semaphore, cuImportExternalSemaphore,CUexterna
 // Graph management
 PER_CUDA_FUNCTION(graph_create, cuGraphCreate, void **, uint32);
 PER_CUDA_FUNCTION(graph_add_kernel_node, cuGraphAddKernelNode, void **, void *, const void *, std::size_t, const void *);
+PER_CUDA_FUNCTION(graph_add_node, cuGraphAddNode, void **, void *, const void *, std::size_t, void *);
 PER_CUDA_FUNCTION(graph_instantiate, cuGraphInstantiate, void **, void *, void *, char *, std::size_t);
 PER_CUDA_FUNCTION(graph_launch, cuGraphLaunch, void *, void *);
 PER_CUDA_FUNCTION(graph_destroy, cuGraphDestroy, void *);
 PER_CUDA_FUNCTION(graph_exec_destroy, cuGraphExecDestroy, void *);
+PER_CUDA_FUNCTION(graph_conditional_handle_create, cuGraphConditionalHandleCreate, void *, void *, void *, uint32, uint32);
+
+// JIT linker (for loading condition kernel with cudadevrt)
+PER_CUDA_FUNCTION(link_create, cuLinkCreate_v2, uint32, void *, void *, void **);
+PER_CUDA_FUNCTION(link_add_data, cuLinkAddData_v2, void *, uint32, void *, std::size_t, const char *, uint32, void *, void *);
+PER_CUDA_FUNCTION(link_add_file, cuLinkAddFile_v2, void *, uint32, const char *, uint32, void *, void *);
+PER_CUDA_FUNCTION(link_complete, cuLinkComplete, void *, void **, std::size_t *);
+PER_CUDA_FUNCTION(link_destroy, cuLinkDestroy, void *);
+PER_CUDA_FUNCTION(module_load_data, cuModuleLoadData, void **, const void *);
 // clang-format on
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -110,6 +110,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
   AMDGPUContext::get_instance().push_back_kernel_arg_pointer(context_pointer);
 
+  QD_ERROR_IF(ctx.graph_do_while_arg_id >= 0,
+              "graph_do_while is only supported on the CUDA backend");
+
   for (auto &task : offloaded_tasks) {
     QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
              task.block_dim);
diff --git a/quadrants/runtime/cpu/kernel_launcher.cpp b/quadrants/runtime/cpu/kernel_launcher.cpp
@@ -41,6 +41,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
       }
     }
   }
+  QD_ERROR_IF(ctx.graph_do_while_arg_id >= 0,
+              "graph_do_while is only supported on the CUDA backend");
+
   for (auto task : launcher_ctx.task_funcs) {
     task(&ctx.get_context());
   }
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.h b/quadrants/runtime/cuda/cuda_graph_manager.h
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py