From 61b5b360f751ec8d692f5111124f641b8f0ea90e Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 28 Feb 2026 20:28:07 -0500
Subject: [PATCH 001/128] Add CUDA graph MVP for multi-task kernels

When QD_CUDA_GRAPH=1, kernels with 2+ top-level for loops (offloaded
tasks) are captured into a CUDA graph on first launch and replayed on
subsequent launches, eliminating per-kernel launch overhead.

Uses the explicit graph node API (cuGraphAddKernelNode) with persistent
device arg/result buffers. Assumes stable ndarray device pointers.

Made-with: Cursor
---
 .../rhi/cuda/cuda_driver_functions.inc.h      |   8 +
 quadrants/runtime/cuda/kernel_launcher.cpp    | 218 ++++++++++++++++--
 quadrants/runtime/cuda/kernel_launcher.h      |  36 +++
 tests/python/test_cuda_graph.py               |  89 +++++++
 4 files changed, 328 insertions(+), 23 deletions(-)
 create mode 100644 tests/python/test_cuda_graph.py
diff --git a/quadrants/rhi/cuda/cuda_driver_functions.inc.h b/quadrants/rhi/cuda/cuda_driver_functions.inc.h
index 25b3c7958e..2da4799b96 100644
--- a/quadrants/rhi/cuda/cuda_driver_functions.inc.h
+++ b/quadrants/rhi/cuda/cuda_driver_functions.inc.h
@@ -69,4 +69,12 @@ PER_CUDA_FUNCTION(surf_object_create,cuSurfObjectCreate,CUsurfObject *, const CU
 PER_CUDA_FUNCTION(signal_external_semaphore_async,cuSignalExternalSemaphoresAsync,const CUexternalSemaphore * , const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS * , unsigned int  , CUstream)
 PER_CUDA_FUNCTION(wait_external_semaphore_async,cuWaitExternalSemaphoresAsync,const CUexternalSemaphore * , const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS * , unsigned int  , CUstream)
 PER_CUDA_FUNCTION(import_external_semaphore, cuImportExternalSemaphore,CUexternalSemaphore * , const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *)
+
+// Graph management
+PER_CUDA_FUNCTION(graph_create, cuGraphCreate, void **, uint32);
+PER_CUDA_FUNCTION(graph_add_kernel_node, cuGraphAddKernelNode, void **, void *, const void *, std::size_t, const void *);
+PER_CUDA_FUNCTION(graph_instantiate, cuGraphInstantiate, void **, void *, void *, char *, std::size_t);
+PER_CUDA_FUNCTION(graph_launch, cuGraphLaunch, void *, void *);
+PER_CUDA_FUNCTION(graph_destroy, cuGraphDestroy, void *);
+PER_CUDA_FUNCTION(graph_exec_destroy, cuGraphExecDestroy, void *);
 // clang-format on
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 5eae5e747d..75a3c909f5 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -1,9 +1,59 @@
 #include "quadrants/runtime/cuda/kernel_launcher.h"
 #include "quadrants/rhi/cuda/cuda_context.h"
 
+#include <cstdlib>
+#include <cstring>
+
 namespace quadrants::lang {
 namespace cuda {
 
+CachedCudaGraph::~CachedCudaGraph() {
+  if (graph_exec) {
+    CUDADriver::get_instance().graph_exec_destroy(graph_exec);
+  }
+  if (persistent_device_arg_buffer) {
+    CUDADriver::get_instance().mem_free(persistent_device_arg_buffer);
+  }
+  if (persistent_device_result_buffer) {
+    CUDADriver::get_instance().mem_free(persistent_device_result_buffer);
+  }
+}
+
+CachedCudaGraph::CachedCudaGraph(CachedCudaGraph &&other) noexcept
+    : graph_exec(other.graph_exec),
+      persistent_device_arg_buffer(other.persistent_device_arg_buffer),
+      persistent_device_result_buffer(other.persistent_device_result_buffer),
+      persistent_ctx(other.persistent_ctx),
+      arg_buffer_size(other.arg_buffer_size),
+      result_buffer_size(other.result_buffer_size) {
+  other.graph_exec = nullptr;
+  other.persistent_device_arg_buffer = nullptr;
+  other.persistent_device_result_buffer = nullptr;
+}
+
+CachedCudaGraph &CachedCudaGraph::operator=(CachedCudaGraph &&other) noexcept {
+  if (this != &other) {
+    if (graph_exec)
+      CUDADriver::get_instance().graph_exec_destroy(graph_exec);
+    if (persistent_device_arg_buffer)
+      CUDADriver::get_instance().mem_free(persistent_device_arg_buffer);
+    if (persistent_device_result_buffer)
+      CUDADriver::get_instance().mem_free(persistent_device_result_buffer);
+
+    graph_exec = other.graph_exec;
+    persistent_device_arg_buffer = other.persistent_device_arg_buffer;
+    persistent_device_result_buffer = other.persistent_device_result_buffer;
+    persistent_ctx = other.persistent_ctx;
+    arg_buffer_size = other.arg_buffer_size;
+    result_buffer_size = other.result_buffer_size;
+
+    other.graph_exec = nullptr;
+    other.persistent_device_arg_buffer = nullptr;
+    other.persistent_device_result_buffer = nullptr;
+  }
+  return *this;
+}
+
 bool KernelLauncher::on_cuda_device(void *ptr) {
   unsigned int attr_val = 0;
   uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
@@ -12,9 +62,154 @@ bool KernelLauncher::on_cuda_device(void *ptr) {
   return ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE;
 }
 
+void KernelLauncher::launch_llvm_kernel_graph(Handle handle,
+                                              LaunchContextBuilder &ctx) {
+  int launch_id = handle.get_launch_id();
+  auto it = cuda_graph_cache_.find(launch_id);
+
+  if (it != cuda_graph_cache_.end()) {
+    auto *stream = CUDAContext::get_instance().get_stream();
+    CUDADriver::get_instance().graph_launch(it->second.graph_exec, stream);
+    return;
+  }
+
+  auto &launcher_ctx = contexts_[launch_id];
+  auto *executor = get_runtime_executor();
+  auto *cuda_module = launcher_ctx.jit_module;
+  const auto &parameters = *launcher_ctx.parameters;
+  const auto &offloaded_tasks = launcher_ctx.offloaded_tasks;
+
+  if (offloaded_tasks.size() < 2) {
+    // Not worth graphing a single kernel — fall through to normal launch.
+    // We signal this by setting it = end and letting the caller handle it.
+    // Actually, just do the normal launch inline for simplicity.
+    // This path should not be reached because launch_llvm_kernel checks.
+    QD_WARN("CUDA graph requested for single-task kernel; falling back.");
+  }
+
+  CUDAContext::get_instance().make_current();
+
+  CachedCudaGraph cached;
+
+  // --- Resolve ndarray device pointers (same as normal path) ---
+  for (int i = 0; i < (int)parameters.size(); i++) {
+    const auto &kv = parameters[i];
+    const auto &arg_id = kv.first;
+    const auto &parameter = kv.second;
+    if (parameter.is_array) {
+      const auto arr_sz = ctx.array_runtime_sizes[arg_id];
+      if (arr_sz == 0)
+        continue;
+
+      ArgArrayPtrKey data_ptr_idx{arg_id, TypeFactory::DATA_PTR_POS_IN_NDARRAY};
+      ArgArrayPtrKey grad_ptr_idx{arg_id, TypeFactory::GRAD_PTR_POS_IN_NDARRAY};
+      auto data_ptr = ctx.array_ptrs[data_ptr_idx];
+      auto grad_ptr = ctx.array_ptrs[grad_ptr_idx];
+
+      if (ctx.device_allocation_type[arg_id] ==
+          LaunchContextBuilder::DevAllocType::kNone) {
+        QD_ERROR_IF(!on_cuda_device(data_ptr),
+                    "CUDA graph mode does not support host external arrays");
+        ctx.set_ndarray_ptrs(arg_id, (uint64)data_ptr, (uint64)grad_ptr);
+      } else if (arr_sz > 0) {
+        DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
+        void *dev_data = executor->get_device_alloc_info_ptr(*ptr);
+        void *dev_grad = nullptr;
+        if (grad_ptr) {
+          dev_grad = executor->get_device_alloc_info_ptr(
+              *static_cast<DeviceAllocation *>(grad_ptr));
+        }
+        ctx.set_ndarray_ptrs(arg_id, (uint64)dev_data, (uint64)dev_grad);
+      }
+    }
+  }
+
+  // --- Allocate persistent buffers ---
+  cached.result_buffer_size = std::max(ctx.result_buffer_size, sizeof(uint64));
+  CUDADriver::get_instance().malloc(
+      (void **)&cached.persistent_device_result_buffer,
+      cached.result_buffer_size);
+
+  cached.arg_buffer_size = ctx.arg_buffer_size;
+  if (cached.arg_buffer_size > 0) {
+    CUDADriver::get_instance().malloc(
+        (void **)&cached.persistent_device_arg_buffer, cached.arg_buffer_size);
+    CUDADriver::get_instance().memcpy_host_to_device(
+        cached.persistent_device_arg_buffer, ctx.get_context().arg_buffer,
+        cached.arg_buffer_size);
+  }
+
+  // --- Build persistent RuntimeContext ---
+  cached.persistent_ctx.runtime = executor->get_llvm_runtime();
+  cached.persistent_ctx.arg_buffer = cached.persistent_device_arg_buffer;
+  cached.persistent_ctx.result_buffer =
+      (uint64 *)cached.persistent_device_result_buffer;
+  cached.persistent_ctx.cpu_thread_id = 0;
+
+  // --- Build CUDA graph ---
+  void *graph = nullptr;
+  CUDADriver::get_instance().graph_create(&graph, 0);
+
+  void *prev_node = nullptr;
+  for (const auto &task : offloaded_tasks) {
+    void *func = cuda_module->lookup_function(task.name);
+
+    void *ctx_ptr = &cached.persistent_ctx;
+    CudaKernelNodeParams node_params{};
+    node_params.func = func;
+    node_params.gridDimX = (unsigned int)task.grid_dim;
+    node_params.gridDimY = 1;
+    node_params.gridDimZ = 1;
+    node_params.blockDimX = (unsigned int)task.block_dim;
+    node_params.blockDimY = 1;
+    node_params.blockDimZ = 1;
+    node_params.sharedMemBytes =
+        (unsigned int)task.dynamic_shared_array_bytes;
+    node_params.kernelParams = &ctx_ptr;
+    node_params.extra = nullptr;
+
+    void *node = nullptr;
+    const void *deps = prev_node;
+    std::size_t num_deps = prev_node ? 1 : 0;
+    CUDADriver::get_instance().graph_add_kernel_node(
+        &node, graph, prev_node ? &deps : nullptr, num_deps, &node_params);
+    prev_node = node;
+  }
+
+  // --- Instantiate and launch ---
+  CUDADriver::get_instance().graph_instantiate(
+      &cached.graph_exec, graph, nullptr, nullptr, 0);
+
+  auto *stream = CUDAContext::get_instance().get_stream();
+  CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);
+
+  CUDADriver::get_instance().graph_destroy(graph);
+
+  QD_TRACE("CUDA graph created with {} kernel nodes for launch_id={}",
+           offloaded_tasks.size(), launch_id);
+
+  cuda_graph_cache_.emplace(launch_id, std::move(cached));
+}
+
 void KernelLauncher::launch_llvm_kernel(Handle handle,
                                         LaunchContextBuilder &ctx) {
   QD_ASSERT(handle.get_launch_id() < contexts_.size());
+
+  if (!use_cuda_graph_checked_) {
+    const char *env = std::getenv("QD_CUDA_GRAPH");
+    use_cuda_graph_ = env != nullptr && std::string(env) == "1";
+    use_cuda_graph_checked_ = true;
+  }
+
+  if (use_cuda_graph_) {
+    auto &offloaded_tasks =
+        contexts_[handle.get_launch_id()].offloaded_tasks;
+    if (offloaded_tasks.size() >= 2) {
+      launch_llvm_kernel_graph(handle, ctx);
+      return;
+    }
+  }
+
   auto launcher_ctx = contexts_[handle.get_launch_id()];
   auto *executor = get_runtime_executor();
   auto *cuda_module = launcher_ctx.jit_module;
@@ -23,24 +218,10 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
   CUDAContext::get_instance().make_current();
 
-  // |transfers| is only used for external arrays whose data is originally on
-  // host. They are first transferred onto device and that device pointer is
-  // stored in |device_ptrs| below. |transfers| saves its original pointer so
-  // that we can copy the data back once kernel finishes. as well as the
-  // temporary device allocations, which can be freed after kernel finishes. Key
-  // is [arg_id, ptr_pos], where ptr_pos is TypeFactory::DATA_PTR_POS_IN_NDARRAY
-  // for data_ptr and TypeFactory::GRAD_PTR_POS_IN_NDARRAY for grad_ptr. Value
-  // is [host_ptr, temporary_device_alloc]. Invariant: temp_devallocs.size() !=
-  // 0 <==> transfer happened.
   std::unordered_map<ArgArrayPtrKey, std::pair<void *, DeviceAllocation>,
                      ArgArrayPtrKeyHasher>
       transfers;
 
-  // |device_ptrs| stores pointers on device for all arrays args, including
-  // external arrays and ndarrays, no matter whether the data is originally on
-  // device or host.
-  // This is the source of truth for us to look for device pointers used in CUDA
-  // kernels.
   std::unordered_map<ArgArrayPtrKey, void *, ArgArrayPtrKeyHasher> device_ptrs;
 
   char *device_result_buffer{nullptr};
@@ -55,9 +236,6 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
     const auto &parameter = kv.second;
     if (parameter.is_array) {
       const auto arr_sz = ctx.array_runtime_sizes[arg_id];
-      // Note: both numpy and PyTorch support arrays/tensors with zeros
-      // in shapes, e.g., shape=(0) or shape=(100, 0, 200). This makes
-      // `arr_sz` zero.
       if (arr_sz == 0) {
         continue;
       }
@@ -69,10 +247,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
       if (ctx.device_allocation_type[arg_id] ==
           LaunchContextBuilder::DevAllocType::kNone) {
-        // External array
-        // Note: assuming both data & grad are on the same device
         if (on_cuda_device(data_ptr)) {
-          // data_ptr is a raw ptr on CUDA device
           device_ptrs[data_ptr_idx] = data_ptr;
           device_ptrs[grad_ptr_idx] = grad_ptr;
         } else {
@@ -102,9 +277,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)device_ptrs[grad_ptr_idx]);
       } else if (arr_sz > 0) {
-        // Ndarray
         DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
-        // Unwrapped raw ptr on device
         device_ptrs[data_ptr_idx] = executor->get_device_alloc_info_ptr(*ptr);
 
         if (grad_ptr != nullptr) {
@@ -152,7 +325,6 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         nullptr);
   }
   CUDADriver::get_instance().mem_free_async(device_result_buffer, nullptr);
-  // copy data back to host
   if (transfers.size() > 0) {
     CUDADriver::get_instance().stream_synchronize(nullptr);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index 439408ed55..9665c32014 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -1,11 +1,43 @@
 #pragma once
 
+#include <unordered_map>
+
 #include "quadrants/codegen/llvm/compiled_kernel_data.h"
+#include "quadrants/program/context.h"
 #include "quadrants/runtime/llvm/kernel_launcher.h"
 
 namespace quadrants::lang {
 namespace cuda {
 
+struct CudaKernelNodeParams {
+  void *func;
+  unsigned int gridDimX;
+  unsigned int gridDimY;
+  unsigned int gridDimZ;
+  unsigned int blockDimX;
+  unsigned int blockDimY;
+  unsigned int blockDimZ;
+  unsigned int sharedMemBytes;
+  void **kernelParams;
+  void **extra;
+};
+
+struct CachedCudaGraph {
+  void *graph_exec{nullptr};
+  char *persistent_device_arg_buffer{nullptr};
+  char *persistent_device_result_buffer{nullptr};
+  RuntimeContext persistent_ctx{};
+  std::size_t arg_buffer_size{0};
+  std::size_t result_buffer_size{0};
+
+  CachedCudaGraph() = default;
+  ~CachedCudaGraph();
+  CachedCudaGraph(const CachedCudaGraph &) = delete;
+  CachedCudaGraph &operator=(const CachedCudaGraph &) = delete;
+  CachedCudaGraph(CachedCudaGraph &&other) noexcept;
+  CachedCudaGraph &operator=(CachedCudaGraph &&other) noexcept;
+};
+
 class KernelLauncher : public LLVM::KernelLauncher {
   using Base = LLVM::KernelLauncher;
 
@@ -24,7 +56,11 @@ class KernelLauncher : public LLVM::KernelLauncher {
 
  private:
   bool on_cuda_device(void *ptr);
+  void launch_llvm_kernel_graph(Handle handle, LaunchContextBuilder &ctx);
   std::vector<Context> contexts_;
+  std::unordered_map<int, CachedCudaGraph> cuda_graph_cache_;
+  bool use_cuda_graph_{false};
+  bool use_cuda_graph_checked_{false};
 };
 
 }  // namespace cuda
diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
new file mode 100644
index 0000000000..27591e2f38
--- /dev/null
+++ b/tests/python/test_cuda_graph.py
@@ -0,0 +1,89 @@
+import os
+
+os.environ["QD_CUDA_GRAPH"] = "1"
+
+import numpy as np
+
+import quadrants as qd
+from tests import test_utils
+
+
+@test_utils.test(arch=[qd.cuda])
+def test_cuda_graph_two_loops():
+    """A kernel with two top-level for loops should be fused into a CUDA graph."""
+    n = 1024
+    x = qd.ndarray(qd.f32, shape=(n,))
+    y = qd.ndarray(qd.f32, shape=(n,))
+
+    @qd.kernel
+    def two_loops(x: qd.types.ndarray(qd.f32, ndim=1),
+                  y: qd.types.ndarray(qd.f32, ndim=1)):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1.0
+        for i in range(y.shape[0]):
+            y[i] = y[i] + 2.0
+
+    two_loops(x, y)
+    two_loops(x, y)
+    two_loops(x, y)
+
+    x_np = x.to_numpy()
+    y_np = y.to_numpy()
+    assert np.allclose(x_np, 3.0), f"Expected 3.0, got {x_np[:5]}"
+    assert np.allclose(y_np, 6.0), f"Expected 6.0, got {y_np[:5]}"
+
+
+@test_utils.test(arch=[qd.cuda])
+def test_cuda_graph_three_loops():
+    """A kernel with three top-level for loops."""
+    n = 512
+    a = qd.ndarray(qd.f32, shape=(n,))
+    b = qd.ndarray(qd.f32, shape=(n,))
+    c = qd.ndarray(qd.f32, shape=(n,))
+
+    @qd.kernel
+    def three_loops(a: qd.types.ndarray(qd.f32, ndim=1),
+                    b: qd.types.ndarray(qd.f32, ndim=1),
+                    c: qd.types.ndarray(qd.f32, ndim=1)):
+        for i in range(a.shape[0]):
+            a[i] = a[i] + 1.0
+        for i in range(b.shape[0]):
+            b[i] = b[i] + 10.0
+        for i in range(c.shape[0]):
+            c[i] = a[i] + b[i]
+
+    three_loops(a, b, c)
+
+    a_np = a.to_numpy()
+    b_np = b.to_numpy()
+    c_np = c.to_numpy()
+    assert np.allclose(a_np, 1.0)
+    assert np.allclose(b_np, 10.0)
+    assert np.allclose(c_np, 11.0)
+
+    three_loops(a, b, c)
+
+    a_np = a.to_numpy()
+    b_np = b.to_numpy()
+    c_np = c.to_numpy()
+    assert np.allclose(a_np, 2.0)
+    assert np.allclose(b_np, 20.0)
+    assert np.allclose(c_np, 22.0)
+
+
+@test_utils.test(arch=[qd.cuda])
+def test_cuda_graph_single_loop_no_graph():
+    """A kernel with a single for loop should NOT use the graph path."""
+    n = 256
+    x = qd.ndarray(qd.f32, shape=(n,))
+
+    @qd.kernel
+    def single_loop(x: qd.types.ndarray(qd.f32, ndim=1)):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 5.0
+
+    single_loop(x)
+    single_loop(x)
+
+    x_np = x.to_numpy()
+    assert np.allclose(x_np, 10.0)

From 49ce3c1ea2fe27f02ff78fb16351d9fcc209a72f Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 28 Feb 2026 22:14:09 -0500
Subject: [PATCH 002/128] bug fixes for cuda graph

---
 quadrants/runtime/cuda/kernel_launcher.cpp | 40 +++++++++++++++-------
 quadrants/runtime/cuda/kernel_launcher.h   |  3 +-
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 75a3c909f5..9b7cca8751 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -62,7 +62,7 @@ bool KernelLauncher::on_cuda_device(void *ptr) {
   return ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE;
 }
 
-void KernelLauncher::launch_llvm_kernel_graph(Handle handle,
+bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
                                               LaunchContextBuilder &ctx) {
   int launch_id = handle.get_launch_id();
   auto it = cuda_graph_cache_.find(launch_id);
@@ -70,7 +70,7 @@ void KernelLauncher::launch_llvm_kernel_graph(Handle handle,
   if (it != cuda_graph_cache_.end()) {
     auto *stream = CUDAContext::get_instance().get_stream();
     CUDADriver::get_instance().graph_launch(it->second.graph_exec, stream);
-    return;
+    return true;
   }
 
   auto &launcher_ctx = contexts_[launch_id];
@@ -80,11 +80,29 @@ void KernelLauncher::launch_llvm_kernel_graph(Handle handle,
   const auto &offloaded_tasks = launcher_ctx.offloaded_tasks;
 
   if (offloaded_tasks.size() < 2) {
-    // Not worth graphing a single kernel — fall through to normal launch.
-    // We signal this by setting it = end and letting the caller handle it.
-    // Actually, just do the normal launch inline for simplicity.
-    // This path should not be reached because launch_llvm_kernel checks.
-    QD_WARN("CUDA graph requested for single-task kernel; falling back.");
+    return false;
+  }
+
+  // Pre-check: bail out if any array argument is a host external array,
+  // since CUDA graphs require stable device pointers.
+  for (int i = 0; i < (int)parameters.size(); i++) {
+    const auto &kv = parameters[i];
+    const auto &arg_id = kv.first;
+    const auto &parameter = kv.second;
+    if (parameter.is_array) {
+      const auto arr_sz = ctx.array_runtime_sizes[arg_id];
+      if (arr_sz == 0)
+        continue;
+      if (ctx.device_allocation_type[arg_id] ==
+          LaunchContextBuilder::DevAllocType::kNone) {
+        ArgArrayPtrKey data_ptr_idx{arg_id,
+                                    TypeFactory::DATA_PTR_POS_IN_NDARRAY};
+        auto data_ptr = ctx.array_ptrs[data_ptr_idx];
+        if (!on_cuda_device(data_ptr)) {
+          return false;
+        }
+      }
+    }
   }
 
   CUDAContext::get_instance().make_current();
@@ -108,8 +126,6 @@ void KernelLauncher::launch_llvm_kernel_graph(Handle handle,
 
       if (ctx.device_allocation_type[arg_id] ==
           LaunchContextBuilder::DevAllocType::kNone) {
-        QD_ERROR_IF(!on_cuda_device(data_ptr),
-                    "CUDA graph mode does not support host external arrays");
         ctx.set_ndarray_ptrs(arg_id, (uint64)data_ptr, (uint64)grad_ptr);
       } else if (arr_sz > 0) {
         DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
@@ -189,6 +205,7 @@ void KernelLauncher::launch_llvm_kernel_graph(Handle handle,
            offloaded_tasks.size(), launch_id);
 
   cuda_graph_cache_.emplace(launch_id, std::move(cached));
+  return true;
 }
 
 void KernelLauncher::launch_llvm_kernel(Handle handle,
@@ -202,10 +219,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
   }
 
   if (use_cuda_graph_) {
-    auto &offloaded_tasks =
-        contexts_[handle.get_launch_id()].offloaded_tasks;
-    if (offloaded_tasks.size() >= 2) {
-      launch_llvm_kernel_graph(handle, ctx);
+    if (launch_llvm_kernel_graph(handle, ctx)) {
       return;
     }
   }
diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index 9665c32014..6a7e674582 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -3,7 +3,6 @@
 #include <unordered_map>
 
 #include "quadrants/codegen/llvm/compiled_kernel_data.h"
-#include "quadrants/program/context.h"
 #include "quadrants/runtime/llvm/kernel_launcher.h"
 
 namespace quadrants::lang {
@@ -56,7 +55,7 @@ class KernelLauncher : public LLVM::KernelLauncher {
 
  private:
   bool on_cuda_device(void *ptr);
-  void launch_llvm_kernel_graph(Handle handle, LaunchContextBuilder &ctx);
+  bool launch_llvm_kernel_graph(Handle handle, LaunchContextBuilder &ctx);
   std::vector<Context> contexts_;
   std::unordered_map<int, CachedCudaGraph> cuda_graph_cache_;
   bool use_cuda_graph_{false};

From 9c32a280423ca14001d849f81e81c074a507b479 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 28 Feb 2026 22:45:45 -0500
Subject: [PATCH 003/128] Add per-kernel @qd.kernel(cuda_graph=True) API

Replace the global QD_CUDA_GRAPH=1 env var with a per-kernel opt-in.
The flag flows from the Python decorator through LaunchContextBuilder
to the CUDA kernel launcher, avoiding interference with internal
kernels like ndarray_to_ext_arr.

Made-with: Cursor
---
 python/quadrants/lang/kernel.py            |  2 ++
 python/quadrants/lang/kernel_impl.py       | 20 +++++++++---
 quadrants/program/launch_context_builder.h |  1 +
 quadrants/python/export_lang.cpp           |  3 +-
 quadrants/runtime/cuda/kernel_launcher.cpp |  9 +-----
 quadrants/runtime/cuda/kernel_launcher.h   |  2 --
 tests/python/test_cuda_graph.py            | 37 +++++++++++++++++-----
 7 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
index af6dbdacb5..2c95dc4741 100644
--- a/python/quadrants/lang/kernel.py
+++ b/python/quadrants/lang/kernel.py
@@ -291,6 +291,7 @@ def __init__(self, _func: Callable, autodiff_mode: AutodiffMode, _is_classkernel
         # and front-end IR, but not necessarily any further.
         self.materialized_kernels: dict[CompiledKernelKeyType, KernelCxx] = {}
         self.has_print = False
+        self.use_cuda_graph: bool = False
         self.quadrants_callable: QuadrantsCallable | None = None
         self.visited_functions: set[FunctionSourceInfo] = set()
         self.kernel_function_info: FunctionSourceInfo | None = None
@@ -503,6 +504,7 @@ def launch_kernel(self, key, t_kernel: KernelCxx, compiled_kernel_data: Compiled
                     )
                     self.src_ll_cache_observations.cache_stored = True
             self._last_compiled_kernel_data = compiled_kernel_data
+            launch_ctx.use_cuda_graph = self.use_cuda_graph
             prog.launch_kernel(compiled_kernel_data, launch_ctx)
         except Exception as e:
             e = handle_exception_from_cpp(e)
diff --git a/python/quadrants/lang/kernel_impl.py b/python/quadrants/lang/kernel_impl.py
index a52df1f262..0e511578cf 100644
--- a/python/quadrants/lang/kernel_impl.py
+++ b/python/quadrants/lang/kernel_impl.py
@@ -123,7 +123,9 @@ def _inside_class(level_of_class_stackframe: int) -> bool:
     return False
 
 
-def _kernel_impl(_func: Callable, level_of_class_stackframe: int, verbose: bool = False) -> QuadrantsCallable:
+def _kernel_impl(
+    _func: Callable, level_of_class_stackframe: int, verbose: bool = False, cuda_graph: bool = False
+) -> QuadrantsCallable:
     # Can decorators determine if a function is being defined inside a class?
     # https://stackoverflow.com/a/8793684/12003165
     is_classkernel = _inside_class(level_of_class_stackframe + 1)
@@ -132,6 +134,8 @@ def _kernel_impl(_func: Callable, level_of_class_stackframe: int, verbose: bool
         print(f"kernel={_func.__name__} is_classkernel={is_classkernel}")
     primal = Kernel(_func, autodiff_mode=_NONE, _is_classkernel=is_classkernel)
     adjoint = Kernel(_func, autodiff_mode=_REVERSE, _is_classkernel=is_classkernel)
+    primal.use_cuda_graph = cuda_graph
+    adjoint.use_cuda_graph = cuda_graph
     # Having |primal| contains |grad| makes the tape work.
     primal.grad = adjoint
 
@@ -173,7 +177,7 @@ def wrapped_classkernel(*args, **kwargs):
 @overload
 # TODO: This callable should be Callable[[F], F].
 # See comments below.
-def kernel(_fn: None = None, *, pure: bool = False) -> Callable[[Any], Any]: ...
+def kernel(_fn: None = None, *, pure: bool = False, cuda_graph: bool = False) -> Callable[[Any], Any]: ...
 
 
 # TODO: This next overload should return F, but currently that will cause issues
@@ -183,10 +187,16 @@ def kernel(_fn: None = None, *, pure: bool = False) -> Callable[[Any], Any]: ...
 # However, by making it return Any, we can make the pure parameter
 # change now, without breaking pyright.
 @overload
-def kernel(_fn: Any, *, pure: bool = False) -> Any: ...
+def kernel(_fn: Any, *, pure: bool = False, cuda_graph: bool = False) -> Any: ...
 
 
-def kernel(_fn: Callable[..., typing.Any] | None = None, *, pure: bool | None = None, fastcache: bool = False):
+def kernel(
+    _fn: Callable[..., typing.Any] | None = None,
+    *,
+    pure: bool | None = None,
+    fastcache: bool = False,
+    cuda_graph: bool = False,
+):
     """
     Marks a function as a Quadrants kernel.
 
@@ -215,7 +225,7 @@ def decorator(fn: F, has_kernel_params: bool = True) -> F:
         else:
             level = 4
 
-        wrapped = _kernel_impl(fn, level_of_class_stackframe=level)
+        wrapped = _kernel_impl(fn, level_of_class_stackframe=level, cuda_graph=cuda_graph)
         wrapped.is_pure = pure is not None and pure or fastcache
         if pure is not None:
             warnings_helper.warn_once(
diff --git a/quadrants/program/launch_context_builder.h b/quadrants/program/launch_context_builder.h
index 84528a8439..91a2590b0b 100644
--- a/quadrants/program/launch_context_builder.h
+++ b/quadrants/program/launch_context_builder.h
@@ -150,6 +150,7 @@ class LaunchContextBuilder {
   size_t arg_buffer_size{0};
   const StructType *args_type{nullptr};
   size_t result_buffer_size{0};
+  bool use_cuda_graph{false};
 
   // Note that I've tried to group `array_runtime_size` and
   // `is_device_allocations` into a small struct. However, it caused some test
diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
index b3d23c0037..d155162667 100644
--- a/quadrants/python/export_lang.cpp
+++ b/quadrants/python/export_lang.cpp
@@ -659,7 +659,8 @@ void export_lang(py::module &m) {
            &LaunchContextBuilder::set_args_ndarray_with_grad)
       .def("get_struct_ret_int", &LaunchContextBuilder::get_struct_ret_int)
       .def("get_struct_ret_uint", &LaunchContextBuilder::get_struct_ret_uint)
-      .def("get_struct_ret_float", &LaunchContextBuilder::get_struct_ret_float);
+      .def("get_struct_ret_float", &LaunchContextBuilder::get_struct_ret_float)
+      .def_readwrite("use_cuda_graph", &LaunchContextBuilder::use_cuda_graph);
 
   py::class_<Function>(m, "Function")
       .def("insert_scalar_param", &Function::insert_scalar_param)
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 9b7cca8751..923b8edc84 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -1,7 +1,6 @@
 #include "quadrants/runtime/cuda/kernel_launcher.h"
 #include "quadrants/rhi/cuda/cuda_context.h"
 
-#include <cstdlib>
 #include <cstring>
 
 namespace quadrants::lang {
@@ -212,13 +211,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
                                         LaunchContextBuilder &ctx) {
   QD_ASSERT(handle.get_launch_id() < contexts_.size());
 
-  if (!use_cuda_graph_checked_) {
-    const char *env = std::getenv("QD_CUDA_GRAPH");
-    use_cuda_graph_ = env != nullptr && std::string(env) == "1";
-    use_cuda_graph_checked_ = true;
-  }
-
-  if (use_cuda_graph_) {
+  if (ctx.use_cuda_graph) {
     if (launch_llvm_kernel_graph(handle, ctx)) {
       return;
     }
diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index 6a7e674582..3142c1d441 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -58,8 +58,6 @@ class KernelLauncher : public LLVM::KernelLauncher {
   bool launch_llvm_kernel_graph(Handle handle, LaunchContextBuilder &ctx);
   std::vector<Context> contexts_;
   std::unordered_map<int, CachedCudaGraph> cuda_graph_cache_;
-  bool use_cuda_graph_{false};
-  bool use_cuda_graph_checked_{false};
 };
 
 }  // namespace cuda
diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index 27591e2f38..f7295349cc 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -1,7 +1,3 @@
-import os
-
-os.environ["QD_CUDA_GRAPH"] = "1"
-
 import numpy as np
 
 import quadrants as qd
@@ -15,7 +11,7 @@ def test_cuda_graph_two_loops():
     x = qd.ndarray(qd.f32, shape=(n,))
     y = qd.ndarray(qd.f32, shape=(n,))
 
-    @qd.kernel
+    @qd.kernel(cuda_graph=True)
     def two_loops(x: qd.types.ndarray(qd.f32, ndim=1),
                   y: qd.types.ndarray(qd.f32, ndim=1)):
         for i in range(x.shape[0]):
@@ -41,7 +37,7 @@ def test_cuda_graph_three_loops():
     b = qd.ndarray(qd.f32, shape=(n,))
     c = qd.ndarray(qd.f32, shape=(n,))
 
-    @qd.kernel
+    @qd.kernel(cuda_graph=True)
     def three_loops(a: qd.types.ndarray(qd.f32, ndim=1),
                     b: qd.types.ndarray(qd.f32, ndim=1),
                     c: qd.types.ndarray(qd.f32, ndim=1)):
@@ -73,11 +69,12 @@ def three_loops(a: qd.types.ndarray(qd.f32, ndim=1),
 
 @test_utils.test(arch=[qd.cuda])
 def test_cuda_graph_single_loop_no_graph():
-    """A kernel with a single for loop should NOT use the graph path."""
+    """A kernel with a single for loop should NOT use the graph path,
+    even with cuda_graph=True (falls back since < 2 tasks)."""
     n = 256
     x = qd.ndarray(qd.f32, shape=(n,))
 
-    @qd.kernel
+    @qd.kernel(cuda_graph=True)
     def single_loop(x: qd.types.ndarray(qd.f32, ndim=1)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 5.0
@@ -87,3 +84,27 @@ def single_loop(x: qd.types.ndarray(qd.f32, ndim=1)):
 
     x_np = x.to_numpy()
     assert np.allclose(x_np, 10.0)
+
+
+@test_utils.test(arch=[qd.cuda])
+def test_no_cuda_graph_annotation():
+    """A kernel WITHOUT cuda_graph=True should never use the graph path."""
+    n = 256
+    x = qd.ndarray(qd.f32, shape=(n,))
+    y = qd.ndarray(qd.f32, shape=(n,))
+
+    @qd.kernel
+    def two_loops(x: qd.types.ndarray(qd.f32, ndim=1),
+                  y: qd.types.ndarray(qd.f32, ndim=1)):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1.0
+        for i in range(y.shape[0]):
+            y[i] = y[i] + 2.0
+
+    two_loops(x, y)
+    two_loops(x, y)
+
+    x_np = x.to_numpy()
+    y_np = y.to_numpy()
+    assert np.allclose(x_np, 2.0)
+    assert np.allclose(y_np, 4.0)

From cffb9ae568fc7b2471f8f59b72bd08b6072ca8f7 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 28 Feb 2026 22:50:09 -0500
Subject: [PATCH 004/128] Add cross-platform test for cuda_graph=True
 annotation

Verify that cuda_graph=True is a harmless no-op on non-CUDA backends
(tested on x64/CPU). Passes on both x64 and CUDA.

Made-with: Cursor
---
 tests/python/test_cuda_graph.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index f7295349cc..7c43d24c4e 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -108,3 +108,27 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1),
     y_np = y.to_numpy()
     assert np.allclose(x_np, 2.0)
     assert np.allclose(y_np, 4.0)
+
+
+@test_utils.test()
+def test_cuda_graph_annotation_cross_platform():
+    """cuda_graph=True should be a harmless no-op on non-CUDA backends."""
+    n = 256
+    x = qd.ndarray(qd.f32, shape=(n,))
+    y = qd.ndarray(qd.f32, shape=(n,))
+
+    @qd.kernel(cuda_graph=True)
+    def two_loops(x: qd.types.ndarray(qd.f32, ndim=1),
+                  y: qd.types.ndarray(qd.f32, ndim=1)):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1.0
+        for i in range(y.shape[0]):
+            y[i] = y[i] + 2.0
+
+    two_loops(x, y)
+    two_loops(x, y)
+
+    x_np = x.to_numpy()
+    y_np = y.to_numpy()
+    assert np.allclose(x_np, 2.0), f"Expected 2.0, got {x_np[:5]}"
+    assert np.allclose(y_np, 4.0), f"Expected 4.0, got {y_np[:5]}"

From ed1cff94c1a2a8a27ef2dd8bc7fa06adf1f2f3c7 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 28 Feb 2026 23:03:56 -0500
Subject: [PATCH 005/128] Handle argument changes in CUDA graph replay

On each graph replay, re-resolve ndarray device pointers and re-upload
the arg buffer to the persistent device buffer. This ensures correct
results when the kernel is called with different ndarrays after the
graph was first captured.

Refactored ndarray pointer resolution into resolve_ctx_ndarray_ptrs().

Made-with: Cursor
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 91 +++++++++++-----------
 quadrants/runtime/cuda/kernel_launcher.h   |  3 +
 tests/python/test_cuda_graph.py            | 40 ++++++++++
 3 files changed, 87 insertions(+), 47 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 923b8edc84..70f5b17805 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -61,54 +61,10 @@ bool KernelLauncher::on_cuda_device(void *ptr) {
   return ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE;
 }
 
-bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
-                                              LaunchContextBuilder &ctx) {
-  int launch_id = handle.get_launch_id();
-  auto it = cuda_graph_cache_.find(launch_id);
-
-  if (it != cuda_graph_cache_.end()) {
-    auto *stream = CUDAContext::get_instance().get_stream();
-    CUDADriver::get_instance().graph_launch(it->second.graph_exec, stream);
-    return true;
-  }
-
-  auto &launcher_ctx = contexts_[launch_id];
+bool KernelLauncher::resolve_ctx_ndarray_ptrs(
+    LaunchContextBuilder &ctx,
+    const std::vector<std::pair<int, Callable::Parameter>> &parameters) {
   auto *executor = get_runtime_executor();
-  auto *cuda_module = launcher_ctx.jit_module;
-  const auto &parameters = *launcher_ctx.parameters;
-  const auto &offloaded_tasks = launcher_ctx.offloaded_tasks;
-
-  if (offloaded_tasks.size() < 2) {
-    return false;
-  }
-
-  // Pre-check: bail out if any array argument is a host external array,
-  // since CUDA graphs require stable device pointers.
-  for (int i = 0; i < (int)parameters.size(); i++) {
-    const auto &kv = parameters[i];
-    const auto &arg_id = kv.first;
-    const auto &parameter = kv.second;
-    if (parameter.is_array) {
-      const auto arr_sz = ctx.array_runtime_sizes[arg_id];
-      if (arr_sz == 0)
-        continue;
-      if (ctx.device_allocation_type[arg_id] ==
-          LaunchContextBuilder::DevAllocType::kNone) {
-        ArgArrayPtrKey data_ptr_idx{arg_id,
-                                    TypeFactory::DATA_PTR_POS_IN_NDARRAY};
-        auto data_ptr = ctx.array_ptrs[data_ptr_idx];
-        if (!on_cuda_device(data_ptr)) {
-          return false;
-        }
-      }
-    }
-  }
-
-  CUDAContext::get_instance().make_current();
-
-  CachedCudaGraph cached;
-
-  // --- Resolve ndarray device pointers (same as normal path) ---
   for (int i = 0; i < (int)parameters.size(); i++) {
     const auto &kv = parameters[i];
     const auto &arg_id = kv.first;
@@ -125,6 +81,9 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
 
       if (ctx.device_allocation_type[arg_id] ==
           LaunchContextBuilder::DevAllocType::kNone) {
+        if (!on_cuda_device(data_ptr)) {
+          return false;
+        }
         ctx.set_ndarray_ptrs(arg_id, (uint64)data_ptr, (uint64)grad_ptr);
       } else if (arr_sz > 0) {
         DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
@@ -138,6 +97,44 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
       }
     }
   }
+  return true;
+}
+
+bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
+                                              LaunchContextBuilder &ctx) {
+  int launch_id = handle.get_launch_id();
+
+  auto &launcher_ctx = contexts_[launch_id];
+  const auto &parameters = *launcher_ctx.parameters;
+  const auto &offloaded_tasks = launcher_ctx.offloaded_tasks;
+
+  if (offloaded_tasks.size() < 2) {
+    return false;
+  }
+
+  if (!resolve_ctx_ndarray_ptrs(ctx, parameters)) {
+    return false;
+  }
+
+  auto it = cuda_graph_cache_.find(launch_id);
+  if (it != cuda_graph_cache_.end()) {
+    auto &cached = it->second;
+    if (ctx.arg_buffer_size > 0) {
+      CUDADriver::get_instance().memcpy_host_to_device(
+          cached.persistent_device_arg_buffer, ctx.get_context().arg_buffer,
+          cached.arg_buffer_size);
+    }
+    auto *stream = CUDAContext::get_instance().get_stream();
+    CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);
+    return true;
+  }
+
+  CUDAContext::get_instance().make_current();
+
+  auto *executor = get_runtime_executor();
+  auto *cuda_module = launcher_ctx.jit_module;
+
+  CachedCudaGraph cached;
 
   // --- Allocate persistent buffers ---
   cached.result_buffer_size = std::max(ctx.result_buffer_size, sizeof(uint64));
diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index 3142c1d441..4e063bd1c6 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -55,6 +55,9 @@ class KernelLauncher : public LLVM::KernelLauncher {
 
  private:
   bool on_cuda_device(void *ptr);
+  bool resolve_ctx_ndarray_ptrs(
+      LaunchContextBuilder &ctx,
+      const std::vector<std::pair<int, Callable::Parameter>> &parameters);
   bool launch_llvm_kernel_graph(Handle handle, LaunchContextBuilder &ctx);
   std::vector<Context> contexts_;
   std::unordered_map<int, CachedCudaGraph> cuda_graph_cache_;
diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index 7c43d24c4e..7b9bd93e06 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -110,6 +110,46 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1),
     assert np.allclose(y_np, 4.0)
 
 
+@test_utils.test(arch=[qd.cuda])
+def test_cuda_graph_changed_args():
+    """Graph should produce correct results when called with different ndarrays."""
+    n = 256
+
+    @qd.kernel(cuda_graph=True)
+    def two_loops(x: qd.types.ndarray(qd.f32, ndim=1),
+                  y: qd.types.ndarray(qd.f32, ndim=1)):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1.0
+        for i in range(y.shape[0]):
+            y[i] = y[i] + 2.0
+
+    x1 = qd.ndarray(qd.f32, shape=(n,))
+    y1 = qd.ndarray(qd.f32, shape=(n,))
+    two_loops(x1, y1)
+    two_loops(x1, y1)
+
+    x1_np = x1.to_numpy()
+    y1_np = y1.to_numpy()
+    assert np.allclose(x1_np, 2.0), f"Expected 2.0, got {x1_np[:5]}"
+    assert np.allclose(y1_np, 4.0), f"Expected 4.0, got {y1_np[:5]}"
+
+    x2 = qd.ndarray(qd.f32, shape=(n,))
+    y2 = qd.ndarray(qd.f32, shape=(n,))
+    x2.from_numpy(np.full(n, 10.0, dtype=np.float32))
+    y2.from_numpy(np.full(n, 20.0, dtype=np.float32))
+    two_loops(x2, y2)
+
+    x2_np = x2.to_numpy()
+    y2_np = y2.to_numpy()
+    assert np.allclose(x2_np, 11.0), f"Expected 11.0, got {x2_np[:5]}"
+    assert np.allclose(y2_np, 22.0), f"Expected 22.0, got {y2_np[:5]}"
+
+    x1_np = x1.to_numpy()
+    y1_np = y1.to_numpy()
+    assert np.allclose(x1_np, 2.0), f"x1 should be unchanged, got {x1_np[:5]}"
+    assert np.allclose(y1_np, 4.0), f"y1 should be unchanged, got {y1_np[:5]}"
+
+
 @test_utils.test()
 def test_cuda_graph_annotation_cross_platform():
     """cuda_graph=True should be a harmless no-op on non-CUDA backends."""

From 85dc8dbeea1bc22a288a033a2e29096cb6aea804 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 10:28:36 -0700
Subject: [PATCH 006/128] Fix formatting and disable cuda_graph on adjoint
 kernels

Apply lint formatting fixes (clang-format, ruff) and remove
cuda_graph flag from autodiff adjoint kernel until the interaction
with reverse-mode AD is validated.
---
 python/quadrants/lang/kernel_impl.py       |  1 -
 quadrants/runtime/cuda/kernel_launcher.cpp |  3 +--
 tests/python/test_cuda_graph.py            | 19 ++++++++-----------
 3 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/python/quadrants/lang/kernel_impl.py b/python/quadrants/lang/kernel_impl.py
index 0e511578cf..8002a5a341 100644
--- a/python/quadrants/lang/kernel_impl.py
+++ b/python/quadrants/lang/kernel_impl.py
@@ -135,7 +135,6 @@ def _kernel_impl(
     primal = Kernel(_func, autodiff_mode=_NONE, _is_classkernel=is_classkernel)
     adjoint = Kernel(_func, autodiff_mode=_REVERSE, _is_classkernel=is_classkernel)
     primal.use_cuda_graph = cuda_graph
-    adjoint.use_cuda_graph = cuda_graph
     # Having |primal| contains |grad| makes the tape work.
     primal.grad = adjoint
 
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 70f5b17805..1de6d3e529 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -175,8 +175,7 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
     node_params.blockDimX = (unsigned int)task.block_dim;
     node_params.blockDimY = 1;
     node_params.blockDimZ = 1;
-    node_params.sharedMemBytes =
-        (unsigned int)task.dynamic_shared_array_bytes;
+    node_params.sharedMemBytes = (unsigned int)task.dynamic_shared_array_bytes;
     node_params.kernelParams = &ctx_ptr;
     node_params.extra = nullptr;
 
diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index 7b9bd93e06..3bdef991ef 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -1,6 +1,7 @@
 import numpy as np
 
 import quadrants as qd
+
 from tests import test_utils
 
 
@@ -12,8 +13,7 @@ def test_cuda_graph_two_loops():
     y = qd.ndarray(qd.f32, shape=(n,))
 
     @qd.kernel(cuda_graph=True)
-    def two_loops(x: qd.types.ndarray(qd.f32, ndim=1),
-                  y: qd.types.ndarray(qd.f32, ndim=1)):
+    def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndim=1)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1.0
         for i in range(y.shape[0]):
@@ -38,9 +38,9 @@ def test_cuda_graph_three_loops():
     c = qd.ndarray(qd.f32, shape=(n,))
 
     @qd.kernel(cuda_graph=True)
-    def three_loops(a: qd.types.ndarray(qd.f32, ndim=1),
-                    b: qd.types.ndarray(qd.f32, ndim=1),
-                    c: qd.types.ndarray(qd.f32, ndim=1)):
+    def three_loops(
+        a: qd.types.ndarray(qd.f32, ndim=1), b: qd.types.ndarray(qd.f32, ndim=1), c: qd.types.ndarray(qd.f32, ndim=1)
+    ):
         for i in range(a.shape[0]):
             a[i] = a[i] + 1.0
         for i in range(b.shape[0]):
@@ -94,8 +94,7 @@ def test_no_cuda_graph_annotation():
     y = qd.ndarray(qd.f32, shape=(n,))
 
     @qd.kernel
-    def two_loops(x: qd.types.ndarray(qd.f32, ndim=1),
-                  y: qd.types.ndarray(qd.f32, ndim=1)):
+    def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndim=1)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1.0
         for i in range(y.shape[0]):
@@ -116,8 +115,7 @@ def test_cuda_graph_changed_args():
     n = 256
 
     @qd.kernel(cuda_graph=True)
-    def two_loops(x: qd.types.ndarray(qd.f32, ndim=1),
-                  y: qd.types.ndarray(qd.f32, ndim=1)):
+    def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndim=1)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1.0
         for i in range(y.shape[0]):
@@ -158,8 +156,7 @@ def test_cuda_graph_annotation_cross_platform():
     y = qd.ndarray(qd.f32, shape=(n,))
 
     @qd.kernel(cuda_graph=True)
-    def two_loops(x: qd.types.ndarray(qd.f32, ndim=1),
-                  y: qd.types.ndarray(qd.f32, ndim=1)):
+    def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndim=1)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1.0
         for i in range(y.shape[0]):

From d9ca32aadc275b428e34e1611eda4c528dc5478b Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 28 Feb 2026 23:40:07 -0500
Subject: [PATCH 007/128] Add graph_while conditional nodes for GPU-side
 iteration loops

Implements @qd.kernel(graph_while='flag_arg') which wraps the kernel
offloaded tasks in a CUDA conditional while node (requires SM 9.0+).
The named argument is a scalar i32 ndarray on device; the loop
continues while its value is non-zero.

Key implementation details:
- Condition kernel compiled as PTX and JIT-linked with libcudadevrt.a
  at runtime to access cudaGraphSetConditional device function
- CU_GRAPH_COND_ASSIGN_DEFAULT flag ensures handle is reset each launch
- Works with both counter-based (decrement to 0) and boolean flag
  (set to 0 when done) patterns
- graph_while implicitly enables cuda_graph=True

Tests: counter, boolean done flag, multiple loops, graph replay.
---
 python/quadrants/lang/kernel.py               |  10 +
 python/quadrants/lang/kernel_impl.py          |  14 +-
 quadrants/program/launch_context_builder.h    |   2 +
 quadrants/python/export_lang.cpp              |   3 +-
 .../rhi/cuda/cuda_driver_functions.inc.h      |  10 +
 quadrants/runtime/cuda/kernel_launcher.cpp    | 186 +++++++++++++++++-
 quadrants/runtime/cuda/kernel_launcher.h      |  21 ++
 tests/python/test_cuda_graph_while.py         | 121 ++++++++++++
 8 files changed, 353 insertions(+), 14 deletions(-)
 create mode 100644 tests/python/test_cuda_graph_while.py

diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
index 2c95dc4741..c0c8d40732 100644
--- a/python/quadrants/lang/kernel.py
+++ b/python/quadrants/lang/kernel.py
@@ -292,6 +292,7 @@ def __init__(self, _func: Callable, autodiff_mode: AutodiffMode, _is_classkernel
         self.materialized_kernels: dict[CompiledKernelKeyType, KernelCxx] = {}
         self.has_print = False
         self.use_cuda_graph: bool = False
+        self.graph_while_arg: str | None = None
         self.quadrants_callable: QuadrantsCallable | None = None
         self.visited_functions: set[FunctionSourceInfo] = set()
         self.kernel_function_info: FunctionSourceInfo | None = None
@@ -505,6 +506,15 @@ def launch_kernel(self, key, t_kernel: KernelCxx, compiled_kernel_data: Compiled
                     self.src_ll_cache_observations.cache_stored = True
             self._last_compiled_kernel_data = compiled_kernel_data
             launch_ctx.use_cuda_graph = self.use_cuda_graph
+            if self.graph_while_arg is not None:
+                non_template_idx = 0
+                for meta in self.arg_metas:
+                    if meta.annotation is template or isinstance(meta.annotation, template):
+                        continue
+                    if meta.name == self.graph_while_arg:
+                        launch_ctx.graph_while_arg_id = non_template_idx
+                        break
+                    non_template_idx += 1
             prog.launch_kernel(compiled_kernel_data, launch_ctx)
         except Exception as e:
             e = handle_exception_from_cpp(e)
diff --git a/python/quadrants/lang/kernel_impl.py b/python/quadrants/lang/kernel_impl.py
index 8002a5a341..d3e7976b9d 100644
--- a/python/quadrants/lang/kernel_impl.py
+++ b/python/quadrants/lang/kernel_impl.py
@@ -124,17 +124,22 @@ def _inside_class(level_of_class_stackframe: int) -> bool:
 
 
 def _kernel_impl(
-    _func: Callable, level_of_class_stackframe: int, verbose: bool = False, cuda_graph: bool = False
+    _func: Callable, level_of_class_stackframe: int, verbose: bool = False,
+    cuda_graph: bool = False, graph_while: str | None = None,
 ) -> QuadrantsCallable:
     # Can decorators determine if a function is being defined inside a class?
     # https://stackoverflow.com/a/8793684/12003165
     is_classkernel = _inside_class(level_of_class_stackframe + 1)
 
+    if graph_while is not None:
+        cuda_graph = True
+
     if verbose:
         print(f"kernel={_func.__name__} is_classkernel={is_classkernel}")
     primal = Kernel(_func, autodiff_mode=_NONE, _is_classkernel=is_classkernel)
     adjoint = Kernel(_func, autodiff_mode=_REVERSE, _is_classkernel=is_classkernel)
     primal.use_cuda_graph = cuda_graph
+    primal.graph_while_arg = graph_while
     # Having |primal| contains |grad| makes the tape work.
     primal.grad = adjoint
 
@@ -176,7 +181,7 @@ def wrapped_classkernel(*args, **kwargs):
 @overload
 # TODO: This callable should be Callable[[F], F].
 # See comments below.
-def kernel(_fn: None = None, *, pure: bool = False, cuda_graph: bool = False) -> Callable[[Any], Any]: ...
+def kernel(_fn: None = None, *, pure: bool = False, cuda_graph: bool = False, graph_while: str | None = None) -> Callable[[Any], Any]: ...
 
 
 # TODO: This next overload should return F, but currently that will cause issues
@@ -186,7 +191,7 @@ def kernel(_fn: None = None, *, pure: bool = False, cuda_graph: bool = False) ->
 # However, by making it return Any, we can make the pure parameter
 # change now, without breaking pyright.
 @overload
-def kernel(_fn: Any, *, pure: bool = False, cuda_graph: bool = False) -> Any: ...
+def kernel(_fn: Any, *, pure: bool = False, cuda_graph: bool = False, graph_while: str | None = None) -> Any: ...
 
 
 def kernel(
@@ -195,6 +200,7 @@ def kernel(
     pure: bool | None = None,
     fastcache: bool = False,
     cuda_graph: bool = False,
+    graph_while: str | None = None,
 ):
     """
     Marks a function as a Quadrants kernel.
@@ -224,7 +230,7 @@ def decorator(fn: F, has_kernel_params: bool = True) -> F:
         else:
             level = 4
 
-        wrapped = _kernel_impl(fn, level_of_class_stackframe=level, cuda_graph=cuda_graph)
+        wrapped = _kernel_impl(fn, level_of_class_stackframe=level, cuda_graph=cuda_graph, graph_while=graph_while)
         wrapped.is_pure = pure is not None and pure or fastcache
         if pure is not None:
             warnings_helper.warn_once(
diff --git a/quadrants/program/launch_context_builder.h b/quadrants/program/launch_context_builder.h
index 91a2590b0b..9bcc6310bd 100644
--- a/quadrants/program/launch_context_builder.h
+++ b/quadrants/program/launch_context_builder.h
@@ -151,6 +151,8 @@ class LaunchContextBuilder {
   const StructType *args_type{nullptr};
   size_t result_buffer_size{0};
   bool use_cuda_graph{false};
+  int graph_while_arg_id{-1};
+  void *graph_while_flag_dev_ptr{nullptr};
 
   // Note that I've tried to group `array_runtime_size` and
   // `is_device_allocations` into a small struct. However, it caused some test
diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
index d155162667..6583157f12 100644
--- a/quadrants/python/export_lang.cpp
+++ b/quadrants/python/export_lang.cpp
@@ -660,7 +660,8 @@ void export_lang(py::module &m) {
       .def("get_struct_ret_int", &LaunchContextBuilder::get_struct_ret_int)
       .def("get_struct_ret_uint", &LaunchContextBuilder::get_struct_ret_uint)
       .def("get_struct_ret_float", &LaunchContextBuilder::get_struct_ret_float)
-      .def_readwrite("use_cuda_graph", &LaunchContextBuilder::use_cuda_graph);
+      .def_readwrite("use_cuda_graph", &LaunchContextBuilder::use_cuda_graph)
+      .def_readwrite("graph_while_arg_id", &LaunchContextBuilder::graph_while_arg_id);
 
   py::class_<Function>(m, "Function")
       .def("insert_scalar_param", &Function::insert_scalar_param)
diff --git a/quadrants/rhi/cuda/cuda_driver_functions.inc.h b/quadrants/rhi/cuda/cuda_driver_functions.inc.h
index 2da4799b96..9fe0e543d5 100644
--- a/quadrants/rhi/cuda/cuda_driver_functions.inc.h
+++ b/quadrants/rhi/cuda/cuda_driver_functions.inc.h
@@ -73,8 +73,18 @@ PER_CUDA_FUNCTION(import_external_semaphore, cuImportExternalSemaphore,CUexterna
 // Graph management
 PER_CUDA_FUNCTION(graph_create, cuGraphCreate, void **, uint32);
 PER_CUDA_FUNCTION(graph_add_kernel_node, cuGraphAddKernelNode, void **, void *, const void *, std::size_t, const void *);
+PER_CUDA_FUNCTION(graph_add_node, cuGraphAddNode, void **, void *, const void *, std::size_t, void *);
 PER_CUDA_FUNCTION(graph_instantiate, cuGraphInstantiate, void **, void *, void *, char *, std::size_t);
 PER_CUDA_FUNCTION(graph_launch, cuGraphLaunch, void *, void *);
 PER_CUDA_FUNCTION(graph_destroy, cuGraphDestroy, void *);
 PER_CUDA_FUNCTION(graph_exec_destroy, cuGraphExecDestroy, void *);
+PER_CUDA_FUNCTION(graph_conditional_handle_create, cuGraphConditionalHandleCreate, void *, void *, void *, uint32, uint32);
+
+// JIT linker (for loading condition kernel with cudadevrt)
+PER_CUDA_FUNCTION(link_create, cuLinkCreate_v2, uint32, void *, void *, void **);
+PER_CUDA_FUNCTION(link_add_data, cuLinkAddData_v2, void *, uint32, void *, std::size_t, const char *, uint32, void *, void *);
+PER_CUDA_FUNCTION(link_add_file, cuLinkAddFile_v2, void *, uint32, const char *, uint32, void *, void *);
+PER_CUDA_FUNCTION(link_complete, cuLinkComplete, void *, void **, std::size_t *);
+PER_CUDA_FUNCTION(link_destroy, cuLinkDestroy, void *);
+PER_CUDA_FUNCTION(module_load_data, cuModuleLoadData, void **, const void *);
 // clang-format on
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 1de6d3e529..61b4e468ec 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -2,10 +2,51 @@
 #include "quadrants/rhi/cuda/cuda_context.h"
 
 #include <cstring>
+#include <filesystem>
 
 namespace quadrants::lang {
 namespace cuda {
 
+// PTX for a tiny condition kernel that reads a device-side int32 flag and
+// calls cudaGraphSetConditional(handle, flag != 0 ? 1 : 0).
+// Compiled from CUDA C with: nvcc -ptx -arch=sm_90 -rdc=true
+// Requires JIT linking with libcudadevrt.a at runtime.
+static const char *kConditionKernelPTX = R"PTX(
+.version 8.8
+.target sm_90
+.address_size 64
+.extern .func cudaGraphSetConditional
+(
+    .param .b64 cudaGraphSetConditional_param_0,
+    .param .b32 cudaGraphSetConditional_param_1
+)
+;
+.visible .entry _qd_graph_while_cond(
+    .param .u64 _qd_graph_while_cond_param_0,
+    .param .u64 _qd_graph_while_cond_param_1
+)
+{
+    .reg .pred %p<2>;
+    .reg .b32 %r<3>;
+    .reg .b64 %rd<4>;
+    ld.param.u64 %rd1, [_qd_graph_while_cond_param_0];
+    ld.param.u64 %rd2, [_qd_graph_while_cond_param_1];
+    cvta.to.global.u64 %rd3, %rd2;
+    ld.global.u32 %r1, [%rd3];
+    setp.ne.s32 %p1, %r1, 0;
+    selp.u32 %r2, 1, 0, %p1;
+    { // callseq 0, 0
+    .reg .b32 temp_param_reg;
+    .param .b64 param0;
+    st.param.b64 [param0+0], %rd1;
+    .param .b32 param1;
+    st.param.b32 [param1+0], %r2;
+    call.uni cudaGraphSetConditional, (param0, param1);
+    } // callseq 0
+    ret;
+}
+)PTX";
+
 CachedCudaGraph::~CachedCudaGraph() {
   if (graph_exec) {
     CUDADriver::get_instance().graph_exec_destroy(graph_exec);
@@ -79,27 +120,82 @@ bool KernelLauncher::resolve_ctx_ndarray_ptrs(
       auto data_ptr = ctx.array_ptrs[data_ptr_idx];
       auto grad_ptr = ctx.array_ptrs[grad_ptr_idx];
 
+      void *resolved_data = nullptr;
+      void *resolved_grad = nullptr;
+
       if (ctx.device_allocation_type[arg_id] ==
           LaunchContextBuilder::DevAllocType::kNone) {
         if (!on_cuda_device(data_ptr)) {
           return false;
         }
-        ctx.set_ndarray_ptrs(arg_id, (uint64)data_ptr, (uint64)grad_ptr);
+        resolved_data = data_ptr;
+        resolved_grad = grad_ptr;
       } else if (arr_sz > 0) {
         DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
-        void *dev_data = executor->get_device_alloc_info_ptr(*ptr);
-        void *dev_grad = nullptr;
+        resolved_data = executor->get_device_alloc_info_ptr(*ptr);
         if (grad_ptr) {
-          dev_grad = executor->get_device_alloc_info_ptr(
+          resolved_grad = executor->get_device_alloc_info_ptr(
               *static_cast<DeviceAllocation *>(grad_ptr));
         }
-        ctx.set_ndarray_ptrs(arg_id, (uint64)dev_data, (uint64)dev_grad);
+      }
+
+      if (resolved_data) {
+        ctx.set_ndarray_ptrs(arg_id, (uint64)resolved_data,
+                             (uint64)resolved_grad);
+        if (arg_id == ctx.graph_while_arg_id) {
+          ctx.graph_while_flag_dev_ptr = resolved_data;
+        }
       }
     }
   }
   return true;
 }
 
+void KernelLauncher::ensure_condition_kernel_loaded() {
+  if (cond_kernel_func_)
+    return;
+
+  auto &driver = CUDADriver::get_instance();
+
+  // Find libcudadevrt.a — required for cudaGraphSetConditional in device code
+  std::string cudadevrt_path;
+  for (const auto &candidate : {
+           std::string("/usr/local/cuda/lib64/libcudadevrt.a"),
+           std::string("/usr/lib/x86_64-linux-gnu/libcudadevrt.a"),
+       }) {
+    if (std::filesystem::exists(candidate)) {
+      cudadevrt_path = candidate;
+      break;
+    }
+  }
+  if (cudadevrt_path.empty()) {
+    QD_WARN("Cannot find libcudadevrt.a — graph_while will not work");
+    return;
+  }
+
+  void *link_state = nullptr;
+  driver.link_create(0, nullptr, nullptr, &link_state);
+
+  std::size_t ptx_len = std::strlen(kConditionKernelPTX) + 1;
+  driver.link_add_data(link_state, /*CU_JIT_INPUT_PTX=*/1,
+                       const_cast<char *>(kConditionKernelPTX), ptx_len,
+                       "qd_cond", 0, nullptr, nullptr);
+
+  driver.link_add_file(link_state, /*CU_JIT_INPUT_LIBRARY=*/4,
+                       cudadevrt_path.c_str(), 0, nullptr, nullptr);
+
+  void *cubin = nullptr;
+  std::size_t cubin_size = 0;
+  driver.link_complete(link_state, &cubin, &cubin_size);
+
+  driver.module_load_data(&cond_kernel_module_, cubin);
+  driver.module_get_function(&cond_kernel_func_, cond_kernel_module_,
+                             "_qd_graph_while_cond");
+  driver.link_destroy(link_state);
+
+  QD_TRACE("Loaded graph_while condition kernel ({} bytes cubin)", cubin_size);
+}
+
 bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
                                               LaunchContextBuilder &ctx) {
   int launch_id = handle.get_launch_id();
@@ -108,7 +204,7 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
   const auto &parameters = *launcher_ctx.parameters;
   const auto &offloaded_tasks = launcher_ctx.offloaded_tasks;
 
-  if (offloaded_tasks.size() < 2) {
+  if (offloaded_tasks.size() < 2 && ctx.graph_while_arg_id < 0) {
     return false;
   }
 
@@ -116,6 +212,8 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
     return false;
   }
 
+  const bool use_graph_while = ctx.graph_while_arg_id >= 0;
+
   auto it = cuda_graph_cache_.find(launch_id);
   if (it != cuda_graph_cache_.end()) {
     auto &cached = it->second;
@@ -162,6 +260,48 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
   void *graph = nullptr;
   CUDADriver::get_instance().graph_create(&graph, 0);
 
+  // Determine the target graph for kernel nodes.
+  // With graph_while, kernels go into the conditional while body graph.
+  void *kernel_target_graph = graph;
+  unsigned long long cond_handle = 0;
+
+  if (use_graph_while) {
+    ensure_condition_kernel_loaded();
+    if (!cond_kernel_func_) {
+      QD_WARN("Condition kernel not available, falling back to non-graph");
+      CUDADriver::get_instance().graph_destroy(graph);
+      return false;
+    }
+
+    void *cu_ctx = CUDAContext::get_instance().get_context();
+
+    CUDADriver::get_instance().graph_conditional_handle_create(
+        &cond_handle, graph, cu_ctx,
+        /*defaultLaunchValue=*/1,
+        /*flags=CU_GRAPH_COND_ASSIGN_DEFAULT=*/1);
+
+    CudaGraphNodeParams cond_node_params{};
+    cond_node_params.type = 13;  // CU_GRAPH_NODE_TYPE_CONDITIONAL
+    cond_node_params.handle = cond_handle;
+    cond_node_params.condType = 1;  // CU_GRAPH_COND_TYPE_WHILE
+    cond_node_params.size = 1;
+    cond_node_params.phGraph_out = nullptr;  // CUDA will populate this
+    cond_node_params.ctx = cu_ctx;
+
+    void *cond_node = nullptr;
+    CUDADriver::get_instance().graph_add_node(&cond_node, graph, nullptr, 0,
+                                              &cond_node_params);
+
+    // CUDA replaces phGraph_out with a pointer to its owned array
+    void **body_graphs = (void **)cond_node_params.phGraph_out;
+    QD_ASSERT(body_graphs && body_graphs[0]);
+    kernel_target_graph = body_graphs[0];
+
+    QD_TRACE("CUDA graph_while: conditional node created, body graph={}",
+             kernel_target_graph);
+  }
+
+  // Add work kernel nodes to the target graph
   void *prev_node = nullptr;
   for (const auto &task : offloaded_tasks) {
     void *func = cuda_module->lookup_function(task.name);
@@ -183,10 +323,36 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
     const void *deps = prev_node;
     std::size_t num_deps = prev_node ? 1 : 0;
     CUDADriver::get_instance().graph_add_kernel_node(
-        &node, graph, prev_node ? &deps : nullptr, num_deps, &node_params);
+        &node, kernel_target_graph, prev_node ? &deps : nullptr, num_deps,
+        &node_params);
     prev_node = node;
   }
 
+  // For graph_while: add condition kernel as the last node in the body graph
+  if (use_graph_while) {
+    QD_ASSERT(ctx.graph_while_flag_dev_ptr);
+
+    void *flag_ptr = ctx.graph_while_flag_dev_ptr;
+    void *cond_args[2] = {&cond_handle, &flag_ptr};
+
+    CudaKernelNodeParams cond_kp{};
+    cond_kp.func = cond_kernel_func_;
+    cond_kp.gridDimX = 1;
+    cond_kp.gridDimY = 1;
+    cond_kp.gridDimZ = 1;
+    cond_kp.blockDimX = 1;
+    cond_kp.blockDimY = 1;
+    cond_kp.blockDimZ = 1;
+    cond_kp.sharedMemBytes = 0;
+    cond_kp.kernelParams = cond_args;
+    cond_kp.extra = nullptr;
+
+    void *cond_kernel_node = nullptr;
+    CUDADriver::get_instance().graph_add_kernel_node(
+        &cond_kernel_node, kernel_target_graph,
+        prev_node ? &prev_node : nullptr, prev_node ? 1 : 0, &cond_kp);
+  }
+
   // --- Instantiate and launch ---
   CUDADriver::get_instance().graph_instantiate(
       &cached.graph_exec, graph, nullptr, nullptr, 0);
@@ -196,8 +362,10 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
 
   CUDADriver::get_instance().graph_destroy(graph);
 
-  QD_TRACE("CUDA graph created with {} kernel nodes for launch_id={}",
-           offloaded_tasks.size(), launch_id);
+  QD_TRACE("CUDA graph created with {} kernel nodes for launch_id={}"
+           "{}",
+           offloaded_tasks.size(), launch_id,
+           use_graph_while ? " (with graph_while)" : "");
 
   cuda_graph_cache_.emplace(launch_id, std::move(cached));
   return true;
diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index 4e063bd1c6..10e16f1355 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <string>
 #include <unordered_map>
 
 #include "quadrants/codegen/llvm/compiled_kernel_data.h"
@@ -21,6 +22,21 @@ struct CudaKernelNodeParams {
   void **extra;
 };
 
+// Mirrors CUgraphNodeParams layout for conditional while nodes.
+// See CUDA driver API: CUgraphNodeParams / CUDA_CONDITIONAL_NODE_PARAMS.
+struct CudaGraphNodeParams {
+  unsigned int type;  // CU_GRAPH_NODE_TYPE_CONDITIONAL = 13
+  int reserved0[3];
+  // Union starts at offset 16 (232 bytes total)
+  unsigned long long handle;   // CUgraphConditionalHandle
+  unsigned int condType;       // CU_GRAPH_COND_TYPE_WHILE = 1
+  unsigned int size;           // 1 for while
+  void *phGraph_out;           // CUgraph* output array
+  void *ctx;                   // CUcontext
+  char _pad[232 - 8 - 4 - 4 - 8 - 8];
+  long long reserved2;
+};
+
 struct CachedCudaGraph {
   void *graph_exec{nullptr};
   char *persistent_device_arg_buffer{nullptr};
@@ -59,8 +75,13 @@ class KernelLauncher : public LLVM::KernelLauncher {
       LaunchContextBuilder &ctx,
       const std::vector<std::pair<int, Callable::Parameter>> &parameters);
   bool launch_llvm_kernel_graph(Handle handle, LaunchContextBuilder &ctx);
+  void ensure_condition_kernel_loaded();
   std::vector<Context> contexts_;
   std::unordered_map<int, CachedCudaGraph> cuda_graph_cache_;
+
+  // JIT-compiled condition kernel for graph_while conditional nodes
+  void *cond_kernel_module_{nullptr};   // CUmodule
+  void *cond_kernel_func_{nullptr};     // CUfunction
 };
 
 }  // namespace cuda
diff --git a/tests/python/test_cuda_graph_while.py b/tests/python/test_cuda_graph_while.py
new file mode 100644
index 0000000000..233e65dba8
--- /dev/null
+++ b/tests/python/test_cuda_graph_while.py
@@ -0,0 +1,121 @@
+import numpy as np
+import pytest
+import quadrants as qd
+from tests import test_utils
+
+
+@test_utils.test(arch=[qd.cuda])
+def test_graph_while_counter():
+    """Test graph_while with a counter that decrements each iteration."""
+    N = 64
+
+    @qd.kernel(graph_while='counter')
+    def increment_loop(x: qd.types.ndarray(qd.i32, ndim=1),
+                       counter: qd.types.ndarray(qd.i32, ndim=0)):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1
+        for i in range(1):
+            counter[None] = counter[None] - 1
+
+    x = qd.ndarray(qd.i32, shape=(N,))
+    counter = qd.ndarray(qd.i32, shape=())
+
+    x.from_numpy(np.zeros(N, dtype=np.int32))
+    counter.from_numpy(np.array(5, dtype=np.int32))
+
+    increment_loop(x, counter)
+
+    qd.sync()
+    assert counter.to_numpy() == 0
+    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 5, dtype=np.int32))
+
+
+@test_utils.test(arch=[qd.cuda])
+def test_graph_while_boolean_done():
+    """Test graph_while with a boolean 'continue' flag (non-zero = keep going)."""
+    N = 64
+    threshold = 7
+
+    @qd.kernel(graph_while='keep_going')
+    def increment_until_threshold(x: qd.types.ndarray(qd.i32, ndim=1),
+                                  keep_going: qd.types.ndarray(qd.i32, ndim=0)):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1
+        for i in range(1):
+            if x[0] >= threshold:
+                keep_going[None] = 0
+
+    x = qd.ndarray(qd.i32, shape=(N,))
+    keep_going = qd.ndarray(qd.i32, shape=())
+
+    x.from_numpy(np.zeros(N, dtype=np.int32))
+    keep_going.from_numpy(np.array(1, dtype=np.int32))
+
+    increment_until_threshold(x, keep_going)
+
+    qd.sync()
+    assert keep_going.to_numpy() == 0
+    np.testing.assert_array_equal(x.to_numpy(), np.full(N, threshold, dtype=np.int32))
+
+
+@test_utils.test(arch=[qd.cuda])
+def test_graph_while_multiple_loops():
+    """Test graph_while with multiple top-level loops in the kernel body."""
+    N = 32
+
+    @qd.kernel(graph_while='counter')
+    def multi_loop(x: qd.types.ndarray(qd.f32, ndim=1),
+                   y: qd.types.ndarray(qd.f32, ndim=1),
+                   counter: qd.types.ndarray(qd.i32, ndim=0)):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1.0
+        for i in range(y.shape[0]):
+            y[i] = y[i] + 2.0
+        for i in range(1):
+            counter[None] = counter[None] - 1
+
+    x = qd.ndarray(qd.f32, shape=(N,))
+    y = qd.ndarray(qd.f32, shape=(N,))
+    counter = qd.ndarray(qd.i32, shape=())
+
+    x.from_numpy(np.zeros(N, dtype=np.float32))
+    y.from_numpy(np.zeros(N, dtype=np.float32))
+    counter.from_numpy(np.array(10, dtype=np.int32))
+
+    multi_loop(x, y, counter)
+
+    qd.sync()
+    assert counter.to_numpy() == 0
+    np.testing.assert_allclose(x.to_numpy(), np.full(N, 10.0))
+    np.testing.assert_allclose(y.to_numpy(), np.full(N, 20.0))
+
+
+@test_utils.test(arch=[qd.cuda])
+def test_graph_while_replay():
+    """Test that graph_while works correctly on subsequent calls (graph replay)."""
+    N = 16
+
+    @qd.kernel(graph_while='counter')
+    def inc(x: qd.types.ndarray(qd.i32, ndim=1),
+            counter: qd.types.ndarray(qd.i32, ndim=0)):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1
+        for i in range(1):
+            counter[None] = counter[None] - 1
+
+    x = qd.ndarray(qd.i32, shape=(N,))
+    counter = qd.ndarray(qd.i32, shape=())
+
+    # First call: 3 iterations
+    x.from_numpy(np.zeros(N, dtype=np.int32))
+    counter.from_numpy(np.array(3, dtype=np.int32))
+    inc(x, counter)
+    qd.sync()
+    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 3, dtype=np.int32))
+
+    # Second call: 7 iterations (graph replay with new counter value)
+    x.from_numpy(np.zeros(N, dtype=np.int32))
+    counter.from_numpy(np.array(7, dtype=np.int32))
+    inc(x, counter)
+    qd.sync()
+    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 7, dtype=np.int32))

From d6cbd15d685bb8ba8ececba15c693e0ba881374e Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sun, 1 Mar 2026 09:34:07 -0500
Subject: [PATCH 008/128] Fix graph_while arg_id for struct parameters and add
 cross-platform fallback

The graph_while_arg_id was computed using Python-level parameter indices,
which is wrong when struct parameters are flattened into many C++ args
(e.g. Genesis solver has 40 C++ params from 6 Python params). Now tracks
the flattened C++ arg index during launch context setup and caches it.

Also adds C++ do-while fallback loops for CPU, CUDA (non-graph path), and
AMDGPU backends so graph_while works identically on all platforms.
---
 python/quadrants/lang/kernel.py               |  13 +-
 quadrants/runtime/amdgpu/kernel_launcher.cpp  |  30 ++-
 quadrants/runtime/cpu/kernel_launcher.cpp     |  16 +-
 quadrants/runtime/cuda/kernel_launcher.cpp    |  31 +++-
 .../python/test_graph_while_cross_backend.py  | 171 ++++++++++++++++++
 5 files changed, 235 insertions(+), 26 deletions(-)
 create mode 100644 tests/python/test_graph_while_cross_backend.py

diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
index c0c8d40732..daf862b418 100644
--- a/python/quadrants/lang/kernel.py
+++ b/python/quadrants/lang/kernel.py
@@ -445,6 +445,8 @@ def launch_kernel(self, key, t_kernel: KernelCxx, compiled_kernel_data: Compiled
                     template_num += 1
                     i_out += 1
                     continue
+                if self.graph_while_arg is not None and self.arg_metas[i_in].name == self.graph_while_arg:
+                    self._graph_while_cpp_arg_id = i_out - template_num
                 num_args_, is_launch_ctx_cacheable_ = self._recursive_set_args(
                     self.used_py_dataclass_parameters_by_key_enforcing[key],
                     self.arg_metas[i_in].name,
@@ -506,15 +508,8 @@ def launch_kernel(self, key, t_kernel: KernelCxx, compiled_kernel_data: Compiled
                     self.src_ll_cache_observations.cache_stored = True
             self._last_compiled_kernel_data = compiled_kernel_data
             launch_ctx.use_cuda_graph = self.use_cuda_graph
-            if self.graph_while_arg is not None:
-                non_template_idx = 0
-                for meta in self.arg_metas:
-                    if meta.annotation is template or isinstance(meta.annotation, template):
-                        continue
-                    if meta.name == self.graph_while_arg:
-                        launch_ctx.graph_while_arg_id = non_template_idx
-                        break
-                    non_template_idx += 1
+            if self.graph_while_arg is not None and hasattr(self, '_graph_while_cpp_arg_id'):
+                launch_ctx.graph_while_arg_id = self._graph_while_cpp_arg_id
             prog.launch_kernel(compiled_kernel_data, launch_ctx)
         except Exception as e:
             e = handle_exception_from_cpp(e)
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 6ef0b0e0e5..f993855a9d 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -74,6 +74,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         }
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)ctx.array_ptrs[grad_ptr_idx]);
+        if (arg_id == ctx.graph_while_arg_id) {
+          ctx.graph_while_flag_dev_ptr = device_ptrs[data_ptr_idx];
+        }
       } else if (arr_sz > 0) {  // why use arr_sz constrain?
         // Ndarray
         DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
@@ -82,6 +85,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)ctx.array_ptrs[grad_ptr_idx]);
+        if (arg_id == ctx.graph_while_arg_id) {
+          ctx.graph_while_flag_dev_ptr = device_ptrs[data_ptr_idx];
+        }
       }
     }
   }
@@ -110,13 +116,23 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
   AMDGPUContext::get_instance().push_back_kernel_arg_pointer(context_pointer);
 
-  for (auto &task : offloaded_tasks) {
-    QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
-             task.block_dim);
-    amdgpu_module->launch(task.name, task.grid_dim, task.block_dim,
-                          task.dynamic_shared_array_bytes,
-                          {(void *)&context_pointer}, {arg_size});
-  }
+  do {
+    for (auto &task : offloaded_tasks) {
+      QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
+               task.block_dim);
+      amdgpu_module->launch(task.name, task.grid_dim, task.block_dim,
+                            task.dynamic_shared_array_bytes,
+                            {(void *)&context_pointer}, {arg_size});
+    }
+    if (ctx.graph_while_arg_id >= 0 && ctx.graph_while_flag_dev_ptr) {
+      int32_t counter_val = 0;
+      AMDGPUDriver::get_instance().stream_synchronize(nullptr);
+      AMDGPUDriver::get_instance().memcpy_device_to_host(
+          &counter_val, ctx.graph_while_flag_dev_ptr, sizeof(int32_t));
+      if (counter_val == 0)
+        break;
+    }
+  } while (ctx.graph_while_arg_id >= 0);
   QD_TRACE("Launching kernel");
   if (ctx.arg_buffer_size > 0) {
     AMDGPUDriver::get_instance().mem_free(device_arg_buffer);
diff --git a/quadrants/runtime/cpu/kernel_launcher.cpp b/quadrants/runtime/cpu/kernel_launcher.cpp
index d7dd8df259..f54ae7f26c 100644
--- a/quadrants/runtime/cpu/kernel_launcher.cpp
+++ b/quadrants/runtime/cpu/kernel_launcher.cpp
@@ -27,6 +27,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
       if (ctx.device_allocation_type[arg_id] ==
           LaunchContextBuilder::DevAllocType::kNone) {
         ctx.set_ndarray_ptrs(arg_id, (uint64)data_ptr, (uint64)grad_ptr);
+        if (arg_id == ctx.graph_while_arg_id) {
+          ctx.graph_while_flag_dev_ptr = data_ptr;
+        }
       } else if (ctx.array_runtime_sizes[arg_id] > 0) {
         uint64 host_ptr = (uint64)executor->get_device_alloc_info_ptr(
             *static_cast<DeviceAllocation *>(data_ptr));
@@ -38,12 +41,19 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
                 : (uint64)executor->get_device_alloc_info_ptr(
                       *static_cast<DeviceAllocation *>(grad_ptr));
         ctx.set_ndarray_ptrs(arg_id, host_ptr, host_ptr_grad);
+        if (arg_id == ctx.graph_while_arg_id) {
+          ctx.graph_while_flag_dev_ptr = (void *)host_ptr;
+        }
       }
     }
   }
-  for (auto task : launcher_ctx.task_funcs) {
-    task(&ctx.get_context());
-  }
+  do {
+    for (auto task : launcher_ctx.task_funcs) {
+      task(&ctx.get_context());
+    }
+  } while (ctx.graph_while_arg_id >= 0 &&
+           ctx.graph_while_flag_dev_ptr &&
+           *static_cast<int32_t *>(ctx.graph_while_flag_dev_ptr) != 0);
 }
 
 KernelLauncher::Handle KernelLauncher::register_llvm_kernel(
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 61b4e468ec..b1347628fb 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -447,6 +447,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)device_ptrs[grad_ptr_idx]);
+        if (arg_id == ctx.graph_while_arg_id) {
+          ctx.graph_while_flag_dev_ptr = device_ptrs[data_ptr_idx];
+        }
       } else if (arr_sz > 0) {
         DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
         device_ptrs[data_ptr_idx] = executor->get_device_alloc_info_ptr(*ptr);
@@ -460,6 +463,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)device_ptrs[grad_ptr_idx]);
+        if (arg_id == ctx.graph_while_arg_id) {
+          ctx.graph_while_flag_dev_ptr = device_ptrs[data_ptr_idx];
+        }
       }
     }
   }
@@ -480,13 +486,24 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
     ctx.get_context().arg_buffer = device_arg_buffer;
   }
 
-  for (auto task : offloaded_tasks) {
-    QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
-             task.block_dim);
-    cuda_module->launch(task.name, task.grid_dim, task.block_dim,
-                        task.dynamic_shared_array_bytes, {&ctx.get_context()},
-                        {});
-  }
+  do {
+    for (auto task : offloaded_tasks) {
+      QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
+               task.block_dim);
+      cuda_module->launch(task.name, task.grid_dim, task.block_dim,
+                          task.dynamic_shared_array_bytes, {&ctx.get_context()},
+                          {});
+    }
+    if (ctx.graph_while_arg_id >= 0 && ctx.graph_while_flag_dev_ptr) {
+      int32_t counter_val = 0;
+      auto *stream = CUDAContext::get_instance().get_stream();
+      CUDADriver::get_instance().stream_synchronize(stream);
+      CUDADriver::get_instance().memcpy_device_to_host(
+          &counter_val, ctx.graph_while_flag_dev_ptr, sizeof(int32_t));
+      if (counter_val == 0)
+        break;
+    }
+  } while (ctx.graph_while_arg_id >= 0);
   if (ctx.arg_buffer_size > 0) {
     CUDADriver::get_instance().mem_free_async(device_arg_buffer, nullptr);
   }
diff --git a/tests/python/test_graph_while_cross_backend.py b/tests/python/test_graph_while_cross_backend.py
new file mode 100644
index 0000000000..0d1b64be3d
--- /dev/null
+++ b/tests/python/test_graph_while_cross_backend.py
@@ -0,0 +1,171 @@
+import numpy as np
+import pytest
+import quadrants as qd
+from tests import test_utils
+
+
+@test_utils.test(arch=[qd.cpu, qd.cuda])
+def test_graph_while_counter_cross_backend():
+    """graph_while with a counter: must work identically on CPU and CUDA."""
+    N = 64
+    ITERS = 5
+
+    @qd.kernel(graph_while='counter')
+    def increment_loop(x: qd.types.ndarray(qd.i32, ndim=1),
+                       counter: qd.types.ndarray(qd.i32, ndim=0)):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1
+        for i in range(1):
+            counter[None] = counter[None] - 1
+
+    x = qd.ndarray(qd.i32, shape=(N,))
+    counter = qd.ndarray(qd.i32, shape=())
+
+    x.from_numpy(np.zeros(N, dtype=np.int32))
+    counter.from_numpy(np.array(ITERS, dtype=np.int32))
+
+    increment_loop(x, counter)
+    qd.sync()
+
+    assert counter.to_numpy() == 0
+    np.testing.assert_array_equal(x.to_numpy(), np.full(N, ITERS, dtype=np.int32))
+
+
+@test_utils.test(arch=[qd.cpu, qd.cuda])
+def test_graph_while_boolean_reduction_cross_backend():
+    """graph_while with per-thread conditions reduced into a single flag.
+
+    Each element has a different threshold. The loop continues while ANY element
+    hasn't reached its threshold. A reduction kernel (reset flag to 0, then
+    any-not-done sets it to 1) combines per-element state into the scalar flag.
+    """
+    N = 32
+
+    @qd.kernel(graph_while='keep_going')
+    def increment_until_all_done(
+            x: qd.types.ndarray(qd.i32, ndim=1),
+            thresholds: qd.types.ndarray(qd.i32, ndim=1),
+            keep_going: qd.types.ndarray(qd.i32, ndim=0)):
+        # Work: increment elements that haven't reached their threshold
+        for i in range(x.shape[0]):
+            if x[i] < thresholds[i]:
+                x[i] = x[i] + 1
+
+        # Reduction: reset flag, then OR-reduce per-element conditions
+        for i in range(1):
+            keep_going[None] = 0
+        for i in range(x.shape[0]):
+            if x[i] < thresholds[i]:
+                keep_going[None] = 1
+
+    x = qd.ndarray(qd.i32, shape=(N,))
+    thresholds = qd.ndarray(qd.i32, shape=(N,))
+    keep_going = qd.ndarray(qd.i32, shape=())
+
+    # Thresholds vary: 1, 2, 3, ..., N. Loop must run N times (max threshold).
+    thresh_np = np.arange(1, N + 1, dtype=np.int32)
+    x.from_numpy(np.zeros(N, dtype=np.int32))
+    thresholds.from_numpy(thresh_np)
+    keep_going.from_numpy(np.array(1, dtype=np.int32))
+
+    increment_until_all_done(x, thresholds, keep_going)
+    qd.sync()
+
+    assert keep_going.to_numpy() == 0
+    np.testing.assert_array_equal(x.to_numpy(), thresh_np)
+
+
+@test_utils.test(arch=[qd.cpu, qd.cuda])
+def test_graph_while_multi_loop_cross_backend():
+    """graph_while with multiple top-level for loops in the body."""
+    N = 16
+    ITERS = 8
+
+    @qd.kernel(graph_while='counter')
+    def multi_loop(a: qd.types.ndarray(qd.f32, ndim=1),
+                   b: qd.types.ndarray(qd.f32, ndim=1),
+                   counter: qd.types.ndarray(qd.i32, ndim=0)):
+        for i in range(a.shape[0]):
+            a[i] = a[i] + 1.0
+        for i in range(b.shape[0]):
+            b[i] = b[i] + 3.0
+        for i in range(1):
+            counter[None] = counter[None] - 1
+
+    a = qd.ndarray(qd.f32, shape=(N,))
+    b = qd.ndarray(qd.f32, shape=(N,))
+    counter = qd.ndarray(qd.i32, shape=())
+
+    a.from_numpy(np.zeros(N, dtype=np.float32))
+    b.from_numpy(np.zeros(N, dtype=np.float32))
+    counter.from_numpy(np.array(ITERS, dtype=np.int32))
+
+    multi_loop(a, b, counter)
+    qd.sync()
+
+    assert counter.to_numpy() == 0
+    np.testing.assert_allclose(a.to_numpy(), np.full(N, float(ITERS)))
+    np.testing.assert_allclose(b.to_numpy(), np.full(N, float(ITERS * 3)))
+
+
+@test_utils.test(arch=[qd.cpu, qd.cuda])
+def test_graph_while_replay_cross_backend():
+    """graph_while replay: second call with different counter value."""
+    N = 16
+
+    @qd.kernel(graph_while='counter')
+    def inc(x: qd.types.ndarray(qd.i32, ndim=1),
+            counter: qd.types.ndarray(qd.i32, ndim=0)):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1
+        for i in range(1):
+            counter[None] = counter[None] - 1
+
+    x = qd.ndarray(qd.i32, shape=(N,))
+    counter = qd.ndarray(qd.i32, shape=())
+
+    # First call: 3 iterations
+    x.from_numpy(np.zeros(N, dtype=np.int32))
+    counter.from_numpy(np.array(3, dtype=np.int32))
+    inc(x, counter)
+    qd.sync()
+    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 3, dtype=np.int32))
+    assert counter.to_numpy() == 0
+
+    # Second call: 7 iterations
+    x.from_numpy(np.zeros(N, dtype=np.int32))
+    counter.from_numpy(np.array(7, dtype=np.int32))
+    inc(x, counter)
+    qd.sync()
+    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 7, dtype=np.int32))
+    assert counter.to_numpy() == 0
+
+
+@test_utils.test(arch=[qd.cpu, qd.cuda])
+def test_graph_while_single_iteration():
+    """graph_while with counter=1 executes the body exactly once.
+
+    Note: graph_while has do-while semantics (body executes at least once,
+    matching CUDA conditional while node behavior). Counter must be >= 1.
+    """
+    N = 8
+
+    @qd.kernel(graph_while='counter')
+    def inc(x: qd.types.ndarray(qd.i32, ndim=1),
+            counter: qd.types.ndarray(qd.i32, ndim=0)):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1
+        for i in range(1):
+            counter[None] = counter[None] - 1
+
+    x = qd.ndarray(qd.i32, shape=(N,))
+    counter = qd.ndarray(qd.i32, shape=())
+
+    x.from_numpy(np.zeros(N, dtype=np.int32))
+    counter.from_numpy(np.array(1, dtype=np.int32))
+
+    inc(x, counter)
+    qd.sync()
+
+    assert counter.to_numpy() == 0
+    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 1, dtype=np.int32))

From 0573c12ee4d0c9e7606ad15986460aae4cf79095 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 5 Mar 2026 14:40:31 -0500
Subject: [PATCH 009/128] Add static_assert on CudaGraphNodeParams size to
 catch ABI drift

---
 quadrants/runtime/cuda/kernel_launcher.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index 10e16f1355..eaf83fb961 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -36,6 +36,8 @@ struct CudaGraphNodeParams {
   char _pad[232 - 8 - 4 - 4 - 8 - 8];
   long long reserved2;
 };
+static_assert(sizeof(CudaGraphNodeParams) == 256,
+              "CudaGraphNodeParams layout must match CUgraphNodeParams (256 bytes)");
 
 struct CachedCudaGraph {
   void *graph_exec{nullptr};

From 7fd81d3137dbd37c4e3bc49a11b1d7cfe8c66c36 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 5 Mar 2026 14:41:27 -0500
Subject: [PATCH 010/128] Add compute capability check for graph_while
 (requires SM 9.0+)

Falls back to non-graph path with a warning on pre-Hopper GPUs,
instead of failing with an unhelpful JIT link error.
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index b1347628fb..9ecaf8c830 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -155,6 +155,15 @@ void KernelLauncher::ensure_condition_kernel_loaded() {
   if (cond_kernel_func_)
     return;
 
+  int cc = CUDAContext::get_instance().get_compute_capability();
+  if (cc < 90) {
+    QD_WARN(
+        "graph_while requires SM 9.0+ (Hopper), but this device is SM {}. "
+        "Falling back to non-graph path.",
+        cc);
+    return;
+  }
+
   auto &driver = CUDADriver::get_instance();
 
   // Find libcudadevrt.a — required for cudaGraphSetConditional in device code

From 9c75cee9cbde224b781c8f64eda0f2f157812534 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 5 Mar 2026 14:42:56 -0500
Subject: [PATCH 011/128] Use CUDA_HOME/CUDA_PATH env vars to find
 libcudadevrt.a

Checks env-var-derived paths before the hardcoded fallbacks, so
custom toolkit installs (e.g. conda, non-default prefix) are found.
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 9ecaf8c830..079307cf36 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -1,8 +1,10 @@
 #include "quadrants/runtime/cuda/kernel_launcher.h"
 #include "quadrants/rhi/cuda/cuda_context.h"
 
+#include <cstdlib>
 #include <cstring>
 #include <filesystem>
+#include <vector>
 
 namespace quadrants::lang {
 namespace cuda {
@@ -166,12 +168,17 @@ void KernelLauncher::ensure_condition_kernel_loaded() {
 
   auto &driver = CUDADriver::get_instance();
 
-  // Find libcudadevrt.a — required for cudaGraphSetConditional in device code
   std::string cudadevrt_path;
-  for (const auto &candidate : {
-           std::string("/usr/local/cuda/lib64/libcudadevrt.a"),
-           std::string("/usr/lib/x86_64-linux-gnu/libcudadevrt.a"),
-       }) {
+  std::vector<std::string> candidates;
+  for (const char *env_name : {"CUDA_HOME", "CUDA_PATH"}) {
+    if (const char *env_val = std::getenv(env_name)) {
+      candidates.push_back(std::string(env_val) + "/lib64/libcudadevrt.a");
+      candidates.push_back(std::string(env_val) + "/lib/libcudadevrt.a");
+    }
+  }
+  candidates.push_back("/usr/local/cuda/lib64/libcudadevrt.a");
+  candidates.push_back("/usr/lib/x86_64-linux-gnu/libcudadevrt.a");
+  for (const auto &candidate : candidates) {
     if (std::filesystem::exists(candidate)) {
       cudadevrt_path = candidate;
       break;

From 7f80b72041d80b98dc6badc9cde73bb7e99e0674 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 5 Mar 2026 14:46:26 -0500
Subject: [PATCH 012/128] Restore documentation comments removed during
 cuda-graph refactor

---
 quadrants/runtime/cuda/kernel_launcher.cpp | 23 ++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 079307cf36..d213afcb8a 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -405,10 +405,24 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
   CUDAContext::get_instance().make_current();
 
+  // |transfers| is only used for external arrays whose data is originally on
+  // host. They are first transferred onto device and that device pointer is
+  // stored in |device_ptrs| below. |transfers| saves its original pointer so
+  // that we can copy the data back once kernel finishes. as well as the
+  // temporary device allocations, which can be freed after kernel finishes. Key
+  // is [arg_id, ptr_pos], where ptr_pos is TypeFactory::DATA_PTR_POS_IN_NDARRAY
+  // for data_ptr and TypeFactory::GRAD_PTR_POS_IN_NDARRAY for grad_ptr. Value
+  // is [host_ptr, temporary_device_alloc]. Invariant: temp_devallocs.size() !=
+  // 0 <==> transfer happened.
   std::unordered_map<ArgArrayPtrKey, std::pair<void *, DeviceAllocation>,
                      ArgArrayPtrKeyHasher>
       transfers;
 
+  // |device_ptrs| stores pointers on device for all arrays args, including
+  // external arrays and ndarrays, no matter whether the data is originally on
+  // device or host.
+  // This is the source of truth for us to look for device pointers used in CUDA
+  // kernels.
   std::unordered_map<ArgArrayPtrKey, void *, ArgArrayPtrKeyHasher> device_ptrs;
 
   char *device_result_buffer{nullptr};
@@ -423,6 +437,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
     const auto &parameter = kv.second;
     if (parameter.is_array) {
       const auto arr_sz = ctx.array_runtime_sizes[arg_id];
+      // Note: both numpy and PyTorch support arrays/tensors with zeros
+      // in shapes, e.g., shape=(0) or shape=(100, 0, 200). This makes
+      // `arr_sz` zero.
       if (arr_sz == 0) {
         continue;
       }
@@ -434,7 +451,10 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
       if (ctx.device_allocation_type[arg_id] ==
           LaunchContextBuilder::DevAllocType::kNone) {
+        // External array
+        // Note: assuming both data & grad are on the same device
         if (on_cuda_device(data_ptr)) {
+          // data_ptr is a raw ptr on CUDA device
           device_ptrs[data_ptr_idx] = data_ptr;
           device_ptrs[grad_ptr_idx] = grad_ptr;
         } else {
@@ -467,7 +487,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
           ctx.graph_while_flag_dev_ptr = device_ptrs[data_ptr_idx];
         }
       } else if (arr_sz > 0) {
+        // Ndarray
         DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
+        // Unwrapped raw ptr on device
         device_ptrs[data_ptr_idx] = executor->get_device_alloc_info_ptr(*ptr);
 
         if (grad_ptr != nullptr) {
@@ -529,6 +551,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         nullptr);
   }
   CUDADriver::get_instance().mem_free_async(device_result_buffer, nullptr);
+  // copy data back to host
   if (transfers.size() > 0) {
     CUDADriver::get_instance().stream_synchronize(nullptr);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {

From 7762fd980dfb2a47b97a4ebd32956e5fb76c09ac Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 5 Mar 2026 14:50:43 -0500
Subject: [PATCH 013/128] Add CUDA graph documentation and do-while semantics
 warning

Document cuda_graph=True and graph_while API in kernel() docstring,
and add a user guide page covering usage patterns, cross-platform
behavior, and the do-while semantics constraint.
---
 docs/source/user_guide/index.md               |  1 +
 .../user_guide/user_guide/cuda_graphs.md      | 96 +++++++++++++++++++
 python/quadrants/lang/kernel_impl.py          | 17 ++++
 3 files changed, 114 insertions(+)
 create mode 100644 docs/source/user_guide/user_guide/cuda_graphs.md

diff --git a/docs/source/user_guide/index.md b/docs/source/user_guide/index.md
index 931b2e30c3..57a9293962 100644
--- a/docs/source/user_guide/index.md
+++ b/docs/source/user_guide/index.md
@@ -11,6 +11,7 @@ user_guide/static
 user_guide/sub_functions
 user_guide/scalar_tensors
 user_guide/synchronization
+user_guide/cuda_graphs
 user_guide/compound_types
 user_guide/python_backend
 user_guide/quirks
diff --git a/docs/source/user_guide/user_guide/cuda_graphs.md b/docs/source/user_guide/user_guide/cuda_graphs.md
new file mode 100644
index 0000000000..3e0ddfac77
--- /dev/null
+++ b/docs/source/user_guide/user_guide/cuda_graphs.md
@@ -0,0 +1,96 @@
+# CUDA Graphs
+
+When a Quadrants kernel has multiple top-level `for` loops, each loop is launched as a separate GPU kernel. The per-kernel launch overhead can become significant when kernels are small and numerous. CUDA graphs let you capture these launches once and replay them as a single unit, eliminating the repeated launch overhead.
+
+## Per-kernel opt-in with `cuda_graph=True`
+
+Annotate a kernel with `cuda_graph=True` to enable graph capture:
+
+```python
+@qd.kernel(cuda_graph=True)
+def step(x: qd.types.ndarray(qd.f32, ndim=1),
+         y: qd.types.ndarray(qd.f32, ndim=1)):
+    for i in range(x.shape[0]):
+        x[i] = x[i] + 1.0
+    for i in range(y.shape[0]):
+        y[i] = y[i] + 2.0
+
+step(x, y)  # first call: captures the graph
+step(x, y)  # subsequent calls: replays the cached graph
+```
+
+On the first call, the kernel's offloaded tasks are captured into a CUDA graph using the explicit node API. Subsequent calls replay the cached graph. The arg buffer is re-uploaded on each replay, so calling the kernel with different ndarrays works correctly.
+
+**When it applies**: graph capture only activates when there are 2 or more top-level `for` loops (offloaded tasks). A single-loop kernel with `cuda_graph=True` falls back silently to the normal launch path.
+
+**Cross-platform**: `cuda_graph=True` is a harmless no-op on non-CUDA backends (CPU, Metal, etc.). You can annotate kernels unconditionally without breaking portability.
+
+## GPU-side iteration with `graph_while`
+
+For iterative algorithms (physics solvers, convergence loops), you often want to repeat the kernel body until a condition is met, without returning to the host each iteration. The `graph_while` parameter enables this:
+
+```python
+@qd.kernel(graph_while="counter")
+def solve(x: qd.types.ndarray(qd.f32, ndim=1),
+          counter: qd.types.ndarray(qd.i32, ndim=0)):
+    for i in range(x.shape[0]):
+        x[i] = x[i] + 1.0
+    for i in range(1):
+        counter[None] = counter[None] - 1
+
+x = qd.ndarray(qd.f32, shape=(N,))
+counter = qd.ndarray(qd.i32, shape=())
+counter.from_numpy(np.array(10, dtype=np.int32))
+solve(x, counter)
+# x is now incremented 10 times; counter is 0
+```
+
+The `graph_while` value is the name of a scalar `qd.i32` ndarray parameter. The kernel body repeats while this value is non-zero.
+
+- On SM 9.0+ (Hopper), this uses CUDA conditional while nodes — the entire iteration runs on the GPU with no host involvement.
+- On older CUDA GPUs and non-CUDA backends, it falls back to a host-side do-while loop.
+- `graph_while` implicitly enables `cuda_graph=True`.
+
+### Patterns
+
+**Counter-based**: set the counter to N, decrement each iteration. The body runs exactly N times.
+
+```python
+@qd.kernel(graph_while="counter")
+def iterate(x: qd.types.ndarray(qd.f32, ndim=1),
+            counter: qd.types.ndarray(qd.i32, ndim=0)):
+    for i in range(x.shape[0]):
+        x[i] = x[i] + 1.0
+    for i in range(1):
+        counter[None] = counter[None] - 1
+```
+
+**Boolean flag**: set a `keep_going` flag to 1, have the kernel set it to 0 when a convergence criterion is met.
+
+```python
+@qd.kernel(graph_while="keep_going")
+def converge(x: qd.types.ndarray(qd.f32, ndim=1),
+             keep_going: qd.types.ndarray(qd.i32, ndim=0)):
+    for i in range(x.shape[0]):
+        # ... do work ...
+        pass
+    for i in range(1):
+        if some_condition(x):
+            keep_going[None] = 0
+```
+
+### Do-while semantics
+
+`graph_while` has **do-while** semantics: the kernel body always executes at least once before the condition is checked. This matches the behavior of CUDA conditional while nodes. The flag value must be >= 1 at launch time. Passing 0 with a kernel that decrements the counter will cause an infinite loop.
+
+## When to use CUDA graphs
+
+CUDA graphs are most beneficial when:
+
+- A kernel has many small top-level `for` loops where launch overhead dominates runtime.
+- An iterative algorithm needs to repeat the kernel body many times without host round-trips (`graph_while`).
+
+They are less useful when:
+
+- Kernels have only a single top-level loop (no graph is created).
+- Individual kernel runtimes are large enough to fully hide launch latency.
diff --git a/python/quadrants/lang/kernel_impl.py b/python/quadrants/lang/kernel_impl.py
index d3e7976b9d..8ce3e30b45 100644
--- a/python/quadrants/lang/kernel_impl.py
+++ b/python/quadrants/lang/kernel_impl.py
@@ -212,6 +212,23 @@ def kernel(
 
     Kernel's gradient kernel would be generated automatically by the AutoDiff system.
 
+    Args:
+        cuda_graph: If True, kernels with 2+ top-level for loops are captured
+            into a CUDA graph on first launch and replayed on subsequent
+            launches, reducing per-kernel launch overhead. On non-CUDA backends
+            this flag is a harmless no-op.
+        graph_while: Name of a scalar ``qd.i32`` ndarray parameter that
+            controls GPU-side iteration. The kernel body repeats while the
+            named argument is non-zero.  Uses CUDA conditional while nodes
+            on SM 9.0+ (Hopper); falls back to a host-side do-while loop
+            on older GPUs and non-CUDA backends.  Implicitly enables
+            ``cuda_graph=True``.
+
+            **Do-while semantics**: the kernel body always executes at least
+            once before the condition is checked. The flag value must be >= 1
+            at launch time. Passing 0 with a kernel that decrements the
+            counter will result in an infinite loop.
+
     Example::
 
         >>> x = qd.field(qd.i32, shape=(4, 8))

From 47d59dc83ef395002a8f866fab23881e456c0972 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 5 Mar 2026 16:07:36 -0500
Subject: [PATCH 014/128] Apply clang-format to kernel_launcher.h static_assert

---
 quadrants/runtime/cuda/kernel_launcher.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index eaf83fb961..1945c71395 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -36,8 +36,9 @@ struct CudaGraphNodeParams {
   char _pad[232 - 8 - 4 - 4 - 8 - 8];
   long long reserved2;
 };
-static_assert(sizeof(CudaGraphNodeParams) == 256,
-              "CudaGraphNodeParams layout must match CUgraphNodeParams (256 bytes)");
+static_assert(
+    sizeof(CudaGraphNodeParams) == 256,
+    "CudaGraphNodeParams layout must match CUgraphNodeParams (256 bytes)");
 
 struct CachedCudaGraph {
   void *graph_exec{nullptr};

From ad4eab6c39819d697f809edf9e419a9211f98e33 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 10:32:26 -0700
Subject: [PATCH 015/128] Fix lint: formatting (black, clang-format, ruff)

---
 python/quadrants/lang/kernel.py               |  2 +-
 python/quadrants/lang/kernel_impl.py          | 11 ++++--
 quadrants/python/export_lang.cpp              |  3 +-
 quadrants/runtime/cpu/kernel_launcher.cpp     |  3 +-
 quadrants/runtime/cuda/kernel_launcher.cpp    | 13 ++++---
 quadrants/runtime/cuda/kernel_launcher.h      | 14 +++----
 tests/python/test_cuda_graph_while.py         | 28 +++++++-------
 .../python/test_graph_while_cross_backend.py  | 37 ++++++++++---------
 8 files changed, 59 insertions(+), 52 deletions(-)

diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
index daf862b418..e2cbd00076 100644
--- a/python/quadrants/lang/kernel.py
+++ b/python/quadrants/lang/kernel.py
@@ -508,7 +508,7 @@ def launch_kernel(self, key, t_kernel: KernelCxx, compiled_kernel_data: Compiled
                     self.src_ll_cache_observations.cache_stored = True
             self._last_compiled_kernel_data = compiled_kernel_data
             launch_ctx.use_cuda_graph = self.use_cuda_graph
-            if self.graph_while_arg is not None and hasattr(self, '_graph_while_cpp_arg_id'):
+            if self.graph_while_arg is not None and hasattr(self, "_graph_while_cpp_arg_id"):
                 launch_ctx.graph_while_arg_id = self._graph_while_cpp_arg_id
             prog.launch_kernel(compiled_kernel_data, launch_ctx)
         except Exception as e:
diff --git a/python/quadrants/lang/kernel_impl.py b/python/quadrants/lang/kernel_impl.py
index 8ce3e30b45..5e4fc68b29 100644
--- a/python/quadrants/lang/kernel_impl.py
+++ b/python/quadrants/lang/kernel_impl.py
@@ -124,8 +124,11 @@ def _inside_class(level_of_class_stackframe: int) -> bool:
 
 
 def _kernel_impl(
-    _func: Callable, level_of_class_stackframe: int, verbose: bool = False,
-    cuda_graph: bool = False, graph_while: str | None = None,
+    _func: Callable,
+    level_of_class_stackframe: int,
+    verbose: bool = False,
+    cuda_graph: bool = False,
+    graph_while: str | None = None,
 ) -> QuadrantsCallable:
     # Can decorators determine if a function is being defined inside a class?
     # https://stackoverflow.com/a/8793684/12003165
@@ -181,7 +184,9 @@ def wrapped_classkernel(*args, **kwargs):
 @overload
 # TODO: This callable should be Callable[[F], F].
 # See comments below.
-def kernel(_fn: None = None, *, pure: bool = False, cuda_graph: bool = False, graph_while: str | None = None) -> Callable[[Any], Any]: ...
+def kernel(
+    _fn: None = None, *, pure: bool = False, cuda_graph: bool = False, graph_while: str | None = None
+) -> Callable[[Any], Any]: ...
 
 
 # TODO: This next overload should return F, but currently that will cause issues
diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
index 6583157f12..09ce3abdd8 100644
--- a/quadrants/python/export_lang.cpp
+++ b/quadrants/python/export_lang.cpp
@@ -661,7 +661,8 @@ void export_lang(py::module &m) {
       .def("get_struct_ret_uint", &LaunchContextBuilder::get_struct_ret_uint)
       .def("get_struct_ret_float", &LaunchContextBuilder::get_struct_ret_float)
       .def_readwrite("use_cuda_graph", &LaunchContextBuilder::use_cuda_graph)
-      .def_readwrite("graph_while_arg_id", &LaunchContextBuilder::graph_while_arg_id);
+      .def_readwrite("graph_while_arg_id",
+                     &LaunchContextBuilder::graph_while_arg_id);
 
   py::class_<Function>(m, "Function")
       .def("insert_scalar_param", &Function::insert_scalar_param)
diff --git a/quadrants/runtime/cpu/kernel_launcher.cpp b/quadrants/runtime/cpu/kernel_launcher.cpp
index f54ae7f26c..b08bab551a 100644
--- a/quadrants/runtime/cpu/kernel_launcher.cpp
+++ b/quadrants/runtime/cpu/kernel_launcher.cpp
@@ -51,8 +51,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
     for (auto task : launcher_ctx.task_funcs) {
       task(&ctx.get_context());
     }
-  } while (ctx.graph_while_arg_id >= 0 &&
-           ctx.graph_while_flag_dev_ptr &&
+  } while (ctx.graph_while_arg_id >= 0 && ctx.graph_while_flag_dev_ptr &&
            *static_cast<int32_t *>(ctx.graph_while_flag_dev_ptr) != 0);
 }
 
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index d213afcb8a..91f06ae1ce 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -370,18 +370,19 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
   }
 
   // --- Instantiate and launch ---
-  CUDADriver::get_instance().graph_instantiate(
-      &cached.graph_exec, graph, nullptr, nullptr, 0);
+  CUDADriver::get_instance().graph_instantiate(&cached.graph_exec, graph,
+                                               nullptr, nullptr, 0);
 
   auto *stream = CUDAContext::get_instance().get_stream();
   CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);
 
   CUDADriver::get_instance().graph_destroy(graph);
 
-  QD_TRACE("CUDA graph created with {} kernel nodes for launch_id={}"
-           "{}",
-           offloaded_tasks.size(), launch_id,
-           use_graph_while ? " (with graph_while)" : "");
+  QD_TRACE(
+      "CUDA graph created with {} kernel nodes for launch_id={}"
+      "{}",
+      offloaded_tasks.size(), launch_id,
+      use_graph_while ? " (with graph_while)" : "");
 
   cuda_graph_cache_.emplace(launch_id, std::move(cached));
   return true;
diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index 1945c71395..630a5bdb85 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -28,11 +28,11 @@ struct CudaGraphNodeParams {
   unsigned int type;  // CU_GRAPH_NODE_TYPE_CONDITIONAL = 13
   int reserved0[3];
   // Union starts at offset 16 (232 bytes total)
-  unsigned long long handle;   // CUgraphConditionalHandle
-  unsigned int condType;       // CU_GRAPH_COND_TYPE_WHILE = 1
-  unsigned int size;           // 1 for while
-  void *phGraph_out;           // CUgraph* output array
-  void *ctx;                   // CUcontext
+  unsigned long long handle;  // CUgraphConditionalHandle
+  unsigned int condType;      // CU_GRAPH_COND_TYPE_WHILE = 1
+  unsigned int size;          // 1 for while
+  void *phGraph_out;          // CUgraph* output array
+  void *ctx;                  // CUcontext
   char _pad[232 - 8 - 4 - 4 - 8 - 8];
   long long reserved2;
 };
@@ -83,8 +83,8 @@ class KernelLauncher : public LLVM::KernelLauncher {
   std::unordered_map<int, CachedCudaGraph> cuda_graph_cache_;
 
   // JIT-compiled condition kernel for graph_while conditional nodes
-  void *cond_kernel_module_{nullptr};   // CUmodule
-  void *cond_kernel_func_{nullptr};     // CUfunction
+  void *cond_kernel_module_{nullptr};  // CUmodule
+  void *cond_kernel_func_{nullptr};    // CUfunction
 };
 
 }  // namespace cuda
diff --git a/tests/python/test_cuda_graph_while.py b/tests/python/test_cuda_graph_while.py
index 233e65dba8..45ef7e445f 100644
--- a/tests/python/test_cuda_graph_while.py
+++ b/tests/python/test_cuda_graph_while.py
@@ -1,6 +1,7 @@
 import numpy as np
-import pytest
+
 import quadrants as qd
+
 from tests import test_utils
 
 
@@ -9,9 +10,8 @@ def test_graph_while_counter():
     """Test graph_while with a counter that decrements each iteration."""
     N = 64
 
-    @qd.kernel(graph_while='counter')
-    def increment_loop(x: qd.types.ndarray(qd.i32, ndim=1),
-                       counter: qd.types.ndarray(qd.i32, ndim=0)):
+    @qd.kernel(graph_while="counter")
+    def increment_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
         for i in range(1):
@@ -36,9 +36,8 @@ def test_graph_while_boolean_done():
     N = 64
     threshold = 7
 
-    @qd.kernel(graph_while='keep_going')
-    def increment_until_threshold(x: qd.types.ndarray(qd.i32, ndim=1),
-                                  keep_going: qd.types.ndarray(qd.i32, ndim=0)):
+    @qd.kernel(graph_while="keep_going")
+    def increment_until_threshold(x: qd.types.ndarray(qd.i32, ndim=1), keep_going: qd.types.ndarray(qd.i32, ndim=0)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
         for i in range(1):
@@ -63,10 +62,12 @@ def test_graph_while_multiple_loops():
     """Test graph_while with multiple top-level loops in the kernel body."""
     N = 32
 
-    @qd.kernel(graph_while='counter')
-    def multi_loop(x: qd.types.ndarray(qd.f32, ndim=1),
-                   y: qd.types.ndarray(qd.f32, ndim=1),
-                   counter: qd.types.ndarray(qd.i32, ndim=0)):
+    @qd.kernel(graph_while="counter")
+    def multi_loop(
+        x: qd.types.ndarray(qd.f32, ndim=1),
+        y: qd.types.ndarray(qd.f32, ndim=1),
+        counter: qd.types.ndarray(qd.i32, ndim=0),
+    ):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1.0
         for i in range(y.shape[0]):
@@ -95,9 +96,8 @@ def test_graph_while_replay():
     """Test that graph_while works correctly on subsequent calls (graph replay)."""
     N = 16
 
-    @qd.kernel(graph_while='counter')
-    def inc(x: qd.types.ndarray(qd.i32, ndim=1),
-            counter: qd.types.ndarray(qd.i32, ndim=0)):
+    @qd.kernel(graph_while="counter")
+    def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
         for i in range(1):
diff --git a/tests/python/test_graph_while_cross_backend.py b/tests/python/test_graph_while_cross_backend.py
index 0d1b64be3d..9c5b6df639 100644
--- a/tests/python/test_graph_while_cross_backend.py
+++ b/tests/python/test_graph_while_cross_backend.py
@@ -1,6 +1,7 @@
 import numpy as np
-import pytest
+
 import quadrants as qd
+
 from tests import test_utils
 
 
@@ -10,9 +11,8 @@ def test_graph_while_counter_cross_backend():
     N = 64
     ITERS = 5
 
-    @qd.kernel(graph_while='counter')
-    def increment_loop(x: qd.types.ndarray(qd.i32, ndim=1),
-                       counter: qd.types.ndarray(qd.i32, ndim=0)):
+    @qd.kernel(graph_while="counter")
+    def increment_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
         for i in range(1):
@@ -41,11 +41,12 @@ def test_graph_while_boolean_reduction_cross_backend():
     """
     N = 32
 
-    @qd.kernel(graph_while='keep_going')
+    @qd.kernel(graph_while="keep_going")
     def increment_until_all_done(
-            x: qd.types.ndarray(qd.i32, ndim=1),
-            thresholds: qd.types.ndarray(qd.i32, ndim=1),
-            keep_going: qd.types.ndarray(qd.i32, ndim=0)):
+        x: qd.types.ndarray(qd.i32, ndim=1),
+        thresholds: qd.types.ndarray(qd.i32, ndim=1),
+        keep_going: qd.types.ndarray(qd.i32, ndim=0),
+    ):
         # Work: increment elements that haven't reached their threshold
         for i in range(x.shape[0]):
             if x[i] < thresholds[i]:
@@ -81,10 +82,12 @@ def test_graph_while_multi_loop_cross_backend():
     N = 16
     ITERS = 8
 
-    @qd.kernel(graph_while='counter')
-    def multi_loop(a: qd.types.ndarray(qd.f32, ndim=1),
-                   b: qd.types.ndarray(qd.f32, ndim=1),
-                   counter: qd.types.ndarray(qd.i32, ndim=0)):
+    @qd.kernel(graph_while="counter")
+    def multi_loop(
+        a: qd.types.ndarray(qd.f32, ndim=1),
+        b: qd.types.ndarray(qd.f32, ndim=1),
+        counter: qd.types.ndarray(qd.i32, ndim=0),
+    ):
         for i in range(a.shape[0]):
             a[i] = a[i] + 1.0
         for i in range(b.shape[0]):
@@ -113,9 +116,8 @@ def test_graph_while_replay_cross_backend():
     """graph_while replay: second call with different counter value."""
     N = 16
 
-    @qd.kernel(graph_while='counter')
-    def inc(x: qd.types.ndarray(qd.i32, ndim=1),
-            counter: qd.types.ndarray(qd.i32, ndim=0)):
+    @qd.kernel(graph_while="counter")
+    def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
         for i in range(1):
@@ -150,9 +152,8 @@ def test_graph_while_single_iteration():
     """
     N = 8
 
-    @qd.kernel(graph_while='counter')
-    def inc(x: qd.types.ndarray(qd.i32, ndim=1),
-            counter: qd.types.ndarray(qd.i32, ndim=0)):
+    @qd.kernel(graph_while="counter")
+    def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
         for i in range(1):

From e00fc15340002f5ae20aa804e2a866e2364960d1 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 12:16:53 -0700
Subject: [PATCH 016/128] Fix clang-format whitespace in kernel_launcher.cpp

---
 quadrants/runtime/cuda/kernel_launcher.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 1de6d3e529..e98f8b8f9e 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -188,8 +188,8 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
   }
 
   // --- Instantiate and launch ---
-  CUDADriver::get_instance().graph_instantiate(
-      &cached.graph_exec, graph, nullptr, nullptr, 0);
+  CUDADriver::get_instance().graph_instantiate(&cached.graph_exec, graph,
+                                               nullptr, nullptr, 0);
 
   auto *stream = CUDAContext::get_instance().get_stream();
   CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);

From 0031619ade14f0c58cbe1fca181d1f1bc485d4ad Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 12:29:12 -0700
Subject: [PATCH 017/128] Reject cuda_graph=True on kernels with struct return
 values

The graph path doesn't copy the result buffer back to the host,
so struct returns would silently return stale data. Error early
instead of producing wrong results.
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index e98f8b8f9e..ce930cf78c 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -112,6 +112,10 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
     return false;
   }
 
+  QD_ERROR_IF(ctx.result_buffer_size > 0,
+              "cuda_graph=True is not supported for kernels with struct return "
+              "values; remove cuda_graph=True or avoid returning values");
+
   if (!resolve_ctx_ndarray_ptrs(ctx, parameters)) {
     return false;
   }

From 792ff3478537ea6dfec6455392dfffa51390a701 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 12:32:21 -0700
Subject: [PATCH 018/128] Add test for cuda_graph with different-sized arrays
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Verifies that calling a cuda_graph=True kernel first with small
arrays then with larger ones produces correct results for all
elements — catches stale grid dims if the graph were incorrectly
replayed from the first capture.
---
 tests/python/test_cuda_graph.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index 3bdef991ef..2b2d064e06 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -148,6 +148,35 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, n
     assert np.allclose(y1_np, 4.0), f"y1 should be unchanged, got {y1_np[:5]}"
 
 
+@test_utils.test(arch=[qd.cuda])
+def test_cuda_graph_different_sizes():
+    """Graph must produce correct results when called with different-sized arrays.
+
+    Catches stale grid dims: if the graph cached from the small call is
+    replayed for the large call, elements beyond the original size stay zero.
+    """
+
+    @qd.kernel(cuda_graph=True)
+    def add_one(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndim=1)):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1.0
+        for i in range(y.shape[0]):
+            y[i] = y[i] + 2.0
+
+    x1 = qd.ndarray(qd.f32, shape=(256,))
+    y1 = qd.ndarray(qd.f32, shape=(256,))
+    add_one(x1, y1)
+
+    x2 = qd.ndarray(qd.f32, shape=(1024,))
+    y2 = qd.ndarray(qd.f32, shape=(1024,))
+    add_one(x2, y2)
+
+    x2_np = x2.to_numpy()
+    y2_np = y2.to_numpy()
+    assert np.allclose(x2_np, 1.0), f"Expected all 1.0, got {x2_np[250:260]}"
+    assert np.allclose(y2_np, 2.0), f"Expected all 2.0, got {y2_np[250:260]}"
+
+
 @test_utils.test()
 def test_cuda_graph_annotation_cross_platform():
     """cuda_graph=True should be a harmless no-op on non-CUDA backends."""

From 334c2e8d197de29c31e796b1c4cc0100b01e2c17 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 12:34:03 -0700
Subject: [PATCH 019/128] Restore comments removed during cuda graph refactor

Re-add documentation comments for |transfers|, |device_ptrs|,
zero-sized array handling, external array logic, and the
host copy-back section in the non-graph launch path.
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 23 ++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index ce930cf78c..3ecc147ebe 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -225,10 +225,24 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
   CUDAContext::get_instance().make_current();
 
+  // |transfers| is only used for external arrays whose data is originally on
+  // host. They are first transferred onto device and that device pointer is
+  // stored in |device_ptrs| below. |transfers| saves its original pointer so
+  // that we can copy the data back once kernel finishes. as well as the
+  // temporary device allocations, which can be freed after kernel finishes. Key
+  // is [arg_id, ptr_pos], where ptr_pos is TypeFactory::DATA_PTR_POS_IN_NDARRAY
+  // for data_ptr and TypeFactory::GRAD_PTR_POS_IN_NDARRAY for grad_ptr. Value
+  // is [host_ptr, temporary_device_alloc]. Invariant: temp_devallocs.size() !=
+  // 0 <==> transfer happened.
   std::unordered_map<ArgArrayPtrKey, std::pair<void *, DeviceAllocation>,
                      ArgArrayPtrKeyHasher>
       transfers;
 
+  // |device_ptrs| stores pointers on device for all arrays args, including
+  // external arrays and ndarrays, no matter whether the data is originally on
+  // device or host.
+  // This is the source of truth for us to look for device pointers used in CUDA
+  // kernels.
   std::unordered_map<ArgArrayPtrKey, void *, ArgArrayPtrKeyHasher> device_ptrs;
 
   char *device_result_buffer{nullptr};
@@ -243,6 +257,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
     const auto &parameter = kv.second;
     if (parameter.is_array) {
       const auto arr_sz = ctx.array_runtime_sizes[arg_id];
+      // Note: both numpy and PyTorch support arrays/tensors with zeros
+      // in shapes, e.g., shape=(0) or shape=(100, 0, 200). This makes
+      // `arr_sz` zero.
       if (arr_sz == 0) {
         continue;
       }
@@ -254,7 +271,10 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
       if (ctx.device_allocation_type[arg_id] ==
           LaunchContextBuilder::DevAllocType::kNone) {
+        // External array
+        // Note: assuming both data & grad are on the same device
         if (on_cuda_device(data_ptr)) {
+          // data_ptr is a raw ptr on CUDA device
           device_ptrs[data_ptr_idx] = data_ptr;
           device_ptrs[grad_ptr_idx] = grad_ptr;
         } else {
@@ -284,7 +304,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)device_ptrs[grad_ptr_idx]);
       } else if (arr_sz > 0) {
+        // Ndarray
         DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
+        // Unwrapped raw ptr on device
         device_ptrs[data_ptr_idx] = executor->get_device_alloc_info_ptr(*ptr);
 
         if (grad_ptr != nullptr) {
@@ -332,6 +354,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         nullptr);
   }
   CUDADriver::get_instance().mem_free_async(device_result_buffer, nullptr);
+  // copy data back to host
   if (transfers.size() > 0) {
     CUDADriver::get_instance().stream_synchronize(nullptr);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {

From 8f56ffd0fe2889d87852da0667f818e2e248e27f Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 12:37:34 -0700
Subject: [PATCH 020/128] Add test for cuda_graph after qd.reset()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Verify that a cuda_graph=True kernel works correctly after a
reset/reinit cycle — exercises the full teardown and rebuild
of the KernelLauncher and its graph cache.
---
 tests/python/test_cuda_graph.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index 2b2d064e06..40e397e03e 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -177,6 +177,37 @@ def add_one(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndi
     assert np.allclose(y2_np, 2.0), f"Expected all 2.0, got {y2_np[250:260]}"
 
 
+@test_utils.test(arch=[qd.cuda])
+def test_cuda_graph_after_reset():
+    """cuda_graph=True kernel must work correctly after qd.reset()."""
+
+    @qd.kernel(cuda_graph=True)
+    def add_one(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndim=1)):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1.0
+        for i in range(y.shape[0]):
+            y[i] = y[i] + 2.0
+
+    n = 256
+    x = qd.ndarray(qd.f32, shape=(n,))
+    y = qd.ndarray(qd.f32, shape=(n,))
+    add_one(x, y)
+    add_one(x, y)
+
+    assert np.allclose(x.to_numpy(), 2.0)
+    assert np.allclose(y.to_numpy(), 4.0)
+
+    qd.reset()
+    qd.init(arch=qd.cuda)
+
+    x2 = qd.ndarray(qd.f32, shape=(n,))
+    y2 = qd.ndarray(qd.f32, shape=(n,))
+    add_one(x2, y2)
+
+    assert np.allclose(x2.to_numpy(), 1.0)
+    assert np.allclose(y2.to_numpy(), 2.0)
+
+
 @test_utils.test()
 def test_cuda_graph_annotation_cross_platform():
     """cuda_graph=True should be a harmless no-op on non-CUDA backends."""

From f8ff3ee878be66c36b68204cfb8d39d2a9bc5c14 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 15:49:33 -0700
Subject: [PATCH 021/128] Fix graph_while cache staleness when counter ndarray
 changes

The condition kernel's flag pointer was baked into the CUDA graph at
creation time. Passing a different ndarray on replay would cause the
condition kernel to read from a stale device address. Invalidate the
cached graph when the flag pointer changes so it gets rebuilt.
---
 quadrants/runtime/cuda/kernel_launcher.cpp    | 25 ++++++++----
 quadrants/runtime/cuda/kernel_launcher.h      |  1 +
 tests/python/test_cuda_graph_while.py         | 39 ++++++++++++++++++
 .../python/test_graph_while_cross_backend.py  | 40 +++++++++++++++++++
 4 files changed, 98 insertions(+), 7 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 5467c6c050..5f5aedbdde 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -237,14 +237,22 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
   auto it = cuda_graph_cache_.find(launch_id);
   if (it != cuda_graph_cache_.end()) {
     auto &cached = it->second;
-    if (ctx.arg_buffer_size > 0) {
-      CUDADriver::get_instance().memcpy_host_to_device(
-          cached.persistent_device_arg_buffer, ctx.get_context().arg_buffer,
-          cached.arg_buffer_size);
+    if (use_graph_while &&
+        cached.graph_while_flag_dev_ptr != ctx.graph_while_flag_dev_ptr) {
+      QD_TRACE(
+          "graph_while flag pointer changed ({} -> {}), rebuilding CUDA graph",
+          cached.graph_while_flag_dev_ptr, ctx.graph_while_flag_dev_ptr);
+      cuda_graph_cache_.erase(it);
+    } else {
+      if (ctx.arg_buffer_size > 0) {
+        CUDADriver::get_instance().memcpy_host_to_device(
+            cached.persistent_device_arg_buffer, ctx.get_context().arg_buffer,
+            cached.arg_buffer_size);
+      }
+      auto *stream = CUDAContext::get_instance().get_stream();
+      CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);
+      return true;
     }
-    auto *stream = CUDAContext::get_instance().get_stream();
-    CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);
-    return true;
   }
 
   CUDAContext::get_instance().make_current();
@@ -388,6 +396,9 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
       offloaded_tasks.size(), launch_id,
       use_graph_while ? " (with graph_while)" : "");
 
+  if (use_graph_while) {
+    cached.graph_while_flag_dev_ptr = ctx.graph_while_flag_dev_ptr;
+  }
   cuda_graph_cache_.emplace(launch_id, std::move(cached));
   return true;
 }
diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index 630a5bdb85..2038e5e9d4 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -47,6 +47,7 @@ struct CachedCudaGraph {
   RuntimeContext persistent_ctx{};
   std::size_t arg_buffer_size{0};
   std::size_t result_buffer_size{0};
+  void *graph_while_flag_dev_ptr{nullptr};
 
   CachedCudaGraph() = default;
   ~CachedCudaGraph();
diff --git a/tests/python/test_cuda_graph_while.py b/tests/python/test_cuda_graph_while.py
index 45ef7e445f..c47104452b 100644
--- a/tests/python/test_cuda_graph_while.py
+++ b/tests/python/test_cuda_graph_while.py
@@ -119,3 +119,42 @@ def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, n
     inc(x, counter)
     qd.sync()
     np.testing.assert_array_equal(x.to_numpy(), np.full(N, 7, dtype=np.int32))
+
+
+@test_utils.test(arch=[qd.cuda])
+def test_graph_while_replay_new_ndarray():
+    """Test graph_while replay when the counter ndarray is a different allocation.
+
+    Regression test: the condition kernel's flag pointer was baked into the
+    CUDA graph at creation time. Passing a new ndarray (different device
+    address) on replay would cause the condition kernel to read stale memory.
+    The fix invalidates the cached graph when the flag pointer changes.
+    """
+    N = 16
+
+    @qd.kernel(graph_while="counter")
+    def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1
+        for i in range(1):
+            counter[None] = counter[None] - 1
+
+    x = qd.ndarray(qd.i32, shape=(N,))
+
+    # First call with one counter ndarray
+    counter1 = qd.ndarray(qd.i32, shape=())
+    x.from_numpy(np.zeros(N, dtype=np.int32))
+    counter1.from_numpy(np.array(3, dtype=np.int32))
+    inc(x, counter1)
+    qd.sync()
+    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 3, dtype=np.int32))
+    assert counter1.to_numpy() == 0
+
+    # Second call with a NEW counter ndarray (different device allocation)
+    counter2 = qd.ndarray(qd.i32, shape=())
+    x.from_numpy(np.zeros(N, dtype=np.int32))
+    counter2.from_numpy(np.array(5, dtype=np.int32))
+    inc(x, counter2)
+    qd.sync()
+    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 5, dtype=np.int32))
+    assert counter2.to_numpy() == 0
diff --git a/tests/python/test_graph_while_cross_backend.py b/tests/python/test_graph_while_cross_backend.py
index 9c5b6df639..e49eb0729c 100644
--- a/tests/python/test_graph_while_cross_backend.py
+++ b/tests/python/test_graph_while_cross_backend.py
@@ -143,6 +143,46 @@ def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, n
     assert counter.to_numpy() == 0
 
 
+@test_utils.test(arch=[qd.cpu, qd.cuda])
+def test_graph_while_replay_new_ndarray_cross_backend():
+    """graph_while replay with a different ndarray allocation for the counter.
+
+    Regression test: on CUDA, the condition kernel's flag pointer was baked
+    into the graph at creation time. Passing a new ndarray on replay would
+    read stale memory. The fix invalidates the cached graph when the flag
+    pointer changes. On CPU the host-side fallback always reads from the
+    current pointer, so this verifies both paths produce correct results.
+    """
+    N = 16
+
+    @qd.kernel(graph_while="counter")
+    def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1
+        for i in range(1):
+            counter[None] = counter[None] - 1
+
+    x = qd.ndarray(qd.i32, shape=(N,))
+
+    # First call with one counter ndarray
+    counter1 = qd.ndarray(qd.i32, shape=())
+    x.from_numpy(np.zeros(N, dtype=np.int32))
+    counter1.from_numpy(np.array(4, dtype=np.int32))
+    inc(x, counter1)
+    qd.sync()
+    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 4, dtype=np.int32))
+    assert counter1.to_numpy() == 0
+
+    # Second call with a NEW counter ndarray (different device allocation)
+    counter2 = qd.ndarray(qd.i32, shape=())
+    x.from_numpy(np.zeros(N, dtype=np.int32))
+    counter2.from_numpy(np.array(6, dtype=np.int32))
+    inc(x, counter2)
+    qd.sync()
+    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 6, dtype=np.int32))
+    assert counter2.to_numpy() == 0
+
+
 @test_utils.test(arch=[qd.cpu, qd.cuda])
 def test_graph_while_single_iteration():
     """graph_while with counter=1 executes the body exactly once.

From 96b43def98f3814784886c1962f03e68c8136439 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 16:05:02 -0700
Subject: [PATCH 022/128] Validate graph_while parameter name at decoration
 time

Raise ValueError immediately if the graph_while name doesn't match any
kernel parameter, instead of silently running the kernel once without
looping. Also document the CUDA API version for CudaGraphNodeParams.
---
 python/quadrants/lang/kernel_impl.py     | 7 +++++++
 quadrants/runtime/cuda/kernel_launcher.h | 5 +++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/python/quadrants/lang/kernel_impl.py b/python/quadrants/lang/kernel_impl.py
index 5e4fc68b29..b5ab435e41 100644
--- a/python/quadrants/lang/kernel_impl.py
+++ b/python/quadrants/lang/kernel_impl.py
@@ -143,6 +143,13 @@ def _kernel_impl(
     adjoint = Kernel(_func, autodiff_mode=_REVERSE, _is_classkernel=is_classkernel)
     primal.use_cuda_graph = cuda_graph
     primal.graph_while_arg = graph_while
+    if graph_while is not None:
+        arg_names = [m.name for m in primal.arg_metas]
+        if graph_while not in arg_names:
+            raise ValueError(
+                f"graph_while={graph_while!r} does not match any parameter of "
+                f"kernel {_func.__name__!r}. Available parameters: {arg_names}"
+            )
     # Having |primal| contains |grad| makes the tape work.
     primal.grad = adjoint
 
diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index 2038e5e9d4..ce7578721d 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -22,8 +22,9 @@ struct CudaKernelNodeParams {
   void **extra;
 };
 
-// Mirrors CUgraphNodeParams layout for conditional while nodes.
-// See CUDA driver API: CUgraphNodeParams / CUDA_CONDITIONAL_NODE_PARAMS.
+// Mirrors CUDA driver API CUgraphNodeParams / CUDA_CONDITIONAL_NODE_PARAMS.
+// Field order verified against cuda-python bindings (handle, type, size,
+// phGraph_out, ctx). Introduced in CUDA 12.4; layout stable through 13.2+.
 struct CudaGraphNodeParams {
   unsigned int type;  // CU_GRAPH_NODE_TYPE_CONDITIONAL = 13
   int reserved0[3];

From 501362f1a3626b1c5d1dbbe8cc7ca45454ffe212 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 17:18:58 -0400
Subject: [PATCH 023/128] Add CUDA graph documentation page

Made-with: Cursor
---
 docs/source/user_guide/cuda_graph.md | 109 +++++++++++++++++++++++++++
 docs/source/user_guide/index.md      |  10 ++-
 2 files changed, 118 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/user_guide/cuda_graph.md

diff --git a/docs/source/user_guide/cuda_graph.md b/docs/source/user_guide/cuda_graph.md
new file mode 100644
index 0000000000..abbf87aaf4
--- /dev/null
+++ b/docs/source/user_guide/cuda_graph.md
@@ -0,0 +1,109 @@
+# CUDA Graph
+
+CUDA graphs reduce kernel launch overhead by capturing a sequence of GPU operations into a graph, then replaying it in a single launch. This is most beneficial for kernels that compile into multiple GPU tasks (e.g. kernels with multiple top-level `for` loops), where the per-task launch overhead would otherwise dominate.
+
+## Usage
+
+Add `cuda_graph=True` to the `@qd.kernel` decorator:
+
+```python
+@qd.kernel(cuda_graph=True)
+def my_kernel(
+    x: qd.types.ndarray(qd.f32, ndim=1),
+    y: qd.types.ndarray(qd.f32, ndim=1),
+):
+    for i in range(x.shape[0]):
+        x[i] = x[i] + 1.0
+    for i in range(y.shape[0]):
+        y[i] = y[i] + 2.0
+```
+
+The kernel is used normally — no other API changes are needed:
+
+```python
+x = qd.ndarray(qd.f32, shape=(1024,))
+y = qd.ndarray(qd.f32, shape=(1024,))
+
+my_kernel(x, y)  # first call: builds and caches the graph
+my_kernel(x, y)  # subsequent calls: replays the cached graph
+```
+
+### When to use
+
+Use `cuda_graph=True` on kernels that:
+
+- Run on CUDA (`arch=qd.cuda`)
+- Contain **two or more** top-level `for` loops (i.e. compile into multiple offloaded tasks)
+- Are called repeatedly with arguments of the same shape
+
+Kernels with a single `for` loop compile into a single GPU task, so there is no multi-launch overhead to eliminate. The graph path will gracefully fall back to the normal launch path in this case.
+
+### Restrictions
+
+- **No struct return values.** Kernels that return values (e.g. `-> qd.i32`) cannot use CUDA graphs. An error is raised if `cuda_graph=True` is set on such a kernel.
+- **Primal kernels only.** The `cuda_graph=True` flag is applied to the primal (forward) kernel only, not its adjoint. Autodiff kernels use the normal launch path.
+- **Non-CUDA backends.** On non-CUDA backends (CPU, Vulkan, Metal), `cuda_graph=True` is silently ignored. This means you can annotate a kernel unconditionally and it will work on all platforms.
+
+### Passing different arguments
+
+You can pass different ndarrays to the same kernel on subsequent calls. The cached graph is replayed with the updated arguments — no graph rebuild occurs:
+
+```python
+x1 = qd.ndarray(qd.f32, shape=(1024,))
+y1 = qd.ndarray(qd.f32, shape=(1024,))
+my_kernel(x1, y1)  # builds graph
+
+x2 = qd.ndarray(qd.f32, shape=(1024,))
+y2 = qd.ndarray(qd.f32, shape=(1024,))
+my_kernel(x2, y2)  # replays graph with new array pointers
+```
+
+### Fields as arguments
+
+Fields (SNode-backed data created with `qd.field`) are accessed through the global runtime pointer, not through the kernel argument buffer. The graph captures this pointer at build time, so fields work transparently with CUDA graphs.
+
+When different fields are passed as template arguments, each unique combination of fields produces a separately compiled kernel with its own graph cache entry. There is no interference between them.
+
+---
+
+## Advanced: Implementation Details
+
+### Graph build and replay
+
+On the first call to a `cuda_graph=True` kernel, the runtime:
+
+1. **Allocates persistent device buffers** for the kernel's argument buffer and result buffer. These live for the lifetime of the runtime (until `qd.reset()`).
+2. **Copies the host argument buffer** (containing scalar values and resolved device pointers for ndarrays) into the persistent device argument buffer.
+3. **Builds a `RuntimeContext`** whose `arg_buffer` and `result_buffer` point at the persistent device buffers, and whose `runtime` pointer points at the `LLVMRuntime`. This `RuntimeContext` is stored inside the cache entry at a stable address.
+4. **Constructs a CUDA graph** by iterating over the kernel's offloaded tasks and adding each as a kernel node. Each node receives a pointer to the persistent `RuntimeContext` as its sole kernel parameter. Nodes are chained with sequential dependencies.
+5. **Instantiates** the graph into an executable (`cuGraphInstantiate`) and launches it.
+6. **Caches** the graph executable, persistent buffers, and `RuntimeContext` in a map keyed by `launch_id`.
+
+On subsequent calls (cache hit), the runtime:
+
+1. **Copies the updated host argument buffer** into the persistent device argument buffer via `cuMemcpyHtoD`. This is the only operation needed — the graph's kernel nodes already point at the persistent `RuntimeContext`, which already points at the persistent argument buffer.
+2. **Replays** the cached graph via `cuGraphLaunch`.
+
+### How arguments reach the GPU kernels
+
+Each compiled GPU kernel takes a single parameter: a pointer to `RuntimeContext`. The `RuntimeContext` contains:
+
+- `arg_buffer`: a device-side buffer holding serialized scalar arguments and resolved ndarray device pointers
+- `result_buffer`: a device-side buffer for return values
+- `runtime`: a pointer to `LLVMRuntime`, which holds field/SNode tree data
+
+For CUDA graphs, these pointers are baked into the graph at capture time. On replay, the *contents* of the argument buffer are updated (via a host-to-device memcpy), but the *pointers* themselves remain stable. This is what allows the graph to be replayed without rebuilding.
+
+Before the argument buffer is copied to the device, `resolve_ctx_ndarray_ptrs` walks all array parameters and resolves `DeviceAllocation` handles into raw device pointers, writing them into the argument buffer. This ensures that even when different ndarrays are passed on subsequent calls, the argument buffer contains the correct device addresses.
+
+### Cache keying and template specialization
+
+The graph cache is keyed by `launch_id`, an integer assigned by `register_llvm_kernel` when a `CompiledKernelData` is first seen. Each unique combination of template arguments (including field arguments) produces a different compiled kernel with a different `launch_id`. This means:
+
+- Calling the same kernel with field A and field B results in two independent compiled kernels, two independent `launch_id` values, and two independent graph cache entries.
+- Each cached graph contains kernel nodes compiled specifically for that field combination's SNode layout.
+- There is no risk of one template specialization's graph being replayed for a different specialization.
+
+### Lifetime and cleanup
+
+The `CachedCudaGraph` struct owns the graph executable and persistent device buffers via RAII. When the `KernelLauncher` is destroyed (which happens on `qd.reset()`), all cached graphs and their device allocations are freed. After a reset, the next kernel call triggers a fresh graph build against the new runtime.
diff --git a/docs/source/user_guide/index.md b/docs/source/user_guide/index.md
index 87c0c78fc4..e7f8c46b30 100644
--- a/docs/source/user_guide/index.md
+++ b/docs/source/user_guide/index.md
@@ -32,11 +32,19 @@ interop
 ```
 
 ```{toctree}
-:caption: Reference
+:caption: Performance
 :maxdepth: 1
 :titlesonly:
 
+cuda_graph
 perf_dispatch
+```
+
+```{toctree}
+:caption: Reference
+:maxdepth: 1
+:titlesonly:
+
 unsupported_python
 python_backend
 debug

From 517d3db9fb85e9cf557f7b33b7d818b67a190ebf Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 18:03:43 -0400
Subject: [PATCH 024/128] Expose CUDA graph cache size for test observability

Add get_cuda_graph_cache_size() through the KernelLauncher -> Program ->
pybind chain so tests can verify that graphs are actually being created
(or not) rather than only checking output correctness.

Made-with: Cursor
---
 quadrants/program/kernel_launcher.h      |  4 ++++
 quadrants/program/program.h              |  4 ++++
 quadrants/python/export_lang.cpp         |  3 ++-
 quadrants/runtime/cuda/kernel_launcher.h |  3 +++
 tests/python/test_cuda_graph.py          | 10 ++++++++++
 5 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/quadrants/program/kernel_launcher.h b/quadrants/program/kernel_launcher.h
index 2adce9d2da..ce294c475b 100644
--- a/quadrants/program/kernel_launcher.h
+++ b/quadrants/program/kernel_launcher.h
@@ -12,6 +12,10 @@ class KernelLauncher {
   virtual void launch_kernel(const CompiledKernelData &compiled_kernel_data,
                              LaunchContextBuilder &ctx) = 0;
 
+  virtual std::size_t get_cuda_graph_cache_size() const {
+    return 0;
+  }
+
   virtual ~KernelLauncher() = default;
 };
 
diff --git a/quadrants/program/program.h b/quadrants/program/program.h
index 1fa2c2ac57..783703c115 100644
--- a/quadrants/program/program.h
+++ b/quadrants/program/program.h
@@ -133,6 +133,10 @@ class QD_DLL_EXPORT Program {
   void launch_kernel(const CompiledKernelData &compiled_kernel_data,
                      LaunchContextBuilder &ctx);
 
+  std::size_t get_cuda_graph_cache_size() {
+    return program_impl_->get_kernel_launcher().get_cuda_graph_cache_size();
+  }
+
   DeviceCapabilityConfig get_device_caps() {
     return program_impl_->get_device_caps();
   }
diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
index d155162667..f8d94e24fe 100644
--- a/quadrants/python/export_lang.cpp
+++ b/quadrants/python/export_lang.cpp
@@ -495,7 +495,8 @@ void export_lang(py::module &m) {
       .def("compile_kernel", &Program::compile_kernel,
            py::return_value_policy::reference)
       .def("launch_kernel", &Program::launch_kernel)
-      .def("get_device_caps", &Program::get_device_caps);
+      .def("get_device_caps", &Program::get_device_caps)
+      .def("get_cuda_graph_cache_size", &Program::get_cuda_graph_cache_size);
 
   py::class_<CompileResult>(m, "CompileResult")
       .def_property_readonly(
diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index 4e063bd1c6..050b7c6d3a 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -52,6 +52,9 @@ class KernelLauncher : public LLVM::KernelLauncher {
   void launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx) override;
   Handle register_llvm_kernel(
       const LLVM::CompiledKernelData &compiled) override;
+  std::size_t get_cuda_graph_cache_size() const override {
+    return cuda_graph_cache_.size();
+  }
 
  private:
   bool on_cuda_device(void *ptr);
diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index 40e397e03e..a70fc88cdf 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -1,10 +1,15 @@
 import numpy as np
 
 import quadrants as qd
+from quadrants.lang import impl
 
 from tests import test_utils
 
 
+def _cuda_graph_cache_size():
+    return impl.get_runtime().prog.get_cuda_graph_cache_size()
+
+
 @test_utils.test(arch=[qd.cuda])
 def test_cuda_graph_two_loops():
     """A kernel with two top-level for loops should be fused into a CUDA graph."""
@@ -19,9 +24,12 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, n
         for i in range(y.shape[0]):
             y[i] = y[i] + 2.0
 
+    assert _cuda_graph_cache_size() == 0
     two_loops(x, y)
+    assert _cuda_graph_cache_size() == 1
     two_loops(x, y)
     two_loops(x, y)
+    assert _cuda_graph_cache_size() == 1
 
     x_np = x.to_numpy()
     y_np = y.to_numpy()
@@ -81,6 +89,7 @@ def single_loop(x: qd.types.ndarray(qd.f32, ndim=1)):
 
     single_loop(x)
     single_loop(x)
+    assert _cuda_graph_cache_size() == 0
 
     x_np = x.to_numpy()
     assert np.allclose(x_np, 10.0)
@@ -102,6 +111,7 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, n
 
     two_loops(x, y)
     two_loops(x, y)
+    assert _cuda_graph_cache_size() == 0
 
     x_np = x.to_numpy()
     y_np = y.to_numpy()

From da3ff27bbc440c81f786850c2906007e0ccd31a2 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 18:48:03 -0400
Subject: [PATCH 025/128] Add get_cuda_graph_cache_used_on_last_call() for test
 observability

Tracks whether the CUDA graph cache was used on the most recent kernel
launch, exposed through KernelLauncher -> Program -> pybind so tests
can assert the graph path was (or was not) taken.

Made-with: Cursor
---
 quadrants/program/kernel_launcher.h        |  4 ++++
 quadrants/program/program.h                |  5 +++++
 quadrants/python/export_lang.cpp           |  4 +++-
 quadrants/runtime/cuda/kernel_launcher.cpp |  2 ++
 quadrants/runtime/cuda/kernel_launcher.h   |  4 ++++
 tests/python/test_cuda_graph.py            | 10 ++++++++++
 6 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/quadrants/program/kernel_launcher.h b/quadrants/program/kernel_launcher.h
index ce294c475b..f800768269 100644
--- a/quadrants/program/kernel_launcher.h
+++ b/quadrants/program/kernel_launcher.h
@@ -16,6 +16,10 @@ class KernelLauncher {
     return 0;
   }
 
+  virtual bool get_cuda_graph_cache_used_on_last_call() const {
+    return false;
+  }
+
   virtual ~KernelLauncher() = default;
 };
 
diff --git a/quadrants/program/program.h b/quadrants/program/program.h
index 783703c115..7ceee2730e 100644
--- a/quadrants/program/program.h
+++ b/quadrants/program/program.h
@@ -137,6 +137,11 @@ class QD_DLL_EXPORT Program {
     return program_impl_->get_kernel_launcher().get_cuda_graph_cache_size();
   }
 
+  bool get_cuda_graph_cache_used_on_last_call() {
+    return program_impl_->get_kernel_launcher()
+        .get_cuda_graph_cache_used_on_last_call();
+  }
+
   DeviceCapabilityConfig get_device_caps() {
     return program_impl_->get_device_caps();
   }
diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
index f8d94e24fe..fb425991bc 100644
--- a/quadrants/python/export_lang.cpp
+++ b/quadrants/python/export_lang.cpp
@@ -496,7 +496,9 @@ void export_lang(py::module &m) {
            py::return_value_policy::reference)
       .def("launch_kernel", &Program::launch_kernel)
       .def("get_device_caps", &Program::get_device_caps)
-      .def("get_cuda_graph_cache_size", &Program::get_cuda_graph_cache_size);
+      .def("get_cuda_graph_cache_size", &Program::get_cuda_graph_cache_size)
+      .def("get_cuda_graph_cache_used_on_last_call",
+           &Program::get_cuda_graph_cache_used_on_last_call);
 
   py::class_<CompileResult>(m, "CompileResult")
       .def_property_readonly(
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 3ecc147ebe..de0b63c81f 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -213,9 +213,11 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
   if (ctx.use_cuda_graph) {
     if (launch_llvm_kernel_graph(handle, ctx)) {
+      cuda_graph_cache_used_on_last_call_ = true;
       return;
     }
   }
+  cuda_graph_cache_used_on_last_call_ = false;
 
   auto launcher_ctx = contexts_[handle.get_launch_id()];
   auto *executor = get_runtime_executor();
diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index 050b7c6d3a..6bbaca5e49 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -55,6 +55,9 @@ class KernelLauncher : public LLVM::KernelLauncher {
   std::size_t get_cuda_graph_cache_size() const override {
     return cuda_graph_cache_.size();
   }
+  bool get_cuda_graph_cache_used_on_last_call() const override {
+    return cuda_graph_cache_used_on_last_call_;
+  }
 
  private:
   bool on_cuda_device(void *ptr);
@@ -64,6 +67,7 @@ class KernelLauncher : public LLVM::KernelLauncher {
   bool launch_llvm_kernel_graph(Handle handle, LaunchContextBuilder &ctx);
   std::vector<Context> contexts_;
   std::unordered_map<int, CachedCudaGraph> cuda_graph_cache_;
+  bool cuda_graph_cache_used_on_last_call_{false};
 };
 
 }  // namespace cuda
diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index a70fc88cdf..51717d0185 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -10,6 +10,10 @@ def _cuda_graph_cache_size():
     return impl.get_runtime().prog.get_cuda_graph_cache_size()
 
 
+def _cuda_graph_used():
+    return impl.get_runtime().prog.get_cuda_graph_cache_used_on_last_call()
+
+
 @test_utils.test(arch=[qd.cuda])
 def test_cuda_graph_two_loops():
     """A kernel with two top-level for loops should be fused into a CUDA graph."""
@@ -27,7 +31,9 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, n
     assert _cuda_graph_cache_size() == 0
     two_loops(x, y)
     assert _cuda_graph_cache_size() == 1
+    assert _cuda_graph_used()
     two_loops(x, y)
+    assert _cuda_graph_used()
     two_loops(x, y)
     assert _cuda_graph_cache_size() == 1
 
@@ -88,7 +94,9 @@ def single_loop(x: qd.types.ndarray(qd.f32, ndim=1)):
             x[i] = x[i] + 5.0
 
     single_loop(x)
+    assert not _cuda_graph_used()
     single_loop(x)
+    assert not _cuda_graph_used()
     assert _cuda_graph_cache_size() == 0
 
     x_np = x.to_numpy()
@@ -110,7 +118,9 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, n
             y[i] = y[i] + 2.0
 
     two_loops(x, y)
+    assert not _cuda_graph_used()
     two_loops(x, y)
+    assert not _cuda_graph_used()
     assert _cuda_graph_cache_size() == 0
 
     x_np = x.to_numpy()

From a2abceb58d7b43dbd3562ffd243dd3d15742acc9 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 18:50:47 -0400
Subject: [PATCH 026/128] Add cache size and cache used assertions to all CUDA
 graph tests

Every test now verifies graph caching behavior, not just output
correctness. Cross-platform test uses platform_supports_graph to
make assertions conditional on the backend.

Made-with: Cursor
---
 tests/python/test_cuda_graph.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index 51717d0185..429e0fe78d 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -14,6 +14,10 @@ def _cuda_graph_used():
     return impl.get_runtime().prog.get_cuda_graph_cache_used_on_last_call()
 
 
+def _on_cuda():
+    return impl.current_cfg().arch == qd.cuda
+
+
 @test_utils.test(arch=[qd.cuda])
 def test_cuda_graph_two_loops():
     """A kernel with two top-level for loops should be fused into a CUDA graph."""
@@ -62,7 +66,10 @@ def three_loops(
         for i in range(c.shape[0]):
             c[i] = a[i] + b[i]
 
+    assert _cuda_graph_cache_size() == 0
     three_loops(a, b, c)
+    assert _cuda_graph_cache_size() == 1
+    assert _cuda_graph_used()
 
     a_np = a.to_numpy()
     b_np = b.to_numpy()
@@ -72,6 +79,8 @@ def three_loops(
     assert np.allclose(c_np, 11.0)
 
     three_loops(a, b, c)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
 
     a_np = a.to_numpy()
     b_np = b.to_numpy()
@@ -143,8 +152,12 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, n
 
     x1 = qd.ndarray(qd.f32, shape=(n,))
     y1 = qd.ndarray(qd.f32, shape=(n,))
+    assert _cuda_graph_cache_size() == 0
     two_loops(x1, y1)
+    assert _cuda_graph_cache_size() == 1
+    assert _cuda_graph_used()
     two_loops(x1, y1)
+    assert _cuda_graph_used()
 
     x1_np = x1.to_numpy()
     y1_np = y1.to_numpy()
@@ -156,6 +169,8 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, n
     x2.from_numpy(np.full(n, 10.0, dtype=np.float32))
     y2.from_numpy(np.full(n, 20.0, dtype=np.float32))
     two_loops(x2, y2)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
 
     x2_np = x2.to_numpy()
     y2_np = y2.to_numpy()
@@ -185,11 +200,15 @@ def add_one(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndi
 
     x1 = qd.ndarray(qd.f32, shape=(256,))
     y1 = qd.ndarray(qd.f32, shape=(256,))
+    assert _cuda_graph_cache_size() == 0
     add_one(x1, y1)
+    assert _cuda_graph_cache_size() == 1
+    assert _cuda_graph_used()
 
     x2 = qd.ndarray(qd.f32, shape=(1024,))
     y2 = qd.ndarray(qd.f32, shape=(1024,))
     add_one(x2, y2)
+    assert _cuda_graph_used()
 
     x2_np = x2.to_numpy()
     y2_np = y2.to_numpy()
@@ -212,7 +231,10 @@ def add_one(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndi
     x = qd.ndarray(qd.f32, shape=(n,))
     y = qd.ndarray(qd.f32, shape=(n,))
     add_one(x, y)
+    assert _cuda_graph_cache_size() == 1
+    assert _cuda_graph_used()
     add_one(x, y)
+    assert _cuda_graph_used()
 
     assert np.allclose(x.to_numpy(), 2.0)
     assert np.allclose(y.to_numpy(), 4.0)
@@ -222,7 +244,10 @@ def add_one(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndi
 
     x2 = qd.ndarray(qd.f32, shape=(n,))
     y2 = qd.ndarray(qd.f32, shape=(n,))
+    assert _cuda_graph_cache_size() == 0
     add_one(x2, y2)
+    assert _cuda_graph_cache_size() == 1
+    assert _cuda_graph_used()
 
     assert np.allclose(x2.to_numpy(), 1.0)
     assert np.allclose(y2.to_numpy(), 2.0)
@@ -231,6 +256,7 @@ def add_one(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndi
 @test_utils.test()
 def test_cuda_graph_annotation_cross_platform():
     """cuda_graph=True should be a harmless no-op on non-CUDA backends."""
+    platform_supports_graph = _on_cuda()
     n = 256
     x = qd.ndarray(qd.f32, shape=(n,))
     y = qd.ndarray(qd.f32, shape=(n,))
@@ -242,8 +268,14 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, n
         for i in range(y.shape[0]):
             y[i] = y[i] + 2.0
 
+    assert _cuda_graph_cache_size() == 0
     two_loops(x, y)
+    expected_cache_size = 1 if platform_supports_graph else 0
+    assert _cuda_graph_cache_size() == expected_cache_size
+    assert _cuda_graph_used() == platform_supports_graph
     two_loops(x, y)
+    assert _cuda_graph_used() == platform_supports_graph
+    assert _cuda_graph_cache_size() == expected_cache_size
 
     x_np = x.to_numpy()
     y_np = y.to_numpy()

From 720f5d8661a57900e8e8d38d3777bad26b3f5005 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 18:54:10 -0400
Subject: [PATCH 027/128] Inline expected cache size in cross-platform test
 assertion

Made-with: Cursor
---
 tests/python/test_cuda_graph.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index 429e0fe78d..6257668cb6 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -270,12 +270,11 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, n
 
     assert _cuda_graph_cache_size() == 0
     two_loops(x, y)
-    expected_cache_size = 1 if platform_supports_graph else 0
-    assert _cuda_graph_cache_size() == expected_cache_size
+    assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
     two_loops(x, y)
     assert _cuda_graph_used() == platform_supports_graph
-    assert _cuda_graph_cache_size() == expected_cache_size
+    assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
 
     x_np = x.to_numpy()
     y_np = y.to_numpy()

From f158fd48e27e0c9238ffedf6ecc8b0b14a6ce331 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 18:57:20 -0400
Subject: [PATCH 028/128] Run all CUDA graph tests on all platforms

Graph assertions are conditional on platform_supports_graph so they
verify correct behavior on both CUDA (graphs used) and non-CUDA
(graceful no-op) backends.

Made-with: Cursor
---
 tests/python/test_cuda_graph.py | 64 ++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 29 deletions(-)

diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index 6257668cb6..497ba14053 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -18,9 +18,10 @@ def _on_cuda():
     return impl.current_cfg().arch == qd.cuda
 
 
-@test_utils.test(arch=[qd.cuda])
+@test_utils.test()
 def test_cuda_graph_two_loops():
     """A kernel with two top-level for loops should be fused into a CUDA graph."""
+    platform_supports_graph = _on_cuda()
     n = 1024
     x = qd.ndarray(qd.f32, shape=(n,))
     y = qd.ndarray(qd.f32, shape=(n,))
@@ -34,12 +35,12 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, n
 
     assert _cuda_graph_cache_size() == 0
     two_loops(x, y)
-    assert _cuda_graph_cache_size() == 1
-    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
+    assert _cuda_graph_used() == platform_supports_graph
     two_loops(x, y)
-    assert _cuda_graph_used()
+    assert _cuda_graph_used() == platform_supports_graph
     two_loops(x, y)
-    assert _cuda_graph_cache_size() == 1
+    assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
 
     x_np = x.to_numpy()
     y_np = y.to_numpy()
@@ -47,9 +48,10 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, n
     assert np.allclose(y_np, 6.0), f"Expected 6.0, got {y_np[:5]}"
 
 
-@test_utils.test(arch=[qd.cuda])
+@test_utils.test()
 def test_cuda_graph_three_loops():
     """A kernel with three top-level for loops."""
+    platform_supports_graph = _on_cuda()
     n = 512
     a = qd.ndarray(qd.f32, shape=(n,))
     b = qd.ndarray(qd.f32, shape=(n,))
@@ -68,8 +70,8 @@ def three_loops(
 
     assert _cuda_graph_cache_size() == 0
     three_loops(a, b, c)
-    assert _cuda_graph_cache_size() == 1
-    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
+    assert _cuda_graph_used() == platform_supports_graph
 
     a_np = a.to_numpy()
     b_np = b.to_numpy()
@@ -79,8 +81,8 @@ def three_loops(
     assert np.allclose(c_np, 11.0)
 
     three_loops(a, b, c)
-    assert _cuda_graph_used()
-    assert _cuda_graph_cache_size() == 1
+    assert _cuda_graph_used() == platform_supports_graph
+    assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
 
     a_np = a.to_numpy()
     b_np = b.to_numpy()
@@ -90,7 +92,7 @@ def three_loops(
     assert np.allclose(c_np, 22.0)
 
 
-@test_utils.test(arch=[qd.cuda])
+@test_utils.test()
 def test_cuda_graph_single_loop_no_graph():
     """A kernel with a single for loop should NOT use the graph path,
     even with cuda_graph=True (falls back since < 2 tasks)."""
@@ -112,7 +114,7 @@ def single_loop(x: qd.types.ndarray(qd.f32, ndim=1)):
     assert np.allclose(x_np, 10.0)
 
 
-@test_utils.test(arch=[qd.cuda])
+@test_utils.test()
 def test_no_cuda_graph_annotation():
     """A kernel WITHOUT cuda_graph=True should never use the graph path."""
     n = 256
@@ -138,9 +140,10 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, n
     assert np.allclose(y_np, 4.0)
 
 
-@test_utils.test(arch=[qd.cuda])
+@test_utils.test()
 def test_cuda_graph_changed_args():
     """Graph should produce correct results when called with different ndarrays."""
+    platform_supports_graph = _on_cuda()
     n = 256
 
     @qd.kernel(cuda_graph=True)
@@ -154,10 +157,10 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, n
     y1 = qd.ndarray(qd.f32, shape=(n,))
     assert _cuda_graph_cache_size() == 0
     two_loops(x1, y1)
-    assert _cuda_graph_cache_size() == 1
-    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
+    assert _cuda_graph_used() == platform_supports_graph
     two_loops(x1, y1)
-    assert _cuda_graph_used()
+    assert _cuda_graph_used() == platform_supports_graph
 
     x1_np = x1.to_numpy()
     y1_np = y1.to_numpy()
@@ -169,8 +172,8 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, n
     x2.from_numpy(np.full(n, 10.0, dtype=np.float32))
     y2.from_numpy(np.full(n, 20.0, dtype=np.float32))
     two_loops(x2, y2)
-    assert _cuda_graph_used()
-    assert _cuda_graph_cache_size() == 1
+    assert _cuda_graph_used() == platform_supports_graph
+    assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
 
     x2_np = x2.to_numpy()
     y2_np = y2.to_numpy()
@@ -183,13 +186,14 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, n
     assert np.allclose(y1_np, 4.0), f"y1 should be unchanged, got {y1_np[:5]}"
 
 
-@test_utils.test(arch=[qd.cuda])
+@test_utils.test()
 def test_cuda_graph_different_sizes():
     """Graph must produce correct results when called with different-sized arrays.
 
     Catches stale grid dims: if the graph cached from the small call is
     replayed for the large call, elements beyond the original size stay zero.
     """
+    platform_supports_graph = _on_cuda()
 
     @qd.kernel(cuda_graph=True)
     def add_one(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndim=1)):
@@ -202,13 +206,13 @@ def add_one(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndi
     y1 = qd.ndarray(qd.f32, shape=(256,))
     assert _cuda_graph_cache_size() == 0
     add_one(x1, y1)
-    assert _cuda_graph_cache_size() == 1
-    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
+    assert _cuda_graph_used() == platform_supports_graph
 
     x2 = qd.ndarray(qd.f32, shape=(1024,))
     y2 = qd.ndarray(qd.f32, shape=(1024,))
     add_one(x2, y2)
-    assert _cuda_graph_used()
+    assert _cuda_graph_used() == platform_supports_graph
 
     x2_np = x2.to_numpy()
     y2_np = y2.to_numpy()
@@ -216,9 +220,10 @@ def add_one(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndi
     assert np.allclose(y2_np, 2.0), f"Expected all 2.0, got {y2_np[250:260]}"
 
 
-@test_utils.test(arch=[qd.cuda])
+@test_utils.test()
 def test_cuda_graph_after_reset():
     """cuda_graph=True kernel must work correctly after qd.reset()."""
+    platform_supports_graph = _on_cuda()
 
     @qd.kernel(cuda_graph=True)
     def add_one(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndim=1)):
@@ -231,23 +236,24 @@ def add_one(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndi
     x = qd.ndarray(qd.f32, shape=(n,))
     y = qd.ndarray(qd.f32, shape=(n,))
     add_one(x, y)
-    assert _cuda_graph_cache_size() == 1
-    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
+    assert _cuda_graph_used() == platform_supports_graph
     add_one(x, y)
-    assert _cuda_graph_used()
+    assert _cuda_graph_used() == platform_supports_graph
 
     assert np.allclose(x.to_numpy(), 2.0)
     assert np.allclose(y.to_numpy(), 4.0)
 
+    arch = impl.current_cfg().arch
     qd.reset()
-    qd.init(arch=qd.cuda)
+    qd.init(arch=arch)
 
     x2 = qd.ndarray(qd.f32, shape=(n,))
     y2 = qd.ndarray(qd.f32, shape=(n,))
     assert _cuda_graph_cache_size() == 0
     add_one(x2, y2)
-    assert _cuda_graph_cache_size() == 1
-    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
+    assert _cuda_graph_used() == platform_supports_graph
 
     assert np.allclose(x2.to_numpy(), 1.0)
     assert np.allclose(y2.to_numpy(), 2.0)

From a8e6b8f172d6d1245a99a87767a65fbef2cdcbf4 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 19:02:21 -0400
Subject: [PATCH 029/128] update doc

---
 docs/source/user_guide/cuda_graph.md | 63 ++--------------------------
 1 file changed, 4 insertions(+), 59 deletions(-)

diff --git a/docs/source/user_guide/cuda_graph.md b/docs/source/user_guide/cuda_graph.md
index abbf87aaf4..4855d70909 100644
--- a/docs/source/user_guide/cuda_graph.md
+++ b/docs/source/user_guide/cuda_graph.md
@@ -1,10 +1,10 @@
 # CUDA Graph
 
-CUDA graphs reduce kernel launch overhead by capturing a sequence of GPU operations into a graph, then replaying it in a single launch. This is most beneficial for kernels that compile into multiple GPU tasks (e.g. kernels with multiple top-level `for` loops), where the per-task launch overhead would otherwise dominate.
+CUDA graphs reduce kernel launch overhead by capturing a sequence of GPU operations into a graph, then replaying it in a single launch. On non-CUDA platforms, the cuda graph annotation is simply ignored, and code runs normally.
 
 ## Usage
 
-Add `cuda_graph=True` to the `@qd.kernel` decorator:
+Add `cuda_graph=True` to a `@qd.kernel` decorator:
 
 ```python
 @qd.kernel(cuda_graph=True)
@@ -18,6 +18,8 @@ def my_kernel(
         y[i] = y[i] + 2.0
 ```
 
+The top level for-loops will be compiled into a single CUDA graph. The parallelism is the same as before, but the launch latency much reduced.
+
 The kernel is used normally — no other API changes are needed:
 
 ```python
@@ -28,21 +30,10 @@ my_kernel(x, y)  # first call: builds and caches the graph
 my_kernel(x, y)  # subsequent calls: replays the cached graph
 ```
 
-### When to use
-
-Use `cuda_graph=True` on kernels that:
-
-- Run on CUDA (`arch=qd.cuda`)
-- Contain **two or more** top-level `for` loops (i.e. compile into multiple offloaded tasks)
-- Are called repeatedly with arguments of the same shape
-
-Kernels with a single `for` loop compile into a single GPU task, so there is no multi-launch overhead to eliminate. The graph path will gracefully fall back to the normal launch path in this case.
-
 ### Restrictions
 
 - **No struct return values.** Kernels that return values (e.g. `-> qd.i32`) cannot use CUDA graphs. An error is raised if `cuda_graph=True` is set on such a kernel.
 - **Primal kernels only.** The `cuda_graph=True` flag is applied to the primal (forward) kernel only, not its adjoint. Autodiff kernels use the normal launch path.
-- **Non-CUDA backends.** On non-CUDA backends (CPU, Vulkan, Metal), `cuda_graph=True` is silently ignored. This means you can annotate a kernel unconditionally and it will work on all platforms.
 
 ### Passing different arguments
 
@@ -60,50 +51,4 @@ my_kernel(x2, y2)  # replays graph with new array pointers
 
 ### Fields as arguments
 
-Fields (SNode-backed data created with `qd.field`) are accessed through the global runtime pointer, not through the kernel argument buffer. The graph captures this pointer at build time, so fields work transparently with CUDA graphs.
-
 When different fields are passed as template arguments, each unique combination of fields produces a separately compiled kernel with its own graph cache entry. There is no interference between them.
-
----
-
-## Advanced: Implementation Details
-
-### Graph build and replay
-
-On the first call to a `cuda_graph=True` kernel, the runtime:
-
-1. **Allocates persistent device buffers** for the kernel's argument buffer and result buffer. These live for the lifetime of the runtime (until `qd.reset()`).
-2. **Copies the host argument buffer** (containing scalar values and resolved device pointers for ndarrays) into the persistent device argument buffer.
-3. **Builds a `RuntimeContext`** whose `arg_buffer` and `result_buffer` point at the persistent device buffers, and whose `runtime` pointer points at the `LLVMRuntime`. This `RuntimeContext` is stored inside the cache entry at a stable address.
-4. **Constructs a CUDA graph** by iterating over the kernel's offloaded tasks and adding each as a kernel node. Each node receives a pointer to the persistent `RuntimeContext` as its sole kernel parameter. Nodes are chained with sequential dependencies.
-5. **Instantiates** the graph into an executable (`cuGraphInstantiate`) and launches it.
-6. **Caches** the graph executable, persistent buffers, and `RuntimeContext` in a map keyed by `launch_id`.
-
-On subsequent calls (cache hit), the runtime:
-
-1. **Copies the updated host argument buffer** into the persistent device argument buffer via `cuMemcpyHtoD`. This is the only operation needed — the graph's kernel nodes already point at the persistent `RuntimeContext`, which already points at the persistent argument buffer.
-2. **Replays** the cached graph via `cuGraphLaunch`.
-
-### How arguments reach the GPU kernels
-
-Each compiled GPU kernel takes a single parameter: a pointer to `RuntimeContext`. The `RuntimeContext` contains:
-
-- `arg_buffer`: a device-side buffer holding serialized scalar arguments and resolved ndarray device pointers
-- `result_buffer`: a device-side buffer for return values
-- `runtime`: a pointer to `LLVMRuntime`, which holds field/SNode tree data
-
-For CUDA graphs, these pointers are baked into the graph at capture time. On replay, the *contents* of the argument buffer are updated (via a host-to-device memcpy), but the *pointers* themselves remain stable. This is what allows the graph to be replayed without rebuilding.
-
-Before the argument buffer is copied to the device, `resolve_ctx_ndarray_ptrs` walks all array parameters and resolves `DeviceAllocation` handles into raw device pointers, writing them into the argument buffer. This ensures that even when different ndarrays are passed on subsequent calls, the argument buffer contains the correct device addresses.
-
-### Cache keying and template specialization
-
-The graph cache is keyed by `launch_id`, an integer assigned by `register_llvm_kernel` when a `CompiledKernelData` is first seen. Each unique combination of template arguments (including field arguments) produces a different compiled kernel with a different `launch_id`. This means:
-
-- Calling the same kernel with field A and field B results in two independent compiled kernels, two independent `launch_id` values, and two independent graph cache entries.
-- Each cached graph contains kernel nodes compiled specifically for that field combination's SNode layout.
-- There is no risk of one template specialization's graph being replayed for a different specialization.
-
-### Lifetime and cleanup
-
-The `CachedCudaGraph` struct owns the graph executable and persistent device buffers via RAII. When the `KernelLauncher` is destroyed (which happens on `qd.reset()`), all cached graphs and their device allocations are freed. After a reset, the next kernel call triggers a fresh graph build against the new runtime.

From dd4f48b4ea4414a276a92761ca40208885d84193 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 19:10:29 -0400
Subject: [PATCH 030/128] Add comment documenting resolve_ctx_ndarray_ptrs
 contract

Made-with: Cursor
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index de0b63c81f..7ca1b69afa 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -61,6 +61,13 @@ bool KernelLauncher::on_cuda_device(void *ptr) {
   return ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE;
 }
 
+// Resolves ndarray parameter handles in the launch context to raw device
+// pointers, writing them into the arg buffer via set_ndarray_ptrs.
+//
+// Unlike the normal launch path, this does not handle host-resident arrays
+// (no temporary device allocation or host-to-device transfer). Returns false
+// if any external array is on the host, signaling the caller to fall back
+// to the non-graph launch path.
 bool KernelLauncher::resolve_ctx_ndarray_ptrs(
     LaunchContextBuilder &ctx,
     const std::vector<std::pair<int, Callable::Parameter>> &parameters) {

From aa0844267c25d9f0774b1bc462cdabc5b4acc3ce Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 19:13:03 -0400
Subject: [PATCH 031/128] Add comment explaining contexts_ population in graph
 path

Made-with: Cursor
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 7ca1b69afa..f562703cbc 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -111,6 +111,8 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
                                               LaunchContextBuilder &ctx) {
   int launch_id = handle.get_launch_id();
 
+  // Populated by register_llvm_kernel, which runs before launch_llvm_kernel
+  // for all LLVM kernels regardless of whether the graph path is used.
   auto &launcher_ctx = contexts_[launch_id];
   const auto &parameters = *launcher_ctx.parameters;
   const auto &offloaded_tasks = launcher_ctx.offloaded_tasks;

From 98bf081471b41806ad8a1a3d627a8c937a08b000 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 19:13:53 -0400
Subject: [PATCH 032/128] Add comment explaining single-task graph fallback
 guard

Made-with: Cursor
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index f562703cbc..2e6d91051b 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -117,6 +117,8 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
   const auto &parameters = *launcher_ctx.parameters;
   const auto &offloaded_tasks = launcher_ctx.offloaded_tasks;
 
+  // A single-task kernel has no multi-launch overhead to eliminate, so
+  // graphing it provides no benefit. Return false to use the normal path.
   if (offloaded_tasks.size() < 2) {
     return false;
   }

From 7b18674500eedd2fa91dbd10cef1db2f904f0885 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 19:14:49 -0400
Subject: [PATCH 033/128] Add comment explaining resolve_ctx_ndarray_ptrs
 fallback check

Made-with: Cursor
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 2e6d91051b..0c33882599 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -127,6 +127,8 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
               "cuda_graph=True is not supported for kernels with struct return "
               "values; remove cuda_graph=True or avoid returning values");
 
+  // Falls back to the normal path if any external array is host-resident,
+  // since the graph path cannot perform host-to-device transfers.
   if (!resolve_ctx_ndarray_ptrs(ctx, parameters)) {
     return false;
   }

From 9907333c0190f45c3ff95a744811b3bee4c993f9 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 19:15:25 -0400
Subject: [PATCH 034/128] Add comment explaining kernelParams vs extra in graph
 node params

Made-with: Cursor
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 0c33882599..80b4131f44 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -194,6 +194,8 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
     node_params.blockDimZ = 1;
     node_params.sharedMemBytes = (unsigned int)task.dynamic_shared_array_bytes;
     node_params.kernelParams = &ctx_ptr;
+    // kernelParams and extra are two mutually exclusive ways of passing
+    // arguments to a CUDA kernel; we use kernelParams, so extra is null.
     node_params.extra = nullptr;
 
     void *node = nullptr;

From 6ff327eea0a8bf98e4768927c8d295cb8e5e3653 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 19:15:54 -0400
Subject: [PATCH 035/128] Add comment explaining graph_exec field in
 CachedCudaGraph

Made-with: Cursor
---
 quadrants/runtime/cuda/kernel_launcher.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index 6bbaca5e49..ffcd08b059 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -22,6 +22,8 @@ struct CudaKernelNodeParams {
 };
 
 struct CachedCudaGraph {
+  // CUgraphExec handle (typed as void* since driver API is loaded dynamically).
+  // This is the instantiated, launchable form of the captured CUDA graph.
   void *graph_exec{nullptr};
   char *persistent_device_arg_buffer{nullptr};
   char *persistent_device_result_buffer{nullptr};

From 6796baf82b1fa8f0ec3fb5a3ba1a11128cc2244e Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 19:18:13 -0400
Subject: [PATCH 036/128] Add comment explaining cuda_graph_cache_ key

Made-with: Cursor
---
 quadrants/runtime/cuda/kernel_launcher.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index ffcd08b059..72e2a18ddb 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -68,6 +68,8 @@ class KernelLauncher : public LLVM::KernelLauncher {
       const std::vector<std::pair<int, Callable::Parameter>> &parameters);
   bool launch_llvm_kernel_graph(Handle handle, LaunchContextBuilder &ctx);
   std::vector<Context> contexts_;
+  // Keyed by launch_id, which uniquely identifies a compiled kernel variant
+  // (each template specialization gets its own launch_id).
   std::unordered_map<int, CachedCudaGraph> cuda_graph_cache_;
   bool cuda_graph_cache_used_on_last_call_{false};
 };

From 1d4ebefcd52d7df4588f40e33bb750090fe75f92 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 19:24:01 -0400
Subject: [PATCH 037/128] Parametrize test_cuda_graph_changed_args over ndarray
 and field

Uses qd.types.NDArray vs qd.Template annotation based on tensor_type.
Fields produce separate template specializations so the cache size
assertion accounts for the extra graph cache entry.

Made-with: Cursor
---
 tests/python/test_cuda_graph.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index 497ba14053..01048d2fcd 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pytest
 
 import quadrants as qd
 from quadrants.lang import impl
@@ -140,21 +141,24 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, n
     assert np.allclose(y_np, 4.0)
 
 
+@pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test()
-def test_cuda_graph_changed_args():
-    """Graph should produce correct results when called with different ndarrays."""
+def test_cuda_graph_changed_args(tensor_type):
+    """Graph should produce correct results when called with different tensors."""
     platform_supports_graph = _on_cuda()
     n = 256
 
+    Annotation = qd.types.NDArray[qd.f32, 1] if tensor_type == qd.ndarray else qd.Template
+
     @qd.kernel(cuda_graph=True)
-    def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndim=1)):
+    def two_loops(x: Annotation, y: Annotation):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1.0
         for i in range(y.shape[0]):
             y[i] = y[i] + 2.0
 
-    x1 = qd.ndarray(qd.f32, shape=(n,))
-    y1 = qd.ndarray(qd.f32, shape=(n,))
+    x1 = tensor_type(qd.f32, (n,))
+    y1 = tensor_type(qd.f32, (n,))
     assert _cuda_graph_cache_size() == 0
     two_loops(x1, y1)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
@@ -167,13 +171,18 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, n
     assert np.allclose(x1_np, 2.0), f"Expected 2.0, got {x1_np[:5]}"
     assert np.allclose(y1_np, 4.0), f"Expected 4.0, got {y1_np[:5]}"
 
-    x2 = qd.ndarray(qd.f32, shape=(n,))
-    y2 = qd.ndarray(qd.f32, shape=(n,))
+    x2 = tensor_type(qd.f32, (n,))
+    y2 = tensor_type(qd.f32, (n,))
     x2.from_numpy(np.full(n, 10.0, dtype=np.float32))
     y2.from_numpy(np.full(n, 20.0, dtype=np.float32))
     two_loops(x2, y2)
     assert _cuda_graph_used() == platform_supports_graph
-    assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
+    # Fields are template args, so different field objects produce a second
+    # compiled kernel and a second graph cache entry.
+    if tensor_type == qd.field:
+        assert _cuda_graph_cache_size() == (2 if platform_supports_graph else 0)
+    else:
+        assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
 
     x2_np = x2.to_numpy()
     y2_np = y2.to_numpy()

From a4cfdc393e750a12ac5988c38f252f2144acbb69 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 19:26:15 -0400
Subject: [PATCH 038/128] Parametrize all CUDA graph tests over ndarray and
 field

All tests except test_cuda_graph_different_sizes now run with both
qd.ndarray (NDArray annotation) and qd.field (Template annotation).
different_sizes is ndarray-only since different-sized fields produce
separate template specializations rather than reusing the same graph.

Made-with: Cursor
---
 tests/python/test_cuda_graph.py | 82 +++++++++++++++++++++------------
 1 file changed, 53 insertions(+), 29 deletions(-)

diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index 01048d2fcd..4529a960be 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -19,21 +19,25 @@ def _on_cuda():
     return impl.current_cfg().arch == qd.cuda
 
 
+@pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test()
-def test_cuda_graph_two_loops():
+def test_cuda_graph_two_loops(tensor_type):
     """A kernel with two top-level for loops should be fused into a CUDA graph."""
     platform_supports_graph = _on_cuda()
     n = 1024
-    x = qd.ndarray(qd.f32, shape=(n,))
-    y = qd.ndarray(qd.f32, shape=(n,))
+
+    Annotation = qd.types.NDArray[qd.f32, 1] if tensor_type == qd.ndarray else qd.Template
 
     @qd.kernel(cuda_graph=True)
-    def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndim=1)):
+    def two_loops(x: Annotation, y: Annotation):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1.0
         for i in range(y.shape[0]):
             y[i] = y[i] + 2.0
 
+    x = tensor_type(qd.f32, (n,))
+    y = tensor_type(qd.f32, (n,))
+
     assert _cuda_graph_cache_size() == 0
     two_loops(x, y)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
@@ -49,19 +53,17 @@ def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, n
     assert np.allclose(y_np, 6.0), f"Expected 6.0, got {y_np[:5]}"
 
 
+@pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test()
-def test_cuda_graph_three_loops():
+def test_cuda_graph_three_loops(tensor_type):
     """A kernel with three top-level for loops."""
     platform_supports_graph = _on_cuda()
     n = 512
-    a = qd.ndarray(qd.f32, shape=(n,))
-    b = qd.ndarray(qd.f32, shape=(n,))
-    c = qd.ndarray(qd.f32, shape=(n,))
+
+    Annotation = qd.types.NDArray[qd.f32, 1] if tensor_type == qd.ndarray else qd.Template
 
     @qd.kernel(cuda_graph=True)
-    def three_loops(
-        a: qd.types.ndarray(qd.f32, ndim=1), b: qd.types.ndarray(qd.f32, ndim=1), c: qd.types.ndarray(qd.f32, ndim=1)
-    ):
+    def three_loops(a: Annotation, b: Annotation, c: Annotation):
         for i in range(a.shape[0]):
             a[i] = a[i] + 1.0
         for i in range(b.shape[0]):
@@ -69,6 +71,10 @@ def three_loops(
         for i in range(c.shape[0]):
             c[i] = a[i] + b[i]
 
+    a = tensor_type(qd.f32, (n,))
+    b = tensor_type(qd.f32, (n,))
+    c = tensor_type(qd.f32, (n,))
+
     assert _cuda_graph_cache_size() == 0
     three_loops(a, b, c)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
@@ -93,18 +99,22 @@ def three_loops(
     assert np.allclose(c_np, 22.0)
 
 
+@pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test()
-def test_cuda_graph_single_loop_no_graph():
+def test_cuda_graph_single_loop_no_graph(tensor_type):
     """A kernel with a single for loop should NOT use the graph path,
     even with cuda_graph=True (falls back since < 2 tasks)."""
     n = 256
-    x = qd.ndarray(qd.f32, shape=(n,))
+
+    Annotation = qd.types.NDArray[qd.f32, 1] if tensor_type == qd.ndarray else qd.Template
 
     @qd.kernel(cuda_graph=True)
-    def single_loop(x: qd.types.ndarray(qd.f32, ndim=1)):
+    def single_loop(x: Annotation):
         for i in range(x.shape[0]):
             x[i] = x[i] + 5.0
 
+    x = tensor_type(qd.f32, (n,))
+
     single_loop(x)
     assert not _cuda_graph_used()
     single_loop(x)
@@ -115,20 +125,24 @@ def single_loop(x: qd.types.ndarray(qd.f32, ndim=1)):
     assert np.allclose(x_np, 10.0)
 
 
+@pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test()
-def test_no_cuda_graph_annotation():
+def test_no_cuda_graph_annotation(tensor_type):
     """A kernel WITHOUT cuda_graph=True should never use the graph path."""
     n = 256
-    x = qd.ndarray(qd.f32, shape=(n,))
-    y = qd.ndarray(qd.f32, shape=(n,))
+
+    Annotation = qd.types.NDArray[qd.f32, 1] if tensor_type == qd.ndarray else qd.Template
 
     @qd.kernel
-    def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndim=1)):
+    def two_loops(x: Annotation, y: Annotation):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1.0
         for i in range(y.shape[0]):
             y[i] = y[i] + 2.0
 
+    x = tensor_type(qd.f32, (n,))
+    y = tensor_type(qd.f32, (n,))
+
     two_loops(x, y)
     assert not _cuda_graph_used()
     two_loops(x, y)
@@ -201,11 +215,14 @@ def test_cuda_graph_different_sizes():
 
     Catches stale grid dims: if the graph cached from the small call is
     replayed for the large call, elements beyond the original size stay zero.
+
+    ndarray-only: fields are template args so different-sized fields would
+    produce separate compiled kernels, not exercise the same graph.
     """
     platform_supports_graph = _on_cuda()
 
     @qd.kernel(cuda_graph=True)
-    def add_one(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndim=1)):
+    def add_one(x: qd.types.NDArray[qd.f32, 1], y: qd.types.NDArray[qd.f32, 1]):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1.0
         for i in range(y.shape[0]):
@@ -229,21 +246,24 @@ def add_one(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndi
     assert np.allclose(y2_np, 2.0), f"Expected all 2.0, got {y2_np[250:260]}"
 
 
+@pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test()
-def test_cuda_graph_after_reset():
+def test_cuda_graph_after_reset(tensor_type):
     """cuda_graph=True kernel must work correctly after qd.reset()."""
     platform_supports_graph = _on_cuda()
 
+    Annotation = qd.types.NDArray[qd.f32, 1] if tensor_type == qd.ndarray else qd.Template
+
     @qd.kernel(cuda_graph=True)
-    def add_one(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndim=1)):
+    def add_one(x: Annotation, y: Annotation):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1.0
         for i in range(y.shape[0]):
             y[i] = y[i] + 2.0
 
     n = 256
-    x = qd.ndarray(qd.f32, shape=(n,))
-    y = qd.ndarray(qd.f32, shape=(n,))
+    x = tensor_type(qd.f32, (n,))
+    y = tensor_type(qd.f32, (n,))
     add_one(x, y)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
@@ -257,8 +277,8 @@ def add_one(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndi
     qd.reset()
     qd.init(arch=arch)
 
-    x2 = qd.ndarray(qd.f32, shape=(n,))
-    y2 = qd.ndarray(qd.f32, shape=(n,))
+    x2 = tensor_type(qd.f32, (n,))
+    y2 = tensor_type(qd.f32, (n,))
     assert _cuda_graph_cache_size() == 0
     add_one(x2, y2)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
@@ -268,21 +288,25 @@ def add_one(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndi
     assert np.allclose(y2.to_numpy(), 2.0)
 
 
+@pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test()
-def test_cuda_graph_annotation_cross_platform():
+def test_cuda_graph_annotation_cross_platform(tensor_type):
     """cuda_graph=True should be a harmless no-op on non-CUDA backends."""
     platform_supports_graph = _on_cuda()
     n = 256
-    x = qd.ndarray(qd.f32, shape=(n,))
-    y = qd.ndarray(qd.f32, shape=(n,))
+
+    Annotation = qd.types.NDArray[qd.f32, 1] if tensor_type == qd.ndarray else qd.Template
 
     @qd.kernel(cuda_graph=True)
-    def two_loops(x: qd.types.ndarray(qd.f32, ndim=1), y: qd.types.ndarray(qd.f32, ndim=1)):
+    def two_loops(x: Annotation, y: Annotation):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1.0
         for i in range(y.shape[0]):
             y[i] = y[i] + 2.0
 
+    x = tensor_type(qd.f32, (n,))
+    y = tensor_type(qd.f32, (n,))
+
     assert _cuda_graph_cache_size() == 0
     two_loops(x, y)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)

From 2db1b055b06274d741defe188cf8bf913befc9c8 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 19:30:34 -0400
Subject: [PATCH 039/128] Parametrize test_cuda_graph_different_sizes over
 ndarray and field

Adds field coverage and cache size assertions after each call. Fields
produce separate template specializations so the cache grows to 2
entries for different-sized fields.

Made-with: Cursor
---
 tests/python/test_cuda_graph.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index 4529a960be..c70408a6f3 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -209,36 +209,45 @@ def two_loops(x: Annotation, y: Annotation):
     assert np.allclose(y1_np, 4.0), f"y1 should be unchanged, got {y1_np[:5]}"
 
 
+@pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test()
-def test_cuda_graph_different_sizes():
+def test_cuda_graph_different_sizes(tensor_type):
     """Graph must produce correct results when called with different-sized arrays.
 
     Catches stale grid dims: if the graph cached from the small call is
     replayed for the large call, elements beyond the original size stay zero.
 
-    ndarray-only: fields are template args so different-sized fields would
-    produce separate compiled kernels, not exercise the same graph.
+    For fields, different-sized fields are separate template specializations,
+    so each gets its own graph cache entry.
     """
     platform_supports_graph = _on_cuda()
 
+    Annotation = qd.types.NDArray[qd.f32, 1] if tensor_type == qd.ndarray else qd.Template
+
     @qd.kernel(cuda_graph=True)
-    def add_one(x: qd.types.NDArray[qd.f32, 1], y: qd.types.NDArray[qd.f32, 1]):
+    def add_one(x: Annotation, y: Annotation):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1.0
         for i in range(y.shape[0]):
             y[i] = y[i] + 2.0
 
-    x1 = qd.ndarray(qd.f32, shape=(256,))
-    y1 = qd.ndarray(qd.f32, shape=(256,))
+    x1 = tensor_type(qd.f32, (256,))
+    y1 = tensor_type(qd.f32, (256,))
     assert _cuda_graph_cache_size() == 0
     add_one(x1, y1)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
 
-    x2 = qd.ndarray(qd.f32, shape=(1024,))
-    y2 = qd.ndarray(qd.f32, shape=(1024,))
+    x2 = tensor_type(qd.f32, (1024,))
+    y2 = tensor_type(qd.f32, (1024,))
     add_one(x2, y2)
     assert _cuda_graph_used() == platform_supports_graph
+    # Ndarrays reuse the same compiled kernel; fields produce a second
+    # template specialization with its own graph cache entry.
+    if tensor_type == qd.field:
+        assert _cuda_graph_cache_size() == (2 if platform_supports_graph else 0)
+    else:
+        assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
 
     x2_np = x2.to_numpy()
     y2_np = y2.to_numpy()

From f5ff0afc1c91ab6196349b4172ade67fe6edadb6 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 19:51:17 -0400
Subject: [PATCH 040/128] Rename cuda_graphs.md to cuda_graph.md

Made-with: Cursor
---
 docs/source/user_guide/{cuda_graphs.md => cuda_graph.md} | 0
 docs/source/user_guide/index.md                          | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename docs/source/user_guide/{cuda_graphs.md => cuda_graph.md} (100%)

diff --git a/docs/source/user_guide/cuda_graphs.md b/docs/source/user_guide/cuda_graph.md
similarity index 100%
rename from docs/source/user_guide/cuda_graphs.md
rename to docs/source/user_guide/cuda_graph.md
diff --git a/docs/source/user_guide/index.md b/docs/source/user_guide/index.md
index baeaee30de..c47e8afc82 100644
--- a/docs/source/user_guide/index.md
+++ b/docs/source/user_guide/index.md
@@ -12,7 +12,7 @@ user_guide/static
 user_guide/sub_functions
 user_guide/scalar_tensors
 user_guide/synchronization
-user_guide/cuda_graphs
+user_guide/cuda_graph
 user_guide/compound_types
 user_guide/python_backend
 user_guide/quirks

From a91878be8d8bb38b1df0b12c71e5713c901b60a1 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 19:53:55 -0400
Subject: [PATCH 041/128] merge doc from pr 1

---
 docs/source/user_guide/cuda_graph.md | 64 +++++++++++++++++-----------
 1 file changed, 40 insertions(+), 24 deletions(-)

diff --git a/docs/source/user_guide/cuda_graph.md b/docs/source/user_guide/cuda_graph.md
index 3e0ddfac77..1f6739e73b 100644
--- a/docs/source/user_guide/cuda_graph.md
+++ b/docs/source/user_guide/cuda_graph.md
@@ -1,31 +1,59 @@
-# CUDA Graphs
+# CUDA Graph
 
-When a Quadrants kernel has multiple top-level `for` loops, each loop is launched as a separate GPU kernel. The per-kernel launch overhead can become significant when kernels are small and numerous. CUDA graphs let you capture these launches once and replay them as a single unit, eliminating the repeated launch overhead.
+CUDA graphs reduce kernel launch overhead by capturing a sequence of GPU operations into a graph, then replaying it in a single launch. On non-CUDA platforms, the cuda graph annotation is simply ignored, and code runs normally.
 
-## Per-kernel opt-in with `cuda_graph=True`
+## Basic usage
 
-Annotate a kernel with `cuda_graph=True` to enable graph capture:
+Add `cuda_graph=True` to a `@qd.kernel` decorator:
 
 ```python
 @qd.kernel(cuda_graph=True)
-def step(x: qd.types.ndarray(qd.f32, ndim=1),
-         y: qd.types.ndarray(qd.f32, ndim=1)):
+def my_kernel(
+    x: qd.types.ndarray(qd.f32, ndim=1),
+    y: qd.types.ndarray(qd.f32, ndim=1),
+):
     for i in range(x.shape[0]):
         x[i] = x[i] + 1.0
     for i in range(y.shape[0]):
         y[i] = y[i] + 2.0
+```
+
+The top level for-loops will be compiled into a single CUDA graph. The parallelism is the same as before, but the launch latency much reduced.
+
+The kernel is used normally — no other API changes are needed:
+
+```python
+x = qd.ndarray(qd.f32, shape=(1024,))
+y = qd.ndarray(qd.f32, shape=(1024,))
 
-step(x, y)  # first call: captures the graph
-step(x, y)  # subsequent calls: replays the cached graph
+my_kernel(x, y)  # first call: builds and caches the graph
+my_kernel(x, y)  # subsequent calls: replays the cached graph
 ```
 
-On the first call, the kernel's offloaded tasks are captured into a CUDA graph using the explicit node API. Subsequent calls replay the cached graph. The arg buffer is re-uploaded on each replay, so calling the kernel with different ndarrays works correctly.
+### Restrictions
 
-**When it applies**: graph capture only activates when there are 2 or more top-level `for` loops (offloaded tasks). A single-loop kernel with `cuda_graph=True` falls back silently to the normal launch path.
+- **No struct return values.** Kernels that return values (e.g. `-> qd.i32`) cannot use CUDA graphs. An error is raised if `cuda_graph=True` is set on such a kernel.
+- **Primal kernels only.** The `cuda_graph=True` flag is applied to the primal (forward) kernel only, not its adjoint. Autodiff kernels use the normal launch path.
 
-**Cross-platform**: `cuda_graph=True` is a harmless no-op on non-CUDA backends (CPU, Metal, etc.). You can annotate kernels unconditionally without breaking portability.
+### Passing different arguments
 
-## GPU-side iteration with `graph_while`
+You can pass different ndarrays to the same kernel on subsequent calls. The cached graph is replayed with the updated arguments — no graph rebuild occurs:
+
+```python
+x1 = qd.ndarray(qd.f32, shape=(1024,))
+y1 = qd.ndarray(qd.f32, shape=(1024,))
+my_kernel(x1, y1)  # builds graph
+
+x2 = qd.ndarray(qd.f32, shape=(1024,))
+y2 = qd.ndarray(qd.f32, shape=(1024,))
+my_kernel(x2, y2)  # replays graph with new array pointers
+```
+
+### Fields as arguments
+
+When different fields are passed as template arguments, each unique combination of fields produces a separately compiled kernel with its own graph cache entry. There is no interference between them.## GPU-side iteration with `graph_while`
+
+## `graph_while`
 
 For iterative algorithms (physics solvers, convergence loops), you often want to repeat the kernel body until a condition is met, without returning to the host each iteration. The `graph_while` parameter enables this:
 
@@ -82,15 +110,3 @@ def converge(x: qd.types.ndarray(qd.f32, ndim=1),
 ### Do-while semantics
 
 `graph_while` has **do-while** semantics: the kernel body always executes at least once before the condition is checked. This matches the behavior of CUDA conditional while nodes. The flag value must be >= 1 at launch time. Passing 0 with a kernel that decrements the counter will cause an infinite loop.
-
-## When to use CUDA graphs
-
-CUDA graphs are most beneficial when:
-
-- A kernel has many small top-level `for` loops where launch overhead dominates runtime.
-- An iterative algorithm needs to repeat the kernel body many times without host round-trips (`graph_while`).
-
-They are less useful when:
-
-- Kernels have only a single top-level loop (no graph is created).
-- Individual kernel runtimes are large enough to fully hide launch latency.

From cd23e79a45efc59727cc3bca0ff826e78c5961c5 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 19:54:51 -0400
Subject: [PATCH 042/128] Use index.md from cuda-graph-mvp-1 branch

Made-with: Cursor
---
 docs/source/user_guide/index.md | 57 ++++++++++++++++++++++++++-------
 1 file changed, 45 insertions(+), 12 deletions(-)

diff --git a/docs/source/user_guide/index.md b/docs/source/user_guide/index.md
index c47e8afc82..e7f8c46b30 100644
--- a/docs/source/user_guide/index.md
+++ b/docs/source/user_guide/index.md
@@ -5,16 +5,49 @@
 :maxdepth: 1
 :titlesonly:
 
-user_guide/getting_started
-user_guide/supported_systems
-user_guide/tensor_types
-user_guide/static
-user_guide/sub_functions
-user_guide/scalar_tensors
-user_guide/synchronization
-user_guide/cuda_graph
-user_guide/compound_types
-user_guide/python_backend
-user_guide/quirks
-user_guide/troubleshooting
+getting_started
+supported_systems
+```
+
+```{toctree}
+:caption: Core concepts
+:maxdepth: 1
+:titlesonly:
+
+tensor_types
+scalar_tensors
+matrix_vector
+compound_types
+static
+sub_functions
+parallelization
+```
+
+```{toctree}
+:caption: Integration
+:maxdepth: 1
+:titlesonly:
+
+interop
+```
+
+```{toctree}
+:caption: Performance
+:maxdepth: 1
+:titlesonly:
+
+cuda_graph
+perf_dispatch
+```
+
+```{toctree}
+:caption: Reference
+:maxdepth: 1
+:titlesonly:
+
+unsupported_python
+python_backend
+debug
+quirks
+troubleshooting
 ```

From aad0dd9ed3b5db255de60d823938a5d06206b7ef Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 19:55:27 -0400
Subject: [PATCH 043/128] fix up merge

---
 docs/source/user_guide/cuda_graph.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/user_guide/cuda_graph.md b/docs/source/user_guide/cuda_graph.md
index 1f6739e73b..b6bc87260d 100644
--- a/docs/source/user_guide/cuda_graph.md
+++ b/docs/source/user_guide/cuda_graph.md
@@ -51,9 +51,9 @@ my_kernel(x2, y2)  # replays graph with new array pointers
 
 ### Fields as arguments
 
-When different fields are passed as template arguments, each unique combination of fields produces a separately compiled kernel with its own graph cache entry. There is no interference between them.## GPU-side iteration with `graph_while`
+When different fields are passed as template arguments, each unique combination of fields produces a separately compiled kernel with its own graph cache entry. There is no interference between them.
 
-## `graph_while`
+## GPU-side iteration with `graph_while`
 
 For iterative algorithms (physics solvers, convergence loops), you often want to repeat the kernel body until a condition is met, without returning to the host each iteration. The `graph_while` parameter enables this:
 

From d559a92e0ffebace29a72e0cd20a8a84ba1cb8a6 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 19:56:46 -0400
Subject: [PATCH 044/128] Use [()] instead of [None] in CUDA graph docs

Made-with: Cursor
---
 docs/source/user_guide/cuda_graph.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/user_guide/cuda_graph.md b/docs/source/user_guide/cuda_graph.md
index b6bc87260d..3ae72553e7 100644
--- a/docs/source/user_guide/cuda_graph.md
+++ b/docs/source/user_guide/cuda_graph.md
@@ -64,7 +64,7 @@ def solve(x: qd.types.ndarray(qd.f32, ndim=1),
     for i in range(x.shape[0]):
         x[i] = x[i] + 1.0
     for i in range(1):
-        counter[None] = counter[None] - 1
+        counter[()] = counter[()] - 1
 
 x = qd.ndarray(qd.f32, shape=(N,))
 counter = qd.ndarray(qd.i32, shape=())
@@ -90,7 +90,7 @@ def iterate(x: qd.types.ndarray(qd.f32, ndim=1),
     for i in range(x.shape[0]):
         x[i] = x[i] + 1.0
     for i in range(1):
-        counter[None] = counter[None] - 1
+        counter[()] = counter[()] - 1
 ```
 
 **Boolean flag**: set a `keep_going` flag to 1, have the kernel set it to 0 when a convergence criterion is met.
@@ -104,7 +104,7 @@ def converge(x: qd.types.ndarray(qd.f32, ndim=1),
         pass
     for i in range(1):
         if some_condition(x):
-            keep_going[None] = 0
+            keep_going[()] = 0
 ```
 
 ### Do-while semantics

From 712846f01042d5331c7f2b650de168d586716222 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 20:01:55 -0400
Subject: [PATCH 045/128] ndarray vs field

---
 docs/source/user_guide/cuda_graph.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/user_guide/cuda_graph.md b/docs/source/user_guide/cuda_graph.md
index 3ae72553e7..43feda69d5 100644
--- a/docs/source/user_guide/cuda_graph.md
+++ b/docs/source/user_guide/cuda_graph.md
@@ -110,3 +110,9 @@ def converge(x: qd.types.ndarray(qd.f32, ndim=1),
 ### Do-while semantics
 
 `graph_while` has **do-while** semantics: the kernel body always executes at least once before the condition is checked. This matches the behavior of CUDA conditional while nodes. The flag value must be >= 1 at launch time. Passing 0 with a kernel that decrements the counter will cause an infinite loop.
+
+### ndarray vs field
+
+The parameter used by `graph_while` MUST be an ndarray.
+
+However, other parameters can be any supported Quadrants kernel parameter type.

From a14a0723c68273fcaf04f0fd3bd91566809e4a26 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 20:06:11 -0400
Subject: [PATCH 046/128] Rename graph_while to graph_do_while

Reflects the do-while semantics: the kernel body always executes at
least once before the condition is checked.

Made-with: Cursor
---
 docs/source/user_guide/cuda_graph.md          | 18 ++---
 python/quadrants/lang/kernel.py               | 10 +--
 python/quadrants/lang/kernel_impl.py          | 22 +++---
 quadrants/program/launch_context_builder.h    |  4 +-
 quadrants/python/export_lang.cpp              |  4 +-
 quadrants/runtime/amdgpu/kernel_launcher.cpp  | 14 ++--
 quadrants/runtime/cpu/kernel_launcher.cpp     | 12 ++--
 quadrants/runtime/cuda/kernel_launcher.cpp    | 70 +++++++++----------
 quadrants/runtime/cuda/kernel_launcher.h      |  4 +-
 ...h_while.py => test_cuda_graph_do_while.py} | 30 ++++----
 ...y => test_graph_do_while_cross_backend.py} | 38 +++++-----
 11 files changed, 113 insertions(+), 113 deletions(-)
 rename tests/python/{test_cuda_graph_while.py => test_cuda_graph_do_while.py} (84%)
 rename tests/python/{test_graph_while_cross_backend.py => test_graph_do_while_cross_backend.py} (85%)

diff --git a/docs/source/user_guide/cuda_graph.md b/docs/source/user_guide/cuda_graph.md
index 43feda69d5..ef9fe761a4 100644
--- a/docs/source/user_guide/cuda_graph.md
+++ b/docs/source/user_guide/cuda_graph.md
@@ -53,12 +53,12 @@ my_kernel(x2, y2)  # replays graph with new array pointers
 
 When different fields are passed as template arguments, each unique combination of fields produces a separately compiled kernel with its own graph cache entry. There is no interference between them.
 
-## GPU-side iteration with `graph_while`
+## GPU-side iteration with `graph_do_while`
 
-For iterative algorithms (physics solvers, convergence loops), you often want to repeat the kernel body until a condition is met, without returning to the host each iteration. The `graph_while` parameter enables this:
+For iterative algorithms (physics solvers, convergence loops), you often want to repeat the kernel body until a condition is met, without returning to the host each iteration. The `graph_do_while` parameter enables this:
 
 ```python
-@qd.kernel(graph_while="counter")
+@qd.kernel(graph_do_while="counter")
 def solve(x: qd.types.ndarray(qd.f32, ndim=1),
           counter: qd.types.ndarray(qd.i32, ndim=0)):
     for i in range(x.shape[0]):
@@ -73,18 +73,18 @@ solve(x, counter)
 # x is now incremented 10 times; counter is 0
 ```
 
-The `graph_while` value is the name of a scalar `qd.i32` ndarray parameter. The kernel body repeats while this value is non-zero.
+The `graph_do_while` value is the name of a scalar `qd.i32` ndarray parameter. The kernel body repeats while this value is non-zero.
 
 - On SM 9.0+ (Hopper), this uses CUDA conditional while nodes — the entire iteration runs on the GPU with no host involvement.
 - On older CUDA GPUs and non-CUDA backends, it falls back to a host-side do-while loop.
-- `graph_while` implicitly enables `cuda_graph=True`.
+- `graph_do_while` implicitly enables `cuda_graph=True`.
 
 ### Patterns
 
 **Counter-based**: set the counter to N, decrement each iteration. The body runs exactly N times.
 
 ```python
-@qd.kernel(graph_while="counter")
+@qd.kernel(graph_do_while="counter")
 def iterate(x: qd.types.ndarray(qd.f32, ndim=1),
             counter: qd.types.ndarray(qd.i32, ndim=0)):
     for i in range(x.shape[0]):
@@ -96,7 +96,7 @@ def iterate(x: qd.types.ndarray(qd.f32, ndim=1),
 **Boolean flag**: set a `keep_going` flag to 1, have the kernel set it to 0 when a convergence criterion is met.
 
 ```python
-@qd.kernel(graph_while="keep_going")
+@qd.kernel(graph_do_while="keep_going")
 def converge(x: qd.types.ndarray(qd.f32, ndim=1),
              keep_going: qd.types.ndarray(qd.i32, ndim=0)):
     for i in range(x.shape[0]):
@@ -109,10 +109,10 @@ def converge(x: qd.types.ndarray(qd.f32, ndim=1),
 
 ### Do-while semantics
 
-`graph_while` has **do-while** semantics: the kernel body always executes at least once before the condition is checked. This matches the behavior of CUDA conditional while nodes. The flag value must be >= 1 at launch time. Passing 0 with a kernel that decrements the counter will cause an infinite loop.
+`graph_do_while` has **do-while** semantics: the kernel body always executes at least once before the condition is checked. This matches the behavior of CUDA conditional while nodes. The flag value must be >= 1 at launch time. Passing 0 with a kernel that decrements the counter will cause an infinite loop.
 
 ### ndarray vs field
 
-The parameter used by `graph_while` MUST be an ndarray.
+The parameter used by `graph_do_while` MUST be an ndarray.
 
 However, other parameters can be any supported Quadrants kernel parameter type.
diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
index e2cbd00076..d2102c354a 100644
--- a/python/quadrants/lang/kernel.py
+++ b/python/quadrants/lang/kernel.py
@@ -292,7 +292,7 @@ def __init__(self, _func: Callable, autodiff_mode: AutodiffMode, _is_classkernel
         self.materialized_kernels: dict[CompiledKernelKeyType, KernelCxx] = {}
         self.has_print = False
         self.use_cuda_graph: bool = False
-        self.graph_while_arg: str | None = None
+        self.graph_do_while_arg: str | None = None
         self.quadrants_callable: QuadrantsCallable | None = None
         self.visited_functions: set[FunctionSourceInfo] = set()
         self.kernel_function_info: FunctionSourceInfo | None = None
@@ -445,8 +445,8 @@ def launch_kernel(self, key, t_kernel: KernelCxx, compiled_kernel_data: Compiled
                     template_num += 1
                     i_out += 1
                     continue
-                if self.graph_while_arg is not None and self.arg_metas[i_in].name == self.graph_while_arg:
-                    self._graph_while_cpp_arg_id = i_out - template_num
+                if self.graph_do_while_arg is not None and self.arg_metas[i_in].name == self.graph_do_while_arg:
+                    self._graph_do_while_cpp_arg_id = i_out - template_num
                 num_args_, is_launch_ctx_cacheable_ = self._recursive_set_args(
                     self.used_py_dataclass_parameters_by_key_enforcing[key],
                     self.arg_metas[i_in].name,
@@ -508,8 +508,8 @@ def launch_kernel(self, key, t_kernel: KernelCxx, compiled_kernel_data: Compiled
                     self.src_ll_cache_observations.cache_stored = True
             self._last_compiled_kernel_data = compiled_kernel_data
             launch_ctx.use_cuda_graph = self.use_cuda_graph
-            if self.graph_while_arg is not None and hasattr(self, "_graph_while_cpp_arg_id"):
-                launch_ctx.graph_while_arg_id = self._graph_while_cpp_arg_id
+            if self.graph_do_while_arg is not None and hasattr(self, "_graph_do_while_cpp_arg_id"):
+                launch_ctx.graph_do_while_arg_id = self._graph_do_while_cpp_arg_id
             prog.launch_kernel(compiled_kernel_data, launch_ctx)
         except Exception as e:
             e = handle_exception_from_cpp(e)
diff --git a/python/quadrants/lang/kernel_impl.py b/python/quadrants/lang/kernel_impl.py
index b5ab435e41..a9103b69fc 100644
--- a/python/quadrants/lang/kernel_impl.py
+++ b/python/quadrants/lang/kernel_impl.py
@@ -128,13 +128,13 @@ def _kernel_impl(
     level_of_class_stackframe: int,
     verbose: bool = False,
     cuda_graph: bool = False,
-    graph_while: str | None = None,
+    graph_do_while: str | None = None,
 ) -> QuadrantsCallable:
     # Can decorators determine if a function is being defined inside a class?
     # https://stackoverflow.com/a/8793684/12003165
     is_classkernel = _inside_class(level_of_class_stackframe + 1)
 
-    if graph_while is not None:
+    if graph_do_while is not None:
         cuda_graph = True
 
     if verbose:
@@ -142,12 +142,12 @@ def _kernel_impl(
     primal = Kernel(_func, autodiff_mode=_NONE, _is_classkernel=is_classkernel)
     adjoint = Kernel(_func, autodiff_mode=_REVERSE, _is_classkernel=is_classkernel)
     primal.use_cuda_graph = cuda_graph
-    primal.graph_while_arg = graph_while
-    if graph_while is not None:
+    primal.graph_do_while_arg = graph_do_while
+    if graph_do_while is not None:
         arg_names = [m.name for m in primal.arg_metas]
-        if graph_while not in arg_names:
+        if graph_do_while not in arg_names:
             raise ValueError(
-                f"graph_while={graph_while!r} does not match any parameter of "
+                f"graph_do_while={graph_do_while!r} does not match any parameter of "
                 f"kernel {_func.__name__!r}. Available parameters: {arg_names}"
             )
     # Having |primal| contains |grad| makes the tape work.
@@ -192,7 +192,7 @@ def wrapped_classkernel(*args, **kwargs):
 # TODO: This callable should be Callable[[F], F].
 # See comments below.
 def kernel(
-    _fn: None = None, *, pure: bool = False, cuda_graph: bool = False, graph_while: str | None = None
+    _fn: None = None, *, pure: bool = False, cuda_graph: bool = False, graph_do_while: str | None = None
 ) -> Callable[[Any], Any]: ...
 
 
@@ -203,7 +203,7 @@ def kernel(
 # However, by making it return Any, we can make the pure parameter
 # change now, without breaking pyright.
 @overload
-def kernel(_fn: Any, *, pure: bool = False, cuda_graph: bool = False, graph_while: str | None = None) -> Any: ...
+def kernel(_fn: Any, *, pure: bool = False, cuda_graph: bool = False, graph_do_while: str | None = None) -> Any: ...
 
 
 def kernel(
@@ -212,7 +212,7 @@ def kernel(
     pure: bool | None = None,
     fastcache: bool = False,
     cuda_graph: bool = False,
-    graph_while: str | None = None,
+    graph_do_while: str | None = None,
 ):
     """
     Marks a function as a Quadrants kernel.
@@ -229,7 +229,7 @@ def kernel(
             into a CUDA graph on first launch and replayed on subsequent
             launches, reducing per-kernel launch overhead. On non-CUDA backends
             this flag is a harmless no-op.
-        graph_while: Name of a scalar ``qd.i32`` ndarray parameter that
+        graph_do_while: Name of a scalar ``qd.i32`` ndarray parameter that
             controls GPU-side iteration. The kernel body repeats while the
             named argument is non-zero.  Uses CUDA conditional while nodes
             on SM 9.0+ (Hopper); falls back to a host-side do-while loop
@@ -259,7 +259,7 @@ def decorator(fn: F, has_kernel_params: bool = True) -> F:
         else:
             level = 4
 
-        wrapped = _kernel_impl(fn, level_of_class_stackframe=level, cuda_graph=cuda_graph, graph_while=graph_while)
+        wrapped = _kernel_impl(fn, level_of_class_stackframe=level, cuda_graph=cuda_graph, graph_do_while=graph_do_while)
         wrapped.is_pure = pure is not None and pure or fastcache
         if pure is not None:
             warnings_helper.warn_once(
diff --git a/quadrants/program/launch_context_builder.h b/quadrants/program/launch_context_builder.h
index 9bcc6310bd..d0abd782f8 100644
--- a/quadrants/program/launch_context_builder.h
+++ b/quadrants/program/launch_context_builder.h
@@ -151,8 +151,8 @@ class LaunchContextBuilder {
   const StructType *args_type{nullptr};
   size_t result_buffer_size{0};
   bool use_cuda_graph{false};
-  int graph_while_arg_id{-1};
-  void *graph_while_flag_dev_ptr{nullptr};
+  int graph_do_while_arg_id{-1};
+  void *graph_do_while_flag_dev_ptr{nullptr};
 
   // Note that I've tried to group `array_runtime_size` and
   // `is_device_allocations` into a small struct. However, it caused some test
diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
index c82a26e091..baec110535 100644
--- a/quadrants/python/export_lang.cpp
+++ b/quadrants/python/export_lang.cpp
@@ -664,8 +664,8 @@ void export_lang(py::module &m) {
       .def("get_struct_ret_uint", &LaunchContextBuilder::get_struct_ret_uint)
       .def("get_struct_ret_float", &LaunchContextBuilder::get_struct_ret_float)
       .def_readwrite("use_cuda_graph", &LaunchContextBuilder::use_cuda_graph)
-      .def_readwrite("graph_while_arg_id",
-                     &LaunchContextBuilder::graph_while_arg_id);
+      .def_readwrite("graph_do_while_arg_id",
+                     &LaunchContextBuilder::graph_do_while_arg_id);
 
   py::class_<Function>(m, "Function")
       .def("insert_scalar_param", &Function::insert_scalar_param)
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index f993855a9d..e35edff10d 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -74,8 +74,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         }
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)ctx.array_ptrs[grad_ptr_idx]);
-        if (arg_id == ctx.graph_while_arg_id) {
-          ctx.graph_while_flag_dev_ptr = device_ptrs[data_ptr_idx];
+        if (arg_id == ctx.graph_do_while_arg_id) {
+          ctx.graph_do_while_flag_dev_ptr = device_ptrs[data_ptr_idx];
         }
       } else if (arr_sz > 0) {  // why use arr_sz constrain?
         // Ndarray
@@ -85,8 +85,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)ctx.array_ptrs[grad_ptr_idx]);
-        if (arg_id == ctx.graph_while_arg_id) {
-          ctx.graph_while_flag_dev_ptr = device_ptrs[data_ptr_idx];
+        if (arg_id == ctx.graph_do_while_arg_id) {
+          ctx.graph_do_while_flag_dev_ptr = device_ptrs[data_ptr_idx];
         }
       }
     }
@@ -124,15 +124,15 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
                             task.dynamic_shared_array_bytes,
                             {(void *)&context_pointer}, {arg_size});
     }
-    if (ctx.graph_while_arg_id >= 0 && ctx.graph_while_flag_dev_ptr) {
+    if (ctx.graph_do_while_arg_id >= 0 && ctx.graph_do_while_flag_dev_ptr) {
       int32_t counter_val = 0;
       AMDGPUDriver::get_instance().stream_synchronize(nullptr);
       AMDGPUDriver::get_instance().memcpy_device_to_host(
-          &counter_val, ctx.graph_while_flag_dev_ptr, sizeof(int32_t));
+          &counter_val, ctx.graph_do_while_flag_dev_ptr, sizeof(int32_t));
       if (counter_val == 0)
         break;
     }
-  } while (ctx.graph_while_arg_id >= 0);
+  } while (ctx.graph_do_while_arg_id >= 0);
   QD_TRACE("Launching kernel");
   if (ctx.arg_buffer_size > 0) {
     AMDGPUDriver::get_instance().mem_free(device_arg_buffer);
diff --git a/quadrants/runtime/cpu/kernel_launcher.cpp b/quadrants/runtime/cpu/kernel_launcher.cpp
index b08bab551a..6b67a2ea5b 100644
--- a/quadrants/runtime/cpu/kernel_launcher.cpp
+++ b/quadrants/runtime/cpu/kernel_launcher.cpp
@@ -27,8 +27,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
       if (ctx.device_allocation_type[arg_id] ==
           LaunchContextBuilder::DevAllocType::kNone) {
         ctx.set_ndarray_ptrs(arg_id, (uint64)data_ptr, (uint64)grad_ptr);
-        if (arg_id == ctx.graph_while_arg_id) {
-          ctx.graph_while_flag_dev_ptr = data_ptr;
+        if (arg_id == ctx.graph_do_while_arg_id) {
+          ctx.graph_do_while_flag_dev_ptr = data_ptr;
         }
       } else if (ctx.array_runtime_sizes[arg_id] > 0) {
         uint64 host_ptr = (uint64)executor->get_device_alloc_info_ptr(
@@ -41,8 +41,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
                 : (uint64)executor->get_device_alloc_info_ptr(
                       *static_cast<DeviceAllocation *>(grad_ptr));
         ctx.set_ndarray_ptrs(arg_id, host_ptr, host_ptr_grad);
-        if (arg_id == ctx.graph_while_arg_id) {
-          ctx.graph_while_flag_dev_ptr = (void *)host_ptr;
+        if (arg_id == ctx.graph_do_while_arg_id) {
+          ctx.graph_do_while_flag_dev_ptr = (void *)host_ptr;
         }
       }
     }
@@ -51,8 +51,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
     for (auto task : launcher_ctx.task_funcs) {
       task(&ctx.get_context());
     }
-  } while (ctx.graph_while_arg_id >= 0 && ctx.graph_while_flag_dev_ptr &&
-           *static_cast<int32_t *>(ctx.graph_while_flag_dev_ptr) != 0);
+  } while (ctx.graph_do_while_arg_id >= 0 && ctx.graph_do_while_flag_dev_ptr &&
+           *static_cast<int32_t *>(ctx.graph_do_while_flag_dev_ptr) != 0);
 }
 
 KernelLauncher::Handle KernelLauncher::register_llvm_kernel(
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 9f3265a759..b9c72c0a6e 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -23,16 +23,16 @@ static const char *kConditionKernelPTX = R"PTX(
     .param .b32 cudaGraphSetConditional_param_1
 )
 ;
-.visible .entry _qd_graph_while_cond(
-    .param .u64 _qd_graph_while_cond_param_0,
-    .param .u64 _qd_graph_while_cond_param_1
+.visible .entry _qd_graph_do_while_cond(
+    .param .u64 _qd_graph_do_while_cond_param_0,
+    .param .u64 _qd_graph_do_while_cond_param_1
 )
 {
     .reg .pred %p<2>;
     .reg .b32 %r<3>;
     .reg .b64 %rd<4>;
-    ld.param.u64 %rd1, [_qd_graph_while_cond_param_0];
-    ld.param.u64 %rd2, [_qd_graph_while_cond_param_1];
+    ld.param.u64 %rd1, [_qd_graph_do_while_cond_param_0];
+    ld.param.u64 %rd2, [_qd_graph_do_while_cond_param_1];
     cvta.to.global.u64 %rd3, %rd2;
     ld.global.u32 %r1, [%rd3];
     setp.ne.s32 %p1, %r1, 0;
@@ -151,8 +151,8 @@ bool KernelLauncher::resolve_ctx_ndarray_ptrs(
       if (resolved_data) {
         ctx.set_ndarray_ptrs(arg_id, (uint64)resolved_data,
                              (uint64)resolved_grad);
-        if (arg_id == ctx.graph_while_arg_id) {
-          ctx.graph_while_flag_dev_ptr = resolved_data;
+        if (arg_id == ctx.graph_do_while_arg_id) {
+          ctx.graph_do_while_flag_dev_ptr = resolved_data;
         }
       }
     }
@@ -167,7 +167,7 @@ void KernelLauncher::ensure_condition_kernel_loaded() {
   int cc = CUDAContext::get_instance().get_compute_capability();
   if (cc < 90) {
     QD_WARN(
-        "graph_while requires SM 9.0+ (Hopper), but this device is SM {}. "
+        "graph_do_while requires SM 9.0+ (Hopper), but this device is SM {}. "
         "Falling back to non-graph path.",
         cc);
     return;
@@ -192,7 +192,7 @@ void KernelLauncher::ensure_condition_kernel_loaded() {
     }
   }
   if (cudadevrt_path.empty()) {
-    QD_WARN("Cannot find libcudadevrt.a — graph_while will not work");
+    QD_WARN("Cannot find libcudadevrt.a — graph_do_while will not work");
     return;
   }
 
@@ -213,10 +213,10 @@ void KernelLauncher::ensure_condition_kernel_loaded() {
 
   driver.module_load_data(&cond_kernel_module_, cubin);
   driver.module_get_function(&cond_kernel_func_, cond_kernel_module_,
-                             "_qd_graph_while_cond");
+                             "_qd_graph_do_while_cond");
   driver.link_destroy(link_state);
 
-  QD_TRACE("Loaded graph_while condition kernel ({} bytes cubin)", cubin_size);
+  QD_TRACE("Loaded graph_do_while condition kernel ({} bytes cubin)", cubin_size);
 }
 
 bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
@@ -230,9 +230,9 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
   const auto &offloaded_tasks = launcher_ctx.offloaded_tasks;
 
   // A single-task kernel has no multi-launch overhead to eliminate, so
-  // graphing it provides no benefit — unless graph_while is active, in which
+  // graphing it provides no benefit — unless graph_do_while is active, in which
   // case the graph is needed for the conditional-while loop structure.
-  if (offloaded_tasks.size() < 2 && ctx.graph_while_arg_id < 0) {
+  if (offloaded_tasks.size() < 2 && ctx.graph_do_while_arg_id < 0) {
     return false;
   }
 
@@ -246,16 +246,16 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
     return false;
   }
 
-  const bool use_graph_while = ctx.graph_while_arg_id >= 0;
+  const bool use_graph_do_while = ctx.graph_do_while_arg_id >= 0;
 
   auto it = cuda_graph_cache_.find(launch_id);
   if (it != cuda_graph_cache_.end()) {
     auto &cached = it->second;
-    if (use_graph_while &&
-        cached.graph_while_flag_dev_ptr != ctx.graph_while_flag_dev_ptr) {
+    if (use_graph_do_while &&
+        cached.graph_do_while_flag_dev_ptr != ctx.graph_do_while_flag_dev_ptr) {
       QD_TRACE(
-          "graph_while flag pointer changed ({} -> {}), rebuilding CUDA graph",
-          cached.graph_while_flag_dev_ptr, ctx.graph_while_flag_dev_ptr);
+          "graph_do_while flag pointer changed ({} -> {}), rebuilding CUDA graph",
+          cached.graph_do_while_flag_dev_ptr, ctx.graph_do_while_flag_dev_ptr);
       cuda_graph_cache_.erase(it);
     } else {
       if (ctx.arg_buffer_size > 0) {
@@ -303,11 +303,11 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
   CUDADriver::get_instance().graph_create(&graph, 0);
 
   // Determine the target graph for kernel nodes.
-  // With graph_while, kernels go into the conditional while body graph.
+  // With graph_do_while, kernels go into the conditional while body graph.
   void *kernel_target_graph = graph;
   unsigned long long cond_handle = 0;
 
-  if (use_graph_while) {
+  if (use_graph_do_while) {
     ensure_condition_kernel_loaded();
     if (!cond_kernel_func_) {
       QD_WARN("Condition kernel not available, falling back to non-graph");
@@ -339,7 +339,7 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
     QD_ASSERT(body_graphs && body_graphs[0]);
     kernel_target_graph = body_graphs[0];
 
-    QD_TRACE("CUDA graph_while: conditional node created, body graph={}",
+    QD_TRACE("CUDA graph_do_while: conditional node created, body graph={}",
              kernel_target_graph);
   }
 
@@ -372,11 +372,11 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
     prev_node = node;
   }
 
-  // For graph_while: add condition kernel as the last node in the body graph
-  if (use_graph_while) {
-    QD_ASSERT(ctx.graph_while_flag_dev_ptr);
+  // For graph_do_while: add condition kernel as the last node in the body graph
+  if (use_graph_do_while) {
+    QD_ASSERT(ctx.graph_do_while_flag_dev_ptr);
 
-    void *flag_ptr = ctx.graph_while_flag_dev_ptr;
+    void *flag_ptr = ctx.graph_do_while_flag_dev_ptr;
     void *cond_args[2] = {&cond_handle, &flag_ptr};
 
     CudaKernelNodeParams cond_kp{};
@@ -410,10 +410,10 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
       "CUDA graph created with {} kernel nodes for launch_id={}"
       "{}",
       offloaded_tasks.size(), launch_id,
-      use_graph_while ? " (with graph_while)" : "");
+      use_graph_do_while ? " (with graph_do_while)" : "");
 
-  if (use_graph_while) {
-    cached.graph_while_flag_dev_ptr = ctx.graph_while_flag_dev_ptr;
+  if (use_graph_do_while) {
+    cached.graph_do_while_flag_dev_ptr = ctx.graph_do_while_flag_dev_ptr;
   }
   cuda_graph_cache_.emplace(launch_id, std::move(cached));
   return true;
@@ -517,8 +517,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)device_ptrs[grad_ptr_idx]);
-        if (arg_id == ctx.graph_while_arg_id) {
-          ctx.graph_while_flag_dev_ptr = device_ptrs[data_ptr_idx];
+        if (arg_id == ctx.graph_do_while_arg_id) {
+          ctx.graph_do_while_flag_dev_ptr = device_ptrs[data_ptr_idx];
         }
       } else if (arr_sz > 0) {
         // Ndarray
@@ -535,8 +535,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)device_ptrs[grad_ptr_idx]);
-        if (arg_id == ctx.graph_while_arg_id) {
-          ctx.graph_while_flag_dev_ptr = device_ptrs[data_ptr_idx];
+        if (arg_id == ctx.graph_do_while_arg_id) {
+          ctx.graph_do_while_flag_dev_ptr = device_ptrs[data_ptr_idx];
         }
       }
     }
@@ -566,16 +566,16 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
                           task.dynamic_shared_array_bytes, {&ctx.get_context()},
                           {});
     }
-    if (ctx.graph_while_arg_id >= 0 && ctx.graph_while_flag_dev_ptr) {
+    if (ctx.graph_do_while_arg_id >= 0 && ctx.graph_do_while_flag_dev_ptr) {
       int32_t counter_val = 0;
       auto *stream = CUDAContext::get_instance().get_stream();
       CUDADriver::get_instance().stream_synchronize(stream);
       CUDADriver::get_instance().memcpy_device_to_host(
-          &counter_val, ctx.graph_while_flag_dev_ptr, sizeof(int32_t));
+          &counter_val, ctx.graph_do_while_flag_dev_ptr, sizeof(int32_t));
       if (counter_val == 0)
         break;
     }
-  } while (ctx.graph_while_arg_id >= 0);
+  } while (ctx.graph_do_while_arg_id >= 0);
   if (ctx.arg_buffer_size > 0) {
     CUDADriver::get_instance().mem_free_async(device_arg_buffer, nullptr);
   }
diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index ecaa935207..2605cbbb62 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -50,7 +50,7 @@ struct CachedCudaGraph {
   RuntimeContext persistent_ctx{};
   std::size_t arg_buffer_size{0};
   std::size_t result_buffer_size{0};
-  void *graph_while_flag_dev_ptr{nullptr};
+  void *graph_do_while_flag_dev_ptr{nullptr};
 
   CachedCudaGraph() = default;
   ~CachedCudaGraph();
@@ -95,7 +95,7 @@ class KernelLauncher : public LLVM::KernelLauncher {
   std::unordered_map<int, CachedCudaGraph> cuda_graph_cache_;
   bool cuda_graph_cache_used_on_last_call_{false};
 
-  // JIT-compiled condition kernel for graph_while conditional nodes
+  // JIT-compiled condition kernel for graph_do_while conditional nodes
   void *cond_kernel_module_{nullptr};  // CUmodule
   void *cond_kernel_func_{nullptr};    // CUfunction
 };
diff --git a/tests/python/test_cuda_graph_while.py b/tests/python/test_cuda_graph_do_while.py
similarity index 84%
rename from tests/python/test_cuda_graph_while.py
rename to tests/python/test_cuda_graph_do_while.py
index c47104452b..e1638592bd 100644
--- a/tests/python/test_cuda_graph_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -6,11 +6,11 @@
 
 
 @test_utils.test(arch=[qd.cuda])
-def test_graph_while_counter():
-    """Test graph_while with a counter that decrements each iteration."""
+def test_graph_do_while_counter():
+    """Test graph_do_while with a counter that decrements each iteration."""
     N = 64
 
-    @qd.kernel(graph_while="counter")
+    @qd.kernel(graph_do_while="counter")
     def increment_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
@@ -31,12 +31,12 @@ def increment_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarra
 
 
 @test_utils.test(arch=[qd.cuda])
-def test_graph_while_boolean_done():
-    """Test graph_while with a boolean 'continue' flag (non-zero = keep going)."""
+def test_graph_do_while_boolean_done():
+    """Test graph_do_while with a boolean 'continue' flag (non-zero = keep going)."""
     N = 64
     threshold = 7
 
-    @qd.kernel(graph_while="keep_going")
+    @qd.kernel(graph_do_while="keep_going")
     def increment_until_threshold(x: qd.types.ndarray(qd.i32, ndim=1), keep_going: qd.types.ndarray(qd.i32, ndim=0)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
@@ -58,11 +58,11 @@ def increment_until_threshold(x: qd.types.ndarray(qd.i32, ndim=1), keep_going: q
 
 
 @test_utils.test(arch=[qd.cuda])
-def test_graph_while_multiple_loops():
-    """Test graph_while with multiple top-level loops in the kernel body."""
+def test_graph_do_while_multiple_loops():
+    """Test graph_do_while with multiple top-level loops in the kernel body."""
     N = 32
 
-    @qd.kernel(graph_while="counter")
+    @qd.kernel(graph_do_while="counter")
     def multi_loop(
         x: qd.types.ndarray(qd.f32, ndim=1),
         y: qd.types.ndarray(qd.f32, ndim=1),
@@ -92,11 +92,11 @@ def multi_loop(
 
 
 @test_utils.test(arch=[qd.cuda])
-def test_graph_while_replay():
-    """Test that graph_while works correctly on subsequent calls (graph replay)."""
+def test_graph_do_while_replay():
+    """Test that graph_do_while works correctly on subsequent calls (graph replay)."""
     N = 16
 
-    @qd.kernel(graph_while="counter")
+    @qd.kernel(graph_do_while="counter")
     def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
@@ -122,8 +122,8 @@ def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, n
 
 
 @test_utils.test(arch=[qd.cuda])
-def test_graph_while_replay_new_ndarray():
-    """Test graph_while replay when the counter ndarray is a different allocation.
+def test_graph_do_while_replay_new_ndarray():
+    """Test graph_do_while replay when the counter ndarray is a different allocation.
 
     Regression test: the condition kernel's flag pointer was baked into the
     CUDA graph at creation time. Passing a new ndarray (different device
@@ -132,7 +132,7 @@ def test_graph_while_replay_new_ndarray():
     """
     N = 16
 
-    @qd.kernel(graph_while="counter")
+    @qd.kernel(graph_do_while="counter")
     def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
diff --git a/tests/python/test_graph_while_cross_backend.py b/tests/python/test_graph_do_while_cross_backend.py
similarity index 85%
rename from tests/python/test_graph_while_cross_backend.py
rename to tests/python/test_graph_do_while_cross_backend.py
index e49eb0729c..0fb0317f60 100644
--- a/tests/python/test_graph_while_cross_backend.py
+++ b/tests/python/test_graph_do_while_cross_backend.py
@@ -6,12 +6,12 @@
 
 
 @test_utils.test(arch=[qd.cpu, qd.cuda])
-def test_graph_while_counter_cross_backend():
-    """graph_while with a counter: must work identically on CPU and CUDA."""
+def test_graph_do_while_counter_cross_backend():
+    """graph_do_while with a counter: must work identically on CPU and CUDA."""
     N = 64
     ITERS = 5
 
-    @qd.kernel(graph_while="counter")
+    @qd.kernel(graph_do_while="counter")
     def increment_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
@@ -32,8 +32,8 @@ def increment_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarra
 
 
 @test_utils.test(arch=[qd.cpu, qd.cuda])
-def test_graph_while_boolean_reduction_cross_backend():
-    """graph_while with per-thread conditions reduced into a single flag.
+def test_graph_do_while_boolean_reduction_cross_backend():
+    """graph_do_while with per-thread conditions reduced into a single flag.
 
     Each element has a different threshold. The loop continues while ANY element
     hasn't reached its threshold. A reduction kernel (reset flag to 0, then
@@ -41,7 +41,7 @@ def test_graph_while_boolean_reduction_cross_backend():
     """
     N = 32
 
-    @qd.kernel(graph_while="keep_going")
+    @qd.kernel(graph_do_while="keep_going")
     def increment_until_all_done(
         x: qd.types.ndarray(qd.i32, ndim=1),
         thresholds: qd.types.ndarray(qd.i32, ndim=1),
@@ -77,12 +77,12 @@ def increment_until_all_done(
 
 
 @test_utils.test(arch=[qd.cpu, qd.cuda])
-def test_graph_while_multi_loop_cross_backend():
-    """graph_while with multiple top-level for loops in the body."""
+def test_graph_do_while_multi_loop_cross_backend():
+    """graph_do_while with multiple top-level for loops in the body."""
     N = 16
     ITERS = 8
 
-    @qd.kernel(graph_while="counter")
+    @qd.kernel(graph_do_while="counter")
     def multi_loop(
         a: qd.types.ndarray(qd.f32, ndim=1),
         b: qd.types.ndarray(qd.f32, ndim=1),
@@ -112,11 +112,11 @@ def multi_loop(
 
 
 @test_utils.test(arch=[qd.cpu, qd.cuda])
-def test_graph_while_replay_cross_backend():
-    """graph_while replay: second call with different counter value."""
+def test_graph_do_while_replay_cross_backend():
+    """graph_do_while replay: second call with different counter value."""
     N = 16
 
-    @qd.kernel(graph_while="counter")
+    @qd.kernel(graph_do_while="counter")
     def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
@@ -144,8 +144,8 @@ def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, n
 
 
 @test_utils.test(arch=[qd.cpu, qd.cuda])
-def test_graph_while_replay_new_ndarray_cross_backend():
-    """graph_while replay with a different ndarray allocation for the counter.
+def test_graph_do_while_replay_new_ndarray_cross_backend():
+    """graph_do_while replay with a different ndarray allocation for the counter.
 
     Regression test: on CUDA, the condition kernel's flag pointer was baked
     into the graph at creation time. Passing a new ndarray on replay would
@@ -155,7 +155,7 @@ def test_graph_while_replay_new_ndarray_cross_backend():
     """
     N = 16
 
-    @qd.kernel(graph_while="counter")
+    @qd.kernel(graph_do_while="counter")
     def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
@@ -184,15 +184,15 @@ def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, n
 
 
 @test_utils.test(arch=[qd.cpu, qd.cuda])
-def test_graph_while_single_iteration():
-    """graph_while with counter=1 executes the body exactly once.
+def test_graph_do_while_single_iteration():
+    """graph_do_while with counter=1 executes the body exactly once.
 
-    Note: graph_while has do-while semantics (body executes at least once,
+    Note: graph_do_while has do-while semantics (body executes at least once,
     matching CUDA conditional while node behavior). Counter must be >= 1.
     """
     N = 8
 
-    @qd.kernel(graph_while="counter")
+    @qd.kernel(graph_do_while="counter")
     def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1

From aa4dd52f7992ac9203d1b590f6a22b4fe587b4db Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 20:15:05 -0400
Subject: [PATCH 047/128] add caveats to doc

---
 docs/source/user_guide/cuda_graph.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/source/user_guide/cuda_graph.md b/docs/source/user_guide/cuda_graph.md
index ef9fe761a4..71cca1d647 100644
--- a/docs/source/user_guide/cuda_graph.md
+++ b/docs/source/user_guide/cuda_graph.md
@@ -116,3 +116,15 @@ def converge(x: qd.types.ndarray(qd.f32, ndim=1),
 The parameter used by `graph_do_while` MUST be an ndarray.
 
 However, other parameters can be any supported Quadrants kernel parameter type.
+
+### Caveats
+
+On currently unsupported GPU platforms, such as AMDGPU at the time of writing, the value of the `graph_do_while` parameter will be copied from the GPU to the host each iteration, in order to check whether we should continue iterating. This causes a GPU pipeline stall. At the end of each loop iteration:
+- wait for GPU async queue to finish processing
+- copy condition value to hostside
+- evaluate condition value on hostside
+- launch new kernels for next loop iteration, if not finished yet
+
+Therefore on unsupported platforms, you might consider creating a second implementation, which works differently. e.g.:
+- fixed number of loop iterations, so no dependency on gpu data for kernel launch; combined perhaps with:
+- make each kernel 'short-circuit', exit quickly, if the task has already been completed; to avoid running the GPU more than necessary

From 44781b861116713f1af4416778a318e8dc4e1965 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 20:23:48 -0400
Subject: [PATCH 048/128] Add comments to AMDGPU graph_do_while fallback code

Made-with: Cursor
---
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index e35edff10d..a7baa749d2 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -74,6 +74,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         }
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)ctx.array_ptrs[grad_ptr_idx]);
+        // Record the device pointer for the graph_do_while flag so the
+        // host-side fallback loop can read it back after each iteration.
         if (arg_id == ctx.graph_do_while_arg_id) {
           ctx.graph_do_while_flag_dev_ptr = device_ptrs[data_ptr_idx];
         }
@@ -116,6 +118,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
   AMDGPUContext::get_instance().push_back_kernel_arg_pointer(context_pointer);
 
+  // Host-side do-while fallback for graph_do_while. AMDGPU has no conditional
+  // graph nodes, so we sync and read the flag back to the host each iteration.
+  // Without graph_do_while the loop body executes exactly once.
   do {
     for (auto &task : offloaded_tasks) {
       QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,

From c63e20188add538ed4062b6b38fd36353932c020 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 22:27:01 -0400
Subject: [PATCH 049/128] Remove graph_do_while fallback, require CUDA with SM
 9.0+

graph_do_while now throws on CPU and AMDGPU instead of falling back
to a host-side loop. On CUDA, ensure_condition_kernel_loaded throws
if SM < 9.0 or libcudadevrt.a is missing.

Add libcudart-dev-12-8 to CI for libcudadevrt.a. All graph_do_while
tests are CUDA-only with cache size and usage assertions.

Made-with: Cursor
---
 .github/workflows/test_gpu.yml                |  6 +-
 quadrants/runtime/amdgpu/kernel_launcher.cpp  | 38 ++++---------
 quadrants/runtime/cpu/kernel_launcher.cpp     | 18 ++----
 quadrants/runtime/cuda/kernel_launcher.cpp    | 55 +++++--------------
 tests/python/test_cuda_graph_do_while.py      | 23 ++++++++
 .../test_graph_do_while_cross_backend.py      | 39 ++++++++++---
 6 files changed, 88 insertions(+), 91 deletions(-)

diff --git a/.github/workflows/test_gpu.yml b/.github/workflows/test_gpu.yml
index 9eaeb5f679..7765d6f832 100644
--- a/.github/workflows/test_gpu.yml
+++ b/.github/workflows/test_gpu.yml
@@ -71,8 +71,7 @@ jobs:
           python-version: '3.10'
       - name: install cuda stuff
         run: |
-          sudo apt-get install -y libcusolver-dev-12-8
-          sudo apt-get install -y libcusparse-dev-12-8
+          sudo apt-get install -y libcusolver-dev-12-8 libcusparse-dev-12-8 libcudart-dev-12-8
           ls -lhd /usr/local/cuda*
           ls -l /usr/local/cuda/lib64/libcusolver*
           ls -l /usr/local/cuda/lib64/libcusparse*
@@ -105,8 +104,7 @@ jobs:
           python-version: '3.10'
       - name: install cuda stuff
         run: |
-          sudo apt-get install -y libcusolver-dev-12-8
-          sudo apt-get install -y libcusparse-dev-12-8
+          sudo apt-get install -y libcusolver-dev-12-8 libcusparse-dev-12-8 libcudart-dev-12-8
           ls -lhd /usr/local/cuda*
           ls -l /usr/local/cuda/lib64/libcusolver*
           ls -l /usr/local/cuda/lib64/libcusparse*
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index a7baa749d2..62a64a65f9 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -74,11 +74,6 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         }
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)ctx.array_ptrs[grad_ptr_idx]);
-        // Record the device pointer for the graph_do_while flag so the
-        // host-side fallback loop can read it back after each iteration.
-        if (arg_id == ctx.graph_do_while_arg_id) {
-          ctx.graph_do_while_flag_dev_ptr = device_ptrs[data_ptr_idx];
-        }
       } else if (arr_sz > 0) {  // why use arr_sz constrain?
         // Ndarray
         DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
@@ -87,9 +82,6 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)ctx.array_ptrs[grad_ptr_idx]);
-        if (arg_id == ctx.graph_do_while_arg_id) {
-          ctx.graph_do_while_flag_dev_ptr = device_ptrs[data_ptr_idx];
-        }
       }
     }
   }
@@ -118,26 +110,16 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
   AMDGPUContext::get_instance().push_back_kernel_arg_pointer(context_pointer);
 
-  // Host-side do-while fallback for graph_do_while. AMDGPU has no conditional
-  // graph nodes, so we sync and read the flag back to the host each iteration.
-  // Without graph_do_while the loop body executes exactly once.
-  do {
-    for (auto &task : offloaded_tasks) {
-      QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
-               task.block_dim);
-      amdgpu_module->launch(task.name, task.grid_dim, task.block_dim,
-                            task.dynamic_shared_array_bytes,
-                            {(void *)&context_pointer}, {arg_size});
-    }
-    if (ctx.graph_do_while_arg_id >= 0 && ctx.graph_do_while_flag_dev_ptr) {
-      int32_t counter_val = 0;
-      AMDGPUDriver::get_instance().stream_synchronize(nullptr);
-      AMDGPUDriver::get_instance().memcpy_device_to_host(
-          &counter_val, ctx.graph_do_while_flag_dev_ptr, sizeof(int32_t));
-      if (counter_val == 0)
-        break;
-    }
-  } while (ctx.graph_do_while_arg_id >= 0);
+  QD_ERROR_IF(ctx.graph_do_while_arg_id >= 0,
+              "graph_do_while is only supported on the CUDA backend");
+
+  for (auto &task : offloaded_tasks) {
+    QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
+             task.block_dim);
+    amdgpu_module->launch(task.name, task.grid_dim, task.block_dim,
+                          task.dynamic_shared_array_bytes,
+                          {(void *)&context_pointer}, {arg_size});
+  }
   QD_TRACE("Launching kernel");
   if (ctx.arg_buffer_size > 0) {
     AMDGPUDriver::get_instance().mem_free(device_arg_buffer);
diff --git a/quadrants/runtime/cpu/kernel_launcher.cpp b/quadrants/runtime/cpu/kernel_launcher.cpp
index 6b67a2ea5b..1f34dced14 100644
--- a/quadrants/runtime/cpu/kernel_launcher.cpp
+++ b/quadrants/runtime/cpu/kernel_launcher.cpp
@@ -27,9 +27,6 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
       if (ctx.device_allocation_type[arg_id] ==
           LaunchContextBuilder::DevAllocType::kNone) {
         ctx.set_ndarray_ptrs(arg_id, (uint64)data_ptr, (uint64)grad_ptr);
-        if (arg_id == ctx.graph_do_while_arg_id) {
-          ctx.graph_do_while_flag_dev_ptr = data_ptr;
-        }
       } else if (ctx.array_runtime_sizes[arg_id] > 0) {
         uint64 host_ptr = (uint64)executor->get_device_alloc_info_ptr(
             *static_cast<DeviceAllocation *>(data_ptr));
@@ -41,18 +38,15 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
                 : (uint64)executor->get_device_alloc_info_ptr(
                       *static_cast<DeviceAllocation *>(grad_ptr));
         ctx.set_ndarray_ptrs(arg_id, host_ptr, host_ptr_grad);
-        if (arg_id == ctx.graph_do_while_arg_id) {
-          ctx.graph_do_while_flag_dev_ptr = (void *)host_ptr;
-        }
       }
     }
   }
-  do {
-    for (auto task : launcher_ctx.task_funcs) {
-      task(&ctx.get_context());
-    }
-  } while (ctx.graph_do_while_arg_id >= 0 && ctx.graph_do_while_flag_dev_ptr &&
-           *static_cast<int32_t *>(ctx.graph_do_while_flag_dev_ptr) != 0);
+  QD_ERROR_IF(ctx.graph_do_while_arg_id >= 0,
+              "graph_do_while is only supported on the CUDA backend");
+
+  for (auto task : launcher_ctx.task_funcs) {
+    task(&ctx.get_context());
+  }
 }
 
 KernelLauncher::Handle KernelLauncher::register_llvm_kernel(
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index b9c72c0a6e..5dbfa1d66d 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -165,13 +165,10 @@ void KernelLauncher::ensure_condition_kernel_loaded() {
     return;
 
   int cc = CUDAContext::get_instance().get_compute_capability();
-  if (cc < 90) {
-    QD_WARN(
-        "graph_do_while requires SM 9.0+ (Hopper), but this device is SM {}. "
-        "Falling back to non-graph path.",
-        cc);
-    return;
-  }
+  QD_ERROR_IF(cc < 90,
+              "graph_do_while requires SM 9.0+ (Hopper), but this device is "
+              "SM {}.",
+              cc);
 
   auto &driver = CUDADriver::get_instance();
 
@@ -191,10 +188,9 @@ void KernelLauncher::ensure_condition_kernel_loaded() {
       break;
     }
   }
-  if (cudadevrt_path.empty()) {
-    QD_WARN("Cannot find libcudadevrt.a — graph_do_while will not work");
-    return;
-  }
+  QD_ERROR_IF(cudadevrt_path.empty(),
+              "graph_do_while requires libcudadevrt.a but it was not found. "
+              "Install the CUDA toolkit and/or set CUDA_HOME.");
 
   void *link_state = nullptr;
   driver.link_create(0, nullptr, nullptr, &link_state);
@@ -309,11 +305,7 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
 
   if (use_graph_do_while) {
     ensure_condition_kernel_loaded();
-    if (!cond_kernel_func_) {
-      QD_WARN("Condition kernel not available, falling back to non-graph");
-      CUDADriver::get_instance().graph_destroy(graph);
-      return false;
-    }
+    QD_ASSERT(cond_kernel_func_);
 
     void *cu_ctx = CUDAContext::get_instance().get_context();
 
@@ -517,9 +509,6 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)device_ptrs[grad_ptr_idx]);
-        if (arg_id == ctx.graph_do_while_arg_id) {
-          ctx.graph_do_while_flag_dev_ptr = device_ptrs[data_ptr_idx];
-        }
       } else if (arr_sz > 0) {
         // Ndarray
         DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
@@ -535,9 +524,6 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)device_ptrs[grad_ptr_idx]);
-        if (arg_id == ctx.graph_do_while_arg_id) {
-          ctx.graph_do_while_flag_dev_ptr = device_ptrs[data_ptr_idx];
-        }
       }
     }
   }
@@ -558,24 +544,13 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
     ctx.get_context().arg_buffer = device_arg_buffer;
   }
 
-  do {
-    for (auto task : offloaded_tasks) {
-      QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
-               task.block_dim);
-      cuda_module->launch(task.name, task.grid_dim, task.block_dim,
-                          task.dynamic_shared_array_bytes, {&ctx.get_context()},
-                          {});
-    }
-    if (ctx.graph_do_while_arg_id >= 0 && ctx.graph_do_while_flag_dev_ptr) {
-      int32_t counter_val = 0;
-      auto *stream = CUDAContext::get_instance().get_stream();
-      CUDADriver::get_instance().stream_synchronize(stream);
-      CUDADriver::get_instance().memcpy_device_to_host(
-          &counter_val, ctx.graph_do_while_flag_dev_ptr, sizeof(int32_t));
-      if (counter_val == 0)
-        break;
-    }
-  } while (ctx.graph_do_while_arg_id >= 0);
+  for (auto task : offloaded_tasks) {
+    QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
+             task.block_dim);
+    cuda_module->launch(task.name, task.grid_dim, task.block_dim,
+                        task.dynamic_shared_array_bytes, {&ctx.get_context()},
+                        {});
+  }
   if (ctx.arg_buffer_size > 0) {
     CUDADriver::get_instance().mem_free_async(device_arg_buffer, nullptr);
   }
diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py
index e1638592bd..6bdeb7ac92 100644
--- a/tests/python/test_cuda_graph_do_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -1,10 +1,19 @@
 import numpy as np
 
 import quadrants as qd
+from quadrants.lang import impl
 
 from tests import test_utils
 
 
+def _cuda_graph_cache_size():
+    return impl.get_runtime().prog.get_cuda_graph_cache_size()
+
+
+def _cuda_graph_used():
+    return impl.get_runtime().prog.get_cuda_graph_cache_used_on_last_call()
+
+
 @test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_counter():
     """Test graph_do_while with a counter that decrements each iteration."""
@@ -24,6 +33,8 @@ def increment_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarra
     counter.from_numpy(np.array(5, dtype=np.int32))
 
     increment_loop(x, counter)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
 
     qd.sync()
     assert counter.to_numpy() == 0
@@ -51,6 +62,8 @@ def increment_until_threshold(x: qd.types.ndarray(qd.i32, ndim=1), keep_going: q
     keep_going.from_numpy(np.array(1, dtype=np.int32))
 
     increment_until_threshold(x, keep_going)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
 
     qd.sync()
     assert keep_going.to_numpy() == 0
@@ -84,6 +97,8 @@ def multi_loop(
     counter.from_numpy(np.array(10, dtype=np.int32))
 
     multi_loop(x, y, counter)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
 
     qd.sync()
     assert counter.to_numpy() == 0
@@ -110,6 +125,8 @@ def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, n
     x.from_numpy(np.zeros(N, dtype=np.int32))
     counter.from_numpy(np.array(3, dtype=np.int32))
     inc(x, counter)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
     qd.sync()
     np.testing.assert_array_equal(x.to_numpy(), np.full(N, 3, dtype=np.int32))
 
@@ -117,6 +134,8 @@ def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, n
     x.from_numpy(np.zeros(N, dtype=np.int32))
     counter.from_numpy(np.array(7, dtype=np.int32))
     inc(x, counter)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
     qd.sync()
     np.testing.assert_array_equal(x.to_numpy(), np.full(N, 7, dtype=np.int32))
 
@@ -146,6 +165,8 @@ def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, n
     x.from_numpy(np.zeros(N, dtype=np.int32))
     counter1.from_numpy(np.array(3, dtype=np.int32))
     inc(x, counter1)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
     qd.sync()
     np.testing.assert_array_equal(x.to_numpy(), np.full(N, 3, dtype=np.int32))
     assert counter1.to_numpy() == 0
@@ -155,6 +176,8 @@ def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, n
     x.from_numpy(np.zeros(N, dtype=np.int32))
     counter2.from_numpy(np.array(5, dtype=np.int32))
     inc(x, counter2)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
     qd.sync()
     np.testing.assert_array_equal(x.to_numpy(), np.full(N, 5, dtype=np.int32))
     assert counter2.to_numpy() == 0
diff --git a/tests/python/test_graph_do_while_cross_backend.py b/tests/python/test_graph_do_while_cross_backend.py
index 0fb0317f60..0193f6d3ee 100644
--- a/tests/python/test_graph_do_while_cross_backend.py
+++ b/tests/python/test_graph_do_while_cross_backend.py
@@ -1,13 +1,22 @@
 import numpy as np
 
 import quadrants as qd
+from quadrants.lang import impl
 
 from tests import test_utils
 
 
-@test_utils.test(arch=[qd.cpu, qd.cuda])
+def _cuda_graph_cache_size():
+    return impl.get_runtime().prog.get_cuda_graph_cache_size()
+
+
+def _cuda_graph_used():
+    return impl.get_runtime().prog.get_cuda_graph_cache_used_on_last_call()
+
+
+@test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_counter_cross_backend():
-    """graph_do_while with a counter: must work identically on CPU and CUDA."""
+    """graph_do_while with a counter."""
     N = 64
     ITERS = 5
 
@@ -25,13 +34,15 @@ def increment_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarra
     counter.from_numpy(np.array(ITERS, dtype=np.int32))
 
     increment_loop(x, counter)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
     qd.sync()
 
     assert counter.to_numpy() == 0
     np.testing.assert_array_equal(x.to_numpy(), np.full(N, ITERS, dtype=np.int32))
 
 
-@test_utils.test(arch=[qd.cpu, qd.cuda])
+@test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_boolean_reduction_cross_backend():
     """graph_do_while with per-thread conditions reduced into a single flag.
 
@@ -70,13 +81,15 @@ def increment_until_all_done(
     keep_going.from_numpy(np.array(1, dtype=np.int32))
 
     increment_until_all_done(x, thresholds, keep_going)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
     qd.sync()
 
     assert keep_going.to_numpy() == 0
     np.testing.assert_array_equal(x.to_numpy(), thresh_np)
 
 
-@test_utils.test(arch=[qd.cpu, qd.cuda])
+@test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_multi_loop_cross_backend():
     """graph_do_while with multiple top-level for loops in the body."""
     N = 16
@@ -104,6 +117,8 @@ def multi_loop(
     counter.from_numpy(np.array(ITERS, dtype=np.int32))
 
     multi_loop(a, b, counter)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
     qd.sync()
 
     assert counter.to_numpy() == 0
@@ -111,7 +126,7 @@ def multi_loop(
     np.testing.assert_allclose(b.to_numpy(), np.full(N, float(ITERS * 3)))
 
 
-@test_utils.test(arch=[qd.cpu, qd.cuda])
+@test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_replay_cross_backend():
     """graph_do_while replay: second call with different counter value."""
     N = 16
@@ -130,6 +145,8 @@ def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, n
     x.from_numpy(np.zeros(N, dtype=np.int32))
     counter.from_numpy(np.array(3, dtype=np.int32))
     inc(x, counter)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
     qd.sync()
     np.testing.assert_array_equal(x.to_numpy(), np.full(N, 3, dtype=np.int32))
     assert counter.to_numpy() == 0
@@ -138,12 +155,14 @@ def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, n
     x.from_numpy(np.zeros(N, dtype=np.int32))
     counter.from_numpy(np.array(7, dtype=np.int32))
     inc(x, counter)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
     qd.sync()
     np.testing.assert_array_equal(x.to_numpy(), np.full(N, 7, dtype=np.int32))
     assert counter.to_numpy() == 0
 
 
-@test_utils.test(arch=[qd.cpu, qd.cuda])
+@test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_replay_new_ndarray_cross_backend():
     """graph_do_while replay with a different ndarray allocation for the counter.
 
@@ -169,6 +188,8 @@ def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, n
     x.from_numpy(np.zeros(N, dtype=np.int32))
     counter1.from_numpy(np.array(4, dtype=np.int32))
     inc(x, counter1)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
     qd.sync()
     np.testing.assert_array_equal(x.to_numpy(), np.full(N, 4, dtype=np.int32))
     assert counter1.to_numpy() == 0
@@ -178,12 +199,14 @@ def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, n
     x.from_numpy(np.zeros(N, dtype=np.int32))
     counter2.from_numpy(np.array(6, dtype=np.int32))
     inc(x, counter2)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
     qd.sync()
     np.testing.assert_array_equal(x.to_numpy(), np.full(N, 6, dtype=np.int32))
     assert counter2.to_numpy() == 0
 
 
-@test_utils.test(arch=[qd.cpu, qd.cuda])
+@test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_single_iteration():
     """graph_do_while with counter=1 executes the body exactly once.
 
@@ -206,6 +229,8 @@ def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, n
     counter.from_numpy(np.array(1, dtype=np.int32))
 
     inc(x, counter)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
     qd.sync()
 
     assert counter.to_numpy() == 0

From 16051887db5f9b5a4396abc7a6bf3563499a3899 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 22:30:17 -0400
Subject: [PATCH 050/128] Remove cross-backend graph_do_while tests

graph_do_while is CUDA-only; cross-backend tests are redundant with
test_cuda_graph_do_while.py.

Made-with: Cursor
---
 .../test_graph_do_while_cross_backend.py      | 237 ------------------
 1 file changed, 237 deletions(-)
 delete mode 100644 tests/python/test_graph_do_while_cross_backend.py

diff --git a/tests/python/test_graph_do_while_cross_backend.py b/tests/python/test_graph_do_while_cross_backend.py
deleted file mode 100644
index 0193f6d3ee..0000000000
--- a/tests/python/test_graph_do_while_cross_backend.py
+++ /dev/null
@@ -1,237 +0,0 @@
-import numpy as np
-
-import quadrants as qd
-from quadrants.lang import impl
-
-from tests import test_utils
-
-
-def _cuda_graph_cache_size():
-    return impl.get_runtime().prog.get_cuda_graph_cache_size()
-
-
-def _cuda_graph_used():
-    return impl.get_runtime().prog.get_cuda_graph_cache_used_on_last_call()
-
-
-@test_utils.test(arch=[qd.cuda])
-def test_graph_do_while_counter_cross_backend():
-    """graph_do_while with a counter."""
-    N = 64
-    ITERS = 5
-
-    @qd.kernel(graph_do_while="counter")
-    def increment_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
-        for i in range(x.shape[0]):
-            x[i] = x[i] + 1
-        for i in range(1):
-            counter[None] = counter[None] - 1
-
-    x = qd.ndarray(qd.i32, shape=(N,))
-    counter = qd.ndarray(qd.i32, shape=())
-
-    x.from_numpy(np.zeros(N, dtype=np.int32))
-    counter.from_numpy(np.array(ITERS, dtype=np.int32))
-
-    increment_loop(x, counter)
-    assert _cuda_graph_used()
-    assert _cuda_graph_cache_size() == 1
-    qd.sync()
-
-    assert counter.to_numpy() == 0
-    np.testing.assert_array_equal(x.to_numpy(), np.full(N, ITERS, dtype=np.int32))
-
-
-@test_utils.test(arch=[qd.cuda])
-def test_graph_do_while_boolean_reduction_cross_backend():
-    """graph_do_while with per-thread conditions reduced into a single flag.
-
-    Each element has a different threshold. The loop continues while ANY element
-    hasn't reached its threshold. A reduction kernel (reset flag to 0, then
-    any-not-done sets it to 1) combines per-element state into the scalar flag.
-    """
-    N = 32
-
-    @qd.kernel(graph_do_while="keep_going")
-    def increment_until_all_done(
-        x: qd.types.ndarray(qd.i32, ndim=1),
-        thresholds: qd.types.ndarray(qd.i32, ndim=1),
-        keep_going: qd.types.ndarray(qd.i32, ndim=0),
-    ):
-        # Work: increment elements that haven't reached their threshold
-        for i in range(x.shape[0]):
-            if x[i] < thresholds[i]:
-                x[i] = x[i] + 1
-
-        # Reduction: reset flag, then OR-reduce per-element conditions
-        for i in range(1):
-            keep_going[None] = 0
-        for i in range(x.shape[0]):
-            if x[i] < thresholds[i]:
-                keep_going[None] = 1
-
-    x = qd.ndarray(qd.i32, shape=(N,))
-    thresholds = qd.ndarray(qd.i32, shape=(N,))
-    keep_going = qd.ndarray(qd.i32, shape=())
-
-    # Thresholds vary: 1, 2, 3, ..., N. Loop must run N times (max threshold).
-    thresh_np = np.arange(1, N + 1, dtype=np.int32)
-    x.from_numpy(np.zeros(N, dtype=np.int32))
-    thresholds.from_numpy(thresh_np)
-    keep_going.from_numpy(np.array(1, dtype=np.int32))
-
-    increment_until_all_done(x, thresholds, keep_going)
-    assert _cuda_graph_used()
-    assert _cuda_graph_cache_size() == 1
-    qd.sync()
-
-    assert keep_going.to_numpy() == 0
-    np.testing.assert_array_equal(x.to_numpy(), thresh_np)
-
-
-@test_utils.test(arch=[qd.cuda])
-def test_graph_do_while_multi_loop_cross_backend():
-    """graph_do_while with multiple top-level for loops in the body."""
-    N = 16
-    ITERS = 8
-
-    @qd.kernel(graph_do_while="counter")
-    def multi_loop(
-        a: qd.types.ndarray(qd.f32, ndim=1),
-        b: qd.types.ndarray(qd.f32, ndim=1),
-        counter: qd.types.ndarray(qd.i32, ndim=0),
-    ):
-        for i in range(a.shape[0]):
-            a[i] = a[i] + 1.0
-        for i in range(b.shape[0]):
-            b[i] = b[i] + 3.0
-        for i in range(1):
-            counter[None] = counter[None] - 1
-
-    a = qd.ndarray(qd.f32, shape=(N,))
-    b = qd.ndarray(qd.f32, shape=(N,))
-    counter = qd.ndarray(qd.i32, shape=())
-
-    a.from_numpy(np.zeros(N, dtype=np.float32))
-    b.from_numpy(np.zeros(N, dtype=np.float32))
-    counter.from_numpy(np.array(ITERS, dtype=np.int32))
-
-    multi_loop(a, b, counter)
-    assert _cuda_graph_used()
-    assert _cuda_graph_cache_size() == 1
-    qd.sync()
-
-    assert counter.to_numpy() == 0
-    np.testing.assert_allclose(a.to_numpy(), np.full(N, float(ITERS)))
-    np.testing.assert_allclose(b.to_numpy(), np.full(N, float(ITERS * 3)))
-
-
-@test_utils.test(arch=[qd.cuda])
-def test_graph_do_while_replay_cross_backend():
-    """graph_do_while replay: second call with different counter value."""
-    N = 16
-
-    @qd.kernel(graph_do_while="counter")
-    def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
-        for i in range(x.shape[0]):
-            x[i] = x[i] + 1
-        for i in range(1):
-            counter[None] = counter[None] - 1
-
-    x = qd.ndarray(qd.i32, shape=(N,))
-    counter = qd.ndarray(qd.i32, shape=())
-
-    # First call: 3 iterations
-    x.from_numpy(np.zeros(N, dtype=np.int32))
-    counter.from_numpy(np.array(3, dtype=np.int32))
-    inc(x, counter)
-    assert _cuda_graph_used()
-    assert _cuda_graph_cache_size() == 1
-    qd.sync()
-    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 3, dtype=np.int32))
-    assert counter.to_numpy() == 0
-
-    # Second call: 7 iterations
-    x.from_numpy(np.zeros(N, dtype=np.int32))
-    counter.from_numpy(np.array(7, dtype=np.int32))
-    inc(x, counter)
-    assert _cuda_graph_used()
-    assert _cuda_graph_cache_size() == 1
-    qd.sync()
-    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 7, dtype=np.int32))
-    assert counter.to_numpy() == 0
-
-
-@test_utils.test(arch=[qd.cuda])
-def test_graph_do_while_replay_new_ndarray_cross_backend():
-    """graph_do_while replay with a different ndarray allocation for the counter.
-
-    Regression test: on CUDA, the condition kernel's flag pointer was baked
-    into the graph at creation time. Passing a new ndarray on replay would
-    read stale memory. The fix invalidates the cached graph when the flag
-    pointer changes. On CPU the host-side fallback always reads from the
-    current pointer, so this verifies both paths produce correct results.
-    """
-    N = 16
-
-    @qd.kernel(graph_do_while="counter")
-    def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
-        for i in range(x.shape[0]):
-            x[i] = x[i] + 1
-        for i in range(1):
-            counter[None] = counter[None] - 1
-
-    x = qd.ndarray(qd.i32, shape=(N,))
-
-    # First call with one counter ndarray
-    counter1 = qd.ndarray(qd.i32, shape=())
-    x.from_numpy(np.zeros(N, dtype=np.int32))
-    counter1.from_numpy(np.array(4, dtype=np.int32))
-    inc(x, counter1)
-    assert _cuda_graph_used()
-    assert _cuda_graph_cache_size() == 1
-    qd.sync()
-    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 4, dtype=np.int32))
-    assert counter1.to_numpy() == 0
-
-    # Second call with a NEW counter ndarray (different device allocation)
-    counter2 = qd.ndarray(qd.i32, shape=())
-    x.from_numpy(np.zeros(N, dtype=np.int32))
-    counter2.from_numpy(np.array(6, dtype=np.int32))
-    inc(x, counter2)
-    assert _cuda_graph_used()
-    assert _cuda_graph_cache_size() == 1
-    qd.sync()
-    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 6, dtype=np.int32))
-    assert counter2.to_numpy() == 0
-
-
-@test_utils.test(arch=[qd.cuda])
-def test_graph_do_while_single_iteration():
-    """graph_do_while with counter=1 executes the body exactly once.
-
-    Note: graph_do_while has do-while semantics (body executes at least once,
-    matching CUDA conditional while node behavior). Counter must be >= 1.
-    """
-    N = 8
-
-    @qd.kernel(graph_do_while="counter")
-    def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
-        for i in range(x.shape[0]):
-            x[i] = x[i] + 1
-        for i in range(1):
-            counter[None] = counter[None] - 1
-
-    x = qd.ndarray(qd.i32, shape=(N,))
-    counter = qd.ndarray(qd.i32, shape=())
-
-    x.from_numpy(np.zeros(N, dtype=np.int32))
-    counter.from_numpy(np.array(1, dtype=np.int32))
-
-    inc(x, counter)
-    assert _cuda_graph_used()
-    assert _cuda_graph_cache_size() == 1
-    qd.sync()
-
-    assert counter.to_numpy() == 0
-    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 1, dtype=np.int32))

From 9fbb433c90502da7039cb1ce6f34b61952c56f4d Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 20:21:27 -0700
Subject: [PATCH 051/128] Allow cuda_graph=True for single-task kernels

The < 2 tasks guard was rejecting single-loop kernels from the graph
path even when the user explicitly requested cuda_graph=True. Relax
the check to only skip empty task lists, respecting the user's intent.
---
 quadrants/runtime/cuda/kernel_launcher.cpp |  4 +---
 tests/python/test_cuda_graph.py            | 14 ++++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 80b4131f44..ce579999dd 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -117,9 +117,7 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
   const auto &parameters = *launcher_ctx.parameters;
   const auto &offloaded_tasks = launcher_ctx.offloaded_tasks;
 
-  // A single-task kernel has no multi-launch overhead to eliminate, so
-  // graphing it provides no benefit. Return false to use the normal path.
-  if (offloaded_tasks.size() < 2) {
+  if (offloaded_tasks.empty()) {
     return false;
   }
 
diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index c70408a6f3..a98bce7059 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -101,9 +101,10 @@ def three_loops(a: Annotation, b: Annotation, c: Annotation):
 
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test()
-def test_cuda_graph_single_loop_no_graph(tensor_type):
-    """A kernel with a single for loop should NOT use the graph path,
-    even with cuda_graph=True (falls back since < 2 tasks)."""
+def test_cuda_graph_single_loop(tensor_type):
+    """A kernel with a single for loop should still use the graph path
+    when cuda_graph=True is explicitly requested."""
+    platform_supports_graph = _on_cuda()
     n = 256
 
     Annotation = qd.types.NDArray[qd.f32, 1] if tensor_type == qd.ndarray else qd.Template
@@ -116,10 +117,11 @@ def single_loop(x: Annotation):
     x = tensor_type(qd.f32, (n,))
 
     single_loop(x)
-    assert not _cuda_graph_used()
+    assert _cuda_graph_used() == platform_supports_graph
+    assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     single_loop(x)
-    assert not _cuda_graph_used()
-    assert _cuda_graph_cache_size() == 0
+    assert _cuda_graph_used() == platform_supports_graph
+    assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
 
     x_np = x.to_numpy()
     assert np.allclose(x_np, 10.0)

From ae8d5a9520eafcacfa3a23b9e6068447213a0c1b Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 20:22:03 -0700
Subject: [PATCH 052/128] Remove test_cuda_graph_single_loop test

---
 tests/python/test_cuda_graph.py | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index a98bce7059..b3775ca606 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -99,34 +99,6 @@ def three_loops(a: Annotation, b: Annotation, c: Annotation):
     assert np.allclose(c_np, 22.0)
 
 
-@pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
-@test_utils.test()
-def test_cuda_graph_single_loop(tensor_type):
-    """A kernel with a single for loop should still use the graph path
-    when cuda_graph=True is explicitly requested."""
-    platform_supports_graph = _on_cuda()
-    n = 256
-
-    Annotation = qd.types.NDArray[qd.f32, 1] if tensor_type == qd.ndarray else qd.Template
-
-    @qd.kernel(cuda_graph=True)
-    def single_loop(x: Annotation):
-        for i in range(x.shape[0]):
-            x[i] = x[i] + 5.0
-
-    x = tensor_type(qd.f32, (n,))
-
-    single_loop(x)
-    assert _cuda_graph_used() == platform_supports_graph
-    assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
-    single_loop(x)
-    assert _cuda_graph_used() == platform_supports_graph
-    assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
-
-    x_np = x.to_numpy()
-    assert np.allclose(x_np, 10.0)
-
-
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test()
 def test_no_cuda_graph_annotation(tensor_type):

From aa82bcbfe9f8b6a6aee6dab1d2cf46150f88152b Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 20:27:42 -0700
Subject: [PATCH 053/128] Fix cuda-cudart-dev package name in GPU workflow

The package is cuda-cudart-dev-12-8, not libcudart-dev-12-8.
---
 .github/workflows/test_gpu.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test_gpu.yml b/.github/workflows/test_gpu.yml
index 7765d6f832..d9262a1a35 100644
--- a/.github/workflows/test_gpu.yml
+++ b/.github/workflows/test_gpu.yml
@@ -71,7 +71,7 @@ jobs:
           python-version: '3.10'
       - name: install cuda stuff
         run: |
-          sudo apt-get install -y libcusolver-dev-12-8 libcusparse-dev-12-8 libcudart-dev-12-8
+          sudo apt-get install -y libcusolver-dev-12-8 libcusparse-dev-12-8 cuda-cudart-dev-12-8
           ls -lhd /usr/local/cuda*
           ls -l /usr/local/cuda/lib64/libcusolver*
           ls -l /usr/local/cuda/lib64/libcusparse*
@@ -104,7 +104,7 @@ jobs:
           python-version: '3.10'
       - name: install cuda stuff
         run: |
-          sudo apt-get install -y libcusolver-dev-12-8 libcusparse-dev-12-8 libcudart-dev-12-8
+          sudo apt-get install -y libcusolver-dev-12-8 libcusparse-dev-12-8 cuda-cudart-dev-12-8
           ls -lhd /usr/local/cuda*
           ls -l /usr/local/cuda/lib64/libcusolver*
           ls -l /usr/local/cuda/lib64/libcusparse*

From 4de7a649683a54e466d5ea4a721ce9ab73d3fccd Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 20:29:45 -0700
Subject: [PATCH 054/128] Fix formatting (black + clang-format)

---
 python/quadrants/lang/kernel_impl.py       | 4 +++-
 quadrants/runtime/cuda/kernel_launcher.cpp | 6 ++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/quadrants/lang/kernel_impl.py b/python/quadrants/lang/kernel_impl.py
index a9103b69fc..4275a57014 100644
--- a/python/quadrants/lang/kernel_impl.py
+++ b/python/quadrants/lang/kernel_impl.py
@@ -259,7 +259,9 @@ def decorator(fn: F, has_kernel_params: bool = True) -> F:
         else:
             level = 4
 
-        wrapped = _kernel_impl(fn, level_of_class_stackframe=level, cuda_graph=cuda_graph, graph_do_while=graph_do_while)
+        wrapped = _kernel_impl(
+            fn, level_of_class_stackframe=level, cuda_graph=cuda_graph, graph_do_while=graph_do_while
+        )
         wrapped.is_pure = pure is not None and pure or fastcache
         if pure is not None:
             warnings_helper.warn_once(
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index bd9dfa3356..3dfcc388e1 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -212,7 +212,8 @@ void KernelLauncher::ensure_condition_kernel_loaded() {
                              "_qd_graph_do_while_cond");
   driver.link_destroy(link_state);
 
-  QD_TRACE("Loaded graph_do_while condition kernel ({} bytes cubin)", cubin_size);
+  QD_TRACE("Loaded graph_do_while condition kernel ({} bytes cubin)",
+           cubin_size);
 }
 
 bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
@@ -247,7 +248,8 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
     if (use_graph_do_while &&
         cached.graph_do_while_flag_dev_ptr != ctx.graph_do_while_flag_dev_ptr) {
       QD_TRACE(
-          "graph_do_while flag pointer changed ({} -> {}), rebuilding CUDA graph",
+          "graph_do_while flag pointer changed ({} -> {}), rebuilding CUDA "
+          "graph",
           cached.graph_do_while_flag_dev_ptr, ctx.graph_do_while_flag_dev_ptr);
       cuda_graph_cache_.erase(it);
     } else {

From b737cce194e43652ebfcd0250866f940b7a47240 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 20:34:44 -0700
Subject: [PATCH 055/128] Simplify cuda_graph doc caveats section

---
 docs/source/user_guide/cuda_graph.md | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/docs/source/user_guide/cuda_graph.md b/docs/source/user_guide/cuda_graph.md
index 71cca1d647..e8047b869b 100644
--- a/docs/source/user_guide/cuda_graph.md
+++ b/docs/source/user_guide/cuda_graph.md
@@ -76,7 +76,7 @@ solve(x, counter)
 The `graph_do_while` value is the name of a scalar `qd.i32` ndarray parameter. The kernel body repeats while this value is non-zero.
 
 - On SM 9.0+ (Hopper), this uses CUDA conditional while nodes — the entire iteration runs on the GPU with no host involvement.
-- On older CUDA GPUs and non-CUDA backends, it falls back to a host-side do-while loop.
+- Old CUDA GPUs, and non-CUDA backends not currently supported.
 - `graph_do_while` implicitly enables `cuda_graph=True`.
 
 ### Patterns
@@ -119,12 +119,4 @@ However, other parameters can be any supported Quadrants kernel parameter type.
 
 ### Caveats
 
-On currently unsupported GPU platforms, such as AMDGPU at the time of writing, the value of the `graph_do_while` parameter will be copied from the GPU to the host each iteration, in order to check whether we should continue iterating. This causes a GPU pipeline stall. At the end of each loop iteration:
-- wait for GPU async queue to finish processing
-- copy condition value to hostside
-- evaluate condition value on hostside
-- launch new kernels for next loop iteration, if not finished yet
-
-Therefore on unsupported platforms, you might consider creating a second implementation, which works differently. e.g.:
-- fixed number of loop iterations, so no dependency on gpu data for kernel launch; combined perhaps with:
-- make each kernel 'short-circuit', exit quickly, if the task has already been completed; to avoid running the GPU more than necessary
+Only runs on CUDA. No fallback on non-CUDA platforms currently.

From 81f580f4f6185fd7438a7e6a50b24d4687ce8571 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 20:35:24 -0700
Subject: [PATCH 056/128] Fix wording: "Older CUDA GPUs"

---
 docs/source/user_guide/cuda_graph.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/user_guide/cuda_graph.md b/docs/source/user_guide/cuda_graph.md
index e8047b869b..23b489fce2 100644
--- a/docs/source/user_guide/cuda_graph.md
+++ b/docs/source/user_guide/cuda_graph.md
@@ -76,7 +76,7 @@ solve(x, counter)
 The `graph_do_while` value is the name of a scalar `qd.i32` ndarray parameter. The kernel body repeats while this value is non-zero.
 
 - On SM 9.0+ (Hopper), this uses CUDA conditional while nodes — the entire iteration runs on the GPU with no host involvement.
-- Old CUDA GPUs, and non-CUDA backends not currently supported.
+- Older CUDA GPUs, and non-CUDA backends not currently supported.
 - `graph_do_while` implicitly enables `cuda_graph=True`.
 
 ### Patterns

From 8333f142880fb2d0a4bebd04b72a4c79e80650b9 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 20:37:20 -0700
Subject: [PATCH 057/128] Update kernel_impl.py docstring: non-CUDA not
 supported

---
 python/quadrants/lang/kernel_impl.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/quadrants/lang/kernel_impl.py b/python/quadrants/lang/kernel_impl.py
index 4275a57014..395d6b1150 100644
--- a/python/quadrants/lang/kernel_impl.py
+++ b/python/quadrants/lang/kernel_impl.py
@@ -227,13 +227,11 @@ def kernel(
     Args:
         cuda_graph: If True, kernels with 2+ top-level for loops are captured
             into a CUDA graph on first launch and replayed on subsequent
-            launches, reducing per-kernel launch overhead. On non-CUDA backends
-            this flag is a harmless no-op.
+            launches, reducing per-kernel launch overhead. Non-CUDA backends are not supported currently.
         graph_do_while: Name of a scalar ``qd.i32`` ndarray parameter that
             controls GPU-side iteration. The kernel body repeats while the
             named argument is non-zero.  Uses CUDA conditional while nodes
-            on SM 9.0+ (Hopper); falls back to a host-side do-while loop
-            on older GPUs and non-CUDA backends.  Implicitly enables
+            on SM 9.0+ (Hopper).  Implicitly enables
             ``cuda_graph=True``.
 
             **Do-while semantics**: the kernel body always executes at least

From aedac2860b1096a7f5054bb98e7f3d0d2532fd86 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 20:38:59 -0700
Subject: [PATCH 058/128] Add comments to JIT linker function declarations

---
 quadrants/rhi/cuda/cuda_driver_functions.inc.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/quadrants/rhi/cuda/cuda_driver_functions.inc.h b/quadrants/rhi/cuda/cuda_driver_functions.inc.h
index 9fe0e543d5..89f56ada94 100644
--- a/quadrants/rhi/cuda/cuda_driver_functions.inc.h
+++ b/quadrants/rhi/cuda/cuda_driver_functions.inc.h
@@ -81,10 +81,15 @@ PER_CUDA_FUNCTION(graph_exec_destroy, cuGraphExecDestroy, void *);
 PER_CUDA_FUNCTION(graph_conditional_handle_create, cuGraphConditionalHandleCreate, void *, void *, void *, uint32, uint32);
 
 // JIT linker (for loading condition kernel with cudadevrt)
+// Creates a new JIT linker session with options (e.g. optimization level, target arch).
 PER_CUDA_FUNCTION(link_create, cuLinkCreate_v2, uint32, void *, void *, void **);
+// Adds code from memory (PTX source or cubin bytes) to the link session.
 PER_CUDA_FUNCTION(link_add_data, cuLinkAddData_v2, void *, uint32, void *, std::size_t, const char *, uint32, void *, void *);
+// Adds code from a file on disk (PTX or cubin) to the link session.
 PER_CUDA_FUNCTION(link_add_file, cuLinkAddFile_v2, void *, uint32, const char *, uint32, void *, void *);
+// Finalizes linking, producing a cubin image in memory (returns pointer and size).
 PER_CUDA_FUNCTION(link_complete, cuLinkComplete, void *, void **, std::size_t *);
+// Destroys the link state and frees resources.
 PER_CUDA_FUNCTION(link_destroy, cuLinkDestroy, void *);
 PER_CUDA_FUNCTION(module_load_data, cuModuleLoadData, void **, const void *);
 // clang-format on

From a7a3ad9f7f472d192ecd1ab536840cad7015f130 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 20:41:32 -0700
Subject: [PATCH 059/128] Add comments to graph_do_while condition kernel PTX

---
 quadrants/runtime/cuda/kernel_launcher.cpp | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 3dfcc388e1..da7691b214 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -10,8 +10,16 @@ namespace quadrants::lang {
 namespace cuda {
 
 // PTX for a tiny condition kernel that reads a device-side int32 flag and
-// calls cudaGraphSetConditional(handle, flag != 0 ? 1 : 0).
+// Condition kernel for graph_do_while. Reads the user's i32 loop-control flag
+// from GPU memory and tells the CUDA graph's conditional while node whether to
+// run another iteration — all without returning to the host.
+//
+// Parameters:
+//   param_0: conditional node handle (passed to cudaGraphSetConditional)
+//   param_1: pointer to the user's qd.i32 flag ndarray on the GPU
+//
 // Compiled from CUDA C with: nvcc -ptx -arch=sm_90 -rdc=true
+// Requires SM 9.0+ (Hopper) for cudaGraphSetConditional / conditional nodes.
 // Requires JIT linking with libcudadevrt.a at runtime.
 static const char *kConditionKernelPTX = R"PTX(
 .version 8.8
@@ -31,12 +39,16 @@ static const char *kConditionKernelPTX = R"PTX(
     .reg .pred %p<2>;
     .reg .b32 %r<3>;
     .reg .b64 %rd<4>;
+    // Load conditional node handle and flag pointer
     ld.param.u64 %rd1, [_qd_graph_do_while_cond_param_0];
     ld.param.u64 %rd2, [_qd_graph_do_while_cond_param_1];
+    // Read flag value from GPU global memory
     cvta.to.global.u64 %rd3, %rd2;
     ld.global.u32 %r1, [%rd3];
+    // Convert to boolean: 1 if flag != 0, else 0
     setp.ne.s32 %p1, %r1, 0;
     selp.u32 %r2, 1, 0, %p1;
+    // Call cudaGraphSetConditional(handle, should_continue)
     { // callseq 0, 0
     .reg .b32 temp_param_reg;
     .param .b64 param0;

From cc6415713756d52251752807c4f81634d4539cc7 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 20:43:49 -0700
Subject: [PATCH 060/128] Improve comments in graph_do_while condition kernel
 PTX

---
 quadrants/runtime/cuda/kernel_launcher.cpp | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index da7691b214..bf74795c21 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -25,12 +25,19 @@ static const char *kConditionKernelPTX = R"PTX(
 .version 8.8
 .target sm_90
 .address_size 64
+
+// Declare the device-side cudaGraphSetConditional function (from libcudadevrt).
+// Takes a conditional node handle (u64) and a boolean (u32: 1=continue, 0=stop).
 .extern .func cudaGraphSetConditional
 (
     .param .b64 cudaGraphSetConditional_param_0,
     .param .b32 cudaGraphSetConditional_param_1
 )
 ;
+
+// Entry point: called by the CUDA graph's conditional while node each iteration.
+//   param_0 (u64): conditional node handle
+//   param_1 (u64): pointer to the user's qd.i32 flag in GPU global memory
 .visible .entry _qd_graph_do_while_cond(
     .param .u64 _qd_graph_do_while_cond_param_0,
     .param .u64 _qd_graph_do_while_cond_param_1
@@ -39,16 +46,23 @@ static const char *kConditionKernelPTX = R"PTX(
     .reg .pred %p<2>;
     .reg .b32 %r<3>;
     .reg .b64 %rd<4>;
-    // Load conditional node handle and flag pointer
+
+    // Load the two kernel parameters into registers:
+    //   %rd1 = conditional node handle
+    //   %rd2 = pointer to user's i32 flag
     ld.param.u64 %rd1, [_qd_graph_do_while_cond_param_0];
     ld.param.u64 %rd2, [_qd_graph_do_while_cond_param_1];
-    // Read flag value from GPU global memory
+
+    // Convert generic pointer to global address space, then read the flag value
     cvta.to.global.u64 %rd3, %rd2;
     ld.global.u32 %r1, [%rd3];
-    // Convert to boolean: 1 if flag != 0, else 0
+
+    // Convert flag to boolean: %r2 = (flag != 0) ? 1 : 0
     setp.ne.s32 %p1, %r1, 0;
     selp.u32 %r2, 1, 0, %p1;
-    // Call cudaGraphSetConditional(handle, should_continue)
+
+    // Tell the conditional while node whether to loop again or stop.
+    // cudaGraphSetConditional(handle, should_continue)
     { // callseq 0, 0
     .reg .b32 temp_param_reg;
     .param .b64 param0;

From e235b23a9f8828d86593df9387300190033e794a Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 20:48:26 -0700
Subject: [PATCH 061/128] Assert no gradient pointers in cuda_graph path

cuda_graph does not support autograd, so grad_ptr should always be
null. Replace the defensive gradient resolution with an explicit error
check.
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index bf74795c21..bd4c03d7ee 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -155,8 +155,13 @@ bool KernelLauncher::resolve_ctx_ndarray_ptrs(
       auto data_ptr = ctx.array_ptrs[data_ptr_idx];
       auto grad_ptr = ctx.array_ptrs[grad_ptr_idx];
 
+      QD_ERROR_IF(grad_ptr != nullptr,
+                  "cuda_graph does not support autograd; "
+                  "ndarray arg {} has a non-null gradient pointer", arg_id);
+
+      // Raw device pointer to the array data, resolved from either an
+      // external array (raw pointer) or a DeviceAllocation handle.
       void *resolved_data = nullptr;
-      void *resolved_grad = nullptr;
 
       if (ctx.device_allocation_type[arg_id] ==
           LaunchContextBuilder::DevAllocType::kNone) {
@@ -164,19 +169,13 @@ bool KernelLauncher::resolve_ctx_ndarray_ptrs(
           return false;
         }
         resolved_data = data_ptr;
-        resolved_grad = grad_ptr;
       } else if (arr_sz > 0) {
         DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
         resolved_data = executor->get_device_alloc_info_ptr(*ptr);
-        if (grad_ptr) {
-          resolved_grad = executor->get_device_alloc_info_ptr(
-              *static_cast<DeviceAllocation *>(grad_ptr));
-        }
       }
 
       if (resolved_data) {
-        ctx.set_ndarray_ptrs(arg_id, (uint64)resolved_data,
-                             (uint64)resolved_grad);
+        ctx.set_ndarray_ptrs(arg_id, (uint64)resolved_data, (uint64) nullptr);
         if (arg_id == ctx.graph_do_while_arg_id) {
           ctx.graph_do_while_flag_dev_ptr = resolved_data;
         }

From ce09e5ee73a84c4572728eee887d9d7c351a2212 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 20:52:05 -0700
Subject: [PATCH 062/128] Add /*name=*/ comment to link_add_data call

---
 quadrants/runtime/cuda/kernel_launcher.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index bd4c03d7ee..af93d18657 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -223,7 +223,7 @@ void KernelLauncher::ensure_condition_kernel_loaded() {
   std::size_t ptx_len = std::strlen(kConditionKernelPTX) + 1;
   driver.link_add_data(link_state, /*CU_JIT_INPUT_PTX=*/1,
                        const_cast<char *>(kConditionKernelPTX), ptx_len,
-                       "qd_cond", 0, nullptr, nullptr);
+                       /*name=*/"qd_cond", 0, nullptr, nullptr);
 
   driver.link_add_file(link_state, /*CU_JIT_INPUT_LIBRARY=*/4,
                        cudadevrt_path.c_str(), 0, nullptr, nullptr);

From ea835196ec061e9f4d6b88b90cf27eead152a12f Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 13 Mar 2026 20:54:07 -0700
Subject: [PATCH 063/128] Add description comment to
 ensure_condition_kernel_loaded

---
 quadrants/runtime/cuda/kernel_launcher.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index af93d18657..2801bffdb7 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -185,6 +185,10 @@ bool KernelLauncher::resolve_ctx_ndarray_ptrs(
   return true;
 }
 
+// Lazily JIT-compiles and loads the graph_do_while condition kernel.
+// Links the PTX (kConditionKernelPTX) with libcudadevrt.a to produce a cubin,
+// then loads the _qd_graph_do_while_cond function for use in conditional
+// while nodes. Only called once; subsequent calls are no-ops.
 void KernelLauncher::ensure_condition_kernel_loaded() {
   if (cond_kernel_func_)
     return;

From 0301b76a28b13c70b6f8c9aa37870aeb72b6d7a7 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 01:18:21 -0400
Subject: [PATCH 064/128] Throw error if graph_do_while condition ndarray
 changes between calls

The condition kernel flag pointer is baked into the CUDA graph at
creation time. Passing a different ndarray would require a graph
rebuild, which is not yet supported. Raise a clear error instead of
silently rebuilding.

Made-with: Cursor
---
 docs/source/user_guide/cuda_graph.md       |  6 +++++
 quadrants/runtime/cuda/kernel_launcher.cpp |  8 +++----
 tests/python/test_cuda_graph_do_while.py   | 26 +++++-----------------
 3 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/docs/source/user_guide/cuda_graph.md b/docs/source/user_guide/cuda_graph.md
index 23b489fce2..d978a05c73 100644
--- a/docs/source/user_guide/cuda_graph.md
+++ b/docs/source/user_guide/cuda_graph.md
@@ -117,6 +117,12 @@ The parameter used by `graph_do_while` MUST be an ndarray.
 
 However, other parameters can be any supported Quadrants kernel parameter type.
 
+### Restrictions
+
+- The same physical ndarray must be used for the counter parameter on every
+  call. Passing a different ndarray raises an error, because the counter's
+  device pointer is baked into the CUDA graph at creation time.
+
 ### Caveats
 
 Only runs on CUDA. No fallback on non-CUDA platforms currently.
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 2801bffdb7..dd10ba030f 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -276,11 +276,9 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
     auto &cached = it->second;
     if (use_graph_do_while &&
         cached.graph_do_while_flag_dev_ptr != ctx.graph_do_while_flag_dev_ptr) {
-      QD_TRACE(
-          "graph_do_while flag pointer changed ({} -> {}), rebuilding CUDA "
-          "graph",
-          cached.graph_do_while_flag_dev_ptr, ctx.graph_do_while_flag_dev_ptr);
-      cuda_graph_cache_.erase(it);
+      QD_ERROR(
+          "graph_do_while condition ndarray changed between calls. "
+          "Reuse the same ndarray for the condition parameter across calls.");
     } else {
       if (ctx.arg_buffer_size > 0) {
         CUDADriver::get_instance().memcpy_host_to_device(
diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py
index 6bdeb7ac92..285ea27af3 100644
--- a/tests/python/test_cuda_graph_do_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pytest
 
 import quadrants as qd
 from quadrants.lang import impl
@@ -141,14 +142,8 @@ def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, n
 
 
 @test_utils.test(arch=[qd.cuda])
-def test_graph_do_while_replay_new_ndarray():
-    """Test graph_do_while replay when the counter ndarray is a different allocation.
-
-    Regression test: the condition kernel's flag pointer was baked into the
-    CUDA graph at creation time. Passing a new ndarray (different device
-    address) on replay would cause the condition kernel to read stale memory.
-    The fix invalidates the cached graph when the flag pointer changes.
-    """
+def test_graph_do_while_replay_new_ndarray_raises():
+    """Passing a different ndarray for the condition parameter should raise."""
     N = 16
 
     @qd.kernel(graph_do_while="counter")
@@ -160,24 +155,13 @@ def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, n
 
     x = qd.ndarray(qd.i32, shape=(N,))
 
-    # First call with one counter ndarray
     counter1 = qd.ndarray(qd.i32, shape=())
     x.from_numpy(np.zeros(N, dtype=np.int32))
     counter1.from_numpy(np.array(3, dtype=np.int32))
     inc(x, counter1)
     assert _cuda_graph_used()
-    assert _cuda_graph_cache_size() == 1
-    qd.sync()
-    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 3, dtype=np.int32))
-    assert counter1.to_numpy() == 0
 
-    # Second call with a NEW counter ndarray (different device allocation)
     counter2 = qd.ndarray(qd.i32, shape=())
-    x.from_numpy(np.zeros(N, dtype=np.int32))
     counter2.from_numpy(np.array(5, dtype=np.int32))
-    inc(x, counter2)
-    assert _cuda_graph_used()
-    assert _cuda_graph_cache_size() == 1
-    qd.sync()
-    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 5, dtype=np.int32))
-    assert counter2.to_numpy() == 0
+    with pytest.raises(RuntimeError, match="condition ndarray changed"):
+        inc(x, counter2)

From 3402fae26c0ee5795dd61575838ee67a67a98842 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 08:38:14 -0400
Subject: [PATCH 065/128] Extract add_conditional_while_node from
 launch_llvm_kernel_graph

Made-with: Cursor
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 91 ++++++++++++----------
 quadrants/runtime/cuda/kernel_launcher.h   |  2 +
 2 files changed, 50 insertions(+), 43 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index dd10ba030f..b1b2ef1d58 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -245,6 +245,39 @@ void KernelLauncher::ensure_condition_kernel_loaded() {
            cubin_size);
 }
 
+void *KernelLauncher::add_conditional_while_node(
+    void *graph, unsigned long long *cond_handle_out) {
+  ensure_condition_kernel_loaded();
+  QD_ASSERT(cond_kernel_func_);
+
+  void *cu_ctx = CUDAContext::get_instance().get_context();
+
+  CUDADriver::get_instance().graph_conditional_handle_create(
+      cond_handle_out, graph, cu_ctx,
+      /*defaultLaunchValue=*/1,
+      /*flags=CU_GRAPH_COND_ASSIGN_DEFAULT=*/1);
+
+  CudaGraphNodeParams cond_node_params{};
+  cond_node_params.type = 13;  // CU_GRAPH_NODE_TYPE_CONDITIONAL
+  cond_node_params.handle = *cond_handle_out;
+  cond_node_params.condType = 1;  // CU_GRAPH_COND_TYPE_WHILE
+  cond_node_params.size = 1;
+  cond_node_params.phGraph_out = nullptr;  // CUDA will populate this
+  cond_node_params.ctx = cu_ctx;
+
+  void *cond_node = nullptr;
+  CUDADriver::get_instance().graph_add_node(&cond_node, graph, nullptr, 0,
+                                            &cond_node_params);
+
+  // CUDA replaces phGraph_out with a pointer to its owned array
+  void **body_graphs = (void **)cond_node_params.phGraph_out;
+  QD_ASSERT(body_graphs && body_graphs[0]);
+
+  QD_TRACE("CUDA graph_do_while: conditional node created, body graph={}",
+           body_graphs[0]);
+  return body_graphs[0];
+}
+
 bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
                                               LaunchContextBuilder &ctx) {
   int launch_id = handle.get_launch_id();
@@ -274,21 +307,21 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
   auto it = cuda_graph_cache_.find(launch_id);
   if (it != cuda_graph_cache_.end()) {
     auto &cached = it->second;
-    if (use_graph_do_while &&
-        cached.graph_do_while_flag_dev_ptr != ctx.graph_do_while_flag_dev_ptr) {
-      QD_ERROR(
-          "graph_do_while condition ndarray changed between calls. "
-          "Reuse the same ndarray for the condition parameter across calls.");
-    } else {
-      if (ctx.arg_buffer_size > 0) {
-        CUDADriver::get_instance().memcpy_host_to_device(
-            cached.persistent_device_arg_buffer, ctx.get_context().arg_buffer,
-            cached.arg_buffer_size);
-      }
-      auto *stream = CUDAContext::get_instance().get_stream();
-      CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);
-      return true;
+    QD_ERROR_IF(
+        use_graph_do_while &&
+            cached.graph_do_while_flag_dev_ptr !=
+                ctx.graph_do_while_flag_dev_ptr,
+        "graph_do_while condition ndarray changed between calls. "
+        "Reuse the same ndarray for the condition parameter across calls.");
+
+    if (ctx.arg_buffer_size > 0) {
+      CUDADriver::get_instance().memcpy_host_to_device(
+          cached.persistent_device_arg_buffer, ctx.get_context().arg_buffer,
+          cached.arg_buffer_size);
     }
+    auto *stream = CUDAContext::get_instance().get_stream();
+    CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);
+    return true;
   }
 
   CUDAContext::get_instance().make_current();
@@ -330,35 +363,7 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
   unsigned long long cond_handle = 0;
 
   if (use_graph_do_while) {
-    ensure_condition_kernel_loaded();
-    QD_ASSERT(cond_kernel_func_);
-
-    void *cu_ctx = CUDAContext::get_instance().get_context();
-
-    CUDADriver::get_instance().graph_conditional_handle_create(
-        &cond_handle, graph, cu_ctx,
-        /*defaultLaunchValue=*/1,
-        /*flags=CU_GRAPH_COND_ASSIGN_DEFAULT=*/1);
-
-    CudaGraphNodeParams cond_node_params{};
-    cond_node_params.type = 13;  // CU_GRAPH_NODE_TYPE_CONDITIONAL
-    cond_node_params.handle = cond_handle;
-    cond_node_params.condType = 1;  // CU_GRAPH_COND_TYPE_WHILE
-    cond_node_params.size = 1;
-    cond_node_params.phGraph_out = nullptr;  // CUDA will populate this
-    cond_node_params.ctx = cu_ctx;
-
-    void *cond_node = nullptr;
-    CUDADriver::get_instance().graph_add_node(&cond_node, graph, nullptr, 0,
-                                              &cond_node_params);
-
-    // CUDA replaces phGraph_out with a pointer to its owned array
-    void **body_graphs = (void **)cond_node_params.phGraph_out;
-    QD_ASSERT(body_graphs && body_graphs[0]);
-    kernel_target_graph = body_graphs[0];
-
-    QD_TRACE("CUDA graph_do_while: conditional node created, body graph={}",
-             kernel_target_graph);
+    kernel_target_graph = add_conditional_while_node(graph, &cond_handle);
   }
 
   // Add work kernel nodes to the target graph
diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index 2605cbbb62..ebf60e7109 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -89,6 +89,8 @@ class KernelLauncher : public LLVM::KernelLauncher {
       const std::vector<std::pair<int, Callable::Parameter>> &parameters);
   bool launch_llvm_kernel_graph(Handle handle, LaunchContextBuilder &ctx);
   void ensure_condition_kernel_loaded();
+  void *add_conditional_while_node(void *graph,
+                                   unsigned long long *cond_handle_out);
   std::vector<Context> contexts_;
   // Keyed by launch_id, which uniquely identifies a compiled kernel variant
   // (each template specialization gets its own launch_id).

From 0603fd5a3c3b35207d316023e85e8e6f5eb7ba7d Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 08:38:51 -0400
Subject: [PATCH 066/128] Add comments to link_state and conditional graph
 structure

Made-with: Cursor
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index b1b2ef1d58..03015baa57 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -221,6 +221,8 @@ void KernelLauncher::ensure_condition_kernel_loaded() {
               "graph_do_while requires libcudadevrt.a but it was not found. "
               "Install the CUDA toolkit and/or set CUDA_HOME.");
 
+  // CUlinkState handle for the JIT linker session that combines our PTX
+  // with libcudadevrt.a to resolve the cudaGraphSetConditional extern.
   void *link_state = nullptr;
   driver.link_create(0, nullptr, nullptr, &link_state);
 
@@ -357,8 +359,16 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
   void *graph = nullptr;
   CUDADriver::get_instance().graph_create(&graph, 0);
 
-  // Determine the target graph for kernel nodes.
-  // With graph_do_while, kernels go into the conditional while body graph.
+  // Target graph for kernel nodes. Without graph_do_while, work kernels go
+  // directly into the top-level graph. With graph_do_while, they go into
+  // a body graph inside a conditional while node:
+  //
+  //   Top-level graph
+  //     └── Conditional while node (repeats while flag != 0)
+  //           └── Body graph
+  //                 ├── Work kernel 1
+  //                 ├── Work kernel 2
+  //                 └── Condition kernel (reads flag, calls cudaGraphSetConditional)
   void *kernel_target_graph = graph;
   unsigned long long cond_handle = 0;
 

From 03e8142593f1a47c0ead31779c1e109e3a85e8b0 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 08:40:57 -0400
Subject: [PATCH 067/128] Error instead of fallback when graph_do_while has
 host-resident ndarrays

Made-with: Cursor
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 03015baa57..f4cd63017d 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -298,14 +298,16 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
               "cuda_graph=True is not supported for kernels with struct return "
               "values; remove cuda_graph=True or avoid returning values");
 
+  const bool use_graph_do_while = ctx.graph_do_while_arg_id >= 0;
+
   // Falls back to the normal path if any external array is host-resident,
   // since the graph path cannot perform host-to-device transfers.
   if (!resolve_ctx_ndarray_ptrs(ctx, parameters)) {
+    QD_ERROR_IF(use_graph_do_while,
+                "graph_do_while requires all ndarrays to be device-resident");
     return false;
   }
 
-  const bool use_graph_do_while = ctx.graph_do_while_arg_id >= 0;
-
   auto it = cuda_graph_cache_.find(launch_id);
   if (it != cuda_graph_cache_.end()) {
     auto &cached = it->second;

From 0c481e8465d1c917d41a6a74a079d6c2bb860d22 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 08:43:00 -0400
Subject: [PATCH 068/128] Extract add_kernel_node helper to deduplicate graph
 kernel node creation

Made-with: Cursor
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 70 ++++++++++------------
 quadrants/runtime/cuda/kernel_launcher.h   |  3 +
 2 files changed, 34 insertions(+), 39 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index f4cd63017d..4c0c5061c8 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -247,6 +247,30 @@ void KernelLauncher::ensure_condition_kernel_loaded() {
            cubin_size);
 }
 
+void *KernelLauncher::add_kernel_node(void *graph, void *prev_node, void *func,
+                                      unsigned int grid_dim,
+                                      unsigned int block_dim,
+                                      unsigned int shared_mem,
+                                      void **kernel_params) {
+  CudaKernelNodeParams params{};
+  params.func = func;
+  params.gridDimX = grid_dim;
+  params.gridDimY = 1;
+  params.gridDimZ = 1;
+  params.blockDimX = block_dim;
+  params.blockDimY = 1;
+  params.blockDimZ = 1;
+  params.sharedMemBytes = shared_mem;
+  params.kernelParams = kernel_params;
+  params.extra = nullptr;
+
+  void *node = nullptr;
+  CUDADriver::get_instance().graph_add_kernel_node(
+      &node, graph, prev_node ? &prev_node : nullptr, prev_node ? 1 : 0,
+      &params);
+  return node;
+}
+
 void *KernelLauncher::add_conditional_while_node(
     void *graph, unsigned long long *cond_handle_out) {
   ensure_condition_kernel_loaded();
@@ -381,30 +405,12 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
   // Add work kernel nodes to the target graph
   void *prev_node = nullptr;
   for (const auto &task : offloaded_tasks) {
-    void *func = cuda_module->lookup_function(task.name);
-
     void *ctx_ptr = &cached.persistent_ctx;
-    CudaKernelNodeParams node_params{};
-    node_params.func = func;
-    node_params.gridDimX = (unsigned int)task.grid_dim;
-    node_params.gridDimY = 1;
-    node_params.gridDimZ = 1;
-    node_params.blockDimX = (unsigned int)task.block_dim;
-    node_params.blockDimY = 1;
-    node_params.blockDimZ = 1;
-    node_params.sharedMemBytes = (unsigned int)task.dynamic_shared_array_bytes;
-    node_params.kernelParams = &ctx_ptr;
-    // kernelParams and extra are two mutually exclusive ways of passing
-    // arguments to a CUDA kernel; we use kernelParams, so extra is null.
-    node_params.extra = nullptr;
-
-    void *node = nullptr;
-    const void *deps = prev_node;
-    std::size_t num_deps = prev_node ? 1 : 0;
-    CUDADriver::get_instance().graph_add_kernel_node(
-        &node, kernel_target_graph, prev_node ? &deps : nullptr, num_deps,
-        &node_params);
-    prev_node = node;
+    prev_node = add_kernel_node(
+        kernel_target_graph, prev_node,
+        cuda_module->lookup_function(task.name),
+        (unsigned int)task.grid_dim, (unsigned int)task.block_dim,
+        (unsigned int)task.dynamic_shared_array_bytes, &ctx_ptr);
   }
 
   // For graph_do_while: add condition kernel as the last node in the body graph
@@ -414,22 +420,8 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
     void *flag_ptr = ctx.graph_do_while_flag_dev_ptr;
     void *cond_args[2] = {&cond_handle, &flag_ptr};
 
-    CudaKernelNodeParams cond_kp{};
-    cond_kp.func = cond_kernel_func_;
-    cond_kp.gridDimX = 1;
-    cond_kp.gridDimY = 1;
-    cond_kp.gridDimZ = 1;
-    cond_kp.blockDimX = 1;
-    cond_kp.blockDimY = 1;
-    cond_kp.blockDimZ = 1;
-    cond_kp.sharedMemBytes = 0;
-    cond_kp.kernelParams = cond_args;
-    cond_kp.extra = nullptr;
-
-    void *cond_kernel_node = nullptr;
-    CUDADriver::get_instance().graph_add_kernel_node(
-        &cond_kernel_node, kernel_target_graph,
-        prev_node ? &prev_node : nullptr, prev_node ? 1 : 0, &cond_kp);
+    add_kernel_node(kernel_target_graph, prev_node, cond_kernel_func_,
+                    1, 1, 0, cond_args);
   }
 
   // --- Instantiate and launch ---
diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index ebf60e7109..250b00fdc0 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -91,6 +91,9 @@ class KernelLauncher : public LLVM::KernelLauncher {
   void ensure_condition_kernel_loaded();
   void *add_conditional_while_node(void *graph,
                                    unsigned long long *cond_handle_out);
+  void *add_kernel_node(void *graph, void *prev_node, void *func,
+                        unsigned int grid_dim, unsigned int block_dim,
+                        unsigned int shared_mem, void **kernel_params);
   std::vector<Context> contexts_;
   // Keyed by launch_id, which uniquely identifies a compiled kernel variant
   // (each template specialization gets its own launch_id).

From e223689509cf3cfe4e05a83a9d66d2231a5ac2cc Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 08:49:44 -0400
Subject: [PATCH 069/128] Add comment explaining why condition kernel must be
 last in body graph

Made-with: Cursor
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 4c0c5061c8..e1aab41c40 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -395,6 +395,11 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
   //                 ├── Work kernel 1
   //                 ├── Work kernel 2
   //                 └── Condition kernel (reads flag, calls cudaGraphSetConditional)
+  //
+  // The condition kernel must be the last node in the body graph. It reads the
+  // flag after the work kernels have updated it, so the loop-continue decision
+  // reflects this iteration's result. Putting it first would cause an extra
+  // iteration: the condition would see the flag from before the work ran.
   void *kernel_target_graph = graph;
   unsigned long long cond_handle = 0;
 
@@ -413,7 +418,6 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
         (unsigned int)task.dynamic_shared_array_bytes, &ctx_ptr);
   }
 
-  // For graph_do_while: add condition kernel as the last node in the body graph
   if (use_graph_do_while) {
     QD_ASSERT(ctx.graph_do_while_flag_dev_ptr);
 

From ff2d2abb93aff918967d757e56dca0c954b9c0a5 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 08:51:34 -0400
Subject: [PATCH 070/128] Add comment for conditional node in body graph

Made-with: Cursor
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index e1aab41c40..c3f8cba9a6 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -419,6 +419,7 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
   }
 
   if (use_graph_do_while) {
+    // add conditional node into the body graph
     QD_ASSERT(ctx.graph_do_while_flag_dev_ptr);
 
     void *flag_ptr = ctx.graph_do_while_flag_dev_ptr;

From b11fe2698f493eb5aee78b30ea58ed7e5b5320aa Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 08:53:21 -0400
Subject: [PATCH 071/128] Add comment explaining cached
 graph_do_while_flag_dev_ptr

Made-with: Cursor
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index c3f8cba9a6..e8ec5138b1 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -445,6 +445,11 @@ bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
       use_graph_do_while ? " (with graph_do_while)" : "");
 
   if (use_graph_do_while) {
+    // Save the flag pointer so we can detect if the user passes a different
+    // ndarray on a later call. The flag's device pointer is baked into the
+    // CUDA graph as a condition kernel argument; if the user later calls with
+    // a different ndarray, the graph would still read from the old pointer,
+    // so we error out instead of silently producing wrong results.
     cached.graph_do_while_flag_dev_ptr = ctx.graph_do_while_flag_dev_ptr;
   }
   cuda_graph_cache_.emplace(launch_id, std::move(cached));

From ee5b0b8a3949c568c3fcfaa5bed9ea7ee0a7c2f2 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:01:31 -0400
Subject: [PATCH 072/128] Extract CudaGraphManager from KernelLauncher into
 separate class

Made-with: Cursor
---
 quadrants/runtime/cuda/CMakeLists.txt         |   1 +
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 453 +++++++++++++++++
 quadrants/runtime/cuda/cuda_graph_manager.h   | 105 ++++
 quadrants/runtime/cuda/kernel_launcher.cpp    | 464 +-----------------
 quadrants/runtime/cuda/kernel_launcher.h      |  78 +--
 5 files changed, 574 insertions(+), 527 deletions(-)
 create mode 100644 quadrants/runtime/cuda/cuda_graph_manager.cpp
 create mode 100644 quadrants/runtime/cuda/cuda_graph_manager.h

diff --git a/quadrants/runtime/cuda/CMakeLists.txt b/quadrants/runtime/cuda/CMakeLists.txt
index 961b895b88..a5c4eeac6e 100644
--- a/quadrants/runtime/cuda/CMakeLists.txt
+++ b/quadrants/runtime/cuda/CMakeLists.txt
@@ -3,6 +3,7 @@
 add_library(cuda_runtime)
 target_sources(cuda_runtime
   PRIVATE
+    cuda_graph_manager.cpp
     jit_cuda.cpp
     kernel_launcher.cpp
     ptx_cache.cpp
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
new file mode 100644
index 0000000000..0ca2ea6896
--- /dev/null
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -0,0 +1,453 @@
+#include "quadrants/runtime/cuda/cuda_graph_manager.h"
+#include "quadrants/rhi/cuda/cuda_context.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <filesystem>
+#include <vector>
+
+namespace quadrants::lang {
+namespace cuda {
+
+// Condition kernel for graph_do_while. Reads the user's i32 loop-control flag
+// from GPU memory and tells the CUDA graph's conditional while node whether to
+// run another iteration — all without returning to the host.
+//
+// Parameters:
+//   param_0: conditional node handle (passed to cudaGraphSetConditional)
+//   param_1: pointer to the user's qd.i32 flag ndarray on the GPU
+//
+// Compiled from CUDA C with: nvcc -ptx -arch=sm_90 -rdc=true
+// Requires SM 9.0+ (Hopper) for cudaGraphSetConditional / conditional nodes.
+// Requires JIT linking with libcudadevrt.a at runtime.
+static const char *kConditionKernelPTX = R"PTX(
+.version 8.8
+.target sm_90
+.address_size 64
+
+// Declare the device-side cudaGraphSetConditional function (from libcudadevrt).
+// Takes a conditional node handle (u64) and a boolean (u32: 1=continue, 0=stop).
+.extern .func cudaGraphSetConditional
+(
+    .param .b64 cudaGraphSetConditional_param_0,
+    .param .b32 cudaGraphSetConditional_param_1
+)
+;
+
+// Entry point: called by the CUDA graph's conditional while node each iteration.
+//   param_0 (u64): conditional node handle
+//   param_1 (u64): pointer to the user's qd.i32 flag in GPU global memory
+.visible .entry _qd_graph_do_while_cond(
+    .param .u64 _qd_graph_do_while_cond_param_0,
+    .param .u64 _qd_graph_do_while_cond_param_1
+)
+{
+    .reg .pred %p<2>;
+    .reg .b32 %r<3>;
+    .reg .b64 %rd<4>;
+
+    // Load the two kernel parameters into registers:
+    //   %rd1 = conditional node handle
+    //   %rd2 = pointer to user's i32 flag
+    ld.param.u64 %rd1, [_qd_graph_do_while_cond_param_0];
+    ld.param.u64 %rd2, [_qd_graph_do_while_cond_param_1];
+
+    // Convert generic pointer to global address space, then read the flag value
+    cvta.to.global.u64 %rd3, %rd2;
+    ld.global.u32 %r1, [%rd3];
+
+    // Convert flag to boolean: %r2 = (flag != 0) ? 1 : 0
+    setp.ne.s32 %p1, %r1, 0;
+    selp.u32 %r2, 1, 0, %p1;
+
+    // Tell the conditional while node whether to loop again or stop.
+    // cudaGraphSetConditional(handle, should_continue)
+    { // callseq 0, 0
+    .reg .b32 temp_param_reg;
+    .param .b64 param0;
+    st.param.b64 [param0+0], %rd1;
+    .param .b32 param1;
+    st.param.b32 [param1+0], %r2;
+    call.uni cudaGraphSetConditional, (param0, param1);
+    } // callseq 0
+    ret;
+}
+)PTX";
+
+CachedCudaGraph::~CachedCudaGraph() {
+  if (graph_exec) {
+    CUDADriver::get_instance().graph_exec_destroy(graph_exec);
+  }
+  if (persistent_device_arg_buffer) {
+    CUDADriver::get_instance().mem_free(persistent_device_arg_buffer);
+  }
+  if (persistent_device_result_buffer) {
+    CUDADriver::get_instance().mem_free(persistent_device_result_buffer);
+  }
+}
+
+CachedCudaGraph::CachedCudaGraph(CachedCudaGraph &&other) noexcept
+    : graph_exec(other.graph_exec),
+      persistent_device_arg_buffer(other.persistent_device_arg_buffer),
+      persistent_device_result_buffer(other.persistent_device_result_buffer),
+      persistent_ctx(other.persistent_ctx),
+      arg_buffer_size(other.arg_buffer_size),
+      result_buffer_size(other.result_buffer_size) {
+  other.graph_exec = nullptr;
+  other.persistent_device_arg_buffer = nullptr;
+  other.persistent_device_result_buffer = nullptr;
+}
+
+CachedCudaGraph &CachedCudaGraph::operator=(CachedCudaGraph &&other) noexcept {
+  if (this != &other) {
+    if (graph_exec)
+      CUDADriver::get_instance().graph_exec_destroy(graph_exec);
+    if (persistent_device_arg_buffer)
+      CUDADriver::get_instance().mem_free(persistent_device_arg_buffer);
+    if (persistent_device_result_buffer)
+      CUDADriver::get_instance().mem_free(persistent_device_result_buffer);
+
+    graph_exec = other.graph_exec;
+    persistent_device_arg_buffer = other.persistent_device_arg_buffer;
+    persistent_device_result_buffer = other.persistent_device_result_buffer;
+    persistent_ctx = other.persistent_ctx;
+    arg_buffer_size = other.arg_buffer_size;
+    result_buffer_size = other.result_buffer_size;
+
+    other.graph_exec = nullptr;
+    other.persistent_device_arg_buffer = nullptr;
+    other.persistent_device_result_buffer = nullptr;
+  }
+  return *this;
+}
+
+bool CudaGraphManager::on_cuda_device(void *ptr) {
+  unsigned int attr_val = 0;
+  uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
+      &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (void *)ptr);
+
+  return ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE;
+}
+
+// Resolves ndarray parameter handles in the launch context to raw device
+// pointers, writing them into the arg buffer via set_ndarray_ptrs.
+//
+// Unlike the normal launch path, this does not handle host-resident arrays
+// (no temporary device allocation or host-to-device transfer). Returns false
+// if any external array is on the host, signaling the caller to fall back
+// to the non-graph launch path.
+bool CudaGraphManager::resolve_ctx_ndarray_ptrs(
+    LaunchContextBuilder &ctx,
+    const std::vector<std::pair<int, Callable::Parameter>> &parameters,
+    LlvmRuntimeExecutor *executor) {
+  for (int i = 0; i < (int)parameters.size(); i++) {
+    const auto &kv = parameters[i];
+    const auto &arg_id = kv.first;
+    const auto &parameter = kv.second;
+    if (parameter.is_array) {
+      const auto arr_sz = ctx.array_runtime_sizes[arg_id];
+      if (arr_sz == 0)
+        continue;
+
+      ArgArrayPtrKey data_ptr_idx{arg_id, TypeFactory::DATA_PTR_POS_IN_NDARRAY};
+      ArgArrayPtrKey grad_ptr_idx{arg_id, TypeFactory::GRAD_PTR_POS_IN_NDARRAY};
+      auto data_ptr = ctx.array_ptrs[data_ptr_idx];
+      auto grad_ptr = ctx.array_ptrs[grad_ptr_idx];
+
+      QD_ERROR_IF(grad_ptr != nullptr,
+                  "cuda_graph does not support autograd; "
+                  "ndarray arg {} has a non-null gradient pointer", arg_id);
+
+      // Raw device pointer to the array data, resolved from either an
+      // external array (raw pointer) or a DeviceAllocation handle.
+      void *resolved_data = nullptr;
+
+      if (ctx.device_allocation_type[arg_id] ==
+          LaunchContextBuilder::DevAllocType::kNone) {
+        if (!on_cuda_device(data_ptr)) {
+          return false;
+        }
+        resolved_data = data_ptr;
+      } else if (arr_sz > 0) {
+        DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
+        resolved_data = executor->get_device_alloc_info_ptr(*ptr);
+      }
+
+      if (resolved_data) {
+        ctx.set_ndarray_ptrs(arg_id, (uint64)resolved_data, (uint64) nullptr);
+        if (arg_id == ctx.graph_do_while_arg_id) {
+          ctx.graph_do_while_flag_dev_ptr = resolved_data;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+// Lazily JIT-compiles and loads the graph_do_while condition kernel.
+// Links the PTX (kConditionKernelPTX) with libcudadevrt.a to produce a cubin,
+// then loads the _qd_graph_do_while_cond function for use in conditional
+// while nodes. Only called once; subsequent calls are no-ops.
+void CudaGraphManager::ensure_condition_kernel_loaded() {
+  if (cond_kernel_func_)
+    return;
+
+  int cc = CUDAContext::get_instance().get_compute_capability();
+  QD_ERROR_IF(cc < 90,
+              "graph_do_while requires SM 9.0+ (Hopper), but this device is "
+              "SM {}.",
+              cc);
+
+  auto &driver = CUDADriver::get_instance();
+
+  std::string cudadevrt_path;
+  std::vector<std::string> candidates;
+  for (const char *env_name : {"CUDA_HOME", "CUDA_PATH"}) {
+    if (const char *env_val = std::getenv(env_name)) {
+      candidates.push_back(std::string(env_val) + "/lib64/libcudadevrt.a");
+      candidates.push_back(std::string(env_val) + "/lib/libcudadevrt.a");
+    }
+  }
+  candidates.push_back("/usr/local/cuda/lib64/libcudadevrt.a");
+  candidates.push_back("/usr/lib/x86_64-linux-gnu/libcudadevrt.a");
+  for (const auto &candidate : candidates) {
+    if (std::filesystem::exists(candidate)) {
+      cudadevrt_path = candidate;
+      break;
+    }
+  }
+  QD_ERROR_IF(cudadevrt_path.empty(),
+              "graph_do_while requires libcudadevrt.a but it was not found. "
+              "Install the CUDA toolkit and/or set CUDA_HOME.");
+
+  // CUlinkState handle for the JIT linker session that combines our PTX
+  // with libcudadevrt.a to resolve the cudaGraphSetConditional extern.
+  void *link_state = nullptr;
+  driver.link_create(0, nullptr, nullptr, &link_state);
+
+  std::size_t ptx_len = std::strlen(kConditionKernelPTX) + 1;
+  driver.link_add_data(link_state, /*CU_JIT_INPUT_PTX=*/1,
+                       const_cast<char *>(kConditionKernelPTX), ptx_len,
+                       /*name=*/"qd_cond", 0, nullptr, nullptr);
+
+  driver.link_add_file(link_state, /*CU_JIT_INPUT_LIBRARY=*/4,
+                       cudadevrt_path.c_str(), 0, nullptr, nullptr);
+
+  void *cubin = nullptr;
+  std::size_t cubin_size = 0;
+  driver.link_complete(link_state, &cubin, &cubin_size);
+
+  driver.module_load_data(&cond_kernel_module_, cubin);
+  driver.module_get_function(&cond_kernel_func_, cond_kernel_module_,
+                             "_qd_graph_do_while_cond");
+  driver.link_destroy(link_state);
+
+  QD_TRACE("Loaded graph_do_while condition kernel ({} bytes cubin)",
+           cubin_size);
+}
+
+void *CudaGraphManager::add_kernel_node(void *graph, void *prev_node,
+                                        void *func, unsigned int grid_dim,
+                                        unsigned int block_dim,
+                                        unsigned int shared_mem,
+                                        void **kernel_params) {
+  CudaKernelNodeParams params{};
+  params.func = func;
+  params.gridDimX = grid_dim;
+  params.gridDimY = 1;
+  params.gridDimZ = 1;
+  params.blockDimX = block_dim;
+  params.blockDimY = 1;
+  params.blockDimZ = 1;
+  params.sharedMemBytes = shared_mem;
+  params.kernelParams = kernel_params;
+  params.extra = nullptr;
+
+  void *node = nullptr;
+  CUDADriver::get_instance().graph_add_kernel_node(
+      &node, graph, prev_node ? &prev_node : nullptr, prev_node ? 1 : 0,
+      &params);
+  return node;
+}
+
+void *CudaGraphManager::add_conditional_while_node(
+    void *graph, unsigned long long *cond_handle_out) {
+  ensure_condition_kernel_loaded();
+  QD_ASSERT(cond_kernel_func_);
+
+  void *cu_ctx = CUDAContext::get_instance().get_context();
+
+  CUDADriver::get_instance().graph_conditional_handle_create(
+      cond_handle_out, graph, cu_ctx,
+      /*defaultLaunchValue=*/1,
+      /*flags=CU_GRAPH_COND_ASSIGN_DEFAULT=*/1);
+
+  CudaGraphNodeParams cond_node_params{};
+  cond_node_params.type = 13;  // CU_GRAPH_NODE_TYPE_CONDITIONAL
+  cond_node_params.handle = *cond_handle_out;
+  cond_node_params.condType = 1;  // CU_GRAPH_COND_TYPE_WHILE
+  cond_node_params.size = 1;
+  cond_node_params.phGraph_out = nullptr;  // CUDA will populate this
+  cond_node_params.ctx = cu_ctx;
+
+  void *cond_node = nullptr;
+  CUDADriver::get_instance().graph_add_node(&cond_node, graph, nullptr, 0,
+                                            &cond_node_params);
+
+  // CUDA replaces phGraph_out with a pointer to its owned array
+  void **body_graphs = (void **)cond_node_params.phGraph_out;
+  QD_ASSERT(body_graphs && body_graphs[0]);
+
+  QD_TRACE("CUDA graph_do_while: conditional node created, body graph={}",
+           body_graphs[0]);
+  return body_graphs[0];
+}
+
+bool CudaGraphManager::try_launch(
+    int launch_id, LaunchContextBuilder &ctx, JITModule *cuda_module,
+    const std::vector<std::pair<int, Callable::Parameter>> &parameters,
+    const std::vector<OffloadedTask> &offloaded_tasks,
+    LlvmRuntimeExecutor *executor) {
+  if (offloaded_tasks.empty()) {
+    return false;
+  }
+
+  QD_ERROR_IF(ctx.result_buffer_size > 0,
+              "cuda_graph=True is not supported for kernels with struct return "
+              "values; remove cuda_graph=True or avoid returning values");
+
+  const bool use_graph_do_while = ctx.graph_do_while_arg_id >= 0;
+
+  // Falls back to the normal path if any external array is host-resident,
+  // since the graph path cannot perform host-to-device transfers.
+  if (!resolve_ctx_ndarray_ptrs(ctx, parameters, executor)) {
+    QD_ERROR_IF(use_graph_do_while,
+                "graph_do_while requires all ndarrays to be device-resident");
+    return false;
+  }
+
+  auto it = cache_.find(launch_id);
+  if (it != cache_.end()) {
+    auto &cached = it->second;
+    QD_ERROR_IF(
+        use_graph_do_while &&
+            cached.graph_do_while_flag_dev_ptr !=
+                ctx.graph_do_while_flag_dev_ptr,
+        "graph_do_while condition ndarray changed between calls. "
+        "Reuse the same ndarray for the condition parameter across calls.");
+
+    if (ctx.arg_buffer_size > 0) {
+      CUDADriver::get_instance().memcpy_host_to_device(
+          cached.persistent_device_arg_buffer, ctx.get_context().arg_buffer,
+          cached.arg_buffer_size);
+    }
+    auto *stream = CUDAContext::get_instance().get_stream();
+    CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);
+    used_on_last_call_ = true;
+    return true;
+  }
+
+  CUDAContext::get_instance().make_current();
+
+  CachedCudaGraph cached;
+
+  // --- Allocate persistent buffers ---
+  cached.result_buffer_size = std::max(ctx.result_buffer_size, sizeof(uint64));
+  CUDADriver::get_instance().malloc(
+      (void **)&cached.persistent_device_result_buffer,
+      cached.result_buffer_size);
+
+  cached.arg_buffer_size = ctx.arg_buffer_size;
+  if (cached.arg_buffer_size > 0) {
+    CUDADriver::get_instance().malloc(
+        (void **)&cached.persistent_device_arg_buffer, cached.arg_buffer_size);
+    CUDADriver::get_instance().memcpy_host_to_device(
+        cached.persistent_device_arg_buffer, ctx.get_context().arg_buffer,
+        cached.arg_buffer_size);
+  }
+
+  // --- Build persistent RuntimeContext ---
+  cached.persistent_ctx.runtime = executor->get_llvm_runtime();
+  cached.persistent_ctx.arg_buffer = cached.persistent_device_arg_buffer;
+  cached.persistent_ctx.result_buffer =
+      (uint64 *)cached.persistent_device_result_buffer;
+  cached.persistent_ctx.cpu_thread_id = 0;
+
+  // --- Build CUDA graph ---
+  void *graph = nullptr;
+  CUDADriver::get_instance().graph_create(&graph, 0);
+
+  // Target graph for kernel nodes. Without graph_do_while, work kernels go
+  // directly into the top-level graph. With graph_do_while, they go into
+  // a body graph inside a conditional while node:
+  //
+  //   Top-level graph
+  //     └── Conditional while node (repeats while flag != 0)
+  //           └── Body graph
+  //                 ├── Work kernel 1
+  //                 ├── Work kernel 2
+  //                 └── Condition kernel (reads flag, calls cudaGraphSetConditional)
+  //
+  // The condition kernel must be the last node in the body graph. It reads the
+  // flag after the work kernels have updated it, so the loop-continue decision
+  // reflects this iteration's result. Putting it first would cause an extra
+  // iteration: the condition would see the flag from before the work ran.
+  void *kernel_target_graph = graph;
+  unsigned long long cond_handle = 0;
+
+  if (use_graph_do_while) {
+    kernel_target_graph = add_conditional_while_node(graph, &cond_handle);
+  }
+
+  // Add work kernel nodes to the target graph
+  void *prev_node = nullptr;
+  for (const auto &task : offloaded_tasks) {
+    void *ctx_ptr = &cached.persistent_ctx;
+    prev_node = add_kernel_node(
+        kernel_target_graph, prev_node,
+        cuda_module->lookup_function(task.name),
+        (unsigned int)task.grid_dim, (unsigned int)task.block_dim,
+        (unsigned int)task.dynamic_shared_array_bytes, &ctx_ptr);
+  }
+
+  if (use_graph_do_while) {
+    // add conditional node into the body graph
+    QD_ASSERT(ctx.graph_do_while_flag_dev_ptr);
+
+    void *flag_ptr = ctx.graph_do_while_flag_dev_ptr;
+    void *cond_args[2] = {&cond_handle, &flag_ptr};
+
+    add_kernel_node(kernel_target_graph, prev_node, cond_kernel_func_,
+                    1, 1, 0, cond_args);
+  }
+
+  // --- Instantiate and launch ---
+  CUDADriver::get_instance().graph_instantiate(&cached.graph_exec, graph,
+                                               nullptr, nullptr, 0);
+
+  auto *stream = CUDAContext::get_instance().get_stream();
+  CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);
+
+  CUDADriver::get_instance().graph_destroy(graph);
+
+  QD_TRACE(
+      "CUDA graph created with {} kernel nodes for launch_id={}"
+      "{}",
+      offloaded_tasks.size(), launch_id,
+      use_graph_do_while ? " (with graph_do_while)" : "");
+
+  if (use_graph_do_while) {
+    // Save the flag pointer so we can detect if the user passes a different
+    // ndarray on a later call. The flag's device pointer is baked into the
+    // CUDA graph as a condition kernel argument; if the user later calls with
+    // a different ndarray, the graph would still read from the old pointer,
+    // so we error out instead of silently producing wrong results.
+    cached.graph_do_while_flag_dev_ptr = ctx.graph_do_while_flag_dev_ptr;
+  }
+  cache_.emplace(launch_id, std::move(cached));
+  used_on_last_call_ = true;
+  return true;
+}
+
+}  // namespace cuda
+}  // namespace quadrants::lang
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.h b/quadrants/runtime/cuda/cuda_graph_manager.h
new file mode 100644
index 0000000000..999dbc9828
--- /dev/null
+++ b/quadrants/runtime/cuda/cuda_graph_manager.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "quadrants/codegen/llvm/compiled_kernel_data.h"
+#include "quadrants/runtime/llvm/llvm_runtime_executor.h"
+
+namespace quadrants::lang {
+namespace cuda {
+
+struct CudaKernelNodeParams {
+  void *func;
+  unsigned int gridDimX;
+  unsigned int gridDimY;
+  unsigned int gridDimZ;
+  unsigned int blockDimX;
+  unsigned int blockDimY;
+  unsigned int blockDimZ;
+  unsigned int sharedMemBytes;
+  void **kernelParams;
+  void **extra;
+};
+
+// Mirrors CUDA driver API CUgraphNodeParams / CUDA_CONDITIONAL_NODE_PARAMS.
+// Field order verified against cuda-python bindings (handle, type, size,
+// phGraph_out, ctx). Introduced in CUDA 12.4; layout stable through 13.2+.
+struct CudaGraphNodeParams {
+  unsigned int type;  // CU_GRAPH_NODE_TYPE_CONDITIONAL = 13
+  int reserved0[3];
+  // Union starts at offset 16 (232 bytes total)
+  unsigned long long handle;  // CUgraphConditionalHandle
+  unsigned int condType;      // CU_GRAPH_COND_TYPE_WHILE = 1
+  unsigned int size;          // 1 for while
+  void *phGraph_out;          // CUgraph* output array
+  void *ctx;                  // CUcontext
+  char _pad[232 - 8 - 4 - 4 - 8 - 8];
+  long long reserved2;
+};
+static_assert(
+    sizeof(CudaGraphNodeParams) == 256,
+    "CudaGraphNodeParams layout must match CUgraphNodeParams (256 bytes)");
+
+struct CachedCudaGraph {
+  // CUgraphExec handle (typed as void* since driver API is loaded dynamically).
+  // This is the instantiated, launchable form of the captured CUDA graph.
+  void *graph_exec{nullptr};
+  char *persistent_device_arg_buffer{nullptr};
+  char *persistent_device_result_buffer{nullptr};
+  RuntimeContext persistent_ctx{};
+  std::size_t arg_buffer_size{0};
+  std::size_t result_buffer_size{0};
+  void *graph_do_while_flag_dev_ptr{nullptr};
+
+  CachedCudaGraph() = default;
+  ~CachedCudaGraph();
+  CachedCudaGraph(const CachedCudaGraph &) = delete;
+  CachedCudaGraph &operator=(const CachedCudaGraph &) = delete;
+  CachedCudaGraph(CachedCudaGraph &&other) noexcept;
+  CachedCudaGraph &operator=(CachedCudaGraph &&other) noexcept;
+};
+
+class CudaGraphManager {
+ public:
+  // Attempts to launch the kernel via a cached or newly built CUDA graph.
+  // Returns true on success; false if the graph path can't be used (e.g.
+  // host-resident ndarrays) and the caller should fall back to normal launch.
+  // Internally tracks whether the graph was used, queryable via used_on_last_call().
+  bool try_launch(int launch_id, LaunchContextBuilder &ctx,
+                  JITModule *cuda_module,
+                  const std::vector<std::pair<int, Callable::Parameter>> &parameters,
+                  const std::vector<OffloadedTask> &offloaded_tasks,
+                  LlvmRuntimeExecutor *executor);
+
+  void mark_not_used() { used_on_last_call_ = false; }
+  std::size_t cache_size() const { return cache_.size(); }
+  bool used_on_last_call() const { return used_on_last_call_; }
+
+ private:
+  bool on_cuda_device(void *ptr);
+  bool resolve_ctx_ndarray_ptrs(
+      LaunchContextBuilder &ctx,
+      const std::vector<std::pair<int, Callable::Parameter>> &parameters,
+      LlvmRuntimeExecutor *executor);
+  void ensure_condition_kernel_loaded();
+  void *add_conditional_while_node(void *graph,
+                                   unsigned long long *cond_handle_out);
+  void *add_kernel_node(void *graph, void *prev_node, void *func,
+                        unsigned int grid_dim, unsigned int block_dim,
+                        unsigned int shared_mem, void **kernel_params);
+
+  // Keyed by launch_id, which uniquely identifies a compiled kernel variant
+  // (each template specialization gets its own launch_id).
+  std::unordered_map<int, CachedCudaGraph> cache_;
+  bool used_on_last_call_{false};
+
+  // JIT-compiled condition kernel for graph_do_while conditional nodes
+  void *cond_kernel_module_{nullptr};  // CUmodule
+  void *cond_kernel_func_{nullptr};    // CUfunction
+};
+
+}  // namespace cuda
+}  // namespace quadrants::lang
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index e8ec5138b1..da497f37b0 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -1,472 +1,25 @@
 #include "quadrants/runtime/cuda/kernel_launcher.h"
 #include "quadrants/rhi/cuda/cuda_context.h"
 
-#include <cstdlib>
-#include <cstring>
-#include <filesystem>
 #include <vector>
 
 namespace quadrants::lang {
 namespace cuda {
 
-// PTX for a tiny condition kernel that reads a device-side int32 flag and
-// Condition kernel for graph_do_while. Reads the user's i32 loop-control flag
-// from GPU memory and tells the CUDA graph's conditional while node whether to
-// run another iteration — all without returning to the host.
-//
-// Parameters:
-//   param_0: conditional node handle (passed to cudaGraphSetConditional)
-//   param_1: pointer to the user's qd.i32 flag ndarray on the GPU
-//
-// Compiled from CUDA C with: nvcc -ptx -arch=sm_90 -rdc=true
-// Requires SM 9.0+ (Hopper) for cudaGraphSetConditional / conditional nodes.
-// Requires JIT linking with libcudadevrt.a at runtime.
-static const char *kConditionKernelPTX = R"PTX(
-.version 8.8
-.target sm_90
-.address_size 64
-
-// Declare the device-side cudaGraphSetConditional function (from libcudadevrt).
-// Takes a conditional node handle (u64) and a boolean (u32: 1=continue, 0=stop).
-.extern .func cudaGraphSetConditional
-(
-    .param .b64 cudaGraphSetConditional_param_0,
-    .param .b32 cudaGraphSetConditional_param_1
-)
-;
-
-// Entry point: called by the CUDA graph's conditional while node each iteration.
-//   param_0 (u64): conditional node handle
-//   param_1 (u64): pointer to the user's qd.i32 flag in GPU global memory
-.visible .entry _qd_graph_do_while_cond(
-    .param .u64 _qd_graph_do_while_cond_param_0,
-    .param .u64 _qd_graph_do_while_cond_param_1
-)
-{
-    .reg .pred %p<2>;
-    .reg .b32 %r<3>;
-    .reg .b64 %rd<4>;
-
-    // Load the two kernel parameters into registers:
-    //   %rd1 = conditional node handle
-    //   %rd2 = pointer to user's i32 flag
-    ld.param.u64 %rd1, [_qd_graph_do_while_cond_param_0];
-    ld.param.u64 %rd2, [_qd_graph_do_while_cond_param_1];
-
-    // Convert generic pointer to global address space, then read the flag value
-    cvta.to.global.u64 %rd3, %rd2;
-    ld.global.u32 %r1, [%rd3];
-
-    // Convert flag to boolean: %r2 = (flag != 0) ? 1 : 0
-    setp.ne.s32 %p1, %r1, 0;
-    selp.u32 %r2, 1, 0, %p1;
-
-    // Tell the conditional while node whether to loop again or stop.
-    // cudaGraphSetConditional(handle, should_continue)
-    { // callseq 0, 0
-    .reg .b32 temp_param_reg;
-    .param .b64 param0;
-    st.param.b64 [param0+0], %rd1;
-    .param .b32 param1;
-    st.param.b32 [param1+0], %r2;
-    call.uni cudaGraphSetConditional, (param0, param1);
-    } // callseq 0
-    ret;
-}
-)PTX";
-
-CachedCudaGraph::~CachedCudaGraph() {
-  if (graph_exec) {
-    CUDADriver::get_instance().graph_exec_destroy(graph_exec);
-  }
-  if (persistent_device_arg_buffer) {
-    CUDADriver::get_instance().mem_free(persistent_device_arg_buffer);
-  }
-  if (persistent_device_result_buffer) {
-    CUDADriver::get_instance().mem_free(persistent_device_result_buffer);
-  }
-}
-
-CachedCudaGraph::CachedCudaGraph(CachedCudaGraph &&other) noexcept
-    : graph_exec(other.graph_exec),
-      persistent_device_arg_buffer(other.persistent_device_arg_buffer),
-      persistent_device_result_buffer(other.persistent_device_result_buffer),
-      persistent_ctx(other.persistent_ctx),
-      arg_buffer_size(other.arg_buffer_size),
-      result_buffer_size(other.result_buffer_size) {
-  other.graph_exec = nullptr;
-  other.persistent_device_arg_buffer = nullptr;
-  other.persistent_device_result_buffer = nullptr;
-}
-
-CachedCudaGraph &CachedCudaGraph::operator=(CachedCudaGraph &&other) noexcept {
-  if (this != &other) {
-    if (graph_exec)
-      CUDADriver::get_instance().graph_exec_destroy(graph_exec);
-    if (persistent_device_arg_buffer)
-      CUDADriver::get_instance().mem_free(persistent_device_arg_buffer);
-    if (persistent_device_result_buffer)
-      CUDADriver::get_instance().mem_free(persistent_device_result_buffer);
-
-    graph_exec = other.graph_exec;
-    persistent_device_arg_buffer = other.persistent_device_arg_buffer;
-    persistent_device_result_buffer = other.persistent_device_result_buffer;
-    persistent_ctx = other.persistent_ctx;
-    arg_buffer_size = other.arg_buffer_size;
-    result_buffer_size = other.result_buffer_size;
-
-    other.graph_exec = nullptr;
-    other.persistent_device_arg_buffer = nullptr;
-    other.persistent_device_result_buffer = nullptr;
-  }
-  return *this;
-}
-
-bool KernelLauncher::on_cuda_device(void *ptr) {
-  unsigned int attr_val = 0;
-  uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
-      &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (void *)ptr);
-
-  return ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE;
-}
-
-// Resolves ndarray parameter handles in the launch context to raw device
-// pointers, writing them into the arg buffer via set_ndarray_ptrs.
-//
-// Unlike the normal launch path, this does not handle host-resident arrays
-// (no temporary device allocation or host-to-device transfer). Returns false
-// if any external array is on the host, signaling the caller to fall back
-// to the non-graph launch path.
-bool KernelLauncher::resolve_ctx_ndarray_ptrs(
-    LaunchContextBuilder &ctx,
-    const std::vector<std::pair<int, Callable::Parameter>> &parameters) {
-  auto *executor = get_runtime_executor();
-  for (int i = 0; i < (int)parameters.size(); i++) {
-    const auto &kv = parameters[i];
-    const auto &arg_id = kv.first;
-    const auto &parameter = kv.second;
-    if (parameter.is_array) {
-      const auto arr_sz = ctx.array_runtime_sizes[arg_id];
-      if (arr_sz == 0)
-        continue;
-
-      ArgArrayPtrKey data_ptr_idx{arg_id, TypeFactory::DATA_PTR_POS_IN_NDARRAY};
-      ArgArrayPtrKey grad_ptr_idx{arg_id, TypeFactory::GRAD_PTR_POS_IN_NDARRAY};
-      auto data_ptr = ctx.array_ptrs[data_ptr_idx];
-      auto grad_ptr = ctx.array_ptrs[grad_ptr_idx];
-
-      QD_ERROR_IF(grad_ptr != nullptr,
-                  "cuda_graph does not support autograd; "
-                  "ndarray arg {} has a non-null gradient pointer", arg_id);
-
-      // Raw device pointer to the array data, resolved from either an
-      // external array (raw pointer) or a DeviceAllocation handle.
-      void *resolved_data = nullptr;
-
-      if (ctx.device_allocation_type[arg_id] ==
-          LaunchContextBuilder::DevAllocType::kNone) {
-        if (!on_cuda_device(data_ptr)) {
-          return false;
-        }
-        resolved_data = data_ptr;
-      } else if (arr_sz > 0) {
-        DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
-        resolved_data = executor->get_device_alloc_info_ptr(*ptr);
-      }
-
-      if (resolved_data) {
-        ctx.set_ndarray_ptrs(arg_id, (uint64)resolved_data, (uint64) nullptr);
-        if (arg_id == ctx.graph_do_while_arg_id) {
-          ctx.graph_do_while_flag_dev_ptr = resolved_data;
-        }
-      }
-    }
-  }
-  return true;
-}
-
-// Lazily JIT-compiles and loads the graph_do_while condition kernel.
-// Links the PTX (kConditionKernelPTX) with libcudadevrt.a to produce a cubin,
-// then loads the _qd_graph_do_while_cond function for use in conditional
-// while nodes. Only called once; subsequent calls are no-ops.
-void KernelLauncher::ensure_condition_kernel_loaded() {
-  if (cond_kernel_func_)
-    return;
-
-  int cc = CUDAContext::get_instance().get_compute_capability();
-  QD_ERROR_IF(cc < 90,
-              "graph_do_while requires SM 9.0+ (Hopper), but this device is "
-              "SM {}.",
-              cc);
-
-  auto &driver = CUDADriver::get_instance();
-
-  std::string cudadevrt_path;
-  std::vector<std::string> candidates;
-  for (const char *env_name : {"CUDA_HOME", "CUDA_PATH"}) {
-    if (const char *env_val = std::getenv(env_name)) {
-      candidates.push_back(std::string(env_val) + "/lib64/libcudadevrt.a");
-      candidates.push_back(std::string(env_val) + "/lib/libcudadevrt.a");
-    }
-  }
-  candidates.push_back("/usr/local/cuda/lib64/libcudadevrt.a");
-  candidates.push_back("/usr/lib/x86_64-linux-gnu/libcudadevrt.a");
-  for (const auto &candidate : candidates) {
-    if (std::filesystem::exists(candidate)) {
-      cudadevrt_path = candidate;
-      break;
-    }
-  }
-  QD_ERROR_IF(cudadevrt_path.empty(),
-              "graph_do_while requires libcudadevrt.a but it was not found. "
-              "Install the CUDA toolkit and/or set CUDA_HOME.");
-
-  // CUlinkState handle for the JIT linker session that combines our PTX
-  // with libcudadevrt.a to resolve the cudaGraphSetConditional extern.
-  void *link_state = nullptr;
-  driver.link_create(0, nullptr, nullptr, &link_state);
-
-  std::size_t ptx_len = std::strlen(kConditionKernelPTX) + 1;
-  driver.link_add_data(link_state, /*CU_JIT_INPUT_PTX=*/1,
-                       const_cast<char *>(kConditionKernelPTX), ptx_len,
-                       /*name=*/"qd_cond", 0, nullptr, nullptr);
-
-  driver.link_add_file(link_state, /*CU_JIT_INPUT_LIBRARY=*/4,
-                       cudadevrt_path.c_str(), 0, nullptr, nullptr);
-
-  void *cubin = nullptr;
-  std::size_t cubin_size = 0;
-  driver.link_complete(link_state, &cubin, &cubin_size);
-
-  driver.module_load_data(&cond_kernel_module_, cubin);
-  driver.module_get_function(&cond_kernel_func_, cond_kernel_module_,
-                             "_qd_graph_do_while_cond");
-  driver.link_destroy(link_state);
-
-  QD_TRACE("Loaded graph_do_while condition kernel ({} bytes cubin)",
-           cubin_size);
-}
-
-void *KernelLauncher::add_kernel_node(void *graph, void *prev_node, void *func,
-                                      unsigned int grid_dim,
-                                      unsigned int block_dim,
-                                      unsigned int shared_mem,
-                                      void **kernel_params) {
-  CudaKernelNodeParams params{};
-  params.func = func;
-  params.gridDimX = grid_dim;
-  params.gridDimY = 1;
-  params.gridDimZ = 1;
-  params.blockDimX = block_dim;
-  params.blockDimY = 1;
-  params.blockDimZ = 1;
-  params.sharedMemBytes = shared_mem;
-  params.kernelParams = kernel_params;
-  params.extra = nullptr;
-
-  void *node = nullptr;
-  CUDADriver::get_instance().graph_add_kernel_node(
-      &node, graph, prev_node ? &prev_node : nullptr, prev_node ? 1 : 0,
-      &params);
-  return node;
-}
-
-void *KernelLauncher::add_conditional_while_node(
-    void *graph, unsigned long long *cond_handle_out) {
-  ensure_condition_kernel_loaded();
-  QD_ASSERT(cond_kernel_func_);
-
-  void *cu_ctx = CUDAContext::get_instance().get_context();
-
-  CUDADriver::get_instance().graph_conditional_handle_create(
-      cond_handle_out, graph, cu_ctx,
-      /*defaultLaunchValue=*/1,
-      /*flags=CU_GRAPH_COND_ASSIGN_DEFAULT=*/1);
-
-  CudaGraphNodeParams cond_node_params{};
-  cond_node_params.type = 13;  // CU_GRAPH_NODE_TYPE_CONDITIONAL
-  cond_node_params.handle = *cond_handle_out;
-  cond_node_params.condType = 1;  // CU_GRAPH_COND_TYPE_WHILE
-  cond_node_params.size = 1;
-  cond_node_params.phGraph_out = nullptr;  // CUDA will populate this
-  cond_node_params.ctx = cu_ctx;
-
-  void *cond_node = nullptr;
-  CUDADriver::get_instance().graph_add_node(&cond_node, graph, nullptr, 0,
-                                            &cond_node_params);
-
-  // CUDA replaces phGraph_out with a pointer to its owned array
-  void **body_graphs = (void **)cond_node_params.phGraph_out;
-  QD_ASSERT(body_graphs && body_graphs[0]);
-
-  QD_TRACE("CUDA graph_do_while: conditional node created, body graph={}",
-           body_graphs[0]);
-  return body_graphs[0];
-}
-
-bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
-                                              LaunchContextBuilder &ctx) {
-  int launch_id = handle.get_launch_id();
-
-  // Populated by register_llvm_kernel, which runs before launch_llvm_kernel
-  // for all LLVM kernels regardless of whether the graph path is used.
-  auto &launcher_ctx = contexts_[launch_id];
-  const auto &parameters = *launcher_ctx.parameters;
-  const auto &offloaded_tasks = launcher_ctx.offloaded_tasks;
-
-  if (offloaded_tasks.empty()) {
-    return false;
-  }
-
-  QD_ERROR_IF(ctx.result_buffer_size > 0,
-              "cuda_graph=True is not supported for kernels with struct return "
-              "values; remove cuda_graph=True or avoid returning values");
-
-  const bool use_graph_do_while = ctx.graph_do_while_arg_id >= 0;
-
-  // Falls back to the normal path if any external array is host-resident,
-  // since the graph path cannot perform host-to-device transfers.
-  if (!resolve_ctx_ndarray_ptrs(ctx, parameters)) {
-    QD_ERROR_IF(use_graph_do_while,
-                "graph_do_while requires all ndarrays to be device-resident");
-    return false;
-  }
-
-  auto it = cuda_graph_cache_.find(launch_id);
-  if (it != cuda_graph_cache_.end()) {
-    auto &cached = it->second;
-    QD_ERROR_IF(
-        use_graph_do_while &&
-            cached.graph_do_while_flag_dev_ptr !=
-                ctx.graph_do_while_flag_dev_ptr,
-        "graph_do_while condition ndarray changed between calls. "
-        "Reuse the same ndarray for the condition parameter across calls.");
-
-    if (ctx.arg_buffer_size > 0) {
-      CUDADriver::get_instance().memcpy_host_to_device(
-          cached.persistent_device_arg_buffer, ctx.get_context().arg_buffer,
-          cached.arg_buffer_size);
-    }
-    auto *stream = CUDAContext::get_instance().get_stream();
-    CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);
-    return true;
-  }
-
-  CUDAContext::get_instance().make_current();
-
-  auto *executor = get_runtime_executor();
-  auto *cuda_module = launcher_ctx.jit_module;
-
-  CachedCudaGraph cached;
-
-  // --- Allocate persistent buffers ---
-  cached.result_buffer_size = std::max(ctx.result_buffer_size, sizeof(uint64));
-  CUDADriver::get_instance().malloc(
-      (void **)&cached.persistent_device_result_buffer,
-      cached.result_buffer_size);
-
-  cached.arg_buffer_size = ctx.arg_buffer_size;
-  if (cached.arg_buffer_size > 0) {
-    CUDADriver::get_instance().malloc(
-        (void **)&cached.persistent_device_arg_buffer, cached.arg_buffer_size);
-    CUDADriver::get_instance().memcpy_host_to_device(
-        cached.persistent_device_arg_buffer, ctx.get_context().arg_buffer,
-        cached.arg_buffer_size);
-  }
-
-  // --- Build persistent RuntimeContext ---
-  cached.persistent_ctx.runtime = executor->get_llvm_runtime();
-  cached.persistent_ctx.arg_buffer = cached.persistent_device_arg_buffer;
-  cached.persistent_ctx.result_buffer =
-      (uint64 *)cached.persistent_device_result_buffer;
-  cached.persistent_ctx.cpu_thread_id = 0;
-
-  // --- Build CUDA graph ---
-  void *graph = nullptr;
-  CUDADriver::get_instance().graph_create(&graph, 0);
-
-  // Target graph for kernel nodes. Without graph_do_while, work kernels go
-  // directly into the top-level graph. With graph_do_while, they go into
-  // a body graph inside a conditional while node:
-  //
-  //   Top-level graph
-  //     └── Conditional while node (repeats while flag != 0)
-  //           └── Body graph
-  //                 ├── Work kernel 1
-  //                 ├── Work kernel 2
-  //                 └── Condition kernel (reads flag, calls cudaGraphSetConditional)
-  //
-  // The condition kernel must be the last node in the body graph. It reads the
-  // flag after the work kernels have updated it, so the loop-continue decision
-  // reflects this iteration's result. Putting it first would cause an extra
-  // iteration: the condition would see the flag from before the work ran.
-  void *kernel_target_graph = graph;
-  unsigned long long cond_handle = 0;
-
-  if (use_graph_do_while) {
-    kernel_target_graph = add_conditional_while_node(graph, &cond_handle);
-  }
-
-  // Add work kernel nodes to the target graph
-  void *prev_node = nullptr;
-  for (const auto &task : offloaded_tasks) {
-    void *ctx_ptr = &cached.persistent_ctx;
-    prev_node = add_kernel_node(
-        kernel_target_graph, prev_node,
-        cuda_module->lookup_function(task.name),
-        (unsigned int)task.grid_dim, (unsigned int)task.block_dim,
-        (unsigned int)task.dynamic_shared_array_bytes, &ctx_ptr);
-  }
-
-  if (use_graph_do_while) {
-    // add conditional node into the body graph
-    QD_ASSERT(ctx.graph_do_while_flag_dev_ptr);
-
-    void *flag_ptr = ctx.graph_do_while_flag_dev_ptr;
-    void *cond_args[2] = {&cond_handle, &flag_ptr};
-
-    add_kernel_node(kernel_target_graph, prev_node, cond_kernel_func_,
-                    1, 1, 0, cond_args);
-  }
-
-  // --- Instantiate and launch ---
-  CUDADriver::get_instance().graph_instantiate(&cached.graph_exec, graph,
-                                               nullptr, nullptr, 0);
-
-  auto *stream = CUDAContext::get_instance().get_stream();
-  CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);
-
-  CUDADriver::get_instance().graph_destroy(graph);
-
-  QD_TRACE(
-      "CUDA graph created with {} kernel nodes for launch_id={}"
-      "{}",
-      offloaded_tasks.size(), launch_id,
-      use_graph_do_while ? " (with graph_do_while)" : "");
-
-  if (use_graph_do_while) {
-    // Save the flag pointer so we can detect if the user passes a different
-    // ndarray on a later call. The flag's device pointer is baked into the
-    // CUDA graph as a condition kernel argument; if the user later calls with
-    // a different ndarray, the graph would still read from the old pointer,
-    // so we error out instead of silently producing wrong results.
-    cached.graph_do_while_flag_dev_ptr = ctx.graph_do_while_flag_dev_ptr;
-  }
-  cuda_graph_cache_.emplace(launch_id, std::move(cached));
-  return true;
-}
-
 void KernelLauncher::launch_llvm_kernel(Handle handle,
                                         LaunchContextBuilder &ctx) {
   QD_ASSERT(handle.get_launch_id() < contexts_.size());
 
   if (ctx.use_cuda_graph) {
-    if (launch_llvm_kernel_graph(handle, ctx)) {
-      cuda_graph_cache_used_on_last_call_ = true;
+    auto &lctx = contexts_[handle.get_launch_id()];
+    if (graph_manager_.try_launch(handle.get_launch_id(), ctx,
+                                  lctx.jit_module, *lctx.parameters,
+                                  lctx.offloaded_tasks,
+                                  get_runtime_executor())) {
       return;
     }
   }
-  cuda_graph_cache_used_on_last_call_ = false;
+  graph_manager_.mark_not_used();
 
   auto launcher_ctx = contexts_[handle.get_launch_id()];
   auto *executor = get_runtime_executor();
@@ -524,7 +77,10 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
           LaunchContextBuilder::DevAllocType::kNone) {
         // External array
         // Note: assuming both data & grad are on the same device
-        if (on_cuda_device(data_ptr)) {
+        unsigned int attr_val = 0;
+        uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
+            &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (void *)data_ptr);
+        if (ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE) {
           // data_ptr is a raw ptr on CUDA device
           device_ptrs[data_ptr_idx] = data_ptr;
           device_ptrs[grad_ptr_idx] = grad_ptr;
diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index 250b00fdc0..0027941c42 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -1,65 +1,15 @@
 #pragma once
 
 #include <string>
-#include <unordered_map>
+#include <vector>
 
 #include "quadrants/codegen/llvm/compiled_kernel_data.h"
+#include "quadrants/runtime/cuda/cuda_graph_manager.h"
 #include "quadrants/runtime/llvm/kernel_launcher.h"
 
 namespace quadrants::lang {
 namespace cuda {
 
-struct CudaKernelNodeParams {
-  void *func;
-  unsigned int gridDimX;
-  unsigned int gridDimY;
-  unsigned int gridDimZ;
-  unsigned int blockDimX;
-  unsigned int blockDimY;
-  unsigned int blockDimZ;
-  unsigned int sharedMemBytes;
-  void **kernelParams;
-  void **extra;
-};
-
-// Mirrors CUDA driver API CUgraphNodeParams / CUDA_CONDITIONAL_NODE_PARAMS.
-// Field order verified against cuda-python bindings (handle, type, size,
-// phGraph_out, ctx). Introduced in CUDA 12.4; layout stable through 13.2+.
-struct CudaGraphNodeParams {
-  unsigned int type;  // CU_GRAPH_NODE_TYPE_CONDITIONAL = 13
-  int reserved0[3];
-  // Union starts at offset 16 (232 bytes total)
-  unsigned long long handle;  // CUgraphConditionalHandle
-  unsigned int condType;      // CU_GRAPH_COND_TYPE_WHILE = 1
-  unsigned int size;          // 1 for while
-  void *phGraph_out;          // CUgraph* output array
-  void *ctx;                  // CUcontext
-  char _pad[232 - 8 - 4 - 4 - 8 - 8];
-  long long reserved2;
-};
-static_assert(
-    sizeof(CudaGraphNodeParams) == 256,
-    "CudaGraphNodeParams layout must match CUgraphNodeParams (256 bytes)");
-
-struct CachedCudaGraph {
-  // CUgraphExec handle (typed as void* since driver API is loaded dynamically).
-  // This is the instantiated, launchable form of the captured CUDA graph.
-  void *graph_exec{nullptr};
-  char *persistent_device_arg_buffer{nullptr};
-  char *persistent_device_result_buffer{nullptr};
-  RuntimeContext persistent_ctx{};
-  std::size_t arg_buffer_size{0};
-  std::size_t result_buffer_size{0};
-  void *graph_do_while_flag_dev_ptr{nullptr};
-
-  CachedCudaGraph() = default;
-  ~CachedCudaGraph();
-  CachedCudaGraph(const CachedCudaGraph &) = delete;
-  CachedCudaGraph &operator=(const CachedCudaGraph &) = delete;
-  CachedCudaGraph(CachedCudaGraph &&other) noexcept;
-  CachedCudaGraph &operator=(CachedCudaGraph &&other) noexcept;
-};
-
 class KernelLauncher : public LLVM::KernelLauncher {
   using Base = LLVM::KernelLauncher;
 
@@ -76,33 +26,15 @@ class KernelLauncher : public LLVM::KernelLauncher {
   Handle register_llvm_kernel(
       const LLVM::CompiledKernelData &compiled) override;
   std::size_t get_cuda_graph_cache_size() const override {
-    return cuda_graph_cache_.size();
+    return graph_manager_.cache_size();
   }
   bool get_cuda_graph_cache_used_on_last_call() const override {
-    return cuda_graph_cache_used_on_last_call_;
+    return graph_manager_.used_on_last_call();
   }
 
  private:
-  bool on_cuda_device(void *ptr);
-  bool resolve_ctx_ndarray_ptrs(
-      LaunchContextBuilder &ctx,
-      const std::vector<std::pair<int, Callable::Parameter>> &parameters);
-  bool launch_llvm_kernel_graph(Handle handle, LaunchContextBuilder &ctx);
-  void ensure_condition_kernel_loaded();
-  void *add_conditional_while_node(void *graph,
-                                   unsigned long long *cond_handle_out);
-  void *add_kernel_node(void *graph, void *prev_node, void *func,
-                        unsigned int grid_dim, unsigned int block_dim,
-                        unsigned int shared_mem, void **kernel_params);
   std::vector<Context> contexts_;
-  // Keyed by launch_id, which uniquely identifies a compiled kernel variant
-  // (each template specialization gets its own launch_id).
-  std::unordered_map<int, CachedCudaGraph> cuda_graph_cache_;
-  bool cuda_graph_cache_used_on_last_call_{false};
-
-  // JIT-compiled condition kernel for graph_do_while conditional nodes
-  void *cond_kernel_module_{nullptr};  // CUmodule
-  void *cond_kernel_func_{nullptr};    // CUfunction
+  CudaGraphManager graph_manager_;
 };
 
 }  // namespace cuda

From 54c8bf07294036a49b7ae3a67e94bb517c46a24f Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:08:09 -0400
Subject: [PATCH 073/128] Fix awkward string literal split in QD_TRACE

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 0ca2ea6896..d416f55526 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -430,11 +430,9 @@ bool CudaGraphManager::try_launch(
 
   CUDADriver::get_instance().graph_destroy(graph);
 
-  QD_TRACE(
-      "CUDA graph created with {} kernel nodes for launch_id={}"
-      "{}",
-      offloaded_tasks.size(), launch_id,
-      use_graph_do_while ? " (with graph_do_while)" : "");
+  QD_TRACE("CUDA graph created with {} kernel nodes for launch_id={}{}",
+           offloaded_tasks.size(), launch_id,
+           use_graph_do_while ? " (with graph_do_while)" : "");
 
   if (use_graph_do_while) {
     // Save the flag pointer so we can detect if the user passes a different

From cf86442bea39dcf29d9adc341925d411b8f15438 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:12:59 -0400
Subject: [PATCH 074/128] Extract launch_cached_graph from try_launch

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 38 ++++++++++---------
 quadrants/runtime/cuda/cuda_graph_manager.h   |  2 +
 2 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index d416f55526..331028082f 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -303,6 +303,26 @@ void *CudaGraphManager::add_conditional_while_node(
   return body_graphs[0];
 }
 
+bool CudaGraphManager::launch_cached_graph(CachedCudaGraph &cached,
+                                            LaunchContextBuilder &ctx,
+                                            bool use_graph_do_while) {
+  QD_ERROR_IF(
+      use_graph_do_while &&
+          cached.graph_do_while_flag_dev_ptr != ctx.graph_do_while_flag_dev_ptr,
+      "graph_do_while condition ndarray changed between calls. "
+      "Reuse the same ndarray for the condition parameter across calls.");
+
+  if (ctx.arg_buffer_size > 0) {
+    CUDADriver::get_instance().memcpy_host_to_device(
+        cached.persistent_device_arg_buffer, ctx.get_context().arg_buffer,
+        cached.arg_buffer_size);
+  }
+  auto *stream = CUDAContext::get_instance().get_stream();
+  CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);
+  used_on_last_call_ = true;
+  return true;
+}
+
 bool CudaGraphManager::try_launch(
     int launch_id, LaunchContextBuilder &ctx, JITModule *cuda_module,
     const std::vector<std::pair<int, Callable::Parameter>> &parameters,
@@ -328,23 +348,7 @@ bool CudaGraphManager::try_launch(
 
   auto it = cache_.find(launch_id);
   if (it != cache_.end()) {
-    auto &cached = it->second;
-    QD_ERROR_IF(
-        use_graph_do_while &&
-            cached.graph_do_while_flag_dev_ptr !=
-                ctx.graph_do_while_flag_dev_ptr,
-        "graph_do_while condition ndarray changed between calls. "
-        "Reuse the same ndarray for the condition parameter across calls.");
-
-    if (ctx.arg_buffer_size > 0) {
-      CUDADriver::get_instance().memcpy_host_to_device(
-          cached.persistent_device_arg_buffer, ctx.get_context().arg_buffer,
-          cached.arg_buffer_size);
-    }
-    auto *stream = CUDAContext::get_instance().get_stream();
-    CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);
-    used_on_last_call_ = true;
-    return true;
+    return launch_cached_graph(it->second, ctx, use_graph_do_while);
   }
 
   CUDAContext::get_instance().make_current();
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.h b/quadrants/runtime/cuda/cuda_graph_manager.h
index 999dbc9828..952ce91cd3 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.h
+++ b/quadrants/runtime/cuda/cuda_graph_manager.h
@@ -79,6 +79,8 @@ class CudaGraphManager {
   bool used_on_last_call() const { return used_on_last_call_; }
 
  private:
+  bool launch_cached_graph(CachedCudaGraph &cached, LaunchContextBuilder &ctx,
+                           bool use_graph_do_while);
   bool on_cuda_device(void *ptr);
   bool resolve_ctx_ndarray_ptrs(
       LaunchContextBuilder &ctx,

From ddb552e5a2891cefefb0cec272e743c35712ee30 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:16:27 -0400
Subject: [PATCH 075/128] Extract CudaGraphManager from KernelLauncher into
 separate class

Made-with: Cursor
---
 quadrants/runtime/cuda/CMakeLists.txt         |   1 +
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 226 +++++++++++++++++
 quadrants/runtime/cuda/cuda_graph_manager.h   |  77 ++++++
 quadrants/runtime/cuda/kernel_launcher.cpp    | 230 +-----------------
 quadrants/runtime/cuda/kernel_launcher.h      |  49 +---
 5 files changed, 321 insertions(+), 262 deletions(-)
 create mode 100644 quadrants/runtime/cuda/cuda_graph_manager.cpp
 create mode 100644 quadrants/runtime/cuda/cuda_graph_manager.h

diff --git a/quadrants/runtime/cuda/CMakeLists.txt b/quadrants/runtime/cuda/CMakeLists.txt
index 961b895b88..a5c4eeac6e 100644
--- a/quadrants/runtime/cuda/CMakeLists.txt
+++ b/quadrants/runtime/cuda/CMakeLists.txt
@@ -3,6 +3,7 @@
 add_library(cuda_runtime)
 target_sources(cuda_runtime
   PRIVATE
+    cuda_graph_manager.cpp
     jit_cuda.cpp
     kernel_launcher.cpp
     ptx_cache.cpp
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
new file mode 100644
index 0000000000..48fb0417ba
--- /dev/null
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -0,0 +1,226 @@
+#include "quadrants/runtime/cuda/cuda_graph_manager.h"
+#include "quadrants/rhi/cuda/cuda_context.h"
+
+namespace quadrants::lang {
+namespace cuda {
+
+CachedCudaGraph::~CachedCudaGraph() {
+  if (graph_exec) {
+    CUDADriver::get_instance().graph_exec_destroy(graph_exec);
+  }
+  if (persistent_device_arg_buffer) {
+    CUDADriver::get_instance().mem_free(persistent_device_arg_buffer);
+  }
+  if (persistent_device_result_buffer) {
+    CUDADriver::get_instance().mem_free(persistent_device_result_buffer);
+  }
+}
+
+CachedCudaGraph::CachedCudaGraph(CachedCudaGraph &&other) noexcept
+    : graph_exec(other.graph_exec),
+      persistent_device_arg_buffer(other.persistent_device_arg_buffer),
+      persistent_device_result_buffer(other.persistent_device_result_buffer),
+      persistent_ctx(other.persistent_ctx),
+      arg_buffer_size(other.arg_buffer_size),
+      result_buffer_size(other.result_buffer_size) {
+  other.graph_exec = nullptr;
+  other.persistent_device_arg_buffer = nullptr;
+  other.persistent_device_result_buffer = nullptr;
+}
+
+CachedCudaGraph &CachedCudaGraph::operator=(CachedCudaGraph &&other) noexcept {
+  if (this != &other) {
+    if (graph_exec)
+      CUDADriver::get_instance().graph_exec_destroy(graph_exec);
+    if (persistent_device_arg_buffer)
+      CUDADriver::get_instance().mem_free(persistent_device_arg_buffer);
+    if (persistent_device_result_buffer)
+      CUDADriver::get_instance().mem_free(persistent_device_result_buffer);
+
+    graph_exec = other.graph_exec;
+    persistent_device_arg_buffer = other.persistent_device_arg_buffer;
+    persistent_device_result_buffer = other.persistent_device_result_buffer;
+    persistent_ctx = other.persistent_ctx;
+    arg_buffer_size = other.arg_buffer_size;
+    result_buffer_size = other.result_buffer_size;
+
+    other.graph_exec = nullptr;
+    other.persistent_device_arg_buffer = nullptr;
+    other.persistent_device_result_buffer = nullptr;
+  }
+  return *this;
+}
+
+bool CudaGraphManager::on_cuda_device(void *ptr) {
+  unsigned int attr_val = 0;
+  uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
+      &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (void *)ptr);
+
+  return ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE;
+}
+
+// Resolves ndarray parameter handles in the launch context to raw device
+// pointers, writing them into the arg buffer via set_ndarray_ptrs.
+//
+// Unlike the normal launch path, this does not handle host-resident arrays
+// (no temporary device allocation or host-to-device transfer). Returns false
+// if any external array is on the host, signaling the caller to fall back
+// to the non-graph launch path.
+bool CudaGraphManager::resolve_ctx_ndarray_ptrs(
+    LaunchContextBuilder &ctx,
+    const std::vector<std::pair<int, Callable::Parameter>> &parameters,
+    LlvmRuntimeExecutor *executor) {
+  for (int i = 0; i < (int)parameters.size(); i++) {
+    const auto &kv = parameters[i];
+    const auto &arg_id = kv.first;
+    const auto &parameter = kv.second;
+    if (parameter.is_array) {
+      const auto arr_sz = ctx.array_runtime_sizes[arg_id];
+      if (arr_sz == 0)
+        continue;
+
+      ArgArrayPtrKey data_ptr_idx{arg_id, TypeFactory::DATA_PTR_POS_IN_NDARRAY};
+      ArgArrayPtrKey grad_ptr_idx{arg_id, TypeFactory::GRAD_PTR_POS_IN_NDARRAY};
+      auto data_ptr = ctx.array_ptrs[data_ptr_idx];
+      auto grad_ptr = ctx.array_ptrs[grad_ptr_idx];
+
+      if (ctx.device_allocation_type[arg_id] ==
+          LaunchContextBuilder::DevAllocType::kNone) {
+        if (!on_cuda_device(data_ptr)) {
+          return false;
+        }
+        ctx.set_ndarray_ptrs(arg_id, (uint64)data_ptr, (uint64)grad_ptr);
+      } else if (arr_sz > 0) {
+        DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
+        void *dev_data = executor->get_device_alloc_info_ptr(*ptr);
+        void *dev_grad = nullptr;
+        if (grad_ptr) {
+          dev_grad = executor->get_device_alloc_info_ptr(
+              *static_cast<DeviceAllocation *>(grad_ptr));
+        }
+        ctx.set_ndarray_ptrs(arg_id, (uint64)dev_data, (uint64)dev_grad);
+      }
+    }
+  }
+  return true;
+}
+
+void *CudaGraphManager::add_kernel_node(void *graph, void *prev_node,
+                                        void *func, unsigned int grid_dim,
+                                        unsigned int block_dim,
+                                        unsigned int shared_mem,
+                                        void **kernel_params) {
+  CudaKernelNodeParams params{};
+  params.func = func;
+  params.gridDimX = grid_dim;
+  params.gridDimY = 1;
+  params.gridDimZ = 1;
+  params.blockDimX = block_dim;
+  params.blockDimY = 1;
+  params.blockDimZ = 1;
+  params.sharedMemBytes = shared_mem;
+  params.kernelParams = kernel_params;
+  params.extra = nullptr;
+
+  void *node = nullptr;
+  CUDADriver::get_instance().graph_add_kernel_node(
+      &node, graph, prev_node ? &prev_node : nullptr, prev_node ? 1 : 0,
+      &params);
+  return node;
+}
+
+bool CudaGraphManager::launch_cached_graph(CachedCudaGraph &cached,
+                                            LaunchContextBuilder &ctx) {
+  if (ctx.arg_buffer_size > 0) {
+    CUDADriver::get_instance().memcpy_host_to_device(
+        cached.persistent_device_arg_buffer, ctx.get_context().arg_buffer,
+        cached.arg_buffer_size);
+  }
+  auto *stream = CUDAContext::get_instance().get_stream();
+  CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);
+  used_on_last_call_ = true;
+  return true;
+}
+
+bool CudaGraphManager::try_launch(
+    int launch_id, LaunchContextBuilder &ctx, JITModule *cuda_module,
+    const std::vector<std::pair<int, Callable::Parameter>> &parameters,
+    const std::vector<OffloadedTask> &offloaded_tasks,
+    LlvmRuntimeExecutor *executor) {
+  if (offloaded_tasks.empty()) {
+    return false;
+  }
+
+  QD_ERROR_IF(ctx.result_buffer_size > 0,
+              "cuda_graph=True is not supported for kernels with struct return "
+              "values; remove cuda_graph=True or avoid returning values");
+
+  // Falls back to the normal path if any external array is host-resident,
+  // since the graph path cannot perform host-to-device transfers.
+  if (!resolve_ctx_ndarray_ptrs(ctx, parameters, executor)) {
+    return false;
+  }
+
+  auto it = cache_.find(launch_id);
+  if (it != cache_.end()) {
+    return launch_cached_graph(it->second, ctx);
+  }
+
+  CUDAContext::get_instance().make_current();
+
+  CachedCudaGraph cached;
+
+  // --- Allocate persistent buffers ---
+  cached.result_buffer_size = std::max(ctx.result_buffer_size, sizeof(uint64));
+  CUDADriver::get_instance().malloc(
+      (void **)&cached.persistent_device_result_buffer,
+      cached.result_buffer_size);
+
+  cached.arg_buffer_size = ctx.arg_buffer_size;
+  if (cached.arg_buffer_size > 0) {
+    CUDADriver::get_instance().malloc(
+        (void **)&cached.persistent_device_arg_buffer, cached.arg_buffer_size);
+    CUDADriver::get_instance().memcpy_host_to_device(
+        cached.persistent_device_arg_buffer, ctx.get_context().arg_buffer,
+        cached.arg_buffer_size);
+  }
+
+  // --- Build persistent RuntimeContext ---
+  cached.persistent_ctx.runtime = executor->get_llvm_runtime();
+  cached.persistent_ctx.arg_buffer = cached.persistent_device_arg_buffer;
+  cached.persistent_ctx.result_buffer =
+      (uint64 *)cached.persistent_device_result_buffer;
+  cached.persistent_ctx.cpu_thread_id = 0;
+
+  // --- Build CUDA graph ---
+  void *graph = nullptr;
+  CUDADriver::get_instance().graph_create(&graph, 0);
+
+  void *prev_node = nullptr;
+  for (const auto &task : offloaded_tasks) {
+    void *ctx_ptr = &cached.persistent_ctx;
+    prev_node = add_kernel_node(
+        graph, prev_node, cuda_module->lookup_function(task.name),
+        (unsigned int)task.grid_dim, (unsigned int)task.block_dim,
+        (unsigned int)task.dynamic_shared_array_bytes, &ctx_ptr);
+  }
+
+  // --- Instantiate and launch ---
+  CUDADriver::get_instance().graph_instantiate(&cached.graph_exec, graph,
+                                               nullptr, nullptr, 0);
+
+  auto *stream = CUDAContext::get_instance().get_stream();
+  CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);
+
+  CUDADriver::get_instance().graph_destroy(graph);
+
+  QD_TRACE("CUDA graph created with {} kernel nodes for launch_id={}",
+           offloaded_tasks.size(), launch_id);
+
+  cache_.emplace(launch_id, std::move(cached));
+  used_on_last_call_ = true;
+  return true;
+}
+
+}  // namespace cuda
+}  // namespace quadrants::lang
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.h b/quadrants/runtime/cuda/cuda_graph_manager.h
new file mode 100644
index 0000000000..3afc66538a
--- /dev/null
+++ b/quadrants/runtime/cuda/cuda_graph_manager.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <cstddef>
+#include <unordered_map>
+#include <vector>
+
+#include "quadrants/codegen/llvm/compiled_kernel_data.h"
+#include "quadrants/runtime/llvm/llvm_runtime_executor.h"
+
+namespace quadrants::lang {
+namespace cuda {
+
+struct CudaKernelNodeParams {
+  void *func;
+  unsigned int gridDimX;
+  unsigned int gridDimY;
+  unsigned int gridDimZ;
+  unsigned int blockDimX;
+  unsigned int blockDimY;
+  unsigned int blockDimZ;
+  unsigned int sharedMemBytes;
+  void **kernelParams;
+  void **extra;
+};
+
+struct CachedCudaGraph {
+  // CUgraphExec handle (typed as void* since driver API is loaded dynamically).
+  // This is the instantiated, launchable form of the captured CUDA graph.
+  void *graph_exec{nullptr};
+  char *persistent_device_arg_buffer{nullptr};
+  char *persistent_device_result_buffer{nullptr};
+  RuntimeContext persistent_ctx{};
+  std::size_t arg_buffer_size{0};
+  std::size_t result_buffer_size{0};
+
+  CachedCudaGraph() = default;
+  ~CachedCudaGraph();
+  CachedCudaGraph(const CachedCudaGraph &) = delete;
+  CachedCudaGraph &operator=(const CachedCudaGraph &) = delete;
+  CachedCudaGraph(CachedCudaGraph &&other) noexcept;
+  CachedCudaGraph &operator=(CachedCudaGraph &&other) noexcept;
+};
+
+class CudaGraphManager {
+ public:
+  // Attempts to launch the kernel via a cached or newly built CUDA graph.
+  // Returns true on success; false if the graph path can't be used (e.g.
+  // host-resident ndarrays) and the caller should fall back to normal launch.
+  bool try_launch(int launch_id, LaunchContextBuilder &ctx,
+                  JITModule *cuda_module,
+                  const std::vector<std::pair<int, Callable::Parameter>> &parameters,
+                  const std::vector<OffloadedTask> &offloaded_tasks,
+                  LlvmRuntimeExecutor *executor);
+
+  void mark_not_used() { used_on_last_call_ = false; }
+  std::size_t cache_size() const { return cache_.size(); }
+  bool used_on_last_call() const { return used_on_last_call_; }
+
+ private:
+  bool launch_cached_graph(CachedCudaGraph &cached, LaunchContextBuilder &ctx);
+  bool on_cuda_device(void *ptr);
+  bool resolve_ctx_ndarray_ptrs(
+      LaunchContextBuilder &ctx,
+      const std::vector<std::pair<int, Callable::Parameter>> &parameters,
+      LlvmRuntimeExecutor *executor);
+  void *add_kernel_node(void *graph, void *prev_node, void *func,
+                        unsigned int grid_dim, unsigned int block_dim,
+                        unsigned int shared_mem, void **kernel_params);
+
+  // Keyed by launch_id, which uniquely identifies a compiled kernel variant
+  // (each template specialization gets its own launch_id).
+  std::unordered_map<int, CachedCudaGraph> cache_;
+  bool used_on_last_call_{false};
+};
+
+}  // namespace cuda
+}  // namespace quadrants::lang
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index ce579999dd..da497f37b0 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -1,236 +1,25 @@
 #include "quadrants/runtime/cuda/kernel_launcher.h"
 #include "quadrants/rhi/cuda/cuda_context.h"
 
-#include <cstring>
+#include <vector>
 
 namespace quadrants::lang {
 namespace cuda {
 
-CachedCudaGraph::~CachedCudaGraph() {
-  if (graph_exec) {
-    CUDADriver::get_instance().graph_exec_destroy(graph_exec);
-  }
-  if (persistent_device_arg_buffer) {
-    CUDADriver::get_instance().mem_free(persistent_device_arg_buffer);
-  }
-  if (persistent_device_result_buffer) {
-    CUDADriver::get_instance().mem_free(persistent_device_result_buffer);
-  }
-}
-
-CachedCudaGraph::CachedCudaGraph(CachedCudaGraph &&other) noexcept
-    : graph_exec(other.graph_exec),
-      persistent_device_arg_buffer(other.persistent_device_arg_buffer),
-      persistent_device_result_buffer(other.persistent_device_result_buffer),
-      persistent_ctx(other.persistent_ctx),
-      arg_buffer_size(other.arg_buffer_size),
-      result_buffer_size(other.result_buffer_size) {
-  other.graph_exec = nullptr;
-  other.persistent_device_arg_buffer = nullptr;
-  other.persistent_device_result_buffer = nullptr;
-}
-
-CachedCudaGraph &CachedCudaGraph::operator=(CachedCudaGraph &&other) noexcept {
-  if (this != &other) {
-    if (graph_exec)
-      CUDADriver::get_instance().graph_exec_destroy(graph_exec);
-    if (persistent_device_arg_buffer)
-      CUDADriver::get_instance().mem_free(persistent_device_arg_buffer);
-    if (persistent_device_result_buffer)
-      CUDADriver::get_instance().mem_free(persistent_device_result_buffer);
-
-    graph_exec = other.graph_exec;
-    persistent_device_arg_buffer = other.persistent_device_arg_buffer;
-    persistent_device_result_buffer = other.persistent_device_result_buffer;
-    persistent_ctx = other.persistent_ctx;
-    arg_buffer_size = other.arg_buffer_size;
-    result_buffer_size = other.result_buffer_size;
-
-    other.graph_exec = nullptr;
-    other.persistent_device_arg_buffer = nullptr;
-    other.persistent_device_result_buffer = nullptr;
-  }
-  return *this;
-}
-
-bool KernelLauncher::on_cuda_device(void *ptr) {
-  unsigned int attr_val = 0;
-  uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
-      &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (void *)ptr);
-
-  return ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE;
-}
-
-// Resolves ndarray parameter handles in the launch context to raw device
-// pointers, writing them into the arg buffer via set_ndarray_ptrs.
-//
-// Unlike the normal launch path, this does not handle host-resident arrays
-// (no temporary device allocation or host-to-device transfer). Returns false
-// if any external array is on the host, signaling the caller to fall back
-// to the non-graph launch path.
-bool KernelLauncher::resolve_ctx_ndarray_ptrs(
-    LaunchContextBuilder &ctx,
-    const std::vector<std::pair<int, Callable::Parameter>> &parameters) {
-  auto *executor = get_runtime_executor();
-  for (int i = 0; i < (int)parameters.size(); i++) {
-    const auto &kv = parameters[i];
-    const auto &arg_id = kv.first;
-    const auto &parameter = kv.second;
-    if (parameter.is_array) {
-      const auto arr_sz = ctx.array_runtime_sizes[arg_id];
-      if (arr_sz == 0)
-        continue;
-
-      ArgArrayPtrKey data_ptr_idx{arg_id, TypeFactory::DATA_PTR_POS_IN_NDARRAY};
-      ArgArrayPtrKey grad_ptr_idx{arg_id, TypeFactory::GRAD_PTR_POS_IN_NDARRAY};
-      auto data_ptr = ctx.array_ptrs[data_ptr_idx];
-      auto grad_ptr = ctx.array_ptrs[grad_ptr_idx];
-
-      if (ctx.device_allocation_type[arg_id] ==
-          LaunchContextBuilder::DevAllocType::kNone) {
-        if (!on_cuda_device(data_ptr)) {
-          return false;
-        }
-        ctx.set_ndarray_ptrs(arg_id, (uint64)data_ptr, (uint64)grad_ptr);
-      } else if (arr_sz > 0) {
-        DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
-        void *dev_data = executor->get_device_alloc_info_ptr(*ptr);
-        void *dev_grad = nullptr;
-        if (grad_ptr) {
-          dev_grad = executor->get_device_alloc_info_ptr(
-              *static_cast<DeviceAllocation *>(grad_ptr));
-        }
-        ctx.set_ndarray_ptrs(arg_id, (uint64)dev_data, (uint64)dev_grad);
-      }
-    }
-  }
-  return true;
-}
-
-bool KernelLauncher::launch_llvm_kernel_graph(Handle handle,
-                                              LaunchContextBuilder &ctx) {
-  int launch_id = handle.get_launch_id();
-
-  // Populated by register_llvm_kernel, which runs before launch_llvm_kernel
-  // for all LLVM kernels regardless of whether the graph path is used.
-  auto &launcher_ctx = contexts_[launch_id];
-  const auto &parameters = *launcher_ctx.parameters;
-  const auto &offloaded_tasks = launcher_ctx.offloaded_tasks;
-
-  if (offloaded_tasks.empty()) {
-    return false;
-  }
-
-  QD_ERROR_IF(ctx.result_buffer_size > 0,
-              "cuda_graph=True is not supported for kernels with struct return "
-              "values; remove cuda_graph=True or avoid returning values");
-
-  // Falls back to the normal path if any external array is host-resident,
-  // since the graph path cannot perform host-to-device transfers.
-  if (!resolve_ctx_ndarray_ptrs(ctx, parameters)) {
-    return false;
-  }
-
-  auto it = cuda_graph_cache_.find(launch_id);
-  if (it != cuda_graph_cache_.end()) {
-    auto &cached = it->second;
-    if (ctx.arg_buffer_size > 0) {
-      CUDADriver::get_instance().memcpy_host_to_device(
-          cached.persistent_device_arg_buffer, ctx.get_context().arg_buffer,
-          cached.arg_buffer_size);
-    }
-    auto *stream = CUDAContext::get_instance().get_stream();
-    CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);
-    return true;
-  }
-
-  CUDAContext::get_instance().make_current();
-
-  auto *executor = get_runtime_executor();
-  auto *cuda_module = launcher_ctx.jit_module;
-
-  CachedCudaGraph cached;
-
-  // --- Allocate persistent buffers ---
-  cached.result_buffer_size = std::max(ctx.result_buffer_size, sizeof(uint64));
-  CUDADriver::get_instance().malloc(
-      (void **)&cached.persistent_device_result_buffer,
-      cached.result_buffer_size);
-
-  cached.arg_buffer_size = ctx.arg_buffer_size;
-  if (cached.arg_buffer_size > 0) {
-    CUDADriver::get_instance().malloc(
-        (void **)&cached.persistent_device_arg_buffer, cached.arg_buffer_size);
-    CUDADriver::get_instance().memcpy_host_to_device(
-        cached.persistent_device_arg_buffer, ctx.get_context().arg_buffer,
-        cached.arg_buffer_size);
-  }
-
-  // --- Build persistent RuntimeContext ---
-  cached.persistent_ctx.runtime = executor->get_llvm_runtime();
-  cached.persistent_ctx.arg_buffer = cached.persistent_device_arg_buffer;
-  cached.persistent_ctx.result_buffer =
-      (uint64 *)cached.persistent_device_result_buffer;
-  cached.persistent_ctx.cpu_thread_id = 0;
-
-  // --- Build CUDA graph ---
-  void *graph = nullptr;
-  CUDADriver::get_instance().graph_create(&graph, 0);
-
-  void *prev_node = nullptr;
-  for (const auto &task : offloaded_tasks) {
-    void *func = cuda_module->lookup_function(task.name);
-
-    void *ctx_ptr = &cached.persistent_ctx;
-    CudaKernelNodeParams node_params{};
-    node_params.func = func;
-    node_params.gridDimX = (unsigned int)task.grid_dim;
-    node_params.gridDimY = 1;
-    node_params.gridDimZ = 1;
-    node_params.blockDimX = (unsigned int)task.block_dim;
-    node_params.blockDimY = 1;
-    node_params.blockDimZ = 1;
-    node_params.sharedMemBytes = (unsigned int)task.dynamic_shared_array_bytes;
-    node_params.kernelParams = &ctx_ptr;
-    // kernelParams and extra are two mutually exclusive ways of passing
-    // arguments to a CUDA kernel; we use kernelParams, so extra is null.
-    node_params.extra = nullptr;
-
-    void *node = nullptr;
-    const void *deps = prev_node;
-    std::size_t num_deps = prev_node ? 1 : 0;
-    CUDADriver::get_instance().graph_add_kernel_node(
-        &node, graph, prev_node ? &deps : nullptr, num_deps, &node_params);
-    prev_node = node;
-  }
-
-  // --- Instantiate and launch ---
-  CUDADriver::get_instance().graph_instantiate(&cached.graph_exec, graph,
-                                               nullptr, nullptr, 0);
-
-  auto *stream = CUDAContext::get_instance().get_stream();
-  CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);
-
-  CUDADriver::get_instance().graph_destroy(graph);
-
-  QD_TRACE("CUDA graph created with {} kernel nodes for launch_id={}",
-           offloaded_tasks.size(), launch_id);
-
-  cuda_graph_cache_.emplace(launch_id, std::move(cached));
-  return true;
-}
-
 void KernelLauncher::launch_llvm_kernel(Handle handle,
                                         LaunchContextBuilder &ctx) {
   QD_ASSERT(handle.get_launch_id() < contexts_.size());
 
   if (ctx.use_cuda_graph) {
-    if (launch_llvm_kernel_graph(handle, ctx)) {
-      cuda_graph_cache_used_on_last_call_ = true;
+    auto &lctx = contexts_[handle.get_launch_id()];
+    if (graph_manager_.try_launch(handle.get_launch_id(), ctx,
+                                  lctx.jit_module, *lctx.parameters,
+                                  lctx.offloaded_tasks,
+                                  get_runtime_executor())) {
       return;
     }
   }
-  cuda_graph_cache_used_on_last_call_ = false;
+  graph_manager_.mark_not_used();
 
   auto launcher_ctx = contexts_[handle.get_launch_id()];
   auto *executor = get_runtime_executor();
@@ -288,7 +77,10 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
           LaunchContextBuilder::DevAllocType::kNone) {
         // External array
         // Note: assuming both data & grad are on the same device
-        if (on_cuda_device(data_ptr)) {
+        unsigned int attr_val = 0;
+        uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
+            &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (void *)data_ptr);
+        if (ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE) {
           // data_ptr is a raw ptr on CUDA device
           device_ptrs[data_ptr_idx] = data_ptr;
           device_ptrs[grad_ptr_idx] = grad_ptr;
diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index 72e2a18ddb..0027941c42 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -1,44 +1,15 @@
 #pragma once
 
-#include <unordered_map>
+#include <string>
+#include <vector>
 
 #include "quadrants/codegen/llvm/compiled_kernel_data.h"
+#include "quadrants/runtime/cuda/cuda_graph_manager.h"
 #include "quadrants/runtime/llvm/kernel_launcher.h"
 
 namespace quadrants::lang {
 namespace cuda {
 
-struct CudaKernelNodeParams {
-  void *func;
-  unsigned int gridDimX;
-  unsigned int gridDimY;
-  unsigned int gridDimZ;
-  unsigned int blockDimX;
-  unsigned int blockDimY;
-  unsigned int blockDimZ;
-  unsigned int sharedMemBytes;
-  void **kernelParams;
-  void **extra;
-};
-
-struct CachedCudaGraph {
-  // CUgraphExec handle (typed as void* since driver API is loaded dynamically).
-  // This is the instantiated, launchable form of the captured CUDA graph.
-  void *graph_exec{nullptr};
-  char *persistent_device_arg_buffer{nullptr};
-  char *persistent_device_result_buffer{nullptr};
-  RuntimeContext persistent_ctx{};
-  std::size_t arg_buffer_size{0};
-  std::size_t result_buffer_size{0};
-
-  CachedCudaGraph() = default;
-  ~CachedCudaGraph();
-  CachedCudaGraph(const CachedCudaGraph &) = delete;
-  CachedCudaGraph &operator=(const CachedCudaGraph &) = delete;
-  CachedCudaGraph(CachedCudaGraph &&other) noexcept;
-  CachedCudaGraph &operator=(CachedCudaGraph &&other) noexcept;
-};
-
 class KernelLauncher : public LLVM::KernelLauncher {
   using Base = LLVM::KernelLauncher;
 
@@ -55,23 +26,15 @@ class KernelLauncher : public LLVM::KernelLauncher {
   Handle register_llvm_kernel(
       const LLVM::CompiledKernelData &compiled) override;
   std::size_t get_cuda_graph_cache_size() const override {
-    return cuda_graph_cache_.size();
+    return graph_manager_.cache_size();
   }
   bool get_cuda_graph_cache_used_on_last_call() const override {
-    return cuda_graph_cache_used_on_last_call_;
+    return graph_manager_.used_on_last_call();
   }
 
  private:
-  bool on_cuda_device(void *ptr);
-  bool resolve_ctx_ndarray_ptrs(
-      LaunchContextBuilder &ctx,
-      const std::vector<std::pair<int, Callable::Parameter>> &parameters);
-  bool launch_llvm_kernel_graph(Handle handle, LaunchContextBuilder &ctx);
   std::vector<Context> contexts_;
-  // Keyed by launch_id, which uniquely identifies a compiled kernel variant
-  // (each template specialization gets its own launch_id).
-  std::unordered_map<int, CachedCudaGraph> cuda_graph_cache_;
-  bool cuda_graph_cache_used_on_last_call_{false};
+  CudaGraphManager graph_manager_;
 };
 
 }  // namespace cuda

From 76181bfbc2398a0f8c24e322c9aa1726161f0952 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:19:48 -0400
Subject: [PATCH 076/128] Make on_cuda_device a free function shared by both
 launch paths

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 2 +-
 quadrants/runtime/cuda/cuda_graph_manager.h   | 3 ++-
 quadrants/runtime/cuda/kernel_launcher.cpp    | 5 +----
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 48fb0417ba..a5065000ef 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -51,7 +51,7 @@ CachedCudaGraph &CachedCudaGraph::operator=(CachedCudaGraph &&other) noexcept {
   return *this;
 }
 
-bool CudaGraphManager::on_cuda_device(void *ptr) {
+bool on_cuda_device(void *ptr) {
   unsigned int attr_val = 0;
   uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
       &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (void *)ptr);
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.h b/quadrants/runtime/cuda/cuda_graph_manager.h
index 3afc66538a..085f7d00b0 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.h
+++ b/quadrants/runtime/cuda/cuda_graph_manager.h
@@ -41,6 +41,8 @@ struct CachedCudaGraph {
   CachedCudaGraph &operator=(CachedCudaGraph &&other) noexcept;
 };
 
+bool on_cuda_device(void *ptr);
+
 class CudaGraphManager {
  public:
   // Attempts to launch the kernel via a cached or newly built CUDA graph.
@@ -58,7 +60,6 @@ class CudaGraphManager {
 
  private:
   bool launch_cached_graph(CachedCudaGraph &cached, LaunchContextBuilder &ctx);
-  bool on_cuda_device(void *ptr);
   bool resolve_ctx_ndarray_ptrs(
       LaunchContextBuilder &ctx,
       const std::vector<std::pair<int, Callable::Parameter>> &parameters,
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index da497f37b0..e7d22123c1 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -77,10 +77,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
           LaunchContextBuilder::DevAllocType::kNone) {
         // External array
         // Note: assuming both data & grad are on the same device
-        unsigned int attr_val = 0;
-        uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
-            &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (void *)data_ptr);
-        if (ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE) {
+        if (on_cuda_device(data_ptr)) {
           // data_ptr is a raw ptr on CUDA device
           device_ptrs[data_ptr_idx] = data_ptr;
           device_ptrs[grad_ptr_idx] = grad_ptr;

From f6d531ea6b5b1e4eb359b05ac6d52c51e7465ed1 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:20:19 -0400
Subject: [PATCH 077/128] Make on_cuda_device a free function shared by both
 launch paths

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 2 +-
 quadrants/runtime/cuda/cuda_graph_manager.h   | 3 ++-
 quadrants/runtime/cuda/kernel_launcher.cpp    | 5 +----
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 331028082f..4622c2aef5 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -121,7 +121,7 @@ CachedCudaGraph &CachedCudaGraph::operator=(CachedCudaGraph &&other) noexcept {
   return *this;
 }
 
-bool CudaGraphManager::on_cuda_device(void *ptr) {
+bool on_cuda_device(void *ptr) {
   unsigned int attr_val = 0;
   uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
       &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (void *)ptr);
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.h b/quadrants/runtime/cuda/cuda_graph_manager.h
index 952ce91cd3..5e26936486 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.h
+++ b/quadrants/runtime/cuda/cuda_graph_manager.h
@@ -62,6 +62,8 @@ struct CachedCudaGraph {
   CachedCudaGraph &operator=(CachedCudaGraph &&other) noexcept;
 };
 
+bool on_cuda_device(void *ptr);
+
 class CudaGraphManager {
  public:
   // Attempts to launch the kernel via a cached or newly built CUDA graph.
@@ -81,7 +83,6 @@ class CudaGraphManager {
  private:
   bool launch_cached_graph(CachedCudaGraph &cached, LaunchContextBuilder &ctx,
                            bool use_graph_do_while);
-  bool on_cuda_device(void *ptr);
   bool resolve_ctx_ndarray_ptrs(
       LaunchContextBuilder &ctx,
       const std::vector<std::pair<int, Callable::Parameter>> &parameters,
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index da497f37b0..e7d22123c1 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -77,10 +77,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
           LaunchContextBuilder::DevAllocType::kNone) {
         // External array
         // Note: assuming both data & grad are on the same device
-        unsigned int attr_val = 0;
-        uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
-            &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (void *)data_ptr);
-        if (ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE) {
+        if (on_cuda_device(data_ptr)) {
           // data_ptr is a raw ptr on CUDA device
           device_ptrs[data_ptr_idx] = data_ptr;
           device_ptrs[grad_ptr_idx] = grad_ptr;

From 3fc7a9abfde6b5d0292f6ff77ace7e7fade35aa3 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:23:26 -0400
Subject: [PATCH 078/128] Move on_cuda_device to cuda_context where it belongs

Made-with: Cursor
---
 quadrants/rhi/cuda/cuda_context.cpp           | 7 +++++++
 quadrants/rhi/cuda/cuda_context.h             | 2 ++
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 8 --------
 quadrants/runtime/cuda/cuda_graph_manager.h   | 2 --
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/quadrants/rhi/cuda/cuda_context.cpp b/quadrants/rhi/cuda/cuda_context.cpp
index 76fc3b7f2f..3351c976c3 100644
--- a/quadrants/rhi/cuda/cuda_context.cpp
+++ b/quadrants/rhi/cuda/cuda_context.cpp
@@ -194,4 +194,11 @@ CUDAContext &CUDAContext::get_instance() {
   return *context;
 }
 
+bool on_cuda_device(void *ptr) {
+  unsigned int attr_val = 0;
+  uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
+      &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (void *)ptr);
+  return ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE;
+}
+
 }  // namespace quadrants::lang
diff --git a/quadrants/rhi/cuda/cuda_context.h b/quadrants/rhi/cuda/cuda_context.h
index c57baa3d92..cdc265a151 100644
--- a/quadrants/rhi/cuda/cuda_context.h
+++ b/quadrants/rhi/cuda/cuda_context.h
@@ -122,4 +122,6 @@ class CUDAContext {
   }
 };
 
+bool on_cuda_device(void *ptr);
+
 }  // namespace quadrants::lang
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 4622c2aef5..266f6342c6 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -121,14 +121,6 @@ CachedCudaGraph &CachedCudaGraph::operator=(CachedCudaGraph &&other) noexcept {
   return *this;
 }
 
-bool on_cuda_device(void *ptr) {
-  unsigned int attr_val = 0;
-  uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
-      &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (void *)ptr);
-
-  return ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE;
-}
-
 // Resolves ndarray parameter handles in the launch context to raw device
 // pointers, writing them into the arg buffer via set_ndarray_ptrs.
 //
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.h b/quadrants/runtime/cuda/cuda_graph_manager.h
index 5e26936486..73ff39a53b 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.h
+++ b/quadrants/runtime/cuda/cuda_graph_manager.h
@@ -62,8 +62,6 @@ struct CachedCudaGraph {
   CachedCudaGraph &operator=(CachedCudaGraph &&other) noexcept;
 };
 
-bool on_cuda_device(void *ptr);
-
 class CudaGraphManager {
  public:
   // Attempts to launch the kernel via a cached or newly built CUDA graph.

From 683194df30dd4f3d74d4f7a9f76694efaec0b583 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:23:55 -0400
Subject: [PATCH 079/128] Move on_cuda_device to cuda_context where it belongs

Made-with: Cursor
---
 quadrants/rhi/cuda/cuda_context.cpp           | 7 +++++++
 quadrants/rhi/cuda/cuda_context.h             | 2 ++
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 8 --------
 quadrants/runtime/cuda/cuda_graph_manager.h   | 2 --
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/quadrants/rhi/cuda/cuda_context.cpp b/quadrants/rhi/cuda/cuda_context.cpp
index 76fc3b7f2f..3351c976c3 100644
--- a/quadrants/rhi/cuda/cuda_context.cpp
+++ b/quadrants/rhi/cuda/cuda_context.cpp
@@ -194,4 +194,11 @@ CUDAContext &CUDAContext::get_instance() {
   return *context;
 }
 
+bool on_cuda_device(void *ptr) {
+  unsigned int attr_val = 0;
+  uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
+      &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (void *)ptr);
+  return ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE;
+}
+
 }  // namespace quadrants::lang
diff --git a/quadrants/rhi/cuda/cuda_context.h b/quadrants/rhi/cuda/cuda_context.h
index c57baa3d92..cdc265a151 100644
--- a/quadrants/rhi/cuda/cuda_context.h
+++ b/quadrants/rhi/cuda/cuda_context.h
@@ -122,4 +122,6 @@ class CUDAContext {
   }
 };
 
+bool on_cuda_device(void *ptr);
+
 }  // namespace quadrants::lang
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index a5065000ef..a763b11aa2 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -51,14 +51,6 @@ CachedCudaGraph &CachedCudaGraph::operator=(CachedCudaGraph &&other) noexcept {
   return *this;
 }
 
-bool on_cuda_device(void *ptr) {
-  unsigned int attr_val = 0;
-  uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
-      &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (void *)ptr);
-
-  return ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE;
-}
-
 // Resolves ndarray parameter handles in the launch context to raw device
 // pointers, writing them into the arg buffer via set_ndarray_ptrs.
 //
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.h b/quadrants/runtime/cuda/cuda_graph_manager.h
index 085f7d00b0..2bbec8a3d0 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.h
+++ b/quadrants/runtime/cuda/cuda_graph_manager.h
@@ -41,8 +41,6 @@ struct CachedCudaGraph {
   CachedCudaGraph &operator=(CachedCudaGraph &&other) noexcept;
 };
 
-bool on_cuda_device(void *ptr);
-
 class CudaGraphManager {
  public:
   // Attempts to launch the kernel via a cached or newly built CUDA graph.

From c17e59c5ed6c57978d10147a1c78c7b17c323f9e Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:26:07 -0400
Subject: [PATCH 080/128] Move on_cuda_device to runtime/cuda/cuda_utils

Made-with: Cursor
---
 quadrants/rhi/cuda/cuda_context.cpp           |  7 -------
 quadrants/rhi/cuda/cuda_context.h             |  2 --
 quadrants/runtime/cuda/CMakeLists.txt         |  1 +
 quadrants/runtime/cuda/cuda_graph_manager.cpp |  1 +
 quadrants/runtime/cuda/cuda_utils.cpp         | 15 +++++++++++++++
 quadrants/runtime/cuda/cuda_utils.h           |  9 +++++++++
 quadrants/runtime/cuda/kernel_launcher.cpp    |  1 +
 7 files changed, 27 insertions(+), 9 deletions(-)
 create mode 100644 quadrants/runtime/cuda/cuda_utils.cpp
 create mode 100644 quadrants/runtime/cuda/cuda_utils.h

diff --git a/quadrants/rhi/cuda/cuda_context.cpp b/quadrants/rhi/cuda/cuda_context.cpp
index 3351c976c3..76fc3b7f2f 100644
--- a/quadrants/rhi/cuda/cuda_context.cpp
+++ b/quadrants/rhi/cuda/cuda_context.cpp
@@ -194,11 +194,4 @@ CUDAContext &CUDAContext::get_instance() {
   return *context;
 }
 
-bool on_cuda_device(void *ptr) {
-  unsigned int attr_val = 0;
-  uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
-      &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (void *)ptr);
-  return ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE;
-}
-
 }  // namespace quadrants::lang
diff --git a/quadrants/rhi/cuda/cuda_context.h b/quadrants/rhi/cuda/cuda_context.h
index cdc265a151..c57baa3d92 100644
--- a/quadrants/rhi/cuda/cuda_context.h
+++ b/quadrants/rhi/cuda/cuda_context.h
@@ -122,6 +122,4 @@ class CUDAContext {
   }
 };
 
-bool on_cuda_device(void *ptr);
-
 }  // namespace quadrants::lang
diff --git a/quadrants/runtime/cuda/CMakeLists.txt b/quadrants/runtime/cuda/CMakeLists.txt
index a5c4eeac6e..43f5088f37 100644
--- a/quadrants/runtime/cuda/CMakeLists.txt
+++ b/quadrants/runtime/cuda/CMakeLists.txt
@@ -4,6 +4,7 @@ add_library(cuda_runtime)
 target_sources(cuda_runtime
   PRIVATE
     cuda_graph_manager.cpp
+    cuda_utils.cpp
     jit_cuda.cpp
     kernel_launcher.cpp
     ptx_cache.cpp
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index a763b11aa2..6d781f9b38 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -1,4 +1,5 @@
 #include "quadrants/runtime/cuda/cuda_graph_manager.h"
+#include "quadrants/runtime/cuda/cuda_utils.h"
 #include "quadrants/rhi/cuda/cuda_context.h"
 
 namespace quadrants::lang {
diff --git a/quadrants/runtime/cuda/cuda_utils.cpp b/quadrants/runtime/cuda/cuda_utils.cpp
new file mode 100644
index 0000000000..0fd6154817
--- /dev/null
+++ b/quadrants/runtime/cuda/cuda_utils.cpp
@@ -0,0 +1,15 @@
+#include "quadrants/runtime/cuda/cuda_utils.h"
+#include "quadrants/rhi/cuda/cuda_context.h"
+
+namespace quadrants::lang {
+namespace cuda {
+
+bool on_cuda_device(void *ptr) {
+  unsigned int attr_val = 0;
+  uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
+      &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (void *)ptr);
+  return ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE;
+}
+
+}  // namespace cuda
+}  // namespace quadrants::lang
diff --git a/quadrants/runtime/cuda/cuda_utils.h b/quadrants/runtime/cuda/cuda_utils.h
new file mode 100644
index 0000000000..ed48f3b712
--- /dev/null
+++ b/quadrants/runtime/cuda/cuda_utils.h
@@ -0,0 +1,9 @@
+#pragma once
+
+namespace quadrants::lang {
+namespace cuda {
+
+bool on_cuda_device(void *ptr);
+
+}  // namespace cuda
+}  // namespace quadrants::lang
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index e7d22123c1..10df2c72ab 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -1,4 +1,5 @@
 #include "quadrants/runtime/cuda/kernel_launcher.h"
+#include "quadrants/runtime/cuda/cuda_utils.h"
 #include "quadrants/rhi/cuda/cuda_context.h"
 
 #include <vector>

From ccf9a37beb94b9318f40a84ffe6709400941286f Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:26:46 -0400
Subject: [PATCH 081/128] Move on_cuda_device to runtime/cuda/cuda_utils

Made-with: Cursor
---
 quadrants/rhi/cuda/cuda_context.cpp           |  7 -------
 quadrants/rhi/cuda/cuda_context.h             |  2 --
 quadrants/runtime/cuda/CMakeLists.txt         |  1 +
 quadrants/runtime/cuda/cuda_graph_manager.cpp |  1 +
 quadrants/runtime/cuda/cuda_utils.cpp         | 15 +++++++++++++++
 quadrants/runtime/cuda/cuda_utils.h           |  9 +++++++++
 quadrants/runtime/cuda/kernel_launcher.cpp    |  1 +
 7 files changed, 27 insertions(+), 9 deletions(-)
 create mode 100644 quadrants/runtime/cuda/cuda_utils.cpp
 create mode 100644 quadrants/runtime/cuda/cuda_utils.h

diff --git a/quadrants/rhi/cuda/cuda_context.cpp b/quadrants/rhi/cuda/cuda_context.cpp
index 3351c976c3..76fc3b7f2f 100644
--- a/quadrants/rhi/cuda/cuda_context.cpp
+++ b/quadrants/rhi/cuda/cuda_context.cpp
@@ -194,11 +194,4 @@ CUDAContext &CUDAContext::get_instance() {
   return *context;
 }
 
-bool on_cuda_device(void *ptr) {
-  unsigned int attr_val = 0;
-  uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
-      &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (void *)ptr);
-  return ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE;
-}
-
 }  // namespace quadrants::lang
diff --git a/quadrants/rhi/cuda/cuda_context.h b/quadrants/rhi/cuda/cuda_context.h
index cdc265a151..c57baa3d92 100644
--- a/quadrants/rhi/cuda/cuda_context.h
+++ b/quadrants/rhi/cuda/cuda_context.h
@@ -122,6 +122,4 @@ class CUDAContext {
   }
 };
 
-bool on_cuda_device(void *ptr);
-
 }  // namespace quadrants::lang
diff --git a/quadrants/runtime/cuda/CMakeLists.txt b/quadrants/runtime/cuda/CMakeLists.txt
index a5c4eeac6e..43f5088f37 100644
--- a/quadrants/runtime/cuda/CMakeLists.txt
+++ b/quadrants/runtime/cuda/CMakeLists.txt
@@ -4,6 +4,7 @@ add_library(cuda_runtime)
 target_sources(cuda_runtime
   PRIVATE
     cuda_graph_manager.cpp
+    cuda_utils.cpp
     jit_cuda.cpp
     kernel_launcher.cpp
     ptx_cache.cpp
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 266f6342c6..15562d7561 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -1,4 +1,5 @@
 #include "quadrants/runtime/cuda/cuda_graph_manager.h"
+#include "quadrants/runtime/cuda/cuda_utils.h"
 #include "quadrants/rhi/cuda/cuda_context.h"
 
 #include <cstdlib>
diff --git a/quadrants/runtime/cuda/cuda_utils.cpp b/quadrants/runtime/cuda/cuda_utils.cpp
new file mode 100644
index 0000000000..0fd6154817
--- /dev/null
+++ b/quadrants/runtime/cuda/cuda_utils.cpp
@@ -0,0 +1,15 @@
+#include "quadrants/runtime/cuda/cuda_utils.h"
+#include "quadrants/rhi/cuda/cuda_context.h"
+
+namespace quadrants::lang {
+namespace cuda {
+
+bool on_cuda_device(void *ptr) {
+  unsigned int attr_val = 0;
+  uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
+      &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (void *)ptr);
+  return ret_code == CUDA_SUCCESS && attr_val == CU_MEMORYTYPE_DEVICE;
+}
+
+}  // namespace cuda
+}  // namespace quadrants::lang
diff --git a/quadrants/runtime/cuda/cuda_utils.h b/quadrants/runtime/cuda/cuda_utils.h
new file mode 100644
index 0000000000..ed48f3b712
--- /dev/null
+++ b/quadrants/runtime/cuda/cuda_utils.h
@@ -0,0 +1,9 @@
+#pragma once
+
+namespace quadrants::lang {
+namespace cuda {
+
+bool on_cuda_device(void *ptr);
+
+}  // namespace cuda
+}  // namespace quadrants::lang
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index e7d22123c1..10df2c72ab 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -1,4 +1,5 @@
 #include "quadrants/runtime/cuda/kernel_launcher.h"
+#include "quadrants/runtime/cuda/cuda_utils.h"
 #include "quadrants/rhi/cuda/cuda_context.h"
 
 #include <vector>

From 2e42c12e6f0a7092a8fe0d8bf446f42a83adba4c Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:30:19 -0400
Subject: [PATCH 082/128] Extract resolve_device_alloc_ptr helper to
 deduplicate DeviceAllocation unwrapping

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 3 +--
 quadrants/runtime/cuda/cuda_utils.h           | 9 +++++++++
 quadrants/runtime/cuda/kernel_launcher.cpp    | 8 ++------
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 15562d7561..e8582b30cb 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -162,8 +162,7 @@ bool CudaGraphManager::resolve_ctx_ndarray_ptrs(
         }
         resolved_data = data_ptr;
       } else if (arr_sz > 0) {
-        DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
-        resolved_data = executor->get_device_alloc_info_ptr(*ptr);
+        resolved_data = resolve_device_alloc_ptr(executor, data_ptr);
       }
 
       if (resolved_data) {
diff --git a/quadrants/runtime/cuda/cuda_utils.h b/quadrants/runtime/cuda/cuda_utils.h
index ed48f3b712..c62bd56cd6 100644
--- a/quadrants/runtime/cuda/cuda_utils.h
+++ b/quadrants/runtime/cuda/cuda_utils.h
@@ -1,9 +1,18 @@
 #pragma once
 
+#include "quadrants/runtime/llvm/llvm_runtime_executor.h"
+
 namespace quadrants::lang {
 namespace cuda {
 
 bool on_cuda_device(void *ptr);
 
+// Unwraps a DeviceAllocation handle to a raw device pointer.
+inline void *resolve_device_alloc_ptr(LlvmRuntimeExecutor *executor,
+                                      void *alloc) {
+  return executor->get_device_alloc_info_ptr(
+      *static_cast<DeviceAllocation *>(alloc));
+}
+
 }  // namespace cuda
 }  // namespace quadrants::lang
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 10df2c72ab..919683acd7 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -109,14 +109,10 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)device_ptrs[grad_ptr_idx]);
       } else if (arr_sz > 0) {
-        // Ndarray
-        DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
-        // Unwrapped raw ptr on device
-        device_ptrs[data_ptr_idx] = executor->get_device_alloc_info_ptr(*ptr);
+        device_ptrs[data_ptr_idx] = resolve_device_alloc_ptr(executor, data_ptr);
 
         if (grad_ptr != nullptr) {
-          ptr = static_cast<DeviceAllocation *>(grad_ptr);
-          device_ptrs[grad_ptr_idx] = executor->get_device_alloc_info_ptr(*ptr);
+          device_ptrs[grad_ptr_idx] = resolve_device_alloc_ptr(executor, grad_ptr);
         } else {
           device_ptrs[grad_ptr_idx] = nullptr;
         }

From c94b41c7434d5f8331ff0b80dfdf85b6b42b4637 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:30:47 -0400
Subject: [PATCH 083/128] Extract resolve_device_alloc_ptr helper to
 deduplicate DeviceAllocation unwrapping

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 6 ++----
 quadrants/runtime/cuda/cuda_utils.h           | 9 +++++++++
 quadrants/runtime/cuda/kernel_launcher.cpp    | 8 ++------
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 6d781f9b38..c82fb45481 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -84,12 +84,10 @@ bool CudaGraphManager::resolve_ctx_ndarray_ptrs(
         }
         ctx.set_ndarray_ptrs(arg_id, (uint64)data_ptr, (uint64)grad_ptr);
       } else if (arr_sz > 0) {
-        DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
-        void *dev_data = executor->get_device_alloc_info_ptr(*ptr);
+        void *dev_data = resolve_device_alloc_ptr(executor, data_ptr);
         void *dev_grad = nullptr;
         if (grad_ptr) {
-          dev_grad = executor->get_device_alloc_info_ptr(
-              *static_cast<DeviceAllocation *>(grad_ptr));
+          dev_grad = resolve_device_alloc_ptr(executor, grad_ptr);
         }
         ctx.set_ndarray_ptrs(arg_id, (uint64)dev_data, (uint64)dev_grad);
       }
diff --git a/quadrants/runtime/cuda/cuda_utils.h b/quadrants/runtime/cuda/cuda_utils.h
index ed48f3b712..c62bd56cd6 100644
--- a/quadrants/runtime/cuda/cuda_utils.h
+++ b/quadrants/runtime/cuda/cuda_utils.h
@@ -1,9 +1,18 @@
 #pragma once
 
+#include "quadrants/runtime/llvm/llvm_runtime_executor.h"
+
 namespace quadrants::lang {
 namespace cuda {
 
 bool on_cuda_device(void *ptr);
 
+// Unwraps a DeviceAllocation handle to a raw device pointer.
+inline void *resolve_device_alloc_ptr(LlvmRuntimeExecutor *executor,
+                                      void *alloc) {
+  return executor->get_device_alloc_info_ptr(
+      *static_cast<DeviceAllocation *>(alloc));
+}
+
 }  // namespace cuda
 }  // namespace quadrants::lang
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 10df2c72ab..919683acd7 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -109,14 +109,10 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)device_ptrs[grad_ptr_idx]);
       } else if (arr_sz > 0) {
-        // Ndarray
-        DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
-        // Unwrapped raw ptr on device
-        device_ptrs[data_ptr_idx] = executor->get_device_alloc_info_ptr(*ptr);
+        device_ptrs[data_ptr_idx] = resolve_device_alloc_ptr(executor, data_ptr);
 
         if (grad_ptr != nullptr) {
-          ptr = static_cast<DeviceAllocation *>(grad_ptr);
-          device_ptrs[grad_ptr_idx] = executor->get_device_alloc_info_ptr(*ptr);
+          device_ptrs[grad_ptr_idx] = resolve_device_alloc_ptr(executor, grad_ptr);
         } else {
           device_ptrs[grad_ptr_idx] = nullptr;
         }

From 55f03c15125892c0e9f117fbbed78426f67c2f4f Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:32:00 -0400
Subject: [PATCH 084/128] Revert "Extract resolve_device_alloc_ptr helper to
 deduplicate DeviceAllocation unwrapping"

This reverts commit c94b41c7434d5f8331ff0b80dfdf85b6b42b4637.
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 6 ++++--
 quadrants/runtime/cuda/cuda_utils.h           | 9 ---------
 quadrants/runtime/cuda/kernel_launcher.cpp    | 8 ++++++--
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index c82fb45481..6d781f9b38 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -84,10 +84,12 @@ bool CudaGraphManager::resolve_ctx_ndarray_ptrs(
         }
         ctx.set_ndarray_ptrs(arg_id, (uint64)data_ptr, (uint64)grad_ptr);
       } else if (arr_sz > 0) {
-        void *dev_data = resolve_device_alloc_ptr(executor, data_ptr);
+        DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
+        void *dev_data = executor->get_device_alloc_info_ptr(*ptr);
         void *dev_grad = nullptr;
         if (grad_ptr) {
-          dev_grad = resolve_device_alloc_ptr(executor, grad_ptr);
+          dev_grad = executor->get_device_alloc_info_ptr(
+              *static_cast<DeviceAllocation *>(grad_ptr));
         }
         ctx.set_ndarray_ptrs(arg_id, (uint64)dev_data, (uint64)dev_grad);
       }
diff --git a/quadrants/runtime/cuda/cuda_utils.h b/quadrants/runtime/cuda/cuda_utils.h
index c62bd56cd6..ed48f3b712 100644
--- a/quadrants/runtime/cuda/cuda_utils.h
+++ b/quadrants/runtime/cuda/cuda_utils.h
@@ -1,18 +1,9 @@
 #pragma once
 
-#include "quadrants/runtime/llvm/llvm_runtime_executor.h"
-
 namespace quadrants::lang {
 namespace cuda {
 
 bool on_cuda_device(void *ptr);
 
-// Unwraps a DeviceAllocation handle to a raw device pointer.
-inline void *resolve_device_alloc_ptr(LlvmRuntimeExecutor *executor,
-                                      void *alloc) {
-  return executor->get_device_alloc_info_ptr(
-      *static_cast<DeviceAllocation *>(alloc));
-}
-
 }  // namespace cuda
 }  // namespace quadrants::lang
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 919683acd7..10df2c72ab 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -109,10 +109,14 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)device_ptrs[grad_ptr_idx]);
       } else if (arr_sz > 0) {
-        device_ptrs[data_ptr_idx] = resolve_device_alloc_ptr(executor, data_ptr);
+        // Ndarray
+        DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
+        // Unwrapped raw ptr on device
+        device_ptrs[data_ptr_idx] = executor->get_device_alloc_info_ptr(*ptr);
 
         if (grad_ptr != nullptr) {
-          device_ptrs[grad_ptr_idx] = resolve_device_alloc_ptr(executor, grad_ptr);
+          ptr = static_cast<DeviceAllocation *>(grad_ptr);
+          device_ptrs[grad_ptr_idx] = executor->get_device_alloc_info_ptr(*ptr);
         } else {
           device_ptrs[grad_ptr_idx] = nullptr;
         }

From 680c7dc41456adf992ef7a52027b6f507dc23b15 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:32:15 -0400
Subject: [PATCH 085/128] Revert "Extract resolve_device_alloc_ptr helper to
 deduplicate DeviceAllocation unwrapping"

This reverts commit 2e42c12e6f0a7092a8fe0d8bf446f42a83adba4c.
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 3 ++-
 quadrants/runtime/cuda/cuda_utils.h           | 9 ---------
 quadrants/runtime/cuda/kernel_launcher.cpp    | 8 ++++++--
 3 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index e8582b30cb..15562d7561 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -162,7 +162,8 @@ bool CudaGraphManager::resolve_ctx_ndarray_ptrs(
         }
         resolved_data = data_ptr;
       } else if (arr_sz > 0) {
-        resolved_data = resolve_device_alloc_ptr(executor, data_ptr);
+        DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
+        resolved_data = executor->get_device_alloc_info_ptr(*ptr);
       }
 
       if (resolved_data) {
diff --git a/quadrants/runtime/cuda/cuda_utils.h b/quadrants/runtime/cuda/cuda_utils.h
index c62bd56cd6..ed48f3b712 100644
--- a/quadrants/runtime/cuda/cuda_utils.h
+++ b/quadrants/runtime/cuda/cuda_utils.h
@@ -1,18 +1,9 @@
 #pragma once
 
-#include "quadrants/runtime/llvm/llvm_runtime_executor.h"
-
 namespace quadrants::lang {
 namespace cuda {
 
 bool on_cuda_device(void *ptr);
 
-// Unwraps a DeviceAllocation handle to a raw device pointer.
-inline void *resolve_device_alloc_ptr(LlvmRuntimeExecutor *executor,
-                                      void *alloc) {
-  return executor->get_device_alloc_info_ptr(
-      *static_cast<DeviceAllocation *>(alloc));
-}
-
 }  // namespace cuda
 }  // namespace quadrants::lang
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 919683acd7..10df2c72ab 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -109,10 +109,14 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)device_ptrs[grad_ptr_idx]);
       } else if (arr_sz > 0) {
-        device_ptrs[data_ptr_idx] = resolve_device_alloc_ptr(executor, data_ptr);
+        // Ndarray
+        DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
+        // Unwrapped raw ptr on device
+        device_ptrs[data_ptr_idx] = executor->get_device_alloc_info_ptr(*ptr);
 
         if (grad_ptr != nullptr) {
-          device_ptrs[grad_ptr_idx] = resolve_device_alloc_ptr(executor, grad_ptr);
+          ptr = static_cast<DeviceAllocation *>(grad_ptr);
+          device_ptrs[grad_ptr_idx] = executor->get_device_alloc_info_ptr(*ptr);
         } else {
           device_ptrs[grad_ptr_idx] = nullptr;
         }

From 4f78c21e13f9c5bd30a8c398a0c637e34d6f3dd0 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:33:23 -0400
Subject: [PATCH 086/128] Error on gradient pointers in cuda_graph path instead
 of silently resolving

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 6d781f9b38..1e95a24703 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -77,21 +77,25 @@ bool CudaGraphManager::resolve_ctx_ndarray_ptrs(
       auto data_ptr = ctx.array_ptrs[data_ptr_idx];
       auto grad_ptr = ctx.array_ptrs[grad_ptr_idx];
 
+      QD_ERROR_IF(grad_ptr != nullptr,
+                  "cuda_graph does not support autograd; "
+                  "ndarray arg {} has a non-null gradient pointer", arg_id);
+
+      void *resolved_data = nullptr;
+
       if (ctx.device_allocation_type[arg_id] ==
           LaunchContextBuilder::DevAllocType::kNone) {
         if (!on_cuda_device(data_ptr)) {
           return false;
         }
-        ctx.set_ndarray_ptrs(arg_id, (uint64)data_ptr, (uint64)grad_ptr);
+        resolved_data = data_ptr;
       } else if (arr_sz > 0) {
         DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
-        void *dev_data = executor->get_device_alloc_info_ptr(*ptr);
-        void *dev_grad = nullptr;
-        if (grad_ptr) {
-          dev_grad = executor->get_device_alloc_info_ptr(
-              *static_cast<DeviceAllocation *>(grad_ptr));
-        }
-        ctx.set_ndarray_ptrs(arg_id, (uint64)dev_data, (uint64)dev_grad);
+        resolved_data = executor->get_device_alloc_info_ptr(*ptr);
+      }
+
+      if (resolved_data) {
+        ctx.set_ndarray_ptrs(arg_id, (uint64)resolved_data, (uint64) nullptr);
       }
     }
   }

From e0200e50fd1e27bded1c0d1433d0ee72031f8d29 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:35:35 -0400
Subject: [PATCH 087/128] Add comment explaining scalar parameter skip in
 resolve_ctx_ndarray_ptrs

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 1e95a24703..829c2b7f75 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -67,6 +67,8 @@ bool CudaGraphManager::resolve_ctx_ndarray_ptrs(
     const auto &kv = parameters[i];
     const auto &arg_id = kv.first;
     const auto &parameter = kv.second;
+    // Scalar parameters are already in the arg buffer and need no resolution;
+    // only array parameters require translating handles to device pointers.
     if (parameter.is_array) {
       const auto arr_sz = ctx.array_runtime_sizes[arg_id];
       if (arr_sz == 0)

From 05a7e4f78ca46bd2939ea693a86452248bda2bd0 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:38:29 -0400
Subject: [PATCH 088/128] Clarify that fields are template parameters and not
 handled here

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 829c2b7f75..77ddd19040 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -69,6 +69,8 @@ bool CudaGraphManager::resolve_ctx_ndarray_ptrs(
     const auto &parameter = kv.second;
     // Scalar parameters are already in the arg buffer and need no resolution;
     // only array parameters require translating handles to device pointers.
+    // Fields are template parameters, and would never arrive here.
+    // We only need to handle ndarrays and external arrays.
     if (parameter.is_array) {
       const auto arr_sz = ctx.array_runtime_sizes[arg_id];
       if (arr_sz == 0)

From a55c23424431f83c9765dc249a0fd40d28d0eaba Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:39:58 -0400
Subject: [PATCH 089/128] Re-add comments lost during merge conflict resolution

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 15562d7561..0dfb613d36 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -137,6 +137,10 @@ bool CudaGraphManager::resolve_ctx_ndarray_ptrs(
     const auto &kv = parameters[i];
     const auto &arg_id = kv.first;
     const auto &parameter = kv.second;
+    // Scalar parameters are already in the arg buffer and need no resolution;
+    // only array parameters require translating handles to device pointers.
+    // Fields are template parameters, and would never arrive here.
+    // We only need to handle ndarrays and external arrays.
     if (parameter.is_array) {
       const auto arr_sz = ctx.array_runtime_sizes[arg_id];
       if (arr_sz == 0)

From b73dfb8cb6328c2a23dcdf545b831c8e08ce8899 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:40:55 -0400
Subject: [PATCH 090/128] Add comment explaining resolved_data variable

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 77ddd19040..edfc4329e1 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -85,6 +85,8 @@ bool CudaGraphManager::resolve_ctx_ndarray_ptrs(
                   "cuda_graph does not support autograd; "
                   "ndarray arg {} has a non-null gradient pointer", arg_id);
 
+      // Raw device pointer to the array data, resolved from either an
+      // external array (raw pointer) or a DeviceAllocation handle.
       void *resolved_data = nullptr;
 
       if (ctx.device_allocation_type[arg_id] ==

From 34f685cf7506df69df4fef830d29d40012c56d52 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:43:05 -0400
Subject: [PATCH 091/128] Add comment noting cache_size and used_on_last_call
 are for tests

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.h b/quadrants/runtime/cuda/cuda_graph_manager.h
index 2bbec8a3d0..f80687eba9 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.h
+++ b/quadrants/runtime/cuda/cuda_graph_manager.h
@@ -52,6 +52,7 @@ class CudaGraphManager {
                   const std::vector<OffloadedTask> &offloaded_tasks,
                   LlvmRuntimeExecutor *executor);
 
+  // cache_size and used_on_last_call used for tests
   void mark_not_used() { used_on_last_call_ = false; }
   std::size_t cache_size() const { return cache_.size(); }
   bool used_on_last_call() const { return used_on_last_call_; }

From e88fad49b7bb24c2438c74e4d0438d21355aa38b Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:45:03 -0400
Subject: [PATCH 092/128] Apply clang-format

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 15 +++++---
 quadrants/runtime/cuda/cuda_graph_manager.h   | 34 +++++++++++++------
 quadrants/runtime/cuda/kernel_launcher.cpp    |  5 ++-
 3 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index edfc4329e1..7a5df23222 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -83,7 +83,8 @@ bool CudaGraphManager::resolve_ctx_ndarray_ptrs(
 
       QD_ERROR_IF(grad_ptr != nullptr,
                   "cuda_graph does not support autograd; "
-                  "ndarray arg {} has a non-null gradient pointer", arg_id);
+                  "ndarray arg {} has a non-null gradient pointer",
+                  arg_id);
 
       // Raw device pointer to the array data, resolved from either an
       // external array (raw pointer) or a DeviceAllocation handle.
@@ -108,8 +109,10 @@ bool CudaGraphManager::resolve_ctx_ndarray_ptrs(
   return true;
 }
 
-void *CudaGraphManager::add_kernel_node(void *graph, void *prev_node,
-                                        void *func, unsigned int grid_dim,
+void *CudaGraphManager::add_kernel_node(void *graph,
+                                        void *prev_node,
+                                        void *func,
+                                        unsigned int grid_dim,
                                         unsigned int block_dim,
                                         unsigned int shared_mem,
                                         void **kernel_params) {
@@ -133,7 +136,7 @@ void *CudaGraphManager::add_kernel_node(void *graph, void *prev_node,
 }
 
 bool CudaGraphManager::launch_cached_graph(CachedCudaGraph &cached,
-                                            LaunchContextBuilder &ctx) {
+                                           LaunchContextBuilder &ctx) {
   if (ctx.arg_buffer_size > 0) {
     CUDADriver::get_instance().memcpy_host_to_device(
         cached.persistent_device_arg_buffer, ctx.get_context().arg_buffer,
@@ -146,7 +149,9 @@ bool CudaGraphManager::launch_cached_graph(CachedCudaGraph &cached,
 }
 
 bool CudaGraphManager::try_launch(
-    int launch_id, LaunchContextBuilder &ctx, JITModule *cuda_module,
+    int launch_id,
+    LaunchContextBuilder &ctx,
+    JITModule *cuda_module,
     const std::vector<std::pair<int, Callable::Parameter>> &parameters,
     const std::vector<OffloadedTask> &offloaded_tasks,
     LlvmRuntimeExecutor *executor) {
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.h b/quadrants/runtime/cuda/cuda_graph_manager.h
index f80687eba9..e92feb34ef 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.h
+++ b/quadrants/runtime/cuda/cuda_graph_manager.h
@@ -46,16 +46,24 @@ class CudaGraphManager {
   // Attempts to launch the kernel via a cached or newly built CUDA graph.
   // Returns true on success; false if the graph path can't be used (e.g.
   // host-resident ndarrays) and the caller should fall back to normal launch.
-  bool try_launch(int launch_id, LaunchContextBuilder &ctx,
-                  JITModule *cuda_module,
-                  const std::vector<std::pair<int, Callable::Parameter>> &parameters,
-                  const std::vector<OffloadedTask> &offloaded_tasks,
-                  LlvmRuntimeExecutor *executor);
+  bool try_launch(
+      int launch_id,
+      LaunchContextBuilder &ctx,
+      JITModule *cuda_module,
+      const std::vector<std::pair<int, Callable::Parameter>> &parameters,
+      const std::vector<OffloadedTask> &offloaded_tasks,
+      LlvmRuntimeExecutor *executor);
 
   // cache_size and used_on_last_call used for tests
-  void mark_not_used() { used_on_last_call_ = false; }
-  std::size_t cache_size() const { return cache_.size(); }
-  bool used_on_last_call() const { return used_on_last_call_; }
+  void mark_not_used() {
+    used_on_last_call_ = false;
+  }
+  std::size_t cache_size() const {
+    return cache_.size();
+  }
+  bool used_on_last_call() const {
+    return used_on_last_call_;
+  }
 
  private:
   bool launch_cached_graph(CachedCudaGraph &cached, LaunchContextBuilder &ctx);
@@ -63,9 +71,13 @@ class CudaGraphManager {
       LaunchContextBuilder &ctx,
       const std::vector<std::pair<int, Callable::Parameter>> &parameters,
       LlvmRuntimeExecutor *executor);
-  void *add_kernel_node(void *graph, void *prev_node, void *func,
-                        unsigned int grid_dim, unsigned int block_dim,
-                        unsigned int shared_mem, void **kernel_params);
+  void *add_kernel_node(void *graph,
+                        void *prev_node,
+                        void *func,
+                        unsigned int grid_dim,
+                        unsigned int block_dim,
+                        unsigned int shared_mem,
+                        void **kernel_params);
 
   // Keyed by launch_id, which uniquely identifies a compiled kernel variant
   // (each template specialization gets its own launch_id).
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 10df2c72ab..ad19d607aa 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -13,9 +13,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
   if (ctx.use_cuda_graph) {
     auto &lctx = contexts_[handle.get_launch_id()];
-    if (graph_manager_.try_launch(handle.get_launch_id(), ctx,
-                                  lctx.jit_module, *lctx.parameters,
-                                  lctx.offloaded_tasks,
+    if (graph_manager_.try_launch(handle.get_launch_id(), ctx, lctx.jit_module,
+                                  *lctx.parameters, lctx.offloaded_tasks,
                                   get_runtime_executor())) {
       return;
     }

From 51f898fc8d9d796349458240b4d7b1d78f054bfe Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:45:24 -0400
Subject: [PATCH 093/128] Apply clang-format

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 30 ++++++++------
 quadrants/runtime/cuda/cuda_graph_manager.h   | 40 +++++++++++++------
 quadrants/runtime/cuda/kernel_launcher.cpp    |  5 +--
 3 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 0dfb613d36..d31f919049 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -153,7 +153,8 @@ bool CudaGraphManager::resolve_ctx_ndarray_ptrs(
 
       QD_ERROR_IF(grad_ptr != nullptr,
                   "cuda_graph does not support autograd; "
-                  "ndarray arg {} has a non-null gradient pointer", arg_id);
+                  "ndarray arg {} has a non-null gradient pointer",
+                  arg_id);
 
       // Raw device pointer to the array data, resolved from either an
       // external array (raw pointer) or a DeviceAllocation handle.
@@ -243,8 +244,10 @@ void CudaGraphManager::ensure_condition_kernel_loaded() {
            cubin_size);
 }
 
-void *CudaGraphManager::add_kernel_node(void *graph, void *prev_node,
-                                        void *func, unsigned int grid_dim,
+void *CudaGraphManager::add_kernel_node(void *graph,
+                                        void *prev_node,
+                                        void *func,
+                                        unsigned int grid_dim,
                                         unsigned int block_dim,
                                         unsigned int shared_mem,
                                         void **kernel_params) {
@@ -268,7 +271,8 @@ void *CudaGraphManager::add_kernel_node(void *graph, void *prev_node,
 }
 
 void *CudaGraphManager::add_conditional_while_node(
-    void *graph, unsigned long long *cond_handle_out) {
+    void *graph,
+    unsigned long long *cond_handle_out) {
   ensure_condition_kernel_loaded();
   QD_ASSERT(cond_kernel_func_);
 
@@ -301,8 +305,8 @@ void *CudaGraphManager::add_conditional_while_node(
 }
 
 bool CudaGraphManager::launch_cached_graph(CachedCudaGraph &cached,
-                                            LaunchContextBuilder &ctx,
-                                            bool use_graph_do_while) {
+                                           LaunchContextBuilder &ctx,
+                                           bool use_graph_do_while) {
   QD_ERROR_IF(
       use_graph_do_while &&
           cached.graph_do_while_flag_dev_ptr != ctx.graph_do_while_flag_dev_ptr,
@@ -321,7 +325,9 @@ bool CudaGraphManager::launch_cached_graph(CachedCudaGraph &cached,
 }
 
 bool CudaGraphManager::try_launch(
-    int launch_id, LaunchContextBuilder &ctx, JITModule *cuda_module,
+    int launch_id,
+    LaunchContextBuilder &ctx,
+    JITModule *cuda_module,
     const std::vector<std::pair<int, Callable::Parameter>> &parameters,
     const std::vector<OffloadedTask> &offloaded_tasks,
     LlvmRuntimeExecutor *executor) {
@@ -387,7 +393,8 @@ bool CudaGraphManager::try_launch(
   //           └── Body graph
   //                 ├── Work kernel 1
   //                 ├── Work kernel 2
-  //                 └── Condition kernel (reads flag, calls cudaGraphSetConditional)
+  //                 └── Condition kernel (reads flag, calls
+  //                 cudaGraphSetConditional)
   //
   // The condition kernel must be the last node in the body graph. It reads the
   // flag after the work kernels have updated it, so the loop-continue decision
@@ -405,8 +412,7 @@ bool CudaGraphManager::try_launch(
   for (const auto &task : offloaded_tasks) {
     void *ctx_ptr = &cached.persistent_ctx;
     prev_node = add_kernel_node(
-        kernel_target_graph, prev_node,
-        cuda_module->lookup_function(task.name),
+        kernel_target_graph, prev_node, cuda_module->lookup_function(task.name),
         (unsigned int)task.grid_dim, (unsigned int)task.block_dim,
         (unsigned int)task.dynamic_shared_array_bytes, &ctx_ptr);
   }
@@ -418,8 +424,8 @@ bool CudaGraphManager::try_launch(
     void *flag_ptr = ctx.graph_do_while_flag_dev_ptr;
     void *cond_args[2] = {&cond_handle, &flag_ptr};
 
-    add_kernel_node(kernel_target_graph, prev_node, cond_kernel_func_,
-                    1, 1, 0, cond_args);
+    add_kernel_node(kernel_target_graph, prev_node, cond_kernel_func_, 1, 1, 0,
+                    cond_args);
   }
 
   // --- Instantiate and launch ---
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.h b/quadrants/runtime/cuda/cuda_graph_manager.h
index b66049b614..95274238fa 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.h
+++ b/quadrants/runtime/cuda/cuda_graph_manager.h
@@ -67,20 +67,30 @@ class CudaGraphManager {
   // Attempts to launch the kernel via a cached or newly built CUDA graph.
   // Returns true on success; false if the graph path can't be used (e.g.
   // host-resident ndarrays) and the caller should fall back to normal launch.
-  // Internally tracks whether the graph was used, queryable via used_on_last_call().
-  bool try_launch(int launch_id, LaunchContextBuilder &ctx,
-                  JITModule *cuda_module,
-                  const std::vector<std::pair<int, Callable::Parameter>> &parameters,
-                  const std::vector<OffloadedTask> &offloaded_tasks,
-                  LlvmRuntimeExecutor *executor);
+  // Internally tracks whether the graph was used, queryable via
+  // used_on_last_call().
+  bool try_launch(
+      int launch_id,
+      LaunchContextBuilder &ctx,
+      JITModule *cuda_module,
+      const std::vector<std::pair<int, Callable::Parameter>> &parameters,
+      const std::vector<OffloadedTask> &offloaded_tasks,
+      LlvmRuntimeExecutor *executor);
 
   // cache_size and used_on_last_call used for tests
-  void mark_not_used() { used_on_last_call_ = false; }
-  std::size_t cache_size() const { return cache_.size(); }
-  bool used_on_last_call() const { return used_on_last_call_; }
+  void mark_not_used() {
+    used_on_last_call_ = false;
+  }
+  std::size_t cache_size() const {
+    return cache_.size();
+  }
+  bool used_on_last_call() const {
+    return used_on_last_call_;
+  }
 
  private:
-  bool launch_cached_graph(CachedCudaGraph &cached, LaunchContextBuilder &ctx,
+  bool launch_cached_graph(CachedCudaGraph &cached,
+                           LaunchContextBuilder &ctx,
                            bool use_graph_do_while);
   bool resolve_ctx_ndarray_ptrs(
       LaunchContextBuilder &ctx,
@@ -89,9 +99,13 @@ class CudaGraphManager {
   void ensure_condition_kernel_loaded();
   void *add_conditional_while_node(void *graph,
                                    unsigned long long *cond_handle_out);
-  void *add_kernel_node(void *graph, void *prev_node, void *func,
-                        unsigned int grid_dim, unsigned int block_dim,
-                        unsigned int shared_mem, void **kernel_params);
+  void *add_kernel_node(void *graph,
+                        void *prev_node,
+                        void *func,
+                        unsigned int grid_dim,
+                        unsigned int block_dim,
+                        unsigned int shared_mem,
+                        void **kernel_params);
 
   // Keyed by launch_id, which uniquely identifies a compiled kernel variant
   // (each template specialization gets its own launch_id).
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 10df2c72ab..ad19d607aa 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -13,9 +13,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
   if (ctx.use_cuda_graph) {
     auto &lctx = contexts_[handle.get_launch_id()];
-    if (graph_manager_.try_launch(handle.get_launch_id(), ctx,
-                                  lctx.jit_module, *lctx.parameters,
-                                  lctx.offloaded_tasks,
+    if (graph_manager_.try_launch(handle.get_launch_id(), ctx, lctx.jit_module,
+                                  *lctx.parameters, lctx.offloaded_tasks,
                                   get_runtime_executor())) {
       return;
     }

From 90639dc0d034050180da2d8f1a1b2f1fb78e17a7 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:48:48 -0400
Subject: [PATCH 094/128] Add comment explaining why CudaGraphNodeParams is
 defined locally

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.h b/quadrants/runtime/cuda/cuda_graph_manager.h
index 95274238fa..f15492e371 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.h
+++ b/quadrants/runtime/cuda/cuda_graph_manager.h
@@ -25,6 +25,8 @@ struct CudaKernelNodeParams {
 };
 
 // Mirrors CUDA driver API CUgraphNodeParams / CUDA_CONDITIONAL_NODE_PARAMS.
+// We define our own copy because Quadrants loads the CUDA driver dynamically
+// rather than linking against it, so we don't have access to those headers.
 // Field order verified against cuda-python bindings (handle, type, size,
 // phGraph_out, ctx). Introduced in CUDA 12.4; layout stable through 13.2+.
 struct CudaGraphNodeParams {

From e9d4af4b30aefe027b75f7a99b44fb691ccb441d Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:51:17 -0400
Subject: [PATCH 095/128] Add comment explaining CudaGraphNodeParams vs
 CudaKernelNodeParams

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.h b/quadrants/runtime/cuda/cuda_graph_manager.h
index f15492e371..a7ad57039d 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.h
+++ b/quadrants/runtime/cuda/cuda_graph_manager.h
@@ -29,6 +29,12 @@ struct CudaKernelNodeParams {
 // rather than linking against it, so we don't have access to those headers.
 // Field order verified against cuda-python bindings (handle, type, size,
 // phGraph_out, ctx). Introduced in CUDA 12.4; layout stable through 13.2+.
+//
+// Used to add the conditional while node via cuGraphAddNode. Normal kernel
+// nodes have a dedicated cuGraphAddKernelNode API with CudaKernelNodeParams,
+// but conditional nodes use the generic cuGraphAddNode which takes this
+// catch-all 256-byte union. The type field selects the variant; we only use
+// the conditional node variant, so most of the bytes are padding.
 struct CudaGraphNodeParams {
   unsigned int type;  // CU_GRAPH_NODE_TYPE_CONDITIONAL = 13
   int reserved0[3];

From 359c7d8dec69b60c152f80402a16e99b018fd516 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:56:51 -0400
Subject: [PATCH 096/128] Rename increment_loop to graph_loop in
 test_graph_do_while_counter

Made-with: Cursor
---
 tests/python/test_cuda_graph_do_while.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py
index 285ea27af3..8122bb6803 100644
--- a/tests/python/test_cuda_graph_do_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -21,7 +21,7 @@ def test_graph_do_while_counter():
     N = 64
 
     @qd.kernel(graph_do_while="counter")
-    def increment_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
+    def graph_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
         for i in range(1):
@@ -33,7 +33,7 @@ def increment_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarra
     x.from_numpy(np.zeros(N, dtype=np.int32))
     counter.from_numpy(np.array(5, dtype=np.int32))
 
-    increment_loop(x, counter)
+    graph_loop(x, counter)
     assert _cuda_graph_used()
     assert _cuda_graph_cache_size() == 1
 

From 99889ee6792d3afe3f3664e2a3acbcfb0c6db347 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:57:19 -0400
Subject: [PATCH 097/128] Remove unnecessary qd.sync() calls from do-while
 tests

Made-with: Cursor
---
 tests/python/test_cuda_graph_do_while.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py
index 8122bb6803..2ca6d9ec32 100644
--- a/tests/python/test_cuda_graph_do_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -37,7 +37,6 @@ def graph_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd
     assert _cuda_graph_used()
     assert _cuda_graph_cache_size() == 1
 
-    qd.sync()
     assert counter.to_numpy() == 0
     np.testing.assert_array_equal(x.to_numpy(), np.full(N, 5, dtype=np.int32))
 
@@ -66,7 +65,6 @@ def increment_until_threshold(x: qd.types.ndarray(qd.i32, ndim=1), keep_going: q
     assert _cuda_graph_used()
     assert _cuda_graph_cache_size() == 1
 
-    qd.sync()
     assert keep_going.to_numpy() == 0
     np.testing.assert_array_equal(x.to_numpy(), np.full(N, threshold, dtype=np.int32))
 
@@ -101,7 +99,6 @@ def multi_loop(
     assert _cuda_graph_used()
     assert _cuda_graph_cache_size() == 1
 
-    qd.sync()
     assert counter.to_numpy() == 0
     np.testing.assert_allclose(x.to_numpy(), np.full(N, 10.0))
     np.testing.assert_allclose(y.to_numpy(), np.full(N, 20.0))
@@ -128,7 +125,6 @@ def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, n
     inc(x, counter)
     assert _cuda_graph_used()
     assert _cuda_graph_cache_size() == 1
-    qd.sync()
     np.testing.assert_array_equal(x.to_numpy(), np.full(N, 3, dtype=np.int32))
 
     # Second call: 7 iterations (graph replay with new counter value)
@@ -137,7 +133,6 @@ def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, n
     inc(x, counter)
     assert _cuda_graph_used()
     assert _cuda_graph_cache_size() == 1
-    qd.sync()
     np.testing.assert_array_equal(x.to_numpy(), np.full(N, 7, dtype=np.int32))
 
 

From b57582a4035e10df0cdb188f068ffdd064198f96 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:57:56 -0400
Subject: [PATCH 098/128] Add second call with different counter to
 test_graph_do_while_counter

Made-with: Cursor
---
 tests/python/test_cuda_graph_do_while.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py
index 2ca6d9ec32..e65770a3df 100644
--- a/tests/python/test_cuda_graph_do_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -40,6 +40,16 @@ def graph_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd
     assert counter.to_numpy() == 0
     np.testing.assert_array_equal(x.to_numpy(), np.full(N, 5, dtype=np.int32))
 
+    x.from_numpy(np.zeros(N, dtype=np.int32))
+    counter.from_numpy(np.array(10, dtype=np.int32))
+
+    graph_loop(x, counter)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
+
+    assert counter.to_numpy() == 0
+    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 10, dtype=np.int32))
+
 
 @test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_boolean_done():

From 844a4547e17470f2e7cc319b2918d27985abde0e Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:58:50 -0400
Subject: [PATCH 099/128] Add second call to all do-while tests to verify graph
 reuse

Made-with: Cursor
---
 tests/python/test_cuda_graph_do_while.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py
index e65770a3df..56235034b3 100644
--- a/tests/python/test_cuda_graph_do_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -78,6 +78,16 @@ def increment_until_threshold(x: qd.types.ndarray(qd.i32, ndim=1), keep_going: q
     assert keep_going.to_numpy() == 0
     np.testing.assert_array_equal(x.to_numpy(), np.full(N, threshold, dtype=np.int32))
 
+    x.from_numpy(np.zeros(N, dtype=np.int32))
+    keep_going.from_numpy(np.array(1, dtype=np.int32))
+
+    increment_until_threshold(x, keep_going)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
+
+    assert keep_going.to_numpy() == 0
+    np.testing.assert_array_equal(x.to_numpy(), np.full(N, threshold, dtype=np.int32))
+
 
 @test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_multiple_loops():
@@ -113,6 +123,18 @@ def multi_loop(
     np.testing.assert_allclose(x.to_numpy(), np.full(N, 10.0))
     np.testing.assert_allclose(y.to_numpy(), np.full(N, 20.0))
 
+    x.from_numpy(np.zeros(N, dtype=np.float32))
+    y.from_numpy(np.zeros(N, dtype=np.float32))
+    counter.from_numpy(np.array(5, dtype=np.int32))
+
+    multi_loop(x, y, counter)
+    assert _cuda_graph_used()
+    assert _cuda_graph_cache_size() == 1
+
+    assert counter.to_numpy() == 0
+    np.testing.assert_allclose(x.to_numpy(), np.full(N, 5.0))
+    np.testing.assert_allclose(y.to_numpy(), np.full(N, 10.0))
+
 
 @test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_replay():

From 4a62b03c444b99fdd1d71844c886f03515b70880 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 09:59:29 -0400
Subject: [PATCH 100/128] Use different values on second call in do-while tests

Made-with: Cursor
---
 tests/python/test_cuda_graph_do_while.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py
index 56235034b3..f39e6c294f 100644
--- a/tests/python/test_cuda_graph_do_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -78,7 +78,8 @@ def increment_until_threshold(x: qd.types.ndarray(qd.i32, ndim=1), keep_going: q
     assert keep_going.to_numpy() == 0
     np.testing.assert_array_equal(x.to_numpy(), np.full(N, threshold, dtype=np.int32))
 
-    x.from_numpy(np.zeros(N, dtype=np.int32))
+    # Second call: start from 4, so only 3 iterations to reach threshold
+    x.from_numpy(np.full(N, 4, dtype=np.int32))
     keep_going.from_numpy(np.array(1, dtype=np.int32))
 
     increment_until_threshold(x, keep_going)

From bf9e9ba98aa3aacc5760a6d85770a054bcc8ff47 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 10:00:43 -0400
Subject: [PATCH 101/128] Make threshold a runtime ndarray parameter in boolean
 done test

Made-with: Cursor
---
 tests/python/test_cuda_graph_do_while.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py
index f39e6c294f..43d9b4fcd0 100644
--- a/tests/python/test_cuda_graph_do_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -55,39 +55,45 @@ def graph_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd
 def test_graph_do_while_boolean_done():
     """Test graph_do_while with a boolean 'continue' flag (non-zero = keep going)."""
     N = 64
-    threshold = 7
 
     @qd.kernel(graph_do_while="keep_going")
-    def increment_until_threshold(x: qd.types.ndarray(qd.i32, ndim=1), keep_going: qd.types.ndarray(qd.i32, ndim=0)):
+    def increment_until_threshold(
+        x: qd.types.ndarray(qd.i32, ndim=1),
+        threshold: qd.types.ndarray(qd.i32, ndim=0),
+        keep_going: qd.types.ndarray(qd.i32, ndim=0),
+    ):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
         for i in range(1):
-            if x[0] >= threshold:
+            if x[0] >= threshold[None]:
                 keep_going[None] = 0
 
     x = qd.ndarray(qd.i32, shape=(N,))
+    threshold = qd.ndarray(qd.i32, shape=())
     keep_going = qd.ndarray(qd.i32, shape=())
 
     x.from_numpy(np.zeros(N, dtype=np.int32))
+    threshold.from_numpy(np.array(7, dtype=np.int32))
     keep_going.from_numpy(np.array(1, dtype=np.int32))
 
-    increment_until_threshold(x, keep_going)
+    increment_until_threshold(x, threshold, keep_going)
     assert _cuda_graph_used()
     assert _cuda_graph_cache_size() == 1
 
     assert keep_going.to_numpy() == 0
-    np.testing.assert_array_equal(x.to_numpy(), np.full(N, threshold, dtype=np.int32))
+    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 7, dtype=np.int32))
 
-    # Second call: start from 4, so only 3 iterations to reach threshold
-    x.from_numpy(np.full(N, 4, dtype=np.int32))
+    # Second call: different threshold, start from 0
+    x.from_numpy(np.zeros(N, dtype=np.int32))
+    threshold.from_numpy(np.array(12, dtype=np.int32))
     keep_going.from_numpy(np.array(1, dtype=np.int32))
 
-    increment_until_threshold(x, keep_going)
+    increment_until_threshold(x, threshold, keep_going)
     assert _cuda_graph_used()
     assert _cuda_graph_cache_size() == 1
 
     assert keep_going.to_numpy() == 0
-    np.testing.assert_array_equal(x.to_numpy(), np.full(N, threshold, dtype=np.int32))
+    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 12, dtype=np.int32))
 
 
 @test_utils.test(arch=[qd.cuda])

From ec0dacafac522e4489704e46333ed125259f3beb Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 10:01:14 -0400
Subject: [PATCH 102/128] Pass threshold as scalar int instead of ndarray

Made-with: Cursor
---
 tests/python/test_cuda_graph_do_while.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py
index 43d9b4fcd0..16e0daf3f1 100644
--- a/tests/python/test_cuda_graph_do_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -59,24 +59,22 @@ def test_graph_do_while_boolean_done():
     @qd.kernel(graph_do_while="keep_going")
     def increment_until_threshold(
         x: qd.types.ndarray(qd.i32, ndim=1),
-        threshold: qd.types.ndarray(qd.i32, ndim=0),
+        threshold: qd.i32,
         keep_going: qd.types.ndarray(qd.i32, ndim=0),
     ):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
         for i in range(1):
-            if x[0] >= threshold[None]:
+            if x[0] >= threshold:
                 keep_going[None] = 0
 
     x = qd.ndarray(qd.i32, shape=(N,))
-    threshold = qd.ndarray(qd.i32, shape=())
     keep_going = qd.ndarray(qd.i32, shape=())
 
     x.from_numpy(np.zeros(N, dtype=np.int32))
-    threshold.from_numpy(np.array(7, dtype=np.int32))
     keep_going.from_numpy(np.array(1, dtype=np.int32))
 
-    increment_until_threshold(x, threshold, keep_going)
+    increment_until_threshold(x, 7, keep_going)
     assert _cuda_graph_used()
     assert _cuda_graph_cache_size() == 1
 
@@ -85,10 +83,9 @@ def increment_until_threshold(
 
     # Second call: different threshold, start from 0
     x.from_numpy(np.zeros(N, dtype=np.int32))
-    threshold.from_numpy(np.array(12, dtype=np.int32))
     keep_going.from_numpy(np.array(1, dtype=np.int32))
 
-    increment_until_threshold(x, threshold, keep_going)
+    increment_until_threshold(x, 12, keep_going)
     assert _cuda_graph_used()
     assert _cuda_graph_cache_size() == 1
 

From 5a2a41a6b149f73ab2fb19cbb1ad1d41b97a6eff Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 10:01:40 -0400
Subject: [PATCH 103/128] Remove comment

Made-with: Cursor
---
 tests/python/test_cuda_graph_do_while.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py
index 16e0daf3f1..e77845e944 100644
--- a/tests/python/test_cuda_graph_do_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -81,7 +81,6 @@ def increment_until_threshold(
     assert keep_going.to_numpy() == 0
     np.testing.assert_array_equal(x.to_numpy(), np.full(N, 7, dtype=np.int32))
 
-    # Second call: different threshold, start from 0
     x.from_numpy(np.zeros(N, dtype=np.int32))
     keep_going.from_numpy(np.array(1, dtype=np.int32))
 

From 2bdb112fd4fe657e3c8de41e18bddb44b5f4aa67 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 10:03:00 -0400
Subject: [PATCH 104/128] Remove redundant test_graph_do_while_replay

Made-with: Cursor
---
 tests/python/test_cuda_graph_do_while.py | 32 ------------------------
 1 file changed, 32 deletions(-)

diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py
index e77845e944..25497a6462 100644
--- a/tests/python/test_cuda_graph_do_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -139,38 +139,6 @@ def multi_loop(
     np.testing.assert_allclose(y.to_numpy(), np.full(N, 10.0))
 
 
-@test_utils.test(arch=[qd.cuda])
-def test_graph_do_while_replay():
-    """Test that graph_do_while works correctly on subsequent calls (graph replay)."""
-    N = 16
-
-    @qd.kernel(graph_do_while="counter")
-    def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
-        for i in range(x.shape[0]):
-            x[i] = x[i] + 1
-        for i in range(1):
-            counter[None] = counter[None] - 1
-
-    x = qd.ndarray(qd.i32, shape=(N,))
-    counter = qd.ndarray(qd.i32, shape=())
-
-    # First call: 3 iterations
-    x.from_numpy(np.zeros(N, dtype=np.int32))
-    counter.from_numpy(np.array(3, dtype=np.int32))
-    inc(x, counter)
-    assert _cuda_graph_used()
-    assert _cuda_graph_cache_size() == 1
-    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 3, dtype=np.int32))
-
-    # Second call: 7 iterations (graph replay with new counter value)
-    x.from_numpy(np.zeros(N, dtype=np.int32))
-    counter.from_numpy(np.array(7, dtype=np.int32))
-    inc(x, counter)
-    assert _cuda_graph_used()
-    assert _cuda_graph_cache_size() == 1
-    np.testing.assert_array_equal(x.to_numpy(), np.full(N, 7, dtype=np.int32))
-
-
 @test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_replay_new_ndarray_raises():
     """Passing a different ndarray for the condition parameter should raise."""

From 955413da9df06decf5fd60ba76bdf54d39df34b7 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 10:03:50 -0400
Subject: [PATCH 105/128] Simplify changed-condition-ndarray test

Made-with: Cursor
---
 tests/python/test_cuda_graph_do_while.py | 26 ++++++++++--------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py
index 25497a6462..b79c2cf69a 100644
--- a/tests/python/test_cuda_graph_do_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -140,26 +140,22 @@ def multi_loop(
 
 
 @test_utils.test(arch=[qd.cuda])
-def test_graph_do_while_replay_new_ndarray_raises():
+def test_graph_do_while_changed_condition_ndarray_raises():
     """Passing a different ndarray for the condition parameter should raise."""
-    N = 16
 
-    @qd.kernel(graph_do_while="counter")
-    def inc(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
+    @qd.kernel(graph_do_while="c")
+    def k(x: qd.types.ndarray(qd.i32, ndim=1), c: qd.types.ndarray(qd.i32, ndim=0)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
         for i in range(1):
-            counter[None] = counter[None] - 1
-
-    x = qd.ndarray(qd.i32, shape=(N,))
+            c[None] = c[None] - 1
 
-    counter1 = qd.ndarray(qd.i32, shape=())
-    x.from_numpy(np.zeros(N, dtype=np.int32))
-    counter1.from_numpy(np.array(3, dtype=np.int32))
-    inc(x, counter1)
-    assert _cuda_graph_used()
+    x = qd.ndarray(qd.i32, shape=(4,))
+    c1 = qd.ndarray(qd.i32, shape=())
+    c1.from_numpy(np.array(1, dtype=np.int32))
+    k(x, c1)
 
-    counter2 = qd.ndarray(qd.i32, shape=())
-    counter2.from_numpy(np.array(5, dtype=np.int32))
+    c2 = qd.ndarray(qd.i32, shape=())
+    c2.from_numpy(np.array(1, dtype=np.int32))
     with pytest.raises(RuntimeError, match="condition ndarray changed"):
-        inc(x, counter2)
+        k(x, c2)

From 3dfcf8a978e5782842d38973cf881ad6fd86d2d6 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 10:04:27 -0400
Subject: [PATCH 106/128] Replace [None] with [()] in do-while tests

Made-with: Cursor
---
 tests/python/test_cuda_graph_do_while.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py
index b79c2cf69a..947b6068ae 100644
--- a/tests/python/test_cuda_graph_do_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -25,7 +25,7 @@ def graph_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
         for i in range(1):
-            counter[None] = counter[None] - 1
+            counter[()] = counter[()] - 1
 
     x = qd.ndarray(qd.i32, shape=(N,))
     counter = qd.ndarray(qd.i32, shape=())
@@ -66,7 +66,7 @@ def increment_until_threshold(
             x[i] = x[i] + 1
         for i in range(1):
             if x[0] >= threshold:
-                keep_going[None] = 0
+                keep_going[()] = 0
 
     x = qd.ndarray(qd.i32, shape=(N,))
     keep_going = qd.ndarray(qd.i32, shape=())
@@ -108,7 +108,7 @@ def multi_loop(
         for i in range(y.shape[0]):
             y[i] = y[i] + 2.0
         for i in range(1):
-            counter[None] = counter[None] - 1
+            counter[()] = counter[()] - 1
 
     x = qd.ndarray(qd.f32, shape=(N,))
     y = qd.ndarray(qd.f32, shape=(N,))
@@ -148,7 +148,7 @@ def k(x: qd.types.ndarray(qd.i32, ndim=1), c: qd.types.ndarray(qd.i32, ndim=0)):
         for i in range(x.shape[0]):
             x[i] = x[i] + 1
         for i in range(1):
-            c[None] = c[None] - 1
+            c[()] = c[()] - 1
 
     x = qd.ndarray(qd.i32, shape=(4,))
     c1 = qd.ndarray(qd.i32, shape=())

From f17537830521b0962cc7dd8cd80f1a5e253f7a56 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 10:42:54 -0400
Subject: [PATCH 107/128] Error instead of fallback when cuda_graph gets
 host-resident arrays

cuda_graph requires all ndarrays to be device-resident. Previously
this silently fell back to the non-graph path; now it throws a clear
error message.

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 22 ++++++++-----------
 quadrants/runtime/cuda/cuda_graph_manager.h   |  2 +-
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 7a5df23222..42679c5947 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -56,10 +56,10 @@ CachedCudaGraph &CachedCudaGraph::operator=(CachedCudaGraph &&other) noexcept {
 // pointers, writing them into the arg buffer via set_ndarray_ptrs.
 //
 // Unlike the normal launch path, this does not handle host-resident arrays
-// (no temporary device allocation or host-to-device transfer). Returns false
-// if any external array is on the host, signaling the caller to fall back
-// to the non-graph launch path.
-bool CudaGraphManager::resolve_ctx_ndarray_ptrs(
+// (no temporary device allocation or host-to-device transfer). Errors if
+// any external array is on the host, since cuda_graph requires all arrays
+// to be device-resident.
+void CudaGraphManager::resolve_ctx_ndarray_ptrs(
     LaunchContextBuilder &ctx,
     const std::vector<std::pair<int, Callable::Parameter>> &parameters,
     LlvmRuntimeExecutor *executor) {
@@ -92,9 +92,10 @@ bool CudaGraphManager::resolve_ctx_ndarray_ptrs(
 
       if (ctx.device_allocation_type[arg_id] ==
           LaunchContextBuilder::DevAllocType::kNone) {
-        if (!on_cuda_device(data_ptr)) {
-          return false;
-        }
+        QD_ERROR_IF(!on_cuda_device(data_ptr),
+                  "cuda_graph requires all ndarrays to be device-resident; "
+                  "ndarray arg {} is host-resident",
+                  arg_id);
         resolved_data = data_ptr;
       } else if (arr_sz > 0) {
         DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
@@ -106,7 +107,6 @@ bool CudaGraphManager::resolve_ctx_ndarray_ptrs(
       }
     }
   }
-  return true;
 }
 
 void *CudaGraphManager::add_kernel_node(void *graph,
@@ -163,11 +163,7 @@ bool CudaGraphManager::try_launch(
               "cuda_graph=True is not supported for kernels with struct return "
               "values; remove cuda_graph=True or avoid returning values");
 
-  // Falls back to the normal path if any external array is host-resident,
-  // since the graph path cannot perform host-to-device transfers.
-  if (!resolve_ctx_ndarray_ptrs(ctx, parameters, executor)) {
-    return false;
-  }
+  resolve_ctx_ndarray_ptrs(ctx, parameters, executor);
 
   auto it = cache_.find(launch_id);
   if (it != cache_.end()) {
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.h b/quadrants/runtime/cuda/cuda_graph_manager.h
index e92feb34ef..1d99c1ee6f 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.h
+++ b/quadrants/runtime/cuda/cuda_graph_manager.h
@@ -67,7 +67,7 @@ class CudaGraphManager {
 
  private:
   bool launch_cached_graph(CachedCudaGraph &cached, LaunchContextBuilder &ctx);
-  bool resolve_ctx_ndarray_ptrs(
+  void resolve_ctx_ndarray_ptrs(
       LaunchContextBuilder &ctx,
       const std::vector<std::pair<int, Callable::Parameter>> &parameters,
       LlvmRuntimeExecutor *executor);

From b222dd15142f6cd1de54b97fdf72aaaad2e48ae4 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 11:41:18 -0400
Subject: [PATCH 108/128] Align autograd check and libcudadevrt error message
 with branch 3

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 7492baf2fd..6c48926c2b 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -147,14 +147,14 @@ void CudaGraphManager::resolve_ctx_ndarray_ptrs(
         continue;
 
       ArgArrayPtrKey data_ptr_idx{arg_id, TypeFactory::DATA_PTR_POS_IN_NDARRAY};
-      ArgArrayPtrKey grad_ptr_idx{arg_id, TypeFactory::GRAD_PTR_POS_IN_NDARRAY};
       auto data_ptr = ctx.array_ptrs[data_ptr_idx];
-      auto grad_ptr = ctx.array_ptrs[grad_ptr_idx];
 
-      QD_ERROR_IF(grad_ptr != nullptr,
-                  "cuda_graph does not support autograd; "
-                  "ndarray arg {} has a non-null gradient pointer",
-                  arg_id);
+      QD_ERROR_IF(
+          ctx.array_ptrs[{arg_id, TypeFactory::GRAD_PTR_POS_IN_NDARRAY}] !=
+              nullptr,
+          "cuda_graph does not support autograd; "
+          "ndarray arg {} has a non-null gradient pointer",
+          arg_id);
 
       // Raw device pointer to the array data, resolved from either an
       // external array (raw pointer) or a DeviceAllocation handle.
@@ -163,9 +163,9 @@ void CudaGraphManager::resolve_ctx_ndarray_ptrs(
       if (ctx.device_allocation_type[arg_id] ==
           LaunchContextBuilder::DevAllocType::kNone) {
         QD_ERROR_IF(!on_cuda_device(data_ptr),
-                  "cuda_graph requires all ndarrays to be device-resident; "
-                  "ndarray arg {} is host-resident",
-                  arg_id);
+                    "cuda_graph requires all ndarrays to be device-resident; "
+                    "ndarray arg {} is host-resident",
+                    arg_id);
         resolved_data = data_ptr;
       } else if (arr_sz > 0) {
         DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);
@@ -215,8 +215,8 @@ void CudaGraphManager::ensure_condition_kernel_loaded() {
     }
   }
   QD_ERROR_IF(cudadevrt_path.empty(),
-              "graph_do_while requires libcudadevrt.a but it was not found. "
-              "Install the CUDA toolkit and/or set CUDA_HOME.");
+              "Cannot find libcudadevrt.a — required for graph_do_while. "
+              "Install the CUDA toolkit and set CUDA_HOME.");
 
   // CUlinkState handle for the JIT linker session that combines our PTX
   // with libcudadevrt.a to resolve the cudaGraphSetConditional extern.

From 339b084a384941cc6526cfda85e0a0e379f6ea4a Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 11:42:27 -0400
Subject: [PATCH 109/128] Reorder use_graph_do_while declaration to match
 branch 3

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 6c48926c2b..9c79d96728 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -335,12 +335,12 @@ bool CudaGraphManager::try_launch(
     return false;
   }
 
+  const bool use_graph_do_while = ctx.graph_do_while_arg_id >= 0;
+
   QD_ERROR_IF(ctx.result_buffer_size > 0,
               "cuda_graph=True is not supported for kernels with struct return "
               "values; remove cuda_graph=True or avoid returning values");
 
-  const bool use_graph_do_while = ctx.graph_do_while_arg_id >= 0;
-
   resolve_ctx_ndarray_ptrs(ctx, parameters, executor);
 
   auto it = cache_.find(launch_id);

From dd480d9f851c568825f121ab074f1573c17d6228 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 11:43:36 -0400
Subject: [PATCH 110/128] Fix clang-format indentation in QD_ERROR_IF

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 42679c5947..2142e75418 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -93,9 +93,9 @@ void CudaGraphManager::resolve_ctx_ndarray_ptrs(
       if (ctx.device_allocation_type[arg_id] ==
           LaunchContextBuilder::DevAllocType::kNone) {
         QD_ERROR_IF(!on_cuda_device(data_ptr),
-                  "cuda_graph requires all ndarrays to be device-resident; "
-                  "ndarray arg {} is host-resident",
-                  arg_id);
+                    "cuda_graph requires all ndarrays to be device-resident; "
+                    "ndarray arg {} is host-resident",
+                    arg_id);
         resolved_data = data_ptr;
       } else if (arr_sz > 0) {
         DeviceAllocation *ptr = static_cast<DeviceAllocation *>(data_ptr);

From 7964d82dfdc74341f54e4c5ffb7bc527d81c9bfc Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 11:55:29 -0400
Subject: [PATCH 111/128] Fix macro parse error: avoid brace-init-list inside
 QD_ERROR_IF
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The C preprocessor treats commas inside {…} as macro argument
separators on some compilers, causing build failures.

Made-with: Cursor
---
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 9c79d96728..47e4a59a99 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -147,14 +147,13 @@ void CudaGraphManager::resolve_ctx_ndarray_ptrs(
         continue;
 
       ArgArrayPtrKey data_ptr_idx{arg_id, TypeFactory::DATA_PTR_POS_IN_NDARRAY};
+      ArgArrayPtrKey grad_ptr_idx{arg_id, TypeFactory::GRAD_PTR_POS_IN_NDARRAY};
       auto data_ptr = ctx.array_ptrs[data_ptr_idx];
 
-      QD_ERROR_IF(
-          ctx.array_ptrs[{arg_id, TypeFactory::GRAD_PTR_POS_IN_NDARRAY}] !=
-              nullptr,
-          "cuda_graph does not support autograd; "
-          "ndarray arg {} has a non-null gradient pointer",
-          arg_id);
+      QD_ERROR_IF(ctx.array_ptrs[grad_ptr_idx] != nullptr,
+                  "cuda_graph does not support autograd; "
+                  "ndarray arg {} has a non-null gradient pointer",
+                  arg_id);
 
       // Raw device pointer to the array data, resolved from either an
       // external array (raw pointer) or a DeviceAllocation handle.

From d2563b9ef72a82f6408113756bdc3a99b01fc327 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 12:36:35 -0400
Subject: [PATCH 112/128] Skip graph_do_while tests on SM < 90

The CI GPU runner is SM 75 (Turing). Branch 2 requires SM 9.0+
(Hopper) for graph_do_while with no fallback.

Made-with: Cursor
---
 tests/python/test_cuda_graph_do_while.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py
index 947b6068ae..1e265711a1 100644
--- a/tests/python/test_cuda_graph_do_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -18,6 +18,8 @@ def _cuda_graph_used():
 @test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_counter():
     """Test graph_do_while with a counter that decrements each iteration."""
+    if qd.lang.impl.get_cuda_compute_capability() < 90:
+        pytest.skip("graph_do_while requires SM 9.0+ (Hopper)")
     N = 64
 
     @qd.kernel(graph_do_while="counter")
@@ -54,6 +56,8 @@ def graph_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd
 @test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_boolean_done():
     """Test graph_do_while with a boolean 'continue' flag (non-zero = keep going)."""
+    if qd.lang.impl.get_cuda_compute_capability() < 90:
+        pytest.skip("graph_do_while requires SM 9.0+ (Hopper)")
     N = 64
 
     @qd.kernel(graph_do_while="keep_going")
@@ -95,6 +99,8 @@ def increment_until_threshold(
 @test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_multiple_loops():
     """Test graph_do_while with multiple top-level loops in the kernel body."""
+    if qd.lang.impl.get_cuda_compute_capability() < 90:
+        pytest.skip("graph_do_while requires SM 9.0+ (Hopper)")
     N = 32
 
     @qd.kernel(graph_do_while="counter")
@@ -142,6 +148,8 @@ def multi_loop(
 @test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_changed_condition_ndarray_raises():
     """Passing a different ndarray for the condition parameter should raise."""
+    if qd.lang.impl.get_cuda_compute_capability() < 90:
+        pytest.skip("graph_do_while requires SM 9.0+ (Hopper)")
 
     @qd.kernel(graph_do_while="c")
     def k(x: qd.types.ndarray(qd.i32, ndim=1), c: qd.types.ndarray(qd.i32, ndim=0)):

From 0c33d055d2df5217069cf3cd43e12913a47c8a16 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 12:37:38 -0400
Subject: [PATCH 113/128] Revert "Skip graph_do_while tests on SM < 90"

This reverts commit d2563b9ef72a82f6408113756bdc3a99b01fc327.
---
 tests/python/test_cuda_graph_do_while.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py
index 1e265711a1..947b6068ae 100644
--- a/tests/python/test_cuda_graph_do_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -18,8 +18,6 @@ def _cuda_graph_used():
 @test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_counter():
     """Test graph_do_while with a counter that decrements each iteration."""
-    if qd.lang.impl.get_cuda_compute_capability() < 90:
-        pytest.skip("graph_do_while requires SM 9.0+ (Hopper)")
     N = 64
 
     @qd.kernel(graph_do_while="counter")
@@ -56,8 +54,6 @@ def graph_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd
 @test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_boolean_done():
     """Test graph_do_while with a boolean 'continue' flag (non-zero = keep going)."""
-    if qd.lang.impl.get_cuda_compute_capability() < 90:
-        pytest.skip("graph_do_while requires SM 9.0+ (Hopper)")
     N = 64
 
     @qd.kernel(graph_do_while="keep_going")
@@ -99,8 +95,6 @@ def increment_until_threshold(
 @test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_multiple_loops():
     """Test graph_do_while with multiple top-level loops in the kernel body."""
-    if qd.lang.impl.get_cuda_compute_capability() < 90:
-        pytest.skip("graph_do_while requires SM 9.0+ (Hopper)")
     N = 32
 
     @qd.kernel(graph_do_while="counter")
@@ -148,8 +142,6 @@ def multi_loop(
 @test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_changed_condition_ndarray_raises():
     """Passing a different ndarray for the condition parameter should raise."""
-    if qd.lang.impl.get_cuda_compute_capability() < 90:
-        pytest.skip("graph_do_while requires SM 9.0+ (Hopper)")
 
     @qd.kernel(graph_do_while="c")
     def k(x: qd.types.ndarray(qd.i32, ndim=1), c: qd.types.ndarray(qd.i32, ndim=0)):

From a5ebc33fae34d8a0d882030a05d3a9a04a054e08 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 12:38:50 -0400
Subject: [PATCH 114/128] xfail graph_do_while tests on SM < 90

Made-with: Cursor
---
 tests/python/test_cuda_graph_do_while.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py
index 947b6068ae..f5d4bc4bda 100644
--- a/tests/python/test_cuda_graph_do_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -15,9 +15,15 @@ def _cuda_graph_used():
     return impl.get_runtime().prog.get_cuda_graph_cache_used_on_last_call()
 
 
+def _on_hopper():
+    return qd.lang.impl.get_cuda_compute_capability() >= 90
+
+
 @test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_counter():
     """Test graph_do_while with a counter that decrements each iteration."""
+    if not _on_hopper():
+        pytest.xfail("graph_do_while requires SM 9.0+ (Hopper)")
     N = 64
 
     @qd.kernel(graph_do_while="counter")
@@ -54,6 +60,8 @@ def graph_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd
 @test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_boolean_done():
     """Test graph_do_while with a boolean 'continue' flag (non-zero = keep going)."""
+    if not _on_hopper():
+        pytest.xfail("graph_do_while requires SM 9.0+ (Hopper)")
     N = 64
 
     @qd.kernel(graph_do_while="keep_going")
@@ -95,6 +103,8 @@ def increment_until_threshold(
 @test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_multiple_loops():
     """Test graph_do_while with multiple top-level loops in the kernel body."""
+    if not _on_hopper():
+        pytest.xfail("graph_do_while requires SM 9.0+ (Hopper)")
     N = 32
 
     @qd.kernel(graph_do_while="counter")
@@ -142,6 +152,8 @@ def multi_loop(
 @test_utils.test(arch=[qd.cuda])
 def test_graph_do_while_changed_condition_ndarray_raises():
     """Passing a different ndarray for the condition parameter should raise."""
+    if not _on_hopper():
+        pytest.xfail("graph_do_while requires SM 9.0+ (Hopper)")
 
     @qd.kernel(graph_do_while="c")
     def k(x: qd.types.ndarray(qd.i32, ndim=1), c: qd.types.ndarray(qd.i32, ndim=0)):

From 32b1341f039fd5c040b055df137688931c07185f Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 16:01:46 -0400
Subject: [PATCH 115/128] Add num_offloaded_tasks query for compiled kernels

Expose the number of offloaded tasks (parallel-for loops) as a
compile-time property on CompiledKernelData, captured per launch in
Program and accessible via pybind11. Assert expected task counts in
cuda graph tests.

Made-with: Cursor
---
 quadrants/codegen/compiled_kernel_data.h              |  1 +
 quadrants/codegen/llvm/compiled_kernel_data.h         |  3 +++
 quadrants/codegen/spirv/compiled_kernel_data.h        |  3 +++
 quadrants/program/program.cpp                         |  1 +
 quadrants/program/program.h                           |  5 +++++
 quadrants/python/export_lang.cpp                      |  4 +++-
 tests/cpp/codegen/compiled_kernel_data_test.cpp       |  4 ++++
 .../kernel_compilation_manager_test.cpp               |  4 ++++
 tests/python/test_cuda_graph.py                       | 11 +++++++++++
 9 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/quadrants/codegen/compiled_kernel_data.h b/quadrants/codegen/compiled_kernel_data.h
index c3bb6c082e..f0cfe956f7 100644
--- a/quadrants/codegen/compiled_kernel_data.h
+++ b/quadrants/codegen/compiled_kernel_data.h
@@ -103,6 +103,7 @@ class CompiledKernelData {
   virtual ~CompiledKernelData() = default;
 
   virtual Arch arch() const = 0;
+  virtual size_t num_tasks() const = 0;
 
   Err load(std::istream &is);
   Err dump(std::ostream &os) const;
diff --git a/quadrants/codegen/llvm/compiled_kernel_data.h b/quadrants/codegen/llvm/compiled_kernel_data.h
index cec2e56d31..2dca14cb34 100644
--- a/quadrants/codegen/llvm/compiled_kernel_data.h
+++ b/quadrants/codegen/llvm/compiled_kernel_data.h
@@ -50,6 +50,9 @@ class CompiledKernelData : public lang::CompiledKernelData {
   CompiledKernelData(Arch arch, InternalData data);
 
   Arch arch() const override;
+  size_t num_tasks() const override {
+    return data_.compiled_data.tasks.size();
+  }
   std::unique_ptr<lang::CompiledKernelData> clone() const override;
 
   Err check() const override;
diff --git a/quadrants/codegen/spirv/compiled_kernel_data.h b/quadrants/codegen/spirv/compiled_kernel_data.h
index 235b47d1c2..a72f1829b3 100644
--- a/quadrants/codegen/spirv/compiled_kernel_data.h
+++ b/quadrants/codegen/spirv/compiled_kernel_data.h
@@ -30,6 +30,9 @@ class CompiledKernelData : public lang::CompiledKernelData {
   CompiledKernelData(Arch arch, InternalData data);
 
   Arch arch() const override;
+  size_t num_tasks() const override {
+    return data_.metadata.kernel_attribs.tasks_attribs.size();
+  }
   std::unique_ptr<lang::CompiledKernelData> clone() const override;
 
   const InternalData &get_internal_data() const {
diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 7f5dfef2d8..55b3b4b91a 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -166,6 +166,7 @@ CompileResult Program::compile_kernel(const CompileConfig &compile_config,
 
 void Program::launch_kernel(const CompiledKernelData &compiled_kernel_data,
                             LaunchContextBuilder &ctx) {
+  num_offloaded_tasks_on_last_call_ = compiled_kernel_data.num_tasks();
   program_impl_->get_kernel_launcher().launch_kernel(compiled_kernel_data, ctx);
   if (compile_config().debug && arch_uses_llvm(compiled_kernel_data.arch())) {
     program_impl_->check_runtime_error(result_buffer);
diff --git a/quadrants/program/program.h b/quadrants/program/program.h
index 7ceee2730e..2e0df16ced 100644
--- a/quadrants/program/program.h
+++ b/quadrants/program/program.h
@@ -142,6 +142,10 @@ class QD_DLL_EXPORT Program {
         .get_cuda_graph_cache_used_on_last_call();
   }
 
+  size_t get_num_offloaded_tasks_on_last_call() const {
+    return num_offloaded_tasks_on_last_call_;
+  }
+
   DeviceCapabilityConfig get_device_caps() {
     return program_impl_->get_device_caps();
   }
@@ -337,6 +341,7 @@ class QD_DLL_EXPORT Program {
   float64 total_compilation_time_{0.0};
   static std::atomic<int> num_instances_;
   bool finalized_{false};
+  size_t num_offloaded_tasks_on_last_call_{0};
 
   // TODO: Move ndarrays_ to be managed by runtime
   std::unordered_map<void *, std::unique_ptr<Ndarray>> ndarrays_;
diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
index fb425991bc..47abc6f369 100644
--- a/quadrants/python/export_lang.cpp
+++ b/quadrants/python/export_lang.cpp
@@ -498,7 +498,9 @@ void export_lang(py::module &m) {
       .def("get_device_caps", &Program::get_device_caps)
       .def("get_cuda_graph_cache_size", &Program::get_cuda_graph_cache_size)
       .def("get_cuda_graph_cache_used_on_last_call",
-           &Program::get_cuda_graph_cache_used_on_last_call);
+           &Program::get_cuda_graph_cache_used_on_last_call)
+      .def("get_num_offloaded_tasks_on_last_call",
+           &Program::get_num_offloaded_tasks_on_last_call);
 
   py::class_<CompileResult>(m, "CompileResult")
       .def_property_readonly(
diff --git a/tests/cpp/codegen/compiled_kernel_data_test.cpp b/tests/cpp/codegen/compiled_kernel_data_test.cpp
index d99f54f6c5..b429919159 100644
--- a/tests/cpp/codegen/compiled_kernel_data_test.cpp
+++ b/tests/cpp/codegen/compiled_kernel_data_test.cpp
@@ -24,6 +24,10 @@ class FakeCompiledKernelData : public CompiledKernelData {
     return kFakeArch;
   }
 
+  size_t num_tasks() const override {
+    return 0;
+  }
+
   std::unique_ptr<CompiledKernelData> clone() const override {
     return std::make_unique<FakeCompiledKernelData>(*this);
   }
diff --git a/tests/cpp/compilation_manager/kernel_compilation_manager_test.cpp b/tests/cpp/compilation_manager/kernel_compilation_manager_test.cpp
index a9585e7034..3fdf59c2d2 100644
--- a/tests/cpp/compilation_manager/kernel_compilation_manager_test.cpp
+++ b/tests/cpp/compilation_manager/kernel_compilation_manager_test.cpp
@@ -29,6 +29,10 @@ class FakeCompiledKernelData : public CompiledKernelData {
     return kFakeArch;
   }
 
+  size_t num_tasks() const override {
+    return 0;
+  }
+
   std::unique_ptr<CompiledKernelData> clone() const override {
     return std::make_unique<FakeCompiledKernelData>(data_);
   }
diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index b3775ca606..fcb7a441d6 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -19,6 +19,10 @@ def _on_cuda():
     return impl.current_cfg().arch == qd.cuda
 
 
+def _num_offloaded_tasks():
+    return impl.get_runtime().prog.get_num_offloaded_tasks_on_last_call()
+
+
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test()
 def test_cuda_graph_two_loops(tensor_type):
@@ -40,6 +44,7 @@ def two_loops(x: Annotation, y: Annotation):
 
     assert _cuda_graph_cache_size() == 0
     two_loops(x, y)
+    assert _num_offloaded_tasks() == 2
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
     two_loops(x, y)
@@ -77,6 +82,7 @@ def three_loops(a: Annotation, b: Annotation, c: Annotation):
 
     assert _cuda_graph_cache_size() == 0
     three_loops(a, b, c)
+    assert _num_offloaded_tasks() == 3
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
 
@@ -118,6 +124,7 @@ def two_loops(x: Annotation, y: Annotation):
     y = tensor_type(qd.f32, (n,))
 
     two_loops(x, y)
+    assert _num_offloaded_tasks() == 2
     assert not _cuda_graph_used()
     two_loops(x, y)
     assert not _cuda_graph_used()
@@ -149,6 +156,7 @@ def two_loops(x: Annotation, y: Annotation):
     y1 = tensor_type(qd.f32, (n,))
     assert _cuda_graph_cache_size() == 0
     two_loops(x1, y1)
+    assert _num_offloaded_tasks() == 2
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
     two_loops(x1, y1)
@@ -209,6 +217,7 @@ def add_one(x: Annotation, y: Annotation):
     y1 = tensor_type(qd.f32, (256,))
     assert _cuda_graph_cache_size() == 0
     add_one(x1, y1)
+    assert _num_offloaded_tasks() == 2
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
 
@@ -248,6 +257,7 @@ def add_one(x: Annotation, y: Annotation):
     x = tensor_type(qd.f32, (n,))
     y = tensor_type(qd.f32, (n,))
     add_one(x, y)
+    assert _num_offloaded_tasks() == 2
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
     add_one(x, y)
@@ -292,6 +302,7 @@ def two_loops(x: Annotation, y: Annotation):
 
     assert _cuda_graph_cache_size() == 0
     two_loops(x, y)
+    assert _num_offloaded_tasks() == 2
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
     two_loops(x, y)

From 2c1464b4b9d67c51b51dde685c9ee1f8e936d364 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 16:13:17 -0400
Subject: [PATCH 116/128] Expose CUDA graph node count for test assertions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Track the number of kernel nodes in each cached CUDA graph and expose
it via get_cuda_graph_num_nodes_on_last_call() through the full
CudaGraphManager → KernelLauncher → Program → pybind11 chain. Assert
node counts match offloaded task counts in all cuda graph tests.

Made-with: Cursor
---
 quadrants/program/kernel_launcher.h           |  4 ++++
 quadrants/program/program.h                   |  5 +++++
 quadrants/python/export_lang.cpp              |  4 +++-
 quadrants/runtime/cuda/cuda_graph_manager.cpp | 10 ++++++++--
 quadrants/runtime/cuda/cuda_graph_manager.h   |  6 ++++++
 quadrants/runtime/cuda/kernel_launcher.h      |  3 +++
 tests/python/test_cuda_graph.py               | 12 ++++++++++++
 7 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/quadrants/program/kernel_launcher.h b/quadrants/program/kernel_launcher.h
index f800768269..12f294611e 100644
--- a/quadrants/program/kernel_launcher.h
+++ b/quadrants/program/kernel_launcher.h
@@ -20,6 +20,10 @@ class KernelLauncher {
     return false;
   }
 
+  virtual std::size_t get_cuda_graph_num_nodes_on_last_call() const {
+    return 0;
+  }
+
   virtual ~KernelLauncher() = default;
 };
 
diff --git a/quadrants/program/program.h b/quadrants/program/program.h
index 2e0df16ced..1312f0441e 100644
--- a/quadrants/program/program.h
+++ b/quadrants/program/program.h
@@ -146,6 +146,11 @@ class QD_DLL_EXPORT Program {
     return num_offloaded_tasks_on_last_call_;
   }
 
+  std::size_t get_cuda_graph_num_nodes_on_last_call() {
+    return program_impl_->get_kernel_launcher()
+        .get_cuda_graph_num_nodes_on_last_call();
+  }
+
   DeviceCapabilityConfig get_device_caps() {
     return program_impl_->get_device_caps();
   }
diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
index 47abc6f369..88fd7a09eb 100644
--- a/quadrants/python/export_lang.cpp
+++ b/quadrants/python/export_lang.cpp
@@ -500,7 +500,9 @@ void export_lang(py::module &m) {
       .def("get_cuda_graph_cache_used_on_last_call",
            &Program::get_cuda_graph_cache_used_on_last_call)
       .def("get_num_offloaded_tasks_on_last_call",
-           &Program::get_num_offloaded_tasks_on_last_call);
+           &Program::get_num_offloaded_tasks_on_last_call)
+      .def("get_cuda_graph_num_nodes_on_last_call",
+           &Program::get_cuda_graph_num_nodes_on_last_call);
 
   py::class_<CompileResult>(m, "CompileResult")
       .def_property_readonly(
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.cpp b/quadrants/runtime/cuda/cuda_graph_manager.cpp
index 2142e75418..3de86de5c8 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.cpp
+++ b/quadrants/runtime/cuda/cuda_graph_manager.cpp
@@ -23,7 +23,8 @@ CachedCudaGraph::CachedCudaGraph(CachedCudaGraph &&other) noexcept
       persistent_device_result_buffer(other.persistent_device_result_buffer),
       persistent_ctx(other.persistent_ctx),
       arg_buffer_size(other.arg_buffer_size),
-      result_buffer_size(other.result_buffer_size) {
+      result_buffer_size(other.result_buffer_size),
+      num_nodes(other.num_nodes) {
   other.graph_exec = nullptr;
   other.persistent_device_arg_buffer = nullptr;
   other.persistent_device_result_buffer = nullptr;
@@ -44,6 +45,7 @@ CachedCudaGraph &CachedCudaGraph::operator=(CachedCudaGraph &&other) noexcept {
     persistent_ctx = other.persistent_ctx;
     arg_buffer_size = other.arg_buffer_size;
     result_buffer_size = other.result_buffer_size;
+    num_nodes = other.num_nodes;
 
     other.graph_exec = nullptr;
     other.persistent_device_arg_buffer = nullptr;
@@ -145,6 +147,7 @@ bool CudaGraphManager::launch_cached_graph(CachedCudaGraph &cached,
   auto *stream = CUDAContext::get_instance().get_stream();
   CUDADriver::get_instance().graph_launch(cached.graph_exec, stream);
   used_on_last_call_ = true;
+  num_nodes_on_last_call_ = cached.num_nodes;
   return true;
 }
 
@@ -218,9 +221,12 @@ bool CudaGraphManager::try_launch(
 
   CUDADriver::get_instance().graph_destroy(graph);
 
+  cached.num_nodes = offloaded_tasks.size();
+
   QD_TRACE("CUDA graph created with {} kernel nodes for launch_id={}",
-           offloaded_tasks.size(), launch_id);
+           cached.num_nodes, launch_id);
 
+  num_nodes_on_last_call_ = cached.num_nodes;
   cache_.emplace(launch_id, std::move(cached));
   used_on_last_call_ = true;
   return true;
diff --git a/quadrants/runtime/cuda/cuda_graph_manager.h b/quadrants/runtime/cuda/cuda_graph_manager.h
index 1d99c1ee6f..1df10bb7bc 100644
--- a/quadrants/runtime/cuda/cuda_graph_manager.h
+++ b/quadrants/runtime/cuda/cuda_graph_manager.h
@@ -32,6 +32,7 @@ struct CachedCudaGraph {
   RuntimeContext persistent_ctx{};
   std::size_t arg_buffer_size{0};
   std::size_t result_buffer_size{0};
+  std::size_t num_nodes{0};
 
   CachedCudaGraph() = default;
   ~CachedCudaGraph();
@@ -57,6 +58,7 @@ class CudaGraphManager {
   // cache_size and used_on_last_call used for tests
   void mark_not_used() {
     used_on_last_call_ = false;
+    num_nodes_on_last_call_ = 0;
   }
   std::size_t cache_size() const {
     return cache_.size();
@@ -64,6 +66,9 @@ class CudaGraphManager {
   bool used_on_last_call() const {
     return used_on_last_call_;
   }
+  std::size_t num_nodes_on_last_call() const {
+    return num_nodes_on_last_call_;
+  }
 
  private:
   bool launch_cached_graph(CachedCudaGraph &cached, LaunchContextBuilder &ctx);
@@ -83,6 +88,7 @@ class CudaGraphManager {
   // (each template specialization gets its own launch_id).
   std::unordered_map<int, CachedCudaGraph> cache_;
   bool used_on_last_call_{false};
+  std::size_t num_nodes_on_last_call_{0};
 };
 
 }  // namespace cuda
diff --git a/quadrants/runtime/cuda/kernel_launcher.h b/quadrants/runtime/cuda/kernel_launcher.h
index 0027941c42..cd060fccca 100644
--- a/quadrants/runtime/cuda/kernel_launcher.h
+++ b/quadrants/runtime/cuda/kernel_launcher.h
@@ -31,6 +31,9 @@ class KernelLauncher : public LLVM::KernelLauncher {
   bool get_cuda_graph_cache_used_on_last_call() const override {
     return graph_manager_.used_on_last_call();
   }
+  std::size_t get_cuda_graph_num_nodes_on_last_call() const override {
+    return graph_manager_.num_nodes_on_last_call();
+  }
 
  private:
   std::vector<Context> contexts_;
diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index fcb7a441d6..68ca80e524 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -23,6 +23,10 @@ def _num_offloaded_tasks():
     return impl.get_runtime().prog.get_num_offloaded_tasks_on_last_call()
 
 
+def _cuda_graph_num_nodes():
+    return impl.get_runtime().prog.get_cuda_graph_num_nodes_on_last_call()
+
+
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test()
 def test_cuda_graph_two_loops(tensor_type):
@@ -45,9 +49,11 @@ def two_loops(x: Annotation, y: Annotation):
     assert _cuda_graph_cache_size() == 0
     two_loops(x, y)
     assert _num_offloaded_tasks() == 2
+    assert _cuda_graph_num_nodes() == (2 if platform_supports_graph else 0)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
     two_loops(x, y)
+    assert _cuda_graph_num_nodes() == (2 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
     two_loops(x, y)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
@@ -83,6 +89,7 @@ def three_loops(a: Annotation, b: Annotation, c: Annotation):
     assert _cuda_graph_cache_size() == 0
     three_loops(a, b, c)
     assert _num_offloaded_tasks() == 3
+    assert _cuda_graph_num_nodes() == (3 if platform_supports_graph else 0)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
 
@@ -125,6 +132,7 @@ def two_loops(x: Annotation, y: Annotation):
 
     two_loops(x, y)
     assert _num_offloaded_tasks() == 2
+    assert _cuda_graph_num_nodes() == 0
     assert not _cuda_graph_used()
     two_loops(x, y)
     assert not _cuda_graph_used()
@@ -157,6 +165,7 @@ def two_loops(x: Annotation, y: Annotation):
     assert _cuda_graph_cache_size() == 0
     two_loops(x1, y1)
     assert _num_offloaded_tasks() == 2
+    assert _cuda_graph_num_nodes() == (2 if platform_supports_graph else 0)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
     two_loops(x1, y1)
@@ -218,6 +227,7 @@ def add_one(x: Annotation, y: Annotation):
     assert _cuda_graph_cache_size() == 0
     add_one(x1, y1)
     assert _num_offloaded_tasks() == 2
+    assert _cuda_graph_num_nodes() == (2 if platform_supports_graph else 0)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
 
@@ -258,6 +268,7 @@ def add_one(x: Annotation, y: Annotation):
     y = tensor_type(qd.f32, (n,))
     add_one(x, y)
     assert _num_offloaded_tasks() == 2
+    assert _cuda_graph_num_nodes() == (2 if platform_supports_graph else 0)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
     add_one(x, y)
@@ -303,6 +314,7 @@ def two_loops(x: Annotation, y: Annotation):
     assert _cuda_graph_cache_size() == 0
     two_loops(x, y)
     assert _num_offloaded_tasks() == 2
+    assert _cuda_graph_num_nodes() == (2 if platform_supports_graph else 0)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
     two_loops(x, y)

From 470b0731858703e5c4b8b9c62497f2e63525fd2e Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 16:23:28 -0400
Subject: [PATCH 117/128] Add multi-func cuda graph test with 9 offloaded tasks

Test a kernel that calls three @qd.func with 2, 4, and 3 top-level
for loops respectively, asserting 9 offloaded tasks and 9 graph nodes.

Made-with: Cursor
---
 tests/python/test_cuda_graph.py | 78 +++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index 68ca80e524..38e9095faa 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -112,6 +112,84 @@ def three_loops(a: Annotation, b: Annotation, c: Annotation):
     assert np.allclose(c_np, 22.0)
 
 
+@pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
+@test_utils.test()
+def test_cuda_graph_multi_func(tensor_type):
+    """A kernel calling three funcs with 2, 4, and 3 top-level for loops."""
+    platform_supports_graph = _on_cuda()
+    n = 256
+
+    Annotation = qd.types.NDArray[qd.f32, 1] if tensor_type == qd.ndarray else qd.Template
+
+    @qd.func
+    def func_a(x: Annotation, y: Annotation):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1.0
+        for i in range(y.shape[0]):
+            y[i] = y[i] + 2.0
+
+    @qd.func
+    def func_b(a: Annotation, b: Annotation, c: Annotation, d: Annotation):
+        for i in range(a.shape[0]):
+            a[i] = a[i] + 3.0
+        for i in range(b.shape[0]):
+            b[i] = b[i] + 4.0
+        for i in range(c.shape[0]):
+            c[i] = c[i] + 5.0
+        for i in range(d.shape[0]):
+            d[i] = d[i] + 6.0
+
+    @qd.func
+    def func_c(x: Annotation, y: Annotation, z: Annotation):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 7.0
+        for i in range(y.shape[0]):
+            y[i] = y[i] + 8.0
+        for i in range(z.shape[0]):
+            z[i] = z[i] + 9.0
+
+    @qd.kernel(cuda_graph=True)
+    def multi_func(a: Annotation, b: Annotation, c: Annotation, d: Annotation, e: Annotation, f: Annotation):
+        func_a(a, b)
+        func_b(a, b, c, d)
+        func_c(d, e, f)
+
+    a = tensor_type(qd.f32, (n,))
+    b = tensor_type(qd.f32, (n,))
+    c = tensor_type(qd.f32, (n,))
+    d = tensor_type(qd.f32, (n,))
+    e = tensor_type(qd.f32, (n,))
+    f = tensor_type(qd.f32, (n,))
+
+    assert _cuda_graph_cache_size() == 0
+    multi_func(a, b, c, d, e, f)
+    assert _num_offloaded_tasks() == 9
+    assert _cuda_graph_num_nodes() == (9 if platform_supports_graph else 0)
+    assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
+    assert _cuda_graph_used() == platform_supports_graph
+
+    # func_a: a += 1, b += 2
+    # func_b: a += 3, b += 4, c += 5, d += 6
+    # func_c: d += 7, e += 8, f += 9
+    assert np.allclose(a.to_numpy(), 4.0)  # 1 + 3
+    assert np.allclose(b.to_numpy(), 6.0)  # 2 + 4
+    assert np.allclose(c.to_numpy(), 5.0)
+    assert np.allclose(d.to_numpy(), 13.0)  # 6 + 7
+    assert np.allclose(e.to_numpy(), 8.0)
+    assert np.allclose(f.to_numpy(), 9.0)
+
+    multi_func(a, b, c, d, e, f)
+    assert _cuda_graph_num_nodes() == (9 if platform_supports_graph else 0)
+    assert _cuda_graph_used() == platform_supports_graph
+
+    assert np.allclose(a.to_numpy(), 8.0)
+    assert np.allclose(b.to_numpy(), 12.0)
+    assert np.allclose(c.to_numpy(), 10.0)
+    assert np.allclose(d.to_numpy(), 26.0)
+    assert np.allclose(e.to_numpy(), 16.0)
+    assert np.allclose(f.to_numpy(), 18.0)
+
+
 @pytest.mark.parametrize("tensor_type", [qd.ndarray, qd.field])
 @test_utils.test()
 def test_no_cuda_graph_annotation(tensor_type):

From 70aac93c87939b16de83f2d095ba1b45c69dc40d Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 17:48:25 -0400
Subject: [PATCH 118/128] Change graph_do_while syntax from decorator param to
 in-kernel while loop

Replace `@qd.kernel(graph_do_while="var")` with explicit
`while qd.graph_do_while(var):` inside the kernel body. The AST
transformer recognises the pattern and sets the condition arg without
emitting a while-loop IR node. No C++ changes needed.

Made-with: Cursor
---
 docs/source/user_guide/cuda_graph.md         | 43 +++++++++--------
 python/quadrants/lang/ast/ast_transformer.py | 30 ++++++++++++
 python/quadrants/lang/kernel_impl.py         | 36 ++------------
 python/quadrants/lang/misc.py                | 18 +++++++
 tests/python/test_cuda_graph_do_while.py     | 50 +++++++++++---------
 5 files changed, 103 insertions(+), 74 deletions(-)

diff --git a/docs/source/user_guide/cuda_graph.md b/docs/source/user_guide/cuda_graph.md
index d978a05c73..3ef9e15636 100644
--- a/docs/source/user_guide/cuda_graph.md
+++ b/docs/source/user_guide/cuda_graph.md
@@ -55,16 +55,17 @@ When different fields are passed as template arguments, each unique combination
 
 ## GPU-side iteration with `graph_do_while`
 
-For iterative algorithms (physics solvers, convergence loops), you often want to repeat the kernel body until a condition is met, without returning to the host each iteration. The `graph_do_while` parameter enables this:
+For iterative algorithms (physics solvers, convergence loops), you often want to repeat the kernel body until a condition is met, without returning to the host each iteration. Use `while qd.graph_do_while(flag):` inside a `cuda_graph=True` kernel:
 
 ```python
-@qd.kernel(graph_do_while="counter")
+@qd.kernel(cuda_graph=True)
 def solve(x: qd.types.ndarray(qd.f32, ndim=1),
           counter: qd.types.ndarray(qd.i32, ndim=0)):
-    for i in range(x.shape[0]):
-        x[i] = x[i] + 1.0
-    for i in range(1):
-        counter[()] = counter[()] - 1
+    while qd.graph_do_while(counter):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1.0
+        for i in range(1):
+            counter[()] = counter[()] - 1
 
 x = qd.ndarray(qd.f32, shape=(N,))
 counter = qd.ndarray(qd.i32, shape=())
@@ -73,38 +74,40 @@ solve(x, counter)
 # x is now incremented 10 times; counter is 0
 ```
 
-The `graph_do_while` value is the name of a scalar `qd.i32` ndarray parameter. The kernel body repeats while this value is non-zero.
+The argument to `qd.graph_do_while()` must be the name of a scalar `qd.i32` ndarray parameter. The loop body repeats while this value is non-zero.
 
 - On SM 9.0+ (Hopper), this uses CUDA conditional while nodes — the entire iteration runs on the GPU with no host involvement.
 - Older CUDA GPUs, and non-CUDA backends not currently supported.
-- `graph_do_while` implicitly enables `cuda_graph=True`.
+- Using `qd.graph_do_while()` implicitly enables `cuda_graph=True` if not already set.
 
 ### Patterns
 
 **Counter-based**: set the counter to N, decrement each iteration. The body runs exactly N times.
 
 ```python
-@qd.kernel(graph_do_while="counter")
+@qd.kernel(cuda_graph=True)
 def iterate(x: qd.types.ndarray(qd.f32, ndim=1),
             counter: qd.types.ndarray(qd.i32, ndim=0)):
-    for i in range(x.shape[0]):
-        x[i] = x[i] + 1.0
-    for i in range(1):
-        counter[()] = counter[()] - 1
+    while qd.graph_do_while(counter):
+        for i in range(x.shape[0]):
+            x[i] = x[i] + 1.0
+        for i in range(1):
+            counter[()] = counter[()] - 1
 ```
 
 **Boolean flag**: set a `keep_going` flag to 1, have the kernel set it to 0 when a convergence criterion is met.
 
 ```python
-@qd.kernel(graph_do_while="keep_going")
+@qd.kernel(cuda_graph=True)
 def converge(x: qd.types.ndarray(qd.f32, ndim=1),
              keep_going: qd.types.ndarray(qd.i32, ndim=0)):
-    for i in range(x.shape[0]):
-        # ... do work ...
-        pass
-    for i in range(1):
-        if some_condition(x):
-            keep_going[()] = 0
+    while qd.graph_do_while(keep_going):
+        for i in range(x.shape[0]):
+            # ... do work ...
+            pass
+        for i in range(1):
+            if some_condition(x):
+                keep_going[()] = 0
 ```
 
 ### Do-while semantics
diff --git a/python/quadrants/lang/ast/ast_transformer.py b/python/quadrants/lang/ast/ast_transformer.py
index 1b13ead0f9..ac4a36b831 100644
--- a/python/quadrants/lang/ast/ast_transformer.py
+++ b/python/quadrants/lang/ast/ast_transformer.py
@@ -1193,11 +1193,41 @@ def build_For(ctx: ASTTransformerFuncContext, node: ast.For) -> None:
                 # Struct for
                 return ASTTransformer.build_struct_for(ctx, node, is_grouped=False)
 
+    @staticmethod
+    def _is_graph_do_while_call(node: ast.expr) -> str | None:
+        """If *node* is ``qd.graph_do_while(var)`` return the arg name, else None."""
+        if not isinstance(node, ast.Call):
+            return None
+        func = node.func
+        if isinstance(func, ast.Attribute) and func.attr == "graph_do_while":
+            if len(node.args) == 1 and isinstance(node.args[0], ast.Name):
+                return node.args[0].id
+        if isinstance(func, ast.Name) and func.id == "graph_do_while":
+            if len(node.args) == 1 and isinstance(node.args[0], ast.Name):
+                return node.args[0].id
+        return None
+
     @staticmethod
     def build_While(ctx: ASTTransformerFuncContext, node: ast.While) -> None:
         if node.orelse:
             raise QuadrantsSyntaxError("'else' clause for 'while' not supported in Quadrants kernels")
 
+        graph_do_while_arg = ASTTransformer._is_graph_do_while_call(node.test)
+        if graph_do_while_arg is not None:
+            kernel = ctx.global_context.current_kernel
+            arg_names = [m.name for m in kernel.arg_metas]
+            if graph_do_while_arg not in arg_names:
+                raise QuadrantsSyntaxError(
+                    f"qd.graph_do_while({graph_do_while_arg!r}) does not match any "
+                    f"parameter of kernel {kernel.func.__name__!r}. "
+                    f"Available parameters: {arg_names}"
+                )
+            kernel.graph_do_while_arg = graph_do_while_arg
+            if not kernel.use_cuda_graph:
+                kernel.use_cuda_graph = True
+            build_stmts(ctx, node.body)
+            return None
+
         with ctx.loop_scope_guard():
             stmt_dbg_info = _qd_core.DebugInfo(ctx.get_pos_info(node))
             ctx.ast_builder.begin_frontend_while(expr.Expr(1, dtype=primitive_types.i32).ptr, stmt_dbg_info)
diff --git a/python/quadrants/lang/kernel_impl.py b/python/quadrants/lang/kernel_impl.py
index 395d6b1150..b61eb24de6 100644
--- a/python/quadrants/lang/kernel_impl.py
+++ b/python/quadrants/lang/kernel_impl.py
@@ -128,28 +128,16 @@ def _kernel_impl(
     level_of_class_stackframe: int,
     verbose: bool = False,
     cuda_graph: bool = False,
-    graph_do_while: str | None = None,
 ) -> QuadrantsCallable:
     # Can decorators determine if a function is being defined inside a class?
     # https://stackoverflow.com/a/8793684/12003165
     is_classkernel = _inside_class(level_of_class_stackframe + 1)
 
-    if graph_do_while is not None:
-        cuda_graph = True
-
     if verbose:
         print(f"kernel={_func.__name__} is_classkernel={is_classkernel}")
     primal = Kernel(_func, autodiff_mode=_NONE, _is_classkernel=is_classkernel)
     adjoint = Kernel(_func, autodiff_mode=_REVERSE, _is_classkernel=is_classkernel)
     primal.use_cuda_graph = cuda_graph
-    primal.graph_do_while_arg = graph_do_while
-    if graph_do_while is not None:
-        arg_names = [m.name for m in primal.arg_metas]
-        if graph_do_while not in arg_names:
-            raise ValueError(
-                f"graph_do_while={graph_do_while!r} does not match any parameter of "
-                f"kernel {_func.__name__!r}. Available parameters: {arg_names}"
-            )
     # Having |primal| contains |grad| makes the tape work.
     primal.grad = adjoint
 
@@ -191,9 +179,7 @@ def wrapped_classkernel(*args, **kwargs):
 @overload
 # TODO: This callable should be Callable[[F], F].
 # See comments below.
-def kernel(
-    _fn: None = None, *, pure: bool = False, cuda_graph: bool = False, graph_do_while: str | None = None
-) -> Callable[[Any], Any]: ...
+def kernel(_fn: None = None, *, pure: bool = False, cuda_graph: bool = False) -> Callable[[Any], Any]: ...
 
 
 # TODO: This next overload should return F, but currently that will cause issues
@@ -203,7 +189,7 @@ def kernel(
 # However, by making it return Any, we can make the pure parameter
 # change now, without breaking pyright.
 @overload
-def kernel(_fn: Any, *, pure: bool = False, cuda_graph: bool = False, graph_do_while: str | None = None) -> Any: ...
+def kernel(_fn: Any, *, pure: bool = False, cuda_graph: bool = False) -> Any: ...
 
 
 def kernel(
@@ -212,7 +198,6 @@ def kernel(
     pure: bool | None = None,
     fastcache: bool = False,
     cuda_graph: bool = False,
-    graph_do_while: str | None = None,
 ):
     """
     Marks a function as a Quadrants kernel.
@@ -227,17 +212,8 @@ def kernel(
     Args:
         cuda_graph: If True, kernels with 2+ top-level for loops are captured
             into a CUDA graph on first launch and replayed on subsequent
-            launches, reducing per-kernel launch overhead. Non-CUDA backends are not supported currently.
-        graph_do_while: Name of a scalar ``qd.i32`` ndarray parameter that
-            controls GPU-side iteration. The kernel body repeats while the
-            named argument is non-zero.  Uses CUDA conditional while nodes
-            on SM 9.0+ (Hopper).  Implicitly enables
-            ``cuda_graph=True``.
-
-            **Do-while semantics**: the kernel body always executes at least
-            once before the condition is checked. The flag value must be >= 1
-            at launch time. Passing 0 with a kernel that decrements the
-            counter will result in an infinite loop.
+            launches, reducing per-kernel launch overhead. Non-CUDA backends
+            are not supported currently.
 
     Example::
 
@@ -257,9 +233,7 @@ def decorator(fn: F, has_kernel_params: bool = True) -> F:
         else:
             level = 4
 
-        wrapped = _kernel_impl(
-            fn, level_of_class_stackframe=level, cuda_graph=cuda_graph, graph_do_while=graph_do_while
-        )
+        wrapped = _kernel_impl(fn, level_of_class_stackframe=level, cuda_graph=cuda_graph)
         wrapped.is_pure = pure is not None and pure or fastcache
         if pure is not None:
             warnings_helper.warn_once(
diff --git a/python/quadrants/lang/misc.py b/python/quadrants/lang/misc.py
index fa08aad77c..3af8cf7422 100644
--- a/python/quadrants/lang/misc.py
+++ b/python/quadrants/lang/misc.py
@@ -701,6 +701,23 @@ def copy():
         _bit_vectorize()
 
 
+def graph_do_while(condition) -> bool:
+    """Marks a while loop as a CUDA graph do-while conditional node.
+
+    Used as ``while qd.graph_do_while(flag):`` inside a
+    ``@qd.kernel(cuda_graph=True)`` kernel. The loop body repeats while
+    ``flag`` (a scalar ``qd.i32`` ndarray) is non-zero.
+
+    On SM 9.0+ (Hopper) GPUs this compiles to a native CUDA graph
+    conditional while node. On other backends or older GPUs it falls
+    back to a host-side do-while loop.
+
+    This function should not be called directly at runtime; it is
+    recognised and transformed during AST compilation.
+    """
+    return bool(condition)
+
+
 def global_thread_idx():
     """Returns the global thread id of this running thread,
     only available for cpu and cuda backends.
@@ -837,6 +854,7 @@ def dump_compile_config() -> None:
     "python",
     "vulkan",
     "extension",
+    "graph_do_while",
     "loop_config",
     "global_thread_idx",
     "assume_in_range",
diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py
index f5d4bc4bda..0e7108305f 100644
--- a/tests/python/test_cuda_graph_do_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -26,12 +26,13 @@ def test_graph_do_while_counter():
         pytest.xfail("graph_do_while requires SM 9.0+ (Hopper)")
     N = 64
 
-    @qd.kernel(graph_do_while="counter")
+    @qd.kernel(cuda_graph=True)
     def graph_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd.i32, ndim=0)):
-        for i in range(x.shape[0]):
-            x[i] = x[i] + 1
-        for i in range(1):
-            counter[()] = counter[()] - 1
+        while qd.graph_do_while(counter):
+            for i in range(x.shape[0]):
+                x[i] = x[i] + 1
+            for i in range(1):
+                counter[()] = counter[()] - 1
 
     x = qd.ndarray(qd.i32, shape=(N,))
     counter = qd.ndarray(qd.i32, shape=())
@@ -64,17 +65,18 @@ def test_graph_do_while_boolean_done():
         pytest.xfail("graph_do_while requires SM 9.0+ (Hopper)")
     N = 64
 
-    @qd.kernel(graph_do_while="keep_going")
+    @qd.kernel(cuda_graph=True)
     def increment_until_threshold(
         x: qd.types.ndarray(qd.i32, ndim=1),
         threshold: qd.i32,
         keep_going: qd.types.ndarray(qd.i32, ndim=0),
     ):
-        for i in range(x.shape[0]):
-            x[i] = x[i] + 1
-        for i in range(1):
-            if x[0] >= threshold:
-                keep_going[()] = 0
+        while qd.graph_do_while(keep_going):
+            for i in range(x.shape[0]):
+                x[i] = x[i] + 1
+            for i in range(1):
+                if x[0] >= threshold:
+                    keep_going[()] = 0
 
     x = qd.ndarray(qd.i32, shape=(N,))
     keep_going = qd.ndarray(qd.i32, shape=())
@@ -107,18 +109,19 @@ def test_graph_do_while_multiple_loops():
         pytest.xfail("graph_do_while requires SM 9.0+ (Hopper)")
     N = 32
 
-    @qd.kernel(graph_do_while="counter")
+    @qd.kernel(cuda_graph=True)
     def multi_loop(
         x: qd.types.ndarray(qd.f32, ndim=1),
         y: qd.types.ndarray(qd.f32, ndim=1),
         counter: qd.types.ndarray(qd.i32, ndim=0),
     ):
-        for i in range(x.shape[0]):
-            x[i] = x[i] + 1.0
-        for i in range(y.shape[0]):
-            y[i] = y[i] + 2.0
-        for i in range(1):
-            counter[()] = counter[()] - 1
+        while qd.graph_do_while(counter):
+            for i in range(x.shape[0]):
+                x[i] = x[i] + 1.0
+            for i in range(y.shape[0]):
+                y[i] = y[i] + 2.0
+            for i in range(1):
+                counter[()] = counter[()] - 1
 
     x = qd.ndarray(qd.f32, shape=(N,))
     y = qd.ndarray(qd.f32, shape=(N,))
@@ -155,12 +158,13 @@ def test_graph_do_while_changed_condition_ndarray_raises():
     if not _on_hopper():
         pytest.xfail("graph_do_while requires SM 9.0+ (Hopper)")
 
-    @qd.kernel(graph_do_while="c")
+    @qd.kernel(cuda_graph=True)
     def k(x: qd.types.ndarray(qd.i32, ndim=1), c: qd.types.ndarray(qd.i32, ndim=0)):
-        for i in range(x.shape[0]):
-            x[i] = x[i] + 1
-        for i in range(1):
-            c[()] = c[()] - 1
+        while qd.graph_do_while(c):
+            for i in range(x.shape[0]):
+                x[i] = x[i] + 1
+            for i in range(1):
+                c[()] = c[()] - 1
 
     x = qd.ndarray(qd.i32, shape=(4,))
     c1 = qd.ndarray(qd.i32, shape=())

From 791bab4137e632c4f2bd8b6df26192754bf12869 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 17:54:05 -0400
Subject: [PATCH 119/128] Remove implicit cuda_graph=True note from docs

Made-with: Cursor
---
 docs/source/user_guide/cuda_graph.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/user_guide/cuda_graph.md b/docs/source/user_guide/cuda_graph.md
index 3ef9e15636..9071260f8c 100644
--- a/docs/source/user_guide/cuda_graph.md
+++ b/docs/source/user_guide/cuda_graph.md
@@ -78,7 +78,6 @@ The argument to `qd.graph_do_while()` must be the name of a scalar `qd.i32` ndar
 
 - On SM 9.0+ (Hopper), this uses CUDA conditional while nodes — the entire iteration runs on the GPU with no host involvement.
 - Older CUDA GPUs, and non-CUDA backends not currently supported.
-- Using `qd.graph_do_while()` implicitly enables `cuda_graph=True` if not already set.
 
 ### Patterns
 

From ee9178cbdf477e5d3c62bc8ea9c9f8ce25980f26 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 17:56:50 -0400
Subject: [PATCH 120/128] Require cuda_graph=True for graph_do_while instead of
 implicitly enabling it

Made-with: Cursor
---
 python/quadrants/lang/ast/ast_transformer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/quadrants/lang/ast/ast_transformer.py b/python/quadrants/lang/ast/ast_transformer.py
index ac4a36b831..1d9fe0c59b 100644
--- a/python/quadrants/lang/ast/ast_transformer.py
+++ b/python/quadrants/lang/ast/ast_transformer.py
@@ -1222,9 +1222,11 @@ def build_While(ctx: ASTTransformerFuncContext, node: ast.While) -> None:
                     f"parameter of kernel {kernel.func.__name__!r}. "
                     f"Available parameters: {arg_names}"
                 )
-            kernel.graph_do_while_arg = graph_do_while_arg
             if not kernel.use_cuda_graph:
-                kernel.use_cuda_graph = True
+                raise QuadrantsSyntaxError(
+                    "qd.graph_do_while() requires @qd.kernel(cuda_graph=True)"
+                )
+            kernel.graph_do_while_arg = graph_do_while_arg
             build_stmts(ctx, node.body)
             return None
 

From 662337516d667f79c5bc67ecc119f697511d3de9 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 17:58:11 -0400
Subject: [PATCH 121/128] Update graph_do_while docstring to reflect SM 9.0+
 only support

Made-with: Cursor
---
 python/quadrants/lang/misc.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/quadrants/lang/misc.py b/python/quadrants/lang/misc.py
index 3af8cf7422..a40c2dfe0f 100644
--- a/python/quadrants/lang/misc.py
+++ b/python/quadrants/lang/misc.py
@@ -709,11 +709,12 @@ def graph_do_while(condition) -> bool:
     ``flag`` (a scalar ``qd.i32`` ndarray) is non-zero.
 
     On SM 9.0+ (Hopper) GPUs this compiles to a native CUDA graph
-    conditional while node. On other backends or older GPUs it falls
-    back to a host-side do-while loop.
+    conditional while node. Older CUDA GPUs and non-CUDA backends
+    are not currently supported.
 
     This function should not be called directly at runtime; it is
     recognised and transformed during AST compilation.
+    Requires ``@qd.kernel(cuda_graph=True)``.
     """
     return bool(condition)
 

From b2d375e38f4a8c0678c99a49ee7b100ae1ac5570 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 18:00:07 -0400
Subject: [PATCH 122/128] Add tests for graph_do_while syntax errors

Test that using qd.graph_do_while() without cuda_graph=True and with a
non-existent parameter name both raise QuadrantsSyntaxError.

Made-with: Cursor
---
 tests/python/test_cuda_graph_do_while.py | 34 ++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/tests/python/test_cuda_graph_do_while.py b/tests/python/test_cuda_graph_do_while.py
index 0e7108305f..59cc30a29a 100644
--- a/tests/python/test_cuda_graph_do_while.py
+++ b/tests/python/test_cuda_graph_do_while.py
@@ -175,3 +175,37 @@ def k(x: qd.types.ndarray(qd.i32, ndim=1), c: qd.types.ndarray(qd.i32, ndim=0)):
     c2.from_numpy(np.array(1, dtype=np.int32))
     with pytest.raises(RuntimeError, match="condition ndarray changed"):
         k(x, c2)
+
+
+@test_utils.test()
+def test_graph_do_while_without_cuda_graph_raises():
+    """Using qd.graph_do_while without cuda_graph=True should raise."""
+
+    @qd.kernel
+    def k(x: qd.types.ndarray(qd.i32, ndim=1), c: qd.types.ndarray(qd.i32, ndim=0)):
+        while qd.graph_do_while(c):
+            for i in range(x.shape[0]):
+                x[i] = x[i] + 1
+
+    x = qd.ndarray(qd.i32, shape=(4,))
+    c = qd.ndarray(qd.i32, shape=())
+    c.from_numpy(np.array(1, dtype=np.int32))
+    with pytest.raises(qd.QuadrantsSyntaxError, match="requires @qd.kernel\\(cuda_graph=True\\)"):
+        k(x, c)
+
+
+@test_utils.test()
+def test_graph_do_while_nonexistent_arg_raises():
+    """Using a variable name that isn't a kernel parameter should raise."""
+
+    @qd.kernel(cuda_graph=True)
+    def k(x: qd.types.ndarray(qd.i32, ndim=1), c: qd.types.ndarray(qd.i32, ndim=0)):
+        while qd.graph_do_while(nonexistent):
+            for i in range(x.shape[0]):
+                x[i] = x[i] + 1
+
+    x = qd.ndarray(qd.i32, shape=(4,))
+    c = qd.ndarray(qd.i32, shape=())
+    c.from_numpy(np.array(1, dtype=np.int32))
+    with pytest.raises(qd.QuadrantsSyntaxError, match="does not match any parameter"):
+        k(x, c)

From 6748e1747c134dcedf19bdfb051467f99252bffe Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 18:03:24 -0400
Subject: [PATCH 123/128] Apply black formatting to ast_transformer.py

Made-with: Cursor
---
 python/quadrants/lang/ast/ast_transformer.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/quadrants/lang/ast/ast_transformer.py b/python/quadrants/lang/ast/ast_transformer.py
index 1d9fe0c59b..79c639761e 100644
--- a/python/quadrants/lang/ast/ast_transformer.py
+++ b/python/quadrants/lang/ast/ast_transformer.py
@@ -1223,9 +1223,7 @@ def build_While(ctx: ASTTransformerFuncContext, node: ast.While) -> None:
                     f"Available parameters: {arg_names}"
                 )
             if not kernel.use_cuda_graph:
-                raise QuadrantsSyntaxError(
-                    "qd.graph_do_while() requires @qd.kernel(cuda_graph=True)"
-                )
+                raise QuadrantsSyntaxError("qd.graph_do_while() requires @qd.kernel(cuda_graph=True)")
             kernel.graph_do_while_arg = graph_do_while_arg
             build_stmts(ctx, node.body)
             return None

From 2996cb968309c74cc2d79c371e443ef52d316e06 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 18:14:13 -0400
Subject: [PATCH 124/128] Fix offloaded tasks assertions to use >= for x64
 ndarray compatibility

The LLVM x64 backend generates extra tasks per ndarray argument for
serialization/setup, so exact equality checks fail. Use >= instead.

Made-with: Cursor
---
 tests/python/test_cuda_graph.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index 38e9095faa..ab701d1254 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -48,7 +48,7 @@ def two_loops(x: Annotation, y: Annotation):
 
     assert _cuda_graph_cache_size() == 0
     two_loops(x, y)
-    assert _num_offloaded_tasks() == 2
+    assert _num_offloaded_tasks() >= 2
     assert _cuda_graph_num_nodes() == (2 if platform_supports_graph else 0)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
@@ -88,7 +88,7 @@ def three_loops(a: Annotation, b: Annotation, c: Annotation):
 
     assert _cuda_graph_cache_size() == 0
     three_loops(a, b, c)
-    assert _num_offloaded_tasks() == 3
+    assert _num_offloaded_tasks() >= 3
     assert _cuda_graph_num_nodes() == (3 if platform_supports_graph else 0)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
@@ -163,7 +163,7 @@ def multi_func(a: Annotation, b: Annotation, c: Annotation, d: Annotation, e: An
 
     assert _cuda_graph_cache_size() == 0
     multi_func(a, b, c, d, e, f)
-    assert _num_offloaded_tasks() == 9
+    assert _num_offloaded_tasks() >= 9
     assert _cuda_graph_num_nodes() == (9 if platform_supports_graph else 0)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
@@ -209,7 +209,7 @@ def two_loops(x: Annotation, y: Annotation):
     y = tensor_type(qd.f32, (n,))
 
     two_loops(x, y)
-    assert _num_offloaded_tasks() == 2
+    assert _num_offloaded_tasks() >= 2
     assert _cuda_graph_num_nodes() == 0
     assert not _cuda_graph_used()
     two_loops(x, y)
@@ -242,7 +242,7 @@ def two_loops(x: Annotation, y: Annotation):
     y1 = tensor_type(qd.f32, (n,))
     assert _cuda_graph_cache_size() == 0
     two_loops(x1, y1)
-    assert _num_offloaded_tasks() == 2
+    assert _num_offloaded_tasks() >= 2
     assert _cuda_graph_num_nodes() == (2 if platform_supports_graph else 0)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
@@ -304,7 +304,7 @@ def add_one(x: Annotation, y: Annotation):
     y1 = tensor_type(qd.f32, (256,))
     assert _cuda_graph_cache_size() == 0
     add_one(x1, y1)
-    assert _num_offloaded_tasks() == 2
+    assert _num_offloaded_tasks() >= 2
     assert _cuda_graph_num_nodes() == (2 if platform_supports_graph else 0)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
@@ -345,7 +345,7 @@ def add_one(x: Annotation, y: Annotation):
     x = tensor_type(qd.f32, (n,))
     y = tensor_type(qd.f32, (n,))
     add_one(x, y)
-    assert _num_offloaded_tasks() == 2
+    assert _num_offloaded_tasks() >= 2
     assert _cuda_graph_num_nodes() == (2 if platform_supports_graph else 0)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
@@ -391,7 +391,7 @@ def two_loops(x: Annotation, y: Annotation):
 
     assert _cuda_graph_cache_size() == 0
     two_loops(x, y)
-    assert _num_offloaded_tasks() == 2
+    assert _num_offloaded_tasks() >= 2
     assert _cuda_graph_num_nodes() == (2 if platform_supports_graph else 0)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph

From b39c3a9e6d040df65cf3bc210764f44cc3574370 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 16:01:49 -0700
Subject: [PATCH 125/128] Fix cuda graph tests: derive expected node count from
 offloaded tasks

Ndarray kernels can produce additional serial tasks beyond the
user-visible loops, so hardcoding expected node counts breaks.
Use the actual num_offloaded_tasks instead.
---
 tests/python/test_cuda_graph.py | 46 +++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/tests/python/test_cuda_graph.py b/tests/python/test_cuda_graph.py
index ab701d1254..3d4878ab3d 100644
--- a/tests/python/test_cuda_graph.py
+++ b/tests/python/test_cuda_graph.py
@@ -48,12 +48,14 @@ def two_loops(x: Annotation, y: Annotation):
 
     assert _cuda_graph_cache_size() == 0
     two_loops(x, y)
-    assert _num_offloaded_tasks() >= 2
-    assert _cuda_graph_num_nodes() == (2 if platform_supports_graph else 0)
+    num_tasks = _num_offloaded_tasks()
+    assert num_tasks >= 2
+    expected_nodes = num_tasks if platform_supports_graph else 0
+    assert _cuda_graph_num_nodes() == expected_nodes
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
     two_loops(x, y)
-    assert _cuda_graph_num_nodes() == (2 if platform_supports_graph else 0)
+    assert _cuda_graph_num_nodes() == expected_nodes
     assert _cuda_graph_used() == platform_supports_graph
     two_loops(x, y)
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
@@ -88,8 +90,10 @@ def three_loops(a: Annotation, b: Annotation, c: Annotation):
 
     assert _cuda_graph_cache_size() == 0
     three_loops(a, b, c)
-    assert _num_offloaded_tasks() >= 3
-    assert _cuda_graph_num_nodes() == (3 if platform_supports_graph else 0)
+    num_tasks = _num_offloaded_tasks()
+    assert num_tasks >= 3
+    expected_nodes = num_tasks if platform_supports_graph else 0
+    assert _cuda_graph_num_nodes() == expected_nodes
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
 
@@ -163,8 +167,10 @@ def multi_func(a: Annotation, b: Annotation, c: Annotation, d: Annotation, e: An
 
     assert _cuda_graph_cache_size() == 0
     multi_func(a, b, c, d, e, f)
-    assert _num_offloaded_tasks() >= 9
-    assert _cuda_graph_num_nodes() == (9 if platform_supports_graph else 0)
+    num_tasks = _num_offloaded_tasks()
+    assert num_tasks >= 9
+    expected_nodes = num_tasks if platform_supports_graph else 0
+    assert _cuda_graph_num_nodes() == expected_nodes
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
 
@@ -179,7 +185,7 @@ def multi_func(a: Annotation, b: Annotation, c: Annotation, d: Annotation, e: An
     assert np.allclose(f.to_numpy(), 9.0)
 
     multi_func(a, b, c, d, e, f)
-    assert _cuda_graph_num_nodes() == (9 if platform_supports_graph else 0)
+    assert _cuda_graph_num_nodes() == expected_nodes
     assert _cuda_graph_used() == platform_supports_graph
 
     assert np.allclose(a.to_numpy(), 8.0)
@@ -242,8 +248,10 @@ def two_loops(x: Annotation, y: Annotation):
     y1 = tensor_type(qd.f32, (n,))
     assert _cuda_graph_cache_size() == 0
     two_loops(x1, y1)
-    assert _num_offloaded_tasks() >= 2
-    assert _cuda_graph_num_nodes() == (2 if platform_supports_graph else 0)
+    num_tasks = _num_offloaded_tasks()
+    assert num_tasks >= 2
+    expected_nodes = num_tasks if platform_supports_graph else 0
+    assert _cuda_graph_num_nodes() == expected_nodes
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
     two_loops(x1, y1)
@@ -304,8 +312,10 @@ def add_one(x: Annotation, y: Annotation):
     y1 = tensor_type(qd.f32, (256,))
     assert _cuda_graph_cache_size() == 0
     add_one(x1, y1)
-    assert _num_offloaded_tasks() >= 2
-    assert _cuda_graph_num_nodes() == (2 if platform_supports_graph else 0)
+    num_tasks = _num_offloaded_tasks()
+    assert num_tasks >= 2
+    expected_nodes = num_tasks if platform_supports_graph else 0
+    assert _cuda_graph_num_nodes() == expected_nodes
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
 
@@ -345,8 +355,10 @@ def add_one(x: Annotation, y: Annotation):
     x = tensor_type(qd.f32, (n,))
     y = tensor_type(qd.f32, (n,))
     add_one(x, y)
-    assert _num_offloaded_tasks() >= 2
-    assert _cuda_graph_num_nodes() == (2 if platform_supports_graph else 0)
+    num_tasks = _num_offloaded_tasks()
+    assert num_tasks >= 2
+    expected_nodes = num_tasks if platform_supports_graph else 0
+    assert _cuda_graph_num_nodes() == expected_nodes
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
     add_one(x, y)
@@ -391,8 +403,10 @@ def two_loops(x: Annotation, y: Annotation):
 
     assert _cuda_graph_cache_size() == 0
     two_loops(x, y)
-    assert _num_offloaded_tasks() >= 2
-    assert _cuda_graph_num_nodes() == (2 if platform_supports_graph else 0)
+    num_tasks = _num_offloaded_tasks()
+    assert num_tasks >= 2
+    expected_nodes = num_tasks if platform_supports_graph else 0
+    assert _cuda_graph_num_nodes() == expected_nodes
     assert _cuda_graph_cache_size() == (1 if platform_supports_graph else 0)
     assert _cuda_graph_used() == platform_supports_graph
     two_loops(x, y)

From 349756099336d602a5a201133f1fda9ce88c5534 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 14 Mar 2026 17:51:36 -0700
Subject: [PATCH 126/128] Add graph_do_while to public API test list

---
 tests/python/test_api.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index cf12abc393..9b931488e9 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -140,6 +140,7 @@ def _get_expected_matrix_apis():
     "get_addr",
     "global_thread_idx",
     "gpu",
+    "graph_do_while",
     "grouped",
     "i",
     "i16",

From 1bdd202ae96c843b5c72b6da7c105a5002797229 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Mon, 16 Mar 2026 09:28:08 -0700
Subject: [PATCH 127/128] Fix end-of-file newline in env.sh

---
 env.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/env.sh b/env.sh
index 342d019a1b..6efb14a262 100644
--- a/env.sh
+++ b/env.sh
@@ -8,4 +8,4 @@ export SCCACHE_DIR="/home/hugh/.cache/ti-build-cache/sccache-v0-10-0/cache"
 export SCCACHE_CACHE_SIZE="40G"
 export SCCACHE_STARTUP_TIMEOUT="30"
 export SCCACHE_IDLE_TIMEOUT="0"
-export QUADRANTS_CMAKE_ARGS="-DCLANG_EXECUTABLE=/usr/bin/clang-20 -DCMAKE_CXX_COMPILER_WORKS=1 -DCMAKE_C_COMPILER_LAUNCHER=/home/hugh/.cache/ti-build-cache/sccache-v0-10-0/bin/sccache -DCMAKE_CXX_COMPILER_LAUNCHER=/home/hugh/.cache/ti-build-cache/sccache-v0-10-0/bin/sccache"
\ No newline at end of file
+export QUADRANTS_CMAKE_ARGS="-DCLANG_EXECUTABLE=/usr/bin/clang-20 -DCMAKE_CXX_COMPILER_WORKS=1 -DCMAKE_C_COMPILER_LAUNCHER=/home/hugh/.cache/ti-build-cache/sccache-v0-10-0/bin/sccache -DCMAKE_CXX_COMPILER_LAUNCHER=/home/hugh/.cache/ti-build-cache/sccache-v0-10-0/bin/sccache"

From df0f753acb12670fc449da62c659cd1fb9d0265d Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Mon, 16 Mar 2026 10:43:09 -0700
Subject: [PATCH 128/128] Remove env.sh from git and add to .gitignore

env.sh is generated by ./build.py and should not be tracked.
---
 .gitignore |  1 +
 env.sh     | 11 -----------
 2 files changed, 1 insertion(+), 11 deletions(-)
 delete mode 100644 env.sh

diff --git a/.gitignore b/.gitignore
index 6d05d1ed47..0aaf63e31c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -96,3 +96,4 @@ imgui.ini
 stubs/
 CHANGELOG.md
 python/quadrants/_version.py
+env.sh
diff --git a/env.sh b/env.sh
deleted file mode 100644
index 6efb14a262..0000000000
--- a/env.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-export LLVM_DIR="/home/hugh/.cache/ti-build-cache/llvm-22.1.0-x86-202603120808"
-export VULKAN_SDK="/home/hugh/.cache/ti-build-cache/vulkan-1.4.321.1/x86_64"
-export PATH="/home/hugh/.cache/ti-build-cache/vulkan-1.4.321.1/x86_64/bin:/home/hugh/ais/seagull/quadrants/.venv/bin:/home/hugh/.local/bin:/home/hugh/bin:/home/hugh/.local/bin:/home/hugh/bin:/home/hugh/.local/bin:/home/hugh/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin"
-export LD_LIBRARY_PATH="/home/hugh/.cache/ti-build-cache/vulkan-1.4.321.1/x86_64/lib"
-export VK_LAYER_PATH="/home/hugh/.cache/ti-build-cache/vulkan-1.4.321.1/x86_64/share/vulkan/explicit_layer.d"
-export SCCACHE_LOG="error"
-export SCCACHE_DIR="/home/hugh/.cache/ti-build-cache/sccache-v0-10-0/cache"
-export SCCACHE_CACHE_SIZE="40G"
-export SCCACHE_STARTUP_TIMEOUT="30"
-export SCCACHE_IDLE_TIMEOUT="0"
-export QUADRANTS_CMAKE_ARGS="-DCLANG_EXECUTABLE=/usr/bin/clang-20 -DCMAKE_CXX_COMPILER_WORKS=1 -DCMAKE_C_COMPILER_LAUNCHER=/home/hugh/.cache/ti-build-cache/sccache-v0-10-0/bin/sccache -DCMAKE_CXX_COMPILER_LAUNCHER=/home/hugh/.cache/ti-build-cache/sccache-v0-10-0/bin/sccache"