Genesis-Embodied-AI · hughperkins · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026
diff --git a/quadrants/rhi/amdgpu/amdgpu_context.cpp b/quadrants/rhi/amdgpu/amdgpu_context.cpp
@@ -204,6 +204,10 @@ void AMDGPUContext::launch(void *func,
 }
 
 AMDGPUContext::~AMDGPUContext() {
+  for (auto *s : stream_pool_) {
+    driver_.stream_destroy(s);
+  }
+  stream_pool_.clear();
   if (context_) {
     driver_.device_primary_ctx_release(device_);
   }

diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h
@@ -24,6 +24,7 @@ class AMDGPUContext {
   AMDGPUDriver &driver_;
   bool debug_{false};
   static thread_local void *stream_;
+  std::vector<void *> stream_pool_;
   std::vector<void *> kernel_arg_pointer_;
 
  public:
@@ -125,6 +126,23 @@ class AMDGPUContext {
     return stream_;
   }
 
+  void *acquire_stream() {
+    std::lock_guard<std::mutex> _(lock_);
+    if (!stream_pool_.empty()) {
+      auto s = stream_pool_.back();
+      stream_pool_.pop_back();
+      return s;
+    }
+    void *s = nullptr;
+    AMDGPUDriver::get_instance().stream_create(&s, 0);
+    return s;
+  }
+
+  void release_stream(void *s) {
+    std::lock_guard<std::mutex> _(lock_);
+    stream_pool_.push_back(s);
+  }
+
   static AMDGPUContext &get_instance();
 };
 

diff --git a/quadrants/rhi/cuda/cuda_context.cpp b/quadrants/rhi/cuda/cuda_context.cpp
@@ -180,13 +180,10 @@ void CUDAContext::launch(void *func,
 }
 
 CUDAContext::~CUDAContext() {
-  // TODO: restore these?
-  /*
-  CUDADriver::get_instance().cuMemFree(context_buffer);
-  for (auto cudaModule: cudaModules)
-      CUDADriver::get_instance().cuModuleUnload(cudaModule);
-  CUDADriver::get_instance().cuCtxDestroy(context);
-  */
+  for (auto *s : stream_pool_) {
+    driver_.stream_destroy(s);
+  }
+  stream_pool_.clear();
 }
 
 CUDAContext &CUDAContext::get_instance() {

diff --git a/quadrants/rhi/cuda/cuda_context.h b/quadrants/rhi/cuda/cuda_context.h
@@ -3,6 +3,7 @@
 #include <mutex>
 #include <unordered_map>
 #include <thread>
+#include <vector>
 
 #include "quadrants/program/kernel_profiler.h"
 #include "quadrants/rhi/cuda/cuda_driver.h"
@@ -31,6 +32,7 @@ class CUDAContext {
   bool debug_;
   bool supports_mem_pool_;
   static thread_local void *stream_;
+  std::vector<void *> stream_pool_;
 
  public:
   CUDAContext();
@@ -120,6 +122,23 @@ class CUDAContext {
   void *get_stream() const {
     return stream_;
   }
+
+  void *acquire_stream() {
+    std::lock_guard<std::mutex> _(lock_);
+    if (!stream_pool_.empty()) {
+      auto s = stream_pool_.back();
+      stream_pool_.pop_back();
+      return s;
+    }
+    void *s = nullptr;
+    CUDADriver::get_instance().stream_create(&s, 0);
+    return s;
+  }
+
+  void release_stream(void *s) {
+    std::lock_guard<std::mutex> _(lock_);
+    stream_pool_.push_back(s);
+  }
 };
 
 }  // namespace quadrants::lang
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -127,15 +127,11 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         i++;
       }
 
-      // Create one stream per unique group ID. Streams are created/destroyed
-      // per launch; a stream pool could reduce overhead for hot loops.
       std::map<int, void *> stream_by_id;
       for (size_t j = group_start; j < i; j++) {
         int sid = offloaded_tasks[j].stream_parallel_group_id;
         if (stream_by_id.find(sid) == stream_by_id.end()) {
-          void *s = nullptr;
-          AMDGPUDriver::get_instance().stream_create(&s, 0);
-          stream_by_id[sid] = s;
+          stream_by_id[sid] = AMDGPUContext::get_instance().acquire_stream();
         }
       }
 
@@ -155,7 +151,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         AMDGPUDriver::get_instance().stream_synchronize(s);
       }
       for (auto &[sid, s] : stream_by_id) {
-        AMDGPUDriver::get_instance().stream_destroy(s);
+        AMDGPUContext::get_instance().release_stream(s);
       }
 
       AMDGPUContext::get_instance().set_stream(active_stream);

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -159,15 +159,11 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         i++;
       }
 
-      // Create one stream per unique group ID. Streams are created/destroyed
-      // per launch; a stream pool could reduce overhead for hot loops.
       std::map<int, void *> stream_by_id;
       for (size_t j = group_start; j < i; j++) {
         int sid = offloaded_tasks[j].stream_parallel_group_id;
         if (stream_by_id.find(sid) == stream_by_id.end()) {
-          void *s = nullptr;
-          CUDADriver::get_instance().stream_create(&s, 0);
-          stream_by_id[sid] = s;
+          stream_by_id[sid] = CUDAContext::get_instance().acquire_stream();
         }
       }
 
@@ -187,7 +183,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         CUDADriver::get_instance().stream_synchronize(s);
       }
       for (auto &[sid, s] : stream_by_id) {
-        CUDADriver::get_instance().stream_destroy(s);
+        CUDAContext::get_instance().release_stream(s);
       }
 
       CUDAContext::get_instance().set_stream(active_stream);

diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
@@ -419,3 +419,31 @@ def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)):
     s.synchronize()
     assert np.allclose(arr.to_numpy(), 99.0)
     s.destroy()
+
+
+@test_utils.test()
+def test_stream_pool_reuse():
+    """Repeated stream_parallel invocations reuse pooled streams correctly."""
+    N = 128
+    a = qd.ndarray(qd.f32, shape=(N,))
+    b = qd.ndarray(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def parallel_fill(
+        x: qd.types.ndarray(dtype=qd.f32, ndim=1),
+        y: qd.types.ndarray(dtype=qd.f32, ndim=1),
+        val: qd.f32,
+    ):
+        with qd.stream_parallel():
+            for i in range(N):
+                x[i] = val
+        with qd.stream_parallel():
+            for i in range(N):
+                y[i] = val * 2.0
+
+    for iteration in range(5):
+        v = float(iteration + 1)
+        parallel_fill(a, b, v)
+        qd.sync()
+        assert np.allclose(a.to_numpy(), v), f"iteration {iteration}"
+        assert np.allclose(b.to_numpy(), v * 2.0), f"iteration {iteration}"