From 5cd04851c504273f475193dff83da4123ec7bacd Mon Sep 17 00:00:00 2001
From: Chuck Ketcham <cketcham@nvidia.com>
Date: Tue, 31 Mar 2026 16:29:05 +0000
Subject: [PATCH 1/5] [Realtime] Add Hololink bridge support for HOST_LOOP
 graph dispatch Extends the Hololink bridge skeleton and host dispatcher to
 support CPU-launched CUDA graph dispatch (CUDAQ_DISPATCH_PATH_HOST) over
 Hololink RDMA.

Signed-off-by: Chuck Ketcham <cketcham@nvidia.com>
---
 .../daemon/bridge/hololink/hololink_wrapper.h |   9 ++
 .../daemon/dispatcher/cudaq_realtime.h        |   6 +
 .../cudaq/realtime/hololink_bridge_common.h   | 141 ++++++++++++++----
 .../lib/daemon/bridge/hololink/CMakeLists.txt |   9 --
 .../bridge/hololink/hololink_wrapper.cpp      |   7 +
 .../lib/daemon/dispatcher/host_dispatcher.cu  |  25 +++-
 realtime/unittests/utils/CMakeLists.txt       |   9 --
 realtime/unittests/utils/hololink_test.sh     |   8 +
 8 files changed, 167 insertions(+), 47 deletions(-)

diff --git a/realtime/include/cudaq/realtime/daemon/bridge/hololink/hololink_wrapper.h b/realtime/include/cudaq/realtime/daemon/bridge/hololink/hololink_wrapper.h
index 73264287d7e..d4dc766fd34 100644
--- a/realtime/include/cudaq/realtime/daemon/bridge/hololink/hololink_wrapper.h
+++ b/realtime/include/cudaq/realtime/daemon/bridge/hololink/hololink_wrapper.h
@@ -72,6 +72,15 @@ void hololink_close(hololink_transceiver_t handle);
  */
 void hololink_blocking_monitor(hololink_transceiver_t handle);
 
+/**
+ * Force CPU+GPU accessible allocation for ring flags and data only.
+ * Does NOT change CQ/QP UMEMs or TX kernel handler (BlueFlame preserved).
+ * Must be called after create and before start().
+ * Required when a CPU thread needs to read ring flags/data directly
+ * (e.g. HOST_LOOP dispatcher on Grace-Blackwell dGPU).
+ */
+void hololink_set_cpu_ring_buffers(hololink_transceiver_t handle, int enable);
+
 //==============================================================================
 // QP information (for RDMA setup)
 //==============================================================================
diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h b/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h
index dcee4cddc9b..4fbdc00ca9a 100644
--- a/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h
+++ b/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h
@@ -102,6 +102,12 @@ typedef struct {
   cudaq_kernel_type_t kernel_type;     // regular/cooperative kernel
   cudaq_dispatch_mode_t dispatch_mode; // device call/graph launch
   cudaq_dispatch_path_t dispatch_path; // GPU kernel or CPU host loop
+  int skip_tx_markers; // when non-zero, the host dispatcher will NOT write
+                       // sentinel markers (CUDAQ_TX_FLAG_IN_FLIGHT) to
+                       // tx_flags before graph launch.  Set this when an
+                       // external GPU kernel (e.g. Hololink TX) polls the
+                       // same tx_flags array; the sentinel would be
+                       // misinterpreted as a valid address.
 } cudaq_dispatcher_config_t;
 
 // GPU ring buffer pointers. For device backend use device pointers only.
diff --git a/realtime/include/cudaq/realtime/hololink_bridge_common.h b/realtime/include/cudaq/realtime/hololink_bridge_common.h
index dba8eb5fa68..f4c2c6d95ae 100644
--- a/realtime/include/cudaq/realtime/hololink_bridge_common.h
+++ b/realtime/include/cudaq/realtime/hololink_bridge_common.h
@@ -136,6 +136,24 @@ struct BridgeConfig {
   /// Default: cudaq_launch_dispatch_kernel_regular
   cudaq_dispatch_launch_fn_t launch_fn = nullptr;
 
+  // HOST_LOOP graph launch mode -- CPU-side dispatcher that polls Hololink
+  // ring flags and launches CUDA graphs.  Requires a Grace-based system
+  // (Grace-Hopper / DGX Spark, Grace-Blackwell / GB200) where GPU memory is
+  // CPU-accessible via NVLink-C2C, since the HOST_LOOP thread reads DOCA
+  // GPU ring flags directly from the CPU.
+  cudaq_dispatch_path_t dispatch_path = CUDAQ_DISPATCH_PATH_DEVICE;
+
+  /// Host-side function table for GRAPH_LAUNCH entries (HOST_LOOP only).
+  /// Each entry must have dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH
+  /// and handler.graph_exec set to a valid cudaGraphExec_t.
+  cudaq_function_entry_t *h_function_entries = nullptr;
+  size_t h_func_count = 0;
+
+  /// Pinned mailbox for HOST_LOOP graph dispatch (from graph_resources).
+  /// h_mailbox is the host pointer, d_mailbox is the device-mapped view.
+  void **h_mailbox = nullptr;
+  void **d_mailbox = nullptr;
+
   /// @brief Optional cleanup callback invoked during shutdown.
   std::function<void()> cleanup_fn;
 };
@@ -247,8 +265,9 @@ inline int bridge_run(BridgeConfig &config) {
   bool is_igpu = (prop.integrated != 0);
   bool unified_igpu = config.unified && is_igpu;
 
+  bool is_host_loop = (config.dispatch_path == CUDAQ_DISPATCH_PATH_HOST);
   bool use_forward = config.forward || (config.unified && !is_igpu);
-  bool use_3kernel = !config.forward && !config.unified;
+  bool use_3kernel = is_host_loop || (!config.forward && !config.unified);
 
   hololink_transceiver_t transceiver = hololink_create_transceiver(
       config.device.c_str(), 1, // ib_port
@@ -266,6 +285,10 @@ inline int bridge_run(BridgeConfig &config) {
     return 1;
   }
 
+  // HOST_LOOP needs CPU-readable ring flags and data; allocate as CPU_GPU.
+  if (is_host_loop)
+    hololink_set_cpu_ring_buffers(transceiver, 1);
+
   std::cout << "  Connecting to remote QP 0x" << std::hex << config.remote_qp
             << std::dec << " at " << config.peer_ip << "..." << std::endl;
 
@@ -336,7 +359,7 @@ inline int bridge_run(BridgeConfig &config) {
   hololink_doca_transport_ctx unified_ctx{};
 
   if (!config.forward) {
-    if (!config.unified) {
+    if (!config.unified && !is_host_loop) {
       int dispatch_blocks = 0;
       cudaError_t occ_err;
       if (config.kernel_type == CUDAQ_KERNEL_COOPERATIVE) {
@@ -358,8 +381,10 @@ inline int bridge_run(BridgeConfig &config) {
     // [4] Wire dispatch kernel
     //==========================================================================
     std::cout << "\n[4/5] Wiring dispatch kernel ("
-              << (config.unified ? "unified" : "3-kernel") << ")..."
-              << std::endl;
+              << (is_host_loop     ? "host-loop"
+                  : config.unified ? "unified"
+                                   : "3-kernel")
+              << ")..." << std::endl;
 
     void *tmp_shutdown = nullptr;
     BRIDGE_CUDA_CHECK(
@@ -374,7 +399,14 @@ inline int bridge_run(BridgeConfig &config) {
     BRIDGE_CUDA_CHECK(cudaMemcpy(const_cast<int *>(d_shutdown_flag), &zero,
                                  sizeof(int), cudaMemcpyHostToDevice));
 
-    BRIDGE_CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t)));
+    if (is_host_loop) {
+      void *tmp_stats = nullptr;
+      BRIDGE_CUDA_CHECK(
+          cudaHostAlloc(&tmp_stats, sizeof(uint64_t), cudaHostAllocMapped));
+      d_stats = static_cast<uint64_t *>(tmp_stats);
+    } else {
+      BRIDGE_CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t)));
+    }
     BRIDGE_CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t)));
 
     if (cudaq_dispatch_manager_create(&manager) != CUDAQ_OK) {
@@ -385,20 +417,31 @@ inline int bridge_run(BridgeConfig &config) {
     cudaq_dispatcher_config_t dconfig{};
     dconfig.device_id = config.gpu_id;
     dconfig.vp_id = 0;
-    dconfig.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL;
-
-    if (config.unified) {
-      dconfig.kernel_type = CUDAQ_KERNEL_UNIFIED;
-      dconfig.num_blocks = 1;
-      dconfig.threads_per_block = 1;
-      dconfig.num_slots = 0;
-      dconfig.slot_size = 0;
-    } else {
-      dconfig.kernel_type = config.kernel_type;
-      dconfig.num_blocks = config.num_blocks;
-      dconfig.threads_per_block = config.threads_per_block;
+
+    if (is_host_loop) {
+      dconfig.dispatch_path = CUDAQ_DISPATCH_PATH_HOST;
+      dconfig.dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH;
       dconfig.num_slots = static_cast<uint32_t>(config.num_pages);
       dconfig.slot_size = static_cast<uint32_t>(config.page_size);
+      // Hololink TX kernel polls tx_flags for ready data; writing sentinel
+      // markers (0xEEEE) would be misinterpreted as a valid TX buffer address.
+      dconfig.skip_tx_markers = 1;
+    } else {
+      dconfig.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL;
+
+      if (config.unified) {
+        dconfig.kernel_type = CUDAQ_KERNEL_UNIFIED;
+        dconfig.num_blocks = 1;
+        dconfig.threads_per_block = 1;
+        dconfig.num_slots = 0;
+        dconfig.slot_size = 0;
+      } else {
+        dconfig.kernel_type = config.kernel_type;
+        dconfig.num_blocks = config.num_blocks;
+        dconfig.threads_per_block = config.threads_per_block;
+        dconfig.num_slots = static_cast<uint32_t>(config.num_pages);
+        dconfig.slot_size = static_cast<uint32_t>(config.page_size);
+      }
     }
 
     if (cudaq_dispatcher_create(manager, &dconfig, &dispatcher) != CUDAQ_OK) {
@@ -406,7 +449,46 @@ inline int bridge_run(BridgeConfig &config) {
       return 1;
     }
 
-    if (config.unified) {
+    if (is_host_loop) {
+      // HOST_LOOP: wire ringbuffer with Hololink GPU pointers as both
+      // device and host views.  On Grace-based systems (Grace-Hopper,
+      // Grace-Blackwell), GPU memory is CPU-accessible via NVLink-C2C.
+      cudaq_ringbuffer_t ringbuffer{};
+      ringbuffer.rx_flags = reinterpret_cast<volatile uint64_t *>(rx_ring_flag);
+      ringbuffer.tx_flags = reinterpret_cast<volatile uint64_t *>(tx_ring_flag);
+      ringbuffer.rx_data = rx_ring_data;
+      ringbuffer.tx_data = tx_ring_data;
+      ringbuffer.rx_stride_sz = config.page_size;
+      ringbuffer.tx_stride_sz = config.page_size;
+      ringbuffer.rx_flags_host =
+          reinterpret_cast<volatile uint64_t *>(rx_ring_flag);
+      ringbuffer.tx_flags_host =
+          reinterpret_cast<volatile uint64_t *>(tx_ring_flag);
+      ringbuffer.rx_data_host = rx_ring_data;
+      ringbuffer.tx_data_host = tx_ring_data;
+
+      if (cudaq_dispatcher_set_ringbuffer(dispatcher, &ringbuffer) !=
+          CUDAQ_OK) {
+        std::cerr << "ERROR: Failed to set ringbuffer" << std::endl;
+        return 1;
+      }
+
+      cudaq_function_table_t table{};
+      table.entries = config.h_function_entries;
+      table.count = static_cast<uint32_t>(config.h_func_count);
+      if (cudaq_dispatcher_set_function_table(dispatcher, &table) != CUDAQ_OK) {
+        std::cerr << "ERROR: Failed to set function table" << std::endl;
+        return 1;
+      }
+
+      if (config.h_mailbox) {
+        if (cudaq_dispatcher_set_mailbox(dispatcher, config.h_mailbox) !=
+            CUDAQ_OK) {
+          std::cerr << "ERROR: Failed to set mailbox" << std::endl;
+          return 1;
+        }
+      }
+    } else if (config.unified) {
       // Pack DOCA transport handles into the opaque context
       unified_ctx.gpu_dev_qp = hololink_get_gpu_dev_qp(transceiver);
       unified_ctx.rx_ring_data = rx_ring_data;
@@ -448,12 +530,14 @@ inline int bridge_run(BridgeConfig &config) {
       }
     }
 
-    cudaq_function_table_t table{};
-    table.entries = config.d_function_entries;
-    table.count = config.func_count;
-    if (cudaq_dispatcher_set_function_table(dispatcher, &table) != CUDAQ_OK) {
-      std::cerr << "ERROR: Failed to set function table" << std::endl;
-      return 1;
+    if (!is_host_loop) {
+      cudaq_function_table_t table{};
+      table.entries = config.d_function_entries;
+      table.count = config.func_count;
+      if (cudaq_dispatcher_set_function_table(dispatcher, &table) != CUDAQ_OK) {
+        std::cerr << "ERROR: Failed to set function table" << std::endl;
+        return 1;
+      }
     }
 
     if (cudaq_dispatcher_set_control(dispatcher, d_shutdown_flag, d_stats) !=
@@ -488,6 +572,7 @@ inline int bridge_run(BridgeConfig &config) {
     std::this_thread::sleep_for(std::chrono::milliseconds(500));
   } else {
     std::cout << "\n[5/5] Launching Hololink kernels..." << std::endl;
+    std::cout.flush();
     hololink_thread = std::thread(
         [transceiver]() { hololink_blocking_monitor(transceiver); });
     std::this_thread::sleep_for(std::chrono::milliseconds(500));
@@ -572,8 +657,12 @@ inline int bridge_run(BridgeConfig &config) {
 
   if (shutdown_flag)
     cudaFreeHost(const_cast<int *>(shutdown_flag));
-  if (d_stats)
-    cudaFree(d_stats);
+  if (d_stats) {
+    if (is_host_loop)
+      cudaFreeHost(d_stats);
+    else
+      cudaFree(d_stats);
+  }
 
   // Call tool-specific cleanup
   if (config.cleanup_fn)
diff --git a/realtime/lib/daemon/bridge/hololink/CMakeLists.txt b/realtime/lib/daemon/bridge/hololink/CMakeLists.txt
index 456cfe82062..8a1e60a709d 100644
--- a/realtime/lib/daemon/bridge/hololink/CMakeLists.txt
+++ b/realtime/lib/daemon/bridge/hololink/CMakeLists.txt
@@ -71,14 +71,6 @@ find_library(HOLOLINK_COMMON_LIB
     "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib"
   NO_DEFAULT_PATH)
 
-find_library(ROCE_RECEIVER_LIB
-  NAMES roce_receiver
-  PATHS
-    "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}"
-    "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/operators/roce_receiver"
-    "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib"
-  NO_DEFAULT_PATH)
-
 find_library(BASE_RECEIVER_OP_LIB
   NAMES base_receiver_op
   PATHS
@@ -180,7 +172,6 @@ if (GPU_ROCE_TRANSCEIVER_LIB AND
     PRIVATE
       hololink_wrapper_generic
       ${GPU_ROCE_TRANSCEIVER_LIB}
-      ${ROCE_RECEIVER_LIB}
       ${BASE_RECEIVER_OP_LIB}
       ${HOLOLINK_CORE_LIB}
       ${HOLOLINK_COMMON_LIB}
diff --git a/realtime/lib/daemon/bridge/hololink/hololink_wrapper.cpp b/realtime/lib/daemon/bridge/hololink/hololink_wrapper.cpp
index e59f5176405..2452e87f810 100644
--- a/realtime/lib/daemon/bridge/hololink/hololink_wrapper.cpp
+++ b/realtime/lib/daemon/bridge/hololink/hololink_wrapper.cpp
@@ -100,6 +100,13 @@ void hololink_blocking_monitor(hololink_transceiver_t handle) {
   }
 }
 
+void hololink_set_cpu_ring_buffers(hololink_transceiver_t handle, int enable) {
+  if (handle) {
+    auto *impl = reinterpret_cast<HololinkTransceiverImpl *>(handle);
+    impl->transceiver->set_cpu_ring_buffers(enable != 0);
+  }
+}
+
 //==============================================================================
 // QP information
 //==============================================================================
diff --git a/realtime/lib/daemon/dispatcher/host_dispatcher.cu b/realtime/lib/daemon/dispatcher/host_dispatcher.cu
index 4f80e3e0d36..d1e7edf9248 100644
--- a/realtime/lib/daemon/dispatcher/host_dispatcher.cu
+++ b/realtime/lib/daemon/dispatcher/host_dispatcher.cu
@@ -102,6 +102,21 @@ static int acquire_graph_worker(const cudaq_host_dispatch_loop_ctx_t *ctx,
   return __builtin_ffsll(static_cast<long long>(mask)) - 1;
 }
 
+static void
+sweep_completed_workers(const cudaq_host_dispatch_loop_ctx_t *ctx) {
+  uint64_t busy =
+      ~as_atomic_u64(ctx->idle_mask)->load(cuda::std::memory_order_acquire);
+  busy &= (1ULL << ctx->num_workers) - 1;
+  while (busy != 0) {
+    int w = __builtin_ffsll(static_cast<long long>(busy)) - 1;
+    if (cudaStreamQuery(ctx->workers[w].stream) == cudaSuccess) {
+      as_atomic_u64(ctx->idle_mask)
+          ->fetch_or(1ULL << w, cuda::std::memory_order_release);
+    }
+    busy &= ~(1ULL << w);
+  }
+}
+
 static void launch_graph_worker(const cudaq_host_dispatch_loop_ctx_t *ctx,
                                 int worker_id, void *slot_host,
                                 size_t current_slot) {
@@ -129,8 +144,10 @@ static void launch_graph_worker(const cudaq_host_dispatch_loop_ctx_t *ctx,
     void *d_ctx = d_ctxs + worker_id * sizeof(GraphIOContext);
     ctx->h_mailbox_bank[worker_id] = d_ctx;
 
-    as_atomic_u64(ctx->ringbuffer.tx_flags_host)[current_slot].store(
-        CUDAQ_TX_FLAG_IN_FLIGHT, cuda::std::memory_order_release);
+    if (!ctx->config.skip_tx_markers) {
+      as_atomic_u64(ctx->ringbuffer.tx_flags_host)[current_slot].store(
+          CUDAQ_TX_FLAG_IN_FLIGHT, cuda::std::memory_order_release);
+    }
     __sync_synchronize();
   } else {
     ctx->h_mailbox_bank[worker_id] = data_dev;
@@ -154,7 +171,7 @@ static void launch_graph_worker(const cudaq_host_dispatch_loop_ctx_t *ctx,
     if (ctx->workers[w].post_launch_fn)
       ctx->workers[w].post_launch_fn(ctx->workers[w].post_launch_data,
                                         data_dev, ctx->workers[w].stream);
-    if (ctx->io_ctxs_host == nullptr) {
+    if (ctx->io_ctxs_host == nullptr && !ctx->config.skip_tx_markers) {
       as_atomic_u64(ctx->ringbuffer.tx_flags_host)[current_slot].store(
           CUDAQ_TX_FLAG_IN_FLIGHT, cuda::std::memory_order_release);
     }
@@ -178,6 +195,7 @@ cudaq_host_dispatcher_loop(const cudaq_host_dispatch_loop_ctx_t *ctx) {
             cuda::std::memory_order_acquire);
 
     if (rx_value == 0) {
+      sweep_completed_workers(ctx);
       CUDAQ_REALTIME_CPU_RELAX();
       continue;
     }
@@ -206,6 +224,7 @@ cudaq_host_dispatcher_loop(const cudaq_host_dispatch_loop_ctx_t *ctx) {
       continue;
     }
 
+    sweep_completed_workers(ctx);
     int worker_id =
         acquire_graph_worker(ctx, use_function_table, entry, function_id);
     if (worker_id < 0) {
diff --git a/realtime/unittests/utils/CMakeLists.txt b/realtime/unittests/utils/CMakeLists.txt
index a66cf9bce97..1cd14db74f6 100644
--- a/realtime/unittests/utils/CMakeLists.txt
+++ b/realtime/unittests/utils/CMakeLists.txt
@@ -71,14 +71,6 @@ find_library(HOLOLINK_COMMON_LIB
     "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib"
   NO_DEFAULT_PATH)
 
-find_library(ROCE_RECEIVER_LIB
-  NAMES roce_receiver
-  PATHS
-    "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}"
-    "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/operators/roce_receiver"
-    "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib"
-  NO_DEFAULT_PATH)
-
 find_library(BASE_RECEIVER_OP_LIB
   NAMES base_receiver_op
   PATHS
@@ -226,7 +218,6 @@ if (GPU_ROCE_TRANSCEIVER_LIB AND
       cudaq-realtime-bridge-hololink
       hololink_wrapper_generic
       ${GPU_ROCE_TRANSCEIVER_LIB}
-      ${ROCE_RECEIVER_LIB}
       ${BASE_RECEIVER_OP_LIB}
       ${HOLOLINK_CORE_LIB}
       ${HOLOLINK_COMMON_LIB}
diff --git a/realtime/unittests/utils/hololink_test.sh b/realtime/unittests/utils/hololink_test.sh
index 4e171fdffdb..3335edc63c1 100755
--- a/realtime/unittests/utils/hololink_test.sh
+++ b/realtime/unittests/utils/hololink_test.sh
@@ -211,6 +211,14 @@ do_build() {
         target_arch="arm64"
     fi
 
+    # Ensure nvcc is on PATH for detect_cuda_arch() and cmake check_language(CUDA).
+    if [[ -x /usr/local/cuda/bin/nvcc ]]; then
+        case ":$PATH:" in
+            *":/usr/local/cuda/bin:"*) ;;
+            *) export PATH="/usr/local/cuda/bin:$PATH" ;;
+        esac
+    fi
+
     # Detect highest CUDA arch supported by nvcc
     local cuda_arch
     cuda_arch=$(detect_cuda_arch)

From 812af07683438acd5520809979d398ef9000d54e Mon Sep 17 00:00:00 2001
From: Chuck Ketcham <cketcham@nvidia.com>
Date: Tue, 31 Mar 2026 17:33:23 +0000
Subject: [PATCH 2/5] Apply Hololink patches in CI for HOST_LOOP support

Signed-off-by: Chuck Ketcham <cketcham@nvidia.com>
---
 .github/workflows/realtime_ci.yml             |   3 +-
 ...ing-flags-as-CPU_GPU-on-iGPU-for-hos.patch | 108 ++++++++++++++++++
 ...g_buffers-for-HOST_LOOP-dGPU-support.patch |  81 +++++++++++++
 3 files changed, 191 insertions(+), 1 deletion(-)
 create mode 100644 realtime/scripts/hololink-patches/0003-Allocate-RX-TX-ring-flags-as-CPU_GPU-on-iGPU-for-hos.patch
 create mode 100644 realtime/scripts/hololink-patches/0004-Add-set_cpu_ring_buffers-for-HOST_LOOP-dGPU-support.patch

diff --git a/.github/workflows/realtime_ci.yml b/.github/workflows/realtime_ci.yml
index 5b3f045a7ba..dcf70b60462 100644
--- a/.github/workflows/realtime_ci.yml
+++ b/.github/workflows/realtime_ci.yml
@@ -130,7 +130,8 @@ jobs:
             bash scripts/install_dev_prerequisites.sh
             # Build HSB (GPU RoCE transceiver and hololink_core)
             export CUDA_NATIVE_ARCH="${{ (contains(matrix.cuda_version, '12') && '80-real;90') || '80-real;90-real;100f-real;110-real;120-real;100-virtual' }}"
-            cd /workspace/ && git clone -b release-2.6.0-EA https://github.com/nvidia-holoscan/holoscan-sensor-bridge.git && cd holoscan-sensor-bridge 
+            cd /workspace/ && git clone -b release-2.6.0-EA https://github.com/nvidia-holoscan/holoscan-sensor-bridge.git && cd holoscan-sensor-bridge
+            for p in /workspace/realtime/scripts/hololink-patches/*.patch; do echo "Applying: $(basename $p)"; git apply "$p"; done
             cmake -G Ninja -S /workspace/holoscan-sensor-bridge -B /workspace/holoscan-sensor-bridge/build -DCMAKE_BUILD_TYPE=Release -DHOLOLINK_BUILD_ONLY_NATIVE=OFF -DHOLOLINK_BUILD_PYTHON=OFF -DHOLOLINK_BUILD_TESTS=OFF -DHOLOLINK_BUILD_TOOLS=OFF -DHOLOLINK_BUILD_EXAMPLES=OFF -DHOLOLINK_BUILD_EMULATOR=OFF
             cmake --build /workspace/holoscan-sensor-bridge/build --target roce_receiver gpu_roce_transceiver hololink_core
             # Build CUDA-Q Realtime
diff --git a/realtime/scripts/hololink-patches/0003-Allocate-RX-TX-ring-flags-as-CPU_GPU-on-iGPU-for-hos.patch b/realtime/scripts/hololink-patches/0003-Allocate-RX-TX-ring-flags-as-CPU_GPU-on-iGPU-for-hos.patch
new file mode 100644
index 00000000000..71407ef5e3a
--- /dev/null
+++ b/realtime/scripts/hololink-patches/0003-Allocate-RX-TX-ring-flags-as-CPU_GPU-on-iGPU-for-hos.patch
@@ -0,0 +1,108 @@
+From 407d74a469ab6aca6b11f1b68fc0ff5b1e8e47d1 Mon Sep 17 00:00:00 2001
+From: Chuck Ketcham <cketcham@nvidia.com>
+Date: Mon, 23 Mar 2026 22:30:09 +0000
+Subject: [PATCH 3/4] Allocate RX/TX ring flags as CPU_GPU on iGPU for
+ host-side polling
+
+On integrated GPU systems (Grace-Hopper, Grace-Blackwell), the HOST_LOOP
+dispatcher polls DOCA ring flags from the CPU. The flags were
+unconditionally allocated as DOCA_GPU_MEM_TYPE_GPU, causing a segfault
+when the CPU thread attempted an atomic load. When umem_cpu is set
+(iGPU), allocate flags with DOCA_GPU_MEM_TYPE_CPU_GPU and use memset
+instead of cudaMemset for zero-initialization.
+
+Signed-off-by: Chuck Ketcham <cketcham@nvidia.com>
+---
+ .../gpu_roce_transceiver_doca.cpp             | 60 +++++++++++++------
+ 1 file changed, 42 insertions(+), 18 deletions(-)
+
+diff --git a/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver_doca.cpp b/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver_doca.cpp
+index 707119e2..12039fa5 100644
+--- a/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver_doca.cpp
++++ b/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver_doca.cpp
+@@ -528,21 +528,33 @@ doca_error_t DocaQp::create_ring(size_t stride_sz, unsigned stride_num, struct i
+         return DOCA_ERROR_NOT_SUPPORTED;
+     }
+ 
+-    result = doca_gpu_mem_alloc(gdev,
+-        sizeof(uint64_t) * gpu_rx_ring.stride_num,
+-        get_page_size(),
+-        DOCA_GPU_MEM_TYPE_GPU,
+-        (void**)&(gpu_rx_ring.flag),
+-        nullptr);
++    if (umem_cpu) {
++        result = doca_gpu_mem_alloc(gdev,
++            sizeof(uint64_t) * gpu_rx_ring.stride_num,
++            get_page_size(),
++            DOCA_GPU_MEM_TYPE_CPU_GPU,
++            (void**)&(gpu_rx_ring.flag),
++            (void**)&(gpu_rx_ring.flag));
++    } else {
++        result = doca_gpu_mem_alloc(gdev,
++            sizeof(uint64_t) * gpu_rx_ring.stride_num,
++            get_page_size(),
++            DOCA_GPU_MEM_TYPE_GPU,
++            (void**)&(gpu_rx_ring.flag),
++            nullptr);
++    }
+     if (result != DOCA_SUCCESS || gpu_rx_ring.flag == nullptr) {
+         HSB_LOG_ERROR("Failed to alloc rx flag ring buffer: {}", doca_error_get_descr(result));
+         goto exit;
+     }
+-
+-    result_cuda = cudaMemset(gpu_rx_ring.flag, 0, sizeof(uint64_t) * gpu_rx_ring.stride_num);
+-    if (result_cuda != cudaSuccess) {
+-        HSB_LOG_ERROR("cudaMemset returned error {}", (int)result_cuda);
+-        goto exit;
++    if (umem_cpu) {
++        memset(gpu_rx_ring.flag, 0, sizeof(uint64_t) * gpu_rx_ring.stride_num);
++    } else {
++        result_cuda = cudaMemset(gpu_rx_ring.flag, 0, sizeof(uint64_t) * gpu_rx_ring.stride_num);
++        if (result_cuda != cudaSuccess) {
++            HSB_LOG_ERROR("cudaMemset returned error {}", (int)result_cuda);
++            goto exit;
++        }
+     }
+ 
+     gpu_tx_ring.stride_sz = stride_sz;
+@@ -597,18 +609,30 @@ doca_error_t DocaQp::create_ring(size_t stride_sz, unsigned stride_num, struct i
+         return DOCA_ERROR_NOT_SUPPORTED;
+     }
+ 
+-    result = doca_gpu_mem_alloc(gdev,
+-        sizeof(uint64_t) * gpu_tx_ring.stride_num,
+-        get_page_size(),
+-        DOCA_GPU_MEM_TYPE_GPU,
+-        (void**)&(gpu_tx_ring.flag),
+-        nullptr);
++    if (umem_cpu) {
++        result = doca_gpu_mem_alloc(gdev,
++            sizeof(uint64_t) * gpu_tx_ring.stride_num,
++            get_page_size(),
++            DOCA_GPU_MEM_TYPE_CPU_GPU,
++            (void**)&(gpu_tx_ring.flag),
++            (void**)&(gpu_tx_ring.flag));
++    } else {
++        result = doca_gpu_mem_alloc(gdev,
++            sizeof(uint64_t) * gpu_tx_ring.stride_num,
++            get_page_size(),
++            DOCA_GPU_MEM_TYPE_GPU,
++            (void**)&(gpu_tx_ring.flag),
++            nullptr);
++    }
+     if (result != DOCA_SUCCESS || gpu_tx_ring.flag == nullptr) {
+         HSB_LOG_ERROR("Failed to alloc tx flag ring buffer: {}", doca_error_get_descr(result));
+         goto exit;
+     }
+ 
+-    cudaMemset(gpu_tx_ring.flag, 0, sizeof(uint64_t) * gpu_tx_ring.stride_num);
++    if (umem_cpu)
++        memset(gpu_tx_ring.flag, 0, sizeof(uint64_t) * gpu_tx_ring.stride_num);
++    else
++        cudaMemset(gpu_tx_ring.flag, 0, sizeof(uint64_t) * gpu_tx_ring.stride_num);
+ 
+     return DOCA_SUCCESS;
+ 
+-- 
+2.43.0
+
diff --git a/realtime/scripts/hololink-patches/0004-Add-set_cpu_ring_buffers-for-HOST_LOOP-dGPU-support.patch b/realtime/scripts/hololink-patches/0004-Add-set_cpu_ring_buffers-for-HOST_LOOP-dGPU-support.patch
new file mode 100644
index 00000000000..1ccefc97e3c
--- /dev/null
+++ b/realtime/scripts/hololink-patches/0004-Add-set_cpu_ring_buffers-for-HOST_LOOP-dGPU-support.patch
@@ -0,0 +1,81 @@
+From dce0123f9383cc1fdce11d1f540199de23b183b6 Mon Sep 17 00:00:00 2001
+From: Chuck Ketcham <cketcham@nvidia.com>
+Date: Tue, 24 Mar 2026 18:13:10 +0000
+Subject: [PATCH 4/4] Add set_cpu_ring_buffers() for HOST_LOOP dGPU support
+
+On dGPU systems (prop.integrated=false), GpuRoceTransceiver allocates
+ring flags and data as DOCA_GPU_MEM_TYPE_GPU (GPU-only). The HOST_LOOP
+dispatcher reads ring flags from the CPU, causing a segfault on these
+GPU-only pointers.
+Add set_cpu_ring_buffers(bool) which, when enabled, temporarily forces
+umem_cpu=true around the DocaQp::create_ring() call so ring buffers
+are allocated as CPU_GPU. The original umem_cpu is restored immediately
+after, preserving BlueFlame TX and avoiding the CPU proxy thread on dGPU.
+
+Signed-off-by: Chuck Ketcham <cketcham@nvidia.com>
+---
+ .../gpu_roce_transceiver/gpu_roce_transceiver.cpp   | 13 ++++++++++++-
+ .../gpu_roce_transceiver/gpu_roce_transceiver.hpp   |  8 ++++++++
+ 2 files changed, 20 insertions(+), 1 deletion(-)
+
+diff --git a/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver.cpp b/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver.cpp
+index d8d28d65..9f852f42 100644
+--- a/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver.cpp
++++ b/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver.cpp
+@@ -245,7 +245,8 @@ bool GpuRoceTransceiver::start()
+     umem_cpu = false;
+     if (prop.integrated)
+         umem_cpu = true;
+-    HSB_LOG_INFO("Device {} GPU type {}", gpu_id_, prop.integrated ? "iGPU" : "dGPU");
++    HSB_LOG_INFO("Device {} GPU type {} umem_cpu={} cpu_ring_buffers={}", gpu_id_,
++                 prop.integrated ? "iGPU" : "dGPU", umem_cpu, cpu_ring_buffers_);
+ 
+     result = doca_gpu_create(gpu_bus_id, &doca_gpu_device_);
+     if (result != DOCA_SUCCESS) {
+@@ -338,7 +339,17 @@ bool GpuRoceTransceiver::start()
+         return false;
+     }
+ 
++    // cpu_ring_buffers_: force CPU_GPU allocation for ring flags and data only,
++    // without changing CQ/QP UMEMs or TX kernel handler mode.
++    if (cpu_ring_buffers_ && !umem_cpu)
++        doca_qp->umem_cpu = true;
++
+     result = doca_qp->create_ring(cu_page_size_, pages_, ibv_pd);
++
++    // Restore original umem_cpu so TX kernel handler uses BlueFlame on dGPU.
++    if (cpu_ring_buffers_ && !umem_cpu)
++        doca_qp->umem_cpu = false;
++
+     if (result != DOCA_SUCCESS) {
+         HSB_LOG_ERROR("Failed to create ring buffers: {}", doca_error_get_descr(result));
+         return false;
+diff --git a/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver.hpp b/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver.hpp
+index c0f79001..eaebad46 100644
+--- a/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver.hpp
++++ b/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver.hpp
+@@ -217,6 +217,13 @@ public:
+     uint32_t get_tx_ring_stride_num();
+     uint64_t* get_tx_ring_flag_addr();
+ 
++    /// Force CPU+GPU accessible allocation for ring flags and data only
++    /// (DOCA_GPU_MEM_TYPE_CPU_GPU) even on dGPU systems.  Must be called
++    /// before start().  Does NOT affect CQ/QP UMEMs or TX kernel handler.
++    /// Required when a CPU thread needs to read ring flags/data directly
++    /// (e.g. HOST_LOOP dispatcher on Grace-Blackwell).
++    void set_cpu_ring_buffers(bool enable) { cpu_ring_buffers_ = enable; }
++
+     /** Blocks until close(); returns false (no CPU frame stream; kernel owns datapath). */
+     bool get_next_frame(unsigned timeout_ms, CUstream cuda_stream);
+ 
+@@ -288,6 +295,7 @@ private:
+     std::mutex& get_lock();
+ 
+     bool umem_cpu;
++    bool cpu_ring_buffers_ = false;
+ 
+     CUdevice cuDevice;
+     CUcontext cuContext;
+-- 
+2.43.0
+

From cbf52eda18732db59b819b1d29a38fc4401c1aca Mon Sep 17 00:00:00 2001
From: Chuck Ketcham <cketcham@nvidia.com>
Date: Tue, 31 Mar 2026 18:29:06 +0000
Subject: [PATCH 3/5] Apply Hololink patches in Dockerfile for HOST_LOOP
 support

Signed-off-by: Chuck Ketcham <cketcham@nvidia.com>
---
 realtime/docker/assets.Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/realtime/docker/assets.Dockerfile b/realtime/docker/assets.Dockerfile
index 794a03e79b5..e3080306b09 100644
--- a/realtime/docker/assets.Dockerfile
+++ b/realtime/docker/assets.Dockerfile
@@ -90,6 +90,7 @@ ENV CUDA_NATIVE_ARCH=${cuda_native_arg}
 ARG hsb_version="release-2.6.0-EA"
 # Build HSB
 RUN cd / && git clone -b ${hsb_version} https://github.com/nvidia-holoscan/holoscan-sensor-bridge.git && cd holoscan-sensor-bridge && \
+    for p in /cuda-quantum/realtime/scripts/hololink-patches/*.patch; do echo "Applying: $(basename $p)"; git apply "$p"; done && \
     cmake -G Ninja -S . -B build -DCMAKE_BUILD_TYPE=Release -DHOLOLINK_BUILD_ONLY_NATIVE=OFF -DHOLOLINK_BUILD_PYTHON=OFF -DHOLOLINK_BUILD_TESTS=OFF -DHOLOLINK_BUILD_TOOLS=OFF -DHOLOLINK_BUILD_EXAMPLES=OFF -DHOLOLINK_BUILD_EMULATOR=OFF && \
     cmake --build build --target roce_receiver gpu_roce_transceiver hololink_core
 

From f731fccebebc52207434d52c8092928074ec2c2f Mon Sep 17 00:00:00 2001
From: Chuck Ketcham <cketcham@nvidia.com>
Date: Tue, 31 Mar 2026 18:54:06 +0000
Subject: [PATCH 4/5] Fix BackpressureWhenAllBusy test for
 sweep_completed_workers

Signed-off-by: Chuck Ketcham <cketcham@nvidia.com>
---
 realtime/unittests/test_host_dispatcher.cu | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/realtime/unittests/test_host_dispatcher.cu b/realtime/unittests/test_host_dispatcher.cu
index 5c7ac4e7f22..39ce9e3986f 100644
--- a/realtime/unittests/test_host_dispatcher.cu
+++ b/realtime/unittests/test_host_dispatcher.cu
@@ -887,16 +887,10 @@ TEST_F(HostDispatcherLoopTest, BackpressureWhenAllBusy) {
   SignalSlot(0);
   SignalSlot(1);
 
+  // sweep_completed_workers auto-releases workers, so both slots
+  // complete without an explicit RestoreWorker call.
   ASSERT_TRUE(PollTxFlag(0)) << "Timeout on slot 0";
-  ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
-
-  // Slot 1 should still be pending — worker is busy.
-  EXPECT_EQ(tx_flags_host_[1], 0u)
-      << "Slot 1 should stall while worker is busy";
-
-  RestoreWorker(0);
-
-  ASSERT_TRUE(PollTxFlag(1)) << "Timeout on slot 1 after restoring worker";
+  ASSERT_TRUE(PollTxFlag(1)) << "Timeout on slot 1";
   ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
 
   const std::uint8_t expected0[] = {1, 2, 3, 4};

From f7b4ede1150fffb8fd5c60139fc8c02111ddb2cd Mon Sep 17 00:00:00 2001
From: Chuck Ketcham <cketcham@nvidia.com>
Date: Tue, 31 Mar 2026 19:54:47 +0000
Subject: [PATCH 5/5] Clarify dispatch_path comment with enum value (PR review)

Signed-off-by: Chuck Ketcham <cketcham@nvidia.com>
---
 .../include/cudaq/realtime/hololink_bridge_common.h    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/realtime/include/cudaq/realtime/hololink_bridge_common.h b/realtime/include/cudaq/realtime/hololink_bridge_common.h
index f4c2c6d95ae..c6ed548dc18 100644
--- a/realtime/include/cudaq/realtime/hololink_bridge_common.h
+++ b/realtime/include/cudaq/realtime/hololink_bridge_common.h
@@ -136,11 +136,11 @@ struct BridgeConfig {
   /// Default: cudaq_launch_dispatch_kernel_regular
   cudaq_dispatch_launch_fn_t launch_fn = nullptr;
 
-  // HOST_LOOP graph launch mode -- CPU-side dispatcher that polls Hololink
-  // ring flags and launches CUDA graphs.  Requires a Grace-based system
-  // (Grace-Hopper / DGX Spark, Grace-Blackwell / GB200) where GPU memory is
-  // CPU-accessible via NVLink-C2C, since the HOST_LOOP thread reads DOCA
-  // GPU ring flags directly from the CPU.
+  // Set this to CUDAQ_DISPATCH_PATH_HOST for the HOST_LOOP graph launch mode --
+  // CPU-side dispatcher that polls Hololink ring flags and launches CUDA
+  // graphs.  Requires a Grace-based system (Grace-Hopper / DGX Spark,
+  // Grace-Blackwell / GB200) where GPU memory is CPU-accessible via NVLink-C2C,
+  // since the HOST_LOOP thread reads DOCA GPU ring flags directly from the CPU.
   cudaq_dispatch_path_t dispatch_path = CUDAQ_DISPATCH_PATH_DEVICE;
 
   /// Host-side function table for GRAPH_LAUNCH entries (HOST_LOOP only).