cketcham2333 · cketcham2333 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/.github/workflows/realtime_ci.yml b/.github/workflows/realtime_ci.yml
@@ -130,7 +130,8 @@ jobs:
             bash scripts/install_dev_prerequisites.sh
             # Build HSB (GPU RoCE transceiver and hololink_core)
             export CUDA_NATIVE_ARCH="${{ (contains(matrix.cuda_version, '12') && '80-real;90') || '80-real;90-real;100f-real;110-real;120-real;100-virtual' }}"
-            cd /workspace/ && git clone -b release-2.6.0-EA https://github.com/nvidia-holoscan/holoscan-sensor-bridge.git && cd holoscan-sensor-bridge 
+            cd /workspace/ && git clone -b release-2.6.0-EA https://github.com/nvidia-holoscan/holoscan-sensor-bridge.git && cd holoscan-sensor-bridge
+            for p in /workspace/realtime/scripts/hololink-patches/*.patch; do echo "Applying: $(basename $p)"; git apply "$p"; done
             cmake -G Ninja -S /workspace/holoscan-sensor-bridge -B /workspace/holoscan-sensor-bridge/build -DCMAKE_BUILD_TYPE=Release -DHOLOLINK_BUILD_ONLY_NATIVE=OFF -DHOLOLINK_BUILD_PYTHON=OFF -DHOLOLINK_BUILD_TESTS=OFF -DHOLOLINK_BUILD_TOOLS=OFF -DHOLOLINK_BUILD_EXAMPLES=OFF -DHOLOLINK_BUILD_EMULATOR=OFF
             cmake --build /workspace/holoscan-sensor-bridge/build --target roce_receiver gpu_roce_transceiver hololink_core
             # Build CUDA-Q Realtime

diff --git a/realtime/docker/assets.Dockerfile b/realtime/docker/assets.Dockerfile
@@ -90,6 +90,7 @@ ENV CUDA_NATIVE_ARCH=${cuda_native_arg}
 ARG hsb_version="release-2.6.0-EA"
 # Build HSB
 RUN cd / && git clone -b ${hsb_version} https://github.com/nvidia-holoscan/holoscan-sensor-bridge.git && cd holoscan-sensor-bridge && \
+    for p in /cuda-quantum/realtime/scripts/hololink-patches/*.patch; do echo "Applying: $(basename $p)"; git apply "$p"; done && \
     cmake -G Ninja -S . -B build -DCMAKE_BUILD_TYPE=Release -DHOLOLINK_BUILD_ONLY_NATIVE=OFF -DHOLOLINK_BUILD_PYTHON=OFF -DHOLOLINK_BUILD_TESTS=OFF -DHOLOLINK_BUILD_TOOLS=OFF -DHOLOLINK_BUILD_EXAMPLES=OFF -DHOLOLINK_BUILD_EMULATOR=OFF && \
     cmake --build build --target roce_receiver gpu_roce_transceiver hololink_core
 

diff --git a/realtime/include/cudaq/realtime/daemon/bridge/hololink/hololink_wrapper.h b/realtime/include/cudaq/realtime/daemon/bridge/hololink/hololink_wrapper.h
@@ -72,6 +72,15 @@ void hololink_close(hololink_transceiver_t handle);
  */
 void hololink_blocking_monitor(hololink_transceiver_t handle);
 
+/**
+ * Force CPU+GPU accessible allocation for ring flags and data only.
+ * Does NOT change CQ/QP UMEMs or TX kernel handler (BlueFlame preserved).
+ * Must be called after create and before start().
+ * Required when a CPU thread needs to read ring flags/data directly
+ * (e.g. HOST_LOOP dispatcher on Grace-Blackwell dGPU).
+ */
+void hololink_set_cpu_ring_buffers(hololink_transceiver_t handle, int enable);
+
 //==============================================================================
 // QP information (for RDMA setup)
 //==============================================================================

diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h b/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h
@@ -102,6 +102,12 @@ typedef struct {
   cudaq_kernel_type_t kernel_type;     // regular/cooperative kernel
   cudaq_dispatch_mode_t dispatch_mode; // device call/graph launch
   cudaq_dispatch_path_t dispatch_path; // GPU kernel or CPU host loop
+  int skip_tx_markers; // when non-zero, the host dispatcher will NOT write
+                       // sentinel markers (CUDAQ_TX_FLAG_IN_FLIGHT) to
+                       // tx_flags before graph launch.  Set this when an
+                       // external GPU kernel (e.g. Hololink TX) polls the
+                       // same tx_flags array; the sentinel would be
+                       // misinterpreted as a valid address.
 } cudaq_dispatcher_config_t;
 
 // GPU ring buffer pointers. For device backend use device pointers only.

diff --git a/realtime/include/cudaq/realtime/hololink_bridge_common.h b/realtime/include/cudaq/realtime/hololink_bridge_common.h
@@ -136,6 +136,24 @@ struct BridgeConfig {
   /// Default: cudaq_launch_dispatch_kernel_regular
   cudaq_dispatch_launch_fn_t launch_fn = nullptr;
 
+  // Set this to CUDAQ_DISPATCH_PATH_HOST for the HOST_LOOP graph launch mode --
+  // CPU-side dispatcher that polls Hololink ring flags and launches CUDA
+  // graphs.  Requires a Grace-based system (Grace-Hopper / DGX Spark,
+  // Grace-Blackwell / GB200) where GPU memory is CPU-accessible via NVLink-C2C,
+  // since the HOST_LOOP thread reads DOCA GPU ring flags directly from the CPU.
+  cudaq_dispatch_path_t dispatch_path = CUDAQ_DISPATCH_PATH_DEVICE;
+
+  /// Host-side function table for GRAPH_LAUNCH entries (HOST_LOOP only).
+  /// Each entry must have dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH
+  /// and handler.graph_exec set to a valid cudaGraphExec_t.
+  cudaq_function_entry_t *h_function_entries = nullptr;
+  size_t h_func_count = 0;
+
+  /// Pinned mailbox for HOST_LOOP graph dispatch (from graph_resources).
+  /// h_mailbox is the host pointer, d_mailbox is the device-mapped view.
+  void **h_mailbox = nullptr;
+  void **d_mailbox = nullptr;
+
   /// @brief Optional cleanup callback invoked during shutdown.
   std::function<void()> cleanup_fn;
 };
@@ -247,8 +265,9 @@ inline int bridge_run(BridgeConfig &config) {
   bool is_igpu = (prop.integrated != 0);
   bool unified_igpu = config.unified && is_igpu;
 
+  bool is_host_loop = (config.dispatch_path == CUDAQ_DISPATCH_PATH_HOST);
   bool use_forward = config.forward || (config.unified && !is_igpu);
-  bool use_3kernel = !config.forward && !config.unified;
+  bool use_3kernel = is_host_loop || (!config.forward && !config.unified);
 
   hololink_transceiver_t transceiver = hololink_create_transceiver(
       config.device.c_str(), 1, // ib_port
@@ -266,6 +285,10 @@ inline int bridge_run(BridgeConfig &config) {
     return 1;
   }
 
+  // HOST_LOOP needs CPU-readable ring flags and data; allocate as CPU_GPU.
+  if (is_host_loop)
+    hololink_set_cpu_ring_buffers(transceiver, 1);
+
   std::cout << "  Connecting to remote QP 0x" << std::hex << config.remote_qp
             << std::dec << " at " << config.peer_ip << "..." << std::endl;
 
@@ -336,7 +359,7 @@ inline int bridge_run(BridgeConfig &config) {
   hololink_doca_transport_ctx unified_ctx{};
 
   if (!config.forward) {
-    if (!config.unified) {
+    if (!config.unified && !is_host_loop) {
       int dispatch_blocks = 0;
       cudaError_t occ_err;
       if (config.kernel_type == CUDAQ_KERNEL_COOPERATIVE) {
@@ -358,8 +381,10 @@ inline int bridge_run(BridgeConfig &config) {
     // [4] Wire dispatch kernel
     //==========================================================================
     std::cout << "\n[4/5] Wiring dispatch kernel ("
-              << (config.unified ? "unified" : "3-kernel") << ")..."
-              << std::endl;
+              << (is_host_loop     ? "host-loop"
+                  : config.unified ? "unified"
+                                   : "3-kernel")
+              << ")..." << std::endl;
 
     void *tmp_shutdown = nullptr;
     BRIDGE_CUDA_CHECK(
@@ -374,7 +399,14 @@ inline int bridge_run(BridgeConfig &config) {
     BRIDGE_CUDA_CHECK(cudaMemcpy(const_cast<int *>(d_shutdown_flag), &zero,
                                  sizeof(int), cudaMemcpyHostToDevice));
 
-    BRIDGE_CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t)));
+    if (is_host_loop) {
+      void *tmp_stats = nullptr;
+      BRIDGE_CUDA_CHECK(
+          cudaHostAlloc(&tmp_stats, sizeof(uint64_t), cudaHostAllocMapped));
+      d_stats = static_cast<uint64_t *>(tmp_stats);
+    } else {
+      BRIDGE_CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t)));
+    }
     BRIDGE_CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t)));
 
     if (cudaq_dispatch_manager_create(&manager) != CUDAQ_OK) {
@@ -385,28 +417,78 @@ inline int bridge_run(BridgeConfig &config) {
     cudaq_dispatcher_config_t dconfig{};
     dconfig.device_id = config.gpu_id;
     dconfig.vp_id = 0;
-    dconfig.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL;
-
-    if (config.unified) {
-      dconfig.kernel_type = CUDAQ_KERNEL_UNIFIED;
-      dconfig.num_blocks = 1;
-      dconfig.threads_per_block = 1;
-      dconfig.num_slots = 0;
-      dconfig.slot_size = 0;
-    } else {
-      dconfig.kernel_type = config.kernel_type;
-      dconfig.num_blocks = config.num_blocks;
-      dconfig.threads_per_block = config.threads_per_block;
+
+    if (is_host_loop) {
+      dconfig.dispatch_path = CUDAQ_DISPATCH_PATH_HOST;
+      dconfig.dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH;
       dconfig.num_slots = static_cast<uint32_t>(config.num_pages);
       dconfig.slot_size = static_cast<uint32_t>(config.page_size);
+      // Hololink TX kernel polls tx_flags for ready data; writing sentinel
+      // markers (0xEEEE) would be misinterpreted as a valid TX buffer address.
+      dconfig.skip_tx_markers = 1;
+    } else {
+      dconfig.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL;
+
+      if (config.unified) {
+        dconfig.kernel_type = CUDAQ_KERNEL_UNIFIED;
+        dconfig.num_blocks = 1;
+        dconfig.threads_per_block = 1;
+        dconfig.num_slots = 0;
+        dconfig.slot_size = 0;
+      } else {
+        dconfig.kernel_type = config.kernel_type;
+        dconfig.num_blocks = config.num_blocks;
+        dconfig.threads_per_block = config.threads_per_block;
+        dconfig.num_slots = static_cast<uint32_t>(config.num_pages);
+        dconfig.slot_size = static_cast<uint32_t>(config.page_size);
+      }
     }
 
     if (cudaq_dispatcher_create(manager, &dconfig, &dispatcher) != CUDAQ_OK) {
       std::cerr << "ERROR: Failed to create dispatcher" << std::endl;
       return 1;
     }
 
-    if (config.unified) {
+    if (is_host_loop) {
+      // HOST_LOOP: wire ringbuffer with Hololink GPU pointers as both
+      // device and host views.  On Grace-based systems (Grace-Hopper,
+      // Grace-Blackwell), GPU memory is CPU-accessible via NVLink-C2C.
+      cudaq_ringbuffer_t ringbuffer{};
+      ringbuffer.rx_flags = reinterpret_cast<volatile uint64_t *>(rx_ring_flag);
+      ringbuffer.tx_flags = reinterpret_cast<volatile uint64_t *>(tx_ring_flag);
+      ringbuffer.rx_data = rx_ring_data;
+      ringbuffer.tx_data = tx_ring_data;
+      ringbuffer.rx_stride_sz = config.page_size;
+      ringbuffer.tx_stride_sz = config.page_size;
+      ringbuffer.rx_flags_host =
+          reinterpret_cast<volatile uint64_t *>(rx_ring_flag);
+      ringbuffer.tx_flags_host =
+          reinterpret_cast<volatile uint64_t *>(tx_ring_flag);
+      ringbuffer.rx_data_host = rx_ring_data;
+      ringbuffer.tx_data_host = tx_ring_data;
+
+      if (cudaq_dispatcher_set_ringbuffer(dispatcher, &ringbuffer) !=
+          CUDAQ_OK) {
+        std::cerr << "ERROR: Failed to set ringbuffer" << std::endl;
+        return 1;
+      }
+
+      cudaq_function_table_t table{};
+      table.entries = config.h_function_entries;
+      table.count = static_cast<uint32_t>(config.h_func_count);
+      if (cudaq_dispatcher_set_function_table(dispatcher, &table) != CUDAQ_OK) {
+        std::cerr << "ERROR: Failed to set function table" << std::endl;
+        return 1;
+      }
+
+      if (config.h_mailbox) {
+        if (cudaq_dispatcher_set_mailbox(dispatcher, config.h_mailbox) !=
+            CUDAQ_OK) {
+          std::cerr << "ERROR: Failed to set mailbox" << std::endl;
+          return 1;
+        }
+      }
+    } else if (config.unified) {
       // Pack DOCA transport handles into the opaque context
       unified_ctx.gpu_dev_qp = hololink_get_gpu_dev_qp(transceiver);
       unified_ctx.rx_ring_data = rx_ring_data;
@@ -448,12 +530,14 @@ inline int bridge_run(BridgeConfig &config) {
       }
     }
 
-    cudaq_function_table_t table{};
-    table.entries = config.d_function_entries;
-    table.count = config.func_count;
-    if (cudaq_dispatcher_set_function_table(dispatcher, &table) != CUDAQ_OK) {
-      std::cerr << "ERROR: Failed to set function table" << std::endl;
-      return 1;
+    if (!is_host_loop) {
+      cudaq_function_table_t table{};
+      table.entries = config.d_function_entries;
+      table.count = config.func_count;
+      if (cudaq_dispatcher_set_function_table(dispatcher, &table) != CUDAQ_OK) {
+        std::cerr << "ERROR: Failed to set function table" << std::endl;
+        return 1;
+      }
     }
 
     if (cudaq_dispatcher_set_control(dispatcher, d_shutdown_flag, d_stats) !=
@@ -488,6 +572,7 @@ inline int bridge_run(BridgeConfig &config) {
     std::this_thread::sleep_for(std::chrono::milliseconds(500));
   } else {
     std::cout << "\n[5/5] Launching Hololink kernels..." << std::endl;
+    std::cout.flush();
     hololink_thread = std::thread(
         [transceiver]() { hololink_blocking_monitor(transceiver); });
     std::this_thread::sleep_for(std::chrono::milliseconds(500));
@@ -572,8 +657,12 @@ inline int bridge_run(BridgeConfig &config) {
 
   if (shutdown_flag)
     cudaFreeHost(const_cast<int *>(shutdown_flag));
-  if (d_stats)
-    cudaFree(d_stats);
+  if (d_stats) {
+    if (is_host_loop)
+      cudaFreeHost(d_stats);
+    else
+      cudaFree(d_stats);
+  }
 
   // Call tool-specific cleanup
   if (config.cleanup_fn)

diff --git a/realtime/lib/daemon/bridge/hololink/CMakeLists.txt b/realtime/lib/daemon/bridge/hololink/CMakeLists.txt
@@ -71,14 +71,6 @@ find_library(HOLOLINK_COMMON_LIB
     "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib"
   NO_DEFAULT_PATH)
 
-find_library(ROCE_RECEIVER_LIB
-  NAMES roce_receiver
-  PATHS
-    "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}"
-    "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/operators/roce_receiver"
-    "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib"
-  NO_DEFAULT_PATH)
-
 find_library(BASE_RECEIVER_OP_LIB
   NAMES base_receiver_op
   PATHS
@@ -180,7 +172,6 @@ if (GPU_ROCE_TRANSCEIVER_LIB AND
     PRIVATE
       hololink_wrapper_generic
       ${GPU_ROCE_TRANSCEIVER_LIB}
-      ${ROCE_RECEIVER_LIB}
       ${BASE_RECEIVER_OP_LIB}
       ${HOLOLINK_CORE_LIB}
       ${HOLOLINK_COMMON_LIB}

diff --git a/realtime/lib/daemon/bridge/hololink/hololink_wrapper.cpp b/realtime/lib/daemon/bridge/hololink/hololink_wrapper.cpp
@@ -100,6 +100,13 @@ void hololink_blocking_monitor(hololink_transceiver_t handle) {
   }
 }
 
+void hololink_set_cpu_ring_buffers(hololink_transceiver_t handle, int enable) {
+  if (handle) {
+    auto *impl = reinterpret_cast<HololinkTransceiverImpl *>(handle);
+    impl->transceiver->set_cpu_ring_buffers(enable != 0);
+  }
+}
+
 //==============================================================================
 // QP information
 //==============================================================================

diff --git a/realtime/lib/daemon/dispatcher/host_dispatcher.cu b/realtime/lib/daemon/dispatcher/host_dispatcher.cu
@@ -102,6 +102,21 @@ static int acquire_graph_worker(const cudaq_host_dispatch_loop_ctx_t *ctx,
   return __builtin_ffsll(static_cast<long long>(mask)) - 1;
 }
 
+static void
+sweep_completed_workers(const cudaq_host_dispatch_loop_ctx_t *ctx) {
+  uint64_t busy =
+      ~as_atomic_u64(ctx->idle_mask)->load(cuda::std::memory_order_acquire);
+  busy &= (1ULL << ctx->num_workers) - 1;
+  while (busy != 0) {
+    int w = __builtin_ffsll(static_cast<long long>(busy)) - 1;
+    if (cudaStreamQuery(ctx->workers[w].stream) == cudaSuccess) {
+      as_atomic_u64(ctx->idle_mask)
+          ->fetch_or(1ULL << w, cuda::std::memory_order_release);
+    }
+    busy &= ~(1ULL << w);
+  }
+}
+
 static void launch_graph_worker(const cudaq_host_dispatch_loop_ctx_t *ctx,
                                 int worker_id, void *slot_host,
                                 size_t current_slot) {
@@ -129,8 +144,10 @@ static void launch_graph_worker(const cudaq_host_dispatch_loop_ctx_t *ctx,
     void *d_ctx = d_ctxs + worker_id * sizeof(GraphIOContext);
     ctx->h_mailbox_bank[worker_id] = d_ctx;
 
-    as_atomic_u64(ctx->ringbuffer.tx_flags_host)[current_slot].store(
-        CUDAQ_TX_FLAG_IN_FLIGHT, cuda::std::memory_order_release);
+    if (!ctx->config.skip_tx_markers) {
+      as_atomic_u64(ctx->ringbuffer.tx_flags_host)[current_slot].store(
+          CUDAQ_TX_FLAG_IN_FLIGHT, cuda::std::memory_order_release);
+    }
     __sync_synchronize();
   } else {
     ctx->h_mailbox_bank[worker_id] = data_dev;
@@ -154,7 +171,7 @@ static void launch_graph_worker(const cudaq_host_dispatch_loop_ctx_t *ctx,
     if (ctx->workers[w].post_launch_fn)
       ctx->workers[w].post_launch_fn(ctx->workers[w].post_launch_data,
                                         data_dev, ctx->workers[w].stream);
-    if (ctx->io_ctxs_host == nullptr) {
+    if (ctx->io_ctxs_host == nullptr && !ctx->config.skip_tx_markers) {
       as_atomic_u64(ctx->ringbuffer.tx_flags_host)[current_slot].store(
           CUDAQ_TX_FLAG_IN_FLIGHT, cuda::std::memory_order_release);
     }
@@ -178,6 +195,7 @@ cudaq_host_dispatcher_loop(const cudaq_host_dispatch_loop_ctx_t *ctx) {
             cuda::std::memory_order_acquire);
 
     if (rx_value == 0) {
+      sweep_completed_workers(ctx);
       CUDAQ_REALTIME_CPU_RELAX();
       continue;
     }
@@ -206,6 +224,7 @@ cudaq_host_dispatcher_loop(const cudaq_host_dispatch_loop_ctx_t *ctx) {
       continue;
     }
 
+    sweep_completed_workers(ctx);
     int worker_id =
         acquire_graph_worker(ctx, use_function_table, entry, function_id);
     if (worker_id < 0) {