From 5cd04851c504273f475193dff83da4123ec7bacd Mon Sep 17 00:00:00 2001 From: Chuck Ketcham Date: Tue, 31 Mar 2026 16:29:05 +0000 Subject: [PATCH 1/5] [Realtime] Add Hololink bridge support for HOST_LOOP graph dispatch Extends the Hololink bridge skeleton and host dispatcher to support CPU-launched CUDA graph dispatch (CUDAQ_DISPATCH_PATH_HOST) over Hololink RDMA. Signed-off-by: Chuck Ketcham --- .../daemon/bridge/hololink/hololink_wrapper.h | 9 ++ .../daemon/dispatcher/cudaq_realtime.h | 6 + .../cudaq/realtime/hololink_bridge_common.h | 141 ++++++++++++++---- .../lib/daemon/bridge/hololink/CMakeLists.txt | 9 -- .../bridge/hololink/hololink_wrapper.cpp | 7 + .../lib/daemon/dispatcher/host_dispatcher.cu | 25 +++- realtime/unittests/utils/CMakeLists.txt | 9 -- realtime/unittests/utils/hololink_test.sh | 8 + 8 files changed, 167 insertions(+), 47 deletions(-) diff --git a/realtime/include/cudaq/realtime/daemon/bridge/hololink/hololink_wrapper.h b/realtime/include/cudaq/realtime/daemon/bridge/hololink/hololink_wrapper.h index 73264287d7e..d4dc766fd34 100644 --- a/realtime/include/cudaq/realtime/daemon/bridge/hololink/hololink_wrapper.h +++ b/realtime/include/cudaq/realtime/daemon/bridge/hololink/hololink_wrapper.h @@ -72,6 +72,15 @@ void hololink_close(hololink_transceiver_t handle); */ void hololink_blocking_monitor(hololink_transceiver_t handle); +/** + * Force CPU+GPU accessible allocation for ring flags and data only. + * Does NOT change CQ/QP UMEMs or TX kernel handler (BlueFlame preserved). + * Must be called after create and before start(). + * Required when a CPU thread needs to read ring flags/data directly + * (e.g. HOST_LOOP dispatcher on Grace-Blackwell dGPU). + */ +void hololink_set_cpu_ring_buffers(hololink_transceiver_t handle, int enable); + //============================================================================== // QP information (for RDMA setup) //============================================================================== diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h b/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h index dcee4cddc9b..4fbdc00ca9a 100644 --- a/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h +++ b/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h @@ -102,6 +102,12 @@ typedef struct { cudaq_kernel_type_t kernel_type; // regular/cooperative kernel cudaq_dispatch_mode_t dispatch_mode; // device call/graph launch cudaq_dispatch_path_t dispatch_path; // GPU kernel or CPU host loop + int skip_tx_markers; // when non-zero, the host dispatcher will NOT write + // sentinel markers (CUDAQ_TX_FLAG_IN_FLIGHT) to + // tx_flags before graph launch. Set this when an + // external GPU kernel (e.g. Hololink TX) polls the + // same tx_flags array; the sentinel would be + // misinterpreted as a valid address. } cudaq_dispatcher_config_t; // GPU ring buffer pointers. For device backend use device pointers only. diff --git a/realtime/include/cudaq/realtime/hololink_bridge_common.h b/realtime/include/cudaq/realtime/hololink_bridge_common.h index dba8eb5fa68..f4c2c6d95ae 100644 --- a/realtime/include/cudaq/realtime/hololink_bridge_common.h +++ b/realtime/include/cudaq/realtime/hololink_bridge_common.h @@ -136,6 +136,24 @@ struct BridgeConfig { /// Default: cudaq_launch_dispatch_kernel_regular cudaq_dispatch_launch_fn_t launch_fn = nullptr; + // HOST_LOOP graph launch mode -- CPU-side dispatcher that polls Hololink + // ring flags and launches CUDA graphs. Requires a Grace-based system + // (Grace-Hopper / DGX Spark, Grace-Blackwell / GB200) where GPU memory is + // CPU-accessible via NVLink-C2C, since the HOST_LOOP thread reads DOCA + // GPU ring flags directly from the CPU. + cudaq_dispatch_path_t dispatch_path = CUDAQ_DISPATCH_PATH_DEVICE; + + /// Host-side function table for GRAPH_LAUNCH entries (HOST_LOOP only). + /// Each entry must have dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH + /// and handler.graph_exec set to a valid cudaGraphExec_t. + cudaq_function_entry_t *h_function_entries = nullptr; + size_t h_func_count = 0; + + /// Pinned mailbox for HOST_LOOP graph dispatch (from graph_resources). + /// h_mailbox is the host pointer, d_mailbox is the device-mapped view. + void **h_mailbox = nullptr; + void **d_mailbox = nullptr; + /// @brief Optional cleanup callback invoked during shutdown. std::function cleanup_fn; }; @@ -247,8 +265,9 @@ inline int bridge_run(BridgeConfig &config) { bool is_igpu = (prop.integrated != 0); bool unified_igpu = config.unified && is_igpu; + bool is_host_loop = (config.dispatch_path == CUDAQ_DISPATCH_PATH_HOST); bool use_forward = config.forward || (config.unified && !is_igpu); - bool use_3kernel = !config.forward && !config.unified; + bool use_3kernel = is_host_loop || (!config.forward && !config.unified); hololink_transceiver_t transceiver = hololink_create_transceiver( config.device.c_str(), 1, // ib_port @@ -266,6 +285,10 @@ inline int bridge_run(BridgeConfig &config) { return 1; } + // HOST_LOOP needs CPU-readable ring flags and data; allocate as CPU_GPU. + if (is_host_loop) + hololink_set_cpu_ring_buffers(transceiver, 1); + std::cout << " Connecting to remote QP 0x" << std::hex << config.remote_qp << std::dec << " at " << config.peer_ip << "..." << std::endl; @@ -336,7 +359,7 @@ inline int bridge_run(BridgeConfig &config) { hololink_doca_transport_ctx unified_ctx{}; if (!config.forward) { - if (!config.unified) { + if (!config.unified && !is_host_loop) { int dispatch_blocks = 0; cudaError_t occ_err; if (config.kernel_type == CUDAQ_KERNEL_COOPERATIVE) { @@ -358,8 +381,10 @@ inline int bridge_run(BridgeConfig &config) { // [4] Wire dispatch kernel //========================================================================== std::cout << "\n[4/5] Wiring dispatch kernel (" - << (config.unified ? "unified" : "3-kernel") << ")..." - << std::endl; + << (is_host_loop ? "host-loop" + : config.unified ? "unified" + : "3-kernel") + << ")..." << std::endl; void *tmp_shutdown = nullptr; BRIDGE_CUDA_CHECK( @@ -374,7 +399,14 @@ inline int bridge_run(BridgeConfig &config) { BRIDGE_CUDA_CHECK(cudaMemcpy(const_cast(d_shutdown_flag), &zero, sizeof(int), cudaMemcpyHostToDevice)); - BRIDGE_CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t))); + if (is_host_loop) { + void *tmp_stats = nullptr; + BRIDGE_CUDA_CHECK( + cudaHostAlloc(&tmp_stats, sizeof(uint64_t), cudaHostAllocMapped)); + d_stats = static_cast(tmp_stats); + } else { + BRIDGE_CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t))); + } BRIDGE_CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t))); if (cudaq_dispatch_manager_create(&manager) != CUDAQ_OK) { @@ -385,20 +417,31 @@ inline int bridge_run(BridgeConfig &config) { cudaq_dispatcher_config_t dconfig{}; dconfig.device_id = config.gpu_id; dconfig.vp_id = 0; - dconfig.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; - - if (config.unified) { - dconfig.kernel_type = CUDAQ_KERNEL_UNIFIED; - dconfig.num_blocks = 1; - dconfig.threads_per_block = 1; - dconfig.num_slots = 0; - dconfig.slot_size = 0; - } else { - dconfig.kernel_type = config.kernel_type; - dconfig.num_blocks = config.num_blocks; - dconfig.threads_per_block = config.threads_per_block; + + if (is_host_loop) { + dconfig.dispatch_path = CUDAQ_DISPATCH_PATH_HOST; + dconfig.dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; dconfig.num_slots = static_cast(config.num_pages); dconfig.slot_size = static_cast(config.page_size); + // Hololink TX kernel polls tx_flags for ready data; writing sentinel + // markers (0xEEEE) would be misinterpreted as a valid TX buffer address. + dconfig.skip_tx_markers = 1; + } else { + dconfig.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; + + if (config.unified) { + dconfig.kernel_type = CUDAQ_KERNEL_UNIFIED; + dconfig.num_blocks = 1; + dconfig.threads_per_block = 1; + dconfig.num_slots = 0; + dconfig.slot_size = 0; + } else { + dconfig.kernel_type = config.kernel_type; + dconfig.num_blocks = config.num_blocks; + dconfig.threads_per_block = config.threads_per_block; + dconfig.num_slots = static_cast(config.num_pages); + dconfig.slot_size = static_cast(config.page_size); + } } if (cudaq_dispatcher_create(manager, &dconfig, &dispatcher) != CUDAQ_OK) { @@ -406,7 +449,46 @@ inline int bridge_run(BridgeConfig &config) { return 1; } - if (config.unified) { + if (is_host_loop) { + // HOST_LOOP: wire ringbuffer with Hololink GPU pointers as both + // device and host views. On Grace-based systems (Grace-Hopper, + // Grace-Blackwell), GPU memory is CPU-accessible via NVLink-C2C. + cudaq_ringbuffer_t ringbuffer{}; + ringbuffer.rx_flags = reinterpret_cast(rx_ring_flag); + ringbuffer.tx_flags = reinterpret_cast(tx_ring_flag); + ringbuffer.rx_data = rx_ring_data; + ringbuffer.tx_data = tx_ring_data; + ringbuffer.rx_stride_sz = config.page_size; + ringbuffer.tx_stride_sz = config.page_size; + ringbuffer.rx_flags_host = + reinterpret_cast(rx_ring_flag); + ringbuffer.tx_flags_host = + reinterpret_cast(tx_ring_flag); + ringbuffer.rx_data_host = rx_ring_data; + ringbuffer.tx_data_host = tx_ring_data; + + if (cudaq_dispatcher_set_ringbuffer(dispatcher, &ringbuffer) != + CUDAQ_OK) { + std::cerr << "ERROR: Failed to set ringbuffer" << std::endl; + return 1; + } + + cudaq_function_table_t table{}; + table.entries = config.h_function_entries; + table.count = static_cast(config.h_func_count); + if (cudaq_dispatcher_set_function_table(dispatcher, &table) != CUDAQ_OK) { + std::cerr << "ERROR: Failed to set function table" << std::endl; + return 1; + } + + if (config.h_mailbox) { + if (cudaq_dispatcher_set_mailbox(dispatcher, config.h_mailbox) != + CUDAQ_OK) { + std::cerr << "ERROR: Failed to set mailbox" << std::endl; + return 1; + } + } + } else if (config.unified) { // Pack DOCA transport handles into the opaque context unified_ctx.gpu_dev_qp = hololink_get_gpu_dev_qp(transceiver); unified_ctx.rx_ring_data = rx_ring_data; @@ -448,12 +530,14 @@ inline int bridge_run(BridgeConfig &config) { } } - cudaq_function_table_t table{}; - table.entries = config.d_function_entries; - table.count = config.func_count; - if (cudaq_dispatcher_set_function_table(dispatcher, &table) != CUDAQ_OK) { - std::cerr << "ERROR: Failed to set function table" << std::endl; - return 1; + if (!is_host_loop) { + cudaq_function_table_t table{}; + table.entries = config.d_function_entries; + table.count = config.func_count; + if (cudaq_dispatcher_set_function_table(dispatcher, &table) != CUDAQ_OK) { + std::cerr << "ERROR: Failed to set function table" << std::endl; + return 1; + } } if (cudaq_dispatcher_set_control(dispatcher, d_shutdown_flag, d_stats) != @@ -488,6 +572,7 @@ inline int bridge_run(BridgeConfig &config) { std::this_thread::sleep_for(std::chrono::milliseconds(500)); } else { std::cout << "\n[5/5] Launching Hololink kernels..." << std::endl; + std::cout.flush(); hololink_thread = std::thread( [transceiver]() { hololink_blocking_monitor(transceiver); }); std::this_thread::sleep_for(std::chrono::milliseconds(500)); @@ -572,8 +657,12 @@ inline int bridge_run(BridgeConfig &config) { if (shutdown_flag) cudaFreeHost(const_cast(shutdown_flag)); - if (d_stats) - cudaFree(d_stats); + if (d_stats) { + if (is_host_loop) + cudaFreeHost(d_stats); + else + cudaFree(d_stats); + } // Call tool-specific cleanup if (config.cleanup_fn) diff --git a/realtime/lib/daemon/bridge/hololink/CMakeLists.txt b/realtime/lib/daemon/bridge/hololink/CMakeLists.txt index 456cfe82062..8a1e60a709d 100644 --- a/realtime/lib/daemon/bridge/hololink/CMakeLists.txt +++ b/realtime/lib/daemon/bridge/hololink/CMakeLists.txt @@ -71,14 +71,6 @@ find_library(HOLOLINK_COMMON_LIB "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" NO_DEFAULT_PATH) -find_library(ROCE_RECEIVER_LIB - NAMES roce_receiver - PATHS - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/operators/roce_receiver" - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" - NO_DEFAULT_PATH) - find_library(BASE_RECEIVER_OP_LIB NAMES base_receiver_op PATHS @@ -180,7 +172,6 @@ if (GPU_ROCE_TRANSCEIVER_LIB AND PRIVATE hololink_wrapper_generic ${GPU_ROCE_TRANSCEIVER_LIB} - ${ROCE_RECEIVER_LIB} ${BASE_RECEIVER_OP_LIB} ${HOLOLINK_CORE_LIB} ${HOLOLINK_COMMON_LIB} diff --git a/realtime/lib/daemon/bridge/hololink/hololink_wrapper.cpp b/realtime/lib/daemon/bridge/hololink/hololink_wrapper.cpp index e59f5176405..2452e87f810 100644 --- a/realtime/lib/daemon/bridge/hololink/hololink_wrapper.cpp +++ b/realtime/lib/daemon/bridge/hololink/hololink_wrapper.cpp @@ -100,6 +100,13 @@ void hololink_blocking_monitor(hololink_transceiver_t handle) { } } +void hololink_set_cpu_ring_buffers(hololink_transceiver_t handle, int enable) { + if (handle) { + auto *impl = reinterpret_cast(handle); + impl->transceiver->set_cpu_ring_buffers(enable != 0); + } +} + //============================================================================== // QP information //============================================================================== diff --git a/realtime/lib/daemon/dispatcher/host_dispatcher.cu b/realtime/lib/daemon/dispatcher/host_dispatcher.cu index 4f80e3e0d36..d1e7edf9248 100644 --- a/realtime/lib/daemon/dispatcher/host_dispatcher.cu +++ b/realtime/lib/daemon/dispatcher/host_dispatcher.cu @@ -102,6 +102,21 @@ static int acquire_graph_worker(const cudaq_host_dispatch_loop_ctx_t *ctx, return __builtin_ffsll(static_cast(mask)) - 1; } +static void +sweep_completed_workers(const cudaq_host_dispatch_loop_ctx_t *ctx) { + uint64_t busy = + ~as_atomic_u64(ctx->idle_mask)->load(cuda::std::memory_order_acquire); + busy &= (1ULL << ctx->num_workers) - 1; + while (busy != 0) { + int w = __builtin_ffsll(static_cast(busy)) - 1; + if (cudaStreamQuery(ctx->workers[w].stream) == cudaSuccess) { + as_atomic_u64(ctx->idle_mask) + ->fetch_or(1ULL << w, cuda::std::memory_order_release); + } + busy &= ~(1ULL << w); + } +} + static void launch_graph_worker(const cudaq_host_dispatch_loop_ctx_t *ctx, int worker_id, void *slot_host, size_t current_slot) { @@ -129,8 +144,10 @@ static void launch_graph_worker(const cudaq_host_dispatch_loop_ctx_t *ctx, void *d_ctx = d_ctxs + worker_id * sizeof(GraphIOContext); ctx->h_mailbox_bank[worker_id] = d_ctx; - as_atomic_u64(ctx->ringbuffer.tx_flags_host)[current_slot].store( - CUDAQ_TX_FLAG_IN_FLIGHT, cuda::std::memory_order_release); + if (!ctx->config.skip_tx_markers) { + as_atomic_u64(ctx->ringbuffer.tx_flags_host)[current_slot].store( + CUDAQ_TX_FLAG_IN_FLIGHT, cuda::std::memory_order_release); + } __sync_synchronize(); } else { ctx->h_mailbox_bank[worker_id] = data_dev; @@ -154,7 +171,7 @@ static void launch_graph_worker(const cudaq_host_dispatch_loop_ctx_t *ctx, if (ctx->workers[w].post_launch_fn) ctx->workers[w].post_launch_fn(ctx->workers[w].post_launch_data, data_dev, ctx->workers[w].stream); - if (ctx->io_ctxs_host == nullptr) { + if (ctx->io_ctxs_host == nullptr && !ctx->config.skip_tx_markers) { as_atomic_u64(ctx->ringbuffer.tx_flags_host)[current_slot].store( CUDAQ_TX_FLAG_IN_FLIGHT, cuda::std::memory_order_release); } @@ -178,6 +195,7 @@ cudaq_host_dispatcher_loop(const cudaq_host_dispatch_loop_ctx_t *ctx) { cuda::std::memory_order_acquire); if (rx_value == 0) { + sweep_completed_workers(ctx); CUDAQ_REALTIME_CPU_RELAX(); continue; } @@ -206,6 +224,7 @@ cudaq_host_dispatcher_loop(const cudaq_host_dispatch_loop_ctx_t *ctx) { continue; } + sweep_completed_workers(ctx); int worker_id = acquire_graph_worker(ctx, use_function_table, entry, function_id); if (worker_id < 0) { diff --git a/realtime/unittests/utils/CMakeLists.txt b/realtime/unittests/utils/CMakeLists.txt index a66cf9bce97..1cd14db74f6 100644 --- a/realtime/unittests/utils/CMakeLists.txt +++ b/realtime/unittests/utils/CMakeLists.txt @@ -71,14 +71,6 @@ find_library(HOLOLINK_COMMON_LIB "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" NO_DEFAULT_PATH) -find_library(ROCE_RECEIVER_LIB - NAMES roce_receiver - PATHS - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/operators/roce_receiver" - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" - NO_DEFAULT_PATH) - find_library(BASE_RECEIVER_OP_LIB NAMES base_receiver_op PATHS @@ -226,7 +218,6 @@ if (GPU_ROCE_TRANSCEIVER_LIB AND cudaq-realtime-bridge-hololink hololink_wrapper_generic ${GPU_ROCE_TRANSCEIVER_LIB} - ${ROCE_RECEIVER_LIB} ${BASE_RECEIVER_OP_LIB} ${HOLOLINK_CORE_LIB} ${HOLOLINK_COMMON_LIB} diff --git a/realtime/unittests/utils/hololink_test.sh b/realtime/unittests/utils/hololink_test.sh index 4e171fdffdb..3335edc63c1 100755 --- a/realtime/unittests/utils/hololink_test.sh +++ b/realtime/unittests/utils/hololink_test.sh @@ -211,6 +211,14 @@ do_build() { target_arch="arm64" fi + # Ensure nvcc is on PATH for detect_cuda_arch() and cmake check_language(CUDA). + if [[ -x /usr/local/cuda/bin/nvcc ]]; then + case ":$PATH:" in + *":/usr/local/cuda/bin:"*) ;; + *) export PATH="/usr/local/cuda/bin:$PATH" ;; + esac + fi + # Detect highest CUDA arch supported by nvcc local cuda_arch cuda_arch=$(detect_cuda_arch) From 812af07683438acd5520809979d398ef9000d54e Mon Sep 17 00:00:00 2001 From: Chuck Ketcham Date: Tue, 31 Mar 2026 17:33:23 +0000 Subject: [PATCH 2/5] Apply Hololink patches in CI for HOST_LOOP support Signed-off-by: Chuck Ketcham --- .github/workflows/realtime_ci.yml | 3 +- ...ing-flags-as-CPU_GPU-on-iGPU-for-hos.patch | 108 ++++++++++++++++++ ...g_buffers-for-HOST_LOOP-dGPU-support.patch | 81 +++++++++++++ 3 files changed, 191 insertions(+), 1 deletion(-) create mode 100644 realtime/scripts/hololink-patches/0003-Allocate-RX-TX-ring-flags-as-CPU_GPU-on-iGPU-for-hos.patch create mode 100644 realtime/scripts/hololink-patches/0004-Add-set_cpu_ring_buffers-for-HOST_LOOP-dGPU-support.patch diff --git a/.github/workflows/realtime_ci.yml b/.github/workflows/realtime_ci.yml index 5b3f045a7ba..dcf70b60462 100644 --- a/.github/workflows/realtime_ci.yml +++ b/.github/workflows/realtime_ci.yml @@ -130,7 +130,8 @@ jobs: bash scripts/install_dev_prerequisites.sh # Build HSB (GPU RoCE transceiver and hololink_core) export CUDA_NATIVE_ARCH="${{ (contains(matrix.cuda_version, '12') && '80-real;90') || '80-real;90-real;100f-real;110-real;120-real;100-virtual' }}" - cd /workspace/ && git clone -b release-2.6.0-EA https://github.com/nvidia-holoscan/holoscan-sensor-bridge.git && cd holoscan-sensor-bridge + cd /workspace/ && git clone -b release-2.6.0-EA https://github.com/nvidia-holoscan/holoscan-sensor-bridge.git && cd holoscan-sensor-bridge + for p in /workspace/realtime/scripts/hololink-patches/*.patch; do echo "Applying: $(basename $p)"; git apply "$p"; done cmake -G Ninja -S /workspace/holoscan-sensor-bridge -B /workspace/holoscan-sensor-bridge/build -DCMAKE_BUILD_TYPE=Release -DHOLOLINK_BUILD_ONLY_NATIVE=OFF -DHOLOLINK_BUILD_PYTHON=OFF -DHOLOLINK_BUILD_TESTS=OFF -DHOLOLINK_BUILD_TOOLS=OFF -DHOLOLINK_BUILD_EXAMPLES=OFF -DHOLOLINK_BUILD_EMULATOR=OFF cmake --build /workspace/holoscan-sensor-bridge/build --target roce_receiver gpu_roce_transceiver hololink_core # Build CUDA-Q Realtime diff --git a/realtime/scripts/hololink-patches/0003-Allocate-RX-TX-ring-flags-as-CPU_GPU-on-iGPU-for-hos.patch b/realtime/scripts/hololink-patches/0003-Allocate-RX-TX-ring-flags-as-CPU_GPU-on-iGPU-for-hos.patch new file mode 100644 index 00000000000..71407ef5e3a --- /dev/null +++ b/realtime/scripts/hololink-patches/0003-Allocate-RX-TX-ring-flags-as-CPU_GPU-on-iGPU-for-hos.patch @@ -0,0 +1,108 @@ +From 407d74a469ab6aca6b11f1b68fc0ff5b1e8e47d1 Mon Sep 17 00:00:00 2001 +From: Chuck Ketcham +Date: Mon, 23 Mar 2026 22:30:09 +0000 +Subject: [PATCH 3/4] Allocate RX/TX ring flags as CPU_GPU on iGPU for + host-side polling + +On integrated GPU systems (Grace-Hopper, Grace-Blackwell), the HOST_LOOP +dispatcher polls DOCA ring flags from the CPU. The flags were +unconditionally allocated as DOCA_GPU_MEM_TYPE_GPU, causing a segfault +when the CPU thread attempted an atomic load. When umem_cpu is set +(iGPU), allocate flags with DOCA_GPU_MEM_TYPE_CPU_GPU and use memset +instead of cudaMemset for zero-initialization. + +Signed-off-by: Chuck Ketcham +--- + .../gpu_roce_transceiver_doca.cpp | 60 +++++++++++++------ + 1 file changed, 42 insertions(+), 18 deletions(-) + +diff --git a/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver_doca.cpp b/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver_doca.cpp +index 707119e2..12039fa5 100644 +--- a/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver_doca.cpp ++++ b/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver_doca.cpp +@@ -528,21 +528,33 @@ doca_error_t DocaQp::create_ring(size_t stride_sz, unsigned stride_num, struct i + return DOCA_ERROR_NOT_SUPPORTED; + } + +- result = doca_gpu_mem_alloc(gdev, +- sizeof(uint64_t) * gpu_rx_ring.stride_num, +- get_page_size(), +- DOCA_GPU_MEM_TYPE_GPU, +- (void**)&(gpu_rx_ring.flag), +- nullptr); ++ if (umem_cpu) { ++ result = doca_gpu_mem_alloc(gdev, ++ sizeof(uint64_t) * gpu_rx_ring.stride_num, ++ get_page_size(), ++ DOCA_GPU_MEM_TYPE_CPU_GPU, ++ (void**)&(gpu_rx_ring.flag), ++ (void**)&(gpu_rx_ring.flag)); ++ } else { ++ result = doca_gpu_mem_alloc(gdev, ++ sizeof(uint64_t) * gpu_rx_ring.stride_num, ++ get_page_size(), ++ DOCA_GPU_MEM_TYPE_GPU, ++ (void**)&(gpu_rx_ring.flag), ++ nullptr); ++ } + if (result != DOCA_SUCCESS || gpu_rx_ring.flag == nullptr) { + HSB_LOG_ERROR("Failed to alloc rx flag ring buffer: {}", doca_error_get_descr(result)); + goto exit; + } +- +- result_cuda = cudaMemset(gpu_rx_ring.flag, 0, sizeof(uint64_t) * gpu_rx_ring.stride_num); +- if (result_cuda != cudaSuccess) { +- HSB_LOG_ERROR("cudaMemset returned error {}", (int)result_cuda); +- goto exit; ++ if (umem_cpu) { ++ memset(gpu_rx_ring.flag, 0, sizeof(uint64_t) * gpu_rx_ring.stride_num); ++ } else { ++ result_cuda = cudaMemset(gpu_rx_ring.flag, 0, sizeof(uint64_t) * gpu_rx_ring.stride_num); ++ if (result_cuda != cudaSuccess) { ++ HSB_LOG_ERROR("cudaMemset returned error {}", (int)result_cuda); ++ goto exit; ++ } + } + + gpu_tx_ring.stride_sz = stride_sz; +@@ -597,18 +609,30 @@ doca_error_t DocaQp::create_ring(size_t stride_sz, unsigned stride_num, struct i + return DOCA_ERROR_NOT_SUPPORTED; + } + +- result = doca_gpu_mem_alloc(gdev, +- sizeof(uint64_t) * gpu_tx_ring.stride_num, +- get_page_size(), +- DOCA_GPU_MEM_TYPE_GPU, +- (void**)&(gpu_tx_ring.flag), +- nullptr); ++ if (umem_cpu) { ++ result = doca_gpu_mem_alloc(gdev, ++ sizeof(uint64_t) * gpu_tx_ring.stride_num, ++ get_page_size(), ++ DOCA_GPU_MEM_TYPE_CPU_GPU, ++ (void**)&(gpu_tx_ring.flag), ++ (void**)&(gpu_tx_ring.flag)); ++ } else { ++ result = doca_gpu_mem_alloc(gdev, ++ sizeof(uint64_t) * gpu_tx_ring.stride_num, ++ get_page_size(), ++ DOCA_GPU_MEM_TYPE_GPU, ++ (void**)&(gpu_tx_ring.flag), ++ nullptr); ++ } + if (result != DOCA_SUCCESS || gpu_tx_ring.flag == nullptr) { + HSB_LOG_ERROR("Failed to alloc tx flag ring buffer: {}", doca_error_get_descr(result)); + goto exit; + } + +- cudaMemset(gpu_tx_ring.flag, 0, sizeof(uint64_t) * gpu_tx_ring.stride_num); ++ if (umem_cpu) ++ memset(gpu_tx_ring.flag, 0, sizeof(uint64_t) * gpu_tx_ring.stride_num); ++ else ++ cudaMemset(gpu_tx_ring.flag, 0, sizeof(uint64_t) * gpu_tx_ring.stride_num); + + return DOCA_SUCCESS; + +-- +2.43.0 + diff --git a/realtime/scripts/hololink-patches/0004-Add-set_cpu_ring_buffers-for-HOST_LOOP-dGPU-support.patch b/realtime/scripts/hololink-patches/0004-Add-set_cpu_ring_buffers-for-HOST_LOOP-dGPU-support.patch new file mode 100644 index 00000000000..1ccefc97e3c --- /dev/null +++ b/realtime/scripts/hololink-patches/0004-Add-set_cpu_ring_buffers-for-HOST_LOOP-dGPU-support.patch @@ -0,0 +1,81 @@ +From dce0123f9383cc1fdce11d1f540199de23b183b6 Mon Sep 17 00:00:00 2001 +From: Chuck Ketcham +Date: Tue, 24 Mar 2026 18:13:10 +0000 +Subject: [PATCH 4/4] Add set_cpu_ring_buffers() for HOST_LOOP dGPU support + +On dGPU systems (prop.integrated=false), GpuRoceTransceiver allocates +ring flags and data as DOCA_GPU_MEM_TYPE_GPU (GPU-only). The HOST_LOOP +dispatcher reads ring flags from the CPU, causing a segfault on these +GPU-only pointers. +Add set_cpu_ring_buffers(bool) which, when enabled, temporarily forces +umem_cpu=true around the DocaQp::create_ring() call so ring buffers +are allocated as CPU_GPU. The original umem_cpu is restored immediately +after, preserving BlueFlame TX and avoiding the CPU proxy thread on dGPU. + +Signed-off-by: Chuck Ketcham +--- + .../gpu_roce_transceiver/gpu_roce_transceiver.cpp | 13 ++++++++++++- + .../gpu_roce_transceiver/gpu_roce_transceiver.hpp | 8 ++++++++ + 2 files changed, 20 insertions(+), 1 deletion(-) + +diff --git a/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver.cpp b/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver.cpp +index d8d28d65..9f852f42 100644 +--- a/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver.cpp ++++ b/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver.cpp +@@ -245,7 +245,8 @@ bool GpuRoceTransceiver::start() + umem_cpu = false; + if (prop.integrated) + umem_cpu = true; +- HSB_LOG_INFO("Device {} GPU type {}", gpu_id_, prop.integrated ? "iGPU" : "dGPU"); ++ HSB_LOG_INFO("Device {} GPU type {} umem_cpu={} cpu_ring_buffers={}", gpu_id_, ++ prop.integrated ? "iGPU" : "dGPU", umem_cpu, cpu_ring_buffers_); + + result = doca_gpu_create(gpu_bus_id, &doca_gpu_device_); + if (result != DOCA_SUCCESS) { +@@ -338,7 +339,17 @@ bool GpuRoceTransceiver::start() + return false; + } + ++ // cpu_ring_buffers_: force CPU_GPU allocation for ring flags and data only, ++ // without changing CQ/QP UMEMs or TX kernel handler mode. ++ if (cpu_ring_buffers_ && !umem_cpu) ++ doca_qp->umem_cpu = true; ++ + result = doca_qp->create_ring(cu_page_size_, pages_, ibv_pd); ++ ++ // Restore original umem_cpu so TX kernel handler uses BlueFlame on dGPU. ++ if (cpu_ring_buffers_ && !umem_cpu) ++ doca_qp->umem_cpu = false; ++ + if (result != DOCA_SUCCESS) { + HSB_LOG_ERROR("Failed to create ring buffers: {}", doca_error_get_descr(result)); + return false; +diff --git a/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver.hpp b/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver.hpp +index c0f79001..eaebad46 100644 +--- a/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver.hpp ++++ b/src/hololink/operators/gpu_roce_transceiver/gpu_roce_transceiver.hpp +@@ -217,6 +217,13 @@ public: + uint32_t get_tx_ring_stride_num(); + uint64_t* get_tx_ring_flag_addr(); + ++ /// Force CPU+GPU accessible allocation for ring flags and data only ++ /// (DOCA_GPU_MEM_TYPE_CPU_GPU) even on dGPU systems. Must be called ++ /// before start(). Does NOT affect CQ/QP UMEMs or TX kernel handler. ++ /// Required when a CPU thread needs to read ring flags/data directly ++ /// (e.g. HOST_LOOP dispatcher on Grace-Blackwell). ++ void set_cpu_ring_buffers(bool enable) { cpu_ring_buffers_ = enable; } ++ + /** Blocks until close(); returns false (no CPU frame stream; kernel owns datapath). */ + bool get_next_frame(unsigned timeout_ms, CUstream cuda_stream); + +@@ -288,6 +295,7 @@ private: + std::mutex& get_lock(); + + bool umem_cpu; ++ bool cpu_ring_buffers_ = false; + + CUdevice cuDevice; + CUcontext cuContext; +-- +2.43.0 + From cbf52eda18732db59b819b1d29a38fc4401c1aca Mon Sep 17 00:00:00 2001 From: Chuck Ketcham Date: Tue, 31 Mar 2026 18:29:06 +0000 Subject: [PATCH 3/5] Apply Hololink patches in Dockerfile for HOST_LOOP support Signed-off-by: Chuck Ketcham --- realtime/docker/assets.Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/realtime/docker/assets.Dockerfile b/realtime/docker/assets.Dockerfile index 794a03e79b5..e3080306b09 100644 --- a/realtime/docker/assets.Dockerfile +++ b/realtime/docker/assets.Dockerfile @@ -90,6 +90,7 @@ ENV CUDA_NATIVE_ARCH=${cuda_native_arg} ARG hsb_version="release-2.6.0-EA" # Build HSB RUN cd / && git clone -b ${hsb_version} https://github.com/nvidia-holoscan/holoscan-sensor-bridge.git && cd holoscan-sensor-bridge && \ + for p in /cuda-quantum/realtime/scripts/hololink-patches/*.patch; do echo "Applying: $(basename $p)"; git apply "$p"; done && \ cmake -G Ninja -S . -B build -DCMAKE_BUILD_TYPE=Release -DHOLOLINK_BUILD_ONLY_NATIVE=OFF -DHOLOLINK_BUILD_PYTHON=OFF -DHOLOLINK_BUILD_TESTS=OFF -DHOLOLINK_BUILD_TOOLS=OFF -DHOLOLINK_BUILD_EXAMPLES=OFF -DHOLOLINK_BUILD_EMULATOR=OFF && \ cmake --build build --target roce_receiver gpu_roce_transceiver hololink_core From f731fccebebc52207434d52c8092928074ec2c2f Mon Sep 17 00:00:00 2001 From: Chuck Ketcham Date: Tue, 31 Mar 2026 18:54:06 +0000 Subject: [PATCH 4/5] Fix BackpressureWhenAllBusy test for sweep_completed_workers Signed-off-by: Chuck Ketcham --- realtime/unittests/test_host_dispatcher.cu | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/realtime/unittests/test_host_dispatcher.cu b/realtime/unittests/test_host_dispatcher.cu index 5c7ac4e7f22..39ce9e3986f 100644 --- a/realtime/unittests/test_host_dispatcher.cu +++ b/realtime/unittests/test_host_dispatcher.cu @@ -887,16 +887,10 @@ TEST_F(HostDispatcherLoopTest, BackpressureWhenAllBusy) { SignalSlot(0); SignalSlot(1); + // sweep_completed_workers auto-releases workers, so both slots + // complete without an explicit RestoreWorker call. ASSERT_TRUE(PollTxFlag(0)) << "Timeout on slot 0"; - ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); - - // Slot 1 should still be pending — worker is busy. - EXPECT_EQ(tx_flags_host_[1], 0u) - << "Slot 1 should stall while worker is busy"; - - RestoreWorker(0); - - ASSERT_TRUE(PollTxFlag(1)) << "Timeout on slot 1 after restoring worker"; + ASSERT_TRUE(PollTxFlag(1)) << "Timeout on slot 1"; ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); const std::uint8_t expected0[] = {1, 2, 3, 4}; From f7b4ede1150fffb8fd5c60139fc8c02111ddb2cd Mon Sep 17 00:00:00 2001 From: Chuck Ketcham Date: Tue, 31 Mar 2026 19:54:47 +0000 Subject: [PATCH 5/5] Clarify dispatch_path comment with enum value (PR review) Signed-off-by: Chuck Ketcham --- .../include/cudaq/realtime/hololink_bridge_common.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/realtime/include/cudaq/realtime/hololink_bridge_common.h b/realtime/include/cudaq/realtime/hololink_bridge_common.h index f4c2c6d95ae..c6ed548dc18 100644 --- a/realtime/include/cudaq/realtime/hololink_bridge_common.h +++ b/realtime/include/cudaq/realtime/hololink_bridge_common.h @@ -136,11 +136,11 @@ struct BridgeConfig { /// Default: cudaq_launch_dispatch_kernel_regular cudaq_dispatch_launch_fn_t launch_fn = nullptr; - // HOST_LOOP graph launch mode -- CPU-side dispatcher that polls Hololink - // ring flags and launches CUDA graphs. Requires a Grace-based system - // (Grace-Hopper / DGX Spark, Grace-Blackwell / GB200) where GPU memory is - // CPU-accessible via NVLink-C2C, since the HOST_LOOP thread reads DOCA - // GPU ring flags directly from the CPU. + // Set this to CUDAQ_DISPATCH_PATH_HOST for the HOST_LOOP graph launch mode -- + // CPU-side dispatcher that polls Hololink ring flags and launches CUDA + // graphs. Requires a Grace-based system (Grace-Hopper / DGX Spark, + // Grace-Blackwell / GB200) where GPU memory is CPU-accessible via NVLink-C2C, + // since the HOST_LOOP thread reads DOCA GPU ring flags directly from the CPU. cudaq_dispatch_path_t dispatch_path = CUDAQ_DISPATCH_PATH_DEVICE; /// Host-side function table for GRAPH_LAUNCH entries (HOST_LOOP only).