Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/realtime_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ jobs:
bash scripts/install_dev_prerequisites.sh
# Build HSB (GPU RoCE transceiver and hololink_core)
export CUDA_NATIVE_ARCH="${{ (contains(matrix.cuda_version, '12') && '80-real;90') || '80-real;90-real;100f-real;110-real;120-real;100-virtual' }}"
cd /workspace/ && git clone -b release-2.6.0-EA https://github.com/nvidia-holoscan/holoscan-sensor-bridge.git && cd holoscan-sensor-bridge
cd /workspace/ && git clone -b release-2.6.0-EA https://github.com/nvidia-holoscan/holoscan-sensor-bridge.git && cd holoscan-sensor-bridge
for p in /workspace/realtime/scripts/hololink-patches/*.patch; do echo "Applying: $(basename $p)"; git apply "$p"; done
cmake -G Ninja -S /workspace/holoscan-sensor-bridge -B /workspace/holoscan-sensor-bridge/build -DCMAKE_BUILD_TYPE=Release -DHOLOLINK_BUILD_ONLY_NATIVE=OFF -DHOLOLINK_BUILD_PYTHON=OFF -DHOLOLINK_BUILD_TESTS=OFF -DHOLOLINK_BUILD_TOOLS=OFF -DHOLOLINK_BUILD_EXAMPLES=OFF -DHOLOLINK_BUILD_EMULATOR=OFF
cmake --build /workspace/holoscan-sensor-bridge/build --target roce_receiver gpu_roce_transceiver hololink_core
# Build CUDA-Q Realtime
Expand Down
1 change: 1 addition & 0 deletions realtime/docker/assets.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ ENV CUDA_NATIVE_ARCH=${cuda_native_arg}
ARG hsb_version="release-2.6.0-EA"
# Build HSB
RUN cd / && git clone -b ${hsb_version} https://github.com/nvidia-holoscan/holoscan-sensor-bridge.git && cd holoscan-sensor-bridge && \
for p in /cuda-quantum/realtime/scripts/hololink-patches/*.patch; do echo "Applying: $(basename $p)"; git apply "$p"; done && \
cmake -G Ninja -S . -B build -DCMAKE_BUILD_TYPE=Release -DHOLOLINK_BUILD_ONLY_NATIVE=OFF -DHOLOLINK_BUILD_PYTHON=OFF -DHOLOLINK_BUILD_TESTS=OFF -DHOLOLINK_BUILD_TOOLS=OFF -DHOLOLINK_BUILD_EXAMPLES=OFF -DHOLOLINK_BUILD_EMULATOR=OFF && \
cmake --build build --target roce_receiver gpu_roce_transceiver hololink_core

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,15 @@ void hololink_close(hololink_transceiver_t handle);
*/
void hololink_blocking_monitor(hololink_transceiver_t handle);

/**
* Force CPU+GPU accessible allocation for ring flags and data only.
* Does NOT change CQ/QP UMEMs or TX kernel handler (BlueFlame preserved).
* Must be called after create and before start().
* Required when a CPU thread needs to read ring flags/data directly
* (e.g. HOST_LOOP dispatcher on Grace-Blackwell dGPU).
*/
void hololink_set_cpu_ring_buffers(hololink_transceiver_t handle, int enable);

//==============================================================================
// QP information (for RDMA setup)
//==============================================================================
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,12 @@ typedef struct {
cudaq_kernel_type_t kernel_type; // regular/cooperative kernel
cudaq_dispatch_mode_t dispatch_mode; // device call/graph launch
cudaq_dispatch_path_t dispatch_path; // GPU kernel or CPU host loop
int skip_tx_markers; // when non-zero, the host dispatcher will NOT write
// sentinel markers (CUDAQ_TX_FLAG_IN_FLIGHT) to
// tx_flags before graph launch. Set this when an
// external GPU kernel (e.g. Hololink TX) polls the
// same tx_flags array; the sentinel would be
// misinterpreted as a valid address.
} cudaq_dispatcher_config_t;

// GPU ring buffer pointers. For device backend use device pointers only.
Expand Down
141 changes: 115 additions & 26 deletions realtime/include/cudaq/realtime/hololink_bridge_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,24 @@ struct BridgeConfig {
/// Default: cudaq_launch_dispatch_kernel_regular
cudaq_dispatch_launch_fn_t launch_fn = nullptr;

// Set this to CUDAQ_DISPATCH_PATH_HOST for the HOST_LOOP graph launch mode --
// CPU-side dispatcher that polls Hololink ring flags and launches CUDA
// graphs. Requires a Grace-based system (Grace-Hopper / DGX Spark,
// Grace-Blackwell / GB200) where GPU memory is CPU-accessible via NVLink-C2C,
// since the HOST_LOOP thread reads DOCA GPU ring flags directly from the CPU.
cudaq_dispatch_path_t dispatch_path = CUDAQ_DISPATCH_PATH_DEVICE;

/// Host-side function table for GRAPH_LAUNCH entries (HOST_LOOP only).
/// Each entry must have dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH
/// and handler.graph_exec set to a valid cudaGraphExec_t.
cudaq_function_entry_t *h_function_entries = nullptr;
size_t h_func_count = 0;

/// Pinned mailbox for HOST_LOOP graph dispatch (from graph_resources).
/// h_mailbox is the host pointer, d_mailbox is the device-mapped view.
void **h_mailbox = nullptr;
void **d_mailbox = nullptr;

/// @brief Optional cleanup callback invoked during shutdown.
std::function<void()> cleanup_fn;
};
Expand Down Expand Up @@ -247,8 +265,9 @@ inline int bridge_run(BridgeConfig &config) {
bool is_igpu = (prop.integrated != 0);
bool unified_igpu = config.unified && is_igpu;

bool is_host_loop = (config.dispatch_path == CUDAQ_DISPATCH_PATH_HOST);
bool use_forward = config.forward || (config.unified && !is_igpu);
bool use_3kernel = !config.forward && !config.unified;
bool use_3kernel = is_host_loop || (!config.forward && !config.unified);

hololink_transceiver_t transceiver = hololink_create_transceiver(
config.device.c_str(), 1, // ib_port
Expand All @@ -266,6 +285,10 @@ inline int bridge_run(BridgeConfig &config) {
return 1;
}

// HOST_LOOP needs CPU-readable ring flags and data; allocate as CPU_GPU.
if (is_host_loop)
hololink_set_cpu_ring_buffers(transceiver, 1);

std::cout << " Connecting to remote QP 0x" << std::hex << config.remote_qp
<< std::dec << " at " << config.peer_ip << "..." << std::endl;

Expand Down Expand Up @@ -336,7 +359,7 @@ inline int bridge_run(BridgeConfig &config) {
hololink_doca_transport_ctx unified_ctx{};

if (!config.forward) {
if (!config.unified) {
if (!config.unified && !is_host_loop) {
int dispatch_blocks = 0;
cudaError_t occ_err;
if (config.kernel_type == CUDAQ_KERNEL_COOPERATIVE) {
Expand All @@ -358,8 +381,10 @@ inline int bridge_run(BridgeConfig &config) {
// [4] Wire dispatch kernel
//==========================================================================
std::cout << "\n[4/5] Wiring dispatch kernel ("
<< (config.unified ? "unified" : "3-kernel") << ")..."
<< std::endl;
<< (is_host_loop ? "host-loop"
: config.unified ? "unified"
: "3-kernel")
<< ")..." << std::endl;

void *tmp_shutdown = nullptr;
BRIDGE_CUDA_CHECK(
Expand All @@ -374,7 +399,14 @@ inline int bridge_run(BridgeConfig &config) {
BRIDGE_CUDA_CHECK(cudaMemcpy(const_cast<int *>(d_shutdown_flag), &zero,
sizeof(int), cudaMemcpyHostToDevice));

BRIDGE_CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t)));
if (is_host_loop) {
void *tmp_stats = nullptr;
BRIDGE_CUDA_CHECK(
cudaHostAlloc(&tmp_stats, sizeof(uint64_t), cudaHostAllocMapped));
d_stats = static_cast<uint64_t *>(tmp_stats);
} else {
BRIDGE_CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t)));
}
BRIDGE_CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t)));

if (cudaq_dispatch_manager_create(&manager) != CUDAQ_OK) {
Expand All @@ -385,28 +417,78 @@ inline int bridge_run(BridgeConfig &config) {
cudaq_dispatcher_config_t dconfig{};
dconfig.device_id = config.gpu_id;
dconfig.vp_id = 0;
dconfig.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL;

if (config.unified) {
dconfig.kernel_type = CUDAQ_KERNEL_UNIFIED;
dconfig.num_blocks = 1;
dconfig.threads_per_block = 1;
dconfig.num_slots = 0;
dconfig.slot_size = 0;
} else {
dconfig.kernel_type = config.kernel_type;
dconfig.num_blocks = config.num_blocks;
dconfig.threads_per_block = config.threads_per_block;

if (is_host_loop) {
dconfig.dispatch_path = CUDAQ_DISPATCH_PATH_HOST;
dconfig.dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH;
dconfig.num_slots = static_cast<uint32_t>(config.num_pages);
dconfig.slot_size = static_cast<uint32_t>(config.page_size);
// Hololink TX kernel polls tx_flags for ready data; writing sentinel
// markers (0xEEEE) would be misinterpreted as a valid TX buffer address.
dconfig.skip_tx_markers = 1;
} else {
dconfig.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL;

if (config.unified) {
dconfig.kernel_type = CUDAQ_KERNEL_UNIFIED;
dconfig.num_blocks = 1;
dconfig.threads_per_block = 1;
dconfig.num_slots = 0;
dconfig.slot_size = 0;
} else {
dconfig.kernel_type = config.kernel_type;
dconfig.num_blocks = config.num_blocks;
dconfig.threads_per_block = config.threads_per_block;
dconfig.num_slots = static_cast<uint32_t>(config.num_pages);
dconfig.slot_size = static_cast<uint32_t>(config.page_size);
}
}

if (cudaq_dispatcher_create(manager, &dconfig, &dispatcher) != CUDAQ_OK) {
std::cerr << "ERROR: Failed to create dispatcher" << std::endl;
return 1;
}

if (config.unified) {
if (is_host_loop) {
// HOST_LOOP: wire ringbuffer with Hololink GPU pointers as both
// device and host views. On Grace-based systems (Grace-Hopper,
// Grace-Blackwell), GPU memory is CPU-accessible via NVLink-C2C.
cudaq_ringbuffer_t ringbuffer{};
ringbuffer.rx_flags = reinterpret_cast<volatile uint64_t *>(rx_ring_flag);
ringbuffer.tx_flags = reinterpret_cast<volatile uint64_t *>(tx_ring_flag);
ringbuffer.rx_data = rx_ring_data;
ringbuffer.tx_data = tx_ring_data;
ringbuffer.rx_stride_sz = config.page_size;
ringbuffer.tx_stride_sz = config.page_size;
ringbuffer.rx_flags_host =
reinterpret_cast<volatile uint64_t *>(rx_ring_flag);
ringbuffer.tx_flags_host =
reinterpret_cast<volatile uint64_t *>(tx_ring_flag);
ringbuffer.rx_data_host = rx_ring_data;
ringbuffer.tx_data_host = tx_ring_data;

if (cudaq_dispatcher_set_ringbuffer(dispatcher, &ringbuffer) !=
CUDAQ_OK) {
std::cerr << "ERROR: Failed to set ringbuffer" << std::endl;
return 1;
}

cudaq_function_table_t table{};
table.entries = config.h_function_entries;
table.count = static_cast<uint32_t>(config.h_func_count);
if (cudaq_dispatcher_set_function_table(dispatcher, &table) != CUDAQ_OK) {
std::cerr << "ERROR: Failed to set function table" << std::endl;
return 1;
}

if (config.h_mailbox) {
if (cudaq_dispatcher_set_mailbox(dispatcher, config.h_mailbox) !=
CUDAQ_OK) {
std::cerr << "ERROR: Failed to set mailbox" << std::endl;
return 1;
}
}
} else if (config.unified) {
// Pack DOCA transport handles into the opaque context
unified_ctx.gpu_dev_qp = hololink_get_gpu_dev_qp(transceiver);
unified_ctx.rx_ring_data = rx_ring_data;
Expand Down Expand Up @@ -448,12 +530,14 @@ inline int bridge_run(BridgeConfig &config) {
}
}

cudaq_function_table_t table{};
table.entries = config.d_function_entries;
table.count = config.func_count;
if (cudaq_dispatcher_set_function_table(dispatcher, &table) != CUDAQ_OK) {
std::cerr << "ERROR: Failed to set function table" << std::endl;
return 1;
if (!is_host_loop) {
cudaq_function_table_t table{};
table.entries = config.d_function_entries;
table.count = config.func_count;
if (cudaq_dispatcher_set_function_table(dispatcher, &table) != CUDAQ_OK) {
std::cerr << "ERROR: Failed to set function table" << std::endl;
return 1;
}
}

if (cudaq_dispatcher_set_control(dispatcher, d_shutdown_flag, d_stats) !=
Expand Down Expand Up @@ -488,6 +572,7 @@ inline int bridge_run(BridgeConfig &config) {
std::this_thread::sleep_for(std::chrono::milliseconds(500));
} else {
std::cout << "\n[5/5] Launching Hololink kernels..." << std::endl;
std::cout.flush();
hololink_thread = std::thread(
[transceiver]() { hololink_blocking_monitor(transceiver); });
std::this_thread::sleep_for(std::chrono::milliseconds(500));
Expand Down Expand Up @@ -572,8 +657,12 @@ inline int bridge_run(BridgeConfig &config) {

if (shutdown_flag)
cudaFreeHost(const_cast<int *>(shutdown_flag));
if (d_stats)
cudaFree(d_stats);
if (d_stats) {
if (is_host_loop)
cudaFreeHost(d_stats);
else
cudaFree(d_stats);
}

// Call tool-specific cleanup
if (config.cleanup_fn)
Expand Down
9 changes: 0 additions & 9 deletions realtime/lib/daemon/bridge/hololink/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,6 @@ find_library(HOLOLINK_COMMON_LIB
"${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib"
NO_DEFAULT_PATH)

find_library(ROCE_RECEIVER_LIB
NAMES roce_receiver
PATHS
"${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}"
"${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/operators/roce_receiver"
"${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib"
NO_DEFAULT_PATH)

find_library(BASE_RECEIVER_OP_LIB
NAMES base_receiver_op
PATHS
Expand Down Expand Up @@ -180,7 +172,6 @@ if (GPU_ROCE_TRANSCEIVER_LIB AND
PRIVATE
hololink_wrapper_generic
${GPU_ROCE_TRANSCEIVER_LIB}
${ROCE_RECEIVER_LIB}
${BASE_RECEIVER_OP_LIB}
${HOLOLINK_CORE_LIB}
${HOLOLINK_COMMON_LIB}
Expand Down
7 changes: 7 additions & 0 deletions realtime/lib/daemon/bridge/hololink/hololink_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,13 @@ void hololink_blocking_monitor(hololink_transceiver_t handle) {
}
}

void hololink_set_cpu_ring_buffers(hololink_transceiver_t handle, int enable) {
if (handle) {
auto *impl = reinterpret_cast<HololinkTransceiverImpl *>(handle);
impl->transceiver->set_cpu_ring_buffers(enable != 0);
}
}

//==============================================================================
// QP information
//==============================================================================
Expand Down
25 changes: 22 additions & 3 deletions realtime/lib/daemon/dispatcher/host_dispatcher.cu
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,21 @@ static int acquire_graph_worker(const cudaq_host_dispatch_loop_ctx_t *ctx,
return __builtin_ffsll(static_cast<long long>(mask)) - 1;
}

static void
sweep_completed_workers(const cudaq_host_dispatch_loop_ctx_t *ctx) {
uint64_t busy =
~as_atomic_u64(ctx->idle_mask)->load(cuda::std::memory_order_acquire);
busy &= (1ULL << ctx->num_workers) - 1;
while (busy != 0) {
int w = __builtin_ffsll(static_cast<long long>(busy)) - 1;
if (cudaStreamQuery(ctx->workers[w].stream) == cudaSuccess) {
as_atomic_u64(ctx->idle_mask)
->fetch_or(1ULL << w, cuda::std::memory_order_release);
}
busy &= ~(1ULL << w);
}
}

static void launch_graph_worker(const cudaq_host_dispatch_loop_ctx_t *ctx,
int worker_id, void *slot_host,
size_t current_slot) {
Expand Down Expand Up @@ -129,8 +144,10 @@ static void launch_graph_worker(const cudaq_host_dispatch_loop_ctx_t *ctx,
void *d_ctx = d_ctxs + worker_id * sizeof(GraphIOContext);
ctx->h_mailbox_bank[worker_id] = d_ctx;

as_atomic_u64(ctx->ringbuffer.tx_flags_host)[current_slot].store(
CUDAQ_TX_FLAG_IN_FLIGHT, cuda::std::memory_order_release);
if (!ctx->config.skip_tx_markers) {
as_atomic_u64(ctx->ringbuffer.tx_flags_host)[current_slot].store(
CUDAQ_TX_FLAG_IN_FLIGHT, cuda::std::memory_order_release);
}
__sync_synchronize();
} else {
ctx->h_mailbox_bank[worker_id] = data_dev;
Expand All @@ -154,7 +171,7 @@ static void launch_graph_worker(const cudaq_host_dispatch_loop_ctx_t *ctx,
if (ctx->workers[w].post_launch_fn)
ctx->workers[w].post_launch_fn(ctx->workers[w].post_launch_data,
data_dev, ctx->workers[w].stream);
if (ctx->io_ctxs_host == nullptr) {
if (ctx->io_ctxs_host == nullptr && !ctx->config.skip_tx_markers) {
as_atomic_u64(ctx->ringbuffer.tx_flags_host)[current_slot].store(
CUDAQ_TX_FLAG_IN_FLIGHT, cuda::std::memory_order_release);
}
Expand All @@ -178,6 +195,7 @@ cudaq_host_dispatcher_loop(const cudaq_host_dispatch_loop_ctx_t *ctx) {
cuda::std::memory_order_acquire);

if (rx_value == 0) {
sweep_completed_workers(ctx);
CUDAQ_REALTIME_CPU_RELAX();
continue;
}
Expand Down Expand Up @@ -206,6 +224,7 @@ cudaq_host_dispatcher_loop(const cudaq_host_dispatch_loop_ctx_t *ctx) {
continue;
}

sweep_completed_workers(ctx);
int worker_id =
acquire_graph_worker(ctx, use_function_table, entry, function_id);
if (worker_id < 0) {
Expand Down
Loading
Loading