diff --git a/include/cudecomp.h b/include/cudecomp.h index 9d5e5dd..902a628 100644 --- a/include/cudecomp.h +++ b/include/cudecomp.h @@ -48,7 +48,8 @@ typedef enum { CUDECOMP_TRANSPOSE_COMM_NCCL = 4, ///< NCCL backend CUDECOMP_TRANSPOSE_COMM_NCCL_PL = 5, ///< NCCL backend with pipelining CUDECOMP_TRANSPOSE_COMM_NVSHMEM = 6, ///< NVSHMEM backend - CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL = 7 ///< NVSHMEM backend with pipelining + CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL = 7, ///< NVSHMEM backend with pipelining + CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM = 8 ///< NVSHMEM backend using SM-based P2P transfers } cudecompTransposeCommBackend_t; /** diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h index 138de8f..aebd048 100644 --- a/include/internal/comm_routines.h +++ b/include/internal/comm_routines.h @@ -90,11 +90,12 @@ static inline void checkMpiInt32Limit(int64_t val, cudecompHaloCommBackend_t bac #ifdef ENABLE_NVSHMEM #define CUDECOMP_NVSHMEM_INTRAGROUP_SYNC_FREQ 8 // max number of intra-group transfers to schedule between team syncs template -static void -nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff, - const std::vector& send_counts, const std::vector& send_offsets, - T* recv_buff, const std::vector& recv_counts, - const std::vector& recv_offsets, cudecompCommAxis comm_axis, cudaStream_t stream) { +static void nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff, + const std::vector& send_counts, + const std::vector& send_offsets, T* recv_buff, + const std::vector& recv_counts, + const std::vector& recv_offsets, cudecompCommAxis comm_axis, bool use_sm, + cudaStream_t stream) { auto& comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info; auto team = comm_info.nvshmem_team; int self_rank = comm_info.rank; @@ -104,16 +105,25 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_ CHECK_CUDA(cudaStreamWaitEvent(stream, grid_desc->nvshmem_sync_event)); // Event dependency on external stream for intra-group transfers - CHECK_CUDA(cudaEventRecord(grid_desc->events[0], stream)); - for (int i = 0; i < handle->device_p2p_ce_count; ++i) { - CHECK_CUDA(cudaStreamWaitEvent(handle->streams[i], grid_desc->events[0], 0)); + if (!use_sm) { + CHECK_CUDA(cudaEventRecord(grid_desc->events[0], stream)); + for (int i = 0; i < handle->device_p2p_ce_count; ++i) { + CHECK_CUDA(cudaStreamWaitEvent(handle->streams[i], grid_desc->events[0], 0)); + } } + bool need_barrier = false; + bool need_quiet = false; cudecompNvshmemA2AParams params; + cudecompNvshmemP2PParams p2p_params; + p2p_params.send_buff = send_buff; + p2p_params.recv_buff = recv_buff; + p2p_params.block_counters = grid_desc->nvshmem_block_counters; // Inter-group transfers (non-blocking) params.send_buff = send_buff; params.recv_buff = recv_buff; + int count = 0; for (int i = 1; i < send_counts.size(); ++i) { int src_rank, dst_rank; @@ -121,6 +131,8 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_ int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank); if (nvshmem_ptr(recv_buff, dst_rank_global)) { continue; } + if (!use_sm) need_barrier = true; + need_quiet = true; params.send_offsets[count] = send_offsets[dst_rank]; params.recv_offsets[count] = recv_offsets[dst_rank]; params.send_counts[count] = send_counts[dst_rank]; @@ -129,59 +141,87 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_ if (count == CUDECOMP_NVSHMEM_A2A_PARAM_CAPACITY) { params.ntransfers = count; - cudecomp_nvshmem_alltoallv(params, stream); + cudecomp_nvshmem_alltoallv(params, use_sm ? &comm_info.nvshmem_signals[0] : nullptr, stream); count = 0; } } if (count != 0) { params.ntransfers = count; - cudecomp_nvshmem_alltoallv(params, stream); + cudecomp_nvshmem_alltoallv(params, use_sm ? &comm_info.nvshmem_signals[0] : nullptr, stream); } // Intra-group transfers (blocking, scheduled after non-blocking inter-group transfers for concurrency) count = 0; - for (int i = 1; i < send_counts.size(); ++i) { + for (int i = (use_sm ? 0 : 1); i < send_counts.size(); ++i) { int src_rank, dst_rank; getAlltoallPeerRanks(grid_desc, comm_axis, i, src_rank, dst_rank); int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank); if (nvshmem_ptr(recv_buff, dst_rank_global)) { - if (comm_info.ngroups == 1 && handle->device_p2p_ce_count == 1 && count != 0 && - count % CUDECOMP_NVSHMEM_INTRAGROUP_SYNC_FREQ == 0) { - // For single group, single P2P CE (e.g. NVSwitch), synchronize NVSHMEM team every - // CUDECOMP_NVSHMEM_INTRAGROUP_SYNC_FREQ transfers This helps reduce CE contention due to accumulation of - // jitter. - for (int i = 0; i < handle->device_p2p_ce_count; ++i) { - CHECK_CUDA(cudaEventRecord(grid_desc->events[0], handle->streams[i])); - CHECK_CUDA(cudaStreamWaitEvent(aux_stream, grid_desc->events[0], 0)); - } + if (!use_sm) { + need_barrier = true; + if (comm_info.ngroups == 1 && handle->device_p2p_ce_count == 1 && count != 0 && + count % CUDECOMP_NVSHMEM_INTRAGROUP_SYNC_FREQ == 0) { + // For single group, single P2P CE (e.g. NVSwitch), synchronize NVSHMEM team every + // CUDECOMP_NVSHMEM_INTRAGROUP_SYNC_FREQ transfers This helps reduce CE contention due to accumulation of + // jitter. + for (int i = 0; i < handle->device_p2p_ce_count; ++i) { + CHECK_CUDA(cudaEventRecord(grid_desc->events[0], handle->streams[i])); + CHECK_CUDA(cudaStreamWaitEvent(aux_stream, grid_desc->events[0], 0)); + } - nvshmemx_team_sync_on_stream(team, aux_stream); + nvshmemx_team_sync_on_stream(team, aux_stream); - CHECK_CUDA(cudaEventRecord(grid_desc->events[0], aux_stream)); - for (int i = 0; i < handle->device_p2p_ce_count; ++i) { - CHECK_CUDA(cudaStreamWaitEvent(handle->streams[i], grid_desc->events[0], 0)); + CHECK_CUDA(cudaEventRecord(grid_desc->events[0], aux_stream)); + for (int i = 0; i < handle->device_p2p_ce_count; ++i) { + CHECK_CUDA(cudaStreamWaitEvent(handle->streams[i], grid_desc->events[0], 0)); + } } - } - nvshmemx_putmem_on_stream(recv_buff + recv_offsets[dst_rank], send_buff + send_offsets[dst_rank], - send_counts[dst_rank] * sizeof(T), dst_rank_global, - handle->streams[count % handle->device_p2p_ce_count]); - count++; + nvshmemx_putmem_on_stream(recv_buff + recv_offsets[dst_rank], send_buff + send_offsets[dst_rank], + send_counts[dst_rank] * sizeof(T), dst_rank_global, + handle->streams[count % handle->device_p2p_ce_count]); + count++; + } else { + p2p_params.send_offsets[count] = send_offsets[dst_rank]; + p2p_params.recv_offsets[count] = recv_offsets[dst_rank]; + p2p_params.send_counts[count] = send_counts[dst_rank]; + p2p_params.peer_ranks[count] = dst_rank_global; + count++; + + if (count == CUDECOMP_NVSHMEM_P2P_PARAM_CAPACITY) { + p2p_params.ntransfers = count; + cudecomp_nvshmem_alltoallv_p2p(handle, p2p_params, &comm_info.nvshmem_signals[0], stream); + count = 0; + } + } } } - // Self-copy with cudaMemcpy - CHECK_CUDA(cudaMemcpyAsync(recv_buff + recv_offsets[self_rank], send_buff + send_offsets[self_rank], - send_counts[self_rank] * sizeof(T), cudaMemcpyDeviceToDevice, stream)); + if (use_sm) { + if (count != 0) { + p2p_params.ntransfers = count; + cudecomp_nvshmem_alltoallv_p2p(handle, p2p_params, &comm_info.nvshmem_signals[0], stream); + } + + if (need_quiet) { nvshmemx_quiet_on_stream(stream); } + nvshmemx_signal_wait_until_on_stream(&comm_info.nvshmem_signals[0], NVSHMEM_CMP_EQ, + static_cast(comm_info.nranks), stream); + } else { + // Self-copy with cudaMemcpy + CHECK_CUDA(cudaMemcpyAsync(recv_buff + recv_offsets[self_rank], send_buff + send_offsets[self_rank], + send_counts[self_rank] * sizeof(T), cudaMemcpyDeviceToDevice, stream)); + } - // Event dependency on internal streams for completion of intra-group transfers - for (int i = 0; i < handle->device_p2p_ce_count; ++i) { - CHECK_CUDA(cudaEventRecord(grid_desc->events[0], handle->streams[i])); - CHECK_CUDA(cudaStreamWaitEvent(stream, grid_desc->events[0], 0)); + // Event dependency on internal streams for completion of intra-group transfers (not needed for SM path) + if (!use_sm) { + for (int i = 0; i < handle->device_p2p_ce_count; ++i) { + CHECK_CUDA(cudaEventRecord(grid_desc->events[0], handle->streams[i])); + CHECK_CUDA(cudaStreamWaitEvent(stream, grid_desc->events[0], 0)); + } } - nvshmemx_barrier_on_stream(team, stream); + if (need_barrier) { nvshmemx_barrier_on_stream(team, stream); } } #endif @@ -213,11 +253,13 @@ static void cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridD std::vector reqs; switch (grid_desc->config.transpose_comm_backend) { - case CUDECOMP_TRANSPOSE_COMM_NVSHMEM: { + case CUDECOMP_TRANSPOSE_COMM_NVSHMEM: + case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM: { #ifdef ENABLE_NVSHMEM if (nvshmem_ptr(send_buff, handle->rank) && nvshmem_ptr(recv_buff, handle->rank)) { nvshmemAlltoallV(handle, grid_desc, send_buff, send_counts, send_offsets, recv_buff, recv_counts, - recv_offsets_nvshmem, comm_axis, stream); + recv_offsets_nvshmem, comm_axis, + grid_desc->config.transpose_comm_backend == CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM, stream); break; } else { THROW_INVALID_USAGE("NVSHMEM communication backends require workspace allocated via cudecompMalloc."); diff --git a/include/internal/common.h b/include/internal/common.h index fde5680..ea52a73 100644 --- a/include/internal/common.h +++ b/include/internal/common.h @@ -114,6 +114,8 @@ struct cudecompHandle { // Miscellaneous int32_t device_p2p_ce_count = 0; // number of P2P CEs available + int32_t device_num_sms = 0; // number of SMs on the device + int32_t device_max_threads_per_sm = 0; // maximum threads per SM bool use_col_major_rank_order = false; // Flag to control whether to use column-major rank order }; @@ -183,6 +185,10 @@ struct cudecompGridDesc { std::vector events{nullptr}; // CUDA events used for scheduling cudaEvent_t nvshmem_sync_event = nullptr; // NVSHMEM event used for synchronization +#ifdef ENABLE_NVSHMEM + int* nvshmem_block_counters = nullptr; // device memory counters for SM alltoallv last-block detection +#endif + cudecomp::graphCache graph_cache; // CUDA graph cache cudecomp::ncclComm nccl_comm; // NCCL communicator (global), shared from handle @@ -292,7 +298,8 @@ static inline bool haloBackendRequiresNccl(cudecompHaloCommBackend_t comm_backen } static inline bool transposeBackendRequiresNvshmem(cudecompTransposeCommBackend_t comm_backend) { - return (comm_backend == CUDECOMP_TRANSPOSE_COMM_NVSHMEM || comm_backend == CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL); + return (comm_backend == CUDECOMP_TRANSPOSE_COMM_NVSHMEM || comm_backend == CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL || + comm_backend == CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM); } static inline bool haloBackendRequiresNvshmem(cudecompHaloCommBackend_t comm_backend) { @@ -380,8 +387,8 @@ static inline void getAlltoallPeerRanks(cudecompGridDesc_t grid_desc, cudecompCo const auto& info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info; - // Quick return for single rank case - if (info.nranks == 1) { + // Return self for single rank case or when iter is zero + if (info.nranks == 1 || iter == 0) { src_rank = info.rank; dst_rank = info.rank; return; diff --git a/include/internal/cudecomp_kernels.cuh b/include/internal/cudecomp_kernels.cuh index f09c3d2..97bcfc9 100644 --- a/include/internal/cudecomp_kernels.cuh +++ b/include/internal/cudecomp_kernels.cuh @@ -18,6 +18,8 @@ #ifndef CUDECOMP_KERNELS_CUH #define CUDECOMP_KERNELS_CUH +#include + #ifdef ENABLE_NVSHMEM #include #endif @@ -28,6 +30,8 @@ #define CUDECOMP_UNROLL_FACTOR (4) #define CUDECOMP_MIN_BLOCKS_PER_SM (16) +#define CUDECOMP_NVSHMEM_NTHREADS (1024) + namespace cudecomp { #ifdef ENABLE_NVSHMEM @@ -47,6 +51,61 @@ __launch_bounds__(CUDECOMP_CUDA_NTHREADS) __global__ nvshmem_putmem_nbi(recv_buff + recv_offset, send_buff + send_offset, send_count * sizeof(T), peer_rank); } + +template +__launch_bounds__(CUDECOMP_CUDA_NTHREADS) __global__ + void cudecomp_nvshmem_alltoallv_signal_k(cudecompNvshmemA2AParams params, uint64_t* sig_addr) { + + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= params.ntransfers) return; + + int peer_rank = params.peer_ranks[tid]; + T* send_buff = params.send_buff; + T* recv_buff = params.recv_buff; + size_t send_offset = params.send_offsets[tid]; + size_t recv_offset = params.recv_offsets[tid]; + size_t send_count = params.send_counts[tid]; + + nvshmem_putmem_signal_nbi(recv_buff + recv_offset, send_buff + send_offset, send_count * sizeof(T), sig_addr, 1, + NVSHMEM_SIGNAL_ADD, peer_rank); +} + +template +__launch_bounds__(CUDECOMP_NVSHMEM_NTHREADS) __global__ + void cudecomp_nvshmem_alltoallv_p2p_k(cudecompNvshmemP2PParams params, uint64_t* sig_addr) { + + T* send_buff = params.send_buff; + T* recv_buff = params.recv_buff; + int bid = blockIdx.x; + + if (params.ntransfers > 0) { + int blocks_per_copy = gridDim.x / params.ntransfers; + int copyid = bid / blocks_per_copy; + int block_within_copy = bid % blocks_per_copy; + int peer_rank = params.peer_ranks[copyid]; + size_t send_offset = params.send_offsets[copyid]; + size_t recv_offset = params.recv_offsets[copyid]; + size_t send_count = params.send_counts[copyid]; + + size_t nelems_per_block = (send_count + blocks_per_copy - 1) / blocks_per_copy; + size_t block_offset = (size_t)block_within_copy * nelems_per_block; + if (block_offset < send_count) { + size_t block_count = min(nelems_per_block, send_count - block_offset); + nvshmemx_putmem_block(recv_buff + recv_offset + block_offset, send_buff + send_offset + block_offset, + block_count * sizeof(T), peer_rank); + } + + // Last block to finish this copy signals the destination PE. + nvshmem_fence(); + __syncthreads(); + if (threadIdx.x == 0) { + if (atomicAdd(¶ms.block_counters[peer_rank], 1) + 1 == blocks_per_copy) { + params.block_counters[peer_rank] = 0; + nvshmemx_signal_op(sig_addr, 1, NVSHMEM_SIGNAL_ADD, peer_rank); + } + } + } +} #endif template @@ -107,7 +166,8 @@ __launch_bounds__(CUDECOMP_CUDA_NTHREADS) __global__ } template -void cudecomp_batched_d2d_memcpy_3d_nd_dispatch(const cudecompBatchedD2DMemcpy3DParams& params, +void cudecomp_batched_d2d_memcpy_3d_nd_dispatch(cudecompHandle_t handle, + const cudecompBatchedD2DMemcpy3DParams& params, cudaStream_t stream) { size_t N = params.extents[0][0] * params.extents[1][0] * params.extents[2][0]; @@ -138,12 +198,9 @@ void cudecomp_batched_d2d_memcpy_3d_nd_dispatch(const cudecompBatchedD2DMemcpy3D int blocks_per_copy_unroll = (blocks_per_copy + CUDECOMP_UNROLL_FACTOR - 1) / CUDECOMP_UNROLL_FACTOR; size_t total_blocks_unroll = params.ncopies * blocks_per_copy_unroll; - // Clamp minimum number of blocks from unrolling - int dev, num_sms; - CHECK_CUDA(cudaGetDevice(&dev)); - CHECK_CUDA(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev)); - - if (total_blocks_unroll > CUDECOMP_MIN_BLOCKS_PER_SM * num_sms) { blocks_per_copy = blocks_per_copy_unroll; } + if (total_blocks_unroll > CUDECOMP_MIN_BLOCKS_PER_SM * handle->device_num_sms) { + blocks_per_copy = blocks_per_copy_unroll; + } switch (src_nd) { case 1: diff --git a/include/internal/cudecomp_kernels.h b/include/internal/cudecomp_kernels.h index 6fcb9d7..17025aa 100644 --- a/include/internal/cudecomp_kernels.h +++ b/include/internal/cudecomp_kernels.h @@ -20,42 +20,77 @@ #include +#include "internal/common.h" + namespace cudecomp { #ifdef ENABLE_NVSHMEM -#define CUDECOMP_NVSHMEM_A2A_PARAM_CAPACITY 96 + +// Capacity for the inter-group alltoallv kernel. +#define CUDECOMP_NVSHMEM_A2A_PARAM_CAPACITY (96) template struct cudecompNvshmemA2AParams { - int ntransfers; T* send_buff = nullptr; T* recv_buff = nullptr; size_t send_offsets[CUDECOMP_NVSHMEM_A2A_PARAM_CAPACITY]; size_t recv_offsets[CUDECOMP_NVSHMEM_A2A_PARAM_CAPACITY]; size_t send_counts[CUDECOMP_NVSHMEM_A2A_PARAM_CAPACITY]; int peer_ranks[CUDECOMP_NVSHMEM_A2A_PARAM_CAPACITY]; + int ntransfers; +}; + +// Capacity for the intra-group SM P2P kernel. +#define CUDECOMP_NVSHMEM_MAX_SMS (32) +#define CUDECOMP_NVSHMEM_P2P_PARAM_CAPACITY (CUDECOMP_NVSHMEM_MAX_SMS * 2) +template struct cudecompNvshmemP2PParams { + T* send_buff = nullptr; + T* recv_buff = nullptr; + int* block_counters = nullptr; + size_t send_offsets[CUDECOMP_NVSHMEM_P2P_PARAM_CAPACITY]; + size_t recv_offsets[CUDECOMP_NVSHMEM_P2P_PARAM_CAPACITY]; + size_t send_counts[CUDECOMP_NVSHMEM_P2P_PARAM_CAPACITY]; + int peer_ranks[CUDECOMP_NVSHMEM_P2P_PARAM_CAPACITY]; + int ntransfers; }; -void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams& params, cudaStream_t stream); -void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams& params, cudaStream_t stream); -void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams>& params, cudaStream_t stream); -void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams>& params, +void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams& params, uint64_t* sig_addr, cudaStream_t stream); +void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams& params, uint64_t* sig_addr, + cudaStream_t stream); +void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams>& params, uint64_t* sig_addr, cudaStream_t stream); +void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams>& params, uint64_t* sig_addr, + cudaStream_t stream); + +void cudecomp_nvshmem_alltoallv_p2p(cudecompHandle_t handle, const cudecompNvshmemP2PParams& params, + uint64_t* sig_addr, cudaStream_t stream); +void cudecomp_nvshmem_alltoallv_p2p(cudecompHandle_t handle, const cudecompNvshmemP2PParams& params, + uint64_t* sig_addr, cudaStream_t stream); +void cudecomp_nvshmem_alltoallv_p2p(cudecompHandle_t handle, + const cudecompNvshmemP2PParams>& params, + uint64_t* sig_addr, cudaStream_t stream); +void cudecomp_nvshmem_alltoallv_p2p(cudecompHandle_t handle, + const cudecompNvshmemP2PParams>& params, + uint64_t* sig_addr, cudaStream_t stream); #endif #define CUDECOMP_BATCHED_D2D_3D_PARAM_CAPACITY 56 template struct cudecompBatchedD2DMemcpy3DParams { - int ncopies; T* src[CUDECOMP_BATCHED_D2D_3D_PARAM_CAPACITY]; T* dest[CUDECOMP_BATCHED_D2D_3D_PARAM_CAPACITY]; size_t src_strides[2][CUDECOMP_BATCHED_D2D_3D_PARAM_CAPACITY]; // [depth stride, row stride] col_stride=1 assumed size_t dest_strides[2][CUDECOMP_BATCHED_D2D_3D_PARAM_CAPACITY]; // [depth stride, row stride] col_stride=1 assumed size_t extents[3][CUDECOMP_BATCHED_D2D_3D_PARAM_CAPACITY]; // [depth, height, width] + int ncopies; }; -void cudecomp_batched_d2d_memcpy_3d(cudecompBatchedD2DMemcpy3DParams& params, cudaStream_t stream); -void cudecomp_batched_d2d_memcpy_3d(cudecompBatchedD2DMemcpy3DParams& params, cudaStream_t stream); -void cudecomp_batched_d2d_memcpy_3d(cudecompBatchedD2DMemcpy3DParams>& params, +void cudecomp_batched_d2d_memcpy_3d(cudecompHandle_t handle, cudecompBatchedD2DMemcpy3DParams& params, + cudaStream_t stream); +void cudecomp_batched_d2d_memcpy_3d(cudecompHandle_t handle, cudecompBatchedD2DMemcpy3DParams& params, + cudaStream_t stream); +void cudecomp_batched_d2d_memcpy_3d(cudecompHandle_t handle, + cudecompBatchedD2DMemcpy3DParams>& params, cudaStream_t stream); -void cudecomp_batched_d2d_memcpy_3d(cudecompBatchedD2DMemcpy3DParams>& params, +void cudecomp_batched_d2d_memcpy_3d(cudecompHandle_t handle, + cudecompBatchedD2DMemcpy3DParams>& params, cudaStream_t stream); } // namespace cudecomp diff --git a/include/internal/halo.h b/include/internal/halo.h index c9d890f..dd9a5e2 100644 --- a/include/internal/halo.h +++ b/include/internal/halo.h @@ -183,7 +183,7 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG } memcpy_params.ncopies = 2; - cudecomp_batched_d2d_memcpy_3d(memcpy_params, stream); + cudecomp_batched_d2d_memcpy_3d(handle, memcpy_params, stream); } break; case 1: { @@ -218,7 +218,7 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG } memcpy_params.ncopies = 2; - cudecomp_batched_d2d_memcpy_3d(memcpy_params, stream); + cudecomp_batched_d2d_memcpy_3d(handle, memcpy_params, stream); std::array counts{static_cast(halo_size), static_cast(halo_size)}; std::array offsets{}; @@ -266,7 +266,7 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG memcpy_params.ncopies = 2; } - cudecomp_batched_d2d_memcpy_3d(memcpy_params, stream); + cudecomp_batched_d2d_memcpy_3d(handle, memcpy_params, stream); } break; case 2: { diff --git a/include/internal/transpose.h b/include/internal/transpose.h index 6328f93..5275379 100644 --- a/include/internal/transpose.h +++ b/include/internal/transpose.h @@ -579,7 +579,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c memcpy_count++; if (memcpy_count == memcpy_limit || j == splits_a.size()) { memcpy_params.ncopies = memcpy_count; - cudecomp_batched_d2d_memcpy_3d(memcpy_params, graph_stream); + cudecomp_batched_d2d_memcpy_3d(handle, memcpy_params, graph_stream); memcpy_count = 0; } #if CUDART_VERSION >= 11010 @@ -873,7 +873,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c memcpy_count++; if (memcpy_count == memcpy_limit || j == splits_a.size() - 1) { memcpy_params.ncopies = memcpy_count; - cudecomp_batched_d2d_memcpy_3d(memcpy_params, stream); + cudecomp_batched_d2d_memcpy_3d(handle, memcpy_params, stream); memcpy_count = 0; } } diff --git a/src/autotune.cc b/src/autotune.cc index d4872c8..15a678c 100644 --- a/src/autotune.cc +++ b/src/autotune.cc @@ -105,6 +105,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d if (!options->disable_nvshmem_backends) { comm_backend_list.push_back(CUDECOMP_TRANSPOSE_COMM_NVSHMEM); comm_backend_list.push_back(CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL); + comm_backend_list.push_back(CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM); need_nvshmem = true; } #endif @@ -283,6 +284,8 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d (uint64_t*)nvshmem_malloc(grid_desc->col_comm_info.nranks * sizeof(uint64_t)); CHECK_CUDA( cudaMemset(grid_desc->col_comm_info.nvshmem_signals, 0, grid_desc->col_comm_info.nranks * sizeof(uint64_t))); + CHECK_CUDA(cudaMalloc(&grid_desc->nvshmem_block_counters, handle->nranks * sizeof(int))); + CHECK_CUDA(cudaMemset(grid_desc->nvshmem_block_counters, 0, handle->nranks * sizeof(int))); #endif } @@ -461,6 +464,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d nvshmem_team_destroy(grid_desc->col_comm_info.nvshmem_team); nvshmem_free(grid_desc->row_comm_info.nvshmem_signals); nvshmem_free(grid_desc->col_comm_info.nvshmem_signals); + CHECK_CUDA(cudaFree(grid_desc->nvshmem_block_counters)); grid_desc->row_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID; grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID; #endif @@ -709,6 +713,8 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, (uint64_t*)nvshmem_malloc(grid_desc->col_comm_info.nranks * sizeof(uint64_t)); CHECK_CUDA( cudaMemset(grid_desc->col_comm_info.nvshmem_signals, 0, grid_desc->col_comm_info.nranks * sizeof(uint64_t))); + CHECK_CUDA(cudaMalloc(&grid_desc->nvshmem_block_counters, handle->nranks * sizeof(int))); + CHECK_CUDA(cudaMemset(grid_desc->nvshmem_block_counters, 0, handle->nranks * sizeof(int))); #endif } @@ -824,6 +830,7 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, nvshmem_team_destroy(grid_desc->col_comm_info.nvshmem_team); nvshmem_free(grid_desc->row_comm_info.nvshmem_signals); nvshmem_free(grid_desc->col_comm_info.nvshmem_signals); + CHECK_CUDA(cudaFree(grid_desc->nvshmem_block_counters)); grid_desc->row_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID; grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID; #endif diff --git a/src/cudecomp.cc b/src/cudecomp.cc index c37902f..9a841cd 100644 --- a/src/cudecomp.cc +++ b/src/cudecomp.cc @@ -102,11 +102,13 @@ static void checkTransposeCommBackend(cudecompTransposeCommBackend_t comm_backen case CUDECOMP_TRANSPOSE_COMM_MPI_A2A: #ifdef ENABLE_NVSHMEM case CUDECOMP_TRANSPOSE_COMM_NVSHMEM: - case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL: return; + case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL: + case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM: return; #else return; case CUDECOMP_TRANSPOSE_COMM_NVSHMEM: - case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL: THROW_NOT_SUPPORTED("transpose communication type unsupported"); + case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL: + case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM: THROW_NOT_SUPPORTED("transpose communication type unsupported"); #endif default: THROW_INVALID_USAGE("unknown transpose communication type"); @@ -529,6 +531,10 @@ cudecompResult_t cudecompInit(cudecompHandle_t* handle_in, MPI_Comm mpi_comm) { handle->device_p2p_ce_count = 2; // Assume 2 P2P CE otherwise (shared D2H/H2D CE) } + // Cache device attributes + CHECK_CUDA(cudaDeviceGetAttribute(&handle->device_num_sms, cudaDevAttrMultiProcessorCount, dev)); + CHECK_CUDA(cudaDeviceGetAttribute(&handle->device_max_threads_per_sm, cudaDevAttrMaxThreadsPerMultiProcessor, dev)); + handle->initialized = true; cudecomp_initialized = true; @@ -791,6 +797,8 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes (uint64_t*)nvshmem_malloc(grid_desc->col_comm_info.nranks * sizeof(uint64_t)); CHECK_CUDA( cudaMemset(grid_desc->col_comm_info.nvshmem_signals, 0, grid_desc->col_comm_info.nranks * sizeof(uint64_t))); + CHECK_CUDA(cudaMalloc(&grid_desc->nvshmem_block_counters, handle->nranks * sizeof(int))); + CHECK_CUDA(cudaMemset(grid_desc->nvshmem_block_counters, 0, handle->nranks * sizeof(int))); handle->n_grid_descs_using_nvshmem++; } else { // Finalize nvshmem to reclaim symmetric heap memory if not used @@ -924,6 +932,7 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe nvshmem_team_destroy(grid_desc->col_comm_info.nvshmem_team); nvshmem_free(grid_desc->col_comm_info.nvshmem_signals); } + CHECK_CUDA(cudaFree(grid_desc->nvshmem_block_counters)); handle->n_grid_descs_using_nvshmem--; // Finalize nvshmem to reclaim symmetric heap memory if not used @@ -1338,6 +1347,7 @@ const char* cudecompTransposeCommBackendToString(cudecompTransposeCommBackend_t case CUDECOMP_TRANSPOSE_COMM_MPI_A2A: return "MPI_A2A"; case CUDECOMP_TRANSPOSE_COMM_NVSHMEM: return "NVSHMEM"; case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL: return "NVSHMEM (pipelined)"; + case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM: return "NVSHMEM_SM"; default: return "ERROR"; } } diff --git a/src/cudecomp_kernels.cu b/src/cudecomp_kernels.cu index db6b345..311ebb9 100644 --- a/src/cudecomp_kernels.cu +++ b/src/cudecomp_kernels.cu @@ -22,19 +22,23 @@ namespace cudecomp { -void cudecomp_batched_d2d_memcpy_3d(cudecompBatchedD2DMemcpy3DParams& params, cudaStream_t stream) { - cudecomp_batched_d2d_memcpy_3d_nd_dispatch(params, stream); +void cudecomp_batched_d2d_memcpy_3d(cudecompHandle_t handle, cudecompBatchedD2DMemcpy3DParams& params, + cudaStream_t stream) { + cudecomp_batched_d2d_memcpy_3d_nd_dispatch(handle, params, stream); } -void cudecomp_batched_d2d_memcpy_3d(cudecompBatchedD2DMemcpy3DParams& params, cudaStream_t stream) { - cudecomp_batched_d2d_memcpy_3d_nd_dispatch(params, stream); +void cudecomp_batched_d2d_memcpy_3d(cudecompHandle_t handle, cudecompBatchedD2DMemcpy3DParams& params, + cudaStream_t stream) { + cudecomp_batched_d2d_memcpy_3d_nd_dispatch(handle, params, stream); } -void cudecomp_batched_d2d_memcpy_3d(cudecompBatchedD2DMemcpy3DParams>& params, +void cudecomp_batched_d2d_memcpy_3d(cudecompHandle_t handle, + cudecompBatchedD2DMemcpy3DParams>& params, cudaStream_t stream) { - cudecomp_batched_d2d_memcpy_3d_nd_dispatch(params, stream); + cudecomp_batched_d2d_memcpy_3d_nd_dispatch(handle, params, stream); } -void cudecomp_batched_d2d_memcpy_3d(cudecompBatchedD2DMemcpy3DParams>& params, +void cudecomp_batched_d2d_memcpy_3d(cudecompHandle_t handle, + cudecompBatchedD2DMemcpy3DParams>& params, cudaStream_t stream) { - cudecomp_batched_d2d_memcpy_3d_nd_dispatch(params, stream); + cudecomp_batched_d2d_memcpy_3d_nd_dispatch(handle, params, stream); } } // namespace cudecomp diff --git a/src/cudecomp_kernels_rdc.cu b/src/cudecomp_kernels_rdc.cu index 2215f39..2711f2b 100644 --- a/src/cudecomp_kernels_rdc.cu +++ b/src/cudecomp_kernels_rdc.cu @@ -23,27 +23,77 @@ namespace cudecomp { #ifdef ENABLE_NVSHMEM -void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams& params, cudaStream_t stream) { - cudecomp_nvshmem_alltoallv_k<<<1, CUDECOMP_CUDA_NTHREADS, 0, stream>>>(params); +template +static void launch_nvshmem_alltoallv(const cudecompNvshmemA2AParams& params, uint64_t* sig_addr, + cudaStream_t stream) { + if (sig_addr) { + cudecomp_nvshmem_alltoallv_signal_k<<<1, CUDECOMP_CUDA_NTHREADS, 0, stream>>>(params, sig_addr); + } else { + cudecomp_nvshmem_alltoallv_k<<<1, CUDECOMP_CUDA_NTHREADS, 0, stream>>>(params); + } CHECK_CUDA_LAUNCH(); } -void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams& params, cudaStream_t stream) { - cudecomp_nvshmem_alltoallv_k<<<1, CUDECOMP_CUDA_NTHREADS, 0, stream>>>(params); - CHECK_CUDA_LAUNCH(); +void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams& params, uint64_t* sig_addr, + cudaStream_t stream) { + launch_nvshmem_alltoallv(params, sig_addr, stream); } -void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams>& params, +void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams& params, uint64_t* sig_addr, cudaStream_t stream) { - cudecomp_nvshmem_alltoallv_k<<<1, CUDECOMP_CUDA_NTHREADS, 0, stream>>>(params); - CHECK_CUDA_LAUNCH(); + launch_nvshmem_alltoallv(params, sig_addr, stream); } -void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams>& params, +void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams>& params, uint64_t* sig_addr, cudaStream_t stream) { - cudecomp_nvshmem_alltoallv_k<<<1, CUDECOMP_CUDA_NTHREADS, 0, stream>>>(params); + launch_nvshmem_alltoallv(params, sig_addr, stream); +} + +void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams>& params, uint64_t* sig_addr, + cudaStream_t stream) { + launch_nvshmem_alltoallv(params, sig_addr, stream); +} + +static int nvshmem_p2p_nblocks(int ntransfers, cudecompHandle_t handle) { + // Theoretical max blocks per SM based on thread count. Round down to a multiple of ntransfers so + // that all launched blocks fit in a single resident wave and each copy receives exactly the same + // number of blocks, ensuring even NVLink subscription across peers. + int max_blocks_per_sm = handle->device_max_threads_per_sm / CUDECOMP_NVSHMEM_NTHREADS; + int nblocks_max = max_blocks_per_sm * std::min(handle->device_num_sms, (int32_t)CUDECOMP_NVSHMEM_MAX_SMS); + if (ntransfers == 0) return nblocks_max; + int blocks_per_copy = std::max(1, nblocks_max / ntransfers); + return blocks_per_copy * ntransfers; +} + +template +static void launch_nvshmem_alltoallv_p2p(cudecompHandle_t handle, const cudecompNvshmemP2PParams& params, + uint64_t* sig_addr, cudaStream_t stream) { + int nblocks = nvshmem_p2p_nblocks(params.ntransfers, handle); + cudecomp_nvshmem_alltoallv_p2p_k<<>>(params, sig_addr); CHECK_CUDA_LAUNCH(); } + +void cudecomp_nvshmem_alltoallv_p2p(cudecompHandle_t handle, const cudecompNvshmemP2PParams& params, + uint64_t* sig_addr, cudaStream_t stream) { + launch_nvshmem_alltoallv_p2p(handle, params, sig_addr, stream); +} + +void cudecomp_nvshmem_alltoallv_p2p(cudecompHandle_t handle, const cudecompNvshmemP2PParams& params, + uint64_t* sig_addr, cudaStream_t stream) { + launch_nvshmem_alltoallv_p2p(handle, params, sig_addr, stream); +} + +void cudecomp_nvshmem_alltoallv_p2p(cudecompHandle_t handle, + const cudecompNvshmemP2PParams>& params, + uint64_t* sig_addr, cudaStream_t stream) { + launch_nvshmem_alltoallv_p2p(handle, params, sig_addr, stream); +} + +void cudecomp_nvshmem_alltoallv_p2p(cudecompHandle_t handle, + const cudecompNvshmemP2PParams>& params, + uint64_t* sig_addr, cudaStream_t stream) { + launch_nvshmem_alltoallv_p2p(handle, params, sig_addr, stream); +} #endif } // namespace cudecomp diff --git a/src/cudecomp_m.cuf b/src/cudecomp_m.cuf index 8d207b6..fb7bb04 100644 --- a/src/cudecomp_m.cuf +++ b/src/cudecomp_m.cuf @@ -30,6 +30,7 @@ module cudecomp enumerator :: CUDECOMP_TRANSPOSE_COMM_NCCL_PL = 5 enumerator :: CUDECOMP_TRANSPOSE_COMM_NVSHMEM = 6 enumerator :: CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL = 7 + enumerator :: CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM = 8 end enum ! enum for cuDecomp halo backend options diff --git a/tests/fortran/transpose_test.f90 b/tests/fortran/transpose_test.f90 index d11af09..7060d99 100644 --- a/tests/fortran/transpose_test.f90 +++ b/tests/fortran/transpose_test.f90 @@ -48,8 +48,8 @@ module transpose_CUDECOMP_DOUBLE_COMPLEX_mod type(cudecompHandle) :: handle integer :: rank, nranks - type(cudecompGridDesc) :: grid_desc_cache(7) - logical :: grid_desc_cache_set(7) = .false. + type(cudecompGridDesc) :: grid_desc_cache(8) + logical :: grid_desc_cache_set(8) = .false. ARRTYPE, pointer, device, contiguous :: work_d(:) integer :: work_backend = -1 @@ -642,7 +642,7 @@ program main endif ! Free grid descriptors - do i = 1, 7 + do i = 1, 8 if (grid_desc_cache_set(i)) then ! Free workspace with correct grid descriptor if (work_backend == i) then diff --git a/tests/test_config.yaml b/tests/test_config.yaml index 08dcbed..f1a94d5 100644 --- a/tests/test_config.yaml +++ b/tests/test_config.yaml @@ -18,7 +18,7 @@ transpose_test_base: &transpose_test_base 'hex' ,'hey', 'hez', 'pdx', 'pdy', 'pdz'] - backend: [1, 2, 3, 4, 5, 6, 7] + backend: [1, 2, 3, 4, 5, 6, 7, 8] gx: [128] gy: [124]