Skip to content

Commit e401242

Browse files
authored
Revert "Use native NVSHMEM synchronization APIs in NVSHMEM backends (#107)" (#110)
This reverts commit d5716ce. Signed-off-by: Josh Romero <joshr@nvidia.com>
1 parent 3c68e4a commit e401242

5 files changed

Lines changed: 71 additions & 105 deletions

File tree

include/internal/comm_routines.h

Lines changed: 60 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -95,23 +95,26 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
9595
const std::vector<comm_count_t>& send_counts, const std::vector<comm_count_t>& send_offsets,
9696
T* recv_buff, const std::vector<comm_count_t>& recv_counts,
9797
const std::vector<comm_count_t>& recv_offsets, cudecompCommAxis comm_axis, cudaStream_t stream) {
98-
auto& comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
98+
auto comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
99+
auto comm = comm_info.mpi_comm;
99100
auto team = comm_info.nvshmem_team;
100101
int self_rank = comm_info.rank;
101-
auto aux_stream = handle->streams[handle->device_p2p_ce_count];
102-
103-
// Enforce sync dependency between transpose operations
104-
CHECK_CUDA(cudaStreamWaitEvent(stream, grid_desc->nvshmem_sync_event));
105102

106103
// Event dependency on external stream for intra-group transfers
107104
CHECK_CUDA(cudaEventRecord(grid_desc->events[0], stream));
108105
for (int i = 0; i < handle->device_p2p_ce_count; ++i) {
109106
CHECK_CUDA(cudaStreamWaitEvent(handle->streams[i], grid_desc->events[0], 0));
110107
}
111108

109+
// Using cudaEventSynchronize + barrier instead of nvshmemx_team_sync_on_stream for lower latency
110+
CHECK_CUDA(cudaEventSynchronize(grid_desc->nvshmem_sync_event));
111+
CHECK_MPI(MPI_Barrier(comm));
112+
// nvshmemx_team_sync_on_stream(team, stream);
113+
112114
cudecompNvshmemA2AParams<T> params;
113115

114116
// Inter-group transfers (non-blocking)
117+
bool need_quiet = false;
115118
params.send_buff = send_buff;
116119
params.recv_buff = recv_buff;
117120
int count = 0;
@@ -131,11 +134,13 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
131134
params.ntransfers = count;
132135
cudecomp_nvshmem_alltoallv(params, stream);
133136
count = 0;
137+
need_quiet = true;
134138
}
135139
}
136140
if (count != 0) {
137141
params.ntransfers = count;
138142
cudecomp_nvshmem_alltoallv(params, stream);
143+
need_quiet = true;
139144
}
140145

141146
// Intra-group transfers (blocking, scheduled after non-blocking inter-group transfers for concurrency)
@@ -146,19 +151,19 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
146151
int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
147152
if (nvshmem_ptr(recv_buff, dst_rank_global)) {
148153

149-
if (comm_info.ngroups == 1 && handle->device_p2p_ce_count == 1 && count != 0 &&
154+
if (comm_info.ngroups == 1 && handle->device_p2p_ce_count == 1 &&
150155
count % CUDECOMP_NVSHMEM_INTRAGROUP_SYNC_FREQ == 0) {
151156
// For single group, single P2P CE (e.g. NVSwitch), synchronize NVSHMEM team every
152157
// CUDECOMP_NVSHMEM_INTRAGROUP_SYNC_FREQ transfers This helps reduce CE contention due to accumulation of
153158
// jitter.
154159
for (int i = 0; i < handle->device_p2p_ce_count; ++i) {
155160
CHECK_CUDA(cudaEventRecord(grid_desc->events[0], handle->streams[i]));
156-
CHECK_CUDA(cudaStreamWaitEvent(aux_stream, grid_desc->events[0], 0));
161+
CHECK_CUDA(cudaStreamWaitEvent(handle->streams[handle->device_p2p_ce_count], grid_desc->events[0], 0));
157162
}
158163

159-
nvshmemx_team_sync_on_stream(team, aux_stream);
164+
nvshmemx_team_sync_on_stream(team, handle->streams[handle->device_p2p_ce_count]);
160165

161-
CHECK_CUDA(cudaEventRecord(grid_desc->events[0], aux_stream));
166+
CHECK_CUDA(cudaEventRecord(grid_desc->events[0], handle->streams[handle->device_p2p_ce_count]));
162167
for (int i = 0; i < handle->device_p2p_ce_count; ++i) {
163168
CHECK_CUDA(cudaStreamWaitEvent(handle->streams[i], grid_desc->events[0], 0));
164169
}
@@ -181,7 +186,12 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
181186
CHECK_CUDA(cudaStreamWaitEvent(stream, grid_desc->events[0], 0));
182187
}
183188

184-
nvshmemx_barrier_on_stream(team, stream);
189+
if (need_quiet) { nvshmemx_quiet_on_stream(stream); }
190+
191+
// Using cudaStreamSynchronize + barrier instead of nvshmemx_team_sync_on_stream for lower latency
192+
CHECK_CUDA(cudaStreamSynchronize(stream));
193+
CHECK_MPI(MPI_Barrier(comm));
194+
// nvshmemx_team_sync_on_stream(team, stream);
185195
}
186196
#endif
187197

@@ -227,7 +237,7 @@ static void cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridD
227237
#endif
228238
}
229239
case CUDECOMP_TRANSPOSE_COMM_NCCL: {
230-
auto& comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
240+
auto comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
231241
// For fully intra-group alltoall, use distinct NCCL local comm instead of global comm as it is faster.
232242
auto comm = (comm_info.ngroups == 1) ? *grid_desc->nccl_local_comm : *grid_desc->nccl_comm;
233243

@@ -357,7 +367,7 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc
357367
const std::vector<comm_count_t>& recv_offsets,
358368
const std::vector<comm_count_t>& recv_offsets_nvshmem, cudecompCommAxis comm_axis,
359369
const std::vector<int>& src_ranks, const std::vector<int>& dst_ranks, cudaStream_t stream,
360-
cudecompTransposePerformanceSample* current_sample = nullptr) {
370+
bool& synced, cudecompTransposePerformanceSample* current_sample = nullptr) {
361371

362372
// If there are no transfers to complete, quick return
363373
if (send_counts.size() == 0 && recv_counts.size() == 0) { return; }
@@ -394,17 +404,14 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc
394404
case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL: {
395405
#ifdef ENABLE_NVSHMEM
396406
if (nvshmem_ptr(send_buff, handle->rank) && nvshmem_ptr(recv_buff, handle->rank)) {
397-
auto& comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
407+
auto comm =
408+
(comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.mpi_comm : grid_desc->col_comm_info.mpi_comm;
409+
// auto team = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.nvshmem_team
410+
// : grid_desc->col_comm_info.nvshmem_team;
398411
auto pl_stream = handle->streams[0];
399-
auto aux_stream = handle->streams[handle->device_p2p_ce_count];
400412
int self_rank = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info.rank : grid_desc->col_comm_info.rank;
401413

402-
// Enforce sync dependency between transpose operations
403-
CHECK_CUDA(cudaStreamWaitEvent(pl_stream, grid_desc->nvshmem_sync_event));
404-
405-
bool need_quiet = false;
406-
407-
// Inter-group transfers and self-copy (non-blocking)
414+
bool barrier = false;
408415
for (int i = 0; i < src_ranks.size(); ++i) {
409416
int src_rank = src_ranks[i];
410417
int dst_rank = dst_ranks[i];
@@ -414,44 +421,39 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc
414421
CHECK_CUDA(cudaMemcpyAsync(recv_buff + recv_offsets_nvshmem[self_rank], send_buff + send_offsets[self_rank],
415422
send_counts[self_rank] * sizeof(T), cudaMemcpyDeviceToDevice, stream));
416423
} else {
417-
int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
418-
if (nvshmem_ptr(recv_buff, dst_rank_global)) { continue; }
419-
420424
CHECK_CUDA(cudaStreamWaitEvent(pl_stream, grid_desc->events[dst_rank], 0));
425+
if (!synced) {
426+
// Using cudaEventSynchronize + barrier instead of nvshmemx_team_sync_on_stream for lower latency
427+
CHECK_CUDA(cudaEventSynchronize(grid_desc->nvshmem_sync_event));
428+
CHECK_MPI(MPI_Barrier(comm));
429+
// Only need to sync on the first remote operation of an alltoall sequence to ensure reads on other ranks
430+
// from previous communication have completed.
431+
synced = true;
432+
}
421433

422-
nvshmemx_putmem_signal_nbi_on_stream(recv_buff + recv_offsets_nvshmem[dst_rank],
423-
send_buff + send_offsets[dst_rank], send_counts[dst_rank] * sizeof(T),
424-
&comm_info.nvshmem_signals[comm_info.rank], 1, NVSHMEM_SIGNAL_SET,
425-
dst_rank_global, pl_stream);
434+
int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
435+
nvshmemx_putmem_nbi_on_stream(recv_buff + recv_offsets_nvshmem[dst_rank], send_buff + send_offsets[dst_rank],
436+
send_counts[dst_rank] * sizeof(T), dst_rank_global, pl_stream);
426437

427-
need_quiet = true;
438+
barrier = true;
428439
}
429440
}
430441

431-
// Intra-group transfers (blocking, scheduled after non-blocking inter-group transfers for concurrency)
432-
for (int i = 0; i < src_ranks.size(); ++i) {
433-
int src_rank = src_ranks[i];
434-
int dst_rank = dst_ranks[i];
435-
436-
int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
437-
if (!nvshmem_ptr(recv_buff, dst_rank_global) || src_rank == self_rank) { continue; }
438-
439-
CHECK_CUDA(cudaStreamWaitEvent(pl_stream, grid_desc->events[dst_rank], 0));
440-
441-
nvshmemx_putmem_signal_on_stream(recv_buff + recv_offsets_nvshmem[dst_rank], send_buff + send_offsets[dst_rank],
442-
send_counts[dst_rank] * sizeof(T), &comm_info.nvshmem_signals[comm_info.rank],
443-
1, NVSHMEM_SIGNAL_SET, dst_rank_global, pl_stream);
444-
}
445-
446-
if (need_quiet) { nvshmemx_quiet_on_stream(pl_stream); }
447-
for (int i = 0; i < src_ranks.size(); ++i) {
448-
int src_rank = src_ranks[i];
449-
int dst_rank = dst_ranks[i];
450-
if (src_rank != self_rank) {
451-
nvshmemx_signal_wait_until_on_stream(&comm_info.nvshmem_signals[src_rank], NVSHMEM_CMP_EQ, 1, pl_stream);
452-
CHECK_CUDA(cudaEventRecord(grid_desc->events[dst_rank], pl_stream));
453-
CHECK_CUDA(cudaStreamWaitEvent(stream, grid_desc->events[dst_rank], 0));
454-
}
442+
if (barrier) {
443+
nvshmemx_quiet_on_stream(pl_stream);
444+
// Using cudaStreamSynchronize + barrier instead of nvshmemx_team_sync_on_stream for lower latency
445+
CHECK_CUDA(cudaStreamSynchronize(pl_stream));
446+
CHECK_MPI(MPI_Barrier(comm));
447+
448+
// nvshmemx_team_sync_on_stream(team, pl_stream);
449+
// for (int i = 0; i < src_ranks.size(); ++i) {
450+
// int src_rank = src_ranks[i];
451+
// int dst_rank = dst_ranks[i];
452+
// if (src_rank != self_rank) {
453+
// CHECK_CUDA(cudaEventRecord(grid_desc->events[dst_rank], pl_stream));
454+
// CHECK_CUDA(cudaStreamWaitEvent(stream, grid_desc->events[dst_rank], 0));
455+
// }
456+
//}
455457
}
456458
break;
457459
} else {
@@ -592,7 +594,8 @@ static void cudecompSendRecvPair(const cudecompHandle_t& handle, const cudecompG
592594
case CUDECOMP_HALO_COMM_NVSHMEM_BLOCKING: {
593595
#ifdef ENABLE_NVSHMEM
594596
if (nvshmem_ptr(send_buff, handle->rank) && nvshmem_ptr(recv_buff, handle->rank)) {
595-
nvshmemx_barrier_all_on_stream(stream);
597+
nvshmemx_quiet_on_stream(stream);
598+
nvshmemx_sync_all_on_stream(stream);
596599
for (int i = 0; i < send_counts.size(); ++i) {
597600
if (peer_ranks[i] == handle->rank) {
598601
// Self-copy with cudaMemcpy
@@ -605,12 +608,14 @@ static void cudecompSendRecvPair(const cudecompHandle_t& handle, const cudecompG
605608
}
606609
}
607610
if (grid_desc->config.halo_comm_backend == CUDECOMP_HALO_COMM_NVSHMEM_BLOCKING) {
608-
nvshmemx_barrier_all_on_stream(stream);
611+
nvshmemx_quiet_on_stream(stream);
612+
nvshmemx_sync_all_on_stream(stream);
609613
}
610614
}
611615

612616
if (grid_desc->config.halo_comm_backend == CUDECOMP_HALO_COMM_NVSHMEM) {
613-
nvshmemx_barrier_all_on_stream(stream);
617+
nvshmemx_quiet_on_stream(stream);
618+
nvshmemx_sync_all_on_stream(stream);
614619
};
615620
break;
616621
} else {

include/internal/common.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,6 @@ struct cudecompCommInfo {
127127

128128
#ifdef ENABLE_NVSHMEM
129129
nvshmem_team_t nvshmem_team = NVSHMEM_TEAM_INVALID;
130-
uint64_t* nvshmem_signals = nullptr;
131130
#endif
132131

133132
bool mnnvl_active = false; // flag to indicate whether communicator has MNNVL connections

include/internal/transpose.h

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -236,26 +236,12 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
236236
T* o2 = work + pinfo_a.size;
237237
T* o3 = output;
238238

239-
#ifdef ENABLE_NVSHMEM
240239
if (transposeBackendRequiresNvshmem(grid_desc->config.transpose_comm_backend)) {
241240
auto max_pencil_size_a = getGlobalMaxPencilSize(handle, grid_desc, ax_a);
242241
o2 = work + max_pencil_size_a;
243-
244-
// NVSHMEM team synchronization between transpose operations
245-
if (splits_a.size() != 1) {
246-
auto& comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
247-
auto team = comm_info.nvshmem_team;
248-
auto aux_stream = handle->streams[handle->device_p2p_ce_count];
249-
CHECK_CUDA(cudaEventRecord(grid_desc->nvshmem_sync_event, stream));
250-
CHECK_CUDA(cudaStreamWaitEvent(aux_stream, grid_desc->nvshmem_sync_event));
251-
// Zero out signal buffer for this team here.
252-
CHECK_CUDA(cudaMemsetAsync(comm_info.nvshmem_signals, 0, comm_info.nranks * sizeof(uint64_t), aux_stream));
253-
nvshmemx_team_sync_on_stream(team, aux_stream);
254-
CHECK_CUDA(cudaEventRecord(grid_desc->nvshmem_sync_event, aux_stream));
255-
// Delay final stream wait dependency to alltoall to ensure sync runs concurrently with initial transpose/pack
256-
}
242+
// Record event at start of transpose op for NVSHMEM team synchronization
243+
CHECK_CUDA(cudaEventRecord(grid_desc->nvshmem_sync_event, stream));
257244
}
258-
#endif
259245

260246
cudecompTransposePerformanceSample* current_sample = nullptr;
261247
if (handle->performance_report_enable) {
@@ -623,6 +609,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
623609
}
624610

625611
if (pipelined) {
612+
bool nvshmem_synced = false;
626613
for (int j = 0; j < splits_b.size(); ++j) {
627614
int src_rank, dst_rank;
628615
getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank);
@@ -652,7 +639,8 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
652639

653640
if (o2 != o1) {
654641
cudecompAlltoallPipelined(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets,
655-
recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, current_sample);
642+
recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced,
643+
current_sample);
656644
}
657645

658646
if (o2 != o3) {
@@ -716,6 +704,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
716704
if (i > 0) { strides_out[i] = strides_out[i - 1] * extents_h[i - 1]; }
717705
}
718706

707+
bool nvshmem_synced = false;
719708
for (int j = 0; j < splits_b.size(); ++j) {
720709
int src_rank, dst_rank;
721710
getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank);
@@ -746,7 +735,8 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
746735

747736
if (o2 != o1) {
748737
cudecompAlltoallPipelined(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets,
749-
recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, current_sample);
738+
recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced,
739+
current_sample);
750740
}
751741
}
752742

@@ -770,6 +760,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
770760
}
771761
} else {
772762
// Unpack
763+
bool nvshmem_synced = false;
773764
int memcpy_count = 0;
774765
cudecompBatchedD2DMemcpy3DParams<T> memcpy_params;
775766
for (int j = 0; j < splits_a.size(); ++j) {
@@ -802,7 +793,8 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
802793

803794
if (o2 != o1) {
804795
cudecompAlltoallPipelined(handle, grid_desc, o1, send_counts, send_offsets, o2, recv_counts, recv_offsets,
805-
recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, current_sample);
796+
recv_offsets_nvshmem, comm_axis, src_ranks, dst_ranks, stream, nvshmem_synced,
797+
current_sample);
806798
}
807799
}
808800

src/autotune.cc

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -275,14 +275,6 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
275275
nvshmem_team_config_t tmp;
276276
nvshmem_team_split_2d(NVSHMEM_TEAM_WORLD, grid_desc->config.pdims[1], &tmp, 0,
277277
&grid_desc->row_comm_info.nvshmem_team, &tmp, 0, &grid_desc->col_comm_info.nvshmem_team);
278-
grid_desc->row_comm_info.nvshmem_signals =
279-
(uint64_t*)nvshmem_malloc(grid_desc->row_comm_info.nranks * sizeof(uint64_t));
280-
CHECK_CUDA(
281-
cudaMemset(grid_desc->row_comm_info.nvshmem_signals, 0, grid_desc->row_comm_info.nranks * sizeof(uint64_t)));
282-
grid_desc->col_comm_info.nvshmem_signals =
283-
(uint64_t*)nvshmem_malloc(grid_desc->col_comm_info.nranks * sizeof(uint64_t));
284-
CHECK_CUDA(
285-
cudaMemset(grid_desc->col_comm_info.nvshmem_signals, 0, grid_desc->col_comm_info.nranks * sizeof(uint64_t)));
286278
#endif
287279
}
288280

@@ -459,8 +451,6 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
459451
#ifdef ENABLE_NVSHMEM
460452
nvshmem_team_destroy(grid_desc->row_comm_info.nvshmem_team);
461453
nvshmem_team_destroy(grid_desc->col_comm_info.nvshmem_team);
462-
nvshmem_free(grid_desc->row_comm_info.nvshmem_signals);
463-
nvshmem_free(grid_desc->col_comm_info.nvshmem_signals);
464454
grid_desc->row_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
465455
grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
466456
#endif
@@ -701,14 +691,6 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
701691
nvshmem_team_config_t tmp;
702692
nvshmem_team_split_2d(NVSHMEM_TEAM_WORLD, grid_desc->config.pdims[1], &tmp, 0,
703693
&grid_desc->row_comm_info.nvshmem_team, &tmp, 0, &grid_desc->col_comm_info.nvshmem_team);
704-
grid_desc->row_comm_info.nvshmem_signals =
705-
(uint64_t*)nvshmem_malloc(grid_desc->row_comm_info.nranks * sizeof(uint64_t));
706-
CHECK_CUDA(
707-
cudaMemset(grid_desc->row_comm_info.nvshmem_signals, 0, grid_desc->row_comm_info.nranks * sizeof(uint64_t)));
708-
grid_desc->col_comm_info.nvshmem_signals =
709-
(uint64_t*)nvshmem_malloc(grid_desc->col_comm_info.nranks * sizeof(uint64_t));
710-
CHECK_CUDA(
711-
cudaMemset(grid_desc->col_comm_info.nvshmem_signals, 0, grid_desc->col_comm_info.nranks * sizeof(uint64_t)));
712694
#endif
713695
}
714696

@@ -822,8 +804,6 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
822804
#ifdef ENABLE_NVSHMEM
823805
nvshmem_team_destroy(grid_desc->row_comm_info.nvshmem_team);
824806
nvshmem_team_destroy(grid_desc->col_comm_info.nvshmem_team);
825-
nvshmem_free(grid_desc->row_comm_info.nvshmem_signals);
826-
nvshmem_free(grid_desc->col_comm_info.nvshmem_signals);
827807
grid_desc->row_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
828808
grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
829809
#endif

0 commit comments

Comments
 (0)