Skip to content

Commit 7f96ba4

Browse files
authored
Enforce NVSHMEM minimum version of 2.6.0 to remove old workarounds. Update NVSHMEM usage guidance. (#106)
* Enforce NVSHMEM minimum version of 2.6.0 to remove old workarounds. Update NVSHMEM usage guidance. Signed-off-by: Josh Romero <joshr@nvidia.com> * Formatting. Signed-off-by: Josh Romero <joshr@nvidia.com> * Replace use of nvshmemx_vendor_get_version_info. Signed-off-by: Josh Romero <joshr@nvidia.com> --------- Signed-off-by: Josh Romero <joshr@nvidia.com>
1 parent a8f5668 commit 7f96ba4

4 files changed

Lines changed: 28 additions & 32 deletions

File tree

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,8 +232,8 @@ if (CUDECOMP_ENABLE_NVSHMEM)
232232
list(APPEND NVSHMEM_VERSION ${CMAKE_MATCH_1})
233233
list(JOIN NVSHMEM_VERSION "." NVSHMEM_VERSION)
234234

235-
if (NVSHMEM_VERSION VERSION_LESS "2.5")
236-
target_link_libraries(cudecomp PRIVATE ${NVSHMEM_LIBRARY_DIR}/libnvshmem.a)
235+
if (NVSHMEM_VERSION VERSION_LESS "2.6")
236+
message(FATAL_ERROR "NVSHMEM versions earlier than 2.6.0 are not supported by cuDecomp.")
237237
else()
238238
target_link_libraries(cudecomp PRIVATE ${NVSHMEM_LIBRARY_DIR}/libnvshmem_host.so)
239239
target_link_libraries(cudecomp PRIVATE ${NVSHMEM_LIBRARY_DIR}/libnvshmem_device.a)

docs/nvshmem.rst

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ In general, NVSHMEM operations requires memory it operates on to be allocated on
1212
:code:`nvshmem_malloc`. While cuDecomp attempts to hide this complexity behind :code:`cudecompMalloc`, it is important
1313
to understand that memory allocated for usage with NVSHMEM comes out of a separate memory pool than all other
1414
CUDA allocations. At a high-level, NVSHMEM will preallocate this symmetric heap on each GPU when it is initialized,
15-
with the heap size set by the `NVSHMEM_SYMMETRIC_SIZE <https://docs.nvidia.com/hpc-sdk/nvshmem/api/docs/gen/env.html#c.NVSHMEM_SYMMETRIC_SIZE>`_ environment variable.
15+
with the heap size set by the `NVSHMEM_SYMMETRIC_SIZE <https://docs.nvidia.com/nvshmem/api/gen/env.html#c.NVSHMEM_SYMMETRIC_SIZE>`_ environment variable.
1616
As such, it is important to set the symmetric heap size to a value that is large enough for any necessary allocations from cuDecomp,
1717
but not much larger as that will waste GPU memory space.
1818

@@ -24,11 +24,13 @@ To help with this, the code will produce warnings like the following
2424
2525
if the library detects NVSHMEM allocations that may exceed the symmetric heap size, and suggests an appropriate value for :code:`NVSHMEM_SYMMETRIC_SIZE`.
2626

27+
Note that manual symmetric heap size management is only required if CUDA VMM features are disabled in NVSHMEM via :code:`NVSHMEM_DISABLE_CUDA_VMM=1`.
28+
2729
MPI compatibility
2830
-----------------
29-
As noted in the NVSHMEM documentation `here <https://docs.nvidia.com/hpc-sdk/nvshmem/api/docs/faq.html#interoperability-with-mpi-faqs>`_,
31+
As noted in the NVSHMEM documentation `here <https://docs.nvidia.com/nvshmem/api/faq.html#interoperability-with-mpi-faqs>`_,
3032
memory allocated on the symmetric heap may lead to crashes when used in MPI calls with some MPI implementations, especially when
31-
CUDA VMM features in NVSHMEM are enabled. We strongly encourage users to set :code:`NVSHMEM_DISABLE_CUDA_VMM=1` when using cuDecomp
33+
CUDA VMM features in NVSHMEM are enabled. If you find this is the case for your system, we suggest setting :code:`NVSHMEM_DISABLE_CUDA_VMM=1` when using cuDecomp
3234
with NVSHMEM enabled. However, this is not always sufficient and MPI can still crash when passed NVSHMEM allocated memory.
3335

3436
Due to this, cuDecomp attempts to avoid using NVSHMEM-allocated memory with MPI where possible but it can arise in a couple of situations:

include/internal/comm_routines.h

Lines changed: 5 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ static inline void checkMpiInt32Limit(int64_t val, cudecompHaloCommBackend_t bac
8888
}
8989

9090
#ifdef ENABLE_NVSHMEM
91-
#define CUDECOMP_NVSHMEM_CHUNK_SZ (static_cast<size_t>(1024 * 1024 * 1024))
9291
#define CUDECOMP_NVSHMEM_INTRAGROUP_SYNC_FREQ 8 // max number of intra-group transfers to schedule between team syncs
9392
template <typename T>
9493
static void
@@ -170,16 +169,9 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
170169
}
171170
}
172171

173-
// Use host call for direct P2P accessible entries
174-
// Need to chunk host API calls due to 2 GiB limitation in API
175-
size_t send_bytes = send_counts[dst_rank] * sizeof(T);
176-
size_t nchunks = (send_bytes + CUDECOMP_NVSHMEM_CHUNK_SZ - 1) / CUDECOMP_NVSHMEM_CHUNK_SZ;
177-
for (size_t j = 0; j < nchunks; ++j) {
178-
nvshmemx_putmem_on_stream(recv_buff + recv_offsets[dst_rank] + j * (CUDECOMP_NVSHMEM_CHUNK_SZ / sizeof(T)),
179-
send_buff + send_offsets[dst_rank] + j * (CUDECOMP_NVSHMEM_CHUNK_SZ / sizeof(T)),
180-
std::min(CUDECOMP_NVSHMEM_CHUNK_SZ, send_bytes - j * CUDECOMP_NVSHMEM_CHUNK_SZ),
181-
dst_rank_global, handle->streams[count % handle->device_p2p_ce_count]);
182-
}
172+
nvshmemx_putmem_on_stream(recv_buff + recv_offsets[dst_rank], send_buff + send_offsets[dst_rank],
173+
send_counts[dst_rank] * sizeof(T), dst_rank_global,
174+
handle->streams[count % handle->device_p2p_ce_count]);
183175
count++;
184176
}
185177
}
@@ -440,16 +432,8 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc
440432
}
441433

442434
int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
443-
// Need to chunk host API calls due to 2 GiB limitation in API
444-
size_t send_bytes = send_counts[dst_rank] * sizeof(T);
445-
int nchunks = (send_bytes + CUDECOMP_NVSHMEM_CHUNK_SZ - 1) / CUDECOMP_NVSHMEM_CHUNK_SZ;
446-
for (int j = 0; j < nchunks; ++j) {
447-
nvshmemx_putmem_nbi_on_stream(
448-
recv_buff + recv_offsets_nvshmem[dst_rank] + j * (CUDECOMP_NVSHMEM_CHUNK_SZ / sizeof(T)),
449-
send_buff + send_offsets[dst_rank] + j * (CUDECOMP_NVSHMEM_CHUNK_SZ / sizeof(T)),
450-
std::min(static_cast<size_t>(CUDECOMP_NVSHMEM_CHUNK_SZ), send_bytes - j * CUDECOMP_NVSHMEM_CHUNK_SZ),
451-
dst_rank_global, pl_stream);
452-
}
435+
nvshmemx_putmem_nbi_on_stream(recv_buff + recv_offsets_nvshmem[dst_rank], send_buff + send_offsets[dst_rank],
436+
send_counts[dst_rank] * sizeof(T), dst_rank_global, pl_stream);
453437

454438
barrier = true;
455439
}

src/cudecomp.cc

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -378,12 +378,6 @@ static void inspectNvshmemEnvVars(cudecompHandle_t& handle) {
378378
char* vmm_str = std::getenv("NVSHMEM_DISABLE_CUDA_VMM");
379379
if (vmm_str) { handle->nvshmem_vmm = std::strtol(vmm_str, nullptr, 10) == 0; }
380380

381-
if (handle->rank == 0 && handle->nvshmem_vmm) {
382-
printf("CUDECOMP:WARN: NVSHMEM_DISABLE_CUDA_VMM is unset. We currently recommend setting it "
383-
"(i.e. NVSHMEM_DISABLE_CUDA_VMM=1) for best compatibility with MPI libraries. See the documentation "
384-
"for more details.\n");
385-
}
386-
387381
// Check NVSHMEM_SYMMETRIC_SIZE
388382
char* symmetric_size_str = std::getenv("NVSHMEM_SYMMETRIC_SIZE");
389383
if (symmetric_size_str) {
@@ -405,6 +399,21 @@ static void inspectNvshmemEnvVars(cudecompHandle_t& handle) {
405399
handle->nvshmem_symmetric_size = 1ull << 30;
406400
}
407401
}
402+
403+
static void checkNvshmemVersion() {
404+
int major, minor, patch;
405+
char name[NVSHMEM_MAX_NAME_LEN];
406+
nvshmem_info_get_name(name);
407+
const char* vpos = strchr(name, 'v');
408+
if (!vpos || sscanf(vpos, "v%d.%d.%d", &major, &minor, &patch) != 3) {
409+
THROW_INTERNAL_ERROR("Could not parse NVSHMEM version.");
410+
}
411+
412+
// We have removed workarounds for bugs encountered with NVSHMEM versions earlier than 2.6.0.
413+
if ((major == 2 && minor < 6) || major < 2) {
414+
THROW_NOT_SUPPORTED("NVSHMEM versions earlier than 2.6.0 are not supported.");
415+
}
416+
}
408417
#endif
409418

410419
} // namespace
@@ -691,6 +700,7 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
691700
((autotune_transpose_backend || autotune_halo_backend) && !autotune_disable_nvshmem_backends)) {
692701
#ifdef ENABLE_NVSHMEM
693702
if (!handle->nvshmem_initialized) {
703+
checkNvshmemVersion();
694704
inspectNvshmemEnvVars(handle);
695705
initNvshmemFromMPIComm(handle->mpi_comm);
696706
handle->nvshmem_initialized = true;

0 commit comments

Comments
 (0)