From cde0d5e4184a375bb6bd3c8f95ad10ebf0afd47e Mon Sep 17 00:00:00 2001 From: xuzhibo Date: Wed, 3 Jun 2026 20:31:27 +0800 Subject: [PATCH 1/2] fix(megatron): pre-init NCCL communicator for MoE expert DP group to avoid deadlock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For large MoE models (e.g. Qwen3.5-122B-A10B) trained with Expert Parallelism, INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP is lazily initialized by PyTorch/NCCL: dist.new_group() only registers metadata; the actual ncclCommInitRankConfig bootstrap fires on first collective use, which happens at the first optimizer step — by which point GPU memory is near its limit (~125-130 GiB on H200). If any rank cannot allocate the NCCL bootstrap buffer at that moment, its thread stalls silently (NCCL_TIMEOUT does NOT cover the bootstrap phase), causing all other ranks to wait forever (deadlock with no error output). Fix: call a no-op barrier() on this group immediately after initialize_model_parallel() returns, while GPU memory is still empty, forcing NCCL bootstrap at a safe time. The guard on get_inter_distributed_optimizer_ instance_group(check_initialized=False) returns None for dense models or EP=1, so this change is a no-op for all non-MoE configurations. --- swift/megatron/utils/megatron_lm_utils.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/swift/megatron/utils/megatron_lm_utils.py b/swift/megatron/utils/megatron_lm_utils.py index 1710d333aa..f14b96a6c4 100644 --- a/swift/megatron/utils/megatron_lm_utils.py +++ b/swift/megatron/utils/megatron_lm_utils.py @@ -82,6 +82,25 @@ def _initialize_mpu(args): f'VPP: {args.virtual_pipeline_model_parallel_size}, CP: {args.context_parallel_size}, ' f'EP: {args.expert_model_parallel_size}, ETP: {args.expert_tensor_parallel_size}') + # before model weights are loaded onto GPU. + # + # Background: PyTorch lazily initializes NCCL communicators — dist.new_group() only registers + # metadata; the actual ncclCommInitRankConfig bootstrap runs on first collective use. + # For MoE models this group's first use is the very first optimizer step, by which point GPU + # memory is near its limit (~125-130 GiB on H200). If any rank cannot allocate the NCCL + # temporary buffer at that moment, its bootstrap thread stalls silently — NCCL_TIMEOUT does + # NOT cover the bootstrap phase — and all other ranks wait forever (deadlock). + # + # Calling a barrier here forces bootstrap while GPU memory is still empty, eliminating the race. + # get_inter_distributed_optimizer_instance_group returns _INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP, + # which is None for dense models or when EP=1, so the guard is safe in all configurations. + inter_ep_dp_group = mpu.get_inter_distributed_optimizer_instance_group(check_initialized=False) + if inter_ep_dp_group is not None: + logger.info('Pre-initializing INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP NCCL communicator ' + 'to avoid lazy-init deadlock during first optimizer step.') + torch.distributed.barrier(group=inter_ep_dp_group, device_ids=[torch.cuda.current_device()]) + torch.cuda.synchronize() + def initialize_megatron(args): # Pytorch distributed. From 89b7948e7ef64b14510a6faef045aa9020bde01c Mon Sep 17 00:00:00 2001 From: zb2313 <57031778+zb2313@users.noreply.github.com> Date: Thu, 4 Jun 2026 00:04:39 +0800 Subject: [PATCH 2/2] Update swift/megatron/utils/megatron_lm_utils.py Thanks for the feedback. I've added a compatibility check using `hasattr()` before calling `get_inter_distributed_optimizer_instance_group()` and pushed an update. Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- swift/megatron/utils/megatron_lm_utils.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/swift/megatron/utils/megatron_lm_utils.py b/swift/megatron/utils/megatron_lm_utils.py index f14b96a6c4..d7a7b16247 100644 --- a/swift/megatron/utils/megatron_lm_utils.py +++ b/swift/megatron/utils/megatron_lm_utils.py @@ -94,12 +94,13 @@ def _initialize_mpu(args): # Calling a barrier here forces bootstrap while GPU memory is still empty, eliminating the race. # get_inter_distributed_optimizer_instance_group returns _INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP, # which is None for dense models or when EP=1, so the guard is safe in all configurations. - inter_ep_dp_group = mpu.get_inter_distributed_optimizer_instance_group(check_initialized=False) - if inter_ep_dp_group is not None: - logger.info('Pre-initializing INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP NCCL communicator ' - 'to avoid lazy-init deadlock during first optimizer step.') - torch.distributed.barrier(group=inter_ep_dp_group, device_ids=[torch.cuda.current_device()]) - torch.cuda.synchronize() + if hasattr(mpu, 'get_inter_distributed_optimizer_instance_group'): + inter_ep_dp_group = mpu.get_inter_distributed_optimizer_instance_group(check_initialized=False) + if inter_ep_dp_group is not None: + logger.info('Pre-initializing INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP NCCL communicator ' + 'to avoid lazy-init deadlock during first optimizer step.') + torch.distributed.barrier(group=inter_ep_dp_group, device_ids=[torch.cuda.current_device()]) + torch.cuda.synchronize() def initialize_megatron(args):