From cde0d5e4184a375bb6bd3c8f95ad10ebf0afd47e Mon Sep 17 00:00:00 2001
From: xuzhibo <xuzhibo1@xiaohongshu.com>
Date: Wed, 3 Jun 2026 20:31:27 +0800
Subject: [PATCH 1/2] fix(megatron): pre-init NCCL communicator for MoE expert
 DP group to avoid deadlock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  For large MoE models (e.g. Qwen3.5-122B-A10B) trained with Expert Parallelism,
  INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP is lazily initialized by PyTorch/NCCL:
  dist.new_group() only registers metadata; the actual ncclCommInitRankConfig
  bootstrap fires on first collective use, which happens at the first optimizer
  step — by which point GPU memory is near its limit (~125-130 GiB on H200).

  If any rank cannot allocate the NCCL bootstrap buffer at that moment, its
  thread stalls silently (NCCL_TIMEOUT does NOT cover the bootstrap phase),
  causing all other ranks to wait forever (deadlock with no error output).

  Fix: call a no-op barrier() on this group immediately after
  initialize_model_parallel() returns, while GPU memory is still empty, forcing
  NCCL bootstrap at a safe time. The guard on get_inter_distributed_optimizer_
  instance_group(check_initialized=False) returns None for dense models or EP=1,
  so this change is a no-op for all non-MoE configurations.
---
 swift/megatron/utils/megatron_lm_utils.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/swift/megatron/utils/megatron_lm_utils.py b/swift/megatron/utils/megatron_lm_utils.py
index 1710d333aa..f14b96a6c4 100644
--- a/swift/megatron/utils/megatron_lm_utils.py
+++ b/swift/megatron/utils/megatron_lm_utils.py
@@ -82,6 +82,25 @@ def _initialize_mpu(args):
                         f'VPP: {args.virtual_pipeline_model_parallel_size}, CP: {args.context_parallel_size}, '
                         f'EP: {args.expert_model_parallel_size}, ETP: {args.expert_tensor_parallel_size}')
 
+        # before model weights are loaded onto GPU.
+        #
+        # Background: PyTorch lazily initializes NCCL communicators — dist.new_group() only registers
+        # metadata; the actual ncclCommInitRankConfig bootstrap runs on first collective use.
+        # For MoE models this group's first use is the very first optimizer step, by which point GPU
+        # memory is near its limit (~125-130 GiB on H200). If any rank cannot allocate the NCCL
+        # temporary buffer at that moment, its bootstrap thread stalls silently — NCCL_TIMEOUT does
+        # NOT cover the bootstrap phase — and all other ranks wait forever (deadlock).
+        #
+        # Calling a barrier here forces bootstrap while GPU memory is still empty, eliminating the race.
+        # get_inter_distributed_optimizer_instance_group returns _INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP,
+        # which is None for dense models or when EP=1, so the guard is safe in all configurations.
+        inter_ep_dp_group = mpu.get_inter_distributed_optimizer_instance_group(check_initialized=False)
+        if inter_ep_dp_group is not None:
+            logger.info('Pre-initializing INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP NCCL communicator '
+                        'to avoid lazy-init deadlock during first optimizer step.')
+            torch.distributed.barrier(group=inter_ep_dp_group, device_ids=[torch.cuda.current_device()])
+            torch.cuda.synchronize()
+
 
 def initialize_megatron(args):
     # Pytorch distributed.

From 89b7948e7ef64b14510a6faef045aa9020bde01c Mon Sep 17 00:00:00 2001
From: zb2313 <57031778+zb2313@users.noreply.github.com>
Date: Thu, 4 Jun 2026 00:04:39 +0800
Subject: [PATCH 2/2] Update swift/megatron/utils/megatron_lm_utils.py

Thanks for the feedback. I've added a compatibility check using `hasattr()` before calling `get_inter_distributed_optimizer_instance_group()` and pushed an update.

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 swift/megatron/utils/megatron_lm_utils.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/swift/megatron/utils/megatron_lm_utils.py b/swift/megatron/utils/megatron_lm_utils.py
index f14b96a6c4..d7a7b16247 100644
--- a/swift/megatron/utils/megatron_lm_utils.py
+++ b/swift/megatron/utils/megatron_lm_utils.py
@@ -94,12 +94,13 @@ def _initialize_mpu(args):
         # Calling a barrier here forces bootstrap while GPU memory is still empty, eliminating the race.
         # get_inter_distributed_optimizer_instance_group returns _INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP,
         # which is None for dense models or when EP=1, so the guard is safe in all configurations.
-        inter_ep_dp_group = mpu.get_inter_distributed_optimizer_instance_group(check_initialized=False)
-        if inter_ep_dp_group is not None:
-            logger.info('Pre-initializing INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP NCCL communicator '
-                        'to avoid lazy-init deadlock during first optimizer step.')
-            torch.distributed.barrier(group=inter_ep_dp_group, device_ids=[torch.cuda.current_device()])
-            torch.cuda.synchronize()
+        if hasattr(mpu, 'get_inter_distributed_optimizer_instance_group'):
+            inter_ep_dp_group = mpu.get_inter_distributed_optimizer_instance_group(check_initialized=False)
+            if inter_ep_dp_group is not None:
+                logger.info('Pre-initializing INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP NCCL communicator '
+                            'to avoid lazy-init deadlock during first optimizer step.')
+                torch.distributed.barrier(group=inter_ep_dp_group, device_ids=[torch.cuda.current_device()])
+                torch.cuda.synchronize()
 
 
 def initialize_megatron(args):