update code

yao-fengchen · yao-fengchen · commit 700db7de264f · 2026-02-06T09:32:38.000Z
diff --git a/docker/Dockerfile_ascend_a3 b/docker/Dockerfile_ascend_a3
@@ -22,5 +22,6 @@ ARG LMDEPLOY_TAG=main
 RUN --mount=type=cache,target=/root/.cache \
     pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
     pip config set global.trusted-host pypi.tuna.tsinghua.edu.cn && \
+    pip install --no-cache-dir torch==2.9.0 torch-npu==2.9.0 torchvision==0.24.0 && \
     TORCH_DEVICE_BACKEND_AUTOLOAD=0 DEVICE=ascend pip install git+https://github.com/DeepLink-org/dlinfer.git@${DLINFER_TAG} && \
     LMDEPLOY_TARGET_DEVICE=ascend pip install git+https://github.com/InternLM/lmdeploy.git@${LMDEPLOY_TAG}
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
@@ -150,7 +150,7 @@
 |   QWen2.5-VL   | 3B - 72B  | MLLM |       Yes        |       Yes        |       -       |       -       |      Yes       |        -         |    Yes    |    No     |
 |   QWen2-MoE    |  A14.57B  | LLM  |       Yes        |        -         |      No       |      No       |       -        |        -         |    Yes    |     -     |
 |     QWen3      | 0.6B-235B | LLM  |       Yes        |       Yes        |      No       |      No       |      Yes       |       Yes        |    Yes    |    Yes    |
-|  DeepSeek-V2   |    16B    | LLM  |       Yes        |       Yes        |      No       |      No       |       -        |        -         |     -     |     -     |
+|  DeepSeek-V2   |    16B    | LLM  |        No        |       Yes        |      No       |      No       |       -        |        -         |     -     |     -     |
 | InternVL(v1.5) |  2B-26B   | MLLM |       Yes        |        -         |      Yes      |      Yes      |       -        |        -         |    Yes    |     -     |
 |   InternVL2    |  1B-40B   | MLLM |       Yes        |       Yes        |      Yes      |      Yes      |      Yes       |        -         |    Yes    |    Yes    |
 |  InternVL2.5   |  1B-78B   | MLLM |       Yes        |       Yes        |      Yes      |      Yes      |      Yes       |        -         |    Yes    |    Yes    |
diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -16,7 +16,7 @@
 from lmdeploy.pytorch.distributed import get_dist_manager
 from lmdeploy.utils import get_logger
 
-from ..moe import DlinferMoeMetada, DlinferMoeType
+from ..moe import DlinferMoECommType, DlinferMoeMetadata
 from ..op_backend import DlinferOpsBackend
 
 logger = get_logger('lmdeploy')
@@ -281,19 +281,19 @@ def get_dist_meta():
         def get_tokens_info(dp_size, tp_size, ep_size, ep_group):
             if ep_size <= 1:
                 return 0, 0, 0
-            # get runtime_tokens_current_rank
+            # get padded_tokens_current_rank
             is_graph = cls.enable_graph and step_context.is_decoding
             if is_graph:
                 from dlinfer.framework.lmdeploy_ext.cudagraph.ascend_cudagraph import get_ascend_compatible_size
                 actual_tokens_current_rank = step_context.q_seqlens.shape[0]
-                runtime_tokens_current_rank = min(get_ascend_compatible_size(actual_tokens_current_rank),
-                                                  cls.max_batches)
+                padded_tokens_current_rank = min(get_ascend_compatible_size(actual_tokens_current_rank),
+                                                 cls.max_batches)
             else:
                 actual_tokens_current_rank = step_context.q_seqlens.sum().item()
-                runtime_tokens_current_rank = actual_tokens_current_rank
+                padded_tokens_current_rank = actual_tokens_current_rank
             # get max_tokens_across_dp
             if dp_size > 1:
-                runtime_tokens_tensor = torch.tensor([runtime_tokens_current_rank],
+                runtime_tokens_tensor = torch.tensor([padded_tokens_current_rank],
                                                      dtype=step_context.q_seqlens.dtype,
                                                      device=torch.npu.current_device())
                 world_size = dp_size * tp_size
@@ -303,49 +303,49 @@ def get_tokens_info(dp_size, tp_size, ep_size, ep_group):
                 dist.all_gather_into_tensor(runtime_tokens_buffer, runtime_tokens_tensor, ep_group)
                 max_tokens_across_dp = torch.max(runtime_tokens_buffer).item()
             else:
-                max_tokens_across_dp = runtime_tokens_current_rank
-            return actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp
+                max_tokens_across_dp = padded_tokens_current_rank
+            return actual_tokens_current_rank, padded_tokens_current_rank, max_tokens_across_dp
 
         @lru_cache
         def init_mc2_token_capacity(tp_size):
             max_num_tokens = min(cls.max_batches, 512)
             num_tokens_per_tp_rank = (max_num_tokens + tp_size - 1) // tp_size
             return num_tokens_per_tp_rank * tp_size
 
-        def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size):
+        def select_moe_comm_type(max_tokens_across_dp, dp_size, tp_size, ep_size):
             if ep_size <= 1:
-                return DlinferMoeType.ALLGATHER
+                return DlinferMoECommType.ALLGATHER
             mc2_token_capacity = init_mc2_token_capacity(tp_size)
             is_graph = cls.enable_graph and step_context.is_decoding
             if is_graph:
                 import math
                 max_tokens_across_dp = math.ceil(max_tokens_across_dp / tp_size) * tp_size
             if SocVersion.is_A2():
                 if max_tokens_across_dp <= mc2_token_capacity and dp_size * tp_size >= 16:
-                    return DlinferMoeType.MC2
+                    return DlinferMoECommType.MC2
                 else:
-                    return DlinferMoeType.ALLGATHER
+                    return DlinferMoECommType.ALLGATHER
             elif SocVersion.is_A3():
                 if max_tokens_across_dp <= mc2_token_capacity:
-                    return DlinferMoeType.MC2
+                    return DlinferMoECommType.MC2
                 else:
-                    return DlinferMoeType.ALLTOALL
+                    return DlinferMoECommType.ALLTOALL
             else:
                 raise ValueError(f'Unsupported soc_version: {SocVersion.soc_version()}')
 
-        def get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp, tp_size,
-                         moe_type):
+        def get_pad_info(actual_tokens_current_rank, padded_tokens_current_rank, max_tokens_across_dp, tp_size,
+                         moe_comm_type):
             x_active_mask = None
-            if moe_type == DlinferMoeType.MC2:
+            if moe_comm_type == DlinferMoECommType.MC2:
                 paded_size = math.ceil(max_tokens_across_dp / tp_size) * tp_size
-                pad_size = paded_size - runtime_tokens_current_rank
+                pad_size = paded_size - padded_tokens_current_rank
                 x_active_mask = torch.ones(actual_tokens_current_rank,
                                            dtype=torch.bool,
                                            device=torch.npu.current_device())
-            elif moe_type == DlinferMoeType.ALLTOALL:
-                pad_size = tp_size - runtime_tokens_current_rank
-            elif moe_type == DlinferMoeType.ALLGATHER:
-                pad_size = max_tokens_across_dp - runtime_tokens_current_rank
+            elif moe_comm_type == DlinferMoECommType.ALLTOALL:
+                pad_size = tp_size - padded_tokens_current_rank
+            elif moe_comm_type == DlinferMoECommType.ALLGATHER:
+                pad_size = max_tokens_across_dp - padded_tokens_current_rank
             else:
                 pad_size = 0
             return pad_size, x_active_mask
@@ -404,15 +404,15 @@ def get_moe_group_name(group):
         step_context.attn_metadata = attn_metadata
 
         cls.dist_meta = get_dist_meta()
-        actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp = get_tokens_info(
+        actual_tokens_current_rank, padded_tokens_current_rank, max_tokens_across_dp = get_tokens_info(
             cls.dist_meta.dp_size, cls.dist_meta.tp_size, cls.dist_meta.ep_size, cls.dist_meta.ep_group)
-        moe_type = select_moe_type(max_tokens_across_dp, cls.dist_meta.dp_size, cls.dist_meta.tp_size,
-                                   cls.dist_meta.ep_size)
-        pad_size, x_active_mask = get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank,
-                                               max_tokens_across_dp, cls.dist_meta.tp_size, moe_type)
+        moe_comm_type = select_moe_comm_type(max_tokens_across_dp, cls.dist_meta.dp_size, cls.dist_meta.tp_size,
+                                             cls.dist_meta.ep_size)
+        pad_size, x_active_mask = get_pad_info(actual_tokens_current_rank, padded_tokens_current_rank,
+                                               max_tokens_across_dp, cls.dist_meta.tp_size, moe_comm_type)
         moe_group_name = get_moe_group_name(cls.dist_meta.ep_group)
 
-        moe_metadata = DlinferMoeMetada(
+        moe_metadata = DlinferMoeMetadata(
             max_tokens_across_dp=max_tokens_across_dp,
             pad_size=pad_size,
             dp_size=cls.dist_meta.dp_size,
@@ -422,7 +422,7 @@ def get_moe_group_name(group):
             ep_rank=cls.dist_meta.ep_rank,
             tp_group=cls.dist_meta.tp_group,
             ep_group=cls.dist_meta.ep_group,
-            moe_type=moe_type,
+            moe_comm_type=moe_comm_type,
             x_active_mask=x_active_mask,
             moe_group_name=moe_group_name,
         )
diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -4,8 +4,8 @@
 
 import torch
 
-from lmdeploy.pytorch.kernels.dlinfer import DlinferMoeMetada  # noqa: F401
-from lmdeploy.pytorch.kernels.dlinfer import DlinferMoeType  # noqa: F401
+from lmdeploy.pytorch.kernels.dlinfer import DlinferMoECommType  # noqa: F401
+from lmdeploy.pytorch.kernels.dlinfer import DlinferMoeMetadata  # noqa: F401
 from lmdeploy.pytorch.kernels.dlinfer import fused_moe, moe_gating_topk_softmax
 from lmdeploy.pytorch.model_inputs import get_step_ctx_manager
 
diff --git a/lmdeploy/pytorch/kernels/dlinfer/__init__.py b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
@@ -4,7 +4,7 @@
 from .awq_kernels import awq_linear
 from .fill_kv_cache import fill_kv_cache
 from .flash_attention import flash_attention_fwd
-from .fused_moe import DlinferMoeMetada, DlinferMoeType, fused_moe
+from .fused_moe import DlinferMoECommType, DlinferMoeMetadata, fused_moe
 from .linear import linear
 from .moe_gating_topk_softmax import moe_gating_topk_softmax
 from .pagedattention import paged_attention_fwd
@@ -15,8 +15,8 @@
     'apply_rotary_pos_emb',
     'awq_linear',
     'fill_kv_cache',
-    'DlinferMoeType',
-    'DlinferMoeMetada',
+    'DlinferMoECommType',
+    'DlinferMoeMetadata',
     'fused_moe',
     'paged_attention_fwd',
     'flash_attention_fwd',
diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import dlinfer.ops as ext_ops
-from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetada
-from dlinfer.utils.type_annotation import MoeType as DlinferMoeType  # noqa: F401
+from dlinfer.utils.type_annotation import MoECommType as DlinferMoECommType  # noqa: F401
+from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetadata
 from torch import Tensor
 
 
@@ -13,7 +13,7 @@ def fused_moe(
     topk_ids: Tensor,
     topk: int,
     renormalize: bool,
-    moe_metadata: DlinferMoeMetada,
+    moe_metadata: DlinferMoeMetadata,
 ):
     """Dlinfer fused moe."""
     return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize,
diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
@@ -1,9 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import dlinfer.ops as ext_ops
-from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetada
+from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetadata
 from torch import Tensor
 
 
-def moe_gating_topk_softmax(router_logits: Tensor, topk: int, moe_metadata: DlinferMoeMetada) -> tuple[Tensor, Tensor]:
+def moe_gating_topk_softmax(router_logits: Tensor, topk: int,
+                            moe_metadata: DlinferMoeMetadata) -> tuple[Tensor, Tensor]:
     routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, moe_metadata)
     return routing_weights, selected_experts