From 718bf681f96540581cac5490522081dd1218eeba Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 18 Jun 2026 14:36:11 +0800
Subject: [PATCH 1/5] update deepseek_v 4 vllm docs

---
 swift/megatron/init.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/swift/megatron/init.py b/swift/megatron/init.py
index 6396d38903..27adcd7ffc 100644
--- a/swift/megatron/init.py
+++ b/swift/megatron/init.py
@@ -177,12 +177,14 @@ def save_weights(
                     else:
                         llm_config.num_nextn_predict_layers = config.mtp_num_layers
                 HfConfigFactory.del_config_attr(hf_config, 'quantization_config')
+                HfConfigFactory.del_config_attr(hf_config, 'expert_dtype')
                 if config.fp8 is not None and config.fp8_recipe == 'blockwise' and config.fp8_param:
                     from transformers.utils.quantization_config import FineGrainedFP8Config
                     modules_to_not_convert = get_modules_to_not_convert(self.hf_model)
                     if hasattr(self, '_fp8_skip_modules'):
                         modules_to_not_convert = (modules_to_not_convert or []) + list(self._fp8_skip_modules)
                     hf_config.quantization_config = FineGrainedFP8Config(modules_to_not_convert=modules_to_not_convert)
+                    llm_config.expert_dtype = 'fp8'
                 hf_config.save_pretrained(output_dir)
                 if getattr(self.hf_model, '_auto_class') is not None:
                     try:

From eba53061e7c16b8fd5b5c5e4f1aed6dfcb55043c Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 18 Jun 2026 15:40:30 +0800
Subject: [PATCH 2/5] update

---
 docs/source/Megatron-SWIFT/Command-line-parameters.md    | 1 +
 docs/source_en/Megatron-SWIFT/Command-line-parameters.md | 1 +
 swift/megatron/arguments/megatron_args.py                | 1 +
 3 files changed, 3 insertions(+)

diff --git a/docs/source/Megatron-SWIFT/Command-line-parameters.md b/docs/source/Megatron-SWIFT/Command-line-parameters.md
index d3e979e747..88cbbe0d1b 100644
--- a/docs/source/Megatron-SWIFT/Command-line-parameters.md
+++ b/docs/source/Megatron-SWIFT/Command-line-parameters.md
@@ -214,6 +214,7 @@
 **DSA参数**
 - dsa_indexer_loss_coeff: DSA 索引器 KL 散度损失的系数。设置为 0 可禁用索引器损失。默认为`0.`。
 - dsa_indexer_use_sparse_loss: 是否使用稀疏 DSA 索引器损失。如果为 True，索引器损失将使用 top-k 索引进行计算。默认为False。
+- apply_dsa_kernel_fusion: 是否启用融合 DSA 稀疏注意力内核（FlashMLA + cuDNN DSA）。设为 False 将回退到未融合的 PyTorch 实现。需要安装 flash_mla 和 nvidia-cudnn-frontend >= 1.24.0。默认为False。
 
 **Deepseek-V4**
 - csa_dense_mode: 是否对压缩稀疏注意力使用密集模式。若为 `True`，CSA 索引器将被禁用。默认为False。
diff --git a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md
index 3eee7608c3..f8412af93a 100644
--- a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md
+++ b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md
@@ -225,6 +225,7 @@ For guidance on selecting parallelization strategies, please refer to the [Train
 
 - dsa_indexer_loss_coeff: Coefficient for the DSA indexer KL divergence loss. Set to 0 to disable indexer loss. Default is `0.`.
 - dsa_indexer_use_sparse_loss: Whether to use sparse DSA indexer loss. If True, the indexer loss will be computed using the top-k indices. Default is False.
+- apply_dsa_kernel_fusion: Whether to enable fused DSA sparse-attention kernels (FlashMLA + cuDNN DSA). Set to False to fall back to unfused PyTorch implementations. Requires flash_mla and nvidia-cudnn-frontend >= 1.24.0. Default is False.
 
 **Deepseek-V4**
 
diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py
index ee933929f9..bf5bd47640 100644
--- a/swift/megatron/arguments/megatron_args.py
+++ b/swift/megatron/arguments/megatron_args.py
@@ -630,6 +630,7 @@ class MegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin):
     # dsa
     dsa_indexer_loss_coeff: float = 0.
     dsa_indexer_use_sparse_loss: bool = False
+    apply_dsa_kernel_fusion: bool = False
     # deepseek-v4
     csa_dense_mode: bool = False
     use_fused_mhc: bool = False

From 38946baf8a107d63d6075f0859c20e21db873513 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 18 Jun 2026 17:24:31 +0800
Subject: [PATCH 3/5] update

---
 swift/megatron/init.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/swift/megatron/init.py b/swift/megatron/init.py
index 27adcd7ffc..c16e2b12f2 100644
--- a/swift/megatron/init.py
+++ b/swift/megatron/init.py
@@ -177,14 +177,16 @@ def save_weights(
                     else:
                         llm_config.num_nextn_predict_layers = config.mtp_num_layers
                 HfConfigFactory.del_config_attr(hf_config, 'quantization_config')
-                HfConfigFactory.del_config_attr(hf_config, 'expert_dtype')
+                expert_dtype = None
                 if config.fp8 is not None and config.fp8_recipe == 'blockwise' and config.fp8_param:
                     from transformers.utils.quantization_config import FineGrainedFP8Config
                     modules_to_not_convert = get_modules_to_not_convert(self.hf_model)
                     if hasattr(self, '_fp8_skip_modules'):
                         modules_to_not_convert = (modules_to_not_convert or []) + list(self._fp8_skip_modules)
                     hf_config.quantization_config = FineGrainedFP8Config(modules_to_not_convert=modules_to_not_convert)
-                    llm_config.expert_dtype = 'fp8'
+                    expert_dtype = 'fp8'
+                if args.model_type == 'deepseek_v4':
+                    HfConfigFactory.set_config_attr(hf_config, 'expert_dtype', expert_dtype)
                 hf_config.save_pretrained(output_dir)
                 if getattr(self.hf_model, '_auto_class') is not None:
                     try:

From 0eaa5ddc1a9a1d26859bb25cbf7428047b1d0488 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 24 Jun 2026 10:42:33 +0800
Subject: [PATCH 4/5] update

---
 docs/source/BestPractices/deepseek-v4.md | 38 ++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/docs/source/BestPractices/deepseek-v4.md b/docs/source/BestPractices/deepseek-v4.md
index 7351667c3f..833e7039e8 100644
--- a/docs/source/BestPractices/deepseek-v4.md
+++ b/docs/source/BestPractices/deepseek-v4.md
@@ -214,3 +214,41 @@ swift infer \
 推理结果：
 
 ![result](../../resources/deepseek_v4/infer_result.png)
+
+跑通vLLM推理：
+
+- 如果要使用vllm推理，你可以参考[这里的文档](https://recipes.vllm.ai/deepseek-ai/DeepSeek-V4-Flash)。你需要FP4/FP8精度的权重。
+- 此外你需要copy原始的'config.json'文件，并修改'expert_dtype'（与训练后的config.json一致）。因为，使用transformers的`config.save_pretrained`保存的文件与原始文件不同，vllm不兼容保存后的文件。
+- 如果遇到tilelang问题，可以查看[这个issue](https://github.com/modelscope/ms-swift/issues/9494)。
+- mcore-bridge DeepSeek-V4 Fp8修复：[PR](https://github.com/modelscope/mcore-bridge/pull/133)。
+
+这里先做量化（这里的量化会导致LoRA增量信息丢失，这里只作为例子，建议使用FP8全参数训练并导出FP8权重）：
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+NPROC_PER_NODE=8 \
+megatron export \
+    --model megatron_output/DeepSeek-V4-Flash/vx-xxx/checkpoint-xxx-merged \
+    --output_dir megatron_output/DeepSeek-V4-Flash/vx-xxx/checkpoint-xxx-merged-FP8 \
+    --to_hf true \
+    --fp8_recipe blockwise \
+    --fp8_format e4m3 \
+    --fp8_param_gather true \
+    --mtp_num_layers 1 \
+    --expert_model_parallel_size 8
+```
+
+vLLM启动命令：
+```shell
+vllm serve megatron_output/DeepSeek-V4-Flash/vx-xxx/checkpoint-xxx-merged-FP8 \
+    --trust-remote-code \
+    --kv-cache-dtype fp8 \
+    --block-size 256 \
+    --enable-expert-parallel \
+    --tensor-parallel-size 8 \
+    --max-model-len 8192 \
+    --tokenizer-mode deepseek_v4 \
+    --tool-call-parser deepseek_v4 \
+    --enable-auto-tool-choice \
+    --reasoning-parser deepseek_v4
+```

From cef2008fe69e9e996fc8ba6c652234f937a5c1af Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 24 Jun 2026 10:45:22 +0800
Subject: [PATCH 5/5] update

---
 docs/source_en/BestPractices/deepseek-v4.md | 38 +++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/docs/source_en/BestPractices/deepseek-v4.md b/docs/source_en/BestPractices/deepseek-v4.md
index 5e40484bfe..8d1cf9a26a 100644
--- a/docs/source_en/BestPractices/deepseek-v4.md
+++ b/docs/source_en/BestPractices/deepseek-v4.md
@@ -214,3 +214,41 @@ swift infer \
 Inference result:
 
 ![result](../../resources/deepseek_v4/infer_result.png)
+
+Running vLLM inference:
+
+- If you want to use vLLM for inference, you can refer to [this documentation](https://recipes.vllm.ai/deepseek-ai/DeepSeek-V4-Flash). You need FP4/FP8 precision weights.
+- Additionally, you need to copy the original 'config.json' file and modify 'expert_dtype' (consistent with the config.json after training). This is because the file saved by transformers' `config.save_pretrained` differs from the original file, and vLLM is not compatible with the saved file.
+- If you encounter tilelang issues, you can check [this issue](https://github.com/modelscope/ms-swift/issues/9494).
+- mcore-bridge DeepSeek-V4 FP8 fix: [PR](https://github.com/modelscope/mcore-bridge/pull/133).
+
+First perform quantization (note: this quantization will cause LoRA incremental information loss; this is only an example. It is recommended to use FP8 full-parameter training and export FP8 weights):
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+NPROC_PER_NODE=8 \
+megatron export \
+    --model megatron_output/DeepSeek-V4-Flash/vx-xxx/checkpoint-xxx-merged \
+    --output_dir megatron_output/DeepSeek-V4-Flash/vx-xxx/checkpoint-xxx-merged-FP8 \
+    --to_hf true \
+    --fp8_recipe blockwise \
+    --fp8_format e4m3 \
+    --fp8_param_gather true \
+    --mtp_num_layers 1 \
+    --expert_model_parallel_size 8
+```
+
+vLLM launch command:
+```shell
+vllm serve megatron_output/DeepSeek-V4-Flash/vx-xxx/checkpoint-xxx-merged-FP8 \
+    --trust-remote-code \
+    --kv-cache-dtype fp8 \
+    --block-size 256 \
+    --enable-expert-parallel \
+    --tensor-parallel-size 8 \
+    --max-model-len 8192 \
+    --tokenizer-mode deepseek_v4 \
+    --tool-call-parser deepseek_v4 \
+    --enable-auto-tool-choice \
+    --reasoning-parser deepseek_v4
+```