From 718bf681f96540581cac5490522081dd1218eeba Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 18 Jun 2026 14:36:11 +0800 Subject: [PATCH 1/5] update deepseek_v 4 vllm docs --- swift/megatron/init.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/swift/megatron/init.py b/swift/megatron/init.py index 6396d38903..27adcd7ffc 100644 --- a/swift/megatron/init.py +++ b/swift/megatron/init.py @@ -177,12 +177,14 @@ def save_weights( else: llm_config.num_nextn_predict_layers = config.mtp_num_layers HfConfigFactory.del_config_attr(hf_config, 'quantization_config') + HfConfigFactory.del_config_attr(hf_config, 'expert_dtype') if config.fp8 is not None and config.fp8_recipe == 'blockwise' and config.fp8_param: from transformers.utils.quantization_config import FineGrainedFP8Config modules_to_not_convert = get_modules_to_not_convert(self.hf_model) if hasattr(self, '_fp8_skip_modules'): modules_to_not_convert = (modules_to_not_convert or []) + list(self._fp8_skip_modules) hf_config.quantization_config = FineGrainedFP8Config(modules_to_not_convert=modules_to_not_convert) + llm_config.expert_dtype = 'fp8' hf_config.save_pretrained(output_dir) if getattr(self.hf_model, '_auto_class') is not None: try: From eba53061e7c16b8fd5b5c5e4f1aed6dfcb55043c Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 18 Jun 2026 15:40:30 +0800 Subject: [PATCH 2/5] update --- docs/source/Megatron-SWIFT/Command-line-parameters.md | 1 + docs/source_en/Megatron-SWIFT/Command-line-parameters.md | 1 + swift/megatron/arguments/megatron_args.py | 1 + 3 files changed, 3 insertions(+) diff --git a/docs/source/Megatron-SWIFT/Command-line-parameters.md b/docs/source/Megatron-SWIFT/Command-line-parameters.md index d3e979e747..88cbbe0d1b 100644 --- a/docs/source/Megatron-SWIFT/Command-line-parameters.md +++ b/docs/source/Megatron-SWIFT/Command-line-parameters.md @@ -214,6 +214,7 @@ **DSA参数** - dsa_indexer_loss_coeff: DSA 索引器 KL 散度损失的系数。设置为 0 可禁用索引器损失。默认为`0.`。 - dsa_indexer_use_sparse_loss: 是否使用稀疏 DSA 索引器损失。如果为 True,索引器损失将使用 top-k 索引进行计算。默认为False。 +- apply_dsa_kernel_fusion: 是否启用融合 DSA 稀疏注意力内核(FlashMLA + cuDNN DSA)。设为 False 将回退到未融合的 PyTorch 实现。需要安装 flash_mla 和 nvidia-cudnn-frontend >= 1.24.0。默认为False。 **Deepseek-V4** - csa_dense_mode: 是否对压缩稀疏注意力使用密集模式。若为 `True`,CSA 索引器将被禁用。默认为False。 diff --git a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md index 3eee7608c3..f8412af93a 100644 --- a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md +++ b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md @@ -225,6 +225,7 @@ For guidance on selecting parallelization strategies, please refer to the [Train - dsa_indexer_loss_coeff: Coefficient for the DSA indexer KL divergence loss. Set to 0 to disable indexer loss. Default is `0.`. - dsa_indexer_use_sparse_loss: Whether to use sparse DSA indexer loss. If True, the indexer loss will be computed using the top-k indices. Default is False. +- apply_dsa_kernel_fusion: Whether to enable fused DSA sparse-attention kernels (FlashMLA + cuDNN DSA). Set to False to fall back to unfused PyTorch implementations. Requires flash_mla and nvidia-cudnn-frontend >= 1.24.0. Default is False. **Deepseek-V4** diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py index ee933929f9..bf5bd47640 100644 --- a/swift/megatron/arguments/megatron_args.py +++ b/swift/megatron/arguments/megatron_args.py @@ -630,6 +630,7 @@ class MegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin): # dsa dsa_indexer_loss_coeff: float = 0. dsa_indexer_use_sparse_loss: bool = False + apply_dsa_kernel_fusion: bool = False # deepseek-v4 csa_dense_mode: bool = False use_fused_mhc: bool = False From 38946baf8a107d63d6075f0859c20e21db873513 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 18 Jun 2026 17:24:31 +0800 Subject: [PATCH 3/5] update --- swift/megatron/init.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/swift/megatron/init.py b/swift/megatron/init.py index 27adcd7ffc..c16e2b12f2 100644 --- a/swift/megatron/init.py +++ b/swift/megatron/init.py @@ -177,14 +177,16 @@ def save_weights( else: llm_config.num_nextn_predict_layers = config.mtp_num_layers HfConfigFactory.del_config_attr(hf_config, 'quantization_config') - HfConfigFactory.del_config_attr(hf_config, 'expert_dtype') + expert_dtype = None if config.fp8 is not None and config.fp8_recipe == 'blockwise' and config.fp8_param: from transformers.utils.quantization_config import FineGrainedFP8Config modules_to_not_convert = get_modules_to_not_convert(self.hf_model) if hasattr(self, '_fp8_skip_modules'): modules_to_not_convert = (modules_to_not_convert or []) + list(self._fp8_skip_modules) hf_config.quantization_config = FineGrainedFP8Config(modules_to_not_convert=modules_to_not_convert) - llm_config.expert_dtype = 'fp8' + expert_dtype = 'fp8' + if args.model_type == 'deepseek_v4': + HfConfigFactory.set_config_attr(hf_config, 'expert_dtype', expert_dtype) hf_config.save_pretrained(output_dir) if getattr(self.hf_model, '_auto_class') is not None: try: From 0eaa5ddc1a9a1d26859bb25cbf7428047b1d0488 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Wed, 24 Jun 2026 10:42:33 +0800 Subject: [PATCH 4/5] update --- docs/source/BestPractices/deepseek-v4.md | 38 ++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/docs/source/BestPractices/deepseek-v4.md b/docs/source/BestPractices/deepseek-v4.md index 7351667c3f..833e7039e8 100644 --- a/docs/source/BestPractices/deepseek-v4.md +++ b/docs/source/BestPractices/deepseek-v4.md @@ -214,3 +214,41 @@ swift infer \ 推理结果: ![result](../../resources/deepseek_v4/infer_result.png) + +跑通vLLM推理: + +- 如果要使用vllm推理,你可以参考[这里的文档](https://recipes.vllm.ai/deepseek-ai/DeepSeek-V4-Flash)。你需要FP4/FP8精度的权重。 +- 此外你需要copy原始的'config.json'文件,并修改'expert_dtype'(与训练后的config.json一致)。因为,使用transformers的`config.save_pretrained`保存的文件与原始文件不同,vllm不兼容保存后的文件。 +- 如果遇到tilelang问题,可以查看[这个issue](https://github.com/modelscope/ms-swift/issues/9494)。 +- mcore-bridge DeepSeek-V4 Fp8修复:[PR](https://github.com/modelscope/mcore-bridge/pull/133)。 + +这里先做量化(这里的量化会导致LoRA增量信息丢失,这里只作为例子,建议使用FP8全参数训练并导出FP8权重): + +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +NPROC_PER_NODE=8 \ +megatron export \ + --model megatron_output/DeepSeek-V4-Flash/vx-xxx/checkpoint-xxx-merged \ + --output_dir megatron_output/DeepSeek-V4-Flash/vx-xxx/checkpoint-xxx-merged-FP8 \ + --to_hf true \ + --fp8_recipe blockwise \ + --fp8_format e4m3 \ + --fp8_param_gather true \ + --mtp_num_layers 1 \ + --expert_model_parallel_size 8 +``` + +vLLM启动命令: +```shell +vllm serve megatron_output/DeepSeek-V4-Flash/vx-xxx/checkpoint-xxx-merged-FP8 \ + --trust-remote-code \ + --kv-cache-dtype fp8 \ + --block-size 256 \ + --enable-expert-parallel \ + --tensor-parallel-size 8 \ + --max-model-len 8192 \ + --tokenizer-mode deepseek_v4 \ + --tool-call-parser deepseek_v4 \ + --enable-auto-tool-choice \ + --reasoning-parser deepseek_v4 +``` From cef2008fe69e9e996fc8ba6c652234f937a5c1af Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Wed, 24 Jun 2026 10:45:22 +0800 Subject: [PATCH 5/5] update --- docs/source_en/BestPractices/deepseek-v4.md | 38 +++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/docs/source_en/BestPractices/deepseek-v4.md b/docs/source_en/BestPractices/deepseek-v4.md index 5e40484bfe..8d1cf9a26a 100644 --- a/docs/source_en/BestPractices/deepseek-v4.md +++ b/docs/source_en/BestPractices/deepseek-v4.md @@ -214,3 +214,41 @@ swift infer \ Inference result: ![result](../../resources/deepseek_v4/infer_result.png) + +Running vLLM inference: + +- If you want to use vLLM for inference, you can refer to [this documentation](https://recipes.vllm.ai/deepseek-ai/DeepSeek-V4-Flash). You need FP4/FP8 precision weights. +- Additionally, you need to copy the original 'config.json' file and modify 'expert_dtype' (consistent with the config.json after training). This is because the file saved by transformers' `config.save_pretrained` differs from the original file, and vLLM is not compatible with the saved file. +- If you encounter tilelang issues, you can check [this issue](https://github.com/modelscope/ms-swift/issues/9494). +- mcore-bridge DeepSeek-V4 FP8 fix: [PR](https://github.com/modelscope/mcore-bridge/pull/133). + +First perform quantization (note: this quantization will cause LoRA incremental information loss; this is only an example. It is recommended to use FP8 full-parameter training and export FP8 weights): + +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +NPROC_PER_NODE=8 \ +megatron export \ + --model megatron_output/DeepSeek-V4-Flash/vx-xxx/checkpoint-xxx-merged \ + --output_dir megatron_output/DeepSeek-V4-Flash/vx-xxx/checkpoint-xxx-merged-FP8 \ + --to_hf true \ + --fp8_recipe blockwise \ + --fp8_format e4m3 \ + --fp8_param_gather true \ + --mtp_num_layers 1 \ + --expert_model_parallel_size 8 +``` + +vLLM launch command: +```shell +vllm serve megatron_output/DeepSeek-V4-Flash/vx-xxx/checkpoint-xxx-merged-FP8 \ + --trust-remote-code \ + --kv-cache-dtype fp8 \ + --block-size 256 \ + --enable-expert-parallel \ + --tensor-parallel-size 8 \ + --max-model-len 8192 \ + --tokenizer-mode deepseek_v4 \ + --tool-call-parser deepseek_v4 \ + --enable-auto-tool-choice \ + --reasoning-parser deepseek_v4 +```