diff --git a/README.md b/README.md index 8089b29d..cc92fc2f 100644 --- a/README.md +++ b/README.md @@ -112,25 +112,27 @@ supported on Twinkle✨ framework. > both Tinker APIs, as well as the full-fledged Twinkle✨ native APIs. The serverless endpoint is backed > by one training base at a time, and currently it is [Qwen3-30B-A3B-Instruct-2507](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Instruct-2507). -| Model Type | Model ID on [ModelScope](https://modelscope.cn) | Model Size | Requires | Support Megatron | HF Model ID | -| ------------------- | ------------------------------------------------------------ | :-------------------------------------: | -------------------- | :--------------: | :----------------------------------------------------------: | -| qwen3 series | [Qwen/Qwen3-14B-Base](https://modelscope.cn/models/Qwen/Qwen3-14B-Base) | 0.6B/1.7B/4B/8B/14B | transformers>=4.51 | ✔ | [Qwen/Qwen3-14B-Base](https://huggingface.co/Qwen/Qwen3-14B-Base) | -| | [Qwen/Qwen3-32B](https://modelscope.cn/models/Qwen/Qwen3-32B) | 0.6B/1.7B/4B/8B/14B/32B | transformers>=4.51 | ✔ | [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) | -| qwen3_moe series | [Qwen/Qwen3-30B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Base) | 30B-A3B/A3B-Base,235B-A22B | transformers>=4.51 | ✔ | [Qwen/Qwen3-30B-A3B-Base](https://huggingface.co/Qwen/Qwen3-30B-A3B-Base) | -| qwen2 series | [Qwen/Qwen2-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-0.5B-Instruct) | 0.5B/1.5B/7B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) | -| | [Qwen/Qwen2-1.5B](https://modelscope.cn/models/Qwen/Qwen2-1.5B) | 0.5B/1.5B/7B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2-1.5B](https://huggingface.co/Qwen/Qwen2-1.5B) | -| | [Qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-1.5B-Instruct) | 0.5B/1.5B/3B/7B/14B/32B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | -| | [Qwen/Qwen2.5-0.5B](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B) | 0.5B/1.5B/3B/7B/14B/32B | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B) | -| qwen2_moe series | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B-Chat) | - | transformers>=4.40 | ✔ | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat) | -| | [Qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B) | - | transformers>=4.40 | ✔ | [Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B) | -| chatglm3 series | [ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b) | 6b/6b-base/6b-32k/6b-128k | transformers<4.42 | ✘ | [zai-org/chatglm3-6b](https://huggingface.co/zai-org/chatglm3-6b) | -| chatglm4 series | [ZhipuAI/glm-4-9b-chat](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat) | glm-4-9b/glm-4-9b-chat/glm-4-9b-chat-1m | transformers>=4.42 | ✘ | [zai-org/glm-4-9b-chat](https://huggingface.co/zai-org/glm-4-9b-chat) | -| | [ZhipuAI/LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b) | - | transformers>=4.42 | ✘ | [zai-org/LongWriter-glm4-9b](https://huggingface.co/zai-org/LongWriter-glm4-9b) | -| glm_edge series | [ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat) | 1.5b-chat/4b-chat | transformers>=4.46 | ✘ | [zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat) | -| internlm2 series | [Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b) | 1_8b/chat-1_8b-sft/base-7b/7b/chat-7b/ | transformers>=4.38 | ✘ | [internlm/internlm2-1_8b](https://huggingface.co/internlm/internlm2-1_8b) | -| deepseek_v1 | [deepseek-ai/DeepSeek-V2-Lite](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite) | V2/V2-Lite/V2-Chat/2-Lite-Chat/V2.5 | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-V2-Lite](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite) | -| | [deepseek-ai/DeepSeek-Prover-V2-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-Prover-V2-7B) | - | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-Prover-V2-7B](https://huggingface.co/deepseek-ai/DeepSeek-Prover-V2-7B) | -| | [deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1) | - | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) | +| Model Type | Model ID on [ModelScope](https://modelscope.cn) | Model Size | Requires | Support Megatron | HF Model ID | +|---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:| +| qwen3 series | [Qwen/Qwen3-14B-Base](https://modelscope.cn/models/Qwen/Qwen3-14B-Base) | 0.6B/1.7B/4B/8B/14B | transformers>=4.51 | ✔ | [Qwen/Qwen3-14B-Base](https://huggingface.co/Qwen/Qwen3-14B-Base) | +| | [Qwen/Qwen3-32B](https://modelscope.cn/models/Qwen/Qwen3-32B) | 0.6B/1.7B/4B/8B/14B/32B | transformers>=4.51 | ✔ | [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) | +| qwen3_moe series | [Qwen/Qwen3-30B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Base) | 30B-A3B/A3B-Base,235B-A22B | transformers>=4.51 | ✔ | [Qwen/Qwen3-30B-A3B-Base](https://huggingface.co/Qwen/Qwen3-30B-A3B-Base) | +| qwen3.5 moe series | [Qwen/Qwen3.5-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.5-35B-A3B) | 35B-A3B,122B-A10B, etc. | transformers>=5.20 | ✔ | [Qwen/Qwen3.5-35B-A3B](https://huggingface.co/Qwen/Qwen3.5-35B-A3B) | +| qwen3.5 series | [Qwen/Qwen3.5-9B](https://www.modelscope.cn/models/Qwen/Qwen3.5-9B) | 2B ~ 27B | transformers>=5.20 | ✔ | [Qwen/Qwen3.5-9B](https://huggingface.co/Qwen/Qwen3.5-9B) | +| qwen2 series | [Qwen/Qwen2-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-0.5B-Instruct) | 0.5B/1.5B/7B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) | +| | [Qwen/Qwen2-1.5B](https://modelscope.cn/models/Qwen/Qwen2-1.5B) | 0.5B/1.5B/7B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2-1.5B](https://huggingface.co/Qwen/Qwen2-1.5B) | +| | [Qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-1.5B-Instruct) | 0.5B/1.5B/3B/7B/14B/32B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | +| | [Qwen/Qwen2.5-0.5B](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B) | 0.5B/1.5B/3B/7B/14B/32B | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B) | +| qwen2_moe series | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B-Chat) | - | transformers>=4.40 | ✔ | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat) | +| | [Qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B) | - | transformers>=4.40 | ✔ | [Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B) | +| chatglm3 series | [ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b) | 6b/6b-base/6b-32k/6b-128k | transformers<4.42 | ✘ | [zai-org/chatglm3-6b](https://huggingface.co/zai-org/chatglm3-6b) | +| chatglm4 series | [ZhipuAI/glm-4-9b-chat](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat) | glm-4-9b/glm-4-9b-chat/glm-4-9b-chat-1m | transformers>=4.42 | ✘ | [zai-org/glm-4-9b-chat](https://huggingface.co/zai-org/glm-4-9b-chat) | +| | [ZhipuAI/LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b) | - | transformers>=4.42 | ✘ | [zai-org/LongWriter-glm4-9b](https://huggingface.co/zai-org/LongWriter-glm4-9b) | +| glm_edge series | [ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat) | 1.5b-chat/4b-chat | transformers>=4.46 | ✘ | [zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat) | +| internlm2 series | [Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b) | 1_8b/chat-1_8b-sft/base-7b/7b/chat-7b/ | transformers>=4.38 | ✘ | [internlm/internlm2-1_8b](https://huggingface.co/internlm/internlm2-1_8b) | +| deepseek_v1 | [deepseek-ai/DeepSeek-V2-Lite](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite) | V2/V2-Lite/V2-Chat/2-Lite-Chat/V2.5 | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-V2-Lite](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite) | +| | [deepseek-ai/DeepSeek-Prover-V2-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-Prover-V2-7B) | - | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-Prover-V2-7B](https://huggingface.co/deepseek-ai/DeepSeek-Prover-V2-7B) | +| | [deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1) | - | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) | | deepSeek-r1-distill | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | 1.5B/7B/14B/32B | transformers>=4.37 | ✔ | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | For more detailed model support list 👉 [Quick Start](docs/source_en/Usage%20Guide/Quick-Start.md) @@ -159,7 +161,7 @@ twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_me def train(): # to load model from Hugging Face, use 'hf://...' - base_model = 'ms://Qwen/Qwen3-4B' + base_model = 'ms://Qwen/Qwen3.5-4B' # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding diff --git a/README_ZH.md b/README_ZH.md index 783597b9..10c1fc88 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -93,25 +93,27 @@ Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Cl >[!Note] > 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务,目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持,目前使用的是[Qwen3-30B-A3B-Instruct-2507](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Instruct-2507)。 -| Model Type | Model ID 举例 | Model Size | Requires | Support Megatron | HF Model ID | -| ------------------- | ------------------------------------------------------------ | :-------------------------------------: | -------------------- | :--------------: | :----------------------------------------------------------: | -| qwen3 全系列 | [Qwen/Qwen3-14B-Base](https://modelscope.cn/models/Qwen/Qwen3-14B-Base) | 0.6B/1.7B/4B/8B/14B | transformers>=4.51 | ✔ | [Qwen/Qwen3-14B-Base](https://huggingface.co/Qwen/Qwen3-14B-Base) | -| | [Qwen/Qwen3-32B](https://modelscope.cn/models/Qwen/Qwen3-32B) | 0.6B/1.7B/4B/8B/14B/32B | transformers>=4.51 | ✔ | [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) | -| qwen3_moe 全系列 | [Qwen/Qwen3-30B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Base) | 30B-A3B/A3B-Base,235B-A22B | transformers>=4.51 | ✔ | [Qwen/Qwen3-30B-A3B-Base](https://huggingface.co/Qwen/Qwen3-30B-A3B-Base) | -| qwen2 全系列 | [Qwen/Qwen2-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-0.5B-Instruct) | 0.5B/1.5B/7B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) | -| | [Qwen/Qwen2-1.5B](https://modelscope.cn/models/Qwen/Qwen2-1.5B) | 0.5B/1.5B/7B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2-1.5B](https://huggingface.co/Qwen/Qwen2-1.5B) | -| | [Qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-1.5B-Instruct) | 0.5B/1.5B/3B/7B/14B/32B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | -| | [Qwen/Qwen2.5-0.5B](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B) | 0.5B/1.5B/3B/7B/14B/32B | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B) | -| qwen2_moe 全系列 | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B-Chat) | - | transformers>=4.40 | ✔ | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat) | -| | [Qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B) | - | transformers>=4.40 | ✔ | [Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B) | -| chatglm3 全系列 | [ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b) | 6b/6b-base/6b-32k/6b-128k | transformers<4.42 | ✘ | [zai-org/chatglm3-6b](https://huggingface.co/zai-org/chatglm3-6b) | -| chatglm4 全系列 | [ZhipuAI/glm-4-9b-chat](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat) | glm-4-9b/glm-4-9b-chat/glm-4-9b-chat-1m | transformers>=4.42 | ✘ | [zai-org/glm-4-9b-chat](https://huggingface.co/zai-org/glm-4-9b-chat) | -| | [ZhipuAI/LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b) | - | transformers>=4.42 | ✘ | [zai-org/LongWriter-glm4-9b](https://huggingface.co/zai-org/LongWriter-glm4-9b) | -| glm_edge 全系列 | [ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat) | 1.5b-chat/4b-chat | transformers>=4.46 | ✘ | [zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat) | -| internlm2 全系列 | [Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b) | 1_8b/chat-1_8b-sft/base-7b/7b/chat-7b/ | transformers>=4.38 | ✘ | [internlm/internlm2-1_8b](https://huggingface.co/internlm/internlm2-1_8b) | -| deepseek_v1 | [deepseek-ai/DeepSeek-V2-Lite](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite) | V2/V2-Lite/V2-Chat/2-Lite-Chat/V2.5 | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-V2-Lite](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite) | -| | [deepseek-ai/DeepSeek-Prover-V2-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-Prover-V2-7B) | - | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-Prover-V2-7B](https://huggingface.co/deepseek-ai/DeepSeek-Prover-V2-7B) | -| | [deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1) | - | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) | +| Model Type | Model ID 举例 | Model Size | Requires | Support Megatron | HF Model ID | +|---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:| +| qwen3 全系列 | [Qwen/Qwen3-14B-Base](https://modelscope.cn/models/Qwen/Qwen3-14B-Base) | 0.6B/1.7B/4B/8B/14B | transformers>=4.51 | ✔ | [Qwen/Qwen3-14B-Base](https://huggingface.co/Qwen/Qwen3-14B-Base) | +| | [Qwen/Qwen3-32B](https://modelscope.cn/models/Qwen/Qwen3-32B) | 0.6B/1.7B/4B/8B/14B/32B | transformers>=4.51 | ✔ | [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) | +| qwen3_moe 全系列 | [Qwen/Qwen3-30B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Base) | 30B-A3B/A3B-Base,235B-A22B | transformers>=4.51 | ✔ | [Qwen/Qwen3-30B-A3B-Base](https://huggingface.co/Qwen/Qwen3-30B-A3B-Base) | +| qwen3.5 moe 全系列 | [Qwen/Qwen3.5-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.5-35B-A3B) | 35B-A3B,122B-A10B, etc. | transformers>=5.20 | ✔ | [Qwen/Qwen3.5-35B-A3B](https://huggingface.co/Qwen/Qwen3.5-35B-A3B) | +| qwen3.5 全系列 | [Qwen/Qwen3.5-9B](https://www.modelscope.cn/models/Qwen/Qwen3.5-9B) | 2B ~ 27B | transformers>=5.20 | ✔ | [Qwen/Qwen3.5-9B](https://huggingface.co/Qwen/Qwen3.5-9B) | +| qwen2 全系列 | [Qwen/Qwen2-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-0.5B-Instruct) | 0.5B/1.5B/7B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) | +| | [Qwen/Qwen2-1.5B](https://modelscope.cn/models/Qwen/Qwen2-1.5B) | 0.5B/1.5B/7B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2-1.5B](https://huggingface.co/Qwen/Qwen2-1.5B) | +| | [Qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-1.5B-Instruct) | 0.5B/1.5B/3B/7B/14B/32B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | +| | [Qwen/Qwen2.5-0.5B](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B) | 0.5B/1.5B/3B/7B/14B/32B | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B) | +| qwen2_moe 全系列 | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B-Chat) | - | transformers>=4.40 | ✔ | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat) | +| | [Qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B) | - | transformers>=4.40 | ✔ | [Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B) | +| chatglm3 全系列 | [ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b) | 6b/6b-base/6b-32k/6b-128k | transformers<4.42 | ✘ | [zai-org/chatglm3-6b](https://huggingface.co/zai-org/chatglm3-6b) | +| chatglm4 全系列 | [ZhipuAI/glm-4-9b-chat](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat) | glm-4-9b/glm-4-9b-chat/glm-4-9b-chat-1m | transformers>=4.42 | ✘ | [zai-org/glm-4-9b-chat](https://huggingface.co/zai-org/glm-4-9b-chat) | +| | [ZhipuAI/LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b) | - | transformers>=4.42 | ✘ | [zai-org/LongWriter-glm4-9b](https://huggingface.co/zai-org/LongWriter-glm4-9b) | +| glm_edge 全系列 | [ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat) | 1.5b-chat/4b-chat | transformers>=4.46 | ✘ | [zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat) | +| internlm2 全系列 | [Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b) | 1_8b/chat-1_8b-sft/base-7b/7b/chat-7b/ | transformers>=4.38 | ✘ | [internlm/internlm2-1_8b](https://huggingface.co/internlm/internlm2-1_8b) | +| deepseek_v1 | [deepseek-ai/DeepSeek-V2-Lite](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite) | V2/V2-Lite/V2-Chat/2-Lite-Chat/V2.5 | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-V2-Lite](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite) | +| | [deepseek-ai/DeepSeek-Prover-V2-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-Prover-V2-7B) | - | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-Prover-V2-7B](https://huggingface.co/deepseek-ai/DeepSeek-Prover-V2-7B) | +| | [deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1) | - | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) | | deepSeek-r1-distill | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | 1.5B/7B/14B/32B | transformers>=4.37 | ✔ | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | 更详细的模型支持列表 👉 [快速开始.md](docs/source_zh/使用指引/快速开始.md) @@ -139,7 +141,7 @@ twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_me def train(): # to load model from Hugging Face, use 'hf://...' - base_model = 'ms://Qwen/Qwen3-4B' + base_model = 'ms://Qwen/Qwen3.5-4B' # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding diff --git a/cookbook/client/tinker/custom_service/lora.py b/cookbook/client/tinker/custom_service/lora.py index e1eab4d3..292e9115 100644 --- a/cookbook/client/tinker/custom_service/lora.py +++ b/cookbook/client/tinker/custom_service/lora.py @@ -61,7 +61,7 @@ # Step 6: Create or resume a training client. # If resume_path is set, it restores both model weights and optimizer state. -base_model = 'Qwen/Qwen3-4B' +base_model = 'Qwen/Qwen3.5-4B' if not resume_path: training_client = service_client.create_lora_training_client(base_model=base_model) else: diff --git a/cookbook/client/tinker/custom_service/megatron/server_config.yaml b/cookbook/client/tinker/custom_service/megatron/server_config.yaml index b8fa1abd..a8103b76 100644 --- a/cookbook/client/tinker/custom_service/megatron/server_config.yaml +++ b/cookbook/client/tinker/custom_service/megatron/server_config.yaml @@ -24,7 +24,7 @@ applications: server_config: per_token_model_limit: 3 # Maximum number of models (adapters) per token (server-globally enforced) supported_models: - - Qwen/Qwen3-4B + - Qwen/Qwen3.5-4B deployments: - name: TinkerCompatServer autoscaling_config: @@ -36,12 +36,12 @@ applications: # 2. Model Service (commented out) - Would host the base model for training. # Uncomment and configure if you need a training model worker. - - name: models-Qwen3-4B - route_prefix: /api/v1/model/Qwen/Qwen3-4B + - name: models-Qwen3.5-4B + route_prefix: /api/v1/model/Qwen/Qwen3.5-4B import_path: model args: use_megatron: true - model_id: "ms://Qwen/Qwen3-4B" # ModelScope model identifier + model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier max_length: 10240 nproc_per_node: 2 # Number of GPU processes per node device_group: @@ -73,11 +73,11 @@ applications: # 3. Sampler Service - Runs inference / sampling using vLLM engine # Used for generating text from the model (e.g., evaluating LoRA results). - - name: sampler-Qwen3-4B - route_prefix: /api/v1/sampler/Qwen/Qwen3-4B + - name: sampler-Qwen3.5-4B + route_prefix: /api/v1/sampler/Qwen/Qwen3.5-4B import_path: sampler args: - model_id: "ms://Qwen/Qwen3-4B" # ModelScope model identifier + model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier nproc_per_node: 2 # Number of GPU processes per node sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler) engine_args: # vLLM engine-specific settings diff --git a/cookbook/client/tinker/custom_service/sample.py b/cookbook/client/tinker/custom_service/sample.py index 278f24bf..132eb63a 100644 --- a/cookbook/client/tinker/custom_service/sample.py +++ b/cookbook/client/tinker/custom_service/sample.py @@ -17,7 +17,7 @@ from tinker import ServiceClient # Step 2: Define the base model and connect to the server -base_model = 'Qwen/Qwen3-4B' +base_model = 'Qwen/Qwen3.5-4B' service_client = ServiceClient( base_url='http://localhost:8000', api_key='EMPTY-TOKEN' diff --git a/cookbook/client/tinker/custom_service/self_cognition.py b/cookbook/client/tinker/custom_service/self_cognition.py index 4acc88f7..e285cc7f 100644 --- a/cookbook/client/tinker/custom_service/self_cognition.py +++ b/cookbook/client/tinker/custom_service/self_cognition.py @@ -24,7 +24,7 @@ from tinker import ServiceClient # The base model to fine-tune / evaluate -base_model = 'Qwen/Qwen3-4B' +base_model = 'Qwen/Qwen3.5-4B' base_url = 'http://localhost:8000' api_key = 'EMPTY_API_KEY' diff --git a/cookbook/client/tinker/custom_service/short_math_grpo.py b/cookbook/client/tinker/custom_service/short_math_grpo.py index d35102b7..6b1e6cea 100644 --- a/cookbook/client/tinker/custom_service/short_math_grpo.py +++ b/cookbook/client/tinker/custom_service/short_math_grpo.py @@ -38,7 +38,7 @@ logger = get_logger() # ========== Configuration ========== -BASE_MODEL = 'Qwen/Qwen3-4B' +BASE_MODEL = 'Qwen/Qwen3.5-4B' NUM_GENERATIONS = 8 MAX_NEW_TOKENS = 4096 LEARNING_RATE = 1e-4 diff --git a/cookbook/client/tinker/custom_service/transformer/server_config.yaml b/cookbook/client/tinker/custom_service/transformer/server_config.yaml index 5009ce08..e79ad6f2 100644 --- a/cookbook/client/tinker/custom_service/transformer/server_config.yaml +++ b/cookbook/client/tinker/custom_service/transformer/server_config.yaml @@ -24,7 +24,7 @@ applications: server_config: per_token_model_limit: 3 # Maximum number of models (adapters) per token (server-globally enforced) supported_models: - - Qwen/Qwen3-4B + - Qwen/Qwen3.5-4B deployments: - name: TinkerCompatServer autoscaling_config: @@ -36,12 +36,12 @@ applications: # 2. Model Service (commented out) - Would host the base model for training. # Uncomment and configure if you need a training model worker. - - name: models-Qwen3-4B - route_prefix: /api/v1/model/Qwen/Qwen3-4B + - name: models-Qwen3.5-4B + route_prefix: /api/v1/model/Qwen/Qwen3.5-4B import_path: model args: use_megatron: false # Use HuggingFace Transformers backend - model_id: "ms://Qwen/Qwen3-4B" # ModelScope model identifier + model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier max_length: 10240 nproc_per_node: 2 # Number of GPU processes per node device_group: @@ -70,11 +70,11 @@ applications: # 3. Sampler Service - Runs inference / sampling using vLLM engine # Used for generating text from the model (e.g., evaluating LoRA results). - - name: sampler-Qwen3-4B - route_prefix: /api/v1/sampler/Qwen/Qwen3-4B + - name: sampler-Qwen3.5-4B + route_prefix: /api/v1/sampler/Qwen/Qwen3.5-4B import_path: sampler args: - model_id: "ms://Qwen/Qwen3-4B" # ModelScope model identifier + model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier nproc_per_node: 2 # Number of GPU processes per node sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler) engine_args: # vLLM engine-specific settings diff --git a/cookbook/client/twinkle/grpo.py b/cookbook/client/twinkle/grpo.py index 6db8cee2..1f7c0553 100644 --- a/cookbook/client/twinkle/grpo.py +++ b/cookbook/client/twinkle/grpo.py @@ -44,7 +44,7 @@ logger = get_logger() # ========== Configuration ========== -MODEL_ID = 'ms://Qwen/Qwen3-4B' +MODEL_ID = 'ms://Qwen/Qwen3.5-4B' NUM_GENERATIONS = 4 MAX_NEW_TOKENS = 1024 LEARNING_RATE = 1e-5 diff --git a/cookbook/client/twinkle/megatron/server_config.yaml b/cookbook/client/twinkle/megatron/server_config.yaml index 91c300f9..c8efe648 100644 --- a/cookbook/client/twinkle/megatron/server_config.yaml +++ b/cookbook/client/twinkle/megatron/server_config.yaml @@ -34,13 +34,13 @@ applications: # 2. Model Service - Hosts the base model for training (Megatron backend) # This is the actual model worker that performs forward/backward passes. - - name: models-Qwen3-4B - route_prefix: /models/Qwen/Qwen3-4B # REST path for this model + - name: models-Qwen3.5-4B + route_prefix: /models/Qwen/Qwen3.5-4B # REST path for this model import_path: model args: use_megatron: true # Use Megatron-LM backend (not HuggingFace) mixed_precision: bf16 - model_id: "ms://Qwen/Qwen3-4B" # ModelScope model identifier to load + model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier to load nproc_per_node: 2 # Number of GPU processes per node device_group: # Logical device group for this model name: model diff --git a/cookbook/client/twinkle/sample.py b/cookbook/client/twinkle/sample.py index 75aeb5b1..9437bb36 100644 --- a/cookbook/client/twinkle/sample.py +++ b/cookbook/client/twinkle/sample.py @@ -22,7 +22,7 @@ logger = get_logger() -MODEL_ID = 'Qwen/Qwen3-4B' +MODEL_ID = 'Qwen/Qwen3.5-4B' # Optional: adapter URI for LoRA inference # This can be a twinkle:// path from a training run checkpoint diff --git a/cookbook/client/twinkle/self_congnition.py b/cookbook/client/twinkle/self_congnition.py index 781b809f..6b907653 100644 --- a/cookbook/client/twinkle/self_congnition.py +++ b/cookbook/client/twinkle/self_congnition.py @@ -51,7 +51,7 @@ def train(): dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) # Apply a chat template so the data matches the model's expected input format - dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B', max_length=512) + dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) # Replace placeholder names in the dataset with custom model/author names dataset.map('SelfCognitionProcessor', init_args={'model_name': 'twinkle模型', 'model_author': 'ModelScope社区'}) @@ -65,7 +65,7 @@ def train(): # Step 5: Configure the model # Create a multi-LoRA Transformers model pointing to the base model on ModelScope - model = MultiLoraTransformersModel(model_id='ms://Qwen/Qwen3-4B') + model = MultiLoraTransformersModel(model_id='ms://Qwen/Qwen3.5-4B') # Define LoRA configuration: apply low-rank adapters to all linear layers lora_config = LoraConfig(target_modules='all-linear') diff --git a/cookbook/client/twinkle/transformer/server_config.yaml b/cookbook/client/twinkle/transformer/server_config.yaml index f10b5b3e..e16ced6a 100644 --- a/cookbook/client/twinkle/transformer/server_config.yaml +++ b/cookbook/client/twinkle/transformer/server_config.yaml @@ -34,12 +34,12 @@ applications: # 2. Model Service - Hosts the base model for training # This is the actual model worker that performs forward/backward passes. - - name: models-Qwen3-4B - route_prefix: /models/Qwen/Qwen3-4B # REST path for this model + - name: models-Qwen3.5-4B + route_prefix: /models/Qwen/Qwen3.5-4B # REST path for this model import_path: model args: use_megatron: false # Use HuggingFace Transformers (not Megatron) - model_id: "ms://Qwen/Qwen3-4B" # ModelScope model identifier to load + model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier to load adapter_config: adapter_timeout: 1800 # Seconds before an idle adapter is unloaded nproc_per_node: 2 # Number of GPU processes per node @@ -91,11 +91,11 @@ applications: # 4. Sampler Service - Handles text generation inference # Uses vLLM for efficient batched generation with optional LoRA adapters. - - name: sampler-Qwen3-4B - route_prefix: /samplers/Qwen/Qwen3-4B # REST path for this sampler + - name: sampler-Qwen3.5-4B + route_prefix: /samplers/Qwen/Qwen3.5-4B # REST path for this sampler import_path: sampler args: - model_id: "ms://Qwen/Qwen3-4B" # ModelScope model identifier to load + model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier to load sampler_type: vllm # Sampler backend (vllm or torch) nproc_per_node: 2 # Number of GPU processes per node engine_args: # vLLM engine configuration diff --git a/cookbook/megatron/tp.py b/cookbook/megatron/tp.py index b09d1a60..ee457fe7 100644 --- a/cookbook/megatron/tp.py +++ b/cookbook/megatron/tp.py @@ -9,7 +9,7 @@ from twinkle.model import MegatronModel from twinkle.preprocessor import SelfCognitionProcessor # Construct a device_mesh, tp=pp=cp=2, dp=1 -device_mesh = DeviceMesh.from_sizes(dp_size=1, tp_size=2, pp_size=2, cp_size=2) +device_mesh = DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2) # use torchrun mode twinkle.initialize(mode='local', global_device_mesh=device_mesh) @@ -19,7 +19,7 @@ def eval(model): # 100 Samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) dataset.encode() dataloader = DataLoader(dataset=dataset, batch_size=16) @@ -33,7 +33,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset @@ -41,7 +41,7 @@ def train(): # Global batch size = 1, dp_size = 1 dataloader = DataLoader(dataset=dataset, batch_size=16) # Use a MegatronModel - model = MegatronModel(model_id='ms://Qwen/Qwen3-4B') + model = MegatronModel(model_id='ms://Qwen/Qwen3.5-4B') lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') diff --git a/cookbook/ray/single_controller.py b/cookbook/ray/single_controller.py index cd1482a6..39d99353 100644 --- a/cookbook/ray/single_controller.py +++ b/cookbook/ray/single_controller.py @@ -41,7 +41,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset @@ -49,7 +49,7 @@ def train(): # Global batch size = 8, for GPUs, so 1 sample per GPU dataloader = DataLoader(dataset=dataset, batch_size=8, min_batch_size=8) # Use a TransformersModel - model = TransformersModel(model_id='ms://Qwen/Qwen3-4B', remote_group='default') + model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B', remote_group='default') lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') diff --git a/cookbook/rl/grpo.py b/cookbook/rl/grpo.py index 29059fae..590ca719 100644 --- a/cookbook/rl/grpo.py +++ b/cookbook/rl/grpo.py @@ -20,8 +20,8 @@ logger = get_logger() -MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3-4B') -USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '1'))) +MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.5-4B') +USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '0'))) MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4)) SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS',4)) diff --git a/cookbook/transformers/fsdp2.py b/cookbook/transformers/fsdp2.py index ca37d724..5624495d 100644 --- a/cookbook/transformers/fsdp2.py +++ b/cookbook/transformers/fsdp2.py @@ -20,7 +20,7 @@ def eval(model): # 100 Samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) dataset.encode() dataloader = DataLoader(dataset=dataset, batch_size=8) @@ -35,7 +35,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset @@ -43,7 +43,8 @@ def train(): # Global batch size = 8, for GPUs, so 1 sample per GPU dataloader = DataLoader(dataset=dataset, batch_size=8) # Use a TransformersModel - model = TransformersModel(model_id='ms://Qwen/Qwen3-4B') + model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B') + model.model._no_split_modules = {'Qwen3_5DecoderLayer'} lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') diff --git a/cookbook/transformers/sp_fsdp_dense.py b/cookbook/transformers/sp_fsdp_dense.py index da6e2d28..868b61c0 100644 --- a/cookbook/transformers/sp_fsdp_dense.py +++ b/cookbook/transformers/sp_fsdp_dense.py @@ -10,7 +10,7 @@ from twinkle.preprocessor import SelfCognitionProcessor logger = get_logger() -MODEL_ID = 'ms://Qwen/Qwen3-4B' +MODEL_ID = 'ms://Qwen/Qwen3.5-4B' DATASETS = 'ms://swift/self-cognition' device_group = [DeviceGroup( diff --git a/docs/source_en/Components/Advantage/GRPOAdvantage.md b/docs/source_en/Components/Advantage/GRPOAdvantage.md index ba90b4c0..fb92e7b4 100644 --- a/docs/source_en/Components/Advantage/GRPOAdvantage.md +++ b/docs/source_en/Components/Advantage/GRPOAdvantage.md @@ -41,8 +41,8 @@ from twinkle.sampler import vLLMSampler from twinkle.reward import MathReward # Create components -actor = TransformersModel(model_id='ms://Qwen/Qwen3-4B') -sampler = vLLMSampler(model_id='ms://Qwen/Qwen3-4B') +actor = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B') +sampler = vLLMSampler(model_id='ms://Qwen/Qwen3.5-4B') reward_fn = MathReward() advantage_fn = GRPOAdvantage() diff --git a/docs/source_en/Components/Advantage/RLOOAdvantage.md b/docs/source_en/Components/Advantage/RLOOAdvantage.md index 5479a4ce..e78590b8 100644 --- a/docs/source_en/Components/Advantage/RLOOAdvantage.md +++ b/docs/source_en/Components/Advantage/RLOOAdvantage.md @@ -38,8 +38,8 @@ from twinkle.sampler import vLLMSampler from twinkle.reward import MathReward # Create components -actor = TransformersModel(model_id='ms://Qwen/Qwen3-4B') -sampler = vLLMSampler(model_id='ms://Qwen/Qwen3-4B') +actor = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B') +sampler = vLLMSampler(model_id='ms://Qwen/Qwen3.5-4B') reward_fn = MathReward() advantage_fn = RLOOAdvantage() dataloader = ... diff --git a/docs/source_en/Components/Data Format/Sampling.md b/docs/source_en/Components/Data Format/Sampling.md index cd21454a..9bc3cc8f 100644 --- a/docs/source_en/Components/Data Format/Sampling.md +++ b/docs/source_en/Components/Data Format/Sampling.md @@ -62,7 +62,7 @@ Usage example: from twinkle.data_format import SamplingParams, SampleResponse from twinkle.sampler import vLLMSampler -sampler = vLLMSampler(model_id='ms://Qwen/Qwen3-4B') +sampler = vLLMSampler(model_id='ms://Qwen/Qwen3.5-4B') params = SamplingParams(max_tokens=512, temperature=0.7, top_p=0.9) response: SampleResponse = sampler.sample(trajectories, sampling_params=params, num_samples=4) diff --git a/docs/source_en/Components/Dataset/Dataset.md b/docs/source_en/Components/Dataset/Dataset.md index fc75e6fc..aea56968 100644 --- a/docs/source_en/Components/Dataset/Dataset.md +++ b/docs/source_en/Components/Dataset/Dataset.md @@ -60,7 +60,7 @@ dataset = Dataset(DatasetMeta(dataset_id='my/custom/dataset.jsonl', data_slice=r The Template component is responsible for converting string/image multimodal raw data into model input tokens. The dataset can set a Template to complete the `encode` process. ```python -dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B', max_length=512) +dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) ``` The set_template method supports passing `kwargs` (such as `max_length` in the example) to be used as constructor parameters for `Template`. diff --git a/docs/source_en/Components/Model/MegatronModel.md b/docs/source_en/Components/Model/MegatronModel.md index d6b26c77..75425d6f 100644 --- a/docs/source_en/Components/Model/MegatronModel.md +++ b/docs/source_en/Components/Model/MegatronModel.md @@ -35,7 +35,7 @@ from twinkle.model import MegatronModel from twinkle import DeviceMesh from twinkle.dataloader import DataLoader dataloader = DataLoader(...) -model = MegatronModel(model_id='ms://Qwen/Qwen3-4B', device_mesh=DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2), remote_group='actor') +model = MegatronModel(model_id='ms://Qwen/Qwen3.5-4B', device_mesh=DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2), remote_group='actor') model.add_adapter_to_model(...) model.set_optimizer('default', adapter_name='...') for data in dataloader: diff --git a/docs/source_en/Components/Model/TransformersModel.md b/docs/source_en/Components/Model/TransformersModel.md index ff9eac7e..7e34149f 100644 --- a/docs/source_en/Components/Model/TransformersModel.md +++ b/docs/source_en/Components/Model/TransformersModel.md @@ -41,7 +41,7 @@ from twinkle.model import TransformersModel from twinkle import DeviceMesh from twinkle.dataloader import DataLoader dataloader = DataLoader(...) -model = TransformersModel(model_id='ms://Qwen/Qwen3-4B', device_mesh=DeviceMesh.from_sizes(dp_size=2, fsdp_size=2), remote_group='actor') +model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B', device_mesh=DeviceMesh.from_sizes(dp_size=2, fsdp_size=2), remote_group='actor') model.add_adapter_to_model(...) model.set_optimizer(..., adapter_name='...') for data in dataloader: diff --git a/docs/source_en/Components/Reward/Reward.md b/docs/source_en/Components/Reward/Reward.md index 0f9903b7..b010a95a 100644 --- a/docs/source_en/Components/Reward/Reward.md +++ b/docs/source_en/Components/Reward/Reward.md @@ -87,7 +87,7 @@ from twinkle.sampler import vLLMSampler from twinkle.reward import MathReward from twinkle.advantage import GRPOAdvantage -sampler = vLLMSampler(model_id='ms://Qwen/Qwen3-4B') +sampler = vLLMSampler(model_id='ms://Qwen/Qwen3.5-4B') reward_fn = MathReward() advantage_fn = GRPOAdvantage() diff --git a/docs/source_en/Components/Sampler/TorchSampler.md b/docs/source_en/Components/Sampler/TorchSampler.md index 93d1469c..6076d801 100644 --- a/docs/source_en/Components/Sampler/TorchSampler.md +++ b/docs/source_en/Components/Sampler/TorchSampler.md @@ -9,7 +9,7 @@ from twinkle.sampler import TorchSampler from twinkle import DeviceMesh sampler = TorchSampler( - model_id='ms://Qwen/Qwen3-4B', + model_id='ms://Qwen/Qwen3.5-4B', device_mesh=DeviceMesh.from_sizes(dp_size=1), ) diff --git a/docs/source_en/Components/Sampler/vLLMSampler.md b/docs/source_en/Components/Sampler/vLLMSampler.md index 53c034e7..f6eb1fa6 100644 --- a/docs/source_en/Components/Sampler/vLLMSampler.md +++ b/docs/source_en/Components/Sampler/vLLMSampler.md @@ -11,7 +11,7 @@ from twinkle import DeviceMesh # Create sampler sampler = vLLMSampler( - model_id='ms://Qwen/Qwen3-4B', + model_id='ms://Qwen/Qwen3.5-4B', device_mesh=DeviceMesh.from_sizes(dp_size=2, tp_size=2), remote_group='sampler_group' ) @@ -60,7 +60,7 @@ twinkle.initialize('ray', groups=device_groups) # Create remote sampler sampler = vLLMSampler( - model_id='ms://Qwen/Qwen3-4B', + model_id='ms://Qwen/Qwen3.5-4B', device_mesh=DeviceMesh.from_sizes(dp_size=4), remote_group='sampler' ) diff --git a/docs/source_en/Usage Guide/Quick-Start.md b/docs/source_en/Usage Guide/Quick-Start.md index 4c1e55bb..24820fea 100644 --- a/docs/source_en/Usage Guide/Quick-Start.md +++ b/docs/source_en/Usage Guide/Quick-Start.md @@ -46,7 +46,7 @@ def train(): dataset = PackingDataset(dataset_meta) dataset.map(SelfCognitionProcessor(model_name='Twinkle Model', model_author='ModelScope Community')) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B', max_length=512) + dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) dataset.encode() dataset.pack_dataset() @@ -90,7 +90,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) # Encode dataset @@ -98,7 +98,7 @@ def train(): # Global batch size = 8, for GPUs, so 1 sample per GPU dataloader = DataLoader(dataset=dataset, batch_size=8) # Use a TransformersModel - model = TransformersModel(model_id='ms://Qwen/Qwen3-4B') + model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B') lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') @@ -130,7 +130,7 @@ if __name__ == '__main__': train() ``` -In this training code, we constructed a dataset and loaded the Qwen/Qwen3-4B model, used LoRA with the all-linear approach, and completed one training run. In the logs, you can observe the process of loss gradually converging. +In this training code, we constructed a dataset and loaded the Qwen/Qwen3.5-4B model, used LoRA with the all-linear approach, and completed one training run. In the logs, you can observe the process of loss gradually converging. ### torchrun @@ -158,7 +158,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) # Encode dataset @@ -166,7 +166,7 @@ def train(): # Global batch size = 8, for GPUs, so 1 sample per GPU dataloader = DataLoader(dataset=dataset, batch_size=8) # Use a TransformersModel - model = TransformersModel(model_id='ms://Qwen/Qwen3-4B') + model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B') lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') @@ -231,7 +231,7 @@ from twinkle.reward import GSM8KAccuracyReward, GSM8KFormatReward from twinkle.sampler import vLLMSampler from twinkle.template import Template -MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3-4B') +MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.5-4B') MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4)) SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS',4)) NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS @@ -437,7 +437,7 @@ from twinkle_client.sampler import vLLMSampler logger = get_logger() # ========== Configuration ========== -MODEL_ID = 'ms://Qwen/Qwen3-4B' +MODEL_ID = 'ms://Qwen/Qwen3.5-4B' NUM_GENERATIONS = 4 MAX_NEW_TOKENS = 1024 LEARNING_RATE = 1e-5 @@ -682,7 +682,7 @@ from twinkle.preprocessor import SelfCognitionProcessor from twinkle.server.tinker.common import input_feature_to_datum # The base model to fine-tune / evaluate -base_model = 'ms://Qwen/Qwen3-4B' +base_model = 'ms://Qwen/Qwen3.5-4B' def train(): @@ -747,7 +747,7 @@ This service shares the same code as the Tinker API section described above. The Switch the prefix. ```text -ms://Qwen/Qwen3-4B -> hf://Qwen/Qwen3-4B +ms://Qwen/Qwen3.5-4B -> hf://Qwen/Qwen3.5-4B ``` ## 🛠️ Twinkle✨ Modular Ecosystem @@ -874,10 +874,10 @@ DeviceGroup: Define how many resource groups are needed for this training sessio ```python from twinkle.model import TransformersModel -model = TransformersModel(model_id='Qwen/Qwen3-4B', remote_group='default', device_mesh=device_mesh) +model = TransformersModel(model_id='Qwen/Qwen3.5-4B', remote_group='default', device_mesh=device_mesh) # Or from twinkle.model import MegatronModel -model = MegatronModel(model_id='Qwen/Qwen3-4B', remote_group='default', device_mesh=device_mesh) +model = MegatronModel(model_id='Qwen/Qwen3.5-4B', remote_group='default', device_mesh=device_mesh) ``` DeviceMesh specifies the topology of components like models within the resource group. It can be understood as how to perform parallelization. This affects a series of framework decisions, such as data acquisition, data consumption, data return, etc. @@ -903,7 +903,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='Qwen/Qwen3-4B') + dataset.set_template('Template', model_id='Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) # Encode dataset @@ -911,7 +911,7 @@ def train(): # Global batch size = 8, for GPUs, so 1 sample per GPU dataloader = DataLoader(dataset=dataset, batch_size=8, min_batch_size=8) # Use a TransformersModel - model = TransformersModel(model_id='Qwen/Qwen3-4B', remote_group='default') + model = TransformersModel(model_id='Qwen/Qwen3.5-4B', remote_group='default') lora_config = LoraConfig( r=8, @@ -948,29 +948,3 @@ Start training like this: ```shell python3 train.py ``` - -## Supported Large Language Models List - -| Model Type | Model ID Example | Model Size | Requires | Support Megatron | HF Model ID | -| ------------------- | ------------------------------------------------------------ | :-------------------------------------: | -------------------- | :--------------: | :----------------------------------------------------------: | -| qwen2 series | [Qwen/Qwen2-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-0.5B-Instruct) | 0.5B/1.5B/7B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) | -| | [Qwen/Qwen2-1.5B](https://modelscope.cn/models/Qwen/Qwen2-1.5B) | 0.5B/1.5B/7B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2-1.5B](https://huggingface.co/Qwen/Qwen2-1.5B) | -| | [Qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-1.5B-Instruct) | 0.5B/1.5B/3B/7B/14B/32B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | -| | [Qwen/Qwen2.5-0.5B](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B) | 0.5B/1.5B/3B/7B/14B/32B | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B) | -| qwen2_moe series | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B-Chat) | - | transformers>=4.40 | ✔ | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat) | -| | [Qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B) | - | transformers>=4.40 | ✔ | [Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B) | -| qwen3 series | [Qwen/Qwen3-14B-Base](https://modelscope.cn/models/Qwen/Qwen3-14B-Base) | 0.6B/1.7B/4B/8B/14B | transformers>=4.51 | ✔ | [Qwen/Qwen3-14B-Base](https://huggingface.co/Qwen/Qwen3-14B-Base) | -| | [Qwen/Qwen3-32B](https://modelscope.cn/models/Qwen/Qwen3-32B) | 0.6B/1.7B/4B/8B/14B/32B | transformers>=4.51 | ✔ | [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) | -| qwen3_moe series | [Qwen/Qwen3-30B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Base) | - | transformers>=4.51 | ✔ | [Qwen/Qwen3-30B-A3B-Base](https://huggingface.co/Qwen/Qwen3-30B-A3B-Base) | -| | [Qwen/Qwen3-30B-A3B](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B) | - | transformers>=4.51 | ✔ | [Qwen/Qwen3-30B-A3B](https://huggingface.co/Qwen/Qwen3-30B-A3B) | -| | [Qwen/Qwen3-235B-A22B](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B) | - | transformers>=4.51 | ✔ | [Qwen/Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B) | -| chatglm2 series | [ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b) | 6b/6b-32k | transformers<4.42 | ✘ | [zai-org/chatglm2-6b](https://huggingface.co/zai-org/chatglm2-6b) | -| chatglm3 series | [ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b) | 6b/6b-base/6b-32k/6b-128k | transformers<4.42 | ✘ | [zai-org/chatglm3-6b](https://huggingface.co/zai-org/chatglm3-6b) | -| chatglm4 series | [ZhipuAI/glm-4-9b-chat](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat) | glm-4-9b/glm-4-9b-chat/glm-4-9b-chat-1m | transformers>=4.42 | ✘ | [zai-org/glm-4-9b-chat](https://huggingface.co/zai-org/glm-4-9b-chat) | -| | [ZhipuAI/LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b) | - | transformers>=4.42 | ✘ | [zai-org/LongWriter-glm4-9b](https://huggingface.co/zai-org/LongWriter-glm4-9b) | -| glm_edge series | [ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat) | 1.5b-chat/4b-chat | transformers>=4.46 | ✘ | [zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat) | -| internlm2 series | [Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b) | 1_8b/chat-1_8b-sft/base-7b/7b/chat-7b/ | transformers>=4.38 | ✘ | [internlm/internlm2-1_8b](https://huggingface.co/internlm/internlm2-1_8b) | -| deepseek_v1 | [deepseek-ai/DeepSeek-V2-Lite](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite) | V2/V2-Lite/V2-Chat/2-Lite-Chat/V2.5 | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-V2-Lite](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite) | -| | [deepseek-ai/DeepSeek-Prover-V2-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-Prover-V2-7B) | - | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-Prover-V2-7B](https://huggingface.co/deepseek-ai/DeepSeek-Prover-V2-7B) | -| | [deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1) | - | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) | -| deepSeek-r1-distill | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | 1.5B/7B/14B/32B | transformers>=4.37 | ✔ | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" index 98579527..5e4cbf0d 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" @@ -46,7 +46,7 @@ def train(): dataset = PackingDataset(dataset_meta) dataset.map(SelfCognitionProcessor(model_name='Twinkle模型', model_author='ModelScope社区')) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B', max_length=512) + dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) dataset.encode() dataset.pack_dataset() @@ -90,7 +90,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset @@ -98,7 +98,7 @@ def train(): # Global batch size = 8, for GPUs, so 1 sample per GPU dataloader = DataLoader(dataset=dataset, batch_size=8) # Use a TransformersModel - model = TransformersModel(model_id='ms://Qwen/Qwen3-4B') + model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B') lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') @@ -131,7 +131,7 @@ if __name__ == '__main__': ``` -在这个训练代码中,我们构造了一个数据集并拉起了Qwen/Qwen3-4B模型,使用all-linear方式加载了lora,并完成了一次训练。在日志中,可以看到loss逐步收敛的过程。 +在这个训练代码中,我们构造了一个数据集并拉起了Qwen/Qwen3.5-4B模型,使用all-linear方式加载了lora,并完成了一次训练。在日志中,可以看到loss逐步收敛的过程。 ### torchrun @@ -159,7 +159,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset @@ -167,7 +167,7 @@ def train(): # Global batch size = 8, for GPUs, so 1 sample per GPU dataloader = DataLoader(dataset=dataset, batch_size=8) # Use a TransformersModel - model = TransformersModel(model_id='ms://Qwen/Qwen3-4B') + model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B') lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') @@ -233,7 +233,7 @@ from twinkle.reward import GSM8KAccuracyReward, GSM8KFormatReward from twinkle.sampler import vLLMSampler from twinkle.template import Template -MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3-4B') +MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.5-4B') MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4)) SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS',4)) NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS @@ -439,7 +439,7 @@ from twinkle_client.sampler import vLLMSampler logger = get_logger() # ========== Configuration ========== -MODEL_ID = 'ms://Qwen/Qwen3-4B' +MODEL_ID = 'ms://Qwen/Qwen3.5-4B' NUM_GENERATIONS = 4 MAX_NEW_TOKENS = 1024 LEARNING_RATE = 1e-5 @@ -684,7 +684,7 @@ from twinkle.preprocessor import SelfCognitionProcessor from twinkle.server.tinker.common import input_feature_to_datum # The base model to fine-tune / evaluate -base_model = 'Qwen/Qwen3-4B' +base_model = 'Qwen/Qwen3.5-4B' def train(): @@ -749,7 +749,7 @@ if __name__ == '__main__': 切换前缀即可。 ```text -ms://Qwen/Qwen3-4B -> hf://Qwen/Qwen3-4B +ms://Qwen/Qwen3.5-4B -> hf://Qwen/Qwen3.5-4B ``` ## 🛠️ Twinkle✨ 模块化生态系统 @@ -876,10 +876,10 @@ DeviceGroup:定义本次训练需要多少个资源组。定义后,组件可 ```python from twinkle.model import TransformersModel -model = TransformersModel(model_id='ms://Qwen/Qwen3-4B', remote_group='default', device_mesh=device_mesh) +model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B', remote_group='default', device_mesh=device_mesh) # 或者 from twinkle.model import MegatronModel -model = MegatronModel(model_id='ms://Qwen/Qwen3-4B', remote_group='default', device_mesh=device_mesh) +model = MegatronModel(model_id='ms://Qwen/Qwen3.5-4B', remote_group='default', device_mesh=device_mesh) ``` DeviceMesh 指定了模型等组件在资源组中的拓扑结构。可以理解为如何进行并行。这会影响一系列的框架决策,例如数据获取、数据消费、数据返回等。 @@ -905,7 +905,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset @@ -913,7 +913,7 @@ def train(): # Global batch size = 8, for GPUs, so 1 sample per GPU dataloader = DataLoader(dataset=dataset, batch_size=8, min_batch_size=8) # Use a TransformersModel - model = TransformersModel(model_id='ms://Qwen/Qwen3-4B', remote_group='default') + model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B', remote_group='default') lora_config = LoraConfig( r=8, @@ -950,29 +950,3 @@ if __name__ == '__main__': ```shell python3 train.py ``` - -## 支持的大语言模型列表 - -| Model Type | Model ID 举例 | Model Size | Requires | Support Megatron | HF Model ID | -| ------------------- | ------------------------------------------------------------ | :-------------------------------------: | -------------------- | :--------------: | :----------------------------------------------------------: | -| qwen2 全系列 | [Qwen/Qwen2-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-0.5B-Instruct) | 0.5B/1.5B/7B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) | -| | [Qwen/Qwen2-1.5B](https://modelscope.cn/models/Qwen/Qwen2-1.5B) | 0.5B/1.5B/7B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2-1.5B](https://huggingface.co/Qwen/Qwen2-1.5B) | -| | [Qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-1.5B-Instruct) | 0.5B/1.5B/3B/7B/14B/32B/72B | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | -| | [Qwen/Qwen2.5-0.5B](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B) | 0.5B/1.5B/3B/7B/14B/32B | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B) | -| qwen2_moe 全系列 | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B-Chat) | - | transformers>=4.40 | ✔ | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat) | -| | [Qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B) | - | transformers>=4.40 | ✔ | [Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B) | -| qwen3 全系列 | [Qwen/Qwen3-14B-Base](https://modelscope.cn/models/Qwen/Qwen3-14B-Base) | 0.6B/1.7B/4B/8B/14B | transformers>=4.51 | ✔ | [Qwen/Qwen3-14B-Base](https://huggingface.co/Qwen/Qwen3-14B-Base) | -| | [Qwen/Qwen3-32B](https://modelscope.cn/models/Qwen/Qwen3-32B) | 0.6B/1.7B/4B/8B/14B/32B | transformers>=4.51 | ✔ | [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) | -| qwen3_moe 全系列 | [Qwen/Qwen3-30B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Base) | - | transformers>=4.51 | ✔ | [Qwen/Qwen3-30B-A3B-Base](https://huggingface.co/Qwen/Qwen3-30B-A3B-Base) | -| | [Qwen/Qwen3-30B-A3B](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B) | - | transformers>=4.51 | ✔ | [Qwen/Qwen3-30B-A3B](https://huggingface.co/Qwen/Qwen3-30B-A3B) | -| | [Qwen/Qwen3-235B-A22B](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B) | - | transformers>=4.51 | ✔ | [Qwen/Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B) | -| chatglm2 全系列 | [ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b) | 6b/6b-32k | transformers<4.42 | ✘ | [zai-org/chatglm2-6b](https://huggingface.co/zai-org/chatglm2-6b) | -| chatglm3 全系列 | [ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b) | 6b/6b-base/6b-32k/6b-128k | transformers<4.42 | ✘ | [zai-org/chatglm3-6b](https://huggingface.co/zai-org/chatglm3-6b) | -| chatglm4 全系列 | [ZhipuAI/glm-4-9b-chat](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat) | glm-4-9b/glm-4-9b-chat/glm-4-9b-chat-1m | transformers>=4.42 | ✘ | [zai-org/glm-4-9b-chat](https://huggingface.co/zai-org/glm-4-9b-chat) | -| | [ZhipuAI/LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b) | - | transformers>=4.42 | ✘ | [zai-org/LongWriter-glm4-9b](https://huggingface.co/zai-org/LongWriter-glm4-9b) | -| glm_edge 全系列 | [ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat) | 1.5b-chat/4b-chat | transformers>=4.46 | ✘ | [zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat) | -| internlm2 全系列 | [Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b) | 1_8b/chat-1_8b-sft/base-7b/7b/chat-7b/ | transformers>=4.38 | ✘ | [internlm/internlm2-1_8b](https://huggingface.co/internlm/internlm2-1_8b) | -| deepseek_v1 | [deepseek-ai/DeepSeek-V2-Lite](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite) | V2/V2-Lite/V2-Chat/2-Lite-Chat/V2.5 | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-V2-Lite](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite) | -| | [deepseek-ai/DeepSeek-Prover-V2-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-Prover-V2-7B) | - | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-Prover-V2-7B](https://huggingface.co/deepseek-ai/DeepSeek-Prover-V2-7B) | -| | [deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1) | - | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) | -| deepSeek-r1-distill | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | 1.5B/7B/14B/32B | transformers>=4.37 | ✔ | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | diff --git "a/docs/source_zh/\347\273\204\344\273\266/\344\274\230\345\212\277/GRPOAdvantage.md" "b/docs/source_zh/\347\273\204\344\273\266/\344\274\230\345\212\277/GRPOAdvantage.md" index 77f8441e..9dc0635d 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\344\274\230\345\212\277/GRPOAdvantage.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\344\274\230\345\212\277/GRPOAdvantage.md" @@ -41,8 +41,8 @@ from twinkle.sampler import vLLMSampler from twinkle.reward import MathReward # 创建组件 -actor = TransformersModel(model_id='ms://Qwen/Qwen3-4B') -sampler = vLLMSampler(model_id='ms://Qwen/Qwen3-4B') +actor = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B') +sampler = vLLMSampler(model_id='ms://Qwen/Qwen3.5-4B') reward_fn = MathReward() advantage_fn = GRPOAdvantage() diff --git "a/docs/source_zh/\347\273\204\344\273\266/\344\274\230\345\212\277/RLOOAdvantage.md" "b/docs/source_zh/\347\273\204\344\273\266/\344\274\230\345\212\277/RLOOAdvantage.md" index 50a7c53b..d42876b9 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\344\274\230\345\212\277/RLOOAdvantage.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\344\274\230\345\212\277/RLOOAdvantage.md" @@ -38,8 +38,8 @@ from twinkle.sampler import vLLMSampler from twinkle.reward import MathReward # 创建组件 -actor = TransformersModel(model_id='ms://Qwen/Qwen3-4B') -sampler = vLLMSampler(model_id='ms://Qwen/Qwen3-4B') +actor = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B') +sampler = vLLMSampler(model_id='ms://Qwen/Qwen3.5-4B') reward_fn = MathReward() advantage_fn = RLOOAdvantage() diff --git "a/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/Reward.md" "b/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/Reward.md" index 2728612c..d23555d2 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/Reward.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/Reward.md" @@ -87,7 +87,7 @@ from twinkle.sampler import vLLMSampler from twinkle.reward import MathReward from twinkle.advantage import GRPOAdvantage -sampler = vLLMSampler(model_id='ms://Qwen/Qwen3-4B') +sampler = vLLMSampler(model_id='ms://Qwen/Qwen3.5-4B') reward_fn = MathReward() advantage_fn = GRPOAdvantage() diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\346\240\274\345\274\217/Sampling.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\346\240\274\345\274\217/Sampling.md" index 71977be1..3605185d 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\346\240\274\345\274\217/Sampling.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\346\240\274\345\274\217/Sampling.md" @@ -62,7 +62,7 @@ class SampleResponse: from twinkle.data_format import SamplingParams, SampleResponse from twinkle.sampler import vLLMSampler -sampler = vLLMSampler(model_id='ms://Qwen/Qwen3-4B') +sampler = vLLMSampler(model_id='ms://Qwen/Qwen3.5-4B') params = SamplingParams(max_tokens=512, temperature=0.7, top_p=0.9) response: SampleResponse = sampler.sample(trajectories, sampling_params=params, num_samples=4) diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" index 11d8a716..780ddf17 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" @@ -60,7 +60,7 @@ dataset = Dataset(DatasetMeta(dataset_id='my/custom/dataset.jsonl', data_slice=r Template 组件是负责将字符串/图片多模态原始数据转换为模型输入 token 的组件。数据集可以设置一个 Template 来完成 `encode` 过程。 ```python -dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B', max_length=512) +dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) ``` set_template 方法支持传入 `kwargs`(例如例子中的 `max_length`),作为 `Template` 的构造参数使用。 diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/MegatronModel.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/MegatronModel.md" index 08986bbd..2a4b5b53 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/MegatronModel.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/MegatronModel.md" @@ -35,7 +35,7 @@ from twinkle.model import MegatronModel from twinkle import DeviceMesh from twinkle.dataloader import DataLoader dataloader = DataLoader(...) -model = MegatronModel(model_id='ms://Qwen/Qwen3-4B', device_mesh=DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2), remote_group='actor') +model = MegatronModel(model_id='ms://Qwen/Qwen3.5-4B', device_mesh=DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2), remote_group='actor') model.add_adapter_to_model(...) model.set_optimizer('default', adapter_name='...') for data in dataloader: diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/TransformersModel.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/TransformersModel.md" index 297ca54f..468b5efb 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/TransformersModel.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/TransformersModel.md" @@ -41,7 +41,7 @@ from twinkle.model import TransformersModel from twinkle import DeviceMesh from twinkle.dataloader import DataLoader dataloader = DataLoader(...) -model = TransformersModel(model_id='ms://Qwen/Qwen3-4B', device_mesh=DeviceMesh.from_sizes(dp_size=2, fsdp_size=2), remote_group='actor') +model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B', device_mesh=DeviceMesh.from_sizes(dp_size=2, fsdp_size=2), remote_group='actor') model.add_adapter_to_model(...) model.set_optimizer(..., adapter_name='...') for data in dataloader: diff --git "a/docs/source_zh/\347\273\204\344\273\266/\351\207\207\346\240\267\345\231\250/TorchSampler.md" "b/docs/source_zh/\347\273\204\344\273\266/\351\207\207\346\240\267\345\231\250/TorchSampler.md" index c23246f7..bcdce5b9 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\351\207\207\346\240\267\345\231\250/TorchSampler.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\351\207\207\346\240\267\345\231\250/TorchSampler.md" @@ -9,7 +9,7 @@ from twinkle.sampler import TorchSampler from twinkle import DeviceMesh sampler = TorchSampler( - model_id='ms://Qwen/Qwen3-4B', + model_id='ms://Qwen/Qwen3.5-4B', device_mesh=DeviceMesh.from_sizes(dp_size=1), ) diff --git "a/docs/source_zh/\347\273\204\344\273\266/\351\207\207\346\240\267\345\231\250/vLLMSampler.md" "b/docs/source_zh/\347\273\204\344\273\266/\351\207\207\346\240\267\345\231\250/vLLMSampler.md" index 32fe18b6..eced51f7 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\351\207\207\346\240\267\345\231\250/vLLMSampler.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\351\207\207\346\240\267\345\231\250/vLLMSampler.md" @@ -11,7 +11,7 @@ from twinkle import DeviceMesh # 创建采样器 sampler = vLLMSampler( - model_id='ms://Qwen/Qwen3-4B', + model_id='ms://Qwen/Qwen3.5-4B', device_mesh=DeviceMesh.from_sizes(dp_size=2, tp_size=2), remote_group='sampler_group' ) @@ -60,7 +60,7 @@ twinkle.initialize('ray', groups=device_groups) # 创建远程采样器 sampler = vLLMSampler( - model_id='ms://Qwen/Qwen3-4B', + model_id='ms://Qwen/Qwen3.5-4B', device_mesh=DeviceMesh.from_sizes(dp_size=4), remote_group='sampler' ) diff --git a/src/twinkle/model/megatron/model/constant.py b/src/twinkle/model/megatron/model/constant.py index 33ac637c..968186ac 100644 --- a/src/twinkle/model/megatron/model/constant.py +++ b/src/twinkle/model/megatron/model/constant.py @@ -32,6 +32,7 @@ class MLLMMegatronModelType: qwen2_5_vl = 'qwen2_5_vl' qwen3_vl = 'qwen3_vl' qwen3_5 = 'qwen3_5' + qwen3_5_moe = 'qwen3_5_moe' class MegatronModelType(LLMMegatronModelType, MLLMMegatronModelType): diff --git a/src/twinkle/model/megatron/model/gpt_bridge.py b/src/twinkle/model/megatron/model/gpt_bridge.py index a5190ae0..daea5c90 100644 --- a/src/twinkle/model/megatron/model/gpt_bridge.py +++ b/src/twinkle/model/megatron/model/gpt_bridge.py @@ -1317,8 +1317,12 @@ def _set_layer_mlp(self, mg_layer, hf_state_dict, layer_idx: int, to_mcore: bool to_mcore) else: hf_state_dict.update(self._set_mlp_state(mg_mlp, hf_state_dict, f'{hf_mlp_prefix}.', layer_idx, to_mcore)) - self._set_state_dict(mg_layer, 'mlp.linear_fc1.layer_norm_weight', hf_state_dict, - 'post_attention_layernorm.weight', to_mcore) + if self.args.hf_model_type == 'qwen3_5': + self._set_state_dict(mg_layer, 'pre_mlp_layernorm.weight', hf_state_dict, + 'post_attention_layernorm.weight', to_mcore) + else: + self._set_state_dict(mg_layer, 'mlp.linear_fc1.layer_norm_weight', hf_state_dict, + 'post_attention_layernorm.weight', to_mcore) return hf_state_dict def _set_layer_state(self, mg_layer, hf_state_dict, hf_prefix: str, layer_idx: int, to_mcore: bool): diff --git a/src/twinkle/model/megatron/model/gpts/qwen3_next.py b/src/twinkle/model/megatron/model/gpts/qwen3_next.py index 94c26470..7ae0c943 100644 --- a/src/twinkle/model/megatron/model/gpts/qwen3_next.py +++ b/src/twinkle/model/megatron/model/gpts/qwen3_next.py @@ -458,10 +458,14 @@ def get_qwen3_next_layer_spec(config, args, gated_delta_net_cls): elif layer_type == 'full_attention': layer_spec.submodules.self_attention.submodules.linear_qkv = TEColumnParallelLinear layer_spec.submodules.self_attention.module = Qwen3NextSelfAttention + # Replace ALL layernorms with Qwen3NextRMSNorm (Zero-Centered) layer_spec.submodules.input_layernorm = layer_norm_impl - if hasattr(layer_spec.submodules, - 'pre_mlp_layernorm') and layer_spec.submodules.pre_mlp_layernorm is not IdentityOp: + if hasattr(layer_spec.submodules, 'pre_mlp_layernorm'): layer_spec.submodules.pre_mlp_layernorm = layer_norm_impl + # qwen3.5 dense + if args.hf_model_type == 'qwen3_5': + layer_spec.submodules.mlp.submodules.linear_fc1 = TEColumnParallelLinear + # Replace qk_layernorm if present if hasattr(layer_spec.submodules.self_attention.submodules, 'q_layernorm'): layer_spec.submodules.self_attention.submodules.q_layernorm = layer_norm_impl if hasattr(layer_spec.submodules.self_attention.submodules, 'k_layernorm'): diff --git a/src/twinkle/model/megatron/model/mm_gpts/qwen3_5.py b/src/twinkle/model/megatron/model/mm_gpts/qwen3_5.py index f0dec64f..dbffd992 100644 --- a/src/twinkle/model/megatron/model/mm_gpts/qwen3_5.py +++ b/src/twinkle/model/megatron/model/mm_gpts/qwen3_5.py @@ -139,28 +139,36 @@ def _convert_mtp_extra(self, mtp_layer, hf_state_dict, to_mcore, origin_hf_state except ImportError: Qwen3_5MoeForConditionalGeneration = None -_auto_model_cls = Qwen3_5MoeForConditionalGeneration -if _auto_model_cls is None: - try: - from transformers import AutoModel - _auto_model_cls = AutoModel - except ImportError: - _auto_model_cls = None +try: + from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5ForConditionalGeneration +except ImportError: + Qwen3_5ForConditionalGeneration = None class Qwen3_5MoeLoader(Qwen3NextLoader): gated_delta_net = Qwen3_5MoeGatedDeltaNet +register_megatron_model( + MegatronModelMeta( + MegatronModelType.qwen3_5_moe, + [ + ModelType.qwen3_5_moe, + ], + bridge_cls=Qwen3_5Bridge, + visual_cls=Qwen3_5Vit, + auto_model_cls=Qwen3_5MoeForConditionalGeneration, + loader=Qwen3_5MoeLoader, + )) + register_megatron_model( MegatronModelMeta( MegatronModelType.qwen3_5, [ ModelType.qwen3_5, - ModelType.qwen3_5_moe, ], bridge_cls=Qwen3_5Bridge, visual_cls=Qwen3_5Vit, - auto_model_cls=_auto_model_cls, + auto_model_cls=Qwen3_5ForConditionalGeneration, loader=Qwen3_5MoeLoader, )) diff --git a/src/twinkle/server/__main__.py b/src/twinkle/server/__main__.py index d6d30582..17ea2e1f 100644 --- a/src/twinkle/server/__main__.py +++ b/src/twinkle/server/__main__.py @@ -10,7 +10,7 @@ python -m twinkle.server --config server_config.yaml --server-type tinker # Quick start with minimal args - python -m twinkle.server --server-type tinker --port 8000 --model-id "Qwen/Qwen3-4B" + python -m twinkle.server --server-type tinker --port 8000 --model-id "Qwen/Qwen3.5-4B" """ from __future__ import annotations diff --git a/src/twinkle/server/twinkle/sampler.py b/src/twinkle/server/twinkle/sampler.py index 6efa5bd6..27ffd694 100644 --- a/src/twinkle/server/twinkle/sampler.py +++ b/src/twinkle/server/twinkle/sampler.py @@ -104,7 +104,7 @@ def build_sampler_app(model_id: str, """Build a sampler application for text generation inference. Args: - model_id: Model identifier (e.g., "Qwen/Qwen3-4B") + model_id: Model identifier (e.g., "Qwen/Qwen3.5-4B") nproc_per_node: Number of GPU processes per node device_group: Device group configuration dict device_mesh: Device mesh configuration dict for parallelism