From 663d69123f7312b149c81e67ee53b16966998746 Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Mon, 9 Feb 2026 12:15:09 +0800
Subject: [PATCH 01/10] 1. Modified the tokenizer in `TinkerModel` to directly
 retrieve it from the sample client. 2. Updated the `SFTFormatter`,
 `vLLMRolloutModel`, `SimpleMMWorkflow`,  `mm_utils.py`, `verl_trainer.py` and
 `trainer/verl/utils.py` to enhance multimodal processing capabilities,
 enabling compatibility with Qwen-series vision-language models.

---
 examples/grpo_vlm/README.md                   |   4 +-
 examples/mix_vlm/README.md                    |   4 +-
 pyproject.toml                                |   3 +-
 trinity/buffer/schema/formatter.py            | 152 ++++++++----------
 trinity/common/models/mm_utils.py             |  82 +++++++++-
 trinity/common/models/model.py                |   5 +
 trinity/common/models/tinker_model.py         |   5 +-
 trinity/common/models/vllm_model.py           |  65 +++++---
 .../common/workflows/simple_mm_workflow.py    |  39 ++---
 trinity/trainer/verl/utils.py                 |  43 ++++-
 trinity/trainer/verl_trainer.py               |  34 +---
 11 files changed, 258 insertions(+), 178 deletions(-)

diff --git a/examples/grpo_vlm/README.md b/examples/grpo_vlm/README.md
index 5ab12488a61..8283e18910d 100644
--- a/examples/grpo_vlm/README.md
+++ b/examples/grpo_vlm/README.md
@@ -8,8 +8,8 @@ This example shows the usage of GRPO with Qwen2.5-VL-3B-Instruct on the [geometr
 The specific requirements are:
 
 ```yaml
-vllm>=0.9.1,<0.10.0
-transformers<4.53.0
+vllm>=0.10.2
+transformers>=4.54.0
 qwen_vl_utils
 ```
 
diff --git a/examples/mix_vlm/README.md b/examples/mix_vlm/README.md
index 5a2c8752de0..124a3663aea 100644
--- a/examples/mix_vlm/README.md
+++ b/examples/mix_vlm/README.md
@@ -8,8 +8,8 @@ This is an example of using the [MIX](../../docs/sphinx_doc/source/tutorial/exam
 The specific requirements are:
 
 ```yaml
-vllm>=0.9.1,<0.10.0
-transformers<4.53.0
+vllm>=0.10.2
+transformers>=4.54.0
 qwen_vl_utils
 ```
 
diff --git a/pyproject.toml b/pyproject.toml
index b3d2f237c8d..96e55f3ff66 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -86,7 +86,7 @@ megatron = [
     # "mbridge @ git+https://github.com/ISEEKYAN/mbridge.git@20e9ffbbe72ae7b1df83bfe1bc3c11f7382f2612",
 ]
 tinker = [
-    "tinker; python_version >= '3.11'",
+    "tinker>=0.10.0; python_version >= '3.11'",
 ]
 
 doc = [
@@ -101,6 +101,7 @@ doc = [
 
 mm = [
     "qwen-vl-utils",
+    "transformers>=4.54.0",
 ]
 
 flash_attn = [
diff --git a/trinity/buffer/schema/formatter.py b/trinity/buffer/schema/formatter.py
index 039074521b5..bf96c685f0f 100644
--- a/trinity/buffer/schema/formatter.py
+++ b/trinity/buffer/schema/formatter.py
@@ -109,7 +109,7 @@ def __init__(self, tokenizer_path: str, format_config: FormatConfig):
         else:
             self.processor = None
             self.tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path)
-        self.chat_template = format_config.chat_template or self.tokenizer.chat_template
+        self.chat_template = format_config.chat_template
         # For messages type
         if self.prompt_type == PromptType.MESSAGES:
             self.messages_key = format_config.messages_key
@@ -129,7 +129,6 @@ def _messages_to_experience(
         self,
         messages: List[Dict],
         tools: Optional[List[Dict] | str] = None,
-        mm_data: Optional[Dict] = None,
     ) -> Experience:
         """Convert messages and tools into an Experience object.
 
@@ -170,89 +169,63 @@ def _messages_to_experience(
                 prompt_length=prompt_length,
                 messages=messages,
             )
-        if mm_data:
-            return self.convert_mm_data_to_experiences(messages=messages, mm_data=mm_data)
-        token_ids = self.tokenizer.apply_chat_template(
-            messages,
-            tools=tools,
-            add_generation_prompt=False,
-            return_tensors="pt",
-            chat_template=self.chat_template,
-        )[0]
-        prompt_tokens_ids = self.tokenizer.apply_chat_template(
-            messages[:-1],
-            tools=tools,
-            add_generation_prompt=True,
-            return_tensors="pt",
-            chat_template=self.chat_template,
-        )[0]
-        return Experience(
-            tokens=token_ids,
-            prompt_length=len(prompt_tokens_ids),
-            messages=messages,
-        )
-
-    def load_mm_data(self, sample: Dict) -> Dict:
-        """Load multi-modal data such as images or videos.
-
-        NOTE: You can override this method for custom data loading.
-
-        Args:
-            sample (Dict): The raw sample dictionary containing multi-modal data.
-
-        Returns:
-            Dict: A dictionary containing multi-modal data. Specifically, it may contain:
-                - images: A list of `PIL.Image.Image` if `self.image_key` is set
-                - videos: A list of `numpy.ndarray` if `self.video_key` is set
-        """
-        from verl.utils.dataset.vision_utils import process_image, process_video
-
-        mm_data = {}
-        if self.image_key:
-            mm_data["images"] = [process_image(img) for img in sample[self.image_key]]
-        if self.video_key:
-            mm_data["videos"] = [process_video(vid).numpy() for vid in sample[self.video_key]]
-        return mm_data
-
-    def convert_mm_data_to_experiences(
-        self,
-        messages: List[Dict],
-        mm_data: Dict,
-    ) -> Experience:
-        from trinity.common.models.mm_utils import (
-            build_multi_modal_inputs,
-            convert_messages_to_mm_format,
-        )
+        if self.image_key or self.video_key:
+            from trinity.common.models.mm_utils import (
+                build_mm_input_for_training,
+                build_multi_modal_data,
+            )
 
-        messages = convert_messages_to_mm_format(messages)
-        sequence: str = self.processor.apply_chat_template(
-            messages,
-            add_generation_prompt=False,
-            chat_template=self.chat_template,
-        )
-        prompt: str = self.processor.apply_chat_template(
-            messages[:-1],
-            add_generation_prompt=True,
-            chat_template=self.chat_template,
-        )
-        sequence_data = build_multi_modal_inputs(
-            prompt=sequence,
-            images=mm_data.get("images", None),
-            videos=mm_data.get("videos", None),
-            processor=self.processor,
-        )
-        prompt_data = build_multi_modal_inputs(
-            prompt=prompt,
-            images=mm_data.get("images", None),
-            videos=mm_data.get("videos", None),
-            processor=self.processor,
-        )
-        return Experience(
-            tokens=sequence_data["prompt_token_ids"],
-            prompt_length=len(prompt_data["prompt_token_ids"]),
-            messages=messages,
-            multi_modal_inputs=sequence_data["multi_modal_inputs"],
-        )
+            full_text = self.processor.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                chat_template=self.chat_template,
+            )
+            prompt = self.processor.apply_chat_template(
+                messages[:-1],
+                tokenize=False,
+                add_generation_prompt=True,
+                chat_template=self.chat_template,
+            )
+            multi_modal_data = build_multi_modal_data(self.processor, messages)
+            full_text_inputs = build_mm_input_for_training(
+                self.processor,
+                full_text,
+                multi_modal_data,
+            )
+            tokens = full_text_inputs.pop("input_ids")[0]
+            full_text_inputs.pop("attention_mask")
+            prompt_text_inputs = build_mm_input_for_training(
+                self.processor,
+                prompt,
+                multi_modal_data,
+            )
+            return Experience(
+                tokens=tokens,
+                prompt_length=len(prompt_text_inputs["input_ids"][0]),
+                messages=messages,
+                multi_modal_inputs=full_text_inputs,
+            )
+        else:
+            token_ids = self.tokenizer.apply_chat_template(
+                messages,
+                tools=tools,
+                add_generation_prompt=False,
+                return_tensors="pt",
+                chat_template=self.chat_template,
+            )[0]
+            prompt_tokens_ids = self.tokenizer.apply_chat_template(
+                messages[:-1],
+                tools=tools,
+                add_generation_prompt=True,
+                return_tensors="pt",
+                chat_template=self.chat_template,
+            )[0]
+            return Experience(
+                tokens=token_ids,
+                prompt_length=len(prompt_tokens_ids),
+                messages=messages,
+            )
 
     def format(self, sample: Dict) -> Experience:
         if self.prompt_type == PromptType.MESSAGES:
@@ -274,13 +247,18 @@ def format(self, sample: Dict) -> Experience:
             elif self.system_prompt is not None:
                 system_message = {"role": "system", "content": self.system_prompt}
                 messages.append(system_message)
-            messages.append({"role": "user", "content": sample[self.prompt_key]})
+            prompt = sample[self.prompt_key]
+            images = sample[self.image_key] if self.image_key else []
+            videos = sample[self.video_key] if self.video_key else []
+
+            from trinity.common.models.mm_utils import build_mm_message
+
+            messages.append(build_mm_message(prompt, images, videos))
             messages.append({"role": "assistant", "content": sample[self.response_key]})
         else:
             raise ValueError(f"Unsupported prompt_type: {self.prompt_type}")
         tools = sample.get(self.tools_key, None)
-        mm_data = self.load_mm_data(sample) if self.image_key or self.video_key else None
-        return self._messages_to_experience(messages, tools, mm_data)
+        return self._messages_to_experience(messages, tools)
 
 
 class DPOFormatter(ExperienceFormatter):
diff --git a/trinity/common/models/mm_utils.py b/trinity/common/models/mm_utils.py
index e850f190d04..50d24dd3e68 100644
--- a/trinity/common/models/mm_utils.py
+++ b/trinity/common/models/mm_utils.py
@@ -1,5 +1,5 @@
 """"Multi-modal utilities for processing and handling multi-modal data such as images and videos.
-Only support Qwen2.5 VL series.
+Only support Qwen2.5/3 VL series.
 
 Modified from: verl/utils/dataset/rl_dataset.py
 """
@@ -9,7 +9,86 @@
 import numpy as np
 from PIL import Image
 
+from trinity.utils.annotations import Deprecated
 
+
+def build_multi_modal_data(
+    processor: Any,
+    messages: List[Dict],
+) -> Dict[str, Any]:
+    """
+    Preprocess multi-modal data and build multi-modal inputs
+    """
+    processor_class_name = processor.__class__.__name__
+    if "Qwen" in processor_class_name and "VLProcessor" in processor_class_name:
+        from qwen_vl_utils import process_vision_info
+
+        image_inputs, video_inputs = process_vision_info(messages)
+        multi_modal_data = {}
+        if image_inputs:
+            multi_modal_data["image"] = image_inputs
+        if video_inputs:
+            multi_modal_data["video"] = video_inputs
+
+        return multi_modal_data
+    raise NotImplementedError(f"{processor_class_name} not supported")
+
+
+def build_mm_input_for_training(processor: Any, prompt: str, multi_modal_data: Dict) -> Dict:
+    processor_class_name = processor.__class__.__name__
+    if "Qwen" in processor_class_name and "VLProcessor" in processor_class_name:
+        inputs = processor(
+            text=[prompt],
+            images=multi_modal_data.get("image", None),
+            videos=multi_modal_data.get("video", None),
+            padding=True,
+            return_tensors="pt",
+        )
+        return dict(inputs)
+    raise NotImplementedError(f"{processor_class_name} not supported")
+
+
+def build_mm_message(prompt: str, images: List, videos: List):
+    content_list = []
+    segments = re.split("(<image>|<video>)", prompt)
+    img_idx, vid_idx = 0, 0
+    for segment in segments:
+        if segment == "<image>":
+            content_list.append({"type": "image", "image": images[img_idx]})
+            img_idx += 1
+        elif segment == "<video>":
+            content_list.append({"type": "video", "video": videos[vid_idx]})
+            vid_idx += 1
+        elif len(segment) == 0:
+            continue
+        else:
+            content_list.append({"type": "text", "text": segment})
+
+    # deal with redundant <image> and <video>
+    mm_contents = []
+    while img_idx < len(images):
+        mm_contents.append({"type": "image", "image": images[img_idx]})
+        img_idx += 1
+    while vid_idx < len(videos):
+        mm_contents.append({"type": "video", "video": videos[vid_idx]})
+        vid_idx += 1
+
+    content_list = mm_contents + content_list
+    message = {"role": "user", "content": content_list}
+    return message
+
+
+def has_multi_modal_content(messages: List[Dict]):
+    for message in messages:
+        content = message["content"]
+        if isinstance(content, list):
+            for item in content:
+                if item.get("type", "text") != "text":
+                    return True
+    return False
+
+
+@Deprecated
 def build_multi_modal_inputs(
     prompt: str,
     images: List[Image.Image],
@@ -49,6 +128,7 @@ def build_multi_modal_inputs(
     }
 
 
+@Deprecated
 def convert_messages_to_mm_format(messages: List[Dict]) -> List[Dict]:
     for message in messages:
         content = message["content"]
diff --git a/trinity/common/models/model.py b/trinity/common/models/model.py
index 676df1bf0ce..f03ec64a485 100644
--- a/trinity/common/models/model.py
+++ b/trinity/common/models/model.py
@@ -18,6 +18,7 @@
 from trinity.common.constants import RunningStatus
 from trinity.common.experience import Experience
 from trinity.common.models.utils import get_action_mask_method
+from trinity.utils.annotations import Deprecated
 from trinity.utils.log import get_logger
 
 
@@ -343,6 +344,7 @@ async def generate_async(self, prompts: List[str], **kwargs) -> List[Experience]
         )
         return [exp for exps in results for exp in exps]
 
+    @Deprecated
     @_history_recorder
     def generate_mm(
         self,
@@ -360,6 +362,7 @@ def generate_mm(
         )
         return [exp for exps in results for exp in exps]
 
+    @Deprecated
     @_history_recorder
     async def generate_mm_async(
         self,
@@ -388,12 +391,14 @@ async def chat_async(self, messages: List[dict], **kwargs) -> List[Experience]:
         lora_request = await self.get_lora_request_async()
         return await self.model.chat.remote(messages, lora_request=lora_request, **kwargs)
 
+    @Deprecated
     @_history_recorder
     def chat_mm(
         self, messages: List[dict], images: List[Image.Image], videos: List[np.ndarray], **kwargs
     ) -> List[Experience]:
         return ray.get(self.model.chat_mm.remote(messages, images=images, videos=videos, **kwargs))
 
+    @Deprecated
     @_history_recorder
     async def chat_mm_async(
         self, messages: List[dict], images: List[Image.Image], videos: List[np.ndarray], **kwargs
diff --git a/trinity/common/models/tinker_model.py b/trinity/common/models/tinker_model.py
index 0c93c77dafe..de381d2200a 100644
--- a/trinity/common/models/tinker_model.py
+++ b/trinity/common/models/tinker_model.py
@@ -27,10 +27,7 @@ def __init__(
 
     async def _initialize_tokenizer(self) -> None:
         """Initialize the tokenizer."""
-        self.trainer_client = await self.service_client.create_lora_training_client_async(
-            base_model=self.config.model_path
-        )
-        self.tokenizer = self.trainer_client.get_tokenizer()
+        self.tokenizer = self.model.get_tokenizer()
 
     async def _generate_internal(self, prompt: dict, **kwargs) -> types.SampleResponse:
         assert self.model is not None
diff --git a/trinity/common/models/vllm_model.py b/trinity/common/models/vllm_model.py
index 2372db6cc39..9bba768e087 100644
--- a/trinity/common/models/vllm_model.py
+++ b/trinity/common/models/vllm_model.py
@@ -3,7 +3,7 @@
 import asyncio
 import os
 from collections import defaultdict
-from typing import Any, Dict, List, Optional, Sequence, Tuple
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import torch
@@ -14,11 +14,15 @@
 from trinity.common.config import InferenceModelConfig
 from trinity.common.experience import Experience
 from trinity.common.models.mm_utils import (
+    build_mm_input_for_training,
+    build_multi_modal_data,
     build_multi_modal_inputs,
     convert_messages_to_mm_format,
+    has_multi_modal_content,
 )
 from trinity.common.models.model import BaseInferenceModel
 from trinity.common.models.vllm_patch import get_vllm_version
+from trinity.utils.annotations import Deprecated
 
 
 # V0 engine is deprecated since vLLM v0.10.2, related code will be removed in the future.
@@ -155,11 +159,11 @@ async def _initialize_tokenizer(self):
                 self.tokenizer = await self.async_llm.get_tokenizer()
         self.tokenizer.truncation_side = "left"
 
-    def _initialize_processor(self):
+    async def _initialize_processor(self):
         self.processor = AutoProcessor.from_pretrained(
             self.config.model_path, trust_remote_code=True
         )
-        self.tokenizer = self.processor.tokenizer
+        await self._initialize_tokenizer()
 
     async def prepare(
         self,
@@ -182,13 +186,28 @@ async def chat(self, messages: List[Dict], lora_request=None, **kwargs) -> Seque
         Returns:
             A list of experiences.
         """
-        if self.tokenizer is None:
-            await self._initialize_tokenizer()
-
-        prompt = self.apply_chat_template(self.tokenizer, messages)
+        is_mm_message = has_multi_modal_content(messages)
+        if is_mm_message:
+            if self.processor is None:
+                await self._initialize_processor()
+            tokenizer_or_processor = self.processor
+        else:
+            if self.tokenizer is None:
+                await self._initialize_tokenizer()
+            tokenizer_or_processor = self.tokenizer
+
+        prompt = self.apply_chat_template(tokenizer_or_processor, messages)
+        if is_mm_message:
+            multi_modal_data = build_multi_modal_data(self.processor, messages)
+            prompt = {
+                "prompt": prompt,
+                "multi_modal_data": multi_modal_data,
+            }
         return await self.generate(prompt=prompt, lora_request=lora_request, **kwargs)
 
-    async def generate(self, prompt: str, lora_request=None, **kwargs) -> Sequence[Experience]:
+    async def generate(
+        self, prompt: Union[str, Dict], lora_request=None, **kwargs
+    ) -> Sequence[Experience]:
         """Generate a response from the provided prompt in async.
 
         Args:
@@ -198,16 +217,21 @@ async def generate(self, prompt: str, lora_request=None, **kwargs) -> Sequence[E
         Returns:
             A list of experiences.
         """
-        if self.tokenizer is None:
-            await self._initialize_tokenizer()
-
-        token_ids, is_valid = self._handle_prompt_truncation(prompt, **kwargs)
-        if not is_valid:
-            return token_ids
-
-        output = await self._generate_internal(
-            prompt={"prompt_token_ids": token_ids}, lora_request=lora_request, **kwargs
-        )
+        if isinstance(prompt, str):  # pure text
+            if self.tokenizer is None:
+                await self._initialize_tokenizer()
+
+            token_ids, is_valid = self._handle_prompt_truncation(prompt, **kwargs)
+            if not is_valid:
+                return token_ids
+            prompt = {"prompt_token_ids": token_ids}
+            multi_modal_inputs = None
+        else:  # multi modal
+            multi_modal_inputs = build_mm_input_for_training(self.processor, **prompt)
+            multi_modal_inputs.pop("input_ids")
+            multi_modal_inputs.pop("attention_mask")
+
+        output = await self._generate_internal(prompt=prompt, lora_request=lora_request, **kwargs)
         experiences = [
             Experience(
                 tokens=torch.cat(
@@ -230,11 +254,13 @@ async def generate(self, prompt: str, lora_request=None, **kwargs) -> Sequence[E
                 prompt_length=len(output.prompt_token_ids),
                 prompt_text=self.tokenizer.decode(output.prompt_token_ids),
                 response_text=output.outputs[i].text,
+                multi_modal_inputs=multi_modal_inputs,
             )
             for i in range(len(output.outputs))
         ]
         return experiences
 
+    @Deprecated
     async def chat_mm(
         self, messages: List[Dict], images: List[Image.Image], videos: List[np.ndarray], **kwargs
     ) -> Sequence[Experience]:
@@ -249,11 +275,12 @@ async def chat_mm(
             A list of experiences.
         """
         if self.processor is None:
-            self._initialize_processor()
+            await self._initialize_processor()
         messages = convert_messages_to_mm_format(messages)
         prompt = self.apply_chat_template(self.processor, messages)
         return await self.generate_mm(prompt=prompt, images=images, videos=videos, **kwargs)
 
+    @Deprecated
     async def generate_mm(
         self,
         prompt: str = None,
diff --git a/trinity/common/workflows/simple_mm_workflow.py b/trinity/common/workflows/simple_mm_workflow.py
index 2eca2274fb8..c51d5edf8e7 100644
--- a/trinity/common/workflows/simple_mm_workflow.py
+++ b/trinity/common/workflows/simple_mm_workflow.py
@@ -1,6 +1,7 @@
 from typing import List, Optional
 
 from trinity.common.experience import Experience
+from trinity.common.models.mm_utils import build_mm_message
 from trinity.common.models.model import ModelWrapper
 from trinity.common.rewards.reward_fn import RewardFn
 from trinity.common.workflows.workflow import SimpleWorkflow, Task
@@ -23,9 +24,19 @@ def __init__(
             auxiliary_models=auxiliary_models,
         )
 
-    def reset(self, task: Task):
-        from verl.utils.dataset.vision_utils import process_image, process_video
+    def format_messages(self):
+        """Format messages for the instruct model."""
+        messages = []
+        if self.system_prompt:
+            messages.append({"role": "system", "content": self.system_prompt})
+
+        messages.append(build_mm_message(self.task_desc, self.images, self.videos))
 
+        if self.reply_prefix:
+            messages.append({"role": "assistant", "content": self.reply_prefix})
+        return messages
+
+    def reset(self, task: Task):
         self.format_args = task.format_args
         self.system_prompt = """You are a helpful assistant that solves MATH problems. You should first thinks about the reasoning process in mind and then provides the user with the answer. You should present your reasoning process using the format: <think>\n ...your reasoning process here... </think>\n first. You should always include your final answer in \\boxed{} as closed-form results."""  # TODO: check
         self.reply_prefix = task.format_args.reply_prefix
@@ -41,25 +52,14 @@ def reset(self, task: Task):
         else:
             raise ValueError("`reward_fn` must be a subclass of `RewardFn`")
 
-        self.image_key = task.format_args.image_key
-        self.video_key = task.format_args.video_key
-        self.images = []
-        self.videos = []
-        if self.image_key and self.raw_task.get(self.image_key) is not None:
-            self.images = [process_image(img) for img in self.raw_task[self.image_key]]  # type: ignore [index]
-        if self.video_key and self.raw_task.get(self.video_key) is not None:
-            self.videos = [process_video(vid).numpy() for vid in self.raw_task[self.video_key]]  # type: ignore [index]
+        self.images = self.raw_task.get(task.format_args.image_key, [])
+        self.videos = self.raw_task.get(task.format_args.video_key, [])
         self.messages = self.format_messages()
 
     def run(self) -> List[Experience]:
         # TODO: test generate_mm
         self.logger.debug("start chat")
-        if self.images or self.videos:
-            responses = self.model.chat_mm(
-                messages=self.messages, images=self.images, videos=self.videos, **self.rollout_args
-            )
-        else:
-            responses = self.model.chat(messages=self.messages, **self.rollout_args)
+        responses = self.model.chat(messages=self.messages, **self.rollout_args)
         for i, response in enumerate(responses):
             reward_dict = self.reward_fn(  # type: ignore [misc]
                 response=response.response_text,  # type: ignore [arg-type]
@@ -83,12 +83,7 @@ class AsyncSimpleMMWorkflow(SimpleMMWorkflow):
     async def run_async(self) -> List[Experience]:
         # TODO: test generate_mm
         self.logger.debug("start chat")
-        if self.images or self.videos:
-            responses = await self.model.chat_mm_async(
-                messages=self.messages, images=self.images, videos=self.videos, **self.rollout_args
-            )
-        else:
-            responses = await self.model.chat_async(messages=self.messages, **self.rollout_args)
+        responses = await self.model.chat_async(messages=self.messages, **self.rollout_args)
         for i, response in enumerate(responses):
             reward_dict = self.reward_fn(  # type: ignore [misc]
                 response=response.response_text,  # type: ignore [arg-type]
diff --git a/trinity/trainer/verl/utils.py b/trinity/trainer/verl/utils.py
index a073808f4f2..1c8e98c8eba 100644
--- a/trinity/trainer/verl/utils.py
+++ b/trinity/trainer/verl/utils.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import torch
+from transformers import ProcessorMixin
 from verl import DataProto
 from verl.trainer.ppo.metric_utils import _compute_response_info
 from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path
@@ -22,7 +23,7 @@
 
 
 def to_data_proto(
-    experiences: List[Experience], pad_token_id: int, logger: Logger
+    experiences: List[Experience], pad_token_id: int, processor: ProcessorMixin, logger: Logger
 ) -> DataProto:  # noqa: C901
     """Convert List[Experience] to verl DataProto."""
     assert len(experiences) > 0, "No experiences provided."
@@ -83,12 +84,40 @@ def to_data_proto(
         if all(getattr(exp, attr, None) is not None for exp in experiences):
             batch_dict[attr] = gather_response_attrs(experiences, attr, max_response_length)
 
-    if all(exp.multi_modal_inputs is not None for exp in experiences):
-        keys = experiences[0].multi_modal_inputs.keys()
-        batch_dict["multi_modal_inputs"] = np.array(
-            [{key: exp.multi_modal_inputs[key] for key in keys} for exp in experiences],  # type: ignore
-            dtype=object,
-        )
+    if processor is not None:
+        import inspect
+
+        # Adapted from verl/experimental/agent_loop/agent_loop.py
+        position_ids_list, multi_modal_inputs = [], []
+        for idx, exp in enumerate(experiences):
+            mm_inputs = exp.multi_modal_inputs or {}
+            input_ids = batch_dict["input_ids"][idx].unsqueeze(0)
+            attention_mask = batch_dict["attention_mask"][idx].unsqueeze(0)
+
+            get_rope_index_sig = inspect.signature(processor.get_rope_index)
+            get_rope_index_kwargs = {}
+            for key in mm_inputs.keys():
+                if key in get_rope_index_sig.parameters:
+                    get_rope_index_kwargs[key] = mm_inputs[key]
+
+            vision_position_ids, _ = processor.get_rope_index(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                **get_rope_index_kwargs,
+            )  # (3, 1, seq_len)
+            vision_position_ids = vision_position_ids.squeeze(1)  # (3, seq_len)
+
+            text_position_ids = batch_dict["position_ids"][idx].unsqueeze(0)  # (1, seq_length)
+            position_ids = torch.cat(
+                (text_position_ids, vision_position_ids), dim=0
+            )  # (4, seq_length)
+            position_ids_list.append(position_ids)  # (4, seq_length)
+            multi_modal_inputs.append(mm_inputs)
+
+        batch_dict["position_ids"] = torch.stack(
+            position_ids_list, dim=0
+        ).long()  # (bs, 4, seq_length)
+        batch_dict["multi_modal_inputs"] = np.array(multi_modal_inputs, dtype=object)
 
     custom_fields_set = set(tuple(exp.custom_fields) for exp in experiences)
     if len(custom_fields_set) == 1:
diff --git a/trinity/trainer/verl_trainer.py b/trinity/trainer/verl_trainer.py
index cdd74acd5f7..849e176b35a 100644
--- a/trinity/trainer/verl_trainer.py
+++ b/trinity/trainer/verl_trainer.py
@@ -12,7 +12,6 @@
 import ray
 import torch
 from omegaconf import OmegaConf
-from verl import DataProto
 from verl.trainer.ppo.core_algos import agg_loss
 from verl.trainer.ppo.metric_utils import (
     compute_throughout_metrics,
@@ -189,7 +188,6 @@ def __init__(
         global_config: Config,
     ):
         self.logger = get_logger(__name__, in_ray_actor=True)
-        self.pad_token_id = global_config.buffer.pad_token_id
         train_config = global_config.trainer
         config = OmegaConf.structured(train_config.trainer_config)
         # download the checkpoint from hdfs
@@ -449,8 +447,7 @@ async def upload_state_dict(self):  # state dict sync
         self.actor_rollout_wg.upload_state_dict(self.global_steps)
 
     async def train_step(self, batch_exps: List[Experience]) -> Dict:  # noqa C901
-        batch = to_data_proto(batch_exps, self.pad_token_id, self.logger)  # type: ignore
-        batch = self.post_process_batch(batch)
+        batch = to_data_proto(batch_exps, self.tokenizer.pad_token_id, self.processor, self.logger)
         metrics = {}
         self.global_steps += 1
         timing_raw = {}
@@ -696,32 +693,3 @@ def _load_checkpoint(self):
 
     def sync_weight(self) -> None:
         self.actor_rollout_wg.sync_weight()
-
-    def post_process_batch(self, batch: DataProto) -> DataProto:
-        """Adapted from verl/utils/dataset/rl_dataset.py"""
-        if (
-            self.processor is not None
-            and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__
-        ):
-            from verl.models.transformers.qwen2_vl import get_rope_index
-
-            position_ids = []
-            multi_modal_inputs = batch.non_tensor_batch["multi_modal_inputs"]
-            for idx, mm_inputs in enumerate(multi_modal_inputs):
-                input_ids = batch.batch["input_ids"][idx]
-                attention_mask = batch.batch["attention_mask"][idx]
-
-                position_ids.append(
-                    get_rope_index(
-                        self.processor,
-                        input_ids=input_ids,
-                        image_grid_thw=mm_inputs.get("image_grid_thw"),
-                        video_grid_thw=mm_inputs.get("video_grid_thw"),
-                        second_per_grid_ts=mm_inputs.get("second_per_grid_ts"),
-                        attention_mask=attention_mask,
-                    )  # (3, seq_len)
-                )
-                mm_inputs.pop("second_per_grid_ts", None)
-
-            batch.batch["position_ids"] = torch.stack(position_ids, dim=0).long()
-        return batch

From 571402f3b58683a614d7dcf5faaa26f6328937f4 Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Tue, 10 Feb 2026 17:32:59 +0800
Subject: [PATCH 02/10] 1. Bug fix in `chord_policy_loss.py` 2. Fix
 `GPUMemoryValidator` for VL model

---
 .../policy_loss_fn/chord_policy_loss.py       | 46 ++++++++-----------
 trinity/common/config_validator.py            | 10 ++++
 2 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/trinity/algorithm/policy_loss_fn/chord_policy_loss.py b/trinity/algorithm/policy_loss_fn/chord_policy_loss.py
index 6653b4d2258..4f42df4ecff 100644
--- a/trinity/algorithm/policy_loss_fn/chord_policy_loss.py
+++ b/trinity/algorithm/policy_loss_fn/chord_policy_loss.py
@@ -199,35 +199,27 @@ def __call__(  # type: ignore
             per_micro_batch_weight_usual = self.gradient_accumulation / self.train_batch_size_usual  # type: ignore
             per_micro_batch_weight_expert = self.gradient_accumulation / self.train_batch_size_expert  # type: ignore
 
-        if n_usual_exp > 0:
-            grpo_loss, grpo_metrics = self.grpo_loss_fn(
-                logprob[~expert_mask],
-                old_logprob[~expert_mask],
-                action_mask[~expert_mask],
-                advantages[~expert_mask],
-                **kwargs,
-            )
-            grpo_loss = grpo_loss * n_usual_exp * per_micro_batch_weight_usual
-            grpo_metrics = {
-                k: v * n_usual_exp * per_micro_batch_weight_usual for k, v in grpo_metrics.items()
-            }
-        else:
-            grpo_loss = torch.tensor(0.0, device=logprob.device)
-            grpo_metrics = {}
+        grpo_loss, grpo_metrics = self.grpo_loss_fn(
+            logprob[~expert_mask],
+            old_logprob[~expert_mask],
+            action_mask[~expert_mask],
+            advantages[~expert_mask],
+            **kwargs,
+        )
+        grpo_loss = grpo_loss * n_usual_exp * per_micro_batch_weight_usual
+        grpo_metrics = {
+            k: v * n_usual_exp * per_micro_batch_weight_usual for k, v in grpo_metrics.items()
+        }
 
         # SFT Loss (expert)
-        if n_expert_exp > 0:
-            sft_loss, sft_metrics = self.sft_loss_fn(
-                logprob[expert_mask],
-                action_mask[expert_mask],
-            )
-            sft_loss = sft_loss * n_expert_exp * per_micro_batch_weight_expert
-            sft_metrics = {
-                k: v * n_expert_exp * per_micro_batch_weight_expert for k, v in sft_metrics.items()
-            }
-        else:
-            sft_loss = torch.tensor(0.0, device=logprob.device)
-            sft_metrics = {}
+        sft_loss, sft_metrics = self.sft_loss_fn(
+            logprob[expert_mask],
+            action_mask[expert_mask],
+        )
+        sft_loss = sft_loss * n_expert_exp * per_micro_batch_weight_expert
+        sft_metrics = {
+            k: v * n_expert_exp * per_micro_batch_weight_expert for k, v in sft_metrics.items()
+        }
 
         mu = mu_schedule_function(
             current_step, self.mu_warmup_steps, self.mu_decay_steps, self.mu_peak, self.mu_valley
diff --git a/trinity/common/config_validator.py b/trinity/common/config_validator.py
index fdad4d5856a..e97f65de127 100644
--- a/trinity/common/config_validator.py
+++ b/trinity/common/config_validator.py
@@ -1547,6 +1547,10 @@ def _check_max_memory_in_fsdp_training(
             params_memory (float): Estimated parameter + optimizer memory (bytes).
             optim_step_memory (float): Estimated optimizer step memory (bytes).
         """
+        is_vl_model = False
+        if 'VL' in hf_config.__class__.__name__:
+            hf_config = hf_config.text_config
+            is_vl_model = True
         max_activation_memory = self._calc_fsdp_activation_memory(
             hf_config, num_tokens, logits_memory_type, dtype_coeff
         )
@@ -1557,6 +1561,12 @@ def _check_max_memory_in_fsdp_training(
         optim_step_mb = optim_step_memory / (1024**2)
         gpu_capacity_mb = self.memory_capacity / (1024**2)
 
+        if is_vl_model:
+            self.logger.info(
+                "Note: This is a vision-language (VL) model. "
+                "The memory estimate below only covers the text encoder portion. "
+                "Actual GPU memory usage will be higher due to the vision components."
+            )
         self.logger.info(
             f"Estimated GPU memory usage for {module_name} model '{model_path}': "
             f"{total_mb:.2f} MB ({params_mb:.2f} MB params + "

From 60bea1c338ec1b3acee5063fbe02cba9b81fd088 Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Wed, 11 Feb 2026 12:26:59 +0800
Subject: [PATCH 03/10] 1. Support `moonshotai/Kimi-VL-A3B-Thinking`. 2. Remove
 `pad_token_id` in config. 3. Add `trust_remote_code` to config. 4. Add
 `get_model_class` for fsdp worker and fsdp checkpoint manager.

---
 pyproject.toml                                |   4 +
 tests/trainer/trainer_test.py                 |   1 -
 trinity/common/config.py                      |   3 +-
 trinity/common/config_validator.py            |  44 +++---
 trinity/common/models/mm_utils.py             |   8 +-
 trinity/common/verl_config.py                 |   8 +-
 .../trainer/verl/fsdp_checkpoint_manager.py   |  37 ++---
 trinity/trainer/verl/fsdp_workers.py          |  49 ++-----
 .../verl/megatron_checkpoint_manager.py       |   1 +
 trinity/trainer/verl/utils.py                 |  37 +++++
 trinity/utils/monkey_patch.py                 | 128 ++++++++++++++++++
 11 files changed, 221 insertions(+), 99 deletions(-)
 create mode 100644 trinity/utils/monkey_patch.py

diff --git a/pyproject.toml b/pyproject.toml
index 96e55f3ff66..e822673da12 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -102,6 +102,7 @@ doc = [
 mm = [
     "qwen-vl-utils",
     "transformers>=4.54.0",
+    "blobfile",
 ]
 
 flash_attn = [
@@ -142,6 +143,9 @@ known_third_party = ["wandb"]
 [tool.uv.extra-build-dependencies]
 flash-attn = ["torch", "numpy"]
 
+[project.entry-points."vllm.general_plugins"]
+vllm_patch = "trinity.utils.monkey_patch:vllm_patch"
+
 [project.urls]
 "Homepage" = "https://github.com/agentscope-ai/Trinity-RFT"
 "Documentation" = "https://agentscope-ai.github.io/Trinity-RFT/"
diff --git a/tests/trainer/trainer_test.py b/tests/trainer/trainer_test.py
index 95a689f5bed..0cbf1ddc9e1 100644
--- a/tests/trainer/trainer_test.py
+++ b/tests/trainer/trainer_test.py
@@ -1205,7 +1205,6 @@ def tearDown(self):
 
 
 class TestMultiModalGRPO(BaseTrainerCase):
-    @unittest.skip("Require specific vllm/transformers version")
     def test_trainer(self):
         """Test both mode with multi-modal data."""
         self.config.buffer.explorer_input.taskset = get_unittest_dataset_config(
diff --git a/trinity/common/config.py b/trinity/common/config.py
index c392bb60dc8..7f4d6edb65c 100644
--- a/trinity/common/config.py
+++ b/trinity/common/config.py
@@ -444,6 +444,7 @@ class TinkerConfig:
 class ModelConfig:
     # source model path
     model_path: str = ""
+    trust_remote_code: bool = False
     critic_model_path: str = ""
 
     custom_chat_template: Optional[str] = None
@@ -493,6 +494,7 @@ class InferenceModelConfig:
     # ! DO NOT SET in explorer.rollout_model, automatically set from config.model.model_path
     model_path: Optional[str] = None
     name: Optional[str] = None
+    trust_remote_code: bool = False
 
     engine_type: str = "vllm"
     engine_num: int = 1
@@ -663,7 +665,6 @@ class BufferConfig:
     # ! DO NOT SET FOLLOWING FIELDS
     explorer_output: Optional[StorageConfig] = None  # automatically set
     tokenizer_path: Optional[str] = None  # automatically set
-    pad_token_id: Optional[int] = None  # automatically set
     cache_dir: Optional[str] = None  # automatically set
 
 
diff --git a/trinity/common/config_validator.py b/trinity/common/config_validator.py
index e97f65de127..bd3c1652c15 100644
--- a/trinity/common/config_validator.py
+++ b/trinity/common/config_validator.py
@@ -18,6 +18,7 @@
 from trinity.common.constants import StorageType, SyncMethod, SyncStyle
 from trinity.utils.log import get_logger
 from trinity.utils.lora_utils import create_dummy_lora
+from trinity.utils.monkey_patch import kimi_vl_monkey_patch_decorator
 
 if TYPE_CHECKING:
     from trinity.common.verl_config import FSDPConfig
@@ -595,7 +596,7 @@ def validate(self, config: Config) -> None:
         model_args = rollout_args + length_args + rope_args
 
         # rollout model
-        for args in model_args + ["model_path"]:
+        for args in model_args + ["model_path", "trust_remote_code"]:
             set_if_none(config.explorer.rollout_model, args, getattr(config.model, args))
         set_if_none(
             config.explorer.rollout_model, "chat_template", config.model.custom_chat_template
@@ -874,26 +875,6 @@ def validate(self, config: Config) -> None:
                 f"your checkpoint directory: {config.checkpoint_job_dir}"
             ) from e
 
-        # set pad_token_id / tokenizer_path
-        if config.buffer.pad_token_id is None:
-            from transformers import AutoTokenizer
-
-            try:
-                tokenizer = AutoTokenizer.from_pretrained(config.model.model_path)
-                if tokenizer.pad_token_id is None:
-                    tokenizer.pad_token_id = tokenizer.eos_token_id
-                    self.logger.warning(
-                        f"tokenizer.pad_token_id is None. Now set to {tokenizer.eos_token_id}",
-                        stacklevel=1,
-                    )
-                config.buffer.pad_token_id = tokenizer.pad_token_id
-
-            except Exception:
-                self.logger.warning(
-                    f"Failed to get pad token id from model {config.model.model_path}"
-                )
-                config.buffer.pad_token_id = 0
-
         self._check_explorer_input(config)
         self._check_trainer_input(config)
         self._check_data_processor(config)
@@ -1266,7 +1247,10 @@ def validate_trainer_memory_usage(self, config: Config) -> None:
         else:
             self.logger.info("GPU memory check skipped for non-FSDP strategies.")
 
-    def _get_model_params_num_and_config(self, model_path: str) -> Tuple[int, Any]:
+    @kimi_vl_monkey_patch_decorator
+    def _get_model_params_num_and_config(
+        self, model_path: str, trust_remote_code: bool
+    ) -> Tuple[int, Any]:
         """Load model configuration and estimate total parameter count without loading weights.
 
         Uses `accelerate.init_empty_weights()` to avoid GPU memory allocation during inspection.
@@ -1286,9 +1270,13 @@ def _get_model_params_num_and_config(self, model_path: str) -> Tuple[int, Any]:
         import transformers
         from accelerate import init_empty_weights
 
-        model_config = transformers.AutoConfig.from_pretrained(model_path)
+        model_config = transformers.AutoConfig.from_pretrained(
+            model_path, trust_remote_code=trust_remote_code
+        )
         with init_empty_weights():
-            model = transformers.AutoModel.from_config(model_config, torch_dtype=torch.bfloat16)
+            model = transformers.AutoModel.from_config(
+                model_config, trust_remote_code=trust_remote_code, dtype=torch.bfloat16
+            )
         params_num = model.num_parameters()
         assert params_num > 0, f"No parameters found in the model at path: {model_path}"
         return params_num, model_config
@@ -1382,7 +1370,9 @@ def fsdp_memory_check(self, config: Config) -> None:
 
         try:
             model_path = config.model.model_path
-            params_num, hf_config = self._get_model_params_num_and_config(model_path)
+            params_num, hf_config = self._get_model_params_num_and_config(
+                model_path, config.model.trust_remote_code
+            )
 
             verl_config: veRLConfig = config.trainer.trainer_config
             world_size = config.cluster.trainer_gpu_num
@@ -1407,7 +1397,7 @@ def fsdp_memory_check(self, config: Config) -> None:
                     critic_hf_config = hf_config
                 else:
                     critic_params_num, critic_hf_config = self._get_model_params_num_and_config(
-                        config.model.critic_model_path
+                        config.model.critic_model_path, config.model.trust_remote_code
                     )
 
                 (
@@ -1548,7 +1538,7 @@ def _check_max_memory_in_fsdp_training(
             optim_step_memory (float): Estimated optimizer step memory (bytes).
         """
         is_vl_model = False
-        if 'VL' in hf_config.__class__.__name__:
+        if "VL" in hf_config.__class__.__name__:
             hf_config = hf_config.text_config
             is_vl_model = True
         max_activation_memory = self._calc_fsdp_activation_memory(
diff --git a/trinity/common/models/mm_utils.py b/trinity/common/models/mm_utils.py
index 50d24dd3e68..ca379b00a71 100644
--- a/trinity/common/models/mm_utils.py
+++ b/trinity/common/models/mm_utils.py
@@ -20,7 +20,9 @@ def build_multi_modal_data(
     Preprocess multi-modal data and build multi-modal inputs
     """
     processor_class_name = processor.__class__.__name__
-    if "Qwen" in processor_class_name and "VLProcessor" in processor_class_name:
+    if (
+        "Qwen" in processor_class_name or "Kimi" in processor_class_name
+    ) and "VLProcessor" in processor_class_name:
         from qwen_vl_utils import process_vision_info
 
         image_inputs, video_inputs = process_vision_info(messages)
@@ -36,7 +38,9 @@ def build_multi_modal_data(
 
 def build_mm_input_for_training(processor: Any, prompt: str, multi_modal_data: Dict) -> Dict:
     processor_class_name = processor.__class__.__name__
-    if "Qwen" in processor_class_name and "VLProcessor" in processor_class_name:
+    if (
+        "Qwen" in processor_class_name or "Kimi" in processor_class_name
+    ) and "VLProcessor" in processor_class_name:
         inputs = processor(
             text=[prompt],
             images=multi_modal_data.get("image", None),
diff --git a/trinity/common/verl_config.py b/trinity/common/verl_config.py
index d13d5dea2ba..9537da866a4 100644
--- a/trinity/common/verl_config.py
+++ b/trinity/common/verl_config.py
@@ -10,6 +10,7 @@
 from trinity.common.config import Config, SynchronizerConfig, set_if_none
 from trinity.common.constants import EXPLORER_NAME
 from trinity.utils.log import get_logger
+from trinity.utils.monkey_patch import kimi_vl_monkey_patch_decorator
 
 logger = get_logger(__name__)
 
@@ -396,6 +397,7 @@ class veRLConfig:
     synchronizer: Optional[SynchronizerConfig] = None
     enable_preview: bool = True
 
+    @kimi_vl_monkey_patch_decorator
     def _check_parallel_config(
         self,
         obj: Union[Actor, Ref, Critic],
@@ -496,6 +498,7 @@ def synchronize_config(self, config: Config) -> None:  # noqa: C901
 
         # kept to pass RayPPOTrainer._validate_config
         self.data.train_batch_size = config.buffer.train_batch_size
+        self.data.trust_remote_code = config.model.trust_remote_code
 
         self.synchronizer = config.synchronizer
         self.actor_rollout_ref.nccl_timeout = config.synchronizer.sync_timeout
@@ -512,9 +515,8 @@ def synchronize_config(self, config: Config) -> None:  # noqa: C901
         actor_model_config = self.actor_rollout_ref.model
         actor_optim = actor_config.optim
         actor_model_config.path = config.model.model_path
-        actor_model_config.custom_chat_template = config.model.custom_chat_template
-        actor_model_config.rope_scaling = config.model.rope_scaling
-        actor_model_config.rope_theta = config.model.rope_theta
+        for attr in ["trust_remote_code", "custom_chat_template", "rope_scaling", "rope_theta"]:
+            setattr(actor_model_config, attr, getattr(config.model, attr))
         actor_optim.total_training_steps = self.trainer.total_training_steps
         actor_config.ppo_mini_batch_size = config.buffer.train_batch_size
         rollout_config.temperature = (
diff --git a/trinity/trainer/verl/fsdp_checkpoint_manager.py b/trinity/trainer/verl/fsdp_checkpoint_manager.py
index 0bada758570..bacca448a12 100644
--- a/trinity/trainer/verl/fsdp_checkpoint_manager.py
+++ b/trinity/trainer/verl/fsdp_checkpoint_manager.py
@@ -48,6 +48,7 @@
 from verl.utils.logger import log_with_rank
 
 from trinity.manager.synchronizer import Synchronizer
+from trinity.trainer.verl.utils import get_model_class
 from trinity.trainer.verl_trainer import CheckpointMonitor
 from trinity.utils.log import get_logger
 
@@ -62,13 +63,14 @@ class FSDPCheckpointManager(OldFSDPCheckpointManager):
     This class is useful in distributed training scenarios where synchronization and non-blocking I/O are important.
     """
 
-    def __init__(self, *args, ray_namespace: str = "", **kwargs):
+    def __init__(self, *args, ray_namespace: str = "", trust_remote_code: bool = False, **kwargs):
         super().__init__(*args, **kwargs)
         self.logger = get_logger()
         self.synchronizer = Synchronizer.get_actor(namespace=ray_namespace)
         self.checkpoint_monitor = CheckpointMonitor.get_actor(
             namespace=ray_namespace,
         )
+        self.trust_remote_code = trust_remote_code
 
         # Threads for asynchronous saving of different components
         self._model_state_dict_thread = None
@@ -321,35 +323,14 @@ def _save_hf_model(self, local_path, global_step) -> bool:
             os.makedirs(hf_local_path, exist_ok=True)
 
             _, model_config, generation_config = self._get_unwrap_model_and_config()
-
-            if "ForTokenClassification" in model_config.architectures[0]:
-                from transformers import AutoModelForTokenClassification
-
-                auto_model_cls = AutoModelForTokenClassification
-            elif "ForCausalLM" in model_config.architectures[0]:
-                from transformers import AutoModelForCausalLM
-
-                auto_model_cls = AutoModelForCausalLM
-            elif "ForConditionalGeneration" in model_config.architectures[0]:
-                # Handle different transformers versions for Vision2Seq models
-                import transformers
-                from packaging import version
-
-                if version.parse(transformers.__version__) >= version.parse("4.54.0"):
-                    # transformers >= 4.54.0 uses AutoModelForImageTextToText
-                    from transformers import AutoModelForImageTextToText
-
-                    auto_model_cls = AutoModelForImageTextToText
-                else:
-                    # transformers < 4.54.0 uses AutoModelForVision2Seq
-                    from transformers import AutoModelForVision2Seq
-
-                    auto_model_cls = AutoModelForVision2Seq
-            else:
-                raise NotImplementedError(f"Unknown architecture {model_config['architectures']}")
+            auto_model_cls = get_model_class(model_config)
 
             with init_empty_weights():
-                save_model = auto_model_cls.from_config(model_config, torch_dtype=torch.bfloat16)
+                save_model = auto_model_cls.from_config(
+                    model_config,
+                    dtype=torch.bfloat16,
+                    trust_remote_code=self.trust_remote_code,
+                )
             save_model.to_empty(device="cpu")
 
             if save_model.can_generate():
diff --git a/trinity/trainer/verl/fsdp_workers.py b/trinity/trainer/verl/fsdp_workers.py
index 00a4cc03543..48bf7bbee8e 100644
--- a/trinity/trainer/verl/fsdp_workers.py
+++ b/trinity/trainer/verl/fsdp_workers.py
@@ -95,7 +95,9 @@
 from trinity.common.constants import ROLLOUT_WEIGHT_SYNC_GROUP_NAME, SyncMethod
 from trinity.manager.synchronizer import Synchronizer
 from trinity.trainer.verl.fsdp_checkpoint_manager import FSDPCheckpointManager
+from trinity.trainer.verl.utils import get_model_class
 from trinity.utils.distributed import init_process_group
+from trinity.utils.monkey_patch import kimi_vl_monkey_patch_decorator
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
@@ -275,6 +277,7 @@ def _fsdp_offload_context(self):
             torch.distributed.barrier()
             torch.cuda.empty_cache()
 
+    @kimi_vl_monkey_patch_decorator
     def _build_model_optimizer(  # noqa: C901
         self,
         model_path,
@@ -292,13 +295,7 @@ def _build_model_optimizer(  # noqa: C901
         tiled_mlp_shards=4,
     ):
         from torch.distributed.fsdp import CPUOffload, MixedPrecision
-        from transformers import (
-            AutoConfig,
-            AutoModel,
-            AutoModelForCausalLM,
-            AutoModelForImageTextToText,
-            AutoModelForVision2Seq,
-        )
+        from transformers import AutoConfig
         from verl.utils.model import (
             get_generation_config,
             print_model_size,
@@ -385,34 +382,7 @@ def _build_model_optimizer(  # noqa: C901
 
         with init_context(), warnings.catch_warnings():
             warnings.simplefilter("ignore")
-            has_remote_code = hasattr(actor_model_config, "auto_map") and any(
-                actor_model_config.architectures[0] in val
-                for val in actor_model_config.auto_map.values()
-            )
-            if has_remote_code:
-                auto_class = next(
-                    k
-                    for k, v in actor_model_config.auto_map.items()
-                    if actor_model_config.architectures[0] in v
-                )
-                match auto_class:
-                    case "AutoModelForVision2Seq":
-                        actor_module_class = AutoModelForVision2Seq
-                    case "AutoModelForCausalLM":
-                        actor_module_class = AutoModelForCausalLM
-                    case "AutoModelForImageTextToText":
-                        actor_module_class = AutoModelForImageTextToText
-                    case _:
-                        actor_module_class = AutoModel
-            else:
-                if type(actor_model_config) in AutoModelForVision2Seq._model_mapping.keys():
-                    actor_module_class = AutoModelForVision2Seq
-                elif type(actor_model_config) in AutoModelForCausalLM._model_mapping.keys():
-                    actor_module_class = AutoModelForCausalLM
-                elif type(actor_model_config) in AutoModelForImageTextToText._model_mapping.keys():
-                    actor_module_class = AutoModelForImageTextToText
-                else:
-                    actor_module_class = AutoModel
+            actor_module_class = get_model_class(actor_model_config)
 
             actor_module = actor_module_class.from_pretrained(
                 pretrained_model_name_or_path=local_path,
@@ -651,6 +621,7 @@ def init_model(self):
         use_remove_padding = self.config.model.get("use_remove_padding", False)
         use_shm = self.config.model.get("use_shm", False)
         use_fused_kernels = self.config.model.get("use_fused_kernels", False)
+        trust_remote_code = self.config.model.get("trust_remote_code", False)
 
         if self._is_actor:
             # we need the model for actor
@@ -678,7 +649,7 @@ def init_model(self):
                 enable_gradient_checkpointing=self.config.model.get(
                     "enable_gradient_checkpointing", False
                 ),
-                trust_remote_code=self.config.model.get("trust_remote_code", False),
+                trust_remote_code=trust_remote_code,
                 use_liger=self.config.model.get("use_liger", False),
                 role="actor",
                 enable_activation_offload=self.config.model.get("enable_activation_offload", False),
@@ -733,7 +704,7 @@ def init_model(self):
                 override_model_config=override_model_config,
                 use_remove_padding=use_remove_padding,
                 use_fused_kernels=use_fused_kernels,
-                trust_remote_code=self.config.model.get("trust_remote_code", False),
+                trust_remote_code=trust_remote_code,
                 use_liger=self.config.model.get("use_liger", False),
                 role="ref",
                 use_tiled_mlp=ref_use_tiled_mlp,
@@ -753,6 +724,7 @@ def init_model(self):
                 processing_class=self.processor if self.processor is not None else self.tokenizer,
                 checkpoint_config=self.config.ref.checkpoint,
                 ray_namespace=self.config.synchronizer.ray_namespace,
+                trust_remote_code=trust_remote_code,
             )
 
         if self._is_actor:
@@ -764,6 +736,7 @@ def init_model(self):
                 processing_class=self.processor if self.processor is not None else self.tokenizer,
                 checkpoint_config=self.config.actor.checkpoint,
                 ray_namespace=self.config.synchronizer.ray_namespace,
+                trust_remote_code=trust_remote_code,
             )
 
     @register(dispatch_mode=Dispatch.ONE_TO_ALL)
@@ -1225,6 +1198,7 @@ def __init__(self, config: FSDPCriticConfig):
         )
         self.use_orig_params = self.config.model.fsdp_config.get("use_orig_params", False)
 
+    @kimi_vl_monkey_patch_decorator
     def _build_critic_model_optimizer(self, config):  # noqa: C901
         # the following line is necessary
         from torch.distributed.fsdp import MixedPrecision
@@ -1534,6 +1508,7 @@ def init_model(self):
             processing_class=self.processor if self.processor is not None else self.tokenizer,
             checkpoint_config=self.config.checkpoint,
             ray_namespace=self.config.ray_namespace,
+            trust_remote_code=self.config.model.get("trust_remote_code", False),
         )
 
     @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="critic"))
diff --git a/trinity/trainer/verl/megatron_checkpoint_manager.py b/trinity/trainer/verl/megatron_checkpoint_manager.py
index 600a4d015be..8135ec464d9 100644
--- a/trinity/trainer/verl/megatron_checkpoint_manager.py
+++ b/trinity/trainer/verl/megatron_checkpoint_manager.py
@@ -349,6 +349,7 @@ def _save_hf_model(self, local_path, global_step) -> bool:
 
                     from accelerate import init_empty_weights
 
+                    # TODO: Switch to get_model_class
                     with init_empty_weights(), warnings.catch_warnings():
                         warnings.simplefilter("ignore")
                         if "mistral7b-rm" in self.config.model.path:
diff --git a/trinity/trainer/verl/utils.py b/trinity/trainer/verl/utils.py
index 1c8e98c8eba..16fd02d8946 100644
--- a/trinity/trainer/verl/utils.py
+++ b/trinity/trainer/verl/utils.py
@@ -251,3 +251,40 @@ def get_latest_hf_checkpoint_path(config: Config):
     if not os.path.exists(hf_checkpoint_dir):
         raise ValueError(f"No huggingface checkpoint found in {hf_checkpoint_dir}")
     return hf_checkpoint_dir
+
+
+# modified from verl/workers/fsdp_workers.py:ActorRolloutRefWorker._build_model_optimizer
+def get_model_class(hf_config):
+    from transformers import (
+        AutoModel,
+        AutoModelForCausalLM,
+        AutoModelForImageTextToText,
+        AutoModelForVision2Seq,
+    )
+
+    has_remote_code = hasattr(hf_config, "auto_map") and any(
+        hf_config.architectures[0] in val for val in hf_config.auto_map.values()
+    )
+    if has_remote_code:
+        auto_class = next(
+            k for k, v in hf_config.auto_map.items() if hf_config.architectures[0] in v
+        )
+        match auto_class:
+            case "AutoModelForVision2Seq":
+                model_class = AutoModelForVision2Seq
+            case "AutoModelForCausalLM":
+                model_class = AutoModelForCausalLM
+            case "AutoModelForImageTextToText":
+                model_class = AutoModelForImageTextToText
+            case _:
+                model_class = AutoModel
+    else:
+        if type(hf_config) in AutoModelForVision2Seq._model_mapping.keys():
+            model_class = AutoModelForVision2Seq
+        elif type(hf_config) in AutoModelForCausalLM._model_mapping.keys():
+            model_class = AutoModelForCausalLM
+        elif type(hf_config) in AutoModelForImageTextToText._model_mapping.keys():
+            model_class = AutoModelForImageTextToText
+        else:
+            model_class = AutoModel
+    return model_class
diff --git a/trinity/utils/monkey_patch.py b/trinity/utils/monkey_patch.py
new file mode 100644
index 00000000000..70a18f1dc8f
--- /dev/null
+++ b/trinity/utils/monkey_patch.py
@@ -0,0 +1,128 @@
+"""Monkey patching for 'kimi_vl' models."""
+
+
+def vllm_patch():
+    import transformers
+
+    if not hasattr(transformers.activations, "PytorchGELUTanh"):
+        transformers.activations.PytorchGELUTanh = transformers.activations.GELUTanh
+
+
+def kimi_vl_monkey_patch_decorator(func):
+    """
+    A decorator that applies temporary monkey patches for 'kimi_vl' models before
+    the decorated function runs, and restores the original state afterward.
+
+    The patch is applied only if:
+      - The model's config.json exists and specifies "model_type": "kimi_vl"
+      - The installed transformers version is >= 4.51.0
+
+    Patches include:
+      1. Replacing `transformers.activations.PytorchGELUTanh` with `GELUTanh`
+      2. Wrapping `importlib.util.spec_from_file_location` to inject DeepseekV3 classes
+
+    The decorator automatically extracts `model_path` and `override_model_config`
+    from the function's arguments using `inspect.signature`, regardless of whether
+    they are passed as positional or keyword arguments.
+    """
+    import importlib
+    import inspect
+    import json
+    import os
+    from functools import wraps
+
+    import transformers
+    from packaging import version
+
+    transformers_version = transformers.__version__
+    sig = inspect.signature(func)  # Analyze function signature once at decoration time
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        # Bind actual arguments to parameter names (handles pos/kw/defaults)
+        bound_args = sig.bind(*args, **kwargs)
+        bound_args.apply_defaults()
+
+        # Extract required parameters safely by name
+        if "model_path" in bound_args.arguments:  # actor/ref worker
+            model_path = bound_args.arguments["model_path"]
+        elif "model_config" in bound_args.arguments:  # verl config check
+            model_path = bound_args.arguments["model_config"].path
+        elif "self" in bound_args.arguments:  # critic worker
+            model_path = bound_args.arguments["self"].config.model.path
+
+        # Track patch state for cleanup
+        kimi_vl_patch_applied = False
+        origin_spec_from_file_location = None
+        origin_PytorchGELUTanh = None
+
+        try:
+            config_path = os.path.join(model_path, "config.json")
+            if os.path.exists(config_path):
+                with open(config_path, "r") as f:
+                    json_hf_config = json.load(f)
+
+                # Check if model requires special patching
+                if json_hf_config.get("model_type") == "kimi_vl" and version.parse(
+                    transformers_version
+                ) >= version.parse("4.51.0"):
+                    # Save original values for restoration
+                    origin_PytorchGELUTanh = getattr(
+                        transformers.activations, "PytorchGELUTanh", None
+                    )
+                    origin_spec_from_file_location = importlib.util.spec_from_file_location
+
+                    # Patch 1: Replace PytorchGELUTanh
+                    transformers.activations.PytorchGELUTanh = transformers.activations.GELUTanh
+
+                    # Patch 2: Wrap spec_from_file_location to inject DeepseekV3 classes
+                    def patched_spec_from_file_location(*args_spec, **kwargs_spec):
+                        spec = origin_spec_from_file_location(*args_spec, **kwargs_spec)
+                        if spec and hasattr(spec, "loader") and spec.loader:
+                            original_exec_module = spec.loader.exec_module
+
+                            def patched_exec_module(module):
+                                original_exec_module(module)
+                                # Inject DeepseekV3* classes from transformers into the module
+                                for attr_name in dir(module):
+                                    if attr_name.startswith("DeepseekV3") and hasattr(
+                                        transformers, attr_name
+                                    ):
+                                        setattr(module, attr_name, getattr(transformers, attr_name))
+                                    elif attr_name in {
+                                        "KimiVLPreTrainedModel",
+                                        "KimiVLForConditionalGeneration",
+                                    }:
+                                        setattr(
+                                            getattr(module, attr_name),
+                                            "supports_gradient_checkpointing",
+                                            True,
+                                        )
+                                        setattr(getattr(module, attr_name), "_supports_sdpa", True)
+
+                            spec.loader.exec_module = patched_exec_module
+                        return spec
+
+                    importlib.util.spec_from_file_location = patched_spec_from_file_location
+
+                    kimi_vl_patch_applied = True
+
+            # Call the original function
+            return func(*args, **kwargs)
+
+        finally:
+            # Always restore original state, even if an exception occurred
+            if kimi_vl_patch_applied:
+                # Restore PytorchGELUTanh
+                if origin_PytorchGELUTanh is not None:
+                    transformers.activations.PytorchGELUTanh = origin_PytorchGELUTanh
+                else:
+                    # Remove attribute if it didn't exist originally
+                    if hasattr(transformers.activations, "PytorchGELUTanh"):
+                        delattr(transformers.activations, "PytorchGELUTanh")
+
+                # Restore spec_from_file_location
+                if origin_spec_from_file_location is not None:
+                    importlib.util.spec_from_file_location = origin_spec_from_file_location
+
+    return wrapper

From b1fc21c543eac2b071ed6262fbe202b62569f8f6 Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Wed, 11 Feb 2026 13:26:01 +0800
Subject: [PATCH 04/10] apply suggestions from reviews

---
 tests/trainer/trainer_test.py       | 1 -
 trinity/buffer/schema/formatter.py  | 2 +-
 trinity/common/models/mm_utils.py   | 4 ++++
 trinity/common/models/vllm_model.py | 4 ++--
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/trainer/trainer_test.py b/tests/trainer/trainer_test.py
index 0cbf1ddc9e1..7885679dc60 100644
--- a/tests/trainer/trainer_test.py
+++ b/tests/trainer/trainer_test.py
@@ -1245,7 +1245,6 @@ def tearDown(self):
 
 
 class TestMultiModalSFT(BaseTrainerCase):
-    @unittest.skip("Require specific vllm/transformers version")
     def test_trainer(self):
         """Test SFT mode with multi-modal data."""
         self.config.mode = "train"
diff --git a/trinity/buffer/schema/formatter.py b/trinity/buffer/schema/formatter.py
index bf96c685f0f..f90dd29aaba 100644
--- a/trinity/buffer/schema/formatter.py
+++ b/trinity/buffer/schema/formatter.py
@@ -194,7 +194,7 @@ def _messages_to_experience(
                 multi_modal_data,
             )
             tokens = full_text_inputs.pop("input_ids")[0]
-            full_text_inputs.pop("attention_mask")
+            full_text_inputs.pop("attention_mask", None)
             prompt_text_inputs = build_mm_input_for_training(
                 self.processor,
                 prompt,
diff --git a/trinity/common/models/mm_utils.py b/trinity/common/models/mm_utils.py
index ca379b00a71..2cfdd8a28a2 100644
--- a/trinity/common/models/mm_utils.py
+++ b/trinity/common/models/mm_utils.py
@@ -58,9 +58,13 @@ def build_mm_message(prompt: str, images: List, videos: List):
     img_idx, vid_idx = 0, 0
     for segment in segments:
         if segment == "<image>":
+            if img_idx >= len(images):
+                raise ValueError("More <image> tags in prompt than images provided.")
             content_list.append({"type": "image", "image": images[img_idx]})
             img_idx += 1
         elif segment == "<video>":
+            if vid_idx >= len(videos):
+                raise ValueError("More <video> tags in prompt than videos provided.")
             content_list.append({"type": "video", "video": videos[vid_idx]})
             vid_idx += 1
         elif len(segment) == 0:
diff --git a/trinity/common/models/vllm_model.py b/trinity/common/models/vllm_model.py
index 9bba768e087..a06ba37c005 100644
--- a/trinity/common/models/vllm_model.py
+++ b/trinity/common/models/vllm_model.py
@@ -228,8 +228,8 @@ async def generate(
             multi_modal_inputs = None
         else:  # multi modal
             multi_modal_inputs = build_mm_input_for_training(self.processor, **prompt)
-            multi_modal_inputs.pop("input_ids")
-            multi_modal_inputs.pop("attention_mask")
+            multi_modal_inputs.pop("input_ids", None)
+            multi_modal_inputs.pop("attention_mask", None)
 
         output = await self._generate_internal(prompt=prompt, lora_request=lora_request, **kwargs)
         experiences = [

From b79a14686388aa6438361b361499b8117c487859 Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Wed, 11 Feb 2026 15:47:11 +0800
Subject: [PATCH 05/10] fix unittest

---
 .github/workflows/docker/docker-compose.yaml |  4 ++--
 tests/template/data/gsm8k/test.jsonl         |  2 ++
 tests/tools.py                               | 14 ++++++++++++++
 tests/trainer/trainer_test.py                |  9 ++++-----
 4 files changed, 22 insertions(+), 7 deletions(-)
 create mode 100644 tests/template/data/gsm8k/test.jsonl

diff --git a/.github/workflows/docker/docker-compose.yaml b/.github/workflows/docker/docker-compose.yaml
index 0889e57ffdd..df6d7bd39e9 100644
--- a/.github/workflows/docker/docker-compose.yaml
+++ b/.github/workflows/docker/docker-compose.yaml
@@ -1,6 +1,6 @@
 services:
   trinity-node-1:
-    image: trinity-rft-unittest:20260205
+    image: trinity-rft-unittest:20260211
     cap_add:
       - SYS_PTRACE
     pull_policy: never
@@ -32,7 +32,7 @@ services:
             capabilities: [gpu]
 
   trinity-node-2:
-    image: trinity-rft-unittest:20260205
+    image: trinity-rft-unittest:20260211
     cap_add:
       - SYS_PTRACE
     pull_policy: never
diff --git a/tests/template/data/gsm8k/test.jsonl b/tests/template/data/gsm8k/test.jsonl
new file mode 100644
index 00000000000..8c07215ebf4
--- /dev/null
+++ b/tests/template/data/gsm8k/test.jsonl
@@ -0,0 +1,2 @@
+{"question": "Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?", "answer": "Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\n#### 18"}
+{"question": "A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?", "answer": "It takes 2/2=<<2/2=1>>1 bolt of white fiber\nSo the total amount of fabric is 2+1=<<2+1=3>>3 bolts of fabric\n#### 3"}
diff --git a/tests/tools.py b/tests/tools.py
index 9e37dfc2409..d0f24cb2275 100644
--- a/tests/tools.py
+++ b/tests/tools.py
@@ -187,6 +187,20 @@ def get_unittest_dataset_config(dataset_name: str = "countdown", split: str = "t
             default_workflow_type="simple_mm_workflow",
             default_reward_fn_type="math_boxed_reward",
         )
+    elif dataset_name == "geometry_sft":
+        # Multi-modal geometry dataset for sft with 8 samples
+        return ExperienceBufferConfig(
+            name=dataset_name,
+            path=os.path.join(os.path.dirname(__file__), "template", "data", "geometry"),
+            split="train",
+            storage_type=StorageType.FILE.value,
+            format=FormatConfig(
+                prompt_type=PromptType.PLAINTEXT,
+                prompt_key="problem",
+                response_key="answer",
+                image_key="images",
+            ),
+        )
     else:
         raise ValueError(f"Unknown dataset name: {dataset_name}")
 
diff --git a/tests/trainer/trainer_test.py b/tests/trainer/trainer_test.py
index 7885679dc60..521781b8cf5 100644
--- a/tests/trainer/trainer_test.py
+++ b/tests/trainer/trainer_test.py
@@ -350,7 +350,7 @@ def test_trainer(self, mock_load):
         mock_load.return_value = deepcopy(self.config)
 
         with self.assertRaises(Exception):
-            run(config_path="dummy.yaml")
+            run(config="dummy.yaml")
         ray.shutdown(_exiting_interpreter=True)
 
         stage_configs = [cfg.check_and_update() for cfg in deepcopy(self.config)]
@@ -375,7 +375,7 @@ def test_trainer(self, mock_load):
         self.config.stages[1].buffer.explorer_input.taskset.path = old_taskset_path
         mock_load.return_value = deepcopy(self.config)
         ray.init(ignore_reinit_error=True, namespace=self.config.ray_namespace)
-        run(config_path="dummy.yaml")
+        run(config="dummy.yaml")
 
         # grpo stage
         grpo_config = stage_configs[1]
@@ -1249,7 +1249,7 @@ def test_trainer(self):
         """Test SFT mode with multi-modal data."""
         self.config.mode = "train"
         self.config.buffer.trainer_input.experience_buffer = get_unittest_dataset_config(
-            "geometry"
+            "geometry_sft"
         )  # Total 8 tasks
         self.config.model.model_path = get_vision_language_model_path()
         self.config.algorithm.algorithm_type = "sft"
@@ -1520,7 +1520,6 @@ def tearDown(self):
         shutil.rmtree(self.config.checkpoint_job_dir, ignore_errors=True)
 
 
-@unittest.skip("Require agentscope >= 1.0.12")
 class AgentScopeTunerTest(unittest.IsolatedAsyncioTestCase):
     def setUp(self) -> None:
         ray.init(ignore_reinit_error=True)
@@ -1620,7 +1619,7 @@ async def judge_func(
                 model_path=get_model_path(),
                 max_model_len=8192,
                 max_tokens=2048,
-                inference_engine_num=2,
+                inference_engine_num=1,
             )
         }
 

From e14fe76abd204514d6542db09fdf0b325c75a9a1 Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Wed, 11 Feb 2026 17:33:46 +0800
Subject: [PATCH 06/10] apply reviews

---
 pyproject.toml                                |  2 +-
 trinity/common/config_validator.py            |  2 +-
 trinity/common/models/mm_utils.py             | 70 ----------------
 trinity/common/models/model.py                | 52 ------------
 trinity/common/models/vllm_model.py           | 84 -------------------
 trinity/common/models/vllm_patch/__init__.py  |  8 ++
 trinity/common/patch/__init__.py              |  5 ++
 .../monkey_patch.py => common/patch/kimi.py}  |  7 --
 trinity/common/verl_config.py                 |  2 +-
 trinity/trainer/verl/fsdp_workers.py          |  2 +-
 10 files changed, 17 insertions(+), 217 deletions(-)
 create mode 100644 trinity/common/patch/__init__.py
 rename trinity/{utils/monkey_patch.py => common/patch/kimi.py} (96%)

diff --git a/pyproject.toml b/pyproject.toml
index dfe11f64a8e..812f14e6fd0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -146,7 +146,7 @@ known_third_party = ["wandb"]
 flash-attn = ["torch", "numpy"]
 
 [project.entry-points."vllm.general_plugins"]
-vllm_patch = "trinity.utils.monkey_patch:vllm_patch"
+vllm_patch = "trinity.common.models.vllm_patch:vllm_patch"
 
 [project.urls]
 "Homepage" = "https://github.com/agentscope-ai/Trinity-RFT"
diff --git a/trinity/common/config_validator.py b/trinity/common/config_validator.py
index bd3c1652c15..69520fe56dc 100644
--- a/trinity/common/config_validator.py
+++ b/trinity/common/config_validator.py
@@ -16,9 +16,9 @@
     set_if_none,
 )
 from trinity.common.constants import StorageType, SyncMethod, SyncStyle
+from trinity.common.patch import kimi_vl_monkey_patch_decorator
 from trinity.utils.log import get_logger
 from trinity.utils.lora_utils import create_dummy_lora
-from trinity.utils.monkey_patch import kimi_vl_monkey_patch_decorator
 
 if TYPE_CHECKING:
     from trinity.common.verl_config import FSDPConfig
diff --git a/trinity/common/models/mm_utils.py b/trinity/common/models/mm_utils.py
index 2cfdd8a28a2..d8b2472fbf7 100644
--- a/trinity/common/models/mm_utils.py
+++ b/trinity/common/models/mm_utils.py
@@ -6,11 +6,6 @@
 import re
 from typing import Any, Dict, List
 
-import numpy as np
-from PIL import Image
-
-from trinity.utils.annotations import Deprecated
-
 
 def build_multi_modal_data(
     processor: Any,
@@ -94,68 +89,3 @@ def has_multi_modal_content(messages: List[Dict]):
                 if item.get("type", "text") != "text":
                     return True
     return False
-
-
-@Deprecated
-def build_multi_modal_inputs(
-    prompt: str,
-    images: List[Image.Image],
-    videos: List[np.ndarray],
-    processor: Any,
-) -> Dict[str, Any]:
-    """
-    Preprocess multi-modal data and build multi-modal inputs
-    """
-    if prompt is None:
-        raise ValueError("Prompt is required for build multi-modal inputs")
-
-    multi_modal_data = {}
-    if images:
-        multi_modal_data["image"] = images
-    if videos:
-        multi_modal_data["video"] = videos
-
-    model_inputs = processor(
-        text=[prompt],
-        images=multi_modal_data.get("image", None),
-        videos=multi_modal_data.get("video", None),
-        return_tensors="pt",
-    )
-
-    input_ids = model_inputs.pop("input_ids")[0]
-    model_inputs.pop("attention_mask")
-
-    if "second_per_grid_ts" in model_inputs:
-        model_inputs.pop("second_per_grid_ts")
-
-    return {
-        "prompt": prompt,
-        "prompt_token_ids": input_ids,
-        "multi_modal_data": multi_modal_data,
-        "multi_modal_inputs": dict(model_inputs),
-    }
-
-
-@Deprecated
-def convert_messages_to_mm_format(messages: List[Dict]) -> List[Dict]:
-    for message in messages:
-        content = message["content"]
-        content_list = []
-        segments = re.split("(<image>|<video>)", content)
-        segments = [item for item in segments if item != ""]
-        for segment in segments:
-            if segment == "<image>":
-                content_list.append(
-                    {"type": "image"}
-                )  # chat template will fill the actual image data later
-            elif segment == "<video>":
-                content_list.append(
-                    {"type": "video"}
-                )  # chat template will fill the actual video data later
-            elif len(segment) == 0:
-                continue
-            else:
-                content_list.append({"type": "text", "text": segment})
-
-        message["content"] = content_list
-    return messages
diff --git a/trinity/common/models/model.py b/trinity/common/models/model.py
index 7043f493615..d223c1f0677 100644
--- a/trinity/common/models/model.py
+++ b/trinity/common/models/model.py
@@ -7,17 +7,14 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple, Union
 
 import httpx
-import numpy as np
 import ray
 import torch
-from PIL import Image
 from torch import Tensor
 
 from trinity.common.config import InferenceModelConfig
 from trinity.common.constants import RunningStatus
 from trinity.common.experience import Experience
 from trinity.common.models.utils import get_action_mask_method
-from trinity.utils.annotations import Deprecated
 from trinity.utils.log import get_logger
 
 if TYPE_CHECKING:
@@ -346,41 +343,6 @@ async def generate_async(self, prompts: List[str], **kwargs) -> List[Experience]
         )
         return [exp for exps in results for exp in exps]
 
-    @Deprecated
-    @_history_recorder
-    def generate_mm(
-        self,
-        prompts: List[str],
-        images: List[List[Image.Image]],
-        videos: List[List[np.ndarray]],
-        **kwargs,
-    ) -> List[Experience]:
-        """Generate a list of experiences from a list of prompts and multi-modal data."""
-        results = ray.get(
-            [
-                self.model.generate_mm.remote(prompt, images=img, videos=vid, **kwargs)
-                for prompt, img, vid in zip(prompts, images, videos)
-            ]
-        )
-        return [exp for exps in results for exp in exps]
-
-    @Deprecated
-    @_history_recorder
-    async def generate_mm_async(
-        self,
-        prompts: List[str],
-        images: List[List[Image.Image]],
-        videos: List[List[np.ndarray]],
-        **kwargs,
-    ) -> List[Experience]:
-        results = await asyncio.gather(
-            *[
-                self.model.generate_mm.remote(p, images=img, videos=vid, **kwargs)
-                for p, img, vid in zip(prompts, images, videos)
-            ]
-        )
-        return [exp for exps in results for exp in exps]
-
     @_history_recorder
     def chat(self, messages: List[dict], **kwargs) -> List[Experience]:
         """Generate a list of experiences from a list of messages."""
@@ -393,20 +355,6 @@ async def chat_async(self, messages: List[dict], **kwargs) -> List[Experience]:
         lora_request = await self.get_lora_request_async()
         return await self.model.chat.remote(messages, lora_request=lora_request, **kwargs)
 
-    @Deprecated
-    @_history_recorder
-    def chat_mm(
-        self, messages: List[dict], images: List[Image.Image], videos: List[np.ndarray], **kwargs
-    ) -> List[Experience]:
-        return ray.get(self.model.chat_mm.remote(messages, images=images, videos=videos, **kwargs))
-
-    @Deprecated
-    @_history_recorder
-    async def chat_mm_async(
-        self, messages: List[dict], images: List[Image.Image], videos: List[np.ndarray], **kwargs
-    ) -> List[Experience]:
-        return await self.model.chat_mm.remote(messages, images=images, videos=videos, **kwargs)
-
     def logprobs(self, tokens: List[int], temperature: Optional[float] = None) -> Tensor:
         """Calculate the logprobs of the given tokens."""
         return ray.get(self.model.logprobs.remote(tokens, temperature=temperature))
diff --git a/trinity/common/models/vllm_model.py b/trinity/common/models/vllm_model.py
index a06ba37c005..86394d11f08 100644
--- a/trinity/common/models/vllm_model.py
+++ b/trinity/common/models/vllm_model.py
@@ -5,10 +5,8 @@
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
-import numpy as np
 import torch
 from packaging.version import parse as parse_version
-from PIL import Image
 from transformers import AutoProcessor
 
 from trinity.common.config import InferenceModelConfig
@@ -16,13 +14,10 @@
 from trinity.common.models.mm_utils import (
     build_mm_input_for_training,
     build_multi_modal_data,
-    build_multi_modal_inputs,
-    convert_messages_to_mm_format,
     has_multi_modal_content,
 )
 from trinity.common.models.model import BaseInferenceModel
 from trinity.common.models.vllm_patch import get_vllm_version
-from trinity.utils.annotations import Deprecated
 
 
 # V0 engine is deprecated since vLLM v0.10.2, related code will be removed in the future.
@@ -260,85 +255,6 @@ async def generate(
         ]
         return experiences
 
-    @Deprecated
-    async def chat_mm(
-        self, messages: List[Dict], images: List[Image.Image], videos: List[np.ndarray], **kwargs
-    ) -> Sequence[Experience]:
-        """Chat with the model with a list of messages in async.
-
-        Args:
-            messages (List[dict]): The input history messages.
-            raw_mm_data (dict): The raw multi-modal data.
-            kwargs (dict): A dictionary of sampling parameters.
-
-        Returns:
-            A list of experiences.
-        """
-        if self.processor is None:
-            await self._initialize_processor()
-        messages = convert_messages_to_mm_format(messages)
-        prompt = self.apply_chat_template(self.processor, messages)
-        return await self.generate_mm(prompt=prompt, images=images, videos=videos, **kwargs)
-
-    @Deprecated
-    async def generate_mm(
-        self,
-        prompt: str = None,
-        images: List[Image.Image] = None,
-        videos: List[np.ndarray] = None,
-        **kwargs,
-    ) -> Sequence[Experience]:
-        """Generate a response from the provided prompt in async.
-
-        Args:
-            prompt (str): The input prompt.
-            images (List): The list of image inputs.
-            videos (List): The list of video inputs.
-
-        Returns:
-            A list of experiences.
-        """
-        mm_inputs = build_multi_modal_inputs(
-            prompt=prompt,
-            images=images,
-            videos=videos,
-            processor=self.processor,
-        )
-
-        vllm_inputs = {
-            "prompt": mm_inputs["prompt"],
-            "multi_modal_data": mm_inputs["multi_modal_data"],
-        }
-
-        output = await self._generate_internal(prompt=vllm_inputs, **kwargs)
-        experiences = [
-            Experience(
-                tokens=torch.cat(
-                    (
-                        torch.tensor(output.prompt_token_ids, dtype=torch.int32),
-                        torch.tensor(output.outputs[i].token_ids, dtype=torch.int32),
-                    )
-                ),
-                logprobs=torch.cat(
-                    (
-                        torch.tensor(
-                            [
-                                list(logprob_dict.values())[0].logprob
-                                for logprob_dict in output.outputs[i].logprobs
-                            ],
-                            dtype=torch.float32,
-                        ),
-                    )
-                ),
-                prompt_length=len(output.prompt_token_ids),
-                prompt_text=mm_inputs["prompt"],
-                response_text=output.outputs[i].text,
-                multi_modal_inputs=mm_inputs["multi_modal_inputs"],
-            )
-            for i in range(len(output.outputs))
-        ]
-        return experiences
-
     async def logprobs(  # type: ignore [override]
         self,
         token_ids: List[int],
diff --git a/trinity/common/models/vllm_patch/__init__.py b/trinity/common/models/vllm_patch/__init__.py
index b9a8ffe0c66..4e4eff4d989 100644
--- a/trinity/common/models/vllm_patch/__init__.py
+++ b/trinity/common/models/vllm_patch/__init__.py
@@ -8,6 +8,14 @@
 from trinity.common.config import InferenceModelConfig
 
 
+def vllm_patch():
+    import transformers
+
+    # Patch for Kimi-VL-A3B-Thinking
+    if not hasattr(transformers.activations, "PytorchGELUTanh"):
+        transformers.activations.PytorchGELUTanh = transformers.activations.GELUTanh
+
+
 def get_vllm_version():
     try:
         vllm_version = parse_version(vllm.__version__)
diff --git a/trinity/common/patch/__init__.py b/trinity/common/patch/__init__.py
new file mode 100644
index 00000000000..e72f96dfdb2
--- /dev/null
+++ b/trinity/common/patch/__init__.py
@@ -0,0 +1,5 @@
+from trinity.common.patch.kimi import kimi_vl_monkey_patch_decorator
+
+__all__ = [
+    "kimi_vl_monkey_patch_decorator",
+]
diff --git a/trinity/utils/monkey_patch.py b/trinity/common/patch/kimi.py
similarity index 96%
rename from trinity/utils/monkey_patch.py
rename to trinity/common/patch/kimi.py
index 70a18f1dc8f..853bd3d4970 100644
--- a/trinity/utils/monkey_patch.py
+++ b/trinity/common/patch/kimi.py
@@ -1,13 +1,6 @@
 """Monkey patching for 'kimi_vl' models."""
 
 
-def vllm_patch():
-    import transformers
-
-    if not hasattr(transformers.activations, "PytorchGELUTanh"):
-        transformers.activations.PytorchGELUTanh = transformers.activations.GELUTanh
-
-
 def kimi_vl_monkey_patch_decorator(func):
     """
     A decorator that applies temporary monkey patches for 'kimi_vl' models before
diff --git a/trinity/common/verl_config.py b/trinity/common/verl_config.py
index 9537da866a4..218f67c4ba2 100644
--- a/trinity/common/verl_config.py
+++ b/trinity/common/verl_config.py
@@ -9,8 +9,8 @@
 from trinity.algorithm import ALGORITHM_TYPE
 from trinity.common.config import Config, SynchronizerConfig, set_if_none
 from trinity.common.constants import EXPLORER_NAME
+from trinity.common.patch import kimi_vl_monkey_patch_decorator
 from trinity.utils.log import get_logger
-from trinity.utils.monkey_patch import kimi_vl_monkey_patch_decorator
 
 logger = get_logger(__name__)
 
diff --git a/trinity/trainer/verl/fsdp_workers.py b/trinity/trainer/verl/fsdp_workers.py
index 48bf7bbee8e..1b893bdfeff 100644
--- a/trinity/trainer/verl/fsdp_workers.py
+++ b/trinity/trainer/verl/fsdp_workers.py
@@ -93,11 +93,11 @@
 
 from trinity.common.config import AlgorithmConfig
 from trinity.common.constants import ROLLOUT_WEIGHT_SYNC_GROUP_NAME, SyncMethod
+from trinity.common.patch import kimi_vl_monkey_patch_decorator
 from trinity.manager.synchronizer import Synchronizer
 from trinity.trainer.verl.fsdp_checkpoint_manager import FSDPCheckpointManager
 from trinity.trainer.verl.utils import get_model_class
 from trinity.utils.distributed import init_process_group
-from trinity.utils.monkey_patch import kimi_vl_monkey_patch_decorator
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))

From 4974a029adf21d3ce60118916a0e955a863445e0 Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Wed, 11 Feb 2026 18:03:38 +0800
Subject: [PATCH 07/10] add qwen3 vl model

---
 .github/workflows/docker/docker-compose.yaml |  1 +
 examples/grpo_vlm/README.md                  |  2 +-
 examples/mix_vlm/README.md                   |  2 +-
 tests/tools.py                               | 10 ++++++++++
 tests/trainer/trainer_test.py                |  3 ++-
 5 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/docker/docker-compose.yaml b/.github/workflows/docker/docker-compose.yaml
index df6d7bd39e9..e9e359e9ce2 100644
--- a/.github/workflows/docker/docker-compose.yaml
+++ b/.github/workflows/docker/docker-compose.yaml
@@ -15,6 +15,7 @@ services:
       - TRINITY_MODEL_PATH=/mnt/models/Qwen3-0.6B
       - TRINITY_API_MODEL_PATH=/mnt/models/Qwen3-1.7B
       - TRINITY_VLM_MODEL_PATH=/mnt/models/Qwen2.5-VL-3B
+      - TRINITY_ALTERNATIVE_VLM_MODEL_PATH=/mnt/models/Qwen3-VL-2B-Instruct
       - VIRTUAL_ENV=/opt/venv
     working_dir: /workspace
     networks:
diff --git a/examples/grpo_vlm/README.md b/examples/grpo_vlm/README.md
index 8283e18910d..ecb2f6ca906 100644
--- a/examples/grpo_vlm/README.md
+++ b/examples/grpo_vlm/README.md
@@ -8,7 +8,7 @@ This example shows the usage of GRPO with Qwen2.5-VL-3B-Instruct on the [geometr
 The specific requirements are:
 
 ```yaml
-vllm>=0.10.2
+vllm>=0.10.2  # Qwen3 VL requires vllm>=0.11.0
 transformers>=4.54.0
 qwen_vl_utils
 ```
diff --git a/examples/mix_vlm/README.md b/examples/mix_vlm/README.md
index 124a3663aea..8475b56e8f6 100644
--- a/examples/mix_vlm/README.md
+++ b/examples/mix_vlm/README.md
@@ -8,7 +8,7 @@ This is an example of using the [MIX](../../docs/sphinx_doc/source/tutorial/exam
 The specific requirements are:
 
 ```yaml
-vllm>=0.10.2
+vllm>=0.10.2  # Qwen3 VL requires vllm>=0.11.0
 transformers>=4.54.0
 qwen_vl_utils
 ```
diff --git a/tests/tools.py b/tests/tools.py
index c0705316fd8..bdfeb4100e8 100644
--- a/tests/tools.py
+++ b/tests/tools.py
@@ -24,6 +24,7 @@
 
 API_MODEL_PATH_ENV_VAR = "TRINITY_API_MODEL_PATH"
 VLM_MODEL_PATH_ENV_VAR = "TRINITY_VLM_MODEL_PATH"
+ALTERNATIVE_VLM_MODEL_PATH_ENV_VAR = "TRINITY_ALTERNATIVE_VLM_MODEL_PATH"
 SFT_DATASET_PATH_ENV_VAR = "TRINITY_SFT_DATASET_PATH"
 
 
@@ -134,6 +135,15 @@ def get_vision_language_model_path() -> str:
     return path
 
 
+def get_alternative_vision_language_model_path() -> str:
+    path = os.environ.get(ALTERNATIVE_VLM_MODEL_PATH_ENV_VAR)
+    if not path:
+        raise EnvironmentError(
+            f"Please set `export {ALTERNATIVE_VLM_MODEL_PATH_ENV_VAR}=<your_model_dir>` before running this test."
+        )
+    return path
+
+
 def get_lora_config() -> LoRAConfig:
     return LoRAConfig(name="lora", lora_rank=16, lora_alpha=16)
 
diff --git a/tests/trainer/trainer_test.py b/tests/trainer/trainer_test.py
index 521781b8cf5..5a0560a0a56 100644
--- a/tests/trainer/trainer_test.py
+++ b/tests/trainer/trainer_test.py
@@ -21,6 +21,7 @@
     RayUnittestBase,
     RayUnittestBaseAsync,
     TensorBoardParser,
+    get_alternative_vision_language_model_path,
     get_checkpoint_path,
     get_lora_config,
     get_model_path,
@@ -1210,7 +1211,7 @@ def test_trainer(self):
         self.config.buffer.explorer_input.taskset = get_unittest_dataset_config(
             "geometry"
         )  # Total 8 tasks
-        self.config.model.model_path = get_vision_language_model_path()
+        self.config.model.model_path = get_alternative_vision_language_model_path()
         self.config.algorithm.algorithm_type = "grpo"
         self.config.algorithm.advantage_fn = "grpo"
         self.config.algorithm.kl_loss_fn = "none"

From 773fd9f58d4d3112a7011d7bf7ad256e1264205b Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Wed, 11 Feb 2026 19:14:51 +0800
Subject: [PATCH 08/10] add doc string

---
 trinity/common/models/mm_utils.py | 165 ++++++++++++++++++++++++++----
 1 file changed, 145 insertions(+), 20 deletions(-)

diff --git a/trinity/common/models/mm_utils.py b/trinity/common/models/mm_utils.py
index d8b2472fbf7..8cf8f9a152c 100644
--- a/trinity/common/models/mm_utils.py
+++ b/trinity/common/models/mm_utils.py
@@ -1,18 +1,52 @@
-""""Multi-modal utilities for processing and handling multi-modal data such as images and videos.
-Only support Qwen2.5/3 VL series.
+"""Utilities for processing multi-modal data (images/videos) for specific vision-language models.
 
-Modified from: verl/utils/dataset/rl_dataset.py
+Supported models:
+- Qwen2.5-VL, Qwen3-VL series
+- Kimi VL series
+
+Provides functions to:
+1. Parse prompts with media tags (<image>/<video>)
+2. Validate multi-modal content in conversations
+3. Preprocess media inputs for inference/training
+4. Construct model-compatible message formats
+
+Note:
+    Only processors with class names containing both ("Qwen" OR "Kimi") AND "VLProcessor" are supported.
+    Relies on `qwen_vl_utils.process_vision_info` for media extraction.
 """
 import re
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
 
 
 def build_multi_modal_data(
     processor: Any,
     messages: List[Dict],
 ) -> Dict[str, Any]:
-    """
-    Preprocess multi-modal data and build multi-modal inputs
+    """Extract and preprocess vision inputs from multi-modal messages for vLLM inference.
+
+    Processes messages containing image/video placeholders using model-specific vision utilities.
+    Returns structured media inputs compatible with vLLM's multi-modal API.
+
+    Args:
+        processor: Vision-language processor instance (must have class name containing
+                   ("Qwen" OR "Kimi") AND "VLProcessor").
+        messages: List of conversation messages in model-expected format. Each message's "content"
+                  may be a string or list of content items (text/image/video dictionaries).
+
+    Returns:
+        Dictionary containing processed media inputs with keys:
+        - "image": List of processed image objects (if images exist)
+        - "video": List of processed video objects (if videos exist)
+        Keys are omitted when no corresponding media is present.
+
+    Raises:
+        NotImplementedError: If processor class name doesn't match supported patterns.
+        ImportError: If required `qwen_vl_utils` module is unavailable.
+
+    Example:
+        >>> messages = [{"role": "user", "content": [{"type": "image", "image": "img.jpg"}, {"type": "text", "text": "Describe this"}]}]
+        >>> build_multi_modal_data(processor, messages)
+        {"image": [processed_image]}
     """
     processor_class_name = processor.__class__.__name__
     if (
@@ -28,10 +62,43 @@ def build_multi_modal_data(
             multi_modal_data["video"] = video_inputs
 
         return multi_modal_data
-    raise NotImplementedError(f"{processor_class_name} not supported")
+    raise NotImplementedError(
+        f"Processor '{processor_class_name}' not supported. Only Qwen/Kimi VL processors are supported."
+    )
+
+
+def build_mm_input_for_training(
+    processor: Any, prompt: str, multi_modal_data: Dict[str, List]
+) -> Dict[str, Any]:
+    """Tokenize prompt and integrate processed media inputs for model training.
+
+    Combines text prompt with preprocessed image/video data into model-ready tensor inputs.
+    Handles padding and tensor conversion for training workflows.
+
+    Args:
+        processor: Vision-language processor instance (must have class name containing
+                   ("Qwen" OR "Kimi") AND "VLProcessor").
+        prompt: Plain text prompt WITHOUT media tags (e.g., "Describe this image").
+                Media placement is handled via `multi_modal_data`, not prompt tags.
+        multi_modal_data: Dictionary from `build_multi_modal_data()` containing:
+                          {"image": [...], "video": [...]} (keys optional)
+
+    Returns:
+        Dictionary of model inputs including:
+        - input_ids: Tokenized prompt IDs
+        - attention_mask: Attention mask tensor
+        - pixel_values: Processed image tensors (if images provided)
+        - pixel_values_videos: Processed video tensors (if videos provided)
+        All tensors converted to PyTorch format (`return_tensors="pt"`).
 
+    Raises:
+        NotImplementedError: If processor class name doesn't match supported patterns.
+        ValueError: If media counts mismatch prompt expectations (handled internally by processor).
 
-def build_mm_input_for_training(processor: Any, prompt: str, multi_modal_data: Dict) -> Dict:
+    Note:
+        Prompt should NOT contain <image>/<video> tags here. Media association is managed
+        through the structured `multi_modal_data` dictionary.
+    """
     processor_class_name = processor.__class__.__name__
     if (
         "Qwen" in processor_class_name or "Kimi" in processor_class_name
@@ -44,12 +111,50 @@ def build_mm_input_for_training(processor: Any, prompt: str, multi_modal_data: D
             return_tensors="pt",
         )
         return dict(inputs)
-    raise NotImplementedError(f"{processor_class_name} not supported")
+    raise NotImplementedError(
+        f"Processor '{processor_class_name}' not supported. Only Qwen/Kimi VL processors are supported."
+    )
+
+
+def build_mm_message(
+    prompt: str, images: List[Union[str, Any]], videos: List[Union[str, Any]]
+) -> Dict[str, Any]:
+    """Construct multi-modal message by injecting media references at tag positions in prompt.
+
+    Parses prompt for <image>/<video> tags, replaces them with corresponding media references,
+    and handles surplus media items. Extra media (beyond tag count) is prepended to content.
+
+    Args:
+        prompt: Text containing optional <image> and <video> tags as media placeholders.
+                Example: "First <image> then <video> and finally <image>"
+        images: List of image references (file paths, URLs, or PIL images) in order of appearance.
+        videos: List of video references (file paths, URLs) in order of appearance.
+
+    Returns:
+        Message dictionary formatted for VL models:
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": ...},  # Surplus media first
+                {"type": "video", "video": ...},
+                {"type": "text", "text": "First "},
+                {"type": "image", "image": ...},  # Tag-replaced media
+                ...
+            ]
+        }
 
+    Raises:
+        ValueError: If prompt contains more <image> tags than provided images,
+                    or more <video> tags than provided videos.
 
-def build_mm_message(prompt: str, images: List, videos: List):
+    Behavior details:
+        - Tags are case-sensitive and must be exact: "<image>", "<video>"
+        - Empty text segments between tags are omitted
+        - Surplus media (images/videos beyond tag count) appears at START of content list
+        - Text segments preserve original prompt ordering around tags
+    """
     content_list = []
-    segments = re.split("(<image>|<video>)", prompt)
+    segments = re.split(r"(<image>|<video>)", prompt)
     img_idx, vid_idx = 0, 0
     for segment in segments:
         if segment == "<image>":
@@ -67,23 +172,43 @@ def build_mm_message(prompt: str, images: List, videos: List):
         else:
             content_list.append({"type": "text", "text": segment})
 
-    # deal with redundant <image> and <video>
-    mm_contents = []
+    # Prepend surplus media items (not referenced by tags)
+    surplus_content = []
     while img_idx < len(images):
-        mm_contents.append({"type": "image", "image": images[img_idx]})
+        surplus_content.append({"type": "image", "image": images[img_idx]})
         img_idx += 1
     while vid_idx < len(videos):
-        mm_contents.append({"type": "video", "video": videos[vid_idx]})
+        surplus_content.append({"type": "video", "video": videos[vid_idx]})
         vid_idx += 1
 
-    content_list = mm_contents + content_list
-    message = {"role": "user", "content": content_list}
-    return message
+    content_list = surplus_content + content_list
+    return {"role": "user", "content": content_list}
+
+
+def has_multi_modal_content(messages: List[Dict]) -> bool:
+    """Check if any message contains non-text (image/video) content.
 
+    Inspects message content structure to detect multi-modal elements. Handles both:
+    - String content (text-only, returns False)
+    - List content (multi-modal candidates)
 
-def has_multi_modal_content(messages: List[Dict]):
+    Args:
+        messages: List of conversation messages. Each message must contain a "content" field.
+                  Content may be:
+                  - str: Plain text message
+                  - List[Dict]: Multi-modal content items (each with "type" key)
+
+    Returns:
+        True if any message contains at least one non-text content item (type != "text"),
+        False otherwise.
+
+    Example:
+        >>> msg = [{"role": "user", "content": [{"type": "text", "text": "Hi"}, {"type": "image", "image": "..."}]}]
+        >>> has_multi_modal_content(msg)
+        True
+    """
     for message in messages:
-        content = message["content"]
+        content = message.get("content", [])
         if isinstance(content, list):
             for item in content:
                 if item.get("type", "text") != "text":

From 0c7d5bbca9a00d2f45226d6bbca24d389a1490f9 Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Wed, 11 Feb 2026 19:51:17 +0800
Subject: [PATCH 09/10] fix unittest

---
 trinity/common/models/mm_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/trinity/common/models/mm_utils.py b/trinity/common/models/mm_utils.py
index 8cf8f9a152c..222c30032e0 100644
--- a/trinity/common/models/mm_utils.py
+++ b/trinity/common/models/mm_utils.py
@@ -182,6 +182,8 @@ def build_mm_message(
         vid_idx += 1
 
     content_list = surplus_content + content_list
+    if len(content_list) == 1 and content_list[0]["type"] == "text":
+        return {"role": "user", "content": content_list[0]["text"]}
     return {"role": "user", "content": content_list}
 
 

From 810b6787660f6be2e54cb81d080f956fbdf8ad37 Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Thu, 12 Feb 2026 11:39:35 +0800
Subject: [PATCH 10/10] doc fix and apply reviews

---
 examples/grpo_vlm/README.md       | 10 +++++++++-
 examples/mix_vlm/README.md        | 10 +++++++++-
 trinity/common/models/mm_utils.py | 10 +++++-----
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/examples/grpo_vlm/README.md b/examples/grpo_vlm/README.md
index ecb2f6ca906..3435258bc94 100644
--- a/examples/grpo_vlm/README.md
+++ b/examples/grpo_vlm/README.md
@@ -8,7 +8,7 @@ This example shows the usage of GRPO with Qwen2.5-VL-3B-Instruct on the [geometr
 The specific requirements are:
 
 ```yaml
-vllm>=0.10.2  # Qwen3 VL requires vllm>=0.11.0
+vllm>=0.10.2  # Qwen3 VL requires vllm>=0.11.0; it is recommended to use version >= 0.13.0
 transformers>=4.54.0
 qwen_vl_utils
 ```
@@ -18,3 +18,11 @@ For other detailed information, please refer to the [documentation](../../docs/s
 The config file is located in [`vlm.yaml`](vlm.yaml), and the curve is shown below.
 
 ![vlm](../../docs/sphinx_doc/assets/geometry3k_qwen25_vl_3b_reward.png)
+
+## Supported Model Architectures
+
+The following vision-language model series are currently supported:
+
+1. Qwen2.5-VL series
+2. Qwen3-VL series
+3. Kimi-VL-A3B-Thinking series
diff --git a/examples/mix_vlm/README.md b/examples/mix_vlm/README.md
index 8475b56e8f6..0ee57a225fe 100644
--- a/examples/mix_vlm/README.md
+++ b/examples/mix_vlm/README.md
@@ -8,7 +8,7 @@ This is an example of using the [MIX](../../docs/sphinx_doc/source/tutorial/exam
 The specific requirements are:
 
 ```yaml
-vllm>=0.10.2  # Qwen3 VL requires vllm>=0.11.0
+vllm>=0.10.2  # Qwen3 VL requires vllm>=0.11.0; it is recommended to use version >= 0.13.0
 transformers>=4.54.0
 qwen_vl_utils
 ```
@@ -34,3 +34,11 @@ trinity run --config examples/mix_vlm/mix_vlm.yaml
 
 The reward curve is shown below:
 ![](../../docs/sphinx_doc/assets/mix_vlm_reward.png)
+
+## Supported Model Architectures
+
+The following vision-language model series are currently supported:
+
+1. Qwen2.5-VL series
+2. Qwen3-VL series
+3. Kimi-VL-A3B-Thinking series
diff --git a/trinity/common/models/mm_utils.py b/trinity/common/models/mm_utils.py
index 222c30032e0..fe012e8d50b 100644
--- a/trinity/common/models/mm_utils.py
+++ b/trinity/common/models/mm_utils.py
@@ -11,7 +11,7 @@
 4. Construct model-compatible message formats
 
 Note:
-    Only processors with class names containing both ("Qwen" OR "Kimi") AND "VLProcessor" are supported.
+    Only processors with class names containing both ("Qwen" OR "Kimi") AND "Processor" are supported.
     Relies on `qwen_vl_utils.process_vision_info` for media extraction.
 """
 import re
@@ -29,7 +29,7 @@ def build_multi_modal_data(
 
     Args:
         processor: Vision-language processor instance (must have class name containing
-                   ("Qwen" OR "Kimi") AND "VLProcessor").
+                   ("Qwen" OR "Kimi") AND "Processor").
         messages: List of conversation messages in model-expected format. Each message's "content"
                   may be a string or list of content items (text/image/video dictionaries).
 
@@ -51,7 +51,7 @@ def build_multi_modal_data(
     processor_class_name = processor.__class__.__name__
     if (
         "Qwen" in processor_class_name or "Kimi" in processor_class_name
-    ) and "VLProcessor" in processor_class_name:
+    ) and "Processor" in processor_class_name:
         from qwen_vl_utils import process_vision_info
 
         image_inputs, video_inputs = process_vision_info(messages)
@@ -77,7 +77,7 @@ def build_mm_input_for_training(
 
     Args:
         processor: Vision-language processor instance (must have class name containing
-                   ("Qwen" OR "Kimi") AND "VLProcessor").
+                   ("Qwen" OR "Kimi") AND "Processor").
         prompt: Plain text prompt WITHOUT media tags (e.g., "Describe this image").
                 Media placement is handled via `multi_modal_data`, not prompt tags.
         multi_modal_data: Dictionary from `build_multi_modal_data()` containing:
@@ -102,7 +102,7 @@ def build_mm_input_for_training(
     processor_class_name = processor.__class__.__name__
     if (
         "Qwen" in processor_class_name or "Kimi" in processor_class_name
-    ) and "VLProcessor" in processor_class_name:
+    ) and "Processor" in processor_class_name:
         inputs = processor(
             text=[prompt],
             images=multi_modal_data.get("image", None),