diff --git a/.github/workflows/docker/docker-compose.yaml b/.github/workflows/docker/docker-compose.yaml
index 0889e57ffdd..e9e359e9ce2 100644
--- a/.github/workflows/docker/docker-compose.yaml
+++ b/.github/workflows/docker/docker-compose.yaml
@@ -1,6 +1,6 @@
 services:
   trinity-node-1:
-    image: trinity-rft-unittest:20260205
+    image: trinity-rft-unittest:20260211
     cap_add:
       - SYS_PTRACE
     pull_policy: never
@@ -15,6 +15,7 @@ services:
       - TRINITY_MODEL_PATH=/mnt/models/Qwen3-0.6B
       - TRINITY_API_MODEL_PATH=/mnt/models/Qwen3-1.7B
       - TRINITY_VLM_MODEL_PATH=/mnt/models/Qwen2.5-VL-3B
+      - TRINITY_ALTERNATIVE_VLM_MODEL_PATH=/mnt/models/Qwen3-VL-2B-Instruct
       - VIRTUAL_ENV=/opt/venv
     working_dir: /workspace
     networks:
@@ -32,7 +33,7 @@ services:
             capabilities: [gpu]
 
   trinity-node-2:
-    image: trinity-rft-unittest:20260205
+    image: trinity-rft-unittest:20260211
     cap_add:
       - SYS_PTRACE
     pull_policy: never
diff --git a/examples/grpo_vlm/README.md b/examples/grpo_vlm/README.md
index 5ab12488a61..3435258bc94 100644
--- a/examples/grpo_vlm/README.md
+++ b/examples/grpo_vlm/README.md
@@ -8,8 +8,8 @@ This example shows the usage of GRPO with Qwen2.5-VL-3B-Instruct on the [geometr
 The specific requirements are:
 
 ```yaml
-vllm>=0.9.1,<0.10.0
-transformers<4.53.0
+vllm>=0.10.2  # Qwen3 VL requires vllm>=0.11.0; it is recommended to use version >= 0.13.0
+transformers>=4.54.0
 qwen_vl_utils
 ```
 
@@ -18,3 +18,11 @@ For other detailed information, please refer to the [documentation](../../docs/s
 The config file is located in [`vlm.yaml`](vlm.yaml), and the curve is shown below.
 
 ![vlm](../../docs/sphinx_doc/assets/geometry3k_qwen25_vl_3b_reward.png)
+
+## Supported Model Architectures
+
+The following vision-language model series are currently supported:
+
+1. Qwen2.5-VL series
+2. Qwen3-VL series
+3. Kimi-VL-A3B-Thinking series
diff --git a/examples/mix_vlm/README.md b/examples/mix_vlm/README.md
index 5a2c8752de0..0ee57a225fe 100644
--- a/examples/mix_vlm/README.md
+++ b/examples/mix_vlm/README.md
@@ -8,8 +8,8 @@ This is an example of using the [MIX](../../docs/sphinx_doc/source/tutorial/exam
 The specific requirements are:
 
 ```yaml
-vllm>=0.9.1,<0.10.0
-transformers<4.53.0
+vllm>=0.10.2  # Qwen3 VL requires vllm>=0.11.0; it is recommended to use version >= 0.13.0
+transformers>=4.54.0
 qwen_vl_utils
 ```
 
@@ -34,3 +34,11 @@ trinity run --config examples/mix_vlm/mix_vlm.yaml
 
 The reward curve is shown below:
 ![](../../docs/sphinx_doc/assets/mix_vlm_reward.png)
+
+## Supported Model Architectures
+
+The following vision-language model series are currently supported:
+
+1. Qwen2.5-VL series
+2. Qwen3-VL series
+3. Kimi-VL-A3B-Thinking series
diff --git a/pyproject.toml b/pyproject.toml
index 89f8a7ec997..812f14e6fd0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -88,7 +88,7 @@ megatron = [
     # "mbridge @ git+https://github.com/ISEEKYAN/mbridge.git@20e9ffbbe72ae7b1df83bfe1bc3c11f7382f2612",
 ]
 tinker = [
-    "tinker; python_version >= '3.11'",
+    "tinker>=0.10.0; python_version >= '3.11'",
 ]
 
 doc = [
@@ -103,6 +103,8 @@ doc = [
 
 mm = [
     "qwen-vl-utils",
+    "transformers>=4.54.0",
+    "blobfile",
 ]
 
 flash_attn = [
@@ -143,6 +145,9 @@ known_third_party = ["wandb"]
 [tool.uv.extra-build-dependencies]
 flash-attn = ["torch", "numpy"]
 
+[project.entry-points."vllm.general_plugins"]
+vllm_patch = "trinity.common.models.vllm_patch:vllm_patch"
+
 [project.urls]
 "Homepage" = "https://github.com/agentscope-ai/Trinity-RFT"
 "Documentation" = "https://agentscope-ai.github.io/Trinity-RFT/"
diff --git a/tests/template/data/gsm8k/test.jsonl b/tests/template/data/gsm8k/test.jsonl
new file mode 100644
index 00000000000..8c07215ebf4
--- /dev/null
+++ b/tests/template/data/gsm8k/test.jsonl
@@ -0,0 +1,2 @@
+{"question": "Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?", "answer": "Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\n#### 18"}
+{"question": "A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?", "answer": "It takes 2/2=<<2/2=1>>1 bolt of white fiber\nSo the total amount of fabric is 2+1=<<2+1=3>>3 bolts of fabric\n#### 3"}
diff --git a/tests/tools.py b/tests/tools.py
index 33cf4952089..bdfeb4100e8 100644
--- a/tests/tools.py
+++ b/tests/tools.py
@@ -24,6 +24,7 @@
 
 API_MODEL_PATH_ENV_VAR = "TRINITY_API_MODEL_PATH"
 VLM_MODEL_PATH_ENV_VAR = "TRINITY_VLM_MODEL_PATH"
+ALTERNATIVE_VLM_MODEL_PATH_ENV_VAR = "TRINITY_ALTERNATIVE_VLM_MODEL_PATH"
 SFT_DATASET_PATH_ENV_VAR = "TRINITY_SFT_DATASET_PATH"
 
 
@@ -134,6 +135,15 @@ def get_vision_language_model_path() -> str:
     return path
 
 
+def get_alternative_vision_language_model_path() -> str:
+    path = os.environ.get(ALTERNATIVE_VLM_MODEL_PATH_ENV_VAR)
+    if not path:
+        raise EnvironmentError(
+            f"Please set `export {ALTERNATIVE_VLM_MODEL_PATH_ENV_VAR}=<your_model_dir>` before running this test."
+        )
+    return path
+
+
 def get_lora_config() -> LoRAConfig:
     return LoRAConfig(name="lora", lora_rank=16, lora_alpha=16)
 
@@ -248,6 +258,20 @@ def get_unittest_dataset_config(dataset_name: str = "countdown", split: str = "t
             default_workflow_type="simple_mm_workflow",
             default_reward_fn_type="math_boxed_reward",
         )
+    elif dataset_name == "geometry_sft":
+        # Multi-modal geometry dataset for sft with 8 samples
+        return ExperienceBufferConfig(
+            name=dataset_name,
+            path=os.path.join(os.path.dirname(__file__), "template", "data", "geometry"),
+            split="train",
+            storage_type=StorageType.FILE.value,
+            format=FormatConfig(
+                prompt_type=PromptType.PLAINTEXT,
+                prompt_key="problem",
+                response_key="answer",
+                image_key="images",
+            ),
+        )
     else:
         raise ValueError(f"Unknown dataset name: {dataset_name}")
 
diff --git a/tests/trainer/trainer_test.py b/tests/trainer/trainer_test.py
index 95a689f5bed..5a0560a0a56 100644
--- a/tests/trainer/trainer_test.py
+++ b/tests/trainer/trainer_test.py
@@ -21,6 +21,7 @@
     RayUnittestBase,
     RayUnittestBaseAsync,
     TensorBoardParser,
+    get_alternative_vision_language_model_path,
     get_checkpoint_path,
     get_lora_config,
     get_model_path,
@@ -350,7 +351,7 @@ def test_trainer(self, mock_load):
         mock_load.return_value = deepcopy(self.config)
 
         with self.assertRaises(Exception):
-            run(config_path="dummy.yaml")
+            run(config="dummy.yaml")
         ray.shutdown(_exiting_interpreter=True)
 
         stage_configs = [cfg.check_and_update() for cfg in deepcopy(self.config)]
@@ -375,7 +376,7 @@ def test_trainer(self, mock_load):
         self.config.stages[1].buffer.explorer_input.taskset.path = old_taskset_path
         mock_load.return_value = deepcopy(self.config)
         ray.init(ignore_reinit_error=True, namespace=self.config.ray_namespace)
-        run(config_path="dummy.yaml")
+        run(config="dummy.yaml")
 
         # grpo stage
         grpo_config = stage_configs[1]
@@ -1205,13 +1206,12 @@ def tearDown(self):
 
 
 class TestMultiModalGRPO(BaseTrainerCase):
-    @unittest.skip("Require specific vllm/transformers version")
     def test_trainer(self):
         """Test both mode with multi-modal data."""
         self.config.buffer.explorer_input.taskset = get_unittest_dataset_config(
             "geometry"
         )  # Total 8 tasks
-        self.config.model.model_path = get_vision_language_model_path()
+        self.config.model.model_path = get_alternative_vision_language_model_path()
         self.config.algorithm.algorithm_type = "grpo"
         self.config.algorithm.advantage_fn = "grpo"
         self.config.algorithm.kl_loss_fn = "none"
@@ -1246,12 +1246,11 @@ def tearDown(self):
 
 
 class TestMultiModalSFT(BaseTrainerCase):
-    @unittest.skip("Require specific vllm/transformers version")
     def test_trainer(self):
         """Test SFT mode with multi-modal data."""
         self.config.mode = "train"
         self.config.buffer.trainer_input.experience_buffer = get_unittest_dataset_config(
-            "geometry"
+            "geometry_sft"
         )  # Total 8 tasks
         self.config.model.model_path = get_vision_language_model_path()
         self.config.algorithm.algorithm_type = "sft"
@@ -1522,7 +1521,6 @@ def tearDown(self):
         shutil.rmtree(self.config.checkpoint_job_dir, ignore_errors=True)
 
 
-@unittest.skip("Require agentscope >= 1.0.12")
 class AgentScopeTunerTest(unittest.IsolatedAsyncioTestCase):
     def setUp(self) -> None:
         ray.init(ignore_reinit_error=True)
@@ -1622,7 +1620,7 @@ async def judge_func(
                 model_path=get_model_path(),
                 max_model_len=8192,
                 max_tokens=2048,
-                inference_engine_num=2,
+                inference_engine_num=1,
             )
         }
 
diff --git a/trinity/algorithm/policy_loss_fn/chord_policy_loss.py b/trinity/algorithm/policy_loss_fn/chord_policy_loss.py
index 6653b4d2258..4f42df4ecff 100644
--- a/trinity/algorithm/policy_loss_fn/chord_policy_loss.py
+++ b/trinity/algorithm/policy_loss_fn/chord_policy_loss.py
@@ -199,35 +199,27 @@ def __call__(  # type: ignore
             per_micro_batch_weight_usual = self.gradient_accumulation / self.train_batch_size_usual  # type: ignore
             per_micro_batch_weight_expert = self.gradient_accumulation / self.train_batch_size_expert  # type: ignore
 
-        if n_usual_exp > 0:
-            grpo_loss, grpo_metrics = self.grpo_loss_fn(
-                logprob[~expert_mask],
-                old_logprob[~expert_mask],
-                action_mask[~expert_mask],
-                advantages[~expert_mask],
-                **kwargs,
-            )
-            grpo_loss = grpo_loss * n_usual_exp * per_micro_batch_weight_usual
-            grpo_metrics = {
-                k: v * n_usual_exp * per_micro_batch_weight_usual for k, v in grpo_metrics.items()
-            }
-        else:
-            grpo_loss = torch.tensor(0.0, device=logprob.device)
-            grpo_metrics = {}
+        grpo_loss, grpo_metrics = self.grpo_loss_fn(
+            logprob[~expert_mask],
+            old_logprob[~expert_mask],
+            action_mask[~expert_mask],
+            advantages[~expert_mask],
+            **kwargs,
+        )
+        grpo_loss = grpo_loss * n_usual_exp * per_micro_batch_weight_usual
+        grpo_metrics = {
+            k: v * n_usual_exp * per_micro_batch_weight_usual for k, v in grpo_metrics.items()
+        }
 
         # SFT Loss (expert)
-        if n_expert_exp > 0:
-            sft_loss, sft_metrics = self.sft_loss_fn(
-                logprob[expert_mask],
-                action_mask[expert_mask],
-            )
-            sft_loss = sft_loss * n_expert_exp * per_micro_batch_weight_expert
-            sft_metrics = {
-                k: v * n_expert_exp * per_micro_batch_weight_expert for k, v in sft_metrics.items()
-            }
-        else:
-            sft_loss = torch.tensor(0.0, device=logprob.device)
-            sft_metrics = {}
+        sft_loss, sft_metrics = self.sft_loss_fn(
+            logprob[expert_mask],
+            action_mask[expert_mask],
+        )
+        sft_loss = sft_loss * n_expert_exp * per_micro_batch_weight_expert
+        sft_metrics = {
+            k: v * n_expert_exp * per_micro_batch_weight_expert for k, v in sft_metrics.items()
+        }
 
         mu = mu_schedule_function(
             current_step, self.mu_warmup_steps, self.mu_decay_steps, self.mu_peak, self.mu_valley
diff --git a/trinity/buffer/schema/formatter.py b/trinity/buffer/schema/formatter.py
index 039074521b5..f90dd29aaba 100644
--- a/trinity/buffer/schema/formatter.py
+++ b/trinity/buffer/schema/formatter.py
@@ -109,7 +109,7 @@ def __init__(self, tokenizer_path: str, format_config: FormatConfig):
         else:
             self.processor = None
             self.tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path)
-        self.chat_template = format_config.chat_template or self.tokenizer.chat_template
+        self.chat_template = format_config.chat_template
         # For messages type
         if self.prompt_type == PromptType.MESSAGES:
             self.messages_key = format_config.messages_key
@@ -129,7 +129,6 @@ def _messages_to_experience(
         self,
         messages: List[Dict],
         tools: Optional[List[Dict] | str] = None,
-        mm_data: Optional[Dict] = None,
     ) -> Experience:
         """Convert messages and tools into an Experience object.
 
@@ -170,89 +169,63 @@ def _messages_to_experience(
                 prompt_length=prompt_length,
                 messages=messages,
             )
-        if mm_data:
-            return self.convert_mm_data_to_experiences(messages=messages, mm_data=mm_data)
-        token_ids = self.tokenizer.apply_chat_template(
-            messages,
-            tools=tools,
-            add_generation_prompt=False,
-            return_tensors="pt",
-            chat_template=self.chat_template,
-        )[0]
-        prompt_tokens_ids = self.tokenizer.apply_chat_template(
-            messages[:-1],
-            tools=tools,
-            add_generation_prompt=True,
-            return_tensors="pt",
-            chat_template=self.chat_template,
-        )[0]
-        return Experience(
-            tokens=token_ids,
-            prompt_length=len(prompt_tokens_ids),
-            messages=messages,
-        )
-
-    def load_mm_data(self, sample: Dict) -> Dict:
-        """Load multi-modal data such as images or videos.
-
-        NOTE: You can override this method for custom data loading.
-
-        Args:
-            sample (Dict): The raw sample dictionary containing multi-modal data.
-
-        Returns:
-            Dict: A dictionary containing multi-modal data. Specifically, it may contain:
-                - images: A list of `PIL.Image.Image` if `self.image_key` is set
-                - videos: A list of `numpy.ndarray` if `self.video_key` is set
-        """
-        from verl.utils.dataset.vision_utils import process_image, process_video
-
-        mm_data = {}
-        if self.image_key:
-            mm_data["images"] = [process_image(img) for img in sample[self.image_key]]
-        if self.video_key:
-            mm_data["videos"] = [process_video(vid).numpy() for vid in sample[self.video_key]]
-        return mm_data
-
-    def convert_mm_data_to_experiences(
-        self,
-        messages: List[Dict],
-        mm_data: Dict,
-    ) -> Experience:
-        from trinity.common.models.mm_utils import (
-            build_multi_modal_inputs,
-            convert_messages_to_mm_format,
-        )
+        if self.image_key or self.video_key:
+            from trinity.common.models.mm_utils import (
+                build_mm_input_for_training,
+                build_multi_modal_data,
+            )
 
-        messages = convert_messages_to_mm_format(messages)
-        sequence: str = self.processor.apply_chat_template(
-            messages,
-            add_generation_prompt=False,
-            chat_template=self.chat_template,
-        )
-        prompt: str = self.processor.apply_chat_template(
-            messages[:-1],
-            add_generation_prompt=True,
-            chat_template=self.chat_template,
-        )
-        sequence_data = build_multi_modal_inputs(
-            prompt=sequence,
-            images=mm_data.get("images", None),
-            videos=mm_data.get("videos", None),
-            processor=self.processor,
-        )
-        prompt_data = build_multi_modal_inputs(
-            prompt=prompt,
-            images=mm_data.get("images", None),
-            videos=mm_data.get("videos", None),
-            processor=self.processor,
-        )
-        return Experience(
-            tokens=sequence_data["prompt_token_ids"],
-            prompt_length=len(prompt_data["prompt_token_ids"]),
-            messages=messages,
-            multi_modal_inputs=sequence_data["multi_modal_inputs"],
-        )
+            full_text = self.processor.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                chat_template=self.chat_template,
+            )
+            prompt = self.processor.apply_chat_template(
+                messages[:-1],
+                tokenize=False,
+                add_generation_prompt=True,
+                chat_template=self.chat_template,
+            )
+            multi_modal_data = build_multi_modal_data(self.processor, messages)
+            full_text_inputs = build_mm_input_for_training(
+                self.processor,
+                full_text,
+                multi_modal_data,
+            )
+            tokens = full_text_inputs.pop("input_ids")[0]
+            full_text_inputs.pop("attention_mask", None)
+            prompt_text_inputs = build_mm_input_for_training(
+                self.processor,
+                prompt,
+                multi_modal_data,
+            )
+            return Experience(
+                tokens=tokens,
+                prompt_length=len(prompt_text_inputs["input_ids"][0]),
+                messages=messages,
+                multi_modal_inputs=full_text_inputs,
+            )
+        else:
+            token_ids = self.tokenizer.apply_chat_template(
+                messages,
+                tools=tools,
+                add_generation_prompt=False,
+                return_tensors="pt",
+                chat_template=self.chat_template,
+            )[0]
+            prompt_tokens_ids = self.tokenizer.apply_chat_template(
+                messages[:-1],
+                tools=tools,
+                add_generation_prompt=True,
+                return_tensors="pt",
+                chat_template=self.chat_template,
+            )[0]
+            return Experience(
+                tokens=token_ids,
+                prompt_length=len(prompt_tokens_ids),
+                messages=messages,
+            )
 
     def format(self, sample: Dict) -> Experience:
         if self.prompt_type == PromptType.MESSAGES:
@@ -274,13 +247,18 @@ def format(self, sample: Dict) -> Experience:
             elif self.system_prompt is not None:
                 system_message = {"role": "system", "content": self.system_prompt}
                 messages.append(system_message)
-            messages.append({"role": "user", "content": sample[self.prompt_key]})
+            prompt = sample[self.prompt_key]
+            images = sample[self.image_key] if self.image_key else []
+            videos = sample[self.video_key] if self.video_key else []
+
+            from trinity.common.models.mm_utils import build_mm_message
+
+            messages.append(build_mm_message(prompt, images, videos))
             messages.append({"role": "assistant", "content": sample[self.response_key]})
         else:
             raise ValueError(f"Unsupported prompt_type: {self.prompt_type}")
         tools = sample.get(self.tools_key, None)
-        mm_data = self.load_mm_data(sample) if self.image_key or self.video_key else None
-        return self._messages_to_experience(messages, tools, mm_data)
+        return self._messages_to_experience(messages, tools)
 
 
 class DPOFormatter(ExperienceFormatter):
diff --git a/trinity/common/config.py b/trinity/common/config.py
index c392bb60dc8..7f4d6edb65c 100644
--- a/trinity/common/config.py
+++ b/trinity/common/config.py
@@ -444,6 +444,7 @@ class TinkerConfig:
 class ModelConfig:
     # source model path
     model_path: str = ""
+    trust_remote_code: bool = False
     critic_model_path: str = ""
 
     custom_chat_template: Optional[str] = None
@@ -493,6 +494,7 @@ class InferenceModelConfig:
     # ! DO NOT SET in explorer.rollout_model, automatically set from config.model.model_path
     model_path: Optional[str] = None
     name: Optional[str] = None
+    trust_remote_code: bool = False
 
     engine_type: str = "vllm"
     engine_num: int = 1
@@ -663,7 +665,6 @@ class BufferConfig:
     # ! DO NOT SET FOLLOWING FIELDS
     explorer_output: Optional[StorageConfig] = None  # automatically set
     tokenizer_path: Optional[str] = None  # automatically set
-    pad_token_id: Optional[int] = None  # automatically set
     cache_dir: Optional[str] = None  # automatically set
 
 
diff --git a/trinity/common/config_validator.py b/trinity/common/config_validator.py
index fdad4d5856a..69520fe56dc 100644
--- a/trinity/common/config_validator.py
+++ b/trinity/common/config_validator.py
@@ -16,6 +16,7 @@
     set_if_none,
 )
 from trinity.common.constants import StorageType, SyncMethod, SyncStyle
+from trinity.common.patch import kimi_vl_monkey_patch_decorator
 from trinity.utils.log import get_logger
 from trinity.utils.lora_utils import create_dummy_lora
 
@@ -595,7 +596,7 @@ def validate(self, config: Config) -> None:
         model_args = rollout_args + length_args + rope_args
 
         # rollout model
-        for args in model_args + ["model_path"]:
+        for args in model_args + ["model_path", "trust_remote_code"]:
             set_if_none(config.explorer.rollout_model, args, getattr(config.model, args))
         set_if_none(
             config.explorer.rollout_model, "chat_template", config.model.custom_chat_template
@@ -874,26 +875,6 @@ def validate(self, config: Config) -> None:
                 f"your checkpoint directory: {config.checkpoint_job_dir}"
             ) from e
 
-        # set pad_token_id / tokenizer_path
-        if config.buffer.pad_token_id is None:
-            from transformers import AutoTokenizer
-
-            try:
-                tokenizer = AutoTokenizer.from_pretrained(config.model.model_path)
-                if tokenizer.pad_token_id is None:
-                    tokenizer.pad_token_id = tokenizer.eos_token_id
-                    self.logger.warning(
-                        f"tokenizer.pad_token_id is None. Now set to {tokenizer.eos_token_id}",
-                        stacklevel=1,
-                    )
-                config.buffer.pad_token_id = tokenizer.pad_token_id
-
-            except Exception:
-                self.logger.warning(
-                    f"Failed to get pad token id from model {config.model.model_path}"
-                )
-                config.buffer.pad_token_id = 0
-
         self._check_explorer_input(config)
         self._check_trainer_input(config)
         self._check_data_processor(config)
@@ -1266,7 +1247,10 @@ def validate_trainer_memory_usage(self, config: Config) -> None:
         else:
             self.logger.info("GPU memory check skipped for non-FSDP strategies.")
 
-    def _get_model_params_num_and_config(self, model_path: str) -> Tuple[int, Any]:
+    @kimi_vl_monkey_patch_decorator
+    def _get_model_params_num_and_config(
+        self, model_path: str, trust_remote_code: bool
+    ) -> Tuple[int, Any]:
         """Load model configuration and estimate total parameter count without loading weights.
 
         Uses `accelerate.init_empty_weights()` to avoid GPU memory allocation during inspection.
@@ -1286,9 +1270,13 @@ def _get_model_params_num_and_config(self, model_path: str) -> Tuple[int, Any]:
         import transformers
         from accelerate import init_empty_weights
 
-        model_config = transformers.AutoConfig.from_pretrained(model_path)
+        model_config = transformers.AutoConfig.from_pretrained(
+            model_path, trust_remote_code=trust_remote_code
+        )
         with init_empty_weights():
-            model = transformers.AutoModel.from_config(model_config, torch_dtype=torch.bfloat16)
+            model = transformers.AutoModel.from_config(
+                model_config, trust_remote_code=trust_remote_code, dtype=torch.bfloat16
+            )
         params_num = model.num_parameters()
         assert params_num > 0, f"No parameters found in the model at path: {model_path}"
         return params_num, model_config
@@ -1382,7 +1370,9 @@ def fsdp_memory_check(self, config: Config) -> None:
 
         try:
             model_path = config.model.model_path
-            params_num, hf_config = self._get_model_params_num_and_config(model_path)
+            params_num, hf_config = self._get_model_params_num_and_config(
+                model_path, config.model.trust_remote_code
+            )
 
             verl_config: veRLConfig = config.trainer.trainer_config
             world_size = config.cluster.trainer_gpu_num
@@ -1407,7 +1397,7 @@ def fsdp_memory_check(self, config: Config) -> None:
                     critic_hf_config = hf_config
                 else:
                     critic_params_num, critic_hf_config = self._get_model_params_num_and_config(
-                        config.model.critic_model_path
+                        config.model.critic_model_path, config.model.trust_remote_code
                     )
 
                 (
@@ -1547,6 +1537,10 @@ def _check_max_memory_in_fsdp_training(
             params_memory (float): Estimated parameter + optimizer memory (bytes).
             optim_step_memory (float): Estimated optimizer step memory (bytes).
         """
+        is_vl_model = False
+        if "VL" in hf_config.__class__.__name__:
+            hf_config = hf_config.text_config
+            is_vl_model = True
         max_activation_memory = self._calc_fsdp_activation_memory(
             hf_config, num_tokens, logits_memory_type, dtype_coeff
         )
@@ -1557,6 +1551,12 @@ def _check_max_memory_in_fsdp_training(
         optim_step_mb = optim_step_memory / (1024**2)
         gpu_capacity_mb = self.memory_capacity / (1024**2)
 
+        if is_vl_model:
+            self.logger.info(
+                "Note: This is a vision-language (VL) model. "
+                "The memory estimate below only covers the text encoder portion. "
+                "Actual GPU memory usage will be higher due to the vision components."
+            )
         self.logger.info(
             f"Estimated GPU memory usage for {module_name} model '{model_path}': "
             f"{total_mb:.2f} MB ({params_mb:.2f} MB params + "
diff --git a/trinity/common/models/mm_utils.py b/trinity/common/models/mm_utils.py
index e850f190d04..fe012e8d50b 100644
--- a/trinity/common/models/mm_utils.py
+++ b/trinity/common/models/mm_utils.py
@@ -1,73 +1,218 @@
-""""Multi-modal utilities for processing and handling multi-modal data such as images and videos.
-Only support Qwen2.5 VL series.
+"""Utilities for processing multi-modal data (images/videos) for specific vision-language models.
 
-Modified from: verl/utils/dataset/rl_dataset.py
+Supported models:
+- Qwen2.5-VL, Qwen3-VL series
+- Kimi VL series
+
+Provides functions to:
+1. Parse prompts with media tags (<image>/<video>)
+2. Validate multi-modal content in conversations
+3. Preprocess media inputs for inference/training
+4. Construct model-compatible message formats
+
+Note:
+    Only processors with class names containing both ("Qwen" OR "Kimi") AND "Processor" are supported.
+    Relies on `qwen_vl_utils.process_vision_info` for media extraction.
 """
 import re
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
 
-import numpy as np
-from PIL import Image
 
-
-def build_multi_modal_inputs(
-    prompt: str,
-    images: List[Image.Image],
-    videos: List[np.ndarray],
+def build_multi_modal_data(
     processor: Any,
+    messages: List[Dict],
 ) -> Dict[str, Any]:
+    """Extract and preprocess vision inputs from multi-modal messages for vLLM inference.
+
+    Processes messages containing image/video placeholders using model-specific vision utilities.
+    Returns structured media inputs compatible with vLLM's multi-modal API.
+
+    Args:
+        processor: Vision-language processor instance (must have class name containing
+                   ("Qwen" OR "Kimi") AND "Processor").
+        messages: List of conversation messages in model-expected format. Each message's "content"
+                  may be a string or list of content items (text/image/video dictionaries).
+
+    Returns:
+        Dictionary containing processed media inputs with keys:
+        - "image": List of processed image objects (if images exist)
+        - "video": List of processed video objects (if videos exist)
+        Keys are omitted when no corresponding media is present.
+
+    Raises:
+        NotImplementedError: If processor class name doesn't match supported patterns.
+        ImportError: If required `qwen_vl_utils` module is unavailable.
+
+    Example:
+        >>> messages = [{"role": "user", "content": [{"type": "image", "image": "img.jpg"}, {"type": "text", "text": "Describe this"}]}]
+        >>> build_multi_modal_data(processor, messages)
+        {"image": [processed_image]}
     """
-    Preprocess multi-modal data and build multi-modal inputs
+    processor_class_name = processor.__class__.__name__
+    if (
+        "Qwen" in processor_class_name or "Kimi" in processor_class_name
+    ) and "Processor" in processor_class_name:
+        from qwen_vl_utils import process_vision_info
+
+        image_inputs, video_inputs = process_vision_info(messages)
+        multi_modal_data = {}
+        if image_inputs:
+            multi_modal_data["image"] = image_inputs
+        if video_inputs:
+            multi_modal_data["video"] = video_inputs
+
+        return multi_modal_data
+    raise NotImplementedError(
+        f"Processor '{processor_class_name}' not supported. Only Qwen/Kimi VL processors are supported."
+    )
+
+
+def build_mm_input_for_training(
+    processor: Any, prompt: str, multi_modal_data: Dict[str, List]
+) -> Dict[str, Any]:
+    """Tokenize prompt and integrate processed media inputs for model training.
+
+    Combines text prompt with preprocessed image/video data into model-ready tensor inputs.
+    Handles padding and tensor conversion for training workflows.
+
+    Args:
+        processor: Vision-language processor instance (must have class name containing
+                   ("Qwen" OR "Kimi") AND "Processor").
+        prompt: Plain text prompt WITHOUT media tags (e.g., "Describe this image").
+                Media placement is handled via `multi_modal_data`, not prompt tags.
+        multi_modal_data: Dictionary from `build_multi_modal_data()` containing:
+                          {"image": [...], "video": [...]} (keys optional)
+
+    Returns:
+        Dictionary of model inputs including:
+        - input_ids: Tokenized prompt IDs
+        - attention_mask: Attention mask tensor
+        - pixel_values: Processed image tensors (if images provided)
+        - pixel_values_videos: Processed video tensors (if videos provided)
+        All tensors converted to PyTorch format (`return_tensors="pt"`).
+
+    Raises:
+        NotImplementedError: If processor class name doesn't match supported patterns.
+        ValueError: If media counts mismatch prompt expectations (handled internally by processor).
+
+    Note:
+        Prompt should NOT contain <image>/<video> tags here. Media association is managed
+        through the structured `multi_modal_data` dictionary.
     """
-    if prompt is None:
-        raise ValueError("Prompt is required for build multi-modal inputs")
-
-    multi_modal_data = {}
-    if images:
-        multi_modal_data["image"] = images
-    if videos:
-        multi_modal_data["video"] = videos
-
-    model_inputs = processor(
-        text=[prompt],
-        images=multi_modal_data.get("image", None),
-        videos=multi_modal_data.get("video", None),
-        return_tensors="pt",
+    processor_class_name = processor.__class__.__name__
+    if (
+        "Qwen" in processor_class_name or "Kimi" in processor_class_name
+    ) and "Processor" in processor_class_name:
+        inputs = processor(
+            text=[prompt],
+            images=multi_modal_data.get("image", None),
+            videos=multi_modal_data.get("video", None),
+            padding=True,
+            return_tensors="pt",
+        )
+        return dict(inputs)
+    raise NotImplementedError(
+        f"Processor '{processor_class_name}' not supported. Only Qwen/Kimi VL processors are supported."
     )
 
-    input_ids = model_inputs.pop("input_ids")[0]
-    model_inputs.pop("attention_mask")
 
-    if "second_per_grid_ts" in model_inputs:
-        model_inputs.pop("second_per_grid_ts")
+def build_mm_message(
+    prompt: str, images: List[Union[str, Any]], videos: List[Union[str, Any]]
+) -> Dict[str, Any]:
+    """Construct multi-modal message by injecting media references at tag positions in prompt.
+
+    Parses prompt for <image>/<video> tags, replaces them with corresponding media references,
+    and handles surplus media items. Extra media (beyond tag count) is prepended to content.
+
+    Args:
+        prompt: Text containing optional <image> and <video> tags as media placeholders.
+                Example: "First <image> then <video> and finally <image>"
+        images: List of image references (file paths, URLs, or PIL images) in order of appearance.
+        videos: List of video references (file paths, URLs) in order of appearance.
+
+    Returns:
+        Message dictionary formatted for VL models:
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": ...},  # Surplus media first
+                {"type": "video", "video": ...},
+                {"type": "text", "text": "First "},
+                {"type": "image", "image": ...},  # Tag-replaced media
+                ...
+            ]
+        }
+
+    Raises:
+        ValueError: If prompt contains more <image> tags than provided images,
+                    or more <video> tags than provided videos.
+
+    Behavior details:
+        - Tags are case-sensitive and must be exact: "<image>", "<video>"
+        - Empty text segments between tags are omitted
+        - Surplus media (images/videos beyond tag count) appears at START of content list
+        - Text segments preserve original prompt ordering around tags
+    """
+    content_list = []
+    segments = re.split(r"(<image>|<video>)", prompt)
+    img_idx, vid_idx = 0, 0
+    for segment in segments:
+        if segment == "<image>":
+            if img_idx >= len(images):
+                raise ValueError("More <image> tags in prompt than images provided.")
+            content_list.append({"type": "image", "image": images[img_idx]})
+            img_idx += 1
+        elif segment == "<video>":
+            if vid_idx >= len(videos):
+                raise ValueError("More <video> tags in prompt than videos provided.")
+            content_list.append({"type": "video", "video": videos[vid_idx]})
+            vid_idx += 1
+        elif len(segment) == 0:
+            continue
+        else:
+            content_list.append({"type": "text", "text": segment})
+
+    # Prepend surplus media items (not referenced by tags)
+    surplus_content = []
+    while img_idx < len(images):
+        surplus_content.append({"type": "image", "image": images[img_idx]})
+        img_idx += 1
+    while vid_idx < len(videos):
+        surplus_content.append({"type": "video", "video": videos[vid_idx]})
+        vid_idx += 1
+
+    content_list = surplus_content + content_list
+    if len(content_list) == 1 and content_list[0]["type"] == "text":
+        return {"role": "user", "content": content_list[0]["text"]}
+    return {"role": "user", "content": content_list}
 
-    return {
-        "prompt": prompt,
-        "prompt_token_ids": input_ids,
-        "multi_modal_data": multi_modal_data,
-        "multi_modal_inputs": dict(model_inputs),
-    }
 
+def has_multi_modal_content(messages: List[Dict]) -> bool:
+    """Check if any message contains non-text (image/video) content.
 
-def convert_messages_to_mm_format(messages: List[Dict]) -> List[Dict]:
+    Inspects message content structure to detect multi-modal elements. Handles both:
+    - String content (text-only, returns False)
+    - List content (multi-modal candidates)
+
+    Args:
+        messages: List of conversation messages. Each message must contain a "content" field.
+                  Content may be:
+                  - str: Plain text message
+                  - List[Dict]: Multi-modal content items (each with "type" key)
+
+    Returns:
+        True if any message contains at least one non-text content item (type != "text"),
+        False otherwise.
+
+    Example:
+        >>> msg = [{"role": "user", "content": [{"type": "text", "text": "Hi"}, {"type": "image", "image": "..."}]}]
+        >>> has_multi_modal_content(msg)
+        True
+    """
     for message in messages:
-        content = message["content"]
-        content_list = []
-        segments = re.split("(<image>|<video>)", content)
-        segments = [item for item in segments if item != ""]
-        for segment in segments:
-            if segment == "<image>":
-                content_list.append(
-                    {"type": "image"}
-                )  # chat template will fill the actual image data later
-            elif segment == "<video>":
-                content_list.append(
-                    {"type": "video"}
-                )  # chat template will fill the actual video data later
-            elif len(segment) == 0:
-                continue
-            else:
-                content_list.append({"type": "text", "text": segment})
-
-        message["content"] = content_list
-    return messages
+        content = message.get("content", [])
+        if isinstance(content, list):
+            for item in content:
+                if item.get("type", "text") != "text":
+                    return True
+    return False
diff --git a/trinity/common/models/model.py b/trinity/common/models/model.py
index b9bd37e1eae..d223c1f0677 100644
--- a/trinity/common/models/model.py
+++ b/trinity/common/models/model.py
@@ -7,10 +7,8 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple, Union
 
 import httpx
-import numpy as np
 import ray
 import torch
-from PIL import Image
 from torch import Tensor
 
 from trinity.common.config import InferenceModelConfig
@@ -345,39 +343,6 @@ async def generate_async(self, prompts: List[str], **kwargs) -> List[Experience]
         )
         return [exp for exps in results for exp in exps]
 
-    @_history_recorder
-    def generate_mm(
-        self,
-        prompts: List[str],
-        images: List[List[Image.Image]],
-        videos: List[List[np.ndarray]],
-        **kwargs,
-    ) -> List[Experience]:
-        """Generate a list of experiences from a list of prompts and multi-modal data."""
-        results = ray.get(
-            [
-                self.model.generate_mm.remote(prompt, images=img, videos=vid, **kwargs)
-                for prompt, img, vid in zip(prompts, images, videos)
-            ]
-        )
-        return [exp for exps in results for exp in exps]
-
-    @_history_recorder
-    async def generate_mm_async(
-        self,
-        prompts: List[str],
-        images: List[List[Image.Image]],
-        videos: List[List[np.ndarray]],
-        **kwargs,
-    ) -> List[Experience]:
-        results = await asyncio.gather(
-            *[
-                self.model.generate_mm.remote(p, images=img, videos=vid, **kwargs)
-                for p, img, vid in zip(prompts, images, videos)
-            ]
-        )
-        return [exp for exps in results for exp in exps]
-
     @_history_recorder
     def chat(self, messages: List[dict], **kwargs) -> List[Experience]:
         """Generate a list of experiences from a list of messages."""
@@ -390,18 +355,6 @@ async def chat_async(self, messages: List[dict], **kwargs) -> List[Experience]:
         lora_request = await self.get_lora_request_async()
         return await self.model.chat.remote(messages, lora_request=lora_request, **kwargs)
 
-    @_history_recorder
-    def chat_mm(
-        self, messages: List[dict], images: List[Image.Image], videos: List[np.ndarray], **kwargs
-    ) -> List[Experience]:
-        return ray.get(self.model.chat_mm.remote(messages, images=images, videos=videos, **kwargs))
-
-    @_history_recorder
-    async def chat_mm_async(
-        self, messages: List[dict], images: List[Image.Image], videos: List[np.ndarray], **kwargs
-    ) -> List[Experience]:
-        return await self.model.chat_mm.remote(messages, images=images, videos=videos, **kwargs)
-
     def logprobs(self, tokens: List[int], temperature: Optional[float] = None) -> Tensor:
         """Calculate the logprobs of the given tokens."""
         return ray.get(self.model.logprobs.remote(tokens, temperature=temperature))
diff --git a/trinity/common/models/tinker_model.py b/trinity/common/models/tinker_model.py
index 0c93c77dafe..de381d2200a 100644
--- a/trinity/common/models/tinker_model.py
+++ b/trinity/common/models/tinker_model.py
@@ -27,10 +27,7 @@ def __init__(
 
     async def _initialize_tokenizer(self) -> None:
         """Initialize the tokenizer."""
-        self.trainer_client = await self.service_client.create_lora_training_client_async(
-            base_model=self.config.model_path
-        )
-        self.tokenizer = self.trainer_client.get_tokenizer()
+        self.tokenizer = self.model.get_tokenizer()
 
     async def _generate_internal(self, prompt: dict, **kwargs) -> types.SampleResponse:
         assert self.model is not None
diff --git a/trinity/common/models/vllm_model.py b/trinity/common/models/vllm_model.py
index 2372db6cc39..86394d11f08 100644
--- a/trinity/common/models/vllm_model.py
+++ b/trinity/common/models/vllm_model.py
@@ -3,19 +3,18 @@
 import asyncio
 import os
 from collections import defaultdict
-from typing import Any, Dict, List, Optional, Sequence, Tuple
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
-import numpy as np
 import torch
 from packaging.version import parse as parse_version
-from PIL import Image
 from transformers import AutoProcessor
 
 from trinity.common.config import InferenceModelConfig
 from trinity.common.experience import Experience
 from trinity.common.models.mm_utils import (
-    build_multi_modal_inputs,
-    convert_messages_to_mm_format,
+    build_mm_input_for_training,
+    build_multi_modal_data,
+    has_multi_modal_content,
 )
 from trinity.common.models.model import BaseInferenceModel
 from trinity.common.models.vllm_patch import get_vllm_version
@@ -155,11 +154,11 @@ async def _initialize_tokenizer(self):
                 self.tokenizer = await self.async_llm.get_tokenizer()
         self.tokenizer.truncation_side = "left"
 
-    def _initialize_processor(self):
+    async def _initialize_processor(self):
         self.processor = AutoProcessor.from_pretrained(
             self.config.model_path, trust_remote_code=True
         )
-        self.tokenizer = self.processor.tokenizer
+        await self._initialize_tokenizer()
 
     async def prepare(
         self,
@@ -182,13 +181,28 @@ async def chat(self, messages: List[Dict], lora_request=None, **kwargs) -> Seque
         Returns:
             A list of experiences.
         """
-        if self.tokenizer is None:
-            await self._initialize_tokenizer()
-
-        prompt = self.apply_chat_template(self.tokenizer, messages)
+        is_mm_message = has_multi_modal_content(messages)
+        if is_mm_message:
+            if self.processor is None:
+                await self._initialize_processor()
+            tokenizer_or_processor = self.processor
+        else:
+            if self.tokenizer is None:
+                await self._initialize_tokenizer()
+            tokenizer_or_processor = self.tokenizer
+
+        prompt = self.apply_chat_template(tokenizer_or_processor, messages)
+        if is_mm_message:
+            multi_modal_data = build_multi_modal_data(self.processor, messages)
+            prompt = {
+                "prompt": prompt,
+                "multi_modal_data": multi_modal_data,
+            }
         return await self.generate(prompt=prompt, lora_request=lora_request, **kwargs)
 
-    async def generate(self, prompt: str, lora_request=None, **kwargs) -> Sequence[Experience]:
+    async def generate(
+        self, prompt: Union[str, Dict], lora_request=None, **kwargs
+    ) -> Sequence[Experience]:
         """Generate a response from the provided prompt in async.
 
         Args:
@@ -198,16 +212,21 @@ async def generate(self, prompt: str, lora_request=None, **kwargs) -> Sequence[E
         Returns:
             A list of experiences.
         """
-        if self.tokenizer is None:
-            await self._initialize_tokenizer()
-
-        token_ids, is_valid = self._handle_prompt_truncation(prompt, **kwargs)
-        if not is_valid:
-            return token_ids
-
-        output = await self._generate_internal(
-            prompt={"prompt_token_ids": token_ids}, lora_request=lora_request, **kwargs
-        )
+        if isinstance(prompt, str):  # pure text
+            if self.tokenizer is None:
+                await self._initialize_tokenizer()
+
+            token_ids, is_valid = self._handle_prompt_truncation(prompt, **kwargs)
+            if not is_valid:
+                return token_ids
+            prompt = {"prompt_token_ids": token_ids}
+            multi_modal_inputs = None
+        else:  # multi modal
+            multi_modal_inputs = build_mm_input_for_training(self.processor, **prompt)
+            multi_modal_inputs.pop("input_ids", None)
+            multi_modal_inputs.pop("attention_mask", None)
+
+        output = await self._generate_internal(prompt=prompt, lora_request=lora_request, **kwargs)
         experiences = [
             Experience(
                 tokens=torch.cat(
@@ -230,83 +249,7 @@ async def generate(self, prompt: str, lora_request=None, **kwargs) -> Sequence[E
                 prompt_length=len(output.prompt_token_ids),
                 prompt_text=self.tokenizer.decode(output.prompt_token_ids),
                 response_text=output.outputs[i].text,
-            )
-            for i in range(len(output.outputs))
-        ]
-        return experiences
-
-    async def chat_mm(
-        self, messages: List[Dict], images: List[Image.Image], videos: List[np.ndarray], **kwargs
-    ) -> Sequence[Experience]:
-        """Chat with the model with a list of messages in async.
-
-        Args:
-            messages (List[dict]): The input history messages.
-            raw_mm_data (dict): The raw multi-modal data.
-            kwargs (dict): A dictionary of sampling parameters.
-
-        Returns:
-            A list of experiences.
-        """
-        if self.processor is None:
-            self._initialize_processor()
-        messages = convert_messages_to_mm_format(messages)
-        prompt = self.apply_chat_template(self.processor, messages)
-        return await self.generate_mm(prompt=prompt, images=images, videos=videos, **kwargs)
-
-    async def generate_mm(
-        self,
-        prompt: str = None,
-        images: List[Image.Image] = None,
-        videos: List[np.ndarray] = None,
-        **kwargs,
-    ) -> Sequence[Experience]:
-        """Generate a response from the provided prompt in async.
-
-        Args:
-            prompt (str): The input prompt.
-            images (List): The list of image inputs.
-            videos (List): The list of video inputs.
-
-        Returns:
-            A list of experiences.
-        """
-        mm_inputs = build_multi_modal_inputs(
-            prompt=prompt,
-            images=images,
-            videos=videos,
-            processor=self.processor,
-        )
-
-        vllm_inputs = {
-            "prompt": mm_inputs["prompt"],
-            "multi_modal_data": mm_inputs["multi_modal_data"],
-        }
-
-        output = await self._generate_internal(prompt=vllm_inputs, **kwargs)
-        experiences = [
-            Experience(
-                tokens=torch.cat(
-                    (
-                        torch.tensor(output.prompt_token_ids, dtype=torch.int32),
-                        torch.tensor(output.outputs[i].token_ids, dtype=torch.int32),
-                    )
-                ),
-                logprobs=torch.cat(
-                    (
-                        torch.tensor(
-                            [
-                                list(logprob_dict.values())[0].logprob
-                                for logprob_dict in output.outputs[i].logprobs
-                            ],
-                            dtype=torch.float32,
-                        ),
-                    )
-                ),
-                prompt_length=len(output.prompt_token_ids),
-                prompt_text=mm_inputs["prompt"],
-                response_text=output.outputs[i].text,
-                multi_modal_inputs=mm_inputs["multi_modal_inputs"],
+                multi_modal_inputs=multi_modal_inputs,
             )
             for i in range(len(output.outputs))
         ]
diff --git a/trinity/common/models/vllm_patch/__init__.py b/trinity/common/models/vllm_patch/__init__.py
index b9a8ffe0c66..4e4eff4d989 100644
--- a/trinity/common/models/vllm_patch/__init__.py
+++ b/trinity/common/models/vllm_patch/__init__.py
@@ -8,6 +8,14 @@
 from trinity.common.config import InferenceModelConfig
 
 
+def vllm_patch():
+    import transformers
+
+    # Patch for Kimi-VL-A3B-Thinking
+    if not hasattr(transformers.activations, "PytorchGELUTanh"):
+        transformers.activations.PytorchGELUTanh = transformers.activations.GELUTanh
+
+
 def get_vllm_version():
     try:
         vllm_version = parse_version(vllm.__version__)
diff --git a/trinity/common/patch/__init__.py b/trinity/common/patch/__init__.py
new file mode 100644
index 00000000000..e72f96dfdb2
--- /dev/null
+++ b/trinity/common/patch/__init__.py
@@ -0,0 +1,5 @@
+from trinity.common.patch.kimi import kimi_vl_monkey_patch_decorator
+
+__all__ = [
+    "kimi_vl_monkey_patch_decorator",
+]
diff --git a/trinity/common/patch/kimi.py b/trinity/common/patch/kimi.py
new file mode 100644
index 00000000000..853bd3d4970
--- /dev/null
+++ b/trinity/common/patch/kimi.py
@@ -0,0 +1,121 @@
+"""Monkey patching for 'kimi_vl' models."""
+
+
+def kimi_vl_monkey_patch_decorator(func):
+    """
+    A decorator that applies temporary monkey patches for 'kimi_vl' models before
+    the decorated function runs, and restores the original state afterward.
+
+    The patch is applied only if:
+      - The model's config.json exists and specifies "model_type": "kimi_vl"
+      - The installed transformers version is >= 4.51.0
+
+    Patches include:
+      1. Replacing `transformers.activations.PytorchGELUTanh` with `GELUTanh`
+      2. Wrapping `importlib.util.spec_from_file_location` to inject DeepseekV3 classes
+
+    The decorator automatically extracts `model_path` and `override_model_config`
+    from the function's arguments using `inspect.signature`, regardless of whether
+    they are passed as positional or keyword arguments.
+    """
+    import importlib
+    import inspect
+    import json
+    import os
+    from functools import wraps
+
+    import transformers
+    from packaging import version
+
+    transformers_version = transformers.__version__
+    sig = inspect.signature(func)  # Analyze function signature once at decoration time
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        # Bind actual arguments to parameter names (handles pos/kw/defaults)
+        bound_args = sig.bind(*args, **kwargs)
+        bound_args.apply_defaults()
+
+        # Extract required parameters safely by name
+        if "model_path" in bound_args.arguments:  # actor/ref worker
+            model_path = bound_args.arguments["model_path"]
+        elif "model_config" in bound_args.arguments:  # verl config check
+            model_path = bound_args.arguments["model_config"].path
+        elif "self" in bound_args.arguments:  # critic worker
+            model_path = bound_args.arguments["self"].config.model.path
+
+        # Track patch state for cleanup
+        kimi_vl_patch_applied = False
+        origin_spec_from_file_location = None
+        origin_PytorchGELUTanh = None
+
+        try:
+            config_path = os.path.join(model_path, "config.json")
+            if os.path.exists(config_path):
+                with open(config_path, "r") as f:
+                    json_hf_config = json.load(f)
+
+                # Check if model requires special patching
+                if json_hf_config.get("model_type") == "kimi_vl" and version.parse(
+                    transformers_version
+                ) >= version.parse("4.51.0"):
+                    # Save original values for restoration
+                    origin_PytorchGELUTanh = getattr(
+                        transformers.activations, "PytorchGELUTanh", None
+                    )
+                    origin_spec_from_file_location = importlib.util.spec_from_file_location
+
+                    # Patch 1: Replace PytorchGELUTanh
+                    transformers.activations.PytorchGELUTanh = transformers.activations.GELUTanh
+
+                    # Patch 2: Wrap spec_from_file_location to inject DeepseekV3 classes
+                    def patched_spec_from_file_location(*args_spec, **kwargs_spec):
+                        spec = origin_spec_from_file_location(*args_spec, **kwargs_spec)
+                        if spec and hasattr(spec, "loader") and spec.loader:
+                            original_exec_module = spec.loader.exec_module
+
+                            def patched_exec_module(module):
+                                original_exec_module(module)
+                                # Inject DeepseekV3* classes from transformers into the module
+                                for attr_name in dir(module):
+                                    if attr_name.startswith("DeepseekV3") and hasattr(
+                                        transformers, attr_name
+                                    ):
+                                        setattr(module, attr_name, getattr(transformers, attr_name))
+                                    elif attr_name in {
+                                        "KimiVLPreTrainedModel",
+                                        "KimiVLForConditionalGeneration",
+                                    }:
+                                        setattr(
+                                            getattr(module, attr_name),
+                                            "supports_gradient_checkpointing",
+                                            True,
+                                        )
+                                        setattr(getattr(module, attr_name), "_supports_sdpa", True)
+
+                            spec.loader.exec_module = patched_exec_module
+                        return spec
+
+                    importlib.util.spec_from_file_location = patched_spec_from_file_location
+
+                    kimi_vl_patch_applied = True
+
+            # Call the original function
+            return func(*args, **kwargs)
+
+        finally:
+            # Always restore original state, even if an exception occurred
+            if kimi_vl_patch_applied:
+                # Restore PytorchGELUTanh
+                if origin_PytorchGELUTanh is not None:
+                    transformers.activations.PytorchGELUTanh = origin_PytorchGELUTanh
+                else:
+                    # Remove attribute if it didn't exist originally
+                    if hasattr(transformers.activations, "PytorchGELUTanh"):
+                        delattr(transformers.activations, "PytorchGELUTanh")
+
+                # Restore spec_from_file_location
+                if origin_spec_from_file_location is not None:
+                    importlib.util.spec_from_file_location = origin_spec_from_file_location
+
+    return wrapper
diff --git a/trinity/common/verl_config.py b/trinity/common/verl_config.py
index d13d5dea2ba..218f67c4ba2 100644
--- a/trinity/common/verl_config.py
+++ b/trinity/common/verl_config.py
@@ -9,6 +9,7 @@
 from trinity.algorithm import ALGORITHM_TYPE
 from trinity.common.config import Config, SynchronizerConfig, set_if_none
 from trinity.common.constants import EXPLORER_NAME
+from trinity.common.patch import kimi_vl_monkey_patch_decorator
 from trinity.utils.log import get_logger
 
 logger = get_logger(__name__)
@@ -396,6 +397,7 @@ class veRLConfig:
     synchronizer: Optional[SynchronizerConfig] = None
     enable_preview: bool = True
 
+    @kimi_vl_monkey_patch_decorator
     def _check_parallel_config(
         self,
         obj: Union[Actor, Ref, Critic],
@@ -496,6 +498,7 @@ def synchronize_config(self, config: Config) -> None:  # noqa: C901
 
         # kept to pass RayPPOTrainer._validate_config
         self.data.train_batch_size = config.buffer.train_batch_size
+        self.data.trust_remote_code = config.model.trust_remote_code
 
         self.synchronizer = config.synchronizer
         self.actor_rollout_ref.nccl_timeout = config.synchronizer.sync_timeout
@@ -512,9 +515,8 @@ def synchronize_config(self, config: Config) -> None:  # noqa: C901
         actor_model_config = self.actor_rollout_ref.model
         actor_optim = actor_config.optim
         actor_model_config.path = config.model.model_path
-        actor_model_config.custom_chat_template = config.model.custom_chat_template
-        actor_model_config.rope_scaling = config.model.rope_scaling
-        actor_model_config.rope_theta = config.model.rope_theta
+        for attr in ["trust_remote_code", "custom_chat_template", "rope_scaling", "rope_theta"]:
+            setattr(actor_model_config, attr, getattr(config.model, attr))
         actor_optim.total_training_steps = self.trainer.total_training_steps
         actor_config.ppo_mini_batch_size = config.buffer.train_batch_size
         rollout_config.temperature = (
diff --git a/trinity/common/workflows/simple_mm_workflow.py b/trinity/common/workflows/simple_mm_workflow.py
index 2eca2274fb8..c51d5edf8e7 100644
--- a/trinity/common/workflows/simple_mm_workflow.py
+++ b/trinity/common/workflows/simple_mm_workflow.py
@@ -1,6 +1,7 @@
 from typing import List, Optional
 
 from trinity.common.experience import Experience
+from trinity.common.models.mm_utils import build_mm_message
 from trinity.common.models.model import ModelWrapper
 from trinity.common.rewards.reward_fn import RewardFn
 from trinity.common.workflows.workflow import SimpleWorkflow, Task
@@ -23,9 +24,19 @@ def __init__(
             auxiliary_models=auxiliary_models,
         )
 
-    def reset(self, task: Task):
-        from verl.utils.dataset.vision_utils import process_image, process_video
+    def format_messages(self):
+        """Format messages for the instruct model."""
+        messages = []
+        if self.system_prompt:
+            messages.append({"role": "system", "content": self.system_prompt})
+
+        messages.append(build_mm_message(self.task_desc, self.images, self.videos))
 
+        if self.reply_prefix:
+            messages.append({"role": "assistant", "content": self.reply_prefix})
+        return messages
+
+    def reset(self, task: Task):
         self.format_args = task.format_args
         self.system_prompt = """You are a helpful assistant that solves MATH problems. You should first thinks about the reasoning process in mind and then provides the user with the answer. You should present your reasoning process using the format: <think>\n ...your reasoning process here... </think>\n first. You should always include your final answer in \\boxed{} as closed-form results."""  # TODO: check
         self.reply_prefix = task.format_args.reply_prefix
@@ -41,25 +52,14 @@ def reset(self, task: Task):
         else:
             raise ValueError("`reward_fn` must be a subclass of `RewardFn`")
 
-        self.image_key = task.format_args.image_key
-        self.video_key = task.format_args.video_key
-        self.images = []
-        self.videos = []
-        if self.image_key and self.raw_task.get(self.image_key) is not None:
-            self.images = [process_image(img) for img in self.raw_task[self.image_key]]  # type: ignore [index]
-        if self.video_key and self.raw_task.get(self.video_key) is not None:
-            self.videos = [process_video(vid).numpy() for vid in self.raw_task[self.video_key]]  # type: ignore [index]
+        self.images = self.raw_task.get(task.format_args.image_key, [])
+        self.videos = self.raw_task.get(task.format_args.video_key, [])
         self.messages = self.format_messages()
 
     def run(self) -> List[Experience]:
         # TODO: test generate_mm
         self.logger.debug("start chat")
-        if self.images or self.videos:
-            responses = self.model.chat_mm(
-                messages=self.messages, images=self.images, videos=self.videos, **self.rollout_args
-            )
-        else:
-            responses = self.model.chat(messages=self.messages, **self.rollout_args)
+        responses = self.model.chat(messages=self.messages, **self.rollout_args)
         for i, response in enumerate(responses):
             reward_dict = self.reward_fn(  # type: ignore [misc]
                 response=response.response_text,  # type: ignore [arg-type]
@@ -83,12 +83,7 @@ class AsyncSimpleMMWorkflow(SimpleMMWorkflow):
     async def run_async(self) -> List[Experience]:
         # TODO: test generate_mm
         self.logger.debug("start chat")
-        if self.images or self.videos:
-            responses = await self.model.chat_mm_async(
-                messages=self.messages, images=self.images, videos=self.videos, **self.rollout_args
-            )
-        else:
-            responses = await self.model.chat_async(messages=self.messages, **self.rollout_args)
+        responses = await self.model.chat_async(messages=self.messages, **self.rollout_args)
         for i, response in enumerate(responses):
             reward_dict = self.reward_fn(  # type: ignore [misc]
                 response=response.response_text,  # type: ignore [arg-type]
diff --git a/trinity/trainer/verl/fsdp_checkpoint_manager.py b/trinity/trainer/verl/fsdp_checkpoint_manager.py
index 0bada758570..bacca448a12 100644
--- a/trinity/trainer/verl/fsdp_checkpoint_manager.py
+++ b/trinity/trainer/verl/fsdp_checkpoint_manager.py
@@ -48,6 +48,7 @@
 from verl.utils.logger import log_with_rank
 
 from trinity.manager.synchronizer import Synchronizer
+from trinity.trainer.verl.utils import get_model_class
 from trinity.trainer.verl_trainer import CheckpointMonitor
 from trinity.utils.log import get_logger
 
@@ -62,13 +63,14 @@ class FSDPCheckpointManager(OldFSDPCheckpointManager):
     This class is useful in distributed training scenarios where synchronization and non-blocking I/O are important.
     """
 
-    def __init__(self, *args, ray_namespace: str = "", **kwargs):
+    def __init__(self, *args, ray_namespace: str = "", trust_remote_code: bool = False, **kwargs):
         super().__init__(*args, **kwargs)
         self.logger = get_logger()
         self.synchronizer = Synchronizer.get_actor(namespace=ray_namespace)
         self.checkpoint_monitor = CheckpointMonitor.get_actor(
             namespace=ray_namespace,
         )
+        self.trust_remote_code = trust_remote_code
 
         # Threads for asynchronous saving of different components
         self._model_state_dict_thread = None
@@ -321,35 +323,14 @@ def _save_hf_model(self, local_path, global_step) -> bool:
             os.makedirs(hf_local_path, exist_ok=True)
 
             _, model_config, generation_config = self._get_unwrap_model_and_config()
-
-            if "ForTokenClassification" in model_config.architectures[0]:
-                from transformers import AutoModelForTokenClassification
-
-                auto_model_cls = AutoModelForTokenClassification
-            elif "ForCausalLM" in model_config.architectures[0]:
-                from transformers import AutoModelForCausalLM
-
-                auto_model_cls = AutoModelForCausalLM
-            elif "ForConditionalGeneration" in model_config.architectures[0]:
-                # Handle different transformers versions for Vision2Seq models
-                import transformers
-                from packaging import version
-
-                if version.parse(transformers.__version__) >= version.parse("4.54.0"):
-                    # transformers >= 4.54.0 uses AutoModelForImageTextToText
-                    from transformers import AutoModelForImageTextToText
-
-                    auto_model_cls = AutoModelForImageTextToText
-                else:
-                    # transformers < 4.54.0 uses AutoModelForVision2Seq
-                    from transformers import AutoModelForVision2Seq
-
-                    auto_model_cls = AutoModelForVision2Seq
-            else:
-                raise NotImplementedError(f"Unknown architecture {model_config['architectures']}")
+            auto_model_cls = get_model_class(model_config)
 
             with init_empty_weights():
-                save_model = auto_model_cls.from_config(model_config, torch_dtype=torch.bfloat16)
+                save_model = auto_model_cls.from_config(
+                    model_config,
+                    dtype=torch.bfloat16,
+                    trust_remote_code=self.trust_remote_code,
+                )
             save_model.to_empty(device="cpu")
 
             if save_model.can_generate():
diff --git a/trinity/trainer/verl/fsdp_workers.py b/trinity/trainer/verl/fsdp_workers.py
index 00a4cc03543..1b893bdfeff 100644
--- a/trinity/trainer/verl/fsdp_workers.py
+++ b/trinity/trainer/verl/fsdp_workers.py
@@ -93,8 +93,10 @@
 
 from trinity.common.config import AlgorithmConfig
 from trinity.common.constants import ROLLOUT_WEIGHT_SYNC_GROUP_NAME, SyncMethod
+from trinity.common.patch import kimi_vl_monkey_patch_decorator
 from trinity.manager.synchronizer import Synchronizer
 from trinity.trainer.verl.fsdp_checkpoint_manager import FSDPCheckpointManager
+from trinity.trainer.verl.utils import get_model_class
 from trinity.utils.distributed import init_process_group
 
 logger = logging.getLogger(__file__)
@@ -275,6 +277,7 @@ def _fsdp_offload_context(self):
             torch.distributed.barrier()
             torch.cuda.empty_cache()
 
+    @kimi_vl_monkey_patch_decorator
     def _build_model_optimizer(  # noqa: C901
         self,
         model_path,
@@ -292,13 +295,7 @@ def _build_model_optimizer(  # noqa: C901
         tiled_mlp_shards=4,
     ):
         from torch.distributed.fsdp import CPUOffload, MixedPrecision
-        from transformers import (
-            AutoConfig,
-            AutoModel,
-            AutoModelForCausalLM,
-            AutoModelForImageTextToText,
-            AutoModelForVision2Seq,
-        )
+        from transformers import AutoConfig
         from verl.utils.model import (
             get_generation_config,
             print_model_size,
@@ -385,34 +382,7 @@ def _build_model_optimizer(  # noqa: C901
 
         with init_context(), warnings.catch_warnings():
             warnings.simplefilter("ignore")
-            has_remote_code = hasattr(actor_model_config, "auto_map") and any(
-                actor_model_config.architectures[0] in val
-                for val in actor_model_config.auto_map.values()
-            )
-            if has_remote_code:
-                auto_class = next(
-                    k
-                    for k, v in actor_model_config.auto_map.items()
-                    if actor_model_config.architectures[0] in v
-                )
-                match auto_class:
-                    case "AutoModelForVision2Seq":
-                        actor_module_class = AutoModelForVision2Seq
-                    case "AutoModelForCausalLM":
-                        actor_module_class = AutoModelForCausalLM
-                    case "AutoModelForImageTextToText":
-                        actor_module_class = AutoModelForImageTextToText
-                    case _:
-                        actor_module_class = AutoModel
-            else:
-                if type(actor_model_config) in AutoModelForVision2Seq._model_mapping.keys():
-                    actor_module_class = AutoModelForVision2Seq
-                elif type(actor_model_config) in AutoModelForCausalLM._model_mapping.keys():
-                    actor_module_class = AutoModelForCausalLM
-                elif type(actor_model_config) in AutoModelForImageTextToText._model_mapping.keys():
-                    actor_module_class = AutoModelForImageTextToText
-                else:
-                    actor_module_class = AutoModel
+            actor_module_class = get_model_class(actor_model_config)
 
             actor_module = actor_module_class.from_pretrained(
                 pretrained_model_name_or_path=local_path,
@@ -651,6 +621,7 @@ def init_model(self):
         use_remove_padding = self.config.model.get("use_remove_padding", False)
         use_shm = self.config.model.get("use_shm", False)
         use_fused_kernels = self.config.model.get("use_fused_kernels", False)
+        trust_remote_code = self.config.model.get("trust_remote_code", False)
 
         if self._is_actor:
             # we need the model for actor
@@ -678,7 +649,7 @@ def init_model(self):
                 enable_gradient_checkpointing=self.config.model.get(
                     "enable_gradient_checkpointing", False
                 ),
-                trust_remote_code=self.config.model.get("trust_remote_code", False),
+                trust_remote_code=trust_remote_code,
                 use_liger=self.config.model.get("use_liger", False),
                 role="actor",
                 enable_activation_offload=self.config.model.get("enable_activation_offload", False),
@@ -733,7 +704,7 @@ def init_model(self):
                 override_model_config=override_model_config,
                 use_remove_padding=use_remove_padding,
                 use_fused_kernels=use_fused_kernels,
-                trust_remote_code=self.config.model.get("trust_remote_code", False),
+                trust_remote_code=trust_remote_code,
                 use_liger=self.config.model.get("use_liger", False),
                 role="ref",
                 use_tiled_mlp=ref_use_tiled_mlp,
@@ -753,6 +724,7 @@ def init_model(self):
                 processing_class=self.processor if self.processor is not None else self.tokenizer,
                 checkpoint_config=self.config.ref.checkpoint,
                 ray_namespace=self.config.synchronizer.ray_namespace,
+                trust_remote_code=trust_remote_code,
             )
 
         if self._is_actor:
@@ -764,6 +736,7 @@ def init_model(self):
                 processing_class=self.processor if self.processor is not None else self.tokenizer,
                 checkpoint_config=self.config.actor.checkpoint,
                 ray_namespace=self.config.synchronizer.ray_namespace,
+                trust_remote_code=trust_remote_code,
             )
 
     @register(dispatch_mode=Dispatch.ONE_TO_ALL)
@@ -1225,6 +1198,7 @@ def __init__(self, config: FSDPCriticConfig):
         )
         self.use_orig_params = self.config.model.fsdp_config.get("use_orig_params", False)
 
+    @kimi_vl_monkey_patch_decorator
     def _build_critic_model_optimizer(self, config):  # noqa: C901
         # the following line is necessary
         from torch.distributed.fsdp import MixedPrecision
@@ -1534,6 +1508,7 @@ def init_model(self):
             processing_class=self.processor if self.processor is not None else self.tokenizer,
             checkpoint_config=self.config.checkpoint,
             ray_namespace=self.config.ray_namespace,
+            trust_remote_code=self.config.model.get("trust_remote_code", False),
         )
 
     @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="critic"))
diff --git a/trinity/trainer/verl/megatron_checkpoint_manager.py b/trinity/trainer/verl/megatron_checkpoint_manager.py
index 600a4d015be..8135ec464d9 100644
--- a/trinity/trainer/verl/megatron_checkpoint_manager.py
+++ b/trinity/trainer/verl/megatron_checkpoint_manager.py
@@ -349,6 +349,7 @@ def _save_hf_model(self, local_path, global_step) -> bool:
 
                     from accelerate import init_empty_weights
 
+                    # TODO: Switch to get_model_class
                     with init_empty_weights(), warnings.catch_warnings():
                         warnings.simplefilter("ignore")
                         if "mistral7b-rm" in self.config.model.path:
diff --git a/trinity/trainer/verl/utils.py b/trinity/trainer/verl/utils.py
index a073808f4f2..16fd02d8946 100644
--- a/trinity/trainer/verl/utils.py
+++ b/trinity/trainer/verl/utils.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import torch
+from transformers import ProcessorMixin
 from verl import DataProto
 from verl.trainer.ppo.metric_utils import _compute_response_info
 from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path
@@ -22,7 +23,7 @@
 
 
 def to_data_proto(
-    experiences: List[Experience], pad_token_id: int, logger: Logger
+    experiences: List[Experience], pad_token_id: int, processor: ProcessorMixin, logger: Logger
 ) -> DataProto:  # noqa: C901
     """Convert List[Experience] to verl DataProto."""
     assert len(experiences) > 0, "No experiences provided."
@@ -83,12 +84,40 @@ def to_data_proto(
         if all(getattr(exp, attr, None) is not None for exp in experiences):
             batch_dict[attr] = gather_response_attrs(experiences, attr, max_response_length)
 
-    if all(exp.multi_modal_inputs is not None for exp in experiences):
-        keys = experiences[0].multi_modal_inputs.keys()
-        batch_dict["multi_modal_inputs"] = np.array(
-            [{key: exp.multi_modal_inputs[key] for key in keys} for exp in experiences],  # type: ignore
-            dtype=object,
-        )
+    if processor is not None:
+        import inspect
+
+        # Adapted from verl/experimental/agent_loop/agent_loop.py
+        position_ids_list, multi_modal_inputs = [], []
+        for idx, exp in enumerate(experiences):
+            mm_inputs = exp.multi_modal_inputs or {}
+            input_ids = batch_dict["input_ids"][idx].unsqueeze(0)
+            attention_mask = batch_dict["attention_mask"][idx].unsqueeze(0)
+
+            get_rope_index_sig = inspect.signature(processor.get_rope_index)
+            get_rope_index_kwargs = {}
+            for key in mm_inputs.keys():
+                if key in get_rope_index_sig.parameters:
+                    get_rope_index_kwargs[key] = mm_inputs[key]
+
+            vision_position_ids, _ = processor.get_rope_index(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                **get_rope_index_kwargs,
+            )  # (3, 1, seq_len)
+            vision_position_ids = vision_position_ids.squeeze(1)  # (3, seq_len)
+
+            text_position_ids = batch_dict["position_ids"][idx].unsqueeze(0)  # (1, seq_length)
+            position_ids = torch.cat(
+                (text_position_ids, vision_position_ids), dim=0
+            )  # (4, seq_length)
+            position_ids_list.append(position_ids)  # (4, seq_length)
+            multi_modal_inputs.append(mm_inputs)
+
+        batch_dict["position_ids"] = torch.stack(
+            position_ids_list, dim=0
+        ).long()  # (bs, 4, seq_length)
+        batch_dict["multi_modal_inputs"] = np.array(multi_modal_inputs, dtype=object)
 
     custom_fields_set = set(tuple(exp.custom_fields) for exp in experiences)
     if len(custom_fields_set) == 1:
@@ -222,3 +251,40 @@ def get_latest_hf_checkpoint_path(config: Config):
     if not os.path.exists(hf_checkpoint_dir):
         raise ValueError(f"No huggingface checkpoint found in {hf_checkpoint_dir}")
     return hf_checkpoint_dir
+
+
+# modified from verl/workers/fsdp_workers.py:ActorRolloutRefWorker._build_model_optimizer
+def get_model_class(hf_config):
+    from transformers import (
+        AutoModel,
+        AutoModelForCausalLM,
+        AutoModelForImageTextToText,
+        AutoModelForVision2Seq,
+    )
+
+    has_remote_code = hasattr(hf_config, "auto_map") and any(
+        hf_config.architectures[0] in val for val in hf_config.auto_map.values()
+    )
+    if has_remote_code:
+        auto_class = next(
+            k for k, v in hf_config.auto_map.items() if hf_config.architectures[0] in v
+        )
+        match auto_class:
+            case "AutoModelForVision2Seq":
+                model_class = AutoModelForVision2Seq
+            case "AutoModelForCausalLM":
+                model_class = AutoModelForCausalLM
+            case "AutoModelForImageTextToText":
+                model_class = AutoModelForImageTextToText
+            case _:
+                model_class = AutoModel
+    else:
+        if type(hf_config) in AutoModelForVision2Seq._model_mapping.keys():
+            model_class = AutoModelForVision2Seq
+        elif type(hf_config) in AutoModelForCausalLM._model_mapping.keys():
+            model_class = AutoModelForCausalLM
+        elif type(hf_config) in AutoModelForImageTextToText._model_mapping.keys():
+            model_class = AutoModelForImageTextToText
+        else:
+            model_class = AutoModel
+    return model_class
diff --git a/trinity/trainer/verl_trainer.py b/trinity/trainer/verl_trainer.py
index cdd74acd5f7..849e176b35a 100644
--- a/trinity/trainer/verl_trainer.py
+++ b/trinity/trainer/verl_trainer.py
@@ -12,7 +12,6 @@
 import ray
 import torch
 from omegaconf import OmegaConf
-from verl import DataProto
 from verl.trainer.ppo.core_algos import agg_loss
 from verl.trainer.ppo.metric_utils import (
     compute_throughout_metrics,
@@ -189,7 +188,6 @@ def __init__(
         global_config: Config,
     ):
         self.logger = get_logger(__name__, in_ray_actor=True)
-        self.pad_token_id = global_config.buffer.pad_token_id
         train_config = global_config.trainer
         config = OmegaConf.structured(train_config.trainer_config)
         # download the checkpoint from hdfs
@@ -449,8 +447,7 @@ async def upload_state_dict(self):  # state dict sync
         self.actor_rollout_wg.upload_state_dict(self.global_steps)
 
     async def train_step(self, batch_exps: List[Experience]) -> Dict:  # noqa C901
-        batch = to_data_proto(batch_exps, self.pad_token_id, self.logger)  # type: ignore
-        batch = self.post_process_batch(batch)
+        batch = to_data_proto(batch_exps, self.tokenizer.pad_token_id, self.processor, self.logger)
         metrics = {}
         self.global_steps += 1
         timing_raw = {}
@@ -696,32 +693,3 @@ def _load_checkpoint(self):
 
     def sync_weight(self) -> None:
         self.actor_rollout_wg.sync_weight()
-
-    def post_process_batch(self, batch: DataProto) -> DataProto:
-        """Adapted from verl/utils/dataset/rl_dataset.py"""
-        if (
-            self.processor is not None
-            and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__
-        ):
-            from verl.models.transformers.qwen2_vl import get_rope_index
-
-            position_ids = []
-            multi_modal_inputs = batch.non_tensor_batch["multi_modal_inputs"]
-            for idx, mm_inputs in enumerate(multi_modal_inputs):
-                input_ids = batch.batch["input_ids"][idx]
-                attention_mask = batch.batch["attention_mask"][idx]
-
-                position_ids.append(
-                    get_rope_index(
-                        self.processor,
-                        input_ids=input_ids,
-                        image_grid_thw=mm_inputs.get("image_grid_thw"),
-                        video_grid_thw=mm_inputs.get("video_grid_thw"),
-                        second_per_grid_ts=mm_inputs.get("second_per_grid_ts"),
-                        attention_mask=attention_mask,
-                    )  # (3, seq_len)
-                )
-                mm_inputs.pop("second_per_grid_ts", None)
-
-            batch.batch["position_ids"] = torch.stack(position_ids, dim=0).long()
-        return batch