From 663d69123f7312b149c81e67ee53b16966998746 Mon Sep 17 00:00:00 2001 From: chenyushuo <297086016@qq.com> Date: Mon, 9 Feb 2026 12:15:09 +0800 Subject: [PATCH 01/10] 1. Modified the tokenizer in `TinkerModel` to directly retrieve it from the sample client. 2. Updated the `SFTFormatter`, `vLLMRolloutModel`, `SimpleMMWorkflow`, `mm_utils.py`, `verl_trainer.py` and `trainer/verl/utils.py` to enhance multimodal processing capabilities, enabling compatibility with Qwen-series vision-language models. --- examples/grpo_vlm/README.md | 4 +- examples/mix_vlm/README.md | 4 +- pyproject.toml | 3 +- trinity/buffer/schema/formatter.py | 152 ++++++++---------- trinity/common/models/mm_utils.py | 82 +++++++++- trinity/common/models/model.py | 5 + trinity/common/models/tinker_model.py | 5 +- trinity/common/models/vllm_model.py | 65 +++++--- .../common/workflows/simple_mm_workflow.py | 39 ++--- trinity/trainer/verl/utils.py | 43 ++++- trinity/trainer/verl_trainer.py | 34 +--- 11 files changed, 258 insertions(+), 178 deletions(-) diff --git a/examples/grpo_vlm/README.md b/examples/grpo_vlm/README.md index 5ab12488a61..8283e18910d 100644 --- a/examples/grpo_vlm/README.md +++ b/examples/grpo_vlm/README.md @@ -8,8 +8,8 @@ This example shows the usage of GRPO with Qwen2.5-VL-3B-Instruct on the [geometr The specific requirements are: ```yaml -vllm>=0.9.1,<0.10.0 -transformers<4.53.0 +vllm>=0.10.2 +transformers>=4.54.0 qwen_vl_utils ``` diff --git a/examples/mix_vlm/README.md b/examples/mix_vlm/README.md index 5a2c8752de0..124a3663aea 100644 --- a/examples/mix_vlm/README.md +++ b/examples/mix_vlm/README.md @@ -8,8 +8,8 @@ This is an example of using the [MIX](../../docs/sphinx_doc/source/tutorial/exam The specific requirements are: ```yaml -vllm>=0.9.1,<0.10.0 -transformers<4.53.0 +vllm>=0.10.2 +transformers>=4.54.0 qwen_vl_utils ``` diff --git a/pyproject.toml b/pyproject.toml index b3d2f237c8d..96e55f3ff66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,7 +86,7 @@ megatron = [ # "mbridge @ git+https://github.com/ISEEKYAN/mbridge.git@20e9ffbbe72ae7b1df83bfe1bc3c11f7382f2612", ] tinker = [ - "tinker; python_version >= '3.11'", + "tinker>=0.10.0; python_version >= '3.11'", ] doc = [ @@ -101,6 +101,7 @@ doc = [ mm = [ "qwen-vl-utils", + "transformers>=4.54.0", ] flash_attn = [ diff --git a/trinity/buffer/schema/formatter.py b/trinity/buffer/schema/formatter.py index 039074521b5..bf96c685f0f 100644 --- a/trinity/buffer/schema/formatter.py +++ b/trinity/buffer/schema/formatter.py @@ -109,7 +109,7 @@ def __init__(self, tokenizer_path: str, format_config: FormatConfig): else: self.processor = None self.tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path) - self.chat_template = format_config.chat_template or self.tokenizer.chat_template + self.chat_template = format_config.chat_template # For messages type if self.prompt_type == PromptType.MESSAGES: self.messages_key = format_config.messages_key @@ -129,7 +129,6 @@ def _messages_to_experience( self, messages: List[Dict], tools: Optional[List[Dict] | str] = None, - mm_data: Optional[Dict] = None, ) -> Experience: """Convert messages and tools into an Experience object. @@ -170,89 +169,63 @@ def _messages_to_experience( prompt_length=prompt_length, messages=messages, ) - if mm_data: - return self.convert_mm_data_to_experiences(messages=messages, mm_data=mm_data) - token_ids = self.tokenizer.apply_chat_template( - messages, - tools=tools, - add_generation_prompt=False, - return_tensors="pt", - chat_template=self.chat_template, - )[0] - prompt_tokens_ids = self.tokenizer.apply_chat_template( - messages[:-1], - tools=tools, - add_generation_prompt=True, - return_tensors="pt", - chat_template=self.chat_template, - )[0] - return Experience( - tokens=token_ids, - prompt_length=len(prompt_tokens_ids), - messages=messages, - ) - - def load_mm_data(self, sample: Dict) -> Dict: - """Load multi-modal data such as images or videos. - - NOTE: You can override this method for custom data loading. - - Args: - sample (Dict): The raw sample dictionary containing multi-modal data. - - Returns: - Dict: A dictionary containing multi-modal data. Specifically, it may contain: - - images: A list of `PIL.Image.Image` if `self.image_key` is set - - videos: A list of `numpy.ndarray` if `self.video_key` is set - """ - from verl.utils.dataset.vision_utils import process_image, process_video - - mm_data = {} - if self.image_key: - mm_data["images"] = [process_image(img) for img in sample[self.image_key]] - if self.video_key: - mm_data["videos"] = [process_video(vid).numpy() for vid in sample[self.video_key]] - return mm_data - - def convert_mm_data_to_experiences( - self, - messages: List[Dict], - mm_data: Dict, - ) -> Experience: - from trinity.common.models.mm_utils import ( - build_multi_modal_inputs, - convert_messages_to_mm_format, - ) + if self.image_key or self.video_key: + from trinity.common.models.mm_utils import ( + build_mm_input_for_training, + build_multi_modal_data, + ) - messages = convert_messages_to_mm_format(messages) - sequence: str = self.processor.apply_chat_template( - messages, - add_generation_prompt=False, - chat_template=self.chat_template, - ) - prompt: str = self.processor.apply_chat_template( - messages[:-1], - add_generation_prompt=True, - chat_template=self.chat_template, - ) - sequence_data = build_multi_modal_inputs( - prompt=sequence, - images=mm_data.get("images", None), - videos=mm_data.get("videos", None), - processor=self.processor, - ) - prompt_data = build_multi_modal_inputs( - prompt=prompt, - images=mm_data.get("images", None), - videos=mm_data.get("videos", None), - processor=self.processor, - ) - return Experience( - tokens=sequence_data["prompt_token_ids"], - prompt_length=len(prompt_data["prompt_token_ids"]), - messages=messages, - multi_modal_inputs=sequence_data["multi_modal_inputs"], - ) + full_text = self.processor.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + chat_template=self.chat_template, + ) + prompt = self.processor.apply_chat_template( + messages[:-1], + tokenize=False, + add_generation_prompt=True, + chat_template=self.chat_template, + ) + multi_modal_data = build_multi_modal_data(self.processor, messages) + full_text_inputs = build_mm_input_for_training( + self.processor, + full_text, + multi_modal_data, + ) + tokens = full_text_inputs.pop("input_ids")[0] + full_text_inputs.pop("attention_mask") + prompt_text_inputs = build_mm_input_for_training( + self.processor, + prompt, + multi_modal_data, + ) + return Experience( + tokens=tokens, + prompt_length=len(prompt_text_inputs["input_ids"][0]), + messages=messages, + multi_modal_inputs=full_text_inputs, + ) + else: + token_ids = self.tokenizer.apply_chat_template( + messages, + tools=tools, + add_generation_prompt=False, + return_tensors="pt", + chat_template=self.chat_template, + )[0] + prompt_tokens_ids = self.tokenizer.apply_chat_template( + messages[:-1], + tools=tools, + add_generation_prompt=True, + return_tensors="pt", + chat_template=self.chat_template, + )[0] + return Experience( + tokens=token_ids, + prompt_length=len(prompt_tokens_ids), + messages=messages, + ) def format(self, sample: Dict) -> Experience: if self.prompt_type == PromptType.MESSAGES: @@ -274,13 +247,18 @@ def format(self, sample: Dict) -> Experience: elif self.system_prompt is not None: system_message = {"role": "system", "content": self.system_prompt} messages.append(system_message) - messages.append({"role": "user", "content": sample[self.prompt_key]}) + prompt = sample[self.prompt_key] + images = sample[self.image_key] if self.image_key else [] + videos = sample[self.video_key] if self.video_key else [] + + from trinity.common.models.mm_utils import build_mm_message + + messages.append(build_mm_message(prompt, images, videos)) messages.append({"role": "assistant", "content": sample[self.response_key]}) else: raise ValueError(f"Unsupported prompt_type: {self.prompt_type}") tools = sample.get(self.tools_key, None) - mm_data = self.load_mm_data(sample) if self.image_key or self.video_key else None - return self._messages_to_experience(messages, tools, mm_data) + return self._messages_to_experience(messages, tools) class DPOFormatter(ExperienceFormatter): diff --git a/trinity/common/models/mm_utils.py b/trinity/common/models/mm_utils.py index e850f190d04..50d24dd3e68 100644 --- a/trinity/common/models/mm_utils.py +++ b/trinity/common/models/mm_utils.py @@ -1,5 +1,5 @@ """"Multi-modal utilities for processing and handling multi-modal data such as images and videos. -Only support Qwen2.5 VL series. +Only support Qwen2.5/3 VL series. Modified from: verl/utils/dataset/rl_dataset.py """ @@ -9,7 +9,86 @@ import numpy as np from PIL import Image +from trinity.utils.annotations import Deprecated + +def build_multi_modal_data( + processor: Any, + messages: List[Dict], +) -> Dict[str, Any]: + """ + Preprocess multi-modal data and build multi-modal inputs + """ + processor_class_name = processor.__class__.__name__ + if "Qwen" in processor_class_name and "VLProcessor" in processor_class_name: + from qwen_vl_utils import process_vision_info + + image_inputs, video_inputs = process_vision_info(messages) + multi_modal_data = {} + if image_inputs: + multi_modal_data["image"] = image_inputs + if video_inputs: + multi_modal_data["video"] = video_inputs + + return multi_modal_data + raise NotImplementedError(f"{processor_class_name} not supported") + + +def build_mm_input_for_training(processor: Any, prompt: str, multi_modal_data: Dict) -> Dict: + processor_class_name = processor.__class__.__name__ + if "Qwen" in processor_class_name and "VLProcessor" in processor_class_name: + inputs = processor( + text=[prompt], + images=multi_modal_data.get("image", None), + videos=multi_modal_data.get("video", None), + padding=True, + return_tensors="pt", + ) + return dict(inputs) + raise NotImplementedError(f"{processor_class_name} not supported") + + +def build_mm_message(prompt: str, images: List, videos: List): + content_list = [] + segments = re.split("(|