diff --git a/.github/workflows/docker/docker-compose.yaml b/.github/workflows/docker/docker-compose.yaml index 0889e57ffdd..e9e359e9ce2 100644 --- a/.github/workflows/docker/docker-compose.yaml +++ b/.github/workflows/docker/docker-compose.yaml @@ -1,6 +1,6 @@ services: trinity-node-1: - image: trinity-rft-unittest:20260205 + image: trinity-rft-unittest:20260211 cap_add: - SYS_PTRACE pull_policy: never @@ -15,6 +15,7 @@ services: - TRINITY_MODEL_PATH=/mnt/models/Qwen3-0.6B - TRINITY_API_MODEL_PATH=/mnt/models/Qwen3-1.7B - TRINITY_VLM_MODEL_PATH=/mnt/models/Qwen2.5-VL-3B + - TRINITY_ALTERNATIVE_VLM_MODEL_PATH=/mnt/models/Qwen3-VL-2B-Instruct - VIRTUAL_ENV=/opt/venv working_dir: /workspace networks: @@ -32,7 +33,7 @@ services: capabilities: [gpu] trinity-node-2: - image: trinity-rft-unittest:20260205 + image: trinity-rft-unittest:20260211 cap_add: - SYS_PTRACE pull_policy: never diff --git a/examples/grpo_vlm/README.md b/examples/grpo_vlm/README.md index 5ab12488a61..3435258bc94 100644 --- a/examples/grpo_vlm/README.md +++ b/examples/grpo_vlm/README.md @@ -8,8 +8,8 @@ This example shows the usage of GRPO with Qwen2.5-VL-3B-Instruct on the [geometr The specific requirements are: ```yaml -vllm>=0.9.1,<0.10.0 -transformers<4.53.0 +vllm>=0.10.2 # Qwen3 VL requires vllm>=0.11.0; it is recommended to use version >= 0.13.0 +transformers>=4.54.0 qwen_vl_utils ``` @@ -18,3 +18,11 @@ For other detailed information, please refer to the [documentation](../../docs/s The config file is located in [`vlm.yaml`](vlm.yaml), and the curve is shown below. ![vlm](../../docs/sphinx_doc/assets/geometry3k_qwen25_vl_3b_reward.png) + +## Supported Model Architectures + +The following vision-language model series are currently supported: + +1. Qwen2.5-VL series +2. Qwen3-VL series +3. Kimi-VL-A3B-Thinking series diff --git a/examples/mix_vlm/README.md b/examples/mix_vlm/README.md index 5a2c8752de0..0ee57a225fe 100644 --- a/examples/mix_vlm/README.md +++ b/examples/mix_vlm/README.md @@ -8,8 +8,8 @@ This is an example of using the [MIX](../../docs/sphinx_doc/source/tutorial/exam The specific requirements are: ```yaml -vllm>=0.9.1,<0.10.0 -transformers<4.53.0 +vllm>=0.10.2 # Qwen3 VL requires vllm>=0.11.0; it is recommended to use version >= 0.13.0 +transformers>=4.54.0 qwen_vl_utils ``` @@ -34,3 +34,11 @@ trinity run --config examples/mix_vlm/mix_vlm.yaml The reward curve is shown below: ![](../../docs/sphinx_doc/assets/mix_vlm_reward.png) + +## Supported Model Architectures + +The following vision-language model series are currently supported: + +1. Qwen2.5-VL series +2. Qwen3-VL series +3. Kimi-VL-A3B-Thinking series diff --git a/pyproject.toml b/pyproject.toml index 89f8a7ec997..812f14e6fd0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,7 +88,7 @@ megatron = [ # "mbridge @ git+https://github.com/ISEEKYAN/mbridge.git@20e9ffbbe72ae7b1df83bfe1bc3c11f7382f2612", ] tinker = [ - "tinker; python_version >= '3.11'", + "tinker>=0.10.0; python_version >= '3.11'", ] doc = [ @@ -103,6 +103,8 @@ doc = [ mm = [ "qwen-vl-utils", + "transformers>=4.54.0", + "blobfile", ] flash_attn = [ @@ -143,6 +145,9 @@ known_third_party = ["wandb"] [tool.uv.extra-build-dependencies] flash-attn = ["torch", "numpy"] +[project.entry-points."vllm.general_plugins"] +vllm_patch = "trinity.common.models.vllm_patch:vllm_patch" + [project.urls] "Homepage" = "https://github.com/agentscope-ai/Trinity-RFT" "Documentation" = "https://agentscope-ai.github.io/Trinity-RFT/" diff --git a/tests/template/data/gsm8k/test.jsonl b/tests/template/data/gsm8k/test.jsonl new file mode 100644 index 00000000000..8c07215ebf4 --- /dev/null +++ b/tests/template/data/gsm8k/test.jsonl @@ -0,0 +1,2 @@ +{"question": "Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?", "answer": "Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\n#### 18"} +{"question": "A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take?", "answer": "It takes 2/2=<<2/2=1>>1 bolt of white fiber\nSo the total amount of fabric is 2+1=<<2+1=3>>3 bolts of fabric\n#### 3"} diff --git a/tests/tools.py b/tests/tools.py index 33cf4952089..bdfeb4100e8 100644 --- a/tests/tools.py +++ b/tests/tools.py @@ -24,6 +24,7 @@ API_MODEL_PATH_ENV_VAR = "TRINITY_API_MODEL_PATH" VLM_MODEL_PATH_ENV_VAR = "TRINITY_VLM_MODEL_PATH" +ALTERNATIVE_VLM_MODEL_PATH_ENV_VAR = "TRINITY_ALTERNATIVE_VLM_MODEL_PATH" SFT_DATASET_PATH_ENV_VAR = "TRINITY_SFT_DATASET_PATH" @@ -134,6 +135,15 @@ def get_vision_language_model_path() -> str: return path +def get_alternative_vision_language_model_path() -> str: + path = os.environ.get(ALTERNATIVE_VLM_MODEL_PATH_ENV_VAR) + if not path: + raise EnvironmentError( + f"Please set `export {ALTERNATIVE_VLM_MODEL_PATH_ENV_VAR}=` before running this test." + ) + return path + + def get_lora_config() -> LoRAConfig: return LoRAConfig(name="lora", lora_rank=16, lora_alpha=16) @@ -248,6 +258,20 @@ def get_unittest_dataset_config(dataset_name: str = "countdown", split: str = "t default_workflow_type="simple_mm_workflow", default_reward_fn_type="math_boxed_reward", ) + elif dataset_name == "geometry_sft": + # Multi-modal geometry dataset for sft with 8 samples + return ExperienceBufferConfig( + name=dataset_name, + path=os.path.join(os.path.dirname(__file__), "template", "data", "geometry"), + split="train", + storage_type=StorageType.FILE.value, + format=FormatConfig( + prompt_type=PromptType.PLAINTEXT, + prompt_key="problem", + response_key="answer", + image_key="images", + ), + ) else: raise ValueError(f"Unknown dataset name: {dataset_name}") diff --git a/tests/trainer/trainer_test.py b/tests/trainer/trainer_test.py index 95a689f5bed..5a0560a0a56 100644 --- a/tests/trainer/trainer_test.py +++ b/tests/trainer/trainer_test.py @@ -21,6 +21,7 @@ RayUnittestBase, RayUnittestBaseAsync, TensorBoardParser, + get_alternative_vision_language_model_path, get_checkpoint_path, get_lora_config, get_model_path, @@ -350,7 +351,7 @@ def test_trainer(self, mock_load): mock_load.return_value = deepcopy(self.config) with self.assertRaises(Exception): - run(config_path="dummy.yaml") + run(config="dummy.yaml") ray.shutdown(_exiting_interpreter=True) stage_configs = [cfg.check_and_update() for cfg in deepcopy(self.config)] @@ -375,7 +376,7 @@ def test_trainer(self, mock_load): self.config.stages[1].buffer.explorer_input.taskset.path = old_taskset_path mock_load.return_value = deepcopy(self.config) ray.init(ignore_reinit_error=True, namespace=self.config.ray_namespace) - run(config_path="dummy.yaml") + run(config="dummy.yaml") # grpo stage grpo_config = stage_configs[1] @@ -1205,13 +1206,12 @@ def tearDown(self): class TestMultiModalGRPO(BaseTrainerCase): - @unittest.skip("Require specific vllm/transformers version") def test_trainer(self): """Test both mode with multi-modal data.""" self.config.buffer.explorer_input.taskset = get_unittest_dataset_config( "geometry" ) # Total 8 tasks - self.config.model.model_path = get_vision_language_model_path() + self.config.model.model_path = get_alternative_vision_language_model_path() self.config.algorithm.algorithm_type = "grpo" self.config.algorithm.advantage_fn = "grpo" self.config.algorithm.kl_loss_fn = "none" @@ -1246,12 +1246,11 @@ def tearDown(self): class TestMultiModalSFT(BaseTrainerCase): - @unittest.skip("Require specific vllm/transformers version") def test_trainer(self): """Test SFT mode with multi-modal data.""" self.config.mode = "train" self.config.buffer.trainer_input.experience_buffer = get_unittest_dataset_config( - "geometry" + "geometry_sft" ) # Total 8 tasks self.config.model.model_path = get_vision_language_model_path() self.config.algorithm.algorithm_type = "sft" @@ -1522,7 +1521,6 @@ def tearDown(self): shutil.rmtree(self.config.checkpoint_job_dir, ignore_errors=True) -@unittest.skip("Require agentscope >= 1.0.12") class AgentScopeTunerTest(unittest.IsolatedAsyncioTestCase): def setUp(self) -> None: ray.init(ignore_reinit_error=True) @@ -1622,7 +1620,7 @@ async def judge_func( model_path=get_model_path(), max_model_len=8192, max_tokens=2048, - inference_engine_num=2, + inference_engine_num=1, ) } diff --git a/trinity/algorithm/policy_loss_fn/chord_policy_loss.py b/trinity/algorithm/policy_loss_fn/chord_policy_loss.py index 6653b4d2258..4f42df4ecff 100644 --- a/trinity/algorithm/policy_loss_fn/chord_policy_loss.py +++ b/trinity/algorithm/policy_loss_fn/chord_policy_loss.py @@ -199,35 +199,27 @@ def __call__( # type: ignore per_micro_batch_weight_usual = self.gradient_accumulation / self.train_batch_size_usual # type: ignore per_micro_batch_weight_expert = self.gradient_accumulation / self.train_batch_size_expert # type: ignore - if n_usual_exp > 0: - grpo_loss, grpo_metrics = self.grpo_loss_fn( - logprob[~expert_mask], - old_logprob[~expert_mask], - action_mask[~expert_mask], - advantages[~expert_mask], - **kwargs, - ) - grpo_loss = grpo_loss * n_usual_exp * per_micro_batch_weight_usual - grpo_metrics = { - k: v * n_usual_exp * per_micro_batch_weight_usual for k, v in grpo_metrics.items() - } - else: - grpo_loss = torch.tensor(0.0, device=logprob.device) - grpo_metrics = {} + grpo_loss, grpo_metrics = self.grpo_loss_fn( + logprob[~expert_mask], + old_logprob[~expert_mask], + action_mask[~expert_mask], + advantages[~expert_mask], + **kwargs, + ) + grpo_loss = grpo_loss * n_usual_exp * per_micro_batch_weight_usual + grpo_metrics = { + k: v * n_usual_exp * per_micro_batch_weight_usual for k, v in grpo_metrics.items() + } # SFT Loss (expert) - if n_expert_exp > 0: - sft_loss, sft_metrics = self.sft_loss_fn( - logprob[expert_mask], - action_mask[expert_mask], - ) - sft_loss = sft_loss * n_expert_exp * per_micro_batch_weight_expert - sft_metrics = { - k: v * n_expert_exp * per_micro_batch_weight_expert for k, v in sft_metrics.items() - } - else: - sft_loss = torch.tensor(0.0, device=logprob.device) - sft_metrics = {} + sft_loss, sft_metrics = self.sft_loss_fn( + logprob[expert_mask], + action_mask[expert_mask], + ) + sft_loss = sft_loss * n_expert_exp * per_micro_batch_weight_expert + sft_metrics = { + k: v * n_expert_exp * per_micro_batch_weight_expert for k, v in sft_metrics.items() + } mu = mu_schedule_function( current_step, self.mu_warmup_steps, self.mu_decay_steps, self.mu_peak, self.mu_valley diff --git a/trinity/buffer/schema/formatter.py b/trinity/buffer/schema/formatter.py index 039074521b5..f90dd29aaba 100644 --- a/trinity/buffer/schema/formatter.py +++ b/trinity/buffer/schema/formatter.py @@ -109,7 +109,7 @@ def __init__(self, tokenizer_path: str, format_config: FormatConfig): else: self.processor = None self.tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path) - self.chat_template = format_config.chat_template or self.tokenizer.chat_template + self.chat_template = format_config.chat_template # For messages type if self.prompt_type == PromptType.MESSAGES: self.messages_key = format_config.messages_key @@ -129,7 +129,6 @@ def _messages_to_experience( self, messages: List[Dict], tools: Optional[List[Dict] | str] = None, - mm_data: Optional[Dict] = None, ) -> Experience: """Convert messages and tools into an Experience object. @@ -170,89 +169,63 @@ def _messages_to_experience( prompt_length=prompt_length, messages=messages, ) - if mm_data: - return self.convert_mm_data_to_experiences(messages=messages, mm_data=mm_data) - token_ids = self.tokenizer.apply_chat_template( - messages, - tools=tools, - add_generation_prompt=False, - return_tensors="pt", - chat_template=self.chat_template, - )[0] - prompt_tokens_ids = self.tokenizer.apply_chat_template( - messages[:-1], - tools=tools, - add_generation_prompt=True, - return_tensors="pt", - chat_template=self.chat_template, - )[0] - return Experience( - tokens=token_ids, - prompt_length=len(prompt_tokens_ids), - messages=messages, - ) - - def load_mm_data(self, sample: Dict) -> Dict: - """Load multi-modal data such as images or videos. - - NOTE: You can override this method for custom data loading. - - Args: - sample (Dict): The raw sample dictionary containing multi-modal data. - - Returns: - Dict: A dictionary containing multi-modal data. Specifically, it may contain: - - images: A list of `PIL.Image.Image` if `self.image_key` is set - - videos: A list of `numpy.ndarray` if `self.video_key` is set - """ - from verl.utils.dataset.vision_utils import process_image, process_video - - mm_data = {} - if self.image_key: - mm_data["images"] = [process_image(img) for img in sample[self.image_key]] - if self.video_key: - mm_data["videos"] = [process_video(vid).numpy() for vid in sample[self.video_key]] - return mm_data - - def convert_mm_data_to_experiences( - self, - messages: List[Dict], - mm_data: Dict, - ) -> Experience: - from trinity.common.models.mm_utils import ( - build_multi_modal_inputs, - convert_messages_to_mm_format, - ) + if self.image_key or self.video_key: + from trinity.common.models.mm_utils import ( + build_mm_input_for_training, + build_multi_modal_data, + ) - messages = convert_messages_to_mm_format(messages) - sequence: str = self.processor.apply_chat_template( - messages, - add_generation_prompt=False, - chat_template=self.chat_template, - ) - prompt: str = self.processor.apply_chat_template( - messages[:-1], - add_generation_prompt=True, - chat_template=self.chat_template, - ) - sequence_data = build_multi_modal_inputs( - prompt=sequence, - images=mm_data.get("images", None), - videos=mm_data.get("videos", None), - processor=self.processor, - ) - prompt_data = build_multi_modal_inputs( - prompt=prompt, - images=mm_data.get("images", None), - videos=mm_data.get("videos", None), - processor=self.processor, - ) - return Experience( - tokens=sequence_data["prompt_token_ids"], - prompt_length=len(prompt_data["prompt_token_ids"]), - messages=messages, - multi_modal_inputs=sequence_data["multi_modal_inputs"], - ) + full_text = self.processor.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + chat_template=self.chat_template, + ) + prompt = self.processor.apply_chat_template( + messages[:-1], + tokenize=False, + add_generation_prompt=True, + chat_template=self.chat_template, + ) + multi_modal_data = build_multi_modal_data(self.processor, messages) + full_text_inputs = build_mm_input_for_training( + self.processor, + full_text, + multi_modal_data, + ) + tokens = full_text_inputs.pop("input_ids")[0] + full_text_inputs.pop("attention_mask", None) + prompt_text_inputs = build_mm_input_for_training( + self.processor, + prompt, + multi_modal_data, + ) + return Experience( + tokens=tokens, + prompt_length=len(prompt_text_inputs["input_ids"][0]), + messages=messages, + multi_modal_inputs=full_text_inputs, + ) + else: + token_ids = self.tokenizer.apply_chat_template( + messages, + tools=tools, + add_generation_prompt=False, + return_tensors="pt", + chat_template=self.chat_template, + )[0] + prompt_tokens_ids = self.tokenizer.apply_chat_template( + messages[:-1], + tools=tools, + add_generation_prompt=True, + return_tensors="pt", + chat_template=self.chat_template, + )[0] + return Experience( + tokens=token_ids, + prompt_length=len(prompt_tokens_ids), + messages=messages, + ) def format(self, sample: Dict) -> Experience: if self.prompt_type == PromptType.MESSAGES: @@ -274,13 +247,18 @@ def format(self, sample: Dict) -> Experience: elif self.system_prompt is not None: system_message = {"role": "system", "content": self.system_prompt} messages.append(system_message) - messages.append({"role": "user", "content": sample[self.prompt_key]}) + prompt = sample[self.prompt_key] + images = sample[self.image_key] if self.image_key else [] + videos = sample[self.video_key] if self.video_key else [] + + from trinity.common.models.mm_utils import build_mm_message + + messages.append(build_mm_message(prompt, images, videos)) messages.append({"role": "assistant", "content": sample[self.response_key]}) else: raise ValueError(f"Unsupported prompt_type: {self.prompt_type}") tools = sample.get(self.tools_key, None) - mm_data = self.load_mm_data(sample) if self.image_key or self.video_key else None - return self._messages_to_experience(messages, tools, mm_data) + return self._messages_to_experience(messages, tools) class DPOFormatter(ExperienceFormatter): diff --git a/trinity/common/config.py b/trinity/common/config.py index c392bb60dc8..7f4d6edb65c 100644 --- a/trinity/common/config.py +++ b/trinity/common/config.py @@ -444,6 +444,7 @@ class TinkerConfig: class ModelConfig: # source model path model_path: str = "" + trust_remote_code: bool = False critic_model_path: str = "" custom_chat_template: Optional[str] = None @@ -493,6 +494,7 @@ class InferenceModelConfig: # ! DO NOT SET in explorer.rollout_model, automatically set from config.model.model_path model_path: Optional[str] = None name: Optional[str] = None + trust_remote_code: bool = False engine_type: str = "vllm" engine_num: int = 1 @@ -663,7 +665,6 @@ class BufferConfig: # ! DO NOT SET FOLLOWING FIELDS explorer_output: Optional[StorageConfig] = None # automatically set tokenizer_path: Optional[str] = None # automatically set - pad_token_id: Optional[int] = None # automatically set cache_dir: Optional[str] = None # automatically set diff --git a/trinity/common/config_validator.py b/trinity/common/config_validator.py index fdad4d5856a..69520fe56dc 100644 --- a/trinity/common/config_validator.py +++ b/trinity/common/config_validator.py @@ -16,6 +16,7 @@ set_if_none, ) from trinity.common.constants import StorageType, SyncMethod, SyncStyle +from trinity.common.patch import kimi_vl_monkey_patch_decorator from trinity.utils.log import get_logger from trinity.utils.lora_utils import create_dummy_lora @@ -595,7 +596,7 @@ def validate(self, config: Config) -> None: model_args = rollout_args + length_args + rope_args # rollout model - for args in model_args + ["model_path"]: + for args in model_args + ["model_path", "trust_remote_code"]: set_if_none(config.explorer.rollout_model, args, getattr(config.model, args)) set_if_none( config.explorer.rollout_model, "chat_template", config.model.custom_chat_template @@ -874,26 +875,6 @@ def validate(self, config: Config) -> None: f"your checkpoint directory: {config.checkpoint_job_dir}" ) from e - # set pad_token_id / tokenizer_path - if config.buffer.pad_token_id is None: - from transformers import AutoTokenizer - - try: - tokenizer = AutoTokenizer.from_pretrained(config.model.model_path) - if tokenizer.pad_token_id is None: - tokenizer.pad_token_id = tokenizer.eos_token_id - self.logger.warning( - f"tokenizer.pad_token_id is None. Now set to {tokenizer.eos_token_id}", - stacklevel=1, - ) - config.buffer.pad_token_id = tokenizer.pad_token_id - - except Exception: - self.logger.warning( - f"Failed to get pad token id from model {config.model.model_path}" - ) - config.buffer.pad_token_id = 0 - self._check_explorer_input(config) self._check_trainer_input(config) self._check_data_processor(config) @@ -1266,7 +1247,10 @@ def validate_trainer_memory_usage(self, config: Config) -> None: else: self.logger.info("GPU memory check skipped for non-FSDP strategies.") - def _get_model_params_num_and_config(self, model_path: str) -> Tuple[int, Any]: + @kimi_vl_monkey_patch_decorator + def _get_model_params_num_and_config( + self, model_path: str, trust_remote_code: bool + ) -> Tuple[int, Any]: """Load model configuration and estimate total parameter count without loading weights. Uses `accelerate.init_empty_weights()` to avoid GPU memory allocation during inspection. @@ -1286,9 +1270,13 @@ def _get_model_params_num_and_config(self, model_path: str) -> Tuple[int, Any]: import transformers from accelerate import init_empty_weights - model_config = transformers.AutoConfig.from_pretrained(model_path) + model_config = transformers.AutoConfig.from_pretrained( + model_path, trust_remote_code=trust_remote_code + ) with init_empty_weights(): - model = transformers.AutoModel.from_config(model_config, torch_dtype=torch.bfloat16) + model = transformers.AutoModel.from_config( + model_config, trust_remote_code=trust_remote_code, dtype=torch.bfloat16 + ) params_num = model.num_parameters() assert params_num > 0, f"No parameters found in the model at path: {model_path}" return params_num, model_config @@ -1382,7 +1370,9 @@ def fsdp_memory_check(self, config: Config) -> None: try: model_path = config.model.model_path - params_num, hf_config = self._get_model_params_num_and_config(model_path) + params_num, hf_config = self._get_model_params_num_and_config( + model_path, config.model.trust_remote_code + ) verl_config: veRLConfig = config.trainer.trainer_config world_size = config.cluster.trainer_gpu_num @@ -1407,7 +1397,7 @@ def fsdp_memory_check(self, config: Config) -> None: critic_hf_config = hf_config else: critic_params_num, critic_hf_config = self._get_model_params_num_and_config( - config.model.critic_model_path + config.model.critic_model_path, config.model.trust_remote_code ) ( @@ -1547,6 +1537,10 @@ def _check_max_memory_in_fsdp_training( params_memory (float): Estimated parameter + optimizer memory (bytes). optim_step_memory (float): Estimated optimizer step memory (bytes). """ + is_vl_model = False + if "VL" in hf_config.__class__.__name__: + hf_config = hf_config.text_config + is_vl_model = True max_activation_memory = self._calc_fsdp_activation_memory( hf_config, num_tokens, logits_memory_type, dtype_coeff ) @@ -1557,6 +1551,12 @@ def _check_max_memory_in_fsdp_training( optim_step_mb = optim_step_memory / (1024**2) gpu_capacity_mb = self.memory_capacity / (1024**2) + if is_vl_model: + self.logger.info( + "Note: This is a vision-language (VL) model. " + "The memory estimate below only covers the text encoder portion. " + "Actual GPU memory usage will be higher due to the vision components." + ) self.logger.info( f"Estimated GPU memory usage for {module_name} model '{model_path}': " f"{total_mb:.2f} MB ({params_mb:.2f} MB params + " diff --git a/trinity/common/models/mm_utils.py b/trinity/common/models/mm_utils.py index e850f190d04..fe012e8d50b 100644 --- a/trinity/common/models/mm_utils.py +++ b/trinity/common/models/mm_utils.py @@ -1,73 +1,218 @@ -""""Multi-modal utilities for processing and handling multi-modal data such as images and videos. -Only support Qwen2.5 VL series. +"""Utilities for processing multi-modal data (images/videos) for specific vision-language models. -Modified from: verl/utils/dataset/rl_dataset.py +Supported models: +- Qwen2.5-VL, Qwen3-VL series +- Kimi VL series + +Provides functions to: +1. Parse prompts with media tags (/