From bcd725b5d04a2bf8c3b07f146c50ac64c691c3d4 Mon Sep 17 00:00:00 2001 From: chawuciren11 <2216740116@qq.com> Date: Thu, 26 Feb 2026 15:09:55 +0800 Subject: [PATCH 1/2] 11 --- .../image_understanding/image_gcot.md | 182 +++++--- .../image_understanding/image_gcot_api.md | 399 ++++++++++++++++++ .../image_understanding/image_gcot.md | 182 +++++--- .../image_understanding/image_gcot_api.md | 390 +++++++++++++++++ 4 files changed, 1044 insertions(+), 109 deletions(-) create mode 100644 docs/en/notes/mm_guide/image_understanding/image_gcot_api.md create mode 100644 docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md diff --git a/docs/en/notes/mm_guide/image_understanding/image_gcot.md b/docs/en/notes/mm_guide/image_understanding/image_gcot.md index 85a41201..68569d6a 100644 --- a/docs/en/notes/mm_guide/image_understanding/image_gcot.md +++ b/docs/en/notes/mm_guide/image_understanding/image_gcot.md @@ -27,36 +27,94 @@ The main process of the pipeline includes: ## 2. Quick Start -### Step 1: Create a Working Directory +### Step 1: Create a New DataFlow Working Directory ```bash -mkdir run_gcot -cd run_gcot +mkdir run_dataflow +cd run_dataflow ``` -### Step 2: Prepare the Script +### Step 2: Initialize DataFlow-MM -Save the code in the "Pipeline Example" section below as `image_gcot_pipeline.py`. +```bash +dataflowmm init -### Step 3: Configure Parameters +``` -Ensure you have a VLM model capable of grounding (e.g., Qwen2.5-VL-7B-Instruct). +You will then see: ```bash -# Install dependencies -pip install open-dataflow vllm - +gpu_pipelines/image_gcot_pipeline.py ``` -### Step 4: Run +### Step 3: Download Sample Data ```bash -python image_gcot_pipeline.py \ - --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \ - --input_file "data/image_qa.jsonl" +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data +``` + +### Step 4: Configure Parameters +```python +if __name__ == "__main__": + pipe = ImageGCoTPipeline( + model_path="Qwen/Qwen2.5-VL-3B-Instruct", + first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl", + hf_cache_dir="~/.cache/huggingface", + download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct", + ) + pipe.forward() +``` +> **�7²2„1‚5 Important Note on Model Path Configuration (Taking `Qwen2.5-VL-3B-Instruct` as an example):** +> +> * **If you have already downloaded the model files:** Please change `model_path` to your local model path. **Crucially**, ensure that the model folder is named exactly `Qwen2.5-VL-3B-Instruct`; otherwise, the framework will fail to recognize it. +> * **If you haven't downloaded the model yet:** You must specify a `download_dir` parameter that ends with `Qwen2.5-VL-3B-Instruct` (as shown in the default parameters). Failure to do so will also result in the model not being recognized after downloading. + +### Step 5: Run + +```bash +cd gpu_pipelines +python image_gcot_pipeline.py ``` +> **•0•0„1‚5 Troubleshooting** +> +> **Issue 1:** If you encounter a CUDA library conflict error similar to the following: +> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12` +> +> **Solution:** This is usually caused by conflicting environment variables. Run the script with an empty `LD_LIBRARY_PATH`: +> ```bash +> LD_LIBRARY_PATH="" python image_gcot_pipeline.py +> ``` +> +> **Issue 2:** If you are using **Qwen series models** and encounter the following error: +> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"` +> +> **Solution:** Open the `config.json` file located in your model folder, find the `rope_scaling` section, and change the key `"type"` to `"rope_type"`. +> +> **Before modification:** +> ```json +> "rope_scaling": { +> "type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> ``` +> +> **After modification:** +> ```json +> "rope_scaling": { +> "rope_type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> ``` --- @@ -74,9 +132,9 @@ The input data for this process typically consists of standard VQA data: ```json { - "image": "./images/cat_dog.jpg", - "question": "Is the cat looking at the dog?", - "answer": "Yes" + "image":"../example_data/capsbench_images/0.png", + "question":"Who is the lead actor in the movie \"Nightmare Alley\"?", + "answer": "Bradley Cooper." } ``` @@ -122,10 +180,7 @@ Finally, the output data generated by the pipeline will contain the following ke **Output Data Example (gcot field)**: ```text -Step 1: Locate the cat [200, 300, 400, 500]. The cat is sitting on the left. -Step 2: Locate the dog [500, 300, 700, 500]. The dog is sleeping on the right. -Step 3: Observe their gaze. The cat is facing the dog. -Answer: Yes +Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper. \nKeywords: Nightmare Alley, cast list, poster.","cleaned_cot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.","extracted_keywords":["Nightmare Alley","cast list","poster"],"bbox_mapping":{},"gcot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper. ``` @@ -139,6 +194,7 @@ Below is the complete `ImageGCoTPipeline` code implementation. import re from typing import List, Dict, Any import argparse +import gc import torch from dataflow.utils.storage import FileStorage from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm @@ -147,7 +203,6 @@ from dataflow.operators.core_vision import PromptTemplatedVQAGenerator, VLMBBoxG from dataflow.operators.core_text import FunctionalRefiner from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate -# 定义 Prompt 模板,强制模型输出推理步骤和关键词 GCOT_PROMPT_TEMPLATE = ( "Question: {question}\n" "Answer: {answer}\n\n" @@ -164,10 +219,8 @@ GCOT_PROMPT_TEMPLATE = ( DEFAULT_BBOX_PROMPT = 'Detect "{keyword}".' -# ----------------- 辅助逻辑函数 ----------------- # - def _parse_base(text: str) -> Dict[str, Any]: - """基础解析逻辑:分离 CoT 文本和 Keywords 行""" + """基础解析逻辑(内部复用)""" if not text: return {"cot": "", "keywords": []} lines = text.split('\n') cot_lines = [] @@ -175,7 +228,6 @@ def _parse_base(text: str) -> Dict[str, Any]: for line in lines: if line.strip().lower().startswith('keywords:'): keyword_str = line.split(':', 1)[-1].strip() - # 简单的分词处理 raw_kws = [kw.strip().strip('.,;:!?"\'') for kw in keyword_str.replace(';', ',').split(',')] keywords = [k for k in raw_kws if k] else: @@ -183,15 +235,42 @@ def _parse_base(text: str) -> Dict[str, Any]: return {"cot": '\n'.join(cot_lines).strip(), "keywords": keywords} def extract_clean_cot_logic(text: str) -> str: + """[For FunctionalRefiner] 仅返回清洗后的 CoT 文本""" return _parse_base(text)["cot"] def extract_keywords_logic(text: str) -> List[str]: - return _parse_base(text)["keywords"] + """[For FunctionalRefiner] 提取并合并关键词""" + parsed = _parse_base(text) + kws = parsed["keywords"] + cot = parsed["cot"] + + if not kws or len(kws) <= 1: + return kws + + # 简单的相邻合并逻辑 + cot_lower = cot.lower() + merged = [] + skip_indices = set() + for i in range(len(kws)): + if i in skip_indices: continue + best_match = kws[i] + best_indices = [i] + # 尝试向后合并 3 个词 + for j in range(i + 1, min(i + 4, len(kws))): + if j in skip_indices: break + combined = ' '.join(kws[i:j+1]) + if combined.lower() in cot_lower: + best_match = combined + best_indices = list(range(i, j+1)) + else: break + merged.append(best_match) + skip_indices.update(best_indices) + return merged def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str: - """将 BBox 注入回 CoT 文本""" + """[For FunctionalRefiner] 将 BBox 注入回 CoT""" if not cot_text or not bbox_map: return cot_text - # 优先匹配长词,避免子串误匹配 + # 优先匹配长词 sorted_keywords = sorted(bbox_map.keys(), key=lambda x: len(x), reverse=True) result_text = cot_text replaced = set() @@ -202,37 +281,35 @@ def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str: answer_pos = result_text.find('Answer:') search_limit = answer_pos if answer_pos != -1 else len(result_text) - # 大小写不敏感查找 pos = result_text.lower().find(keyword.lower(), 0, search_limit) if pos == -1: continue boxes = bbox_map[keyword] # List[str] box_str = "".join(boxes) - # 替换:保留原词,追加 Box replacement = f"{keyword} {box_str}" result_text = result_text[:pos] + replacement + result_text[pos + len(keyword):] replaced.add(keyword) return result_text -# ----------------- 流水线定义 ----------------- # - class ImageGCoTPipeline: def __init__( self, model_path: str, *, + hf_cache_dir: str | None = None, + download_dir: str = "./ckpt/models", first_entry_file: str, - cache_path: str = "./cache_gcot", + cache_path: str = "../cache/cache_gcot", file_name_prefix: str = "gcot", - # Keys 配置 + # Keys question_key: str = "question", answer_key: str = "answer", image_key: str = "image", output_key: str = "gcot", + # Config vllm_max_tokens: int = 512 ): - # 1. 存储初始化 self.storage = FileStorage( first_entry_file_name=first_entry_file, cache_path=cache_path, @@ -240,9 +317,11 @@ class ImageGCoTPipeline: cache_type="jsonl" ) - # 2. 模型服务 (单一模型) + # [单一模型 Serving] self.vlm_serving = LocalModelVLMServing_vllm( hf_model_name_or_path=model_path, + hf_cache_dir=hf_cache_dir, + hf_local_dir=download_dir, vllm_tensor_parallel_size=1, vllm_temperature=0.7, vllm_max_tokens=vllm_max_tokens @@ -259,28 +338,28 @@ class ImageGCoTPipeline: "final": output_key } - # 3. 算子链配置 + # ================== Operators ================== - # Step A: 生成 CoT 和 Keywords + # 1. Generate CoT (通用 Generator) self.op_gen_cot = PromptTemplatedVQAGenerator( serving=self.vlm_serving, system_prompt="You are a helpful assistant.", prompt_template=NamedPlaceholderPromptTemplate(template=GCOT_PROMPT_TEMPLATE) ) - # Step B: 解析清洗 CoT + # 2. Extract Clean CoT (通用 Refiner + Helper) self.op_extract_cot = FunctionalRefiner(func=extract_clean_cot_logic) - # Step C: 解析 Keywords + # 3. Extract Keywords (通用 Refiner + Helper) self.op_extract_kws = FunctionalRefiner(func=extract_keywords_logic) - # Step D: 生成 BBox (Grounding) + # 4. Generate BBox (专用 Generator, 因为涉及行内 Batch) self.op_bbox_gen = VLMBBoxGenerator( serving=self.vlm_serving, prompt_template=DEFAULT_BBOX_PROMPT ) - # Step E: 注入 BBox 到 CoT + # 5. Inject GCoT (通用 Refiner + Helper) self.op_inject = FunctionalRefiner(func=inject_bboxes_logic) def forward(self): @@ -289,7 +368,7 @@ class ImageGCoTPipeline: self.storage.step(), input_image_key=self.keys["img"], output_answer_key=self.keys["raw_cot"], - question=self.keys["q"], + question=self.keys["q"], # Template mapping answer=self.keys["a"] ) @@ -297,7 +376,7 @@ class ImageGCoTPipeline: self.op_extract_cot.run( self.storage.step(), output_key=self.keys["clean_cot"], - text=self.keys["raw_cot"] + text=self.keys["raw_cot"] # Param mapping ) self.op_extract_kws.run( self.storage.step(), @@ -325,16 +404,13 @@ class ImageGCoTPipeline: if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--input_file", default="dataflow/example/image_to_text_pipeline/image_qa_result.jsonl") - parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") - - args = parser.parse_args() - pipe = ImageGCoTPipeline( - model_path=args.model_path, - first_entry_file=args.input_file + model_path="Qwen/Qwen2.5-VL-3B-Instruct", + first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl", + hf_cache_dir="~/.cache/huggingface", + download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct", ) pipe.forward() + ``` diff --git a/docs/en/notes/mm_guide/image_understanding/image_gcot_api.md b/docs/en/notes/mm_guide/image_understanding/image_gcot_api.md new file mode 100644 index 00000000..1df75290 --- /dev/null +++ b/docs/en/notes/mm_guide/image_understanding/image_gcot_api.md @@ -0,0 +1,399 @@ +--- +title: Image Grounded CoT (GCoT) Pipeline +icon: mdi:image-text +createTime: 2026/01/11 20:44:55 +permalink: /en/mm_guide/image_gcot/ +--- +## 1. Overview + +The **Image Grounded Chain-of-Thought (GCoT) Pipeline** is designed to automatically generate **Grounded Chain-of-Thought** data. This pipeline generates multi-step reasoning to answer a question and simultaneously spatially locates (via Bounding Boxes) the key objects mentioned during the reasoning process. This significantly enhances the interpretability and precision of multimodal data. + +Unlike traditional methods, this pipeline uses a **Single VLM (e.g., Qwen2.5-VL)** to handle both "Reasoning" and "Grounding" tasks, making the process streamlined and efficient. + +We support the following application scenarios: + +* **Enhanced Multimodal Data Construction**: Adding interpretability and grounding annotations to VQA datasets. +* **Complex Scene Understanding**: Generating detailed reasoning steps containing object coordinates. +* **Model Reasoning Training**: Building data to train models to be "grounded" and reduce hallucinations. + +The main process of the pipeline includes: + +1. **CoT Generation**: The model generates step-by-step reasoning text and extracts key nouns. +2. **Keyword Parsing**: Cleaning and extracting keywords to be grounded from the generated text. +3. **Visual Grounding**: The model generates bounding boxes (BBoxes) for the extracted keywords. +4. **Information Injection**: Injecting BBox coordinates back into the reasoning text to form the final GCoT. + +--- + +## 2. Quick Start + +### Step 1: Create a New DataFlow Working Directory + +```bash +mkdir run_dataflow +cd run_dataflow + +``` + +### Step 2: Initialize DataFlow-MM + +```bash +dataflowmm init + +``` + +You will then see: + +```bash +gpu_pipelines/image_gcot_pipeline.py +``` + +### Step 3: Download Sample Data + +```bash +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data +``` + +### Step 4: Configure API Key + +Set your API Key environment variable in `api_pipelines/image_gcot_api_pipeline.py`: + +```python +import os +os.environ["DF_API_KEY"] = "your_api_key" + +``` + + +### Step 5: Configure Parameters + +Configure the API service and input data paths in `api_pipelines/image_region_caption_api_pipeline.py`: + +```python + def __init__( + self, + *, + first_entry_file: str, + cache_path: str = "../cache/cache_gcot", + file_name_prefix: str = "gcot", + # Keys + question_key: str = "question", + answer_key: str = "answer", + image_key: str = "image", + output_key: str = "gcot", + # Config + vllm_max_tokens: int = 512 + ): +``` + +```python +self.vlm_serving = APIVLMServing_openai( + api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format + model_name="gpt-4o-mini", + image_io=None, + send_request_stream=False, + max_workers=10, + timeout=1800 + ) + +``` +### Step 6: Run with One Command + +```bash +cd api_pipelines +python image_gcot_api_pipeline.py +``` + +--- + +## 3. Data Flow & Logic + +### 1. **Input Data** + +The input data for this process typically consists of standard VQA data: + +* **image**: Path to the image file. +* **question**: Question about the image. +* **answer**: Standard answer to the question (used to assist CoT generation). + +**Input Data Example**: + +```json +{ + "image":"../example_data/capsbench_images/0.png", + "question":"Who is the lead actor in the movie \"Nightmare Alley\"?", + "answer": "Bradley Cooper." +} + +``` + +### 2. **Core Operator Logic** + +This pipeline combines multiple fine-grained operators to achieve complex GCoT generation logic: + +#### A. **CoT Generation (PromptTemplatedVQAGenerator)** + +Uses a predefined `GCOT_PROMPT_TEMPLATE` to guide the model to generate "Step-by-step Reasoning" and a "Keyword List". + +* **Prompt Strategy**: Asks the model to output in the format `Step 1: ...`, `Step 2: ...`, `Keywords: ...`. +* **Output**: Raw string containing reasoning text and keywords. + +#### B. **Text Cleaning & Extraction (FunctionalRefiner)** + +Uses custom functions to parse the output from the previous step: + +* `extract_clean_cot_logic`: Strips the keyword section, keeping pure CoT text. +* `extract_keywords_logic`: Parses the content after `Keywords:` to generate a Python List. + +#### C. **Visual Grounding (VLMBBoxGenerator)** + +Calls the VLM's grounding capability to generate bounding boxes for each extracted keyword. + +* **Input**: Image + List of Keywords. +* **Output**: Dictionary mapping keywords to bounding box coordinates. + +#### D. **Coordinate Injection (FunctionalRefiner)** + +Uses the `inject_bboxes_logic` function to intelligently insert the generated BBox coordinates back into the original CoT text after the corresponding words. + +### 3. **Output Data** + +Finally, the output data generated by the pipeline will contain the following key fields: + +* **raw_cot_output**: Raw text generated by the model. +* **cleaned_cot**: Cleaned reasoning text. +* **bbox_mapping**: Mapping of keywords to their coordinates. +* **gcot**: Final result, reasoning chain containing coordinate information. + +**Output Data Example (gcot field)**: + +```text +Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper. \nKeywords: Nightmare Alley, cast list, poster.","cleaned_cot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.","extracted_keywords":["Nightmare Alley","cast list","poster"],"bbox_mapping":{},"gcot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper. + +``` + +--- + +## 4. Pipeline Example + +Below is the complete `ImageGCoTAPIPipeline` code implementation. + +```python +import os +os.environ["DF_API_KEY"] = "sk-xxxx" + +import re +from typing import List, Dict, Any +import argparse +import gc +import torch +from dataflow.utils.storage import FileStorage +from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm + +from dataflow.operators.core_vision import PromptTemplatedVQAGenerator, VLMBBoxGenerator +from dataflow.operators.core_text import FunctionalRefiner +from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate +from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai +GCOT_PROMPT_TEMPLATE = ( + "Question: {question}\n" + "Answer: {answer}\n\n" + "Task: Provide a detailed step-by-step reasoning (Chain-of-Thought) that explains " + "how to arrive at this answer based on the image.\n" + "Then, extract key nouns and objects mentioned in your reasoning that are " + "visible in the image and can be spatially located.\n\n" + "Format:\n" + "Step 1: ...\n" + "Step 2: ...\n" + "Answer: {answer}\n" + "Keywords: object1, object2\n" +) + +DEFAULT_BBOX_PROMPT = 'Detect "{keyword}".' + +def _parse_base(text: str) -> Dict[str, Any]: + """基础解析逻辑(内部复用)""" + if not text: return {"cot": "", "keywords": []} + lines = text.split('\n') + cot_lines = [] + keywords = [] + for line in lines: + if line.strip().lower().startswith('keywords:'): + keyword_str = line.split(':', 1)[-1].strip() + raw_kws = [kw.strip().strip('.,;:!?"\'') for kw in keyword_str.replace(';', ',').split(',')] + keywords = [k for k in raw_kws if k] + else: + cot_lines.append(line) + return {"cot": '\n'.join(cot_lines).strip(), "keywords": keywords} + +def extract_clean_cot_logic(text: str) -> str: + """[For FunctionalRefiner] 仅返回清洗后的 CoT 文本""" + return _parse_base(text)["cot"] + +def extract_keywords_logic(text: str) -> List[str]: + """[For FunctionalRefiner] 提取并合并关键词""" + parsed = _parse_base(text) + kws = parsed["keywords"] + cot = parsed["cot"] + + if not kws or len(kws) <= 1: + return kws + + # 简单的相邻合并逻辑 + cot_lower = cot.lower() + merged = [] + skip_indices = set() + for i in range(len(kws)): + if i in skip_indices: continue + best_match = kws[i] + best_indices = [i] + # 尝试向后合并 3 个词 + for j in range(i + 1, min(i + 4, len(kws))): + if j in skip_indices: break + combined = ' '.join(kws[i:j+1]) + if combined.lower() in cot_lower: + best_match = combined + best_indices = list(range(i, j+1)) + else: break + merged.append(best_match) + skip_indices.update(best_indices) + return merged + +def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str: + """[For FunctionalRefiner] 将 BBox 注入回 CoT""" + if not cot_text or not bbox_map: return cot_text + # 优先匹配长词 + sorted_keywords = sorted(bbox_map.keys(), key=lambda x: len(x), reverse=True) + result_text = cot_text + replaced = set() + + for keyword in sorted_keywords: + if keyword in replaced: continue + # 简单策略:只在 'Answer:' 之前注入,防止破坏答案区 + answer_pos = result_text.find('Answer:') + search_limit = answer_pos if answer_pos != -1 else len(result_text) + + pos = result_text.lower().find(keyword.lower(), 0, search_limit) + if pos == -1: continue + + boxes = bbox_map[keyword] # List[str] + box_str = "".join(boxes) + replacement = f"{keyword} {box_str}" + + result_text = result_text[:pos] + replacement + result_text[pos + len(keyword):] + replaced.add(keyword) + return result_text + +class ImageGCoTPipeline: + def __init__( + self, + *, + first_entry_file: str, + cache_path: str = "../cache/cache_gcot", + file_name_prefix: str = "gcot", + # Keys + question_key: str = "question", + answer_key: str = "answer", + image_key: str = "image", + output_key: str = "gcot", + # Config + vllm_max_tokens: int = 512 + ): + self.storage = FileStorage( + first_entry_file_name=first_entry_file, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type="jsonl" + ) + + self.vlm_serving = APIVLMServing_openai( + api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format + model_name="gpt-4o-mini", + image_io=None, + send_request_stream=False, + max_workers=10, + timeout=1800 + ) + + self.keys = { + "q": question_key, + "a": answer_key, + "img": image_key, + "raw_cot": "raw_cot_output", + "clean_cot": "cleaned_cot", + "keywords": "extracted_keywords", + "bbox_map": "bbox_mapping", + "final": output_key + } + + # ================== Operators ================== + + # 1. Generate CoT (通用 Generator) + self.op_gen_cot = PromptTemplatedVQAGenerator( + serving=self.vlm_serving, + system_prompt="You are a helpful assistant.", + prompt_template=NamedPlaceholderPromptTemplate(template=GCOT_PROMPT_TEMPLATE) + ) + + # 2. Extract Clean CoT (通用 Refiner + Helper) + self.op_extract_cot = FunctionalRefiner(func=extract_clean_cot_logic) + + # 3. Extract Keywords (通用 Refiner + Helper) + self.op_extract_kws = FunctionalRefiner(func=extract_keywords_logic) + + # 4. Generate BBox (专用 Generator, 因为涉及行内 Batch) + self.op_bbox_gen = VLMBBoxGenerator( + serving=self.vlm_serving, + prompt_template=DEFAULT_BBOX_PROMPT + ) + + # 5. Inject GCoT (通用 Refiner + Helper) + self.op_inject = FunctionalRefiner(func=inject_bboxes_logic) + + def forward(self): + print(">>> [Pipeline] Step 1: Generating CoT...") + self.op_gen_cot.run( + self.storage.step(), + input_image_key=self.keys["img"], + output_answer_key=self.keys["raw_cot"], + question=self.keys["q"], # Template mapping + answer=self.keys["a"] + ) + + print(">>> [Pipeline] Step 2: Parsing Outputs...") + self.op_extract_cot.run( + self.storage.step(), + output_key=self.keys["clean_cot"], + text=self.keys["raw_cot"] # Param mapping + ) + self.op_extract_kws.run( + self.storage.step(), + output_key=self.keys["keywords"], + text=self.keys["raw_cot"] + ) + + print(">>> [Pipeline] Step 3: Generating BBoxes (Grounding)...") + self.op_bbox_gen.run( + self.storage.step(), + input_image_key=self.keys["img"], + input_kws_key=self.keys["keywords"], + output_key=self.keys["bbox_map"] + ) + + print(">>> [Pipeline] Step 4: Injecting GCoT...") + self.op_inject.run( + self.storage.step(), + output_key=self.keys["final"], + cot_text=self.keys["clean_cot"], + bbox_map=self.keys["bbox_map"] + ) + + print(f">>> [Pipeline] Done. Final GCoT saved to: {self.keys['final']}") + + +if __name__ == "__main__": + pipe = ImageGCoTPipeline( + first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl" + ) + pipe.forward() + +``` diff --git a/docs/zh/notes/mm_guide/image_understanding/image_gcot.md b/docs/zh/notes/mm_guide/image_understanding/image_gcot.md index a4a11c3d..3a5add79 100644 --- a/docs/zh/notes/mm_guide/image_understanding/image_gcot.md +++ b/docs/zh/notes/mm_guide/image_understanding/image_gcot.md @@ -27,36 +27,88 @@ permalink: /zh/mm_guide/image_gcot/ ## 2. 快速开始 -### 第一步:准备工作目录 - +### 第一步:创建新的 DataFlow 工作文件夹 ```bash -mkdir run_gcot -cd run_gcot - +mkdir run_dataflow +cd run_dataflow ``` -### 第二步:准备脚本 - -将下文“流水线示例”中的代码保存为 `image_gcot_pipeline.py`。 +### 第二步:初始化 DataFlow-MM +```bash +dataflowmm init +``` +这时你会看到: +```bash +gpu_pipelines/image_gcot_pipeline.py +``` -### 第三步:配置运行参数 +### 第三步:下载示例数据 +```bash +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data +``` -确保你拥有支持定位能力的 VLM 模型(如 Qwen2.5-VL-7B-Instruct)。 +### 第四步:配置参数 ```bash -# 安装依赖 -pip install open-dataflow vllm +if __name__ == "__main__": + pipe = ImageGCoTPipeline( + model_path="Qwen/Qwen2.5-VL-3B-Instruct", + first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl", + hf_cache_dir="~/.cache/huggingface", + download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct", + ) + pipe.forward() ``` +> **⚠️ 模型路径配置的重要提示(以 `Qwen2.5-VL-3B-Instruct` 为例):** +> +> * **如果您已经下载好了模型文件**:请将 `model_path` 修改为您的本地模型路径。**务必保证**模型存放的最终文件夹名称精确为 `Qwen2.5-VL-3B-Instruct`,否则底层解析时将无法正确匹配和识别该模型。 +> * **如果您还未下载模型(需要自动下载)**:请一定要指定 `download_dir` 参数,并且该目录路径**必须以** `Qwen2.5-VL-3B-Instruct` **结尾**(正如默认参数所示),否则下载完成后同样会导致框架无法识别模型。 -### 第四步:一键运行 +### 第五步:一键运行 ```bash -python image_gcot_pipeline.py \ - --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \ - --input_file "data/image_qa.jsonl" - +cd gpu_pipelines +python image_gcot_pipeline.py ``` +> **🛠️ 常见问题排查 (Troubleshooting)** +> +> **问题 1:** 如果遇到类似如下的动态链接库冲突报错: +> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12` +> +> **解决方法:** 这通常是环境变量干扰导致的。请在运行命令前清空 `LD_LIBRARY_PATH`: +> ```bash +> LD_LIBRARY_PATH="" python image_gcot_pipeline.py +> ``` +> +> **问题 2:** 如果您使用的是 **Qwen 系列模型**,并且遇到以下报错: +> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"` +> +> **解决方法:** 打开模型文件夹下的 `config.json` 文件,找到 `rope_scaling` 配置块,将 `"type"` 字段修改为 `"rope_type"` 即可。 +> +> **修改前:** +> ```json +> "rope_scaling": { +> "type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> ``` +> +> **修改后:** +> ```json +> "rope_scaling": { +> "rope_type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> ``` --- @@ -74,9 +126,9 @@ python image_gcot_pipeline.py \ ```json { - "image": "./images/cat_dog.jpg", - "question": "Is the cat looking at the dog?", - "answer": "Yes" + "image":"../example_data/capsbench_images/0.png", + "question":"Who is the lead actor in the movie \"Nightmare Alley\"?", + "answer": "Bradley Cooper." } ``` @@ -122,10 +174,7 @@ python image_gcot_pipeline.py \ **输出数据示例 (gcot 字段)**: ```text -Step 1: Locate the cat [200, 300, 400, 500]. The cat is sitting on the left. -Step 2: Locate the dog [500, 300, 700, 500]. The dog is sleeping on the right. -Step 3: Observe their gaze. The cat is facing the dog. -Answer: Yes +Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper. \nKeywords: Nightmare Alley, cast list, poster.","cleaned_cot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.","extracted_keywords":["Nightmare Alley","cast list","poster"],"bbox_mapping":{},"gcot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper. ``` @@ -139,6 +188,7 @@ Answer: Yes import re from typing import List, Dict, Any import argparse +import gc import torch from dataflow.utils.storage import FileStorage from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm @@ -147,7 +197,6 @@ from dataflow.operators.core_vision import PromptTemplatedVQAGenerator, VLMBBoxG from dataflow.operators.core_text import FunctionalRefiner from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate -# 定义 Prompt 模板,强制模型输出推理步骤和关键词 GCOT_PROMPT_TEMPLATE = ( "Question: {question}\n" "Answer: {answer}\n\n" @@ -164,10 +213,8 @@ GCOT_PROMPT_TEMPLATE = ( DEFAULT_BBOX_PROMPT = 'Detect "{keyword}".' -# ----------------- 辅助逻辑函数 ----------------- # - def _parse_base(text: str) -> Dict[str, Any]: - """基础解析逻辑:分离 CoT 文本和 Keywords 行""" + """基础解析逻辑(内部复用)""" if not text: return {"cot": "", "keywords": []} lines = text.split('\n') cot_lines = [] @@ -175,7 +222,6 @@ def _parse_base(text: str) -> Dict[str, Any]: for line in lines: if line.strip().lower().startswith('keywords:'): keyword_str = line.split(':', 1)[-1].strip() - # 简单的分词处理 raw_kws = [kw.strip().strip('.,;:!?"\'') for kw in keyword_str.replace(';', ',').split(',')] keywords = [k for k in raw_kws if k] else: @@ -183,15 +229,42 @@ def _parse_base(text: str) -> Dict[str, Any]: return {"cot": '\n'.join(cot_lines).strip(), "keywords": keywords} def extract_clean_cot_logic(text: str) -> str: + """[For FunctionalRefiner] 仅返回清洗后的 CoT 文本""" return _parse_base(text)["cot"] def extract_keywords_logic(text: str) -> List[str]: - return _parse_base(text)["keywords"] + """[For FunctionalRefiner] 提取并合并关键词""" + parsed = _parse_base(text) + kws = parsed["keywords"] + cot = parsed["cot"] + + if not kws or len(kws) <= 1: + return kws + + # 简单的相邻合并逻辑 + cot_lower = cot.lower() + merged = [] + skip_indices = set() + for i in range(len(kws)): + if i in skip_indices: continue + best_match = kws[i] + best_indices = [i] + # 尝试向后合并 3 个词 + for j in range(i + 1, min(i + 4, len(kws))): + if j in skip_indices: break + combined = ' '.join(kws[i:j+1]) + if combined.lower() in cot_lower: + best_match = combined + best_indices = list(range(i, j+1)) + else: break + merged.append(best_match) + skip_indices.update(best_indices) + return merged def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str: - """将 BBox 注入回 CoT 文本""" + """[For FunctionalRefiner] 将 BBox 注入回 CoT""" if not cot_text or not bbox_map: return cot_text - # 优先匹配长词,避免子串误匹配 + # 优先匹配长词 sorted_keywords = sorted(bbox_map.keys(), key=lambda x: len(x), reverse=True) result_text = cot_text replaced = set() @@ -202,37 +275,35 @@ def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str: answer_pos = result_text.find('Answer:') search_limit = answer_pos if answer_pos != -1 else len(result_text) - # 大小写不敏感查找 pos = result_text.lower().find(keyword.lower(), 0, search_limit) if pos == -1: continue boxes = bbox_map[keyword] # List[str] box_str = "".join(boxes) - # 替换:保留原词,追加 Box replacement = f"{keyword} {box_str}" result_text = result_text[:pos] + replacement + result_text[pos + len(keyword):] replaced.add(keyword) return result_text -# ----------------- 流水线定义 ----------------- # - class ImageGCoTPipeline: def __init__( self, model_path: str, *, + hf_cache_dir: str | None = None, + download_dir: str = "./ckpt/models", first_entry_file: str, - cache_path: str = "./cache_gcot", + cache_path: str = "../cache/cache_gcot", file_name_prefix: str = "gcot", - # Keys 配置 + # Keys question_key: str = "question", answer_key: str = "answer", image_key: str = "image", output_key: str = "gcot", + # Config vllm_max_tokens: int = 512 ): - # 1. 存储初始化 self.storage = FileStorage( first_entry_file_name=first_entry_file, cache_path=cache_path, @@ -240,9 +311,11 @@ class ImageGCoTPipeline: cache_type="jsonl" ) - # 2. 模型服务 (单一模型) + # [单一模型 Serving] self.vlm_serving = LocalModelVLMServing_vllm( hf_model_name_or_path=model_path, + hf_cache_dir=hf_cache_dir, + hf_local_dir=download_dir, vllm_tensor_parallel_size=1, vllm_temperature=0.7, vllm_max_tokens=vllm_max_tokens @@ -259,28 +332,28 @@ class ImageGCoTPipeline: "final": output_key } - # 3. 算子链配置 + # ================== Operators ================== - # Step A: 生成 CoT 和 Keywords + # 1. Generate CoT (通用 Generator) self.op_gen_cot = PromptTemplatedVQAGenerator( serving=self.vlm_serving, system_prompt="You are a helpful assistant.", prompt_template=NamedPlaceholderPromptTemplate(template=GCOT_PROMPT_TEMPLATE) ) - # Step B: 解析清洗 CoT + # 2. Extract Clean CoT (通用 Refiner + Helper) self.op_extract_cot = FunctionalRefiner(func=extract_clean_cot_logic) - # Step C: 解析 Keywords + # 3. Extract Keywords (通用 Refiner + Helper) self.op_extract_kws = FunctionalRefiner(func=extract_keywords_logic) - # Step D: 生成 BBox (Grounding) + # 4. Generate BBox (专用 Generator, 因为涉及行内 Batch) self.op_bbox_gen = VLMBBoxGenerator( serving=self.vlm_serving, prompt_template=DEFAULT_BBOX_PROMPT ) - # Step E: 注入 BBox 到 CoT + # 5. Inject GCoT (通用 Refiner + Helper) self.op_inject = FunctionalRefiner(func=inject_bboxes_logic) def forward(self): @@ -289,7 +362,7 @@ class ImageGCoTPipeline: self.storage.step(), input_image_key=self.keys["img"], output_answer_key=self.keys["raw_cot"], - question=self.keys["q"], + question=self.keys["q"], # Template mapping answer=self.keys["a"] ) @@ -297,7 +370,7 @@ class ImageGCoTPipeline: self.op_extract_cot.run( self.storage.step(), output_key=self.keys["clean_cot"], - text=self.keys["raw_cot"] + text=self.keys["raw_cot"] # Param mapping ) self.op_extract_kws.run( self.storage.step(), @@ -325,16 +398,13 @@ class ImageGCoTPipeline: if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--input_file", default="dataflow/example/image_to_text_pipeline/image_qa_result.jsonl") - parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") - - args = parser.parse_args() - pipe = ImageGCoTPipeline( - model_path=args.model_path, - first_entry_file=args.input_file + model_path="Qwen/Qwen2.5-VL-3B-Instruct", + first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl", + hf_cache_dir="~/.cache/huggingface", + download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct", ) pipe.forward() + ``` diff --git a/docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md b/docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md new file mode 100644 index 00000000..642b0d5a --- /dev/null +++ b/docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md @@ -0,0 +1,390 @@ +--- +title: 图像定位思维链 (GCoT) 生成流水线 +icon: mdi:image-text +createTime: 2026/01/11 20:44:55 +permalink: /zh/mm_guide/image_gcot/ +--- +## 1. 概述 + +**图像定位思维链 (GCoT) 生成流水线** 旨在自动化生成**带视觉定位的思维链(Grounded Chain-of-Thought)**数据。该流水线通过多步推理,不仅生成回答问题的逻辑步骤,还将推理过程中提到的关键物体在图像中进行空间定位(Bounding Box),从而显著提升多模态数据的可解释性和精确度。 + +与传统方法不同,本流水线采用 **单一 VLM(如 Qwen2.5-VL)** 同时完成“推理”和“定位”任务,流程更加精简高效。 + +我们支持以下应用场景: + +* **增强型多模态数据构建**:为 VQA 数据集增加解释性和定位标注。 +* **复杂场景理解**:生成包含物体坐标的详细推理步骤。 +* **模型推理能力训练**:构建数据以训练模型“言之有物”,减少幻觉。 + +流水线的主要流程包括: + +1. **CoT 生成**:模型生成分步推理文本,并提取关键名词。 +2. **关键词解析**:从生成的文本中清洗并提取待定位的关键词。 +3. **视觉定位 (Grounding)**:模型针对提取的关键词生成边界框 (BBox)。 +4. **信息注入**:将 BBox 坐标回填至推理文本中,形成最终的 GCoT。 + +--- + +## 2. 快速开始 + +### 第一步:创建新的 DataFlow 工作文件夹 +```bash +mkdir run_dataflow +cd run_dataflow +``` + +### 第二步:初始化 DataFlow-MM +```bash +dataflowmm init +``` +这时你会看到: +```bash +gpu_pipelines/image_gcot_pipeline.py +``` + +### 第三步:下载示例数据 +```bash +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data +``` + +### 第四步:配置 API Key + +在 `api_pipelines/image_gcot_api_pipeline.py` 中设置 API Key 环境变量: + +```python +import os +os.environ["DF_API_KEY"] = "your_api_key" +``` + +### 第五步:配置参数 + +在 `api_pipelines/image_region_caption_api_pipeline.py` 中配置 API 服务和输入数据路径: + +```python + def __init__( + self, + *, + first_entry_file: str, + cache_path: str = "../cache/cache_gcot", + file_name_prefix: str = "gcot", + # Keys + question_key: str = "question", + answer_key: str = "answer", + image_key: str = "image", + output_key: str = "gcot", + # Config + vllm_max_tokens: int = 512 + ): +``` + +```python +self.vlm_serving = APIVLMServing_openai( + api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format + model_name="gpt-4o-mini", + image_io=None, + send_request_stream=False, + max_workers=10, + timeout=1800 + ) +``` + +### 第六步:一键运行 +```bash +cd api_pipelines +python image_gcot_api_pipeline.py +``` + +--- + +## 3. 数据流与流水线逻辑 + +### 1. **输入数据** + +该流程的输入数据通常是标准的 VQA 数据: + +* **image**:图像文件路径。 +* **question**:关于图像的问题。 +* **answer**:问题的标准答案(用于辅助生成 CoT)。 + +**输入数据示例**: + +```json +{ + "image":"../example_data/capsbench_images/0.png", + "question":"Who is the lead actor in the movie \"Nightmare Alley\"?", + "answer": "Bradley Cooper." +} + +``` + +### 2. **核心算子逻辑** + +本流水线通过组合多个细粒度算子来实现复杂的 GCoT 生成逻辑: + +#### A. **CoT 生成 (PromptTemplatedVQAGenerator)** + +利用预设的 `GCOT_PROMPT_TEMPLATE`,引导模型生成“步骤化推理”和“关键词列表”。 + +* **Prompt 策略**:要求模型按 `Step 1: ...`, `Step 2: ...`, `Keywords: ...` 格式输出。 +* **输出**:包含推理文本和关键词的原始字符串。 + +#### B. **文本清洗与提取 (FunctionalRefiner)** + +使用自定义函数对上一步的输出进行解析: + +* `extract_clean_cot_logic`:剥离关键词部分,保留纯净的 CoT 文本。 +* `extract_keywords_logic`:解析 `Keywords:` 后的内容,生成 Python List。 + +#### C. **视觉定位 (VLMBBoxGenerator)** + +针对提取出的每一个关键词,调用 VLM 的定位能力生成边界框。 + +* **输入**:图像 + 关键词列表。 +* **输出**:关键词到边界框坐标的映射字典 (Map)。 + +#### D. **坐标注入 (FunctionalRefiner)** + +使用 `inject_bboxes_logic` 函数,将生成的 BBox 坐标智能插入回原始 CoT 文本中对应的单词之后。 + +### 3. **输出数据** + +最终,流水线生成的输出数据将包含以下关键字段: + +* **raw_cot_output**:模型原始生成的文本。 +* **cleaned_cot**:清洗后的纯推理文本。 +* **bbox_mapping**:关键词与其坐标的映射。 +* **gcot**:最终结果,包含坐标信息的推理链。 + +**输出数据示例 (gcot 字段)**: + +```text +Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper. \nKeywords: Nightmare Alley, cast list, poster.","cleaned_cot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.","extracted_keywords":["Nightmare Alley","cast list","poster"],"bbox_mapping":{},"gcot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper. + +``` + +--- + +## 4. 流水线示例 + +以下是完整的 `ImageGCoTAPIPipeline` 代码实现。 + +```python +import os +os.environ["DF_API_KEY"] = "sk-xxxx" + +import re +from typing import List, Dict, Any +import argparse +import gc +import torch +from dataflow.utils.storage import FileStorage +from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm + +from dataflow.operators.core_vision import PromptTemplatedVQAGenerator, VLMBBoxGenerator +from dataflow.operators.core_text import FunctionalRefiner +from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate +from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai +GCOT_PROMPT_TEMPLATE = ( + "Question: {question}\n" + "Answer: {answer}\n\n" + "Task: Provide a detailed step-by-step reasoning (Chain-of-Thought) that explains " + "how to arrive at this answer based on the image.\n" + "Then, extract key nouns and objects mentioned in your reasoning that are " + "visible in the image and can be spatially located.\n\n" + "Format:\n" + "Step 1: ...\n" + "Step 2: ...\n" + "Answer: {answer}\n" + "Keywords: object1, object2\n" +) + +DEFAULT_BBOX_PROMPT = 'Detect "{keyword}".' + +def _parse_base(text: str) -> Dict[str, Any]: + """基础解析逻辑(内部复用)""" + if not text: return {"cot": "", "keywords": []} + lines = text.split('\n') + cot_lines = [] + keywords = [] + for line in lines: + if line.strip().lower().startswith('keywords:'): + keyword_str = line.split(':', 1)[-1].strip() + raw_kws = [kw.strip().strip('.,;:!?"\'') for kw in keyword_str.replace(';', ',').split(',')] + keywords = [k for k in raw_kws if k] + else: + cot_lines.append(line) + return {"cot": '\n'.join(cot_lines).strip(), "keywords": keywords} + +def extract_clean_cot_logic(text: str) -> str: + """[For FunctionalRefiner] 仅返回清洗后的 CoT 文本""" + return _parse_base(text)["cot"] + +def extract_keywords_logic(text: str) -> List[str]: + """[For FunctionalRefiner] 提取并合并关键词""" + parsed = _parse_base(text) + kws = parsed["keywords"] + cot = parsed["cot"] + + if not kws or len(kws) <= 1: + return kws + + # 简单的相邻合并逻辑 + cot_lower = cot.lower() + merged = [] + skip_indices = set() + for i in range(len(kws)): + if i in skip_indices: continue + best_match = kws[i] + best_indices = [i] + # 尝试向后合并 3 个词 + for j in range(i + 1, min(i + 4, len(kws))): + if j in skip_indices: break + combined = ' '.join(kws[i:j+1]) + if combined.lower() in cot_lower: + best_match = combined + best_indices = list(range(i, j+1)) + else: break + merged.append(best_match) + skip_indices.update(best_indices) + return merged + +def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str: + """[For FunctionalRefiner] 将 BBox 注入回 CoT""" + if not cot_text or not bbox_map: return cot_text + # 优先匹配长词 + sorted_keywords = sorted(bbox_map.keys(), key=lambda x: len(x), reverse=True) + result_text = cot_text + replaced = set() + + for keyword in sorted_keywords: + if keyword in replaced: continue + # 简单策略:只在 'Answer:' 之前注入,防止破坏答案区 + answer_pos = result_text.find('Answer:') + search_limit = answer_pos if answer_pos != -1 else len(result_text) + + pos = result_text.lower().find(keyword.lower(), 0, search_limit) + if pos == -1: continue + + boxes = bbox_map[keyword] # List[str] + box_str = "".join(boxes) + replacement = f"{keyword} {box_str}" + + result_text = result_text[:pos] + replacement + result_text[pos + len(keyword):] + replaced.add(keyword) + return result_text + +class ImageGCoTPipeline: + def __init__( + self, + *, + first_entry_file: str, + cache_path: str = "../cache/cache_gcot", + file_name_prefix: str = "gcot", + # Keys + question_key: str = "question", + answer_key: str = "answer", + image_key: str = "image", + output_key: str = "gcot", + # Config + vllm_max_tokens: int = 512 + ): + self.storage = FileStorage( + first_entry_file_name=first_entry_file, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type="jsonl" + ) + + self.vlm_serving = APIVLMServing_openai( + api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format + model_name="gpt-4o-mini", + image_io=None, + send_request_stream=False, + max_workers=10, + timeout=1800 + ) + + self.keys = { + "q": question_key, + "a": answer_key, + "img": image_key, + "raw_cot": "raw_cot_output", + "clean_cot": "cleaned_cot", + "keywords": "extracted_keywords", + "bbox_map": "bbox_mapping", + "final": output_key + } + + # ================== Operators ================== + + # 1. Generate CoT (通用 Generator) + self.op_gen_cot = PromptTemplatedVQAGenerator( + serving=self.vlm_serving, + system_prompt="You are a helpful assistant.", + prompt_template=NamedPlaceholderPromptTemplate(template=GCOT_PROMPT_TEMPLATE) + ) + + # 2. Extract Clean CoT (通用 Refiner + Helper) + self.op_extract_cot = FunctionalRefiner(func=extract_clean_cot_logic) + + # 3. Extract Keywords (通用 Refiner + Helper) + self.op_extract_kws = FunctionalRefiner(func=extract_keywords_logic) + + # 4. Generate BBox (专用 Generator, 因为涉及行内 Batch) + self.op_bbox_gen = VLMBBoxGenerator( + serving=self.vlm_serving, + prompt_template=DEFAULT_BBOX_PROMPT + ) + + # 5. Inject GCoT (通用 Refiner + Helper) + self.op_inject = FunctionalRefiner(func=inject_bboxes_logic) + + def forward(self): + print(">>> [Pipeline] Step 1: Generating CoT...") + self.op_gen_cot.run( + self.storage.step(), + input_image_key=self.keys["img"], + output_answer_key=self.keys["raw_cot"], + question=self.keys["q"], # Template mapping + answer=self.keys["a"] + ) + + print(">>> [Pipeline] Step 2: Parsing Outputs...") + self.op_extract_cot.run( + self.storage.step(), + output_key=self.keys["clean_cot"], + text=self.keys["raw_cot"] # Param mapping + ) + self.op_extract_kws.run( + self.storage.step(), + output_key=self.keys["keywords"], + text=self.keys["raw_cot"] + ) + + print(">>> [Pipeline] Step 3: Generating BBoxes (Grounding)...") + self.op_bbox_gen.run( + self.storage.step(), + input_image_key=self.keys["img"], + input_kws_key=self.keys["keywords"], + output_key=self.keys["bbox_map"] + ) + + print(">>> [Pipeline] Step 4: Injecting GCoT...") + self.op_inject.run( + self.storage.step(), + output_key=self.keys["final"], + cot_text=self.keys["clean_cot"], + bbox_map=self.keys["bbox_map"] + ) + + print(f">>> [Pipeline] Done. Final GCoT saved to: {self.keys['final']}") + + +if __name__ == "__main__": + pipe = ImageGCoTPipeline( + first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl" + ) + pipe.forward() + + +``` From b697def2cc32ce160e3174d90a6a203d5a195c2f Mon Sep 17 00:00:00 2001 From: HankYang Date: Thu, 26 Feb 2026 18:20:51 +0800 Subject: [PATCH 2/2] fix doc --- docs/.vuepress/notes/en/mm_guide.ts | 4 + docs/.vuepress/notes/zh/mm_guide.ts | 4 + .../image_understanding/image_gcot.md | 2 - .../image_understanding/image_gcot_api.md | 17 +- .../image_scale_caption_pipeline.md | 302 ++++++++--- .../image_scale_caption_pipeline_api.md | 477 ++++++++++++++++++ .../image_visual_only_mcq_pipeline.md | 206 +++++--- .../image_visual_only_mcq_pipeline_api.md | 341 +++++++++++++ .../vision_mct_reasoning_pipeline.md | 185 +++++-- .../vision_mct_reasoning_pipeline_api.md | 248 +++++++++ .../image_understanding/image_gcot.md | 2 - .../image_understanding/image_gcot_api.md | 18 +- .../image_scale_caption_pipeline.md | 247 +++++++-- .../image_scale_caption_pipeline_api.md | 477 ++++++++++++++++++ .../image_visual_only_mcq_pipeline.md | 161 ++++-- .../image_visual_only_mcq_pipeline_api.md | 339 +++++++++++++ .../vision_mct_reasoning_pipeline.md | 128 +++-- .../vision_mct_reasoning_pipeline_api.md | 248 +++++++++ 18 files changed, 3053 insertions(+), 353 deletions(-) create mode 100644 docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md create mode 100644 docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md create mode 100644 docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md create mode 100644 docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md create mode 100644 docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md create mode 100644 docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md diff --git a/docs/.vuepress/notes/en/mm_guide.ts b/docs/.vuepress/notes/en/mm_guide.ts index 8a5469cc..ab560546 100644 --- a/docs/.vuepress/notes/en/mm_guide.ts +++ b/docs/.vuepress/notes/en/mm_guide.ts @@ -28,11 +28,15 @@ export const MMGuide: ThemeNote = defineNoteConfig({ 'context_vqa', 'context_vqa_api', 'image_gcot', + 'image_gcot_api', 'vision_mct_reasoning_pipeline', + 'vision_mct_reasoning_pipeline_api', 'image_region_caption_pipeline', 'image_region_caption_pipeline_api', 'image_scale_caption_pipeline', + 'image_scale_caption_pipeline_api', 'image_visual_only_mcq_pipeline', + 'image_visual_only_mcq_pipeline_api', ], }, { diff --git a/docs/.vuepress/notes/zh/mm_guide.ts b/docs/.vuepress/notes/zh/mm_guide.ts index 21bece4d..aa439f3a 100644 --- a/docs/.vuepress/notes/zh/mm_guide.ts +++ b/docs/.vuepress/notes/zh/mm_guide.ts @@ -28,11 +28,15 @@ export const MMGuide: ThemeNote = defineNoteConfig({ 'context_vqa', 'context_vqa_api', 'image_gcot', + 'image_gcot_api', 'vision_mct_reasoning_pipeline', + 'vision_mct_reasoning_pipeline_api', 'image_region_caption_pipeline', 'image_region_caption_pipeline_api', 'image_scale_caption_pipeline', + 'image_scale_caption_pipeline_api', 'image_visual_only_mcq_pipeline', + 'image_visual_only_mcq_pipeline_api', ], }, { diff --git a/docs/en/notes/mm_guide/image_understanding/image_gcot.md b/docs/en/notes/mm_guide/image_understanding/image_gcot.md index 68569d6a..636d7371 100644 --- a/docs/en/notes/mm_guide/image_understanding/image_gcot.md +++ b/docs/en/notes/mm_guide/image_understanding/image_gcot.md @@ -411,6 +411,4 @@ if __name__ == "__main__": download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct", ) pipe.forward() - - ``` diff --git a/docs/en/notes/mm_guide/image_understanding/image_gcot_api.md b/docs/en/notes/mm_guide/image_understanding/image_gcot_api.md index 1df75290..3499879e 100644 --- a/docs/en/notes/mm_guide/image_understanding/image_gcot_api.md +++ b/docs/en/notes/mm_guide/image_understanding/image_gcot_api.md @@ -1,14 +1,14 @@ --- -title: Image Grounded CoT (GCoT) Pipeline +title: Image Grounded CoT (GCoT) Pipeline (API version) icon: mdi:image-text createTime: 2026/01/11 20:44:55 -permalink: /en/mm_guide/image_gcot/ +permalink: /en/mm_guide/image_gcot_api/ --- ## 1. Overview The **Image Grounded Chain-of-Thought (GCoT) Pipeline** is designed to automatically generate **Grounded Chain-of-Thought** data. This pipeline generates multi-step reasoning to answer a question and simultaneously spatially locates (via Bounding Boxes) the key objects mentioned during the reasoning process. This significantly enhances the interpretability and precision of multimodal data. -Unlike traditional methods, this pipeline uses a **Single VLM (e.g., Qwen2.5-VL)** to handle both "Reasoning" and "Grounding" tasks, making the process streamlined and efficient. +Unlike traditional methods, this pipeline uses a **Single VLM (e.g., GPT-5)** to handle both "Reasoning" and "Grounding" tasks, making the process streamlined and efficient. We support the following application scenarios: @@ -67,7 +67,7 @@ os.environ["DF_API_KEY"] = "your_api_key" ### Step 5: Configure Parameters -Configure the API service and input data paths in `api_pipelines/image_region_caption_api_pipeline.py`: +Configure the API service and input data paths in `api_pipelines/image_gcot_api_pipeline.py`: ```python def __init__( @@ -76,16 +76,20 @@ Configure the API service and input data paths in `api_pipelines/image_region_ca first_entry_file: str, cache_path: str = "../cache/cache_gcot", file_name_prefix: str = "gcot", - # Keys question_key: str = "question", answer_key: str = "answer", image_key: str = "image", output_key: str = "gcot", - # Config vllm_max_tokens: int = 512 ): ``` +```python + pipe = ImageGCoTPipeline( + first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl" + ) +``` + ```python self.vlm_serving = APIVLMServing_openai( api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format @@ -395,5 +399,4 @@ if __name__ == "__main__": first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl" ) pipe.forward() - ``` diff --git a/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md b/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md index 8dc770ae..755b4a76 100644 --- a/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md +++ b/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md @@ -1,72 +1,147 @@ --- -title: ScaleCap High-Density Captioning Pipeline -createTime: 2026/01/11 22:08:57 +title: ScaleCap High-Density Caption Pipeline icon: mdi:image-text +createTime: 2026/01/11 22:08:57 permalink: /en/mm_guide/image_scale_caption_pipeline/ --- + ## 1. Overview -The **ScaleCap High-Density Captioning Pipeline** implements an advanced **"Generate-Verify-Expand-Fuse"** paradigm for image captioning. This pipeline is designed to generate **extremely high information density** captions with **minimal hallucinations**, making it ideal for scenarios requiring deep understanding of image details. +The **Image Scale Caption Pipeline (ScaleCap)** is an advanced image captioning solution based on a **"Generate-Verify-Expand-Integrate"** paradigm. This pipeline is designed to generate image descriptions with **extremely high information density** and **ultra-low hallucination rates**, making it particularly suitable for scenarios requiring deep understanding of image details. -Based on the paper *ScaleCap: Inference-Time Scalable Image Captioning via Dual-Modality Debiasing*, this method progressively mines object and position details through multi-turn dialogue and visual self-verification (Visual Grounding), filtering out hallucinations along the way. +The theoretical foundation of this method is derived from the paper *ScaleCap: Inference-Time Scalable Image Captioning via Dual-Modality Debiasing*. It gradually uncovers object and spatial details through multi-turn dialogue and visual grounding, effectively filtering out hallucinations produced by the model. We support the following application scenarios: * **High-Quality Multimodal Dataset Construction**: Generating training data that is more detailed and accurate than standard captions. -* **Fine-Grained Image Retrieval**: Providing index text rich in detail. -* **Accessibility/Blind Assistance**: Generating "What You See Is What You Get" (WYSIWYG) detailed narrations. +* **Fine-Grained Image Retrieval**: Providing highly detailed text for indexing. +* **Blind Assistance / Image Accessibility**: Generating "what-you-see-is-what-you-get" detailed narrations. The main process of the pipeline includes: -1. **Initial Caption Generation**: VLM generates a baseline description. -2. **Visual Debiasing**: Splitting the description into sentences and verifying each sentence against visual evidence (Visual Grounding). -3. **Detail Expansion**: Generating follow-up questions about object attributes and positions based on verified "Golden Sentences". -4. **Answering & Re-verification**: VLM answers the questions and performs another round of visual grounding to filter incorrect details. -5. **Final Fusion**: Merging all verified information into a coherent, long description. +1. **Initial Caption Generation**: The VLM generates a basic description. +2. **Visual Debiasing**: The description is split into sentences, and each is verified against visual evidence (Visual Grounding). +3. **Detail Questioning**: Targeted questions regarding object attributes and spatial relations are generated based on the verified "Golden Sentences". +4. **Answering & Secondary Verification**: The VLM answers the detail questions, followed by another round of visual grounding to filter out incorrect details. +5. **Final Integration**: All verified information is woven into a coherent, comprehensive long caption. --- ## 2. Quick Start -### Step 1: Create a Working Directory +### Step 1: Create a New DataFlow Working Directory ```bash -mkdir run_scalecap -cd run_scalecap +mkdir run_dataflow +cd run_dataflow ``` -### Step 2: Prepare the Script +### Step 2: Initialize DataFlow-MM -Save the code in the "Pipeline Example" section below as `scalecap_pipeline.py`. +```bash +dataflowmm init -### Step 3: Configure Parameters +``` -Ensure the VLM model path (e.g., Qwen2.5-VL) is correct. +You will then see: ```bash -# Install dependencies -pip install open-dataflow vllm +gpu_pipelines/image_scale_caption_pipeline.py ``` -### Step 4: Run +### Step 3: Download Sample Data ```bash -python scalecap_pipeline.py \ - --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \ - --input_jsonl "data/images.jsonl" \ - --output_key "final_caption" +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data ``` +### Step 4: Configure Parameters + +```python +if __name__ == "__main__": + pipe = ImageScaleCaptionPipeline( + model_path="Qwen/Qwen2.5-VL-3B-Instruct", + hf_cache_dir="~/.cache/huggingface", + download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct", + device="cuda", + first_entry_file="../example_data/capsbench_images/image_scale_caption_demo.jsonl", + cache_path="../cache/image_scale_caption", + file_name_prefix="scalecap", + input_image_key="image", + output_key="final_caption", + vllm_tensor_parallel_size=1, + vllm_max_tokens=1024 + ) + pipe.forward() + +``` + +> **⚠️ Important Note on Model Path Configuration (Taking `Qwen2.5-VL-3B-Instruct` as an example):** +> * **If you have already downloaded the model files:** Please change `model_path` to your local model path. **Crucially**, ensure that the model folder is named exactly `Qwen2.5-VL-3B-Instruct`; otherwise, the framework will fail to recognize it. +> * **If you haven't downloaded the model yet:** You must specify a `download_dir` parameter that ends with `Qwen2.5-VL-3B-Instruct` (as shown in the default parameters). Failure to do so will also result in the model not being recognized after downloading. +> +> + +### Step 5: Run + +```bash +cd gpu_pipelines +python image_scale_caption_pipeline.py + +``` + +> **🛠️ Troubleshooting** +> **Issue 1:** If you encounter a CUDA library conflict error similar to the following: +> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12` +> **Solution:** This is usually caused by conflicting environment variables. Run the script with an empty `LD_LIBRARY_PATH`: +> ```bash +> LD_LIBRARY_PATH="" python image_scale_caption_pipeline.py +> +> ``` +> +> +> **Issue 2:** If you are using **Qwen series models** and encounter the following error: +> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"` +> **Solution:** Open the `config.json` file located in your model folder, find the `rope_scaling` section, and change the key `"type"` to `"rope_type"`. +> **Before modification:** +> ```json +> "rope_scaling": { +> "type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> +> ``` +> +> +> **After modification:** +> ```json +> "rope_scaling": { +> "rope_type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> +> ``` +> +> + --- ## 3. Data Flow & Logic ### 1. **Input Data** -The input data requires only the image path: +The input data for this process is very simple, requiring only the image path: * **image**: Path to the image file. @@ -74,69 +149,69 @@ The input data requires only the image path: ```json { - "image": "./images/complex_scene.jpg" + "image": "../example_data/capsbench_images/0.png" } ``` ### 2. **Core Operator Logic** -This pipeline is a complex orchestration of multiple atomic operators: +This pipeline orchestrates multiple fine-grained operators to achieve the complex ScaleCap logic: #### A. **Initial Generation (PromptedVQAGenerator)** -* **Function**: Generates a preliminary description (`init_caption`) of the image using a basic prompt. +* **Function**: Uses a basic prompt to generate a preliminary description of the image (`init_caption`). #### B. **Visual Debiasing (VisualGroundingRefiner)** * **Function**: The core anti-hallucination mechanism of ScaleCap. * **Logic**: -1. Uses `split_sentences` to break the draft into single sentences. +1. Uses `split_sentences` to break the initial draft into single sentences. 2. Asks the VLM: "Given the image, is the description '{text}' directly supported by visual evidence?". -3. Keeps only sentences where the answer is "Yes", forming **"Golden Sentences"**. +3. Retains only the sentences that receive a "Yes", forming **"Golden Sentences"**. #### C. **Question Generation & Parsing (PromptTemplatedQAGenerator)** -* **Function**: Generates targeted follow-up questions based on Golden Sentences using LLM capabilities. -* **Logic**: The model generates text like "Describe more details about the [Object]", which is then automatically expanded into **Object Detail** and **Positional Relation** questions via `parse_questions_logic`. +* **Function**: Uses LLM capabilities to generate targeted follow-up questions based on the Golden Sentences. +* **Logic**: The model generates text like "Describe more details about the [Object]". The `parse_questions_logic` function automatically expands these into two categories: **object details** and **spatial relationships**. -#### D. **Batch Answering & Refiltering (BatchVQAGenerator & Refiner)** +#### D. **Batch Answering & Secondary Filtering (BatchVQAGenerator & Refiner)** -* **Function**: Mining deep image information. +* **Function**: Deeply mines visual information. * **Logic**: -1. Uses `BatchVQAGenerator` to have the VLM answer all generated questions in a batch. -2. Uses `VisualGroundingRefiner` again to check if these new details are accurate. +1. Uses `BatchVQAGenerator` to have the VLM answer all generated questions in a single batch. +2. Uses `VisualGroundingRefiner` again to verify if these newly generated details are accurate. 3. Retains reliable details (`final_details`). -#### E. **Final Fusion (PromptTemplatedQAGenerator)** +#### E. **Final Integration (PromptTemplatedQAGenerator)** -* **Function**: Rewrites the "Golden Sentences" and "Verified Details" into a fluent text. +* **Function**: Rewrites the "Golden Sentences" and "Verified Details" into a fluent, cohesive text. * **Output**: `final_caption`. ### 3. **Output Data** -The output data records the entire pipeline process, facilitating debugging and analysis: +The output data records the entire pipeline process for easy debugging and analysis: -* **init_caption**: Raw generated draft. -* **golden_sentences**: List of sentences that passed the first check. +* **init_caption**: The original initial draft. +* **golden_sentences**: List of sentences that passed the first debiasing check. * **q_list**: List of generated follow-up questions. -* **final_details**: Detailed answers that passed the second check. +* **final_details**: Detailed answers that passed the secondary check. * **final_caption**: The final high-density description. **Output Data Example**: ```json { - "image": "./images/complex_scene.jpg", + "image": "../example_data/capsbench_images/0.png", "init_caption": "A dog sitting on a bench.", "golden_sentences": ["A dog is sitting on a wooden bench."], - "q_list": ["Describe more details about the dog.", "Describe position of the bench."], + "q_list": ["Describe more details about the dog.", "Describe more details about the position of the bench."], "final_details": ["The dog is a Golden Retriever with a red collar.", "The bench is located in a park."], - "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park..." + "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park." } ``` @@ -145,7 +220,7 @@ The output data records the entire pipeline process, facilitating debugging and ## 4. Pipeline Example -Below is the complete `ImageScaleCaptionPipeline` code implementation. +Below is the complete `ImageScaleCaptionPipeline` code implementation (GPU Version). ```python import re @@ -153,12 +228,79 @@ import argparse from typing import Callable, Any, List from dataflow.utils.storage import FileStorage + from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm + from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate from dataflow.prompts.image import ImageScaleCaptionPrompt + from dataflow.operators.core_vision import PromptedVQAGenerator, BatchVQAGenerator, VisualGroundingRefiner from dataflow.operators.core_text import PromptTemplatedQAGenerator, FunctionalRefiner + +def split_sentences(text: str) -> List[str]: + """将文本拆分为句子列表""" + if not text or not isinstance(text, str): + return [] + # 使用正则按标点符号分割 (. ! ? 。 ! ?) + _SENT_SPLIT = re.compile(r"(?<=[.!?。!?])\s+") + parts = [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()] + return parts or ([text.strip()] if text.strip() else []) + +def join_list(data: Any, separator: str = "\n") -> str: + """将列表连接为字符串""" + if isinstance(data, list): + # 过滤掉非字符串元素或空字符串 + valid_items = [str(x) for x in data if x] + return separator.join(valid_items) + return str(data) if data is not None else "" + +def parse_questions_logic(text: str, max_q: int = 20) -> List[str]: + """ + 解析 LLM 生成的 "Describe more details about..." 文本, + 并自动扩展 position 问题。 + """ + if not text or not isinstance(text, str): + return [] + + lines = [t.strip() for t in text.split("\n") if t.strip()] + obj_qs = [] + + for line in lines: + # 提取包含 "Describe more details about" 的行 + if "Describe more details about" in line: + # 去除可能的序号 (如 "1. Describe...") + try: + start_idx = line.find("Describe") + clean = line[start_idx:] + # 去除句末多余内容,保留到第一个句号 + if "." in clean: + clean = clean.split(".")[0] + "." + obj_qs.append(clean) + except Exception: + continue + + # 去重并保持顺序 + seen = set() + unique_obj_qs = [] + for q in obj_qs: + if q not in seen: + unique_obj_qs.append(q) + seen.add(q) + + # 截断 + unique_obj_qs = unique_obj_qs[:max_q] + + # 扩展 Position 问题 + pos_qs = [ + q.replace("Describe more details about", "Describe more details about the position of") + for q in unique_obj_qs + ] + + # 返回合并后的列表 (对象问题 + 位置问题) + return unique_obj_qs + pos_qs + + class ImageScaleCaptionPipeline: def __init__( self, @@ -210,14 +352,19 @@ class ImageScaleCaptionPipeline: # ================== Operator Initialization ================== # --- Step A: Generate Init Caption --- + # 构造固定 Prompt 列 self.refine_const_prompt = FunctionalRefiner(func=lambda: self.prompts_db["VLM_PROMPT_1"]) + + # 生成初稿 (使用通用 PromptedVQAGenerator) self.gen_init_caption = PromptedVQAGenerator( serving=self.serving, system_prompt="You are a helpful assistant." ) # --- Step B: Refine Golden Sentences --- + # 分句 self.refine_split = FunctionalRefiner(func=split_sentences) + # 视觉自检 (保留 Yes 的句子) self.refine_golden = VisualGroundingRefiner( serving=self.serving, @@ -225,7 +372,10 @@ class ImageScaleCaptionPipeline: ) # --- Step C: Generate Questions --- + # 列表转字符串 self.refine_join = FunctionalRefiner(func=join_list) + + # 文本生成问题 (Text-to-Text) tpl_q = NamedPlaceholderPromptTemplate( template=self.prompts_db["LLM_PROMPT_1"], join_list_with="\n" @@ -234,16 +384,22 @@ class ImageScaleCaptionPipeline: serving=self.serving, prompt_template=tpl_q ) + + # 解析问题文本为列表 self.refine_parse_qs = FunctionalRefiner(func=parse_questions_logic) # --- Step D: Generate Answers --- + # 批量回答 (One Image -> Many Qs) self.gen_answers = BatchVQAGenerator(serving=self.serving) + + # 回答过滤 self.refine_answers = VisualGroundingRefiner( serving=self.serving, prompt_template="Given the image, is the statement '{text}' grounded in the image and not generic? Answer strictly yes or no." ) # --- Step E: Integrate Final Caption --- + # 融合 (Text-to-Text) tpl_final = NamedPlaceholderPromptTemplate( template=self.prompts_db["LLM_PROMPT_4"], join_list_with="\n" @@ -255,6 +411,7 @@ class ImageScaleCaptionPipeline: def forward(self): print(">>> [Pipeline] Step 0: Preparing Prompts...") + # 构造 init_prompt 列 self.refine_const_prompt.run( self.storage.step(), output_key="init_prompt" @@ -287,11 +444,14 @@ class ImageScaleCaptionPipeline: output_key="golden_str", data="golden_sentences" ) + + # template: "{sentence}" -> map to col "golden_str" self.gen_questions_text.run( self.storage.step(), output_answer_key="raw_q_text", sentence="golden_str" ) + self.refine_parse_qs.run( self.storage.step(), output_key="q_list", @@ -305,6 +465,7 @@ class ImageScaleCaptionPipeline: input_image_key=self.input_image_key, output_key="raw_answers" ) + self.refine_answers.run( self.storage.step(), input_list_key="raw_answers", @@ -318,48 +479,35 @@ class ImageScaleCaptionPipeline: output_key="details_str", data="final_details" ) + + # template keys: context, object_info, position_info self.gen_final_caption.run( self.storage.step(), output_answer_key=self.output_key, context="golden_str", object_info="details_str", - position_info="details_str" + position_info="details_str" # 简化:同时作为 object 和 position 信息 ) print(f">>> [Pipeline] All Done. Result saved to: {self.storage.cache_path}") if __name__ == "__main__": - parser = argparse.ArgumentParser(description="ScaleCap Dense Captioning Pipeline") - - parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") - parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface") - parser.add_argument("--download_dir", default="./ckpt/models") - parser.add_argument("--device", default="cuda") - - parser.add_argument("--input_jsonl", default="./dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl") - parser.add_argument("--cache_path", default="./cache_scalecap_results") - parser.add_argument("--file_name_prefix", default="scalecap") - parser.add_argument("--input_image_key", default="image") - parser.add_argument("--output_key", default="final_caption") - - parser.add_argument("--tp", type=int, default=1) - parser.add_argument("--max_tokens", type=int, default=1024) - - args = parser.parse_args() - pipe = ImageScaleCaptionPipeline( - model_path=args.model_path, - hf_cache_dir=args.hf_cache_dir, - download_dir=args.download_dir, - device=args.device, - first_entry_file=args.input_jsonl, - cache_path=args.cache_path, - file_name_prefix=args.file_name_prefix, - input_image_key=args.input_image_key, - output_key=args.output_key, - vllm_tensor_parallel_size=args.tp, - vllm_max_tokens=args.max_tokens + model_path="Qwen/Qwen2.5-VL-3B-Instruct", + hf_cache_dir="~/.cache/huggingface", + download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct", + device="cuda", + + first_entry_file="../example_data/capsbench_images/image_scale_caption_demo.jsonl", + cache_path="../cache/image_scale_caption", + file_name_prefix="scalecap", + + input_image_key="image", + output_key="final_caption", + + vllm_tensor_parallel_size=1, + vllm_max_tokens=1024 ) pipe.forward() diff --git a/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md b/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md new file mode 100644 index 00000000..87ad3fa7 --- /dev/null +++ b/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md @@ -0,0 +1,477 @@ +--- +title: ScaleCap High-Density Caption Pipeline (API version) +icon: mdi:image-text +createTime: 2026/01/11 22:08:57 +permalink: /en/mm_guide/image_scale_caption_pipeline_api/ +--- + +## 1. Overview + +The **Image Scale Caption Pipeline (ScaleCap)** is an advanced image captioning solution based on a **"Generate-Verify-Expand-Integrate"** paradigm. This pipeline is designed to generate image descriptions with **extremely high information density** and **ultra-low hallucination rates**, making it particularly suitable for scenarios requiring deep understanding of image details. + +The theoretical foundation of this method is derived from the paper *ScaleCap: Inference-Time Scalable Image Captioning via Dual-Modality Debiasing*. It gradually uncovers object and spatial details through multi-turn dialogue and visual grounding, effectively filtering out hallucinations produced by the model. + +We support the following application scenarios: + +* **High-Quality Multimodal Dataset Construction**: Generating training data that is more detailed and accurate than standard captions. +* **Fine-Grained Image Retrieval**: Providing highly detailed text for indexing. +* **Blind Assistance / Image Accessibility**: Generating "what-you-see-is-what-you-get" detailed narrations. + +The main process of the pipeline includes: + +1. **Initial Caption Generation**: The VLM generates a basic description. +2. **Visual Debiasing**: The description is split into sentences, and each is verified against visual evidence (Visual Grounding). +3. **Detail Questioning**: Targeted questions regarding object attributes and spatial relations are generated based on the verified "Golden Sentences". +4. **Answering & Secondary Verification**: The VLM answers the detail questions, followed by another round of visual grounding to filter out incorrect details. +5. **Final Integration**: All verified information is woven into a coherent, comprehensive long caption. + +--- + +## 2. Quick Start + +### Step 1: Create a New DataFlow Working Directory + +```bash +mkdir run_dataflow +cd run_dataflow + +``` + +### Step 2: Initialize DataFlow-MM + +```bash +dataflowmm init + +``` + +You will then see: + +```bash +api_pipelines/image_scale_caption_api_pipeline.py + +``` + +### Step 3: Download Sample Data + +```bash +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data + +``` + +### Step 4: Configure API Key + +Set your API Key environment variable in `api_pipelines/image_scale_caption_api_pipeline.py`: + +```python +import os +os.environ["DF_API_KEY"] = "your_api_key" + +``` + +### Step 5: Configure Parameters + +Configure the API service and input data paths in `api_pipelines/image_scale_caption_api_pipeline.py`: + +```python + def __init__( + self, + # Storage params + first_entry_file: str = "../example_data/capsbench_images/image_scale_caption_demo.jsonl", + cache_path: str = "../cache/image_scale_caption", + file_name_prefix: str = "scalecap", + cache_type: str = "jsonl", + # Keys + input_image_key: str = "image", + output_key: str = "final_caption", + ): + +``` + +```python + self.vlm_serving = APIVLMServing_openai( + api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format + model_name="gpt-4o-mini", + image_io=None, + send_request_stream=False, + max_workers=10, + timeout=1800 + ) + +``` + +### Step 6: Run with One Command + +```bash +cd api_pipelines +python image_scale_caption_api_pipeline.py + +``` + +--- + +## 3. Data Flow & Logic + +### 1. **Input Data** + +The input data for this process is very simple, requiring only the image path: + +* **image**: Path to the image file. + +**Input Data Example**: + +```json +{ + "image": "../example_data/capsbench_images/0.png" +} + +``` + +### 2. **Core Operator Logic** + +This pipeline orchestrates multiple fine-grained operators to achieve the complex ScaleCap logic: + +#### A. **Initial Generation (PromptedVQAGenerator)** + +* **Function**: Uses a basic prompt to generate a preliminary description of the image (`init_caption`). + +#### B. **Visual Debiasing (VisualGroundingRefiner)** + +* **Function**: The core anti-hallucination mechanism of ScaleCap. +* **Logic**: +1. Uses `split_sentences` to break the initial draft into single sentences. +2. Asks the VLM: "Given the image, is the description '{text}' directly supported by visual evidence?". +3. Retains only the sentences that receive a "Yes", forming **"Golden Sentences"**. + + + +#### C. **Question Generation & Parsing (PromptTemplatedQAGenerator)** + +* **Function**: Uses LLM capabilities to generate targeted follow-up questions based on the Golden Sentences. +* **Logic**: The model generates text like "Describe more details about the [Object]". The `parse_questions_logic` function automatically expands these into two categories: **object details** and **spatial relationships**. + +#### D. **Batch Answering & Secondary Filtering (BatchVQAGenerator & Refiner)** + +* **Function**: Deeply mines visual information. +* **Logic**: +1. Uses `BatchVQAGenerator` to have the VLM answer all generated questions in a single batch. +2. Uses `VisualGroundingRefiner` again to verify if these newly generated details are accurate. +3. Retains reliable details (`final_details`). + + + +#### E. **Final Integration (PromptTemplatedQAGenerator)** + +* **Function**: Rewrites the "Golden Sentences" and "Verified Details" into a fluent, cohesive text. +* **Output**: `final_caption`. + +### 3. **Output Data** + +The output data records the entire pipeline process for easy debugging and analysis: + +* **init_caption**: The original initial draft. +* **golden_sentences**: List of sentences that passed the first debiasing check. +* **q_list**: List of generated follow-up questions. +* **final_details**: Detailed answers that passed the secondary check. +* **final_caption**: The final high-density description. + +**Output Data Example**: + +```json +{ + "image": "../example_data/capsbench_images/0.png", + "init_caption": "A dog sitting on a bench.", + "golden_sentences": ["A dog is sitting on a wooden bench."], + "q_list": ["Describe more details about the dog.", "Describe more details about the position of the bench."], + "final_details": ["The dog is a Golden Retriever with a red collar.", "The bench is located in a park."], + "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park." +} + +``` + +--- + +## 4. Pipeline Example + +Below is the complete `ImageScaleCaptionPipeline` code implementation (API Version). + +```python +import os +os.environ["DF_API_KEY"] = "sk-xxxx" + + +import re +import argparse +from typing import Callable, Any, List + +from dataflow.utils.storage import FileStorage + +from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate +from dataflow.prompts.image import ImageScaleCaptionPrompt + +from dataflow.operators.core_vision import PromptedVQAGenerator, BatchVQAGenerator, VisualGroundingRefiner +from dataflow.operators.core_text import PromptTemplatedQAGenerator, FunctionalRefiner +from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai + +def split_sentences(text: str) -> List[str]: + """将文本拆分为句子列表""" + if not text or not isinstance(text, str): + return [] + # 使用正则按标点符号分割 (. ! ? 。 ! ?) + _SENT_SPLIT = re.compile(r"(?<=[.!?。!?])\s+") + parts = [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()] + return parts or ([text.strip()] if text.strip() else []) + +def join_list(data: Any, separator: str = "\n") -> str: + """将列表连接为字符串""" + if isinstance(data, list): + # 过滤掉非字符串元素或空字符串 + valid_items = [str(x) for x in data if x] + return separator.join(valid_items) + return str(data) if data is not None else "" + +def parse_questions_logic(text: str, max_q: int = 20) -> List[str]: + """ + 解析 LLM 生成的 "Describe more details about..." 文本, + 并自动扩展 position 问题。 + """ + if not text or not isinstance(text, str): + return [] + + lines = [t.strip() for t in text.split("\n") if t.strip()] + obj_qs = [] + + for line in lines: + # 提取包含 "Describe more details about" 的行 + if "Describe more details about" in line: + # 去除可能的序号 (如 "1. Describe...") + try: + start_idx = line.find("Describe") + clean = line[start_idx:] + # 去除句末多余内容,保留到第一个句号 + if "." in clean: + clean = clean.split(".")[0] + "." + obj_qs.append(clean) + except Exception: + continue + + # 去重并保持顺序 + seen = set() + unique_obj_qs = [] + for q in obj_qs: + if q not in seen: + unique_obj_qs.append(q) + seen.add(q) + + # 截断 + unique_obj_qs = unique_obj_qs[:max_q] + + # 扩展 Position 问题 + pos_qs = [ + q.replace("Describe more details about", "Describe more details about the position of") + for q in unique_obj_qs + ] + + # 返回合并后的列表 (对象问题 + 位置问题) + return unique_obj_qs + pos_qs + + +class ImageScaleCaptionPipeline: + def __init__( + self, + # Storage params + first_entry_file: str = "images.jsonl", + cache_path: str = "./cache_scalecap", + file_name_prefix: str = "scalecap", + cache_type: str = "jsonl", + # Keys + input_image_key: str = "image", + output_key: str = "final_caption", + # VLLM Config + vllm_tensor_parallel_size: int = 1, + vllm_temperature: float = 0.7, + vllm_top_p: float = 0.9, + vllm_max_tokens: int = 512, + ): + # 1. Storage + self.storage = FileStorage( + first_entry_file_name=first_entry_file, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type=cache_type, + ) + + # 2. Serving + self.vlm_serving = APIVLMServing_openai( + api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format + model_name="gpt-4o-mini", + image_io=None, + send_request_stream=False, + max_workers=10, + timeout=1800 + ) + + # 3. Prompts + self.prompts_db = ImageScaleCaptionPrompt().build_prompt() + + # 4. Keys + self.input_image_key = input_image_key + self.output_key = output_key + + # ================== Operator Initialization ================== + + # --- Step A: Generate Init Caption --- + # 构造固定 Prompt 列 + self.refine_const_prompt = FunctionalRefiner(func=lambda: self.prompts_db["VLM_PROMPT_1"]) + + # 生成初稿 (使用通用 PromptedVQAGenerator) + self.gen_init_caption = PromptedVQAGenerator( + serving=self.vlm_serving, + system_prompt="You are a helpful assistant." + ) + + # --- Step B: Refine Golden Sentences --- + # 分句 + self.refine_split = FunctionalRefiner(func=split_sentences) + + # 视觉自检 (保留 Yes 的句子) + self.refine_golden = VisualGroundingRefiner( + serving=self.vlm_serving, + prompt_template="Given the image, is the description '{text}' directly supported by visual evidence? Answer strictly yes or no." + ) + + # --- Step C: Generate Questions --- + # 列表转字符串 + self.refine_join = FunctionalRefiner(func=join_list) + + # 文本生成问题 (Text-to-Text) + tpl_q = NamedPlaceholderPromptTemplate( + template=self.prompts_db["LLM_PROMPT_1"], + join_list_with="\n" + ) + self.gen_questions_text = PromptTemplatedQAGenerator( + serving=self.vlm_serving, + prompt_template=tpl_q + ) + + # 解析问题文本为列表 + self.refine_parse_qs = FunctionalRefiner(func=parse_questions_logic) + + # --- Step D: Generate Answers --- + # 批量回答 (One Image -> Many Qs) + self.gen_answers = BatchVQAGenerator(serving=self.vlm_serving) + + # 回答过滤 + self.refine_answers = VisualGroundingRefiner( + serving=self.vlm_serving, + prompt_template="Given the image, is the statement '{text}' grounded in the image and not generic? Answer strictly yes or no." + ) + + # --- Step E: Integrate Final Caption --- + # 融合 (Text-to-Text) + tpl_final = NamedPlaceholderPromptTemplate( + template=self.prompts_db["LLM_PROMPT_4"], + join_list_with="\n" + ) + self.gen_final_caption = PromptTemplatedQAGenerator( + serving=self.vlm_serving, + prompt_template=tpl_final + ) + + def forward(self): + print(">>> [Pipeline] Step 0: Preparing Prompts...") + # 构造 init_prompt 列 + self.refine_const_prompt.run( + self.storage.step(), + output_key="init_prompt" + ) + + print(">>> [Pipeline] Step 1: Generating Initial Caption...") + self.gen_init_caption.run( + self.storage.step(), + input_prompt_key="init_prompt", + input_image_key=self.input_image_key, + output_answer_key="init_caption" + ) + + print(">>> [Pipeline] Step 2: Refining Golden Sentences...") + self.refine_split.run( + self.storage.step(), + output_key="sentences", + text="init_caption" + ) + self.refine_golden.run( + self.storage.step(), + input_list_key="sentences", + input_image_key=self.input_image_key, + output_key="golden_sentences" + ) + + print(">>> [Pipeline] Step 3: Generating Details Questions...") + self.refine_join.run( + self.storage.step(), + output_key="golden_str", + data="golden_sentences" + ) + + # template: "{sentence}" -> map to col "golden_str" + self.gen_questions_text.run( + self.storage.step(), + output_answer_key="raw_q_text", + sentence="golden_str" + ) + + self.refine_parse_qs.run( + self.storage.step(), + output_key="q_list", + text="raw_q_text" + ) + + print(">>> [Pipeline] Step 4: Generating & Filtering Answers...") + self.gen_answers.run( + self.storage.step(), + input_prompts_key="q_list", + input_image_key=self.input_image_key, + output_key="raw_answers" + ) + + self.refine_answers.run( + self.storage.step(), + input_list_key="raw_answers", + input_image_key=self.input_image_key, + output_key="final_details" + ) + + print(">>> [Pipeline] Step 5: Integrating Final Caption...") + self.refine_join.run( + self.storage.step(), + output_key="details_str", + data="final_details" + ) + + # template keys: context, object_info, position_info + self.gen_final_caption.run( + self.storage.step(), + output_answer_key=self.output_key, + context="golden_str", + object_info="details_str", + position_info="details_str" # 简化:同时作为 object 和 position 信息 + ) + + print(f">>> [Pipeline] All Done. Result saved to: {self.storage.cache_path}") + + +if __name__ == "__main__": + + pipe = ImageScaleCaptionPipeline( + first_entry_file="../example_data/capsbench_images/image_scale_caption_demo.jsonl", + cache_path="../cache/image_scale_caption", + file_name_prefix="scalecap", + input_image_key="image", + output_key="final_caption", + vllm_tensor_parallel_size=1, + vllm_max_tokens=1024 + ) + + pipe.forward() + +``` diff --git a/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md b/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md index cc3806af..4e495489 100644 --- a/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md +++ b/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md @@ -4,27 +4,26 @@ createTime: 2026/01/11 22:13:45 icon: mdi:image-text permalink: /en/mm_guide/image_visual_only_mcq_pipeline/ --- + ## 1. Overview -The **Visual-Only MCQ Pipeline** is a core component of the CapRL (Caption Reinforcement Learning) framework. Its goal is to generate a set of high-quality Multiple Choice Questions (MCQs) that satisfy **strict visual dependency**: the model must "see" the image to answer correctly; answering based on text alone (guessing or common sense) is not possible. +The **Visual-Only MCQ Pipeline** is a core component within the CapRL (Caption Reinforcement Learning) framework. Its goal is to generate a set of high-quality Multiple-Choice Questions (MCQs) that strictly satisfy **strong visual dependency**: the model must "see" the image to answer correctly, and cannot rely merely on text guessing or common sense. -This pipeline uses a **Generate-Parse-Verify** three-step method, leveraging **Option Rotation** and **Blind Tests** to rigorously filter out hallucinations or overly simple questions. The generated questions serve as a robust reward signal for Reinforcement Learning. +This pipeline utilizes a **"Generate-Parse-Verify"** three-step approach, employing **Option Rotation** and **Blind Test (Text-Only)** mechanisms to rigorously filter out model hallucinations or overly simple questions. The generated questions can be used as reward signals (Reward Model) for reinforcement learning. The main process includes: -1. **MCQ Generation**: VLM generates raw QA pairs based on the image. -2. **Structured Parsing**: Using regex logic to parse text into standard question/option structures. +1. **MCQ Generation**: The VLM generates raw Question-Answer text blocks based on the image. +2. **Structured Parsing**: Uses regex logic to parse the raw text into standard question and option structures. 3. **Visual Dependency Verification**: -* **Rotation Test**: Shuffling options multiple times to eliminate positional bias. -* **Dual Filtering**: Requiring high "Visual Accuracy" and low "Text-only Accuracy". - - + * **Rotation Test**: Randomly shuffles the order of options multiple times to eliminate positional bias. + * **Dual Filtering**: Requires a high "Visual Accuracy" (with image) and a low "Textual Accuracy" (without image). --- ## 2. Quick Start -### Step 1: Create Working Directory +### Step 1: Create a New DataFlow Working Directory ```bash mkdir run_vis_mcq @@ -32,39 +31,109 @@ cd run_vis_mcq ``` -### Step 2: Prepare Script +### Step 2: Initialize DataFlow-MM -Save the code in the "Pipeline Example" section below as `visual_mcq_pipeline.py`. +```bash +dataflowmm init -### Step 3: Configure Parameters +``` -Control filtering thresholds via CLI. For example, requiring 100% visual accuracy and less than 25% blind accuracy: +You will then see: ```bash -# Install dependencies -pip install open-dataflow vllm +gpu_pipelines/image_visual_only_mcq_pipeline.py ``` -### Step 4: Run +### Step 3: Download Sample Data ```bash -python visual_mcq_pipeline.py \ - --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \ - --input_file "data/captions.jsonl" \ - --rotate_num 4 \ - --pass_vis 1.0 \ - --pass_txt 0.25 +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data + +``` + +### Step 4: Configure Parameters + +Configure the model path and filtering thresholds (e.g., requiring 100% visual accuracy and less than 25% textual accuracy): + +```python +if __name__ == "__main__": + pipe = VisualOnlyMCQPipeline( + model_path="Qwen/Qwen2.5-VL-3B-Instruct", + first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl", + hf_cache_dir="~/.cache/huggingface", + download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct", + rotate_num=4, + pass_visual_min=1.0, + pass_textual_max=0.25 + ) + pipe.forward() ``` +> **⚠️ Important Note on Model Path Configuration (Taking `Qwen2.5-VL-3B-Instruct` as an example):** +> * **If you have already downloaded the model files:** Please change `model_path` to your local model path. **Crucially**, ensure that the model folder is named exactly `Qwen2.5-VL-3B-Instruct`; otherwise, the framework will fail to recognize it. +> * **If you haven't downloaded the model yet:** You must specify a `download_dir` parameter that ends with `Qwen2.5-VL-3B-Instruct` (as shown in the default parameters). Failure to do so will also result in the model not being recognized after downloading. +> +> + +### Step 5: Run + +```bash +cd gpu_pipelines +python image_visual_only_mcq_pipeline.py + +``` + +> **🛠️ Troubleshooting** +> **Issue 1:** If you encounter a CUDA library conflict error similar to the following: +> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12` +> **Solution:** This is usually caused by conflicting environment variables. Run the script with an empty `LD_LIBRARY_PATH`: +> ```bash +> LD_LIBRARY_PATH="" python image_visual_only_mcq_pipeline.py +> +> ``` +> +> +> **Issue 2:** If you are using **Qwen series models** and encounter the following error: +> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"` +> **Solution:** Open the `config.json` file located in your model folder, find the `rope_scaling` section, and change the key `"type"` to `"rope_type"`. +> **Before modification:** +> ```json +> "rope_scaling": { +> "type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> +> ``` +> +> +> **After modification:** +> ```json +> "rope_scaling": { +> "rope_type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> +> ``` +> +> + --- ## 3. Data Flow & Logic ### 1. **Input Data** -Input only requires the image path: +The input data only requires the image path: * **image**: Path to the image file. @@ -79,35 +148,35 @@ Input only requires the image path: ### 2. **Core Operator Logic** -This pipeline chains three key operators: +This pipeline is chained together by three key operators: -#### A. **FixPromptedVQAGenerator (Raw Generation)** +#### A. **Raw Generation (FixPromptedVQAGenerator)** -* **Function**: Uses CapRL predefined Prompt templates (`SYS_PROMPT_MCQ` / `USER_PROMPT_MCQ`) to generate 5 MCQs at once. -* **Output**: Unstructured text block containing multiple `#### Question` and options. +* **Function**: Uses the preset CapRL prompt templates (`SYS_PROMPT_MCQ` / `USER_PROMPT_MCQ`) to instruct the VLM to generate 5 MCQs in one go. +* **Output**: Unstructured text blocks containing multiple `#### Question` headers and options. -#### B. **FunctionalRefiner (Regex Parsing)** +#### B. **Structured Parsing (FunctionalRefiner)** * **Logic Function**: `parse_mcq_text_logic` -* **Function**: Extracts questions, options (A-F), and correct answers from raw text using regex. -* **Output**: Structured MCQ list (`parsed_mcq_list`). +* **Function**: Extracts the questions, options (A-F), and correct answers from the raw text using regular expressions. +* **Output**: A structured list of MCQs (`parsed_mcq_list`). -#### C. **VisualDependencyRefiner (Dependency Verification)** +#### C. **Dependency Verification (VisualDependencyRefiner)** -This is the core filter. It performs N inferences (N = `rotate_num`) for each question: +This is the core filter of the pipeline. It performs N inferences (N = `rotate_num`) for each question: -1. **Option Rotation**: Randomly shuffles options (e.g., moving answer from A to C) to prevent the model from cheating by "always picking A". -2. **Visual Pass**: Input Image + Question. Records the model's accuracy. -3. **Textual Pass**: Input Question only (no image). Records the model's blind guessing accuracy. +1. **Option Rotation**: Randomly shuffles the option order (e.g., moving the answer from A to C) to prevent the model from cheating by "always choosing A". +2. **Visual Pass**: Inputs Image + Question. Records the proportion of correct answers. +3. **Textual Pass (Blind Test)**: Inputs Question only (No Image). Records the proportion of correct blind guesses. 4. **Filtering Criteria**: -* Keep the question IF AND ONLY IF: `Visual_Acc >= pass_visual_min` **AND** `Textual_Acc <= pass_textual_max`. -* *Example*: If a question can be answered correctly without the image (high text accuracy), it tests common sense rather than vision, so it is **discarded**. +* Retains the question if and only if: `Visual_Acc >= pass_visual_min` **AND** `Textual_Acc <= pass_textual_max`. +* *Example*: If a question can be answered correctly without looking at the image (high textual accuracy), it relies on common sense rather than visual info, and is **discarded**. ### 3. **Output Data** -The output data (`final_mcqs`) contains only questions that passed rigorous verification. These questions possess high quality and visual relevance. +The output data (`final_mcqs`) only contains questions that have passed the rigorous verification. These questions possess extremely high quality and visual relevance. **Output Data Example**: @@ -119,8 +188,8 @@ The output data (`final_mcqs`) contains only questions that passed rigorous veri "question": "What is the color of the car on the far left?\n - A) Red\n - B) Blue...", "answer": "A", "stats": { - "visual_acc": 1.0, # 4/4 correct with image - "text_acc": 0.0 # 0/4 correct without image + "visual_acc": 1.0, + "text_acc": 0.0 } } ] @@ -132,12 +201,10 @@ The output data (`final_mcqs`) contains only questions that passed rigorous veri ## 4. Pipeline Example -Below is the complete `VisualOnlyMCQPipeline` code implementation. +Below is the complete `VisualOnlyMCQPipeline` code implementation (GPU Version). ```python import argparse -import re -from typing import List, Dict, Any from dataflow.utils.storage import FileStorage from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm @@ -145,13 +212,14 @@ from dataflow.operators.core_vision import FixPromptedVQAGenerator, VisualDepend from dataflow.operators.core_text import FunctionalRefiner from dataflow.prompts.image import ImageCaprlPrompt -# 正则解析逻辑 +import re +from typing import List, Dict, Any + _Q_BLOCK_SPLIT = re.compile(r"^####\s*\d+\.\s*\*\*(.*?)\*\*\s*$", re.M) _OPT_LINE_RE = re.compile(r"^\s*-\s*([A-F])\)\s*(.+?)\s*$") _ANS_LINE_RE = re.compile(r"^\s*\*\*Answer:\*\*\s*([A-F])\)\s*(.+?)\s*$", re.I) def parse_mcq_text_logic(mcq_text: str, expected: int = 5) -> List[Dict[str, Any]]: - """将 VLM 生成的原始文本解析为结构化字典列表""" if not mcq_text or not isinstance(mcq_text, str): return [] indices = [m.start() for m in _Q_BLOCK_SPLIT.finditer(mcq_text)] @@ -213,7 +281,9 @@ class VisualOnlyMCQPipeline: model_path: str, *, first_entry_file: str, - cache_path: str = "./cache_mcq", + hf_cache_dir: str | None = None, + download_dir: str = "./ckpt/models", + cache_path: str = "../cache/cache_mcq", file_name_prefix: str = "vis_mcq", # Config rotate_num: int = 4, @@ -227,7 +297,6 @@ class VisualOnlyMCQPipeline: device: str = "cuda", vllm_max_tokens: int = 2048 ): - # 1. 初始化存储 self.storage = FileStorage( first_entry_file_name=first_entry_file, cache_path=cache_path, @@ -235,15 +304,16 @@ class VisualOnlyMCQPipeline: cache_type="jsonl" ) - # 2. 初始化 VLM 服务 self.serving = LocalModelVLMServing_vllm( + hf_cache_dir=hf_cache_dir, + hf_local_dir=download_dir, hf_model_name_or_path=model_path, vllm_tensor_parallel_size=1, - vllm_temperature=0.1, # 低温度以保证格式稳定 + vllm_temperature=0.1, vllm_max_tokens=vllm_max_tokens ) - # Keys 配置 + # Keys self.keys = { "img": input_image_key, "raw_text": "raw_mcq_text", @@ -251,23 +321,24 @@ class VisualOnlyMCQPipeline: "final": output_key } - # 加载 Prompt 库 + # --- Prompts --- self.prompts_db = ImageCaprlPrompt().build_prompt() - # ================== 算子初始化 ================== + # ================== Operators ================== - # 算子 1: 生成原始 MCQ 文本 + # 1. Generate Raw MCQs (FixPromptedVQAGenerator) + # 直接使用 prompt 类中的字符串 self.op_gen_raw = FixPromptedVQAGenerator( serving=self.serving, system_prompt=self.prompts_db["SYS_PROMPT_MCQ"], user_prompt=self.prompts_db["USER_PROMPT_MCQ"] ) - # 算子 2: 解析文本为结构化数据 + # 2. Parse MCQs (Refine) self.op_parse = FunctionalRefiner(func=parse_mcq_text_logic) - # 算子 3: 视觉依赖性验证 (核心过滤) - # 包含旋转 (Rotation) 和 无图检测 (Text-only check) + # 3. Verify Visual Dependency (Refine) + # 传入 prompt 模板 self.op_verify = VisualDependencyRefiner( serving=self.serving, instruction_template=self.prompts_db["ANSWER_INSTRUCTION"], @@ -304,22 +375,15 @@ class VisualOnlyMCQPipeline: print(f">>> [Pipeline] Done. Results in: {self.keys['final']}") if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--input_file", default="./dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl") - parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") - parser.add_argument("--rotate_num", type=int, default=4) - parser.add_argument("--pass_vis", type=float, default=1.0) - parser.add_argument("--pass_txt", type=float, default=0.25) - - args = parser.parse_args() - pipe = VisualOnlyMCQPipeline( - model_path=args.model_path, - first_entry_file=args.input_file, - rotate_num=args.rotate_num, - pass_visual_min=args.pass_vis, - pass_textual_max=args.pass_txt + model_path="Qwen/Qwen2.5-VL-3B-Instruct", + first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl", + hf_cache_dir="~/.cache/huggingface", + download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct", + rotate_num=4, + pass_visual_min=1.0, + pass_textual_max=0.25 ) pipe.forward() -``` \ No newline at end of file +``` diff --git a/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md b/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md new file mode 100644 index 00000000..054ade54 --- /dev/null +++ b/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md @@ -0,0 +1,341 @@ +--- +title: Visual-Only MCQ Pipeline (API version) +createTime: 2026/01/11 22:13:45 +icon: mdi:image-text +permalink: /en/mm_guide/image_visual_only_mcq_pipeline_api/ +--- + +## 1. Overview + +The **Visual-Only MCQ Pipeline** is a core component within the CapRL (Caption Reinforcement Learning) framework. Its goal is to generate a set of high-quality Multiple-Choice Questions (MCQs) that strictly satisfy **strong visual dependency**: the model must "see" the image to answer correctly, and cannot rely merely on text guessing or common sense. + +This pipeline utilizes a **"Generate-Parse-Verify"** three-step approach, employing **Option Rotation** and **Blind Test (Text-Only)** mechanisms to rigorously filter out model hallucinations or overly simple questions. The generated questions can be used as reward signals (Reward Model) for reinforcement learning. + +The main process includes: + +1. **MCQ Generation**: The VLM generates raw Question-Answer text blocks based on the image. +2. **Structured Parsing**: Uses regex logic to parse the raw text into standard question and option structures. +3. **Visual Dependency Verification**: + * **Rotation Test**: Randomly shuffles the order of options multiple times to eliminate positional bias. + * **Dual Filtering**: Requires a high "Visual Accuracy" (with image) and a low "Textual Accuracy" (without image). + +--- + +## 2. Quick Start + +### Step 1: Create a New DataFlow Working Directory + +```bash +mkdir run_vis_mcq +cd run_vis_mcq + +``` + +### Step 2: Initialize DataFlow-MM + +```bash +dataflowmm init + +``` + +You will then see: + +```bash +api_pipelines/image_visual_only_mcq_api_pipeline.py + +``` + +### Step 3: Download Sample Data + +```bash +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data + +``` + +### Step 4: Configure API Key + +Set your API Key environment variable in `api_pipelines/image_visual_only_mcq_api_pipeline.py`: + +```python +import os +os.environ["DF_API_KEY"] = "your_api_key" + +``` + +### Step 5: Configure Parameters + +Configure the API service and run parameters in `api_pipelines/image_visual_only_mcq_api_pipeline.py` (e.g., requiring 100% visual accuracy and less than 25% textual accuracy): + +```python + pipe = VisualOnlyMCQPipeline( + first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl", + rotate_num=4, + pass_visual_min=1.0, + pass_textual_max=0.25 + ) + +``` + +### Step 6: Run with One Command + +```bash +cd api_pipelines +python image_visual_only_mcq_api_pipeline.py + +``` + +--- + +## 3. Data Flow & Logic + +### 1. **Input Data** + +The input data only requires the image path: + +* **image**: Path to the image file. + +**Input Data Example**: + +```json +{ + "image": "./images/sample_01.jpg" +} + +``` + +### 2. **Core Operator Logic** + +This pipeline is chained together by three key operators: + +#### A. **Raw Generation (FixPromptedVQAGenerator)** + +* **Function**: Uses the preset CapRL prompt templates (`SYS_PROMPT_MCQ` / `USER_PROMPT_MCQ`) to instruct the VLM to generate 5 MCQs in one go. +* **Output**: Unstructured text blocks containing multiple `#### Question` headers and options. + +#### B. **Structured Parsing (FunctionalRefiner)** + +* **Logic Function**: `parse_mcq_text_logic` +* **Function**: Extracts the questions, options (A-F), and correct answers from the raw text using regular expressions. +* **Output**: A structured list of MCQs (`parsed_mcq_list`). + +#### C. **Dependency Verification (VisualDependencyRefiner)** + +This is the core filter of the pipeline. It performs N inferences (N = `rotate_num`) for each question: + +1. **Option Rotation**: Randomly shuffles the option order (e.g., moving the answer from A to C) to prevent the model from cheating by "always choosing A". +2. **Visual Pass**: Inputs Image + Question. Records the proportion of correct answers. +3. **Textual Pass (Blind Test)**: Inputs Question only (No Image). Records the proportion of correct blind guesses. +4. **Filtering Criteria**: +* Retains the question if and only if: `Visual_Acc >= pass_visual_min` **AND** `Textual_Acc <= pass_textual_max`. +* *Example*: If a question can be answered correctly without looking at the image (high textual accuracy), it relies on common sense rather than visual info, and is **discarded**. + + + +### 3. **Output Data** + +The output data (`final_mcqs`) only contains questions that have passed the rigorous verification. These questions possess extremely high quality and visual relevance. + +**Output Data Example**: + +```json +{ + "image": "./images/sample_01.jpg", + "final_mcqs": [ + { + "question": "What is the color of the car on the far left?\n - A) Red\n - B) Blue...", + "answer": "A", + "stats": { + "visual_acc": 1.0, + "text_acc": 0.0 + } + } + ] +} + +``` + +--- + +## 4. Pipeline Example + +Below is the complete `VisualOnlyMCQPipeline` code implementation (API Version). + +```python +import os +os.environ["DF_API_KEY"] = "sk-xxxx" +import argparse +from dataflow.utils.storage import FileStorage +from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm + +from dataflow.operators.core_vision import FixPromptedVQAGenerator, VisualDependencyRefiner +from dataflow.operators.core_text import FunctionalRefiner +from dataflow.prompts.image import ImageCaprlPrompt +from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai +import re +from typing import List, Dict, Any + +_Q_BLOCK_SPLIT = re.compile(r"^####\s*\d+\.\s*\*\*(.*?)\*\*\s*$", re.M) +_OPT_LINE_RE = re.compile(r"^\s*-\s*([A-F])\)\s*(.+?)\s*$") +_ANS_LINE_RE = re.compile(r"^\s*\*\*Answer:\*\*\s*([A-F])\)\s*(.+?)\s*$", re.I) + +def parse_mcq_text_logic(mcq_text: str, expected: int = 5) -> List[Dict[str, Any]]: + if not mcq_text or not isinstance(mcq_text, str): return [] + + indices = [m.start() for m in _Q_BLOCK_SPLIT.finditer(mcq_text)] + if not indices: return [] + indices.append(len(mcq_text)) + blocks = [mcq_text[indices[i]:indices[i+1]].strip() for i in range(len(indices)-1)] + + parsed = [] + for block in blocks: + lines = [ln.rstrip() for ln in block.splitlines() if ln.strip()] + q_title_m = _Q_BLOCK_SPLIT.search(block) + if not q_title_m: continue + + q_title = q_title_m.group(1).strip() + options = {} + ans_letter, ans_text = None, None + + for ln in lines: + m_opt = _OPT_LINE_RE.match(ln) + if m_opt: + options[m_opt.group(1)] = m_opt.group(2).strip() + continue + m_ans = _ANS_LINE_RE.match(ln) + if m_ans: + ans_letter = m_ans.group(1).upper() + ans_text = m_ans.group(2).strip() + break + + if options and ans_letter and ans_letter in options: + q_lines = [q_title] + for lbl in ["A", "B", "C", "D", "E", "F"]: + if lbl in options: + q_lines.append(f" - {lbl}) {options[lbl]}") + + parsed.append({ + "question": "\n".join(q_lines), + "question_title": q_title, + "options": options, + "answer": ans_letter, + "answer_text": ans_text + }) + + if expected > 0: + parsed = parsed[:expected] + + uniq = [] + seen = set() + for it in parsed: + key = (it["question_title"], it["answer"]) + if key not in seen: + seen.add(key) + uniq.append(it) + return uniq + + +class VisualOnlyMCQPipeline: + def __init__( + self, + *, + first_entry_file: str, + cache_path: str = "../cache/cache_mcq", + file_name_prefix: str = "vis_mcq", + # Config + rotate_num: int = 4, + pass_visual_min: float = 1.0, + pass_textual_max: float = 0.25, + add_none_above: bool = True, + # Keys + input_image_key: str = "image", + output_key: str = "final_mcqs", + # VLLM + vllm_max_tokens: int = 2048 + ): + self.storage = FileStorage( + first_entry_file_name=first_entry_file, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type="jsonl" + ) + self.vlm_serving = APIVLMServing_openai( + api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format + model_name="gpt-4o-mini", + image_io=None, + send_request_stream=False, + max_workers=10, + timeout=1800 + ) + + + # Keys + self.keys = { + "img": input_image_key, + "raw_text": "raw_mcq_text", + "parsed_list": "parsed_mcq_list", + "final": output_key + } + + # --- Prompts --- + self.prompts_db = ImageCaprlPrompt().build_prompt() + + # ================== Operators ================== + + # 1. Generate Raw MCQs (FixPromptedVQAGenerator) + # 直接使用 prompt 类中的字符串 + self.op_gen_raw = FixPromptedVQAGenerator( + serving=self.vlm_serving, + system_prompt=self.prompts_db["SYS_PROMPT_MCQ"], + user_prompt=self.prompts_db["USER_PROMPT_MCQ"] + ) + + # 2. Parse MCQs (Refine) + self.op_parse = FunctionalRefiner(func=parse_mcq_text_logic) + + # 3. Verify Visual Dependency (Refine) + # 传入 prompt 模板 + self.op_verify = VisualDependencyRefiner( + serving=self.vlm_serving, + instruction_template=self.prompts_db["ANSWER_INSTRUCTION"], + rotate_num=rotate_num, + pass_visual_min=pass_visual_min, + pass_textual_max=pass_textual_max, + add_none_above_visual=add_none_above + ) + + def forward(self): + print(">>> [Pipeline] Step 1: Generating Raw MCQs (FixPrompted)...") + self.op_gen_raw.run( + self.storage.step(), + input_image_key=self.keys["img"], + output_answer_key=self.keys["raw_text"] + ) + + print(">>> [Pipeline] Step 2: Parsing MCQs...") + self.op_parse.run( + self.storage.step(), + output_key=self.keys["parsed_list"], + mcq_text=self.keys["raw_text"], + expected=5 + ) + + print(">>> [Pipeline] Step 3: Verifying Visual Dependency (Rotation Check)...") + self.op_verify.run( + self.storage.step(), + input_list_key=self.keys["parsed_list"], + input_image_key=self.keys["img"], + output_key=self.keys["final"] + ) + + print(f">>> [Pipeline] Done. Results in: {self.keys['final']}") + +if __name__ == "__main__": + pipe = VisualOnlyMCQPipeline( + first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl", + rotate_num=4, + pass_visual_min=1.0, + pass_textual_max=0.25 + ) + pipe.forward() + +``` diff --git a/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md b/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md index 0904691e..799a867e 100644 --- a/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md +++ b/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md @@ -1,3 +1,11 @@ + +``` + +--- + +### 2. 英文 GPU 版 (English GPU Version) + +```markdown --- title: Vision MCTS Reasoning Pipeline icon: mdi:image-text @@ -7,27 +15,27 @@ permalink: /en/mm_guide/vision_mct_reasoning_pipeline/ ## 1. Overview -The **Vision MCTS Reasoning Pipeline** is designed to construct high-quality **Process Supervision Data** for multimodal large models. This pipeline handles two types of data sources: existing Monte Carlo Tree Search (MCTS) trajectory data, or direct generation of new reasoning chains using a VLM. +The **Vision MCTS Reasoning Pipeline** is designed to build high-quality **Process Supervision Data** for multimodal large models. This pipeline handles two sources of data: existing Monte Carlo Tree Search (MCTS) trajectory data, or generating new reasoning chains directly using a VLM. -This pipeline is a core tool for **Grounded-RL** and **SFT Data Construction**, converting complex tree-search processes into a linearized `......` format that models can learn from. +This pipeline is a core tool for **Grounded-RL** and **SFT Data Construction**. It "linearizes" complex tree-like search processes into a `......` format that the model can learn from. We support the following application scenarios: -* **MCTS Data Extraction**: Converting high-value paths (Rollouts) from search trees into linear training data. -* **Hybrid Data Construction**: Automatically falling back to VLM-based CoT generation for samples without search trees. -* **Spatial Reasoning Enhancement**: Supporting the generation of spatial reasoning chains containing explicit coordinates (Bounding Boxes). +* **Data Extraction from MCTS Trees**: Converts high-value paths (Rollouts) in the search tree into linear training data. +* **Hybrid Data Construction**: Automatically falls back to using the VLM for CoT generation for samples without a search tree. +* **Spatial Reasoning Enhancement**: Supports generating spatial reasoning chains that include explicit coordinates (Bounding Boxes). The main process of the pipeline includes: -1. **MCTS Tree Parsing**: Parsing the search tree structure in the input data to extract successful reasoning paths. -2. **Visual Reasoning Generation (Fallback)**: Using a VLM to regenerate reasoning chains for samples where the tree structure is missing or parsing fails. -3. **Data Standardization**: Outputting reasoning chain data in a unified format. +1. **MCTS Tree Parsing**: Parses the search tree structure in the input data and extracts successful reasoning paths. +2. **Visual Reasoning Generation (Fallback)**: For samples with missing tree structures or failed parsing, the VLM is used to regenerate the reasoning chain. +3. **Data Standardization**: Outputs reasoning chain data in a unified format. --- ## 2. Quick Start -### Step 1: Create a Working Directory +### Step 1: Create a New DataFlow Working Directory ```bash mkdir run_mcts_reasoning @@ -35,41 +43,111 @@ cd run_mcts_reasoning ``` -### Step 2: Prepare the Script +### Step 2: Initialize DataFlow-MM -Save the code in the "Pipeline Example" section below as `vision_mcts_pipeline.py`. +```bash +dataflowmm init -### Step 3: Configure Parameters +``` -Ensure the input file (jsonl) contains the `tree` field (for extraction) or just `question/image` (for generation). +You will then see: ```bash -# Install dependencies -pip install open-dataflow vllm +gpu_pipelines/vision_mcts_pipeline.py ``` -### Step 4: Run +### Step 3: Download Sample Data ```bash -python vision_mcts_pipeline.py \ - --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \ - --input_file "data/mcts_trajectories.jsonl" \ - --prompt_type "spatial" +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data ``` +### Step 4: Configure Parameters + +Ensure the input file (jsonl) contains a `tree` field (for extraction) or just `question`/`image` (for generation). + +```python +if __name__ == "__main__": + pipe = VisionMCTSReasoningPipeline( + model_path="Qwen/Qwen2.5-VL-3B-Instruct", + first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl", + prompt_type="spatial", + hf_cache_dir="~/.cache/huggingface", + download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct", + ) + pipe.forward() + +``` + +> **⚠️ Important Note on Model Path Configuration (Taking `Qwen2.5-VL-3B-Instruct` as an example):** +> * **If you have already downloaded the model files:** Please change `model_path` to your local model path. **Crucially**, ensure that the model folder is named exactly `Qwen2.5-VL-3B-Instruct`; otherwise, the framework will fail to recognize it. +> * **If you haven't downloaded the model yet:** You must specify a `download_dir` parameter that ends with `Qwen2.5-VL-3B-Instruct` (as shown in the default parameters). Failure to do so will also result in the model not being recognized after downloading. +> +> + +### Step 5: Run + +```bash +cd gpu_pipelines +python vision_mcts_pipeline.py + +``` + +> **🛠️ Troubleshooting** +> **Issue 1:** If you encounter a CUDA library conflict error similar to the following: +> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12` +> **Solution:** This is usually caused by conflicting environment variables. Run the script with an empty `LD_LIBRARY_PATH`: +> ```bash +> LD_LIBRARY_PATH="" python vision_mcts_pipeline.py +> +> ``` +> +> +> **Issue 2:** If you are using **Qwen series models** and encounter the following error: +> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"` +> **Solution:** Open the `config.json` file located in your model folder, find the `rope_scaling` section, and change the key `"type"` to `"rope_type"`. +> **Before modification:** +> ```json +> "rope_scaling": { +> "type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> +> ``` +> +> +> **After modification:** +> ```json +> "rope_scaling": { +> "rope_type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> +> ``` +> +> + --- ## 3. Data Flow & Logic ### 1. **Input Data** -Input data typically comes from MCTS search logs or unlabelled image-text pairs: +Input data typically originates from MCTS search process logs, or unannotated image-text pairs: * **image**: Path to the image. -* **question**: Visual question. -* **tree** (optional): JSON structure of the MCTS search tree, containing node values, visit counts, and actions. +* **question**: The visual question. +* **tree** (Optional): JSON structure of the MCTS search tree, containing node Values, Visits, and Actions. **Input Data Example**: @@ -84,30 +162,30 @@ Input data typically comes from MCTS search logs or unlabelled image-text pairs: ### 2. **Core Operator Logic** -The pipeline employs an **"Extract First, Fallback to Generate"** hybrid strategy: +This pipeline uses a hybrid strategy of **"Extraction First, Generation as Fallback"**: -#### A. **MCTSTreeRefiner** +#### A. **MCTSTreeRefiner (Tree Structure Parser)** -This operator is responsible for processing the `tree` field. It traverses the tree structure and filters for the best paths from root to leaf based on node Q-values. +This operator handles the `tree` field. It traverses the tree structure and filters out the best path from the root node to a leaf node based on the node's Q-value. * **Input**: `tree` object. -* **Functionality**: Linearizes tree paths, filtering out low-value or incomplete search branches. -* **Output**: List of extracted reasoning chains (`mcts_chains`). +* **Function**: Linearizes tree paths, filtering out low-value or incomplete search branches. +* **Output**: A list of extracted reasoning chains (`mcts_chains`). -#### B. **VisualReasoningGenerator** +#### B. **VisualReasoningGenerator (Visual Reasoning Generator)** -This operator is the "Generation Engine" of the pipeline. It takes the extraction results from the previous step as input. +This operator is the "generation engine" of the pipeline. It receives the extraction result from the previous step as input. * **Mechanism**: Checks `input_existing_chains_key` (i.e., `mcts_chains`). -* If MCTS parsing was successful (chains exist), it reuses them directly without running inference (saving compute). -* If MCTS chains are empty (tree missing or parsing failed), it calls the VLM to generate reasoning chains from scratch based on the `prompt_type`. +* If MCTS parsing is successful (chain exists), it is reused directly without inference (saving computational resources). +* If the MCTS chain is empty (tree does not exist or parsing failed), it calls the VLM to generate the reasoning chain from scratch based on `prompt_type` (e.g., `spatial`). -* **Prompt Type**: Supports modes like `spatial` (spatial coordinate reasoning), `logical` (logical reasoning), etc. +* **Prompt Types**: Supports modes like `spatial` (spatial coordinate reasoning) and `logical` (logical reasoning). ### 3. **Output Data** -The final output data (`final_reasoning_chains`) will contain high-quality Chain-of-Thought data ready for SFT training. +The finally generated output data (`final_reasoning_chains`) will contain high-quality chains of thought that can be directly used for SFT training. **Output Example**: @@ -125,9 +203,9 @@ The final output data (`final_reasoning_chains`) will contain high-quality Chain ## 4. Pipeline Example -Below is the complete `VisionMCTSReasoningPipeline` code implementation. +Below is the complete `VisionMCTSReasoningPipeline` code implementation (GPU Version). + ```python -import argparse from dataflow.utils.storage import FileStorage from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm @@ -141,8 +219,10 @@ class VisionMCTSReasoningPipeline: model_path: str, *, # Storage + hf_cache_dir: str | None = None, + download_dir: str = "./ckpt/models", first_entry_file: str, - cache_path: str = "./cache_mcts", + cache_path: str = "../cache/cache_mcts", file_name_prefix: str = "mcts_reason", # Config prompt_type: str = "spatial", @@ -155,7 +235,6 @@ class VisionMCTSReasoningPipeline: # VLLM vllm_max_tokens: int = 1024 ): - # 1. 存储初始化 self.storage = FileStorage( first_entry_file_name=first_entry_file, cache_path=cache_path, @@ -163,8 +242,9 @@ class VisionMCTSReasoningPipeline: cache_type="jsonl" ) - # 2. 模型服务 self.serving = LocalModelVLMServing_vllm( + hf_cache_dir=hf_cache_dir, + hf_local_dir=download_dir, hf_model_name_or_path=model_path, vllm_tensor_parallel_size=1, vllm_temperature=0.7, @@ -175,20 +255,18 @@ class VisionMCTSReasoningPipeline: "q": input_question_key, "img": input_image_key, "tree": input_tree_key, - "mcts_chains": "mcts_extracted_chains", # 中间结果 + "mcts_chains": "mcts_extracted_chains", "final": output_key } # ================== Operators ================== - # 算子 1: MCTS Tree -> Chains (提取器) - # 负责将树结构扁平化为线性链 + # 1. Refiner: MCTS -> Chains self.op_mcts_refine = MCTSTreeRefiner( max_chains_per_sample=max_samples_per_file ) - # 算子 2: VLM -> Chains (生成器/Fallback) - # 如果 MCTS 提取失败,则使用 VLM 生成;如果成功,则跳过 + # 2. Generator: VLM -> Chains (Fallback) self.op_vlm_gen = VisualReasoningGenerator( serving=self.serving, prompt_type=prompt_type @@ -203,7 +281,8 @@ class VisionMCTSReasoningPipeline: ) print(">>> [Pipeline] Step 2: Generating Chains via VLM (Fallback)...") - # 注意:input_existing_chains_key 实现了混合/回退逻辑 + # 将 mcts_chains 作为 input_existing_chains_key 传入 + # 如果 MCTS 解析成功,则复用;否则调用 VLM 生成 self.op_vlm_gen.run( self.storage.step(), input_question_key=self.keys["q"], @@ -214,17 +293,13 @@ class VisionMCTSReasoningPipeline: if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--input_file", default="dataflow/example/image_to_text_pipeline/mct_reasoning.jsonl") - parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") - parser.add_argument("--prompt_type", default="spatial") - args = parser.parse_args() - pipe = VisionMCTSReasoningPipeline( - model_path=args.model_path, - first_entry_file=args.input_file, - prompt_type=args.prompt_type + model_path="Qwen/Qwen2.5-VL-3B-Instruct", + first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl", + prompt_type="spatial", + hf_cache_dir="~/.cache/huggingface", + download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct", ) pipe.forward() -``` \ No newline at end of file +``` diff --git a/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md b/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md new file mode 100644 index 00000000..8001e5c5 --- /dev/null +++ b/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md @@ -0,0 +1,248 @@ +--- +title: Vision MCTS Reasoning Pipeline (API version) +icon: mdi:image-text +createTime: 2026/01/11 21:59:59 +permalink: /en/mm_guide/vision_mct_reasoning_pipeline_api/ +--- + +## 1. Overview + +The **Vision MCTS Reasoning Pipeline** is designed to build high-quality **Process Supervision Data** for multimodal large models. This pipeline handles two sources of data: existing Monte Carlo Tree Search (MCTS) trajectory data, or generating new reasoning chains directly using a VLM. + +This pipeline is a core tool for **Grounded-RL** and **SFT Data Construction**. It "linearizes" complex tree-like search processes into a `......` format that the model can learn from. + +We support the following application scenarios: + +* **Data Extraction from MCTS Trees**: Converts high-value paths (Rollouts) in the search tree into linear training data. +* **Hybrid Data Construction**: Automatically falls back to using the VLM for CoT generation for samples without a search tree. +* **Spatial Reasoning Enhancement**: Supports generating spatial reasoning chains that include explicit coordinates (Bounding Boxes). + +The main process of the pipeline includes: + +1. **MCTS Tree Parsing**: Parses the search tree structure in the input data and extracts successful reasoning paths. +2. **Visual Reasoning Generation (Fallback)**: For samples with missing tree structures or failed parsing, the VLM is used to regenerate the reasoning chain. +3. **Data Standardization**: Outputs reasoning chain data in a unified format. + +--- + +## 2. Quick Start + +### Step 1: Create a New DataFlow Working Directory + +```bash +mkdir run_mcts_reasoning +cd run_mcts_reasoning + +``` + +### Step 2: Initialize DataFlow-MM + +```bash +dataflowmm init + +``` + +You will then see: + +```bash +api_pipelines/vision_mcts_api_pipeline.py + +``` + +### Step 3: Download Sample Data + +```bash +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data + +``` + +### Step 4: Configure API Key + +Set your API Key environment variable in `api_pipelines/vision_mcts_api_pipeline.py`: + +```python +import os +os.environ["DF_API_KEY"] = "your_api_key" + +``` + +### Step 5: Configure Parameters + +Configure the API service and input data paths in `api_pipelines/vision_mcts_api_pipeline.py`. Ensure the input file (jsonl) contains a `tree` field (for extraction) or just `question`/`image` (for generation). + +```python + pipe = VisionMCTSReasoningPipeline( + first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl", + prompt_type="spatial", + ) + +``` + +### Step 6: Run with One Command + +```bash +cd api_pipelines +python vision_mcts_api_pipeline.py + +``` + +--- + +## 3. Data Flow & Logic + +### 1. **Input Data** + +Input data typically originates from MCTS search process logs, or unannotated image-text pairs: + +* **image**: Path to the image. +* **question**: The visual question. +* **tree** (Optional): JSON structure of the MCTS search tree, containing node Values, Visits, and Actions. + +**Input Data Example**: + +```json +{ + "image": "./images/puzzle.jpg", + "question": "What is the next step to solve this?", + "tree": { "root": { "children": [...], "value": 1.0, "text": "Step 1..." } } +} + +``` + +### 2. **Core Operator Logic** + +This pipeline uses a hybrid strategy of **"Extraction First, Generation as Fallback"**: + +#### A. **MCTSTreeRefiner (Tree Structure Parser)** + +This operator handles the `tree` field. It traverses the tree structure and filters out the best path from the root node to a leaf node based on the node's Q-value. + +* **Input**: `tree` object. +* **Function**: Linearizes tree paths, filtering out low-value or incomplete search branches. +* **Output**: A list of extracted reasoning chains (`mcts_chains`). + +#### B. **VisualReasoningGenerator (Visual Reasoning Generator)** + +This operator is the "generation engine" of the pipeline. It receives the extraction result from the previous step as input. + +* **Mechanism**: Checks `input_existing_chains_key` (i.e., `mcts_chains`). +* If MCTS parsing is successful (chain exists), it is reused directly without inference (saving computational resources). +* If the MCTS chain is empty (tree does not exist or parsing failed), it calls the VLM to generate the reasoning chain from scratch based on `prompt_type` (e.g., `spatial`). + + +* **Prompt Types**: Supports modes like `spatial` (spatial coordinate reasoning) and `logical` (logical reasoning). + +### 3. **Output Data** + +The finally generated output data (`final_reasoning_chains`) will contain high-quality chains of thought that can be directly used for SFT training. + +**Output Example**: + +```json +{ + "image": "./images/puzzle.jpg", + "final_reasoning_chains": [ + "First, locate the red block at [100, 200]. To solve the puzzle, it needs to move right...Move Red Block" + ] +} + +``` + +--- + +## 4. Pipeline Example + +Below is the complete `VisionMCTSReasoningPipeline` code implementation (API Version). + +```python +import os +os.environ["DF_API_KEY"] = "sk-xxxx" +from dataflow.utils.storage import FileStorage +from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm + +# 引入原子算子 +from dataflow.operators.core_text import MCTSTreeRefiner +from dataflow.operators.core_vision import VisualReasoningGenerator +from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai + +class VisionMCTSReasoningPipeline: + def __init__( + self, + first_entry_file: str, + cache_path: str = "../cache/cache_mcts", + file_name_prefix: str = "mcts_reason", + # Config + prompt_type: str = "spatial", + max_samples_per_file: int = 10000, + # Keys + input_question_key: str = "question", + input_image_key: str = "image", + input_tree_key: str = "tree", + output_key: str = "final_reasoning_chains", + + ): + self.storage = FileStorage( + first_entry_file_name=first_entry_file, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type="jsonl" + ) + + self.vlm_serving = APIVLMServing_openai( + api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format + model_name="gpt-4o-mini", + image_io=None, + send_request_stream=False, + max_workers=10, + timeout=1800 + ) + + self.keys = { + "q": input_question_key, + "img": input_image_key, + "tree": input_tree_key, + "mcts_chains": "mcts_extracted_chains", + "final": output_key + } + + # ================== Operators ================== + + # 1. Refiner: MCTS -> Chains + self.op_mcts_refine = MCTSTreeRefiner( + max_chains_per_sample=max_samples_per_file + ) + + # 2. Generator: VLM -> Chains (Fallback) + self.op_vlm_gen = VisualReasoningGenerator( + serving=self.vlm_serving, + prompt_type=prompt_type + ) + + def forward(self): + print(">>> [Pipeline] Step 1: Extracting Chains from MCTS Trees...") + self.op_mcts_refine.run( + self.storage.step(), + input_tree_key=self.keys["tree"], + output_key=self.keys["mcts_chains"] + ) + + print(">>> [Pipeline] Step 2: Generating Chains via VLM (Fallback)...") + # 将 mcts_chains 作为 input_existing_chains_key 传入 + # 如果 MCTS 解析成功,则复用;否则调用 VLM 生成 + self.op_vlm_gen.run( + self.storage.step(), + input_question_key=self.keys["q"], + input_image_key=self.keys["img"], + input_existing_chains_key=self.keys["mcts_chains"], + output_key=self.keys["final"] + ) + + +if __name__ == "__main__": + pipe = VisionMCTSReasoningPipeline( + first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl", + prompt_type="spatial", + ) + pipe.forward() + +``` diff --git a/docs/zh/notes/mm_guide/image_understanding/image_gcot.md b/docs/zh/notes/mm_guide/image_understanding/image_gcot.md index 3a5add79..db64892a 100644 --- a/docs/zh/notes/mm_guide/image_understanding/image_gcot.md +++ b/docs/zh/notes/mm_guide/image_understanding/image_gcot.md @@ -405,6 +405,4 @@ if __name__ == "__main__": download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct", ) pipe.forward() - - ``` diff --git a/docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md b/docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md index 642b0d5a..a2a419f4 100644 --- a/docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md +++ b/docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md @@ -1,14 +1,14 @@ --- -title: 图像定位思维链 (GCoT) 生成流水线 +title: 图像定位思维链 (GCoT) 生成流水线(API版) icon: mdi:image-text createTime: 2026/01/11 20:44:55 -permalink: /zh/mm_guide/image_gcot/ +permalink: /zh/mm_guide/image_gcot_api/ --- ## 1. 概述 **图像定位思维链 (GCoT) 生成流水线** 旨在自动化生成**带视觉定位的思维链(Grounded Chain-of-Thought)**数据。该流水线通过多步推理,不仅生成回答问题的逻辑步骤,还将推理过程中提到的关键物体在图像中进行空间定位(Bounding Box),从而显著提升多模态数据的可解释性和精确度。 -与传统方法不同,本流水线采用 **单一 VLM(如 Qwen2.5-VL)** 同时完成“推理”和“定位”任务,流程更加精简高效。 +与传统方法不同,本流水线采用 **单一 VLM(如 GPT-5)** 同时完成“推理”和“定位”任务,流程更加精简高效。 我们支持以下应用场景: @@ -58,7 +58,7 @@ os.environ["DF_API_KEY"] = "your_api_key" ### 第五步:配置参数 -在 `api_pipelines/image_region_caption_api_pipeline.py` 中配置 API 服务和输入数据路径: +在 `api_pipelines/image_gcot_api_pipeline.py` 中配置 API 服务和输入数据路径: ```python def __init__( @@ -67,16 +67,20 @@ os.environ["DF_API_KEY"] = "your_api_key" first_entry_file: str, cache_path: str = "../cache/cache_gcot", file_name_prefix: str = "gcot", - # Keys question_key: str = "question", answer_key: str = "answer", image_key: str = "image", output_key: str = "gcot", - # Config vllm_max_tokens: int = 512 ): ``` +```python + pipe = ImageGCoTPipeline( + first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl" + ) +``` + ```python self.vlm_serving = APIVLMServing_openai( api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format @@ -385,6 +389,4 @@ if __name__ == "__main__": first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl" ) pipe.forward() - - ``` diff --git a/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md b/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md index 3bb6b039..7cfc00bc 100644 --- a/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md +++ b/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md @@ -29,38 +29,112 @@ permalink: /zh/mm_guide/image_scale_caption_pipeline/ ## 2. 快速开始 -### 第一步:准备工作目录 +### 第一步:创建新的 DataFlow 工作文件夹 ```bash -mkdir run_scalecap -cd run_scalecap +mkdir run_dataflow +cd run_dataflow ``` -### 第二步:准备脚本 +### 第二步:初始化 DataFlow-MM -将下文“流水线示例”中的代码保存为 `scalecap_pipeline.py`。 +```bash +dataflowmm init -### 第三步:配置运行参数 +``` -确保 VLM 模型(如 Qwen2.5-VL)路径正确。 +这时你会看到: ```bash -# 安装依赖 -pip install open-dataflow vllm +gpu_pipelines/image_scale_caption_pipeline.py + +``` + +### 第三步:下载示例数据 + +```bash +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data + +``` + +### 第四步:配置参数 + +```python +if __name__ == "__main__": + pipe = ImageScaleCaptionPipeline( + model_path="Qwen/Qwen2.5-VL-3B-Instruct", + hf_cache_dir="~/.cache/huggingface", + download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct", + device="cuda", + first_entry_file="../example_data/capsbench_images/image_scale_caption_demo.jsonl", + cache_path="../cache/image_scale_caption", + file_name_prefix="scalecap", + input_image_key="image", + output_key="final_caption", + vllm_tensor_parallel_size=1, + vllm_max_tokens=1024 + ) + pipe.forward() ``` -### 第四步:一键运行 +> **⚠️ 模型路径配置的重要提示(以 `Qwen2.5-VL-3B-Instruct` 为例):** +> * **如果您已经下载好了模型文件**:请将 `model_path` 修改为您的本地模型路径。**务必保证**模型存放的最终文件夹名称精确为 `Qwen2.5-VL-3B-Instruct`,否则底层解析时将无法正确匹配和识别该模型。 +> * **如果您还未下载模型(需要自动下载)**:请一定要指定 `download_dir` 参数,并且该目录路径**必须以** `Qwen2.5-VL-3B-Instruct` **结尾**(正如默认参数所示),否则下载完成后同样会导致框架无法识别模型。 +> +> + +### 第五步:一键运行 ```bash -python scalecap_pipeline.py \ - --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \ - --input_jsonl "data/images.jsonl" \ - --output_key "final_caption" +cd gpu_pipelines +python image_scale_caption_pipeline.py ``` +> **🛠️ 常见问题排查 (Troubleshooting)** +> **问题 1:** 如果遇到类似如下的动态链接库冲突报错: +> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12` +> **解决方法:** 这通常是环境变量干扰导致的。请在运行命令前清空 `LD_LIBRARY_PATH`: +> ```bash +> LD_LIBRARY_PATH="" python image_scale_caption_pipeline.py +> +> ``` +> +> +> **问题 2:** 如果您使用的是 **Qwen 系列模型**,并且遇到以下报错: +> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"` +> **解决方法:** 打开模型文件夹下的 `config.json` 文件,找到 `rope_scaling` 配置块,将 `"type"` 字段修改为 `"rope_type"` 即可。 +> **修改前:** +> ```json +> "rope_scaling": { +> "type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> +> ``` +> +> +> **修改后:** +> ```json +> "rope_scaling": { +> "rope_type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> +> ``` +> +> + --- ## 3. 数据流与流水线逻辑 @@ -75,7 +149,7 @@ python scalecap_pipeline.py \ ```json { - "image": "./images/complex_scene.jpg" + "image": "../example_data/capsbench_images/0.png" } ``` @@ -132,12 +206,12 @@ python scalecap_pipeline.py \ ```json { - "image": "./images/complex_scene.jpg", + "image": "../example_data/capsbench_images/0.png", "init_caption": "A dog sitting on a bench.", "golden_sentences": ["A dog is sitting on a wooden bench."], - "q_list": ["Describe more details about the dog.", "Describe position of the bench."], + "q_list": ["Describe more details about the dog.", "Describe more details about the position of the bench."], "final_details": ["The dog is a Golden Retriever with a red collar.", "The bench is located in a park."], - "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park..." + "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park." } ``` @@ -146,7 +220,7 @@ python scalecap_pipeline.py \ ## 4. 流水线示例 -以下是完整的 `ImageScaleCaptionPipeline` 代码实现。 +以下是完整的 `ImageScaleCaptionPipeline` 代码实现 (GPU 版本)。 ```python import re @@ -154,12 +228,79 @@ import argparse from typing import Callable, Any, List from dataflow.utils.storage import FileStorage + from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm + from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate from dataflow.prompts.image import ImageScaleCaptionPrompt + from dataflow.operators.core_vision import PromptedVQAGenerator, BatchVQAGenerator, VisualGroundingRefiner from dataflow.operators.core_text import PromptTemplatedQAGenerator, FunctionalRefiner + +def split_sentences(text: str) -> List[str]: + """将文本拆分为句子列表""" + if not text or not isinstance(text, str): + return [] + # 使用正则按标点符号分割 (. ! ? 。 ! ?) + _SENT_SPLIT = re.compile(r"(?<=[.!?。!?])\s+") + parts = [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()] + return parts or ([text.strip()] if text.strip() else []) + +def join_list(data: Any, separator: str = "\n") -> str: + """将列表连接为字符串""" + if isinstance(data, list): + # 过滤掉非字符串元素或空字符串 + valid_items = [str(x) for x in data if x] + return separator.join(valid_items) + return str(data) if data is not None else "" + +def parse_questions_logic(text: str, max_q: int = 20) -> List[str]: + """ + 解析 LLM 生成的 "Describe more details about..." 文本, + 并自动扩展 position 问题。 + """ + if not text or not isinstance(text, str): + return [] + + lines = [t.strip() for t in text.split("\n") if t.strip()] + obj_qs = [] + + for line in lines: + # 提取包含 "Describe more details about" 的行 + if "Describe more details about" in line: + # 去除可能的序号 (如 "1. Describe...") + try: + start_idx = line.find("Describe") + clean = line[start_idx:] + # 去除句末多余内容,保留到第一个句号 + if "." in clean: + clean = clean.split(".")[0] + "." + obj_qs.append(clean) + except Exception: + continue + + # 去重并保持顺序 + seen = set() + unique_obj_qs = [] + for q in obj_qs: + if q not in seen: + unique_obj_qs.append(q) + seen.add(q) + + # 截断 + unique_obj_qs = unique_obj_qs[:max_q] + + # 扩展 Position 问题 + pos_qs = [ + q.replace("Describe more details about", "Describe more details about the position of") + for q in unique_obj_qs + ] + + # 返回合并后的列表 (对象问题 + 位置问题) + return unique_obj_qs + pos_qs + + class ImageScaleCaptionPipeline: def __init__( self, @@ -211,14 +352,19 @@ class ImageScaleCaptionPipeline: # ================== Operator Initialization ================== # --- Step A: Generate Init Caption --- + # 构造固定 Prompt 列 self.refine_const_prompt = FunctionalRefiner(func=lambda: self.prompts_db["VLM_PROMPT_1"]) + + # 生成初稿 (使用通用 PromptedVQAGenerator) self.gen_init_caption = PromptedVQAGenerator( serving=self.serving, system_prompt="You are a helpful assistant." ) # --- Step B: Refine Golden Sentences --- + # 分句 self.refine_split = FunctionalRefiner(func=split_sentences) + # 视觉自检 (保留 Yes 的句子) self.refine_golden = VisualGroundingRefiner( serving=self.serving, @@ -226,7 +372,10 @@ class ImageScaleCaptionPipeline: ) # --- Step C: Generate Questions --- + # 列表转字符串 self.refine_join = FunctionalRefiner(func=join_list) + + # 文本生成问题 (Text-to-Text) tpl_q = NamedPlaceholderPromptTemplate( template=self.prompts_db["LLM_PROMPT_1"], join_list_with="\n" @@ -235,16 +384,22 @@ class ImageScaleCaptionPipeline: serving=self.serving, prompt_template=tpl_q ) + + # 解析问题文本为列表 self.refine_parse_qs = FunctionalRefiner(func=parse_questions_logic) # --- Step D: Generate Answers --- + # 批量回答 (One Image -> Many Qs) self.gen_answers = BatchVQAGenerator(serving=self.serving) + + # 回答过滤 self.refine_answers = VisualGroundingRefiner( serving=self.serving, prompt_template="Given the image, is the statement '{text}' grounded in the image and not generic? Answer strictly yes or no." ) # --- Step E: Integrate Final Caption --- + # 融合 (Text-to-Text) tpl_final = NamedPlaceholderPromptTemplate( template=self.prompts_db["LLM_PROMPT_4"], join_list_with="\n" @@ -256,6 +411,7 @@ class ImageScaleCaptionPipeline: def forward(self): print(">>> [Pipeline] Step 0: Preparing Prompts...") + # 构造 init_prompt 列 self.refine_const_prompt.run( self.storage.step(), output_key="init_prompt" @@ -288,11 +444,14 @@ class ImageScaleCaptionPipeline: output_key="golden_str", data="golden_sentences" ) + + # template: "{sentence}" -> map to col "golden_str" self.gen_questions_text.run( self.storage.step(), output_answer_key="raw_q_text", sentence="golden_str" ) + self.refine_parse_qs.run( self.storage.step(), output_key="q_list", @@ -306,6 +465,7 @@ class ImageScaleCaptionPipeline: input_image_key=self.input_image_key, output_key="raw_answers" ) + self.refine_answers.run( self.storage.step(), input_list_key="raw_answers", @@ -319,50 +479,37 @@ class ImageScaleCaptionPipeline: output_key="details_str", data="final_details" ) + + # template keys: context, object_info, position_info self.gen_final_caption.run( self.storage.step(), output_answer_key=self.output_key, context="golden_str", object_info="details_str", - position_info="details_str" + position_info="details_str" # 简化:同时作为 object 和 position 信息 ) print(f">>> [Pipeline] All Done. Result saved to: {self.storage.cache_path}") if __name__ == "__main__": - parser = argparse.ArgumentParser(description="ScaleCap Dense Captioning Pipeline") - - parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") - parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface") - parser.add_argument("--download_dir", default="./ckpt/models") - parser.add_argument("--device", default="cuda") - - parser.add_argument("--input_jsonl", default="./dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl") - parser.add_argument("--cache_path", default="./cache_scalecap_results") - parser.add_argument("--file_name_prefix", default="scalecap") - parser.add_argument("--input_image_key", default="image") - parser.add_argument("--output_key", default="final_caption") - - parser.add_argument("--tp", type=int, default=1) - parser.add_argument("--max_tokens", type=int, default=1024) - - args = parser.parse_args() - pipe = ImageScaleCaptionPipeline( - model_path=args.model_path, - hf_cache_dir=args.hf_cache_dir, - download_dir=args.download_dir, - device=args.device, - first_entry_file=args.input_jsonl, - cache_path=args.cache_path, - file_name_prefix=args.file_name_prefix, - input_image_key=args.input_image_key, - output_key=args.output_key, - vllm_tensor_parallel_size=args.tp, - vllm_max_tokens=args.max_tokens + model_path="Qwen/Qwen2.5-VL-3B-Instruct", + hf_cache_dir="~/.cache/huggingface", + download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct", + device="cuda", + + first_entry_file="../example_data/capsbench_images/image_scale_caption_demo.jsonl", + cache_path="../cache/image_scale_caption", + file_name_prefix="scalecap", + + input_image_key="image", + output_key="final_caption", + + vllm_tensor_parallel_size=1, + vllm_max_tokens=1024 ) pipe.forward() -``` +``` \ No newline at end of file diff --git a/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md b/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md new file mode 100644 index 00000000..e504e294 --- /dev/null +++ b/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md @@ -0,0 +1,477 @@ +--- +title: ScaleCap 高密度描述生成流水线(API版) +createTime: 2026/01/11 22:08:57 +icon: mdi:image-text +permalink: /zh/mm_guide/image_scale_caption_pipeline_api/ +--- + +## 1. 概述 + +**ScaleCap 高密度描述生成流水线 (Image Scale Caption Pipeline)** 是一种基于**“生成-验证-扩展-融合”**范式的先进图像描述生成方案。该流水线旨在生成**信息密度极高**且**幻觉率极低**的图像描述,特别适用于需要深度理解图像细节的场景。 + +该方法的理论基础源自论文 *ScaleCap: Inference-Time Scalable Image Captioning via Dual-Modality Debiasing*。它通过多轮对话和视觉自检(Visual Grounding),逐步挖掘图像中的对象与位置细节,并过滤掉模型产生的幻觉。 + +我们支持以下应用场景: + +* **高质量多模态数据集构建**:生成比普通 Caption 更详尽、准确的训练数据。 +* **细粒度图像检索**:提供包含丰富细节的索引文本。 +* **盲人辅助/图像无障碍**:生成“所见即所得”的详细解说。 + +流水线的主要流程包括: + +1. **初稿生成**:VLM 生成基础描述。 +2. **视觉自检 (Debiasing)**:将描述拆分为句子,逐句验证其是否被图像证据支持(Visual Grounding)。 +3. **细节追问**:针对通过验证的“黄金句子”,生成关于对象属性和位置的追问。 +4. **回答与再验证**:VLM 回答追问,并再次进行视觉自检以过滤错误细节。 +5. **最终融合**:将所有经过验证的信息融合成一段连贯的长描述。 + +--- + +## 2. 快速开始 + +### 第一步:创建新的 DataFlow 工作文件夹 + +```bash +mkdir run_dataflow +cd run_dataflow + +``` + +### 第二步:初始化 DataFlow-MM + +```bash +dataflowmm init + +``` + +这时你会看到: + +```bash +api_pipelines/image_scale_caption_api_pipeline.py + +``` + +### 第三步:下载示例数据 + +```bash +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data + +``` + +### 第四步:配置 API Key + +在 `api_pipelines/image_scale_caption_api_pipeline.py` 中设置 API Key 环境变量: + +```python +import os +os.environ["DF_API_KEY"] = "your_api_key" + +``` + +### 第五步:配置参数 + +在 `api_pipelines/image_scale_caption_api_pipeline.py` 中配置 API 服务和输入数据路径: + +```python + def __init__( + self, + # Storage params + first_entry_file: str = "../example_data/capsbench_images/image_scale_caption_demo.jsonl", + cache_path: str = "../cache/image_scale_caption", + file_name_prefix: str = "scalecap", + cache_type: str = "jsonl", + # Keys + input_image_key: str = "image", + output_key: str = "final_caption", + ): + +``` + +```python + self.vlm_serving = APIVLMServing_openai( + api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format + model_name="gpt-4o-mini", + image_io=None, + send_request_stream=False, + max_workers=10, + timeout=1800 + ) + +``` + +### 第六步:一键运行 + +```bash +cd api_pipelines +python image_scale_caption_api_pipeline.py + +``` + +--- + +## 3. 数据流与流水线逻辑 + +### 1. **输入数据** + +输入数据非常简单,仅需图像路径: + +* **image**:图像文件路径。 + +**输入数据示例**: + +```json +{ + "image": "../example_data/capsbench_images/0.png" +} + +``` + +### 2. **核心算子逻辑** + +该流水线是多个原子算子的复杂编排: + +#### A. **初稿生成 (PromptedVQAGenerator)** + +* **功能**:使用基础 Prompt 生成图像的初步描述 (`init_caption`)。 + +#### B. **视觉自检 (VisualGroundingRefiner)** + +* **功能**:这是 ScaleCap 的核心防幻觉机制。 +* **逻辑**: +1. 使用 `split_sentences` 将初稿拆分为单句。 +2. 调用 VLM 询问:“Given the image, is the description '{text}' directly supported by visual evidence?”。 +3. 仅保留回答为 "Yes" 的句子,形成 **"Golden Sentences"**。 + + + +#### C. **问题生成与解析 (PromptTemplatedQAGenerator)** + +* **功能**:基于 Golden Sentences,利用 LLM 能力生成针对性的追问。 +* **逻辑**:模型生成如 "Describe more details about the [Object]" 的文本,并通过 `parse_questions_logic` 自动扩展为**对象细节**和**位置关系**两类问题。 + +#### D. **批量回答与二次过滤 (BatchVQAGenerator & Refiner)** + +* **功能**:挖掘图像深层信息。 +* **逻辑**: +1. 使用 `BatchVQAGenerator` 一次性让 VLM 回答上述生成的所有问题。 +2. 再次使用 `VisualGroundingRefiner` 检查这些新生成的细节是否准确。 +3. 保留可靠的细节信息 (`final_details`)。 + + + +#### E. **最终融合 (PromptTemplatedQAGenerator)** + +* **功能**:将“黄金句子”和“验证后的细节”重写为一段流畅的文本。 +* **输出**:`final_caption`。 + +### 3. **输出数据** + +输出数据记录了流水线的全过程,方便调试和分析: + +* **init_caption**:原始生成的初稿。 +* **golden_sentences**:通过第一次自检的句子列表。 +* **q_list**:生成的追问列表。 +* **final_details**:通过第二次自检的细节回答。 +* **final_caption**:最终的高密度描述。 + +**输出数据示例**: + +```json +{ + "image": "../example_data/capsbench_images/0.png", + "init_caption": "A dog sitting on a bench.", + "golden_sentences": ["A dog is sitting on a wooden bench."], + "q_list": ["Describe more details about the dog.", "Describe more details about the position of the bench."], + "final_details": ["The dog is a Golden Retriever with a red collar.", "The bench is located in a park."], + "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park." +} + +``` + +--- + +## 4. 流水线示例 + +以下是完整的 `ImageScaleCaptionPipeline` 代码实现 (API 版本)。 + +```python +import os +os.environ["DF_API_KEY"] = "sk-xxxx" + + +import re +import argparse +from typing import Callable, Any, List + +from dataflow.utils.storage import FileStorage + +from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate +from dataflow.prompts.image import ImageScaleCaptionPrompt + +from dataflow.operators.core_vision import PromptedVQAGenerator, BatchVQAGenerator, VisualGroundingRefiner +from dataflow.operators.core_text import PromptTemplatedQAGenerator, FunctionalRefiner +from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai + +def split_sentences(text: str) -> List[str]: + """将文本拆分为句子列表""" + if not text or not isinstance(text, str): + return [] + # 使用正则按标点符号分割 (. ! ? 。 ! ?) + _SENT_SPLIT = re.compile(r"(?<=[.!?。!?])\s+") + parts = [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()] + return parts or ([text.strip()] if text.strip() else []) + +def join_list(data: Any, separator: str = "\n") -> str: + """将列表连接为字符串""" + if isinstance(data, list): + # 过滤掉非字符串元素或空字符串 + valid_items = [str(x) for x in data if x] + return separator.join(valid_items) + return str(data) if data is not None else "" + +def parse_questions_logic(text: str, max_q: int = 20) -> List[str]: + """ + 解析 LLM 生成的 "Describe more details about..." 文本, + 并自动扩展 position 问题。 + """ + if not text or not isinstance(text, str): + return [] + + lines = [t.strip() for t in text.split("\n") if t.strip()] + obj_qs = [] + + for line in lines: + # 提取包含 "Describe more details about" 的行 + if "Describe more details about" in line: + # 去除可能的序号 (如 "1. Describe...") + try: + start_idx = line.find("Describe") + clean = line[start_idx:] + # 去除句末多余内容,保留到第一个句号 + if "." in clean: + clean = clean.split(".")[0] + "." + obj_qs.append(clean) + except Exception: + continue + + # 去重并保持顺序 + seen = set() + unique_obj_qs = [] + for q in obj_qs: + if q not in seen: + unique_obj_qs.append(q) + seen.add(q) + + # 截断 + unique_obj_qs = unique_obj_qs[:max_q] + + # 扩展 Position 问题 + pos_qs = [ + q.replace("Describe more details about", "Describe more details about the position of") + for q in unique_obj_qs + ] + + # 返回合并后的列表 (对象问题 + 位置问题) + return unique_obj_qs + pos_qs + + +class ImageScaleCaptionPipeline: + def __init__( + self, + # Storage params + first_entry_file: str = "images.jsonl", + cache_path: str = "./cache_scalecap", + file_name_prefix: str = "scalecap", + cache_type: str = "jsonl", + # Keys + input_image_key: str = "image", + output_key: str = "final_caption", + # VLLM Config + vllm_tensor_parallel_size: int = 1, + vllm_temperature: float = 0.7, + vllm_top_p: float = 0.9, + vllm_max_tokens: int = 512, + ): + # 1. Storage + self.storage = FileStorage( + first_entry_file_name=first_entry_file, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type=cache_type, + ) + + # 2. Serving + self.vlm_serving = APIVLMServing_openai( + api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format + model_name="gpt-4o-mini", + image_io=None, + send_request_stream=False, + max_workers=10, + timeout=1800 + ) + + # 3. Prompts + self.prompts_db = ImageScaleCaptionPrompt().build_prompt() + + # 4. Keys + self.input_image_key = input_image_key + self.output_key = output_key + + # ================== Operator Initialization ================== + + # --- Step A: Generate Init Caption --- + # 构造固定 Prompt 列 + self.refine_const_prompt = FunctionalRefiner(func=lambda: self.prompts_db["VLM_PROMPT_1"]) + + # 生成初稿 (使用通用 PromptedVQAGenerator) + self.gen_init_caption = PromptedVQAGenerator( + serving=self.vlm_serving, + system_prompt="You are a helpful assistant." + ) + + # --- Step B: Refine Golden Sentences --- + # 分句 + self.refine_split = FunctionalRefiner(func=split_sentences) + + # 视觉自检 (保留 Yes 的句子) + self.refine_golden = VisualGroundingRefiner( + serving=self.vlm_serving, + prompt_template="Given the image, is the description '{text}' directly supported by visual evidence? Answer strictly yes or no." + ) + + # --- Step C: Generate Questions --- + # 列表转字符串 + self.refine_join = FunctionalRefiner(func=join_list) + + # 文本生成问题 (Text-to-Text) + tpl_q = NamedPlaceholderPromptTemplate( + template=self.prompts_db["LLM_PROMPT_1"], + join_list_with="\n" + ) + self.gen_questions_text = PromptTemplatedQAGenerator( + serving=self.vlm_serving, + prompt_template=tpl_q + ) + + # 解析问题文本为列表 + self.refine_parse_qs = FunctionalRefiner(func=parse_questions_logic) + + # --- Step D: Generate Answers --- + # 批量回答 (One Image -> Many Qs) + self.gen_answers = BatchVQAGenerator(serving=self.vlm_serving) + + # 回答过滤 + self.refine_answers = VisualGroundingRefiner( + serving=self.vlm_serving, + prompt_template="Given the image, is the statement '{text}' grounded in the image and not generic? Answer strictly yes or no." + ) + + # --- Step E: Integrate Final Caption --- + # 融合 (Text-to-Text) + tpl_final = NamedPlaceholderPromptTemplate( + template=self.prompts_db["LLM_PROMPT_4"], + join_list_with="\n" + ) + self.gen_final_caption = PromptTemplatedQAGenerator( + serving=self.vlm_serving, + prompt_template=tpl_final + ) + + def forward(self): + print(">>> [Pipeline] Step 0: Preparing Prompts...") + # 构造 init_prompt 列 + self.refine_const_prompt.run( + self.storage.step(), + output_key="init_prompt" + ) + + print(">>> [Pipeline] Step 1: Generating Initial Caption...") + self.gen_init_caption.run( + self.storage.step(), + input_prompt_key="init_prompt", + input_image_key=self.input_image_key, + output_answer_key="init_caption" + ) + + print(">>> [Pipeline] Step 2: Refining Golden Sentences...") + self.refine_split.run( + self.storage.step(), + output_key="sentences", + text="init_caption" + ) + self.refine_golden.run( + self.storage.step(), + input_list_key="sentences", + input_image_key=self.input_image_key, + output_key="golden_sentences" + ) + + print(">>> [Pipeline] Step 3: Generating Details Questions...") + self.refine_join.run( + self.storage.step(), + output_key="golden_str", + data="golden_sentences" + ) + + # template: "{sentence}" -> map to col "golden_str" + self.gen_questions_text.run( + self.storage.step(), + output_answer_key="raw_q_text", + sentence="golden_str" + ) + + self.refine_parse_qs.run( + self.storage.step(), + output_key="q_list", + text="raw_q_text" + ) + + print(">>> [Pipeline] Step 4: Generating & Filtering Answers...") + self.gen_answers.run( + self.storage.step(), + input_prompts_key="q_list", + input_image_key=self.input_image_key, + output_key="raw_answers" + ) + + self.refine_answers.run( + self.storage.step(), + input_list_key="raw_answers", + input_image_key=self.input_image_key, + output_key="final_details" + ) + + print(">>> [Pipeline] Step 5: Integrating Final Caption...") + self.refine_join.run( + self.storage.step(), + output_key="details_str", + data="final_details" + ) + + # template keys: context, object_info, position_info + self.gen_final_caption.run( + self.storage.step(), + output_answer_key=self.output_key, + context="golden_str", + object_info="details_str", + position_info="details_str" # 简化:同时作为 object 和 position 信息 + ) + + print(f">>> [Pipeline] All Done. Result saved to: {self.storage.cache_path}") + + +if __name__ == "__main__": + + pipe = ImageScaleCaptionPipeline( + first_entry_file="../example_data/capsbench_images/image_scale_caption_demo.jsonl", + cache_path="../cache/image_scale_caption", + file_name_prefix="scalecap", + input_image_key="image", + output_key="final_caption", + vllm_tensor_parallel_size=1, + vllm_max_tokens=1024 + ) + + pipe.forward() + +``` diff --git a/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md b/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md index a586da66..32c17ddd 100644 --- a/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md +++ b/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md @@ -18,8 +18,6 @@ permalink: /zh/mm_guide/image_visual_only_mcq_pipeline/ * **旋转测试**:多次打乱选项顺序,消除位置偏见。 * **双重过滤**:要求“有图答对率”高,“无图答对率”低。 - - --- ## 2. 快速开始 @@ -32,32 +30,102 @@ cd run_vis_mcq ``` -### 第二步:准备脚本 +### 第二步:初始化 DataFlow-MM -将下文“流水线示例”中的代码保存为 `visual_mcq_pipeline.py`。 +```bash +dataflowmm init + +``` -### 第三步:配置运行参数 +这时你会看到: + +```bash +gpu_pipelines/image_visual_only_mcq_pipeline.py + +``` -该流水线通过命令行参数控制过滤阈值。例如,要求有图 100% 正确,无图正确率低于 25%: +### 第三步:下载示例数据 ```bash -# 安装依赖 -pip install open-dataflow vllm +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data + +``` + +### 第四步:配置参数 + +配置模型路径和过滤阈值(例如,要求有图 100% 正确,无图正确率低于 25%): + +```python +if __name__ == "__main__": + pipe = VisualOnlyMCQPipeline( + model_path="Qwen/Qwen2.5-VL-3B-Instruct", + first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl", + hf_cache_dir="~/.cache/huggingface", + download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct", + rotate_num=4, + pass_visual_min=1.0, + pass_textual_max=0.25 + ) + pipe.forward() ``` -### 第四步:一键运行 +> **⚠️ 模型路径配置的重要提示(以 `Qwen2.5-VL-3B-Instruct` 为例):** +> * **如果您已经下载好了模型文件**:请将 `model_path` 修改为您的本地模型路径。**务必保证**模型存放的最终文件夹名称精确为 `Qwen2.5-VL-3B-Instruct`,否则底层解析时将无法正确匹配和识别该模型。 +> * **如果您还未下载模型(需要自动下载)**:请一定要指定 `download_dir` 参数,并且该目录路径**必须以** `Qwen2.5-VL-3B-Instruct` **结尾**(正如默认参数所示),否则下载完成后同样会导致框架无法识别模型。 +> +> + +### 第五步:一键运行 ```bash -python visual_mcq_pipeline.py \ - --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \ - --input_file "data/captions.jsonl" \ - --rotate_num 4 \ - --pass_vis 1.0 \ - --pass_txt 0.25 +cd gpu_pipelines +python image_visual_only_mcq_pipeline.py ``` +> **🛠️ 常见问题排查 (Troubleshooting)** +> **问题 1:** 如果遇到类似如下的动态链接库冲突报错: +> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12` +> **解决方法:** 这通常是环境变量干扰导致的。请在运行命令前清空 `LD_LIBRARY_PATH`: +> ```bash +> LD_LIBRARY_PATH="" python image_visual_only_mcq_pipeline.py +> +> ``` +> +> +> **问题 2:** 如果您使用的是 **Qwen 系列模型**,并且遇到以下报错: +> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"` +> **解决方法:** 打开模型文件夹下的 `config.json` 文件,找到 `rope_scaling` 配置块,将 `"type"` 字段修改为 `"rope_type"` 即可。 +> **修改前:** +> ```json +> "rope_scaling": { +> "type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> +> ``` +> +> +> **修改后:** +> ```json +> "rope_scaling": { +> "rope_type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> +> ``` +> +> + --- ## 3. 数据流与流水线逻辑 @@ -98,13 +166,12 @@ python visual_mcq_pipeline.py \ 1. **选项旋转**:随机打乱选项顺序(例如将答案从 A 换到 C),防止模型通过“总是选 A”来作弊。 2. **有图推理 (Visual Pass)**:输入图像 + 题目。记录模型答对的比例。 -3. **无图推理 (Textual Pass)**:仅输入题目(无图像)。记录模型盲猜对的比例。 +3. **无图推理 (Textual Pass)**:仅输入题目(无图像进行盲测)。记录模型盲猜对的比例。 4. **过滤判据**: + * 保留题目,当且仅当:`Visual_Acc >= pass_visual_min` **且** `Textual_Acc <= pass_textual_max`。 * *示例*:如果一道题不看图也能答对(无图准确率高),说明它考的是常识而非视觉,**剔除**。 - - ### 3. **输出数据** 输出数据 (`final_mcqs`) 仅包含通过了严苛验证的题目。这些题目具有极高的质量和视觉相关性。 @@ -132,12 +199,10 @@ python visual_mcq_pipeline.py \ ## 4. 流水线示例 -以下是完整的 `VisualOnlyMCQPipeline` 代码实现。 +以下是完整的 `VisualOnlyMCQPipeline` 代码实现 (GPU 版本)。 ```python import argparse -import re -from typing import List, Dict, Any from dataflow.utils.storage import FileStorage from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm @@ -145,13 +210,14 @@ from dataflow.operators.core_vision import FixPromptedVQAGenerator, VisualDepend from dataflow.operators.core_text import FunctionalRefiner from dataflow.prompts.image import ImageCaprlPrompt -# 正则解析逻辑 +import re +from typing import List, Dict, Any + _Q_BLOCK_SPLIT = re.compile(r"^####\s*\d+\.\s*\*\*(.*?)\*\*\s*$", re.M) _OPT_LINE_RE = re.compile(r"^\s*-\s*([A-F])\)\s*(.+?)\s*$") _ANS_LINE_RE = re.compile(r"^\s*\*\*Answer:\*\*\s*([A-F])\)\s*(.+?)\s*$", re.I) def parse_mcq_text_logic(mcq_text: str, expected: int = 5) -> List[Dict[str, Any]]: - """将 VLM 生成的原始文本解析为结构化字典列表""" if not mcq_text or not isinstance(mcq_text, str): return [] indices = [m.start() for m in _Q_BLOCK_SPLIT.finditer(mcq_text)] @@ -213,7 +279,9 @@ class VisualOnlyMCQPipeline: model_path: str, *, first_entry_file: str, - cache_path: str = "./cache_mcq", + hf_cache_dir: str | None = None, + download_dir: str = "./ckpt/models", + cache_path: str = "../cache/cache_mcq", file_name_prefix: str = "vis_mcq", # Config rotate_num: int = 4, @@ -227,7 +295,6 @@ class VisualOnlyMCQPipeline: device: str = "cuda", vllm_max_tokens: int = 2048 ): - # 1. 初始化存储 self.storage = FileStorage( first_entry_file_name=first_entry_file, cache_path=cache_path, @@ -235,15 +302,16 @@ class VisualOnlyMCQPipeline: cache_type="jsonl" ) - # 2. 初始化 VLM 服务 self.serving = LocalModelVLMServing_vllm( + hf_cache_dir=hf_cache_dir, + hf_local_dir=download_dir, hf_model_name_or_path=model_path, vllm_tensor_parallel_size=1, - vllm_temperature=0.1, # 低温度以保证格式稳定 + vllm_temperature=0.1, vllm_max_tokens=vllm_max_tokens ) - # Keys 配置 + # Keys self.keys = { "img": input_image_key, "raw_text": "raw_mcq_text", @@ -251,23 +319,24 @@ class VisualOnlyMCQPipeline: "final": output_key } - # 加载 Prompt 库 + # --- Prompts --- self.prompts_db = ImageCaprlPrompt().build_prompt() - # ================== 算子初始化 ================== + # ================== Operators ================== - # 算子 1: 生成原始 MCQ 文本 + # 1. Generate Raw MCQs (FixPromptedVQAGenerator) + # 直接使用 prompt 类中的字符串 self.op_gen_raw = FixPromptedVQAGenerator( serving=self.serving, system_prompt=self.prompts_db["SYS_PROMPT_MCQ"], user_prompt=self.prompts_db["USER_PROMPT_MCQ"] ) - # 算子 2: 解析文本为结构化数据 + # 2. Parse MCQs (Refine) self.op_parse = FunctionalRefiner(func=parse_mcq_text_logic) - # 算子 3: 视觉依赖性验证 (核心过滤) - # 包含旋转 (Rotation) 和 无图检测 (Text-only check) + # 3. Verify Visual Dependency (Refine) + # 传入 prompt 模板 self.op_verify = VisualDependencyRefiner( serving=self.serving, instruction_template=self.prompts_db["ANSWER_INSTRUCTION"], @@ -304,23 +373,15 @@ class VisualOnlyMCQPipeline: print(f">>> [Pipeline] Done. Results in: {self.keys['final']}") if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--input_file", default="./dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl") - parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") - parser.add_argument("--rotate_num", type=int, default=4) - parser.add_argument("--pass_vis", type=float, default=1.0) - parser.add_argument("--pass_txt", type=float, default=0.25) - - args = parser.parse_args() - pipe = VisualOnlyMCQPipeline( - model_path=args.model_path, - first_entry_file=args.input_file, - rotate_num=args.rotate_num, - pass_visual_min=args.pass_vis, - pass_textual_max=args.pass_txt + model_path="Qwen/Qwen2.5-VL-3B-Instruct", + first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl", + hf_cache_dir="~/.cache/huggingface", + download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct", + rotate_num=4, + pass_visual_min=1.0, + pass_textual_max=0.25 ) pipe.forward() -``` - +``` \ No newline at end of file diff --git a/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md b/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md new file mode 100644 index 00000000..55f74fcc --- /dev/null +++ b/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md @@ -0,0 +1,339 @@ +--- +title: 视觉依赖 MCQ 生成流水线(API版) +createTime: 2026/01/11 22:13:45 +icon: mdi:image-text +permalink: /zh/mm_guide/image_visual_only_mcq_pipeline_api/ +--- +## 1. 概述 + +**视觉依赖 MCQ 生成流水线 (Visual-Only MCQ Pipeline)** 是 CapRL (Caption Reinforcement Learning) 框架中的核心组件。它的目标是生成一组高质量的多项选择题 (MCQ),且这些题目必须满足**强视觉依赖性**:即模型必须“看”图才能答对,仅凭文本(猜题或常识)无法作答。 + +该流水线通过**生成-解析-验证**三步法,利用**选项旋转 (Rotation)** 和**无图盲测 (Blind Test)** 机制,严格过滤掉模型幻觉或过于简单的题目。生成的题目可作为强化学习的奖励信号(Reward Model)。 + +主要流程包括: + +1. **MCQ 生成**:VLM 基于图像生成原始的问答对文本。 +2. **结构化解析**:利用正则逻辑将文本解析为标准的题目与选项结构。 +3. **视觉依赖验证**: +* **旋转测试**:多次打乱选项顺序,消除位置偏见。 +* **双重过滤**:要求“有图答对率”高,“无图答对率”低。 + +--- + +## 2. 快速开始 + +### 第一步:创建工作目录 + +```bash +mkdir run_vis_mcq +cd run_vis_mcq + +``` + +### 第二步:初始化 DataFlow-MM + +```bash +dataflowmm init + +``` + +这时你会看到: + +```bash +api_pipelines/image_visual_only_mcq_api_pipeline.py + +``` + +### 第三步:下载示例数据 + +```bash +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data + +``` + +### 第四步:配置 API Key + +在 `api_pipelines/image_visual_only_mcq_api_pipeline.py` 中设置 API Key 环境变量: + +```python +import os +os.environ["DF_API_KEY"] = "your_api_key" + +``` + +### 第五步:配置参数 + +在 `api_pipelines/image_visual_only_mcq_api_pipeline.py` 中配置过滤阈值,例如,要求有图 100% 正确,无图正确率低于 25%: + +```python + pipe = VisualOnlyMCQPipeline( + first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl", + rotate_num=4, + pass_visual_min=1.0, + pass_textual_max=0.25 + ) + +``` + +### 第六步:一键运行 + +```bash +cd api_pipelines +python image_visual_only_mcq_api_pipeline.py + +``` + +--- + +## 3. 数据流与流水线逻辑 + +### 1. **输入数据** + +输入仅需包含图像路径: + +* **image**:图像文件路径。 + +**输入数据示例**: + +```json +{ + "image": "./images/sample_01.jpg" +} + +``` + +### 2. **核心算子逻辑** + +该流水线由三个关键算子串联而成: + +#### A. **FixPromptedVQAGenerator(原始生成)** + +* **功能**:使用 CapRL 预设的 Prompt 模板(`SYS_PROMPT_MCQ` / `USER_PROMPT_MCQ`),让 VLM 一次性生成 5 道 MCQ。 +* **输出**:包含多个 `#### Question` 和选项的非结构化文本块。 + +#### B. **FunctionalRefiner(正则解析)** + +* **逻辑函数**:`parse_mcq_text_logic` +* **功能**:利用正则表达式从原始文本中提取题目、选项(A-F)和正确答案。 +* **输出**:结构化的 MCQ 列表 (`parsed_mcq_list`)。 + +#### C. **VisualDependencyRefiner(依赖性验证)** + +这是本流水线的核心过滤器。它对每道题进行 N 次推理(N = `rotate_num`): + +1. **选项旋转**:随机打乱选项顺序(例如将答案从 A 换到 C),防止模型通过“总是选 A”来作弊。 +2. **有图推理 (Visual Pass)**:输入图像 + 题目。记录模型答对的比例。 +3. **无图推理 (Textual Pass)**:仅输入题目(无图像进行盲测)。记录模型盲猜对的比例。 +4. **过滤判据**: + +* 保留题目,当且仅当:`Visual_Acc >= pass_visual_min` **且** `Textual_Acc <= pass_textual_max`。 +* *示例*:如果一道题不看图也能答对(无图准确率高),说明它考的是常识而非视觉,**剔除**。 + +### 3. **输出数据** + +输出数据 (`final_mcqs`) 仅包含通过了严苛验证的题目。这些题目具有极高的质量和视觉相关性。 + +**输出数据示例**: + +```json +{ + "image": "./images/sample_01.jpg", + "final_mcqs": [ + { + "question": "What is the color of the car on the far left?\n - A) Red\n - B) Blue...", + "answer": "A", + "stats": { + "visual_acc": 1.0, # 4次全对 + "text_acc": 0.0 # 盲猜全错 + } + } + ] +} + +``` + +--- + +## 4. 流水线示例 + +以下是完整的 `VisualOnlyMCQPipeline` 代码实现 (API 版本)。 + +```python +import os +os.environ["DF_API_KEY"] = "sk-xxxx" +import argparse +from dataflow.utils.storage import FileStorage +from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm + +from dataflow.operators.core_vision import FixPromptedVQAGenerator, VisualDependencyRefiner +from dataflow.operators.core_text import FunctionalRefiner +from dataflow.prompts.image import ImageCaprlPrompt +from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai +import re +from typing import List, Dict, Any + +_Q_BLOCK_SPLIT = re.compile(r"^####\s*\d+\.\s*\*\*(.*?)\*\*\s*$", re.M) +_OPT_LINE_RE = re.compile(r"^\s*-\s*([A-F])\)\s*(.+?)\s*$") +_ANS_LINE_RE = re.compile(r"^\s*\*\*Answer:\*\*\s*([A-F])\)\s*(.+?)\s*$", re.I) + +def parse_mcq_text_logic(mcq_text: str, expected: int = 5) -> List[Dict[str, Any]]: + if not mcq_text or not isinstance(mcq_text, str): return [] + + indices = [m.start() for m in _Q_BLOCK_SPLIT.finditer(mcq_text)] + if not indices: return [] + indices.append(len(mcq_text)) + blocks = [mcq_text[indices[i]:indices[i+1]].strip() for i in range(len(indices)-1)] + + parsed = [] + for block in blocks: + lines = [ln.rstrip() for ln in block.splitlines() if ln.strip()] + q_title_m = _Q_BLOCK_SPLIT.search(block) + if not q_title_m: continue + + q_title = q_title_m.group(1).strip() + options = {} + ans_letter, ans_text = None, None + + for ln in lines: + m_opt = _OPT_LINE_RE.match(ln) + if m_opt: + options[m_opt.group(1)] = m_opt.group(2).strip() + continue + m_ans = _ANS_LINE_RE.match(ln) + if m_ans: + ans_letter = m_ans.group(1).upper() + ans_text = m_ans.group(2).strip() + break + + if options and ans_letter and ans_letter in options: + q_lines = [q_title] + for lbl in ["A", "B", "C", "D", "E", "F"]: + if lbl in options: + q_lines.append(f" - {lbl}) {options[lbl]}") + + parsed.append({ + "question": "\n".join(q_lines), + "question_title": q_title, + "options": options, + "answer": ans_letter, + "answer_text": ans_text + }) + + if expected > 0: + parsed = parsed[:expected] + + uniq = [] + seen = set() + for it in parsed: + key = (it["question_title"], it["answer"]) + if key not in seen: + seen.add(key) + uniq.append(it) + return uniq + + +class VisualOnlyMCQPipeline: + def __init__( + self, + *, + first_entry_file: str, + cache_path: str = "../cache/cache_mcq", + file_name_prefix: str = "vis_mcq", + # Config + rotate_num: int = 4, + pass_visual_min: float = 1.0, + pass_textual_max: float = 0.25, + add_none_above: bool = True, + # Keys + input_image_key: str = "image", + output_key: str = "final_mcqs", + # VLLM + vllm_max_tokens: int = 2048 + ): + self.storage = FileStorage( + first_entry_file_name=first_entry_file, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type="jsonl" + ) + self.vlm_serving = APIVLMServing_openai( + api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format + model_name="gpt-4o-mini", + image_io=None, + send_request_stream=False, + max_workers=10, + timeout=1800 + ) + + + # Keys + self.keys = { + "img": input_image_key, + "raw_text": "raw_mcq_text", + "parsed_list": "parsed_mcq_list", + "final": output_key + } + + # --- Prompts --- + self.prompts_db = ImageCaprlPrompt().build_prompt() + + # ================== Operators ================== + + # 1. Generate Raw MCQs (FixPromptedVQAGenerator) + # 直接使用 prompt 类中的字符串 + self.op_gen_raw = FixPromptedVQAGenerator( + serving=self.vlm_serving, + system_prompt=self.prompts_db["SYS_PROMPT_MCQ"], + user_prompt=self.prompts_db["USER_PROMPT_MCQ"] + ) + + # 2. Parse MCQs (Refine) + self.op_parse = FunctionalRefiner(func=parse_mcq_text_logic) + + # 3. Verify Visual Dependency (Refine) + # 传入 prompt 模板 + self.op_verify = VisualDependencyRefiner( + serving=self.vlm_serving, + instruction_template=self.prompts_db["ANSWER_INSTRUCTION"], + rotate_num=rotate_num, + pass_visual_min=pass_visual_min, + pass_textual_max=pass_textual_max, + add_none_above_visual=add_none_above + ) + + def forward(self): + print(">>> [Pipeline] Step 1: Generating Raw MCQs (FixPrompted)...") + self.op_gen_raw.run( + self.storage.step(), + input_image_key=self.keys["img"], + output_answer_key=self.keys["raw_text"] + ) + + print(">>> [Pipeline] Step 2: Parsing MCQs...") + self.op_parse.run( + self.storage.step(), + output_key=self.keys["parsed_list"], + mcq_text=self.keys["raw_text"], + expected=5 + ) + + print(">>> [Pipeline] Step 3: Verifying Visual Dependency (Rotation Check)...") + self.op_verify.run( + self.storage.step(), + input_list_key=self.keys["parsed_list"], + input_image_key=self.keys["img"], + output_key=self.keys["final"] + ) + + print(f">>> [Pipeline] Done. Results in: {self.keys['final']}") + +if __name__ == "__main__": + pipe = VisualOnlyMCQPipeline( + first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl", + rotate_num=4, + pass_visual_min=1.0, + pass_textual_max=0.25 + ) + pipe.forward() + +``` diff --git a/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md b/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md index ee038567..fa803660 100644 --- a/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md +++ b/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md @@ -35,30 +35,100 @@ cd run_mcts_reasoning ``` -### 第二步:准备脚本 +### 第二步:初始化 DataFlow-MM -将下文“流水线示例”中的代码保存为 `vision_mcts_pipeline.py`。 +```bash +dataflowmm init -### 第三步:配置运行参数 +``` -确保输入文件(jsonl)包含 `tree` 字段(用于提取)或仅包含 `question/image`(用于生成)。 +这时你会看到: ```bash -# 安装依赖 -pip install open-dataflow vllm +gpu_pipelines/vision_mcts_pipeline.py ``` -### 第四步:一键运行 +### 第三步:下载示例数据 ```bash -python vision_mcts_pipeline.py \ - --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \ - --input_file "data/mcts_trajectories.jsonl" \ - --prompt_type "spatial" +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data ``` +### 第四步:配置参数 + +确保输入文件(jsonl)包含 `tree` 字段(用于提取)或仅包含 `question/image`(用于生成): + +```python +if __name__ == "__main__": + pipe = VisionMCTSReasoningPipeline( + model_path="Qwen/Qwen2.5-VL-3B-Instruct", + first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl", + prompt_type="spatial", + hf_cache_dir="~/.cache/huggingface", + download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct", + ) + pipe.forward() + +``` + +> **⚠️ 模型路径配置的重要提示(以 `Qwen2.5-VL-3B-Instruct` 为例):** +> * **如果您已经下载好了模型文件**:请将 `model_path` 修改为您的本地模型路径。**务必保证**模型存放的最终文件夹名称精确为 `Qwen2.5-VL-3B-Instruct`,否则底层解析时将无法正确匹配和识别该模型。 +> * **如果您还未下载模型(需要自动下载)**:请一定要指定 `download_dir` 参数,并且该目录路径**必须以** `Qwen2.5-VL-3B-Instruct` **结尾**(正如默认参数所示),否则下载完成后同样会导致框架无法识别模型。 +> +> + +### 第五步:一键运行 + +```bash +cd gpu_pipelines +python vision_mcts_pipeline.py + +``` + +> **🛠️ 常见问题排查 (Troubleshooting)** +> **问题 1:** 如果遇到类似如下的动态链接库冲突报错: +> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12` +> **解决方法:** 这通常是环境变量干扰导致的。请在运行命令前清空 `LD_LIBRARY_PATH`: +> ```bash +> LD_LIBRARY_PATH="" python vision_mcts_pipeline.py +> +> ``` +> +> +> **问题 2:** 如果您使用的是 **Qwen 系列模型**,并且遇到以下报错: +> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"` +> **解决方法:** 打开模型文件夹下的 `config.json` 文件,找到 `rope_scaling` 配置块,将 `"type"` 字段修改为 `"rope_type"` 即可。 +> **修改前:** +> ```json +> "rope_scaling": { +> "type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> +> ``` +> +> +> **修改后:** +> ```json +> "rope_scaling": { +> "rope_type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> +> ``` +> +> + --- ## 3. 数据流与流水线逻辑 @@ -125,10 +195,9 @@ python vision_mcts_pipeline.py \ ## 4. 流水线示例 -以下是完整的 `VisionMCTSReasoningPipeline` 代码实现。 +以下是完整的 `VisionMCTSReasoningPipeline` 代码实现 (GPU 版本)。 ```python -import argparse from dataflow.utils.storage import FileStorage from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm @@ -142,8 +211,10 @@ class VisionMCTSReasoningPipeline: model_path: str, *, # Storage + hf_cache_dir: str | None = None, + download_dir: str = "./ckpt/models", first_entry_file: str, - cache_path: str = "./cache_mcts", + cache_path: str = "../cache/cache_mcts", file_name_prefix: str = "mcts_reason", # Config prompt_type: str = "spatial", @@ -156,7 +227,6 @@ class VisionMCTSReasoningPipeline: # VLLM vllm_max_tokens: int = 1024 ): - # 1. 存储初始化 self.storage = FileStorage( first_entry_file_name=first_entry_file, cache_path=cache_path, @@ -164,8 +234,9 @@ class VisionMCTSReasoningPipeline: cache_type="jsonl" ) - # 2. 模型服务 self.serving = LocalModelVLMServing_vllm( + hf_cache_dir=hf_cache_dir, + hf_local_dir=download_dir, hf_model_name_or_path=model_path, vllm_tensor_parallel_size=1, vllm_temperature=0.7, @@ -176,20 +247,18 @@ class VisionMCTSReasoningPipeline: "q": input_question_key, "img": input_image_key, "tree": input_tree_key, - "mcts_chains": "mcts_extracted_chains", # 中间结果 + "mcts_chains": "mcts_extracted_chains", "final": output_key } # ================== Operators ================== - # 算子 1: MCTS Tree -> Chains (提取器) - # 负责将树结构扁平化为线性链 + # 1. Refiner: MCTS -> Chains self.op_mcts_refine = MCTSTreeRefiner( max_chains_per_sample=max_samples_per_file ) - # 算子 2: VLM -> Chains (生成器/Fallback) - # 如果 MCTS 提取失败,则使用 VLM 生成;如果成功,则跳过 + # 2. Generator: VLM -> Chains (Fallback) self.op_vlm_gen = VisualReasoningGenerator( serving=self.serving, prompt_type=prompt_type @@ -204,7 +273,8 @@ class VisionMCTSReasoningPipeline: ) print(">>> [Pipeline] Step 2: Generating Chains via VLM (Fallback)...") - # 注意:input_existing_chains_key 实现了混合/回退逻辑 + # 将 mcts_chains 作为 input_existing_chains_key 传入 + # 如果 MCTS 解析成功,则复用;否则调用 VLM 生成 self.op_vlm_gen.run( self.storage.step(), input_question_key=self.keys["q"], @@ -215,16 +285,12 @@ class VisionMCTSReasoningPipeline: if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--input_file", default="dataflow/example/image_to_text_pipeline/mct_reasoning.jsonl") - parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") - parser.add_argument("--prompt_type", default="spatial") - args = parser.parse_args() - pipe = VisionMCTSReasoningPipeline( - model_path=args.model_path, - first_entry_file=args.input_file, - prompt_type=args.prompt_type + model_path="Qwen/Qwen2.5-VL-3B-Instruct", + first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl", + prompt_type="spatial", + hf_cache_dir="~/.cache/huggingface", + download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct", ) pipe.forward() diff --git a/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md b/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md new file mode 100644 index 00000000..1c391d8b --- /dev/null +++ b/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md @@ -0,0 +1,248 @@ +--- +title: 视觉 MCTS 推理链生成流水线(API版) +icon: mdi:image-text +createTime: 2026/01/11 21:59:59 +permalink: /zh/mm_guide/vision_mct_reasoning_pipeline_api/ +--- + +## 1. 概述 + +**视觉 MCTS 推理链生成流水线 (Vision MCTS Reasoning Pipeline)** 旨在为多模态大模型构建高质量的**过程监督数据(Process Supervision Data)**。该流水线能够处理两种数据来源:已有的蒙特卡洛树搜索(MCTS)轨迹数据,或直接利用 VLM 生成新的推理链。 + +该流水线是 **Grounded-RL** 和 **SFT 数据构建**的核心工具,它将复杂的树状搜索过程“线性化”为模型可学习的 `......` 格式。 + +我们支持以下应用场景: + +* **从 MCTS 树提取数据**:将搜索树中高价值的路径(Rollouts)转化为线性训练数据。 +* **混合数据构建**:对于没有搜索树的样本,自动回退到使用 VLM 进行 CoT 生成。 +* **空间推理增强**:支持生成包含显式坐标(Bounding Box)的空间推理链。 + +流水线的主要流程包括: + +1. **MCTS 树解析**:解析输入数据中的搜索树结构,提取成功的推理路径。 +2. **视觉推理生成 (Fallback)**:对于缺失树结构或解析失败的样本,利用 VLM 重新生成推理链。 +3. **数据标准化**:输出统一格式的推理链数据。 + +--- + +## 2. 快速开始 + +### 第一步:准备工作目录 + +```bash +mkdir run_mcts_reasoning +cd run_mcts_reasoning + +``` + +### 第二步:初始化 DataFlow-MM + +```bash +dataflowmm init + +``` + +这时你会看到: + +```bash +api_pipelines/vision_mcts_api_pipeline.py + +``` + +### 第三步:下载示例数据 + +```bash +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data + +``` + +### 第四步:配置 API Key + +在 `api_pipelines/vision_mcts_api_pipeline.py` 中设置 API Key 环境变量: + +```python +import os +os.environ["DF_API_KEY"] = "your_api_key" + +``` + +### 第五步:配置参数 + +配置 API 服务和输入数据路径。确保输入文件(jsonl)包含 `tree` 字段(用于提取)或仅包含 `question/image`(用于生成): + +```python + pipe = VisionMCTSReasoningPipeline( + first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl", + prompt_type="spatial", + ) + +``` + +### 第六步:一键运行 + +```bash +cd api_pipelines +python vision_mcts_api_pipeline.py + +``` + +--- + +## 3. 数据流与流水线逻辑 + +### 1. **输入数据** + +输入数据通常来源于 MCTS 搜索过程的日志,或未标注的图文对: + +* **image**:图像路径。 +* **question**:视觉问题。 +* **tree**(可选):MCTS 搜索树的 JSON 结构,包含节点值(Value)、访问次数(Visits)和动作(Actions)。 + +**输入数据示例**: + +```json +{ + "image": "./images/puzzle.jpg", + "question": "What is the next step to solve this?", + "tree": { "root": { "children": [...], "value": 1.0, "text": "Step 1..." } } +} + +``` + +### 2. **核心算子逻辑** + +该流水线采用 **“提取优先,生成兜底”** 的混合策略: + +#### A. **MCTSTreeRefiner(树结构解析器)** + +该算子负责处理 `tree` 字段。它遍历树结构,根据节点价值(Q-value)筛选出从根节点到叶子节点的最佳路径。 + +* **输入**:`tree` 对象。 +* **功能**:线性化树路径,过滤掉低价值或未完成的搜索分支。 +* **输出**:提取出的推理链列表(`mcts_chains`)。 + +#### B. **VisualReasoningGenerator(视觉推理生成器)** + +该算子是流水线的“生成引擎”。它接收上一步的提取结果作为输入。 + +* **机制**:检查 `input_existing_chains_key`(即 `mcts_chains`)。 +* 如果 MCTS 解析成功(链存在),则直接复用,不进行推理(节省计算资源)。 +* 如果 MCTS 链为空(树不存在或解析失败),则调用 VLM,根据 `prompt_type`(如 `spatial`)从头生成推理链。 + + +* **Prompt 类型**:支持 `spatial`(空间坐标推理)、`logical`(逻辑推理)等模式。 + +### 3. **输出数据** + +最终生成的输出数据(`final_reasoning_chains`)将包含高质量的思维链,可直接用于 SFT 训练。 + +**输出示例**: + +```json +{ + "image": "./images/puzzle.jpg", + "final_reasoning_chains": [ + "First, locate the red block at [100, 200]. To solve the puzzle, it needs to move right...Move Red Block" + ] +} + +``` + +--- + +## 4. 流水线示例 + +以下是完整的 `VisionMCTSReasoningPipeline` 代码实现 (API 版本)。 + +```python +import os +os.environ["DF_API_KEY"] = "sk-xxxx" +from dataflow.utils.storage import FileStorage +from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm + +# 引入原子算子 +from dataflow.operators.core_text import MCTSTreeRefiner +from dataflow.operators.core_vision import VisualReasoningGenerator +from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai + +class VisionMCTSReasoningPipeline: + def __init__( + self, + first_entry_file: str, + cache_path: str = "../cache/cache_mcts", + file_name_prefix: str = "mcts_reason", + # Config + prompt_type: str = "spatial", + max_samples_per_file: int = 10000, + # Keys + input_question_key: str = "question", + input_image_key: str = "image", + input_tree_key: str = "tree", + output_key: str = "final_reasoning_chains", + + ): + self.storage = FileStorage( + first_entry_file_name=first_entry_file, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type="jsonl" + ) + + self.vlm_serving = APIVLMServing_openai( + api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format + model_name="gpt-4o-mini", + image_io=None, + send_request_stream=False, + max_workers=10, + timeout=1800 + ) + + self.keys = { + "q": input_question_key, + "img": input_image_key, + "tree": input_tree_key, + "mcts_chains": "mcts_extracted_chains", + "final": output_key + } + + # ================== Operators ================== + + # 1. Refiner: MCTS -> Chains + self.op_mcts_refine = MCTSTreeRefiner( + max_chains_per_sample=max_samples_per_file + ) + + # 2. Generator: VLM -> Chains (Fallback) + self.op_vlm_gen = VisualReasoningGenerator( + serving=self.vlm_serving, + prompt_type=prompt_type + ) + + def forward(self): + print(">>> [Pipeline] Step 1: Extracting Chains from MCTS Trees...") + self.op_mcts_refine.run( + self.storage.step(), + input_tree_key=self.keys["tree"], + output_key=self.keys["mcts_chains"] + ) + + print(">>> [Pipeline] Step 2: Generating Chains via VLM (Fallback)...") + # 将 mcts_chains 作为 input_existing_chains_key 传入 + # 如果 MCTS 解析成功,则复用;否则调用 VLM 生成 + self.op_vlm_gen.run( + self.storage.step(), + input_question_key=self.keys["q"], + input_image_key=self.keys["img"], + input_existing_chains_key=self.keys["mcts_chains"], + output_key=self.keys["final"] + ) + + +if __name__ == "__main__": + pipe = VisionMCTSReasoningPipeline( + first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl", + prompt_type="spatial", + ) + pipe.forward() + +```