diff --git a/docs/.vuepress/notes/en/mm_guide.ts b/docs/.vuepress/notes/en/mm_guide.ts
index 8a5469cc..ab560546 100644
--- a/docs/.vuepress/notes/en/mm_guide.ts
+++ b/docs/.vuepress/notes/en/mm_guide.ts
@@ -28,11 +28,15 @@ export const MMGuide: ThemeNote = defineNoteConfig({
'context_vqa',
'context_vqa_api',
'image_gcot',
+ 'image_gcot_api',
'vision_mct_reasoning_pipeline',
+ 'vision_mct_reasoning_pipeline_api',
'image_region_caption_pipeline',
'image_region_caption_pipeline_api',
'image_scale_caption_pipeline',
+ 'image_scale_caption_pipeline_api',
'image_visual_only_mcq_pipeline',
+ 'image_visual_only_mcq_pipeline_api',
],
},
{
diff --git a/docs/.vuepress/notes/zh/mm_guide.ts b/docs/.vuepress/notes/zh/mm_guide.ts
index 21bece4d..aa439f3a 100644
--- a/docs/.vuepress/notes/zh/mm_guide.ts
+++ b/docs/.vuepress/notes/zh/mm_guide.ts
@@ -28,11 +28,15 @@ export const MMGuide: ThemeNote = defineNoteConfig({
'context_vqa',
'context_vqa_api',
'image_gcot',
+ 'image_gcot_api',
'vision_mct_reasoning_pipeline',
+ 'vision_mct_reasoning_pipeline_api',
'image_region_caption_pipeline',
'image_region_caption_pipeline_api',
'image_scale_caption_pipeline',
+ 'image_scale_caption_pipeline_api',
'image_visual_only_mcq_pipeline',
+ 'image_visual_only_mcq_pipeline_api',
],
},
{
diff --git a/docs/en/notes/mm_guide/image_understanding/image_gcot.md b/docs/en/notes/mm_guide/image_understanding/image_gcot.md
index 96be0e2a..636d7371 100644
--- a/docs/en/notes/mm_guide/image_understanding/image_gcot.md
+++ b/docs/en/notes/mm_guide/image_understanding/image_gcot.md
@@ -27,33 +27,94 @@ The main process of the pipeline includes:
## 2. Quick Start
-### Step 1: Create a Working Directory
+### Step 1: Create a New DataFlow Working Directory
```bash
-mkdir run_gcot
-cd run_gcot
+mkdir run_dataflow
+cd run_dataflow
```
-### Step 2: Prepare the Script
+### Step 2: Initialize DataFlow-MM
-Save the code in the "Pipeline Example" section below as `image_gcot_pipeline.py`.
+```bash
+dataflowmm init
-### Step 3: Download Example Data
+```
-```bash
-huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data
+You will then see:
+```bash
+gpu_pipelines/image_gcot_pipeline.py
```
-### Step 4: Run
+### Step 3: Download Sample Data
```bash
-python image_gcot_pipeline.py \
- --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
- --input_file "data/image_qa.jsonl"
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+```
+### Step 4: Configure Parameters
+```python
+if __name__ == "__main__":
+ pipe = ImageGCoTPipeline(
+ model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+ first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl",
+ hf_cache_dir="~/.cache/huggingface",
+ download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+ )
+ pipe.forward()
```
+> **�7²2„1‚5 Important Note on Model Path Configuration (Taking `Qwen2.5-VL-3B-Instruct` as an example):**
+>
+> * **If you have already downloaded the model files:** Please change `model_path` to your local model path. **Crucially**, ensure that the model folder is named exactly `Qwen2.5-VL-3B-Instruct`; otherwise, the framework will fail to recognize it.
+> * **If you haven't downloaded the model yet:** You must specify a `download_dir` parameter that ends with `Qwen2.5-VL-3B-Instruct` (as shown in the default parameters). Failure to do so will also result in the model not being recognized after downloading.
+
+
+### Step 5: Run
+
+```bash
+cd gpu_pipelines
+python image_gcot_pipeline.py
+```
+> **•0•0„1‚5 Troubleshooting**
+>
+> **Issue 1:** If you encounter a CUDA library conflict error similar to the following:
+> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`
+>
+> **Solution:** This is usually caused by conflicting environment variables. Run the script with an empty `LD_LIBRARY_PATH`:
+> ```bash
+> LD_LIBRARY_PATH="" python image_gcot_pipeline.py
+> ```
+>
+> **Issue 2:** If you are using **Qwen series models** and encounter the following error:
+> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"`
+>
+> **Solution:** Open the `config.json` file located in your model folder, find the `rope_scaling` section, and change the key `"type"` to `"rope_type"`.
+>
+> **Before modification:**
+> ```json
+> "rope_scaling": {
+> "type": "mrope",
+> "mrope_section": [
+> 16,
+> 24,
+> 24
+> ]
+> }
+> ```
+>
+> **After modification:**
+> ```json
+> "rope_scaling": {
+> "rope_type": "mrope",
+> "mrope_section": [
+> 16,
+> 24,
+> 24
+> ]
+> }
+> ```
---
@@ -71,9 +132,9 @@ The input data for this process typically consists of standard VQA data:
```json
{
- "image": "./images/cat_dog.jpg",
- "question": "Is the cat looking at the dog?",
- "answer": "Yes"
+ "image":"../example_data/capsbench_images/0.png",
+ "question":"Who is the lead actor in the movie \"Nightmare Alley\"?",
+ "answer": "Bradley Cooper."
}
```
@@ -119,10 +180,7 @@ Finally, the output data generated by the pipeline will contain the following ke
**Output Data Example (gcot field)**:
```text
-Step 1: Locate the cat [200, 300, 400, 500]. The cat is sitting on the left.
-Step 2: Locate the dog [500, 300, 700, 500]. The dog is sleeping on the right.
-Step 3: Observe their gaze. The cat is facing the dog.
-Answer: Yes
+Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper. \nKeywords: Nightmare Alley, cast list, poster.","cleaned_cot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.","extracted_keywords":["Nightmare Alley","cast list","poster"],"bbox_mapping":{},"gcot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.
```
@@ -136,6 +194,7 @@ Below is the complete `ImageGCoTPipeline` code implementation.
import re
from typing import List, Dict, Any
import argparse
+import gc
import torch
from dataflow.utils.storage import FileStorage
from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
@@ -144,7 +203,6 @@ from dataflow.operators.core_vision import PromptTemplatedVQAGenerator, VLMBBoxG
from dataflow.operators.core_text import FunctionalRefiner
from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
-# 定义 Prompt 模板,强制模型输出推理步骤和关键词
GCOT_PROMPT_TEMPLATE = (
"Question: {question}\n"
"Answer: {answer}\n\n"
@@ -161,10 +219,8 @@ GCOT_PROMPT_TEMPLATE = (
DEFAULT_BBOX_PROMPT = 'Detect "{keyword}".'
-# ----------------- 辅助逻辑函数 ----------------- #
-
def _parse_base(text: str) -> Dict[str, Any]:
- """基础解析逻辑:分离 CoT 文本和 Keywords 行"""
+ """基础解析逻辑(内部复用)"""
if not text: return {"cot": "", "keywords": []}
lines = text.split('\n')
cot_lines = []
@@ -172,7 +228,6 @@ def _parse_base(text: str) -> Dict[str, Any]:
for line in lines:
if line.strip().lower().startswith('keywords:'):
keyword_str = line.split(':', 1)[-1].strip()
- # 简单的分词处理
raw_kws = [kw.strip().strip('.,;:!?"\'') for kw in keyword_str.replace(';', ',').split(',')]
keywords = [k for k in raw_kws if k]
else:
@@ -180,15 +235,42 @@ def _parse_base(text: str) -> Dict[str, Any]:
return {"cot": '\n'.join(cot_lines).strip(), "keywords": keywords}
def extract_clean_cot_logic(text: str) -> str:
+ """[For FunctionalRefiner] 仅返回清洗后的 CoT 文本"""
return _parse_base(text)["cot"]
def extract_keywords_logic(text: str) -> List[str]:
- return _parse_base(text)["keywords"]
+ """[For FunctionalRefiner] 提取并合并关键词"""
+ parsed = _parse_base(text)
+ kws = parsed["keywords"]
+ cot = parsed["cot"]
+
+ if not kws or len(kws) <= 1:
+ return kws
+
+ # 简单的相邻合并逻辑
+ cot_lower = cot.lower()
+ merged = []
+ skip_indices = set()
+ for i in range(len(kws)):
+ if i in skip_indices: continue
+ best_match = kws[i]
+ best_indices = [i]
+ # 尝试向后合并 3 个词
+ for j in range(i + 1, min(i + 4, len(kws))):
+ if j in skip_indices: break
+ combined = ' '.join(kws[i:j+1])
+ if combined.lower() in cot_lower:
+ best_match = combined
+ best_indices = list(range(i, j+1))
+ else: break
+ merged.append(best_match)
+ skip_indices.update(best_indices)
+ return merged
def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str:
- """将 BBox 注入回 CoT 文本"""
+ """[For FunctionalRefiner] 将 BBox 注入回 CoT"""
if not cot_text or not bbox_map: return cot_text
- # 优先匹配长词,避免子串误匹配
+ # 优先匹配长词
sorted_keywords = sorted(bbox_map.keys(), key=lambda x: len(x), reverse=True)
result_text = cot_text
replaced = set()
@@ -199,37 +281,35 @@ def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str:
answer_pos = result_text.find('Answer:')
search_limit = answer_pos if answer_pos != -1 else len(result_text)
- # 大小写不敏感查找
pos = result_text.lower().find(keyword.lower(), 0, search_limit)
if pos == -1: continue
boxes = bbox_map[keyword] # List[str]
box_str = "".join(boxes)
- # 替换:保留原词,追加 Box
replacement = f"{keyword} {box_str}"
result_text = result_text[:pos] + replacement + result_text[pos + len(keyword):]
replaced.add(keyword)
return result_text
-# ----------------- 流水线定义 ----------------- #
-
class ImageGCoTPipeline:
def __init__(
self,
model_path: str,
*,
+ hf_cache_dir: str | None = None,
+ download_dir: str = "./ckpt/models",
first_entry_file: str,
- cache_path: str = "./cache_gcot",
+ cache_path: str = "../cache/cache_gcot",
file_name_prefix: str = "gcot",
- # Keys 配置
+ # Keys
question_key: str = "question",
answer_key: str = "answer",
image_key: str = "image",
output_key: str = "gcot",
+ # Config
vllm_max_tokens: int = 512
):
- # 1. 存储初始化
self.storage = FileStorage(
first_entry_file_name=first_entry_file,
cache_path=cache_path,
@@ -237,9 +317,11 @@ class ImageGCoTPipeline:
cache_type="jsonl"
)
- # 2. 模型服务 (单一模型)
+ # [单一模型 Serving]
self.vlm_serving = LocalModelVLMServing_vllm(
hf_model_name_or_path=model_path,
+ hf_cache_dir=hf_cache_dir,
+ hf_local_dir=download_dir,
vllm_tensor_parallel_size=1,
vllm_temperature=0.7,
vllm_max_tokens=vllm_max_tokens
@@ -256,28 +338,28 @@ class ImageGCoTPipeline:
"final": output_key
}
- # 3. 算子链配置
+ # ================== Operators ==================
- # Step A: 生成 CoT 和 Keywords
+ # 1. Generate CoT (通用 Generator)
self.op_gen_cot = PromptTemplatedVQAGenerator(
serving=self.vlm_serving,
system_prompt="You are a helpful assistant.",
prompt_template=NamedPlaceholderPromptTemplate(template=GCOT_PROMPT_TEMPLATE)
)
- # Step B: 解析清洗 CoT
+ # 2. Extract Clean CoT (通用 Refiner + Helper)
self.op_extract_cot = FunctionalRefiner(func=extract_clean_cot_logic)
- # Step C: 解析 Keywords
+ # 3. Extract Keywords (通用 Refiner + Helper)
self.op_extract_kws = FunctionalRefiner(func=extract_keywords_logic)
- # Step D: 生成 BBox (Grounding)
+ # 4. Generate BBox (专用 Generator, 因为涉及行内 Batch)
self.op_bbox_gen = VLMBBoxGenerator(
serving=self.vlm_serving,
prompt_template=DEFAULT_BBOX_PROMPT
)
- # Step E: 注入 BBox 到 CoT
+ # 5. Inject GCoT (通用 Refiner + Helper)
self.op_inject = FunctionalRefiner(func=inject_bboxes_logic)
def forward(self):
@@ -286,7 +368,7 @@ class ImageGCoTPipeline:
self.storage.step(),
input_image_key=self.keys["img"],
output_answer_key=self.keys["raw_cot"],
- question=self.keys["q"],
+ question=self.keys["q"], # Template mapping
answer=self.keys["a"]
)
@@ -294,7 +376,7 @@ class ImageGCoTPipeline:
self.op_extract_cot.run(
self.storage.step(),
output_key=self.keys["clean_cot"],
- text=self.keys["raw_cot"]
+ text=self.keys["raw_cot"] # Param mapping
)
self.op_extract_kws.run(
self.storage.step(),
@@ -322,16 +404,11 @@ class ImageGCoTPipeline:
if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("--input_file", default="dataflow/example/image_to_text_pipeline/image_qa_result.jsonl")
- parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
-
- args = parser.parse_args()
-
pipe = ImageGCoTPipeline(
- model_path=args.model_path,
- first_entry_file=args.input_file
+ model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+ first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl",
+ hf_cache_dir="~/.cache/huggingface",
+ download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
)
pipe.forward()
-
```
diff --git a/docs/en/notes/mm_guide/image_understanding/image_gcot_api.md b/docs/en/notes/mm_guide/image_understanding/image_gcot_api.md
new file mode 100644
index 00000000..3499879e
--- /dev/null
+++ b/docs/en/notes/mm_guide/image_understanding/image_gcot_api.md
@@ -0,0 +1,402 @@
+---
+title: Image Grounded CoT (GCoT) Pipeline (API version)
+icon: mdi:image-text
+createTime: 2026/01/11 20:44:55
+permalink: /en/mm_guide/image_gcot_api/
+---
+## 1. Overview
+
+The **Image Grounded Chain-of-Thought (GCoT) Pipeline** is designed to automatically generate **Grounded Chain-of-Thought** data. This pipeline generates multi-step reasoning to answer a question and simultaneously spatially locates (via Bounding Boxes) the key objects mentioned during the reasoning process. This significantly enhances the interpretability and precision of multimodal data.
+
+Unlike traditional methods, this pipeline uses a **Single VLM (e.g., GPT-5)** to handle both "Reasoning" and "Grounding" tasks, making the process streamlined and efficient.
+
+We support the following application scenarios:
+
+* **Enhanced Multimodal Data Construction**: Adding interpretability and grounding annotations to VQA datasets.
+* **Complex Scene Understanding**: Generating detailed reasoning steps containing object coordinates.
+* **Model Reasoning Training**: Building data to train models to be "grounded" and reduce hallucinations.
+
+The main process of the pipeline includes:
+
+1. **CoT Generation**: The model generates step-by-step reasoning text and extracts key nouns.
+2. **Keyword Parsing**: Cleaning and extracting keywords to be grounded from the generated text.
+3. **Visual Grounding**: The model generates bounding boxes (BBoxes) for the extracted keywords.
+4. **Information Injection**: Injecting BBox coordinates back into the reasoning text to form the final GCoT.
+
+---
+
+## 2. Quick Start
+
+### Step 1: Create a New DataFlow Working Directory
+
+```bash
+mkdir run_dataflow
+cd run_dataflow
+
+```
+
+### Step 2: Initialize DataFlow-MM
+
+```bash
+dataflowmm init
+
+```
+
+You will then see:
+
+```bash
+gpu_pipelines/image_gcot_pipeline.py
+```
+
+### Step 3: Download Sample Data
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+```
+
+### Step 4: Configure API Key
+
+Set your API Key environment variable in `api_pipelines/image_gcot_api_pipeline.py`:
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+
+```
+
+
+### Step 5: Configure Parameters
+
+Configure the API service and input data paths in `api_pipelines/image_gcot_api_pipeline.py`:
+
+```python
+ def __init__(
+ self,
+ *,
+ first_entry_file: str,
+ cache_path: str = "../cache/cache_gcot",
+ file_name_prefix: str = "gcot",
+ question_key: str = "question",
+ answer_key: str = "answer",
+ image_key: str = "image",
+ output_key: str = "gcot",
+ vllm_max_tokens: int = 512
+ ):
+```
+
+```python
+ pipe = ImageGCoTPipeline(
+ first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl"
+ )
+```
+
+```python
+self.vlm_serving = APIVLMServing_openai(
+ api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format
+ model_name="gpt-4o-mini",
+ image_io=None,
+ send_request_stream=False,
+ max_workers=10,
+ timeout=1800
+ )
+
+```
+### Step 6: Run with One Command
+
+```bash
+cd api_pipelines
+python image_gcot_api_pipeline.py
+```
+
+---
+
+## 3. Data Flow & Logic
+
+### 1. **Input Data**
+
+The input data for this process typically consists of standard VQA data:
+
+* **image**: Path to the image file.
+* **question**: Question about the image.
+* **answer**: Standard answer to the question (used to assist CoT generation).
+
+**Input Data Example**:
+
+```json
+{
+ "image":"../example_data/capsbench_images/0.png",
+ "question":"Who is the lead actor in the movie \"Nightmare Alley\"?",
+ "answer": "Bradley Cooper."
+}
+
+```
+
+### 2. **Core Operator Logic**
+
+This pipeline combines multiple fine-grained operators to achieve complex GCoT generation logic:
+
+#### A. **CoT Generation (PromptTemplatedVQAGenerator)**
+
+Uses a predefined `GCOT_PROMPT_TEMPLATE` to guide the model to generate "Step-by-step Reasoning" and a "Keyword List".
+
+* **Prompt Strategy**: Asks the model to output in the format `Step 1: ...`, `Step 2: ...`, `Keywords: ...`.
+* **Output**: Raw string containing reasoning text and keywords.
+
+#### B. **Text Cleaning & Extraction (FunctionalRefiner)**
+
+Uses custom functions to parse the output from the previous step:
+
+* `extract_clean_cot_logic`: Strips the keyword section, keeping pure CoT text.
+* `extract_keywords_logic`: Parses the content after `Keywords:` to generate a Python List.
+
+#### C. **Visual Grounding (VLMBBoxGenerator)**
+
+Calls the VLM's grounding capability to generate bounding boxes for each extracted keyword.
+
+* **Input**: Image + List of Keywords.
+* **Output**: Dictionary mapping keywords to bounding box coordinates.
+
+#### D. **Coordinate Injection (FunctionalRefiner)**
+
+Uses the `inject_bboxes_logic` function to intelligently insert the generated BBox coordinates back into the original CoT text after the corresponding words.
+
+### 3. **Output Data**
+
+Finally, the output data generated by the pipeline will contain the following key fields:
+
+* **raw_cot_output**: Raw text generated by the model.
+* **cleaned_cot**: Cleaned reasoning text.
+* **bbox_mapping**: Mapping of keywords to their coordinates.
+* **gcot**: Final result, reasoning chain containing coordinate information.
+
+**Output Data Example (gcot field)**:
+
+```text
+Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper. \nKeywords: Nightmare Alley, cast list, poster.","cleaned_cot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.","extracted_keywords":["Nightmare Alley","cast list","poster"],"bbox_mapping":{},"gcot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.
+
+```
+
+---
+
+## 4. Pipeline Example
+
+Below is the complete `ImageGCoTAPIPipeline` code implementation.
+
+```python
+import os
+os.environ["DF_API_KEY"] = "sk-xxxx"
+
+import re
+from typing import List, Dict, Any
+import argparse
+import gc
+import torch
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
+from dataflow.operators.core_vision import PromptTemplatedVQAGenerator, VLMBBoxGenerator
+from dataflow.operators.core_text import FunctionalRefiner
+from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
+from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
+GCOT_PROMPT_TEMPLATE = (
+ "Question: {question}\n"
+ "Answer: {answer}\n\n"
+ "Task: Provide a detailed step-by-step reasoning (Chain-of-Thought) that explains "
+ "how to arrive at this answer based on the image.\n"
+ "Then, extract key nouns and objects mentioned in your reasoning that are "
+ "visible in the image and can be spatially located.\n\n"
+ "Format:\n"
+ "Step 1: ...\n"
+ "Step 2: ...\n"
+ "Answer: {answer}\n"
+ "Keywords: object1, object2\n"
+)
+
+DEFAULT_BBOX_PROMPT = 'Detect "{keyword}".'
+
+def _parse_base(text: str) -> Dict[str, Any]:
+ """基础解析逻辑(内部复用)"""
+ if not text: return {"cot": "", "keywords": []}
+ lines = text.split('\n')
+ cot_lines = []
+ keywords = []
+ for line in lines:
+ if line.strip().lower().startswith('keywords:'):
+ keyword_str = line.split(':', 1)[-1].strip()
+ raw_kws = [kw.strip().strip('.,;:!?"\'') for kw in keyword_str.replace(';', ',').split(',')]
+ keywords = [k for k in raw_kws if k]
+ else:
+ cot_lines.append(line)
+ return {"cot": '\n'.join(cot_lines).strip(), "keywords": keywords}
+
+def extract_clean_cot_logic(text: str) -> str:
+ """[For FunctionalRefiner] 仅返回清洗后的 CoT 文本"""
+ return _parse_base(text)["cot"]
+
+def extract_keywords_logic(text: str) -> List[str]:
+ """[For FunctionalRefiner] 提取并合并关键词"""
+ parsed = _parse_base(text)
+ kws = parsed["keywords"]
+ cot = parsed["cot"]
+
+ if not kws or len(kws) <= 1:
+ return kws
+
+ # 简单的相邻合并逻辑
+ cot_lower = cot.lower()
+ merged = []
+ skip_indices = set()
+ for i in range(len(kws)):
+ if i in skip_indices: continue
+ best_match = kws[i]
+ best_indices = [i]
+ # 尝试向后合并 3 个词
+ for j in range(i + 1, min(i + 4, len(kws))):
+ if j in skip_indices: break
+ combined = ' '.join(kws[i:j+1])
+ if combined.lower() in cot_lower:
+ best_match = combined
+ best_indices = list(range(i, j+1))
+ else: break
+ merged.append(best_match)
+ skip_indices.update(best_indices)
+ return merged
+
+def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str:
+ """[For FunctionalRefiner] 将 BBox 注入回 CoT"""
+ if not cot_text or not bbox_map: return cot_text
+ # 优先匹配长词
+ sorted_keywords = sorted(bbox_map.keys(), key=lambda x: len(x), reverse=True)
+ result_text = cot_text
+ replaced = set()
+
+ for keyword in sorted_keywords:
+ if keyword in replaced: continue
+ # 简单策略:只在 'Answer:' 之前注入,防止破坏答案区
+ answer_pos = result_text.find('Answer:')
+ search_limit = answer_pos if answer_pos != -1 else len(result_text)
+
+ pos = result_text.lower().find(keyword.lower(), 0, search_limit)
+ if pos == -1: continue
+
+ boxes = bbox_map[keyword] # List[str]
+ box_str = "".join(boxes)
+ replacement = f"{keyword} {box_str}"
+
+ result_text = result_text[:pos] + replacement + result_text[pos + len(keyword):]
+ replaced.add(keyword)
+ return result_text
+
+class ImageGCoTPipeline:
+ def __init__(
+ self,
+ *,
+ first_entry_file: str,
+ cache_path: str = "../cache/cache_gcot",
+ file_name_prefix: str = "gcot",
+ # Keys
+ question_key: str = "question",
+ answer_key: str = "answer",
+ image_key: str = "image",
+ output_key: str = "gcot",
+ # Config
+ vllm_max_tokens: int = 512
+ ):
+ self.storage = FileStorage(
+ first_entry_file_name=first_entry_file,
+ cache_path=cache_path,
+ file_name_prefix=file_name_prefix,
+ cache_type="jsonl"
+ )
+
+ self.vlm_serving = APIVLMServing_openai(
+ api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format
+ model_name="gpt-4o-mini",
+ image_io=None,
+ send_request_stream=False,
+ max_workers=10,
+ timeout=1800
+ )
+
+ self.keys = {
+ "q": question_key,
+ "a": answer_key,
+ "img": image_key,
+ "raw_cot": "raw_cot_output",
+ "clean_cot": "cleaned_cot",
+ "keywords": "extracted_keywords",
+ "bbox_map": "bbox_mapping",
+ "final": output_key
+ }
+
+ # ================== Operators ==================
+
+ # 1. Generate CoT (通用 Generator)
+ self.op_gen_cot = PromptTemplatedVQAGenerator(
+ serving=self.vlm_serving,
+ system_prompt="You are a helpful assistant.",
+ prompt_template=NamedPlaceholderPromptTemplate(template=GCOT_PROMPT_TEMPLATE)
+ )
+
+ # 2. Extract Clean CoT (通用 Refiner + Helper)
+ self.op_extract_cot = FunctionalRefiner(func=extract_clean_cot_logic)
+
+ # 3. Extract Keywords (通用 Refiner + Helper)
+ self.op_extract_kws = FunctionalRefiner(func=extract_keywords_logic)
+
+ # 4. Generate BBox (专用 Generator, 因为涉及行内 Batch)
+ self.op_bbox_gen = VLMBBoxGenerator(
+ serving=self.vlm_serving,
+ prompt_template=DEFAULT_BBOX_PROMPT
+ )
+
+ # 5. Inject GCoT (通用 Refiner + Helper)
+ self.op_inject = FunctionalRefiner(func=inject_bboxes_logic)
+
+ def forward(self):
+ print(">>> [Pipeline] Step 1: Generating CoT...")
+ self.op_gen_cot.run(
+ self.storage.step(),
+ input_image_key=self.keys["img"],
+ output_answer_key=self.keys["raw_cot"],
+ question=self.keys["q"], # Template mapping
+ answer=self.keys["a"]
+ )
+
+ print(">>> [Pipeline] Step 2: Parsing Outputs...")
+ self.op_extract_cot.run(
+ self.storage.step(),
+ output_key=self.keys["clean_cot"],
+ text=self.keys["raw_cot"] # Param mapping
+ )
+ self.op_extract_kws.run(
+ self.storage.step(),
+ output_key=self.keys["keywords"],
+ text=self.keys["raw_cot"]
+ )
+
+ print(">>> [Pipeline] Step 3: Generating BBoxes (Grounding)...")
+ self.op_bbox_gen.run(
+ self.storage.step(),
+ input_image_key=self.keys["img"],
+ input_kws_key=self.keys["keywords"],
+ output_key=self.keys["bbox_map"]
+ )
+
+ print(">>> [Pipeline] Step 4: Injecting GCoT...")
+ self.op_inject.run(
+ self.storage.step(),
+ output_key=self.keys["final"],
+ cot_text=self.keys["clean_cot"],
+ bbox_map=self.keys["bbox_map"]
+ )
+
+ print(f">>> [Pipeline] Done. Final GCoT saved to: {self.keys['final']}")
+
+
+if __name__ == "__main__":
+ pipe = ImageGCoTPipeline(
+ first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl"
+ )
+ pipe.forward()
+```
diff --git a/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md b/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md
index aeff0c1d..755b4a76 100644
--- a/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md
+++ b/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md
@@ -1,69 +1,147 @@
---
-title: ScaleCap High-Density Captioning Pipeline
-createTime: 2026/01/11 22:08:57
+title: ScaleCap High-Density Caption Pipeline
icon: mdi:image-text
+createTime: 2026/01/11 22:08:57
permalink: /en/mm_guide/image_scale_caption_pipeline/
---
+
## 1. Overview
-The **ScaleCap High-Density Captioning Pipeline** implements an advanced **"Generate-Verify-Expand-Fuse"** paradigm for image captioning. This pipeline is designed to generate **extremely high information density** captions with **minimal hallucinations**, making it ideal for scenarios requiring deep understanding of image details.
+The **Image Scale Caption Pipeline (ScaleCap)** is an advanced image captioning solution based on a **"Generate-Verify-Expand-Integrate"** paradigm. This pipeline is designed to generate image descriptions with **extremely high information density** and **ultra-low hallucination rates**, making it particularly suitable for scenarios requiring deep understanding of image details.
-Based on the paper *ScaleCap: Inference-Time Scalable Image Captioning via Dual-Modality Debiasing*, this method progressively mines object and position details through multi-turn dialogue and visual self-verification (Visual Grounding), filtering out hallucinations along the way.
+The theoretical foundation of this method is derived from the paper *ScaleCap: Inference-Time Scalable Image Captioning via Dual-Modality Debiasing*. It gradually uncovers object and spatial details through multi-turn dialogue and visual grounding, effectively filtering out hallucinations produced by the model.
We support the following application scenarios:
* **High-Quality Multimodal Dataset Construction**: Generating training data that is more detailed and accurate than standard captions.
-* **Fine-Grained Image Retrieval**: Providing index text rich in detail.
-* **Accessibility/Blind Assistance**: Generating "What You See Is What You Get" (WYSIWYG) detailed narrations.
+* **Fine-Grained Image Retrieval**: Providing highly detailed text for indexing.
+* **Blind Assistance / Image Accessibility**: Generating "what-you-see-is-what-you-get" detailed narrations.
The main process of the pipeline includes:
-1. **Initial Caption Generation**: VLM generates a baseline description.
-2. **Visual Debiasing**: Splitting the description into sentences and verifying each sentence against visual evidence (Visual Grounding).
-3. **Detail Expansion**: Generating follow-up questions about object attributes and positions based on verified "Golden Sentences".
-4. **Answering & Re-verification**: VLM answers the questions and performs another round of visual grounding to filter incorrect details.
-5. **Final Fusion**: Merging all verified information into a coherent, long description.
+1. **Initial Caption Generation**: The VLM generates a basic description.
+2. **Visual Debiasing**: The description is split into sentences, and each is verified against visual evidence (Visual Grounding).
+3. **Detail Questioning**: Targeted questions regarding object attributes and spatial relations are generated based on the verified "Golden Sentences".
+4. **Answering & Secondary Verification**: The VLM answers the detail questions, followed by another round of visual grounding to filter out incorrect details.
+5. **Final Integration**: All verified information is woven into a coherent, comprehensive long caption.
---
## 2. Quick Start
-### Step 1: Create a Working Directory
+### Step 1: Create a New DataFlow Working Directory
```bash
-mkdir run_scalecap
-cd run_scalecap
+mkdir run_dataflow
+cd run_dataflow
```
-### Step 2: Prepare the Script
+### Step 2: Initialize DataFlow-MM
+
+```bash
+dataflowmm init
-Save the code in the "Pipeline Example" section below as `scalecap_pipeline.py`.
+```
-### Step 3: Download Example Data
+You will then see:
```bash
-huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data
+gpu_pipelines/image_scale_caption_pipeline.py
+
+```
+
+### Step 3: Download Sample Data
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### Step 4: Configure Parameters
+
+```python
+if __name__ == "__main__":
+ pipe = ImageScaleCaptionPipeline(
+ model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+ hf_cache_dir="~/.cache/huggingface",
+ download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+ device="cuda",
+ first_entry_file="../example_data/capsbench_images/image_scale_caption_demo.jsonl",
+ cache_path="../cache/image_scale_caption",
+ file_name_prefix="scalecap",
+ input_image_key="image",
+ output_key="final_caption",
+ vllm_tensor_parallel_size=1,
+ vllm_max_tokens=1024
+ )
+ pipe.forward()
```
-### Step 4: Run
+> **⚠️ Important Note on Model Path Configuration (Taking `Qwen2.5-VL-3B-Instruct` as an example):**
+> * **If you have already downloaded the model files:** Please change `model_path` to your local model path. **Crucially**, ensure that the model folder is named exactly `Qwen2.5-VL-3B-Instruct`; otherwise, the framework will fail to recognize it.
+> * **If you haven't downloaded the model yet:** You must specify a `download_dir` parameter that ends with `Qwen2.5-VL-3B-Instruct` (as shown in the default parameters). Failure to do so will also result in the model not being recognized after downloading.
+>
+>
+
+### Step 5: Run
```bash
-python scalecap_pipeline.py \
- --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
- --input_jsonl "data/images.jsonl" \
- --output_key "final_caption"
+cd gpu_pipelines
+python image_scale_caption_pipeline.py
```
+> **🛠️ Troubleshooting**
+> **Issue 1:** If you encounter a CUDA library conflict error similar to the following:
+> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`
+> **Solution:** This is usually caused by conflicting environment variables. Run the script with an empty `LD_LIBRARY_PATH`:
+> ```bash
+> LD_LIBRARY_PATH="" python image_scale_caption_pipeline.py
+>
+> ```
+>
+>
+> **Issue 2:** If you are using **Qwen series models** and encounter the following error:
+> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"`
+> **Solution:** Open the `config.json` file located in your model folder, find the `rope_scaling` section, and change the key `"type"` to `"rope_type"`.
+> **Before modification:**
+> ```json
+> "rope_scaling": {
+> "type": "mrope",
+> "mrope_section": [
+> 16,
+> 24,
+> 24
+> ]
+> }
+>
+> ```
+>
+>
+> **After modification:**
+> ```json
+> "rope_scaling": {
+> "rope_type": "mrope",
+> "mrope_section": [
+> 16,
+> 24,
+> 24
+> ]
+> }
+>
+> ```
+>
+>
+
---
## 3. Data Flow & Logic
### 1. **Input Data**
-The input data requires only the image path:
+The input data for this process is very simple, requiring only the image path:
* **image**: Path to the image file.
@@ -71,69 +149,69 @@ The input data requires only the image path:
```json
{
- "image": "./images/complex_scene.jpg"
+ "image": "../example_data/capsbench_images/0.png"
}
```
### 2. **Core Operator Logic**
-This pipeline is a complex orchestration of multiple atomic operators:
+This pipeline orchestrates multiple fine-grained operators to achieve the complex ScaleCap logic:
#### A. **Initial Generation (PromptedVQAGenerator)**
-* **Function**: Generates a preliminary description (`init_caption`) of the image using a basic prompt.
+* **Function**: Uses a basic prompt to generate a preliminary description of the image (`init_caption`).
#### B. **Visual Debiasing (VisualGroundingRefiner)**
* **Function**: The core anti-hallucination mechanism of ScaleCap.
* **Logic**:
-1. Uses `split_sentences` to break the draft into single sentences.
+1. Uses `split_sentences` to break the initial draft into single sentences.
2. Asks the VLM: "Given the image, is the description '{text}' directly supported by visual evidence?".
-3. Keeps only sentences where the answer is "Yes", forming **"Golden Sentences"**.
+3. Retains only the sentences that receive a "Yes", forming **"Golden Sentences"**.
#### C. **Question Generation & Parsing (PromptTemplatedQAGenerator)**
-* **Function**: Generates targeted follow-up questions based on Golden Sentences using LLM capabilities.
-* **Logic**: The model generates text like "Describe more details about the [Object]", which is then automatically expanded into **Object Detail** and **Positional Relation** questions via `parse_questions_logic`.
+* **Function**: Uses LLM capabilities to generate targeted follow-up questions based on the Golden Sentences.
+* **Logic**: The model generates text like "Describe more details about the [Object]". The `parse_questions_logic` function automatically expands these into two categories: **object details** and **spatial relationships**.
-#### D. **Batch Answering & Refiltering (BatchVQAGenerator & Refiner)**
+#### D. **Batch Answering & Secondary Filtering (BatchVQAGenerator & Refiner)**
-* **Function**: Mining deep image information.
+* **Function**: Deeply mines visual information.
* **Logic**:
-1. Uses `BatchVQAGenerator` to have the VLM answer all generated questions in a batch.
-2. Uses `VisualGroundingRefiner` again to check if these new details are accurate.
+1. Uses `BatchVQAGenerator` to have the VLM answer all generated questions in a single batch.
+2. Uses `VisualGroundingRefiner` again to verify if these newly generated details are accurate.
3. Retains reliable details (`final_details`).
-#### E. **Final Fusion (PromptTemplatedQAGenerator)**
+#### E. **Final Integration (PromptTemplatedQAGenerator)**
-* **Function**: Rewrites the "Golden Sentences" and "Verified Details" into a fluent text.
+* **Function**: Rewrites the "Golden Sentences" and "Verified Details" into a fluent, cohesive text.
* **Output**: `final_caption`.
### 3. **Output Data**
-The output data records the entire pipeline process, facilitating debugging and analysis:
+The output data records the entire pipeline process for easy debugging and analysis:
-* **init_caption**: Raw generated draft.
-* **golden_sentences**: List of sentences that passed the first check.
+* **init_caption**: The original initial draft.
+* **golden_sentences**: List of sentences that passed the first debiasing check.
* **q_list**: List of generated follow-up questions.
-* **final_details**: Detailed answers that passed the second check.
+* **final_details**: Detailed answers that passed the secondary check.
* **final_caption**: The final high-density description.
**Output Data Example**:
```json
{
- "image": "./images/complex_scene.jpg",
+ "image": "../example_data/capsbench_images/0.png",
"init_caption": "A dog sitting on a bench.",
"golden_sentences": ["A dog is sitting on a wooden bench."],
- "q_list": ["Describe more details about the dog.", "Describe position of the bench."],
+ "q_list": ["Describe more details about the dog.", "Describe more details about the position of the bench."],
"final_details": ["The dog is a Golden Retriever with a red collar.", "The bench is located in a park."],
- "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park..."
+ "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park."
}
```
@@ -142,7 +220,7 @@ The output data records the entire pipeline process, facilitating debugging and
## 4. Pipeline Example
-Below is the complete `ImageScaleCaptionPipeline` code implementation.
+Below is the complete `ImageScaleCaptionPipeline` code implementation (GPU Version).
```python
import re
@@ -150,12 +228,79 @@ import argparse
from typing import Callable, Any, List
from dataflow.utils.storage import FileStorage
+
from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
from dataflow.prompts.image import ImageScaleCaptionPrompt
+
from dataflow.operators.core_vision import PromptedVQAGenerator, BatchVQAGenerator, VisualGroundingRefiner
from dataflow.operators.core_text import PromptTemplatedQAGenerator, FunctionalRefiner
+
+def split_sentences(text: str) -> List[str]:
+ """将文本拆分为句子列表"""
+ if not text or not isinstance(text, str):
+ return []
+ # 使用正则按标点符号分割 (. ! ? 。 ! ?)
+ _SENT_SPLIT = re.compile(r"(?<=[.!?。!?])\s+")
+ parts = [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]
+ return parts or ([text.strip()] if text.strip() else [])
+
+def join_list(data: Any, separator: str = "\n") -> str:
+ """将列表连接为字符串"""
+ if isinstance(data, list):
+ # 过滤掉非字符串元素或空字符串
+ valid_items = [str(x) for x in data if x]
+ return separator.join(valid_items)
+ return str(data) if data is not None else ""
+
+def parse_questions_logic(text: str, max_q: int = 20) -> List[str]:
+ """
+ 解析 LLM 生成的 "Describe more details about..." 文本,
+ 并自动扩展 position 问题。
+ """
+ if not text or not isinstance(text, str):
+ return []
+
+ lines = [t.strip() for t in text.split("\n") if t.strip()]
+ obj_qs = []
+
+ for line in lines:
+ # 提取包含 "Describe more details about" 的行
+ if "Describe more details about" in line:
+ # 去除可能的序号 (如 "1. Describe...")
+ try:
+ start_idx = line.find("Describe")
+ clean = line[start_idx:]
+ # 去除句末多余内容,保留到第一个句号
+ if "." in clean:
+ clean = clean.split(".")[0] + "."
+ obj_qs.append(clean)
+ except Exception:
+ continue
+
+ # 去重并保持顺序
+ seen = set()
+ unique_obj_qs = []
+ for q in obj_qs:
+ if q not in seen:
+ unique_obj_qs.append(q)
+ seen.add(q)
+
+ # 截断
+ unique_obj_qs = unique_obj_qs[:max_q]
+
+ # 扩展 Position 问题
+ pos_qs = [
+ q.replace("Describe more details about", "Describe more details about the position of")
+ for q in unique_obj_qs
+ ]
+
+ # 返回合并后的列表 (对象问题 + 位置问题)
+ return unique_obj_qs + pos_qs
+
+
class ImageScaleCaptionPipeline:
def __init__(
self,
@@ -207,14 +352,19 @@ class ImageScaleCaptionPipeline:
# ================== Operator Initialization ==================
# --- Step A: Generate Init Caption ---
+ # 构造固定 Prompt 列
self.refine_const_prompt = FunctionalRefiner(func=lambda: self.prompts_db["VLM_PROMPT_1"])
+
+ # 生成初稿 (使用通用 PromptedVQAGenerator)
self.gen_init_caption = PromptedVQAGenerator(
serving=self.serving,
system_prompt="You are a helpful assistant."
)
# --- Step B: Refine Golden Sentences ---
+ # 分句
self.refine_split = FunctionalRefiner(func=split_sentences)
+
# 视觉自检 (保留 Yes 的句子)
self.refine_golden = VisualGroundingRefiner(
serving=self.serving,
@@ -222,7 +372,10 @@ class ImageScaleCaptionPipeline:
)
# --- Step C: Generate Questions ---
+ # 列表转字符串
self.refine_join = FunctionalRefiner(func=join_list)
+
+ # 文本生成问题 (Text-to-Text)
tpl_q = NamedPlaceholderPromptTemplate(
template=self.prompts_db["LLM_PROMPT_1"],
join_list_with="\n"
@@ -231,16 +384,22 @@ class ImageScaleCaptionPipeline:
serving=self.serving,
prompt_template=tpl_q
)
+
+ # 解析问题文本为列表
self.refine_parse_qs = FunctionalRefiner(func=parse_questions_logic)
# --- Step D: Generate Answers ---
+ # 批量回答 (One Image -> Many Qs)
self.gen_answers = BatchVQAGenerator(serving=self.serving)
+
+ # 回答过滤
self.refine_answers = VisualGroundingRefiner(
serving=self.serving,
prompt_template="Given the image, is the statement '{text}' grounded in the image and not generic? Answer strictly yes or no."
)
# --- Step E: Integrate Final Caption ---
+ # 融合 (Text-to-Text)
tpl_final = NamedPlaceholderPromptTemplate(
template=self.prompts_db["LLM_PROMPT_4"],
join_list_with="\n"
@@ -252,6 +411,7 @@ class ImageScaleCaptionPipeline:
def forward(self):
print(">>> [Pipeline] Step 0: Preparing Prompts...")
+ # 构造 init_prompt 列
self.refine_const_prompt.run(
self.storage.step(),
output_key="init_prompt"
@@ -284,11 +444,14 @@ class ImageScaleCaptionPipeline:
output_key="golden_str",
data="golden_sentences"
)
+
+ # template: "{sentence}" -> map to col "golden_str"
self.gen_questions_text.run(
self.storage.step(),
output_answer_key="raw_q_text",
sentence="golden_str"
)
+
self.refine_parse_qs.run(
self.storage.step(),
output_key="q_list",
@@ -302,6 +465,7 @@ class ImageScaleCaptionPipeline:
input_image_key=self.input_image_key,
output_key="raw_answers"
)
+
self.refine_answers.run(
self.storage.step(),
input_list_key="raw_answers",
@@ -315,48 +479,35 @@ class ImageScaleCaptionPipeline:
output_key="details_str",
data="final_details"
)
+
+ # template keys: context, object_info, position_info
self.gen_final_caption.run(
self.storage.step(),
output_answer_key=self.output_key,
context="golden_str",
object_info="details_str",
- position_info="details_str"
+ position_info="details_str" # 简化:同时作为 object 和 position 信息
)
print(f">>> [Pipeline] All Done. Result saved to: {self.storage.cache_path}")
if __name__ == "__main__":
- parser = argparse.ArgumentParser(description="ScaleCap Dense Captioning Pipeline")
-
- parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
- parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface")
- parser.add_argument("--download_dir", default="./ckpt/models")
- parser.add_argument("--device", default="cuda")
-
- parser.add_argument("--input_jsonl", default="./dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl")
- parser.add_argument("--cache_path", default="./cache_scalecap_results")
- parser.add_argument("--file_name_prefix", default="scalecap")
- parser.add_argument("--input_image_key", default="image")
- parser.add_argument("--output_key", default="final_caption")
-
- parser.add_argument("--tp", type=int, default=1)
- parser.add_argument("--max_tokens", type=int, default=1024)
-
- args = parser.parse_args()
-
pipe = ImageScaleCaptionPipeline(
- model_path=args.model_path,
- hf_cache_dir=args.hf_cache_dir,
- download_dir=args.download_dir,
- device=args.device,
- first_entry_file=args.input_jsonl,
- cache_path=args.cache_path,
- file_name_prefix=args.file_name_prefix,
- input_image_key=args.input_image_key,
- output_key=args.output_key,
- vllm_tensor_parallel_size=args.tp,
- vllm_max_tokens=args.max_tokens
+ model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+ hf_cache_dir="~/.cache/huggingface",
+ download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+ device="cuda",
+
+ first_entry_file="../example_data/capsbench_images/image_scale_caption_demo.jsonl",
+ cache_path="../cache/image_scale_caption",
+ file_name_prefix="scalecap",
+
+ input_image_key="image",
+ output_key="final_caption",
+
+ vllm_tensor_parallel_size=1,
+ vllm_max_tokens=1024
)
pipe.forward()
diff --git a/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md b/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md
new file mode 100644
index 00000000..87ad3fa7
--- /dev/null
+++ b/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md
@@ -0,0 +1,477 @@
+---
+title: ScaleCap High-Density Caption Pipeline (API version)
+icon: mdi:image-text
+createTime: 2026/01/11 22:08:57
+permalink: /en/mm_guide/image_scale_caption_pipeline_api/
+---
+
+## 1. Overview
+
+The **Image Scale Caption Pipeline (ScaleCap)** is an advanced image captioning solution based on a **"Generate-Verify-Expand-Integrate"** paradigm. This pipeline is designed to generate image descriptions with **extremely high information density** and **ultra-low hallucination rates**, making it particularly suitable for scenarios requiring deep understanding of image details.
+
+The theoretical foundation of this method is derived from the paper *ScaleCap: Inference-Time Scalable Image Captioning via Dual-Modality Debiasing*. It gradually uncovers object and spatial details through multi-turn dialogue and visual grounding, effectively filtering out hallucinations produced by the model.
+
+We support the following application scenarios:
+
+* **High-Quality Multimodal Dataset Construction**: Generating training data that is more detailed and accurate than standard captions.
+* **Fine-Grained Image Retrieval**: Providing highly detailed text for indexing.
+* **Blind Assistance / Image Accessibility**: Generating "what-you-see-is-what-you-get" detailed narrations.
+
+The main process of the pipeline includes:
+
+1. **Initial Caption Generation**: The VLM generates a basic description.
+2. **Visual Debiasing**: The description is split into sentences, and each is verified against visual evidence (Visual Grounding).
+3. **Detail Questioning**: Targeted questions regarding object attributes and spatial relations are generated based on the verified "Golden Sentences".
+4. **Answering & Secondary Verification**: The VLM answers the detail questions, followed by another round of visual grounding to filter out incorrect details.
+5. **Final Integration**: All verified information is woven into a coherent, comprehensive long caption.
+
+---
+
+## 2. Quick Start
+
+### Step 1: Create a New DataFlow Working Directory
+
+```bash
+mkdir run_dataflow
+cd run_dataflow
+
+```
+
+### Step 2: Initialize DataFlow-MM
+
+```bash
+dataflowmm init
+
+```
+
+You will then see:
+
+```bash
+api_pipelines/image_scale_caption_api_pipeline.py
+
+```
+
+### Step 3: Download Sample Data
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### Step 4: Configure API Key
+
+Set your API Key environment variable in `api_pipelines/image_scale_caption_api_pipeline.py`:
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+
+```
+
+### Step 5: Configure Parameters
+
+Configure the API service and input data paths in `api_pipelines/image_scale_caption_api_pipeline.py`:
+
+```python
+ def __init__(
+ self,
+ # Storage params
+ first_entry_file: str = "../example_data/capsbench_images/image_scale_caption_demo.jsonl",
+ cache_path: str = "../cache/image_scale_caption",
+ file_name_prefix: str = "scalecap",
+ cache_type: str = "jsonl",
+ # Keys
+ input_image_key: str = "image",
+ output_key: str = "final_caption",
+ ):
+
+```
+
+```python
+ self.vlm_serving = APIVLMServing_openai(
+ api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format
+ model_name="gpt-4o-mini",
+ image_io=None,
+ send_request_stream=False,
+ max_workers=10,
+ timeout=1800
+ )
+
+```
+
+### Step 6: Run with One Command
+
+```bash
+cd api_pipelines
+python image_scale_caption_api_pipeline.py
+
+```
+
+---
+
+## 3. Data Flow & Logic
+
+### 1. **Input Data**
+
+The input data for this process is very simple, requiring only the image path:
+
+* **image**: Path to the image file.
+
+**Input Data Example**:
+
+```json
+{
+ "image": "../example_data/capsbench_images/0.png"
+}
+
+```
+
+### 2. **Core Operator Logic**
+
+This pipeline orchestrates multiple fine-grained operators to achieve the complex ScaleCap logic:
+
+#### A. **Initial Generation (PromptedVQAGenerator)**
+
+* **Function**: Uses a basic prompt to generate a preliminary description of the image (`init_caption`).
+
+#### B. **Visual Debiasing (VisualGroundingRefiner)**
+
+* **Function**: The core anti-hallucination mechanism of ScaleCap.
+* **Logic**:
+1. Uses `split_sentences` to break the initial draft into single sentences.
+2. Asks the VLM: "Given the image, is the description '{text}' directly supported by visual evidence?".
+3. Retains only the sentences that receive a "Yes", forming **"Golden Sentences"**.
+
+
+
+#### C. **Question Generation & Parsing (PromptTemplatedQAGenerator)**
+
+* **Function**: Uses LLM capabilities to generate targeted follow-up questions based on the Golden Sentences.
+* **Logic**: The model generates text like "Describe more details about the [Object]". The `parse_questions_logic` function automatically expands these into two categories: **object details** and **spatial relationships**.
+
+#### D. **Batch Answering & Secondary Filtering (BatchVQAGenerator & Refiner)**
+
+* **Function**: Deeply mines visual information.
+* **Logic**:
+1. Uses `BatchVQAGenerator` to have the VLM answer all generated questions in a single batch.
+2. Uses `VisualGroundingRefiner` again to verify if these newly generated details are accurate.
+3. Retains reliable details (`final_details`).
+
+
+
+#### E. **Final Integration (PromptTemplatedQAGenerator)**
+
+* **Function**: Rewrites the "Golden Sentences" and "Verified Details" into a fluent, cohesive text.
+* **Output**: `final_caption`.
+
+### 3. **Output Data**
+
+The output data records the entire pipeline process for easy debugging and analysis:
+
+* **init_caption**: The original initial draft.
+* **golden_sentences**: List of sentences that passed the first debiasing check.
+* **q_list**: List of generated follow-up questions.
+* **final_details**: Detailed answers that passed the secondary check.
+* **final_caption**: The final high-density description.
+
+**Output Data Example**:
+
+```json
+{
+ "image": "../example_data/capsbench_images/0.png",
+ "init_caption": "A dog sitting on a bench.",
+ "golden_sentences": ["A dog is sitting on a wooden bench."],
+ "q_list": ["Describe more details about the dog.", "Describe more details about the position of the bench."],
+ "final_details": ["The dog is a Golden Retriever with a red collar.", "The bench is located in a park."],
+ "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park."
+}
+
+```
+
+---
+
+## 4. Pipeline Example
+
+Below is the complete `ImageScaleCaptionPipeline` code implementation (API Version).
+
+```python
+import os
+os.environ["DF_API_KEY"] = "sk-xxxx"
+
+
+import re
+import argparse
+from typing import Callable, Any, List
+
+from dataflow.utils.storage import FileStorage
+
+from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
+from dataflow.prompts.image import ImageScaleCaptionPrompt
+
+from dataflow.operators.core_vision import PromptedVQAGenerator, BatchVQAGenerator, VisualGroundingRefiner
+from dataflow.operators.core_text import PromptTemplatedQAGenerator, FunctionalRefiner
+from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
+
+def split_sentences(text: str) -> List[str]:
+ """将文本拆分为句子列表"""
+ if not text or not isinstance(text, str):
+ return []
+ # 使用正则按标点符号分割 (. ! ? 。 ! ?)
+ _SENT_SPLIT = re.compile(r"(?<=[.!?。!?])\s+")
+ parts = [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]
+ return parts or ([text.strip()] if text.strip() else [])
+
+def join_list(data: Any, separator: str = "\n") -> str:
+ """将列表连接为字符串"""
+ if isinstance(data, list):
+ # 过滤掉非字符串元素或空字符串
+ valid_items = [str(x) for x in data if x]
+ return separator.join(valid_items)
+ return str(data) if data is not None else ""
+
+def parse_questions_logic(text: str, max_q: int = 20) -> List[str]:
+ """
+ 解析 LLM 生成的 "Describe more details about..." 文本,
+ 并自动扩展 position 问题。
+ """
+ if not text or not isinstance(text, str):
+ return []
+
+ lines = [t.strip() for t in text.split("\n") if t.strip()]
+ obj_qs = []
+
+ for line in lines:
+ # 提取包含 "Describe more details about" 的行
+ if "Describe more details about" in line:
+ # 去除可能的序号 (如 "1. Describe...")
+ try:
+ start_idx = line.find("Describe")
+ clean = line[start_idx:]
+ # 去除句末多余内容,保留到第一个句号
+ if "." in clean:
+ clean = clean.split(".")[0] + "."
+ obj_qs.append(clean)
+ except Exception:
+ continue
+
+ # 去重并保持顺序
+ seen = set()
+ unique_obj_qs = []
+ for q in obj_qs:
+ if q not in seen:
+ unique_obj_qs.append(q)
+ seen.add(q)
+
+ # 截断
+ unique_obj_qs = unique_obj_qs[:max_q]
+
+ # 扩展 Position 问题
+ pos_qs = [
+ q.replace("Describe more details about", "Describe more details about the position of")
+ for q in unique_obj_qs
+ ]
+
+ # 返回合并后的列表 (对象问题 + 位置问题)
+ return unique_obj_qs + pos_qs
+
+
+class ImageScaleCaptionPipeline:
+ def __init__(
+ self,
+ # Storage params
+ first_entry_file: str = "images.jsonl",
+ cache_path: str = "./cache_scalecap",
+ file_name_prefix: str = "scalecap",
+ cache_type: str = "jsonl",
+ # Keys
+ input_image_key: str = "image",
+ output_key: str = "final_caption",
+ # VLLM Config
+ vllm_tensor_parallel_size: int = 1,
+ vllm_temperature: float = 0.7,
+ vllm_top_p: float = 0.9,
+ vllm_max_tokens: int = 512,
+ ):
+ # 1. Storage
+ self.storage = FileStorage(
+ first_entry_file_name=first_entry_file,
+ cache_path=cache_path,
+ file_name_prefix=file_name_prefix,
+ cache_type=cache_type,
+ )
+
+ # 2. Serving
+ self.vlm_serving = APIVLMServing_openai(
+ api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format
+ model_name="gpt-4o-mini",
+ image_io=None,
+ send_request_stream=False,
+ max_workers=10,
+ timeout=1800
+ )
+
+ # 3. Prompts
+ self.prompts_db = ImageScaleCaptionPrompt().build_prompt()
+
+ # 4. Keys
+ self.input_image_key = input_image_key
+ self.output_key = output_key
+
+ # ================== Operator Initialization ==================
+
+ # --- Step A: Generate Init Caption ---
+ # 构造固定 Prompt 列
+ self.refine_const_prompt = FunctionalRefiner(func=lambda: self.prompts_db["VLM_PROMPT_1"])
+
+ # 生成初稿 (使用通用 PromptedVQAGenerator)
+ self.gen_init_caption = PromptedVQAGenerator(
+ serving=self.vlm_serving,
+ system_prompt="You are a helpful assistant."
+ )
+
+ # --- Step B: Refine Golden Sentences ---
+ # 分句
+ self.refine_split = FunctionalRefiner(func=split_sentences)
+
+ # 视觉自检 (保留 Yes 的句子)
+ self.refine_golden = VisualGroundingRefiner(
+ serving=self.vlm_serving,
+ prompt_template="Given the image, is the description '{text}' directly supported by visual evidence? Answer strictly yes or no."
+ )
+
+ # --- Step C: Generate Questions ---
+ # 列表转字符串
+ self.refine_join = FunctionalRefiner(func=join_list)
+
+ # 文本生成问题 (Text-to-Text)
+ tpl_q = NamedPlaceholderPromptTemplate(
+ template=self.prompts_db["LLM_PROMPT_1"],
+ join_list_with="\n"
+ )
+ self.gen_questions_text = PromptTemplatedQAGenerator(
+ serving=self.vlm_serving,
+ prompt_template=tpl_q
+ )
+
+ # 解析问题文本为列表
+ self.refine_parse_qs = FunctionalRefiner(func=parse_questions_logic)
+
+ # --- Step D: Generate Answers ---
+ # 批量回答 (One Image -> Many Qs)
+ self.gen_answers = BatchVQAGenerator(serving=self.vlm_serving)
+
+ # 回答过滤
+ self.refine_answers = VisualGroundingRefiner(
+ serving=self.vlm_serving,
+ prompt_template="Given the image, is the statement '{text}' grounded in the image and not generic? Answer strictly yes or no."
+ )
+
+ # --- Step E: Integrate Final Caption ---
+ # 融合 (Text-to-Text)
+ tpl_final = NamedPlaceholderPromptTemplate(
+ template=self.prompts_db["LLM_PROMPT_4"],
+ join_list_with="\n"
+ )
+ self.gen_final_caption = PromptTemplatedQAGenerator(
+ serving=self.vlm_serving,
+ prompt_template=tpl_final
+ )
+
+ def forward(self):
+ print(">>> [Pipeline] Step 0: Preparing Prompts...")
+ # 构造 init_prompt 列
+ self.refine_const_prompt.run(
+ self.storage.step(),
+ output_key="init_prompt"
+ )
+
+ print(">>> [Pipeline] Step 1: Generating Initial Caption...")
+ self.gen_init_caption.run(
+ self.storage.step(),
+ input_prompt_key="init_prompt",
+ input_image_key=self.input_image_key,
+ output_answer_key="init_caption"
+ )
+
+ print(">>> [Pipeline] Step 2: Refining Golden Sentences...")
+ self.refine_split.run(
+ self.storage.step(),
+ output_key="sentences",
+ text="init_caption"
+ )
+ self.refine_golden.run(
+ self.storage.step(),
+ input_list_key="sentences",
+ input_image_key=self.input_image_key,
+ output_key="golden_sentences"
+ )
+
+ print(">>> [Pipeline] Step 3: Generating Details Questions...")
+ self.refine_join.run(
+ self.storage.step(),
+ output_key="golden_str",
+ data="golden_sentences"
+ )
+
+ # template: "{sentence}" -> map to col "golden_str"
+ self.gen_questions_text.run(
+ self.storage.step(),
+ output_answer_key="raw_q_text",
+ sentence="golden_str"
+ )
+
+ self.refine_parse_qs.run(
+ self.storage.step(),
+ output_key="q_list",
+ text="raw_q_text"
+ )
+
+ print(">>> [Pipeline] Step 4: Generating & Filtering Answers...")
+ self.gen_answers.run(
+ self.storage.step(),
+ input_prompts_key="q_list",
+ input_image_key=self.input_image_key,
+ output_key="raw_answers"
+ )
+
+ self.refine_answers.run(
+ self.storage.step(),
+ input_list_key="raw_answers",
+ input_image_key=self.input_image_key,
+ output_key="final_details"
+ )
+
+ print(">>> [Pipeline] Step 5: Integrating Final Caption...")
+ self.refine_join.run(
+ self.storage.step(),
+ output_key="details_str",
+ data="final_details"
+ )
+
+ # template keys: context, object_info, position_info
+ self.gen_final_caption.run(
+ self.storage.step(),
+ output_answer_key=self.output_key,
+ context="golden_str",
+ object_info="details_str",
+ position_info="details_str" # 简化:同时作为 object 和 position 信息
+ )
+
+ print(f">>> [Pipeline] All Done. Result saved to: {self.storage.cache_path}")
+
+
+if __name__ == "__main__":
+
+ pipe = ImageScaleCaptionPipeline(
+ first_entry_file="../example_data/capsbench_images/image_scale_caption_demo.jsonl",
+ cache_path="../cache/image_scale_caption",
+ file_name_prefix="scalecap",
+ input_image_key="image",
+ output_key="final_caption",
+ vllm_tensor_parallel_size=1,
+ vllm_max_tokens=1024
+ )
+
+ pipe.forward()
+
+```
diff --git a/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md b/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md
index f6c04ad8..4e495489 100644
--- a/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md
+++ b/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md
@@ -4,27 +4,26 @@ createTime: 2026/01/11 22:13:45
icon: mdi:image-text
permalink: /en/mm_guide/image_visual_only_mcq_pipeline/
---
+
## 1. Overview
-The **Visual-Only MCQ Pipeline** is a core component of the CapRL (Caption Reinforcement Learning) framework. Its goal is to generate a set of high-quality Multiple Choice Questions (MCQs) that satisfy **strict visual dependency**: the model must "see" the image to answer correctly; answering based on text alone (guessing or common sense) is not possible.
+The **Visual-Only MCQ Pipeline** is a core component within the CapRL (Caption Reinforcement Learning) framework. Its goal is to generate a set of high-quality Multiple-Choice Questions (MCQs) that strictly satisfy **strong visual dependency**: the model must "see" the image to answer correctly, and cannot rely merely on text guessing or common sense.
-This pipeline uses a **Generate-Parse-Verify** three-step method, leveraging **Option Rotation** and **Blind Tests** to rigorously filter out hallucinations or overly simple questions. The generated questions serve as a robust reward signal for Reinforcement Learning.
+This pipeline utilizes a **"Generate-Parse-Verify"** three-step approach, employing **Option Rotation** and **Blind Test (Text-Only)** mechanisms to rigorously filter out model hallucinations or overly simple questions. The generated questions can be used as reward signals (Reward Model) for reinforcement learning.
The main process includes:
-1. **MCQ Generation**: VLM generates raw QA pairs based on the image.
-2. **Structured Parsing**: Using regex logic to parse text into standard question/option structures.
+1. **MCQ Generation**: The VLM generates raw Question-Answer text blocks based on the image.
+2. **Structured Parsing**: Uses regex logic to parse the raw text into standard question and option structures.
3. **Visual Dependency Verification**:
-* **Rotation Test**: Shuffling options multiple times to eliminate positional bias.
-* **Dual Filtering**: Requiring high "Visual Accuracy" and low "Text-only Accuracy".
-
-
+ * **Rotation Test**: Randomly shuffles the order of options multiple times to eliminate positional bias.
+ * **Dual Filtering**: Requires a high "Visual Accuracy" (with image) and a low "Textual Accuracy" (without image).
---
## 2. Quick Start
-### Step 1: Create Working Directory
+### Step 1: Create a New DataFlow Working Directory
```bash
mkdir run_vis_mcq
@@ -32,36 +31,109 @@ cd run_vis_mcq
```
-### Step 2: Prepare Script
+### Step 2: Initialize DataFlow-MM
+
+```bash
+dataflowmm init
+
+```
+
+You will then see:
-Save the code in the "Pipeline Example" section below as `visual_mcq_pipeline.py`.
+```bash
+gpu_pipelines/image_visual_only_mcq_pipeline.py
+
+```
-### Step 3: Download Example Data
+### Step 3: Download Sample Data
```bash
-huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### Step 4: Configure Parameters
+
+Configure the model path and filtering thresholds (e.g., requiring 100% visual accuracy and less than 25% textual accuracy):
+
+```python
+if __name__ == "__main__":
+ pipe = VisualOnlyMCQPipeline(
+ model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+ first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl",
+ hf_cache_dir="~/.cache/huggingface",
+ download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+ rotate_num=4,
+ pass_visual_min=1.0,
+ pass_textual_max=0.25
+ )
+ pipe.forward()
```
-### Step 4: Run
+> **⚠️ Important Note on Model Path Configuration (Taking `Qwen2.5-VL-3B-Instruct` as an example):**
+> * **If you have already downloaded the model files:** Please change `model_path` to your local model path. **Crucially**, ensure that the model folder is named exactly `Qwen2.5-VL-3B-Instruct`; otherwise, the framework will fail to recognize it.
+> * **If you haven't downloaded the model yet:** You must specify a `download_dir` parameter that ends with `Qwen2.5-VL-3B-Instruct` (as shown in the default parameters). Failure to do so will also result in the model not being recognized after downloading.
+>
+>
+
+### Step 5: Run
```bash
-python visual_mcq_pipeline.py \
- --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
- --input_file "data/captions.jsonl" \
- --rotate_num 4 \
- --pass_vis 1.0 \
- --pass_txt 0.25
+cd gpu_pipelines
+python image_visual_only_mcq_pipeline.py
```
+> **🛠️ Troubleshooting**
+> **Issue 1:** If you encounter a CUDA library conflict error similar to the following:
+> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`
+> **Solution:** This is usually caused by conflicting environment variables. Run the script with an empty `LD_LIBRARY_PATH`:
+> ```bash
+> LD_LIBRARY_PATH="" python image_visual_only_mcq_pipeline.py
+>
+> ```
+>
+>
+> **Issue 2:** If you are using **Qwen series models** and encounter the following error:
+> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"`
+> **Solution:** Open the `config.json` file located in your model folder, find the `rope_scaling` section, and change the key `"type"` to `"rope_type"`.
+> **Before modification:**
+> ```json
+> "rope_scaling": {
+> "type": "mrope",
+> "mrope_section": [
+> 16,
+> 24,
+> 24
+> ]
+> }
+>
+> ```
+>
+>
+> **After modification:**
+> ```json
+> "rope_scaling": {
+> "rope_type": "mrope",
+> "mrope_section": [
+> 16,
+> 24,
+> 24
+> ]
+> }
+>
+> ```
+>
+>
+
---
## 3. Data Flow & Logic
### 1. **Input Data**
-Input only requires the image path:
+The input data only requires the image path:
* **image**: Path to the image file.
@@ -76,35 +148,35 @@ Input only requires the image path:
### 2. **Core Operator Logic**
-This pipeline chains three key operators:
+This pipeline is chained together by three key operators:
-#### A. **FixPromptedVQAGenerator (Raw Generation)**
+#### A. **Raw Generation (FixPromptedVQAGenerator)**
-* **Function**: Uses CapRL predefined Prompt templates (`SYS_PROMPT_MCQ` / `USER_PROMPT_MCQ`) to generate 5 MCQs at once.
-* **Output**: Unstructured text block containing multiple `#### Question` and options.
+* **Function**: Uses the preset CapRL prompt templates (`SYS_PROMPT_MCQ` / `USER_PROMPT_MCQ`) to instruct the VLM to generate 5 MCQs in one go.
+* **Output**: Unstructured text blocks containing multiple `#### Question` headers and options.
-#### B. **FunctionalRefiner (Regex Parsing)**
+#### B. **Structured Parsing (FunctionalRefiner)**
* **Logic Function**: `parse_mcq_text_logic`
-* **Function**: Extracts questions, options (A-F), and correct answers from raw text using regex.
-* **Output**: Structured MCQ list (`parsed_mcq_list`).
+* **Function**: Extracts the questions, options (A-F), and correct answers from the raw text using regular expressions.
+* **Output**: A structured list of MCQs (`parsed_mcq_list`).
-#### C. **VisualDependencyRefiner (Dependency Verification)**
+#### C. **Dependency Verification (VisualDependencyRefiner)**
-This is the core filter. It performs N inferences (N = `rotate_num`) for each question:
+This is the core filter of the pipeline. It performs N inferences (N = `rotate_num`) for each question:
-1. **Option Rotation**: Randomly shuffles options (e.g., moving answer from A to C) to prevent the model from cheating by "always picking A".
-2. **Visual Pass**: Input Image + Question. Records the model's accuracy.
-3. **Textual Pass**: Input Question only (no image). Records the model's blind guessing accuracy.
+1. **Option Rotation**: Randomly shuffles the option order (e.g., moving the answer from A to C) to prevent the model from cheating by "always choosing A".
+2. **Visual Pass**: Inputs Image + Question. Records the proportion of correct answers.
+3. **Textual Pass (Blind Test)**: Inputs Question only (No Image). Records the proportion of correct blind guesses.
4. **Filtering Criteria**:
-* Keep the question IF AND ONLY IF: `Visual_Acc >= pass_visual_min` **AND** `Textual_Acc <= pass_textual_max`.
-* *Example*: If a question can be answered correctly without the image (high text accuracy), it tests common sense rather than vision, so it is **discarded**.
+* Retains the question if and only if: `Visual_Acc >= pass_visual_min` **AND** `Textual_Acc <= pass_textual_max`.
+* *Example*: If a question can be answered correctly without looking at the image (high textual accuracy), it relies on common sense rather than visual info, and is **discarded**.
### 3. **Output Data**
-The output data (`final_mcqs`) contains only questions that passed rigorous verification. These questions possess high quality and visual relevance.
+The output data (`final_mcqs`) only contains questions that have passed the rigorous verification. These questions possess extremely high quality and visual relevance.
**Output Data Example**:
@@ -116,8 +188,8 @@ The output data (`final_mcqs`) contains only questions that passed rigorous veri
"question": "What is the color of the car on the far left?\n - A) Red\n - B) Blue...",
"answer": "A",
"stats": {
- "visual_acc": 1.0, # 4/4 correct with image
- "text_acc": 0.0 # 0/4 correct without image
+ "visual_acc": 1.0,
+ "text_acc": 0.0
}
}
]
@@ -129,12 +201,10 @@ The output data (`final_mcqs`) contains only questions that passed rigorous veri
## 4. Pipeline Example
-Below is the complete `VisualOnlyMCQPipeline` code implementation.
+Below is the complete `VisualOnlyMCQPipeline` code implementation (GPU Version).
```python
import argparse
-import re
-from typing import List, Dict, Any
from dataflow.utils.storage import FileStorage
from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
@@ -142,13 +212,14 @@ from dataflow.operators.core_vision import FixPromptedVQAGenerator, VisualDepend
from dataflow.operators.core_text import FunctionalRefiner
from dataflow.prompts.image import ImageCaprlPrompt
-# 正则解析逻辑
+import re
+from typing import List, Dict, Any
+
_Q_BLOCK_SPLIT = re.compile(r"^####\s*\d+\.\s*\*\*(.*?)\*\*\s*$", re.M)
_OPT_LINE_RE = re.compile(r"^\s*-\s*([A-F])\)\s*(.+?)\s*$")
_ANS_LINE_RE = re.compile(r"^\s*\*\*Answer:\*\*\s*([A-F])\)\s*(.+?)\s*$", re.I)
def parse_mcq_text_logic(mcq_text: str, expected: int = 5) -> List[Dict[str, Any]]:
- """将 VLM 生成的原始文本解析为结构化字典列表"""
if not mcq_text or not isinstance(mcq_text, str): return []
indices = [m.start() for m in _Q_BLOCK_SPLIT.finditer(mcq_text)]
@@ -210,7 +281,9 @@ class VisualOnlyMCQPipeline:
model_path: str,
*,
first_entry_file: str,
- cache_path: str = "./cache_mcq",
+ hf_cache_dir: str | None = None,
+ download_dir: str = "./ckpt/models",
+ cache_path: str = "../cache/cache_mcq",
file_name_prefix: str = "vis_mcq",
# Config
rotate_num: int = 4,
@@ -224,7 +297,6 @@ class VisualOnlyMCQPipeline:
device: str = "cuda",
vllm_max_tokens: int = 2048
):
- # 1. 初始化存储
self.storage = FileStorage(
first_entry_file_name=first_entry_file,
cache_path=cache_path,
@@ -232,15 +304,16 @@ class VisualOnlyMCQPipeline:
cache_type="jsonl"
)
- # 2. 初始化 VLM 服务
self.serving = LocalModelVLMServing_vllm(
+ hf_cache_dir=hf_cache_dir,
+ hf_local_dir=download_dir,
hf_model_name_or_path=model_path,
vllm_tensor_parallel_size=1,
- vllm_temperature=0.1, # 低温度以保证格式稳定
+ vllm_temperature=0.1,
vllm_max_tokens=vllm_max_tokens
)
- # Keys 配置
+ # Keys
self.keys = {
"img": input_image_key,
"raw_text": "raw_mcq_text",
@@ -248,23 +321,24 @@ class VisualOnlyMCQPipeline:
"final": output_key
}
- # 加载 Prompt 库
+ # --- Prompts ---
self.prompts_db = ImageCaprlPrompt().build_prompt()
- # ================== 算子初始化 ==================
+ # ================== Operators ==================
- # 算子 1: 生成原始 MCQ 文本
+ # 1. Generate Raw MCQs (FixPromptedVQAGenerator)
+ # 直接使用 prompt 类中的字符串
self.op_gen_raw = FixPromptedVQAGenerator(
serving=self.serving,
system_prompt=self.prompts_db["SYS_PROMPT_MCQ"],
user_prompt=self.prompts_db["USER_PROMPT_MCQ"]
)
- # 算子 2: 解析文本为结构化数据
+ # 2. Parse MCQs (Refine)
self.op_parse = FunctionalRefiner(func=parse_mcq_text_logic)
- # 算子 3: 视觉依赖性验证 (核心过滤)
- # 包含旋转 (Rotation) 和 无图检测 (Text-only check)
+ # 3. Verify Visual Dependency (Refine)
+ # 传入 prompt 模板
self.op_verify = VisualDependencyRefiner(
serving=self.serving,
instruction_template=self.prompts_db["ANSWER_INSTRUCTION"],
@@ -301,22 +375,15 @@ class VisualOnlyMCQPipeline:
print(f">>> [Pipeline] Done. Results in: {self.keys['final']}")
if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("--input_file", default="./dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl")
- parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
- parser.add_argument("--rotate_num", type=int, default=4)
- parser.add_argument("--pass_vis", type=float, default=1.0)
- parser.add_argument("--pass_txt", type=float, default=0.25)
-
- args = parser.parse_args()
-
pipe = VisualOnlyMCQPipeline(
- model_path=args.model_path,
- first_entry_file=args.input_file,
- rotate_num=args.rotate_num,
- pass_visual_min=args.pass_vis,
- pass_textual_max=args.pass_txt
+ model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+ first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl",
+ hf_cache_dir="~/.cache/huggingface",
+ download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+ rotate_num=4,
+ pass_visual_min=1.0,
+ pass_textual_max=0.25
)
pipe.forward()
-```
\ No newline at end of file
+```
diff --git a/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md b/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md
new file mode 100644
index 00000000..054ade54
--- /dev/null
+++ b/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md
@@ -0,0 +1,341 @@
+---
+title: Visual-Only MCQ Pipeline (API version)
+createTime: 2026/01/11 22:13:45
+icon: mdi:image-text
+permalink: /en/mm_guide/image_visual_only_mcq_pipeline_api/
+---
+
+## 1. Overview
+
+The **Visual-Only MCQ Pipeline** is a core component within the CapRL (Caption Reinforcement Learning) framework. Its goal is to generate a set of high-quality Multiple-Choice Questions (MCQs) that strictly satisfy **strong visual dependency**: the model must "see" the image to answer correctly, and cannot rely merely on text guessing or common sense.
+
+This pipeline utilizes a **"Generate-Parse-Verify"** three-step approach, employing **Option Rotation** and **Blind Test (Text-Only)** mechanisms to rigorously filter out model hallucinations or overly simple questions. The generated questions can be used as reward signals (Reward Model) for reinforcement learning.
+
+The main process includes:
+
+1. **MCQ Generation**: The VLM generates raw Question-Answer text blocks based on the image.
+2. **Structured Parsing**: Uses regex logic to parse the raw text into standard question and option structures.
+3. **Visual Dependency Verification**:
+ * **Rotation Test**: Randomly shuffles the order of options multiple times to eliminate positional bias.
+ * **Dual Filtering**: Requires a high "Visual Accuracy" (with image) and a low "Textual Accuracy" (without image).
+
+---
+
+## 2. Quick Start
+
+### Step 1: Create a New DataFlow Working Directory
+
+```bash
+mkdir run_vis_mcq
+cd run_vis_mcq
+
+```
+
+### Step 2: Initialize DataFlow-MM
+
+```bash
+dataflowmm init
+
+```
+
+You will then see:
+
+```bash
+api_pipelines/image_visual_only_mcq_api_pipeline.py
+
+```
+
+### Step 3: Download Sample Data
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### Step 4: Configure API Key
+
+Set your API Key environment variable in `api_pipelines/image_visual_only_mcq_api_pipeline.py`:
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+
+```
+
+### Step 5: Configure Parameters
+
+Configure the API service and run parameters in `api_pipelines/image_visual_only_mcq_api_pipeline.py` (e.g., requiring 100% visual accuracy and less than 25% textual accuracy):
+
+```python
+ pipe = VisualOnlyMCQPipeline(
+ first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl",
+ rotate_num=4,
+ pass_visual_min=1.0,
+ pass_textual_max=0.25
+ )
+
+```
+
+### Step 6: Run with One Command
+
+```bash
+cd api_pipelines
+python image_visual_only_mcq_api_pipeline.py
+
+```
+
+---
+
+## 3. Data Flow & Logic
+
+### 1. **Input Data**
+
+The input data only requires the image path:
+
+* **image**: Path to the image file.
+
+**Input Data Example**:
+
+```json
+{
+ "image": "./images/sample_01.jpg"
+}
+
+```
+
+### 2. **Core Operator Logic**
+
+This pipeline is chained together by three key operators:
+
+#### A. **Raw Generation (FixPromptedVQAGenerator)**
+
+* **Function**: Uses the preset CapRL prompt templates (`SYS_PROMPT_MCQ` / `USER_PROMPT_MCQ`) to instruct the VLM to generate 5 MCQs in one go.
+* **Output**: Unstructured text blocks containing multiple `#### Question` headers and options.
+
+#### B. **Structured Parsing (FunctionalRefiner)**
+
+* **Logic Function**: `parse_mcq_text_logic`
+* **Function**: Extracts the questions, options (A-F), and correct answers from the raw text using regular expressions.
+* **Output**: A structured list of MCQs (`parsed_mcq_list`).
+
+#### C. **Dependency Verification (VisualDependencyRefiner)**
+
+This is the core filter of the pipeline. It performs N inferences (N = `rotate_num`) for each question:
+
+1. **Option Rotation**: Randomly shuffles the option order (e.g., moving the answer from A to C) to prevent the model from cheating by "always choosing A".
+2. **Visual Pass**: Inputs Image + Question. Records the proportion of correct answers.
+3. **Textual Pass (Blind Test)**: Inputs Question only (No Image). Records the proportion of correct blind guesses.
+4. **Filtering Criteria**:
+* Retains the question if and only if: `Visual_Acc >= pass_visual_min` **AND** `Textual_Acc <= pass_textual_max`.
+* *Example*: If a question can be answered correctly without looking at the image (high textual accuracy), it relies on common sense rather than visual info, and is **discarded**.
+
+
+
+### 3. **Output Data**
+
+The output data (`final_mcqs`) only contains questions that have passed the rigorous verification. These questions possess extremely high quality and visual relevance.
+
+**Output Data Example**:
+
+```json
+{
+ "image": "./images/sample_01.jpg",
+ "final_mcqs": [
+ {
+ "question": "What is the color of the car on the far left?\n - A) Red\n - B) Blue...",
+ "answer": "A",
+ "stats": {
+ "visual_acc": 1.0,
+ "text_acc": 0.0
+ }
+ }
+ ]
+}
+
+```
+
+---
+
+## 4. Pipeline Example
+
+Below is the complete `VisualOnlyMCQPipeline` code implementation (API Version).
+
+```python
+import os
+os.environ["DF_API_KEY"] = "sk-xxxx"
+import argparse
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
+from dataflow.operators.core_vision import FixPromptedVQAGenerator, VisualDependencyRefiner
+from dataflow.operators.core_text import FunctionalRefiner
+from dataflow.prompts.image import ImageCaprlPrompt
+from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
+import re
+from typing import List, Dict, Any
+
+_Q_BLOCK_SPLIT = re.compile(r"^####\s*\d+\.\s*\*\*(.*?)\*\*\s*$", re.M)
+_OPT_LINE_RE = re.compile(r"^\s*-\s*([A-F])\)\s*(.+?)\s*$")
+_ANS_LINE_RE = re.compile(r"^\s*\*\*Answer:\*\*\s*([A-F])\)\s*(.+?)\s*$", re.I)
+
+def parse_mcq_text_logic(mcq_text: str, expected: int = 5) -> List[Dict[str, Any]]:
+ if not mcq_text or not isinstance(mcq_text, str): return []
+
+ indices = [m.start() for m in _Q_BLOCK_SPLIT.finditer(mcq_text)]
+ if not indices: return []
+ indices.append(len(mcq_text))
+ blocks = [mcq_text[indices[i]:indices[i+1]].strip() for i in range(len(indices)-1)]
+
+ parsed = []
+ for block in blocks:
+ lines = [ln.rstrip() for ln in block.splitlines() if ln.strip()]
+ q_title_m = _Q_BLOCK_SPLIT.search(block)
+ if not q_title_m: continue
+
+ q_title = q_title_m.group(1).strip()
+ options = {}
+ ans_letter, ans_text = None, None
+
+ for ln in lines:
+ m_opt = _OPT_LINE_RE.match(ln)
+ if m_opt:
+ options[m_opt.group(1)] = m_opt.group(2).strip()
+ continue
+ m_ans = _ANS_LINE_RE.match(ln)
+ if m_ans:
+ ans_letter = m_ans.group(1).upper()
+ ans_text = m_ans.group(2).strip()
+ break
+
+ if options and ans_letter and ans_letter in options:
+ q_lines = [q_title]
+ for lbl in ["A", "B", "C", "D", "E", "F"]:
+ if lbl in options:
+ q_lines.append(f" - {lbl}) {options[lbl]}")
+
+ parsed.append({
+ "question": "\n".join(q_lines),
+ "question_title": q_title,
+ "options": options,
+ "answer": ans_letter,
+ "answer_text": ans_text
+ })
+
+ if expected > 0:
+ parsed = parsed[:expected]
+
+ uniq = []
+ seen = set()
+ for it in parsed:
+ key = (it["question_title"], it["answer"])
+ if key not in seen:
+ seen.add(key)
+ uniq.append(it)
+ return uniq
+
+
+class VisualOnlyMCQPipeline:
+ def __init__(
+ self,
+ *,
+ first_entry_file: str,
+ cache_path: str = "../cache/cache_mcq",
+ file_name_prefix: str = "vis_mcq",
+ # Config
+ rotate_num: int = 4,
+ pass_visual_min: float = 1.0,
+ pass_textual_max: float = 0.25,
+ add_none_above: bool = True,
+ # Keys
+ input_image_key: str = "image",
+ output_key: str = "final_mcqs",
+ # VLLM
+ vllm_max_tokens: int = 2048
+ ):
+ self.storage = FileStorage(
+ first_entry_file_name=first_entry_file,
+ cache_path=cache_path,
+ file_name_prefix=file_name_prefix,
+ cache_type="jsonl"
+ )
+ self.vlm_serving = APIVLMServing_openai(
+ api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format
+ model_name="gpt-4o-mini",
+ image_io=None,
+ send_request_stream=False,
+ max_workers=10,
+ timeout=1800
+ )
+
+
+ # Keys
+ self.keys = {
+ "img": input_image_key,
+ "raw_text": "raw_mcq_text",
+ "parsed_list": "parsed_mcq_list",
+ "final": output_key
+ }
+
+ # --- Prompts ---
+ self.prompts_db = ImageCaprlPrompt().build_prompt()
+
+ # ================== Operators ==================
+
+ # 1. Generate Raw MCQs (FixPromptedVQAGenerator)
+ # 直接使用 prompt 类中的字符串
+ self.op_gen_raw = FixPromptedVQAGenerator(
+ serving=self.vlm_serving,
+ system_prompt=self.prompts_db["SYS_PROMPT_MCQ"],
+ user_prompt=self.prompts_db["USER_PROMPT_MCQ"]
+ )
+
+ # 2. Parse MCQs (Refine)
+ self.op_parse = FunctionalRefiner(func=parse_mcq_text_logic)
+
+ # 3. Verify Visual Dependency (Refine)
+ # 传入 prompt 模板
+ self.op_verify = VisualDependencyRefiner(
+ serving=self.vlm_serving,
+ instruction_template=self.prompts_db["ANSWER_INSTRUCTION"],
+ rotate_num=rotate_num,
+ pass_visual_min=pass_visual_min,
+ pass_textual_max=pass_textual_max,
+ add_none_above_visual=add_none_above
+ )
+
+ def forward(self):
+ print(">>> [Pipeline] Step 1: Generating Raw MCQs (FixPrompted)...")
+ self.op_gen_raw.run(
+ self.storage.step(),
+ input_image_key=self.keys["img"],
+ output_answer_key=self.keys["raw_text"]
+ )
+
+ print(">>> [Pipeline] Step 2: Parsing MCQs...")
+ self.op_parse.run(
+ self.storage.step(),
+ output_key=self.keys["parsed_list"],
+ mcq_text=self.keys["raw_text"],
+ expected=5
+ )
+
+ print(">>> [Pipeline] Step 3: Verifying Visual Dependency (Rotation Check)...")
+ self.op_verify.run(
+ self.storage.step(),
+ input_list_key=self.keys["parsed_list"],
+ input_image_key=self.keys["img"],
+ output_key=self.keys["final"]
+ )
+
+ print(f">>> [Pipeline] Done. Results in: {self.keys['final']}")
+
+if __name__ == "__main__":
+ pipe = VisualOnlyMCQPipeline(
+ first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl",
+ rotate_num=4,
+ pass_visual_min=1.0,
+ pass_textual_max=0.25
+ )
+ pipe.forward()
+
+```
diff --git a/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md b/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md
index c559087b..799a867e 100644
--- a/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md
+++ b/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md
@@ -1,3 +1,11 @@
+
+```
+
+---
+
+### 2. 英文 GPU 版 (English GPU Version)
+
+```markdown
---
title: Vision MCTS Reasoning Pipeline
icon: mdi:image-text
@@ -7,27 +15,27 @@ permalink: /en/mm_guide/vision_mct_reasoning_pipeline/
## 1. Overview
-The **Vision MCTS Reasoning Pipeline** is designed to construct high-quality **Process Supervision Data** for multimodal large models. This pipeline handles two types of data sources: existing Monte Carlo Tree Search (MCTS) trajectory data, or direct generation of new reasoning chains using a VLM.
+The **Vision MCTS Reasoning Pipeline** is designed to build high-quality **Process Supervision Data** for multimodal large models. This pipeline handles two sources of data: existing Monte Carlo Tree Search (MCTS) trajectory data, or generating new reasoning chains directly using a VLM.
-This pipeline is a core tool for **Grounded-RL** and **SFT Data Construction**, converting complex tree-search processes into a linearized `......` format that models can learn from.
+This pipeline is a core tool for **Grounded-RL** and **SFT Data Construction**. It "linearizes" complex tree-like search processes into a `......` format that the model can learn from.
We support the following application scenarios:
-* **MCTS Data Extraction**: Converting high-value paths (Rollouts) from search trees into linear training data.
-* **Hybrid Data Construction**: Automatically falling back to VLM-based CoT generation for samples without search trees.
-* **Spatial Reasoning Enhancement**: Supporting the generation of spatial reasoning chains containing explicit coordinates (Bounding Boxes).
+* **Data Extraction from MCTS Trees**: Converts high-value paths (Rollouts) in the search tree into linear training data.
+* **Hybrid Data Construction**: Automatically falls back to using the VLM for CoT generation for samples without a search tree.
+* **Spatial Reasoning Enhancement**: Supports generating spatial reasoning chains that include explicit coordinates (Bounding Boxes).
The main process of the pipeline includes:
-1. **MCTS Tree Parsing**: Parsing the search tree structure in the input data to extract successful reasoning paths.
-2. **Visual Reasoning Generation (Fallback)**: Using a VLM to regenerate reasoning chains for samples where the tree structure is missing or parsing fails.
-3. **Data Standardization**: Outputting reasoning chain data in a unified format.
+1. **MCTS Tree Parsing**: Parses the search tree structure in the input data and extracts successful reasoning paths.
+2. **Visual Reasoning Generation (Fallback)**: For samples with missing tree structures or failed parsing, the VLM is used to regenerate the reasoning chain.
+3. **Data Standardization**: Outputs reasoning chain data in a unified format.
---
## 2. Quick Start
-### Step 1: Create a Working Directory
+### Step 1: Create a New DataFlow Working Directory
```bash
mkdir run_mcts_reasoning
@@ -35,38 +43,111 @@ cd run_mcts_reasoning
```
-### Step 2: Prepare the Script
+### Step 2: Initialize DataFlow-MM
+
+```bash
+dataflowmm init
+
+```
+
+You will then see:
-Save the code in the "Pipeline Example" section below as `vision_mcts_pipeline.py`.
+```bash
+gpu_pipelines/vision_mcts_pipeline.py
+
+```
-### Step 3: Download Example Data
+### Step 3: Download Sample Data
```bash
-huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
```
-### Step 4: Run
+### Step 4: Configure Parameters
+
+Ensure the input file (jsonl) contains a `tree` field (for extraction) or just `question`/`image` (for generation).
+
+```python
+if __name__ == "__main__":
+ pipe = VisionMCTSReasoningPipeline(
+ model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+ first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl",
+ prompt_type="spatial",
+ hf_cache_dir="~/.cache/huggingface",
+ download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+ )
+ pipe.forward()
+
+```
+
+> **⚠️ Important Note on Model Path Configuration (Taking `Qwen2.5-VL-3B-Instruct` as an example):**
+> * **If you have already downloaded the model files:** Please change `model_path` to your local model path. **Crucially**, ensure that the model folder is named exactly `Qwen2.5-VL-3B-Instruct`; otherwise, the framework will fail to recognize it.
+> * **If you haven't downloaded the model yet:** You must specify a `download_dir` parameter that ends with `Qwen2.5-VL-3B-Instruct` (as shown in the default parameters). Failure to do so will also result in the model not being recognized after downloading.
+>
+>
+
+### Step 5: Run
```bash
-python vision_mcts_pipeline.py \
- --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
- --input_file "data/mcts_trajectories.jsonl" \
- --prompt_type "spatial"
+cd gpu_pipelines
+python vision_mcts_pipeline.py
```
+> **🛠️ Troubleshooting**
+> **Issue 1:** If you encounter a CUDA library conflict error similar to the following:
+> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`
+> **Solution:** This is usually caused by conflicting environment variables. Run the script with an empty `LD_LIBRARY_PATH`:
+> ```bash
+> LD_LIBRARY_PATH="" python vision_mcts_pipeline.py
+>
+> ```
+>
+>
+> **Issue 2:** If you are using **Qwen series models** and encounter the following error:
+> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"`
+> **Solution:** Open the `config.json` file located in your model folder, find the `rope_scaling` section, and change the key `"type"` to `"rope_type"`.
+> **Before modification:**
+> ```json
+> "rope_scaling": {
+> "type": "mrope",
+> "mrope_section": [
+> 16,
+> 24,
+> 24
+> ]
+> }
+>
+> ```
+>
+>
+> **After modification:**
+> ```json
+> "rope_scaling": {
+> "rope_type": "mrope",
+> "mrope_section": [
+> 16,
+> 24,
+> 24
+> ]
+> }
+>
+> ```
+>
+>
+
---
## 3. Data Flow & Logic
### 1. **Input Data**
-Input data typically comes from MCTS search logs or unlabelled image-text pairs:
+Input data typically originates from MCTS search process logs, or unannotated image-text pairs:
* **image**: Path to the image.
-* **question**: Visual question.
-* **tree** (optional): JSON structure of the MCTS search tree, containing node values, visit counts, and actions.
+* **question**: The visual question.
+* **tree** (Optional): JSON structure of the MCTS search tree, containing node Values, Visits, and Actions.
**Input Data Example**:
@@ -81,30 +162,30 @@ Input data typically comes from MCTS search logs or unlabelled image-text pairs:
### 2. **Core Operator Logic**
-The pipeline employs an **"Extract First, Fallback to Generate"** hybrid strategy:
+This pipeline uses a hybrid strategy of **"Extraction First, Generation as Fallback"**:
-#### A. **MCTSTreeRefiner**
+#### A. **MCTSTreeRefiner (Tree Structure Parser)**
-This operator is responsible for processing the `tree` field. It traverses the tree structure and filters for the best paths from root to leaf based on node Q-values.
+This operator handles the `tree` field. It traverses the tree structure and filters out the best path from the root node to a leaf node based on the node's Q-value.
* **Input**: `tree` object.
-* **Functionality**: Linearizes tree paths, filtering out low-value or incomplete search branches.
-* **Output**: List of extracted reasoning chains (`mcts_chains`).
+* **Function**: Linearizes tree paths, filtering out low-value or incomplete search branches.
+* **Output**: A list of extracted reasoning chains (`mcts_chains`).
-#### B. **VisualReasoningGenerator**
+#### B. **VisualReasoningGenerator (Visual Reasoning Generator)**
-This operator is the "Generation Engine" of the pipeline. It takes the extraction results from the previous step as input.
+This operator is the "generation engine" of the pipeline. It receives the extraction result from the previous step as input.
* **Mechanism**: Checks `input_existing_chains_key` (i.e., `mcts_chains`).
-* If MCTS parsing was successful (chains exist), it reuses them directly without running inference (saving compute).
-* If MCTS chains are empty (tree missing or parsing failed), it calls the VLM to generate reasoning chains from scratch based on the `prompt_type`.
+* If MCTS parsing is successful (chain exists), it is reused directly without inference (saving computational resources).
+* If the MCTS chain is empty (tree does not exist or parsing failed), it calls the VLM to generate the reasoning chain from scratch based on `prompt_type` (e.g., `spatial`).
-* **Prompt Type**: Supports modes like `spatial` (spatial coordinate reasoning), `logical` (logical reasoning), etc.
+* **Prompt Types**: Supports modes like `spatial` (spatial coordinate reasoning) and `logical` (logical reasoning).
### 3. **Output Data**
-The final output data (`final_reasoning_chains`) will contain high-quality Chain-of-Thought data ready for SFT training.
+The finally generated output data (`final_reasoning_chains`) will contain high-quality chains of thought that can be directly used for SFT training.
**Output Example**:
@@ -122,9 +203,9 @@ The final output data (`final_reasoning_chains`) will contain high-quality Chain
## 4. Pipeline Example
-Below is the complete `VisionMCTSReasoningPipeline` code implementation.
+Below is the complete `VisionMCTSReasoningPipeline` code implementation (GPU Version).
+
```python
-import argparse
from dataflow.utils.storage import FileStorage
from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
@@ -138,8 +219,10 @@ class VisionMCTSReasoningPipeline:
model_path: str,
*,
# Storage
+ hf_cache_dir: str | None = None,
+ download_dir: str = "./ckpt/models",
first_entry_file: str,
- cache_path: str = "./cache_mcts",
+ cache_path: str = "../cache/cache_mcts",
file_name_prefix: str = "mcts_reason",
# Config
prompt_type: str = "spatial",
@@ -152,7 +235,6 @@ class VisionMCTSReasoningPipeline:
# VLLM
vllm_max_tokens: int = 1024
):
- # 1. 存储初始化
self.storage = FileStorage(
first_entry_file_name=first_entry_file,
cache_path=cache_path,
@@ -160,8 +242,9 @@ class VisionMCTSReasoningPipeline:
cache_type="jsonl"
)
- # 2. 模型服务
self.serving = LocalModelVLMServing_vllm(
+ hf_cache_dir=hf_cache_dir,
+ hf_local_dir=download_dir,
hf_model_name_or_path=model_path,
vllm_tensor_parallel_size=1,
vllm_temperature=0.7,
@@ -172,20 +255,18 @@ class VisionMCTSReasoningPipeline:
"q": input_question_key,
"img": input_image_key,
"tree": input_tree_key,
- "mcts_chains": "mcts_extracted_chains", # 中间结果
+ "mcts_chains": "mcts_extracted_chains",
"final": output_key
}
# ================== Operators ==================
- # 算子 1: MCTS Tree -> Chains (提取器)
- # 负责将树结构扁平化为线性链
+ # 1. Refiner: MCTS -> Chains
self.op_mcts_refine = MCTSTreeRefiner(
max_chains_per_sample=max_samples_per_file
)
- # 算子 2: VLM -> Chains (生成器/Fallback)
- # 如果 MCTS 提取失败,则使用 VLM 生成;如果成功,则跳过
+ # 2. Generator: VLM -> Chains (Fallback)
self.op_vlm_gen = VisualReasoningGenerator(
serving=self.serving,
prompt_type=prompt_type
@@ -200,7 +281,8 @@ class VisionMCTSReasoningPipeline:
)
print(">>> [Pipeline] Step 2: Generating Chains via VLM (Fallback)...")
- # 注意:input_existing_chains_key 实现了混合/回退逻辑
+ # 将 mcts_chains 作为 input_existing_chains_key 传入
+ # 如果 MCTS 解析成功,则复用;否则调用 VLM 生成
self.op_vlm_gen.run(
self.storage.step(),
input_question_key=self.keys["q"],
@@ -211,17 +293,13 @@ class VisionMCTSReasoningPipeline:
if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("--input_file", default="dataflow/example/image_to_text_pipeline/mct_reasoning.jsonl")
- parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
- parser.add_argument("--prompt_type", default="spatial")
- args = parser.parse_args()
-
pipe = VisionMCTSReasoningPipeline(
- model_path=args.model_path,
- first_entry_file=args.input_file,
- prompt_type=args.prompt_type
+ model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+ first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl",
+ prompt_type="spatial",
+ hf_cache_dir="~/.cache/huggingface",
+ download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
)
pipe.forward()
-```
\ No newline at end of file
+```
diff --git a/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md b/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md
new file mode 100644
index 00000000..8001e5c5
--- /dev/null
+++ b/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md
@@ -0,0 +1,248 @@
+---
+title: Vision MCTS Reasoning Pipeline (API version)
+icon: mdi:image-text
+createTime: 2026/01/11 21:59:59
+permalink: /en/mm_guide/vision_mct_reasoning_pipeline_api/
+---
+
+## 1. Overview
+
+The **Vision MCTS Reasoning Pipeline** is designed to build high-quality **Process Supervision Data** for multimodal large models. This pipeline handles two sources of data: existing Monte Carlo Tree Search (MCTS) trajectory data, or generating new reasoning chains directly using a VLM.
+
+This pipeline is a core tool for **Grounded-RL** and **SFT Data Construction**. It "linearizes" complex tree-like search processes into a `......` format that the model can learn from.
+
+We support the following application scenarios:
+
+* **Data Extraction from MCTS Trees**: Converts high-value paths (Rollouts) in the search tree into linear training data.
+* **Hybrid Data Construction**: Automatically falls back to using the VLM for CoT generation for samples without a search tree.
+* **Spatial Reasoning Enhancement**: Supports generating spatial reasoning chains that include explicit coordinates (Bounding Boxes).
+
+The main process of the pipeline includes:
+
+1. **MCTS Tree Parsing**: Parses the search tree structure in the input data and extracts successful reasoning paths.
+2. **Visual Reasoning Generation (Fallback)**: For samples with missing tree structures or failed parsing, the VLM is used to regenerate the reasoning chain.
+3. **Data Standardization**: Outputs reasoning chain data in a unified format.
+
+---
+
+## 2. Quick Start
+
+### Step 1: Create a New DataFlow Working Directory
+
+```bash
+mkdir run_mcts_reasoning
+cd run_mcts_reasoning
+
+```
+
+### Step 2: Initialize DataFlow-MM
+
+```bash
+dataflowmm init
+
+```
+
+You will then see:
+
+```bash
+api_pipelines/vision_mcts_api_pipeline.py
+
+```
+
+### Step 3: Download Sample Data
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### Step 4: Configure API Key
+
+Set your API Key environment variable in `api_pipelines/vision_mcts_api_pipeline.py`:
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+
+```
+
+### Step 5: Configure Parameters
+
+Configure the API service and input data paths in `api_pipelines/vision_mcts_api_pipeline.py`. Ensure the input file (jsonl) contains a `tree` field (for extraction) or just `question`/`image` (for generation).
+
+```python
+ pipe = VisionMCTSReasoningPipeline(
+ first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl",
+ prompt_type="spatial",
+ )
+
+```
+
+### Step 6: Run with One Command
+
+```bash
+cd api_pipelines
+python vision_mcts_api_pipeline.py
+
+```
+
+---
+
+## 3. Data Flow & Logic
+
+### 1. **Input Data**
+
+Input data typically originates from MCTS search process logs, or unannotated image-text pairs:
+
+* **image**: Path to the image.
+* **question**: The visual question.
+* **tree** (Optional): JSON structure of the MCTS search tree, containing node Values, Visits, and Actions.
+
+**Input Data Example**:
+
+```json
+{
+ "image": "./images/puzzle.jpg",
+ "question": "What is the next step to solve this?",
+ "tree": { "root": { "children": [...], "value": 1.0, "text": "Step 1..." } }
+}
+
+```
+
+### 2. **Core Operator Logic**
+
+This pipeline uses a hybrid strategy of **"Extraction First, Generation as Fallback"**:
+
+#### A. **MCTSTreeRefiner (Tree Structure Parser)**
+
+This operator handles the `tree` field. It traverses the tree structure and filters out the best path from the root node to a leaf node based on the node's Q-value.
+
+* **Input**: `tree` object.
+* **Function**: Linearizes tree paths, filtering out low-value or incomplete search branches.
+* **Output**: A list of extracted reasoning chains (`mcts_chains`).
+
+#### B. **VisualReasoningGenerator (Visual Reasoning Generator)**
+
+This operator is the "generation engine" of the pipeline. It receives the extraction result from the previous step as input.
+
+* **Mechanism**: Checks `input_existing_chains_key` (i.e., `mcts_chains`).
+* If MCTS parsing is successful (chain exists), it is reused directly without inference (saving computational resources).
+* If the MCTS chain is empty (tree does not exist or parsing failed), it calls the VLM to generate the reasoning chain from scratch based on `prompt_type` (e.g., `spatial`).
+
+
+* **Prompt Types**: Supports modes like `spatial` (spatial coordinate reasoning) and `logical` (logical reasoning).
+
+### 3. **Output Data**
+
+The finally generated output data (`final_reasoning_chains`) will contain high-quality chains of thought that can be directly used for SFT training.
+
+**Output Example**:
+
+```json
+{
+ "image": "./images/puzzle.jpg",
+ "final_reasoning_chains": [
+ "First, locate the red block at [100, 200]. To solve the puzzle, it needs to move right...Move Red Block"
+ ]
+}
+
+```
+
+---
+
+## 4. Pipeline Example
+
+Below is the complete `VisionMCTSReasoningPipeline` code implementation (API Version).
+
+```python
+import os
+os.environ["DF_API_KEY"] = "sk-xxxx"
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
+# 引入原子算子
+from dataflow.operators.core_text import MCTSTreeRefiner
+from dataflow.operators.core_vision import VisualReasoningGenerator
+from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
+
+class VisionMCTSReasoningPipeline:
+ def __init__(
+ self,
+ first_entry_file: str,
+ cache_path: str = "../cache/cache_mcts",
+ file_name_prefix: str = "mcts_reason",
+ # Config
+ prompt_type: str = "spatial",
+ max_samples_per_file: int = 10000,
+ # Keys
+ input_question_key: str = "question",
+ input_image_key: str = "image",
+ input_tree_key: str = "tree",
+ output_key: str = "final_reasoning_chains",
+
+ ):
+ self.storage = FileStorage(
+ first_entry_file_name=first_entry_file,
+ cache_path=cache_path,
+ file_name_prefix=file_name_prefix,
+ cache_type="jsonl"
+ )
+
+ self.vlm_serving = APIVLMServing_openai(
+ api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format
+ model_name="gpt-4o-mini",
+ image_io=None,
+ send_request_stream=False,
+ max_workers=10,
+ timeout=1800
+ )
+
+ self.keys = {
+ "q": input_question_key,
+ "img": input_image_key,
+ "tree": input_tree_key,
+ "mcts_chains": "mcts_extracted_chains",
+ "final": output_key
+ }
+
+ # ================== Operators ==================
+
+ # 1. Refiner: MCTS -> Chains
+ self.op_mcts_refine = MCTSTreeRefiner(
+ max_chains_per_sample=max_samples_per_file
+ )
+
+ # 2. Generator: VLM -> Chains (Fallback)
+ self.op_vlm_gen = VisualReasoningGenerator(
+ serving=self.vlm_serving,
+ prompt_type=prompt_type
+ )
+
+ def forward(self):
+ print(">>> [Pipeline] Step 1: Extracting Chains from MCTS Trees...")
+ self.op_mcts_refine.run(
+ self.storage.step(),
+ input_tree_key=self.keys["tree"],
+ output_key=self.keys["mcts_chains"]
+ )
+
+ print(">>> [Pipeline] Step 2: Generating Chains via VLM (Fallback)...")
+ # 将 mcts_chains 作为 input_existing_chains_key 传入
+ # 如果 MCTS 解析成功,则复用;否则调用 VLM 生成
+ self.op_vlm_gen.run(
+ self.storage.step(),
+ input_question_key=self.keys["q"],
+ input_image_key=self.keys["img"],
+ input_existing_chains_key=self.keys["mcts_chains"],
+ output_key=self.keys["final"]
+ )
+
+
+if __name__ == "__main__":
+ pipe = VisionMCTSReasoningPipeline(
+ first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl",
+ prompt_type="spatial",
+ )
+ pipe.forward()
+
+```
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_gcot.md b/docs/zh/notes/mm_guide/image_understanding/image_gcot.md
index a4a11c3d..db64892a 100644
--- a/docs/zh/notes/mm_guide/image_understanding/image_gcot.md
+++ b/docs/zh/notes/mm_guide/image_understanding/image_gcot.md
@@ -27,36 +27,88 @@ permalink: /zh/mm_guide/image_gcot/
## 2. 快速开始
-### 第一步:准备工作目录
-
+### 第一步:创建新的 DataFlow 工作文件夹
```bash
-mkdir run_gcot
-cd run_gcot
-
+mkdir run_dataflow
+cd run_dataflow
```
-### 第二步:准备脚本
-
-将下文“流水线示例”中的代码保存为 `image_gcot_pipeline.py`。
+### 第二步:初始化 DataFlow-MM
+```bash
+dataflowmm init
+```
+这时你会看到:
+```bash
+gpu_pipelines/image_gcot_pipeline.py
+```
-### 第三步:配置运行参数
+### 第三步:下载示例数据
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+```
-确保你拥有支持定位能力的 VLM 模型(如 Qwen2.5-VL-7B-Instruct)。
+### 第四步:配置参数
```bash
-# 安装依赖
-pip install open-dataflow vllm
+if __name__ == "__main__":
+ pipe = ImageGCoTPipeline(
+ model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+ first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl",
+ hf_cache_dir="~/.cache/huggingface",
+ download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+ )
+ pipe.forward()
```
+> **⚠️ 模型路径配置的重要提示(以 `Qwen2.5-VL-3B-Instruct` 为例):**
+>
+> * **如果您已经下载好了模型文件**:请将 `model_path` 修改为您的本地模型路径。**务必保证**模型存放的最终文件夹名称精确为 `Qwen2.5-VL-3B-Instruct`,否则底层解析时将无法正确匹配和识别该模型。
+> * **如果您还未下载模型(需要自动下载)**:请一定要指定 `download_dir` 参数,并且该目录路径**必须以** `Qwen2.5-VL-3B-Instruct` **结尾**(正如默认参数所示),否则下载完成后同样会导致框架无法识别模型。
-### 第四步:一键运行
+### 第五步:一键运行
```bash
-python image_gcot_pipeline.py \
- --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
- --input_file "data/image_qa.jsonl"
-
+cd gpu_pipelines
+python image_gcot_pipeline.py
```
+> **🛠️ 常见问题排查 (Troubleshooting)**
+>
+> **问题 1:** 如果遇到类似如下的动态链接库冲突报错:
+> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`
+>
+> **解决方法:** 这通常是环境变量干扰导致的。请在运行命令前清空 `LD_LIBRARY_PATH`:
+> ```bash
+> LD_LIBRARY_PATH="" python image_gcot_pipeline.py
+> ```
+>
+> **问题 2:** 如果您使用的是 **Qwen 系列模型**,并且遇到以下报错:
+> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"`
+>
+> **解决方法:** 打开模型文件夹下的 `config.json` 文件,找到 `rope_scaling` 配置块,将 `"type"` 字段修改为 `"rope_type"` 即可。
+>
+> **修改前:**
+> ```json
+> "rope_scaling": {
+> "type": "mrope",
+> "mrope_section": [
+> 16,
+> 24,
+> 24
+> ]
+> }
+> ```
+>
+> **修改后:**
+> ```json
+> "rope_scaling": {
+> "rope_type": "mrope",
+> "mrope_section": [
+> 16,
+> 24,
+> 24
+> ]
+> }
+> ```
---
@@ -74,9 +126,9 @@ python image_gcot_pipeline.py \
```json
{
- "image": "./images/cat_dog.jpg",
- "question": "Is the cat looking at the dog?",
- "answer": "Yes"
+ "image":"../example_data/capsbench_images/0.png",
+ "question":"Who is the lead actor in the movie \"Nightmare Alley\"?",
+ "answer": "Bradley Cooper."
}
```
@@ -122,10 +174,7 @@ python image_gcot_pipeline.py \
**输出数据示例 (gcot 字段)**:
```text
-Step 1: Locate the cat [200, 300, 400, 500]. The cat is sitting on the left.
-Step 2: Locate the dog [500, 300, 700, 500]. The dog is sleeping on the right.
-Step 3: Observe their gaze. The cat is facing the dog.
-Answer: Yes
+Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper. \nKeywords: Nightmare Alley, cast list, poster.","cleaned_cot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.","extracted_keywords":["Nightmare Alley","cast list","poster"],"bbox_mapping":{},"gcot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.
```
@@ -139,6 +188,7 @@ Answer: Yes
import re
from typing import List, Dict, Any
import argparse
+import gc
import torch
from dataflow.utils.storage import FileStorage
from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
@@ -147,7 +197,6 @@ from dataflow.operators.core_vision import PromptTemplatedVQAGenerator, VLMBBoxG
from dataflow.operators.core_text import FunctionalRefiner
from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
-# 定义 Prompt 模板,强制模型输出推理步骤和关键词
GCOT_PROMPT_TEMPLATE = (
"Question: {question}\n"
"Answer: {answer}\n\n"
@@ -164,10 +213,8 @@ GCOT_PROMPT_TEMPLATE = (
DEFAULT_BBOX_PROMPT = 'Detect "{keyword}".'
-# ----------------- 辅助逻辑函数 ----------------- #
-
def _parse_base(text: str) -> Dict[str, Any]:
- """基础解析逻辑:分离 CoT 文本和 Keywords 行"""
+ """基础解析逻辑(内部复用)"""
if not text: return {"cot": "", "keywords": []}
lines = text.split('\n')
cot_lines = []
@@ -175,7 +222,6 @@ def _parse_base(text: str) -> Dict[str, Any]:
for line in lines:
if line.strip().lower().startswith('keywords:'):
keyword_str = line.split(':', 1)[-1].strip()
- # 简单的分词处理
raw_kws = [kw.strip().strip('.,;:!?"\'') for kw in keyword_str.replace(';', ',').split(',')]
keywords = [k for k in raw_kws if k]
else:
@@ -183,15 +229,42 @@ def _parse_base(text: str) -> Dict[str, Any]:
return {"cot": '\n'.join(cot_lines).strip(), "keywords": keywords}
def extract_clean_cot_logic(text: str) -> str:
+ """[For FunctionalRefiner] 仅返回清洗后的 CoT 文本"""
return _parse_base(text)["cot"]
def extract_keywords_logic(text: str) -> List[str]:
- return _parse_base(text)["keywords"]
+ """[For FunctionalRefiner] 提取并合并关键词"""
+ parsed = _parse_base(text)
+ kws = parsed["keywords"]
+ cot = parsed["cot"]
+
+ if not kws or len(kws) <= 1:
+ return kws
+
+ # 简单的相邻合并逻辑
+ cot_lower = cot.lower()
+ merged = []
+ skip_indices = set()
+ for i in range(len(kws)):
+ if i in skip_indices: continue
+ best_match = kws[i]
+ best_indices = [i]
+ # 尝试向后合并 3 个词
+ for j in range(i + 1, min(i + 4, len(kws))):
+ if j in skip_indices: break
+ combined = ' '.join(kws[i:j+1])
+ if combined.lower() in cot_lower:
+ best_match = combined
+ best_indices = list(range(i, j+1))
+ else: break
+ merged.append(best_match)
+ skip_indices.update(best_indices)
+ return merged
def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str:
- """将 BBox 注入回 CoT 文本"""
+ """[For FunctionalRefiner] 将 BBox 注入回 CoT"""
if not cot_text or not bbox_map: return cot_text
- # 优先匹配长词,避免子串误匹配
+ # 优先匹配长词
sorted_keywords = sorted(bbox_map.keys(), key=lambda x: len(x), reverse=True)
result_text = cot_text
replaced = set()
@@ -202,37 +275,35 @@ def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str:
answer_pos = result_text.find('Answer:')
search_limit = answer_pos if answer_pos != -1 else len(result_text)
- # 大小写不敏感查找
pos = result_text.lower().find(keyword.lower(), 0, search_limit)
if pos == -1: continue
boxes = bbox_map[keyword] # List[str]
box_str = "".join(boxes)
- # 替换:保留原词,追加 Box
replacement = f"{keyword} {box_str}"
result_text = result_text[:pos] + replacement + result_text[pos + len(keyword):]
replaced.add(keyword)
return result_text
-# ----------------- 流水线定义 ----------------- #
-
class ImageGCoTPipeline:
def __init__(
self,
model_path: str,
*,
+ hf_cache_dir: str | None = None,
+ download_dir: str = "./ckpt/models",
first_entry_file: str,
- cache_path: str = "./cache_gcot",
+ cache_path: str = "../cache/cache_gcot",
file_name_prefix: str = "gcot",
- # Keys 配置
+ # Keys
question_key: str = "question",
answer_key: str = "answer",
image_key: str = "image",
output_key: str = "gcot",
+ # Config
vllm_max_tokens: int = 512
):
- # 1. 存储初始化
self.storage = FileStorage(
first_entry_file_name=first_entry_file,
cache_path=cache_path,
@@ -240,9 +311,11 @@ class ImageGCoTPipeline:
cache_type="jsonl"
)
- # 2. 模型服务 (单一模型)
+ # [单一模型 Serving]
self.vlm_serving = LocalModelVLMServing_vllm(
hf_model_name_or_path=model_path,
+ hf_cache_dir=hf_cache_dir,
+ hf_local_dir=download_dir,
vllm_tensor_parallel_size=1,
vllm_temperature=0.7,
vllm_max_tokens=vllm_max_tokens
@@ -259,28 +332,28 @@ class ImageGCoTPipeline:
"final": output_key
}
- # 3. 算子链配置
+ # ================== Operators ==================
- # Step A: 生成 CoT 和 Keywords
+ # 1. Generate CoT (通用 Generator)
self.op_gen_cot = PromptTemplatedVQAGenerator(
serving=self.vlm_serving,
system_prompt="You are a helpful assistant.",
prompt_template=NamedPlaceholderPromptTemplate(template=GCOT_PROMPT_TEMPLATE)
)
- # Step B: 解析清洗 CoT
+ # 2. Extract Clean CoT (通用 Refiner + Helper)
self.op_extract_cot = FunctionalRefiner(func=extract_clean_cot_logic)
- # Step C: 解析 Keywords
+ # 3. Extract Keywords (通用 Refiner + Helper)
self.op_extract_kws = FunctionalRefiner(func=extract_keywords_logic)
- # Step D: 生成 BBox (Grounding)
+ # 4. Generate BBox (专用 Generator, 因为涉及行内 Batch)
self.op_bbox_gen = VLMBBoxGenerator(
serving=self.vlm_serving,
prompt_template=DEFAULT_BBOX_PROMPT
)
- # Step E: 注入 BBox 到 CoT
+ # 5. Inject GCoT (通用 Refiner + Helper)
self.op_inject = FunctionalRefiner(func=inject_bboxes_logic)
def forward(self):
@@ -289,7 +362,7 @@ class ImageGCoTPipeline:
self.storage.step(),
input_image_key=self.keys["img"],
output_answer_key=self.keys["raw_cot"],
- question=self.keys["q"],
+ question=self.keys["q"], # Template mapping
answer=self.keys["a"]
)
@@ -297,7 +370,7 @@ class ImageGCoTPipeline:
self.op_extract_cot.run(
self.storage.step(),
output_key=self.keys["clean_cot"],
- text=self.keys["raw_cot"]
+ text=self.keys["raw_cot"] # Param mapping
)
self.op_extract_kws.run(
self.storage.step(),
@@ -325,16 +398,11 @@ class ImageGCoTPipeline:
if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("--input_file", default="dataflow/example/image_to_text_pipeline/image_qa_result.jsonl")
- parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
-
- args = parser.parse_args()
-
pipe = ImageGCoTPipeline(
- model_path=args.model_path,
- first_entry_file=args.input_file
+ model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+ first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl",
+ hf_cache_dir="~/.cache/huggingface",
+ download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
)
pipe.forward()
-
```
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md b/docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md
new file mode 100644
index 00000000..a2a419f4
--- /dev/null
+++ b/docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md
@@ -0,0 +1,392 @@
+---
+title: 图像定位思维链 (GCoT) 生成流水线(API版)
+icon: mdi:image-text
+createTime: 2026/01/11 20:44:55
+permalink: /zh/mm_guide/image_gcot_api/
+---
+## 1. 概述
+
+**图像定位思维链 (GCoT) 生成流水线** 旨在自动化生成**带视觉定位的思维链(Grounded Chain-of-Thought)**数据。该流水线通过多步推理,不仅生成回答问题的逻辑步骤,还将推理过程中提到的关键物体在图像中进行空间定位(Bounding Box),从而显著提升多模态数据的可解释性和精确度。
+
+与传统方法不同,本流水线采用 **单一 VLM(如 GPT-5)** 同时完成“推理”和“定位”任务,流程更加精简高效。
+
+我们支持以下应用场景:
+
+* **增强型多模态数据构建**:为 VQA 数据集增加解释性和定位标注。
+* **复杂场景理解**:生成包含物体坐标的详细推理步骤。
+* **模型推理能力训练**:构建数据以训练模型“言之有物”,减少幻觉。
+
+流水线的主要流程包括:
+
+1. **CoT 生成**:模型生成分步推理文本,并提取关键名词。
+2. **关键词解析**:从生成的文本中清洗并提取待定位的关键词。
+3. **视觉定位 (Grounding)**:模型针对提取的关键词生成边界框 (BBox)。
+4. **信息注入**:将 BBox 坐标回填至推理文本中,形成最终的 GCoT。
+
+---
+
+## 2. 快速开始
+
+### 第一步:创建新的 DataFlow 工作文件夹
+```bash
+mkdir run_dataflow
+cd run_dataflow
+```
+
+### 第二步:初始化 DataFlow-MM
+```bash
+dataflowmm init
+```
+这时你会看到:
+```bash
+gpu_pipelines/image_gcot_pipeline.py
+```
+
+### 第三步:下载示例数据
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+```
+
+### 第四步:配置 API Key
+
+在 `api_pipelines/image_gcot_api_pipeline.py` 中设置 API Key 环境变量:
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+```
+
+### 第五步:配置参数
+
+在 `api_pipelines/image_gcot_api_pipeline.py` 中配置 API 服务和输入数据路径:
+
+```python
+ def __init__(
+ self,
+ *,
+ first_entry_file: str,
+ cache_path: str = "../cache/cache_gcot",
+ file_name_prefix: str = "gcot",
+ question_key: str = "question",
+ answer_key: str = "answer",
+ image_key: str = "image",
+ output_key: str = "gcot",
+ vllm_max_tokens: int = 512
+ ):
+```
+
+```python
+ pipe = ImageGCoTPipeline(
+ first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl"
+ )
+```
+
+```python
+self.vlm_serving = APIVLMServing_openai(
+ api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format
+ model_name="gpt-4o-mini",
+ image_io=None,
+ send_request_stream=False,
+ max_workers=10,
+ timeout=1800
+ )
+```
+
+### 第六步:一键运行
+```bash
+cd api_pipelines
+python image_gcot_api_pipeline.py
+```
+
+---
+
+## 3. 数据流与流水线逻辑
+
+### 1. **输入数据**
+
+该流程的输入数据通常是标准的 VQA 数据:
+
+* **image**:图像文件路径。
+* **question**:关于图像的问题。
+* **answer**:问题的标准答案(用于辅助生成 CoT)。
+
+**输入数据示例**:
+
+```json
+{
+ "image":"../example_data/capsbench_images/0.png",
+ "question":"Who is the lead actor in the movie \"Nightmare Alley\"?",
+ "answer": "Bradley Cooper."
+}
+
+```
+
+### 2. **核心算子逻辑**
+
+本流水线通过组合多个细粒度算子来实现复杂的 GCoT 生成逻辑:
+
+#### A. **CoT 生成 (PromptTemplatedVQAGenerator)**
+
+利用预设的 `GCOT_PROMPT_TEMPLATE`,引导模型生成“步骤化推理”和“关键词列表”。
+
+* **Prompt 策略**:要求模型按 `Step 1: ...`, `Step 2: ...`, `Keywords: ...` 格式输出。
+* **输出**:包含推理文本和关键词的原始字符串。
+
+#### B. **文本清洗与提取 (FunctionalRefiner)**
+
+使用自定义函数对上一步的输出进行解析:
+
+* `extract_clean_cot_logic`:剥离关键词部分,保留纯净的 CoT 文本。
+* `extract_keywords_logic`:解析 `Keywords:` 后的内容,生成 Python List。
+
+#### C. **视觉定位 (VLMBBoxGenerator)**
+
+针对提取出的每一个关键词,调用 VLM 的定位能力生成边界框。
+
+* **输入**:图像 + 关键词列表。
+* **输出**:关键词到边界框坐标的映射字典 (Map)。
+
+#### D. **坐标注入 (FunctionalRefiner)**
+
+使用 `inject_bboxes_logic` 函数,将生成的 BBox 坐标智能插入回原始 CoT 文本中对应的单词之后。
+
+### 3. **输出数据**
+
+最终,流水线生成的输出数据将包含以下关键字段:
+
+* **raw_cot_output**:模型原始生成的文本。
+* **cleaned_cot**:清洗后的纯推理文本。
+* **bbox_mapping**:关键词与其坐标的映射。
+* **gcot**:最终结果,包含坐标信息的推理链。
+
+**输出数据示例 (gcot 字段)**:
+
+```text
+Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper. \nKeywords: Nightmare Alley, cast list, poster.","cleaned_cot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.","extracted_keywords":["Nightmare Alley","cast list","poster"],"bbox_mapping":{},"gcot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.
+
+```
+
+---
+
+## 4. 流水线示例
+
+以下是完整的 `ImageGCoTAPIPipeline` 代码实现。
+
+```python
+import os
+os.environ["DF_API_KEY"] = "sk-xxxx"
+
+import re
+from typing import List, Dict, Any
+import argparse
+import gc
+import torch
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
+from dataflow.operators.core_vision import PromptTemplatedVQAGenerator, VLMBBoxGenerator
+from dataflow.operators.core_text import FunctionalRefiner
+from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
+from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
+GCOT_PROMPT_TEMPLATE = (
+ "Question: {question}\n"
+ "Answer: {answer}\n\n"
+ "Task: Provide a detailed step-by-step reasoning (Chain-of-Thought) that explains "
+ "how to arrive at this answer based on the image.\n"
+ "Then, extract key nouns and objects mentioned in your reasoning that are "
+ "visible in the image and can be spatially located.\n\n"
+ "Format:\n"
+ "Step 1: ...\n"
+ "Step 2: ...\n"
+ "Answer: {answer}\n"
+ "Keywords: object1, object2\n"
+)
+
+DEFAULT_BBOX_PROMPT = 'Detect "{keyword}".'
+
+def _parse_base(text: str) -> Dict[str, Any]:
+ """基础解析逻辑(内部复用)"""
+ if not text: return {"cot": "", "keywords": []}
+ lines = text.split('\n')
+ cot_lines = []
+ keywords = []
+ for line in lines:
+ if line.strip().lower().startswith('keywords:'):
+ keyword_str = line.split(':', 1)[-1].strip()
+ raw_kws = [kw.strip().strip('.,;:!?"\'') for kw in keyword_str.replace(';', ',').split(',')]
+ keywords = [k for k in raw_kws if k]
+ else:
+ cot_lines.append(line)
+ return {"cot": '\n'.join(cot_lines).strip(), "keywords": keywords}
+
+def extract_clean_cot_logic(text: str) -> str:
+ """[For FunctionalRefiner] 仅返回清洗后的 CoT 文本"""
+ return _parse_base(text)["cot"]
+
+def extract_keywords_logic(text: str) -> List[str]:
+ """[For FunctionalRefiner] 提取并合并关键词"""
+ parsed = _parse_base(text)
+ kws = parsed["keywords"]
+ cot = parsed["cot"]
+
+ if not kws or len(kws) <= 1:
+ return kws
+
+ # 简单的相邻合并逻辑
+ cot_lower = cot.lower()
+ merged = []
+ skip_indices = set()
+ for i in range(len(kws)):
+ if i in skip_indices: continue
+ best_match = kws[i]
+ best_indices = [i]
+ # 尝试向后合并 3 个词
+ for j in range(i + 1, min(i + 4, len(kws))):
+ if j in skip_indices: break
+ combined = ' '.join(kws[i:j+1])
+ if combined.lower() in cot_lower:
+ best_match = combined
+ best_indices = list(range(i, j+1))
+ else: break
+ merged.append(best_match)
+ skip_indices.update(best_indices)
+ return merged
+
+def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str:
+ """[For FunctionalRefiner] 将 BBox 注入回 CoT"""
+ if not cot_text or not bbox_map: return cot_text
+ # 优先匹配长词
+ sorted_keywords = sorted(bbox_map.keys(), key=lambda x: len(x), reverse=True)
+ result_text = cot_text
+ replaced = set()
+
+ for keyword in sorted_keywords:
+ if keyword in replaced: continue
+ # 简单策略:只在 'Answer:' 之前注入,防止破坏答案区
+ answer_pos = result_text.find('Answer:')
+ search_limit = answer_pos if answer_pos != -1 else len(result_text)
+
+ pos = result_text.lower().find(keyword.lower(), 0, search_limit)
+ if pos == -1: continue
+
+ boxes = bbox_map[keyword] # List[str]
+ box_str = "".join(boxes)
+ replacement = f"{keyword} {box_str}"
+
+ result_text = result_text[:pos] + replacement + result_text[pos + len(keyword):]
+ replaced.add(keyword)
+ return result_text
+
+class ImageGCoTPipeline:
+ def __init__(
+ self,
+ *,
+ first_entry_file: str,
+ cache_path: str = "../cache/cache_gcot",
+ file_name_prefix: str = "gcot",
+ # Keys
+ question_key: str = "question",
+ answer_key: str = "answer",
+ image_key: str = "image",
+ output_key: str = "gcot",
+ # Config
+ vllm_max_tokens: int = 512
+ ):
+ self.storage = FileStorage(
+ first_entry_file_name=first_entry_file,
+ cache_path=cache_path,
+ file_name_prefix=file_name_prefix,
+ cache_type="jsonl"
+ )
+
+ self.vlm_serving = APIVLMServing_openai(
+ api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format
+ model_name="gpt-4o-mini",
+ image_io=None,
+ send_request_stream=False,
+ max_workers=10,
+ timeout=1800
+ )
+
+ self.keys = {
+ "q": question_key,
+ "a": answer_key,
+ "img": image_key,
+ "raw_cot": "raw_cot_output",
+ "clean_cot": "cleaned_cot",
+ "keywords": "extracted_keywords",
+ "bbox_map": "bbox_mapping",
+ "final": output_key
+ }
+
+ # ================== Operators ==================
+
+ # 1. Generate CoT (通用 Generator)
+ self.op_gen_cot = PromptTemplatedVQAGenerator(
+ serving=self.vlm_serving,
+ system_prompt="You are a helpful assistant.",
+ prompt_template=NamedPlaceholderPromptTemplate(template=GCOT_PROMPT_TEMPLATE)
+ )
+
+ # 2. Extract Clean CoT (通用 Refiner + Helper)
+ self.op_extract_cot = FunctionalRefiner(func=extract_clean_cot_logic)
+
+ # 3. Extract Keywords (通用 Refiner + Helper)
+ self.op_extract_kws = FunctionalRefiner(func=extract_keywords_logic)
+
+ # 4. Generate BBox (专用 Generator, 因为涉及行内 Batch)
+ self.op_bbox_gen = VLMBBoxGenerator(
+ serving=self.vlm_serving,
+ prompt_template=DEFAULT_BBOX_PROMPT
+ )
+
+ # 5. Inject GCoT (通用 Refiner + Helper)
+ self.op_inject = FunctionalRefiner(func=inject_bboxes_logic)
+
+ def forward(self):
+ print(">>> [Pipeline] Step 1: Generating CoT...")
+ self.op_gen_cot.run(
+ self.storage.step(),
+ input_image_key=self.keys["img"],
+ output_answer_key=self.keys["raw_cot"],
+ question=self.keys["q"], # Template mapping
+ answer=self.keys["a"]
+ )
+
+ print(">>> [Pipeline] Step 2: Parsing Outputs...")
+ self.op_extract_cot.run(
+ self.storage.step(),
+ output_key=self.keys["clean_cot"],
+ text=self.keys["raw_cot"] # Param mapping
+ )
+ self.op_extract_kws.run(
+ self.storage.step(),
+ output_key=self.keys["keywords"],
+ text=self.keys["raw_cot"]
+ )
+
+ print(">>> [Pipeline] Step 3: Generating BBoxes (Grounding)...")
+ self.op_bbox_gen.run(
+ self.storage.step(),
+ input_image_key=self.keys["img"],
+ input_kws_key=self.keys["keywords"],
+ output_key=self.keys["bbox_map"]
+ )
+
+ print(">>> [Pipeline] Step 4: Injecting GCoT...")
+ self.op_inject.run(
+ self.storage.step(),
+ output_key=self.keys["final"],
+ cot_text=self.keys["clean_cot"],
+ bbox_map=self.keys["bbox_map"]
+ )
+
+ print(f">>> [Pipeline] Done. Final GCoT saved to: {self.keys['final']}")
+
+
+if __name__ == "__main__":
+ pipe = ImageGCoTPipeline(
+ first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl"
+ )
+ pipe.forward()
+```
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md b/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md
index 3bb6b039..7cfc00bc 100644
--- a/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md
+++ b/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md
@@ -29,38 +29,112 @@ permalink: /zh/mm_guide/image_scale_caption_pipeline/
## 2. 快速开始
-### 第一步:准备工作目录
+### 第一步:创建新的 DataFlow 工作文件夹
```bash
-mkdir run_scalecap
-cd run_scalecap
+mkdir run_dataflow
+cd run_dataflow
```
-### 第二步:准备脚本
+### 第二步:初始化 DataFlow-MM
-将下文“流水线示例”中的代码保存为 `scalecap_pipeline.py`。
+```bash
+dataflowmm init
-### 第三步:配置运行参数
+```
-确保 VLM 模型(如 Qwen2.5-VL)路径正确。
+这时你会看到:
```bash
-# 安装依赖
-pip install open-dataflow vllm
+gpu_pipelines/image_scale_caption_pipeline.py
+
+```
+
+### 第三步:下载示例数据
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### 第四步:配置参数
+
+```python
+if __name__ == "__main__":
+ pipe = ImageScaleCaptionPipeline(
+ model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+ hf_cache_dir="~/.cache/huggingface",
+ download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+ device="cuda",
+ first_entry_file="../example_data/capsbench_images/image_scale_caption_demo.jsonl",
+ cache_path="../cache/image_scale_caption",
+ file_name_prefix="scalecap",
+ input_image_key="image",
+ output_key="final_caption",
+ vllm_tensor_parallel_size=1,
+ vllm_max_tokens=1024
+ )
+ pipe.forward()
```
-### 第四步:一键运行
+> **⚠️ 模型路径配置的重要提示(以 `Qwen2.5-VL-3B-Instruct` 为例):**
+> * **如果您已经下载好了模型文件**:请将 `model_path` 修改为您的本地模型路径。**务必保证**模型存放的最终文件夹名称精确为 `Qwen2.5-VL-3B-Instruct`,否则底层解析时将无法正确匹配和识别该模型。
+> * **如果您还未下载模型(需要自动下载)**:请一定要指定 `download_dir` 参数,并且该目录路径**必须以** `Qwen2.5-VL-3B-Instruct` **结尾**(正如默认参数所示),否则下载完成后同样会导致框架无法识别模型。
+>
+>
+
+### 第五步:一键运行
```bash
-python scalecap_pipeline.py \
- --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
- --input_jsonl "data/images.jsonl" \
- --output_key "final_caption"
+cd gpu_pipelines
+python image_scale_caption_pipeline.py
```
+> **🛠️ 常见问题排查 (Troubleshooting)**
+> **问题 1:** 如果遇到类似如下的动态链接库冲突报错:
+> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`
+> **解决方法:** 这通常是环境变量干扰导致的。请在运行命令前清空 `LD_LIBRARY_PATH`:
+> ```bash
+> LD_LIBRARY_PATH="" python image_scale_caption_pipeline.py
+>
+> ```
+>
+>
+> **问题 2:** 如果您使用的是 **Qwen 系列模型**,并且遇到以下报错:
+> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"`
+> **解决方法:** 打开模型文件夹下的 `config.json` 文件,找到 `rope_scaling` 配置块,将 `"type"` 字段修改为 `"rope_type"` 即可。
+> **修改前:**
+> ```json
+> "rope_scaling": {
+> "type": "mrope",
+> "mrope_section": [
+> 16,
+> 24,
+> 24
+> ]
+> }
+>
+> ```
+>
+>
+> **修改后:**
+> ```json
+> "rope_scaling": {
+> "rope_type": "mrope",
+> "mrope_section": [
+> 16,
+> 24,
+> 24
+> ]
+> }
+>
+> ```
+>
+>
+
---
## 3. 数据流与流水线逻辑
@@ -75,7 +149,7 @@ python scalecap_pipeline.py \
```json
{
- "image": "./images/complex_scene.jpg"
+ "image": "../example_data/capsbench_images/0.png"
}
```
@@ -132,12 +206,12 @@ python scalecap_pipeline.py \
```json
{
- "image": "./images/complex_scene.jpg",
+ "image": "../example_data/capsbench_images/0.png",
"init_caption": "A dog sitting on a bench.",
"golden_sentences": ["A dog is sitting on a wooden bench."],
- "q_list": ["Describe more details about the dog.", "Describe position of the bench."],
+ "q_list": ["Describe more details about the dog.", "Describe more details about the position of the bench."],
"final_details": ["The dog is a Golden Retriever with a red collar.", "The bench is located in a park."],
- "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park..."
+ "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park."
}
```
@@ -146,7 +220,7 @@ python scalecap_pipeline.py \
## 4. 流水线示例
-以下是完整的 `ImageScaleCaptionPipeline` 代码实现。
+以下是完整的 `ImageScaleCaptionPipeline` 代码实现 (GPU 版本)。
```python
import re
@@ -154,12 +228,79 @@ import argparse
from typing import Callable, Any, List
from dataflow.utils.storage import FileStorage
+
from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
from dataflow.prompts.image import ImageScaleCaptionPrompt
+
from dataflow.operators.core_vision import PromptedVQAGenerator, BatchVQAGenerator, VisualGroundingRefiner
from dataflow.operators.core_text import PromptTemplatedQAGenerator, FunctionalRefiner
+
+def split_sentences(text: str) -> List[str]:
+ """将文本拆分为句子列表"""
+ if not text or not isinstance(text, str):
+ return []
+ # 使用正则按标点符号分割 (. ! ? 。 ! ?)
+ _SENT_SPLIT = re.compile(r"(?<=[.!?。!?])\s+")
+ parts = [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]
+ return parts or ([text.strip()] if text.strip() else [])
+
+def join_list(data: Any, separator: str = "\n") -> str:
+ """将列表连接为字符串"""
+ if isinstance(data, list):
+ # 过滤掉非字符串元素或空字符串
+ valid_items = [str(x) for x in data if x]
+ return separator.join(valid_items)
+ return str(data) if data is not None else ""
+
+def parse_questions_logic(text: str, max_q: int = 20) -> List[str]:
+ """
+ 解析 LLM 生成的 "Describe more details about..." 文本,
+ 并自动扩展 position 问题。
+ """
+ if not text or not isinstance(text, str):
+ return []
+
+ lines = [t.strip() for t in text.split("\n") if t.strip()]
+ obj_qs = []
+
+ for line in lines:
+ # 提取包含 "Describe more details about" 的行
+ if "Describe more details about" in line:
+ # 去除可能的序号 (如 "1. Describe...")
+ try:
+ start_idx = line.find("Describe")
+ clean = line[start_idx:]
+ # 去除句末多余内容,保留到第一个句号
+ if "." in clean:
+ clean = clean.split(".")[0] + "."
+ obj_qs.append(clean)
+ except Exception:
+ continue
+
+ # 去重并保持顺序
+ seen = set()
+ unique_obj_qs = []
+ for q in obj_qs:
+ if q not in seen:
+ unique_obj_qs.append(q)
+ seen.add(q)
+
+ # 截断
+ unique_obj_qs = unique_obj_qs[:max_q]
+
+ # 扩展 Position 问题
+ pos_qs = [
+ q.replace("Describe more details about", "Describe more details about the position of")
+ for q in unique_obj_qs
+ ]
+
+ # 返回合并后的列表 (对象问题 + 位置问题)
+ return unique_obj_qs + pos_qs
+
+
class ImageScaleCaptionPipeline:
def __init__(
self,
@@ -211,14 +352,19 @@ class ImageScaleCaptionPipeline:
# ================== Operator Initialization ==================
# --- Step A: Generate Init Caption ---
+ # 构造固定 Prompt 列
self.refine_const_prompt = FunctionalRefiner(func=lambda: self.prompts_db["VLM_PROMPT_1"])
+
+ # 生成初稿 (使用通用 PromptedVQAGenerator)
self.gen_init_caption = PromptedVQAGenerator(
serving=self.serving,
system_prompt="You are a helpful assistant."
)
# --- Step B: Refine Golden Sentences ---
+ # 分句
self.refine_split = FunctionalRefiner(func=split_sentences)
+
# 视觉自检 (保留 Yes 的句子)
self.refine_golden = VisualGroundingRefiner(
serving=self.serving,
@@ -226,7 +372,10 @@ class ImageScaleCaptionPipeline:
)
# --- Step C: Generate Questions ---
+ # 列表转字符串
self.refine_join = FunctionalRefiner(func=join_list)
+
+ # 文本生成问题 (Text-to-Text)
tpl_q = NamedPlaceholderPromptTemplate(
template=self.prompts_db["LLM_PROMPT_1"],
join_list_with="\n"
@@ -235,16 +384,22 @@ class ImageScaleCaptionPipeline:
serving=self.serving,
prompt_template=tpl_q
)
+
+ # 解析问题文本为列表
self.refine_parse_qs = FunctionalRefiner(func=parse_questions_logic)
# --- Step D: Generate Answers ---
+ # 批量回答 (One Image -> Many Qs)
self.gen_answers = BatchVQAGenerator(serving=self.serving)
+
+ # 回答过滤
self.refine_answers = VisualGroundingRefiner(
serving=self.serving,
prompt_template="Given the image, is the statement '{text}' grounded in the image and not generic? Answer strictly yes or no."
)
# --- Step E: Integrate Final Caption ---
+ # 融合 (Text-to-Text)
tpl_final = NamedPlaceholderPromptTemplate(
template=self.prompts_db["LLM_PROMPT_4"],
join_list_with="\n"
@@ -256,6 +411,7 @@ class ImageScaleCaptionPipeline:
def forward(self):
print(">>> [Pipeline] Step 0: Preparing Prompts...")
+ # 构造 init_prompt 列
self.refine_const_prompt.run(
self.storage.step(),
output_key="init_prompt"
@@ -288,11 +444,14 @@ class ImageScaleCaptionPipeline:
output_key="golden_str",
data="golden_sentences"
)
+
+ # template: "{sentence}" -> map to col "golden_str"
self.gen_questions_text.run(
self.storage.step(),
output_answer_key="raw_q_text",
sentence="golden_str"
)
+
self.refine_parse_qs.run(
self.storage.step(),
output_key="q_list",
@@ -306,6 +465,7 @@ class ImageScaleCaptionPipeline:
input_image_key=self.input_image_key,
output_key="raw_answers"
)
+
self.refine_answers.run(
self.storage.step(),
input_list_key="raw_answers",
@@ -319,50 +479,37 @@ class ImageScaleCaptionPipeline:
output_key="details_str",
data="final_details"
)
+
+ # template keys: context, object_info, position_info
self.gen_final_caption.run(
self.storage.step(),
output_answer_key=self.output_key,
context="golden_str",
object_info="details_str",
- position_info="details_str"
+ position_info="details_str" # 简化:同时作为 object 和 position 信息
)
print(f">>> [Pipeline] All Done. Result saved to: {self.storage.cache_path}")
if __name__ == "__main__":
- parser = argparse.ArgumentParser(description="ScaleCap Dense Captioning Pipeline")
-
- parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
- parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface")
- parser.add_argument("--download_dir", default="./ckpt/models")
- parser.add_argument("--device", default="cuda")
-
- parser.add_argument("--input_jsonl", default="./dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl")
- parser.add_argument("--cache_path", default="./cache_scalecap_results")
- parser.add_argument("--file_name_prefix", default="scalecap")
- parser.add_argument("--input_image_key", default="image")
- parser.add_argument("--output_key", default="final_caption")
-
- parser.add_argument("--tp", type=int, default=1)
- parser.add_argument("--max_tokens", type=int, default=1024)
-
- args = parser.parse_args()
-
pipe = ImageScaleCaptionPipeline(
- model_path=args.model_path,
- hf_cache_dir=args.hf_cache_dir,
- download_dir=args.download_dir,
- device=args.device,
- first_entry_file=args.input_jsonl,
- cache_path=args.cache_path,
- file_name_prefix=args.file_name_prefix,
- input_image_key=args.input_image_key,
- output_key=args.output_key,
- vllm_tensor_parallel_size=args.tp,
- vllm_max_tokens=args.max_tokens
+ model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+ hf_cache_dir="~/.cache/huggingface",
+ download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+ device="cuda",
+
+ first_entry_file="../example_data/capsbench_images/image_scale_caption_demo.jsonl",
+ cache_path="../cache/image_scale_caption",
+ file_name_prefix="scalecap",
+
+ input_image_key="image",
+ output_key="final_caption",
+
+ vllm_tensor_parallel_size=1,
+ vllm_max_tokens=1024
)
pipe.forward()
-```
+```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md b/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md
new file mode 100644
index 00000000..e504e294
--- /dev/null
+++ b/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md
@@ -0,0 +1,477 @@
+---
+title: ScaleCap 高密度描述生成流水线(API版)
+createTime: 2026/01/11 22:08:57
+icon: mdi:image-text
+permalink: /zh/mm_guide/image_scale_caption_pipeline_api/
+---
+
+## 1. 概述
+
+**ScaleCap 高密度描述生成流水线 (Image Scale Caption Pipeline)** 是一种基于**“生成-验证-扩展-融合”**范式的先进图像描述生成方案。该流水线旨在生成**信息密度极高**且**幻觉率极低**的图像描述,特别适用于需要深度理解图像细节的场景。
+
+该方法的理论基础源自论文 *ScaleCap: Inference-Time Scalable Image Captioning via Dual-Modality Debiasing*。它通过多轮对话和视觉自检(Visual Grounding),逐步挖掘图像中的对象与位置细节,并过滤掉模型产生的幻觉。
+
+我们支持以下应用场景:
+
+* **高质量多模态数据集构建**:生成比普通 Caption 更详尽、准确的训练数据。
+* **细粒度图像检索**:提供包含丰富细节的索引文本。
+* **盲人辅助/图像无障碍**:生成“所见即所得”的详细解说。
+
+流水线的主要流程包括:
+
+1. **初稿生成**:VLM 生成基础描述。
+2. **视觉自检 (Debiasing)**:将描述拆分为句子,逐句验证其是否被图像证据支持(Visual Grounding)。
+3. **细节追问**:针对通过验证的“黄金句子”,生成关于对象属性和位置的追问。
+4. **回答与再验证**:VLM 回答追问,并再次进行视觉自检以过滤错误细节。
+5. **最终融合**:将所有经过验证的信息融合成一段连贯的长描述。
+
+---
+
+## 2. 快速开始
+
+### 第一步:创建新的 DataFlow 工作文件夹
+
+```bash
+mkdir run_dataflow
+cd run_dataflow
+
+```
+
+### 第二步:初始化 DataFlow-MM
+
+```bash
+dataflowmm init
+
+```
+
+这时你会看到:
+
+```bash
+api_pipelines/image_scale_caption_api_pipeline.py
+
+```
+
+### 第三步:下载示例数据
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### 第四步:配置 API Key
+
+在 `api_pipelines/image_scale_caption_api_pipeline.py` 中设置 API Key 环境变量:
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+
+```
+
+### 第五步:配置参数
+
+在 `api_pipelines/image_scale_caption_api_pipeline.py` 中配置 API 服务和输入数据路径:
+
+```python
+ def __init__(
+ self,
+ # Storage params
+ first_entry_file: str = "../example_data/capsbench_images/image_scale_caption_demo.jsonl",
+ cache_path: str = "../cache/image_scale_caption",
+ file_name_prefix: str = "scalecap",
+ cache_type: str = "jsonl",
+ # Keys
+ input_image_key: str = "image",
+ output_key: str = "final_caption",
+ ):
+
+```
+
+```python
+ self.vlm_serving = APIVLMServing_openai(
+ api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format
+ model_name="gpt-4o-mini",
+ image_io=None,
+ send_request_stream=False,
+ max_workers=10,
+ timeout=1800
+ )
+
+```
+
+### 第六步:一键运行
+
+```bash
+cd api_pipelines
+python image_scale_caption_api_pipeline.py
+
+```
+
+---
+
+## 3. 数据流与流水线逻辑
+
+### 1. **输入数据**
+
+输入数据非常简单,仅需图像路径:
+
+* **image**:图像文件路径。
+
+**输入数据示例**:
+
+```json
+{
+ "image": "../example_data/capsbench_images/0.png"
+}
+
+```
+
+### 2. **核心算子逻辑**
+
+该流水线是多个原子算子的复杂编排:
+
+#### A. **初稿生成 (PromptedVQAGenerator)**
+
+* **功能**:使用基础 Prompt 生成图像的初步描述 (`init_caption`)。
+
+#### B. **视觉自检 (VisualGroundingRefiner)**
+
+* **功能**:这是 ScaleCap 的核心防幻觉机制。
+* **逻辑**:
+1. 使用 `split_sentences` 将初稿拆分为单句。
+2. 调用 VLM 询问:“Given the image, is the description '{text}' directly supported by visual evidence?”。
+3. 仅保留回答为 "Yes" 的句子,形成 **"Golden Sentences"**。
+
+
+
+#### C. **问题生成与解析 (PromptTemplatedQAGenerator)**
+
+* **功能**:基于 Golden Sentences,利用 LLM 能力生成针对性的追问。
+* **逻辑**:模型生成如 "Describe more details about the [Object]" 的文本,并通过 `parse_questions_logic` 自动扩展为**对象细节**和**位置关系**两类问题。
+
+#### D. **批量回答与二次过滤 (BatchVQAGenerator & Refiner)**
+
+* **功能**:挖掘图像深层信息。
+* **逻辑**:
+1. 使用 `BatchVQAGenerator` 一次性让 VLM 回答上述生成的所有问题。
+2. 再次使用 `VisualGroundingRefiner` 检查这些新生成的细节是否准确。
+3. 保留可靠的细节信息 (`final_details`)。
+
+
+
+#### E. **最终融合 (PromptTemplatedQAGenerator)**
+
+* **功能**:将“黄金句子”和“验证后的细节”重写为一段流畅的文本。
+* **输出**:`final_caption`。
+
+### 3. **输出数据**
+
+输出数据记录了流水线的全过程,方便调试和分析:
+
+* **init_caption**:原始生成的初稿。
+* **golden_sentences**:通过第一次自检的句子列表。
+* **q_list**:生成的追问列表。
+* **final_details**:通过第二次自检的细节回答。
+* **final_caption**:最终的高密度描述。
+
+**输出数据示例**:
+
+```json
+{
+ "image": "../example_data/capsbench_images/0.png",
+ "init_caption": "A dog sitting on a bench.",
+ "golden_sentences": ["A dog is sitting on a wooden bench."],
+ "q_list": ["Describe more details about the dog.", "Describe more details about the position of the bench."],
+ "final_details": ["The dog is a Golden Retriever with a red collar.", "The bench is located in a park."],
+ "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park."
+}
+
+```
+
+---
+
+## 4. 流水线示例
+
+以下是完整的 `ImageScaleCaptionPipeline` 代码实现 (API 版本)。
+
+```python
+import os
+os.environ["DF_API_KEY"] = "sk-xxxx"
+
+
+import re
+import argparse
+from typing import Callable, Any, List
+
+from dataflow.utils.storage import FileStorage
+
+from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
+from dataflow.prompts.image import ImageScaleCaptionPrompt
+
+from dataflow.operators.core_vision import PromptedVQAGenerator, BatchVQAGenerator, VisualGroundingRefiner
+from dataflow.operators.core_text import PromptTemplatedQAGenerator, FunctionalRefiner
+from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
+
+def split_sentences(text: str) -> List[str]:
+ """将文本拆分为句子列表"""
+ if not text or not isinstance(text, str):
+ return []
+ # 使用正则按标点符号分割 (. ! ? 。 ! ?)
+ _SENT_SPLIT = re.compile(r"(?<=[.!?。!?])\s+")
+ parts = [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]
+ return parts or ([text.strip()] if text.strip() else [])
+
+def join_list(data: Any, separator: str = "\n") -> str:
+ """将列表连接为字符串"""
+ if isinstance(data, list):
+ # 过滤掉非字符串元素或空字符串
+ valid_items = [str(x) for x in data if x]
+ return separator.join(valid_items)
+ return str(data) if data is not None else ""
+
+def parse_questions_logic(text: str, max_q: int = 20) -> List[str]:
+ """
+ 解析 LLM 生成的 "Describe more details about..." 文本,
+ 并自动扩展 position 问题。
+ """
+ if not text or not isinstance(text, str):
+ return []
+
+ lines = [t.strip() for t in text.split("\n") if t.strip()]
+ obj_qs = []
+
+ for line in lines:
+ # 提取包含 "Describe more details about" 的行
+ if "Describe more details about" in line:
+ # 去除可能的序号 (如 "1. Describe...")
+ try:
+ start_idx = line.find("Describe")
+ clean = line[start_idx:]
+ # 去除句末多余内容,保留到第一个句号
+ if "." in clean:
+ clean = clean.split(".")[0] + "."
+ obj_qs.append(clean)
+ except Exception:
+ continue
+
+ # 去重并保持顺序
+ seen = set()
+ unique_obj_qs = []
+ for q in obj_qs:
+ if q not in seen:
+ unique_obj_qs.append(q)
+ seen.add(q)
+
+ # 截断
+ unique_obj_qs = unique_obj_qs[:max_q]
+
+ # 扩展 Position 问题
+ pos_qs = [
+ q.replace("Describe more details about", "Describe more details about the position of")
+ for q in unique_obj_qs
+ ]
+
+ # 返回合并后的列表 (对象问题 + 位置问题)
+ return unique_obj_qs + pos_qs
+
+
+class ImageScaleCaptionPipeline:
+ def __init__(
+ self,
+ # Storage params
+ first_entry_file: str = "images.jsonl",
+ cache_path: str = "./cache_scalecap",
+ file_name_prefix: str = "scalecap",
+ cache_type: str = "jsonl",
+ # Keys
+ input_image_key: str = "image",
+ output_key: str = "final_caption",
+ # VLLM Config
+ vllm_tensor_parallel_size: int = 1,
+ vllm_temperature: float = 0.7,
+ vllm_top_p: float = 0.9,
+ vllm_max_tokens: int = 512,
+ ):
+ # 1. Storage
+ self.storage = FileStorage(
+ first_entry_file_name=first_entry_file,
+ cache_path=cache_path,
+ file_name_prefix=file_name_prefix,
+ cache_type=cache_type,
+ )
+
+ # 2. Serving
+ self.vlm_serving = APIVLMServing_openai(
+ api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format
+ model_name="gpt-4o-mini",
+ image_io=None,
+ send_request_stream=False,
+ max_workers=10,
+ timeout=1800
+ )
+
+ # 3. Prompts
+ self.prompts_db = ImageScaleCaptionPrompt().build_prompt()
+
+ # 4. Keys
+ self.input_image_key = input_image_key
+ self.output_key = output_key
+
+ # ================== Operator Initialization ==================
+
+ # --- Step A: Generate Init Caption ---
+ # 构造固定 Prompt 列
+ self.refine_const_prompt = FunctionalRefiner(func=lambda: self.prompts_db["VLM_PROMPT_1"])
+
+ # 生成初稿 (使用通用 PromptedVQAGenerator)
+ self.gen_init_caption = PromptedVQAGenerator(
+ serving=self.vlm_serving,
+ system_prompt="You are a helpful assistant."
+ )
+
+ # --- Step B: Refine Golden Sentences ---
+ # 分句
+ self.refine_split = FunctionalRefiner(func=split_sentences)
+
+ # 视觉自检 (保留 Yes 的句子)
+ self.refine_golden = VisualGroundingRefiner(
+ serving=self.vlm_serving,
+ prompt_template="Given the image, is the description '{text}' directly supported by visual evidence? Answer strictly yes or no."
+ )
+
+ # --- Step C: Generate Questions ---
+ # 列表转字符串
+ self.refine_join = FunctionalRefiner(func=join_list)
+
+ # 文本生成问题 (Text-to-Text)
+ tpl_q = NamedPlaceholderPromptTemplate(
+ template=self.prompts_db["LLM_PROMPT_1"],
+ join_list_with="\n"
+ )
+ self.gen_questions_text = PromptTemplatedQAGenerator(
+ serving=self.vlm_serving,
+ prompt_template=tpl_q
+ )
+
+ # 解析问题文本为列表
+ self.refine_parse_qs = FunctionalRefiner(func=parse_questions_logic)
+
+ # --- Step D: Generate Answers ---
+ # 批量回答 (One Image -> Many Qs)
+ self.gen_answers = BatchVQAGenerator(serving=self.vlm_serving)
+
+ # 回答过滤
+ self.refine_answers = VisualGroundingRefiner(
+ serving=self.vlm_serving,
+ prompt_template="Given the image, is the statement '{text}' grounded in the image and not generic? Answer strictly yes or no."
+ )
+
+ # --- Step E: Integrate Final Caption ---
+ # 融合 (Text-to-Text)
+ tpl_final = NamedPlaceholderPromptTemplate(
+ template=self.prompts_db["LLM_PROMPT_4"],
+ join_list_with="\n"
+ )
+ self.gen_final_caption = PromptTemplatedQAGenerator(
+ serving=self.vlm_serving,
+ prompt_template=tpl_final
+ )
+
+ def forward(self):
+ print(">>> [Pipeline] Step 0: Preparing Prompts...")
+ # 构造 init_prompt 列
+ self.refine_const_prompt.run(
+ self.storage.step(),
+ output_key="init_prompt"
+ )
+
+ print(">>> [Pipeline] Step 1: Generating Initial Caption...")
+ self.gen_init_caption.run(
+ self.storage.step(),
+ input_prompt_key="init_prompt",
+ input_image_key=self.input_image_key,
+ output_answer_key="init_caption"
+ )
+
+ print(">>> [Pipeline] Step 2: Refining Golden Sentences...")
+ self.refine_split.run(
+ self.storage.step(),
+ output_key="sentences",
+ text="init_caption"
+ )
+ self.refine_golden.run(
+ self.storage.step(),
+ input_list_key="sentences",
+ input_image_key=self.input_image_key,
+ output_key="golden_sentences"
+ )
+
+ print(">>> [Pipeline] Step 3: Generating Details Questions...")
+ self.refine_join.run(
+ self.storage.step(),
+ output_key="golden_str",
+ data="golden_sentences"
+ )
+
+ # template: "{sentence}" -> map to col "golden_str"
+ self.gen_questions_text.run(
+ self.storage.step(),
+ output_answer_key="raw_q_text",
+ sentence="golden_str"
+ )
+
+ self.refine_parse_qs.run(
+ self.storage.step(),
+ output_key="q_list",
+ text="raw_q_text"
+ )
+
+ print(">>> [Pipeline] Step 4: Generating & Filtering Answers...")
+ self.gen_answers.run(
+ self.storage.step(),
+ input_prompts_key="q_list",
+ input_image_key=self.input_image_key,
+ output_key="raw_answers"
+ )
+
+ self.refine_answers.run(
+ self.storage.step(),
+ input_list_key="raw_answers",
+ input_image_key=self.input_image_key,
+ output_key="final_details"
+ )
+
+ print(">>> [Pipeline] Step 5: Integrating Final Caption...")
+ self.refine_join.run(
+ self.storage.step(),
+ output_key="details_str",
+ data="final_details"
+ )
+
+ # template keys: context, object_info, position_info
+ self.gen_final_caption.run(
+ self.storage.step(),
+ output_answer_key=self.output_key,
+ context="golden_str",
+ object_info="details_str",
+ position_info="details_str" # 简化:同时作为 object 和 position 信息
+ )
+
+ print(f">>> [Pipeline] All Done. Result saved to: {self.storage.cache_path}")
+
+
+if __name__ == "__main__":
+
+ pipe = ImageScaleCaptionPipeline(
+ first_entry_file="../example_data/capsbench_images/image_scale_caption_demo.jsonl",
+ cache_path="../cache/image_scale_caption",
+ file_name_prefix="scalecap",
+ input_image_key="image",
+ output_key="final_caption",
+ vllm_tensor_parallel_size=1,
+ vllm_max_tokens=1024
+ )
+
+ pipe.forward()
+
+```
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md b/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md
index a586da66..32c17ddd 100644
--- a/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md
+++ b/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md
@@ -18,8 +18,6 @@ permalink: /zh/mm_guide/image_visual_only_mcq_pipeline/
* **旋转测试**:多次打乱选项顺序,消除位置偏见。
* **双重过滤**:要求“有图答对率”高,“无图答对率”低。
-
-
---
## 2. 快速开始
@@ -32,32 +30,102 @@ cd run_vis_mcq
```
-### 第二步:准备脚本
+### 第二步:初始化 DataFlow-MM
-将下文“流水线示例”中的代码保存为 `visual_mcq_pipeline.py`。
+```bash
+dataflowmm init
+
+```
-### 第三步:配置运行参数
+这时你会看到:
+
+```bash
+gpu_pipelines/image_visual_only_mcq_pipeline.py
+
+```
-该流水线通过命令行参数控制过滤阈值。例如,要求有图 100% 正确,无图正确率低于 25%:
+### 第三步:下载示例数据
```bash
-# 安装依赖
-pip install open-dataflow vllm
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### 第四步:配置参数
+
+配置模型路径和过滤阈值(例如,要求有图 100% 正确,无图正确率低于 25%):
+
+```python
+if __name__ == "__main__":
+ pipe = VisualOnlyMCQPipeline(
+ model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+ first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl",
+ hf_cache_dir="~/.cache/huggingface",
+ download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+ rotate_num=4,
+ pass_visual_min=1.0,
+ pass_textual_max=0.25
+ )
+ pipe.forward()
```
-### 第四步:一键运行
+> **⚠️ 模型路径配置的重要提示(以 `Qwen2.5-VL-3B-Instruct` 为例):**
+> * **如果您已经下载好了模型文件**:请将 `model_path` 修改为您的本地模型路径。**务必保证**模型存放的最终文件夹名称精确为 `Qwen2.5-VL-3B-Instruct`,否则底层解析时将无法正确匹配和识别该模型。
+> * **如果您还未下载模型(需要自动下载)**:请一定要指定 `download_dir` 参数,并且该目录路径**必须以** `Qwen2.5-VL-3B-Instruct` **结尾**(正如默认参数所示),否则下载完成后同样会导致框架无法识别模型。
+>
+>
+
+### 第五步:一键运行
```bash
-python visual_mcq_pipeline.py \
- --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
- --input_file "data/captions.jsonl" \
- --rotate_num 4 \
- --pass_vis 1.0 \
- --pass_txt 0.25
+cd gpu_pipelines
+python image_visual_only_mcq_pipeline.py
```
+> **🛠️ 常见问题排查 (Troubleshooting)**
+> **问题 1:** 如果遇到类似如下的动态链接库冲突报错:
+> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`
+> **解决方法:** 这通常是环境变量干扰导致的。请在运行命令前清空 `LD_LIBRARY_PATH`:
+> ```bash
+> LD_LIBRARY_PATH="" python image_visual_only_mcq_pipeline.py
+>
+> ```
+>
+>
+> **问题 2:** 如果您使用的是 **Qwen 系列模型**,并且遇到以下报错:
+> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"`
+> **解决方法:** 打开模型文件夹下的 `config.json` 文件,找到 `rope_scaling` 配置块,将 `"type"` 字段修改为 `"rope_type"` 即可。
+> **修改前:**
+> ```json
+> "rope_scaling": {
+> "type": "mrope",
+> "mrope_section": [
+> 16,
+> 24,
+> 24
+> ]
+> }
+>
+> ```
+>
+>
+> **修改后:**
+> ```json
+> "rope_scaling": {
+> "rope_type": "mrope",
+> "mrope_section": [
+> 16,
+> 24,
+> 24
+> ]
+> }
+>
+> ```
+>
+>
+
---
## 3. 数据流与流水线逻辑
@@ -98,13 +166,12 @@ python visual_mcq_pipeline.py \
1. **选项旋转**:随机打乱选项顺序(例如将答案从 A 换到 C),防止模型通过“总是选 A”来作弊。
2. **有图推理 (Visual Pass)**:输入图像 + 题目。记录模型答对的比例。
-3. **无图推理 (Textual Pass)**:仅输入题目(无图像)。记录模型盲猜对的比例。
+3. **无图推理 (Textual Pass)**:仅输入题目(无图像进行盲测)。记录模型盲猜对的比例。
4. **过滤判据**:
+
* 保留题目,当且仅当:`Visual_Acc >= pass_visual_min` **且** `Textual_Acc <= pass_textual_max`。
* *示例*:如果一道题不看图也能答对(无图准确率高),说明它考的是常识而非视觉,**剔除**。
-
-
### 3. **输出数据**
输出数据 (`final_mcqs`) 仅包含通过了严苛验证的题目。这些题目具有极高的质量和视觉相关性。
@@ -132,12 +199,10 @@ python visual_mcq_pipeline.py \
## 4. 流水线示例
-以下是完整的 `VisualOnlyMCQPipeline` 代码实现。
+以下是完整的 `VisualOnlyMCQPipeline` 代码实现 (GPU 版本)。
```python
import argparse
-import re
-from typing import List, Dict, Any
from dataflow.utils.storage import FileStorage
from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
@@ -145,13 +210,14 @@ from dataflow.operators.core_vision import FixPromptedVQAGenerator, VisualDepend
from dataflow.operators.core_text import FunctionalRefiner
from dataflow.prompts.image import ImageCaprlPrompt
-# 正则解析逻辑
+import re
+from typing import List, Dict, Any
+
_Q_BLOCK_SPLIT = re.compile(r"^####\s*\d+\.\s*\*\*(.*?)\*\*\s*$", re.M)
_OPT_LINE_RE = re.compile(r"^\s*-\s*([A-F])\)\s*(.+?)\s*$")
_ANS_LINE_RE = re.compile(r"^\s*\*\*Answer:\*\*\s*([A-F])\)\s*(.+?)\s*$", re.I)
def parse_mcq_text_logic(mcq_text: str, expected: int = 5) -> List[Dict[str, Any]]:
- """将 VLM 生成的原始文本解析为结构化字典列表"""
if not mcq_text or not isinstance(mcq_text, str): return []
indices = [m.start() for m in _Q_BLOCK_SPLIT.finditer(mcq_text)]
@@ -213,7 +279,9 @@ class VisualOnlyMCQPipeline:
model_path: str,
*,
first_entry_file: str,
- cache_path: str = "./cache_mcq",
+ hf_cache_dir: str | None = None,
+ download_dir: str = "./ckpt/models",
+ cache_path: str = "../cache/cache_mcq",
file_name_prefix: str = "vis_mcq",
# Config
rotate_num: int = 4,
@@ -227,7 +295,6 @@ class VisualOnlyMCQPipeline:
device: str = "cuda",
vllm_max_tokens: int = 2048
):
- # 1. 初始化存储
self.storage = FileStorage(
first_entry_file_name=first_entry_file,
cache_path=cache_path,
@@ -235,15 +302,16 @@ class VisualOnlyMCQPipeline:
cache_type="jsonl"
)
- # 2. 初始化 VLM 服务
self.serving = LocalModelVLMServing_vllm(
+ hf_cache_dir=hf_cache_dir,
+ hf_local_dir=download_dir,
hf_model_name_or_path=model_path,
vllm_tensor_parallel_size=1,
- vllm_temperature=0.1, # 低温度以保证格式稳定
+ vllm_temperature=0.1,
vllm_max_tokens=vllm_max_tokens
)
- # Keys 配置
+ # Keys
self.keys = {
"img": input_image_key,
"raw_text": "raw_mcq_text",
@@ -251,23 +319,24 @@ class VisualOnlyMCQPipeline:
"final": output_key
}
- # 加载 Prompt 库
+ # --- Prompts ---
self.prompts_db = ImageCaprlPrompt().build_prompt()
- # ================== 算子初始化 ==================
+ # ================== Operators ==================
- # 算子 1: 生成原始 MCQ 文本
+ # 1. Generate Raw MCQs (FixPromptedVQAGenerator)
+ # 直接使用 prompt 类中的字符串
self.op_gen_raw = FixPromptedVQAGenerator(
serving=self.serving,
system_prompt=self.prompts_db["SYS_PROMPT_MCQ"],
user_prompt=self.prompts_db["USER_PROMPT_MCQ"]
)
- # 算子 2: 解析文本为结构化数据
+ # 2. Parse MCQs (Refine)
self.op_parse = FunctionalRefiner(func=parse_mcq_text_logic)
- # 算子 3: 视觉依赖性验证 (核心过滤)
- # 包含旋转 (Rotation) 和 无图检测 (Text-only check)
+ # 3. Verify Visual Dependency (Refine)
+ # 传入 prompt 模板
self.op_verify = VisualDependencyRefiner(
serving=self.serving,
instruction_template=self.prompts_db["ANSWER_INSTRUCTION"],
@@ -304,23 +373,15 @@ class VisualOnlyMCQPipeline:
print(f">>> [Pipeline] Done. Results in: {self.keys['final']}")
if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("--input_file", default="./dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl")
- parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
- parser.add_argument("--rotate_num", type=int, default=4)
- parser.add_argument("--pass_vis", type=float, default=1.0)
- parser.add_argument("--pass_txt", type=float, default=0.25)
-
- args = parser.parse_args()
-
pipe = VisualOnlyMCQPipeline(
- model_path=args.model_path,
- first_entry_file=args.input_file,
- rotate_num=args.rotate_num,
- pass_visual_min=args.pass_vis,
- pass_textual_max=args.pass_txt
+ model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+ first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl",
+ hf_cache_dir="~/.cache/huggingface",
+ download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+ rotate_num=4,
+ pass_visual_min=1.0,
+ pass_textual_max=0.25
)
pipe.forward()
-```
-
+```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md b/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md
new file mode 100644
index 00000000..55f74fcc
--- /dev/null
+++ b/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md
@@ -0,0 +1,339 @@
+---
+title: 视觉依赖 MCQ 生成流水线(API版)
+createTime: 2026/01/11 22:13:45
+icon: mdi:image-text
+permalink: /zh/mm_guide/image_visual_only_mcq_pipeline_api/
+---
+## 1. 概述
+
+**视觉依赖 MCQ 生成流水线 (Visual-Only MCQ Pipeline)** 是 CapRL (Caption Reinforcement Learning) 框架中的核心组件。它的目标是生成一组高质量的多项选择题 (MCQ),且这些题目必须满足**强视觉依赖性**:即模型必须“看”图才能答对,仅凭文本(猜题或常识)无法作答。
+
+该流水线通过**生成-解析-验证**三步法,利用**选项旋转 (Rotation)** 和**无图盲测 (Blind Test)** 机制,严格过滤掉模型幻觉或过于简单的题目。生成的题目可作为强化学习的奖励信号(Reward Model)。
+
+主要流程包括:
+
+1. **MCQ 生成**:VLM 基于图像生成原始的问答对文本。
+2. **结构化解析**:利用正则逻辑将文本解析为标准的题目与选项结构。
+3. **视觉依赖验证**:
+* **旋转测试**:多次打乱选项顺序,消除位置偏见。
+* **双重过滤**:要求“有图答对率”高,“无图答对率”低。
+
+---
+
+## 2. 快速开始
+
+### 第一步:创建工作目录
+
+```bash
+mkdir run_vis_mcq
+cd run_vis_mcq
+
+```
+
+### 第二步:初始化 DataFlow-MM
+
+```bash
+dataflowmm init
+
+```
+
+这时你会看到:
+
+```bash
+api_pipelines/image_visual_only_mcq_api_pipeline.py
+
+```
+
+### 第三步:下载示例数据
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### 第四步:配置 API Key
+
+在 `api_pipelines/image_visual_only_mcq_api_pipeline.py` 中设置 API Key 环境变量:
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+
+```
+
+### 第五步:配置参数
+
+在 `api_pipelines/image_visual_only_mcq_api_pipeline.py` 中配置过滤阈值,例如,要求有图 100% 正确,无图正确率低于 25%:
+
+```python
+ pipe = VisualOnlyMCQPipeline(
+ first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl",
+ rotate_num=4,
+ pass_visual_min=1.0,
+ pass_textual_max=0.25
+ )
+
+```
+
+### 第六步:一键运行
+
+```bash
+cd api_pipelines
+python image_visual_only_mcq_api_pipeline.py
+
+```
+
+---
+
+## 3. 数据流与流水线逻辑
+
+### 1. **输入数据**
+
+输入仅需包含图像路径:
+
+* **image**:图像文件路径。
+
+**输入数据示例**:
+
+```json
+{
+ "image": "./images/sample_01.jpg"
+}
+
+```
+
+### 2. **核心算子逻辑**
+
+该流水线由三个关键算子串联而成:
+
+#### A. **FixPromptedVQAGenerator(原始生成)**
+
+* **功能**:使用 CapRL 预设的 Prompt 模板(`SYS_PROMPT_MCQ` / `USER_PROMPT_MCQ`),让 VLM 一次性生成 5 道 MCQ。
+* **输出**:包含多个 `#### Question` 和选项的非结构化文本块。
+
+#### B. **FunctionalRefiner(正则解析)**
+
+* **逻辑函数**:`parse_mcq_text_logic`
+* **功能**:利用正则表达式从原始文本中提取题目、选项(A-F)和正确答案。
+* **输出**:结构化的 MCQ 列表 (`parsed_mcq_list`)。
+
+#### C. **VisualDependencyRefiner(依赖性验证)**
+
+这是本流水线的核心过滤器。它对每道题进行 N 次推理(N = `rotate_num`):
+
+1. **选项旋转**:随机打乱选项顺序(例如将答案从 A 换到 C),防止模型通过“总是选 A”来作弊。
+2. **有图推理 (Visual Pass)**:输入图像 + 题目。记录模型答对的比例。
+3. **无图推理 (Textual Pass)**:仅输入题目(无图像进行盲测)。记录模型盲猜对的比例。
+4. **过滤判据**:
+
+* 保留题目,当且仅当:`Visual_Acc >= pass_visual_min` **且** `Textual_Acc <= pass_textual_max`。
+* *示例*:如果一道题不看图也能答对(无图准确率高),说明它考的是常识而非视觉,**剔除**。
+
+### 3. **输出数据**
+
+输出数据 (`final_mcqs`) 仅包含通过了严苛验证的题目。这些题目具有极高的质量和视觉相关性。
+
+**输出数据示例**:
+
+```json
+{
+ "image": "./images/sample_01.jpg",
+ "final_mcqs": [
+ {
+ "question": "What is the color of the car on the far left?\n - A) Red\n - B) Blue...",
+ "answer": "A",
+ "stats": {
+ "visual_acc": 1.0, # 4次全对
+ "text_acc": 0.0 # 盲猜全错
+ }
+ }
+ ]
+}
+
+```
+
+---
+
+## 4. 流水线示例
+
+以下是完整的 `VisualOnlyMCQPipeline` 代码实现 (API 版本)。
+
+```python
+import os
+os.environ["DF_API_KEY"] = "sk-xxxx"
+import argparse
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
+from dataflow.operators.core_vision import FixPromptedVQAGenerator, VisualDependencyRefiner
+from dataflow.operators.core_text import FunctionalRefiner
+from dataflow.prompts.image import ImageCaprlPrompt
+from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
+import re
+from typing import List, Dict, Any
+
+_Q_BLOCK_SPLIT = re.compile(r"^####\s*\d+\.\s*\*\*(.*?)\*\*\s*$", re.M)
+_OPT_LINE_RE = re.compile(r"^\s*-\s*([A-F])\)\s*(.+?)\s*$")
+_ANS_LINE_RE = re.compile(r"^\s*\*\*Answer:\*\*\s*([A-F])\)\s*(.+?)\s*$", re.I)
+
+def parse_mcq_text_logic(mcq_text: str, expected: int = 5) -> List[Dict[str, Any]]:
+ if not mcq_text or not isinstance(mcq_text, str): return []
+
+ indices = [m.start() for m in _Q_BLOCK_SPLIT.finditer(mcq_text)]
+ if not indices: return []
+ indices.append(len(mcq_text))
+ blocks = [mcq_text[indices[i]:indices[i+1]].strip() for i in range(len(indices)-1)]
+
+ parsed = []
+ for block in blocks:
+ lines = [ln.rstrip() for ln in block.splitlines() if ln.strip()]
+ q_title_m = _Q_BLOCK_SPLIT.search(block)
+ if not q_title_m: continue
+
+ q_title = q_title_m.group(1).strip()
+ options = {}
+ ans_letter, ans_text = None, None
+
+ for ln in lines:
+ m_opt = _OPT_LINE_RE.match(ln)
+ if m_opt:
+ options[m_opt.group(1)] = m_opt.group(2).strip()
+ continue
+ m_ans = _ANS_LINE_RE.match(ln)
+ if m_ans:
+ ans_letter = m_ans.group(1).upper()
+ ans_text = m_ans.group(2).strip()
+ break
+
+ if options and ans_letter and ans_letter in options:
+ q_lines = [q_title]
+ for lbl in ["A", "B", "C", "D", "E", "F"]:
+ if lbl in options:
+ q_lines.append(f" - {lbl}) {options[lbl]}")
+
+ parsed.append({
+ "question": "\n".join(q_lines),
+ "question_title": q_title,
+ "options": options,
+ "answer": ans_letter,
+ "answer_text": ans_text
+ })
+
+ if expected > 0:
+ parsed = parsed[:expected]
+
+ uniq = []
+ seen = set()
+ for it in parsed:
+ key = (it["question_title"], it["answer"])
+ if key not in seen:
+ seen.add(key)
+ uniq.append(it)
+ return uniq
+
+
+class VisualOnlyMCQPipeline:
+ def __init__(
+ self,
+ *,
+ first_entry_file: str,
+ cache_path: str = "../cache/cache_mcq",
+ file_name_prefix: str = "vis_mcq",
+ # Config
+ rotate_num: int = 4,
+ pass_visual_min: float = 1.0,
+ pass_textual_max: float = 0.25,
+ add_none_above: bool = True,
+ # Keys
+ input_image_key: str = "image",
+ output_key: str = "final_mcqs",
+ # VLLM
+ vllm_max_tokens: int = 2048
+ ):
+ self.storage = FileStorage(
+ first_entry_file_name=first_entry_file,
+ cache_path=cache_path,
+ file_name_prefix=file_name_prefix,
+ cache_type="jsonl"
+ )
+ self.vlm_serving = APIVLMServing_openai(
+ api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format
+ model_name="gpt-4o-mini",
+ image_io=None,
+ send_request_stream=False,
+ max_workers=10,
+ timeout=1800
+ )
+
+
+ # Keys
+ self.keys = {
+ "img": input_image_key,
+ "raw_text": "raw_mcq_text",
+ "parsed_list": "parsed_mcq_list",
+ "final": output_key
+ }
+
+ # --- Prompts ---
+ self.prompts_db = ImageCaprlPrompt().build_prompt()
+
+ # ================== Operators ==================
+
+ # 1. Generate Raw MCQs (FixPromptedVQAGenerator)
+ # 直接使用 prompt 类中的字符串
+ self.op_gen_raw = FixPromptedVQAGenerator(
+ serving=self.vlm_serving,
+ system_prompt=self.prompts_db["SYS_PROMPT_MCQ"],
+ user_prompt=self.prompts_db["USER_PROMPT_MCQ"]
+ )
+
+ # 2. Parse MCQs (Refine)
+ self.op_parse = FunctionalRefiner(func=parse_mcq_text_logic)
+
+ # 3. Verify Visual Dependency (Refine)
+ # 传入 prompt 模板
+ self.op_verify = VisualDependencyRefiner(
+ serving=self.vlm_serving,
+ instruction_template=self.prompts_db["ANSWER_INSTRUCTION"],
+ rotate_num=rotate_num,
+ pass_visual_min=pass_visual_min,
+ pass_textual_max=pass_textual_max,
+ add_none_above_visual=add_none_above
+ )
+
+ def forward(self):
+ print(">>> [Pipeline] Step 1: Generating Raw MCQs (FixPrompted)...")
+ self.op_gen_raw.run(
+ self.storage.step(),
+ input_image_key=self.keys["img"],
+ output_answer_key=self.keys["raw_text"]
+ )
+
+ print(">>> [Pipeline] Step 2: Parsing MCQs...")
+ self.op_parse.run(
+ self.storage.step(),
+ output_key=self.keys["parsed_list"],
+ mcq_text=self.keys["raw_text"],
+ expected=5
+ )
+
+ print(">>> [Pipeline] Step 3: Verifying Visual Dependency (Rotation Check)...")
+ self.op_verify.run(
+ self.storage.step(),
+ input_list_key=self.keys["parsed_list"],
+ input_image_key=self.keys["img"],
+ output_key=self.keys["final"]
+ )
+
+ print(f">>> [Pipeline] Done. Results in: {self.keys['final']}")
+
+if __name__ == "__main__":
+ pipe = VisualOnlyMCQPipeline(
+ first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl",
+ rotate_num=4,
+ pass_visual_min=1.0,
+ pass_textual_max=0.25
+ )
+ pipe.forward()
+
+```
diff --git a/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md b/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md
index ee038567..fa803660 100644
--- a/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md
+++ b/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md
@@ -35,30 +35,100 @@ cd run_mcts_reasoning
```
-### 第二步:准备脚本
+### 第二步:初始化 DataFlow-MM
-将下文“流水线示例”中的代码保存为 `vision_mcts_pipeline.py`。
+```bash
+dataflowmm init
-### 第三步:配置运行参数
+```
-确保输入文件(jsonl)包含 `tree` 字段(用于提取)或仅包含 `question/image`(用于生成)。
+这时你会看到:
```bash
-# 安装依赖
-pip install open-dataflow vllm
+gpu_pipelines/vision_mcts_pipeline.py
```
-### 第四步:一键运行
+### 第三步:下载示例数据
```bash
-python vision_mcts_pipeline.py \
- --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
- --input_file "data/mcts_trajectories.jsonl" \
- --prompt_type "spatial"
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
```
+### 第四步:配置参数
+
+确保输入文件(jsonl)包含 `tree` 字段(用于提取)或仅包含 `question/image`(用于生成):
+
+```python
+if __name__ == "__main__":
+ pipe = VisionMCTSReasoningPipeline(
+ model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+ first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl",
+ prompt_type="spatial",
+ hf_cache_dir="~/.cache/huggingface",
+ download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+ )
+ pipe.forward()
+
+```
+
+> **⚠️ 模型路径配置的重要提示(以 `Qwen2.5-VL-3B-Instruct` 为例):**
+> * **如果您已经下载好了模型文件**:请将 `model_path` 修改为您的本地模型路径。**务必保证**模型存放的最终文件夹名称精确为 `Qwen2.5-VL-3B-Instruct`,否则底层解析时将无法正确匹配和识别该模型。
+> * **如果您还未下载模型(需要自动下载)**:请一定要指定 `download_dir` 参数,并且该目录路径**必须以** `Qwen2.5-VL-3B-Instruct` **结尾**(正如默认参数所示),否则下载完成后同样会导致框架无法识别模型。
+>
+>
+
+### 第五步:一键运行
+
+```bash
+cd gpu_pipelines
+python vision_mcts_pipeline.py
+
+```
+
+> **🛠️ 常见问题排查 (Troubleshooting)**
+> **问题 1:** 如果遇到类似如下的动态链接库冲突报错:
+> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`
+> **解决方法:** 这通常是环境变量干扰导致的。请在运行命令前清空 `LD_LIBRARY_PATH`:
+> ```bash
+> LD_LIBRARY_PATH="" python vision_mcts_pipeline.py
+>
+> ```
+>
+>
+> **问题 2:** 如果您使用的是 **Qwen 系列模型**,并且遇到以下报错:
+> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"`
+> **解决方法:** 打开模型文件夹下的 `config.json` 文件,找到 `rope_scaling` 配置块,将 `"type"` 字段修改为 `"rope_type"` 即可。
+> **修改前:**
+> ```json
+> "rope_scaling": {
+> "type": "mrope",
+> "mrope_section": [
+> 16,
+> 24,
+> 24
+> ]
+> }
+>
+> ```
+>
+>
+> **修改后:**
+> ```json
+> "rope_scaling": {
+> "rope_type": "mrope",
+> "mrope_section": [
+> 16,
+> 24,
+> 24
+> ]
+> }
+>
+> ```
+>
+>
+
---
## 3. 数据流与流水线逻辑
@@ -125,10 +195,9 @@ python vision_mcts_pipeline.py \
## 4. 流水线示例
-以下是完整的 `VisionMCTSReasoningPipeline` 代码实现。
+以下是完整的 `VisionMCTSReasoningPipeline` 代码实现 (GPU 版本)。
```python
-import argparse
from dataflow.utils.storage import FileStorage
from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
@@ -142,8 +211,10 @@ class VisionMCTSReasoningPipeline:
model_path: str,
*,
# Storage
+ hf_cache_dir: str | None = None,
+ download_dir: str = "./ckpt/models",
first_entry_file: str,
- cache_path: str = "./cache_mcts",
+ cache_path: str = "../cache/cache_mcts",
file_name_prefix: str = "mcts_reason",
# Config
prompt_type: str = "spatial",
@@ -156,7 +227,6 @@ class VisionMCTSReasoningPipeline:
# VLLM
vllm_max_tokens: int = 1024
):
- # 1. 存储初始化
self.storage = FileStorage(
first_entry_file_name=first_entry_file,
cache_path=cache_path,
@@ -164,8 +234,9 @@ class VisionMCTSReasoningPipeline:
cache_type="jsonl"
)
- # 2. 模型服务
self.serving = LocalModelVLMServing_vllm(
+ hf_cache_dir=hf_cache_dir,
+ hf_local_dir=download_dir,
hf_model_name_or_path=model_path,
vllm_tensor_parallel_size=1,
vllm_temperature=0.7,
@@ -176,20 +247,18 @@ class VisionMCTSReasoningPipeline:
"q": input_question_key,
"img": input_image_key,
"tree": input_tree_key,
- "mcts_chains": "mcts_extracted_chains", # 中间结果
+ "mcts_chains": "mcts_extracted_chains",
"final": output_key
}
# ================== Operators ==================
- # 算子 1: MCTS Tree -> Chains (提取器)
- # 负责将树结构扁平化为线性链
+ # 1. Refiner: MCTS -> Chains
self.op_mcts_refine = MCTSTreeRefiner(
max_chains_per_sample=max_samples_per_file
)
- # 算子 2: VLM -> Chains (生成器/Fallback)
- # 如果 MCTS 提取失败,则使用 VLM 生成;如果成功,则跳过
+ # 2. Generator: VLM -> Chains (Fallback)
self.op_vlm_gen = VisualReasoningGenerator(
serving=self.serving,
prompt_type=prompt_type
@@ -204,7 +273,8 @@ class VisionMCTSReasoningPipeline:
)
print(">>> [Pipeline] Step 2: Generating Chains via VLM (Fallback)...")
- # 注意:input_existing_chains_key 实现了混合/回退逻辑
+ # 将 mcts_chains 作为 input_existing_chains_key 传入
+ # 如果 MCTS 解析成功,则复用;否则调用 VLM 生成
self.op_vlm_gen.run(
self.storage.step(),
input_question_key=self.keys["q"],
@@ -215,16 +285,12 @@ class VisionMCTSReasoningPipeline:
if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("--input_file", default="dataflow/example/image_to_text_pipeline/mct_reasoning.jsonl")
- parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
- parser.add_argument("--prompt_type", default="spatial")
- args = parser.parse_args()
-
pipe = VisionMCTSReasoningPipeline(
- model_path=args.model_path,
- first_entry_file=args.input_file,
- prompt_type=args.prompt_type
+ model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+ first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl",
+ prompt_type="spatial",
+ hf_cache_dir="~/.cache/huggingface",
+ download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
)
pipe.forward()
diff --git a/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md b/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md
new file mode 100644
index 00000000..1c391d8b
--- /dev/null
+++ b/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md
@@ -0,0 +1,248 @@
+---
+title: 视觉 MCTS 推理链生成流水线(API版)
+icon: mdi:image-text
+createTime: 2026/01/11 21:59:59
+permalink: /zh/mm_guide/vision_mct_reasoning_pipeline_api/
+---
+
+## 1. 概述
+
+**视觉 MCTS 推理链生成流水线 (Vision MCTS Reasoning Pipeline)** 旨在为多模态大模型构建高质量的**过程监督数据(Process Supervision Data)**。该流水线能够处理两种数据来源:已有的蒙特卡洛树搜索(MCTS)轨迹数据,或直接利用 VLM 生成新的推理链。
+
+该流水线是 **Grounded-RL** 和 **SFT 数据构建**的核心工具,它将复杂的树状搜索过程“线性化”为模型可学习的 `......` 格式。
+
+我们支持以下应用场景:
+
+* **从 MCTS 树提取数据**:将搜索树中高价值的路径(Rollouts)转化为线性训练数据。
+* **混合数据构建**:对于没有搜索树的样本,自动回退到使用 VLM 进行 CoT 生成。
+* **空间推理增强**:支持生成包含显式坐标(Bounding Box)的空间推理链。
+
+流水线的主要流程包括:
+
+1. **MCTS 树解析**:解析输入数据中的搜索树结构,提取成功的推理路径。
+2. **视觉推理生成 (Fallback)**:对于缺失树结构或解析失败的样本,利用 VLM 重新生成推理链。
+3. **数据标准化**:输出统一格式的推理链数据。
+
+---
+
+## 2. 快速开始
+
+### 第一步:准备工作目录
+
+```bash
+mkdir run_mcts_reasoning
+cd run_mcts_reasoning
+
+```
+
+### 第二步:初始化 DataFlow-MM
+
+```bash
+dataflowmm init
+
+```
+
+这时你会看到:
+
+```bash
+api_pipelines/vision_mcts_api_pipeline.py
+
+```
+
+### 第三步:下载示例数据
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### 第四步:配置 API Key
+
+在 `api_pipelines/vision_mcts_api_pipeline.py` 中设置 API Key 环境变量:
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+
+```
+
+### 第五步:配置参数
+
+配置 API 服务和输入数据路径。确保输入文件(jsonl)包含 `tree` 字段(用于提取)或仅包含 `question/image`(用于生成):
+
+```python
+ pipe = VisionMCTSReasoningPipeline(
+ first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl",
+ prompt_type="spatial",
+ )
+
+```
+
+### 第六步:一键运行
+
+```bash
+cd api_pipelines
+python vision_mcts_api_pipeline.py
+
+```
+
+---
+
+## 3. 数据流与流水线逻辑
+
+### 1. **输入数据**
+
+输入数据通常来源于 MCTS 搜索过程的日志,或未标注的图文对:
+
+* **image**:图像路径。
+* **question**:视觉问题。
+* **tree**(可选):MCTS 搜索树的 JSON 结构,包含节点值(Value)、访问次数(Visits)和动作(Actions)。
+
+**输入数据示例**:
+
+```json
+{
+ "image": "./images/puzzle.jpg",
+ "question": "What is the next step to solve this?",
+ "tree": { "root": { "children": [...], "value": 1.0, "text": "Step 1..." } }
+}
+
+```
+
+### 2. **核心算子逻辑**
+
+该流水线采用 **“提取优先,生成兜底”** 的混合策略:
+
+#### A. **MCTSTreeRefiner(树结构解析器)**
+
+该算子负责处理 `tree` 字段。它遍历树结构,根据节点价值(Q-value)筛选出从根节点到叶子节点的最佳路径。
+
+* **输入**:`tree` 对象。
+* **功能**:线性化树路径,过滤掉低价值或未完成的搜索分支。
+* **输出**:提取出的推理链列表(`mcts_chains`)。
+
+#### B. **VisualReasoningGenerator(视觉推理生成器)**
+
+该算子是流水线的“生成引擎”。它接收上一步的提取结果作为输入。
+
+* **机制**:检查 `input_existing_chains_key`(即 `mcts_chains`)。
+* 如果 MCTS 解析成功(链存在),则直接复用,不进行推理(节省计算资源)。
+* 如果 MCTS 链为空(树不存在或解析失败),则调用 VLM,根据 `prompt_type`(如 `spatial`)从头生成推理链。
+
+
+* **Prompt 类型**:支持 `spatial`(空间坐标推理)、`logical`(逻辑推理)等模式。
+
+### 3. **输出数据**
+
+最终生成的输出数据(`final_reasoning_chains`)将包含高质量的思维链,可直接用于 SFT 训练。
+
+**输出示例**:
+
+```json
+{
+ "image": "./images/puzzle.jpg",
+ "final_reasoning_chains": [
+ "First, locate the red block at [100, 200]. To solve the puzzle, it needs to move right...Move Red Block"
+ ]
+}
+
+```
+
+---
+
+## 4. 流水线示例
+
+以下是完整的 `VisionMCTSReasoningPipeline` 代码实现 (API 版本)。
+
+```python
+import os
+os.environ["DF_API_KEY"] = "sk-xxxx"
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
+# 引入原子算子
+from dataflow.operators.core_text import MCTSTreeRefiner
+from dataflow.operators.core_vision import VisualReasoningGenerator
+from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
+
+class VisionMCTSReasoningPipeline:
+ def __init__(
+ self,
+ first_entry_file: str,
+ cache_path: str = "../cache/cache_mcts",
+ file_name_prefix: str = "mcts_reason",
+ # Config
+ prompt_type: str = "spatial",
+ max_samples_per_file: int = 10000,
+ # Keys
+ input_question_key: str = "question",
+ input_image_key: str = "image",
+ input_tree_key: str = "tree",
+ output_key: str = "final_reasoning_chains",
+
+ ):
+ self.storage = FileStorage(
+ first_entry_file_name=first_entry_file,
+ cache_path=cache_path,
+ file_name_prefix=file_name_prefix,
+ cache_type="jsonl"
+ )
+
+ self.vlm_serving = APIVLMServing_openai(
+ api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format
+ model_name="gpt-4o-mini",
+ image_io=None,
+ send_request_stream=False,
+ max_workers=10,
+ timeout=1800
+ )
+
+ self.keys = {
+ "q": input_question_key,
+ "img": input_image_key,
+ "tree": input_tree_key,
+ "mcts_chains": "mcts_extracted_chains",
+ "final": output_key
+ }
+
+ # ================== Operators ==================
+
+ # 1. Refiner: MCTS -> Chains
+ self.op_mcts_refine = MCTSTreeRefiner(
+ max_chains_per_sample=max_samples_per_file
+ )
+
+ # 2. Generator: VLM -> Chains (Fallback)
+ self.op_vlm_gen = VisualReasoningGenerator(
+ serving=self.vlm_serving,
+ prompt_type=prompt_type
+ )
+
+ def forward(self):
+ print(">>> [Pipeline] Step 1: Extracting Chains from MCTS Trees...")
+ self.op_mcts_refine.run(
+ self.storage.step(),
+ input_tree_key=self.keys["tree"],
+ output_key=self.keys["mcts_chains"]
+ )
+
+ print(">>> [Pipeline] Step 2: Generating Chains via VLM (Fallback)...")
+ # 将 mcts_chains 作为 input_existing_chains_key 传入
+ # 如果 MCTS 解析成功,则复用;否则调用 VLM 生成
+ self.op_vlm_gen.run(
+ self.storage.step(),
+ input_question_key=self.keys["q"],
+ input_image_key=self.keys["img"],
+ input_existing_chains_key=self.keys["mcts_chains"],
+ output_key=self.keys["final"]
+ )
+
+
+if __name__ == "__main__":
+ pipe = VisionMCTSReasoningPipeline(
+ first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl",
+ prompt_type="spatial",
+ )
+ pipe.forward()
+
+```