From bcd725b5d04a2bf8c3b07f146c50ac64c691c3d4 Mon Sep 17 00:00:00 2001
From: chawuciren11 <2216740116@qq.com>
Date: Thu, 26 Feb 2026 15:09:55 +0800
Subject: [PATCH 1/2] 11

---
 .../image_understanding/image_gcot.md         | 182 +++++---
 .../image_understanding/image_gcot_api.md     | 399 ++++++++++++++++++
 .../image_understanding/image_gcot.md         | 182 +++++---
 .../image_understanding/image_gcot_api.md     | 390 +++++++++++++++++
 4 files changed, 1044 insertions(+), 109 deletions(-)
 create mode 100644 docs/en/notes/mm_guide/image_understanding/image_gcot_api.md
 create mode 100644 docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md

diff --git a/docs/en/notes/mm_guide/image_understanding/image_gcot.md b/docs/en/notes/mm_guide/image_understanding/image_gcot.md
index 85a41201..68569d6a 100644
--- a/docs/en/notes/mm_guide/image_understanding/image_gcot.md
+++ b/docs/en/notes/mm_guide/image_understanding/image_gcot.md
@@ -27,36 +27,94 @@ The main process of the pipeline includes:
 
 ## 2. Quick Start
 
-### Step 1: Create a Working Directory
+### Step 1: Create a New DataFlow Working Directory
 
 ```bash
-mkdir run_gcot
-cd run_gcot
+mkdir run_dataflow
+cd run_dataflow
 
 ```
 
-### Step 2: Prepare the Script
+### Step 2: Initialize DataFlow-MM
 
-Save the code in the "Pipeline Example" section below as `image_gcot_pipeline.py`.
+```bash
+dataflowmm init
 
-### Step 3: Configure Parameters
+```
 
-Ensure you have a VLM model capable of grounding (e.g., Qwen2.5-VL-7B-Instruct).
+You will then see:
 
 ```bash
-# Install dependencies
-pip install open-dataflow vllm
-
+gpu_pipelines/image_gcot_pipeline.py
 ```
 
-### Step 4: Run
+### Step 3: Download Sample Data
 
 ```bash
-python image_gcot_pipeline.py \
-  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
-  --input_file "data/image_qa.jsonl"
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+```
+
+### Step 4: Configure Parameters
+```python
+if __name__ == "__main__":
+    pipe = ImageGCoTPipeline(
+        model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+        first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl",
+        hf_cache_dir="~/.cache/huggingface",
+        download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+    )
+    pipe.forward()
+```
+> **�7²2„1‚5 Important Note on Model Path Configuration (Taking `Qwen2.5-VL-3B-Instruct` as an example):**
+> 
+> * **If you have already downloaded the model files:** Please change `model_path` to your local model path. **Crucially**, ensure that the model folder is named exactly `Qwen2.5-VL-3B-Instruct`; otherwise, the framework will fail to recognize it.
+> * **If you haven't downloaded the model yet:** You must specify a `download_dir` parameter that ends with `Qwen2.5-VL-3B-Instruct` (as shown in the default parameters). Failure to do so will also result in the model not being recognized after downloading.
 
+
+### Step 5: Run
+
+```bash
+cd gpu_pipelines
+python image_gcot_pipeline.py
 ```
+> **•0•0„1‚5 Troubleshooting**
+> 
+> **Issue 1:** If you encounter a CUDA library conflict error similar to the following:
+> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`
+> 
+> **Solution:** This is usually caused by conflicting environment variables. Run the script with an empty `LD_LIBRARY_PATH`:
+> ```bash
+> LD_LIBRARY_PATH="" python image_gcot_pipeline.py
+> ```
+> 
+> **Issue 2:** If you are using **Qwen series models** and encounter the following error:
+> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"`
+> 
+> **Solution:** Open the `config.json` file located in your model folder, find the `rope_scaling` section, and change the key `"type"` to `"rope_type"`.
+> 
+> **Before modification:**
+> ```json
+> "rope_scaling": {
+>   "type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> ```
+> 
+> **After modification:**
+> ```json
+> "rope_scaling": {
+>   "rope_type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> ```
 
 ---
 
@@ -74,9 +132,9 @@ The input data for this process typically consists of standard VQA data:
 
 ```json
 {
-    "image": "./images/cat_dog.jpg",
-    "question": "Is the cat looking at the dog?",
-    "answer": "Yes"
+    "image":"../example_data/capsbench_images/0.png",
+    "question":"Who is the lead actor in the movie \"Nightmare Alley\"?", 
+    "answer": "Bradley Cooper."
 }
 
 ```
@@ -122,10 +180,7 @@ Finally, the output data generated by the pipeline will contain the following ke
 **Output Data Example (gcot field)**:
 
 ```text
-Step 1: Locate the cat [200, 300, 400, 500]. The cat is sitting on the left.
-Step 2: Locate the dog [500, 300, 700, 500]. The dog is sleeping on the right.
-Step 3: Observe their gaze. The cat is facing the dog.
-Answer: Yes
+Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.  \nKeywords: Nightmare Alley, cast list, poster.","cleaned_cot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.","extracted_keywords":["Nightmare Alley","cast list","poster"],"bbox_mapping":{},"gcot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.
 
 ```
 
@@ -139,6 +194,7 @@ Below is the complete `ImageGCoTPipeline` code implementation.
 import re
 from typing import List, Dict, Any
 import argparse
+import gc
 import torch
 from dataflow.utils.storage import FileStorage
 from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
@@ -147,7 +203,6 @@ from dataflow.operators.core_vision import PromptTemplatedVQAGenerator, VLMBBoxG
 from dataflow.operators.core_text import FunctionalRefiner
 from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
 
-# 定义 Prompt 模板，强制模型输出推理步骤和关键词
 GCOT_PROMPT_TEMPLATE = (
     "Question: {question}\n"
     "Answer: {answer}\n\n"
@@ -164,10 +219,8 @@ GCOT_PROMPT_TEMPLATE = (
 
 DEFAULT_BBOX_PROMPT = 'Detect "{keyword}".'
 
-# ----------------- 辅助逻辑函数 ----------------- #
-
 def _parse_base(text: str) -> Dict[str, Any]:
-    """基础解析逻辑：分离 CoT 文本和 Keywords 行"""
+    """基础解析逻辑（内部复用）"""
     if not text: return {"cot": "", "keywords": []}
     lines = text.split('\n')
     cot_lines = []
@@ -175,7 +228,6 @@ def _parse_base(text: str) -> Dict[str, Any]:
     for line in lines:
         if line.strip().lower().startswith('keywords:'):
             keyword_str = line.split(':', 1)[-1].strip()
-            # 简单的分词处理
             raw_kws = [kw.strip().strip('.,;:!?"\'') for kw in keyword_str.replace(';', ',').split(',')]
             keywords = [k for k in raw_kws if k]
         else:
@@ -183,15 +235,42 @@ def _parse_base(text: str) -> Dict[str, Any]:
     return {"cot": '\n'.join(cot_lines).strip(), "keywords": keywords}
 
 def extract_clean_cot_logic(text: str) -> str:
+    """[For FunctionalRefiner] 仅返回清洗后的 CoT 文本"""
     return _parse_base(text)["cot"]
 
 def extract_keywords_logic(text: str) -> List[str]:
-    return _parse_base(text)["keywords"]
+    """[For FunctionalRefiner] 提取并合并关键词"""
+    parsed = _parse_base(text)
+    kws = parsed["keywords"]
+    cot = parsed["cot"]
+    
+    if not kws or len(kws) <= 1:
+        return kws
+    
+    # 简单的相邻合并逻辑
+    cot_lower = cot.lower()
+    merged = []
+    skip_indices = set()
+    for i in range(len(kws)):
+        if i in skip_indices: continue
+        best_match = kws[i]
+        best_indices = [i]
+        # 尝试向后合并 3 个词
+        for j in range(i + 1, min(i + 4, len(kws))):
+            if j in skip_indices: break
+            combined = ' '.join(kws[i:j+1])
+            if combined.lower() in cot_lower:
+                best_match = combined
+                best_indices = list(range(i, j+1))
+            else: break
+        merged.append(best_match)
+        skip_indices.update(best_indices)
+    return merged
 
 def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str:
-    """将 BBox 注入回 CoT 文本"""
+    """[For FunctionalRefiner] 将 BBox 注入回 CoT"""
     if not cot_text or not bbox_map: return cot_text
-    # 优先匹配长词，避免子串误匹配
+    # 优先匹配长词
     sorted_keywords = sorted(bbox_map.keys(), key=lambda x: len(x), reverse=True)
     result_text = cot_text
     replaced = set()
@@ -202,37 +281,35 @@ def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str:
         answer_pos = result_text.find('Answer:')
         search_limit = answer_pos if answer_pos != -1 else len(result_text)
         
-        # 大小写不敏感查找
         pos = result_text.lower().find(keyword.lower(), 0, search_limit)
         if pos == -1: continue
         
         boxes = bbox_map[keyword] # List[str]
         box_str = "".join(boxes)
-        # 替换：保留原词，追加 Box
         replacement = f"{keyword} {box_str}"
         
         result_text = result_text[:pos] + replacement + result_text[pos + len(keyword):]
         replaced.add(keyword)
     return result_text
 
-# ----------------- 流水线定义 ----------------- #
-
 class ImageGCoTPipeline:
     def __init__(
         self,
         model_path: str,
         *,
+        hf_cache_dir: str | None = None,
+        download_dir: str = "./ckpt/models",
         first_entry_file: str,
-        cache_path: str = "./cache_gcot",
+        cache_path: str = "../cache/cache_gcot",
         file_name_prefix: str = "gcot",
-        # Keys 配置
+        # Keys
         question_key: str = "question",
         answer_key: str = "answer",
         image_key: str = "image",
         output_key: str = "gcot",
+        # Config
         vllm_max_tokens: int = 512
     ):
-        # 1. 存储初始化
         self.storage = FileStorage(
             first_entry_file_name=first_entry_file,
             cache_path=cache_path,
@@ -240,9 +317,11 @@ class ImageGCoTPipeline:
             cache_type="jsonl"
         )
         
-        # 2. 模型服务 (单一模型)
+        # [单一模型 Serving]
         self.vlm_serving = LocalModelVLMServing_vllm(
             hf_model_name_or_path=model_path,
+            hf_cache_dir=hf_cache_dir,
+            hf_local_dir=download_dir,
             vllm_tensor_parallel_size=1,
             vllm_temperature=0.7,
             vllm_max_tokens=vllm_max_tokens
@@ -259,28 +338,28 @@ class ImageGCoTPipeline:
             "final": output_key
         }
 
-        # 3. 算子链配置
+        # ================== Operators ==================
         
-        # Step A: 生成 CoT 和 Keywords
+        # 1. Generate CoT (通用 Generator)
         self.op_gen_cot = PromptTemplatedVQAGenerator(
             serving=self.vlm_serving,
             system_prompt="You are a helpful assistant.",
             prompt_template=NamedPlaceholderPromptTemplate(template=GCOT_PROMPT_TEMPLATE)
         )
         
-        # Step B: 解析清洗 CoT
+        # 2. Extract Clean CoT (通用 Refiner + Helper)
         self.op_extract_cot = FunctionalRefiner(func=extract_clean_cot_logic)
         
-        # Step C: 解析 Keywords
+        # 3. Extract Keywords (通用 Refiner + Helper)
         self.op_extract_kws = FunctionalRefiner(func=extract_keywords_logic)
 
-        # Step D: 生成 BBox (Grounding)
+        # 4. Generate BBox (专用 Generator, 因为涉及行内 Batch)
         self.op_bbox_gen = VLMBBoxGenerator(
             serving=self.vlm_serving,
             prompt_template=DEFAULT_BBOX_PROMPT
         )
         
-        # Step E: 注入 BBox 到 CoT
+        # 5. Inject GCoT (通用 Refiner + Helper)
         self.op_inject = FunctionalRefiner(func=inject_bboxes_logic)
 
     def forward(self):
@@ -289,7 +368,7 @@ class ImageGCoTPipeline:
             self.storage.step(),
             input_image_key=self.keys["img"],
             output_answer_key=self.keys["raw_cot"],
-            question=self.keys["q"],
+            question=self.keys["q"], # Template mapping
             answer=self.keys["a"]
         )
         
@@ -297,7 +376,7 @@ class ImageGCoTPipeline:
         self.op_extract_cot.run(
             self.storage.step(),
             output_key=self.keys["clean_cot"],
-            text=self.keys["raw_cot"]
+            text=self.keys["raw_cot"] # Param mapping
         )
         self.op_extract_kws.run(
             self.storage.step(),
@@ -325,16 +404,13 @@ class ImageGCoTPipeline:
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input_file", default="dataflow/example/image_to_text_pipeline/image_qa_result.jsonl")
-    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
-    
-    args = parser.parse_args()
-    
     pipe = ImageGCoTPipeline(
-        model_path=args.model_path,
-        first_entry_file=args.input_file
+        model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+        first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl",
+        hf_cache_dir="~/.cache/huggingface",
+        download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
     )
     pipe.forward()
 
+
 ```
diff --git a/docs/en/notes/mm_guide/image_understanding/image_gcot_api.md b/docs/en/notes/mm_guide/image_understanding/image_gcot_api.md
new file mode 100644
index 00000000..1df75290
--- /dev/null
+++ b/docs/en/notes/mm_guide/image_understanding/image_gcot_api.md
@@ -0,0 +1,399 @@
+---
+title: Image Grounded CoT (GCoT) Pipeline
+icon: mdi:image-text
+createTime: 2026/01/11 20:44:55
+permalink: /en/mm_guide/image_gcot/
+---
+## 1. Overview
+
+The **Image Grounded Chain-of-Thought (GCoT) Pipeline** is designed to automatically generate **Grounded Chain-of-Thought** data. This pipeline generates multi-step reasoning to answer a question and simultaneously spatially locates (via Bounding Boxes) the key objects mentioned during the reasoning process. This significantly enhances the interpretability and precision of multimodal data.
+
+Unlike traditional methods, this pipeline uses a **Single VLM (e.g., Qwen2.5-VL)** to handle both "Reasoning" and "Grounding" tasks, making the process streamlined and efficient.
+
+We support the following application scenarios:
+
+* **Enhanced Multimodal Data Construction**: Adding interpretability and grounding annotations to VQA datasets.
+* **Complex Scene Understanding**: Generating detailed reasoning steps containing object coordinates.
+* **Model Reasoning Training**: Building data to train models to be "grounded" and reduce hallucinations.
+
+The main process of the pipeline includes:
+
+1. **CoT Generation**: The model generates step-by-step reasoning text and extracts key nouns.
+2. **Keyword Parsing**: Cleaning and extracting keywords to be grounded from the generated text.
+3. **Visual Grounding**: The model generates bounding boxes (BBoxes) for the extracted keywords.
+4. **Information Injection**: Injecting BBox coordinates back into the reasoning text to form the final GCoT.
+
+---
+
+## 2. Quick Start
+
+### Step 1: Create a New DataFlow Working Directory
+
+```bash
+mkdir run_dataflow
+cd run_dataflow
+
+```
+
+### Step 2: Initialize DataFlow-MM
+
+```bash
+dataflowmm init
+
+```
+
+You will then see:
+
+```bash
+gpu_pipelines/image_gcot_pipeline.py
+```
+
+### Step 3: Download Sample Data
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+```
+
+### Step 4: Configure API Key
+
+Set your API Key environment variable in `api_pipelines/image_gcot_api_pipeline.py`:
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+
+```
+
+
+### Step 5: Configure Parameters
+
+Configure the API service and input data paths in `api_pipelines/image_region_caption_api_pipeline.py`:
+
+```python
+    def __init__(
+        self,
+        *,
+        first_entry_file: str,
+        cache_path: str = "../cache/cache_gcot",
+        file_name_prefix: str = "gcot",
+        # Keys
+        question_key: str = "question",
+        answer_key: str = "answer",
+        image_key: str = "image",
+        output_key: str = "gcot",
+        # Config
+        vllm_max_tokens: int = 512
+    ):
+```
+
+```python
+self.vlm_serving = APIVLMServing_openai(
+            api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format
+            model_name="gpt-4o-mini",
+            image_io=None,
+            send_request_stream=False,
+            max_workers=10,
+            timeout=1800
+        )
+
+```
+### Step 6: Run with One Command
+
+```bash
+cd api_pipelines
+python image_gcot_api_pipeline.py
+```
+
+---
+
+## 3. Data Flow & Logic
+
+### 1. **Input Data**
+
+The input data for this process typically consists of standard VQA data:
+
+* **image**: Path to the image file.
+* **question**: Question about the image.
+* **answer**: Standard answer to the question (used to assist CoT generation).
+
+**Input Data Example**:
+
+```json
+{
+    "image":"../example_data/capsbench_images/0.png",
+    "question":"Who is the lead actor in the movie \"Nightmare Alley\"?", 
+    "answer": "Bradley Cooper."
+}
+
+```
+
+### 2. **Core Operator Logic**
+
+This pipeline combines multiple fine-grained operators to achieve complex GCoT generation logic:
+
+#### A. **CoT Generation (PromptTemplatedVQAGenerator)**
+
+Uses a predefined `GCOT_PROMPT_TEMPLATE` to guide the model to generate "Step-by-step Reasoning" and a "Keyword List".
+
+* **Prompt Strategy**: Asks the model to output in the format `Step 1: ...`, `Step 2: ...`, `Keywords: ...`.
+* **Output**: Raw string containing reasoning text and keywords.
+
+#### B. **Text Cleaning & Extraction (FunctionalRefiner)**
+
+Uses custom functions to parse the output from the previous step:
+
+* `extract_clean_cot_logic`: Strips the keyword section, keeping pure CoT text.
+* `extract_keywords_logic`: Parses the content after `Keywords:` to generate a Python List.
+
+#### C. **Visual Grounding (VLMBBoxGenerator)**
+
+Calls the VLM's grounding capability to generate bounding boxes for each extracted keyword.
+
+* **Input**: Image + List of Keywords.
+* **Output**: Dictionary mapping keywords to bounding box coordinates.
+
+#### D. **Coordinate Injection (FunctionalRefiner)**
+
+Uses the `inject_bboxes_logic` function to intelligently insert the generated BBox coordinates back into the original CoT text after the corresponding words.
+
+### 3. **Output Data**
+
+Finally, the output data generated by the pipeline will contain the following key fields:
+
+* **raw_cot_output**: Raw text generated by the model.
+* **cleaned_cot**: Cleaned reasoning text.
+* **bbox_mapping**: Mapping of keywords to their coordinates.
+* **gcot**: Final result, reasoning chain containing coordinate information.
+
+**Output Data Example (gcot field)**:
+
+```text
+Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.  \nKeywords: Nightmare Alley, cast list, poster.","cleaned_cot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.","extracted_keywords":["Nightmare Alley","cast list","poster"],"bbox_mapping":{},"gcot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.
+
+```
+
+---
+
+## 4. Pipeline Example
+
+Below is the complete `ImageGCoTAPIPipeline` code implementation.
+
+```python
+import os
+os.environ["DF_API_KEY"] = "sk-xxxx"
+
+import re
+from typing import List, Dict, Any
+import argparse
+import gc
+import torch
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
+from dataflow.operators.core_vision import PromptTemplatedVQAGenerator, VLMBBoxGenerator
+from dataflow.operators.core_text import FunctionalRefiner
+from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
+from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
+GCOT_PROMPT_TEMPLATE = (
+    "Question: {question}\n"
+    "Answer: {answer}\n\n"
+    "Task: Provide a detailed step-by-step reasoning (Chain-of-Thought) that explains "
+    "how to arrive at this answer based on the image.\n"
+    "Then, extract key nouns and objects mentioned in your reasoning that are "
+    "visible in the image and can be spatially located.\n\n"
+    "Format:\n"
+    "Step 1: ...\n"
+    "Step 2: ...\n"
+    "Answer: {answer}\n"
+    "Keywords: object1, object2\n"
+)
+
+DEFAULT_BBOX_PROMPT = 'Detect "{keyword}".'
+
+def _parse_base(text: str) -> Dict[str, Any]:
+    """基础解析逻辑（内部复用）"""
+    if not text: return {"cot": "", "keywords": []}
+    lines = text.split('\n')
+    cot_lines = []
+    keywords = []
+    for line in lines:
+        if line.strip().lower().startswith('keywords:'):
+            keyword_str = line.split(':', 1)[-1].strip()
+            raw_kws = [kw.strip().strip('.,;:!?"\'') for kw in keyword_str.replace(';', ',').split(',')]
+            keywords = [k for k in raw_kws if k]
+        else:
+            cot_lines.append(line)
+    return {"cot": '\n'.join(cot_lines).strip(), "keywords": keywords}
+
+def extract_clean_cot_logic(text: str) -> str:
+    """[For FunctionalRefiner] 仅返回清洗后的 CoT 文本"""
+    return _parse_base(text)["cot"]
+
+def extract_keywords_logic(text: str) -> List[str]:
+    """[For FunctionalRefiner] 提取并合并关键词"""
+    parsed = _parse_base(text)
+    kws = parsed["keywords"]
+    cot = parsed["cot"]
+    
+    if not kws or len(kws) <= 1:
+        return kws
+    
+    # 简单的相邻合并逻辑
+    cot_lower = cot.lower()
+    merged = []
+    skip_indices = set()
+    for i in range(len(kws)):
+        if i in skip_indices: continue
+        best_match = kws[i]
+        best_indices = [i]
+        # 尝试向后合并 3 个词
+        for j in range(i + 1, min(i + 4, len(kws))):
+            if j in skip_indices: break
+            combined = ' '.join(kws[i:j+1])
+            if combined.lower() in cot_lower:
+                best_match = combined
+                best_indices = list(range(i, j+1))
+            else: break
+        merged.append(best_match)
+        skip_indices.update(best_indices)
+    return merged
+
+def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str:
+    """[For FunctionalRefiner] 将 BBox 注入回 CoT"""
+    if not cot_text or not bbox_map: return cot_text
+    # 优先匹配长词
+    sorted_keywords = sorted(bbox_map.keys(), key=lambda x: len(x), reverse=True)
+    result_text = cot_text
+    replaced = set()
+    
+    for keyword in sorted_keywords:
+        if keyword in replaced: continue
+        # 简单策略：只在 'Answer:' 之前注入，防止破坏答案区
+        answer_pos = result_text.find('Answer:')
+        search_limit = answer_pos if answer_pos != -1 else len(result_text)
+        
+        pos = result_text.lower().find(keyword.lower(), 0, search_limit)
+        if pos == -1: continue
+        
+        boxes = bbox_map[keyword] # List[str]
+        box_str = "".join(boxes)
+        replacement = f"{keyword} {box_str}"
+        
+        result_text = result_text[:pos] + replacement + result_text[pos + len(keyword):]
+        replaced.add(keyword)
+    return result_text
+
+class ImageGCoTPipeline:
+    def __init__(
+        self,
+        *,
+        first_entry_file: str,
+        cache_path: str = "../cache/cache_gcot",
+        file_name_prefix: str = "gcot",
+        # Keys
+        question_key: str = "question",
+        answer_key: str = "answer",
+        image_key: str = "image",
+        output_key: str = "gcot",
+        # Config
+        vllm_max_tokens: int = 512
+    ):
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type="jsonl"
+        )
+
+        self.vlm_serving = APIVLMServing_openai(
+            api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format
+            model_name="gpt-4o-mini",
+            image_io=None,
+            send_request_stream=False,
+            max_workers=10,
+            timeout=1800
+        )
+        
+        self.keys = {
+            "q": question_key,
+            "a": answer_key,
+            "img": image_key,
+            "raw_cot": "raw_cot_output",
+            "clean_cot": "cleaned_cot",
+            "keywords": "extracted_keywords",
+            "bbox_map": "bbox_mapping",
+            "final": output_key
+        }
+
+        # ================== Operators ==================
+        
+        # 1. Generate CoT (通用 Generator)
+        self.op_gen_cot = PromptTemplatedVQAGenerator(
+            serving=self.vlm_serving,
+            system_prompt="You are a helpful assistant.",
+            prompt_template=NamedPlaceholderPromptTemplate(template=GCOT_PROMPT_TEMPLATE)
+        )
+        
+        # 2. Extract Clean CoT (通用 Refiner + Helper)
+        self.op_extract_cot = FunctionalRefiner(func=extract_clean_cot_logic)
+        
+        # 3. Extract Keywords (通用 Refiner + Helper)
+        self.op_extract_kws = FunctionalRefiner(func=extract_keywords_logic)
+
+        # 4. Generate BBox (专用 Generator, 因为涉及行内 Batch)
+        self.op_bbox_gen = VLMBBoxGenerator(
+            serving=self.vlm_serving,
+            prompt_template=DEFAULT_BBOX_PROMPT
+        )
+        
+        # 5. Inject GCoT (通用 Refiner + Helper)
+        self.op_inject = FunctionalRefiner(func=inject_bboxes_logic)
+
+    def forward(self):
+        print(">>> [Pipeline] Step 1: Generating CoT...")
+        self.op_gen_cot.run(
+            self.storage.step(),
+            input_image_key=self.keys["img"],
+            output_answer_key=self.keys["raw_cot"],
+            question=self.keys["q"], # Template mapping
+            answer=self.keys["a"]
+        )
+        
+        print(">>> [Pipeline] Step 2: Parsing Outputs...")
+        self.op_extract_cot.run(
+            self.storage.step(),
+            output_key=self.keys["clean_cot"],
+            text=self.keys["raw_cot"] # Param mapping
+        )
+        self.op_extract_kws.run(
+            self.storage.step(),
+            output_key=self.keys["keywords"],
+            text=self.keys["raw_cot"]
+        )
+        
+        print(">>> [Pipeline] Step 3: Generating BBoxes (Grounding)...")
+        self.op_bbox_gen.run(
+            self.storage.step(),
+            input_image_key=self.keys["img"],
+            input_kws_key=self.keys["keywords"],
+            output_key=self.keys["bbox_map"]
+        )
+        
+        print(">>> [Pipeline] Step 4: Injecting GCoT...")
+        self.op_inject.run(
+            self.storage.step(),
+            output_key=self.keys["final"],
+            cot_text=self.keys["clean_cot"],
+            bbox_map=self.keys["bbox_map"]
+        )
+        
+        print(f">>> [Pipeline] Done. Final GCoT saved to: {self.keys['final']}")
+
+
+if __name__ == "__main__":
+    pipe = ImageGCoTPipeline(
+        first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl"
+    )
+    pipe.forward()
+
+```
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_gcot.md b/docs/zh/notes/mm_guide/image_understanding/image_gcot.md
index a4a11c3d..3a5add79 100644
--- a/docs/zh/notes/mm_guide/image_understanding/image_gcot.md
+++ b/docs/zh/notes/mm_guide/image_understanding/image_gcot.md
@@ -27,36 +27,88 @@ permalink: /zh/mm_guide/image_gcot/
 
 ## 2. 快速开始
 
-### 第一步：准备工作目录
-
+### 第一步：创建新的 DataFlow 工作文件夹
 ```bash
-mkdir run_gcot
-cd run_gcot
-
+mkdir run_dataflow
+cd run_dataflow
 ```
 
-### 第二步：准备脚本
-
-将下文“流水线示例”中的代码保存为 `image_gcot_pipeline.py`。
+### 第二步：初始化 DataFlow-MM
+```bash
+dataflowmm init
+```
+这时你会看到：
+```bash
+gpu_pipelines/image_gcot_pipeline.py
+```
 
-### 第三步：配置运行参数
+### 第三步：下载示例数据
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+```
 
-确保你拥有支持定位能力的 VLM 模型（如 Qwen2.5-VL-7B-Instruct）。
+### 第四步：配置参数
 
 ```bash
-# 安装依赖
-pip install open-dataflow vllm
+if __name__ == "__main__":
+    pipe = ImageGCoTPipeline(
+        model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+        first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl",
+        hf_cache_dir="~/.cache/huggingface",
+        download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+    )
+    pipe.forward()
 
 ```
+> **⚠️ 模型路径配置的重要提示（以 `Qwen2.5-VL-3B-Instruct` 为例）：**
+> 
+> * **如果您已经下载好了模型文件**：请将 `model_path` 修改为您的本地模型路径。**务必保证**模型存放的最终文件夹名称精确为 `Qwen2.5-VL-3B-Instruct`，否则底层解析时将无法正确匹配和识别该模型。
+> * **如果您还未下载模型（需要自动下载）**：请一定要指定 `download_dir` 参数，并且该目录路径**必须以** `Qwen2.5-VL-3B-Instruct` **结尾**（正如默认参数所示），否则下载完成后同样会导致框架无法识别模型。
 
-### 第四步：一键运行
+### 第五步：一键运行
 
 ```bash
-python image_gcot_pipeline.py \
-  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
-  --input_file "data/image_qa.jsonl"
-
+cd gpu_pipelines
+python image_gcot_pipeline.py
 ```
+> **🛠️ 常见问题排查 (Troubleshooting)**
+> 
+> **问题 1：** 如果遇到类似如下的动态链接库冲突报错：
+> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`
+> 
+> **解决方法：** 这通常是环境变量干扰导致的。请在运行命令前清空 `LD_LIBRARY_PATH`：
+> ```bash
+> LD_LIBRARY_PATH="" python image_gcot_pipeline.py
+> ```
+> 
+> **问题 2：** 如果您使用的是 **Qwen 系列模型**，并且遇到以下报错：
+> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"`
+> 
+> **解决方法：** 打开模型文件夹下的 `config.json` 文件，找到 `rope_scaling` 配置块，将 `"type"` 字段修改为 `"rope_type"` 即可。
+> 
+> **修改前：**
+> ```json
+> "rope_scaling": {
+>   "type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> ```
+> 
+> **修改后：**
+> ```json
+> "rope_scaling": {
+>   "rope_type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> ```
 
 ---
 
@@ -74,9 +126,9 @@ python image_gcot_pipeline.py \
 
 ```json
 {
-    "image": "./images/cat_dog.jpg",
-    "question": "Is the cat looking at the dog?",
-    "answer": "Yes"
+    "image":"../example_data/capsbench_images/0.png",
+    "question":"Who is the lead actor in the movie \"Nightmare Alley\"?", 
+    "answer": "Bradley Cooper."
 }
 
 ```
@@ -122,10 +174,7 @@ python image_gcot_pipeline.py \
 **输出数据示例 (gcot 字段)**：
 
 ```text
-Step 1: Locate the cat [200, 300, 400, 500]. The cat is sitting on the left.
-Step 2: Locate the dog [500, 300, 700, 500]. The dog is sleeping on the right.
-Step 3: Observe their gaze. The cat is facing the dog.
-Answer: Yes
+Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.  \nKeywords: Nightmare Alley, cast list, poster.","cleaned_cot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.","extracted_keywords":["Nightmare Alley","cast list","poster"],"bbox_mapping":{},"gcot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.
 
 ```
 
@@ -139,6 +188,7 @@ Answer: Yes
 import re
 from typing import List, Dict, Any
 import argparse
+import gc
 import torch
 from dataflow.utils.storage import FileStorage
 from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
@@ -147,7 +197,6 @@ from dataflow.operators.core_vision import PromptTemplatedVQAGenerator, VLMBBoxG
 from dataflow.operators.core_text import FunctionalRefiner
 from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
 
-# 定义 Prompt 模板，强制模型输出推理步骤和关键词
 GCOT_PROMPT_TEMPLATE = (
     "Question: {question}\n"
     "Answer: {answer}\n\n"
@@ -164,10 +213,8 @@ GCOT_PROMPT_TEMPLATE = (
 
 DEFAULT_BBOX_PROMPT = 'Detect "{keyword}".'
 
-# ----------------- 辅助逻辑函数 ----------------- #
-
 def _parse_base(text: str) -> Dict[str, Any]:
-    """基础解析逻辑：分离 CoT 文本和 Keywords 行"""
+    """基础解析逻辑（内部复用）"""
     if not text: return {"cot": "", "keywords": []}
     lines = text.split('\n')
     cot_lines = []
@@ -175,7 +222,6 @@ def _parse_base(text: str) -> Dict[str, Any]:
     for line in lines:
         if line.strip().lower().startswith('keywords:'):
             keyword_str = line.split(':', 1)[-1].strip()
-            # 简单的分词处理
             raw_kws = [kw.strip().strip('.,;:!?"\'') for kw in keyword_str.replace(';', ',').split(',')]
             keywords = [k for k in raw_kws if k]
         else:
@@ -183,15 +229,42 @@ def _parse_base(text: str) -> Dict[str, Any]:
     return {"cot": '\n'.join(cot_lines).strip(), "keywords": keywords}
 
 def extract_clean_cot_logic(text: str) -> str:
+    """[For FunctionalRefiner] 仅返回清洗后的 CoT 文本"""
     return _parse_base(text)["cot"]
 
 def extract_keywords_logic(text: str) -> List[str]:
-    return _parse_base(text)["keywords"]
+    """[For FunctionalRefiner] 提取并合并关键词"""
+    parsed = _parse_base(text)
+    kws = parsed["keywords"]
+    cot = parsed["cot"]
+    
+    if not kws or len(kws) <= 1:
+        return kws
+    
+    # 简单的相邻合并逻辑
+    cot_lower = cot.lower()
+    merged = []
+    skip_indices = set()
+    for i in range(len(kws)):
+        if i in skip_indices: continue
+        best_match = kws[i]
+        best_indices = [i]
+        # 尝试向后合并 3 个词
+        for j in range(i + 1, min(i + 4, len(kws))):
+            if j in skip_indices: break
+            combined = ' '.join(kws[i:j+1])
+            if combined.lower() in cot_lower:
+                best_match = combined
+                best_indices = list(range(i, j+1))
+            else: break
+        merged.append(best_match)
+        skip_indices.update(best_indices)
+    return merged
 
 def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str:
-    """将 BBox 注入回 CoT 文本"""
+    """[For FunctionalRefiner] 将 BBox 注入回 CoT"""
     if not cot_text or not bbox_map: return cot_text
-    # 优先匹配长词，避免子串误匹配
+    # 优先匹配长词
     sorted_keywords = sorted(bbox_map.keys(), key=lambda x: len(x), reverse=True)
     result_text = cot_text
     replaced = set()
@@ -202,37 +275,35 @@ def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str:
         answer_pos = result_text.find('Answer:')
         search_limit = answer_pos if answer_pos != -1 else len(result_text)
         
-        # 大小写不敏感查找
         pos = result_text.lower().find(keyword.lower(), 0, search_limit)
         if pos == -1: continue
         
         boxes = bbox_map[keyword] # List[str]
         box_str = "".join(boxes)
-        # 替换：保留原词，追加 Box
         replacement = f"{keyword} {box_str}"
         
         result_text = result_text[:pos] + replacement + result_text[pos + len(keyword):]
         replaced.add(keyword)
     return result_text
 
-# ----------------- 流水线定义 ----------------- #
-
 class ImageGCoTPipeline:
     def __init__(
         self,
         model_path: str,
         *,
+        hf_cache_dir: str | None = None,
+        download_dir: str = "./ckpt/models",
         first_entry_file: str,
-        cache_path: str = "./cache_gcot",
+        cache_path: str = "../cache/cache_gcot",
         file_name_prefix: str = "gcot",
-        # Keys 配置
+        # Keys
         question_key: str = "question",
         answer_key: str = "answer",
         image_key: str = "image",
         output_key: str = "gcot",
+        # Config
         vllm_max_tokens: int = 512
     ):
-        # 1. 存储初始化
         self.storage = FileStorage(
             first_entry_file_name=first_entry_file,
             cache_path=cache_path,
@@ -240,9 +311,11 @@ class ImageGCoTPipeline:
             cache_type="jsonl"
         )
         
-        # 2. 模型服务 (单一模型)
+        # [单一模型 Serving]
         self.vlm_serving = LocalModelVLMServing_vllm(
             hf_model_name_or_path=model_path,
+            hf_cache_dir=hf_cache_dir,
+            hf_local_dir=download_dir,
             vllm_tensor_parallel_size=1,
             vllm_temperature=0.7,
             vllm_max_tokens=vllm_max_tokens
@@ -259,28 +332,28 @@ class ImageGCoTPipeline:
             "final": output_key
         }
 
-        # 3. 算子链配置
+        # ================== Operators ==================
         
-        # Step A: 生成 CoT 和 Keywords
+        # 1. Generate CoT (通用 Generator)
         self.op_gen_cot = PromptTemplatedVQAGenerator(
             serving=self.vlm_serving,
             system_prompt="You are a helpful assistant.",
             prompt_template=NamedPlaceholderPromptTemplate(template=GCOT_PROMPT_TEMPLATE)
         )
         
-        # Step B: 解析清洗 CoT
+        # 2. Extract Clean CoT (通用 Refiner + Helper)
         self.op_extract_cot = FunctionalRefiner(func=extract_clean_cot_logic)
         
-        # Step C: 解析 Keywords
+        # 3. Extract Keywords (通用 Refiner + Helper)
         self.op_extract_kws = FunctionalRefiner(func=extract_keywords_logic)
 
-        # Step D: 生成 BBox (Grounding)
+        # 4. Generate BBox (专用 Generator, 因为涉及行内 Batch)
         self.op_bbox_gen = VLMBBoxGenerator(
             serving=self.vlm_serving,
             prompt_template=DEFAULT_BBOX_PROMPT
         )
         
-        # Step E: 注入 BBox 到 CoT
+        # 5. Inject GCoT (通用 Refiner + Helper)
         self.op_inject = FunctionalRefiner(func=inject_bboxes_logic)
 
     def forward(self):
@@ -289,7 +362,7 @@ class ImageGCoTPipeline:
             self.storage.step(),
             input_image_key=self.keys["img"],
             output_answer_key=self.keys["raw_cot"],
-            question=self.keys["q"],
+            question=self.keys["q"], # Template mapping
             answer=self.keys["a"]
         )
         
@@ -297,7 +370,7 @@ class ImageGCoTPipeline:
         self.op_extract_cot.run(
             self.storage.step(),
             output_key=self.keys["clean_cot"],
-            text=self.keys["raw_cot"]
+            text=self.keys["raw_cot"] # Param mapping
         )
         self.op_extract_kws.run(
             self.storage.step(),
@@ -325,16 +398,13 @@ class ImageGCoTPipeline:
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input_file", default="dataflow/example/image_to_text_pipeline/image_qa_result.jsonl")
-    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
-    
-    args = parser.parse_args()
-    
     pipe = ImageGCoTPipeline(
-        model_path=args.model_path,
-        first_entry_file=args.input_file
+        model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+        first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl",
+        hf_cache_dir="~/.cache/huggingface",
+        download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
     )
     pipe.forward()
 
+
 ```
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md b/docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md
new file mode 100644
index 00000000..642b0d5a
--- /dev/null
+++ b/docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md
@@ -0,0 +1,390 @@
+---
+title: 图像定位思维链 (GCoT) 生成流水线
+icon: mdi:image-text
+createTime: 2026/01/11 20:44:55
+permalink: /zh/mm_guide/image_gcot/
+---
+## 1. 概述
+
+**图像定位思维链 (GCoT) 生成流水线** 旨在自动化生成**带视觉定位的思维链（Grounded Chain-of-Thought）**数据。该流水线通过多步推理，不仅生成回答问题的逻辑步骤，还将推理过程中提到的关键物体在图像中进行空间定位（Bounding Box），从而显著提升多模态数据的可解释性和精确度。
+
+与传统方法不同，本流水线采用 **单一 VLM（如 Qwen2.5-VL）** 同时完成“推理”和“定位”任务，流程更加精简高效。
+
+我们支持以下应用场景：
+
+* **增强型多模态数据构建**：为 VQA 数据集增加解释性和定位标注。
+* **复杂场景理解**：生成包含物体坐标的详细推理步骤。
+* **模型推理能力训练**：构建数据以训练模型“言之有物”，减少幻觉。
+
+流水线的主要流程包括：
+
+1. **CoT 生成**：模型生成分步推理文本，并提取关键名词。
+2. **关键词解析**：从生成的文本中清洗并提取待定位的关键词。
+3. **视觉定位 (Grounding)**：模型针对提取的关键词生成边界框 (BBox)。
+4. **信息注入**：将 BBox 坐标回填至推理文本中，形成最终的 GCoT。
+
+---
+
+## 2. 快速开始
+
+### 第一步：创建新的 DataFlow 工作文件夹
+```bash
+mkdir run_dataflow
+cd run_dataflow
+```
+
+### 第二步：初始化 DataFlow-MM
+```bash
+dataflowmm init
+```
+这时你会看到：
+```bash
+gpu_pipelines/image_gcot_pipeline.py
+```
+
+### 第三步：下载示例数据
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+```
+
+### 第四步：配置 API Key
+
+在 `api_pipelines/image_gcot_api_pipeline.py` 中设置 API Key 环境变量：
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+```
+
+### 第五步：配置参数
+
+在 `api_pipelines/image_region_caption_api_pipeline.py` 中配置 API 服务和输入数据路径：
+
+```python
+    def __init__(
+        self,
+        *,
+        first_entry_file: str,
+        cache_path: str = "../cache/cache_gcot",
+        file_name_prefix: str = "gcot",
+        # Keys
+        question_key: str = "question",
+        answer_key: str = "answer",
+        image_key: str = "image",
+        output_key: str = "gcot",
+        # Config
+        vllm_max_tokens: int = 512
+    ):
+```
+
+```python
+self.vlm_serving = APIVLMServing_openai(
+            api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format
+            model_name="gpt-4o-mini",
+            image_io=None,
+            send_request_stream=False,
+            max_workers=10,
+            timeout=1800
+        )
+```
+
+### 第六步：一键运行
+```bash
+cd api_pipelines
+python image_gcot_api_pipeline.py
+```
+
+---
+
+## 3. 数据流与流水线逻辑
+
+### 1. **输入数据**
+
+该流程的输入数据通常是标准的 VQA 数据：
+
+* **image**：图像文件路径。
+* **question**：关于图像的问题。
+* **answer**：问题的标准答案（用于辅助生成 CoT）。
+
+**输入数据示例**：
+
+```json
+{
+    "image":"../example_data/capsbench_images/0.png",
+    "question":"Who is the lead actor in the movie \"Nightmare Alley\"?", 
+    "answer": "Bradley Cooper."
+}
+
+```
+
+### 2. **核心算子逻辑**
+
+本流水线通过组合多个细粒度算子来实现复杂的 GCoT 生成逻辑：
+
+#### A. **CoT 生成 (PromptTemplatedVQAGenerator)**
+
+利用预设的 `GCOT_PROMPT_TEMPLATE`，引导模型生成“步骤化推理”和“关键词列表”。
+
+* **Prompt 策略**：要求模型按 `Step 1: ...`, `Step 2: ...`, `Keywords: ...` 格式输出。
+* **输出**：包含推理文本和关键词的原始字符串。
+
+#### B. **文本清洗与提取 (FunctionalRefiner)**
+
+使用自定义函数对上一步的输出进行解析：
+
+* `extract_clean_cot_logic`：剥离关键词部分，保留纯净的 CoT 文本。
+* `extract_keywords_logic`：解析 `Keywords:` 后的内容，生成 Python List。
+
+#### C. **视觉定位 (VLMBBoxGenerator)**
+
+针对提取出的每一个关键词，调用 VLM 的定位能力生成边界框。
+
+* **输入**：图像 + 关键词列表。
+* **输出**：关键词到边界框坐标的映射字典 (Map)。
+
+#### D. **坐标注入 (FunctionalRefiner)**
+
+使用 `inject_bboxes_logic` 函数，将生成的 BBox 坐标智能插入回原始 CoT 文本中对应的单词之后。
+
+### 3. **输出数据**
+
+最终，流水线生成的输出数据将包含以下关键字段：
+
+* **raw_cot_output**：模型原始生成的文本。
+* **cleaned_cot**：清洗后的纯推理文本。
+* **bbox_mapping**：关键词与其坐标的映射。
+* **gcot**：最终结果，包含坐标信息的推理链。
+
+**输出数据示例 (gcot 字段)**：
+
+```text
+Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.  \nKeywords: Nightmare Alley, cast list, poster.","cleaned_cot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.","extracted_keywords":["Nightmare Alley","cast list","poster"],"bbox_mapping":{},"gcot":"Step 1: Analyze the text visible in the image, which includes a list of actors beneath the title of the movie \"Nightmare Alley.\"\n\nStep 2: Identify the names listed. The first name listed is \"Bradley Cooper,\" indicating he is prominent in the film.\n\nStep 3: Recognize that the image is a promotional poster for \"Nightmare Alley,\" suggesting the individuals mentioned are likely key cast members.\n\nStep 4: Confirm that Bradley Cooper is identified as the lead actor based on his position at the top of the cast list.\n\nAnswer: Bradley Cooper.
+
+```
+
+---
+
+## 4. 流水线示例
+
+以下是完整的 `ImageGCoTAPIPipeline` 代码实现。
+
+```python
+import os
+os.environ["DF_API_KEY"] = "sk-xxxx"
+
+import re
+from typing import List, Dict, Any
+import argparse
+import gc
+import torch
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
+from dataflow.operators.core_vision import PromptTemplatedVQAGenerator, VLMBBoxGenerator
+from dataflow.operators.core_text import FunctionalRefiner
+from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
+from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
+GCOT_PROMPT_TEMPLATE = (
+    "Question: {question}\n"
+    "Answer: {answer}\n\n"
+    "Task: Provide a detailed step-by-step reasoning (Chain-of-Thought) that explains "
+    "how to arrive at this answer based on the image.\n"
+    "Then, extract key nouns and objects mentioned in your reasoning that are "
+    "visible in the image and can be spatially located.\n\n"
+    "Format:\n"
+    "Step 1: ...\n"
+    "Step 2: ...\n"
+    "Answer: {answer}\n"
+    "Keywords: object1, object2\n"
+)
+
+DEFAULT_BBOX_PROMPT = 'Detect "{keyword}".'
+
+def _parse_base(text: str) -> Dict[str, Any]:
+    """基础解析逻辑（内部复用）"""
+    if not text: return {"cot": "", "keywords": []}
+    lines = text.split('\n')
+    cot_lines = []
+    keywords = []
+    for line in lines:
+        if line.strip().lower().startswith('keywords:'):
+            keyword_str = line.split(':', 1)[-1].strip()
+            raw_kws = [kw.strip().strip('.,;:!?"\'') for kw in keyword_str.replace(';', ',').split(',')]
+            keywords = [k for k in raw_kws if k]
+        else:
+            cot_lines.append(line)
+    return {"cot": '\n'.join(cot_lines).strip(), "keywords": keywords}
+
+def extract_clean_cot_logic(text: str) -> str:
+    """[For FunctionalRefiner] 仅返回清洗后的 CoT 文本"""
+    return _parse_base(text)["cot"]
+
+def extract_keywords_logic(text: str) -> List[str]:
+    """[For FunctionalRefiner] 提取并合并关键词"""
+    parsed = _parse_base(text)
+    kws = parsed["keywords"]
+    cot = parsed["cot"]
+    
+    if not kws or len(kws) <= 1:
+        return kws
+    
+    # 简单的相邻合并逻辑
+    cot_lower = cot.lower()
+    merged = []
+    skip_indices = set()
+    for i in range(len(kws)):
+        if i in skip_indices: continue
+        best_match = kws[i]
+        best_indices = [i]
+        # 尝试向后合并 3 个词
+        for j in range(i + 1, min(i + 4, len(kws))):
+            if j in skip_indices: break
+            combined = ' '.join(kws[i:j+1])
+            if combined.lower() in cot_lower:
+                best_match = combined
+                best_indices = list(range(i, j+1))
+            else: break
+        merged.append(best_match)
+        skip_indices.update(best_indices)
+    return merged
+
+def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str:
+    """[For FunctionalRefiner] 将 BBox 注入回 CoT"""
+    if not cot_text or not bbox_map: return cot_text
+    # 优先匹配长词
+    sorted_keywords = sorted(bbox_map.keys(), key=lambda x: len(x), reverse=True)
+    result_text = cot_text
+    replaced = set()
+    
+    for keyword in sorted_keywords:
+        if keyword in replaced: continue
+        # 简单策略：只在 'Answer:' 之前注入，防止破坏答案区
+        answer_pos = result_text.find('Answer:')
+        search_limit = answer_pos if answer_pos != -1 else len(result_text)
+        
+        pos = result_text.lower().find(keyword.lower(), 0, search_limit)
+        if pos == -1: continue
+        
+        boxes = bbox_map[keyword] # List[str]
+        box_str = "".join(boxes)
+        replacement = f"{keyword} {box_str}"
+        
+        result_text = result_text[:pos] + replacement + result_text[pos + len(keyword):]
+        replaced.add(keyword)
+    return result_text
+
+class ImageGCoTPipeline:
+    def __init__(
+        self,
+        *,
+        first_entry_file: str,
+        cache_path: str = "../cache/cache_gcot",
+        file_name_prefix: str = "gcot",
+        # Keys
+        question_key: str = "question",
+        answer_key: str = "answer",
+        image_key: str = "image",
+        output_key: str = "gcot",
+        # Config
+        vllm_max_tokens: int = 512
+    ):
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type="jsonl"
+        )
+
+        self.vlm_serving = APIVLMServing_openai(
+            api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format
+            model_name="gpt-4o-mini",
+            image_io=None,
+            send_request_stream=False,
+            max_workers=10,
+            timeout=1800
+        )
+        
+        self.keys = {
+            "q": question_key,
+            "a": answer_key,
+            "img": image_key,
+            "raw_cot": "raw_cot_output",
+            "clean_cot": "cleaned_cot",
+            "keywords": "extracted_keywords",
+            "bbox_map": "bbox_mapping",
+            "final": output_key
+        }
+
+        # ================== Operators ==================
+        
+        # 1. Generate CoT (通用 Generator)
+        self.op_gen_cot = PromptTemplatedVQAGenerator(
+            serving=self.vlm_serving,
+            system_prompt="You are a helpful assistant.",
+            prompt_template=NamedPlaceholderPromptTemplate(template=GCOT_PROMPT_TEMPLATE)
+        )
+        
+        # 2. Extract Clean CoT (通用 Refiner + Helper)
+        self.op_extract_cot = FunctionalRefiner(func=extract_clean_cot_logic)
+        
+        # 3. Extract Keywords (通用 Refiner + Helper)
+        self.op_extract_kws = FunctionalRefiner(func=extract_keywords_logic)
+
+        # 4. Generate BBox (专用 Generator, 因为涉及行内 Batch)
+        self.op_bbox_gen = VLMBBoxGenerator(
+            serving=self.vlm_serving,
+            prompt_template=DEFAULT_BBOX_PROMPT
+        )
+        
+        # 5. Inject GCoT (通用 Refiner + Helper)
+        self.op_inject = FunctionalRefiner(func=inject_bboxes_logic)
+
+    def forward(self):
+        print(">>> [Pipeline] Step 1: Generating CoT...")
+        self.op_gen_cot.run(
+            self.storage.step(),
+            input_image_key=self.keys["img"],
+            output_answer_key=self.keys["raw_cot"],
+            question=self.keys["q"], # Template mapping
+            answer=self.keys["a"]
+        )
+        
+        print(">>> [Pipeline] Step 2: Parsing Outputs...")
+        self.op_extract_cot.run(
+            self.storage.step(),
+            output_key=self.keys["clean_cot"],
+            text=self.keys["raw_cot"] # Param mapping
+        )
+        self.op_extract_kws.run(
+            self.storage.step(),
+            output_key=self.keys["keywords"],
+            text=self.keys["raw_cot"]
+        )
+        
+        print(">>> [Pipeline] Step 3: Generating BBoxes (Grounding)...")
+        self.op_bbox_gen.run(
+            self.storage.step(),
+            input_image_key=self.keys["img"],
+            input_kws_key=self.keys["keywords"],
+            output_key=self.keys["bbox_map"]
+        )
+        
+        print(">>> [Pipeline] Step 4: Injecting GCoT...")
+        self.op_inject.run(
+            self.storage.step(),
+            output_key=self.keys["final"],
+            cot_text=self.keys["clean_cot"],
+            bbox_map=self.keys["bbox_map"]
+        )
+        
+        print(f">>> [Pipeline] Done. Final GCoT saved to: {self.keys['final']}")
+
+
+if __name__ == "__main__":
+    pipe = ImageGCoTPipeline(
+        first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl"
+    )
+    pipe.forward()
+
+
+```

From b697def2cc32ce160e3174d90a6a203d5a195c2f Mon Sep 17 00:00:00 2001
From: HankYang <hankyang428@gmail.com>
Date: Thu, 26 Feb 2026 18:20:51 +0800
Subject: [PATCH 2/2] fix doc

---
 docs/.vuepress/notes/en/mm_guide.ts           |   4 +
 docs/.vuepress/notes/zh/mm_guide.ts           |   4 +
 .../image_understanding/image_gcot.md         |   2 -
 .../image_understanding/image_gcot_api.md     |  17 +-
 .../image_scale_caption_pipeline.md           | 302 ++++++++---
 .../image_scale_caption_pipeline_api.md       | 477 ++++++++++++++++++
 .../image_visual_only_mcq_pipeline.md         | 206 +++++---
 .../image_visual_only_mcq_pipeline_api.md     | 341 +++++++++++++
 .../vision_mct_reasoning_pipeline.md          | 185 +++++--
 .../vision_mct_reasoning_pipeline_api.md      | 248 +++++++++
 .../image_understanding/image_gcot.md         |   2 -
 .../image_understanding/image_gcot_api.md     |  18 +-
 .../image_scale_caption_pipeline.md           | 247 +++++++--
 .../image_scale_caption_pipeline_api.md       | 477 ++++++++++++++++++
 .../image_visual_only_mcq_pipeline.md         | 161 ++++--
 .../image_visual_only_mcq_pipeline_api.md     | 339 +++++++++++++
 .../vision_mct_reasoning_pipeline.md          | 128 +++--
 .../vision_mct_reasoning_pipeline_api.md      | 248 +++++++++
 18 files changed, 3053 insertions(+), 353 deletions(-)
 create mode 100644 docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md
 create mode 100644 docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md
 create mode 100644 docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md
 create mode 100644 docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md
 create mode 100644 docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md
 create mode 100644 docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md

diff --git a/docs/.vuepress/notes/en/mm_guide.ts b/docs/.vuepress/notes/en/mm_guide.ts
index 8a5469cc..ab560546 100644
--- a/docs/.vuepress/notes/en/mm_guide.ts
+++ b/docs/.vuepress/notes/en/mm_guide.ts
@@ -28,11 +28,15 @@ export const MMGuide: ThemeNote = defineNoteConfig({
                 'context_vqa',
                 'context_vqa_api',
                 'image_gcot',
+                'image_gcot_api',
                 'vision_mct_reasoning_pipeline',
+                'vision_mct_reasoning_pipeline_api',
                 'image_region_caption_pipeline',
                 'image_region_caption_pipeline_api',
                 'image_scale_caption_pipeline',
+                'image_scale_caption_pipeline_api',
                 'image_visual_only_mcq_pipeline',
+                'image_visual_only_mcq_pipeline_api',
             ],
         },
         {
diff --git a/docs/.vuepress/notes/zh/mm_guide.ts b/docs/.vuepress/notes/zh/mm_guide.ts
index 21bece4d..aa439f3a 100644
--- a/docs/.vuepress/notes/zh/mm_guide.ts
+++ b/docs/.vuepress/notes/zh/mm_guide.ts
@@ -28,11 +28,15 @@ export const MMGuide: ThemeNote = defineNoteConfig({
                 'context_vqa',
                 'context_vqa_api',
                 'image_gcot',
+                'image_gcot_api',
                 'vision_mct_reasoning_pipeline',
+                'vision_mct_reasoning_pipeline_api',
                 'image_region_caption_pipeline',
                 'image_region_caption_pipeline_api',
                 'image_scale_caption_pipeline',
+                'image_scale_caption_pipeline_api',
                 'image_visual_only_mcq_pipeline',
+                'image_visual_only_mcq_pipeline_api',
             ],
         },
         {
diff --git a/docs/en/notes/mm_guide/image_understanding/image_gcot.md b/docs/en/notes/mm_guide/image_understanding/image_gcot.md
index 68569d6a..636d7371 100644
--- a/docs/en/notes/mm_guide/image_understanding/image_gcot.md
+++ b/docs/en/notes/mm_guide/image_understanding/image_gcot.md
@@ -411,6 +411,4 @@ if __name__ == "__main__":
         download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
     )
     pipe.forward()
-
-
 ```
diff --git a/docs/en/notes/mm_guide/image_understanding/image_gcot_api.md b/docs/en/notes/mm_guide/image_understanding/image_gcot_api.md
index 1df75290..3499879e 100644
--- a/docs/en/notes/mm_guide/image_understanding/image_gcot_api.md
+++ b/docs/en/notes/mm_guide/image_understanding/image_gcot_api.md
@@ -1,14 +1,14 @@
 ---
-title: Image Grounded CoT (GCoT) Pipeline
+title: Image Grounded CoT (GCoT) Pipeline (API version)
 icon: mdi:image-text
 createTime: 2026/01/11 20:44:55
-permalink: /en/mm_guide/image_gcot/
+permalink: /en/mm_guide/image_gcot_api/
 ---
 ## 1. Overview
 
 The **Image Grounded Chain-of-Thought (GCoT) Pipeline** is designed to automatically generate **Grounded Chain-of-Thought** data. This pipeline generates multi-step reasoning to answer a question and simultaneously spatially locates (via Bounding Boxes) the key objects mentioned during the reasoning process. This significantly enhances the interpretability and precision of multimodal data.
 
-Unlike traditional methods, this pipeline uses a **Single VLM (e.g., Qwen2.5-VL)** to handle both "Reasoning" and "Grounding" tasks, making the process streamlined and efficient.
+Unlike traditional methods, this pipeline uses a **Single VLM (e.g., GPT-5)** to handle both "Reasoning" and "Grounding" tasks, making the process streamlined and efficient.
 
 We support the following application scenarios:
 
@@ -67,7 +67,7 @@ os.environ["DF_API_KEY"] = "your_api_key"
 
 ### Step 5: Configure Parameters
 
-Configure the API service and input data paths in `api_pipelines/image_region_caption_api_pipeline.py`:
+Configure the API service and input data paths in `api_pipelines/image_gcot_api_pipeline.py`:
 
 ```python
     def __init__(
@@ -76,16 +76,20 @@ Configure the API service and input data paths in `api_pipelines/image_region_ca
         first_entry_file: str,
         cache_path: str = "../cache/cache_gcot",
         file_name_prefix: str = "gcot",
-        # Keys
         question_key: str = "question",
         answer_key: str = "answer",
         image_key: str = "image",
         output_key: str = "gcot",
-        # Config
         vllm_max_tokens: int = 512
     ):
 ```
 
+```python
+    pipe = ImageGCoTPipeline(
+        first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl"
+    )
+```
+
 ```python
 self.vlm_serving = APIVLMServing_openai(
             api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format
@@ -395,5 +399,4 @@ if __name__ == "__main__":
         first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl"
     )
     pipe.forward()
-
 ```
diff --git a/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md b/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md
index 8dc770ae..755b4a76 100644
--- a/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md
+++ b/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md
@@ -1,72 +1,147 @@
 ---
-title: ScaleCap High-Density Captioning Pipeline
-createTime: 2026/01/11 22:08:57
+title: ScaleCap High-Density Caption Pipeline
 icon: mdi:image-text
+createTime: 2026/01/11 22:08:57
 permalink: /en/mm_guide/image_scale_caption_pipeline/
 ---
+
 ## 1. Overview
 
-The **ScaleCap High-Density Captioning Pipeline** implements an advanced **"Generate-Verify-Expand-Fuse"** paradigm for image captioning. This pipeline is designed to generate **extremely high information density** captions with **minimal hallucinations**, making it ideal for scenarios requiring deep understanding of image details.
+The **Image Scale Caption Pipeline (ScaleCap)** is an advanced image captioning solution based on a **"Generate-Verify-Expand-Integrate"** paradigm. This pipeline is designed to generate image descriptions with **extremely high information density** and **ultra-low hallucination rates**, making it particularly suitable for scenarios requiring deep understanding of image details.
 
-Based on the paper *ScaleCap: Inference-Time Scalable Image Captioning via Dual-Modality Debiasing*, this method progressively mines object and position details through multi-turn dialogue and visual self-verification (Visual Grounding), filtering out hallucinations along the way.
+The theoretical foundation of this method is derived from the paper *ScaleCap: Inference-Time Scalable Image Captioning via Dual-Modality Debiasing*. It gradually uncovers object and spatial details through multi-turn dialogue and visual grounding, effectively filtering out hallucinations produced by the model.
 
 We support the following application scenarios:
 
 * **High-Quality Multimodal Dataset Construction**: Generating training data that is more detailed and accurate than standard captions.
-* **Fine-Grained Image Retrieval**: Providing index text rich in detail.
-* **Accessibility/Blind Assistance**: Generating "What You See Is What You Get" (WYSIWYG) detailed narrations.
+* **Fine-Grained Image Retrieval**: Providing highly detailed text for indexing.
+* **Blind Assistance / Image Accessibility**: Generating "what-you-see-is-what-you-get" detailed narrations.
 
 The main process of the pipeline includes:
 
-1. **Initial Caption Generation**: VLM generates a baseline description.
-2. **Visual Debiasing**: Splitting the description into sentences and verifying each sentence against visual evidence (Visual Grounding).
-3. **Detail Expansion**: Generating follow-up questions about object attributes and positions based on verified "Golden Sentences".
-4. **Answering & Re-verification**: VLM answers the questions and performs another round of visual grounding to filter incorrect details.
-5. **Final Fusion**: Merging all verified information into a coherent, long description.
+1. **Initial Caption Generation**: The VLM generates a basic description.
+2. **Visual Debiasing**: The description is split into sentences, and each is verified against visual evidence (Visual Grounding).
+3. **Detail Questioning**: Targeted questions regarding object attributes and spatial relations are generated based on the verified "Golden Sentences".
+4. **Answering & Secondary Verification**: The VLM answers the detail questions, followed by another round of visual grounding to filter out incorrect details.
+5. **Final Integration**: All verified information is woven into a coherent, comprehensive long caption.
 
 ---
 
 ## 2. Quick Start
 
-### Step 1: Create a Working Directory
+### Step 1: Create a New DataFlow Working Directory
 
 ```bash
-mkdir run_scalecap
-cd run_scalecap
+mkdir run_dataflow
+cd run_dataflow
 
 ```
 
-### Step 2: Prepare the Script
+### Step 2: Initialize DataFlow-MM
 
-Save the code in the "Pipeline Example" section below as `scalecap_pipeline.py`.
+```bash
+dataflowmm init
 
-### Step 3: Configure Parameters
+```
 
-Ensure the VLM model path (e.g., Qwen2.5-VL) is correct.
+You will then see:
 
 ```bash
-# Install dependencies
-pip install open-dataflow vllm
+gpu_pipelines/image_scale_caption_pipeline.py
 
 ```
 
-### Step 4: Run
+### Step 3: Download Sample Data
 
 ```bash
-python scalecap_pipeline.py \
-  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
-  --input_jsonl "data/images.jsonl" \
-  --output_key "final_caption"
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
 
 ```
 
+### Step 4: Configure Parameters
+
+```python
+if __name__ == "__main__":
+    pipe = ImageScaleCaptionPipeline(
+        model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+        hf_cache_dir="~/.cache/huggingface",
+        download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+        device="cuda",
+        first_entry_file="../example_data/capsbench_images/image_scale_caption_demo.jsonl",
+        cache_path="../cache/image_scale_caption",
+        file_name_prefix="scalecap",
+        input_image_key="image",
+        output_key="final_caption",
+        vllm_tensor_parallel_size=1,
+        vllm_max_tokens=1024
+    )
+    pipe.forward()
+
+```
+
+> **⚠️ Important Note on Model Path Configuration (Taking `Qwen2.5-VL-3B-Instruct` as an example):**
+> * **If you have already downloaded the model files:** Please change `model_path` to your local model path. **Crucially**, ensure that the model folder is named exactly `Qwen2.5-VL-3B-Instruct`; otherwise, the framework will fail to recognize it.
+> * **If you haven't downloaded the model yet:** You must specify a `download_dir` parameter that ends with `Qwen2.5-VL-3B-Instruct` (as shown in the default parameters). Failure to do so will also result in the model not being recognized after downloading.
+> 
+> 
+
+### Step 5: Run
+
+```bash
+cd gpu_pipelines
+python image_scale_caption_pipeline.py
+
+```
+
+> **🛠️ Troubleshooting**
+> **Issue 1:** If you encounter a CUDA library conflict error similar to the following:
+> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`
+> **Solution:** This is usually caused by conflicting environment variables. Run the script with an empty `LD_LIBRARY_PATH`:
+> ```bash
+> LD_LIBRARY_PATH="" python image_scale_caption_pipeline.py
+> 
+> ```
+> 
+> 
+> **Issue 2:** If you are using **Qwen series models** and encounter the following error:
+> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"`
+> **Solution:** Open the `config.json` file located in your model folder, find the `rope_scaling` section, and change the key `"type"` to `"rope_type"`.
+> **Before modification:**
+> ```json
+> "rope_scaling": {
+>   "type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> 
+> ```
+> 
+> 
+> **After modification:**
+> ```json
+> "rope_scaling": {
+>   "rope_type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> 
+> ```
+> 
+> 
+
 ---
 
 ## 3. Data Flow & Logic
 
 ### 1. **Input Data**
 
-The input data requires only the image path:
+The input data for this process is very simple, requiring only the image path:
 
 * **image**: Path to the image file.
 
@@ -74,69 +149,69 @@ The input data requires only the image path:
 
 ```json
 {
-    "image": "./images/complex_scene.jpg"
+    "image": "../example_data/capsbench_images/0.png"
 }
 
 ```
 
 ### 2. **Core Operator Logic**
 
-This pipeline is a complex orchestration of multiple atomic operators:
+This pipeline orchestrates multiple fine-grained operators to achieve the complex ScaleCap logic:
 
 #### A. **Initial Generation (PromptedVQAGenerator)**
 
-* **Function**: Generates a preliminary description (`init_caption`) of the image using a basic prompt.
+* **Function**: Uses a basic prompt to generate a preliminary description of the image (`init_caption`).
 
 #### B. **Visual Debiasing (VisualGroundingRefiner)**
 
 * **Function**: The core anti-hallucination mechanism of ScaleCap.
 * **Logic**:
-1. Uses `split_sentences` to break the draft into single sentences.
+1. Uses `split_sentences` to break the initial draft into single sentences.
 2. Asks the VLM: "Given the image, is the description '{text}' directly supported by visual evidence?".
-3. Keeps only sentences where the answer is "Yes", forming **"Golden Sentences"**.
+3. Retains only the sentences that receive a "Yes", forming **"Golden Sentences"**.
 
 
 
 #### C. **Question Generation & Parsing (PromptTemplatedQAGenerator)**
 
-* **Function**: Generates targeted follow-up questions based on Golden Sentences using LLM capabilities.
-* **Logic**: The model generates text like "Describe more details about the [Object]", which is then automatically expanded into **Object Detail** and **Positional Relation** questions via `parse_questions_logic`.
+* **Function**: Uses LLM capabilities to generate targeted follow-up questions based on the Golden Sentences.
+* **Logic**: The model generates text like "Describe more details about the [Object]". The `parse_questions_logic` function automatically expands these into two categories: **object details** and **spatial relationships**.
 
-#### D. **Batch Answering & Refiltering (BatchVQAGenerator & Refiner)**
+#### D. **Batch Answering & Secondary Filtering (BatchVQAGenerator & Refiner)**
 
-* **Function**: Mining deep image information.
+* **Function**: Deeply mines visual information.
 * **Logic**:
-1. Uses `BatchVQAGenerator` to have the VLM answer all generated questions in a batch.
-2. Uses `VisualGroundingRefiner` again to check if these new details are accurate.
+1. Uses `BatchVQAGenerator` to have the VLM answer all generated questions in a single batch.
+2. Uses `VisualGroundingRefiner` again to verify if these newly generated details are accurate.
 3. Retains reliable details (`final_details`).
 
 
 
-#### E. **Final Fusion (PromptTemplatedQAGenerator)**
+#### E. **Final Integration (PromptTemplatedQAGenerator)**
 
-* **Function**: Rewrites the "Golden Sentences" and "Verified Details" into a fluent text.
+* **Function**: Rewrites the "Golden Sentences" and "Verified Details" into a fluent, cohesive text.
 * **Output**: `final_caption`.
 
 ### 3. **Output Data**
 
-The output data records the entire pipeline process, facilitating debugging and analysis:
+The output data records the entire pipeline process for easy debugging and analysis:
 
-* **init_caption**: Raw generated draft.
-* **golden_sentences**: List of sentences that passed the first check.
+* **init_caption**: The original initial draft.
+* **golden_sentences**: List of sentences that passed the first debiasing check.
 * **q_list**: List of generated follow-up questions.
-* **final_details**: Detailed answers that passed the second check.
+* **final_details**: Detailed answers that passed the secondary check.
 * **final_caption**: The final high-density description.
 
 **Output Data Example**:
 
 ```json
 {
-    "image": "./images/complex_scene.jpg",
+    "image": "../example_data/capsbench_images/0.png",
     "init_caption": "A dog sitting on a bench.",
     "golden_sentences": ["A dog is sitting on a wooden bench."],
-    "q_list": ["Describe more details about the dog.", "Describe position of the bench."],
+    "q_list": ["Describe more details about the dog.", "Describe more details about the position of the bench."],
     "final_details": ["The dog is a Golden Retriever with a red collar.", "The bench is located in a park."],
-    "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park..."
+    "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park."
 }
 
 ```
@@ -145,7 +220,7 @@ The output data records the entire pipeline process, facilitating debugging and
 
 ## 4. Pipeline Example
 
-Below is the complete `ImageScaleCaptionPipeline` code implementation.
+Below is the complete `ImageScaleCaptionPipeline` code implementation (GPU Version).
 
 ```python
 import re
@@ -153,12 +228,79 @@ import argparse
 from typing import Callable, Any, List
 
 from dataflow.utils.storage import FileStorage
+
 from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
 from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
 from dataflow.prompts.image import ImageScaleCaptionPrompt
+
 from dataflow.operators.core_vision import PromptedVQAGenerator, BatchVQAGenerator, VisualGroundingRefiner
 from dataflow.operators.core_text import PromptTemplatedQAGenerator, FunctionalRefiner
 
+
+def split_sentences(text: str) -> List[str]:
+    """将文本拆分为句子列表"""
+    if not text or not isinstance(text, str):
+        return []
+    # 使用正则按标点符号分割 (. ! ? 。 ！ ？)
+    _SENT_SPLIT = re.compile(r"(?<=[.!?。！？])\s+")
+    parts = [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]
+    return parts or ([text.strip()] if text.strip() else [])
+
+def join_list(data: Any, separator: str = "\n") -> str:
+    """将列表连接为字符串"""
+    if isinstance(data, list):
+        # 过滤掉非字符串元素或空字符串
+        valid_items = [str(x) for x in data if x]
+        return separator.join(valid_items)
+    return str(data) if data is not None else ""
+
+def parse_questions_logic(text: str, max_q: int = 20) -> List[str]:
+    """
+    解析 LLM 生成的 "Describe more details about..." 文本，
+    并自动扩展 position 问题。
+    """
+    if not text or not isinstance(text, str):
+        return []
+
+    lines = [t.strip() for t in text.split("\n") if t.strip()]
+    obj_qs = []
+    
+    for line in lines:
+        # 提取包含 "Describe more details about" 的行
+        if "Describe more details about" in line:
+            # 去除可能的序号 (如 "1. Describe...")
+            try:
+                start_idx = line.find("Describe")
+                clean = line[start_idx:]
+                # 去除句末多余内容，保留到第一个句号
+                if "." in clean:
+                    clean = clean.split(".")[0] + "."
+                obj_qs.append(clean)
+            except Exception:
+                continue
+    
+    # 去重并保持顺序
+    seen = set()
+    unique_obj_qs = []
+    for q in obj_qs:
+        if q not in seen:
+            unique_obj_qs.append(q)
+            seen.add(q)
+    
+    # 截断
+    unique_obj_qs = unique_obj_qs[:max_q]
+    
+    # 扩展 Position 问题
+    pos_qs = [
+        q.replace("Describe more details about", "Describe more details about the position of")
+        for q in unique_obj_qs
+    ]
+    
+    # 返回合并后的列表 (对象问题 + 位置问题)
+    return unique_obj_qs + pos_qs
+
+
 class ImageScaleCaptionPipeline:
     def __init__(
         self,
@@ -210,14 +352,19 @@ class ImageScaleCaptionPipeline:
         # ================== Operator Initialization ==================
 
         # --- Step A: Generate Init Caption ---
+        # 构造固定 Prompt 列
         self.refine_const_prompt = FunctionalRefiner(func=lambda: self.prompts_db["VLM_PROMPT_1"])
+        
+        # 生成初稿 (使用通用 PromptedVQAGenerator)
         self.gen_init_caption = PromptedVQAGenerator(
             serving=self.serving,
             system_prompt="You are a helpful assistant."
         )
 
         # --- Step B: Refine Golden Sentences ---
+        # 分句
         self.refine_split = FunctionalRefiner(func=split_sentences)
+        
         # 视觉自检 (保留 Yes 的句子)
         self.refine_golden = VisualGroundingRefiner(
             serving=self.serving,
@@ -225,7 +372,10 @@ class ImageScaleCaptionPipeline:
         )
 
         # --- Step C: Generate Questions ---
+        # 列表转字符串
         self.refine_join = FunctionalRefiner(func=join_list)
+        
+        # 文本生成问题 (Text-to-Text)
         tpl_q = NamedPlaceholderPromptTemplate(
             template=self.prompts_db["LLM_PROMPT_1"], 
             join_list_with="\n"
@@ -234,16 +384,22 @@ class ImageScaleCaptionPipeline:
             serving=self.serving,
             prompt_template=tpl_q
         )
+        
+        # 解析问题文本为列表
         self.refine_parse_qs = FunctionalRefiner(func=parse_questions_logic)
 
         # --- Step D: Generate Answers ---
+        # 批量回答 (One Image -> Many Qs)
         self.gen_answers = BatchVQAGenerator(serving=self.serving)
+        
+        # 回答过滤
         self.refine_answers = VisualGroundingRefiner(
             serving=self.serving,
             prompt_template="Given the image, is the statement '{text}' grounded in the image and not generic? Answer strictly yes or no."
         )
 
         # --- Step E: Integrate Final Caption ---
+        # 融合 (Text-to-Text)
         tpl_final = NamedPlaceholderPromptTemplate(
             template=self.prompts_db["LLM_PROMPT_4"], 
             join_list_with="\n"
@@ -255,6 +411,7 @@ class ImageScaleCaptionPipeline:
 
     def forward(self):
         print(">>> [Pipeline] Step 0: Preparing Prompts...")
+        # 构造 init_prompt 列
         self.refine_const_prompt.run(
             self.storage.step(), 
             output_key="init_prompt"
@@ -287,11 +444,14 @@ class ImageScaleCaptionPipeline:
             output_key="golden_str", 
             data="golden_sentences"
         )
+        
+        # template: "{sentence}" -> map to col "golden_str"
         self.gen_questions_text.run(
             self.storage.step(), 
             output_answer_key="raw_q_text", 
             sentence="golden_str"
         )
+        
         self.refine_parse_qs.run(
             self.storage.step(), 
             output_key="q_list", 
@@ -305,6 +465,7 @@ class ImageScaleCaptionPipeline:
             input_image_key=self.input_image_key, 
             output_key="raw_answers"
         )
+        
         self.refine_answers.run(
             self.storage.step(), 
             input_list_key="raw_answers", 
@@ -318,48 +479,35 @@ class ImageScaleCaptionPipeline:
             output_key="details_str", 
             data="final_details"
         )
+        
+        # template keys: context, object_info, position_info
         self.gen_final_caption.run(
             self.storage.step(),
             output_answer_key=self.output_key,
             context="golden_str",
             object_info="details_str",
-            position_info="details_str"
+            position_info="details_str" # 简化：同时作为 object 和 position 信息
         )
 
         print(f">>> [Pipeline] All Done. Result saved to: {self.storage.cache_path}")
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="ScaleCap Dense Captioning Pipeline")
-    
-    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
-    parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface")
-    parser.add_argument("--download_dir", default="./ckpt/models")
-    parser.add_argument("--device", default="cuda")
-
-    parser.add_argument("--input_jsonl", default="./dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl")
-    parser.add_argument("--cache_path", default="./cache_scalecap_results")
-    parser.add_argument("--file_name_prefix", default="scalecap")
-    parser.add_argument("--input_image_key", default="image")
-    parser.add_argument("--output_key", default="final_caption")
-
-    parser.add_argument("--tp", type=int, default=1)
-    parser.add_argument("--max_tokens", type=int, default=1024)
-
-    args = parser.parse_args()
-
     pipe = ImageScaleCaptionPipeline(
-        model_path=args.model_path,
-        hf_cache_dir=args.hf_cache_dir,
-        download_dir=args.download_dir,
-        device=args.device,
-        first_entry_file=args.input_jsonl,
-        cache_path=args.cache_path,
-        file_name_prefix=args.file_name_prefix,
-        input_image_key=args.input_image_key,
-        output_key=args.output_key,
-        vllm_tensor_parallel_size=args.tp,
-        vllm_max_tokens=args.max_tokens
+        model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+        hf_cache_dir="~/.cache/huggingface",
+        download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+        device="cuda",
+        
+        first_entry_file="../example_data/capsbench_images/image_scale_caption_demo.jsonl",
+        cache_path="../cache/image_scale_caption",
+        file_name_prefix="scalecap",
+        
+        input_image_key="image",
+        output_key="final_caption",
+        
+        vllm_tensor_parallel_size=1,
+        vllm_max_tokens=1024
     )
     
     pipe.forward()
diff --git a/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md b/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md
new file mode 100644
index 00000000..87ad3fa7
--- /dev/null
+++ b/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md
@@ -0,0 +1,477 @@
+---
+title: ScaleCap High-Density Caption Pipeline (API version)
+icon: mdi:image-text
+createTime: 2026/01/11 22:08:57
+permalink: /en/mm_guide/image_scale_caption_pipeline_api/
+---
+
+## 1. Overview
+
+The **Image Scale Caption Pipeline (ScaleCap)** is an advanced image captioning solution based on a **"Generate-Verify-Expand-Integrate"** paradigm. This pipeline is designed to generate image descriptions with **extremely high information density** and **ultra-low hallucination rates**, making it particularly suitable for scenarios requiring deep understanding of image details.
+
+The theoretical foundation of this method is derived from the paper *ScaleCap: Inference-Time Scalable Image Captioning via Dual-Modality Debiasing*. It gradually uncovers object and spatial details through multi-turn dialogue and visual grounding, effectively filtering out hallucinations produced by the model.
+
+We support the following application scenarios:
+
+* **High-Quality Multimodal Dataset Construction**: Generating training data that is more detailed and accurate than standard captions.
+* **Fine-Grained Image Retrieval**: Providing highly detailed text for indexing.
+* **Blind Assistance / Image Accessibility**: Generating "what-you-see-is-what-you-get" detailed narrations.
+
+The main process of the pipeline includes:
+
+1. **Initial Caption Generation**: The VLM generates a basic description.
+2. **Visual Debiasing**: The description is split into sentences, and each is verified against visual evidence (Visual Grounding).
+3. **Detail Questioning**: Targeted questions regarding object attributes and spatial relations are generated based on the verified "Golden Sentences".
+4. **Answering & Secondary Verification**: The VLM answers the detail questions, followed by another round of visual grounding to filter out incorrect details.
+5. **Final Integration**: All verified information is woven into a coherent, comprehensive long caption.
+
+---
+
+## 2. Quick Start
+
+### Step 1: Create a New DataFlow Working Directory
+
+```bash
+mkdir run_dataflow
+cd run_dataflow
+
+```
+
+### Step 2: Initialize DataFlow-MM
+
+```bash
+dataflowmm init
+
+```
+
+You will then see:
+
+```bash
+api_pipelines/image_scale_caption_api_pipeline.py
+
+```
+
+### Step 3: Download Sample Data
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### Step 4: Configure API Key
+
+Set your API Key environment variable in `api_pipelines/image_scale_caption_api_pipeline.py`:
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+
+```
+
+### Step 5: Configure Parameters
+
+Configure the API service and input data paths in `api_pipelines/image_scale_caption_api_pipeline.py`:
+
+```python
+    def __init__(
+        self,
+        # Storage params
+        first_entry_file: str = "../example_data/capsbench_images/image_scale_caption_demo.jsonl",
+        cache_path: str = "../cache/image_scale_caption",
+        file_name_prefix: str = "scalecap",
+        cache_type: str = "jsonl",
+        # Keys
+        input_image_key: str = "image",
+        output_key: str = "final_caption",
+    ):
+
+```
+
+```python
+        self.vlm_serving = APIVLMServing_openai(
+            api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format
+            model_name="gpt-4o-mini",
+            image_io=None,
+            send_request_stream=False,
+            max_workers=10,
+            timeout=1800
+        )
+
+```
+
+### Step 6: Run with One Command
+
+```bash
+cd api_pipelines
+python image_scale_caption_api_pipeline.py
+
+```
+
+---
+
+## 3. Data Flow & Logic
+
+### 1. **Input Data**
+
+The input data for this process is very simple, requiring only the image path:
+
+* **image**: Path to the image file.
+
+**Input Data Example**:
+
+```json
+{
+    "image": "../example_data/capsbench_images/0.png"
+}
+
+```
+
+### 2. **Core Operator Logic**
+
+This pipeline orchestrates multiple fine-grained operators to achieve the complex ScaleCap logic:
+
+#### A. **Initial Generation (PromptedVQAGenerator)**
+
+* **Function**: Uses a basic prompt to generate a preliminary description of the image (`init_caption`).
+
+#### B. **Visual Debiasing (VisualGroundingRefiner)**
+
+* **Function**: The core anti-hallucination mechanism of ScaleCap.
+* **Logic**:
+1. Uses `split_sentences` to break the initial draft into single sentences.
+2. Asks the VLM: "Given the image, is the description '{text}' directly supported by visual evidence?".
+3. Retains only the sentences that receive a "Yes", forming **"Golden Sentences"**.
+
+
+
+#### C. **Question Generation & Parsing (PromptTemplatedQAGenerator)**
+
+* **Function**: Uses LLM capabilities to generate targeted follow-up questions based on the Golden Sentences.
+* **Logic**: The model generates text like "Describe more details about the [Object]". The `parse_questions_logic` function automatically expands these into two categories: **object details** and **spatial relationships**.
+
+#### D. **Batch Answering & Secondary Filtering (BatchVQAGenerator & Refiner)**
+
+* **Function**: Deeply mines visual information.
+* **Logic**:
+1. Uses `BatchVQAGenerator` to have the VLM answer all generated questions in a single batch.
+2. Uses `VisualGroundingRefiner` again to verify if these newly generated details are accurate.
+3. Retains reliable details (`final_details`).
+
+
+
+#### E. **Final Integration (PromptTemplatedQAGenerator)**
+
+* **Function**: Rewrites the "Golden Sentences" and "Verified Details" into a fluent, cohesive text.
+* **Output**: `final_caption`.
+
+### 3. **Output Data**
+
+The output data records the entire pipeline process for easy debugging and analysis:
+
+* **init_caption**: The original initial draft.
+* **golden_sentences**: List of sentences that passed the first debiasing check.
+* **q_list**: List of generated follow-up questions.
+* **final_details**: Detailed answers that passed the secondary check.
+* **final_caption**: The final high-density description.
+
+**Output Data Example**:
+
+```json
+{
+    "image": "../example_data/capsbench_images/0.png",
+    "init_caption": "A dog sitting on a bench.",
+    "golden_sentences": ["A dog is sitting on a wooden bench."],
+    "q_list": ["Describe more details about the dog.", "Describe more details about the position of the bench."],
+    "final_details": ["The dog is a Golden Retriever with a red collar.", "The bench is located in a park."],
+    "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park."
+}
+
+```
+
+---
+
+## 4. Pipeline Example
+
+Below is the complete `ImageScaleCaptionPipeline` code implementation (API Version).
+
+```python
+import os
+os.environ["DF_API_KEY"] = "sk-xxxx"
+
+
+import re
+import argparse
+from typing import Callable, Any, List
+
+from dataflow.utils.storage import FileStorage
+
+from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
+from dataflow.prompts.image import ImageScaleCaptionPrompt
+
+from dataflow.operators.core_vision import PromptedVQAGenerator, BatchVQAGenerator, VisualGroundingRefiner
+from dataflow.operators.core_text import PromptTemplatedQAGenerator, FunctionalRefiner
+from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
+
+def split_sentences(text: str) -> List[str]:
+    """将文本拆分为句子列表"""
+    if not text or not isinstance(text, str):
+        return []
+    # 使用正则按标点符号分割 (. ! ? 。 ！ ？)
+    _SENT_SPLIT = re.compile(r"(?<=[.!?。！？])\s+")
+    parts = [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]
+    return parts or ([text.strip()] if text.strip() else [])
+
+def join_list(data: Any, separator: str = "\n") -> str:
+    """将列表连接为字符串"""
+    if isinstance(data, list):
+        # 过滤掉非字符串元素或空字符串
+        valid_items = [str(x) for x in data if x]
+        return separator.join(valid_items)
+    return str(data) if data is not None else ""
+
+def parse_questions_logic(text: str, max_q: int = 20) -> List[str]:
+    """
+    解析 LLM 生成的 "Describe more details about..." 文本，
+    并自动扩展 position 问题。
+    """
+    if not text or not isinstance(text, str):
+        return []
+
+    lines = [t.strip() for t in text.split("\n") if t.strip()]
+    obj_qs = []
+    
+    for line in lines:
+        # 提取包含 "Describe more details about" 的行
+        if "Describe more details about" in line:
+            # 去除可能的序号 (如 "1. Describe...")
+            try:
+                start_idx = line.find("Describe")
+                clean = line[start_idx:]
+                # 去除句末多余内容，保留到第一个句号
+                if "." in clean:
+                    clean = clean.split(".")[0] + "."
+                obj_qs.append(clean)
+            except Exception:
+                continue
+    
+    # 去重并保持顺序
+    seen = set()
+    unique_obj_qs = []
+    for q in obj_qs:
+        if q not in seen:
+            unique_obj_qs.append(q)
+            seen.add(q)
+    
+    # 截断
+    unique_obj_qs = unique_obj_qs[:max_q]
+    
+    # 扩展 Position 问题
+    pos_qs = [
+        q.replace("Describe more details about", "Describe more details about the position of")
+        for q in unique_obj_qs
+    ]
+    
+    # 返回合并后的列表 (对象问题 + 位置问题)
+    return unique_obj_qs + pos_qs
+
+
+class ImageScaleCaptionPipeline:
+    def __init__(
+        self,
+        # Storage params
+        first_entry_file: str = "images.jsonl",
+        cache_path: str = "./cache_scalecap",
+        file_name_prefix: str = "scalecap",
+        cache_type: str = "jsonl",
+        # Keys
+        input_image_key: str = "image",
+        output_key: str = "final_caption",
+        # VLLM Config
+        vllm_tensor_parallel_size: int = 1,
+        vllm_temperature: float = 0.7,
+        vllm_top_p: float = 0.9,
+        vllm_max_tokens: int = 512,
+    ):
+        # 1. Storage
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type=cache_type,
+        )
+
+        # 2. Serving
+        self.vlm_serving = APIVLMServing_openai(
+            api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format
+            model_name="gpt-4o-mini",
+            image_io=None,
+            send_request_stream=False,
+            max_workers=10,
+            timeout=1800
+        )
+
+        # 3. Prompts
+        self.prompts_db = ImageScaleCaptionPrompt().build_prompt()
+
+        # 4. Keys
+        self.input_image_key = input_image_key
+        self.output_key = output_key
+
+        # ================== Operator Initialization ==================
+
+        # --- Step A: Generate Init Caption ---
+        # 构造固定 Prompt 列
+        self.refine_const_prompt = FunctionalRefiner(func=lambda: self.prompts_db["VLM_PROMPT_1"])
+        
+        # 生成初稿 (使用通用 PromptedVQAGenerator)
+        self.gen_init_caption = PromptedVQAGenerator(
+            serving=self.vlm_serving,
+            system_prompt="You are a helpful assistant."
+        )
+
+        # --- Step B: Refine Golden Sentences ---
+        # 分句
+        self.refine_split = FunctionalRefiner(func=split_sentences)
+        
+        # 视觉自检 (保留 Yes 的句子)
+        self.refine_golden = VisualGroundingRefiner(
+            serving=self.vlm_serving,
+            prompt_template="Given the image, is the description '{text}' directly supported by visual evidence? Answer strictly yes or no."
+        )
+
+        # --- Step C: Generate Questions ---
+        # 列表转字符串
+        self.refine_join = FunctionalRefiner(func=join_list)
+        
+        # 文本生成问题 (Text-to-Text)
+        tpl_q = NamedPlaceholderPromptTemplate(
+            template=self.prompts_db["LLM_PROMPT_1"], 
+            join_list_with="\n"
+        )
+        self.gen_questions_text = PromptTemplatedQAGenerator(
+            serving=self.vlm_serving,
+            prompt_template=tpl_q
+        )
+        
+        # 解析问题文本为列表
+        self.refine_parse_qs = FunctionalRefiner(func=parse_questions_logic)
+
+        # --- Step D: Generate Answers ---
+        # 批量回答 (One Image -> Many Qs)
+        self.gen_answers = BatchVQAGenerator(serving=self.vlm_serving)
+        
+        # 回答过滤
+        self.refine_answers = VisualGroundingRefiner(
+            serving=self.vlm_serving,
+            prompt_template="Given the image, is the statement '{text}' grounded in the image and not generic? Answer strictly yes or no."
+        )
+
+        # --- Step E: Integrate Final Caption ---
+        # 融合 (Text-to-Text)
+        tpl_final = NamedPlaceholderPromptTemplate(
+            template=self.prompts_db["LLM_PROMPT_4"], 
+            join_list_with="\n"
+        )
+        self.gen_final_caption = PromptTemplatedQAGenerator(
+            serving=self.vlm_serving,
+            prompt_template=tpl_final
+        )
+
+    def forward(self):
+        print(">>> [Pipeline] Step 0: Preparing Prompts...")
+        # 构造 init_prompt 列
+        self.refine_const_prompt.run(
+            self.storage.step(), 
+            output_key="init_prompt"
+        )
+
+        print(">>> [Pipeline] Step 1: Generating Initial Caption...")
+        self.gen_init_caption.run(
+            self.storage.step(),
+            input_prompt_key="init_prompt",
+            input_image_key=self.input_image_key,
+            output_answer_key="init_caption"
+        )
+
+        print(">>> [Pipeline] Step 2: Refining Golden Sentences...")
+        self.refine_split.run(
+            self.storage.step(), 
+            output_key="sentences", 
+            text="init_caption"
+        )
+        self.refine_golden.run(
+            self.storage.step(), 
+            input_list_key="sentences", 
+            input_image_key=self.input_image_key, 
+            output_key="golden_sentences"
+        )
+
+        print(">>> [Pipeline] Step 3: Generating Details Questions...")
+        self.refine_join.run(
+            self.storage.step(), 
+            output_key="golden_str", 
+            data="golden_sentences"
+        )
+        
+        # template: "{sentence}" -> map to col "golden_str"
+        self.gen_questions_text.run(
+            self.storage.step(), 
+            output_answer_key="raw_q_text", 
+            sentence="golden_str"
+        )
+        
+        self.refine_parse_qs.run(
+            self.storage.step(), 
+            output_key="q_list", 
+            text="raw_q_text"
+        )
+
+        print(">>> [Pipeline] Step 4: Generating & Filtering Answers...")
+        self.gen_answers.run(
+            self.storage.step(), 
+            input_prompts_key="q_list", 
+            input_image_key=self.input_image_key, 
+            output_key="raw_answers"
+        )
+        
+        self.refine_answers.run(
+            self.storage.step(), 
+            input_list_key="raw_answers", 
+            input_image_key=self.input_image_key, 
+            output_key="final_details"
+        )
+
+        print(">>> [Pipeline] Step 5: Integrating Final Caption...")
+        self.refine_join.run(
+            self.storage.step(), 
+            output_key="details_str", 
+            data="final_details"
+        )
+        
+        # template keys: context, object_info, position_info
+        self.gen_final_caption.run(
+            self.storage.step(),
+            output_answer_key=self.output_key,
+            context="golden_str",
+            object_info="details_str",
+            position_info="details_str" # 简化：同时作为 object 和 position 信息
+        )
+
+        print(f">>> [Pipeline] All Done. Result saved to: {self.storage.cache_path}")
+
+
+if __name__ == "__main__":
+
+    pipe = ImageScaleCaptionPipeline( 
+        first_entry_file="../example_data/capsbench_images/image_scale_caption_demo.jsonl",
+        cache_path="../cache/image_scale_caption",
+        file_name_prefix="scalecap",
+        input_image_key="image",
+        output_key="final_caption",
+        vllm_tensor_parallel_size=1,
+        vllm_max_tokens=1024
+    )
+    
+    pipe.forward()
+
+```
diff --git a/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md b/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md
index cc3806af..4e495489 100644
--- a/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md
+++ b/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md
@@ -4,27 +4,26 @@ createTime: 2026/01/11 22:13:45
 icon: mdi:image-text
 permalink: /en/mm_guide/image_visual_only_mcq_pipeline/
 ---
+
 ## 1. Overview
 
-The **Visual-Only MCQ Pipeline** is a core component of the CapRL (Caption Reinforcement Learning) framework. Its goal is to generate a set of high-quality Multiple Choice Questions (MCQs) that satisfy **strict visual dependency**: the model must "see" the image to answer correctly; answering based on text alone (guessing or common sense) is not possible.
+The **Visual-Only MCQ Pipeline** is a core component within the CapRL (Caption Reinforcement Learning) framework. Its goal is to generate a set of high-quality Multiple-Choice Questions (MCQs) that strictly satisfy **strong visual dependency**: the model must "see" the image to answer correctly, and cannot rely merely on text guessing or common sense.
 
-This pipeline uses a **Generate-Parse-Verify** three-step method, leveraging **Option Rotation** and **Blind Tests** to rigorously filter out hallucinations or overly simple questions. The generated questions serve as a robust reward signal for Reinforcement Learning.
+This pipeline utilizes a **"Generate-Parse-Verify"** three-step approach, employing **Option Rotation** and **Blind Test (Text-Only)** mechanisms to rigorously filter out model hallucinations or overly simple questions. The generated questions can be used as reward signals (Reward Model) for reinforcement learning.
 
 The main process includes:
 
-1. **MCQ Generation**: VLM generates raw QA pairs based on the image.
-2. **Structured Parsing**: Using regex logic to parse text into standard question/option structures.
+1. **MCQ Generation**: The VLM generates raw Question-Answer text blocks based on the image.
+2. **Structured Parsing**: Uses regex logic to parse the raw text into standard question and option structures.
 3. **Visual Dependency Verification**:
-* **Rotation Test**: Shuffling options multiple times to eliminate positional bias.
-* **Dual Filtering**: Requiring high "Visual Accuracy" and low "Text-only Accuracy".
-
-
+   * **Rotation Test**: Randomly shuffles the order of options multiple times to eliminate positional bias.
+   * **Dual Filtering**: Requires a high "Visual Accuracy" (with image) and a low "Textual Accuracy" (without image).
 
 ---
 
 ## 2. Quick Start
 
-### Step 1: Create Working Directory
+### Step 1: Create a New DataFlow Working Directory
 
 ```bash
 mkdir run_vis_mcq
@@ -32,39 +31,109 @@ cd run_vis_mcq
 
 ```
 
-### Step 2: Prepare Script
+### Step 2: Initialize DataFlow-MM
 
-Save the code in the "Pipeline Example" section below as `visual_mcq_pipeline.py`.
+```bash
+dataflowmm init
 
-### Step 3: Configure Parameters
+```
 
-Control filtering thresholds via CLI. For example, requiring 100% visual accuracy and less than 25% blind accuracy:
+You will then see:
 
 ```bash
-# Install dependencies
-pip install open-dataflow vllm
+gpu_pipelines/image_visual_only_mcq_pipeline.py
 
 ```
 
-### Step 4: Run
+### Step 3: Download Sample Data
 
 ```bash
-python visual_mcq_pipeline.py \
-  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
-  --input_file "data/captions.jsonl" \
-  --rotate_num 4 \
-  --pass_vis 1.0 \
-  --pass_txt 0.25
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### Step 4: Configure Parameters
+
+Configure the model path and filtering thresholds (e.g., requiring 100% visual accuracy and less than 25% textual accuracy):
+
+```python
+if __name__ == "__main__":
+    pipe = VisualOnlyMCQPipeline(
+        model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+        first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl",
+        hf_cache_dir="~/.cache/huggingface",
+        download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+        rotate_num=4,
+        pass_visual_min=1.0,
+        pass_textual_max=0.25
+    )
+    pipe.forward()
 
 ```
 
+> **⚠️ Important Note on Model Path Configuration (Taking `Qwen2.5-VL-3B-Instruct` as an example):**
+> * **If you have already downloaded the model files:** Please change `model_path` to your local model path. **Crucially**, ensure that the model folder is named exactly `Qwen2.5-VL-3B-Instruct`; otherwise, the framework will fail to recognize it.
+> * **If you haven't downloaded the model yet:** You must specify a `download_dir` parameter that ends with `Qwen2.5-VL-3B-Instruct` (as shown in the default parameters). Failure to do so will also result in the model not being recognized after downloading.
+> 
+> 
+
+### Step 5: Run
+
+```bash
+cd gpu_pipelines
+python image_visual_only_mcq_pipeline.py
+
+```
+
+> **🛠️ Troubleshooting**
+> **Issue 1:** If you encounter a CUDA library conflict error similar to the following:
+> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`
+> **Solution:** This is usually caused by conflicting environment variables. Run the script with an empty `LD_LIBRARY_PATH`:
+> ```bash
+> LD_LIBRARY_PATH="" python image_visual_only_mcq_pipeline.py
+> 
+> ```
+> 
+> 
+> **Issue 2:** If you are using **Qwen series models** and encounter the following error:
+> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"`
+> **Solution:** Open the `config.json` file located in your model folder, find the `rope_scaling` section, and change the key `"type"` to `"rope_type"`.
+> **Before modification:**
+> ```json
+> "rope_scaling": {
+>   "type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> 
+> ```
+> 
+> 
+> **After modification:**
+> ```json
+> "rope_scaling": {
+>   "rope_type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> 
+> ```
+> 
+> 
+
 ---
 
 ## 3. Data Flow & Logic
 
 ### 1. **Input Data**
 
-Input only requires the image path:
+The input data only requires the image path:
 
 * **image**: Path to the image file.
 
@@ -79,35 +148,35 @@ Input only requires the image path:
 
 ### 2. **Core Operator Logic**
 
-This pipeline chains three key operators:
+This pipeline is chained together by three key operators:
 
-#### A. **FixPromptedVQAGenerator (Raw Generation)**
+#### A. **Raw Generation (FixPromptedVQAGenerator)**
 
-* **Function**: Uses CapRL predefined Prompt templates (`SYS_PROMPT_MCQ` / `USER_PROMPT_MCQ`) to generate 5 MCQs at once.
-* **Output**: Unstructured text block containing multiple `#### Question` and options.
+* **Function**: Uses the preset CapRL prompt templates (`SYS_PROMPT_MCQ` / `USER_PROMPT_MCQ`) to instruct the VLM to generate 5 MCQs in one go.
+* **Output**: Unstructured text blocks containing multiple `#### Question` headers and options.
 
-#### B. **FunctionalRefiner (Regex Parsing)**
+#### B. **Structured Parsing (FunctionalRefiner)**
 
 * **Logic Function**: `parse_mcq_text_logic`
-* **Function**: Extracts questions, options (A-F), and correct answers from raw text using regex.
-* **Output**: Structured MCQ list (`parsed_mcq_list`).
+* **Function**: Extracts the questions, options (A-F), and correct answers from the raw text using regular expressions.
+* **Output**: A structured list of MCQs (`parsed_mcq_list`).
 
-#### C. **VisualDependencyRefiner (Dependency Verification)**
+#### C. **Dependency Verification (VisualDependencyRefiner)**
 
-This is the core filter. It performs N inferences (N = `rotate_num`) for each question:
+This is the core filter of the pipeline. It performs N inferences (N = `rotate_num`) for each question:
 
-1. **Option Rotation**: Randomly shuffles options (e.g., moving answer from A to C) to prevent the model from cheating by "always picking A".
-2. **Visual Pass**: Input Image + Question. Records the model's accuracy.
-3. **Textual Pass**: Input Question only (no image). Records the model's blind guessing accuracy.
+1. **Option Rotation**: Randomly shuffles the option order (e.g., moving the answer from A to C) to prevent the model from cheating by "always choosing A".
+2. **Visual Pass**: Inputs Image + Question. Records the proportion of correct answers.
+3. **Textual Pass (Blind Test)**: Inputs Question only (No Image). Records the proportion of correct blind guesses.
 4. **Filtering Criteria**:
-* Keep the question IF AND ONLY IF: `Visual_Acc >= pass_visual_min` **AND** `Textual_Acc <= pass_textual_max`.
-* *Example*: If a question can be answered correctly without the image (high text accuracy), it tests common sense rather than vision, so it is **discarded**.
+* Retains the question if and only if: `Visual_Acc >= pass_visual_min` **AND** `Textual_Acc <= pass_textual_max`.
+* *Example*: If a question can be answered correctly without looking at the image (high textual accuracy), it relies on common sense rather than visual info, and is **discarded**.
 
 
 
 ### 3. **Output Data**
 
-The output data (`final_mcqs`) contains only questions that passed rigorous verification. These questions possess high quality and visual relevance.
+The output data (`final_mcqs`) only contains questions that have passed the rigorous verification. These questions possess extremely high quality and visual relevance.
 
 **Output Data Example**:
 
@@ -119,8 +188,8 @@ The output data (`final_mcqs`) contains only questions that passed rigorous veri
             "question": "What is the color of the car on the far left?\n - A) Red\n - B) Blue...",
             "answer": "A",
             "stats": {
-                "visual_acc": 1.0,  # 4/4 correct with image
-                "text_acc": 0.0     # 0/4 correct without image
+                "visual_acc": 1.0,  
+                "text_acc": 0.0     
             }
         }
     ]
@@ -132,12 +201,10 @@ The output data (`final_mcqs`) contains only questions that passed rigorous veri
 
 ## 4. Pipeline Example
 
-Below is the complete `VisualOnlyMCQPipeline` code implementation.
+Below is the complete `VisualOnlyMCQPipeline` code implementation (GPU Version).
 
 ```python
 import argparse
-import re
-from typing import List, Dict, Any
 from dataflow.utils.storage import FileStorage
 from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
 
@@ -145,13 +212,14 @@ from dataflow.operators.core_vision import FixPromptedVQAGenerator, VisualDepend
 from dataflow.operators.core_text import FunctionalRefiner
 from dataflow.prompts.image import ImageCaprlPrompt
 
-# 正则解析逻辑
+import re
+from typing import List, Dict, Any
+
 _Q_BLOCK_SPLIT = re.compile(r"^####\s*\d+\.\s*\*\*(.*?)\*\*\s*$", re.M)
 _OPT_LINE_RE = re.compile(r"^\s*-\s*([A-F])\)\s*(.+?)\s*$")
 _ANS_LINE_RE = re.compile(r"^\s*\*\*Answer:\*\*\s*([A-F])\)\s*(.+?)\s*$", re.I)
 
 def parse_mcq_text_logic(mcq_text: str, expected: int = 5) -> List[Dict[str, Any]]:
-    """将 VLM 生成的原始文本解析为结构化字典列表"""
     if not mcq_text or not isinstance(mcq_text, str): return []
     
     indices = [m.start() for m in _Q_BLOCK_SPLIT.finditer(mcq_text)]
@@ -213,7 +281,9 @@ class VisualOnlyMCQPipeline:
         model_path: str,
         *,
         first_entry_file: str,
-        cache_path: str = "./cache_mcq",
+        hf_cache_dir: str | None = None,
+        download_dir: str = "./ckpt/models",
+        cache_path: str = "../cache/cache_mcq",
         file_name_prefix: str = "vis_mcq",
         # Config
         rotate_num: int = 4,
@@ -227,7 +297,6 @@ class VisualOnlyMCQPipeline:
         device: str = "cuda",
         vllm_max_tokens: int = 2048
     ):
-        # 1. 初始化存储
         self.storage = FileStorage(
             first_entry_file_name=first_entry_file,
             cache_path=cache_path,
@@ -235,15 +304,16 @@ class VisualOnlyMCQPipeline:
             cache_type="jsonl"
         )
         
-        # 2. 初始化 VLM 服务
         self.serving = LocalModelVLMServing_vllm(
+            hf_cache_dir=hf_cache_dir,
+            hf_local_dir=download_dir,
             hf_model_name_or_path=model_path,
             vllm_tensor_parallel_size=1,
-            vllm_temperature=0.1,  # 低温度以保证格式稳定
+            vllm_temperature=0.1, 
             vllm_max_tokens=vllm_max_tokens
         )
         
-        # Keys 配置
+        # Keys
         self.keys = {
             "img": input_image_key,
             "raw_text": "raw_mcq_text",
@@ -251,23 +321,24 @@ class VisualOnlyMCQPipeline:
             "final": output_key
         }
         
-        # 加载 Prompt 库
+        # --- Prompts ---
         self.prompts_db = ImageCaprlPrompt().build_prompt()
 
-        # ================== 算子初始化 ==================
+        # ================== Operators ==================
         
-        # 算子 1: 生成原始 MCQ 文本
+        # 1. Generate Raw MCQs (FixPromptedVQAGenerator)
+        # 直接使用 prompt 类中的字符串
         self.op_gen_raw = FixPromptedVQAGenerator(
             serving=self.serving,
             system_prompt=self.prompts_db["SYS_PROMPT_MCQ"],
             user_prompt=self.prompts_db["USER_PROMPT_MCQ"]
         )
         
-        # 算子 2: 解析文本为结构化数据
+        # 2. Parse MCQs (Refine)
         self.op_parse = FunctionalRefiner(func=parse_mcq_text_logic)
         
-        # 算子 3: 视觉依赖性验证 (核心过滤)
-        # 包含旋转 (Rotation) 和 无图检测 (Text-only check)
+        # 3. Verify Visual Dependency (Refine)
+        # 传入 prompt 模板
         self.op_verify = VisualDependencyRefiner(
             serving=self.serving,
             instruction_template=self.prompts_db["ANSWER_INSTRUCTION"],
@@ -304,22 +375,15 @@ class VisualOnlyMCQPipeline:
         print(f">>> [Pipeline] Done. Results in: {self.keys['final']}")
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input_file", default="./dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl")
-    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
-    parser.add_argument("--rotate_num", type=int, default=4)
-    parser.add_argument("--pass_vis", type=float, default=1.0)
-    parser.add_argument("--pass_txt", type=float, default=0.25)
-    
-    args = parser.parse_args()
-    
     pipe = VisualOnlyMCQPipeline(
-        model_path=args.model_path,
-        first_entry_file=args.input_file,
-        rotate_num=args.rotate_num,
-        pass_visual_min=args.pass_vis,
-        pass_textual_max=args.pass_txt
+        model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+        first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl",
+        hf_cache_dir="~/.cache/huggingface",
+        download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+        rotate_num=4,
+        pass_visual_min=1.0,
+        pass_textual_max=0.25
     )
     pipe.forward()
 
-```
\ No newline at end of file
+```
diff --git a/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md b/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md
new file mode 100644
index 00000000..054ade54
--- /dev/null
+++ b/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md
@@ -0,0 +1,341 @@
+---
+title: Visual-Only MCQ Pipeline (API version)
+createTime: 2026/01/11 22:13:45
+icon: mdi:image-text
+permalink: /en/mm_guide/image_visual_only_mcq_pipeline_api/
+---
+
+## 1. Overview
+
+The **Visual-Only MCQ Pipeline** is a core component within the CapRL (Caption Reinforcement Learning) framework. Its goal is to generate a set of high-quality Multiple-Choice Questions (MCQs) that strictly satisfy **strong visual dependency**: the model must "see" the image to answer correctly, and cannot rely merely on text guessing or common sense.
+
+This pipeline utilizes a **"Generate-Parse-Verify"** three-step approach, employing **Option Rotation** and **Blind Test (Text-Only)** mechanisms to rigorously filter out model hallucinations or overly simple questions. The generated questions can be used as reward signals (Reward Model) for reinforcement learning.
+
+The main process includes:
+
+1. **MCQ Generation**: The VLM generates raw Question-Answer text blocks based on the image.
+2. **Structured Parsing**: Uses regex logic to parse the raw text into standard question and option structures.
+3. **Visual Dependency Verification**:
+   * **Rotation Test**: Randomly shuffles the order of options multiple times to eliminate positional bias.
+   * **Dual Filtering**: Requires a high "Visual Accuracy" (with image) and a low "Textual Accuracy" (without image).
+
+---
+
+## 2. Quick Start
+
+### Step 1: Create a New DataFlow Working Directory
+
+```bash
+mkdir run_vis_mcq
+cd run_vis_mcq
+
+```
+
+### Step 2: Initialize DataFlow-MM
+
+```bash
+dataflowmm init
+
+```
+
+You will then see:
+
+```bash
+api_pipelines/image_visual_only_mcq_api_pipeline.py
+
+```
+
+### Step 3: Download Sample Data
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### Step 4: Configure API Key
+
+Set your API Key environment variable in `api_pipelines/image_visual_only_mcq_api_pipeline.py`:
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+
+```
+
+### Step 5: Configure Parameters
+
+Configure the API service and run parameters in `api_pipelines/image_visual_only_mcq_api_pipeline.py` (e.g., requiring 100% visual accuracy and less than 25% textual accuracy):
+
+```python
+    pipe = VisualOnlyMCQPipeline(
+        first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl",
+        rotate_num=4,
+        pass_visual_min=1.0,
+        pass_textual_max=0.25
+    )
+
+```
+
+### Step 6: Run with One Command
+
+```bash
+cd api_pipelines
+python image_visual_only_mcq_api_pipeline.py
+
+```
+
+---
+
+## 3. Data Flow & Logic
+
+### 1. **Input Data**
+
+The input data only requires the image path:
+
+* **image**: Path to the image file.
+
+**Input Data Example**:
+
+```json
+{
+    "image": "./images/sample_01.jpg"
+}
+
+```
+
+### 2. **Core Operator Logic**
+
+This pipeline is chained together by three key operators:
+
+#### A. **Raw Generation (FixPromptedVQAGenerator)**
+
+* **Function**: Uses the preset CapRL prompt templates (`SYS_PROMPT_MCQ` / `USER_PROMPT_MCQ`) to instruct the VLM to generate 5 MCQs in one go.
+* **Output**: Unstructured text blocks containing multiple `#### Question` headers and options.
+
+#### B. **Structured Parsing (FunctionalRefiner)**
+
+* **Logic Function**: `parse_mcq_text_logic`
+* **Function**: Extracts the questions, options (A-F), and correct answers from the raw text using regular expressions.
+* **Output**: A structured list of MCQs (`parsed_mcq_list`).
+
+#### C. **Dependency Verification (VisualDependencyRefiner)**
+
+This is the core filter of the pipeline. It performs N inferences (N = `rotate_num`) for each question:
+
+1. **Option Rotation**: Randomly shuffles the option order (e.g., moving the answer from A to C) to prevent the model from cheating by "always choosing A".
+2. **Visual Pass**: Inputs Image + Question. Records the proportion of correct answers.
+3. **Textual Pass (Blind Test)**: Inputs Question only (No Image). Records the proportion of correct blind guesses.
+4. **Filtering Criteria**:
+* Retains the question if and only if: `Visual_Acc >= pass_visual_min` **AND** `Textual_Acc <= pass_textual_max`.
+* *Example*: If a question can be answered correctly without looking at the image (high textual accuracy), it relies on common sense rather than visual info, and is **discarded**.
+
+
+
+### 3. **Output Data**
+
+The output data (`final_mcqs`) only contains questions that have passed the rigorous verification. These questions possess extremely high quality and visual relevance.
+
+**Output Data Example**:
+
+```json
+{
+    "image": "./images/sample_01.jpg",
+    "final_mcqs": [
+        {
+            "question": "What is the color of the car on the far left?\n - A) Red\n - B) Blue...",
+            "answer": "A",
+            "stats": {
+                "visual_acc": 1.0,  
+                "text_acc": 0.0     
+            }
+        }
+    ]
+}
+
+```
+
+---
+
+## 4. Pipeline Example
+
+Below is the complete `VisualOnlyMCQPipeline` code implementation (API Version).
+
+```python
+import os
+os.environ["DF_API_KEY"] = "sk-xxxx"
+import argparse
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
+from dataflow.operators.core_vision import FixPromptedVQAGenerator, VisualDependencyRefiner
+from dataflow.operators.core_text import FunctionalRefiner
+from dataflow.prompts.image import ImageCaprlPrompt
+from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
+import re
+from typing import List, Dict, Any
+
+_Q_BLOCK_SPLIT = re.compile(r"^####\s*\d+\.\s*\*\*(.*?)\*\*\s*$", re.M)
+_OPT_LINE_RE = re.compile(r"^\s*-\s*([A-F])\)\s*(.+?)\s*$")
+_ANS_LINE_RE = re.compile(r"^\s*\*\*Answer:\*\*\s*([A-F])\)\s*(.+?)\s*$", re.I)
+
+def parse_mcq_text_logic(mcq_text: str, expected: int = 5) -> List[Dict[str, Any]]:
+    if not mcq_text or not isinstance(mcq_text, str): return []
+    
+    indices = [m.start() for m in _Q_BLOCK_SPLIT.finditer(mcq_text)]
+    if not indices: return []
+    indices.append(len(mcq_text))
+    blocks = [mcq_text[indices[i]:indices[i+1]].strip() for i in range(len(indices)-1)]
+    
+    parsed = []
+    for block in blocks:
+        lines = [ln.rstrip() for ln in block.splitlines() if ln.strip()]
+        q_title_m = _Q_BLOCK_SPLIT.search(block)
+        if not q_title_m: continue
+        
+        q_title = q_title_m.group(1).strip()
+        options = {}
+        ans_letter, ans_text = None, None
+        
+        for ln in lines:
+            m_opt = _OPT_LINE_RE.match(ln)
+            if m_opt:
+                options[m_opt.group(1)] = m_opt.group(2).strip()
+                continue
+            m_ans = _ANS_LINE_RE.match(ln)
+            if m_ans:
+                ans_letter = m_ans.group(1).upper()
+                ans_text = m_ans.group(2).strip()
+                break
+        
+        if options and ans_letter and ans_letter in options:
+            q_lines = [q_title]
+            for lbl in ["A", "B", "C", "D", "E", "F"]:
+                if lbl in options:
+                    q_lines.append(f"   - {lbl}) {options[lbl]}")
+            
+            parsed.append({
+                "question": "\n".join(q_lines),
+                "question_title": q_title,
+                "options": options,
+                "answer": ans_letter,
+                "answer_text": ans_text
+            })
+            
+    if expected > 0:
+        parsed = parsed[:expected]
+        
+    uniq = []
+    seen = set()
+    for it in parsed:
+        key = (it["question_title"], it["answer"])
+        if key not in seen:
+            seen.add(key)
+            uniq.append(it)
+    return uniq
+
+
+class VisualOnlyMCQPipeline:
+    def __init__(
+        self,
+        *,
+        first_entry_file: str,
+        cache_path: str = "../cache/cache_mcq",
+        file_name_prefix: str = "vis_mcq",
+        # Config
+        rotate_num: int = 4,
+        pass_visual_min: float = 1.0,
+        pass_textual_max: float = 0.25,
+        add_none_above: bool = True,
+        # Keys
+        input_image_key: str = "image",
+        output_key: str = "final_mcqs",
+        # VLLM
+        vllm_max_tokens: int = 2048
+    ):
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type="jsonl"
+        )
+        self.vlm_serving = APIVLMServing_openai(
+            api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format
+            model_name="gpt-4o-mini",
+            image_io=None,
+            send_request_stream=False,
+            max_workers=10,
+            timeout=1800
+        )
+
+        
+        # Keys
+        self.keys = {
+            "img": input_image_key,
+            "raw_text": "raw_mcq_text",
+            "parsed_list": "parsed_mcq_list",
+            "final": output_key
+        }
+        
+        # --- Prompts ---
+        self.prompts_db = ImageCaprlPrompt().build_prompt()
+
+        # ================== Operators ==================
+        
+        # 1. Generate Raw MCQs (FixPromptedVQAGenerator)
+        # 直接使用 prompt 类中的字符串
+        self.op_gen_raw = FixPromptedVQAGenerator(
+            serving=self.vlm_serving,
+            system_prompt=self.prompts_db["SYS_PROMPT_MCQ"],
+            user_prompt=self.prompts_db["USER_PROMPT_MCQ"]
+        )
+        
+        # 2. Parse MCQs (Refine)
+        self.op_parse = FunctionalRefiner(func=parse_mcq_text_logic)
+        
+        # 3. Verify Visual Dependency (Refine)
+        # 传入 prompt 模板
+        self.op_verify = VisualDependencyRefiner(
+            serving=self.vlm_serving,
+            instruction_template=self.prompts_db["ANSWER_INSTRUCTION"],
+            rotate_num=rotate_num,
+            pass_visual_min=pass_visual_min,
+            pass_textual_max=pass_textual_max,
+            add_none_above_visual=add_none_above
+        )
+
+    def forward(self):
+        print(">>> [Pipeline] Step 1: Generating Raw MCQs (FixPrompted)...")
+        self.op_gen_raw.run(
+            self.storage.step(),
+            input_image_key=self.keys["img"],
+            output_answer_key=self.keys["raw_text"]
+        )
+        
+        print(">>> [Pipeline] Step 2: Parsing MCQs...")
+        self.op_parse.run(
+            self.storage.step(),
+            output_key=self.keys["parsed_list"],
+            mcq_text=self.keys["raw_text"], 
+            expected=5
+        )
+        
+        print(">>> [Pipeline] Step 3: Verifying Visual Dependency (Rotation Check)...")
+        self.op_verify.run(
+            self.storage.step(),
+            input_list_key=self.keys["parsed_list"],
+            input_image_key=self.keys["img"],
+            output_key=self.keys["final"]
+        )
+        
+        print(f">>> [Pipeline] Done. Results in: {self.keys['final']}")
+
+if __name__ == "__main__":
+    pipe = VisualOnlyMCQPipeline(
+        first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl",
+        rotate_num=4,
+        pass_visual_min=1.0,
+        pass_textual_max=0.25
+    )
+    pipe.forward()
+
+```
diff --git a/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md b/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md
index 0904691e..799a867e 100644
--- a/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md
+++ b/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md
@@ -1,3 +1,11 @@
+
+```
+
+---
+
+### 2. 英文 GPU 版 (English GPU Version)
+
+```markdown
 ---
 title: Vision MCTS Reasoning Pipeline
 icon: mdi:image-text
@@ -7,27 +15,27 @@ permalink: /en/mm_guide/vision_mct_reasoning_pipeline/
 
 ## 1. Overview
 
-The **Vision MCTS Reasoning Pipeline** is designed to construct high-quality **Process Supervision Data** for multimodal large models. This pipeline handles two types of data sources: existing Monte Carlo Tree Search (MCTS) trajectory data, or direct generation of new reasoning chains using a VLM.
+The **Vision MCTS Reasoning Pipeline** is designed to build high-quality **Process Supervision Data** for multimodal large models. This pipeline handles two sources of data: existing Monte Carlo Tree Search (MCTS) trajectory data, or generating new reasoning chains directly using a VLM.
 
-This pipeline is a core tool for **Grounded-RL** and **SFT Data Construction**, converting complex tree-search processes into a linearized `<think>...</think><answer>...</answer>` format that models can learn from.
+This pipeline is a core tool for **Grounded-RL** and **SFT Data Construction**. It "linearizes" complex tree-like search processes into a `<think>...</think><answer>...</answer>` format that the model can learn from.
 
 We support the following application scenarios:
 
-* **MCTS Data Extraction**: Converting high-value paths (Rollouts) from search trees into linear training data.
-* **Hybrid Data Construction**: Automatically falling back to VLM-based CoT generation for samples without search trees.
-* **Spatial Reasoning Enhancement**: Supporting the generation of spatial reasoning chains containing explicit coordinates (Bounding Boxes).
+* **Data Extraction from MCTS Trees**: Converts high-value paths (Rollouts) in the search tree into linear training data.
+* **Hybrid Data Construction**: Automatically falls back to using the VLM for CoT generation for samples without a search tree.
+* **Spatial Reasoning Enhancement**: Supports generating spatial reasoning chains that include explicit coordinates (Bounding Boxes).
 
 The main process of the pipeline includes:
 
-1. **MCTS Tree Parsing**: Parsing the search tree structure in the input data to extract successful reasoning paths.
-2. **Visual Reasoning Generation (Fallback)**: Using a VLM to regenerate reasoning chains for samples where the tree structure is missing or parsing fails.
-3. **Data Standardization**: Outputting reasoning chain data in a unified format.
+1. **MCTS Tree Parsing**: Parses the search tree structure in the input data and extracts successful reasoning paths.
+2. **Visual Reasoning Generation (Fallback)**: For samples with missing tree structures or failed parsing, the VLM is used to regenerate the reasoning chain.
+3. **Data Standardization**: Outputs reasoning chain data in a unified format.
 
 ---
 
 ## 2. Quick Start
 
-### Step 1: Create a Working Directory
+### Step 1: Create a New DataFlow Working Directory
 
 ```bash
 mkdir run_mcts_reasoning
@@ -35,41 +43,111 @@ cd run_mcts_reasoning
 
 ```
 
-### Step 2: Prepare the Script
+### Step 2: Initialize DataFlow-MM
 
-Save the code in the "Pipeline Example" section below as `vision_mcts_pipeline.py`.
+```bash
+dataflowmm init
 
-### Step 3: Configure Parameters
+```
 
-Ensure the input file (jsonl) contains the `tree` field (for extraction) or just `question/image` (for generation).
+You will then see:
 
 ```bash
-# Install dependencies
-pip install open-dataflow vllm
+gpu_pipelines/vision_mcts_pipeline.py
 
 ```
 
-### Step 4: Run
+### Step 3: Download Sample Data
 
 ```bash
-python vision_mcts_pipeline.py \
-  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
-  --input_file "data/mcts_trajectories.jsonl" \
-  --prompt_type "spatial"
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
 
 ```
 
+### Step 4: Configure Parameters
+
+Ensure the input file (jsonl) contains a `tree` field (for extraction) or just `question`/`image` (for generation).
+
+```python
+if __name__ == "__main__":
+    pipe = VisionMCTSReasoningPipeline(
+        model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+        first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl",
+        prompt_type="spatial",
+        hf_cache_dir="~/.cache/huggingface",
+        download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+    )
+    pipe.forward()
+
+```
+
+> **⚠️ Important Note on Model Path Configuration (Taking `Qwen2.5-VL-3B-Instruct` as an example):**
+> * **If you have already downloaded the model files:** Please change `model_path` to your local model path. **Crucially**, ensure that the model folder is named exactly `Qwen2.5-VL-3B-Instruct`; otherwise, the framework will fail to recognize it.
+> * **If you haven't downloaded the model yet:** You must specify a `download_dir` parameter that ends with `Qwen2.5-VL-3B-Instruct` (as shown in the default parameters). Failure to do so will also result in the model not being recognized after downloading.
+> 
+> 
+
+### Step 5: Run
+
+```bash
+cd gpu_pipelines
+python vision_mcts_pipeline.py
+
+```
+
+> **🛠️ Troubleshooting**
+> **Issue 1:** If you encounter a CUDA library conflict error similar to the following:
+> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`
+> **Solution:** This is usually caused by conflicting environment variables. Run the script with an empty `LD_LIBRARY_PATH`:
+> ```bash
+> LD_LIBRARY_PATH="" python vision_mcts_pipeline.py
+> 
+> ```
+> 
+> 
+> **Issue 2:** If you are using **Qwen series models** and encounter the following error:
+> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"`
+> **Solution:** Open the `config.json` file located in your model folder, find the `rope_scaling` section, and change the key `"type"` to `"rope_type"`.
+> **Before modification:**
+> ```json
+> "rope_scaling": {
+>   "type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> 
+> ```
+> 
+> 
+> **After modification:**
+> ```json
+> "rope_scaling": {
+>   "rope_type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> 
+> ```
+> 
+> 
+
 ---
 
 ## 3. Data Flow & Logic
 
 ### 1. **Input Data**
 
-Input data typically comes from MCTS search logs or unlabelled image-text pairs:
+Input data typically originates from MCTS search process logs, or unannotated image-text pairs:
 
 * **image**: Path to the image.
-* **question**: Visual question.
-* **tree** (optional): JSON structure of the MCTS search tree, containing node values, visit counts, and actions.
+* **question**: The visual question.
+* **tree** (Optional): JSON structure of the MCTS search tree, containing node Values, Visits, and Actions.
 
 **Input Data Example**:
 
@@ -84,30 +162,30 @@ Input data typically comes from MCTS search logs or unlabelled image-text pairs:
 
 ### 2. **Core Operator Logic**
 
-The pipeline employs an **"Extract First, Fallback to Generate"** hybrid strategy:
+This pipeline uses a hybrid strategy of **"Extraction First, Generation as Fallback"**:
 
-#### A. **MCTSTreeRefiner**
+#### A. **MCTSTreeRefiner (Tree Structure Parser)**
 
-This operator is responsible for processing the `tree` field. It traverses the tree structure and filters for the best paths from root to leaf based on node Q-values.
+This operator handles the `tree` field. It traverses the tree structure and filters out the best path from the root node to a leaf node based on the node's Q-value.
 
 * **Input**: `tree` object.
-* **Functionality**: Linearizes tree paths, filtering out low-value or incomplete search branches.
-* **Output**: List of extracted reasoning chains (`mcts_chains`).
+* **Function**: Linearizes tree paths, filtering out low-value or incomplete search branches.
+* **Output**: A list of extracted reasoning chains (`mcts_chains`).
 
-#### B. **VisualReasoningGenerator**
+#### B. **VisualReasoningGenerator (Visual Reasoning Generator)**
 
-This operator is the "Generation Engine" of the pipeline. It takes the extraction results from the previous step as input.
+This operator is the "generation engine" of the pipeline. It receives the extraction result from the previous step as input.
 
 * **Mechanism**: Checks `input_existing_chains_key` (i.e., `mcts_chains`).
-* If MCTS parsing was successful (chains exist), it reuses them directly without running inference (saving compute).
-* If MCTS chains are empty (tree missing or parsing failed), it calls the VLM to generate reasoning chains from scratch based on the `prompt_type`.
+* If MCTS parsing is successful (chain exists), it is reused directly without inference (saving computational resources).
+* If the MCTS chain is empty (tree does not exist or parsing failed), it calls the VLM to generate the reasoning chain from scratch based on `prompt_type` (e.g., `spatial`).
 
 
-* **Prompt Type**: Supports modes like `spatial` (spatial coordinate reasoning), `logical` (logical reasoning), etc.
+* **Prompt Types**: Supports modes like `spatial` (spatial coordinate reasoning) and `logical` (logical reasoning).
 
 ### 3. **Output Data**
 
-The final output data (`final_reasoning_chains`) will contain high-quality Chain-of-Thought data ready for SFT training.
+The finally generated output data (`final_reasoning_chains`) will contain high-quality chains of thought that can be directly used for SFT training.
 
 **Output Example**:
 
@@ -125,9 +203,9 @@ The final output data (`final_reasoning_chains`) will contain high-quality Chain
 
 ## 4. Pipeline Example
 
-Below is the complete `VisionMCTSReasoningPipeline` code implementation.
+Below is the complete `VisionMCTSReasoningPipeline` code implementation (GPU Version).
+
 ```python
-import argparse
 from dataflow.utils.storage import FileStorage
 from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
 
@@ -141,8 +219,10 @@ class VisionMCTSReasoningPipeline:
         model_path: str,
         *,
         # Storage
+        hf_cache_dir: str | None = None,
+        download_dir: str = "./ckpt/models",
         first_entry_file: str,
-        cache_path: str = "./cache_mcts",
+        cache_path: str = "../cache/cache_mcts",
         file_name_prefix: str = "mcts_reason",
         # Config
         prompt_type: str = "spatial",
@@ -155,7 +235,6 @@ class VisionMCTSReasoningPipeline:
         # VLLM
         vllm_max_tokens: int = 1024
     ):
-        # 1. 存储初始化
         self.storage = FileStorage(
             first_entry_file_name=first_entry_file,
             cache_path=cache_path,
@@ -163,8 +242,9 @@ class VisionMCTSReasoningPipeline:
             cache_type="jsonl"
         )
         
-        # 2. 模型服务
         self.serving = LocalModelVLMServing_vllm(
+            hf_cache_dir=hf_cache_dir,
+            hf_local_dir=download_dir,
             hf_model_name_or_path=model_path,
             vllm_tensor_parallel_size=1,
             vllm_temperature=0.7,
@@ -175,20 +255,18 @@ class VisionMCTSReasoningPipeline:
             "q": input_question_key,
             "img": input_image_key,
             "tree": input_tree_key,
-            "mcts_chains": "mcts_extracted_chains", # 中间结果
+            "mcts_chains": "mcts_extracted_chains",
             "final": output_key
         }
 
         # ================== Operators ==================
         
-        # 算子 1: MCTS Tree -> Chains (提取器)
-        # 负责将树结构扁平化为线性链
+        # 1. Refiner: MCTS -> Chains
         self.op_mcts_refine = MCTSTreeRefiner(
             max_chains_per_sample=max_samples_per_file
         )
         
-        # 算子 2: VLM -> Chains (生成器/Fallback)
-        # 如果 MCTS 提取失败，则使用 VLM 生成；如果成功，则跳过
+        # 2. Generator: VLM -> Chains (Fallback)
         self.op_vlm_gen = VisualReasoningGenerator(
             serving=self.serving,
             prompt_type=prompt_type
@@ -203,7 +281,8 @@ class VisionMCTSReasoningPipeline:
         )
         
         print(">>> [Pipeline] Step 2: Generating Chains via VLM (Fallback)...")
-        # 注意：input_existing_chains_key 实现了混合/回退逻辑
+        # 将 mcts_chains 作为 input_existing_chains_key 传入
+        # 如果 MCTS 解析成功，则复用；否则调用 VLM 生成
         self.op_vlm_gen.run(
             self.storage.step(),
             input_question_key=self.keys["q"],
@@ -214,17 +293,13 @@ class VisionMCTSReasoningPipeline:
         
         
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input_file", default="dataflow/example/image_to_text_pipeline/mct_reasoning.jsonl")
-    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
-    parser.add_argument("--prompt_type", default="spatial")
-    args = parser.parse_args()
-    
     pipe = VisionMCTSReasoningPipeline(
-        model_path=args.model_path,
-        first_entry_file=args.input_file,
-        prompt_type=args.prompt_type
+        model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+        first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl",
+        prompt_type="spatial",
+        hf_cache_dir="~/.cache/huggingface",
+        download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
     )
     pipe.forward()
 
-```
\ No newline at end of file
+```
diff --git a/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md b/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md
new file mode 100644
index 00000000..8001e5c5
--- /dev/null
+++ b/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md
@@ -0,0 +1,248 @@
+---
+title: Vision MCTS Reasoning Pipeline (API version)
+icon: mdi:image-text
+createTime: 2026/01/11 21:59:59
+permalink: /en/mm_guide/vision_mct_reasoning_pipeline_api/
+---
+
+## 1. Overview
+
+The **Vision MCTS Reasoning Pipeline** is designed to build high-quality **Process Supervision Data** for multimodal large models. This pipeline handles two sources of data: existing Monte Carlo Tree Search (MCTS) trajectory data, or generating new reasoning chains directly using a VLM.
+
+This pipeline is a core tool for **Grounded-RL** and **SFT Data Construction**. It "linearizes" complex tree-like search processes into a `<think>...</think><answer>...</answer>` format that the model can learn from.
+
+We support the following application scenarios:
+
+* **Data Extraction from MCTS Trees**: Converts high-value paths (Rollouts) in the search tree into linear training data.
+* **Hybrid Data Construction**: Automatically falls back to using the VLM for CoT generation for samples without a search tree.
+* **Spatial Reasoning Enhancement**: Supports generating spatial reasoning chains that include explicit coordinates (Bounding Boxes).
+
+The main process of the pipeline includes:
+
+1. **MCTS Tree Parsing**: Parses the search tree structure in the input data and extracts successful reasoning paths.
+2. **Visual Reasoning Generation (Fallback)**: For samples with missing tree structures or failed parsing, the VLM is used to regenerate the reasoning chain.
+3. **Data Standardization**: Outputs reasoning chain data in a unified format.
+
+---
+
+## 2. Quick Start
+
+### Step 1: Create a New DataFlow Working Directory
+
+```bash
+mkdir run_mcts_reasoning
+cd run_mcts_reasoning
+
+```
+
+### Step 2: Initialize DataFlow-MM
+
+```bash
+dataflowmm init
+
+```
+
+You will then see:
+
+```bash
+api_pipelines/vision_mcts_api_pipeline.py
+
+```
+
+### Step 3: Download Sample Data
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### Step 4: Configure API Key
+
+Set your API Key environment variable in `api_pipelines/vision_mcts_api_pipeline.py`:
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+
+```
+
+### Step 5: Configure Parameters
+
+Configure the API service and input data paths in `api_pipelines/vision_mcts_api_pipeline.py`. Ensure the input file (jsonl) contains a `tree` field (for extraction) or just `question`/`image` (for generation).
+
+```python
+    pipe = VisionMCTSReasoningPipeline(
+        first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl",
+        prompt_type="spatial",
+    )
+
+```
+
+### Step 6: Run with One Command
+
+```bash
+cd api_pipelines
+python vision_mcts_api_pipeline.py
+
+```
+
+---
+
+## 3. Data Flow & Logic
+
+### 1. **Input Data**
+
+Input data typically originates from MCTS search process logs, or unannotated image-text pairs:
+
+* **image**: Path to the image.
+* **question**: The visual question.
+* **tree** (Optional): JSON structure of the MCTS search tree, containing node Values, Visits, and Actions.
+
+**Input Data Example**:
+
+```json
+{
+    "image": "./images/puzzle.jpg",
+    "question": "What is the next step to solve this?",
+    "tree": { "root": { "children": [...], "value": 1.0, "text": "Step 1..." } }
+}
+
+```
+
+### 2. **Core Operator Logic**
+
+This pipeline uses a hybrid strategy of **"Extraction First, Generation as Fallback"**:
+
+#### A. **MCTSTreeRefiner (Tree Structure Parser)**
+
+This operator handles the `tree` field. It traverses the tree structure and filters out the best path from the root node to a leaf node based on the node's Q-value.
+
+* **Input**: `tree` object.
+* **Function**: Linearizes tree paths, filtering out low-value or incomplete search branches.
+* **Output**: A list of extracted reasoning chains (`mcts_chains`).
+
+#### B. **VisualReasoningGenerator (Visual Reasoning Generator)**
+
+This operator is the "generation engine" of the pipeline. It receives the extraction result from the previous step as input.
+
+* **Mechanism**: Checks `input_existing_chains_key` (i.e., `mcts_chains`).
+* If MCTS parsing is successful (chain exists), it is reused directly without inference (saving computational resources).
+* If the MCTS chain is empty (tree does not exist or parsing failed), it calls the VLM to generate the reasoning chain from scratch based on `prompt_type` (e.g., `spatial`).
+
+
+* **Prompt Types**: Supports modes like `spatial` (spatial coordinate reasoning) and `logical` (logical reasoning).
+
+### 3. **Output Data**
+
+The finally generated output data (`final_reasoning_chains`) will contain high-quality chains of thought that can be directly used for SFT training.
+
+**Output Example**:
+
+```json
+{
+    "image": "./images/puzzle.jpg",
+    "final_reasoning_chains": [
+        "<think>First, locate the red block at [100, 200]. To solve the puzzle, it needs to move right...</think><answer>Move Red Block</answer>"
+    ]
+}
+
+```
+
+---
+
+## 4. Pipeline Example
+
+Below is the complete `VisionMCTSReasoningPipeline` code implementation (API Version).
+
+```python
+import os
+os.environ["DF_API_KEY"] = "sk-xxxx"
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
+# 引入原子算子
+from dataflow.operators.core_text import MCTSTreeRefiner
+from dataflow.operators.core_vision import VisualReasoningGenerator
+from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
+
+class VisionMCTSReasoningPipeline:
+    def __init__(
+        self,
+        first_entry_file: str,
+        cache_path: str = "../cache/cache_mcts",
+        file_name_prefix: str = "mcts_reason",
+        # Config
+        prompt_type: str = "spatial",
+        max_samples_per_file: int = 10000,
+        # Keys
+        input_question_key: str = "question",
+        input_image_key: str = "image",
+        input_tree_key: str = "tree",
+        output_key: str = "final_reasoning_chains",
+
+    ):
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type="jsonl"
+        )
+        
+        self.vlm_serving = APIVLMServing_openai(
+            api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format
+            model_name="gpt-4o-mini",
+            image_io=None,
+            send_request_stream=False,
+            max_workers=10,
+            timeout=1800
+        )
+        
+        self.keys = {
+            "q": input_question_key,
+            "img": input_image_key,
+            "tree": input_tree_key,
+            "mcts_chains": "mcts_extracted_chains",
+            "final": output_key
+        }
+
+        # ================== Operators ==================
+        
+        # 1. Refiner: MCTS -> Chains
+        self.op_mcts_refine = MCTSTreeRefiner(
+            max_chains_per_sample=max_samples_per_file
+        )
+        
+        # 2. Generator: VLM -> Chains (Fallback)
+        self.op_vlm_gen = VisualReasoningGenerator(
+            serving=self.vlm_serving,
+            prompt_type=prompt_type
+        )
+
+    def forward(self):
+        print(">>> [Pipeline] Step 1: Extracting Chains from MCTS Trees...")
+        self.op_mcts_refine.run(
+            self.storage.step(),
+            input_tree_key=self.keys["tree"],
+            output_key=self.keys["mcts_chains"]
+        )
+        
+        print(">>> [Pipeline] Step 2: Generating Chains via VLM (Fallback)...")
+        # 将 mcts_chains 作为 input_existing_chains_key 传入
+        # 如果 MCTS 解析成功，则复用；否则调用 VLM 生成
+        self.op_vlm_gen.run(
+            self.storage.step(),
+            input_question_key=self.keys["q"],
+            input_image_key=self.keys["img"],
+            input_existing_chains_key=self.keys["mcts_chains"],
+            output_key=self.keys["final"]
+        )
+        
+        
+if __name__ == "__main__":
+    pipe = VisionMCTSReasoningPipeline(
+        first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl",
+        prompt_type="spatial",
+    )
+    pipe.forward()
+
+```
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_gcot.md b/docs/zh/notes/mm_guide/image_understanding/image_gcot.md
index 3a5add79..db64892a 100644
--- a/docs/zh/notes/mm_guide/image_understanding/image_gcot.md
+++ b/docs/zh/notes/mm_guide/image_understanding/image_gcot.md
@@ -405,6 +405,4 @@ if __name__ == "__main__":
         download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
     )
     pipe.forward()
-
-
 ```
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md b/docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md
index 642b0d5a..a2a419f4 100644
--- a/docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md
+++ b/docs/zh/notes/mm_guide/image_understanding/image_gcot_api.md
@@ -1,14 +1,14 @@
 ---
-title: 图像定位思维链 (GCoT) 生成流水线
+title: 图像定位思维链 (GCoT) 生成流水线（API版）
 icon: mdi:image-text
 createTime: 2026/01/11 20:44:55
-permalink: /zh/mm_guide/image_gcot/
+permalink: /zh/mm_guide/image_gcot_api/
 ---
 ## 1. 概述
 
 **图像定位思维链 (GCoT) 生成流水线** 旨在自动化生成**带视觉定位的思维链（Grounded Chain-of-Thought）**数据。该流水线通过多步推理，不仅生成回答问题的逻辑步骤，还将推理过程中提到的关键物体在图像中进行空间定位（Bounding Box），从而显著提升多模态数据的可解释性和精确度。
 
-与传统方法不同，本流水线采用 **单一 VLM（如 Qwen2.5-VL）** 同时完成“推理”和“定位”任务，流程更加精简高效。
+与传统方法不同，本流水线采用 **单一 VLM（如 GPT-5）** 同时完成“推理”和“定位”任务，流程更加精简高效。
 
 我们支持以下应用场景：
 
@@ -58,7 +58,7 @@ os.environ["DF_API_KEY"] = "your_api_key"
 
 ### 第五步：配置参数
 
-在 `api_pipelines/image_region_caption_api_pipeline.py` 中配置 API 服务和输入数据路径：
+在 `api_pipelines/image_gcot_api_pipeline.py` 中配置 API 服务和输入数据路径：
 
 ```python
     def __init__(
@@ -67,16 +67,20 @@ os.environ["DF_API_KEY"] = "your_api_key"
         first_entry_file: str,
         cache_path: str = "../cache/cache_gcot",
         file_name_prefix: str = "gcot",
-        # Keys
         question_key: str = "question",
         answer_key: str = "answer",
         image_key: str = "image",
         output_key: str = "gcot",
-        # Config
         vllm_max_tokens: int = 512
     ):
 ```
 
+```python
+    pipe = ImageGCoTPipeline(
+        first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl"
+    )
+```
+
 ```python
 self.vlm_serving = APIVLMServing_openai(
             api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format
@@ -385,6 +389,4 @@ if __name__ == "__main__":
         first_entry_file="../example_data/capsbench_images/image_gcot_demo.jsonl"
     )
     pipe.forward()
-
-
 ```
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md b/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md
index 3bb6b039..7cfc00bc 100644
--- a/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md
+++ b/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md
@@ -29,38 +29,112 @@ permalink: /zh/mm_guide/image_scale_caption_pipeline/
 
 ## 2. 快速开始
 
-### 第一步：准备工作目录
+### 第一步：创建新的 DataFlow 工作文件夹
 
 ```bash
-mkdir run_scalecap
-cd run_scalecap
+mkdir run_dataflow
+cd run_dataflow
 
 ```
 
-### 第二步：准备脚本
+### 第二步：初始化 DataFlow-MM
 
-将下文“流水线示例”中的代码保存为 `scalecap_pipeline.py`。
+```bash
+dataflowmm init
 
-### 第三步：配置运行参数
+```
 
-确保 VLM 模型（如 Qwen2.5-VL）路径正确。
+这时你会看到：
 
 ```bash
-# 安装依赖
-pip install open-dataflow vllm
+gpu_pipelines/image_scale_caption_pipeline.py
+
+```
+
+### 第三步：下载示例数据
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### 第四步：配置参数
+
+```python
+if __name__ == "__main__":
+    pipe = ImageScaleCaptionPipeline(
+        model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+        hf_cache_dir="~/.cache/huggingface",
+        download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+        device="cuda",
+        first_entry_file="../example_data/capsbench_images/image_scale_caption_demo.jsonl",
+        cache_path="../cache/image_scale_caption",
+        file_name_prefix="scalecap",
+        input_image_key="image",
+        output_key="final_caption",
+        vllm_tensor_parallel_size=1,
+        vllm_max_tokens=1024
+    )
+    pipe.forward()
 
 ```
 
-### 第四步：一键运行
+> **⚠️ 模型路径配置的重要提示（以 `Qwen2.5-VL-3B-Instruct` 为例）：**
+> * **如果您已经下载好了模型文件**：请将 `model_path` 修改为您的本地模型路径。**务必保证**模型存放的最终文件夹名称精确为 `Qwen2.5-VL-3B-Instruct`，否则底层解析时将无法正确匹配和识别该模型。
+> * **如果您还未下载模型（需要自动下载）**：请一定要指定 `download_dir` 参数，并且该目录路径**必须以** `Qwen2.5-VL-3B-Instruct` **结尾**（正如默认参数所示），否则下载完成后同样会导致框架无法识别模型。
+> 
+> 
+
+### 第五步：一键运行
 
 ```bash
-python scalecap_pipeline.py \
-  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
-  --input_jsonl "data/images.jsonl" \
-  --output_key "final_caption"
+cd gpu_pipelines
+python image_scale_caption_pipeline.py
 
 ```
 
+> **🛠️ 常见问题排查 (Troubleshooting)**
+> **问题 1：** 如果遇到类似如下的动态链接库冲突报错：
+> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`
+> **解决方法：** 这通常是环境变量干扰导致的。请在运行命令前清空 `LD_LIBRARY_PATH`：
+> ```bash
+> LD_LIBRARY_PATH="" python image_scale_caption_pipeline.py
+> 
+> ```
+> 
+> 
+> **问题 2：** 如果您使用的是 **Qwen 系列模型**，并且遇到以下报错：
+> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"`
+> **解决方法：** 打开模型文件夹下的 `config.json` 文件，找到 `rope_scaling` 配置块，将 `"type"` 字段修改为 `"rope_type"` 即可。
+> **修改前：**
+> ```json
+> "rope_scaling": {
+>   "type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> 
+> ```
+> 
+> 
+> **修改后：**
+> ```json
+> "rope_scaling": {
+>   "rope_type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> 
+> ```
+> 
+> 
+
 ---
 
 ## 3. 数据流与流水线逻辑
@@ -75,7 +149,7 @@ python scalecap_pipeline.py \
 
 ```json
 {
-    "image": "./images/complex_scene.jpg"
+    "image": "../example_data/capsbench_images/0.png"
 }
 
 ```
@@ -132,12 +206,12 @@ python scalecap_pipeline.py \
 
 ```json
 {
-    "image": "./images/complex_scene.jpg",
+    "image": "../example_data/capsbench_images/0.png",
     "init_caption": "A dog sitting on a bench.",
     "golden_sentences": ["A dog is sitting on a wooden bench."],
-    "q_list": ["Describe more details about the dog.", "Describe position of the bench."],
+    "q_list": ["Describe more details about the dog.", "Describe more details about the position of the bench."],
     "final_details": ["The dog is a Golden Retriever with a red collar.", "The bench is located in a park."],
-    "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park..."
+    "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park."
 }
 
 ```
@@ -146,7 +220,7 @@ python scalecap_pipeline.py \
 
 ## 4. 流水线示例
 
-以下是完整的 `ImageScaleCaptionPipeline` 代码实现。
+以下是完整的 `ImageScaleCaptionPipeline` 代码实现 (GPU 版本)。
 
 ```python
 import re
@@ -154,12 +228,79 @@ import argparse
 from typing import Callable, Any, List
 
 from dataflow.utils.storage import FileStorage
+
 from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
 from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
 from dataflow.prompts.image import ImageScaleCaptionPrompt
+
 from dataflow.operators.core_vision import PromptedVQAGenerator, BatchVQAGenerator, VisualGroundingRefiner
 from dataflow.operators.core_text import PromptTemplatedQAGenerator, FunctionalRefiner
 
+
+def split_sentences(text: str) -> List[str]:
+    """将文本拆分为句子列表"""
+    if not text or not isinstance(text, str):
+        return []
+    # 使用正则按标点符号分割 (. ! ? 。 ！ ？)
+    _SENT_SPLIT = re.compile(r"(?<=[.!?。！？])\s+")
+    parts = [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]
+    return parts or ([text.strip()] if text.strip() else [])
+
+def join_list(data: Any, separator: str = "\n") -> str:
+    """将列表连接为字符串"""
+    if isinstance(data, list):
+        # 过滤掉非字符串元素或空字符串
+        valid_items = [str(x) for x in data if x]
+        return separator.join(valid_items)
+    return str(data) if data is not None else ""
+
+def parse_questions_logic(text: str, max_q: int = 20) -> List[str]:
+    """
+    解析 LLM 生成的 "Describe more details about..." 文本，
+    并自动扩展 position 问题。
+    """
+    if not text or not isinstance(text, str):
+        return []
+
+    lines = [t.strip() for t in text.split("\n") if t.strip()]
+    obj_qs = []
+    
+    for line in lines:
+        # 提取包含 "Describe more details about" 的行
+        if "Describe more details about" in line:
+            # 去除可能的序号 (如 "1. Describe...")
+            try:
+                start_idx = line.find("Describe")
+                clean = line[start_idx:]
+                # 去除句末多余内容，保留到第一个句号
+                if "." in clean:
+                    clean = clean.split(".")[0] + "."
+                obj_qs.append(clean)
+            except Exception:
+                continue
+    
+    # 去重并保持顺序
+    seen = set()
+    unique_obj_qs = []
+    for q in obj_qs:
+        if q not in seen:
+            unique_obj_qs.append(q)
+            seen.add(q)
+    
+    # 截断
+    unique_obj_qs = unique_obj_qs[:max_q]
+    
+    # 扩展 Position 问题
+    pos_qs = [
+        q.replace("Describe more details about", "Describe more details about the position of")
+        for q in unique_obj_qs
+    ]
+    
+    # 返回合并后的列表 (对象问题 + 位置问题)
+    return unique_obj_qs + pos_qs
+
+
 class ImageScaleCaptionPipeline:
     def __init__(
         self,
@@ -211,14 +352,19 @@ class ImageScaleCaptionPipeline:
         # ================== Operator Initialization ==================
 
         # --- Step A: Generate Init Caption ---
+        # 构造固定 Prompt 列
         self.refine_const_prompt = FunctionalRefiner(func=lambda: self.prompts_db["VLM_PROMPT_1"])
+        
+        # 生成初稿 (使用通用 PromptedVQAGenerator)
         self.gen_init_caption = PromptedVQAGenerator(
             serving=self.serving,
             system_prompt="You are a helpful assistant."
         )
 
         # --- Step B: Refine Golden Sentences ---
+        # 分句
         self.refine_split = FunctionalRefiner(func=split_sentences)
+        
         # 视觉自检 (保留 Yes 的句子)
         self.refine_golden = VisualGroundingRefiner(
             serving=self.serving,
@@ -226,7 +372,10 @@ class ImageScaleCaptionPipeline:
         )
 
         # --- Step C: Generate Questions ---
+        # 列表转字符串
         self.refine_join = FunctionalRefiner(func=join_list)
+        
+        # 文本生成问题 (Text-to-Text)
         tpl_q = NamedPlaceholderPromptTemplate(
             template=self.prompts_db["LLM_PROMPT_1"], 
             join_list_with="\n"
@@ -235,16 +384,22 @@ class ImageScaleCaptionPipeline:
             serving=self.serving,
             prompt_template=tpl_q
         )
+        
+        # 解析问题文本为列表
         self.refine_parse_qs = FunctionalRefiner(func=parse_questions_logic)
 
         # --- Step D: Generate Answers ---
+        # 批量回答 (One Image -> Many Qs)
         self.gen_answers = BatchVQAGenerator(serving=self.serving)
+        
+        # 回答过滤
         self.refine_answers = VisualGroundingRefiner(
             serving=self.serving,
             prompt_template="Given the image, is the statement '{text}' grounded in the image and not generic? Answer strictly yes or no."
         )
 
         # --- Step E: Integrate Final Caption ---
+        # 融合 (Text-to-Text)
         tpl_final = NamedPlaceholderPromptTemplate(
             template=self.prompts_db["LLM_PROMPT_4"], 
             join_list_with="\n"
@@ -256,6 +411,7 @@ class ImageScaleCaptionPipeline:
 
     def forward(self):
         print(">>> [Pipeline] Step 0: Preparing Prompts...")
+        # 构造 init_prompt 列
         self.refine_const_prompt.run(
             self.storage.step(), 
             output_key="init_prompt"
@@ -288,11 +444,14 @@ class ImageScaleCaptionPipeline:
             output_key="golden_str", 
             data="golden_sentences"
         )
+        
+        # template: "{sentence}" -> map to col "golden_str"
         self.gen_questions_text.run(
             self.storage.step(), 
             output_answer_key="raw_q_text", 
             sentence="golden_str"
         )
+        
         self.refine_parse_qs.run(
             self.storage.step(), 
             output_key="q_list", 
@@ -306,6 +465,7 @@ class ImageScaleCaptionPipeline:
             input_image_key=self.input_image_key, 
             output_key="raw_answers"
         )
+        
         self.refine_answers.run(
             self.storage.step(), 
             input_list_key="raw_answers", 
@@ -319,50 +479,37 @@ class ImageScaleCaptionPipeline:
             output_key="details_str", 
             data="final_details"
         )
+        
+        # template keys: context, object_info, position_info
         self.gen_final_caption.run(
             self.storage.step(),
             output_answer_key=self.output_key,
             context="golden_str",
             object_info="details_str",
-            position_info="details_str"
+            position_info="details_str" # 简化：同时作为 object 和 position 信息
         )
 
         print(f">>> [Pipeline] All Done. Result saved to: {self.storage.cache_path}")
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="ScaleCap Dense Captioning Pipeline")
-    
-    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
-    parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface")
-    parser.add_argument("--download_dir", default="./ckpt/models")
-    parser.add_argument("--device", default="cuda")
-
-    parser.add_argument("--input_jsonl", default="./dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl")
-    parser.add_argument("--cache_path", default="./cache_scalecap_results")
-    parser.add_argument("--file_name_prefix", default="scalecap")
-    parser.add_argument("--input_image_key", default="image")
-    parser.add_argument("--output_key", default="final_caption")
-
-    parser.add_argument("--tp", type=int, default=1)
-    parser.add_argument("--max_tokens", type=int, default=1024)
-
-    args = parser.parse_args()
-
     pipe = ImageScaleCaptionPipeline(
-        model_path=args.model_path,
-        hf_cache_dir=args.hf_cache_dir,
-        download_dir=args.download_dir,
-        device=args.device,
-        first_entry_file=args.input_jsonl,
-        cache_path=args.cache_path,
-        file_name_prefix=args.file_name_prefix,
-        input_image_key=args.input_image_key,
-        output_key=args.output_key,
-        vllm_tensor_parallel_size=args.tp,
-        vllm_max_tokens=args.max_tokens
+        model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+        hf_cache_dir="~/.cache/huggingface",
+        download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+        device="cuda",
+        
+        first_entry_file="../example_data/capsbench_images/image_scale_caption_demo.jsonl",
+        cache_path="../cache/image_scale_caption",
+        file_name_prefix="scalecap",
+        
+        input_image_key="image",
+        output_key="final_caption",
+        
+        vllm_tensor_parallel_size=1,
+        vllm_max_tokens=1024
     )
     
     pipe.forward()
 
-```
+```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md b/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md
new file mode 100644
index 00000000..e504e294
--- /dev/null
+++ b/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline_api.md
@@ -0,0 +1,477 @@
+---
+title: ScaleCap 高密度描述生成流水线（API版）
+createTime: 2026/01/11 22:08:57
+icon: mdi:image-text
+permalink: /zh/mm_guide/image_scale_caption_pipeline_api/
+---
+
+## 1. 概述
+
+**ScaleCap 高密度描述生成流水线 (Image Scale Caption Pipeline)** 是一种基于**“生成-验证-扩展-融合”**范式的先进图像描述生成方案。该流水线旨在生成**信息密度极高**且**幻觉率极低**的图像描述，特别适用于需要深度理解图像细节的场景。
+
+该方法的理论基础源自论文 *ScaleCap: Inference-Time Scalable Image Captioning via Dual-Modality Debiasing*。它通过多轮对话和视觉自检（Visual Grounding），逐步挖掘图像中的对象与位置细节，并过滤掉模型产生的幻觉。
+
+我们支持以下应用场景：
+
+* **高质量多模态数据集构建**：生成比普通 Caption 更详尽、准确的训练数据。
+* **细粒度图像检索**：提供包含丰富细节的索引文本。
+* **盲人辅助/图像无障碍**：生成“所见即所得”的详细解说。
+
+流水线的主要流程包括：
+
+1. **初稿生成**：VLM 生成基础描述。
+2. **视觉自检 (Debiasing)**：将描述拆分为句子，逐句验证其是否被图像证据支持（Visual Grounding）。
+3. **细节追问**：针对通过验证的“黄金句子”，生成关于对象属性和位置的追问。
+4. **回答与再验证**：VLM 回答追问，并再次进行视觉自检以过滤错误细节。
+5. **最终融合**：将所有经过验证的信息融合成一段连贯的长描述。
+
+---
+
+## 2. 快速开始
+
+### 第一步：创建新的 DataFlow 工作文件夹
+
+```bash
+mkdir run_dataflow
+cd run_dataflow
+
+```
+
+### 第二步：初始化 DataFlow-MM
+
+```bash
+dataflowmm init
+
+```
+
+这时你会看到：
+
+```bash
+api_pipelines/image_scale_caption_api_pipeline.py
+
+```
+
+### 第三步：下载示例数据
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### 第四步：配置 API Key
+
+在 `api_pipelines/image_scale_caption_api_pipeline.py` 中设置 API Key 环境变量：
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+
+```
+
+### 第五步：配置参数
+
+在 `api_pipelines/image_scale_caption_api_pipeline.py` 中配置 API 服务和输入数据路径：
+
+```python
+    def __init__(
+        self,
+        # Storage params
+        first_entry_file: str = "../example_data/capsbench_images/image_scale_caption_demo.jsonl",
+        cache_path: str = "../cache/image_scale_caption",
+        file_name_prefix: str = "scalecap",
+        cache_type: str = "jsonl",
+        # Keys
+        input_image_key: str = "image",
+        output_key: str = "final_caption",
+    ):
+
+```
+
+```python
+        self.vlm_serving = APIVLMServing_openai(
+            api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format
+            model_name="gpt-4o-mini",
+            image_io=None,
+            send_request_stream=False,
+            max_workers=10,
+            timeout=1800
+        )
+
+```
+
+### 第六步：一键运行
+
+```bash
+cd api_pipelines
+python image_scale_caption_api_pipeline.py
+
+```
+
+---
+
+## 3. 数据流与流水线逻辑
+
+### 1. **输入数据**
+
+输入数据非常简单，仅需图像路径：
+
+* **image**：图像文件路径。
+
+**输入数据示例**：
+
+```json
+{
+    "image": "../example_data/capsbench_images/0.png"
+}
+
+```
+
+### 2. **核心算子逻辑**
+
+该流水线是多个原子算子的复杂编排：
+
+#### A. **初稿生成 (PromptedVQAGenerator)**
+
+* **功能**：使用基础 Prompt 生成图像的初步描述 (`init_caption`)。
+
+#### B. **视觉自检 (VisualGroundingRefiner)**
+
+* **功能**：这是 ScaleCap 的核心防幻觉机制。
+* **逻辑**：
+1. 使用 `split_sentences` 将初稿拆分为单句。
+2. 调用 VLM 询问：“Given the image, is the description '{text}' directly supported by visual evidence?”。
+3. 仅保留回答为 "Yes" 的句子，形成 **"Golden Sentences"**。
+
+
+
+#### C. **问题生成与解析 (PromptTemplatedQAGenerator)**
+
+* **功能**：基于 Golden Sentences，利用 LLM 能力生成针对性的追问。
+* **逻辑**：模型生成如 "Describe more details about the [Object]" 的文本，并通过 `parse_questions_logic` 自动扩展为**对象细节**和**位置关系**两类问题。
+
+#### D. **批量回答与二次过滤 (BatchVQAGenerator & Refiner)**
+
+* **功能**：挖掘图像深层信息。
+* **逻辑**：
+1. 使用 `BatchVQAGenerator` 一次性让 VLM 回答上述生成的所有问题。
+2. 再次使用 `VisualGroundingRefiner` 检查这些新生成的细节是否准确。
+3. 保留可靠的细节信息 (`final_details`)。
+
+
+
+#### E. **最终融合 (PromptTemplatedQAGenerator)**
+
+* **功能**：将“黄金句子”和“验证后的细节”重写为一段流畅的文本。
+* **输出**：`final_caption`。
+
+### 3. **输出数据**
+
+输出数据记录了流水线的全过程，方便调试和分析：
+
+* **init_caption**：原始生成的初稿。
+* **golden_sentences**：通过第一次自检的句子列表。
+* **q_list**：生成的追问列表。
+* **final_details**：通过第二次自检的细节回答。
+* **final_caption**：最终的高密度描述。
+
+**输出数据示例**：
+
+```json
+{
+    "image": "../example_data/capsbench_images/0.png",
+    "init_caption": "A dog sitting on a bench.",
+    "golden_sentences": ["A dog is sitting on a wooden bench."],
+    "q_list": ["Describe more details about the dog.", "Describe more details about the position of the bench."],
+    "final_details": ["The dog is a Golden Retriever with a red collar.", "The bench is located in a park."],
+    "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park."
+}
+
+```
+
+---
+
+## 4. 流水线示例
+
+以下是完整的 `ImageScaleCaptionPipeline` 代码实现 (API 版本)。
+
+```python
+import os
+os.environ["DF_API_KEY"] = "sk-xxxx"
+
+
+import re
+import argparse
+from typing import Callable, Any, List
+
+from dataflow.utils.storage import FileStorage
+
+from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
+from dataflow.prompts.image import ImageScaleCaptionPrompt
+
+from dataflow.operators.core_vision import PromptedVQAGenerator, BatchVQAGenerator, VisualGroundingRefiner
+from dataflow.operators.core_text import PromptTemplatedQAGenerator, FunctionalRefiner
+from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
+
+def split_sentences(text: str) -> List[str]:
+    """将文本拆分为句子列表"""
+    if not text or not isinstance(text, str):
+        return []
+    # 使用正则按标点符号分割 (. ! ? 。 ！ ？)
+    _SENT_SPLIT = re.compile(r"(?<=[.!?。！？])\s+")
+    parts = [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]
+    return parts or ([text.strip()] if text.strip() else [])
+
+def join_list(data: Any, separator: str = "\n") -> str:
+    """将列表连接为字符串"""
+    if isinstance(data, list):
+        # 过滤掉非字符串元素或空字符串
+        valid_items = [str(x) for x in data if x]
+        return separator.join(valid_items)
+    return str(data) if data is not None else ""
+
+def parse_questions_logic(text: str, max_q: int = 20) -> List[str]:
+    """
+    解析 LLM 生成的 "Describe more details about..." 文本，
+    并自动扩展 position 问题。
+    """
+    if not text or not isinstance(text, str):
+        return []
+
+    lines = [t.strip() for t in text.split("\n") if t.strip()]
+    obj_qs = []
+    
+    for line in lines:
+        # 提取包含 "Describe more details about" 的行
+        if "Describe more details about" in line:
+            # 去除可能的序号 (如 "1. Describe...")
+            try:
+                start_idx = line.find("Describe")
+                clean = line[start_idx:]
+                # 去除句末多余内容，保留到第一个句号
+                if "." in clean:
+                    clean = clean.split(".")[0] + "."
+                obj_qs.append(clean)
+            except Exception:
+                continue
+    
+    # 去重并保持顺序
+    seen = set()
+    unique_obj_qs = []
+    for q in obj_qs:
+        if q not in seen:
+            unique_obj_qs.append(q)
+            seen.add(q)
+    
+    # 截断
+    unique_obj_qs = unique_obj_qs[:max_q]
+    
+    # 扩展 Position 问题
+    pos_qs = [
+        q.replace("Describe more details about", "Describe more details about the position of")
+        for q in unique_obj_qs
+    ]
+    
+    # 返回合并后的列表 (对象问题 + 位置问题)
+    return unique_obj_qs + pos_qs
+
+
+class ImageScaleCaptionPipeline:
+    def __init__(
+        self,
+        # Storage params
+        first_entry_file: str = "images.jsonl",
+        cache_path: str = "./cache_scalecap",
+        file_name_prefix: str = "scalecap",
+        cache_type: str = "jsonl",
+        # Keys
+        input_image_key: str = "image",
+        output_key: str = "final_caption",
+        # VLLM Config
+        vllm_tensor_parallel_size: int = 1,
+        vllm_temperature: float = 0.7,
+        vllm_top_p: float = 0.9,
+        vllm_max_tokens: int = 512,
+    ):
+        # 1. Storage
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type=cache_type,
+        )
+
+        # 2. Serving
+        self.vlm_serving = APIVLMServing_openai(
+            api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format
+            model_name="gpt-4o-mini",
+            image_io=None,
+            send_request_stream=False,
+            max_workers=10,
+            timeout=1800
+        )
+
+        # 3. Prompts
+        self.prompts_db = ImageScaleCaptionPrompt().build_prompt()
+
+        # 4. Keys
+        self.input_image_key = input_image_key
+        self.output_key = output_key
+
+        # ================== Operator Initialization ==================
+
+        # --- Step A: Generate Init Caption ---
+        # 构造固定 Prompt 列
+        self.refine_const_prompt = FunctionalRefiner(func=lambda: self.prompts_db["VLM_PROMPT_1"])
+        
+        # 生成初稿 (使用通用 PromptedVQAGenerator)
+        self.gen_init_caption = PromptedVQAGenerator(
+            serving=self.vlm_serving,
+            system_prompt="You are a helpful assistant."
+        )
+
+        # --- Step B: Refine Golden Sentences ---
+        # 分句
+        self.refine_split = FunctionalRefiner(func=split_sentences)
+        
+        # 视觉自检 (保留 Yes 的句子)
+        self.refine_golden = VisualGroundingRefiner(
+            serving=self.vlm_serving,
+            prompt_template="Given the image, is the description '{text}' directly supported by visual evidence? Answer strictly yes or no."
+        )
+
+        # --- Step C: Generate Questions ---
+        # 列表转字符串
+        self.refine_join = FunctionalRefiner(func=join_list)
+        
+        # 文本生成问题 (Text-to-Text)
+        tpl_q = NamedPlaceholderPromptTemplate(
+            template=self.prompts_db["LLM_PROMPT_1"], 
+            join_list_with="\n"
+        )
+        self.gen_questions_text = PromptTemplatedQAGenerator(
+            serving=self.vlm_serving,
+            prompt_template=tpl_q
+        )
+        
+        # 解析问题文本为列表
+        self.refine_parse_qs = FunctionalRefiner(func=parse_questions_logic)
+
+        # --- Step D: Generate Answers ---
+        # 批量回答 (One Image -> Many Qs)
+        self.gen_answers = BatchVQAGenerator(serving=self.vlm_serving)
+        
+        # 回答过滤
+        self.refine_answers = VisualGroundingRefiner(
+            serving=self.vlm_serving,
+            prompt_template="Given the image, is the statement '{text}' grounded in the image and not generic? Answer strictly yes or no."
+        )
+
+        # --- Step E: Integrate Final Caption ---
+        # 融合 (Text-to-Text)
+        tpl_final = NamedPlaceholderPromptTemplate(
+            template=self.prompts_db["LLM_PROMPT_4"], 
+            join_list_with="\n"
+        )
+        self.gen_final_caption = PromptTemplatedQAGenerator(
+            serving=self.vlm_serving,
+            prompt_template=tpl_final
+        )
+
+    def forward(self):
+        print(">>> [Pipeline] Step 0: Preparing Prompts...")
+        # 构造 init_prompt 列
+        self.refine_const_prompt.run(
+            self.storage.step(), 
+            output_key="init_prompt"
+        )
+
+        print(">>> [Pipeline] Step 1: Generating Initial Caption...")
+        self.gen_init_caption.run(
+            self.storage.step(),
+            input_prompt_key="init_prompt",
+            input_image_key=self.input_image_key,
+            output_answer_key="init_caption"
+        )
+
+        print(">>> [Pipeline] Step 2: Refining Golden Sentences...")
+        self.refine_split.run(
+            self.storage.step(), 
+            output_key="sentences", 
+            text="init_caption"
+        )
+        self.refine_golden.run(
+            self.storage.step(), 
+            input_list_key="sentences", 
+            input_image_key=self.input_image_key, 
+            output_key="golden_sentences"
+        )
+
+        print(">>> [Pipeline] Step 3: Generating Details Questions...")
+        self.refine_join.run(
+            self.storage.step(), 
+            output_key="golden_str", 
+            data="golden_sentences"
+        )
+        
+        # template: "{sentence}" -> map to col "golden_str"
+        self.gen_questions_text.run(
+            self.storage.step(), 
+            output_answer_key="raw_q_text", 
+            sentence="golden_str"
+        )
+        
+        self.refine_parse_qs.run(
+            self.storage.step(), 
+            output_key="q_list", 
+            text="raw_q_text"
+        )
+
+        print(">>> [Pipeline] Step 4: Generating & Filtering Answers...")
+        self.gen_answers.run(
+            self.storage.step(), 
+            input_prompts_key="q_list", 
+            input_image_key=self.input_image_key, 
+            output_key="raw_answers"
+        )
+        
+        self.refine_answers.run(
+            self.storage.step(), 
+            input_list_key="raw_answers", 
+            input_image_key=self.input_image_key, 
+            output_key="final_details"
+        )
+
+        print(">>> [Pipeline] Step 5: Integrating Final Caption...")
+        self.refine_join.run(
+            self.storage.step(), 
+            output_key="details_str", 
+            data="final_details"
+        )
+        
+        # template keys: context, object_info, position_info
+        self.gen_final_caption.run(
+            self.storage.step(),
+            output_answer_key=self.output_key,
+            context="golden_str",
+            object_info="details_str",
+            position_info="details_str" # 简化：同时作为 object 和 position 信息
+        )
+
+        print(f">>> [Pipeline] All Done. Result saved to: {self.storage.cache_path}")
+
+
+if __name__ == "__main__":
+
+    pipe = ImageScaleCaptionPipeline( 
+        first_entry_file="../example_data/capsbench_images/image_scale_caption_demo.jsonl",
+        cache_path="../cache/image_scale_caption",
+        file_name_prefix="scalecap",
+        input_image_key="image",
+        output_key="final_caption",
+        vllm_tensor_parallel_size=1,
+        vllm_max_tokens=1024
+    )
+    
+    pipe.forward()
+
+```
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md b/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md
index a586da66..32c17ddd 100644
--- a/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md
+++ b/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md
@@ -18,8 +18,6 @@ permalink: /zh/mm_guide/image_visual_only_mcq_pipeline/
 * **旋转测试**：多次打乱选项顺序，消除位置偏见。
 * **双重过滤**：要求“有图答对率”高，“无图答对率”低。
 
-
-
 ---
 
 ## 2. 快速开始
@@ -32,32 +30,102 @@ cd run_vis_mcq
 
 ```
 
-### 第二步：准备脚本
+### 第二步：初始化 DataFlow-MM
 
-将下文“流水线示例”中的代码保存为 `visual_mcq_pipeline.py`。
+```bash
+dataflowmm init
+
+```
 
-### 第三步：配置运行参数
+这时你会看到：
+
+```bash
+gpu_pipelines/image_visual_only_mcq_pipeline.py
+
+```
 
-该流水线通过命令行参数控制过滤阈值。例如，要求有图 100% 正确，无图正确率低于 25%：
+### 第三步：下载示例数据
 
 ```bash
-# 安装依赖
-pip install open-dataflow vllm
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### 第四步：配置参数
+
+配置模型路径和过滤阈值（例如，要求有图 100% 正确，无图正确率低于 25%）：
+
+```python
+if __name__ == "__main__":
+    pipe = VisualOnlyMCQPipeline(
+        model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+        first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl",
+        hf_cache_dir="~/.cache/huggingface",
+        download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+        rotate_num=4,
+        pass_visual_min=1.0,
+        pass_textual_max=0.25
+    )
+    pipe.forward()
 
 ```
 
-### 第四步：一键运行
+> **⚠️ 模型路径配置的重要提示（以 `Qwen2.5-VL-3B-Instruct` 为例）：**
+> * **如果您已经下载好了模型文件**：请将 `model_path` 修改为您的本地模型路径。**务必保证**模型存放的最终文件夹名称精确为 `Qwen2.5-VL-3B-Instruct`，否则底层解析时将无法正确匹配和识别该模型。
+> * **如果您还未下载模型（需要自动下载）**：请一定要指定 `download_dir` 参数，并且该目录路径**必须以** `Qwen2.5-VL-3B-Instruct` **结尾**（正如默认参数所示），否则下载完成后同样会导致框架无法识别模型。
+> 
+> 
+
+### 第五步：一键运行
 
 ```bash
-python visual_mcq_pipeline.py \
-  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
-  --input_file "data/captions.jsonl" \
-  --rotate_num 4 \
-  --pass_vis 1.0 \
-  --pass_txt 0.25
+cd gpu_pipelines
+python image_visual_only_mcq_pipeline.py
 
 ```
 
+> **🛠️ 常见问题排查 (Troubleshooting)**
+> **问题 1：** 如果遇到类似如下的动态链接库冲突报错：
+> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`
+> **解决方法：** 这通常是环境变量干扰导致的。请在运行命令前清空 `LD_LIBRARY_PATH`：
+> ```bash
+> LD_LIBRARY_PATH="" python image_visual_only_mcq_pipeline.py
+> 
+> ```
+> 
+> 
+> **问题 2：** 如果您使用的是 **Qwen 系列模型**，并且遇到以下报错：
+> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"`
+> **解决方法：** 打开模型文件夹下的 `config.json` 文件，找到 `rope_scaling` 配置块，将 `"type"` 字段修改为 `"rope_type"` 即可。
+> **修改前：**
+> ```json
+> "rope_scaling": {
+>   "type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> 
+> ```
+> 
+> 
+> **修改后：**
+> ```json
+> "rope_scaling": {
+>   "rope_type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> 
+> ```
+> 
+> 
+
 ---
 
 ## 3. 数据流与流水线逻辑
@@ -98,13 +166,12 @@ python visual_mcq_pipeline.py \
 
 1. **选项旋转**：随机打乱选项顺序（例如将答案从 A 换到 C），防止模型通过“总是选 A”来作弊。
 2. **有图推理 (Visual Pass)**：输入图像 + 题目。记录模型答对的比例。
-3. **无图推理 (Textual Pass)**：仅输入题目（无图像）。记录模型盲猜对的比例。
+3. **无图推理 (Textual Pass)**：仅输入题目（无图像进行盲测）。记录模型盲猜对的比例。
 4. **过滤判据**：
+
 * 保留题目，当且仅当：`Visual_Acc >= pass_visual_min` **且** `Textual_Acc <= pass_textual_max`。
 * *示例*：如果一道题不看图也能答对（无图准确率高），说明它考的是常识而非视觉，**剔除**。
 
-
-
 ### 3. **输出数据**
 
 输出数据 (`final_mcqs`) 仅包含通过了严苛验证的题目。这些题目具有极高的质量和视觉相关性。
@@ -132,12 +199,10 @@ python visual_mcq_pipeline.py \
 
 ## 4. 流水线示例
 
-以下是完整的 `VisualOnlyMCQPipeline` 代码实现。
+以下是完整的 `VisualOnlyMCQPipeline` 代码实现 (GPU 版本)。
 
 ```python
 import argparse
-import re
-from typing import List, Dict, Any
 from dataflow.utils.storage import FileStorage
 from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
 
@@ -145,13 +210,14 @@ from dataflow.operators.core_vision import FixPromptedVQAGenerator, VisualDepend
 from dataflow.operators.core_text import FunctionalRefiner
 from dataflow.prompts.image import ImageCaprlPrompt
 
-# 正则解析逻辑
+import re
+from typing import List, Dict, Any
+
 _Q_BLOCK_SPLIT = re.compile(r"^####\s*\d+\.\s*\*\*(.*?)\*\*\s*$", re.M)
 _OPT_LINE_RE = re.compile(r"^\s*-\s*([A-F])\)\s*(.+?)\s*$")
 _ANS_LINE_RE = re.compile(r"^\s*\*\*Answer:\*\*\s*([A-F])\)\s*(.+?)\s*$", re.I)
 
 def parse_mcq_text_logic(mcq_text: str, expected: int = 5) -> List[Dict[str, Any]]:
-    """将 VLM 生成的原始文本解析为结构化字典列表"""
     if not mcq_text or not isinstance(mcq_text, str): return []
     
     indices = [m.start() for m in _Q_BLOCK_SPLIT.finditer(mcq_text)]
@@ -213,7 +279,9 @@ class VisualOnlyMCQPipeline:
         model_path: str,
         *,
         first_entry_file: str,
-        cache_path: str = "./cache_mcq",
+        hf_cache_dir: str | None = None,
+        download_dir: str = "./ckpt/models",
+        cache_path: str = "../cache/cache_mcq",
         file_name_prefix: str = "vis_mcq",
         # Config
         rotate_num: int = 4,
@@ -227,7 +295,6 @@ class VisualOnlyMCQPipeline:
         device: str = "cuda",
         vllm_max_tokens: int = 2048
     ):
-        # 1. 初始化存储
         self.storage = FileStorage(
             first_entry_file_name=first_entry_file,
             cache_path=cache_path,
@@ -235,15 +302,16 @@ class VisualOnlyMCQPipeline:
             cache_type="jsonl"
         )
         
-        # 2. 初始化 VLM 服务
         self.serving = LocalModelVLMServing_vllm(
+            hf_cache_dir=hf_cache_dir,
+            hf_local_dir=download_dir,
             hf_model_name_or_path=model_path,
             vllm_tensor_parallel_size=1,
-            vllm_temperature=0.1,  # 低温度以保证格式稳定
+            vllm_temperature=0.1, 
             vllm_max_tokens=vllm_max_tokens
         )
         
-        # Keys 配置
+        # Keys
         self.keys = {
             "img": input_image_key,
             "raw_text": "raw_mcq_text",
@@ -251,23 +319,24 @@ class VisualOnlyMCQPipeline:
             "final": output_key
         }
         
-        # 加载 Prompt 库
+        # --- Prompts ---
         self.prompts_db = ImageCaprlPrompt().build_prompt()
 
-        # ================== 算子初始化 ==================
+        # ================== Operators ==================
         
-        # 算子 1: 生成原始 MCQ 文本
+        # 1. Generate Raw MCQs (FixPromptedVQAGenerator)
+        # 直接使用 prompt 类中的字符串
         self.op_gen_raw = FixPromptedVQAGenerator(
             serving=self.serving,
             system_prompt=self.prompts_db["SYS_PROMPT_MCQ"],
             user_prompt=self.prompts_db["USER_PROMPT_MCQ"]
         )
         
-        # 算子 2: 解析文本为结构化数据
+        # 2. Parse MCQs (Refine)
         self.op_parse = FunctionalRefiner(func=parse_mcq_text_logic)
         
-        # 算子 3: 视觉依赖性验证 (核心过滤)
-        # 包含旋转 (Rotation) 和 无图检测 (Text-only check)
+        # 3. Verify Visual Dependency (Refine)
+        # 传入 prompt 模板
         self.op_verify = VisualDependencyRefiner(
             serving=self.serving,
             instruction_template=self.prompts_db["ANSWER_INSTRUCTION"],
@@ -304,23 +373,15 @@ class VisualOnlyMCQPipeline:
         print(f">>> [Pipeline] Done. Results in: {self.keys['final']}")
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input_file", default="./dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl")
-    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
-    parser.add_argument("--rotate_num", type=int, default=4)
-    parser.add_argument("--pass_vis", type=float, default=1.0)
-    parser.add_argument("--pass_txt", type=float, default=0.25)
-    
-    args = parser.parse_args()
-    
     pipe = VisualOnlyMCQPipeline(
-        model_path=args.model_path,
-        first_entry_file=args.input_file,
-        rotate_num=args.rotate_num,
-        pass_visual_min=args.pass_vis,
-        pass_textual_max=args.pass_txt
+        model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+        first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl",
+        hf_cache_dir="~/.cache/huggingface",
+        download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+        rotate_num=4,
+        pass_visual_min=1.0,
+        pass_textual_max=0.25
     )
     pipe.forward()
 
-```
-
+```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md b/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md
new file mode 100644
index 00000000..55f74fcc
--- /dev/null
+++ b/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline_api.md
@@ -0,0 +1,339 @@
+---
+title: 视觉依赖 MCQ 生成流水线（API版）
+createTime: 2026/01/11 22:13:45
+icon: mdi:image-text
+permalink: /zh/mm_guide/image_visual_only_mcq_pipeline_api/
+---
+## 1. 概述
+
+**视觉依赖 MCQ 生成流水线 (Visual-Only MCQ Pipeline)** 是 CapRL (Caption Reinforcement Learning) 框架中的核心组件。它的目标是生成一组高质量的多项选择题 (MCQ)，且这些题目必须满足**强视觉依赖性**：即模型必须“看”图才能答对，仅凭文本（猜题或常识）无法作答。
+
+该流水线通过**生成-解析-验证**三步法，利用**选项旋转 (Rotation)** 和**无图盲测 (Blind Test)** 机制，严格过滤掉模型幻觉或过于简单的题目。生成的题目可作为强化学习的奖励信号（Reward Model）。
+
+主要流程包括：
+
+1. **MCQ 生成**：VLM 基于图像生成原始的问答对文本。
+2. **结构化解析**：利用正则逻辑将文本解析为标准的题目与选项结构。
+3. **视觉依赖验证**：
+* **旋转测试**：多次打乱选项顺序，消除位置偏见。
+* **双重过滤**：要求“有图答对率”高，“无图答对率”低。
+
+---
+
+## 2. 快速开始
+
+### 第一步：创建工作目录
+
+```bash
+mkdir run_vis_mcq
+cd run_vis_mcq
+
+```
+
+### 第二步：初始化 DataFlow-MM
+
+```bash
+dataflowmm init
+
+```
+
+这时你会看到：
+
+```bash
+api_pipelines/image_visual_only_mcq_api_pipeline.py
+
+```
+
+### 第三步：下载示例数据
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### 第四步：配置 API Key
+
+在 `api_pipelines/image_visual_only_mcq_api_pipeline.py` 中设置 API Key 环境变量：
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+
+```
+
+### 第五步：配置参数
+
+在 `api_pipelines/image_visual_only_mcq_api_pipeline.py` 中配置过滤阈值，例如，要求有图 100% 正确，无图正确率低于 25%：
+
+```python
+    pipe = VisualOnlyMCQPipeline(
+        first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl",
+        rotate_num=4,
+        pass_visual_min=1.0,
+        pass_textual_max=0.25
+    )
+
+```
+
+### 第六步：一键运行
+
+```bash
+cd api_pipelines
+python image_visual_only_mcq_api_pipeline.py
+
+```
+
+---
+
+## 3. 数据流与流水线逻辑
+
+### 1. **输入数据**
+
+输入仅需包含图像路径：
+
+* **image**：图像文件路径。
+
+**输入数据示例**：
+
+```json
+{
+    "image": "./images/sample_01.jpg"
+}
+
+```
+
+### 2. **核心算子逻辑**
+
+该流水线由三个关键算子串联而成：
+
+#### A. **FixPromptedVQAGenerator（原始生成）**
+
+* **功能**：使用 CapRL 预设的 Prompt 模板（`SYS_PROMPT_MCQ` / `USER_PROMPT_MCQ`），让 VLM 一次性生成 5 道 MCQ。
+* **输出**：包含多个 `#### Question` 和选项的非结构化文本块。
+
+#### B. **FunctionalRefiner（正则解析）**
+
+* **逻辑函数**：`parse_mcq_text_logic`
+* **功能**：利用正则表达式从原始文本中提取题目、选项（A-F）和正确答案。
+* **输出**：结构化的 MCQ 列表 (`parsed_mcq_list`)。
+
+#### C. **VisualDependencyRefiner（依赖性验证）**
+
+这是本流水线的核心过滤器。它对每道题进行 N 次推理（N = `rotate_num`）：
+
+1. **选项旋转**：随机打乱选项顺序（例如将答案从 A 换到 C），防止模型通过“总是选 A”来作弊。
+2. **有图推理 (Visual Pass)**：输入图像 + 题目。记录模型答对的比例。
+3. **无图推理 (Textual Pass)**：仅输入题目（无图像进行盲测）。记录模型盲猜对的比例。
+4. **过滤判据**：
+
+* 保留题目，当且仅当：`Visual_Acc >= pass_visual_min` **且** `Textual_Acc <= pass_textual_max`。
+* *示例*：如果一道题不看图也能答对（无图准确率高），说明它考的是常识而非视觉，**剔除**。
+
+### 3. **输出数据**
+
+输出数据 (`final_mcqs`) 仅包含通过了严苛验证的题目。这些题目具有极高的质量和视觉相关性。
+
+**输出数据示例**：
+
+```json
+{
+    "image": "./images/sample_01.jpg",
+    "final_mcqs": [
+        {
+            "question": "What is the color of the car on the far left?\n - A) Red\n - B) Blue...",
+            "answer": "A",
+            "stats": {
+                "visual_acc": 1.0,  # 4次全对
+                "text_acc": 0.0     # 盲猜全错
+            }
+        }
+    ]
+}
+
+```
+
+---
+
+## 4. 流水线示例
+
+以下是完整的 `VisualOnlyMCQPipeline` 代码实现 (API 版本)。
+
+```python
+import os
+os.environ["DF_API_KEY"] = "sk-xxxx"
+import argparse
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
+from dataflow.operators.core_vision import FixPromptedVQAGenerator, VisualDependencyRefiner
+from dataflow.operators.core_text import FunctionalRefiner
+from dataflow.prompts.image import ImageCaprlPrompt
+from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
+import re
+from typing import List, Dict, Any
+
+_Q_BLOCK_SPLIT = re.compile(r"^####\s*\d+\.\s*\*\*(.*?)\*\*\s*$", re.M)
+_OPT_LINE_RE = re.compile(r"^\s*-\s*([A-F])\)\s*(.+?)\s*$")
+_ANS_LINE_RE = re.compile(r"^\s*\*\*Answer:\*\*\s*([A-F])\)\s*(.+?)\s*$", re.I)
+
+def parse_mcq_text_logic(mcq_text: str, expected: int = 5) -> List[Dict[str, Any]]:
+    if not mcq_text or not isinstance(mcq_text, str): return []
+    
+    indices = [m.start() for m in _Q_BLOCK_SPLIT.finditer(mcq_text)]
+    if not indices: return []
+    indices.append(len(mcq_text))
+    blocks = [mcq_text[indices[i]:indices[i+1]].strip() for i in range(len(indices)-1)]
+    
+    parsed = []
+    for block in blocks:
+        lines = [ln.rstrip() for ln in block.splitlines() if ln.strip()]
+        q_title_m = _Q_BLOCK_SPLIT.search(block)
+        if not q_title_m: continue
+        
+        q_title = q_title_m.group(1).strip()
+        options = {}
+        ans_letter, ans_text = None, None
+        
+        for ln in lines:
+            m_opt = _OPT_LINE_RE.match(ln)
+            if m_opt:
+                options[m_opt.group(1)] = m_opt.group(2).strip()
+                continue
+            m_ans = _ANS_LINE_RE.match(ln)
+            if m_ans:
+                ans_letter = m_ans.group(1).upper()
+                ans_text = m_ans.group(2).strip()
+                break
+        
+        if options and ans_letter and ans_letter in options:
+            q_lines = [q_title]
+            for lbl in ["A", "B", "C", "D", "E", "F"]:
+                if lbl in options:
+                    q_lines.append(f"   - {lbl}) {options[lbl]}")
+            
+            parsed.append({
+                "question": "\n".join(q_lines),
+                "question_title": q_title,
+                "options": options,
+                "answer": ans_letter,
+                "answer_text": ans_text
+            })
+            
+    if expected > 0:
+        parsed = parsed[:expected]
+        
+    uniq = []
+    seen = set()
+    for it in parsed:
+        key = (it["question_title"], it["answer"])
+        if key not in seen:
+            seen.add(key)
+            uniq.append(it)
+    return uniq
+
+
+class VisualOnlyMCQPipeline:
+    def __init__(
+        self,
+        *,
+        first_entry_file: str,
+        cache_path: str = "../cache/cache_mcq",
+        file_name_prefix: str = "vis_mcq",
+        # Config
+        rotate_num: int = 4,
+        pass_visual_min: float = 1.0,
+        pass_textual_max: float = 0.25,
+        add_none_above: bool = True,
+        # Keys
+        input_image_key: str = "image",
+        output_key: str = "final_mcqs",
+        # VLLM
+        vllm_max_tokens: int = 2048
+    ):
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type="jsonl"
+        )
+        self.vlm_serving = APIVLMServing_openai(
+            api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format
+            model_name="gpt-4o-mini",
+            image_io=None,
+            send_request_stream=False,
+            max_workers=10,
+            timeout=1800
+        )
+
+        
+        # Keys
+        self.keys = {
+            "img": input_image_key,
+            "raw_text": "raw_mcq_text",
+            "parsed_list": "parsed_mcq_list",
+            "final": output_key
+        }
+        
+        # --- Prompts ---
+        self.prompts_db = ImageCaprlPrompt().build_prompt()
+
+        # ================== Operators ==================
+        
+        # 1. Generate Raw MCQs (FixPromptedVQAGenerator)
+        # 直接使用 prompt 类中的字符串
+        self.op_gen_raw = FixPromptedVQAGenerator(
+            serving=self.vlm_serving,
+            system_prompt=self.prompts_db["SYS_PROMPT_MCQ"],
+            user_prompt=self.prompts_db["USER_PROMPT_MCQ"]
+        )
+        
+        # 2. Parse MCQs (Refine)
+        self.op_parse = FunctionalRefiner(func=parse_mcq_text_logic)
+        
+        # 3. Verify Visual Dependency (Refine)
+        # 传入 prompt 模板
+        self.op_verify = VisualDependencyRefiner(
+            serving=self.vlm_serving,
+            instruction_template=self.prompts_db["ANSWER_INSTRUCTION"],
+            rotate_num=rotate_num,
+            pass_visual_min=pass_visual_min,
+            pass_textual_max=pass_textual_max,
+            add_none_above_visual=add_none_above
+        )
+
+    def forward(self):
+        print(">>> [Pipeline] Step 1: Generating Raw MCQs (FixPrompted)...")
+        self.op_gen_raw.run(
+            self.storage.step(),
+            input_image_key=self.keys["img"],
+            output_answer_key=self.keys["raw_text"]
+        )
+        
+        print(">>> [Pipeline] Step 2: Parsing MCQs...")
+        self.op_parse.run(
+            self.storage.step(),
+            output_key=self.keys["parsed_list"],
+            mcq_text=self.keys["raw_text"], 
+            expected=5
+        )
+        
+        print(">>> [Pipeline] Step 3: Verifying Visual Dependency (Rotation Check)...")
+        self.op_verify.run(
+            self.storage.step(),
+            input_list_key=self.keys["parsed_list"],
+            input_image_key=self.keys["img"],
+            output_key=self.keys["final"]
+        )
+        
+        print(f">>> [Pipeline] Done. Results in: {self.keys['final']}")
+
+if __name__ == "__main__":
+    pipe = VisualOnlyMCQPipeline(
+        first_entry_file="../example_data/capsbench_images/image_visual_only_mcq_demo.jsonl",
+        rotate_num=4,
+        pass_visual_min=1.0,
+        pass_textual_max=0.25
+    )
+    pipe.forward()
+
+```
diff --git a/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md b/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md
index ee038567..fa803660 100644
--- a/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md
+++ b/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md
@@ -35,30 +35,100 @@ cd run_mcts_reasoning
 
 ```
 
-### 第二步：准备脚本
+### 第二步：初始化 DataFlow-MM
 
-将下文“流水线示例”中的代码保存为 `vision_mcts_pipeline.py`。
+```bash
+dataflowmm init
 
-### 第三步：配置运行参数
+```
 
-确保输入文件（jsonl）包含 `tree` 字段（用于提取）或仅包含 `question/image`（用于生成）。
+这时你会看到：
 
 ```bash
-# 安装依赖
-pip install open-dataflow vllm
+gpu_pipelines/vision_mcts_pipeline.py
 
 ```
 
-### 第四步：一键运行
+### 第三步：下载示例数据
 
 ```bash
-python vision_mcts_pipeline.py \
-  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
-  --input_file "data/mcts_trajectories.jsonl" \
-  --prompt_type "spatial"
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
 
 ```
 
+### 第四步：配置参数
+
+确保输入文件（jsonl）包含 `tree` 字段（用于提取）或仅包含 `question/image`（用于生成）：
+
+```python
+if __name__ == "__main__":
+    pipe = VisionMCTSReasoningPipeline(
+        model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+        first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl",
+        prompt_type="spatial",
+        hf_cache_dir="~/.cache/huggingface",
+        download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
+    )
+    pipe.forward()
+
+```
+
+> **⚠️ 模型路径配置的重要提示（以 `Qwen2.5-VL-3B-Instruct` 为例）：**
+> * **如果您已经下载好了模型文件**：请将 `model_path` 修改为您的本地模型路径。**务必保证**模型存放的最终文件夹名称精确为 `Qwen2.5-VL-3B-Instruct`，否则底层解析时将无法正确匹配和识别该模型。
+> * **如果您还未下载模型（需要自动下载）**：请一定要指定 `download_dir` 参数，并且该目录路径**必须以** `Qwen2.5-VL-3B-Instruct` **结尾**（正如默认参数所示），否则下载完成后同样会导致框架无法识别模型。
+> 
+> 
+
+### 第五步：一键运行
+
+```bash
+cd gpu_pipelines
+python vision_mcts_pipeline.py
+
+```
+
+> **🛠️ 常见问题排查 (Troubleshooting)**
+> **问题 1：** 如果遇到类似如下的动态链接库冲突报错：
+> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`
+> **解决方法：** 这通常是环境变量干扰导致的。请在运行命令前清空 `LD_LIBRARY_PATH`：
+> ```bash
+> LD_LIBRARY_PATH="" python vision_mcts_pipeline.py
+> 
+> ```
+> 
+> 
+> **问题 2：** 如果您使用的是 **Qwen 系列模型**，并且遇到以下报错：
+> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"`
+> **解决方法：** 打开模型文件夹下的 `config.json` 文件，找到 `rope_scaling` 配置块，将 `"type"` 字段修改为 `"rope_type"` 即可。
+> **修改前：**
+> ```json
+> "rope_scaling": {
+>   "type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> 
+> ```
+> 
+> 
+> **修改后：**
+> ```json
+> "rope_scaling": {
+>   "rope_type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> 
+> ```
+> 
+> 
+
 ---
 
 ## 3. 数据流与流水线逻辑
@@ -125,10 +195,9 @@ python vision_mcts_pipeline.py \
 
 ## 4. 流水线示例
 
-以下是完整的 `VisionMCTSReasoningPipeline` 代码实现。
+以下是完整的 `VisionMCTSReasoningPipeline` 代码实现 (GPU 版本)。
 
 ```python
-import argparse
 from dataflow.utils.storage import FileStorage
 from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
 
@@ -142,8 +211,10 @@ class VisionMCTSReasoningPipeline:
         model_path: str,
         *,
         # Storage
+        hf_cache_dir: str | None = None,
+        download_dir: str = "./ckpt/models",
         first_entry_file: str,
-        cache_path: str = "./cache_mcts",
+        cache_path: str = "../cache/cache_mcts",
         file_name_prefix: str = "mcts_reason",
         # Config
         prompt_type: str = "spatial",
@@ -156,7 +227,6 @@ class VisionMCTSReasoningPipeline:
         # VLLM
         vllm_max_tokens: int = 1024
     ):
-        # 1. 存储初始化
         self.storage = FileStorage(
             first_entry_file_name=first_entry_file,
             cache_path=cache_path,
@@ -164,8 +234,9 @@ class VisionMCTSReasoningPipeline:
             cache_type="jsonl"
         )
         
-        # 2. 模型服务
         self.serving = LocalModelVLMServing_vllm(
+            hf_cache_dir=hf_cache_dir,
+            hf_local_dir=download_dir,
             hf_model_name_or_path=model_path,
             vllm_tensor_parallel_size=1,
             vllm_temperature=0.7,
@@ -176,20 +247,18 @@ class VisionMCTSReasoningPipeline:
             "q": input_question_key,
             "img": input_image_key,
             "tree": input_tree_key,
-            "mcts_chains": "mcts_extracted_chains", # 中间结果
+            "mcts_chains": "mcts_extracted_chains",
             "final": output_key
         }
 
         # ================== Operators ==================
         
-        # 算子 1: MCTS Tree -> Chains (提取器)
-        # 负责将树结构扁平化为线性链
+        # 1. Refiner: MCTS -> Chains
         self.op_mcts_refine = MCTSTreeRefiner(
             max_chains_per_sample=max_samples_per_file
         )
         
-        # 算子 2: VLM -> Chains (生成器/Fallback)
-        # 如果 MCTS 提取失败，则使用 VLM 生成；如果成功，则跳过
+        # 2. Generator: VLM -> Chains (Fallback)
         self.op_vlm_gen = VisualReasoningGenerator(
             serving=self.serving,
             prompt_type=prompt_type
@@ -204,7 +273,8 @@ class VisionMCTSReasoningPipeline:
         )
         
         print(">>> [Pipeline] Step 2: Generating Chains via VLM (Fallback)...")
-        # 注意：input_existing_chains_key 实现了混合/回退逻辑
+        # 将 mcts_chains 作为 input_existing_chains_key 传入
+        # 如果 MCTS 解析成功，则复用；否则调用 VLM 生成
         self.op_vlm_gen.run(
             self.storage.step(),
             input_question_key=self.keys["q"],
@@ -215,16 +285,12 @@ class VisionMCTSReasoningPipeline:
         
         
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input_file", default="dataflow/example/image_to_text_pipeline/mct_reasoning.jsonl")
-    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
-    parser.add_argument("--prompt_type", default="spatial")
-    args = parser.parse_args()
-    
     pipe = VisionMCTSReasoningPipeline(
-        model_path=args.model_path,
-        first_entry_file=args.input_file,
-        prompt_type=args.prompt_type
+        model_path="Qwen/Qwen2.5-VL-3B-Instruct",
+        first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl",
+        prompt_type="spatial",
+        hf_cache_dir="~/.cache/huggingface",
+        download_dir="../ckpt/models/Qwen2.5-VL-3B-Instruct",
     )
     pipe.forward()
 
diff --git a/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md b/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md
new file mode 100644
index 00000000..1c391d8b
--- /dev/null
+++ b/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline_api.md
@@ -0,0 +1,248 @@
+---
+title: 视觉 MCTS 推理链生成流水线（API版）
+icon: mdi:image-text
+createTime: 2026/01/11 21:59:59
+permalink: /zh/mm_guide/vision_mct_reasoning_pipeline_api/
+---
+
+## 1. 概述
+
+**视觉 MCTS 推理链生成流水线 (Vision MCTS Reasoning Pipeline)** 旨在为多模态大模型构建高质量的**过程监督数据（Process Supervision Data）**。该流水线能够处理两种数据来源：已有的蒙特卡洛树搜索（MCTS）轨迹数据，或直接利用 VLM 生成新的推理链。
+
+该流水线是 **Grounded-RL** 和 **SFT 数据构建**的核心工具，它将复杂的树状搜索过程“线性化”为模型可学习的 `<think>...</think><answer>...</answer>` 格式。
+
+我们支持以下应用场景：
+
+* **从 MCTS 树提取数据**：将搜索树中高价值的路径（Rollouts）转化为线性训练数据。
+* **混合数据构建**：对于没有搜索树的样本，自动回退到使用 VLM 进行 CoT 生成。
+* **空间推理增强**：支持生成包含显式坐标（Bounding Box）的空间推理链。
+
+流水线的主要流程包括：
+
+1. **MCTS 树解析**：解析输入数据中的搜索树结构，提取成功的推理路径。
+2. **视觉推理生成 (Fallback)**：对于缺失树结构或解析失败的样本，利用 VLM 重新生成推理链。
+3. **数据标准化**：输出统一格式的推理链数据。
+
+---
+
+## 2. 快速开始
+
+### 第一步：准备工作目录
+
+```bash
+mkdir run_mcts_reasoning
+cd run_mcts_reasoning
+
+```
+
+### 第二步：初始化 DataFlow-MM
+
+```bash
+dataflowmm init
+
+```
+
+这时你会看到：
+
+```bash
+api_pipelines/vision_mcts_api_pipeline.py
+
+```
+
+### 第三步：下载示例数据
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+
+```
+
+### 第四步：配置 API Key
+
+在 `api_pipelines/vision_mcts_api_pipeline.py` 中设置 API Key 环境变量：
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+
+```
+
+### 第五步：配置参数
+
+配置 API 服务和输入数据路径。确保输入文件（jsonl）包含 `tree` 字段（用于提取）或仅包含 `question/image`（用于生成）：
+
+```python
+    pipe = VisionMCTSReasoningPipeline(
+        first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl",
+        prompt_type="spatial",
+    )
+
+```
+
+### 第六步：一键运行
+
+```bash
+cd api_pipelines
+python vision_mcts_api_pipeline.py
+
+```
+
+---
+
+## 3. 数据流与流水线逻辑
+
+### 1. **输入数据**
+
+输入数据通常来源于 MCTS 搜索过程的日志，或未标注的图文对：
+
+* **image**：图像路径。
+* **question**：视觉问题。
+* **tree**（可选）：MCTS 搜索树的 JSON 结构，包含节点值（Value）、访问次数（Visits）和动作（Actions）。
+
+**输入数据示例**：
+
+```json
+{
+    "image": "./images/puzzle.jpg",
+    "question": "What is the next step to solve this?",
+    "tree": { "root": { "children": [...], "value": 1.0, "text": "Step 1..." } }
+}
+
+```
+
+### 2. **核心算子逻辑**
+
+该流水线采用 **“提取优先，生成兜底”** 的混合策略：
+
+#### A. **MCTSTreeRefiner（树结构解析器）**
+
+该算子负责处理 `tree` 字段。它遍历树结构，根据节点价值（Q-value）筛选出从根节点到叶子节点的最佳路径。
+
+* **输入**：`tree` 对象。
+* **功能**：线性化树路径，过滤掉低价值或未完成的搜索分支。
+* **输出**：提取出的推理链列表（`mcts_chains`）。
+
+#### B. **VisualReasoningGenerator（视觉推理生成器）**
+
+该算子是流水线的“生成引擎”。它接收上一步的提取结果作为输入。
+
+* **机制**：检查 `input_existing_chains_key`（即 `mcts_chains`）。
+* 如果 MCTS 解析成功（链存在），则直接复用，不进行推理（节省计算资源）。
+* 如果 MCTS 链为空（树不存在或解析失败），则调用 VLM，根据 `prompt_type`（如 `spatial`）从头生成推理链。
+
+
+* **Prompt 类型**：支持 `spatial`（空间坐标推理）、`logical`（逻辑推理）等模式。
+
+### 3. **输出数据**
+
+最终生成的输出数据（`final_reasoning_chains`）将包含高质量的思维链，可直接用于 SFT 训练。
+
+**输出示例**：
+
+```json
+{
+    "image": "./images/puzzle.jpg",
+    "final_reasoning_chains": [
+        "<think>First, locate the red block at [100, 200]. To solve the puzzle, it needs to move right...</think><answer>Move Red Block</answer>"
+    ]
+}
+
+```
+
+---
+
+## 4. 流水线示例
+
+以下是完整的 `VisionMCTSReasoningPipeline` 代码实现 (API 版本)。
+
+```python
+import os
+os.environ["DF_API_KEY"] = "sk-xxxx"
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
+# 引入原子算子
+from dataflow.operators.core_text import MCTSTreeRefiner
+from dataflow.operators.core_vision import VisualReasoningGenerator
+from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
+
+class VisionMCTSReasoningPipeline:
+    def __init__(
+        self,
+        first_entry_file: str,
+        cache_path: str = "../cache/cache_mcts",
+        file_name_prefix: str = "mcts_reason",
+        # Config
+        prompt_type: str = "spatial",
+        max_samples_per_file: int = 10000,
+        # Keys
+        input_question_key: str = "question",
+        input_image_key: str = "image",
+        input_tree_key: str = "tree",
+        output_key: str = "final_reasoning_chains",
+
+    ):
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type="jsonl"
+        )
+        
+        self.vlm_serving = APIVLMServing_openai(
+            api_url="[https://dashscope.aliyuncs.com/compatible-mode/v1](https://dashscope.aliyuncs.com/compatible-mode/v1)", # Any API platform compatible with OpenAI format
+            model_name="gpt-4o-mini",
+            image_io=None,
+            send_request_stream=False,
+            max_workers=10,
+            timeout=1800
+        )
+        
+        self.keys = {
+            "q": input_question_key,
+            "img": input_image_key,
+            "tree": input_tree_key,
+            "mcts_chains": "mcts_extracted_chains",
+            "final": output_key
+        }
+
+        # ================== Operators ==================
+        
+        # 1. Refiner: MCTS -> Chains
+        self.op_mcts_refine = MCTSTreeRefiner(
+            max_chains_per_sample=max_samples_per_file
+        )
+        
+        # 2. Generator: VLM -> Chains (Fallback)
+        self.op_vlm_gen = VisualReasoningGenerator(
+            serving=self.vlm_serving,
+            prompt_type=prompt_type
+        )
+
+    def forward(self):
+        print(">>> [Pipeline] Step 1: Extracting Chains from MCTS Trees...")
+        self.op_mcts_refine.run(
+            self.storage.step(),
+            input_tree_key=self.keys["tree"],
+            output_key=self.keys["mcts_chains"]
+        )
+        
+        print(">>> [Pipeline] Step 2: Generating Chains via VLM (Fallback)...")
+        # 将 mcts_chains 作为 input_existing_chains_key 传入
+        # 如果 MCTS 解析成功，则复用；否则调用 VLM 生成
+        self.op_vlm_gen.run(
+            self.storage.step(),
+            input_question_key=self.keys["q"],
+            input_image_key=self.keys["img"],
+            input_existing_chains_key=self.keys["mcts_chains"],
+            output_key=self.keys["final"]
+        )
+        
+        
+if __name__ == "__main__":
+    pipe = VisionMCTSReasoningPipeline(
+        first_entry_file="../example_data/capsbench_images/visual_mct_reasoning_demo.jsonl",
+        prompt_type="spatial",
+    )
+    pipe.forward()
+
+```