From 0d74b070f585ac7314fc27c15e00788bef9b029c Mon Sep 17 00:00:00 2001
From: chawuciren11 <2216740116@qq.com>
Date: Wed, 11 Feb 2026 05:20:40 +0800
Subject: [PATCH 1/7] region-caption

---
 docs/.vuepress/notes/en/mm_guide.ts           |   1 +
 docs/.vuepress/notes/zh/mm_guide.ts           |   1 +
 .../image_region_caption_pipeline.md          | 145 ++++------
 .../image_region_caption_pipeline_api.md      | 266 ++++++++++++++++++
 .../generate/image_bbox_generator.md          |  51 ++--
 .../image_region_caption_pipeline.md          | 146 ++++------
 .../image_region_caption_pipeline_api.md      | 257 +++++++++++++++++
 .../generate/image_bbox_generator.md          |  52 ++--
 8 files changed, 683 insertions(+), 236 deletions(-)
 create mode 100644 docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
 create mode 100644 docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md

diff --git a/docs/.vuepress/notes/en/mm_guide.ts b/docs/.vuepress/notes/en/mm_guide.ts
index 06a5d0ab..82ad44cf 100644
--- a/docs/.vuepress/notes/en/mm_guide.ts
+++ b/docs/.vuepress/notes/en/mm_guide.ts
@@ -28,6 +28,7 @@ export const MMGuide: ThemeNote = defineNoteConfig({
                 'image_gcot',
                 'vision_mct_reasoning_pipeline',
                 'image_region_caption_pipeline',
+                'image_region_caption_pipeline_api',
                 'image_scale_caption_pipeline',
                 'image_visual_only_mcq_pipeline',
             ],
diff --git a/docs/.vuepress/notes/zh/mm_guide.ts b/docs/.vuepress/notes/zh/mm_guide.ts
index 355c1e0d..ff3e3ca3 100644
--- a/docs/.vuepress/notes/zh/mm_guide.ts
+++ b/docs/.vuepress/notes/zh/mm_guide.ts
@@ -28,6 +28,7 @@ export const MMGuide: ThemeNote = defineNoteConfig({
                 'image_gcot',
                 'vision_mct_reasoning_pipeline',
                 'image_region_caption_pipeline',
+                'image_region_caption_pipeline_api',
                 'image_scale_caption_pipeline',
                 'image_visual_only_mcq_pipeline',
             ],
diff --git a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
index a033dcf1..d35b7385 100644
--- a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
+++ b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
@@ -1,12 +1,12 @@
 ---
-title: Image Region Captioning Pipeline
+title: Image Region Caption Pipeline
 createTime: 2026/01/11 22:04:27
 icon: mdi:image-text
 permalink: /en/mm_guide/image_region_caption_pipeline/
 ---
 ## 1. Overview
 
-The **Image Region Captioning Pipeline** is designed to generate detailed text descriptions for specific regions within an image. Combining the localization capabilities of Computer Vision with the understanding of Multimodal Large Models (VLMs), this pipeline identifies Regions of Interest (ROI) and generates precise natural language annotations for them.
+The **Image Region Caption Pipeline** is designed to generate detailed text descriptions for specific regions within an image. Combining the localization capabilities of Computer Vision with the understanding of Multimodal Large Models (VLMs), this pipeline identifies Regions of Interest (ROI) and generates precise natural language annotations for them.
 
 This pipeline supports processing **pre-defined Bounding Box** data, visualizing these boxes, and then feeding them into a VLM for caption generation.
 
@@ -26,25 +26,31 @@ The main process of the pipeline includes:
 
 ## 2. Quick Start
 
-### Step 1: Create a Working Directory
+### Step 1: Create a New DataFlow Working Directory
 
 ```bash
-mkdir run_region_caption
-cd run_region_caption
+mkdir run_dataflow
+cd run_dataflow
 
 ```
 
-### Step 2: Prepare the Script
+### Step 2: Initialize DataFlow-MM
 
-Save the code in the "Pipeline Example" section below as `region_caption_pipeline.py`.
+```bash
+dataflowmm init
 
-### Step 3: Configure Parameters
+```
 
-Ensure the input file (jsonl) contains `image` and `bbox` fields.
+You will then see:
 
 ```bash
-# Install dependencies
-pip install open-dataflow vllm
+api_pipelines/image_region_caption_api_pipeline.py
+```
+
+### Step 3: Download Sample Data
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data
 
 ```
 
@@ -53,9 +59,6 @@ pip install open-dataflow vllm
 ```bash
 python region_caption_pipeline.py \
   --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
-  --first_entry_file "data/region_captions.jsonl" \
-  --output_jsonl_path "data/results.jsonl"
-
 ```
 
 ---
@@ -64,16 +67,16 @@ python region_caption_pipeline.py \
 
 ### 1. **Input Data**
 
-The input data typically contains the image path and a list of corresponding bounding boxes:
+The input data typically contains the image path and a list of corresponding bounding boxes (optional):
 
 * **image**: Path to the image file.
-* **bbox**: List of bounding box coordinates, typically in `[[x, y, w, h], ...]` or `[[x1, y1, x2, y2], ...]` format (depending on configuration).
+* **bbox**: List of bounding box coordinates, typically in `[[x, y, w, h], ...]` format.
 
 **Input Data Example**:
 
 ```json
 {
-    "image": "./images/kitchen.jpg",
+    "image": "./data/image_region_caption/20.jpg",
     "bbox": [[196, 104, 310, 495], [50, 60, 100, 200]]
 }
 
@@ -89,43 +92,50 @@ This operator handles the vision-level tasks.
 
 * **Input**: Raw image + `bbox` data.
 * **Functionality**: Reads bounding boxes and draws them onto the image (visualization) or preprocesses them according to configuration.
-* **Configuration (`ExistingBBoxDataGenConfig`)**: Controls parameters like `max_boxes` and visualization options (`draw_visualization`).
-* **Output**: Generates a new image path containing visual markers (`image_with_bbox`).
+* **Configuration (`ExistingBBoxDataGenConfig`)**: Controls the maximum number of bounding boxes and the input/output paths.
+* **Output**: JSON output path for the new image with visual markers.
 
 #### B. **PromptedVQAGenerator**
 
 This operator is responsible for generating text using the VLM.
 
-* **Input**: The `image_with_bbox` generated in the previous step.
+* **Input**: The result generated in the previous step.
 * **Functionality**: The VLM receives the marked image and generates descriptions for the corresponding regions based on prompts.
 * **Output**: Region description text.
 
 ### 3. **Output Data**
 
-The final output data will contain the processed image path and the generated descriptions:
+The final generated output data includes the processed image path and the generated descriptions:
 
-* **image_with_bbox**: Path to the image with drawn boxes.
-* **mdvp_record**: List of generated region descriptions.
+* **image**: The input image path.
+* **type**: Indicates whether a bounding box is provided.
+* **bbox**: Bounding box parameters.
+* **normalized_bbox**: Normalized bounding box parameters.
+* **result_file**: The output path for the results.
+* **image_with_bbox**: Path to the image with drawn bounding boxes.
+* **valid_bboxes_num**: The number of valid bounding boxes.
+* **prompt**: The prompt received by the VLM.
+* **answer**: The list of generated region descriptions.
 
 **Output Data Example**:
 
 ```json
 {
-    "image": "./images/kitchen.jpg",
-    "image_with_bbox": "./images/kitchen_visualized.jpg",
-    "mdvp_record": [
-        "A wooden chair located near the table.",
-        "A white refrigerator in the background."
-    ]
+    "image":".\/data\/image_region_caption\/20.png","type":"with_bbox",
+    "bbox":[[196,104,310,495]],
+    "normalized_bbox":[[0.128,0.125,0.329,0.72],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0]],
+    "result_file":".\/cache\/image_region_caption","image_with_bbox":".\/cache\/image_region_caption\\2_bbox_vis.jpg",
+    "valid_bboxes_num":1,
+    "prompt":"Describe the content of each marked region in the image. There are 1 regions: <region1> to <region1>.",
+    "answer":"In <region1>, the focus is on the lower half of a person wearing high-heeled shoes with an ornate design. The setting appears to be a kitchen, with items such as a table with floral tablecloth, a broom, and various kitchen utensils visible in the background. The legs of another person can also be seen, indicating there may be interaction happening in this domestic space. The overall scene captures a domestic and casual atmosphere."
 }
-
 ```
 
 ---
 
 ## 4. Pipeline Example
 
-Below is the complete `ImageRegionCaptioningPipeline` code implementation.
+Below is the complete `ImageRegionCaptionPipeline` code implementation.
 
 ```python
 import argparse
@@ -140,57 +150,42 @@ from dataflow.operators.core_vision.generate.prompted_vqa_generator import (
 from dataflow.utils.storage import FileStorage
 
 
-class ImageRegionCaptioningPipeline:
+class ImageRegionCaptionPipeline:
     def __init__(
         self,
         model_path: str,
         *,
         hf_cache_dir: str | None = None,
         download_dir: str = "./ckpt/models",
-        device: str = "cuda",
-        # Storage & Paths
-        first_entry_file: str = "./dataflow/example/image_to_text_pipeline/region_captions.jsonl",
-        cache_path: str = "./dataflow/example/cache",
+        first_entry_file: str = "./data/image_region_caption/image_region_caption_demo.jsonl",
+        cache_path: str = "./cache/image_region_caption",
         file_name_prefix: str = "region_caption",
         cache_type: str = "jsonl",
-        # Keys
         input_image_key: str = "image",
         input_bbox_key: str = "bbox",
-        image_with_bbox_path: str = 'image_with_bbox', # Key for intermediate image
-        output_key: str = "mdvp_record",
-        # BBox Config
+        image_with_bbox_path: str = 'image_with_bbox',
         max_boxes: int = 10,
-        input_jsonl_path: str = "./dataflow/example/image_to_text_pipeline/region_captions.jsonl",
-        output_jsonl_path: str = "./dataflow/example/image_to_text_pipeline/region_captions_results_v1.jsonl",
-        output_image_with_bbox_path: str = "./dataflow/example/image_to_text_pipeline/image_with_bbox_results_v1.jsonl",
-        draw_visualization: bool = True
+        output_image_with_bbox_path: str = "./cache/image_region_caption/image_with_bbox_result.jsonl",
     ):
-        # 1. 初始化存储 (Storage)
-        # 用于 BBox 生成阶段的存储
         self.bbox_storage = FileStorage(
             first_entry_file_name=first_entry_file,
             cache_path=cache_path,
             file_name_prefix=file_name_prefix,
             cache_type=cache_type
         )
-        
-        # 2. 配置 BBox 生成器
+
         self.cfg = ExistingBBoxDataGenConfig(
             max_boxes=max_boxes,
-            input_jsonl_path=input_jsonl_path,
+            input_jsonl_path=first_entry_file,
             output_jsonl_path=output_image_with_bbox_path,
         )
 
-        # 3. 初始化 Caption 阶段的存储
-        # 注意：这里接续了上一步的输出路径
         self.caption_storage = FileStorage(
             first_entry_file_name=output_image_with_bbox_path,
             cache_path=cache_path,
             file_name_prefix=file_name_prefix,
             cache_type=cache_type
         )
-
-        # 4. 初始化 VLM 服务
         self.serving = LocalModelVLMServing_vllm(
             hf_model_name_or_path=model_path,
             hf_cache_dir=hf_cache_dir,
@@ -200,75 +195,55 @@ class ImageRegionCaptioningPipeline:
             vllm_top_p=0.9,
             vllm_max_tokens=512,
         )
-
-        # 5. 初始化核心算子
         self.bbox_generator = ImageBboxGenerator(config=self.cfg)
-        self.caption_generator = PromptedVQAGenerator(serving=self.serving)
-        
+        self.caption_generator = PromptedVQAGenerator(serving=self.serving,)
         self.input_image_key = input_image_key
         self.input_bbox_key = input_bbox_key
-        self.output_key = output_key
-        self.image_with_bbox_path = image_with_bbox_path
+        self.image_with_bbox_path=image_with_bbox_path
+        self.bbox_record=None
 
     def forward(self):
-        # 步骤 1: 生成带 BBox 可视化的图像
-        print(">>> [Pipeline] Step 1: Processing BBoxes and Visualizing...")
         self.bbox_generator.run(
             storage=self.bbox_storage.step(),
             input_image_key=self.input_image_key,
             input_bbox_key=self.input_bbox_key,
-            output_key=self.image_with_bbox_path,
         )
 
-        # 步骤 2: 基于可视化图像生成描述
-        print(">>> [Pipeline] Step 2: Generating Region Captions...")
         self.caption_generator.run(
             storage=self.caption_storage.step(),
-            input_image_key='image_with_bbox' # 使用上一步生成的带框图像
+            input_image_key='image_with_bbox',
+            input_prompt_key='prompt'
         )
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Image region captioning with DataFlow")
- 
-    parser.add_argument("--model_path", default="/data0/happykeyan/Models/Qwen2.5-VL-3B-Instruct")
+    parser = argparse.ArgumentParser(description="Image region caption with DataFlow")
+    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
     parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface")
     parser.add_argument("--download_dir", default="./ckpt/models")
-    parser.add_argument("--device", choices=["cuda", "cpu", "mps"], default="cuda")
-
-    parser.add_argument("--first_entry_file", default="./dataflow/example/image_to_text_pipeline/region_captions.jsonl")
-    parser.add_argument("--cache_path", default="./dataflow/example/cache")
+    parser.add_argument("--first_entry_file", default="./data/image_region_caption/image_region_caption_demo.jsonl")
+    parser.add_argument("--cache_path", default="./cache/image_region_caption")
     parser.add_argument("--file_name_prefix", default="region_caption")
     parser.add_argument("--cache_type", default="jsonl")
-    
     parser.add_argument("--input_image_key", default="image")
     parser.add_argument("--input_bbox_key", default="bbox")
-    parser.add_argument("--output_key", default="mdvp_record")
-
     parser.add_argument("--max_boxes", type=int, default=10)
-    parser.add_argument("--input_jsonl_path", default="./dataflow/example/image_to_text_pipeline/region_captions.jsonl")
-    parser.add_argument("--output_jsonl_path", default="./dataflow/example/image_to_text_pipeline/region_captions_results_v1.jsonl")
-    parser.add_argument("--output_image_with_bbox_path", default="./dataflow/example/image_to_text_pipeline/image_with_bbox_results_v1.jsonl")
-    parser.add_argument("--draw_visualization", type=bool, default=True)
+    parser.add_argument("--output_image_with_bbox_path", default="./cache/image_region_caption/image_with_bbox_result.jsonl")
 
     args = parser.parse_args()
 
-    pipe = ImageRegionCaptioningPipeline(
+    pipe = ImageRegionCaptionPipeline(
         model_path=args.model_path,
         hf_cache_dir=args.hf_cache_dir,
         download_dir=args.download_dir,
-        device=args.device,
         first_entry_file=args.first_entry_file,
         cache_path=args.cache_path,
         file_name_prefix=args.file_name_prefix,
         cache_type=args.cache_type,
         input_image_key=args.input_image_key,
         input_bbox_key=args.input_bbox_key,
-        output_key=args.output_key,
         max_boxes=args.max_boxes,
-        input_jsonl_path=args.input_jsonl_path,
-        output_image_with_bbox_path=args.output_image_with_bbox_path,
-        draw_visualization=args.draw_visualization
+        output_image_with_bbox_path=args.output_image_with_bbox_path
     )
     pipe.forward()
 
diff --git a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
new file mode 100644
index 00000000..2c40bdf9
--- /dev/null
+++ b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
@@ -0,0 +1,266 @@
+---
+title: Image Region Caption Pipeline (API version)
+createTime: 2026/01/11 22:04:27
+icon: mdi:image-text
+permalink: /en/mm_guide/image_region_caption_pipeline_api/
+---
+## 1. Overview
+
+The **Image Region Caption Pipeline  (API version)** is designed to generate detailed text descriptions for specific regions within an image. Combining the localization capabilities of Computer Vision with the understanding of Multimodal Large Models (VLMs), this pipeline identifies Regions of Interest (ROI) and generates precise natural language annotations for them.
+
+This pipeline supports processing **pre-defined Bounding Box** data, visualizing these boxes, and then feeding them into a VLM for caption generation.
+
+We support the following application scenarios:
+
+* **Dense Captioning**: Generating descriptions for multiple objects within a single image.
+* **Fine-grained Image Understanding**: Focusing on local details rather than global descriptions.
+* **Dataset Augmentation**: Constructing image-text pair datasets that include localization information.
+
+The main process of the pipeline includes:
+
+1. **Data Loading**: Reading source data containing image paths and bounding box information.
+2. **BBox Processing & Visualization**: Processing input bounding boxes and generating a version of the image with visual markers (e.g., drawn boxes).
+3. **Region Caption Generation**: Using a VLM to generate text descriptions based on the marked images or specific regions.
+
+---
+
+## 2. Quick Start
+
+### Step 1: Configure API Key
+
+Set your API Key environment variable in your script:
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+
+```
+
+### Step 2: Create a New DataFlow Working Directory
+
+```bash
+mkdir run_dataflow
+cd run_dataflow
+
+```
+
+### Step 3: Initialize DataFlow-MM
+
+```bash
+dataflowmm init
+
+```
+
+You will then see:
+
+```bash
+api_pipelines/image_region_caption_api_pipeline.py
+```
+
+### Step 4: Download Sample Data
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data
+
+```
+
+### Step 5: Configure Parameters
+
+Configure the API service and input data paths in `api_pipeline/image_region_caption_api_pipeline.py`:
+
+```python
+self.vlm_serving = APIVLMServing_openai(
+    api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any OpenAI-compatible API platform
+    model_name="gpt-4o-mini",
+    image_io=None,
+    send_request_stream=False,
+    max_workers=10,
+    timeout=1800
+)
+
+```
+### Step 6: Run with One Command
+
+```bash
+python api_pipelines/image_region_caption_api_pipeline.py
+
+```
+---
+
+## 3. Data Flow & Logic
+
+### 1. **Input Data**
+
+The input data typically contains the image path and a list of corresponding bounding boxes (optional):
+
+* **image**: Path to the image file.
+* **bbox**: List of bounding box coordinates, typically in `[[x, y, w, h], ...]` format.
+
+**Input Data Example**:
+
+```json
+{
+    "image": "./data/image_region_caption/20.jpg",
+    "bbox": [[196, 104, 310, 495], [50, 60, 100, 200]]
+}
+
+```
+
+### 2. **Core Operator Logic**
+
+This pipeline chains two core operators to complete the task:
+
+#### A. **ImageBboxGenerator**
+
+This operator handles the vision-level tasks.
+
+* **Input**: Raw image + `bbox` data.
+* **Functionality**: Reads bounding boxes and draws them onto the image (visualization) or preprocesses them according to configuration.
+* **Configuration (`ExistingBBoxDataGenConfig`)**: Controls the maximum number of bounding boxes and the input/output paths.
+* **Output**: JSON output path for the new image with visual markers.
+
+#### B. **PromptedVQAGenerator**
+
+This operator is responsible for generating text using the VLM.
+
+* **Input**: The result generated in the previous step.
+* **Functionality**: The VLM receives the marked image and generates descriptions for the corresponding regions based on prompts.
+* **Output**: Region description text.
+
+### 3. **Output Data**
+
+The final generated output data includes the processed image path and the generated descriptions:
+
+* **image**: The input image path.
+* **type**: Indicates whether a bounding box is provided.
+* **bbox**: Bounding box parameters.
+* **normalized_bbox**: Normalized bounding box parameters.
+* **result_file**: The output path for the results.
+* **image_with_bbox**: Path to the image with drawn bounding boxes.
+* **valid_bboxes_num**: The number of valid bounding boxes.
+* **prompt**: The prompt received by the VLM.
+* **answer**: The list of generated region descriptions.
+
+**Output Data Example**:
+
+```json
+{
+    "image":".\/data\/image_region_caption\/20.png","type":"with_bbox",
+    "bbox":[[196,104,310,495]],
+    "normalized_bbox":[[0.128,0.125,0.329,0.72],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0]],
+    "result_file":".\/cache\/image_region_caption","image_with_bbox":".\/cache\/image_region_caption\\2_bbox_vis.jpg",
+    "valid_bboxes_num":1,
+    "prompt":"Describe the content of each marked region in the image. There are 1 regions: <region1> to <region1>.",
+    "answer":"In <region1>, the focus is on the lower half of a person wearing high-heeled shoes with an ornate design. The setting appears to be a kitchen, with items such as a table with floral tablecloth, a broom, and various kitchen utensils visible in the background. The legs of another person can also be seen, indicating there may be interaction happening in this domestic space. The overall scene captures a domestic and casual atmosphere."
+}
+```
+
+---
+
+## 4. Pipeline Example
+
+Below is the complete `ImageRegionCaptionAPIPipeline` code implementation.
+
+```python
+import os
+os.environ["DF_API_KEY"] = "sk-iaY19LU7WMT5QlK8LujFIG7RjI2omHLWYiCs4Do6imieLKOg"
+
+import argparse
+from dataflow.operators.core_vision.generate.image_bbox_generator import (
+    ImageBboxGenerator, 
+    ExistingBBoxDataGenConfig
+)
+from dataflow.operators.core_vision.generate.prompted_vqa_generator import (
+    PromptedVQAGenerator
+)
+from dataflow.utils.storage import FileStorage
+
+from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
+class ImageRegionCaptionPipeline:
+    def __init__(
+        self,
+        first_entry_file: str = "./data/image_region_caption/image_region_caption_demo.jsonl",
+        cache_path: str = "./cache/image_region_caption",
+        file_name_prefix: str = "region_caption",
+        cache_type: str = "jsonl",
+        input_image_key: str = "image",
+        input_bbox_key: str = "bbox",
+        image_with_bbox_path: str = 'image_with_bbox',
+        max_boxes: int = 10,
+        output_image_with_bbox_path: str = "./cache/image_region_caption/image_with_bbox_result.jsonl",
+    ):
+        self.bbox_storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type=cache_type
+        )
+
+        self.cfg = ExistingBBoxDataGenConfig(
+            max_boxes=max_boxes,
+            input_jsonl_path=first_entry_file,
+            output_jsonl_path=output_image_with_bbox_path,
+        )
+
+        self.caption_storage = FileStorage(
+            first_entry_file_name=output_image_with_bbox_path,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type=cache_type
+        )
+        self.vlm_serving = APIVLMServing_openai(
+            api_url="http://172.96.141.132:3001/v1", # Any API platform compatible with OpenAI format
+            model_name="gpt-4o-mini",
+            image_io=None,
+            send_request_stream=False,
+            max_workers=10,
+            timeout=1800
+        )
+        self.bbox_generator = ImageBboxGenerator(config=self.cfg)
+        self.caption_generator = PromptedVQAGenerator(serving=self.vlm_serving,system_prompt="You are a helpful assistant.")
+        self.input_image_key = input_image_key
+        self.input_bbox_key = input_bbox_key
+        self.image_with_bbox_path=image_with_bbox_path
+        self.bbox_record=None
+
+    def forward(self):
+        self.bbox_generator.run(
+            storage=self.bbox_storage.step(),
+            input_image_key=self.input_image_key,
+            input_bbox_key=self.input_bbox_key
+        )
+
+        self.caption_generator.run(
+            storage=self.caption_storage.step(),
+            input_image_key='image_with_bbox',
+            input_prompt_key='prompt'
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Image region caption with DataFlow")
+    parser.add_argument("--first_entry_file", default="./data/image_region_caption/image_region_caption_demo.jsonl")
+    parser.add_argument("--cache_path", default="./cache/image_region_caption")
+    parser.add_argument("--file_name_prefix", default="region_caption")
+    parser.add_argument("--cache_type", default="jsonl")
+    parser.add_argument("--input_image_key", default="image")
+    parser.add_argument("--input_bbox_key", default="bbox")
+
+    parser.add_argument("--max_boxes", type=int, default=10)
+    parser.add_argument("--output_image_with_bbox_path", default="./cache/image_region_caption/image_with_bbox_result.jsonl")
+
+    args = parser.parse_args()
+
+    pipe = ImageRegionCaptionPipeline(
+        first_entry_file=args.first_entry_file,
+        cache_path=args.cache_path,
+        file_name_prefix=args.file_name_prefix,
+        cache_type=args.cache_type,
+        input_image_key=args.input_image_key,
+        input_bbox_key=args.input_bbox_key,
+        max_boxes=args.max_boxes,
+        output_image_with_bbox_path=args.output_image_with_bbox_path,
+    )
+    pipe.forward()
+
+```
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_bbox_generator.md b/docs/en/notes/mm_operators/image_understanding/generate/image_bbox_generator.md
index e930c005..0d6f0c57 100644
--- a/docs/en/notes/mm_operators/image_understanding/generate/image_bbox_generator.md
+++ b/docs/en/notes/mm_operators/image_understanding/generate/image_bbox_generator.md
@@ -49,8 +49,7 @@ def run(
     self, 
     storage: DataFlowStorage, 
     input_image_key: str = "image", 
-    input_bbox_key: str = "bbox", 
-    output_key: str = "mdvp_record"
+    input_bbox_key: str = "bbox"
 ):
     ...
 
@@ -90,7 +89,6 @@ Reads raw data from `config.input_jsonl_path`.
 | `storage` | `DataFlowStorage` | N/A | Storage object, mainly used to provide the `cache_path`. |
 | `input_image_key` | `str` | `"image"` | Field name for image paths in the input JSONL. |
 | `input_bbox_key` | `str` | `"bbox"` | Field name for BBox data in the input JSONL. |
-| `output_key` | `str` | `"mdvp_record"` | (Reserved) Key name for the output record. |
 
 ## 🧩 Example Usage
 
@@ -98,50 +96,41 @@ Reads raw data from `config.input_jsonl_path`.
 from dataflow.utils.storage import FileStorage
 from dataflow.operators.cv import ImageBboxGenerator, ExistingBBoxDataGenConfig
 
-# 1) Configure Parameters
-config = ExistingBBoxDataGenConfig(
-    max_boxes=5,
-    input_jsonl_path="./data/raw_images.jsonl",
-    output_jsonl_path="./data/processed_with_prompts.jsonl"
+cfg = ExistingBBoxDataGenConfig(
+    max_boxes=10,
+    input_jsonl_path="./data/image_region_caption/image_region_caption_demo.jsonl",
+    output_jsonl_path="./cache/image_region_caption/image_with_bbox_result.jsonl",
 )
-
-# 2) Initialize Operator
-# Note: This operator is for data prep and does not require a Serving instance
 generator = ImageBboxGenerator(config=config)
 
-# 3) Prepare Storage (Only for providing cache path)
 storage = FileStorage(
-    cache_path="./cache_vis_images",
-    file_name_prefix="bbox_gen"
+    first_entry_file_name="./data/image_region_caption/image_region_caption_demo.jsonl",
+    cache_path="./cache/image_region_caption",
+    file_name_prefix="region_caption",
+    cache_type="jsonl"
 )
 
-# 4) Execute Processing
-# Automatically reads from config input, writes to config output
 generator.run(
     storage=storage,
-    input_image_key="image_path",
-    input_bbox_key="ground_truth_bbox" # Will auto-extract if this column is missing
+    input_image_key="image",
+    input_bbox_key="bbox"
 )
-
 ```
 
 ### 🧾 Output Data Format (Output JSONL)
 
-Each line in the `output_jsonl_path` file contains:
+Each line in the `image_with_bbox_result.jsonl` file contains:
 
 ```json
 {
-  "image": "/data/raw/cat.jpg",
-  "type": "without_bbox", // or "with_bbox"
-  "bbox": [[100, 200, 50, 60], ...], // Raw pixel coords [x, y, w, h]
-  "normalized_bbox": [
-      [0.1, 0.2, 0.15, 0.26], 
-      [0.0, 0.0, 0.0, 0.0] // Zero-padded
-  ],
-  "result_file": "./cache_vis_images",
-  "image_with_bbox": "./cache_vis_images/1_bbox_vis.jpg", // Path to visualized image
-  "valid_bboxes_num": 1,
-  "prompt": "Describe the content of each marked region in the image. There are 1 regions: \<region1\> to \<region1\>."
+    "image": "./data/image_region_caption/20.png", 
+    "type": "with_bbox", 
+    "bbox": [[196, 104, 310, 495]], 
+    "normalized_bbox": [[0.128, 0.125, 0.329, 0.72], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]], 
+    "result_file": "./cache/image_region_caption", 
+    "image_with_bbox": "./cache/image_region_caption\\2_bbox_vis.jpg", 
+    "valid_bboxes_num": 1, 
+    "prompt": "Describe the content of each marked region in the image. There are 1 regions: <region1> to <region1>."
 }
 
 ```
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
index 245a43db..b56ba441 100644
--- a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
+++ b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
@@ -1,12 +1,12 @@
 ---
-title: 图像区域描述生成流水线
+title: 图像区域描述生成流水线RegionCap
 createTime: 2026/01/11 22:04:27
 icon: mdi:image-text
 permalink: /zh/mm_guide/image_region_caption_pipeline/
 ---
 ## 1. 概述
 
-**图像区域描述生成流水线 (Image Region Captioning Pipeline)** 旨在为图像中的特定区域生成详细的文本描述。该流水线结合了计算机视觉的定位能力与多模态大模型的理解能力，能够识别图像中的感兴趣区域（ROI），并为其生成精确的自然语言标注。
+**图像区域描述生成流水线 (Image Region Caption Pipeline)** 旨在为图像中的特定区域生成详细的文本描述。该流水线结合了计算机视觉的定位能力与多模态大模型的理解能力，能够识别图像中的感兴趣区域（ROI），并为其生成精确的自然语言标注。
 
 该流水线支持处理**预定义边界框 (Bounding Box)** 数据，并将其可视化后输入 VLM 进行描述生成。
 
@@ -26,35 +26,31 @@ permalink: /zh/mm_guide/image_region_caption_pipeline/
 
 ## 2. 快速开始
 
-### 第一步：准备工作目录
-
+### 第一步：创建新的 DataFlow 工作文件夹
 ```bash
-mkdir run_region_caption
-cd run_region_caption
-
+mkdir run_dataflow
+cd run_dataflow
 ```
 
-### 第二步：准备脚本
-
-将下文“流水线示例”中的代码保存为 `region_caption_pipeline.py`。
-
-### 第三步：配置运行参数
-
-确保输入文件（jsonl）包含 `image` 和 `bbox` 字段。
-
+### 第二步：初始化 DataFlow-MM
+```bash
+dataflowmm init
+```
+这时你会看到：
 ```bash
-# 安装依赖
-pip install open-dataflow vllm
+api_pipelines/image_region_caption_api_pipeline.py
+```
 
+### 第三步：下载示例数据
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data
 ```
 
 ### 第四步：一键运行
 
 ```bash
-python region_caption_pipeline.py \
+python api_pipelines/image_region_caption_api_pipeline.py \
   --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
-  --first_entry_file "data/region_captions.jsonl" \
-  --output_jsonl_path "data/results.jsonl"
 
 ```
 
@@ -64,16 +60,16 @@ python region_caption_pipeline.py \
 
 ### 1. **输入数据**
 
-输入数据通常包含图像路径和对应的边界框列表：
+输入数据通常包含图像路径和对应的边界框列表（可选）：
 
 * **image**：图像文件路径。
-* **bbox**：边界框坐标列表，通常格式为 `[[x, y, w, h], ...]` 或 `[[x1, y1, x2, y2], ...]`（取决于具体配置）。
+* **bbox**：边界框坐标列表，通常格式为 `[[x, y, w, h], ...]`。
 
 **输入数据示例**：
 
 ```json
 {
-    "image": "./images/kitchen.jpg",
+    "image": "./data/image_region_caption/20.jpg",
     "bbox": [[196, 104, 310, 495], [50, 60, 100, 200]]
 }
 
@@ -89,34 +85,41 @@ python region_caption_pipeline.py \
 
 * **输入**：原始图像 + `bbox` 数据。
 * **功能**：读取边界框，将其绘制在图像上（可视化），或者根据配置进行预处理。
-* **配置 (`ExistingBBoxDataGenConfig`)**：控制最大框数量 (`max_boxes`) 和可视化选项 (`draw_visualization`)。
-* **输出**：生成带有视觉标记的新图像路径（`image_with_bbox`）。
+* **配置 (`ExistingBBoxDataGenConfig`)**：控制最大框数量 (`max_boxes`)和输入输出路径。
+* **输出**：带有视觉标记的新图像的json文件输出路径。
 
 #### B. **PromptedVQAGenerator（VQA 生成器）**
 
 该算子负责利用 VLM 生成文本。
 
-* **输入**：上一步生成的 `image_with_bbox`。
+* **输入**：上一步的输出。
 * **功能**：VLM 接收带有标记的图像，根据提示生成对应区域的描述。
 * **输出**：区域描述文本。
 
 ### 3. **输出数据**
 
 最终生成的输出数据将包含处理后的图像路径和生成的描述：
-
+* **image**：输入的图片路径。
+* **type**：是否给定边界框。
+* **bbox**：边界框参数。
+* **normalized_bbox**：标准化后的边界框参数。
+* **result_file**：结果输出路径。
 * **image_with_bbox**：画了框的图像路径。
-* **mdvp_record**：生成的区域描述列表。
+* **valid_bboxes_num**：有效边界框数量。
+* **prompt**：VLM接收的提示词。
+* **answer**：生成的区域描述列表。
 
 **输出数据示例**：
 
 ```json
 {
-    "image": "./images/kitchen.jpg",
-    "image_with_bbox": "./images/kitchen_visualized.jpg",
-    "mdvp_record": [
-        "A wooden chair located near the table.",
-        "A white refrigerator in the background."
-    ]
+    "image":".\/data\/image_region_caption\/20.png","type":"with_bbox",
+    "bbox":[[196,104,310,495]],
+    "normalized_bbox":[[0.128,0.125,0.329,0.72],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0]],
+    "result_file":".\/cache\/image_region_caption","image_with_bbox":".\/cache\/image_region_caption\\2_bbox_vis.jpg",
+    "valid_bboxes_num":1,
+    "prompt":"Describe the content of each marked region in the image. There are 1 regions: <region1> to <region1>.",
+    "answer":"In <region1>, the focus is on the lower half of a person wearing high-heeled shoes with an ornate design. The setting appears to be a kitchen, with items such as a table with floral tablecloth, a broom, and various kitchen utensils visible in the background. The legs of another person can also be seen, indicating there may be interaction happening in this domestic space. The overall scene captures a domestic and casual atmosphere."
 }
 
 ```
@@ -125,7 +128,7 @@ python region_caption_pipeline.py \
 
 ## 4. 流水线示例
 
-以下是完整的 `ImageRegionCaptioningPipeline` 代码实现。
+以下是完整的 `ImageRegionCaptionPipeline` 代码实现。
 
 ```python
 import argparse
@@ -140,57 +143,42 @@ from dataflow.operators.core_vision.generate.prompted_vqa_generator import (
 from dataflow.utils.storage import FileStorage
 
 
-class ImageRegionCaptioningPipeline:
+class ImageRegionCaptionPipeline:
     def __init__(
         self,
         model_path: str,
         *,
         hf_cache_dir: str | None = None,
         download_dir: str = "./ckpt/models",
-        device: str = "cuda",
-        # Storage & Paths
-        first_entry_file: str = "./dataflow/example/image_to_text_pipeline/region_captions.jsonl",
-        cache_path: str = "./dataflow/example/cache",
+        first_entry_file: str = "./data/image_region_caption/image_region_caption_demo.jsonl",
+        cache_path: str = "./cache/image_region_caption",
         file_name_prefix: str = "region_caption",
         cache_type: str = "jsonl",
-        # Keys
         input_image_key: str = "image",
         input_bbox_key: str = "bbox",
-        image_with_bbox_path: str = 'image_with_bbox', # Key for intermediate image
-        output_key: str = "mdvp_record",
-        # BBox Config
+        image_with_bbox_path: str = 'image_with_bbox',
         max_boxes: int = 10,
-        input_jsonl_path: str = "./dataflow/example/image_to_text_pipeline/region_captions.jsonl",
-        output_jsonl_path: str = "./dataflow/example/image_to_text_pipeline/region_captions_results_v1.jsonl",
-        output_image_with_bbox_path: str = "./dataflow/example/image_to_text_pipeline/image_with_bbox_results_v1.jsonl",
-        draw_visualization: bool = True
+        output_image_with_bbox_path: str = "./cache/image_region_caption/image_with_bbox_result.jsonl",
     ):
-        # 1. 初始化存储 (Storage)
-        # 用于 BBox 生成阶段的存储
         self.bbox_storage = FileStorage(
             first_entry_file_name=first_entry_file,
             cache_path=cache_path,
             file_name_prefix=file_name_prefix,
             cache_type=cache_type
         )
-        
-        # 2. 配置 BBox 生成器
+
         self.cfg = ExistingBBoxDataGenConfig(
             max_boxes=max_boxes,
-            input_jsonl_path=input_jsonl_path,
+            input_jsonl_path=first_entry_file,
             output_jsonl_path=output_image_with_bbox_path,
         )
 
-        # 3. 初始化 Caption 阶段的存储
-        # 注意：这里接续了上一步的输出路径
         self.caption_storage = FileStorage(
             first_entry_file_name=output_image_with_bbox_path,
             cache_path=cache_path,
             file_name_prefix=file_name_prefix,
             cache_type=cache_type
         )
-
-        # 4. 初始化 VLM 服务
         self.serving = LocalModelVLMServing_vllm(
             hf_model_name_or_path=model_path,
             hf_cache_dir=hf_cache_dir,
@@ -200,75 +188,55 @@ class ImageRegionCaptioningPipeline:
             vllm_top_p=0.9,
             vllm_max_tokens=512,
         )
-
-        # 5. 初始化核心算子
         self.bbox_generator = ImageBboxGenerator(config=self.cfg)
-        self.caption_generator = PromptedVQAGenerator(serving=self.serving)
-        
+        self.caption_generator = PromptedVQAGenerator(serving=self.serving,)
         self.input_image_key = input_image_key
         self.input_bbox_key = input_bbox_key
-        self.output_key = output_key
-        self.image_with_bbox_path = image_with_bbox_path
+        self.image_with_bbox_path=image_with_bbox_path
+        self.bbox_record=None
 
     def forward(self):
-        # 步骤 1: 生成带 BBox 可视化的图像
-        print(">>> [Pipeline] Step 1: Processing BBoxes and Visualizing...")
         self.bbox_generator.run(
             storage=self.bbox_storage.step(),
             input_image_key=self.input_image_key,
             input_bbox_key=self.input_bbox_key,
-            output_key=self.image_with_bbox_path,
         )
 
-        # 步骤 2: 基于可视化图像生成描述
-        print(">>> [Pipeline] Step 2: Generating Region Captions...")
         self.caption_generator.run(
             storage=self.caption_storage.step(),
-            input_image_key='image_with_bbox' # 使用上一步生成的带框图像
+            input_image_key='image_with_bbox',
+            input_prompt_key='prompt'
         )
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Image region captioning with DataFlow")
- 
-    parser.add_argument("--model_path", default="/data0/happykeyan/Models/Qwen2.5-VL-3B-Instruct")
+    parser = argparse.ArgumentParser(description="Image region caption with DataFlow")
+    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
     parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface")
     parser.add_argument("--download_dir", default="./ckpt/models")
-    parser.add_argument("--device", choices=["cuda", "cpu", "mps"], default="cuda")
-
-    parser.add_argument("--first_entry_file", default="./dataflow/example/image_to_text_pipeline/region_captions.jsonl")
-    parser.add_argument("--cache_path", default="./dataflow/example/cache")
+    parser.add_argument("--first_entry_file", default="./data/image_region_caption/image_region_caption_demo.jsonl")
+    parser.add_argument("--cache_path", default="./cache/image_region_caption")
     parser.add_argument("--file_name_prefix", default="region_caption")
     parser.add_argument("--cache_type", default="jsonl")
-    
     parser.add_argument("--input_image_key", default="image")
     parser.add_argument("--input_bbox_key", default="bbox")
-    parser.add_argument("--output_key", default="mdvp_record")
-
     parser.add_argument("--max_boxes", type=int, default=10)
-    parser.add_argument("--input_jsonl_path", default="./dataflow/example/image_to_text_pipeline/region_captions.jsonl")
-    parser.add_argument("--output_jsonl_path", default="./dataflow/example/image_to_text_pipeline/region_captions_results_v1.jsonl")
-    parser.add_argument("--output_image_with_bbox_path", default="./dataflow/example/image_to_text_pipeline/image_with_bbox_results_v1.jsonl")
-    parser.add_argument("--draw_visualization", type=bool, default=True)
+    parser.add_argument("--output_image_with_bbox_path", default="./cache/image_region_caption/image_with_bbox_result.jsonl")
 
     args = parser.parse_args()
 
-    pipe = ImageRegionCaptioningPipeline(
+    pipe = ImageRegionCaptionPipeline(
         model_path=args.model_path,
         hf_cache_dir=args.hf_cache_dir,
         download_dir=args.download_dir,
-        device=args.device,
         first_entry_file=args.first_entry_file,
         cache_path=args.cache_path,
         file_name_prefix=args.file_name_prefix,
         cache_type=args.cache_type,
         input_image_key=args.input_image_key,
         input_bbox_key=args.input_bbox_key,
-        output_key=args.output_key,
         max_boxes=args.max_boxes,
-        input_jsonl_path=args.input_jsonl_path,
-        output_image_with_bbox_path=args.output_image_with_bbox_path,
-        draw_visualization=args.draw_visualization
+        output_image_with_bbox_path=args.output_image_with_bbox_path
     )
     pipe.forward()
 
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
new file mode 100644
index 00000000..ac974089
--- /dev/null
+++ b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
@@ -0,0 +1,257 @@
+---
+title: 图像区域描述生成流水线RegionCap（API版）
+createTime: 2026/01/11 22:04:27
+icon: mdi:image-text
+permalink: /zh/mm_guide/image_region_caption_pipeline_api/
+---
+## 1. 概述
+
+**图像区域描述生成流水线（API版）** 旨在为图像中的特定区域生成详细的文本描述。该流水线结合了计算机视觉的定位能力与多模态大模型的理解能力，能够识别图像中的感兴趣区域（ROI），并为其生成精确的自然语言标注。
+
+该流水线支持处理**预定义边界框 (Bounding Box)** 数据，并将其可视化后输入 VLM 进行描述生成。
+
+我们支持以下应用场景：
+
+* **密集描述生成 (Dense Captioning)**：为图像中的多个物体分别生成描述。
+* **细粒度图像理解**：关注图像的局部细节而非全局描述。
+* **数据集增强**：构建带定位信息的图文对数据集。
+
+流水线的主要流程包括：
+
+1. **数据加载**：读取包含图像和边界框信息的源数据。
+2. **边界框处理与可视化**：处理输入的边界框，生成带有可视化标记（如画框）的图像版本。
+3. **区域描述生成**：利用 VLM 针对标记后的图像或特定区域生成文本描述。
+
+---
+
+## 2. 快速开始
+
+### 第一步：配置 API Key
+
+在脚本中设置 API Key 环境变量：
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+```
+
+### 第二步：创建新的 DataFlow 工作文件夹
+```bash
+mkdir run_dataflow
+cd run_dataflow
+```
+
+### 第三步：初始化 DataFlow-MM
+```bash
+dataflowmm init
+```
+这时你会看到：
+```bash
+api_pipelines/image_region_caption_api_pipeline.py
+```
+
+### 第四步：下载示例数据
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data
+```
+
+### 第五步：配置参数
+
+在 `api_pipeline/image_region_caption_api_pipeline.py` 中配置 API 服务和输入数据路径：
+
+```python
+self.vlm_serving = APIVLMServing_openai(
+            api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # 任意兼容OpenAI格式的API平台
+            model_name="gpt-4o-mini",
+            image_io=None,
+            send_request_stream=False,
+            max_workers=10,
+            timeout=1800
+        )
+```
+
+### 第六步：一键运行
+```bash
+python api_pipelines/image_region_caption_api_pipeline.py
+```
+
+---
+
+## 3. 数据流与流水线逻辑
+
+### 1. **输入数据**
+
+输入数据通常包含图像路径和对应的边界框列表（可选）：
+
+* **image**：图像文件路径。
+* **bbox**：边界框坐标列表，通常格式为 `[[x, y, w, h], ...]` 。
+
+**输入数据示例**：
+
+```json
+{
+    "image": "./data/image_region_caption/20.jpg",
+    "bbox": [[196, 104, 310, 495], [50, 60, 100, 200]]
+}
+
+```
+
+### 2. **核心算子逻辑**
+
+该流水线通过串联两个核心算子来完成任务：
+
+#### A. **ImageBboxGenerator（边界框处理器）**
+
+该算子负责处理视觉层面的任务。
+
+* **输入**：原始图像 + `bbox` 数据。
+* **功能**：读取边界框，将其绘制在图像上（可视化），或者根据配置进行预处理。
+* **配置 (`ExistingBBoxDataGenConfig`)**：控制最大框数量 (`max_boxes`)和输入输出路径。
+* **输出**：带有视觉标记的新图像的json文件输出路径。
+
+#### B. **PromptedVQAGenerator（VQA 生成器）**
+
+该算子负责利用 VLM 生成文本。
+
+* **输入**：上一步的输出。
+* **功能**：VLM 接收带有标记的图像，根据提示生成对应区域的描述。
+* **输出**：区域描述文本。
+
+### 3. **输出数据**
+
+最终生成的输出数据将包含处理后的图像路径和生成的描述：
+* **image**：输入的图片路径。
+* **type**：是否给定边界框。
+* **bbox**：边界框参数。
+* **normalized_bbox**：标准化后的边界框参数。
+* **result_file**：结果输出路径。
+* **image_with_bbox**：画了框的图像路径。
+* **valid_bboxes_num**：有效边界框数量。
+* **prompt**：VLM接收的提示词。
+* **answer**：生成的区域描述列表。
+
+**输出数据示例**：
+
+```json
+{
+    "image":".\/data\/image_region_caption\/20.png","type":"with_bbox",
+    "bbox":[[196,104,310,495]],
+    "normalized_bbox":[[0.128,0.125,0.329,0.72],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0]],
+    "result_file":".\/cache\/image_region_caption","image_with_bbox":".\/cache\/image_region_caption\\2_bbox_vis.jpg",
+    "valid_bboxes_num":1,
+    "prompt":"Describe the content of each marked region in the image. There are 1 regions: <region1> to <region1>.",
+    "answer":"In <region1>, the focus is on the lower half of a person wearing high-heeled shoes with an ornate design. The setting appears to be a kitchen, with items such as a table with floral tablecloth, a broom, and various kitchen utensils visible in the background. The legs of another person can also be seen, indicating there may be interaction happening in this domestic space. The overall scene captures a domestic and casual atmosphere."
+}
+
+```
+
+---
+
+## 4. 流水线示例
+
+以下是完整的 `ImageRegionCaptionAPIPipeline` 代码实现。
+
+```python
+import os
+os.environ["DF_API_KEY"] = "sk-iaY19LU7WMT5QlK8LujFIG7RjI2omHLWYiCs4Do6imieLKOg"
+
+import argparse
+from dataflow.operators.core_vision.generate.image_bbox_generator import (
+    ImageBboxGenerator, 
+    ExistingBBoxDataGenConfig
+)
+from dataflow.operators.core_vision.generate.prompted_vqa_generator import (
+    PromptedVQAGenerator
+)
+from dataflow.utils.storage import FileStorage
+
+from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
+class ImageRegionCaptionPipeline:
+    def __init__(
+        self,
+        first_entry_file: str = "./data/image_region_caption/image_region_caption_demo.jsonl",
+        cache_path: str = "./cache/image_region_caption",
+        file_name_prefix: str = "region_caption",
+        cache_type: str = "jsonl",
+        input_image_key: str = "image",
+        input_bbox_key: str = "bbox",
+        image_with_bbox_path: str = 'image_with_bbox',
+        max_boxes: int = 10,
+        output_image_with_bbox_path: str = "./cache/image_region_caption/image_with_bbox_result.jsonl",
+    ):
+        self.bbox_storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type=cache_type
+        )
+
+        self.cfg = ExistingBBoxDataGenConfig(
+            max_boxes=max_boxes,
+            input_jsonl_path=first_entry_file,
+            output_jsonl_path=output_image_with_bbox_path,
+        )
+
+        self.caption_storage = FileStorage(
+            first_entry_file_name=output_image_with_bbox_path,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type=cache_type
+        )
+        self.vlm_serving = APIVLMServing_openai(
+            api_url="http://172.96.141.132:3001/v1", # Any API platform compatible with OpenAI format
+            model_name="gpt-4o-mini",
+            image_io=None,
+            send_request_stream=False,
+            max_workers=10,
+            timeout=1800
+        )
+        self.bbox_generator = ImageBboxGenerator(config=self.cfg)
+        self.caption_generator = PromptedVQAGenerator(serving=self.vlm_serving,system_prompt="You are a helpful assistant.")
+        self.input_image_key = input_image_key
+        self.input_bbox_key = input_bbox_key
+        self.image_with_bbox_path=image_with_bbox_path
+        self.bbox_record=None
+
+    def forward(self):
+        self.bbox_generator.run(
+            storage=self.bbox_storage.step(),
+            input_image_key=self.input_image_key,
+            input_bbox_key=self.input_bbox_key
+        )
+
+        self.caption_generator.run(
+            storage=self.caption_storage.step(),
+            input_image_key='image_with_bbox',
+            input_prompt_key='prompt'
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Image region caption with DataFlow")
+    parser.add_argument("--first_entry_file", default="./data/image_region_caption/image_region_caption_demo.jsonl")
+    parser.add_argument("--cache_path", default="./cache/image_region_caption")
+    parser.add_argument("--file_name_prefix", default="region_caption")
+    parser.add_argument("--cache_type", default="jsonl")
+    parser.add_argument("--input_image_key", default="image")
+    parser.add_argument("--input_bbox_key", default="bbox")
+
+    parser.add_argument("--max_boxes", type=int, default=10)
+    parser.add_argument("--output_image_with_bbox_path", default="./cache/image_region_caption/image_with_bbox_result.jsonl")
+
+    args = parser.parse_args()
+
+    pipe = ImageRegionCaptionPipeline(
+        first_entry_file=args.first_entry_file,
+        cache_path=args.cache_path,
+        file_name_prefix=args.file_name_prefix,
+        cache_type=args.cache_type,
+        input_image_key=args.input_image_key,
+        input_bbox_key=args.input_bbox_key,
+        max_boxes=args.max_boxes,
+        output_image_with_bbox_path=args.output_image_with_bbox_path,
+    )
+    pipe.forward()
+
+```
+
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/image_bbox_generator.md b/docs/zh/notes/mm_operators/image_understanding/generate/image_bbox_generator.md
index 661a1562..eab99df6 100644
--- a/docs/zh/notes/mm_operators/image_understanding/generate/image_bbox_generator.md
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/image_bbox_generator.md
@@ -49,8 +49,7 @@ def run(
     self, 
     storage: DataFlowStorage, 
     input_image_key: str = "image", 
-    input_bbox_key: str = "bbox", 
-    output_key: str = "mdvp_record"
+    input_bbox_key: str = "bbox"
 ):
     ...
 
@@ -69,7 +68,7 @@ def run(
 3. **标准化与可视化 (Normalize & Visualize)**
 *
 * **标准化**：将 `[x, y, w, h]` 转换为归一化的 `[x1, y1, x2, y2]` 格式，并根据 `max_boxes` 进行截断或补零 (`0.0, 0.0, 0.0, 0.0`)。
-* **可视化**：在原图上绘制绿色矩形框和数字标签，保存至 `storage.cache_path`。
+* **可视化**：在原图上绘制矩形框和数字标签，保存至 `storage.cache_path`。
 
 
 4. **Prompt 生成**
@@ -89,7 +88,6 @@ def run(
 | `storage` | `DataFlowStorage` | 无 | DataFlow 存储对象，主要用于获取缓存路径 (`cache_path`)。 |
 | `input_image_key` | `str` | `"image"` | 输入 JSONL 中图像路径的字段名。 |
 | `input_bbox_key` | `str` | `"bbox"` | 输入 JSONL 中 BBox 数据的字段名。 |
-| `output_key` | `str` | `"mdvp_record"` | (保留字段) 用于标识输出记录的键名。 |
 
 ## 🧩 示例用法
 
@@ -97,49 +95,41 @@ def run(
 from dataflow.utils.storage import FileStorage
 from dataflow.operators.cv import ImageBboxGenerator, ExistingBBoxDataGenConfig
 
-# 1) 配置参数
-config = ExistingBBoxDataGenConfig(
-    max_boxes=5,
-    input_jsonl_path="./data/raw_images.jsonl",
-    output_jsonl_path="./data/processed_with_prompts.jsonl"
+cfg = ExistingBBoxDataGenConfig(
+    max_boxes=10,
+    input_jsonl_path="./data/image_region_caption/image_region_caption_demo.jsonl",
+    output_jsonl_path="./cache/image_region_caption/image_with_bbox_result.jsonl",
 )
-
-# 2) 初始化算子
-# 注意：此算子主要用于数据准备，不依赖 Serving 实例
 generator = ImageBboxGenerator(config=config)
 
-# 3) 准备 Storage (仅用于提供缓存路径)
 storage = FileStorage(
-    cache_path="./cache_vis_images",
-    file_name_prefix="bbox_gen"
+    first_entry_file_name="./data/image_region_caption/image_region_caption_demo.jsonl",
+    cache_path="./cache/image_region_caption",
+    file_name_prefix="region_caption",
+    cache_type="jsonl"
 )
 
-# 4) 执行处理
-# 自动读取 config 中的 input_jsonl_path，结果写入 output_jsonl_path
 generator.run(
     storage=storage,
-    input_image_key="image_path",
-    input_bbox_key="ground_truth_bbox" # 若文件中无此列，将自动提取 BBox
+    input_image_key="image",
+    input_bbox_key="bbox"
 )
 
 ```
 
 ### 🧾 输出数据格式 (Output JSONL)
 
-生成的 `output_jsonl_path` 文件中，每一行包含以下结构：
+生成的 `image_with_bbox_result.jsonl` 文件中，每一行包含以下结构：
 
 ```json
 {
-  "image": "/data/raw/cat.jpg",
-  "type": "without_bbox", // 或 "with_bbox"
-  "bbox": [[100, 200, 50, 60], ...], // 原始像素坐标 [x, y, w, h]
-  "normalized_bbox": [
-      [0.1, 0.2, 0.15, 0.26], 
-      [0.0, 0.0, 0.0, 0.0] // 补零填充
-  ],
-  "result_file": "./cache_vis_images",
-  "image_with_bbox": "./cache_vis_images/1_bbox_vis.jpg", // 可视化图片路径
-  "valid_bboxes_num": 1,
-  "prompt": "Describe the content of each marked region in the image. There are 1 regions: \<region1\> to \<region1\>."
+    "image": "./data/image_region_caption/20.png", 
+    "type": "with_bbox", 
+    "bbox": [[196, 104, 310, 495]], 
+    "normalized_bbox": [[0.128, 0.125, 0.329, 0.72], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]], 
+    "result_file": "./cache/image_region_caption", 
+    "image_with_bbox": "./cache/image_region_caption\\2_bbox_vis.jpg", 
+    "valid_bboxes_num": 1, 
+    "prompt": "Describe the content of each marked region in the image. There are 1 regions: <region1> to <region1>."
 }
 ```

From 8d147e0b4475a415686a6f6fe41b275dc7614808 Mon Sep 17 00:00:00 2001
From: chawuciren11 <2216740116@qq.com>
Date: Sat, 21 Feb 2026 10:18:43 +0800
Subject: [PATCH 2/7] 11

---
 .../image_understanding/image_region_caption_pipeline.md     | 5 ++---
 .../image_understanding/image_region_caption_pipeline_api.md | 2 +-
 .../image_understanding/image_region_caption_pipeline.md     | 5 ++---
 .../image_understanding/image_region_caption_pipeline_api.md | 2 +-
 4 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
index d35b7385..760bbb61 100644
--- a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
+++ b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
@@ -44,7 +44,7 @@ dataflowmm init
 You will then see:
 
 ```bash
-api_pipelines/image_region_caption_api_pipeline.py
+gpu_pipelines/image_region_caption_pipeline.py
 ```
 
 ### Step 3: Download Sample Data
@@ -57,8 +57,7 @@ huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --loca
 ### Step 4: Run
 
 ```bash
-python region_caption_pipeline.py \
-  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
+python gpu_pipelines/image_region_caption_pipeline.py
 ```
 
 ---
diff --git a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
index 2c40bdf9..fc622643 100644
--- a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
+++ b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
@@ -66,7 +66,7 @@ huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --loca
 
 ### Step 5: Configure Parameters
 
-Configure the API service and input data paths in `api_pipeline/image_region_caption_api_pipeline.py`:
+Configure the API service and input data paths in `api_pipelines/image_region_caption_api_pipeline.py`:
 
 ```python
 self.vlm_serving = APIVLMServing_openai(
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
index b56ba441..2f99c257 100644
--- a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
+++ b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
@@ -38,7 +38,7 @@ dataflowmm init
 ```
 这时你会看到：
 ```bash
-api_pipelines/image_region_caption_api_pipeline.py
+gpu_pipelines/image_region_caption_pipeline.py
 ```
 
 ### 第三步：下载示例数据
@@ -49,8 +49,7 @@ huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --loca
 ### 第四步：一键运行
 
 ```bash
-python api_pipelines/image_region_caption_api_pipeline.py \
-  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
+python gpu_pipelines/image_region_caption_pipeline.py
 
 ```
 
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
index ac974089..a55fe63d 100644
--- a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
+++ b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
@@ -57,7 +57,7 @@ huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --loca
 
 ### 第五步：配置参数
 
-在 `api_pipeline/image_region_caption_api_pipeline.py` 中配置 API 服务和输入数据路径：
+在 `api_pipelines/image_region_caption_api_pipeline.py` 中配置 API 服务和输入数据路径：
 
 ```python
 self.vlm_serving = APIVLMServing_openai(

From 139bde76b13a7a631726d3c8fe26e69f753903f2 Mon Sep 17 00:00:00 2001
From: chawuciren11 <2216740116@qq.com>
Date: Tue, 24 Feb 2026 17:13:44 +0800
Subject: [PATCH 3/7] 11

---
 .../image_region_caption_pipeline.md          | 24 ++++++++++++++---
 .../image_region_caption_pipeline_api.md      | 19 ++++++++++++--
 .../image_region_caption_pipeline.md          | 26 ++++++++++++++++---
 .../image_region_caption_pipeline_api.md      | 21 ++++++++++++---
 4 files changed, 78 insertions(+), 12 deletions(-)

diff --git a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
index 760bbb61..a67175f4 100644
--- a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
+++ b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
@@ -50,14 +50,32 @@ gpu_pipelines/image_region_caption_pipeline.py
 ### Step 3: Download Sample Data
 
 ```bash
-huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
+```
 
+### Step 4: Configure Parameters
+```python
+    def __init__(
+        self,
+        model_path: str = "Qwen/Qwen2.5-VL-3B-Instruct",
+        hf_cache_dir: str = "~/.cache/huggingface",
+        download_dir: str = "./ckpt/models",
+        first_entry_file: str = "../example_data/image_region_caption/image_region_caption_demo.jsonl",
+        cache_path: str = "../cache/image_region_caption",
+        file_name_prefix: str = "region_caption",
+        cache_type: str = "jsonl",
+        input_image_key: str = "image",
+        input_bbox_key: str = "bbox",
+        max_boxes: int = 10,
+        output_image_with_bbox_path: str = "../cache/image_region_caption/image_with_bbox_result.jsonl",
+    ):
 ```
 
-### Step 4: Run
+### Step 5: Run
 
 ```bash
-python gpu_pipelines/image_region_caption_pipeline.py
+cd gpu_pipelines
+python image_region_caption_pipeline.py
 ```
 
 ---
diff --git a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
index fc622643..66391423 100644
--- a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
+++ b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
@@ -60,7 +60,7 @@ api_pipelines/image_region_caption_api_pipeline.py
 ### Step 4: Download Sample Data
 
 ```bash
-huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
 
 ```
 
@@ -68,6 +68,20 @@ huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --loca
 
 Configure the API service and input data paths in `api_pipelines/image_region_caption_api_pipeline.py`:
 
+```python
+    def __init__(
+        self,
+        first_entry_file: str = "../example_data/image_region_caption/image_region_caption_demo.jsonl",
+        cache_path: str = "../cache/image_region_caption",
+        file_name_prefix: str = "region_caption",
+        cache_type: str = "jsonl",
+        input_image_key: str = "image",
+        input_bbox_key: str = "bbox",
+        max_boxes: int = 10,
+        output_image_with_bbox_path: str = "../cache/image_region_caption/image_with_bbox_result.jsonl",
+    ):
+```
+
 ```python
 self.vlm_serving = APIVLMServing_openai(
     api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any OpenAI-compatible API platform
@@ -82,7 +96,8 @@ self.vlm_serving = APIVLMServing_openai(
 ### Step 6: Run with One Command
 
 ```bash
-python api_pipelines/image_region_caption_api_pipeline.py
+cd api_pipelines
+python image_region_caption_api_pipeline.py
 
 ```
 ---
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
index 2f99c257..af575f54 100644
--- a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
+++ b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
@@ -43,14 +43,32 @@ gpu_pipelines/image_region_caption_pipeline.py
 
 ### 第三步：下载示例数据
 ```bash
-huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
 ```
 
-### 第四步：一键运行
+### 第四步：配置参数
+```python
+    def __init__(
+        self,
+        model_path: str = "Qwen/Qwen2.5-VL-3B-Instruct",
+        hf_cache_dir: str = "~/.cache/huggingface",
+        download_dir: str = "./ckpt/models",
+        first_entry_file: str = "../example_data/image_region_caption/image_region_caption_demo.jsonl",
+        cache_path: str = "../cache/image_region_caption",
+        file_name_prefix: str = "region_caption",
+        cache_type: str = "jsonl",
+        input_image_key: str = "image",
+        input_bbox_key: str = "bbox",
+        max_boxes: int = 10,
+        output_image_with_bbox_path: str = "../cache/image_region_caption/image_with_bbox_result.jsonl",
+    ):
+```
 
-```bash
-python gpu_pipelines/image_region_caption_pipeline.py
+### 第五步：一键运行
 
+```bash
+cd gpu_pipelines
+python image_region_caption_pipeline.py
 ```
 
 ---
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
index a55fe63d..8f465dc1 100644
--- a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
+++ b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
@@ -52,16 +52,30 @@ api_pipelines/image_region_caption_api_pipeline.py
 
 ### 第四步：下载示例数据
 ```bash
-huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
 ```
 
 ### 第五步：配置参数
 
 在 `api_pipelines/image_region_caption_api_pipeline.py` 中配置 API 服务和输入数据路径：
 
+```python
+    def __init__(
+        self,
+        first_entry_file: str = "../example_data/image_region_caption/image_region_caption_demo.jsonl",
+        cache_path: str = "../cache/image_region_caption",
+        file_name_prefix: str = "region_caption",
+        cache_type: str = "jsonl",
+        input_image_key: str = "image",
+        input_bbox_key: str = "bbox",
+        max_boxes: int = 10,
+        output_image_with_bbox_path: str = "../cache/image_region_caption/image_with_bbox_result.jsonl",
+    ):
+```
+
 ```python
 self.vlm_serving = APIVLMServing_openai(
-            api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # 任意兼容OpenAI格式的API平台
+            api_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
             model_name="gpt-4o-mini",
             image_io=None,
             send_request_stream=False,
@@ -72,7 +86,8 @@ self.vlm_serving = APIVLMServing_openai(
 
 ### 第六步：一键运行
 ```bash
-python api_pipelines/image_region_caption_api_pipeline.py
+cd api_pipelines
+python image_region_caption_api_pipeline.py
 ```
 
 ---

From b6300dc2ffddc8dbdf1ab8c4bdd4af5c3aecf7e6 Mon Sep 17 00:00:00 2001
From: happykeyan <hankyang428@163.com>
Date: Wed, 25 Feb 2026 00:27:49 +0800
Subject: [PATCH 4/7] fix data path

---
 docs/.vuepress/notes/en/mm_operators.ts       |   4 +-
 docs/.vuepress/notes/zh/mm_operators.ts       |   4 +-
 .../image_understanding/context_vqa.md        | 236 ++++++------------
 .../image_understanding/context_vqa_api.md    | 139 +++++------
 .../image_understanding/image_caption_api.md  | 114 ++++-----
 .../image_understanding/image_vqa_api.md      | 104 ++++----
 .../image_understanding/context_vqa.md        | 221 +++++++---------
 .../image_understanding/context_vqa_api.md    | 152 +++++------
 .../image_understanding/image_caption_api.md  |  72 +++---
 .../image_understanding/image_vqa_api.md      |  68 ++---
 10 files changed, 460 insertions(+), 654 deletions(-)

diff --git a/docs/.vuepress/notes/en/mm_operators.ts b/docs/.vuepress/notes/en/mm_operators.ts
index 1c7faadd..5e811093 100644
--- a/docs/.vuepress/notes/en/mm_operators.ts
+++ b/docs/.vuepress/notes/en/mm_operators.ts
@@ -26,8 +26,8 @@ export const MMOperators: ThemeNote = defineNoteConfig({
                     collapsed: false,
                     prefix: 'generate/',
                     items: [
-                        'image_caption',
-                        'image_qa',
+                        // 'image_caption',
+                        // 'image_qa',
                         'image_pers_qa',
                         'multimodal_math',
                         "prompt_templated_vqa_generator",
diff --git a/docs/.vuepress/notes/zh/mm_operators.ts b/docs/.vuepress/notes/zh/mm_operators.ts
index d8f05499..c35993a2 100644
--- a/docs/.vuepress/notes/zh/mm_operators.ts
+++ b/docs/.vuepress/notes/zh/mm_operators.ts
@@ -27,8 +27,8 @@ export const MMOperators: ThemeNote = defineNoteConfig({
                     collapsed: false,
                     prefix: 'generate/',
                     items: [
-                        'image_caption',
-                        'image_qa',
+                        // 'image_caption',
+                        // 'image_qa',
                         'image_pers_qa',
                         'multimodal_math',
                         'prompt_templated_vqa_generator',
diff --git a/docs/en/notes/mm_guide/image_understanding/context_vqa.md b/docs/en/notes/mm_guide/image_understanding/context_vqa.md
index 52ef9148..52bbe3b3 100644
--- a/docs/en/notes/mm_guide/image_understanding/context_vqa.md
+++ b/docs/en/notes/mm_guide/image_understanding/context_vqa.md
@@ -7,27 +7,25 @@ permalink: /en/mm_guide/contextvqa_pipeline/
 
 ## 1. Overview
 
-The **ContextVQA Multimodal QA Data Generation Pipeline** is designed to automatically generate **Visual Question Answering (VQA) data equipped with external knowledge contexts** starting from images. This pipeline utilizes Vision-Language Models (VLM) to generate Wikipedia-style articles related to the image and corresponding QA pairs, which are then parsed into structured data.
-
-
+The **ContextVQA Multimodal QA Data Generation Pipeline** is designed to automatically generate **visual question answering (Context-based VQA) data with external knowledge contexts** starting from images. This pipeline utilizes Vision-Language Models (VLM) to generate Wikipedia-style articles related to the images and corresponding QA pairs, which are then parsed into structured data.
 
 We support the following application scenarios:
 
 * **Knowledge-based VQA Data Synthesis**: Building QA datasets that require external knowledge reasoning.
 * **Multimodal RAG Data Construction**: Generating high-quality data for training Retrieval-Augmented Generation (RAG) systems.
-* **Visual Reasoning Training**: Generating data where the question points to the image, but the answer must be reasoned from the accompanying text context.
+* **Visual Reasoning Training**: Generating questions that point to the image, but require answers reasoned from the textual context.
 
-The main stages of the pipeline include:
+The main flow includes:
 
-1.  **Data Loading**: Reading data files containing image paths.
-2.  **Context and QA Generation**: Using a VLM to generate a Wikipedia-style article and raw QA pairs based on the image.
-3.  **Data Cleaning and Structuring**: Parsing raw text to extract a structured `{context, qas}` format.
+1. **Data Loading**: Reading data files containing image paths.
+2. **Context and QA Generation**: Utilizing a locally deployed VLM to generate Wikipedia-style articles and raw QA pairs based on the image.
+3. **Data Cleaning and Structuring**: Parsing raw text to extract a structured `{context, qas}` format.
 
 ---
 
 ## 2. Quick Start
 
-### Step 1: Create a New DataFlow Working Directory
+### Step 1: Create a New DataFlow Work Folder
 
 ```bash
 mkdir run_dataflow_mm
@@ -42,34 +40,45 @@ dataflow init
 
 ```
 
-After initialization, you will see the generated file structure, including:
+You will now see:
 
 ```bash
 gpu_pipelines/context_vqa.py  
 
 ```
 
-### Step 3: Configure Model and Data Paths
+### Step 3: Download Example Data
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data
+
+```
+
+### Step 4: Configure Model and Data Paths
 
-Modify the VLM model path and dataset location in `context_vqa.py`:
+Modify the class initialization parameters directly in `context_vqa.py` (no longer passed via command line arguments):
 
 ```python
-parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") # Update to your local model path
-parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface")
-parser.add_argument("--download_dir", default="./ckpt")
-parser.add_argument("--device", choices=["cuda", "cpu", "mps"], default="cuda")
-
-# Update the data path below. 
-# We provide example data at: run_dataflow_mm/example_data/image_to_text_pipeline/capsbench_captions.json
-# Note: You can download the actual images using the "source" URLs provided within the JSON file.
-parser.add_argument("--images_file", default="dataflow/example/image_to_text_pipeline/capsbench_captions.json") 
-parser.add_argument("--cache_path", default="./cache_local")
-parser.add_argument("--file_name_prefix", default="context_vqa")
-parser.add_argument("--cache_type", default="json")
+# Model Serving Configuration
+self.serving = LocalModelVLMServing_vllm(
+    hf_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct",
+    hf_cache_dir="~/.cache/huggingface",
+    hf_local_dir="./ckpt",
+    vllm_tensor_parallel_size=1,
+    vllm_max_tokens=512,
+)
+
+# Data Storage Configuration
+self.storage = FileStorage(
+    first_entry_file_name="./example_data/image_contextvqa/sample_data.json",
+    cache_path="./cache_local",
+    file_name_prefix="context_vqa",
+    cache_type="json",
+)
 
 ```
 
-### Step 4: Launch the Pipeline
+### Step 5: One-Click Run
 
 ```bash
 python gpu_pipelines/context_vqa.py
@@ -82,106 +91,74 @@ python gpu_pipelines/context_vqa.py
 
 ### 1. **Input Data**
 
-The input data for this process primarily contains the following fields:
-
-* **image**: Path to the image file (local path or URL).
-* **id** (optional): Unique identifier for the data.
-
-Data is managed via `FileStorage`, which supports breakpoint resumption (checkpointing).
+Input data is managed through `FileStorage`, supporting breakpoint resumption.
 
-**Input Data Example**:
+**Input Data Example (`sample_data.json`)**:
 
-```jsonl
-{"id": 1, "image": "./images/landmark.jpg"}
-{"id": 2, "image": "./images/animal.jpg"}
+```json
+[
+    {
+        "image": ["./example_data/image_contextvqa/person.png"],
+        "conversation": [
+            {
+                "from": "human",
+                "value": "Write a Wikipedia article related to this image without directly referring to the image. Then write question answer pairs..."
+            }
+        ]
+    }
+]
 
 ```
 
-Example images can be found at `https://huggingface.co/datasets/OpenDCAI/dataflow-demo-image/tree/main/capsbench_images`. Additionally, we have synthesized 200k high-quality context VQA data records for the community to experience at `https://huggingface.co/datasets/OpenDCAI/dataflow-mm-context_vqa`.
-
 ### 2. **Core Operator Logic**
 
-The pipeline completes its task by concatenating two core operators:
-
-#### A. **FixPromptedVQAGenerator (Context Generation)**
-
-This operator uses the VLM model to generate raw text according to a preset prompt template.
-
-**Functions:**
-
-* Generates a Wikipedia-style science article based on the image.
-* Generates QA pairs based on the article.
-* **Prompt Constraints**: The question points to the image but avoids direct mention of object names; answers must come from the article content and not be objects in the image; answers should be concise.
+#### A. **PromptedVQAGenerator (Context Generation)**
 
-**Model Serving Configuration**:
-
-```python
-self.serving = LocalModelVLMServing_vllm(
-    hf_model_name_or_path=model_path,
-    hf_cache_dir=hf_cache_dir,
-    vllm_tensor_parallel_size=1,
-    vllm_temperature=0.7,  # Maintain a level of creativity
-    vllm_top_p=0.9,
-    vllm_max_tokens=512,
-)
-
-```
+This operator calls the local VLM model to generate raw text based on built-in Wikipedia-style prompt templates.
 
 **Operator Execution**:
 
 ```python
 self.vqa_generator.run(
     storage=self.storage.step(),
-    input_image_key="image",
-    output_answer_key="vqa" # Outputs the raw generated text
+    input_conversation_key="conversation",
+    input_image_key=input_image_key,
+    output_answer_key=output_answer_key,
 )
 
 ```
 
-#### B. **WikiQARefiner (Result Refinement)**
-
-This operator is responsible for cleaning the unstructured text generated by the VLM and converting it into a standard format.
-
-**Functions:**
+#### B. **WikiQARefiner (Result Parsing)**
 
-* Cleans Markdown formatting and redundant white space.
-* Separates article content (Context) from QA pairs (QAs).
+This operator cleans the unstructured text generated by the VLM and converts it into a standard format, separating the article content (Context) from the question-answer pairs (QAs).
 
 **Operator Execution**:
 
 ```python
 self.refiner.run(
     storage=self.storage.step(),
-    input_key="vqa",          # Inputs raw text from the previous step
-    output_key="context_vqa"  # Outputs final structured data
+    input_key="vqa",          # Raw text from the previous step
+    output_key="context_vqa"  # Final structured data
 )
 
 ```
 
 ### 3. **Output Data**
 
-Ultimately, the output generated by the pipeline will include:
-
-* **image**: Original image path.
-* **vqa**: Raw text generated by the VLM (intermediate result).
-* **context_vqa**: Final structured result containing `context` (article) and `qas` (QA list).
+The final structured data includes `context` (article) and `qas` (list of questions and answers).
 
 **Output Data Example**:
 
 ```json
 {
     "id": 1,
-    "image": "./images/landmark.jpg",
+    "image": ["./example_data/image_contextvqa/person.png"],
     "context_vqa": {
-        "context": "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France...",
+        "context": "Nightmare Alley is a 2021 American psychological thriller film...",
         "qas": [
             {
-                "question": "In which city is this structure located?",
-                "answer": "Paris"
-            },
-            {
-                "question": "What material is the tower primarily constructed from?",
-                "answer": "wrought-iron"
+                "question": "What genre does this film belong to?",
+                "answer": "Psychological thriller"
             }
         ]
     }
@@ -193,45 +170,35 @@ Ultimately, the output generated by the pipeline will include:
 
 ## 4. Pipeline Example
 
-Below is the complete implementation of `ContextVQAPipeline`, supporting command-line arguments.
+Below is the complete `ContextVQAPipeline` code implementation.
 
 ```python
 import argparse
 from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServingABC
 from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
-from dataflow.operators.core_vision import FixPromptedVQAGenerator
-from dataflow.operators.core_vision import WikiQARefiner
+from dataflow.operators.core_vision import PromptedVQAGenerator, WikiQARefiner
+
 
 class ContextVQAPipeline:
     """
-    Complete batch ContextVQA Caption generation for images with a single command.
+    Batch generate ContextVQA data for images with a single command.
     """
 
-    def __init__(
-        self,
-        model_path: str,
-        *,
-        hf_cache_dir: str | None = None,
-        download_dir: str = "./ckpt",
-        device: str = "cuda",
-        first_entry_file: str = "dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl",
-        cache_path: str = "./cache_local_skvqa",
-        file_name_prefix: str = "skvqa_cache_step",
-        cache_type: str = "jsonl",
-    ):
+    def __init__(self, llm_serving: LLMServingABC = None):
         # ---------- 1. Storage ----------
         self.storage = FileStorage(
-            first_entry_file_name=first_entry_file,
-            cache_path=cache_path,
-            file_name_prefix=file_name_prefix,
-            cache_type=cache_type,
+            first_entry_file_name="./example_data/image_contextvqa/sample_data.json",
+            cache_path="./cache_local",
+            file_name_prefix="context_vqa",
+            cache_type="json",
         )
 
         # ---------- 2. Serving ----------
-        self.serving = LocalModelVLMServing_vllm(
-            hf_model_name_or_path=model_path,
-            hf_cache_dir=hf_cache_dir,
-            hf_local_dir=download_dir,
+        self.vlm_serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct",
+            hf_cache_dir="~/.cache/huggingface",
+            hf_local_dir="./ckpt",
             vllm_tensor_parallel_size=1,
             vllm_temperature=0.7,
             vllm_top_p=0.9,
@@ -239,72 +206,35 @@ class ContextVQAPipeline:
         )
 
         # ---------- 3. Operator ----------
-        # Generate Wiki-style articles and QA using a specific Prompt
-        self.vqa_generator = FixPromptedVQAGenerator(
-            serving=self.serving,
-            system_prompt="You are a helpful assistant.",
-            user_prompt= """
-            Write a Wikipedia article related to this image without directly referring to the image. Then write question answer pairs. The question answer pairs should satisfy the following criteria.
-            1: The question should refer to the image.
-            2: The question should avoid mentioning the name of the object in the image.
-            3: The question should be answered by reasoning over the Wikipedia article.
-            4: The question should sound natural and concise.
-            5: The answer should be extracted from the Wikipedia article.
-            6: The answer should not be any objects in the image.
-            7: The answer should be a single word or phrase and list all correct answers separated by commas.
-            8: The answer should not contain 'and', 'or', rather you can split them into multiple answers.
-            """
+        self.vqa_generator = PromptedVQAGenerator(
+            serving=self.vlm_serving,
+            system_prompt= "You are a helpful assistant."
         )
 
-        # Result cleaning and structuring
         self.refiner = WikiQARefiner()
-
+        
     # ------------------------------------------------------------------ #
     def forward(self):
         input_image_key = "image"
         output_answer_key = "vqa"
         output_wiki_key = "context_vqa"
 
-        # Step 1: Generate raw text
         self.vqa_generator.run(
             storage=self.storage.step(),
+            input_conversation_key="conversation",
             input_image_key=input_image_key,
             output_answer_key=output_answer_key
         )
 
-        # Step 2: Parse into structured data
         self.refiner.run(
             storage=self.storage.step(),
             input_key=output_answer_key,
             output_key=output_wiki_key
         )
 
-# ---------------------------- CLI Entry -------------------------------- #
+# ---------------------------- CLI Entry ------------------------------- #
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Batch SKVQA caption generation with DataFlow")
-
-    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
-    parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface")
-    parser.add_argument("--download_dir", default="./ckpt")
-    parser.add_argument("--device", choices=["cuda", "cpu", "mps"], default="cuda")
-
-    parser.add_argument("--images_file", default="dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl")
-    parser.add_argument("--cache_path", default="./cache_local")
-    parser.add_argument("--file_name_prefix", default="context_vqa")
-    parser.add_argument("--cache_type", default="jsonl")
-
-    args = parser.parse_args()
-
-    pipe = ContextVQAPipeline(
-        model_path=args.model_path,
-        hf_cache_dir=args.hf_cache_dir,
-        download_dir=args.download_dir,
-        device=args.device,
-        first_entry_file=args.images_file,
-        cache_path=args.cache_path,
-        file_name_prefix=args.file_name_prefix,
-        cache_type=args.cache_type,
-    )
+    pipe = ContextVQAPipeline()
     pipe.forward()
 
 ```
\ No newline at end of file
diff --git a/docs/en/notes/mm_guide/image_understanding/context_vqa_api.md b/docs/en/notes/mm_guide/image_understanding/context_vqa_api.md
index 1e49a8e8..5fad70a6 100644
--- a/docs/en/notes/mm_guide/image_understanding/context_vqa_api.md
+++ b/docs/en/notes/mm_guide/image_understanding/context_vqa_api.md
@@ -7,19 +7,19 @@ permalink: /en/mm_guide/contextvqa_api_pipeline/
 
 ## 1. Overview
 
-The **ContextVQA Multimodal QA Data Generation Pipeline (API Version)** is designed to automatically generate **Context-based Visual Question Answering (VQA) data** starting from images. This pipeline utilizes Vision-Language Models (VLM) via API to generate Wikipedia-style articles and QA pairs, which are then parsed into structured data. It is ideal for building knowledge-intensive VQA and multimodal RAG (Retrieval-Augmented Generation) datasets.
+The **ContextVQA Multimodal QA Data Generation Pipeline (API Version)** is designed to automatically generate **visual question answering data with external knowledge context (Context-based VQA)** starting from an image. This pipeline uses a Vision-Language Model (VLM) via API to generate Wikipedia-style articles and QA pairs, which are then parsed into structured data. This is ideal for building knowledge-based VQA and multimodal RAG (Retrieval-Augmented Generation) datasets.
 
-We support the following use cases:
+We support the following application scenarios:
 
-* **Knowledge-based VQA Data Synthesis**: Building QA datasets that require external knowledge reasoning.
-* **Multimodal RAG Data Construction**: Generating high-quality data for training RAG systems.
-* **Visual Reasoning Training**: Generating data where questions refer to an image, but answers must be reasoned from text context.
+* **Knowledge-based VQA Data Synthesis**: Constructing QA datasets that require external knowledge reasoning.
+* **Multimodal RAG Data Construction**: Generating high-quality data for training retrieval-augmented generation models.
+* **Visual Reasoning Training**: Generating questions that point to an image but require answers derived from textual context reasoning.
 
-The pipeline consists of three main stages:
+The main flow of the pipeline includes:
 
 1. **Data Loading**: Reading data files containing image paths.
-2. **Context and QA Generation**: Using VLM APIs to generate Wikipedia-style articles and raw QA pairs based on images.
-3. **Data Cleaning and Structuring**: Parsing raw text to extract structured `{context, qas}` formats.
+2. **Context and QA Generation**: Using a VLM API to generate Wikipedia-style articles and raw QA pairs based on images.
+3. **Data Cleaning and Structuring**: Parsing raw text to extract a structured `{context, qas}` format.
 
 ---
 
@@ -27,15 +27,15 @@ The pipeline consists of three main stages:
 
 ### Step 1: Configure API Key
 
-Set your API Key environment variable in your script:
+Set the API Key environment variable in your script:
 
 ```python
 import os
-os.environ["DF_API_KEY"] = "your_api_key"
+os.environ["DF_API_KEY"] = "sk-xxx"
 
 ```
 
-### Step 2: Create a New DataFlow Working Directory
+### Step 2: Create a New DataFlow Work Folder
 
 ```bash
 mkdir run_dataflow
@@ -50,29 +50,31 @@ dataflowmm init
 
 ```
 
-You will then see:
+You will see the following file created:
 
 ```bash
 api_pipelines/image_contextvqa.py
 
 ```
 
-### Step 4: Download Sample Data
+### Step 4: Download Example Data
 
 ```bash
-huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data
 
 ```
 
 ### Step 5: Configure Parameters
 
-Configure the API service and input data paths in `image_contextvqa.py`:
+In `image_contextvqa.py`, configure the API service and input data paths (no `argparse` required, modify default paths directly in the code):
 
 ```python
 self.vlm_serving = APIVLMServing_openai(
-    api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any OpenAI-compatible API platform
-    key_name_of_api_key="DF_API_KEY", # API key set in Step 1
-    model_name="qwen3-vl-8b-instruct",
+    api_url="http://172.96.141.132:3001/v1", # Any OpenAI-compatible API platform
+    key_name_of_api_key="DF_API_KEY", # Corresponding API key set in Step 1
+    model_name="gpt-5-nano-2025-08-07",
+    image_io=None,
+    send_request_stream=False,
     max_workers=10,
     timeout=1800
 )
@@ -80,14 +82,16 @@ self.vlm_serving = APIVLMServing_openai(
 ```
 
 ```python
-parser.add_argument("--images_file", default="data/image_contextvqa/sample_data.json")
-parser.add_argument("--cache_path", default="./cache_local")
-parser.add_argument("--file_name_prefix", default="context_vqa")
-parser.add_argument("--cache_type", default="json")
+self.storage = FileStorage(
+    first_entry_file_name="./example_data/image_contextvqa/sample_data.json",
+    cache_path="./cache_local",
+    file_name_prefix="context_vqa",
+    cache_type="json",
+)
 
 ```
 
-### Step 6: Run with One Command
+### Step 6: One-Click Run
 
 ```bash
 python api_pipelines/image_contextvqa.py
@@ -100,24 +104,24 @@ python api_pipelines/image_contextvqa.py
 
 ### 1. **Input Data**
 
-The input data for this process primarily includes the following fields:
+The input data for this process mainly includes the following fields:
 
-* **image**: Image file path (local path or URL).
+* **image**: Path to the image file (local path or URL).
 * **id** (Optional): Unique identifier for the data.
-* **conversation** (Optional): Conversation-formatted text used to guide context generation.
+* **conversation** (Optional): Text in dialogue format used to supplement context generation.
 
-Data is managed via `FileStorage`, supporting breakpoint resumption.
+Data is managed through `FileStorage`, which supports breakpoint resumption.
 
 **Input Data Example**:
 
 ```json
 [
     {
-        "image": ["./data/image_contextvqa/person.png"],
+        "image": ["./example_data/image_contextvqa/person.png"],
         "conversation": [
             {
                 "from": "human",
-                "value": "Write a Wikipedia article related to this image without directly referring to the image. Then write question answer pairs..."
+                "value": "Write a Wikipedia article related to this image without directly referring to the image..."
             }
         ]
     }
@@ -127,17 +131,17 @@ Data is managed via `FileStorage`, supporting breakpoint resumption.
 
 ### 2. **Core Operator Logic**
 
-The pipeline chains two core operators:
+This pipeline completes the task by concatenating two core operators:
 
 #### A. **PromptedVQAGenerator (Context Generation)**
 
-This operator calls the VLM API to generate raw text based on the prompt template.
+This operator is responsible for calling the VLM API to generate raw text based on a prompt template.
 
 **Features:**
 
-* Generates a Wikipedia-style encyclopedia article based on the image.
+* Generates a Wikipedia-style popular science article based on the image.
 * Generates QA pairs based on the article.
-* **Prompt Constraints**: Questions point to the image but avoid direct object naming; answers must come from the article and not be objects in the image; answers must be concise.
+* **Prompt Constraints**: Questions refer to the image but avoid mentioning object names; answers are from the article and are not objects in the image; answers are concise.
 
 **Operator Execution**:
 
@@ -153,12 +157,12 @@ self.vqa_generator.run(
 
 #### B. **WikiQARefiner (Result Parsing)**
 
-This operator cleans the unstructured text generated by the VLM and converts it into a standard format.
+This operator cleans the raw text generated by the VLM and converts it into a standard format.
 
 **Features:**
 
-* Cleans Markdown formatting and redundant whitespace.
-* Separates article content (Context) and QA pairs (QAs).
+* Cleans Markdown formatting and extra whitespace.
+* Separates the article content (Context) from the QA pairs (QAs).
 
 **Operator Execution**:
 
@@ -173,28 +177,24 @@ self.refiner.run(
 
 ### 3. **Output Data**
 
-The final output contains:
+The final output data generated by the pipeline will contain:
 
 * **image**: Original image path.
-* **vqa**: Raw text generated by VLM (intermediate result).
-* **context_vqa**: Structured final result containing `context` (article) and `qas` (QA list).
+* **vqa**: Raw text generated by the VLM (intermediate result).
+* **context_vqa**: Final structured result containing `context` (article) and `qas` (QA list).
 
 **Output Data Example**:
 
 ```json
 [
   {
-    "image": ["./data/image_contextvqa/person.png"],
+    "image": ["./example_data/image_contextvqa/person.png"],
     "context_vqa": {
-      "context": "**Wikipedia Article:** Nightmare Alley is a 2021 American psychological thriller film...",
+      "context": "**Wikipedia Article:** *Nightmare Alley* is a 2021 American psychological thriller...",
       "qas": [
         {
           "question": "What genre does this film belong to?",
           "answer": "Psychological thriller"
-        },
-        {
-          "question": "Who directed this film?",
-          "answer": "Guillermo del Toro"
         }
       ]
     }
@@ -207,44 +207,40 @@ The final output contains:
 
 ## 4. Pipeline Example
 
-The following is the complete `ContextVQAPipeline` implementation supporting CLI arguments.
+Below is the complete `ContextVQAPipeline` implementation.
 
 ```python
 import os
-import argparse
+
+# Set API Key environment variable
+os.environ["DF_API_KEY"] = "sk-xxx"
+
 from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServingABC
 from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
 from dataflow.operators.core_vision import PromptedVQAGenerator
 from dataflow.operators.core_vision import WikiQARefiner
 
-# Set API Key environment variable
-os.environ["DF_API_KEY"] = "sk-xxxx"
 
 class ContextVQAPipeline:
     """
-    Generate batch ContextVQA captions with a single command.
+    Generate batch ContextVQA data for images with a single command.
     """
 
-    def __init__(
-        self,
-        first_entry_file: str = "dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl",
-        cache_path: str = "./cache_local_skvqa",
-        file_name_prefix: str = "skvqa_cache_step",
-        cache_type: str = "jsonl",
-    ):
+    def __init__(self, llm_serving: LLMServingABC = None):
         # ---------- 1. Storage ----------
         self.storage = FileStorage(
-            first_entry_file_name=first_entry_file,
-            cache_path=cache_path,
-            file_name_prefix=file_name_prefix,
-            cache_type=cache_type,
+            first_entry_file_name="./example_data/image_contextvqa/sample_data.json",
+            cache_path="./cache_local",
+            file_name_prefix="context_vqa",
+            cache_type="json",
         )
 
         # ---------- 2. Serving ----------
         self.vlm_serving = APIVLMServing_openai(
-            api_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+            api_url="http://172.96.141.132:3001/v1",
             key_name_of_api_key="DF_API_KEY",
-            model_name="qwen3-vl-8b-instruct",
+            model_name="gpt-5-nano-2025-08-07",
             image_io=None,
             send_request_stream=False,
             max_workers=10,
@@ -277,23 +273,8 @@ class ContextVQAPipeline:
             output_key=output_wiki_key
         )
 
-# ---------------------------- CLI Entry -------------------------------- #
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Batch ContextVQA generation with DataFlow")
-
-    parser.add_argument("--images_file", default="data/image_contextvqa/sample_data.json")
-    parser.add_argument("--cache_path", default="./cache_local")
-    parser.add_argument("--file_name_prefix", default="context_vqa")
-    parser.add_argument("--cache_type", default="json")
-
-    args = parser.parse_args()
-
-    pipe = ContextVQAPipeline(
-        first_entry_file=args.images_file,
-        cache_path=args.cache_path,
-        file_name_prefix=args.file_name_prefix,
-        cache_type=args.cache_type,
-    )
+    pipe = ContextVQAPipeline()
     pipe.forward()
 
 ```
\ No newline at end of file
diff --git a/docs/en/notes/mm_guide/image_understanding/image_caption_api.md b/docs/en/notes/mm_guide/image_understanding/image_caption_api.md
index e2607f6b..ecb5153d 100644
--- a/docs/en/notes/mm_guide/image_understanding/image_caption_api.md
+++ b/docs/en/notes/mm_guide/image_understanding/image_caption_api.md
@@ -3,17 +3,17 @@ title: Image Caption Generation Pipeline (API Version)
 icon: mdi:image-edit 
 createTime: 2026/01/24 16:37:37 
 permalink: /en/mm_guide/image_caption_api_pipeline/
----
 
+---
 ## 1. Overview
 
-The **Image Caption Generation Pipeline (API Version)** is designed to leverage advanced Vision-Language Models (VLM) to automatically generate high-quality, accurate, and informative text descriptions for large-scale image datasets. By calling APIs compatible with the OpenAI format, this pipeline rapidly processes images and generates structured annotation data. It is an ideal choice for building multimodal pre-training datasets, image retrieval systems, and accessibility features.
+**Image Caption Generation Pipeline (API Version)** is designed to leverage advanced Vision-Language Models (VLM) to automatically generate high-quality, accurate, and informative textual descriptions for large-scale image datasets. By calling APIs compatible with the OpenAI format, this pipeline can quickly process images and generate structured annotation data. It is an ideal choice for building multimodal pre-training datasets, image retrieval systems, and accessibility features.
 
 We support the following application scenarios:
 
-* **Multimodal Dataset Annotation**: Batch generate precise text descriptions for massive image libraries.
-* **Image Content Understanding**: Automatically extract key objects, scenes, and textual information from images.
-* **Search & Retrieval Optimization**: Enhance image searchability through rich textual descriptions.
+* **Multimodal Dataset Annotation**: Batch generate precise text descriptions for large-scale image libraries.
+* **Image Content Understanding**: Automatically extract key objects, scenes, and text information from images.
+* **Search and Retrieval Optimization**: Enhance image searchability through textual descriptions.
 
 ---
 
@@ -29,9 +29,9 @@ os.environ["DF_API_KEY"] = "your_api_key_here"
 
 ```
 
-### Step 2: Prepare the Environment
+### Step 2: Environment Preparation
 
-Create a working directory and initialize:
+Create a work directory and initialize:
 
 ```bash
 mkdir run_caption_pipeline
@@ -40,20 +40,20 @@ dataflowmm init
 
 ```
 
-### Step 3: Download Sample Data
+### Step 3: Download Example Data
 
 ```bash
-huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data
 
 ```
 
-### Step 4: Configure Core Parameters
+### Step 4: Core Parameter Configuration
 
 Configure the API information in the generated `api_pipelines/image_caption.py` script:
 
 ```python
 self.vlm_serving = APIVLMServing_openai(
-    api_url="http://172.96.141.132:3001/v1", # Replace with your API endpoint
+    api_url="http://172.96.141.132:3001/v1", # Replace with your API address
     key_name_of_api_key="DF_API_KEY",
     model_name="gpt-5-nano-2025-08-07",
     max_workers=10,
@@ -65,22 +65,22 @@ self.vlm_serving = APIVLMServing_openai(
 ### Step 5: Run the Pipeline
 
 ```bash
-python api_pipelines/image_caption.py --images_file data/image_caption/sample_data.json
+python api_pipelines/image_caption.py
 
 ```
 
 ---
 
-## 3. Data Flow & Logic
+## 3. Data Flow and Logic Description
 
 ### 1. **Input Data Structure**
 
-The pipeline accepts standard JSON/JSONL formats containing image paths and prompts:
+The pipeline receives standard JSON/JSONL formats containing image paths and prompts:
 
 ```json
 [
     {
-        "image": ["./data/image_caption/person.png"],
+        "image": ["./example_data/image_caption/person.png"],
         "conversation": [
             {
                 "from": "human",
@@ -94,20 +94,20 @@ The pipeline accepts standard JSON/JSONL formats containing image paths and prom
 
 ### 2. **Core Operator: PromptedVQAGenerator**
 
-In this workflow, we use `PromptedVQAGenerator` as the core operator. It transforms the VLM into a specialized image captioning engine via a system prompt.
+In this process, we use `PromptedVQAGenerator` as the core operator. It transforms the VLM into a specialized image caption generator via a System Prompt.
 
-* **System Prompt**: "You are an image caption generator. Your task is to generate a concise and informative caption for the given image content."
-* **Concurrency Control**: Supports multi-threaded concurrent requests via the `max_workers` parameter, significantly improving processing efficiency for large datasets.
-* **Fault Tolerance**: Built-in timeout and retry mechanisms ensure stability of API calls under high load.
+* **System Prompt**: "You are a image caption generator. Your task is to generate a concise and informative caption for the given image content."
+* **Concurrency Control**: Supports multi-threaded concurrent requests via the `max_workers` parameter, significantly improving processing efficiency for large-scale data.
+* **Error Handling**: Built-in timeout and retry mechanisms ensure API call stability under high loads.
 
 ### 3. **Output Data Example**
 
-Once processing is complete, the `caption` field is appended directly to the data object:
+After processing, the `caption` field is added directly to the data object:
 
 ```json
 [
   {
-    "image": ["./data/image_caption/person.png"],
+    "image": ["./example_data/image_caption/person.png"],
     "conversation": [...],
     "caption": "Promotional poster for Nightmare Alley in grayscale, showing a man in a formal tuxedo with a white bow tie. The cast names run down the left side (Bradley Cooper, Cate Blanchett, Toni Collette, Willem Dafoe, and more), and the gold title Nightmare Alley appears near the bottom left with release text and Regal branding."
   }
@@ -117,79 +117,69 @@ Once processing is complete, the `caption` field is appended directly to the dat
 
 ---
 
-## 4. Full Pipeline Code
+## 4. Complete Pipeline Code
 
-You can directly use or modify the following Python code to implement your custom image captioning task.
+You can directly use or modify the following Python code to implement custom image captioning tasks.
 
 ```python
 import os
-import argparse
+
+# Set API Key environment variable
+os.environ["DF_API_KEY"] = "sk-xxx"
+
 from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServingABC
 from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
 from dataflow.operators.core_vision import PromptedVQAGenerator
 
-# Set API Key environment variable
-os.environ["DF_API_KEY"] = "sk-xxx"
 
 class ImageCaptionPipeline:
     """
-    Batch image caption generation with a single command.
+    Complete batch image caption generation with a single command.
     """
 
-    def __init__(
-        self,
-        first_entry_file: str,
-        cache_path: str = "./cache_local",
-        file_name_prefix: str = "caption",
-        cache_type: str = "json",
-    ):
-        # ---------- 1. Storage: Manage data reading and checkpoints ----------
+    def __init__(self, llm_serving: LLMServingABC = None):
+
+        # ---------- 1. Storage ----------
         self.storage = FileStorage(
-            first_entry_file_name=first_entry_file,
-            cache_path=cache_path,
-            file_name_prefix=file_name_prefix,
-            cache_type=cache_type,
+            first_entry_file_name="./example_data/image_caption/sample_data.json",
+            cache_path="./cache_local",
+            file_name_prefix="caption",
+            cache_type="json",
         )
 
-        # ---------- 2. Serving: Configure API Service ----------
+        # ---------- 2. Serving ----------
         self.vlm_serving = APIVLMServing_openai(
-            api_url="http://172.96.141.132:3001/v1", 
-            key_name_of_api_key="DF_API_KEY",
+            api_url="http://172.96.141.132:3001/v1", # Any API platform compatible with OpenAI format
+            key_name_of_api_key="DF_API_KEY", # Set the API key for the corresponding platform in the environment variable or line 4
             model_name="gpt-5-nano-2025-08-07",
+            image_io=None,
+            send_request_stream=False,
             max_workers=10,
             timeout=1800
         )
 
-        # ---------- 3. Operator: Define Generation Logic ----------
+        # ---------- 3. Operator ----------
         self.vqa_generator = PromptedVQAGenerator(
             serving=self.vlm_serving,
-            system_prompt="You are an image caption generator. Your task is to generate a concise and informative caption for the given image content."
+            system_prompt= "You are a image caption generator. Your task is to generate a concise and informative caption for the given image content."
         )
 
+    # ------------------------------------------------------------------ #
     def forward(self):
-        # Run the pipeline
+        input_image_key = "image"
+        output_answer_key = "caption"
+
         self.vqa_generator.run(
             storage=self.storage.step(),
             input_conversation_key="conversation",
-            input_image_key="image",
-            output_answer_key="caption",
+            input_image_key=input_image_key,
+            output_answer_key=output_answer_key,
         )
 
+# ---------------------------- CLI Entry ------------------------------- #
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Batch image caption generation with DataFlow")
-    parser.add_argument("--images_file", default="data/image_caption/sample_data.json")
-    parser.add_argument("--cache_path", default="./cache_local")
-    parser.add_argument("--file_name_prefix", default="caption")
-    parser.add_argument("--cache_type", default="json")
-
-    args = parser.parse_args()
-
-    pipe = ImageCaptionPipeline(
-        first_entry_file=args.images_file,
-        cache_path=args.cache_path,
-        file_name_prefix=args.file_name_prefix,
-        cache_type=args.cache_type,
-    )
+    pipe = ImageCaptionPipeline()
     pipe.forward()
 
-```
+```
\ No newline at end of file
diff --git a/docs/en/notes/mm_guide/image_understanding/image_vqa_api.md b/docs/en/notes/mm_guide/image_understanding/image_vqa_api.md
index 062ba560..a7f230f0 100644
--- a/docs/en/notes/mm_guide/image_understanding/image_vqa_api.md
+++ b/docs/en/notes/mm_guide/image_understanding/image_vqa_api.md
@@ -7,13 +7,13 @@ permalink: /en/mm_guide/image_vqa_api_pipeline/
 
 ## 1. Overview
 
-The **Image VQA Generation Pipeline (API Version)** focuses on automatically constructing high-quality **Question-Answer Pairs** directly from image content. By leveraging high-performance VLM APIs, the pipeline generates questions and accurate answers that align with human logic based on visual features. This is highly valuable for training multimodal dialogue models, evaluating visual understanding capabilities, and building domain-specific VQA datasets (e.g., medical, security, e-commerce).
+**Image VQA Generation Pipeline (API Version)** focuses on automatically constructing high-quality **Question-Answer (QA) Pairs** directly from image content. Leveraging high-performance VLM APIs, this pipeline generates human-like questions and accurate answers based on the visual features of an image. This is highly valuable for training multimodal dialogue models, evaluating visual understanding capabilities, and building industry-specific VQA datasets (e.g., medical, security, e-commerce).
 
 We support the following application scenarios:
 
-* **Instruction Tuning Data Synthesis**: Generate diverse questioning styles to enhance model interaction capabilities.
-* **Visual Understanding Evaluation**: Create judgment, descriptive, or reasoning-based Q&A focused on image details.
-* **Automated Annotation**: Replace manual labor for large-scale image Q&A labeling, reducing data production costs.
+* **Instruction Fine-tuning Data Synthesis**: Generate diverse questioning styles to enhance model interaction capabilities.
+* **Visual Understanding Evaluation**: Produce judgment, descriptive, or reasoning-based QAs targeting specific image details.
+* **Automated Annotation**: Replace manual labor for large-scale image QA annotation, reducing data production costs.
 
 ---
 
@@ -21,7 +21,7 @@ We support the following application scenarios:
 
 ### Step 1: Configure API Key
 
-Ensure your environment variables are set with API access permissions:
+Ensure your environment variables include the API access rights:
 
 ```python
 import os
@@ -32,7 +32,7 @@ os.environ["DF_API_KEY"] = "sk-your-key-here"
 ### Step 2: Initialize Environment
 
 ```bash
-# Create and enter the working directory
+# Create and enter the workspace
 mkdir run_vqa_dataflow
 cd run_vqa_dataflow
 
@@ -41,16 +41,16 @@ dataflowmm init
 
 ```
 
-### Step 3: Download Sample Data
+### Step 3: Download Example Data
 
 ```bash
-huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data
 
 ```
 
-### Step 4: Configure the Script
+### Step 4: Configure Running Script
 
-In the generated `api_pipelines/image_vqa.py`, you can customize the VLM model name and API information:
+In `api_pipelines/image_vqa.py`, you can customize the VLM model name and API information:
 
 ```python
 self.vlm_serving = APIVLMServing_openai(
@@ -65,22 +65,22 @@ self.vlm_serving = APIVLMServing_openai(
 ### Step 5: Execute the Pipeline
 
 ```bash
-python api_pipelines/image_vqa.py --images_file data/image_vqa/sample_data.json
+python api_pipelines/image_vqa.py 
 
 ```
 
 ---
 
-## 3. Data Flow & Logic
+## 3. Data Flow and Logic Description
 
 ### 1. **Input Data Format**
 
-The input file must contain the image path and a prompt to trigger VQA generation:
+The input file must contain the image path and a prompt to guide the VQA generation:
 
 ```json
 [
     {
-        "image": ["./data/image_vqa/person.png"],
+        "image": ["./example_data/image_vqa/person.png"],
         "conversation": [
             {
                 "from": "human",
@@ -94,20 +94,20 @@ The input file must contain the image path and a prompt to trigger VQA generatio
 
 ### 2. **Core Operator: PromptedVQAGenerator**
 
-This operator is the core engine for generating Q&A pairs:
+This operator serves as the engine for generating QA pairs:
 
-* **Role Definition**: Through the `system_prompt` set as "image question-answer generator", the model is guided to output standard Q&A formats.
-* **Multi-turn Support**: Capable of combining historical context or specific instructions in the `conversation` field to optimize the focus of generated questions.
-* **High-Throughput Processing**: Utilizes `max_workers` for parallel calls, suitable for processing image datasets at scales of  entries.
+* **Role Definition**: Through the `system_prompt`, the model is set as an "image question-answer generator," guiding it to output standard QA formats.
+* **Multi-turn Support**: It can combine historical context or specific instructions in the `conversation` field to refine the focus of question generation.
+* **High Throughput Processing**: Utilizes `max_workers` to implement parallel calls, suitable for processing data at a scale of tens of thousands of images or more.
 
-### 3. **Output Example**
+### 3. **Output Result Example**
 
-Generated VQA results are stored as text in the `vqa` field, typically containing multiple Q&A sets:
+The generated VQA results are stored as text in the `vqa` field, typically containing multiple Q&A sets:
 
 ```json
 [
   {
-    "image": ["./data/image_vqa/person.png"],
+    "image": ["./example_data/image_vqa/person.png"],
     "vqa": "- Q: What is the title of the movie shown on the poster?\n  A: Nightmare Alley\n\n- Q: What color is the film’s title text?\n  A: Gold"
   }
 ]
@@ -116,67 +116,67 @@ Generated VQA results are stored as text in the `vqa` field, typically containin
 
 ---
 
-## 4. Full Pipeline Code
+## 4. Complete Pipeline Code
 
 ```python
 import os
-import argparse
+
+# Set API Key environment variable
+os.environ["DF_API_KEY"] = "sk-xxx"
+
 from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServingABC
 from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
 from dataflow.operators.core_vision import PromptedVQAGenerator
 
-# Configure API Environment
-os.environ["DF_API_KEY"] = "sk-xxx"
 
 class ImageVQAPipeline:
     """
-    One-click batch image VQA generation pipeline
+    Generate batch VQA for images with a single command.
     """
 
-    def __init__(
-        self,
-        first_entry_file: str,
-        cache_path: str = "./cache_local_vqa",
-        file_name_prefix: str = "vqa_task",
-        cache_type: str = "json",
-    ):
-        # 1. Initialize Storage: Supports checkpoints and multi-format export
+    def __init__(self, llm_serving: LLMServingABC = None):
+
+        # ---------- 1. Storage ----------
         self.storage = FileStorage(
-            first_entry_file_name=first_entry_file,
-            cache_path=cache_path,
-            file_name_prefix=file_name_prefix,
-            cache_type=cache_type,
+            first_entry_file_name="./example_data/image_vqa/sample_data.json",
+            cache_path="./cache_local",
+            file_name_prefix="qa",
+            cache_type="json",
         )
 
-        # 2. Configure VLM API Service
+        # ---------- 2. Serving ----------
         self.vlm_serving = APIVLMServing_openai(
-            api_url="http://172.96.141.132:3001/v1",
-            key_name_of_api_key="DF_API_KEY",
+            api_url="http://172.96.141.132:3001/v1", # Any API platform compatible with OpenAI format
+            key_name_of_api_key="DF_API_KEY", # Set the API key in environment variable or line 4
             model_name="gpt-5-nano-2025-08-07",
-            max_workers=10
+            image_io=None,
+            send_request_stream=False,
+            max_workers=10,
+            timeout=1800
         )
 
-        # 3. Initialize VQA Operator
+        # ---------- 3. Operator ----------
         self.vqa_generator = PromptedVQAGenerator(
             serving=self.vlm_serving,
-            system_prompt="You are an image question-answer generator. Your task is to generate a question-answer pair for the given image content."
+            system_prompt= "You are a image question-answer generator. Your task is to generate a question-answer pair for the given image content."
         )
 
+    # ------------------------------------------------------------------ #
     def forward(self):
-        # Execute inference task
+        input_image_key = "image"
+        output_answer_key = "vqa"
+
         self.vqa_generator.run(
             storage=self.storage.step(),
             input_conversation_key="conversation",
-            input_image_key="image",
-            output_answer_key="vqa",
+            input_image_key=input_image_key,
+            output_answer_key=output_answer_key,
         )
 
+# ---------------------------- CLI Entry ------------------------------- #
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Batch VQA generation")
-    parser.add_argument("--images_file", default="data/image_vqa/sample_data.json")
-    args = parser.parse_args()
-
-    pipe = ImageVQAPipeline(first_entry_file=args.images_file)
+    pipe = ImageVQAPipeline()
     pipe.forward()
 
 ```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_guide/image_understanding/context_vqa.md b/docs/zh/notes/mm_guide/image_understanding/context_vqa.md
index d9775d1d..cd9ac1da 100644
--- a/docs/zh/notes/mm_guide/image_understanding/context_vqa.md
+++ b/docs/zh/notes/mm_guide/image_understanding/context_vqa.md
@@ -1,8 +1,9 @@
 ---
-title: ContextVQA 多模态问答数据生成流水线
-icon: mdi:image-text
-createTime: 2026/01/24 16:37:37
+title: ContextVQA 多模态问答数据生成流水线 
+icon: mdi:image-text 
+createTime: 2026/01/24 16:37:37 
 permalink: /zh/mm_guide/contextvqa_pipeline/
+
 ---
 ## 1. 概述
 
@@ -17,7 +18,7 @@ permalink: /zh/mm_guide/contextvqa_pipeline/
 流水线的主要流程包括：
 
 1. **数据加载**：读取包含图像路径的数据文件。
-2. **上下文与问答生成**：利用 VLM 基于图像生成 Wikipedia 风格文章及原始问答对。
+2. **上下文与问答生成**：利用本地部署的 VLM 基于图像生成 Wikipedia 风格文章及原始问答对。
 3. **数据清洗与结构化**：解析原始文本，提取结构化的 `{context, qas}` 格式。
 
 ---
@@ -25,40 +26,63 @@ permalink: /zh/mm_guide/contextvqa_pipeline/
 ## 2. 快速开始
 
 ### 第一步：创建新的 DataFlow 工作文件夹
+
 ```bash
 mkdir run_dataflow_mm
 cd run_dataflow_mm
+
 ```
 
 ### 第二步：初始化 DataFlow-MM
+
 ```bash
 dataflow init
+
 ```
+
 这时你会看到：
+
 ```bash
 gpu_pipelines/context_vqa.py  
+
+```
+
+### 第三步：下载示例数据
+
+```bash
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data
+
 ```
 
-### 第三步：配置模型路径
+### 第四步：配置模型与数据路径
 
-在 `context_vqa.py` 中配置 VLM 模型路径和示例数据
+在 `context_vqa.py` 中直接修改类初始化参数（不再通过命令行参数传递）：
 
 ```python
-parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") # 修改为你的模型路径
-parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface")
-parser.add_argument("--download_dir", default="./ckpt")
-parser.add_argument("--device", choices=["cuda", "cpu", "mps"], default="cuda")
-
-parser.add_argument("--images_file", default="dataflow/example/image_to_text_pipeline/capsbench_captions.json") # 修改为你的数据地址，我们提供示例数据在在run_dataflow_mm/example_data/image_to_text_pipeline/capsbench_captions.json，具体里面的图片可以从json中"source"来源下载
-parser.add_argument("--cache_path", default="./cache_local")
-parser.add_argument("--file_name_prefix", default="context_vqa")
-parser.add_argument("--cache_type", default="json")
+# 模型服务配置
+self.serving = LocalModelVLMServing_vllm(
+    hf_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct",
+    hf_cache_dir="~/.cache/huggingface",
+    hf_local_dir="./ckpt",
+    vllm_tensor_parallel_size=1,
+    vllm_max_tokens=512,
+)
+
+# 数据存储配置
+self.storage = FileStorage(
+    first_entry_file_name="./example_data/image_contextvqa/sample_data.json",
+    cache_path="./cache_local",
+    file_name_prefix="context_vqa",
+    cache_type="json",
+)
+
 ```
 
-### 第四步：一键运行
+### 第五步：一键运行
 
 ```bash
 python gpu_pipelines/context_vqa.py
+
 ```
 
 ---
@@ -67,67 +91,46 @@ python gpu_pipelines/context_vqa.py
 
 ### 1. **输入数据**
 
-该流程的输入数据主要包含以下字段：
+该流程的输入数据通过 `FileStorage` 进行管理，支持断点续传。
 
-* **image**：图像文件路径（本地路径或 URL）。
-* **id**（可选）：数据的唯一标识符。
+**输入数据示例 (`sample_data.json`)**：
 
-数据通过 `FileStorage` 进行管理，支持断点续传。
-
-**输入数据示例**：
+```json
+[
+    {
+        "image": ["./example_data/image_contextvqa/person.png"],
+        "conversation": [
+            {
+                "from": "human",
+                "value": "Write a Wikipedia article related to this image without directly referring to the image. Then write question answer pairs. The question answer pairs should satisfy the following criteria.\n1: The question should refer to the image.\n2: The question should avoid mentioning the name of the object in the image.\n3: The question should be answered by reasoning over the Wikipedia article.\n4: The question should sound natural and concise.\n5: The answer should be extracted from the Wikipedia article.\n6: The answer should not be any objects in the image.\n7: The answer should be a single word or phrase and list all correct answers separated by commas.\n8: The answer should not contain 'and', 'or', rather you can split them into multiple answers."
+            }
+        ]
+    }
+]
 
-```jsonl
-{"id": 1, "image": "./images/landmark.jpg"}
-{"id": 2, "image": "./images/animal.jpg"}
 ```
-示例图片可以在`https://huggingface.co/datasets/OpenDCAI/dataflow-demo-image/tree/main/capsbench_images`中找到；此外我们已经合成了20w高质量context vqa数据供社区使用体验，在https://huggingface.co/datasets/OpenDCAI/dataflow-mm-context_vqa中。
-### 2. **核心算子逻辑**
 
-该流水线通过串联两个核心算子来完成任务：
+### 2. **核心算子逻辑**
 
 #### A. **FixPromptedVQAGenerator（上下文生成）**
 
-该算子负责利用 VLM 模型，根据预设的 Prompt 模板生成原始文本。
-
-**功能：**
-
-* 基于图像生成一段 Wikipedia 风格的科普文章。
-* 基于文章生成问答对。
-* **Prompt 约束**：问题指向图像但避免直接提及物体名称；答案必须来自文章内容且非图像中的物体；答案简练。
-
-**模型服务配置**：
-
-```python
-self.serving = LocalModelVLMServing_vllm(
-    hf_model_name_or_path=model_path,
-    hf_cache_dir=hf_cache_dir,
-    vllm_tensor_parallel_size=1,
-    vllm_temperature=0.7,  # 保持一定的创造性
-    vllm_top_p=0.9,
-    vllm_max_tokens=512,
-)
-
-```
+该算子负责调用本地 VLM 模型，根据内置的 Wikipedia 风格 Prompt 模板生成原始文本。
 
 **算子运行**：
 
 ```python
 self.vqa_generator.run(
     storage=self.storage.step(),
-    input_image_key="image",
-    output_answer_key="vqa" # 输出原始生成的文本
+    input_conversation_key="conversation",
+    input_image_key=input_image_key,
+    output_answer_key=output_answer_key,
 )
 
 ```
 
 #### B. **WikiQARefiner（结果解析）**
 
-该算子负责将 VLM 生成的非结构化文本清洗并转换为标准格式。
-
-**功能：**
-
-* 清洗 Markdown 格式和多余的空白字符。
-* 分离文章内容（Context）和问答对（QAs）。
+该算子负责将 VLM 生成的非结构化文本清洗并转换为标准格式，分离文章内容（Context）和问答对（QAs）。
 
 **算子运行**：
 
@@ -142,28 +145,20 @@ self.refiner.run(
 
 ### 3. **输出数据**
 
-最终，流水线生成的输出数据将包含以下内容：
-
-* **image**：原始图像路径。
-* **vqa**：VLM 生成的原始文本（中间结果）。
-* **context_vqa**：结构化的最终结果，包含 `context`（文章）和 `qas`（问答列表）。
+最终生成的结构化数据包含 `context`（文章）和 `qas`（问答列表）。
 
 **输出数据示例**：
 
 ```json
 {
     "id": 1,
-    "image": "./images/landmark.jpg",
+    "image": ["./example_data/image_contextvqa/person.png"],
     "context_vqa": {
-        "context": "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France...",
+        "context": "Nightmare Alley is a 2021 American psychological thriller film...",
         "qas": [
             {
-                "question": "In which city is this structure located?",
-                "answer": "Paris"
-            },
-            {
-                "question": "What material is the tower primarily constructed from?",
-                "answer": "wrought-iron"
+                "question": "What genre does this film belong to?",
+                "answer": "Psychological thriller"
             }
         ]
     }
@@ -175,45 +170,35 @@ self.refiner.run(
 
 ## 4. 流水线示例
 
-以下是完整的 `ContextVQAPipeline` 代码实现，支持命令行参数调用。
+以下是完整的 `ContextVQAPipeline` 代码实现。
 
 ```python
 import argparse
 from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServingABC
 from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
-from dataflow.operators.core_vision import FixPromptedVQAGenerator
-from dataflow.operators.core_vision import WikiQARefiner
+from dataflow.operators.core_vision import PromptedVQAGenerator, WikiQARefiner
+
 
 class ContextVQAPipeline:
     """
     一行命令即可完成图片批量 ContextVQA Caption 生成。
     """
 
-    def __init__(
-        self,
-        model_path: str,
-        *,
-        hf_cache_dir: str | None = None,
-        download_dir: str = "./ckpt",
-        device: str = "cuda",
-        first_entry_file: str = "dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl",
-        cache_path: str = "./cache_local_skvqa",
-        file_name_prefix: str = "skvqa_cache_step",
-        cache_type: str = "jsonl",
-    ):
+    def __init__(self, llm_serving: LLMServingABC = None):
         # ---------- 1. Storage ----------
         self.storage = FileStorage(
-            first_entry_file_name=first_entry_file,
-            cache_path=cache_path,
-            file_name_prefix=file_name_prefix,
-            cache_type=cache_type,
+            first_entry_file_name="./example_data/image_contextvqa/sample_data.json",
+            cache_path="./cache_local",
+            file_name_prefix="context_vqa",
+            cache_type="json",
         )
 
         # ---------- 2. Serving ----------
-        self.serving = LocalModelVLMServing_vllm(
-            hf_model_name_or_path=model_path,
-            hf_cache_dir=hf_cache_dir,
-            hf_local_dir=download_dir,
+        self.vlm_serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct",
+            hf_cache_dir="~/.cache/huggingface",
+            hf_local_dir="./ckpt",
             vllm_tensor_parallel_size=1,
             vllm_temperature=0.7,
             vllm_top_p=0.9,
@@ -221,40 +206,25 @@ class ContextVQAPipeline:
         )
 
         # ---------- 3. Operator ----------
-        # 使用特定 Prompt 生成 Wiki 风格文章与问答
-        self.vqa_generator = FixPromptedVQAGenerator(
-            serving=self.serving,
-            system_prompt="You are a helpful assistant.",
-            user_prompt= """
-            Write a Wikipedia article related to this image without directly referring to the image. Then write question answer pairs. The question answer pairs should satisfy the following criteria.
-            1: The question should refer to the image.
-            2: The question should avoid mentioning the name of the object in the image.
-            3: The question should be answered by reasoning over the Wikipedia article.
-            4: The question should sound natural and concise.
-            5: The answer should be extracted from the Wikipedia article.
-            6: The answer should not be any objects in the image.
-            7: The answer should be a single word or phrase and list all correct answers separated by commas.
-            8: The answer should not contain 'and', 'or', rather you can split them into multiple answers.
-            """
+        self.vqa_generator = PromptedVQAGenerator(
+            serving=self.vlm_serving,
+            system_prompt= "You are a helpful assistant."
         )
 
-        # 结果清洗与结构化
         self.refiner = WikiQARefiner()
-
     # ------------------------------------------------------------------ #
     def forward(self):
         input_image_key = "image"
         output_answer_key = "vqa"
         output_wiki_key = "context_vqa"
 
-        # 步骤 1: 生成原始文本
         self.vqa_generator.run(
             storage=self.storage.step(),
+            input_conversation_key="conversation",
             input_image_key=input_image_key,
             output_answer_key=output_answer_key
         )
 
-        # 步骤 2: 解析为结构化数据
         self.refiner.run(
             storage=self.storage.step(),
             input_key=output_answer_key,
@@ -263,30 +233,7 @@ class ContextVQAPipeline:
 
 # ---------------------------- CLI 入口 -------------------------------- #
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Batch SKVQA caption generation with DataFlow")
-
-    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
-    parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface")
-    parser.add_argument("--download_dir", default="./ckpt")
-    parser.add_argument("--device", choices=["cuda", "cpu", "mps"], default="cuda")
-
-    parser.add_argument("--images_file", default="dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl")
-    parser.add_argument("--cache_path", default="./cache_local")
-    parser.add_argument("--file_name_prefix", default="context_vqa")
-    parser.add_argument("--cache_type", default="jsonl")
-
-    args = parser.parse_args()
-
-    pipe = ContextVQAPipeline(
-        model_path=args.model_path,
-        hf_cache_dir=args.hf_cache_dir,
-        download_dir=args.download_dir,
-        device=args.device,
-        first_entry_file=args.images_file,
-        cache_path=args.cache_path,
-        file_name_prefix=args.file_name_prefix,
-        cache_type=args.cache_type,
-    )
+    pipe = ContextVQAPipeline()
     pipe.forward()
 
-```
+```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_guide/image_understanding/context_vqa_api.md b/docs/zh/notes/mm_guide/image_understanding/context_vqa_api.md
index d21ecff8..3ecb894a 100644
--- a/docs/zh/notes/mm_guide/image_understanding/context_vqa_api.md
+++ b/docs/zh/notes/mm_guide/image_understanding/context_vqa_api.md
@@ -1,13 +1,12 @@
 ---
-title: ContextVQA 多模态问答数据生成流水线（API版）
-icon: mdi:image-text
-createTime: 2026/01/24 16:37:37
+title: ContextVQA 多模态问答数据生成流水线（API版） 
+icon: mdi:image-text 
+createTime: 2026/01/24 16:37:37 
 permalink: /zh/mm_guide/contextvqa_api_pipeline/
 ---
-
 ## 1. 概述
 
-**ContextVQA 多模态问答数据生成流水线（API版）**旨在从图像出发，自动生成**具备外部知识上下文的视觉问答（Context-based VQA）数据**。该流水线通过 API 形式的视觉语言模型（VLM）生成 Wikipedia 风格文章及问答对，并将其解析为结构化数据，便于构建知识型 VQA 与多模态 RAG 数据集。
+**ContextVQA 多模态问答数据生成流水线（API版）旨在从图像出发，自动生成具备外部知识上下文的视觉问答（Context-based VQA）数据**。该流水线通过 API 形式的视觉语言模型（VLM）生成 Wikipedia 风格文章及问答对，并将其解析为结构化数据，便于构建知识型 VQA 与多模态 RAG 数据集。
 
 我们支持以下应用场景：
 
@@ -31,53 +30,71 @@ permalink: /zh/mm_guide/contextvqa_api_pipeline/
 
 ```python
 import os
-os.environ["DF_API_KEY"] = "your_api_key"
+os.environ["DF_API_KEY"] = "sk-xxx"
+
 ```
 
 ### 第二步：创建新的 DataFlow 工作文件夹
+
 ```bash
 mkdir run_dataflow
 cd run_dataflow
+
 ```
 
 ### 第三步：初始化 DataFlow-MM
+
 ```bash
 dataflowmm init
+
 ```
+
 这时你会看到：
+
 ```bash
 api_pipelines/image_contextvqa.py
+
 ```
 
 ### 第四步：下载示例数据
+
 ```bash
-huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data
+
 ```
 
 ### 第五步：配置参数
 
-在 `image_contextvqa.py` 中配置 API 服务和输入数据路径：
+在 `image_contextvqa.py` 中配置 API 服务和输入数据路径（无需 `argparse`，直接在代码中修改默认路径）：
 
 ```python
 self.vlm_serving = APIVLMServing_openai(
-    api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # 任意兼容openai 格式的api平台
+    api_url="http://172.96.141.132:3001/v1", # 任意兼容openai 格式的api平台
     key_name_of_api_key="DF_API_KEY", # 对应的api key，在第一步中设置
-    model_name="qwen3-vl-8b-instruct",
+    model_name="gpt-5-nano-2025-08-07",
+    image_io=None,
+    send_request_stream=False,
     max_workers=10,
     timeout=1800
 )
+
 ```
 
 ```python
-parser.add_argument("--images_file", default="data/image_contextvqa/sample_data.json")
-parser.add_argument("--cache_path", default="./cache_local")
-parser.add_argument("--file_name_prefix", default="context_vqa")
-parser.add_argument("--cache_type", default="json")
+self.storage = FileStorage(
+    first_entry_file_name="./example_data/image_contextvqa/sample_data.json",
+    cache_path="./cache_local",
+    file_name_prefix="context_vqa",
+    cache_type="json",
+)
+
 ```
 
-### 第五步：一键运行
+### 第六步：一键运行
+
 ```bash
 python api_pipelines/image_contextvqa.py
+
 ```
 
 ---
@@ -99,7 +116,7 @@ python api_pipelines/image_contextvqa.py
 ```json
 [
     {
-        "image": ["./data/image_contextvqa/person.png"],
+        "image": ["./example_data/image_contextvqa/person.png"],
         "conversation": [
             {
                 "from": "human",
@@ -128,12 +145,13 @@ python api_pipelines/image_contextvqa.py
 
 ```python
 self.vlm_serving = APIVLMServing_openai(
-    api_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+    api_url="http://172.96.141.132:3001/v1",
     key_name_of_api_key="DF_API_KEY",
-    model_name="qwen3-vl-8b-instruct",
+    model_name="gpt-5-nano-2025-08-07",
     max_workers=10,
     timeout=1800
 )
+
 ```
 
 **算子运行**：
@@ -145,6 +163,7 @@ self.vqa_generator.run(
     input_image_key="image",
     output_answer_key="vqa"
 )
+
 ```
 
 #### B. **WikiQARefiner（结果解析）**
@@ -164,6 +183,7 @@ self.refiner.run(
     input_key="vqa",
     output_key="context_vqa"
 )
+
 ```
 
 ### 3. **输出数据**
@@ -180,105 +200,66 @@ self.refiner.run(
 [
   {
     "image":[
-      ".\/data\/image_contextvqa\/person.png"
+      "./example_data/image_contextvqa/person.png"
     ],
     "conversation":[
       {
         "from":"human",
-        "value":"Write a Wikipedia article related to this image without directly referring to the image. Then write question answer pairs. The question answer pairs should satisfy the following criteria.\n1: The question should refer to the image.\n2: The question should avoid mentioning the name of the object in the image.\n3: The question should be answered by reasoning over the Wikipedia article.\n4: The question should sound natural and concise.\n5: The answer should be extracted from the Wikipedia article.\n6: The answer should not be any objects in the image.\n7: The answer should be a single word or phrase and list all correct answers separated by commas.\n8: The answer should not contain 'and', 'or', rather you can split them into multiple answers."
+        "value":"Write a Wikipedia article related to this image..."
       }
     ],
     "context_vqa":{
-      "context":"**Wikipedia Article:** *Nightmare Alley* is a 2021 American psychological thriller film directed by Guillermo del Toro and written by del Toro and Kim Morgan. The film is based on the 1946 novel of the same name by William Lindsay Gresham. It follows the rise and fall of a street-smart con man who becomes involved with a carnival showman and his wife, eventually becoming embroiled in a dangerous world of deception and manipulation. The film stars Bradley Cooper as Stanton “Stan” Carlisle, Cate Blanchett as Pearl Holland, Toni Collette as Molly, Willem Dafoe as Dr. John L. Thorne, Richard Jenkins as Mr. O’Malley, Rooney Mara as Vera, Ron Perlman as The Duke, Mary Steenburgen as Mrs. Hargrove, and David Strathairn as Mr. Hargrove. The screenplay was adapted from the original novel by William Lindsay Gresham, which had previously been adapted into a 1947 film starring Tyrone Power. The film premiered at the Venice International Film Festival on September 1, 2021, and was released in the United States on December 17, 2021. It received critical acclaim for its direction, performances, and cinematography. The film’s score was composed by Benjamin Wallfisch, and it features a haunting atmosphere that complements its dark themes. *Nightmare Alley* explores themes of ambition, morality, and the corrupting nature of power. It was nominated for several awards, including Best Picture at the Academy Awards, and won Best Supporting Actor for Willem Dafoe. The film's production design and visual style were praised for their evocative portrayal of 1940s America.",
+      "context":"**Wikipedia Article:** *Nightmare Alley* is a 2021 American psychological thriller film directed by Guillermo del Toro...",
       "qas":[
         {
           "question":"What genre does this film belong to?",
           "answer":"Psychological thriller"
-        },
-        {
-          "question":"Who directed this film?",
-          "answer":"Guillermo del Toro"
-        },
-        {
-          "question":"What year was this film released?",
-          "answer":"2021"
-        },
-        {
-          "question":"Which actor plays the main character?",
-          "answer":"Bradley Cooper"
-        },
-        {
-          "question":"What is the original source material for this film?",
-          "answer":"Novel"
-        },
-        {
-          "question":"What festival did this film premiere at?",
-          "answer":"Venice International Film Festival"
-        },
-        {
-          "question":"What award nomination did this film receive?",
-          "answer":"Best Picture"
-        },
-        {
-          "question":"What theme does this film explore?",
-          "answer":"Ambition"
-        },
-        {
-          "question":"What decade does the setting primarily reflect?",
-          "answer":"1940s"
-        },
-        {
-          "question":"What is the title of the film’s score composer?",
-          "answer":"Benjamin Wallfisch"
         }
       ]
     }
   }
 ]
+
 ```
 
 ---
 
 ## 4. 流水线示例
 
-以下是完整的 `ContextVQAPipeline` 代码实现，支持命令行参数调用。
+以下是完整的 `ContextVQAPipeline` 代码实现。
 
 ```python
 import os
-import argparse
+
+# 设置 API Key 环境变量
+os.environ["DF_API_KEY"] = "sk-xxx"
+
 from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServingABC
 from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
 from dataflow.operators.core_vision import PromptedVQAGenerator
 from dataflow.operators.core_vision import WikiQARefiner
 
-# 设置 API Key 环境变量
-os.environ["DF_API_KEY"] = "sk-xxxx"
 
 class ContextVQAPipeline:
     """
     一行命令即可完成图片批量 ContextVQA Caption 生成。
     """
 
-    def __init__(
-        self,
-        first_entry_file: str = "dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl",
-        cache_path: str = "./cache_local_skvqa",
-        file_name_prefix: str = "skvqa_cache_step",
-        cache_type: str = "jsonl",
-    ):
+    def __init__(self, llm_serving: LLMServingABC = None):
         # ---------- 1. Storage ----------
         self.storage = FileStorage(
-            first_entry_file_name=first_entry_file,
-            cache_path=cache_path,
-            file_name_prefix=file_name_prefix,
-            cache_type=cache_type,
+            first_entry_file_name="./example_data/image_contextvqa/sample_data.json",
+            cache_path="./cache_local",
+            file_name_prefix="context_vqa",
+            cache_type="json",
         )
 
         # ---------- 2. Serving ----------
         self.vlm_serving = APIVLMServing_openai(
-            api_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
-            key_name_of_api_key="DF_API_KEY",
-            model_name="qwen3-vl-8b-instruct",
+            api_url="http://172.96.141.132:3001/v1", # Any API platform compatible with OpenAI format
+            key_name_of_api_key="DF_API_KEY", # Set the API key in environment variable
+            model_name="gpt-5-nano-2025-08-07",
             image_io=None,
             send_request_stream=False,
             max_workers=10,
@@ -288,7 +269,7 @@ class ContextVQAPipeline:
         # ---------- 3. Operator ----------
         self.vqa_generator = PromptedVQAGenerator(
             serving=self.vlm_serving,
-            system_prompt="You are a helpful assistant."
+            system_prompt= "You are a helpful assistant."
         )
 
         self.refiner = WikiQARefiner()
@@ -314,20 +295,7 @@ class ContextVQAPipeline:
 
 # ---------------------------- CLI 入口 -------------------------------- #
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Batch SKVQA caption generation with DataFlow")
-
-    parser.add_argument("--images_file", default="data/image_contextvqa/sample_data.json")
-    parser.add_argument("--cache_path", default="./cache_local")
-    parser.add_argument("--file_name_prefix", default="context_vqa")
-    parser.add_argument("--cache_type", default="json")
-
-    args = parser.parse_args()
-
-    pipe = ContextVQAPipeline(
-        first_entry_file=args.images_file,
-        cache_path=args.cache_path,
-        file_name_prefix=args.file_name_prefix,
-        cache_type=args.cache_type,
-    )
+    pipe = ContextVQAPipeline()
     pipe.forward()
-```
+
+```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_caption_api.md b/docs/zh/notes/mm_guide/image_understanding/image_caption_api.md
index 9b8291b2..15a86c2e 100644
--- a/docs/zh/notes/mm_guide/image_understanding/image_caption_api.md
+++ b/docs/zh/notes/mm_guide/image_understanding/image_caption_api.md
@@ -43,7 +43,7 @@ dataflowmm init
 ### 第三步：下载示例数据
 
 ```bash
-huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data
 
 ```
 
@@ -65,7 +65,7 @@ self.vlm_serving = APIVLMServing_openai(
 ### 第五步：运行流水线
 
 ```bash
-python api_pipelines/image_caption.py --images_file data/image_caption/sample_data.json
+python api_pipelines/image_caption.py
 
 ```
 
@@ -80,7 +80,7 @@ python api_pipelines/image_caption.py --images_file data/image_caption/sample_da
 ```json
 [
     {
-        "image": ["./data/image_caption/person.png"],
+        "image": ["./example_data/image_caption/person.png"],
         "conversation": [
             {
                 "from": "human",
@@ -107,7 +107,7 @@ python api_pipelines/image_caption.py --images_file data/image_caption/sample_da
 ```json
 [
   {
-    "image": ["./data/image_caption/person.png"],
+    "image": ["./example_data/image_caption/person.png"],
     "conversation": [...],
     "caption": "Promotional poster for Nightmare Alley in grayscale, showing a man in a formal tuxedo with a white bow tie. The cast names run down the left side (Bradley Cooper, Cate Blanchett, Toni Collette, Willem Dafoe, and more), and the gold title Nightmare Alley appears near the bottom left with release text and Regal branding."
   }
@@ -123,73 +123,63 @@ python api_pipelines/image_caption.py --images_file data/image_caption/sample_da
 
 ```python
 import os
-import argparse
+
+# 设置 API Key 环境变量
+os.environ["DF_API_KEY"] = "sk-xxx"
+
 from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServingABC
 from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
 from dataflow.operators.core_vision import PromptedVQAGenerator
 
-# 设置 API Key 环境变量
-os.environ["DF_API_KEY"] = "sk-xxx"
 
 class ImageCaptionPipeline:
     """
     一行命令即可完成图片批量 Caption 生成。
     """
 
-    def __init__(
-        self,
-        first_entry_file: str,
-        cache_path: str = "./cache_local",
-        file_name_prefix: str = "caption",
-        cache_type: str = "json",
-    ):
-        # ---------- 1. Storage: 管理数据读取与断点续传 ----------
+    def __init__(self, llm_serving: LLMServingABC = None):
+
+        # ---------- 1. Storage ----------
         self.storage = FileStorage(
-            first_entry_file_name=first_entry_file,
-            cache_path=cache_path,
-            file_name_prefix=file_name_prefix,
-            cache_type=cache_type,
+            first_entry_file_name="./example_data/image_caption/sample_data.json",
+            cache_path="./cache_local",
+            file_name_prefix="caption",
+            cache_type="json",
         )
 
-        # ---------- 2. Serving: 配置 API 服务 ----------
+        # ---------- 2. Serving ----------
         self.vlm_serving = APIVLMServing_openai(
-            api_url="http://172.96.141.132:3001/v1", 
-            key_name_of_api_key="DF_API_KEY",
+            api_url="http://172.96.141.132:3001/v1", # Any API platform compatible with OpenAI format
+            key_name_of_api_key="DF_API_KEY", # Set the API key for the corresponding platform in the environment variable or line 4
             model_name="gpt-5-nano-2025-08-07",
+            image_io=None,
+            send_request_stream=False,
             max_workers=10,
             timeout=1800
         )
 
-        # ---------- 3. Operator: 定义生成逻辑 ----------
+        # ---------- 3. Operator ----------
         self.vqa_generator = PromptedVQAGenerator(
             serving=self.vlm_serving,
-            system_prompt="You are a image caption generator. Your task is to generate a concise and informative caption for the given image content."
+            system_prompt= "You are a image caption generator. Your task is to generate a concise and informative caption for the given image content."
         )
 
+    # ------------------------------------------------------------------ #
     def forward(self):
-        # 运行流水线
+        input_image_key = "image"
+        output_answer_key = "caption"
+
         self.vqa_generator.run(
             storage=self.storage.step(),
             input_conversation_key="conversation",
-            input_image_key="image",
-            output_answer_key="caption",
+            input_image_key=input_image_key,
+            output_answer_key=output_answer_key,
         )
 
+# ---------------------------- CLI 入口 -------------------------------- #
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Batch image caption generation with DataFlow")
-    parser.add_argument("--images_file", default="data/image_caption/sample_data.json")
-    parser.add_argument("--cache_path", default="./cache_local")
-    parser.add_argument("--file_name_prefix", default="caption")
-    parser.add_argument("--cache_type", default="json")
-
-    args = parser.parse_args()
-
-    pipe = ImageCaptionPipeline(
-        first_entry_file=args.images_file,
-        cache_path=args.cache_path,
-        file_name_prefix=args.file_name_prefix,
-        cache_type=args.cache_type,
-    )
+    pipe = ImageCaptionPipeline()
     pipe.forward()
 
 ```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_vqa_api.md b/docs/zh/notes/mm_guide/image_understanding/image_vqa_api.md
index 49ac7714..67120d2a 100644
--- a/docs/zh/notes/mm_guide/image_understanding/image_vqa_api.md
+++ b/docs/zh/notes/mm_guide/image_understanding/image_vqa_api.md
@@ -44,7 +44,7 @@ dataflowmm init
 ### 第三步：下载示例数据
 
 ```bash
-huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data
+huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir dexample_dataa
 
 ```
 
@@ -65,7 +65,7 @@ self.vlm_serving = APIVLMServing_openai(
 ### 第五步：执行流水线
 
 ```bash
-python api_pipelines/image_vqa.py --images_file data/image_vqa/sample_data.json
+python api_pipelines/image_vqa.py 
 
 ```
 
@@ -80,7 +80,7 @@ python api_pipelines/image_vqa.py --images_file data/image_vqa/sample_data.json
 ```json
 [
     {
-        "image": ["./data/image_vqa/person.png"],
+        "image": ["./example_data/image_vqa/person.png"],
         "conversation": [
             {
                 "from": "human",
@@ -107,7 +107,7 @@ python api_pipelines/image_vqa.py --images_file data/image_vqa/sample_data.json
 ```json
 [
   {
-    "image": ["./data/image_vqa/person.png"],
+    "image": ["./example_data/image_vqa/person.png"],
     "vqa": "- Q: What is the title of the movie shown on the poster?\n  A: Nightmare Alley\n\n- Q: What color is the film’s title text?\n  A: Gold"
   }
 ]
@@ -120,63 +120,63 @@ python api_pipelines/image_vqa.py --images_file data/image_vqa/sample_data.json
 
 ```python
 import os
-import argparse
+
+# 设置 API Key 环境变量
+os.environ["DF_API_KEY"] = "sk-xxx"
+
 from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServingABC
 from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
 from dataflow.operators.core_vision import PromptedVQAGenerator
 
-# 配置 API 环境
-os.environ["DF_API_KEY"] = "sk-xxx"
 
 class ImageVQAPipeline:
     """
-    一键式图片批量 VQA 生成流水线
+    一行命令即可完成图片批量 VQA 生成。
     """
 
-    def __init__(
-        self,
-        first_entry_file: str,
-        cache_path: str = "./cache_local_vqa",
-        file_name_prefix: str = "vqa_task",
-        cache_type: str = "json",
-    ):
-        # 1. 初始化存储：支持断点续传与多格式导出
+    def __init__(self, llm_serving: LLMServingABC = None):
+
+        # ---------- 1. Storage ----------
         self.storage = FileStorage(
-            first_entry_file_name=first_entry_file,
-            cache_path=cache_path,
-            file_name_prefix=file_name_prefix,
-            cache_type=cache_type,
+            first_entry_file_name="./example_data/image_vqa/sample_data.json",
+            cache_path="./cache_local",
+            file_name_prefix="qa",
+            cache_type="json",
         )
 
-        # 2. 配置 VLM API 服务
+        # ---------- 2. Serving ----------
         self.vlm_serving = APIVLMServing_openai(
-            api_url="http://172.96.141.132:3001/v1",
-            key_name_of_api_key="DF_API_KEY",
+            api_url="http://172.96.141.132:3001/v1", # Any API platform compatible with OpenAI format
+            key_name_of_api_key="DF_API_KEY", # Set the API key for the corresponding platform in the environment variable or line 4
             model_name="gpt-5-nano-2025-08-07",
-            max_workers=10
+            image_io=None,
+            send_request_stream=False,
+            max_workers=10,
+            timeout=1800
         )
 
-        # 3. 初始化 VQA 算子
+        # ---------- 3. Operator ----------
         self.vqa_generator = PromptedVQAGenerator(
             serving=self.vlm_serving,
-            system_prompt="You are a image question-answer generator. Your task is to generate a question-answer pair for the given image content."
+            system_prompt= "You are a image question-answer generator. Your task is to generate a question-answer pair for the given image content."
         )
 
+    # ------------------------------------------------------------------ #
     def forward(self):
-        # 执行推理任务
+        input_image_key = "image"
+        output_answer_key = "vqa"
+
         self.vqa_generator.run(
             storage=self.storage.step(),
             input_conversation_key="conversation",
-            input_image_key="image",
-            output_answer_key="vqa",
+            input_image_key=input_image_key,
+            output_answer_key=output_answer_key,
         )
 
+# ---------------------------- CLI 入口 -------------------------------- #
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Batch VQA generation")
-    parser.add_argument("--images_file", default="data/image_vqa/sample_data.json")
-    args = parser.parse_args()
-
-    pipe = ImageVQAPipeline(first_entry_file=args.images_file)
+    pipe = ImageVQAPipeline()
     pipe.forward()
 
 ```
\ No newline at end of file

From 9fcfff13382963e9b817fe1fecccd747b6c4a196 Mon Sep 17 00:00:00 2001
From: chawuciren11 <2216740116@qq.com>
Date: Wed, 25 Feb 2026 17:34:42 +0800
Subject: [PATCH 5/7] 11

---
 .../image_region_caption_pipeline_api.md      | 26 +++++++++----------
 .../image_region_caption_pipeline_api.md      | 24 ++++++++---------
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
index 66391423..7b905e97 100644
--- a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
+++ b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
@@ -26,17 +26,7 @@ The main process of the pipeline includes:
 
 ## 2. Quick Start
 
-### Step 1: Configure API Key
-
-Set your API Key environment variable in your script:
-
-```python
-import os
-os.environ["DF_API_KEY"] = "your_api_key"
-
-```
-
-### Step 2: Create a New DataFlow Working Directory
+### Step 1: Create a New DataFlow Working Directory
 
 ```bash
 mkdir run_dataflow
@@ -44,7 +34,7 @@ cd run_dataflow
 
 ```
 
-### Step 3: Initialize DataFlow-MM
+### Step 2: Initialize DataFlow-MM
 
 ```bash
 dataflowmm init
@@ -57,13 +47,23 @@ You will then see:
 api_pipelines/image_region_caption_api_pipeline.py
 ```
 
-### Step 4: Download Sample Data
+### Step 3: Download Sample Data
 
 ```bash
 huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
 
 ```
 
+### Step 4: Configure API Key
+
+Set your API Key environment variable in `api_pipelines/image_region_caption_api_pipeline.py`:
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+
+```
+
 ### Step 5: Configure Parameters
 
 Configure the API service and input data paths in `api_pipelines/image_region_caption_api_pipeline.py`:
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
index 8f465dc1..895ee281 100644
--- a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
+++ b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
@@ -26,22 +26,13 @@ permalink: /zh/mm_guide/image_region_caption_pipeline_api/
 
 ## 2. 快速开始
 
-### 第一步：配置 API Key
-
-在脚本中设置 API Key 环境变量：
-
-```python
-import os
-os.environ["DF_API_KEY"] = "your_api_key"
-```
-
-### 第二步：创建新的 DataFlow 工作文件夹
+### 第一步：创建新的 DataFlow 工作文件夹
 ```bash
 mkdir run_dataflow
 cd run_dataflow
 ```
 
-### 第三步：初始化 DataFlow-MM
+### 第二步：初始化 DataFlow-MM
 ```bash
 dataflowmm init
 ```
@@ -50,11 +41,20 @@ dataflowmm init
 api_pipelines/image_region_caption_api_pipeline.py
 ```
 
-### 第四步：下载示例数据
+### 第三步：下载示例数据
 ```bash
 huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data
 ```
 
+### 第四步：配置 API Key
+
+在 `api_pipelines/image_region_caption_api_pipeline.py` 中设置 API Key 环境变量：
+
+```python
+import os
+os.environ["DF_API_KEY"] = "your_api_key"
+```
+
 ### 第五步：配置参数
 
 在 `api_pipelines/image_region_caption_api_pipeline.py` 中配置 API 服务和输入数据路径：

From 484580ff80c57faa5543e92cb0cb2f443d18d4df Mon Sep 17 00:00:00 2001
From: chawuciren11 <2216740116@qq.com>
Date: Wed, 25 Feb 2026 18:57:37 +0800
Subject: [PATCH 6/7] 11

---
 .../image_region_caption_pipeline.md          | 98 +++++++++++--------
 .../image_region_caption_pipeline_api.md      | 45 +++------
 .../generate/image_bbox_generator.md          | 14 +--
 .../image_region_caption_pipeline.md          | 97 ++++++++++--------
 .../image_region_caption_pipeline_api.md      | 45 +++------
 .../generate/image_bbox_generator.md          | 14 +--
 6 files changed, 147 insertions(+), 166 deletions(-)

diff --git a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
index a67175f4..4013ee2b 100644
--- a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
+++ b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
@@ -59,7 +59,7 @@ huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --loca
         self,
         model_path: str = "Qwen/Qwen2.5-VL-3B-Instruct",
         hf_cache_dir: str = "~/.cache/huggingface",
-        download_dir: str = "./ckpt/models",
+        download_dir: str = "../ckpt/models/Qwen2.5-VL-3B-Instruct",
         first_entry_file: str = "../example_data/image_region_caption/image_region_caption_demo.jsonl",
         cache_path: str = "../cache/image_region_caption",
         file_name_prefix: str = "region_caption",
@@ -70,6 +70,11 @@ huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --loca
         output_image_with_bbox_path: str = "../cache/image_region_caption/image_with_bbox_result.jsonl",
     ):
 ```
+> **�7�2�1�5 Important Note on Model Path Configuration (Taking `Qwen2.5-VL-3B-Instruct` as an example):**
+> 
+> * **If you have already downloaded the model files:** Please change `model_path` to your local model path. **Crucially**, ensure that the model folder is named exactly `Qwen2.5-VL-3B-Instruct`; otherwise, the framework will fail to recognize it.
+> * **If you haven't downloaded the model yet:** You must specify a `download_dir` parameter that ends with `Qwen2.5-VL-3B-Instruct` (as shown in the default parameters). Failure to do so will also result in the model not being recognized after downloading.
+
 
 ### Step 5: Run
 
@@ -77,6 +82,44 @@ huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --loca
 cd gpu_pipelines
 python image_region_caption_pipeline.py
 ```
+> **�0�0�1�5 Troubleshooting**
+> 
+> **Issue 1:** If you encounter a CUDA library conflict error similar to the following:
+> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`
+> 
+> **Solution:** This is usually caused by conflicting environment variables. Run the script with an empty `LD_LIBRARY_PATH`:
+> ```bash
+> LD_LIBRARY_PATH="" python image_region_caption_pipeline.py
+> ```
+> 
+> **Issue 2:** If you are using **Qwen series models** and encounter the following error:
+> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"`
+> 
+> **Solution:** Open the `config.json` file located in your model folder, find the `rope_scaling` section, and change the key `"type"` to `"rope_type"`.
+> 
+> **Before modification:**
+> ```json
+> "rope_scaling": {
+>   "type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> ```
+> 
+> **After modification:**
+> ```json
+> "rope_scaling": {
+>   "rope_type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> ```
 
 ---
 
@@ -93,7 +136,7 @@ The input data typically contains the image path and a list of corresponding bou
 
 ```json
 {
-    "image": "./data/image_region_caption/20.jpg",
+    "image": "../example_data/image_region_caption/20.jpg",
     "bbox": [[196, 104, 310, 495], [50, 60, 100, 200]]
 }
 
@@ -138,10 +181,12 @@ The final generated output data includes the processed image path and the genera
 
 ```json
 {
-    "image":".\/data\/image_region_caption\/20.png","type":"with_bbox",
+    "image":"..\/example_data\/image_region_caption\/20.png",
+    "type":"with_bbox",
     "bbox":[[196,104,310,495]],
     "normalized_bbox":[[0.128,0.125,0.329,0.72],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0]],
-    "result_file":".\/cache\/image_region_caption","image_with_bbox":".\/cache\/image_region_caption\\2_bbox_vis.jpg",
+    "result_file":"..\/cache\/image_region_caption",
+    "image_with_bbox":"..\/cache\/image_region_caption\\2_bbox_vis.jpg",
     "valid_bboxes_num":1,
     "prompt":"Describe the content of each marked region in the image. There are 1 regions: <region1> to <region1>.",
     "answer":"In <region1>, the focus is on the lower half of a person wearing high-heeled shoes with an ornate design. The setting appears to be a kitchen, with items such as a table with floral tablecloth, a broom, and various kitchen utensils visible in the background. The legs of another person can also be seen, indicating there may be interaction happening in this domestic space. The overall scene captures a domestic and casual atmosphere."
@@ -155,7 +200,6 @@ The final generated output data includes the processed image path and the genera
 Below is the complete `ImageRegionCaptionPipeline` code implementation.
 
 ```python
-import argparse
 from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
 from dataflow.operators.core_vision.generate.image_bbox_generator import (
     ImageBboxGenerator, 
@@ -170,19 +214,17 @@ from dataflow.utils.storage import FileStorage
 class ImageRegionCaptionPipeline:
     def __init__(
         self,
-        model_path: str,
-        *,
-        hf_cache_dir: str | None = None,
-        download_dir: str = "./ckpt/models",
-        first_entry_file: str = "./data/image_region_caption/image_region_caption_demo.jsonl",
-        cache_path: str = "./cache/image_region_caption",
+        model_path: str = "Qwen/Qwen2.5-VL-3B-Instruct",
+        hf_cache_dir: str = "~/.cache/huggingface",
+        download_dir: str = "../ckpt/models/Qwen2.5-VL-3B-Instruct",
+        first_entry_file: str = "../example_data/image_region_caption/image_region_caption_demo.jsonl",
+        cache_path: str = "../cache/image_region_caption",
         file_name_prefix: str = "region_caption",
         cache_type: str = "jsonl",
         input_image_key: str = "image",
         input_bbox_key: str = "bbox",
-        image_with_bbox_path: str = 'image_with_bbox',
         max_boxes: int = 10,
-        output_image_with_bbox_path: str = "./cache/image_region_caption/image_with_bbox_result.jsonl",
+        output_image_with_bbox_path: str = "../cache/image_region_caption/image_with_bbox_result.jsonl",
     ):
         self.bbox_storage = FileStorage(
             first_entry_file_name=first_entry_file,
@@ -216,7 +258,6 @@ class ImageRegionCaptionPipeline:
         self.caption_generator = PromptedVQAGenerator(serving=self.serving,)
         self.input_image_key = input_image_key
         self.input_bbox_key = input_bbox_key
-        self.image_with_bbox_path=image_with_bbox_path
         self.bbox_record=None
 
     def forward(self):
@@ -234,34 +275,7 @@ class ImageRegionCaptionPipeline:
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Image region caption with DataFlow")
-    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
-    parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface")
-    parser.add_argument("--download_dir", default="./ckpt/models")
-    parser.add_argument("--first_entry_file", default="./data/image_region_caption/image_region_caption_demo.jsonl")
-    parser.add_argument("--cache_path", default="./cache/image_region_caption")
-    parser.add_argument("--file_name_prefix", default="region_caption")
-    parser.add_argument("--cache_type", default="jsonl")
-    parser.add_argument("--input_image_key", default="image")
-    parser.add_argument("--input_bbox_key", default="bbox")
-    parser.add_argument("--max_boxes", type=int, default=10)
-    parser.add_argument("--output_image_with_bbox_path", default="./cache/image_region_caption/image_with_bbox_result.jsonl")
-
-    args = parser.parse_args()
-
-    pipe = ImageRegionCaptionPipeline(
-        model_path=args.model_path,
-        hf_cache_dir=args.hf_cache_dir,
-        download_dir=args.download_dir,
-        first_entry_file=args.first_entry_file,
-        cache_path=args.cache_path,
-        file_name_prefix=args.file_name_prefix,
-        cache_type=args.cache_type,
-        input_image_key=args.input_image_key,
-        input_bbox_key=args.input_bbox_key,
-        max_boxes=args.max_boxes,
-        output_image_with_bbox_path=args.output_image_with_bbox_path
-    )
+    pipe = ImageRegionCaptionPipeline()
     pipe.forward()
 
 ```
diff --git a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
index 7b905e97..b146935c 100644
--- a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
+++ b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
@@ -115,7 +115,7 @@ The input data typically contains the image path and a list of corresponding bou
 
 ```json
 {
-    "image": "./data/image_region_caption/20.jpg",
+    "image": "../example_data/image_region_caption/20.jpg",
     "bbox": [[196, 104, 310, 495], [50, 60, 100, 200]]
 }
 
@@ -160,10 +160,12 @@ The final generated output data includes the processed image path and the genera
 
 ```json
 {
-    "image":".\/data\/image_region_caption\/20.png","type":"with_bbox",
+    "image":"..\/example_data\/image_region_caption\/20.png",
+    "type":"with_bbox",
     "bbox":[[196,104,310,495]],
     "normalized_bbox":[[0.128,0.125,0.329,0.72],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0]],
-    "result_file":".\/cache\/image_region_caption","image_with_bbox":".\/cache\/image_region_caption\\2_bbox_vis.jpg",
+    "result_file":"..\/cache\/image_region_caption",
+    "image_with_bbox":"..\/cache\/image_region_caption\\2_bbox_vis.jpg",
     "valid_bboxes_num":1,
     "prompt":"Describe the content of each marked region in the image. There are 1 regions: <region1> to <region1>.",
     "answer":"In <region1>, the focus is on the lower half of a person wearing high-heeled shoes with an ornate design. The setting appears to be a kitchen, with items such as a table with floral tablecloth, a broom, and various kitchen utensils visible in the background. The legs of another person can also be seen, indicating there may be interaction happening in this domestic space. The overall scene captures a domestic and casual atmosphere."
@@ -178,9 +180,8 @@ Below is the complete `ImageRegionCaptionAPIPipeline` code implementation.
 
 ```python
 import os
-os.environ["DF_API_KEY"] = "sk-iaY19LU7WMT5QlK8LujFIG7RjI2omHLWYiCs4Do6imieLKOg"
+os.environ["DF_API_KEY"] = "sk-xxxx"
 
-import argparse
 from dataflow.operators.core_vision.generate.image_bbox_generator import (
     ImageBboxGenerator, 
     ExistingBBoxDataGenConfig
@@ -194,15 +195,14 @@ from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
 class ImageRegionCaptionPipeline:
     def __init__(
         self,
-        first_entry_file: str = "./data/image_region_caption/image_region_caption_demo.jsonl",
-        cache_path: str = "./cache/image_region_caption",
+        first_entry_file: str = "../example_data/image_region_caption/image_region_caption_demo.jsonl",
+        cache_path: str = "../cache/image_region_caption",
         file_name_prefix: str = "region_caption",
         cache_type: str = "jsonl",
         input_image_key: str = "image",
         input_bbox_key: str = "bbox",
-        image_with_bbox_path: str = 'image_with_bbox',
         max_boxes: int = 10,
-        output_image_with_bbox_path: str = "./cache/image_region_caption/image_with_bbox_result.jsonl",
+        output_image_with_bbox_path: str = "../cache/image_region_caption/image_with_bbox_result.jsonl",
     ):
         self.bbox_storage = FileStorage(
             first_entry_file_name=first_entry_file,
@@ -224,7 +224,7 @@ class ImageRegionCaptionPipeline:
             cache_type=cache_type
         )
         self.vlm_serving = APIVLMServing_openai(
-            api_url="http://172.96.141.132:3001/v1", # Any API platform compatible with OpenAI format
+            api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format
             model_name="gpt-4o-mini",
             image_io=None,
             send_request_stream=False,
@@ -235,7 +235,6 @@ class ImageRegionCaptionPipeline:
         self.caption_generator = PromptedVQAGenerator(serving=self.vlm_serving,system_prompt="You are a helpful assistant.")
         self.input_image_key = input_image_key
         self.input_bbox_key = input_bbox_key
-        self.image_with_bbox_path=image_with_bbox_path
         self.bbox_record=None
 
     def forward(self):
@@ -253,29 +252,7 @@ class ImageRegionCaptionPipeline:
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Image region caption with DataFlow")
-    parser.add_argument("--first_entry_file", default="./data/image_region_caption/image_region_caption_demo.jsonl")
-    parser.add_argument("--cache_path", default="./cache/image_region_caption")
-    parser.add_argument("--file_name_prefix", default="region_caption")
-    parser.add_argument("--cache_type", default="jsonl")
-    parser.add_argument("--input_image_key", default="image")
-    parser.add_argument("--input_bbox_key", default="bbox")
-
-    parser.add_argument("--max_boxes", type=int, default=10)
-    parser.add_argument("--output_image_with_bbox_path", default="./cache/image_region_caption/image_with_bbox_result.jsonl")
-
-    args = parser.parse_args()
-
-    pipe = ImageRegionCaptionPipeline(
-        first_entry_file=args.first_entry_file,
-        cache_path=args.cache_path,
-        file_name_prefix=args.file_name_prefix,
-        cache_type=args.cache_type,
-        input_image_key=args.input_image_key,
-        input_bbox_key=args.input_bbox_key,
-        max_boxes=args.max_boxes,
-        output_image_with_bbox_path=args.output_image_with_bbox_path,
-    )
+    pipe = ImageRegionCaptionPipeline()
     pipe.forward()
 
 ```
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_bbox_generator.md b/docs/en/notes/mm_operators/image_understanding/generate/image_bbox_generator.md
index 0d6f0c57..b5f579be 100644
--- a/docs/en/notes/mm_operators/image_understanding/generate/image_bbox_generator.md
+++ b/docs/en/notes/mm_operators/image_understanding/generate/image_bbox_generator.md
@@ -98,14 +98,14 @@ from dataflow.operators.cv import ImageBboxGenerator, ExistingBBoxDataGenConfig
 
 cfg = ExistingBBoxDataGenConfig(
     max_boxes=10,
-    input_jsonl_path="./data/image_region_caption/image_region_caption_demo.jsonl",
-    output_jsonl_path="./cache/image_region_caption/image_with_bbox_result.jsonl",
+    input_jsonl_path="../example_data/image_region_caption/image_region_caption_demo.jsonl",
+    output_jsonl_path="../cache/image_region_caption/image_with_bbox_result.jsonl",
 )
 generator = ImageBboxGenerator(config=config)
 
 storage = FileStorage(
-    first_entry_file_name="./data/image_region_caption/image_region_caption_demo.jsonl",
-    cache_path="./cache/image_region_caption",
+    first_entry_file_name="../example_data/image_region_caption/image_region_caption_demo.jsonl",
+    cache_path="../cache/image_region_caption",
     file_name_prefix="region_caption",
     cache_type="jsonl"
 )
@@ -123,12 +123,12 @@ Each line in the `image_with_bbox_result.jsonl` file contains:
 
 ```json
 {
-    "image": "./data/image_region_caption/20.png", 
+    "image": "../example_data/image_region_caption/20.png", 
     "type": "with_bbox", 
     "bbox": [[196, 104, 310, 495]], 
     "normalized_bbox": [[0.128, 0.125, 0.329, 0.72], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]], 
-    "result_file": "./cache/image_region_caption", 
-    "image_with_bbox": "./cache/image_region_caption\\2_bbox_vis.jpg", 
+    "result_file": "../cache/image_region_caption", 
+    "image_with_bbox": "../cache/image_region_caption\\2_bbox_vis.jpg", 
     "valid_bboxes_num": 1, 
     "prompt": "Describe the content of each marked region in the image. There are 1 regions: <region1> to <region1>."
 }
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
index af575f54..2fe90ca0 100644
--- a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
+++ b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
@@ -52,7 +52,7 @@ huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --loca
         self,
         model_path: str = "Qwen/Qwen2.5-VL-3B-Instruct",
         hf_cache_dir: str = "~/.cache/huggingface",
-        download_dir: str = "./ckpt/models",
+        download_dir: str = "../ckpt/models/Qwen2.5-VL-3B-Instruct",
         first_entry_file: str = "../example_data/image_region_caption/image_region_caption_demo.jsonl",
         cache_path: str = "../cache/image_region_caption",
         file_name_prefix: str = "region_caption",
@@ -63,6 +63,10 @@ huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --loca
         output_image_with_bbox_path: str = "../cache/image_region_caption/image_with_bbox_result.jsonl",
     ):
 ```
+> **⚠️ 模型路径配置的重要提示（以 `Qwen2.5-VL-3B-Instruct` 为例）：**
+> 
+> * **如果您已经下载好了模型文件**：请将 `model_path` 修改为您的本地模型路径。**务必保证**模型存放的最终文件夹名称精确为 `Qwen2.5-VL-3B-Instruct`，否则底层解析时将无法正确匹配和识别该模型。
+> * **如果您还未下载模型（需要自动下载）**：请一定要指定 `download_dir` 参数，并且该目录路径**必须以** `Qwen2.5-VL-3B-Instruct` **结尾**（正如默认参数所示），否则下载完成后同样会导致框架无法识别模型。
 
 ### 第五步：一键运行
 
@@ -70,6 +74,44 @@ huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --loca
 cd gpu_pipelines
 python image_region_caption_pipeline.py
 ```
+> **🛠️ 常见问题排查 (Troubleshooting)**
+> 
+> **问题 1：** 如果遇到类似如下的动态链接库冲突报错：
+> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`
+> 
+> **解决方法：** 这通常是环境变量干扰导致的。请在运行命令前清空 `LD_LIBRARY_PATH`：
+> ```bash
+> LD_LIBRARY_PATH="" python image_region_caption_pipeline.py
+> ```
+> 
+> **问题 2：** 如果您使用的是 **Qwen 系列模型**，并且遇到以下报错：
+> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"`
+> 
+> **解决方法：** 打开模型文件夹下的 `config.json` 文件，找到 `rope_scaling` 配置块，将 `"type"` 字段修改为 `"rope_type"` 即可。
+> 
+> **修改前：**
+> ```json
+> "rope_scaling": {
+>   "type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> ```
+> 
+> **修改后：**
+> ```json
+> "rope_scaling": {
+>   "rope_type": "mrope",
+>   "mrope_section": [
+>     16,
+>     24,
+>     24
+>   ]
+> }
+> ```
 
 ---
 
@@ -86,7 +128,7 @@ python image_region_caption_pipeline.py
 
 ```json
 {
-    "image": "./data/image_region_caption/20.jpg",
+    "image": "../example_data/image_region_caption/20.jpg",
     "bbox": [[196, 104, 310, 495], [50, 60, 100, 200]]
 }
 
@@ -130,10 +172,12 @@ python image_region_caption_pipeline.py
 
 ```json
 {
-    "image":".\/data\/image_region_caption\/20.png","type":"with_bbox",
+    "image":"..\/example_data\/image_region_caption\/20.png",
+    "type":"with_bbox",
     "bbox":[[196,104,310,495]],
     "normalized_bbox":[[0.128,0.125,0.329,0.72],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0]],
-    "result_file":".\/cache\/image_region_caption","image_with_bbox":".\/cache\/image_region_caption\\2_bbox_vis.jpg",
+    "result_file":"..\/cache\/image_region_caption",
+    "image_with_bbox":"..\/cache\/image_region_caption\\2_bbox_vis.jpg",
     "valid_bboxes_num":1,
     "prompt":"Describe the content of each marked region in the image. There are 1 regions: <region1> to <region1>.",
     "answer":"In <region1>, the focus is on the lower half of a person wearing high-heeled shoes with an ornate design. The setting appears to be a kitchen, with items such as a table with floral tablecloth, a broom, and various kitchen utensils visible in the background. The legs of another person can also be seen, indicating there may be interaction happening in this domestic space. The overall scene captures a domestic and casual atmosphere."
@@ -148,7 +192,6 @@ python image_region_caption_pipeline.py
 以下是完整的 `ImageRegionCaptionPipeline` 代码实现。
 
 ```python
-import argparse
 from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
 from dataflow.operators.core_vision.generate.image_bbox_generator import (
     ImageBboxGenerator, 
@@ -163,19 +206,17 @@ from dataflow.utils.storage import FileStorage
 class ImageRegionCaptionPipeline:
     def __init__(
         self,
-        model_path: str,
-        *,
-        hf_cache_dir: str | None = None,
-        download_dir: str = "./ckpt/models",
-        first_entry_file: str = "./data/image_region_caption/image_region_caption_demo.jsonl",
-        cache_path: str = "./cache/image_region_caption",
+        model_path: str = "Qwen/Qwen2.5-VL-3B-Instruct",
+        hf_cache_dir: str = "~/.cache/huggingface",
+        download_dir: str = "../ckpt/models/Qwen2.5-VL-3B-Instruct",
+        first_entry_file: str = "../example_data/image_region_caption/image_region_caption_demo.jsonl",
+        cache_path: str = "../cache/image_region_caption",
         file_name_prefix: str = "region_caption",
         cache_type: str = "jsonl",
         input_image_key: str = "image",
         input_bbox_key: str = "bbox",
-        image_with_bbox_path: str = 'image_with_bbox',
         max_boxes: int = 10,
-        output_image_with_bbox_path: str = "./cache/image_region_caption/image_with_bbox_result.jsonl",
+        output_image_with_bbox_path: str = "../cache/image_region_caption/image_with_bbox_result.jsonl",
     ):
         self.bbox_storage = FileStorage(
             first_entry_file_name=first_entry_file,
@@ -209,7 +250,6 @@ class ImageRegionCaptionPipeline:
         self.caption_generator = PromptedVQAGenerator(serving=self.serving,)
         self.input_image_key = input_image_key
         self.input_bbox_key = input_bbox_key
-        self.image_with_bbox_path=image_with_bbox_path
         self.bbox_record=None
 
     def forward(self):
@@ -227,34 +267,7 @@ class ImageRegionCaptionPipeline:
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Image region caption with DataFlow")
-    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
-    parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface")
-    parser.add_argument("--download_dir", default="./ckpt/models")
-    parser.add_argument("--first_entry_file", default="./data/image_region_caption/image_region_caption_demo.jsonl")
-    parser.add_argument("--cache_path", default="./cache/image_region_caption")
-    parser.add_argument("--file_name_prefix", default="region_caption")
-    parser.add_argument("--cache_type", default="jsonl")
-    parser.add_argument("--input_image_key", default="image")
-    parser.add_argument("--input_bbox_key", default="bbox")
-    parser.add_argument("--max_boxes", type=int, default=10)
-    parser.add_argument("--output_image_with_bbox_path", default="./cache/image_region_caption/image_with_bbox_result.jsonl")
-
-    args = parser.parse_args()
-
-    pipe = ImageRegionCaptionPipeline(
-        model_path=args.model_path,
-        hf_cache_dir=args.hf_cache_dir,
-        download_dir=args.download_dir,
-        first_entry_file=args.first_entry_file,
-        cache_path=args.cache_path,
-        file_name_prefix=args.file_name_prefix,
-        cache_type=args.cache_type,
-        input_image_key=args.input_image_key,
-        input_bbox_key=args.input_bbox_key,
-        max_boxes=args.max_boxes,
-        output_image_with_bbox_path=args.output_image_with_bbox_path
-    )
+    pipe = ImageRegionCaptionPipeline()
     pipe.forward()
 
 ```
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
index 895ee281..450a8c12 100644
--- a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
+++ b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md
@@ -105,7 +105,7 @@ python image_region_caption_api_pipeline.py
 
 ```json
 {
-    "image": "./data/image_region_caption/20.jpg",
+    "image": "../example_data/image_region_caption/20.jpg",
     "bbox": [[196, 104, 310, 495], [50, 60, 100, 200]]
 }
 
@@ -149,10 +149,12 @@ python image_region_caption_api_pipeline.py
 
 ```json
 {
-    "image":".\/data\/image_region_caption\/20.png","type":"with_bbox",
+    "image":"..\/example_data\/image_region_caption\/20.png",
+    "type":"with_bbox",
     "bbox":[[196,104,310,495]],
     "normalized_bbox":[[0.128,0.125,0.329,0.72],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0]],
-    "result_file":".\/cache\/image_region_caption","image_with_bbox":".\/cache\/image_region_caption\\2_bbox_vis.jpg",
+    "result_file":"..\/cache\/image_region_caption",
+    "image_with_bbox":"..\/cache\/image_region_caption\\2_bbox_vis.jpg",
     "valid_bboxes_num":1,
     "prompt":"Describe the content of each marked region in the image. There are 1 regions: <region1> to <region1>.",
     "answer":"In <region1>, the focus is on the lower half of a person wearing high-heeled shoes with an ornate design. The setting appears to be a kitchen, with items such as a table with floral tablecloth, a broom, and various kitchen utensils visible in the background. The legs of another person can also be seen, indicating there may be interaction happening in this domestic space. The overall scene captures a domestic and casual atmosphere."
@@ -168,9 +170,8 @@ python image_region_caption_api_pipeline.py
 
 ```python
 import os
-os.environ["DF_API_KEY"] = "sk-iaY19LU7WMT5QlK8LujFIG7RjI2omHLWYiCs4Do6imieLKOg"
+os.environ["DF_API_KEY"] = "sk-xxxx"
 
-import argparse
 from dataflow.operators.core_vision.generate.image_bbox_generator import (
     ImageBboxGenerator, 
     ExistingBBoxDataGenConfig
@@ -184,15 +185,14 @@ from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai
 class ImageRegionCaptionPipeline:
     def __init__(
         self,
-        first_entry_file: str = "./data/image_region_caption/image_region_caption_demo.jsonl",
-        cache_path: str = "./cache/image_region_caption",
+        first_entry_file: str = "../example_data/image_region_caption/image_region_caption_demo.jsonl",
+        cache_path: str = "../cache/image_region_caption",
         file_name_prefix: str = "region_caption",
         cache_type: str = "jsonl",
         input_image_key: str = "image",
         input_bbox_key: str = "bbox",
-        image_with_bbox_path: str = 'image_with_bbox',
         max_boxes: int = 10,
-        output_image_with_bbox_path: str = "./cache/image_region_caption/image_with_bbox_result.jsonl",
+        output_image_with_bbox_path: str = "../cache/image_region_caption/image_with_bbox_result.jsonl",
     ):
         self.bbox_storage = FileStorage(
             first_entry_file_name=first_entry_file,
@@ -214,7 +214,7 @@ class ImageRegionCaptionPipeline:
             cache_type=cache_type
         )
         self.vlm_serving = APIVLMServing_openai(
-            api_url="http://172.96.141.132:3001/v1", # Any API platform compatible with OpenAI format
+            api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format
             model_name="gpt-4o-mini",
             image_io=None,
             send_request_stream=False,
@@ -225,7 +225,6 @@ class ImageRegionCaptionPipeline:
         self.caption_generator = PromptedVQAGenerator(serving=self.vlm_serving,system_prompt="You are a helpful assistant.")
         self.input_image_key = input_image_key
         self.input_bbox_key = input_bbox_key
-        self.image_with_bbox_path=image_with_bbox_path
         self.bbox_record=None
 
     def forward(self):
@@ -243,29 +242,7 @@ class ImageRegionCaptionPipeline:
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Image region caption with DataFlow")
-    parser.add_argument("--first_entry_file", default="./data/image_region_caption/image_region_caption_demo.jsonl")
-    parser.add_argument("--cache_path", default="./cache/image_region_caption")
-    parser.add_argument("--file_name_prefix", default="region_caption")
-    parser.add_argument("--cache_type", default="jsonl")
-    parser.add_argument("--input_image_key", default="image")
-    parser.add_argument("--input_bbox_key", default="bbox")
-
-    parser.add_argument("--max_boxes", type=int, default=10)
-    parser.add_argument("--output_image_with_bbox_path", default="./cache/image_region_caption/image_with_bbox_result.jsonl")
-
-    args = parser.parse_args()
-
-    pipe = ImageRegionCaptionPipeline(
-        first_entry_file=args.first_entry_file,
-        cache_path=args.cache_path,
-        file_name_prefix=args.file_name_prefix,
-        cache_type=args.cache_type,
-        input_image_key=args.input_image_key,
-        input_bbox_key=args.input_bbox_key,
-        max_boxes=args.max_boxes,
-        output_image_with_bbox_path=args.output_image_with_bbox_path,
-    )
+    pipe = ImageRegionCaptionPipeline()
     pipe.forward()
 
 ```
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/image_bbox_generator.md b/docs/zh/notes/mm_operators/image_understanding/generate/image_bbox_generator.md
index eab99df6..dd87c0e6 100644
--- a/docs/zh/notes/mm_operators/image_understanding/generate/image_bbox_generator.md
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/image_bbox_generator.md
@@ -97,14 +97,14 @@ from dataflow.operators.cv import ImageBboxGenerator, ExistingBBoxDataGenConfig
 
 cfg = ExistingBBoxDataGenConfig(
     max_boxes=10,
-    input_jsonl_path="./data/image_region_caption/image_region_caption_demo.jsonl",
-    output_jsonl_path="./cache/image_region_caption/image_with_bbox_result.jsonl",
+    input_jsonl_path="../example_data/image_region_caption/image_region_caption_demo.jsonl",
+    output_jsonl_path="../cache/image_region_caption/image_with_bbox_result.jsonl",
 )
 generator = ImageBboxGenerator(config=config)
 
 storage = FileStorage(
-    first_entry_file_name="./data/image_region_caption/image_region_caption_demo.jsonl",
-    cache_path="./cache/image_region_caption",
+    first_entry_file_name="../example_data/image_region_caption/image_region_caption_demo.jsonl",
+    cache_path="../cache/image_region_caption",
     file_name_prefix="region_caption",
     cache_type="jsonl"
 )
@@ -123,12 +123,12 @@ generator.run(
 
 ```json
 {
-    "image": "./data/image_region_caption/20.png", 
+    "image": "../example_data/image_region_caption/20.png", 
     "type": "with_bbox", 
     "bbox": [[196, 104, 310, 495]], 
     "normalized_bbox": [[0.128, 0.125, 0.329, 0.72], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]], 
-    "result_file": "./cache/image_region_caption", 
-    "image_with_bbox": "./cache/image_region_caption\\2_bbox_vis.jpg", 
+    "result_file": "../cache/image_region_caption", 
+    "image_with_bbox": "../cache/image_region_caption\\2_bbox_vis.jpg", 
     "valid_bboxes_num": 1, 
     "prompt": "Describe the content of each marked region in the image. There are 1 regions: <region1> to <region1>."
 }

From 83d50249fa3f43696b1d632dc67dfb9ef1cada1a Mon Sep 17 00:00:00 2001
From: HankYang <hankyang428@gmail.com>
Date: Wed, 25 Feb 2026 23:44:38 +0800
Subject: [PATCH 7/7] fix doc 1

---
 docs/.vuepress/notes/en/mm_guide.ts           |   3 +-
 docs/.vuepress/notes/en/mm_operators.ts       |   3 +-
 docs/.vuepress/notes/zh/mm_guide.ts           |   3 +-
 docs/.vuepress/notes/zh/mm_operators.ts       |   3 +-
 .../multirole_videoqa_pipeline.md             | 288 ------------------
 .../generate/multirole_videoqa.md             | 140 ---------
 .../multirole_videoqa_pipeline.md             | 288 ------------------
 .../generate/multirole_videoqa.md             | 140 ---------
 8 files changed, 4 insertions(+), 864 deletions(-)
 delete mode 100644 docs/en/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md
 delete mode 100644 docs/en/notes/mm_operators/image_understanding/generate/multirole_videoqa.md
 delete mode 100644 docs/zh/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md
 delete mode 100644 docs/zh/notes/mm_operators/image_understanding/generate/multirole_videoqa.md

diff --git a/docs/.vuepress/notes/en/mm_guide.ts b/docs/.vuepress/notes/en/mm_guide.ts
index 298111d5..8a5469cc 100644
--- a/docs/.vuepress/notes/en/mm_guide.ts
+++ b/docs/.vuepress/notes/en/mm_guide.ts
@@ -46,8 +46,7 @@ export const MMGuide: ThemeNote = defineNoteConfig({
                 'video_clip_and_filter',
                 'video_qa',
                 'video_cotqa',
-                'video_longvideo_cotqa_api',
-                'multirole_videoqa_pipeline'
+                'video_longvideo_cotqa_api'
             ],
         },
         {
diff --git a/docs/.vuepress/notes/en/mm_operators.ts b/docs/.vuepress/notes/en/mm_operators.ts
index 5e811093..98f53d6a 100644
--- a/docs/.vuepress/notes/en/mm_operators.ts
+++ b/docs/.vuepress/notes/en/mm_operators.ts
@@ -41,8 +41,7 @@ export const MMOperators: ThemeNote = defineNoteConfig({
                         // 'image_region_caption',
                         // 'image_scale_caption',
                         // 'image_gcot',
-                        // 'image_caprl',
-                        // 'multirole_videoqa',
+                        // 'image_caprl'
                     ]
                 },
                 {
diff --git a/docs/.vuepress/notes/zh/mm_guide.ts b/docs/.vuepress/notes/zh/mm_guide.ts
index a7ebcbef..21bece4d 100644
--- a/docs/.vuepress/notes/zh/mm_guide.ts
+++ b/docs/.vuepress/notes/zh/mm_guide.ts
@@ -46,8 +46,7 @@ export const MMGuide: ThemeNote = defineNoteConfig({
                 'video_clip_and_filter',
                 'video_qa',
                 'video_cotqa',
-                'video_longvideo_cotqa_api',
-                'multirole_videoqa_pipeline'
+                'video_longvideo_cotqa_api'
             ],
         },
         {
diff --git a/docs/.vuepress/notes/zh/mm_operators.ts b/docs/.vuepress/notes/zh/mm_operators.ts
index c35993a2..8451b255 100644
--- a/docs/.vuepress/notes/zh/mm_operators.ts
+++ b/docs/.vuepress/notes/zh/mm_operators.ts
@@ -42,8 +42,7 @@ export const MMOperators: ThemeNote = defineNoteConfig({
                         // 'image_region_caption',
                         // 'image_scale_caption',
                         // 'image_gcot',
-                        // 'image_caprl',
-                        // 'multirole_videoqa',
+                        // 'image_caprl'
                     ]
                 },
                 {
diff --git a/docs/en/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md b/docs/en/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md
deleted file mode 100644
index bbe78d17..00000000
--- a/docs/en/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md
+++ /dev/null
@@ -1,288 +0,0 @@
----
-title: Multi-Role Video QA Pipeline
-createTime: 2026/01/11 22:15:28
-icon: mdi:image-text
-permalink: /en/mm_guide/multirole_videoqa_pipeline/
----
-## 1. Overview
-
-The **Multi-Role Video QA Pipeline** leverages Multimodal Large Models (VLMs) and a Multi-Agent collaboration mechanism to automatically generate high-quality, deep Question-Answer (QA) pairs from long videos or advertising footage.
-
-Unlike standard single-pass generation, this pipeline introduces a **Multi-Agent Iterative Refinement** phase. It first generates initial QAs, then refines them through multiple rounds of interaction simulating different agent roles (e.g., Questioner, Checker, Polisher), finally outputting logical and accurate QA data.
-
-We support the following application scenarios:
-
-* **Ad Video Understanding**: Extracting key selling points, emotional tone, and narrative logic from ads.
-* **Complex Video Reasoning**: Constructing deep QA datasets requiring reasoning across different time segments.
-* **Long Video Summarization & QA**: Handling video data containing rich Metadata (`Meta`) and multiple Clips (`Clips`).
-
-The main process of the pipeline includes:
-
-1. **Initial Generation**: Generates baseline QA pairs based on video metadata and clips.
-2. **Multi-Agent Refinement**: Critiques, corrects, and optimizes QA pairs through multiple iterations (default 3 rounds).
-3. **Final Generation**: Cleans the data and outputs the final QA set in a standard format.
-
----
-
-## 2. Quick Start
-
-### Step 1: Create a Working Directory
-
-```bash
-mkdir run_video_qa
-cd run_video_qa
-
-```
-
-### Step 2: Prepare the Script
-
-Save the code in the "Pipeline Example" section below as `multirole_videoqa_pipeline.py`.
-
-### Step 3: Configure Parameters
-
-Ensure the input data contains `Meta` and `Clips` fields.
-
-```bash
-# Install dependencies
-pip install open-dataflow vllm
-
-```
-
-### Step 4: Run
-
-```bash
-python multirole_videoqa_pipeline.py \
-  --model_path "/path/to/Qwen2.5-VL-7B-Instruct" \
-  --images_file "data/adsQA.jsonl" \
-  --card_id "0"
-
-```
-
----
-
-## 3. Data Flow & Logic
-
-### 1. **Input Data**
-
-Input data is typically pre-processed video data containing global metadata and segment information:
-
-* **Meta**: Global description, title, or background info of the video.
-* **Clips**: List of video clips, where each clip contains audio text, frame image paths, and clip descriptions.
-
-**Input Data Example**:
-
-```json
-{
-    "Meta": "A commercial for a new sports car featuring dynamic driving scenes.",
-    "Clips": [
-        {
-            "Audio_Text": "Experience the speed.",
-            "Frames_Images": ["./frames/001.jpg", "./frames/002.jpg"],
-            "Description": "Car accelerating on a highway."
-        },
-        {
-            "Audio_Text": "Safety meets luxury.",
-            "Frames_Images": ["./frames/003.jpg"],
-            "Description": "Interior shot showing leather seats."
-        }
-    ]
-}
-
-```
-
-### 2. **Core Operator Logic**
-
-This pipeline executes through a chain of three specialized operators:
-
-#### A. **MultiroleVideoQAInitialGenerator**
-
-* **Function**: Acts as the "Draft Author", reading `Meta` and `Clips` to generate the first version of QA pairs using the VLM.
-* **Output**: A DataFrame containing preliminary QAs.
-
-#### B. **MultiroleVideoQAMultiAgentGenerator**
-
-* **Function**: Acts as the "Editorial Team", polishing the draft.
-* **Mechanism**: Configured with `max_iterations` (e.g., 3 rounds). During these rounds, the model may simulate different roles (e.g., a reviewer pointing out errors, a polisher improving wording) to progressively enhance QA quality.
-* **Input**: Initial DataFrame.
-* **Output**: Intermediate DataFrame after multiple rounds of correction.
-
-#### C. **MultiroleVideoQAFinalGenerator**
-
-* **Function**: Acts as the "Publisher", responsible for final formatting and cleaning.
-* **Output**: Standardized `QA` list.
-
-### 3. **Output Data**
-
-The output data adds a high-quality QA list to the original fields:
-
-* **QA**: List of generated QA pairs, including labels (e.g., question type), question text, and answer text.
-
-**Output Data Example**:
-
-```json
-{
-    "Meta": "...",
-    "Clips": [...],
-    "QA": [
-        {
-            "Label": "Feature Extraction",
-            "Question": "What specific features of the car are highlighted in the interior shots?",
-            "Answer": "The video highlights the luxury leather seats and the advanced dashboard interface."
-        },
-        {
-            "Label": "Narrative Analysis",
-            "Question": "How does the audio complement the visual transition?",
-            "Answer": "The narration 'Experience speed' coincides with the acceleration scene, reinforcing the dynamic visual."
-        }
-    ]
-}
-
-```
-
----
-
-## 4. Pipeline Example
-
-Below is the complete `MultiRoleVideoQAPipeline` code implementation.
-
-```python
-import argparse
-import os 
-from dataflow.serving import LocalModelVLMServing_vllm
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import (
-    MultiroleVideoQAInitialGenerator, 
-    MultiroleVideoQAMultiAgentGenerator, 
-    MultiroleVideoQAFinalGenerator
-)
-
-try:
-    import torch
-    # 多进程启动方式设置为 spawn，避免 CUDA 初始化冲突
-    if 'spawn' not in torch.multiprocessing.get_all_start_methods():
-        torch.multiprocessing.set_start_method('spawn', force=True)
-except ImportError:
-    pass
-
-
-class MultiRoleVideoQAPipeline():
-    def __init__(
-        self,
-        model_path: str,
-        *,
-        hf_cache_dir: str | None = None,
-        download_dir: str = "./ckpt",
-        first_entry_file: str = "/dataflow/example/ads_QA/adsQA.jsonl",
-        cache_path: str = "./cache_local",
-        file_name_prefix: str = "dataflow_cache_step",
-        cache_type: str = "jsonl",
-        # Keys Configuration
-        Meta_key: str = "Meta",
-        clips_key: str = "Clips", 
-        output_key: str = "QA"
-    ):
-        # 1. 存储初始化
-        self.storage = FileStorage(
-            first_entry_file_name=first_entry_file,
-            cache_path=cache_path,
-            file_name_prefix=file_name_prefix,
-            cache_type=cache_type,
-        )
-        
-        # 强制设置 vLLM 的多进程方法
-        os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "spawn"
-
-        # 2. VLM 服务初始化
-        self.llm_serving = LocalModelVLMServing_vllm(
-            hf_model_name_or_path=model_path,
-            hf_cache_dir=hf_cache_dir,
-            hf_local_dir=download_dir,
-            vllm_tensor_parallel_size=1, 
-            vllm_temperature=0.7,
-            vllm_top_p=0.9,
-            vllm_max_tokens=6000, # 视频问答通常需要较长的 Context
-        )
-
-        # 3. 算子链初始化
-        # 阶段一：初始生成
-        self.initial_QA_generation = MultiroleVideoQAInitialGenerator(llm_serving = self.llm_serving)
-        
-        # 阶段二：多智能体迭代优化 (核心差异点)
-        self.multiAgent_QA_generation = MultiroleVideoQAMultiAgentGenerator(
-            llm_serving = self.llm_serving, 
-            max_iterations = 3
-        )
-        
-        # 阶段三：最终格式化
-        self.final_QA_generation = MultiroleVideoQAFinalGenerator(llm_serving = self.llm_serving)
-
-        self.input_meta_key = Meta_key
-        self.input_clips_key = clips_key
-        self.output_key = output_key
-
-    def forward(self):
-        print(">>> [Pipeline] Step 1: Initial QA Generation...")
-        init_df = self.initial_QA_generation.run(
-            storage = self.storage.step(),
-            input_meta_key = self.input_meta_key, 
-            input_clips_key = self.input_clips_key, 
-            output_key = self.output_key
-        )
-        
-        print(">>> [Pipeline] Step 2: Multi-Agent Refinement (3 iterations)...")
-        # 注意：此算子接收上一阶段的 DataFrame (init_df) 作为输入
-        middle_df = self.multiAgent_QA_generation.run(
-            df = init_df,
-            input_meta_key = self.input_meta_key, 
-            input_clips_key = self.input_clips_key, 
-            output_key = self.output_key
-        )
-        
-        print(">>> [Pipeline] Step 3: Finalizing QA Pairs...")
-        self.final_QA_generation.run(
-            storage = self.storage,
-            df = middle_df,
-            input_meta_key = self.input_meta_key, 
-            input_clips_key = self.input_clips_key, 
-            output_key = self.output_key
-        )
-        print(">>> [Pipeline] Done.")
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Batch video QA generation with DataFlow (Single GPU)")
-
-    parser.add_argument("--model_path", default="../../Models/Qwen2.5-VL-7B-Instruct",
-                                 help="Path to the local model or HuggingFace repo ID.")
-    parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface",
-                                 help="HuggingFace cache directory.")
-    parser.add_argument("--download_dir", default="./ckpt",
-                                 help="Local directory for downloading models.")
-    
-    parser.add_argument("--card_id", type=str, default="0",
-                                 help="The single CUDA device ID to use (e.g., '0' or '1').")
-    
-    parser.add_argument("--images_file", default="./dataflow/example/ads_QA/adsQA.jsonl",
-                                 help="Path to the first entry file for DataFlow.")
-    parser.add_argument("--cache_path", default="./cache_local",
-                                 help="Directory for caching DataFlow steps.")
-    parser.add_argument("--file_name_prefix", default="caption",
-                                 help="Prefix for cache file names.")
-    parser.add_argument("--cache_type", default="jsonl",
-                                 help="Type of cache file (e.g., jsonl).")
-
-    args = parser.parse_args()
-
-    os.environ['CUDA_VISIBLE_DEVICES'] = args.card_id.replace(' ', '')
-    
-    pipe = MultiRoleVideoQAPipeline(
-        model_path=args.model_path,
-        hf_cache_dir=args.hf_cache_dir,
-        download_dir=args.download_dir,
-        first_entry_file=args.images_file,
-        cache_path=args.cache_path,
-        file_name_prefix=args.file_name_prefix,
-        cache_type=args.cache_type,
-    )
-    pipe.forward()
-
-```
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/multirole_videoqa.md b/docs/en/notes/mm_operators/image_understanding/generate/multirole_videoqa.md
deleted file mode 100644
index 2f45e04c..00000000
--- a/docs/en/notes/mm_operators/image_understanding/generate/multirole_videoqa.md
+++ /dev/null
@@ -1,140 +0,0 @@
----
-title: MultiRole Video QA Generation
-createTime: 2025/12/2 20:00:00
-icon: material-symbols-light:video
-permalink: /en/mm_operators/generate/multirole_videoqa/
----
-
-## 📘 Overview
-
-`MultiroleVideoQAGenerate` is a data generation operator for **automatically creating Question-Answer (QA) pairs based on the preprocessed video data**.  
-Given input preprocessed video data, it constructs several QA pairs relative to the video. This is suitable for Advertisement video annotation, dataset construction, and video understanding tasks.
-
-**Features:**
-* Supports batch processing of multiple preprocessed video data.
-* Generates high-quality QA pairs using VLMs like Qwen2.5-VL.
-* Automatically handles video input and using prompt to generate data.
-
----
-
-## 🏗️ `__init__` Function
-
-```python
-def __init__(
-    self,
-    llm_serving: VLMServingABC
-):
-    ...
-```
-## 🧾 `__init__` Parameters
-
-| Parameter     | Type            | Default | Description                                                     |
-| :------------ | :-------------- | :------ | :-------------------------------------------------------------- |
-| `llm_serving` | `VLMServingABC` | -       | **Model Serving Object** used to call the VLM for QA pairs generation |
-
------
-
-## ⚡ `run` Function
-
-```python
-def run(
-        self,
-        storage: DataFlowStorage,
-        input_meta_key: str = "Meta", 
-        input_clips_key: str = "Clips", 
-        output_key: str = "QA"
-):
-    ...
-```
-
-The `run` function executes the main QA pairs generation workflow:
-read data paths → **validate DataFrame** → construct prompts → call the model → generate QA pairs captions → write results to output.
-
-## 🧾 `run` Parameters
-
-| Parameter         | Type              | Default     | Description                                           |
-| :---------------- | :---------------- | :---------- | :---------------------------------------------------- |
-| `storage`         | `DataFlowStorage` | -           | Dataflow storage object                               |
-| `input_mets_key`  | `str`             | `"Meta"`    | **Multimodal Input Field Name**                       |
-| `input_clips_key` | `str`             | `"Clips"`   | **Multimodal Input Field Name**                       |
-| `output_key`      | `str`             | `"QA"`      | **Model Output Field Name** (the generated QA pairs)  |
-
------
-
-## 🧠 Example Usage
-
-```python
-import os 
-import argparse
-from dataflow.serving import LocalModelVLMServing_vllm
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import MultiroleVideoQAInitialGenerator, MultiroleVideoQAMultiAgentGenerator, MultiroleVideoQAFinalGenerator
-
-# Step 1: Launch local model service
-llm_serving = LocalModelVLMServing_vllm(
-            hf_model_name_or_path=model_path,
-            hf_cache_dir=hf_cache_dir,
-            hf_local_dir=download_dir,
-            vllm_tensor_parallel_size=1, 
-            vllm_temperature=0.7,
-            vllm_top_p=0.9,
-            vllm_max_tokens=6000,
-        )
-
-# Step 2: Prepare input data
-storage = FileStorage(
-            first_entry_file_name=first_entry_file,
-            cache_path=cache_path,
-            file_name_prefix=file_name_prefix,
-            cache_type=cache_type,
-        )
-
-# Step 3: Initialize and run the operator
-initial_QA_generation = MultiroleVideoQAInitialGenerator(llm_serving = self.llm_serving)
-multiAgent_QA_generation = MultiroleVideoQAMultiAgentGenerator(llm_serving = self.llm_serving, max_iterations = 3)
-final_QA_generation = MultiroleVideoQAFinalGenerator(llm_serving = self.llm_serving)
-
-init_df = initial_QA_generation.run(
-            storage = self.storage.step(),
-            input_meta_key = self.input_meta_key, 
-            input_clips_key = self.input_clips_key, 
-            output_key = self.output_key
-        )
-middle_df = multiAgent_QA_generation.run(
-            df = init_df,
-            input_meta_key = self.input_meta_key, 
-            input_clips_key = self.input_clips_key, 
-            output_key = self.output_key
-        )
-final_QA_generation.run(
-            storage = self.storage,
-            df = middle_df,
-            input_meta_key = self.input_meta_key, 
-            input_clips_key = self.input_clips_key, 
-            output_key = self.output_key
-        )
-```
-
------
-
-## 🧾 Default Output Format
-
-| Field     | Type         | Description                      |
-| :-------- | :----------- | :------------------------------- |
-| `Meta`    | `str`        | Meta information for video       |
-| `Clips`   | `List[Dict]` | Interleaved modality video Clips |
-| `QA`      | `List[Dict]` | QA pairs                         |
-
------
-
-### 📥 Example Input
-
-```jsonl
-{"Meta": "Meta Information", "Clips": [{"Audio_Text": "Audio_Text1", "Frames_Images": ["image_path1","image_path2"], "Description": "Description1"}, {"Audio_Text": "Audio_Text2", "Frames_Images": ["image_path3","image_path4"], "Description": "Description2"}]}
-```
-
-### 📤 Example Output
-
-```jsonl
-{"Meta": "Meta Information", "Clips": [{"Audio_Text": "Audio_Text1", "Frames_Images": ["image_path1","image_path2"], "Description": "Description1"}, {"Audio_Text": "Audio_Text2", "Frames_Images": ["image_path3","image_path4"], "Description": "Description2"}], "QA":[{"Label":"label1", "Question": "Question1", "Answer": "Answer1"},{"Label":"label2", "Question": "Question2", "Answer": "Answer2"}]}
-```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md b/docs/zh/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md
deleted file mode 100644
index 1d423513..00000000
--- a/docs/zh/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md
+++ /dev/null
@@ -1,288 +0,0 @@
----
-title: 多角色视频问答生成流水线
-createTime: 2026/01/11 22:15:28
-icon: mdi:image-text
-permalink: /zh/mm_guide/multirole_videoqa_pipeline/
----
-## 1. 概述
-
-**多角色视频问答生成流水线 (MultiRole Video QA Pipeline)** 旨在利用多模态大模型（VLM）和多智能体（Multi-Agent）协作机制，自动从长视频或广告视频中生成高质量、深度的问答对（QA Pairs）。
-
-与普通的单次生成不同，该流水线引入了**多智能体迭代优化**环节。它首先生成初始问答，然后通过模拟不同角色的智能体（如提问者、检查者、润色者）进行多轮交互和修正，最终输出逻辑严密、信息准确的问答数据。
-
-我们支持以下应用场景：
-
-* **广告视频理解**：提取广告中的关键卖点、情感倾向和叙事逻辑。
-* **复杂视频推理**：构建需要跨时间段推理的深度问答数据集。
-* **长视频摘要与问答**：处理包含丰富元数据（Meta）和多个片段（Clips）的视频数据。
-
-流水线的主要流程包括：
-
-1. **初始生成 (Initial Generation)**：基于视频元数据和片段生成基础问答对。
-2. **多智能体协作 (Multi-Agent Refinement)**：通过多轮迭代（默认 3 轮），对问答对进行批判、修正和优化。
-3. **最终整合 (Final Generation)**：清洗数据，输出标准格式的最终问答集。
-
----
-
-## 2. 快速开始
-
-### 第一步：准备工作目录
-
-```bash
-mkdir run_video_qa
-cd run_video_qa
-
-```
-
-### 第二步：准备脚本
-
-将下文“流水线示例”中的代码保存为 `multirole_videoqa_pipeline.py`。
-
-### 第三步：配置运行参数
-
-确保输入数据包含 `Meta` 和 `Clips` 字段。
-
-```bash
-# 安装依赖
-pip install open-dataflow vllm
-
-```
-
-### 第四步：一键运行
-
-```bash
-python multirole_videoqa_pipeline.py \
-  --model_path "/path/to/Qwen2.5-VL-7B-Instruct" \
-  --images_file "data/adsQA.jsonl" \
-  --card_id "0"
-
-```
-
----
-
-## 3. 数据流与流水线逻辑
-
-### 1. **输入数据**
-
-输入数据通常是经过预处理的视频数据，包含全局元数据和分段信息：
-
-* **Meta**：视频的全局描述、标题或背景信息。
-* **Clips**：视频片段列表，每个片段包含音频文本、帧图像路径和片段描述。
-
-**输入数据示例**：
-
-```json
-{
-    "Meta": "A commercial for a new sports car featuring dynamic driving scenes.",
-    "Clips": [
-        {
-            "Audio_Text": "Experience the speed.",
-            "Frames_Images": ["./frames/001.jpg", "./frames/002.jpg"],
-            "Description": "Car accelerating on a highway."
-        },
-        {
-            "Audio_Text": "Safety meets luxury.",
-            "Frames_Images": ["./frames/003.jpg"],
-            "Description": "Interior shot showing leather seats."
-        }
-    ]
-}
-
-```
-
-### 2. **核心算子逻辑**
-
-该流水线通过三个专门的算子串联执行：
-
-#### A. **MultiroleVideoQAInitialGenerator（初始生成器）**
-
-* **功能**：作为“初稿作者”，它读取 `Meta` 和 `Clips`，利用 VLM 生成第一版问答对。
-* **输出**：包含初步 QA 的 DataFrame。
-
-#### B. **MultiroleVideoQAMultiAgentGenerator（多智能体优化器）**
-
-* **功能**：作为“编辑团队”，它对初稿进行打磨。
-* **机制**：设置 `max_iterations`（如 3 次），在多轮次中，模型可能扮演不同角色（如审核员指出错误、润色员优化措辞），逐步提升 QA 质量。
-* **输入**：初始 DataFrame。
-* **输出**：经过多轮修正后的中间态 DataFrame。
-
-#### C. **MultiroleVideoQAFinalGenerator（最终生成器）**
-
-* **功能**：作为“出版商”，它负责最终的格式化和清洗。
-* **输出**：标准化的 `QA` 列表。
-
-### 3. **输出数据**
-
-输出数据在原有字段基础上增加了高质量的问答列表：
-
-* **QA**：生成的问答对列表，包含标签（如问题类型）、问题文本和答案文本。
-
-**输出数据示例**：
-
-```json
-{
-    "Meta": "...",
-    "Clips": [...],
-    "QA": [
-        {
-            "Label": "Feature Extraction",
-            "Question": "What specific features of the car are highlighted in the interior shots?",
-            "Answer": "The video highlights the luxury leather seats and the advanced dashboard interface."
-        },
-        {
-            "Label": "Narrative Analysis",
-            "Question": "How does the audio complement the visual transition?",
-            "Answer": "The narration 'Experience speed' coincides with the acceleration scene, reinforcing the dynamic visual."
-        }
-    ]
-}
-
-```
-
----
-
-## 4. 流水线示例
-
-以下是完整的 `MultiRoleVideoQAPipeline` 代码实现。
-
-```python
-import argparse
-import os 
-from dataflow.serving import LocalModelVLMServing_vllm
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import (
-    MultiroleVideoQAInitialGenerator, 
-    MultiroleVideoQAMultiAgentGenerator, 
-    MultiroleVideoQAFinalGenerator
-)
-
-try:
-    import torch
-    # 多进程启动方式设置为 spawn，避免 CUDA 初始化冲突
-    if 'spawn' not in torch.multiprocessing.get_all_start_methods():
-        torch.multiprocessing.set_start_method('spawn', force=True)
-except ImportError:
-    pass
-
-
-class MultiRoleVideoQAPipeline():
-    def __init__(
-        self,
-        model_path: str,
-        *,
-        hf_cache_dir: str | None = None,
-        download_dir: str = "./ckpt",
-        first_entry_file: str = "/dataflow/example/ads_QA/adsQA.jsonl",
-        cache_path: str = "./cache_local",
-        file_name_prefix: str = "dataflow_cache_step",
-        cache_type: str = "jsonl",
-        # Keys Configuration
-        Meta_key: str = "Meta",
-        clips_key: str = "Clips", 
-        output_key: str = "QA"
-    ):
-        # 1. 存储初始化
-        self.storage = FileStorage(
-            first_entry_file_name=first_entry_file,
-            cache_path=cache_path,
-            file_name_prefix=file_name_prefix,
-            cache_type=cache_type,
-        )
-        
-        # 强制设置 vLLM 的多进程方法
-        os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "spawn"
-
-        # 2. VLM 服务初始化
-        self.llm_serving = LocalModelVLMServing_vllm(
-            hf_model_name_or_path=model_path,
-            hf_cache_dir=hf_cache_dir,
-            hf_local_dir=download_dir,
-            vllm_tensor_parallel_size=1, 
-            vllm_temperature=0.7,
-            vllm_top_p=0.9,
-            vllm_max_tokens=6000, # 视频问答通常需要较长的 Context
-        )
-
-        # 3. 算子链初始化
-        # 阶段一：初始生成
-        self.initial_QA_generation = MultiroleVideoQAInitialGenerator(llm_serving = self.llm_serving)
-        
-        # 阶段二：多智能体迭代优化 (核心差异点)
-        self.multiAgent_QA_generation = MultiroleVideoQAMultiAgentGenerator(
-            llm_serving = self.llm_serving, 
-            max_iterations = 3
-        )
-        
-        # 阶段三：最终格式化
-        self.final_QA_generation = MultiroleVideoQAFinalGenerator(llm_serving = self.llm_serving)
-
-        self.input_meta_key = Meta_key
-        self.input_clips_key = clips_key
-        self.output_key = output_key
-
-    def forward(self):
-        print(">>> [Pipeline] Step 1: Initial QA Generation...")
-        init_df = self.initial_QA_generation.run(
-            storage = self.storage.step(),
-            input_meta_key = self.input_meta_key, 
-            input_clips_key = self.input_clips_key, 
-            output_key = self.output_key
-        )
-        
-        print(">>> [Pipeline] Step 2: Multi-Agent Refinement (3 iterations)...")
-        # 注意：此算子接收上一阶段的 DataFrame (init_df) 作为输入
-        middle_df = self.multiAgent_QA_generation.run(
-            df = init_df,
-            input_meta_key = self.input_meta_key, 
-            input_clips_key = self.input_clips_key, 
-            output_key = self.output_key
-        )
-        
-        print(">>> [Pipeline] Step 3: Finalizing QA Pairs...")
-        self.final_QA_generation.run(
-            storage = self.storage,
-            df = middle_df,
-            input_meta_key = self.input_meta_key, 
-            input_clips_key = self.input_clips_key, 
-            output_key = self.output_key
-        )
-        print(">>> [Pipeline] Done.")
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Batch video QA generation with DataFlow (Single GPU)")
-
-    parser.add_argument("--model_path", default="../../Models/Qwen2.5-VL-7B-Instruct",
-                                 help="Path to the local model or HuggingFace repo ID.")
-    parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface",
-                                 help="HuggingFace cache directory.")
-    parser.add_argument("--download_dir", default="./ckpt",
-                                 help="Local directory for downloading models.")
-    
-    parser.add_argument("--card_id", type=str, default="0",
-                                 help="The single CUDA device ID to use (e.g., '0' or '1').")
-    
-    parser.add_argument("--images_file", default="./dataflow/example/ads_QA/adsQA.jsonl",
-                                 help="Path to the first entry file for DataFlow.")
-    parser.add_argument("--cache_path", default="./cache_local",
-                                 help="Directory for caching DataFlow steps.")
-    parser.add_argument("--file_name_prefix", default="caption",
-                                 help="Prefix for cache file names.")
-    parser.add_argument("--cache_type", default="jsonl",
-                                 help="Type of cache file (e.g., jsonl).")
-
-    args = parser.parse_args()
-
-    os.environ['CUDA_VISIBLE_DEVICES'] = args.card_id.replace(' ', '')
-    
-    pipe = MultiRoleVideoQAPipeline(
-        model_path=args.model_path,
-        hf_cache_dir=args.hf_cache_dir,
-        download_dir=args.download_dir,
-        first_entry_file=args.images_file,
-        cache_path=args.cache_path,
-        file_name_prefix=args.file_name_prefix,
-        cache_type=args.cache_type,
-    )
-    pipe.forward()
-
-```
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/multirole_videoqa.md b/docs/zh/notes/mm_operators/image_understanding/generate/multirole_videoqa.md
deleted file mode 100644
index e33cc836..00000000
--- a/docs/zh/notes/mm_operators/image_understanding/generate/multirole_videoqa.md
+++ /dev/null
@@ -1,140 +0,0 @@
----
-title: 多角色视频问答生成(MultiRole Video QA Generation)
-createTime: 2025/12/2 20:00:00
-icon: material-symbols-light:video
-permalink: /zh/mm_operators/generate/multirole_videoqa/
----
-
-## 📘 概述
-
-`MultiroleVideoQAGenerate` 是一个数据生成算子，用于**基于预处理视频数据自动创建问答对（QA Pairs）**。 
-给定输入的预处理视频数据，它会构建多个与该视频相关的问答对。该算子适用于**广告视频标注**、**数据集构建**和**视频理解**任务。
-
-**功能特性：**
-* 支持**批量处理**多个预处理视频数据。
-* 使用 **VLM（如 Qwen2.5-VL）**生成**高质量**的问答对。
-* 自动处理视频输入并使用 Prompt 生成数据。
-
----
-
-## 🏗️ `__init__` 函数
-
-```python
-def __init__(
-    self,
-    llm_serving: VLMServingABC
-):
-    ...
-```
-## 🧾 `__init__` 参数
-
-| Parameter     | Type            | Default | Description                                                     |
-| :------------ | :-------------- | :------ | :-------------------------------------------------------------- |
-| `llm_serving` | `VLMServingABC` | -       | **Model Serving Object** used to call the VLM for QA pairs generation |
-
------
-
-## ⚡ `run` 函数
-
-```python
-def run(
-        self,
-        storage: DataFlowStorage,
-        input_meta_key: str = "Meta", 
-        input_clips_key: str = "Clips", 
-        output_key: str = "QA"
-):
-    ...
-```
-
-The `run` function executes the main QA pairs generation workflow:
-read data paths → **validate DataFrame** → construct prompts → call the model → generate QA pairs captions → write results to output.
-
-## 🧾 `run` 参数
-
-| Parameter         | Type              | Default     | Description                                           |
-| :---------------- | :---------------- | :---------- | :---------------------------------------------------- |
-| `storage`         | `DataFlowStorage` | -           | Dataflow storage object                               |
-| `input_mets_key`  | `str`             | `"Meta"`    | **Multimodal Input Field Name**                       |
-| `input_clips_key` | `str`             | `"Clips"`   | **Multimodal Input Field Name**                       |
-| `output_key`      | `str`             | `"QA"`      | **Model Output Field Name** (the generated QA pairs)  |
-
------
-
-## 🧠 示例用法
-
-```python
-import os 
-import argparse
-from dataflow.serving import LocalModelVLMServing_vllm
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import MultiroleVideoQAInitialGenerator, MultiroleVideoQAMultiAgentGenerator, MultiroleVideoQAFinalGenerator
-
-# Step 1: Launch local model service
-llm_serving = LocalModelVLMServing_vllm(
-            hf_model_name_or_path=model_path,
-            hf_cache_dir=hf_cache_dir,
-            hf_local_dir=download_dir,
-            vllm_tensor_parallel_size=1, 
-            vllm_temperature=0.7,
-            vllm_top_p=0.9,
-            vllm_max_tokens=6000,
-        )
-
-# Step 2: Prepare input data
-storage = FileStorage(
-            first_entry_file_name=first_entry_file,
-            cache_path=cache_path,
-            file_name_prefix=file_name_prefix,
-            cache_type=cache_type,
-        )
-
-# Step 3: Initialize and run the operator
-initial_QA_generation = MultiroleVideoQAInitialGenerator(llm_serving = self.llm_serving)
-multiAgent_QA_generation = MultiroleVideoQAMultiAgentGenerator(llm_serving = self.llm_serving, max_iterations = 3)
-final_QA_generation = MultiroleVideoQAFinalGenerator(llm_serving = self.llm_serving)
-
-init_df = initial_QA_generation.run(
-            storage = self.storage.step(),
-            input_meta_key = self.input_meta_key, 
-            input_clips_key = self.input_clips_key, 
-            output_key = self.output_key
-        )
-middle_df = multiAgent_QA_generation.run(
-            df = init_df,
-            input_meta_key = self.input_meta_key, 
-            input_clips_key = self.input_clips_key, 
-            output_key = self.output_key
-        )
-final_QA_generation.run(
-            storage = self.storage,
-            df = middle_df,
-            input_meta_key = self.input_meta_key, 
-            input_clips_key = self.input_clips_key, 
-            output_key = self.output_key
-        )
-```
-
------
-
-## 🧾 默认输出格式
-
-| Field     | Type         | Description                      |
-| :-------- | :----------- | :------------------------------- |
-| `Meta`    | `str`        | Meta information for video       |
-| `Clips`   | `List[Dict]` | Interleaved modality video Clips |
-| `QA`      | `List[Dict]` | QA pairs                         |
-
------
-
-### 📥 示例输入
-
-```jsonl
-{"Meta": "Meta Information", "Clips": [{"Audio_Text": "Audio_Text1", "Frames_Images": ["image_path1","image_path2"], "Description": "Description1"}, {"Audio_Text": "Audio_Text2", "Frames_Images": ["image_path3","image_path4"], "Description": "Description2"}]}
-```
-
-### 📤 示例输出
-
-```jsonl
-{"Meta": "Meta Information", "Clips": [{"Audio_Text": "Audio_Text1", "Frames_Images": ["image_path1","image_path2"], "Description": "Description1"}, {"Audio_Text": "Audio_Text2", "Frames_Images": ["image_path3","image_path4"], "Description": "Description2"}], "QA":[{"Label":"label1", "Question": "Question1", "Answer": "Answer1"},{"Label":"label2", "Question": "Question2", "Answer": "Answer2"}]}
-```
\ No newline at end of file