diff --git a/docs/.vuepress/notes/en/guide.ts b/docs/.vuepress/notes/en/guide.ts index 66aa93bebc..6990d697c8 100644 --- a/docs/.vuepress/notes/en/guide.ts +++ b/docs/.vuepress/notes/en/guide.ts @@ -26,9 +26,8 @@ export const Guide: ThemeNote = defineNoteConfig({ 'second_pipeline', 'dataflow_init', 'df_ecosystem', - 'dataflow_webui' - - + 'dataflow_webui', + 'dataflow_skills' ], }, { @@ -112,21 +111,5 @@ export const Guide: ThemeNote = defineNoteConfig({ "funccall_operators" ] }, - { - text: "Agent for Dataflow", - collapsed: false, - icon: 'mdi:face-agent', - prefix: 'agent', - items: [ - "agent_for_data", - "DataFlow-AgentPipelineOrchestration", - "operator_assemble_line", - "operator_qa", - "operator_write", - "pipeline_prompt", - "pipeline_rec&refine", - "web_collection" - ] - }, ], }) diff --git a/docs/.vuepress/notes/zh/guide.ts b/docs/.vuepress/notes/zh/guide.ts index ab6dd90425..5ee4705847 100644 --- a/docs/.vuepress/notes/zh/guide.ts +++ b/docs/.vuepress/notes/zh/guide.ts @@ -26,8 +26,8 @@ export const Guide: ThemeNote = defineNoteConfig({ 'second_pipeline', 'dataflow_init', 'df_ecosystem', - 'dataflow_webui' - + 'dataflow_webui', + 'dataflow_skills' ], }, // { @@ -111,22 +111,6 @@ export const Guide: ThemeNote = defineNoteConfig({ // "video_process", ] }, - { - text: "Agent for Dataflow", - collapsed: false, - icon: 'mdi:face-agent', - prefix: 'agent', - items: [ - "agent_for_data", - "DataFlow-AgentPipelineOrchestration", - "operator_assemble_line", - "operator_qa", - "operator_write", - "pipeline_prompt", - "pipeline_rec&refine", - "web_collection" - ] - }, // { // text: '写作', // icon: 'fluent-mdl2:edit-create', diff --git a/docs/en/notes/guide/agent/DataFlow-AgentPipelineOrchestration.md b/docs/en/notes/guide/agent/DataFlow-AgentPipelineOrchestration.md deleted file mode 100644 index 98c42edc42..0000000000 --- a/docs/en/notes/guide/agent/DataFlow-AgentPipelineOrchestration.md +++ /dev/null @@ -1,795 +0,0 @@ ---- -title: Agent-Quick Start -icon: carbon:ibm-consulting-advantage-agent -createTime: 2025/06/19 10:29:31 -permalink: /en/guide/DataFlow-AgentPipelineOrchestration/ - ---- -# DataFlow Agent Quick Start Guide - -This guide will help you quickly get started with the 5 core functional modules of the DataFlow Agent platform. - -## Table of Contents - -1. [Pipeline Recommendation](#1-pipeline-recommendation) -2. [Operator Development](#2-operator-development) -3. [Manual Orchestration](#3-manual-orchestration) -4. [Operator Reuse/Prompt Optimization](#4-operator-reuseprompt-optimization) -5. [Web Search/Data Collection](#5-web-searchdata-collection) - ---- - -## 1. Pipeline Recommendation - -### Feature Overview -Automatically recommends and generates appropriate DataFlow Pipelines based on user's natural language descriptions, including operator selection, parameter configuration, and code generation. - -### Use Cases -- Quickly build data processing workflows -- Intelligent recommendations when unfamiliar with specific operators -- Automated Pipeline generation - -### Input Parameters - -#### Basic Configuration -- **Target Description** (Required) - - Describe the data processing goal you want to achieve - - Example: `"Give me 5 logically consistent operators, filter and deduplicate!"` - - Example: `"Clean, deduplicate, and classify text data"` - -- **Input JSONL File Path** (Required) - - Data file for testing the Pipeline - - Format: One JSON object per line - - Default: `{project_root}/tests/test.jsonl` - -- **Session ID** - - Session identifier for caching and tracking - - Default: `"default"` - -#### API Configuration - -**Primary Model Configuration** -- **Chat API URL**: LLM service address - - Default: `http://123.129.219.111:3000/v1/` -- **API Key**: Access key -- **Model Name**: e.g., `gpt-4o`, `qwen-max`, `llama3`, etc. - - Default: `gpt-4o` - -**Embedding Model Configuration** -- **Embedding API URL**: Embedding model service address (optional, uses primary API if empty) -- **Embedding Model Name**: e.g., `text-embedding-3-small` - -#### Debug Configuration -- **Enable Debug Mode**: Whether to enable automatic debugging and fixing -- **Debug Mode Execution Count**: 1-10 times, default 2 - -### Output Results - -#### 1. Pipeline Code (Generated Code) -```python -# Auto-generated Python code -# Contains complete Pipeline definition and execution logic -``` - -#### 2. Execution Log -- Detailed log of Pipeline execution process -- Contains execution status of each operator -- Error messages and debug information - -#### 3. Agent Results -```json -{ - "recommender": {...}, - "pipeline_builder": {...}, - "operator_executor": {...} -} -``` -- Detailed execution results of each Agent node -- Includes recommended operator list, build process, etc. - -### Usage Steps - -1. Enter your requirements in the "Target Description" box -2. Configure API information (URL, Key, Model) -3. (Optional) Configure embedding model and debug options -4. Click "Generate Pipeline" button -5. View generated code and execution results - ---- - -## 2. Operator Development - -### Feature Overview -Automatically generates new DataFlow operator code based on user requirements, including operator implementation, test code, and debugging. - -### Use Cases -- Create custom data processing operators -- Extend DataFlow functionality -- Rapid prototyping - -### Input Parameters - -#### Basic Configuration -- **Target Description** (Required) - - Describe the operator's functionality and purpose - - Example: `"Create an operator for sentiment analysis of text"` - - Example: `"Implement a data deduplication operator supporting multi-field combination deduplication"` - -- **Operator Category** - - Category the operator belongs to, used to match similar operators as reference - - Default: `"Default"` - - Options: `"filter"`, `"mapper"`, `"aggregator"`, etc. - -- **Test Data File Path (JSONL)** - - Data file for testing the operator - - Default: `{project_root}/tests/test.jsonl` - -#### API Configuration -- **Chat API URL**: LLM service address -- **API Key**: Access key (uses environment variable `DF_API_KEY` if empty) -- **Model Name**: Default `gpt-4o` - -#### Advanced Configuration -- **Output Language**: `en` (English) or `zh` (Chinese) -- **Enable Debug Mode**: Automatically execute and fix code errors -- **Maximum Debug Rounds**: 1-10 times, default 3 -- **Output File Path**: Location to save generated code (optional) - -### Output Results - -#### 1. Generated Code -```python -# Complete operator implementation code -class YourOperator(Operator): - def __init__(self, ...): - ... - - def run(self, dataset, ...): - ... -``` - -#### 2. Matched Operators -```json -[ - { - "op_name": "similar_operator_1", - "similarity": 0.85, - "description": "..." - } -] -``` -- List of similar operators matched by the system -- Used as reference and learning material - -#### 3. Execution Results -```json -{ - "success": true, - "output": {...}, - "stderr": "", - "stdout": "..." -} -``` -- Operator execution status -- Output data preview -- Error messages (if any) - -#### 4. Debug Information -```json -{ - "round": 2, - "input_key": "text", - "available_keys": ["text", "label"], - "stdout": "...", - "stderr": "..." -} -``` -- Detailed information of debug process -- Input/output of each debug round - -#### 5. Agent Results -- Execution details of each Agent node -- Includes matching, writing, execution, debugging phases - -#### 6. Execution Log -- Complete execution process log -- Contains detailed information of all phases - -### Usage Steps - -1. Describe operator functionality in detail in "Target Description" -2. Select appropriate operator category -3. Configure API information -4. (Optional) Enable debug mode to automatically fix errors -5. Click "Generate Operator" button -6. View generated code and test results -7. If modifications needed, adjust parameters and regenerate - ---- - -## 3. Manual Orchestration - -### Feature Overview -Manually select and assemble operators through visual interface to build custom Pipelines, supporting drag-and-drop sorting and parameter configuration. - -### Use Cases -- Precise control of Pipeline structure -- Reuse existing operators -- Rapid prototype validation -- Learn operator usage methods - -### Input Parameters - -#### API and File Configuration -- **Chat API URL**: LLM service address -- **API Key**: Access key -- **Model Name**: Default `gpt-4o` -- **Input JSONL File Path**: Test data file - -#### Operator Selection and Configuration - -**Step 1: Select Operator** -1. Select category from "Operator Category" dropdown - - e.g., `filter`, `mapper`, `deduplicator`, etc. -2. Select specific operator from "Operator" dropdown - - System automatically displays parameter description for the operator - -**Step 2: Configure Parameters** - -- **Prompt Template (Optional)** - - If operator supports Prompt templates, a dropdown selector will appear - - Automatically updates to `__init__()` parameters after selection - -- **`__init__()` Parameters (JSON Format)** - ```json - { - "param1": "value1", - "param2": 123, - "prompt_template": "module.PromptClass" - } - ``` - - Operator initialization parameters - - Must be valid JSON object - -- **`run()` Parameters (JSON Format)** - ```json - { - "input_key": "text", - "output_key": "processed_text", - "batch_size": 32 - } - ``` - - Operator runtime parameters - - Must be valid JSON object - -**Step 3: Add to Pipeline** -- Click "➕ Add Operator to Pipeline" button -- Operator will be added to Pipeline sequence - -**Step 4: Adjust Order** -- In Pipeline visualization area, drag operator cards to adjust order -- System automatically renumbers - -**Step 5: Auto-linking** -- System automatically analyzes input/output relationships between operators -- Displays link status: - - 🔗 **Linked**: Output key successfully matched to next operator's input - - ⚠️ **Pending**: Input is empty or unmatched - -### Output Results - -#### 1. Current Pipeline (Visual Display) -- Each operator displayed as a card, containing: - - Step number - - Operator name - - `__init__()` parameter preview - - `run()` parameter preview - - Connection status with previous step - -#### 2. Current Pipeline (JSON Format) -```json -[ - { - "op_name": "TextCleanerOperator", - "init_params": {...}, - "run_params": {...}, - "_incoming_links": [ - { - "input_key": "text", - "value": "raw_text", - "output_keys": ["output"] - } - ] - } -] -``` - -#### 3. Generated Code -```python -# Complete Pipeline execution code -from dataflow import Dataset -from dataflow.operators import * - -# Load data -dataset = Dataset.load("input.jsonl") - -# Execute Pipeline -dataset = TextCleanerOperator(...).run(dataset, ...) -dataset = DeduplicatorOperator(...).run(dataset, ...) -... - -# Save results -dataset.save("output.jsonl") -``` - -#### 4. Processing Result Data (First 100 Records) -```json -[ - {"text": "processed text 1", "label": "A"}, - {"text": "processed text 2", "label": "B"}, - ... -] -``` - -#### 5. Output File Path -- Location where processed data is saved - -### Usage Steps - -1. Configure API information and input file path -2. Select operator category and specific operator -3. Edit `__init__()` and `run()` parameters (JSON format) -4. Click "➕ Add Operator to Pipeline" -5. Repeat steps 2-4 to add more operators -6. Drag to adjust operator order (optional) -7. Check auto-link status, ensure parameters are correct -8. Click "🚀 Run Pipeline" -9. View generated code and execution results - -### Advanced Tips - -- **Clear Pipeline**: Click "🗑️ Clear Pipeline" button -- **Parameter Reuse**: System automatically links previous operator's output key to next operator's input -- **Debugging**: If execution fails, check error messages in log, adjust parameters and retry - ---- - -## 4. Operator Reuse/Prompt Optimization - -### Feature Overview -PromptAgent frontend for generating and optimizing operator Prompt templates, supporting multi-round conversational rewriting and testing. - -### Use Cases -- Create high-quality Prompt templates for operators -- Optimize existing Prompt effectiveness -- Rapid Prompt design iteration -- Generate test code and data - -### Input Parameters - -#### Runtime Configuration -- **Chat API Base URL**: LLM service address - - Default: `http://123.129.219.111:3000/v1/` -- **Chat API Key**: Access key -- **Model**: Model name, default `gpt-4o` -- **Language**: Prompt language, `zh` (Chinese) or `en` (English) - -#### Prompt Configuration -- **Task Description** (Required) - - Describe in detail the task the Prompt should complete - - Example: `"Perform sentiment analysis on user input text, determine if positive, negative, or neutral"` - - Example: `"Rewrite product descriptions into more attractive marketing copy"` - -- **Operator Name (op-name)** (Required) - - Name of the Prompt class - - Example: `SentimentAnalysisPrompt` - - Example: `MarketingCopywriterPrompt` - -- **Output Format** (Optional) - - Specify the format of Prompt output - - Example: - ``` - { - "sentiment": "positive/negative/neutral", - "confidence": 0.95 - } - ``` - -- **Parameter List** (Optional) - - Parameters needed by Prompt template, separated by comma, space, or newline - - Example: `text, language, style` - - Example: - ``` - input_text - target_audience - tone - ``` - -- **File Output Root Path** (Optional) - - Directory to save generated files - - Default: `./pa_cache` - -- **Delete Test Files After Generation** - - Whether to delete test files after generation (keep path placeholder) - - Default: Enabled - -### Output Results - -#### 1. Prompt File Path -- Location of generated Prompt template file -- Example: `./pa_cache/prompts/SentimentAnalysisPrompt.py` - -#### 2. Test Data File Path -- Auto-generated test data file -- Example: `./pa_cache/test_data/test_data.jsonl` - -#### 3. Test Code File Path -- Auto-generated test code -- Example: `./pa_cache/tests/test_prompt.py` - -#### 4. Test Data Preview -```json -[ - {"text": "This product is great!", "language": "en"}, - {"text": "Quality is terrible", "language": "en"}, - {"text": "It's okay", "language": "en"} -] -``` - -#### 5. Test Results Preview -```json -[ - { - "input": {"text": "This product is great!"}, - "output": { - "sentiment": "positive", - "confidence": 0.92 - } - } -] -``` - -#### 6. Prompt Code Preview -```python -from dataflow_agent.promptstemplates import PromptTemplate - -class SentimentAnalysisPrompt(PromptTemplate): - """Sentiment Analysis Prompt Template""" - - def __init__(self): - super().__init__() - self.system_prompt = "You are a sentiment analysis expert..." - self.user_prompt_template = "Please analyze the sentiment of the following text: {text}" - - def format(self, text: str, **kwargs) -> str: - return self.user_prompt_template.format(text=text) -``` - -#### 7. Test Code Preview -```python -import json -from your_prompt import SentimentAnalysisPrompt - -# Load test data -with open("test_data.jsonl") as f: - test_data = [json.loads(line) for line in f] - -# Test Prompt -prompt = SentimentAnalysisPrompt() -for item in test_data: - result = prompt.format(**item) - print(result) -``` - -### Multi-round Rewriting Feature - -In the right-side conversation area, you can: - -1. **View Initial Generation Results** - - Prompt code - - Test results - -2. **Propose Improvements** - - Describe how you want to modify in the conversation input box - - Examples: - - `"Add recognition of sarcastic tone"` - - `"Change output format to return only positive/negative/neutral string"` - - `"Add confidence threshold, return uncertain when below 0.7"` - -3. **Send Rewrite Instructions** - - Click "Send Rewrite Instruction" button - - System regenerates Prompt based on feedback - -4. **Iterative Optimization** - - View updated code and test results - - Continue proposing improvements - - Repeat until satisfied - -5. **Clear Session** - - Click "Clear Session" button to start over - -### Usage Steps - -#### Initial Generation -1. Configure API information (URL, Key, Model) -2. Fill in task description and operator name -3. (Optional) Specify output format and parameter list -4. Click "Generate Prompt Template" button -5. View generated Prompt code and test results - -#### Multi-round Optimization -1. Enter improvement suggestions in right-side dialog box -2. Click "Send Rewrite Instruction" -3. View updated code and test results -4. Repeat steps 1-3 until satisfied - -#### Using Generated Prompt -1. Get file location from "Prompt File Path" -2. Import Prompt class into your operator -3. Specify `prompt_template` in operator's `__init__()` - ---- - -## 5. Web Search/Data Collection - -### Feature Overview -Automatically collect datasets from the web (HuggingFace, Kaggle, and other platforms) and convert to unified format, supporting intelligent search, download, and data cleaning. - -### Use Cases -- Quickly build training datasets -- Collect domain-specific data -- Dataset format conversion -- Batch download and processing - -### Input Parameters - -#### Collection Configuration -- **Target Description** (Required) - - Describe the type of data you want to collect - - Example: `"Collect Python code example datasets"` - - Example: `"Collect Chinese conversation data for training chatbots"` - - Example: `"Collect image classification datasets with cat and dog pictures"` - -- **Data Category** - - `PT`: Pre-Training data - - `SFT`: Supervised Fine-Tuning data - - Default: `SFT` - -- **Dataset Quantity Limit (Per Keyword)** - - Number of datasets returned per search keyword - - Range: 1-50 - - Default: 5 - - Note: For reference only, actual quantity may vary based on search results - -- **Dataset Size Range** - - Filter datasets by size range - - Options: - - `n<1K`: Less than 1000 records - - `1K1M`: More than 1000000 records - - Default: `1K **Note: Explicit Configuration Requirements** Unlike the "automatic linking" in the UI, you must **explicitly configure** all parameters in script mode. You need to ensure that the `output_key` of the previous operator strictly matches the `input_key` of the next operator; the script will not automatically correct parameter names for you. - -#### 3. Run the Script - -```Bash -python script/run_dfa_op_assemble.py -``` - -#### 4. Result Output - -After the script is executed, the console will print: - -- **[Generation]**: Path of the generated Pipeline code. -- **[Code Preview]**: Preview of the first 20 lines of the generated code. -- **[Execution]**: Execution status. - -### 3.3 Practical Case: General Text Reasoning and Pseudo-Answer Generation - -You can refer to the following tutorials for learning, and also use the sample of [Google Colab](https://colab.research.google.com/drive/1W3Wb1sTyea1xDAGmVu3Tyn7fcvrsppAp?usp=sharing) we provide to run the program: - -We have a `tests/test.jsonl` file, where each line contains a `"raw_content"` field. Our goal is: based on the general English text content of this field, first invoke the large language model to generate reasoning-based answers for the text content, then generate pseudo-answers by generating candidate answers in multiple rounds and selecting the optimal one through statistics, and finally output key fields such as the list of candidate answers, optimal pseudo-answer, corresponding reasoning processes, and typical correct reasoning examples. Therefore, we select the `ReasoningAnswerGenerator` and `ReasoningPseudoAnswerGenerator` operators to orchestrate the Pipeline. - -The following is a complete configuration example: - -```Python -# [Pipeline 定义] -PIPELINE_STEPS = [ - { - "op_name": "ReasoningAnswerGenerator", - "params": { - # __init__ 参数 (注意:在 wf_df_op_usage 中统一合并为 params) - "prompt_template": "dataflow.prompts.reasoning.general.GeneralAnswerGeneratorPrompt", - # run 参数 - "input_key": "raw_content", - "output_key": "generated_cot" - } - }, - { - "op_name": "ReasoningPseudoAnswerGenerator", - "params": { - "max_times": 3, - "input_key": "generated_cot", - "output_key_answer": "pseudo_answers", - "output_key_answer_value": "pseudo_answer_value", - "output_key_solutions": "pseudo_solutions", - "output_key_correct_solution_example": "pseudo_correct_solution_example" - } - } -] -``` -After completing the configuration, execute the following command in the terminal: - -```Bash -python script/run_dfa_op_assemble.py -``` -The script will automatically perform the following actions: - -1. Build the graph: Parse your PIPELINE_STEPS. -2. Generate code: Convert the configuration into standard Python code and store it under `dataflow_cache/generated_pipelines/`. -3. Execute the task: Start a child process to run the generated Pipeline. -4. Output the report: The terminal will display [Execution] Status: success as well as a partial preview of the code. - -You can directly go to the `CACHE_DIR` directory to view the generated JSONL result file and verify whether the data meets expectations. diff --git a/docs/en/notes/guide/agent/operator_qa.md b/docs/en/notes/guide/agent/operator_qa.md deleted file mode 100644 index 2ab4cb1547..0000000000 --- a/docs/en/notes/guide/agent/operator_qa.md +++ /dev/null @@ -1,226 +0,0 @@ ---- -title: Operator QA -createTime: 2026/02/05 22:11:00 -permalink: /en/guide/agent/operator_qa/ ---- - -## 1. Overview - -**Operator QA** is a built-in vertical domain expert assistant within the DataFlow-Agent platform. Its core mission is to help users quickly navigate the extensive DataFlow operator library to find required tools, understand their usage, and inspect underlying source code. - -Unlike generic chatbots, Operator QA integrates **RAG (Retrieval-Augmented Generation)** technology. It is equipped with a complete operator index (FAISS) and a metadata knowledge base of the DataFlow project. When a user asks a question, the Agent autonomously decides whether to retrieve information from the knowledge base, which operators to inspect, and provides accurate technical details—including code snippets and parameter descriptions—back to the user. - -## 2. Core Features - -This module is driven by a frontend UI (`gradio_app/pages/operator_qa.py`), a backend execution workflow (`dataflow_agent/workflow/wf_operator_qa.py`), and a backend agent (`dataflow_agent/agentroles/data_agents/operator_qa_agent.py`). It possesses the following core capabilities: - -### 2.1 Intelligent Retrieval and Recommendation - -The Agent does more than simple keyword matching; it identifies user needs based on semantic understanding. - -* **Semantic Search**: If a user describes a need like "I want to filter out missing values," the Agent uses vector retrieval to find relevant operators such as `ContentNullFilter`. -* **On-Demand Invocation**: Based on the `BaseAgent` graph mode (`use_agent=True`), the Agent automatically determines whether to call the `search_operators` tool or respond directly based on the conversation context. - -### 2.2 Multi-turn Conversation - -Utilizing the `AdvancedMessageHistory` module, the system maintains a complete session context. - -* **Contextual Memory**: A user can ask, "Which operators can load data?" followed by "How do I fill in **its** parameters?" The Agent can recognize that "its" refers to the operator recommended in the previous turn. -* **State Persistence**: In both script interaction and UI modes, by reusing the same `state` and `graph` instances, the `messages` list accumulates across multiple turns, ensuring the LLM maintains a full memory. - -### 2.3 Visualization and Interaction - -* **Gradio UI**: Provides code previews, operator highlighting, and quick-question buttons. -* **Interaction**: Supports multi-turn Q&A, clearing history, and viewing history. - -## 3. Architectural Components - -### 3.1 OperatorQAAgent - -* Inherits from `BaseAgent` and is configured in ReAct/Graph mode. -* Possesses Post-Tools permissions to call RAG services for data retrieval. -* Responsible for parsing natural language, planning database queries, and generating final natural language responses. - -### 3.2 OperatorRAGService - -* A service layer decoupled from the Agent. -* Manages the FAISS vector index and `ops.json` metadata. -* Provides underlying capabilities such as `search` (vector search), `get_operator_info` (fetch details), and `get_operator_source` (fetch source code). - -## 4. User Guide - -This feature provides two modes of use: the **Graphical Interface (Gradio UI)** and **Command-line Scripts**. - -### 4.1 UI Operation - -It is ideal for interactive exploration and rapid validation. To launch the web interface: -```python -python gradio_app/app.py -``` -Visit `http://127.0.0.1:7860` and start using - -1. **Configure Model**: In the "Configuration" panel on the right, verify the API URL and Key, and select a model (defaults to `gpt-4o`). -2. **Initiate Inquiry**: - 1. **Dialogue Box**: Type your question. - 2. **Quick Buttons**: Click "Quick Question" buttons, such as "Which operator filters missing values?" to start instantly. -3. **View Results**: - 1. **Chat Area**: Displays the Agent's response and citations. - 2. **Right Panel**: - * `Related Operators`: Lists operator names retrieved by the Agent. - * `Code Snippets`: Displays Python source code if specific implementations are involved. - -### 4.2 Script Invocation and Explicit Configuration - -In addition to the UI interface, the system provides the `script/run_dfa_operator_qa.py` script. This method is suitable for development and debugging, or for querying operator usage through code automation. - -#### 1. Modify the Configuration - -Open `script/run_dfa_operator_qa.py` and make modifications in the configuration section at the top of the file. - -**API and File Configuration** - -* **CHAT_API_URL**: URL of the LLM service. -* **API_KEY**: Model invocation key. The Agent needs to call the large model to understand your questions and summarize the answers. -* **MODEL**: Model name, the default is `gpt-4o`. -* **CACHE_DIR**: Cache directory. -* **TOP_K**: Retrieval depth. Specify the maximum number of candidate results the Agent returns when retrieving relevant operators from the knowledge base (5 by default). - -**Query and Interaction Mode Configuration** - -* **INTERACTIVE**: **Interaction control switch** (`True` / `False`). - * `True` (Interactive Mode): Launch the continuous conversation mode in the terminal. You can ask follow-up questions like chatting, and support `clear` to clear history. - * `False` (One-time Mode): The script only executes one question specified by `QUERY` and exits immediately after outputting the result. -* **QUERY**: Question content for one-time query. Only effective when `INTERACTIVE = False`. -* **OUTPUT_JSON**: Result save path. - * Only effective in one-time mode. - * If a path is set, the Agent's answer, the retrieved list of operators and code snippets will be completely saved as a JSON file; if left blank, it will only be printed to the console. - -#### 2. Run the Script - -After completing the configuration, execute the following command in the terminal: - -```bash -python script/run_dfa_operator_qa.py - -``` - -#### 3. Result Output - -After the script is executed, the console behaves differently depending on the mode: - -* **Interactive Mode**: The `🧑 You:` prompt appears in the terminal, waiting for input. - * Enter `exit` or `quit` to exit. - * Enter `clear` to clear conversation history. - * Enter `history` to view conversation history. -* **One-time Mode**: The console directly prints the Agent's thinking process, the retrieved list of operators and the final answer. If `OUTPUT_JSON` is configured, a prompt of successful file saving will also be displayed. - -### 4.3 Practical Case: Find Operators for "Data Cleaning" - -You can refer to the following tutorials for learning, and also use the sample of [Google Colab](https://colab.research.google.com/drive/1maDKWp-3zEQNScmL_S7MHUdUC1xyCIcK?usp=sharing) we provide to run the program: - -Suppose you need to clean data when developing a Pipeline and want to know if there are ready-made operators in the DataFlow library for processing. - -**Scenario Configuration**: We set it to one-time query mode and specify to save the results locally for viewing detailed parameters in the code later. - -Open the script and modify the configuration as follows: - -```python -# ===== Example config (edit here) ===== - -# 1. Disable interactive mode and execute a one-time query -INTERACTIVE = False - -# 2. Define your specific requirements -QUERY = "I want to clean data, which operator should I use?" - -# 3. Ensure the API configuration is correct -CHAT_API_URL = os.getenv("DF_API_URL", "http://123.129.219.111:3000/v1/") -API_KEY = os.getenv("DF_API_KEY", "") -MODEL = os.getenv("DF_MODEL", "gpt-4o") - -# 4. Specify the result save path -OUTPUT_JSON = "cache_local/operator_qa_result.json" - - -``` - -**Run**: - -After running the script, the Agent will perform RAG retrieval and generate an answer. Open the generated `script/cache_local/operator_qa_result.json` and you can see the data with the following structure: - -```json -{ - "success": true, - "query": "I want to clean data, which operator should I use?", - "answer": "在 DataFlow 中,有多个算子可以用于数据清洗。以下是一些推荐的算子:\n\n1. **KBCTextCleaner**: 适用于对原始知识内容进行标准化处理,包括HTML标签清理、特殊字符规范化、链接处理和结构优化。适合需要提升RAG知识库质量的场景。\n\n2. **KBCTextCleanerBatch**: 类似于 KBCTextCleaner,但支持批量处理。\n\n3. **ContentNullFilter**: 用于过滤空值、空字符串或仅包含空白字符的文本,确保输入数据的有效性。\n\n4. **HtmlUrlRemoverRefiner**: 去除文本中的URL链接和HTML标签,净化文本内容。\n\n5. **PresidioFilter**: 基于PresidioScorer打分器的得分对数据进行过滤,识别并处理文本中的私人实体(PII)。", - "related_operators": [ - "KBCTextCleaner", - "KBCTextCleanerBatch", - "ContentNullFilter", - "HtmlUrlRemoverRefiner", - "PresidioFilter" - ], - "code_snippet": null, - "follow_up_suggestions": [ - "了解如何配置这些算子的参数", - "查看某个算子的详细实现", - "询问特定数据清洗场景的最佳实践" - ], - "messages": [ - { - "type": "SystemMessage", - "content": "\n[角色]\n你是 DataFlow 算子库的智能问答助手。你的职责是帮助用户了解和使用 DataFlow 中的各种数据处理算子。\n\n[能力]\n1. 根据用户描述的需求,推荐合适的算子\n2. 解释算子的功能、用途和使用场景\n3. 详细说明算子的参数含义和配置方法\n4. 在需要时展示算子的源码实现\n5. 基于多轮对话理解用户的上下文需求\n\n[DataFlow 算子简介]\nDataFlow 是一个数据处理框架,提供了丰富的算子用于数据清洗、过滤、生成、评估等任务。\n每个算子都是一个 Python 类,通常包含:\n- `__init__` 方法:初始化算子,配置必要的参数(如 LLM 服务、提示词等)\n- `run` 方法:执行数据处理逻辑,接收输入数据并产出处理结果\n\n[可用工具]\n你可以调用以下工具来获取算子信息:\n\n1. **search_operators(query, top_k)** - 根据功能描述搜索相关算子\n - 当用户询问某类功能的算子时使用\n - 如果对话历史中已有相关算子信息,可以不调用直接回答\n\n2. **get_operator_info(operator_name)** - 获取指定算子的详细描述\n - 当用户询问特定算子的功能时使用\n\n3. **get_operator_source_code(operator_name)** - 获取算子的完整源代码\n - 当用户需要了解算子实现细节时使用\n\n4. **get_operator_parameters(operator_name)** - 获取算子的参数详情\n - 当用户询问算子如何配置、参数含义时使用\n\n[工具调用策略]\n- 如果是新问题且对话历史中没有相关信息 → 调用 search_operators 检索\n- 如果对话历史中已有相关算子信息 → 可以直接回答,无需重复检索\n- 如果用户追问某个算子的细节 → 调用 get_operator_info/get_operator_source_code/get_operator_parameters\n\n[回答风格]\n1. 清晰简洁,重点突出\n2. 使用中文回答(除非用户要求英文)\n3. 对于技术细节,提供具体的代码示例\n4. 在解释参数时,说明参数类型、默认值和作用\n\n[输出格式]\n请以 JSON 格式返回,包含以下字段:\n{\n \"answer\": \"对用户问题的详细回答\",\n \"related_operators\": [\"相关算子名称列表\"],\n \"source_explanation\": \"说明答案的信息来源,例如:'通过search_operators检索到的XXX算子'、'基于对话历史中的算子信息'、'基于我的知识库'\",\n \"code_snippet\": \"如有必要,提供代码片段(可选)\",\n \"follow_up_suggestions\": [\"可能的后续问题建议(可选)\"]\n}\n\n\n请以JSON格式返回结果,不要包含其他文字说明!!!直接返回json内容,不要```json进行包裹!!", - "role": "", - "additional_kwargs": {}, - "metadata": {} - }, - { - "type": "HumanMessage", - "content": "\n[用户问题]\nI want to clean data, which operator should I use?\n\n[任务]\n请根据用户问题回答。对话历史会自动包含在消息中,你可以参考之前的对话。\n\n工具调用指南:\n1. 如果需要查找算子,调用 search_operators 工具\n2. 如果需要某个算子的详细信息,调用 get_operator_info 工具\n3. 如果需要源码,调用 get_operator_source_code 工具\n4. 如果需要参数详情,调用 get_operator_parameters 工具\n5. 如果之前的对话中已有相关信息,可以直接回答,无需重复调用工具\n\n回答要求:\n- 基于工具返回的信息或对话上下文中的信息回答\n- 在 source_explanation 中说明答案来源\n- 如果问题不明确,可以在 follow_up_suggestions 中给出澄清建议\n\n请以 JSON 格式返回你的回答。\n", - "role": "", - "additional_kwargs": {}, - "metadata": {} - }, - { - "type": "AIMessage", - "content": "", - "role": "", - "additional_kwargs": { - "tool_calls": [ - { - "id": "call_06xfRcedme8OAVBq33keXVdS", - "function": { - "arguments": "{\"query\":\"clean data\"}", - "name": "search_operators" - }, - "type": "function" - } - ], - "refusal": null - }, - "metadata": {} - }, - { - "content": "{\n \"query\": \"clean data\",\n \"matched_operators\": [\n \"KBCTextCleaner\",\n \"KBCTextCleanerBatch\",\n \"ContentNullFilter\",\n \"HtmlUrlRemoverRefiner\",\n \"PresidioFilter\"\n ],\n \"operator_details\": [\n {\n \"node\": 1,\n \"name\": \"KBCTextCleaner\",\n \"description\": \"知识清洗算子:对原始知识内容进行标准化处理,包括HTML标签清理、特殊字符规范化、链接处理和结构优化,提升RAG知识库的质量。主要功能:\\n1. 移除冗余HTML标签但保留语义化标签\\n2. 标准化引号/破折号等特殊字符\\n3. 处理超链接同时保留文本\\n4. 保持原始段落结构和代码缩进\\n5. 确保事实性内容零修改\\n\\n输入格式示例:\\n
\\n

标题文本

\\n

正文段落,包括特殊符号,例如“弯引号”、–破折号等

\\n \\\"示意图\\\"\\n 链接文本\\n
代码片段
\\n ...\\n
\\n\\n输出格式示例:\\n标题文本\\n\\n正文段落,包括特殊符号,例如\\\"直引号\\\"、-破折号等\\n\\n[Image: 示例图 example.jpg]\\n\\n链接文本\\n\\n代码片段\\n\\n[结构保持,语义保留,敏感信息脱敏处理(如手机号、保密标记等)]\",\n \"category\": \"knowledge_cleaning\"\n },\n {\n \"node\": 2,\n \"name\": \"KBCTextCleanerBatch\",\n \"description\": \"知识清洗算子:对原始知识内容进行标准化处理,包括HTML标签清理、特殊字符规范化、链接处理和结构优化,提升RAG知识库的质量。主要功能:\\n1. 移除冗余HTML标签但保留语义化标签\\n2. 标准化引号/破折号等特殊字符\\n3. 处理超链接同时保留文本\\n4. 保持原始段落结构和代码缩进\\n5. 确保事实性内容零修改\",\n \"category\": \"knowledge_cleaning\"\n },\n {\n \"node\": 3,\n \"name\": \"ContentNullFilter\",\n \"description\": \"该算子用于过滤空值、空字符串或仅包含空白字符的文本,确保输入数据的有效性。\\n初始化参数:\\n- 无\\n运行参数:\\n- storage:DataFlowStorage对象\\n- input_key:输入文本字段名\\n- output_key:输出标签字段名,默认为'content_null_filter_label'\\n返回值:\\n- 包含output_key的列表\",\n \"category\": \"general_text\"\n },\n {\n \"node\": 4,\n \"name\": \"HtmlUrlRemoverRefiner\",\n \"description\": \"去除文本中的URL链接和HTML标签,净化文本内容。使用正则表达式匹配并移除各种形式的URL和HTML标签。输入参数:\\n- input_key:输入文本字段名\\n输出参数:\\n- 包含净化后文本的DataFrame\\n- 返回输入字段名,用于后续算子引用\",\n \"category\": \"general_text\"\n },\n {\n \"node\": 5,\n \"name\": \"PresidioFilter\",\n \"description\": \"基于PresidioScorer打分器的得分对数据进行过滤。使用Microsoft Presidio模型识别文本中的私人实体(PII),返回PII信息个数。\\n支持识别姓名、邮箱、电话号码、身份证号等多种敏感信息类型,可用于数据隐私保护和合规性检查。\\n输入参数:\\n- min_score:保留样本的最小PII数量阈值,默认为0\\n- max_score:保留样本的最大PII数量阈值,默认为5\\n- lang:文本语言,默认为'en'\\n- device:模型运行设备,默认为'cuda'\\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\\n输出参数:\\n- 过滤后的DataFrame,仅保留PII数量在[min_score, max_score]范围内的样本\\n- 返回包含输出字段名的列表,用于后续算子引用\",\n \"category\": \"general_text\"\n }\n ]\n}", - "additional_kwargs": {}, - "response_metadata": {}, - "type": "tool", - "name": "search_operators", - "id": "72462389-4f54-4b33-9c31-0f662cc537cb", - "tool_call_id": "call_06xfRcedme8OAVBq33keXVdS", - "artifact": null, - "status": "success" - }, - { - "type": "AIMessage", - "content": "{\n \"answer\": \"在 DataFlow 中,有多个算子可以用于数据清洗。以下是一些推荐的算子:\\n\\n1. **KBCTextCleaner**: 适用于对原始知识内容进行标准化处理,包括HTML标签清理、特殊字符规范化、链接处理和结构优化。适合需要提升RAG知识库质量的场景。\\n\\n2. **KBCTextCleanerBatch**: 类似于 KBCTextCleaner,但支持批量处理。\\n\\n3. **ContentNullFilter**: 用于过滤空值、空字符串或仅包含空白字符的文本,确保输入数据的有效性。\\n\\n4. **HtmlUrlRemoverRefiner**: 去除文本中的URL链接和HTML标签,净化文本内容。\\n\\n5. **PresidioFilter**: 基于PresidioScorer打分器的得分对数据进行过滤,识别并处理文本中的私人实体(PII)。\",\n \"related_operators\": [\n \"KBCTextCleaner\",\n \"KBCTextCleanerBatch\",\n \"ContentNullFilter\",\n \"HtmlUrlRemoverRefiner\",\n \"PresidioFilter\"\n ],\n \"source_explanation\": \"通过search_operators检索到的相关算子\",\n \"code_snippet\": null,\n \"follow_up_suggestions\": [\n \"了解如何配置这些算子的参数\",\n \"查看某个算子的详细实现\",\n \"询问特定数据清洗场景的最佳实践\"\n ]\n}", - "role": "", - "additional_kwargs": { - "refusal": null - }, - "metadata": {} - } - ] -} - -``` \ No newline at end of file diff --git a/docs/en/notes/guide/agent/operator_write.md b/docs/en/notes/guide/agent/operator_write.md deleted file mode 100644 index 037bf15173..0000000000 --- a/docs/en/notes/guide/agent/operator_write.md +++ /dev/null @@ -1,296 +0,0 @@ ---- -title: Operator Write -createTime: 2026/02/05 22:11:00 -permalink: /en/guide/agent/operator_write/ ---- - -## 1. Overview - -**Operator Write** is the core productivity module of the DataFlow-Agent. It is not merely a tool for generating Python code based on user requirements but rather builds a closed-loop system for **generation, execution, and debugging**. - -This workflow enables: - -1. **Semantic Matching**: Understanding user intent (e.g., "filter missing values") and finding the best-matching base class within the existing operator library. -2. **Code Generation**: Writing executable operator code based on the base class and user data samples. -3. **Automatic Injection**: Automatically injecting LLM service capabilities into the operator if needed. -4. **Subprocess Execution**: Instantiating and running the generated operator in a controlled environment. -5. **Self-Healing**: Launching a Debugger to analyze stack traces if execution fails, automatically modifying the code, and retrying until success or the maximum retry limit is reached. - -## 2. Core Features - -### 2.1 Intelligent Code Generation - -* **Sample-Based Programming**: The Agent reads actual data samples (calling the pre-tool `local_tool_for_sample`) and the data Schema to ensure the generated code correctly handles real field names and data types. -* **Operator Reuse**: The system prioritizes retrieving existing operator libraries (calling the pre-tool `match_operator`) to generate code inherited from existing base classes rather than starting from scratch, ensuring code standardization and maintainability. - -### 2.2 Automatic Debugging Loop - -This is a system equipped with self-reflection capabilities. - -* **Execution Monitoring**: At the `llm_instantiate` node, the system attempts to execute the generated code (`exec(code_str)`) and captures standard output and standard errors. -* **Error Diagnosis**: If an exception occurs, the `code_debugger` Agent analyzes the error stack (`error_trace`) and the current code to generate repair suggestions (`debug_reason`). -* **Auto-Rewrite**: The `rewriter` Agent regenerates the code based on the repair suggestions, automatically updates the file, and enters the next round of testing. - -### 2.3 LLM Service Injection - -For complex operators requiring Large Model calls (e.g., "generate summary based on content"), the `llm_append_serving` node automatically injects standard LLM call interfaces (`self.llm_serving`) into the operator code, empowering it with AI capabilities. - -## 3. Workflow Architecture - -This feature is orchestrated by `dataflow_agent/workflow/wf_pipeline_write.py`, forming a directed graph containing conditional loops. - -1. **Match Node**: Retrieves reference operators. -2. **Write Node**: Writes the initial code. -3. **Append Serving Node**: Injects LLM capabilities. -4. **Instantiate Node**: Attempts to run the code. -5. **Debugger Node** (Conditional Trigger): Analyzes errors. -6. **Rewriter Node**: Fixes the code. - -## 4. User Guide - -This feature provides two modes of usage: **Graphical Interface (Gradio UI)** and **Command Line Script**. - -### 4.1 UI Operation - -The frontend page code is located in `gradio_app/pages/operator_write.py`, which provides a visual interactive experience. It is ideal for interactive exploration and rapid validation. To launch the web interface: -```python -python gradio_app/app.py -``` -Visit `http://127.0.0.1:7860` and start using - -#### 1. Configure Inputs - -Configure the following in the left panel of the page: - -* **Target Description**: Describe in detail the function and purpose of the operator you want to create. - * Example: "Create an operator that performs sentiment analysis on text." -* **Operator Category**: The category the operator belongs to, used for matching similar operators as references. Defaults to `"Default"`. Options include `"filter"`, `"mapper"`, `"aggregator"`, etc.. -* **Test Data File**: Specify the `.jsonl` file path used for testing the generated operator. Defaults to the project's built-in `tests/test.jsonl`. -* **Debug Settings**: - * `Enable Debug Mode`: If checked, the system automatically attempts to fix the code if an error occurs. - * `Max Debug Rounds`: Set the maximum number of automatic repair attempts (default is 3). -* **Output Path**: Specify the save path for the generated code (optional). - -#### 2. View Results - -After clicking the **"Generate Operator"** button, the right panel displays detailed results: - -* **Generated Code**: Final usable Python code, supporting syntax highlighting. -* **Matched Operators**: Displays the list of reference operators found by the system in the library (e.g., `"LangkitSampleEvaluator"`, `"LexicalDiversitySampleEvaluator"`, `"PresidioSampleEvaluator"`, `"PerspectiveSampleEvaluator"`, etc.). -* **Execution Result**: Shows `success: true/false` and specific log information `stdout`/`stderr`. -* **Debug Info**: If debugging was triggered, this displays the runtime captured `stdout`/`stderr` and the selected input field key (`input_key`). -* **Agent Results**: Detailed execution results for each Agent node. -* **Execution Log**: Complete execution log information, facilitating the troubleshooting of the Agent's thought process. - -### 4.2 Script Invocation and Explicit Configuration - -For developers, it is recommended to directly modify and run `script/run_dfa_operator_write.py`. This method can be more flexibly integrated into automated workflows and save the generated operator files. - -#### 1. Modify the Configuration - -Open `script/run_dfa_operator_write.py` and modify the parameters in the configuration section at the top of the file. - -**Task Configuration** - - * **`TARGET`**: Describe the function of the operator in natural language. The more specific the description, the more accurate the generated code. It is recommended to include descriptions of input fields and expected outputs. - - * Example: `"Create an operator for performing sentiment analysis on text"` - - * Example: `"Implement a data deduplication operator that supports deduplication based on a combination of multiple fields"` - - * **`CATEGORY`**: The category to which the operator belongs, used to match similar operators as references - - * Default: `"Default"` - - * Optional: `"reasoning"`, `"agentic_rag"`, `"knowledge_cleaning"`, etc. - - * **`JSON_FILE`**: Data file (`.jsonl` format) used to test the operator. - - * Default: If left blank, the project's built-in test data `tests/test.jsonl` will be used. - - * **`OUTPUT_PATH`**: Save path for the generated Python code. If left blank, the code will only be printed to the console and no file will be saved. - -**API and Debug Configuration** - - * **`CHAT_API_URL`**: URL of the LLM service - - * **`api_key`**: Access key (using the environment variable DF_API_KEY) - - * **`MODEL`**: Model name, default is gpt-4o - - * **`NEED_DEBUG`**: Whether to enable the automatic debugging loop (`True` / `False`) - - * `True`: If the generated code reports an error when running on `JSON_FILE`, the Agent will automatically analyze the error stack and attempt to rewrite the code - - * `False`: Generate and execute the code, then end immediately regardless of whether it runs successfully - - * **`MAX_DEBUG_ROUNDS`**: Maximum number of automatic repair attempts, default is 3 rounds - -#### 2. Run the Script - -After completing the configuration, execute the following command in the terminal: - -```bash -python script/run_dfa_operator_write.py -``` - -#### 3. Result Output - -During script execution, the following key information will be output: - -* **[Match Operator Result]**: Displays the "reference operators" found by the Agent in the existing operator library - -* **[Writer Result]**: Length of the generated code and its save location - -* **[Execution Result]**: Code execution result - - * `Success: True`: Indicates the code was generated successfully and ran without errors on the test data. - - * `Success: False`: Indicates the run failed. - -* **[Debug Runtime Preview]**: `stdout`/`stderr` captured during runtime, as well as the selected input field key name (`input_key`) - -### 4.3 Practical Case: Writing a Sentiment Analysis Operator - -You can refer to the following tutorials for learning, and also use the sample of [Google Colab](https://colab.research.google.com/drive/1oTkwMNwxMFGAe9rNtYCC47CQ9HxsA0uH?usp=sharing) we provide to run the program: - -We have a log file `tests/test.jsonl` containing the field `"raw_content"`. We want to create an operator to perform sentiment analysis on the text content of this field. - -**Configuration Example:** - -```python -# ===== Example config (edit here) ===== -# API KEY is passed in via the environment variable DF_API_KEY -CHAT_API_URL = os.getenv("DF_API_URL", "http://123.129.219.111:3000/v1/") -MODEL = os.getenv("DF_MODEL", "gpt-4o") -LANGUAGE = "en" - -# 1. Define specific requirements -TARGET = "Create an operator for performing sentiment analysis on text" -CATEGORY = "Default" -# 2. Specify the result save path -OUTPUT_PATH = "cache_local/my_operator.py" -# 3. Specify the test data path -JSON_FILE = "tests/test.jsonl" -# 4. Enable debugging -NEED_DEBUG = True -MAX_DEBUG_ROUNDS = 10 -``` - -**Run:** - -After running the script, the terminal will output the following: - -``` bash -==== Match Operator Result ==== -Matched ops: ['LangkitSampleEvaluator', 'LexicalDiversitySampleEvaluator', 'PresidioSampleEvaluator', 'PerspectiveSampleEvaluator'] - -==== Writer Result ==== -Code length: 3619 -Saved to: cache_local/my_operator.py - -==== Execution Result (instantiate) ==== -Success: True - -==== Debug Runtime Preview ==== -input_key: raw_content -available_keys: ['raw_content'] -[debug stdout] - [selected_input_key] raw_content - -[debug stderr] -Generating......: 100%|######### | 18/20 [00:08<00:00, 3.34it/s] -``` - -The generated code is saved to `script/cache_local/my_operator.py`. Open it to view the generated code: - -``` python -from dataflow.core import OperatorABC -from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.utils.storage import DataFlowStorage, FileStorage -from dataflow import get_logger -from dataflow.serving import APILLMServing_request -import pandas as pd - -@OPERATOR_REGISTRY.register() -class SentimentAnalysisOperator(OperatorABC): - def __init__(self, llm_serving=None): - self.logger = get_logger() - self.logger.info(f'Initializing {self.__class__.__name__}...') - self.llm_serving = llm_serving - self.score_name = 'SentimentScore' - self.logger.info(f'{self.__class__.__name__} initialized.') - - @staticmethod - def get_desc(lang: str = "zh"): - if lang == "zh": - return ( - "使用LLM进行文本情感分析,返回情感得分,得分越高表示情感越积极。\n" - "输入参数:\n" - "- llm_serving:LLM服务对象\n" - "- input_key:输入文本字段名\n" - "- output_key:输出得分字段名,默认'SentimentScore'\n" - "输出参数:\n" - "- 包含情感分析得分的DataFrame" - ) - else: - return ( - "Perform sentiment analysis on text using LLM, returning sentiment scores where higher scores indicate more positive sentiment.\n" - "Input Parameters:\n" - "- llm_serving: LLM serving object\n" - "- input_key: Field name for input text\n" - "- output_key: Field name for output score, default 'SentimentScore'\n" - "Output Parameters:\n" - "- DataFrame containing sentiment analysis scores" - ) - - def get_score(self, samples: list[dict], input_key: str) -> list[float]: - texts = [sample.get(input_key, '') or '' for sample in samples] - return self.llm_serving.generate_from_input(texts) - - def eval(self, dataframe: pd.DataFrame, input_key: str) -> list[float]: - self.logger.info(f"Evaluating {self.score_name}...") - samples = dataframe.to_dict(orient='records') - scores = self.get_score(samples, input_key) - self.logger.info("Evaluation complete!") - return scores - - def run(self, - storage: DataFlowStorage, - input_key: str | None = None, - output_key: str = 'SentimentScore'): - dataframe = storage.read("dataframe") - if input_key is None: - input_key = self._auto_select_input_key(dataframe) - dataframe[output_key] = self.eval(dataframe, input_key) - storage.write(dataframe) - - def _auto_select_input_key(self, dataframe: pd.DataFrame) -> str: - preferred_keys = ['raw_content', 'text', 'content', 'sentence', 'instruction', 'input', 'query', 'problem', 'prompt'] - for key in preferred_keys: - if key in dataframe.columns and dataframe[key].notnull().any(): - return key - return dataframe.columns[0] - -# Runnable entry code - -test_data_path = '/root/autodl-tmp/DataFlow-Agent/tests/test.jsonl' - -# Initialize FileStorage -storage = FileStorage(first_entry_file_name=test_data_path, cache_path="./cache_local", file_name_prefix="dataflow_cache_step", cache_type="jsonl") -storage = storage.step() - -# Initialize llm_serving -llm_serving = APILLMServing_request(api_url="http://123.129.219.111:3000/v1/chat/completions", key_name_of_api_key="DF_API_KEY", model_name="gpt-4o") - -# Select input key -available_keys = ['raw_content'] -preselected_input_key = 'raw_content' -input_key = preselected_input_key if preselected_input_key in available_keys else available_keys[0] -print(f"[selected_input_key] {input_key}") - -# Instantiate and run the operator -operator = SentimentAnalysisOperator(llm_serving=llm_serving) -operator.run(storage=storage, input_key=input_key) -``` - diff --git a/docs/en/notes/guide/agent/pipeline_prompt.md b/docs/en/notes/guide/agent/pipeline_prompt.md deleted file mode 100644 index 9be0d1d8f7..0000000000 --- a/docs/en/notes/guide/agent/pipeline_prompt.md +++ /dev/null @@ -1,234 +0,0 @@ ---- -title: Operator Reuse / Prompt Optimization -createTime: 2026/02/05 22:11:00 -permalink: /en/guide/agent/pipeline_prompt/ ---- - -## 1. Overview - -**Prompt Optimization** is the core module of DataFlow-Agent designed for **Prompt-Engineering**. Its design goal is to solve the problem of "generic operator logic reuse". - -This module adopts a **single-node architecture**. When a user proposes a new data processing requirement, the Agent not only writes a Prompt that complies with operator specifications but also **automatically generates synthetic test data** and internally builds and runs test scripts. - - -## 2. Core Features - -### 2.1 Example-Based Generation - -* **Operator Code Analysis**: The Agent automatically reads the source code of the target operator (`OP_NAME`) and extracts its parameter definitions. -* **Prompt Migration**: The system retrieves existing Prompt cases from the operator library and uses them as context to guide the LLM in generating a new Prompt class that conforms to the operator's interface specifications (e.g., `init` parameter structure). - -### 2.2 Self-Verification Based on Synthetic Data - -The generated Prompt does not merely remain as text; the Agent immediately tests it. - -* **Data Synthesis**: The Agent **does not require** large-scale business data from the user for testing. Instead, it utilizes the LLM to analyze the operator logic and automatically generates a set of **synthetic test data** covering various edge cases, saving it as a temporary JSONL file. -* **Subprocess Execution**: The Agent internally and automatically builds a temporary Python test script and launches a subprocess to execute it, verifying whether the generated Prompt runs correctly and produces the expected results. - -### 2.3 Iterative Optimization - -* **Interactive Feedback**: The system does not perform blind automatic retries. Users input modification suggestions after viewing the test results, the generated Prompt, and data previews on the frontend. -* **Targeted Hot Update**: Upon receiving feedback, the backend `PromptWriter` calls the `revise_with_feedback` method to make targeted modifications to the Prompt code while maintaining the existing context, automatically triggering a new round of the testing loop. - -## 3. System Architecture - -This function is defined by `dataflow_agent/workflow/wf_pipeline_prompt.py`, featuring a core **single-node** workflow. All generation and verification logic are highly cohesive within the `PromptWriter` Agent. - -### 3.1 Core Node Process - -The **Prompt Writer Node** is the only node in the graph, executing the following internal logic in sequence: - -1. **Context Retrieval**: Calls Pre-tools to obtain the target operator's source code, the user's target, and Prompt examples. -2. **Prompt Generation**: Calls the LLM to generate the Prompt class code in Python form. It then saves the generated code to the `state` object via the `update_state_result` method and writes it to a local file to provide dependencies for subsequent test steps. -3. **Test Data Synthesis**: Calls the internal method `_build_test_data_by_llm` to generate synthetic test data based on the task description. -4. **Test Script Construction**: Calls the internal method `_build_test_code` to generate a temporary test script using string templates. -5. **Subprocess Execution**: Uses `subprocess` to run the test script and captures standard output (stdout) and standard error (stderr). -6. **Test Result Output**: Scans and reads the test result file generated by the subprocess execution, updates the test results into `state.temp_data`, and completes the process. - -### 3.2 Iterative Optimization Mechanism - -The optimization process depends on **frontend interaction**: - -1. The user views the execution results in the UI. -2. The user submits feedback. -3. The frontend calls `_on_chat_submit`, triggering the Agent's `revise_with_feedback` interface. -4. The Agent modifies the code based on the feedback and re-executes the validation phase described above (Test Data Synthesis -> Test Script Construction -> Subprocess Execution). - -## 4. User Guide - -This feature provides two modes of usage: **Graphical Interface (Gradio UI)** and **Command Line Script**. - -### 4.1 Graphical Interface - -The frontend code is located in `gradio_app/pages/PA_frontend.py`, which provides a visual interactive experience. It is ideal for interactive exploration and rapid validation. To launch the web interface: -```python -python gradio_app/app.py -``` -Visit `http://127.0.0.1:7860` and start using - -**Initial Generation:** - -1. Configure API information (URL, Key, Model). -2. Fill in the task description and operator name. -3. (Optional) Specify the output format, argument list, and file output root path. -4. Click the "Generate Prompt Template" button. -5. View the test data, test results, Prompt code, and test code generated by the Agent. - -**Multi-turn Optimization:** - -1. If the results do not meet expectations, enter improvement suggestions in the dialogue box on the right. -2. Click "Send Rewrite Instruction". -3. View the updated code and test results. -4. Repeat steps 1-3 until a satisfactory result is obtained. - -**Using the Generated Prompt:** - -1. Get the generated Prompt file location from "Prompt File Path". -2. Import the Prompt class into your operator. -3. Specify `prompt_template` in the operator's `init()`. - -### 4.2 Script Invocation and Explicit Configuration - -For developers who need to integrate Prompt generation into automated pipelines or prefer code-based configuration, `script/run_dfa_pipeline_prompt.py` can be used. - -#### 1. Modify the Configuration - -Open `script/run_dfa_pipeline_prompt.py` and make modifications in the configuration section at the top of the file. - -**API Configuration** - * **`CHAT_API_URL`**: URL of the LLM service - * **`api_key`**: Access key (using the environment variable DF_API_KEY) - * **`MODEL`**: Model name, default is gpt-4o - -**Task Configuration** - * **`TASK_DESCRIPTION`**: Describe the task you want this Prompt to complete in natural language - * Example: `"I want to write a filter prompt suitable for financial questions."` - * **`OP_NAME`**: Specify which operator will load and use the generated Prompt - * **`OUTPUT_FORMAT`** (Optional): Specify the output format of the Prompt. If left blank, the Agent will generate it by imitating existing prompts - * **`ARGUMENTS`** (Optional): Parameters required by the Prompt template, separated by commas, spaces, or newlines - * Example: `["min_len=10", "drop_na=true"]` - -**Environment Configuration** - * **`CACHE_DIR`**: Result output directory. The generated Prompt files (`.py`), temporary test data, test code, etc., will all be saved here - * **`DELETE_TEST_FILES`**: Whether to automatically clean up temporary synthetic test data after running (`True`/`False`) - -#### 2. Run the Script - -After completing the configuration, execute the following command in the terminal: - -```bash -python script/run_dfa_pipeline_prompt.py -``` - -#### 3. Result Output - -After the script is executed, the console will print the generation process. You can find the generated files in the `CACHE_DIR` directory. - -### 4.3 Practical Case: Reuse the ReasoningQuestionFilter to Write a Filter Prompt for Financial Questions - -You can refer to the following tutorials for learning, and also use the sample of [Google Colab](https://colab.research.google.com/drive/1cU5Eg6tuc7WVDG33tU9Wplza52e54kts?usp=sharing) we provide to run the program: - -Suppose we want to reuse the `ReasoningQuestionFilter` operator in the system and turn it into a filter for financial domain questions. Open the script and modify the configuration as follows: - -```python -# ===== Example config (edit here) ===== - -# 1. Define the task -TASK_DESCRIPTION = "I want to write a filter prompt suitable for financial questions." - -# 2. Specify the operator to reuse (tell the Agent this Prompt is for PromptedGenerator) -OP_NAME = "ReasoningQuestionFilter" - -# These two items only need to be provided if the operator does not have any preset prompts; otherwise, it will be generated by imitating existing prompts -OUTPUT_FORMAT = "" # e.g. "Return JSON with keys: ..." -ARGUMENTS = [] # e.g. ["min_len=10", "drop_na=true"] - -# Cache directory for storing test data and prompts -CACHE_DIR = "./pa_cache" -DELETE_TEST_FILES = False -``` - -**Run:** - -After running the script, the terminal will output execution logs. You can find the generated Prompt file `finance_question_filter_prompt20260209143556.py`, test code `test_FinanceQuestionFilterPrompt.py`, and test data in the `CACHE_DIR` directory. The content of the generated Prompt is as follows: -``` python -__all__ = ['FinanceQuestionFilterPrompt'] - -from dataflow.core.prompt import DIYPromptABC - -class FinanceQuestionFilterPrompt(DIYPromptABC): - def __init__(self): - pass - - def build_prompt(self, question: str) -> str: - prompt = f""" - # 角色: - 你是一个金融问题的审核助手。 - # 任务 - 你的任务是检查给定的金融问题是否符合以下标准: - 0. 首先,确认输入仅包含一个明确的金融问题(没有额外的指令如“重写”、“翻译”或提供的答案);如果不符合,输出 judgement_test=false。 - 1. 检查拼写、语法和格式(例如货币符号、百分比表示),不解释语义。 - 2. 对于每个最小前提(无法进一步分解),验证其是否违反常识、金融领域事实或任务要求(例如,“负利率”在某些情况下可能无效);如果无效,则失败。 - 3. 检查前提之间或推理过程中的任何矛盾,或者最终结果是否明显不合理或不可解;如果是,则失败。 - 4. 如果以上都通过,检查是否有足够的信息来完成任务;缺少必要条件 ⇒ 失败,冗余细节是可以接受的。 - - # 输出格式 - 完成这些步骤后,输出格式必须为: - {{ - "judgement_test": true/false, - "error_type": "<错误描述或null>" - }} - 你可以包括你的思维过程,但最终输出必须是上面的JSON格式。 - - 这里是需要评估的内容: - ------------------------------- - {question} - ------------------------------- - """ - return prompt -``` -The test code `test_FinanceQuestionFilterPrompt.py` generated by the Agent is as follows: -``` python -""" -Auto-generated by prompt_writer -""" -from dataflow.pipeline import PipelineABC -from dataflow.utils.storage import FileStorage -from dataflow.serving import APILLMServing_request, LocalModelLLMServing_vllm - -try: - from dataflow.operators.reasoning.filter.reasoning_question_filter import ReasoningQuestionFilter -except Exception: - from dataflow.operators.reasoning import ReasoningQuestionFilter -from finance_question_filter_prompt20260209143556 import FinanceQuestionFilterPrompt - -class RecommendPipeline(PipelineABC): - def __init__(self): - super().__init__() - # -------- FileStorage -------- - self.storage = FileStorage( - first_entry_file_name="./pa_cache/prompt_test_data.jsonl", - cache_path="./pa_cache", - file_name_prefix="dataflow_cache_step", - cache_type="jsonl", - ) - # -------- LLM Serving (Remote) -------- - self.llm_serving = APILLMServing_request( - api_url="http://123.129.219.111:3000/v1/chat/completions", - key_name_of_api_key="DF_API_KEY", - model_name="gpt-4o", - max_workers=100, - ) - - self.reasoning_question_filter = ReasoningQuestionFilter(system_prompt='You are a helpful assistant.', llm_serving=self.llm_serving, prompt_template=FinanceQuestionFilterPrompt()) - - def forward(self): - self.reasoning_question_filter.run( - storage=self.storage.step(), input_key='math_problem' - ) - -if __name__ == "__main__": - pipeline = RecommendPipeline() - pipeline.compile() - pipeline.forward() -``` diff --git a/docs/en/notes/guide/agent/pipeline_rec&refine.md b/docs/en/notes/guide/agent/pipeline_rec&refine.md deleted file mode 100644 index f7199a0773..0000000000 --- a/docs/en/notes/guide/agent/pipeline_rec&refine.md +++ /dev/null @@ -1,326 +0,0 @@ ---- -title: Pipeline Recommendation & Refinement -createTime: 2026/02/05 22:11:00 -permalink: /en/guide/agent/pipeline_rec&refine/ ---- - -This module contains two closely collaborative core subsystems: - -1. **Pipeline Recommendation**: Responsible for the "0 to 1" process, transforming natural language requirements into complete executable Pipelines. -2. **Pipeline Refinement**: Responsible for the "1 to N" process, fine-tuning existing Pipeline structures based on user feedback. - - -## Part 1: Pipeline Recommendation - -### 1. Overview - -**Pipeline Recommendation** is the core orchestration engine of the DataFlow-Agent. It understands complex business requirements, automatically decomposes task steps, retrieves optimal components from the operator library, plans data flow, and generates executable Python code. - -The system possesses self-healing capabilities: when generated code fails to execute, the Agent proactively consults operator source code documentation, analyzes the cause of the error, and corrects the code until execution succeeds. - -### 2. System Architecture - -This function is orchestrated by `dataflow_agent/workflow/wf_pipeline_recommend_extract_json.py`, forming a directed graph containing multiple levels of intelligent agents. The detailed responsibilities of the nodes are as follows: - -#### 2.1 Analysis and Planning Phase - -1. **Classifier Node** - 1. **Responsibility**: Reads a small number of data samples to identify data types and business domains. This determines the tendency of subsequent operator recommendations. - 2. **Input**: `state.request.json_file` (data file path). - 3. **Output**: `state.category`. -2. **Target Parser Node** - 1. **Core Task (What it does)**: Acts as a business analyst. It does not directly generate code but translates vague user requirements into logically rigorous steps. - 2. **Input**: The user's natural language requirement (e.g., "Filter out texts shorter than 10 in the pdf, then deduplicate, and finally extract keywords"). - 3. **LLM Thinking**: Decomposes the requirement into a standard list of steps conforming to data processing logic (e.g., `["Read and parse pdf into plain text", "Filter out text data shorter than 10 characters", "Deduplicate text data to remove repetitive content", "Extract keywords from text data"]`). - 4. **Subsequent Action**: Uses the descriptions of the decomposed steps to retrieve the most similar physical operators from the operator vector database, forming a **candidate operator pool** for use in the next stage. -3. **Recommender Node** - 1. **Core Task**: Responsible for turning scattered candidate operators into an organized execution plan. - 2. **Input**: - * `target`: The user's original requirement. - * `sample`: Data samples (to understand data characteristics, such as field names and formats). - * `split_ops`: The list of candidate operators and their functional descriptions retrieved via RAG by the `target_parser` in the previous step. - 3. **LLM Thinking**: - * **Logical Sequencing**: Each stage is not limited to a single operator but follows the "requirement". - * **Data Compatibility**: If an operator requires field "X" but it does not exist in the sample data, it must ensure an operator creating that field precedes it. - * **Gap Filling**: Can existing operators meet the requirement? If not, a versatile `PromptedGenerator` needs to be inserted. - 4. **Output**: An ordered list of operator names and recommendation reasons, such as: - ```json - { - "ops": [ - "Text2SQLQuestionGenerator", - "SQLExecutionFilter", - "SQLConsistencyFilter", - "SQLVariationGenerator", - "Text2SQLQuestionGenerator", - "Text2SQLPromptGenerator", - "Text2SQLCoTGenerator", - "ReasoningQuestionSolvableSampleEvaluator", - "SQLComponentClassifier", - "PromptedGenerator" - ], - "reason": "This pipeline design aims to meet all user requirements. - 1. First, use Text2SQLQuestionGenerator to parse the SQL data file and extract SQL statements and corresponding natural language questions. - 2. Next, use SQLExecutionFilter to execute SQL statements in the database to verify their validity. - 3. Then, use SQLConsistencyFilter for consistency filtering to ensure SQL statements match their corresponding natural language questions. - 4. Next, use SQLVariationGenerator to augment valid SQL statements, including value replacement, increasing syntax difficulty, and changing writing styles. - 5. Subsequently, use Text2SQLQuestionGenerator to generate corresponding natural language questions based on the augmented SQL statements. - 6. Next, use Text2SQLPromptGenerator to generate prompt content, and generate the Chain of Thought reasoning process via Text2SQLCoTGenerator. - 7. Then, use ReasoningQuestionSolvableSampleEvaluator to classify the generated data, assessing the difficulty for large models to solve the problem, and use SQLComponentClassifier to assess the difficulty of SQL components. - 8. Finally, use PromptedGenerator to output synthetic SQL data and its corresponding natural language questions and reasoning processes to ensure all requirements are met." - } - - ``` - - -#### 2.2 Construction and Execution Phase - -1. **Builder Node** - 1. **Responsibility**: Converts the recommendation plan (JSON) into an actual Python code file and launches a subprocess to execute that code. - 2. **Mechanism**: Supports creating subprocesses to execute code, capturing standard output (stdout) and standard error (stderr). - 3. **Output**: `state.execution_result` (Success/Fail status and logs). - - - -#### 2.3 Automatic Repair Loop - -When the `builder` execution fails and `need_debug=True`, it enters this loop: - -1. **Debugger Node** - * **Responsibility**: Analyzes the error stack (`error_trace`) and current code to determine the error type (parameter error, logic error, etc.). - - -2. **Info Requester Node** - * **Responsibility**: This is an active learning node. If the Debugger deems information insufficient, it calls tools to read the **source code** or **documentation** of relevant operators to obtain context. - - -3. **Rewriter Node** - 1. **Responsibility**: Synthesizes error logs and source code knowledge found by the InfoRequester to generate the complete repaired code. - 2. **Flow**: The repaired code is sent back to the `builder` for testing until success or the maximum number of retries (`max_debug_rounds`) is reached. - - - -#### 2.4 Output Phase - -* **Exporter Node** - * **Responsibility**: After successful execution, organizes the final Pipeline information, code paths, and data samples, formatting the output for the user. - - - -### 3. User Guide - -This feature provides two modes of usage: **Graphical Interface (Gradio UI)** and **Command Line Script**. - -#### 3.1 Graphical Interface - -Code located in `gradio_app/pages/pipeline_rec.py`.It is ideal for interactive exploration and rapid validation. To launch the web interface: -```python -python gradio_app/app.py -``` -Visit `http://127.0.0.1:7860` and start using - -1. **Configure Inputs**: - 1. Enter your requirements in the "Target Description" box. - 2. Input the jsonl file to be processed. - 3. Configure API information (URL, Key, Model). - 4. (Optional) Configure embedding model and debug options. - 5. Select whether to automatically update the vector index (needs to be checked if operators are not in the registry). - 6. Select whether to use debug mode (debug mode will automatically run the generated Pipeline code until the maximum iteration rounds). - - -2. **Generate Pipeline**: - - Click **"Generate Pipeline"**. - - -3. **View Results**: - 1. **Pipeline Code**: View the final generated pipeline code. - 2. **Execution Log**: View execution log information. - 3. **Agent Results**: Detailed execution results of each Agent node, including the recommended operator list, construction process, etc.. - 4. **Pipeline JSON**: Generated Pipeline topology JSON, containing operator node lists and inter-node connection relationships. - - - -#### 3.2 Script Invocation - -For automated tasks or batch generation, it is recommended to directly modify and run `script/run_dfa_pipeline_recommend.py`. - -##### 1. Modify the Configuration - -Open `script/run_dfa_pipeline_recommend.py` and make modifications in the configuration section at the top of the file. - -**API Configuration** - - * **`CHAT_API_URL`**: URL of the LLM service - * **`api_key`**: Access key (using the environment variable DF_API_KEY) - * **`MODEL`**: Model name, default is gpt-4o - -**Task Configuration** - - * **`TARGET`**: Describe your data processing requirements in detail in natural language - * Example: `"Please help me orchestrate a pipeline specifically for large-scale pre-training data cleaning, covering the entire process from deduplication and rewriting to quality filtering"` - * **`TEST_JSON_REL_PATH`**: Relative path of the data file used to test the Pipeline - * Format: One JSON object per line - * Default: `{Project Root Directory}/tests/test.jsonl` - -**Debug Configuration** - - * **`NEED_DEBUG`**: Whether to enable automatic debugging and repair - * **`True`**: The Agent will attempt to run the generated code immediately. If an error is reported (e.g., `ImportError`, `KeyError`), it will start the Debugger Agent to analyze the error stack, automatically modify the code and retry - * **`False`**: End immediately after generating and running the code, without automatic debugging and repair - * **`MAX_DEBUG_ROUNDS`**: Maximum number of automatic repair attempts, default is 5 rounds - -**File Configuration** - - * **`CACHE_DIR`**: Result output directory. The generated pipeline code, execution logs, intermediate results, etc., will all be saved here - -##### 2. Run the Script - -```bash -python run_dfa_pipeline_recommend.py -``` - -##### 3. Result Output - -After the script is executed, the console will print the execution logs and the final execution status. After the script runs, `my_pipeline.py`, `final_state.json` and `graph.png` will be generated under `CACHE_DIR`. - -#### 3.3 Practical Case: Pre-training Data Cleaning Pipeline - -You can refer to the following tutorials for learning, and also use the sample of [Google Colab](https://colab.research.google.com/drive/1MMJxRpfYi7Zd-jc_pyhvM1Y2WoQXOFcu?usp=sharing) we provide to run the program: - -Suppose we have pre-training data `tests/test.jsonl` containing dirty data, and we want to clean it to obtain high-quality data. Open the script and modify the configuration as follows: - -**Scenario Configuration:** - -```python -# ===== Example config (edit here) ===== - -# 1. Define the task flow -TARGET = """ -- 1. Please help me orchestrate a dedicated pipeline for large-scale pre-training data cleaning, covering the entire process from deduplication and rewriting to quality filtering. - 1. Please help me orchestrate a dedicated pipeline for large-scale pre-training data cleaning, covering the entire process from deduplication and rewriting to quality filtering. -- 2. In the pre-training phase, raw web data (such as Common Crawl) is often filled with a large amount of noise, advertisements, garbled characters, and duplicate content, resulting in uneven data quality. I need to first perform appropriate rewriting on the raw data, such as removing a large number of excessive spaces, HTML tags, etc. Then, rule-based heuristic filtering needs to be applied to eliminate obviously garbage text, incomplete text, and overly short invalid data. Meanwhile, considering the complexity of online content, I need to filter data in a specified language for training large models. Web data has a high duplication rate, so it is best to use a fuzzy deduplication algorithm to clean up similar documents, leaving only one copy. Finally, to ensure that the model learns high-quality knowledge, I hope to have a quality classification model to score the cleaned data and retain only the content with high educational value, thereby building a high-quality pre-training corpus. -- 3. I need an end-to-end pipeline specifically for processing massive pre-training corpora. First, you can perform basic normalization processing on the raw text, removing excessive spaces, HTML tags, and emojis. Then, use heuristic rules for initial filtering to screen out obviously low-quality text. These heuristic rules should cover a wide range, including filtering out text segments with an excessively high symbol/word ratio, text segments containing sensitive words, text segments with an abnormal number of words, incomplete text segments ending with colons/ellipses, text segments with an abnormal number of sentences, empty text, text segments with an abnormal average word length, text segments containing HTML tags, text segments without punctuation marks, text segments with special symbols or watermarks, text segments with an excessively high proportion of parentheses, text segments with an excessively high proportion of uppercase letters, text segments containing lorem ipsum (random dummy text), text segments with an excessively low proportion of independent words, text segments with a small number of characters, text segments starting with bullet points, and text segments containing an excessive amount of Javascript. On this basis, use MinHash or similar algorithms for document-level fuzzy deduplication to significantly reduce data redundancy. Subsequently, use a trained quality assessment model to score and filter the remaining data. Finally, a language identification step can be added to ensure that only high-quality and clean text in the target language is retained in the end. -""" - -# 2. Specify the test data path -TEST_JSON_REL_PATH = "tests/test.jsonl" - -# 3. Enable Debug -NEED_DEBUG = True -MAX_DEBUG_ROUNDS = 5 -``` - -**Run:** -After running the script, the workflow will execute in the following steps: - -1. **Analyze user data and intent**: Analyze the characteristics of the user's data. -2. **Decompose user tasks and recommend operators**: Decompose the user's intent into multiple tasks, retrieve and match operators related to the user's intent. -3. **Generate code**: Analyze the order of requirements, connect these operators in series, and write the pipeline code. -4. **Automatic testing**: Start a child process for trial operation. If an error occurs and debug mode is enabled, the Debugger Node will attempt to fix it. -5. **Final delivery**: End the workflow when execution is successful or the maximum number of debug rounds is reached. - -Users can find the generated Pipeline code files and execution log files in the `CACHE_DIR` directory. - - - -## Part 2: Pipeline Refinement - -### 1. Overview - -Pipeline Refinement allows users to fine-tune generated DataFlow Pipelines using natural language. Users do not need to manually modify complex JSON configurations or Python code; simply inputting instructions like "delete the intermediate filter node" allows the system to intelligently parse the intent and automatically adjust the Pipeline's topology. - -### 2. System Architecture - -This function is orchestrated by `dataflow_agent/workflow/wf_pipeline_refine.py`, adopting a three-stage architecture of **Analyzer -> Planner -> Refiner**: - -#### 2.1 Refine Target Analyzer - -* **Core Responsibilities**: - * **Intent Recognition**: Compares the current Pipeline structure (`state.pipeline_structure_code`) and the user's natural language requirement (`target`) to analyze the type of modification the user wishes to make (add, delete, modify). - * **Pre-emptive RAG**: This is a key feature. The Analyzer parses descriptions of sub-operations implied in user requirements and directly calls RAG search `_get_operators_by_rag_with_scores`. It calculates similarity scores, evaluates match quality, and packages the best-matching operator code `code_snippet` and warning messages into `op_contexts`. -* **Input**: `state.pipeline_structure_code` (current pipeline code), `state.request.target` (user modification instruction). -* **Output**: Intent analysis results containing `needed_operators_desc`, and `op_contexts` containing rich context (operator code, match scores). - -#### 2.2 Refine Planner - -* **Responsibility**: Based on the intent provided by the Analyzer and the pre-retrieved operator context, formulates a specific **modification plan**. It does not directly modify code but generates structured operational steps. -* **Input**: Analyzer's analysis results (`intent`), operator context (`op_context`), current node summary. -* **Output**: A list of structured operational steps, for example: - * `REMOVE_NODE: node_filter_1` - * `ADD_NODE: node_deduplicate (after node_loader)` - * `UPDATE_EDGE: node_loader -> node_deduplicate`. - - - -#### 2.3 JSON Pipeline Refiner - -* **Responsibility**: Executes the Planner's plan, directly manipulating the Nodes and Edges of the Pipeline's JSON data structure. -* **Tool Enhancement**: This Agent has `search_operator_by_description` and `get_operator_code_by_name` mounted as Post-Tools. Although the Analyzer has already provided `op_context`, if the Refiner finds information insufficient during execution, it can still proactively initiate a search to supplement operator information. -* **Output**: Updated `state.pipeline_structure_code`. - -### 3. User Guide - -This feature provides two modes of usage: **Graphical Interface (Gradio UI)** and **Command Line Script**. - -#### 3.1 Graphical Interface - -Integrated in `gradio_app/pages/pipeline_rec.py`.It is ideal for interactive exploration and rapid validation. To launch the web interface: -```python -python gradio_app/app.py -``` -Visit `http://127.0.0.1:7860` and start using - -1. **Prerequisite**: Must first click "Generate Pipeline" at the top of the page to generate initial pipeline code, at which point `pipeline_json_state` will be initialized. -2. **Input Optimization Instruction**: Enter instructions in the "Optimization Requirement" text box. -3. **Execute Optimization**: Click **"Refine Pipeline"**. The system will display the updated Python code, JSON structure, and Agent execution logs. -4. **History Backtracking**: Use "Previous Round" and "Next Round" buttons to switch between different optimization versions and view the code evolution process. -5. **Warning Prompts**: If RAG match quality is low, an `Optimization Warning` comment will be automatically added to the top of the code, alerting the user that the currently generated operator may not fully match the requirement. - -#### 3.2 Script Invocation - -Use `script/run_dfa_pipeline_refine.py` to fine-tune the structure of an existing Pipeline. - -##### 1. Modify the Configuration - -**API Configuration** - - * **`CHAT_API_URL`**: URL of the LLM service - * **`api_key`**: Access key (using the environment variable DF_API_KEY) - * **`MODEL`**: Model name, default is gpt-4o - -**Task Configuration** - - * **`INPUT_JSON`**: Path of the Pipeline structure file to be optimized - * **`OUTPUT_JSON`**: Save path for the optimized Pipeline JSON structure file - * **`TARGET`**: Describe how you want to modify the Pipeline in natural language - * Example: `"Please adjust the Pipeline to contain only 3 nodes and simplify the data flow"` - -##### 2. Run the Script - -```bash -python script/run_dfa_pipeline_refine.py -``` - -#### 3.3 Practical Case: Simplify the Pipeline - -You can refer to the following tutorials for learning, and also use the sample of [Google Colab](https://colab.research.google.com/drive/1MMJxRpfYi7Zd-jc_pyhvM1Y2WoQXOFcu?usp=sharing) we provide to run the program: - -Suppose the Pipeline generated in the previous step is too complex and contains redundant "cleaning" operators, and we want to remove them to simplify the Pipeline. - -**Scenario Configuration:** - -```python -# ===== Example config (edit here) ===== - -# 1. Specify the Pipeline structure file generated in the previous step -INPUT_JSON = "dataflow_agent/tmps/pipeline.json" - -# 2. Issue modification instructions -TARGET = "Please simplify the intermediate cleaning operators and streamline the data flow." - -# 3. Specify the result save location -OUTPUT_JSON = "cache_local/pipeline_refine_result.json.json" -``` - -**Run:** -The Agent will analyze the JSON topology structure of the current Pipeline, find the corresponding deduplication node, and remove it. - diff --git a/docs/en/notes/guide/agent/web_collection.md b/docs/en/notes/guide/agent/web_collection.md deleted file mode 100644 index d0e17e66d7..0000000000 --- a/docs/en/notes/guide/agent/web_collection.md +++ /dev/null @@ -1,372 +0,0 @@ ---- -title: Web Data Collection -createTime: 2026/02/14 00:00:00 -permalink: /en/guide/agent/web_collection/ ---- - -## 1. Overview - -**Web Collection Agent** is the intelligent data collection module in DataFlow-Agent, designed to automatically collect, process, and format training datasets from the internet. The system supports two data types: - -- **PT (Pre-Training)**: Large-scale unlabeled corpora for model pre-training. -- **SFT (Supervised Fine-Tuning)**: Structured instruction-response pairs for model fine-tuning. - -The workflow is capable of: - -1. **Web Search & Exploration**: Multi-layer BFS forest exploration strategy with LLM-driven URL filtering to automatically discover and locate target datasets. -2. **Multi-Platform Download**: Supports HuggingFace, Kaggle, and direct web download, with LLM intelligently deciding the download priority order. -3. **Dual-Channel Parallel Collection**: WebSearch and WebCrawler pipelines run in parallel, providing richer data sources. -4. **Adaptive Data Mapping**: LLM generates Python mapping functions with a triple-verification mechanism to automatically convert heterogeneous data into standard Alpaca format. - -## 2. System Architecture - -This function is orchestrated by `dataflow_agent/workflow/wf_web_collection.py`, forming a directed graph with parallel branches and conditional loops. The overall process is divided into four phases: task analysis, data collection (parallel), data download, and data processing & mapping. - -### 2.1 Task Analysis Phase - -1. **Start Node** - 1. **Responsibility**: Initializes the workflow configuration, creates the download directory, and prepares the execution environment. - 2. **Input**: `state.request.target` (user's original requirement). - 3. **Output**: Initialized `user_query` and download directory. - -2. **Task Decomposer** - 1. **Responsibility**: Uses LLM to decompose complex user requirements into executable subtasks, with a maximum task limit (default 5). - 2. **Input**: User's original query. - 3. **LLM Thinking**: Analyzes the semantic meaning of the requirement and splits it into independent data collection subtasks. - 4. **Output**: `state.task_list`, for example: - - Subtask 1: Collect NLP Q&A datasets - - Subtask 2: Collect text classification datasets - - Subtask 3: Collect image classification datasets - -3. **Category Classifier** - 1. **Responsibility**: Determines whether the current task belongs to the PT or SFT type. - 2. **Input**: Current subtask name. - 3. **LLM Thinking**: Determines the data category based on the task description and generates a dataset background description. - 4. **Output**: `state.category` (`"PT"` or `"SFT"`) and `dataset_background`. - 5. **Fallback Mechanism**: When LLM cannot determine the category, keyword matching is used. SFT keywords include: `["sft", "fine-tuning", "qa", "instruction", "chat", "dialogue"]`. - -### 2.2 Data Collection Phase (Parallel Execution) - -After task analysis is complete, the system enters the `parallel_collection` parallel branch, simultaneously launching two collection pipelines: WebSearch and WebCrawler. - -#### 2.2.1 WebSearch Node - -WebSearch Node is the core data collection node of the system, implementing a complete web exploration and information extraction pipeline with the following core components: - -1. **QueryGenerator** - - **Responsibility**: Generates 3-5 diversified search queries based on the user's original requirement. - - **Example**: Input `"Collect Python code generation datasets"`, output: - - `"Python code generation dataset download"` - - `"Python programming instruction dataset HuggingFace"` - - `"code completion training data GitHub"` - -2. **WebTools** - - **search_web()**: Calls search engines (Tavily / DuckDuckGo / Jina) to obtain the initial URL list. - - **read_with_jina_reader()**: Uses Jina Reader to crawl web page content and return structured Markdown-formatted text. - -3. **Multi-Layer BFS Forest Exploration** - - **Algorithm**: Adopts a Breadth-First Search (BFS) strategy to explore web links layer by layer. In each layer, Jina Reader is used to crawl page content, extract candidate URLs, and then URLSelector filters the most relevant links for the next layer. - - **Key Parameters**: - - `max_depth`: Maximum exploration depth (default 2) - - `concurrent_limit`: Number of concurrent requests (default 10) - - `topk_urls`: Number of URLs filtered per layer (default 5) - - `url_timeout`: Request timeout (default 60 seconds) - -4. **URLSelector** - - **Responsibility**: Uses LLM to select the most relevant URLs from the candidate URL list based on the research objective. - - **Filtering Strategy**: Analyzes URL relevance to the research objective, domain credibility, avoids duplicate content, and filters blocked domains. - -5. **RAGManager** - - **Responsibility**: Stores crawled web content into a vector database, supporting subsequent semantic retrieval and providing context for the SummaryAgent. - -6. **SummaryAgent** - - **Responsibility**: Generates specific download subtasks based on RAG-retrieved content. - - **Output**: A structured subtask list, for example: - ```json - { - "type": "download", - "objective": "Download Spider Text2SQL dataset", - "search_keywords": ["spider dataset", "text2sql"], - "platform_hint": "huggingface", - "priority": 1 - } - ``` - -#### 2.2.2 WebCrawler Node - -WebCrawler Node specializes in extracting code blocks and technical content from web pages. It runs in parallel with WebSearch Node, providing richer data sources. - -1. **Generate Search Queries**: Creates specialized search queries targeting code/technical content. -2. **Search & Crawl**: Searches the web for URL lists and uses Jina Reader for concurrent page crawling. -3. **Code Block Extraction**: Calls `extract_code_blocks_from_markdown` to extract code blocks from Markdown content. -4. **Save Results**: Stores crawled results as `webcrawler_crawled.jsonl`. - -### 2.3 Data Download Phase - -**Download Node** performs the actual dataset download tasks, supporting three download methods with LLM intelligently deciding the download priority order. - -1. **DownloadMethodDecisionAgent (LLM Decision)** - - **Responsibility**: Analyzes the best download method based on the task objective and outputs a priority list, e.g., `["huggingface", "kaggle", "web"]`. - -2. **Try Each Download Method Sequentially**: - - **HuggingFace**: Searches HuggingFace Hub, LLM selects the best matching dataset, and downloads via API. - - **Kaggle**: Searches Kaggle datasets, LLM selects the best match, and downloads through the Kaggle API. - - **Web**: Uses WebAgent for intelligent web exploration and direct file download. - -3. **Record Download Results**: Updates `state.download_results` with the download status and path for each dataset. - -### 2.4 Data Processing & Mapping Phase - -#### Postprocess Node - -- **Responsibility**: Checks whether there are remaining incomplete subtasks (`check_more_tasks`). If so, loops back to the collection phase; otherwise, proceeds to the mapping phase. - -#### Mapping Node - -Mapping Node is responsible for converting collected intermediate-format data into standard Alpaca format, using LLM to generate adaptive Python mapping functions. - -1. **Read Intermediate Data**: Loads raw records from `intermediate.jsonl`. -2. **LLM Generates Mapping Function (Triple Verification)**: - 1. Generates the mapping function 3 times. - 2. Validates consistency on sample data. - 3. Uses the function after passing verification. -3. **Batch Processing**: Executes mapping transformation on all records. -4. **Quality Filtering**: Applies quality filters to remove low-quality data. -5. **Save Results**: Outputs in both `.jsonl` and `.json` formats. - -**Alpaca Format Definition**: - -```json -{ - "instruction": "Task instruction or question", - "input": "Optional input context (e.g., system prompt, SQL Schema)", - "output": "Expected answer or output" -} -``` - -**SFT Data Mapping Rules**: -- `system` role → `input` field -- `user` role → `instruction` field -- `assistant` role → `output` field - -**Mapping Example (Text2SQL)**: - -```json -// Input format -{ - "messages": [ - {"role": "system", "content": "CREATE TABLE farm (Id VARCHAR)"}, - {"role": "user", "content": "How many farms are there?"}, - {"role": "assistant", "content": "SELECT COUNT(*) FROM farm"} - ] -} - -// Output Alpaca format -{ - "instruction": "How many farms are there?", - "input": "CREATE TABLE farm (Id VARCHAR)", - "output": "SELECT COUNT(*) FROM farm" -} -``` - -## 3. State Management & Output - -### 3.1 WebCollectionState Core Fields - -```python -@dataclass -class WebCollectionState(MainState): - # Task related - user_query: str # User's original requirement - task_list: List[Dict] # Decomposed task list - current_task_index: int # Current task index - - # Search related - research_summary: str # Research summary - urls_visited: List[str] # Visited URLs - subtasks: List[Dict] # Download subtasks - - # Download related - download_results: Dict # Download result statistics - - # WebCrawler related - webcrawler_crawled_pages: List # Crawled pages - webcrawler_sft_records: List # SFT records - webcrawler_pt_records: List # PT records - - # Mapping related - mapping_results: Dict # Mapping results - intermediate_data_path: str # Intermediate data path -``` - -### 3.2 WebCollectionRequest Configuration - -```python -@dataclass -class WebCollectionRequest(MainRequest): - # Task configuration - category: str = "PT" # PT or SFT - output_format: str = "alpaca" - - # Search configuration - search_engine: str = "tavily" - max_depth: int = 2 - max_urls: int = 10 - concurrent_limit: int = 5 - topk_urls: int = 5 - - # WebCrawler configuration - enable_webcrawler: bool = True - webcrawler_num_queries: int = 5 - webcrawler_crawl_depth: int = 3 - webcrawler_concurrent_pages: int = 3 -``` - -### 3.3 Output File Structure - -``` -web_collection_output/ -├── rag_db/ # RAG vector database -├── hf_datasets/ # HuggingFace downloaded data -│ └── dataset_name/ -├── kaggle_datasets/ # Kaggle downloaded data -├── web_downloads/ # Direct web downloads -├── webcrawler_output/ # WebCrawler crawled results -│ └── webcrawler_crawled.jsonl -├── processed_output/ # Post-processing results -│ └── intermediate.jsonl -└── mapped_output/ # Final mapping results - ├── final_alpaca_sft.jsonl # Alpaca format (JSONL) - └── final_alpaca_sft.json # Alpaca format (JSON) -``` - -## 4. User Guide - -This feature provides two modes of usage: **Graphical Interface (Gradio UI)** and **Command Line Script**. - -### 4.1 Graphical Interface - -The front-end page code is located in `gradio_app/pages/web_collection.py`, providing a visual interactive experience. To launch the web interface: - -```bash -python gradio_app/app.py -``` - -Visit `http://127.0.0.1:7860` to start using - -![web_agent](/web_agent.png) - -1. `step1:` Describe the type of data you want to collect in the "Target Description" field -2. `step2:` Select the data category (PT or SFT) -3. `step3:` Configure dataset quantity and size limits -4. `step4:` Configure LLM API information (URL, Key, Model) -5. `step5:` (Optional) Configure Kaggle, Tavily, and other service keys -6. `step6:` Click the **"Start Web Collection & Conversion"** button -7. `step7:` Monitor the execution logs in real time -8. `step8:` Review the result summary after completion -9. `step9:` Check the collected data in the download directory - -**Advanced Usage**: Expand the "Advanced Configuration" section to adjust search engine selection, parallelism, caching strategy, data conversion parameters, etc. - -### 4.2 Script Invocation - -For automated tasks or batch collection, it is recommended to use the command line script `script/run_web_collection.py` directly. - -#### 1. Environment Variable Configuration - -```bash -export DF_API_URL="https://api.openai.com/v1" -export DF_API_KEY="your_api_key" -export TAVILY_API_KEY="your_tavily_key" -export KAGGLE_USERNAME="" -export KAGGLE_KEY="" -export RAG_API_URL="" -export RAG_API_KEY="" -``` - -#### 2. Run the Script - -```bash -# Basic usage -python script/run_web_collection.py --target "Collect machine learning Q&A datasets" - -# Full parameters -python script/run_web_collection.py \ - --target "Collect code generation datasets" \ - --category SFT \ - --max-urls 10 \ - --max-depth 2 \ - --download-dir ./my_output -``` - -**Main Parameter Description**: - -- **`--target`**: Data collection target description (required) -- **`--category`**: Data category, `PT` or `SFT` (default `SFT`) -- **`--max-urls`**: Maximum number of URLs (default 10) -- **`--max-depth`**: Maximum crawl depth (default 2) -- **`--output-format`**: Output format (default `alpaca`) - -#### 3. Python API Call - -```python -from dataflow_agent.workflow.wf_web_collection import run_web_collection - -result = await run_web_collection( - target="Collect machine learning code examples", - category="SFT", - output_format="alpaca", - download_dir="./my_output", - model="gpt-4o" -) -``` - -### 4.3 Practical Case: Collecting a Chinese Q&A Dataset - -Suppose we need to build a Chinese Q&A training dataset for a chatbot. Here is the complete workflow. - -**Scenario Configuration:** - -```bash -export DF_API_URL="https://api.openai.com/v1" -export DF_API_KEY="your_api_key" -export TAVILY_API_KEY="your_tavily_key" - -python script/run_web_collection.py \ - --target "Collect Chinese Q&A datasets for fine-tuning" \ - --category SFT \ - --max-urls 20 -``` - -**Run:** -After running the script, the workflow will execute in the following steps: - -1. **Task Decomposition**: LLM decomposes "Collect Chinese Q&A datasets for fine-tuning" into multiple subtasks (e.g., Chinese common knowledge Q&A, Chinese reading comprehension, etc.). -2. **Category Classification**: Based on the "fine-tuning" keyword, automatically classifies as SFT type. -3. **Parallel Collection**: WebSearch explores Chinese QA datasets on platforms such as HuggingFace and GitHub; WebCrawler simultaneously crawls Q&A content from technical blogs. -4. **Intelligent Download**: LLM decides to prioritize downloading matching datasets from HuggingFace, falling back to Kaggle and direct web download on failure. -5. **Format Mapping**: Converts the downloaded heterogeneous data into unified Alpaca format, outputting to the `mapped_output/` directory. - -Users can find the final `final_alpaca_sft.jsonl` file in the download directory, ready for direct use in model fine-tuning training. - -### 4.4 Notes - -1. **API Keys** - - Ensure that necessary API keys are configured - - Tavily is used for search; Kaggle is used for downloading Kaggle datasets - -2. **Network Environment** - - If located in China, it is recommended to use a HuggingFace mirror (set `HF_ENDPOINT`) - - Adjust the parallelism to match your network bandwidth - -3. **Storage Space** - - Ensure sufficient disk space is available - - Large datasets may require several GB of storage - -4. **Execution Time** - - The collection process may take a considerable amount of time (minutes to hours) - - You can control the duration by limiting the number of download tasks - -5. **Data Quality** - - Enabling RAG enhancement can improve data quality - - Adjust sampling parameters to balance quality and speed diff --git a/docs/en/notes/guide/quickstart/dataflow_skills.md b/docs/en/notes/guide/quickstart/dataflow_skills.md new file mode 100644 index 0000000000..9c779e5d20 --- /dev/null +++ b/docs/en/notes/guide/quickstart/dataflow_skills.md @@ -0,0 +1,278 @@ +--- +title: DataFlow Skills +icon: material-symbols:auto-awesome +createTime: 2026/05/22 12:45:39 +permalink: /en/guide/quickstart/dataflow_skills/ +--- + +# DataFlow Skills + +Reusable [Claude Code Skills](https://docs.anthropic.com/en/docs/claude-code/skills) for working with [DataFlow](https://github.com/OpenDCAI/DataFlow). Three skills are available: + +| Skill | What it does | Invoke with | +|---|---|---| +| **`generating-dataflow-pipeline`** | From a target description + a sample JSONL file, plan the operator chain and emit a runnable DataFlow pipeline. | `/generating-dataflow-pipeline` | +| **`dataflow-dev`** | DataFlow developer assistant. Routes intents (new operator / new pipeline / new prompt / diagnose error / code review / KB sync) into the right workflow. Run inside a DataFlow repo. | `/dataflow-dev` | +| **`core_text`** | Per-operator API reference (8 generators, 3 filters, 2 refiners, 5 evaluators). Consulted by the pipeline skill when it needs operators beyond the 6 core primitives. | _(not directly invoked)_ | + + +## Install + +**Prerequisite:** [Claude Code](https://docs.anthropic.com/en/docs/claude-code) CLI on your `PATH`. + +```bash +git clone https://github.com/haolpku/DataFlow-Skills.git +cd DataFlow-Skills +./install.sh +``` + +That copies all three skills into `~/.claude/skills/` (user-level — available in every project). Then in any Claude Code session: + +``` +/generating-dataflow-pipeline +``` + +If the slash command shows up in completion, you're done. + +### Install options + +```bash +./install.sh --project # install into ./.claude/skills/ instead +./install.sh dataflow-dev # install only the named skill(s) +./install.sh --force # overwrite existing skills (default: skip) +``` + +### Update + +```bash +cd DataFlow-Skills +git pull +./install.sh --force +``` + +## Generating DataFlow Pipeline + +> [Video Tutorial: Generate DataFlow Pipeline](https://github.com/user-attachments/assets/ca1fefbf-9bf7-469f-b856-b201952fb99b) + +A reasoning-guided pipeline planner. Given a **target** (what the pipeline should achieve) and a **sample JSONL file** (1–5 representative rows), it analyzes the data, selects operators, validates field dependencies, and generates a complete, runnable DataFlow pipeline in Python. + +### Quick Start + +#### 1. Prepare Your Data + +Create a JSONL file (one JSON object per line) with 1–5 representative rows: + +```jsonl +{"product_name": "Laptop", "category": "Electronics"} +{"product_name": "Coffee Maker", "category": "Appliances"} +``` + +#### 2. Run the Skill + +In Claude Code, invoke `/generating-dataflow-pipeline` and describe your target: + +``` +/generating-dataflow-pipeline +Target: Generate product descriptions and filter high-quality ones +Sample file: ./data/products.jsonl +Expected outputs: generated_description, quality_score +``` + +#### 3. Review the Output + +The skill returns a two-stage result: + +1. **Intermediate Operator Decision** — JSON with operator chain, field flow, and reasoning +2. **Complete 5-Section Response**: + - Field Mapping — which fields exist vs. need to be generated + - Ordered Operator List — operators in execution order with justification + - Reasoning Summary — why this design satisfies the target + - Complete Pipeline Code — full executable Python following standard structure + - Adjustable Parameters / Caveats — tunable knobs and debugging tips + +### Six Core Operators + +| Operator | Purpose | LLM? | +|----------|---------|------| +| `PromptedGenerator` | Single-field LLM generation | Yes | +| `FormatStrPromptedGenerator` | Multi-field template-based generation | Yes | +| `Text2MultiHopQAGenerator` | Multi-hop QA pair construction from text | Yes | +| `PromptedFilter` | LLM-based quality scoring & filtering | Yes | +| `GeneralFilter` | Rule-based deterministic filtering | No | +| **KBC Trio** (3 operators, always together in order) | File/URL → Markdown → chunks → clean text | Partial | + +### Generated Pipeline Structure + +All generated pipelines follow the same standard structure: + +```python +from dataflow.operators.core_text import PromptedGenerator, PromptedFilter +from dataflow.serving import APILLMServing_request +from dataflow.utils.storage import FileStorage + +class MyPipeline: + def __init__(self): + self.storage = FileStorage( + first_entry_file_name="./data/input.jsonl", + cache_path="./cache", + file_name_prefix="step", + cache_type="jsonl" + ) + self.llm_serving = APILLMServing_request( + api_url="https://api.openai.com/v1/chat/completions", + model_name="gpt-4o", + max_workers=10 + ) + # Operator instances ... + + def forward(self): + # Sequential operator.run() calls, each with storage.step() + ... + +if __name__ == "__main__": + pipeline = MyPipeline() + pipeline.forward() +``` + +Key rules: +- `first_entry_file_name` is set to the exact user-provided JSONL path +- Each `operator.run()` call uses `storage=self.storage.step()` for checkpointing +- Fields propagate forward: a field must exist in the sample or be output by a prior step before it can be consumed + +## DataFlow Dev + +A developer assistant skill for the DataFlow repo. It loads architecture knowledge, probes git state, and routes by intent: + +| Say something like… | Workflow | +|---|---| +| "new filter operator that…" | Operator creation (duplicate check → spec → code + registration reminder) | +| "new pipeline that…" | Pipeline creation with the standard `storage.step()` pattern | +| "new prompt for X" | Prompt creation (`PromptABC` / `DIYPromptABC`, `@prompt_restrict` placement) | +| "I'm getting `KeyError: …`" | Diagnose against known issues (#001–#008) | +| "review this operator" | 14-point checklist (registry, `run()` signature, `get_desc`, etc.) | +| "the upstream repo has new operators" | Compare local files to knowledge base, emit update steps | + +### Operator Creation + +The skill runs a duplicate check first, then confirms the spec with you: +- Operator type (filter / generate / refine / eval) +- Module (general_text / text_sft / reasoning / code / other) +- Whether it depends on LLM +- Input/output column names + +Generated code follows a mandatory checklist: +- Inherits `OperatorABC`, calls `super().__init__()` +- `@OPERATOR_REGISTRY.register()` decorator +- `run()` parameters: `input_*` prefix, `output_*` prefix, `storage: DataFlowStorage` first +- `run()` returns list of output key names +- LLM-driven operators use `self.llm_serving` +- Includes `get_desc(lang)` supporting zh/en + +### Pipeline Creation + +Generated pipelines follow these rules: +- `storage` declared in `__init__`, not in `forward()` +- Each operator call passes `storage=self.storage.step()` +- `max_workers` set according to API capacity +- API keys via environment variables, never hardcoded +- Includes `if __name__ == "__main__":` entry point + +### Error Diagnosis + +Quick match table for common errors: + +| Error keyword | Root cause | +|---|---| +| `Unexpected key 'xxx' in operator` | Config param naming (warning only) | +| `No object named 'Xxx' found in 'operators' registry` | Missing `__init__.py` registration | +| `Key Matching Error` | Pipeline key inconsistency | +| `You must call storage.step() before` | Missing `storage.step()` | +| `DummyStorage` + `AttributeError` | DummyStorage doesn't support `get_keys_from_dataframe` | +| `ModuleNotFoundError` + `dataflow.operators.reasoning.refine` | LazyLoader path — import from parent module | + + +## Core Text Operator Reference + +Extended operator reference consulted by the pipeline skill. When the 6 core primitives don't cover your task, these operators are available: + +### Generate + +| Operator | Description | +|----------|-------------| +| `prompted-generator` | Basic single-field LLM generation | +| `format-str-prompted-generator` | Multi-field template-based generation | +| `chunked-prompted-generator` | Long document chunk-by-chunk processing | +| `embedding-generator` | Text vectorization using embedding APIs | +| `retrieval-generator` | Async RAG generation using LightRAG | +| `bench-answer-generator` | Benchmark answer generation with evaluation type variants | +| `text2multihopqa-generator` | Multi-hop QA pair construction from text | +| `random-domain-knowledge-row-generator` | Domain-specific row generation from seed data | + +### Filter + +| Operator | Description | +|----------|-------------| +| `prompted-filter` | LLM-based quality scoring and filtering | +| `general-filter` | Rule-based deterministic filtering | +| `kcentergreedy-filter` | Diversity-based filtering using k-Center Greedy | + +### Refine + +| Operator | Description | +|----------|-------------| +| `prompted-refiner` | LLM-based text rewriting and refinement | +| `pandas-operator` | Custom pandas DataFrame operations | + +### Eval + +| Operator | Description | +|----------|-------------| +| `prompted-evaluator` | LLM-based scoring and evaluation | +| `bench-dataset-evaluator` | Benchmark dataset evaluation | +| `bench-dataset-evaluator-question` | Benchmark question-level evaluation | +| `text2qa-sample-evaluator` | QA sample quality evaluation | +| `unified-bench-dataset-evaluator` | Unified benchmark evaluation across formats | + +Each operator folder follows the same layout: + +``` +/ +├── SKILL.md # English documentation +├── SKILL_zh.md # Chinese documentation +└── examples/ + ├── good.md # Correct usage examples + └── bad.md # Common mistakes +``` + + +## Adding a New Operator + +### As an Extended Operator + +1. Create an operator directory with skill definition: + +``` +core_text/// +├── SKILL.md +├── SKILL_zh.md +└── examples/ + ├── good.md + └── bad.md +``` + +2. Register the operator in `generating-dataflow-pipeline/SKILL.md`'s **Extended Operator Reference** section. Without this entry, the pipeline generator won't discover your operator. + +### Promoting to a Core Primitive + +If the operator is used frequently enough: + +1. Add to the core primitives list in Preferred Operator Strategy +2. Add a decision table row in Operator Selection Priority Rule +3. Add full constructor and `run()` signatures in Operator Parameter Signature Rule +4. Add the import path in Correct Import Paths +5. Add input pattern matching in Input File Content Analysis Rule (if new data type) + + +## Repository + +GitHub: [https://github.com/OpenDCAI/DataFlow-Skills](https://github.com/OpenDCAI/DataFlow-Skills) \ No newline at end of file diff --git a/docs/zh/notes/guide/agent/DataFlow-AgentPipelineOrchestration.md b/docs/zh/notes/guide/agent/DataFlow-AgentPipelineOrchestration.md deleted file mode 100644 index bf25968db2..0000000000 --- a/docs/zh/notes/guide/agent/DataFlow-AgentPipelineOrchestration.md +++ /dev/null @@ -1,842 +0,0 @@ ---- -title: Agent-快速开始 -icon: carbon:ibm-consulting-advantage-agent -createTime: 2025/06/19 10:29:31 -permalink: /zh/guide/DataFlow-AgentPipelineOrchestration/ - ---- -# DataFlow Agent 快速入门指南 - -本指南将帮助您快速上手 DataFlow Agent 平台的5个核心功能模块。 - - -## 安装 - -```bash -git clone https://github.com/OpenDCAI/DataFlow-Agent.git -cd DataFlow-Agent -pip install -r requirements.txt -pip install -e . -``` - -## 启动Web界面 - -```bash -python gradio_app/app.py -``` - -访问 `http://127.0.0.1:7860` 开始使用 - -## 目录 - -1. [管线推荐](#1-管线推荐) -2. [算子编写](#2-算子编写) -3. [手动编排](#3-手动编排) -4. [算子复用/提示词优化](#4-算子复用提示词优化) -5. [Web Search/数据采集](#5-web-search数据采集) - ---- - -## 1. 管线推荐 - -### 功能概述 -根据用户的自然语言描述,自动推荐并生成合适的 DataFlow Pipeline,包括算子选择、参数配置和代码生成。 - -### 使用场景 -- 快速构建数据处理流程 -- 不熟悉具体算子时的智能推荐 -- 自动化 Pipeline 生成 - -### 输入参数 - -#### 基础配置 -- **目标描述** (必需) - - 描述您想要实现的数据处理目标 - - 示例:`"给我随意符合逻辑的5个算子,过滤,去重!"` - - 示例:`"对文本数据进行清洗、去重、分类"` - -- **输入 JSONL 文件路径** (必需) - - 用于测试 Pipeline 的数据文件 - - 格式:每行一个 JSON 对象 - - 默认:`{项目根目录}/tests/test.jsonl` - -- **Session ID** - - 会话标识符,用于缓存和追踪 - - 默认:`"default"` - -#### API 配置 - -**主要模型配置** -- **Chat API URL**: LLM 服务地址 - - 默认:`http://123.129.219.111:3000/v1/` -- **API Key**: 访问密钥 -- **模型名称**: 如 `gpt-4o`, `qwen-max`, `llama3` 等 - - 默认:`gpt-4o` - -**嵌入模型配置** -- **Embedding API URL**: 嵌入模型服务地址(可选,留空则使用主要 API) -- **Embedding 模型名称**: 如 `text-embedding-3-small` - -#### 调试配置 -- **启用调试模式**: 是否启用自动调试和修复 -- **调试模式执行次数**: 1-10 次,默认 2 次 - -### 输出结果 - -#### 1. Pipeline Code (生成的代码) -```python -# 自动生成的 Python 代码 -# 包含完整的 Pipeline 定义和执行逻辑 -``` - -#### 2. Execution Log (执行日志) -- Pipeline 执行过程的详细日志 -- 包含每个算子的执行状态 -- 错误信息和调试信息 - -#### 3. Agent Results (Agent 执行结果) -```json -{ - "recommender": {...}, - "pipeline_builder": {...}, - "operator_executor": {...} -} -``` -- 各个 Agent 节点的详细执行结果 -- 包含推荐的算子列表、构建过程等 - -### 使用步骤 - -![pipeline_rec](/pipeline_rec.png) - -1. `step1:`选择管线推荐子页面 -2. `step2:`在"目标描述"框中输入您的需求 -3. `step3:`输入需要处理jsonl文件 -4. `step4:`配置 API 信息(URL、Key、模型) -5. `step5:`(可选)配置嵌入模型和调试选项 -6. `step6:`选择是否需要自动更新向量索引(如果出现算子不在注册机里,则需要勾选) -7. `step7:`选择是否使用debug模式(debug模式会自动运行管线,直到自大迭代轮次) -8. `step8:`右侧 查看生成的代码和执行结果 - - ---- - -## 2. 算子编写 - -### 功能概述 -根据用户需求自动生成新的 DataFlow 算子代码,包括算子实现、测试代码和调试。 - -### 使用场景 -- 创建自定义数据处理算子 -- 扩展 DataFlow 功能 -- 快速原型开发 - -### 输入参数 - -#### 基础配置 -- **目标描述** (必需) - - 描述算子的功能和用途 - - 示例:`"创建一个算子,用于对文本进行情感分析"` - - 示例:`"实现一个数据去重算子,支持多字段组合去重"` - -- **算子类别** - - 算子所属类别,用于匹配相似算子作为参考 - - 默认:`"Default"` - - 可选:`"filter"`, `"mapper"`, `"aggregator"` 等 - -- **测试数据文件路径 (JSONL)** - - 用于测试算子的数据文件 - - 默认:`{项目根目录}/tests/test.jsonl` - -#### API 配置 -- **Chat API URL**: LLM 服务地址 -- **API Key**: 访问密钥(留空则使用环境变量 `DF_API_KEY`) -- **模型名称**: 默认 `gpt-4o` - -#### 高级配置 -- **输出语言**: `en` (英文) 或 `zh` (中文) -- **启用调试模式**: 自动执行并修复代码错误 -- **最大调试轮次**: 1-10 次,默认 3 次 -- **输出文件路径**: 保存生成代码的位置(可选) - -### 输出结果 - -#### 1. 生成的代码 -```python -# 完整的算子实现代码 -class YourOperator(Operator): - def __init__(self, ...): - ... - - def run(self, dataset, ...): - ... -``` - -#### 2. 匹配的算子 -```json -[ - { - "op_name": "similar_operator_1", - "similarity": 0.85, - "description": "..." - } -] -``` -- 系统匹配到的相似算子列表 -- 用作参考和学习 - -#### 3. 执行结果 -```json -{ - "success": true, - "output": {...}, - "stderr": "", - "stdout": "..." -} -``` -- 算子的执行状态 -- 输出数据预览 -- 错误信息(如有) - -#### 4. 调试信息 -```json -{ - "round": 2, - "input_key": "text", - "available_keys": ["text", "label"], - "stdout": "...", - "stderr": "..." -} -``` -- 调试过程的详细信息 -- 每轮调试的输入输出 - -#### 5. Agent 结果 -- 各个 Agent 节点的执行详情 -- 包含匹配、编写、执行、调试等阶段 - -#### 6. 执行日志 -- 完整的执行过程日志 -- 包含所有阶段的详细信息 - -### 使用步骤 - -![op_write](/op_write.png) - -1. `step1:` 在"目标描述"中详细说明算子功能 -2. `step2:` 选择合适的算子类别,配置 API 信息 -3. `step3:` (可选)启用调试模式以自动修复错误 -4. `step4:` 设置debug轮次 -5. `step5:` 设置输出jsonl文件路径 - ---- - -## 3. 手动编排 - -### 功能概述 -通过可视化界面手动选择和组装算子,构建自定义 Pipeline,支持拖拽排序和参数配置。 - -### 使用场景 -- 精确控制 Pipeline 结构 -- 复用现有算子 -- 快速原型验证 -- 学习算子使用方法 - -### 输入参数 - -#### API 和文件配置 -- **Chat API URL**: LLM 服务地址 -- **API Key**: 访问密钥 -- **模型名称**: 默认 `gpt-4o` -- **输入 JSONL 文件路径**: 测试数据文件 - -#### 算子选择和配置 - -**步骤 1: 选择算子** -1. 从"算子分类"下拉框选择类别 - - 如:`filter`, `mapper`, `deduplicator` 等 -2. 从"算子"下拉框选择具体算子 - - 系统会自动显示该算子的参数说明 - -**步骤 2: 配置参数** - -- **Prompt Template (可选)** - - 如果算子支持 Prompt 模板,会显示下拉选择器 - - 选择后自动更新到 `__init__()` 参数中 - -- **`__init__()` 参数 (JSON 格式)** - ```json - { - "param1": "value1", - "param2": 123, - "prompt_template": "module.PromptClass" - } - ``` - - 算子初始化参数 - - 必须是有效的 JSON 对象 - -- **`run()` 参数 (JSON 格式)** - ```json - { - "input_key": "text", - "output_key": "processed_text", - "batch_size": 32 - } - ``` - - 算子运行时参数 - - 必须是有效的 JSON 对象 - -**步骤 3: 添加到 Pipeline** -- 点击"➕ 添加算子到 Pipeline"按钮 -- 算子会被添加到 Pipeline 序列中 - -**步骤 4: 调整顺序** -- 在 Pipeline 可视化区域,可以检查算子前后key是否对其 -- 系统会自动重新编号 - -**步骤 5: 自动链接** -- 系统会自动分析算子间的输入输出关系 -- 显示链接状态: - - 🔗 **已链接**: 输出键成功匹配到下一个算子的输入 - - ⚠️ **待处理**: 输入为空或未匹配 - -### 输出结果 - -#### 1. 当前 Pipeline (可视化展示) -- 每个算子显示为卡片,包含: - - 步骤编号 - - 算子名称 - - `__init__()` 参数预览 - - `run()` 参数预览 - - 与上一步的连接状态 - -#### 2. 当前 Pipeline (JSON 格式) -```json -[ - { - "op_name": "TextCleanerOperator", - "init_params": {...}, - "run_params": {...}, - "_incoming_links": [ - { - "input_key": "text", - "value": "raw_text", - "output_keys": ["output"] - } - ] - } -] -``` - -#### 3. 生成的代码 -```python -class RecommendPipeline(PipelineABC): - def __init__(self): - super().__init__() - # -------- FileStorage -------- - self.storage = FileStorage( - first_entry_file_name="/tmp/test_sample_10.jsonl", - cache_path="dataflow_cache", - file_name_prefix="dataflow_cache_step", - cache_type="jsonl", - ) - # -------- LLM Serving (Remote) -------- - self.llm_serving = APILLMServing_request( - api_url="http://123.129.219.111:3000/v1/chat/completions", - key_name_of_api_key="DF_API_KEY", - model_name="gpt-4o", - max_workers=100, - ) - # -------- Operators -------- - self.condor_generator = CondorGenerator(llm_serving=self.llm_serving, llm_serving=self.llm_serving, num_samples=15, use_task_diversity=True) - self.prompted_generator = PromptedGenerator(llm_serving=self.llm_serving, llm_serving=self.llm_serving, system_prompt='分析样本数据,识别与可再生能源相关的关键主题和趋势。', json_schema=None) - self.task2_vec_dataset_evaluator = Task2VecDatasetEvaluator(llm_serving=self.llm_serving, device='cuda', sample_nums=10, sample_size=1, method='montecarlo', model_cache_dir='./dataflow_cache') - - def forward(self): - self.condor_generator.run( - storage=self.storage.step(), - input_key='raw_content', - output_key='generated_content_1' - ) - self.prompted_generator.run( - storage=self.storage.step(), - input_key='generated_content_1', - output_key='generated_content_2' - ) - self.task2_vec_dataset_evaluator.run( - storage=self.storage.step(), - input_key='generated_content_2' - ) - -if __name__ == "__main__": - pipeline = RecommendPipeline() - pipeline.compile() - pipeline.forward() - -``` - -#### 4. 处理结果数据 (前 100 条) -```json -[ - {"text": "processed text 1", "label": "A"}, - {"text": "processed text 2", "label": "B"}, - ... -] -``` - -#### 5. 输出文件路径 -- 处理后数据的保存位置 - -### 使用步骤 - -![op_assemble](/op_assemble.png) - -1. `step1:` 配置 API 信息和输入文件路径 -2. `step2:` 配置APIKey -3. `step3:` 配置模型 -4. `step4:` 选择待处理文件路径 -5. `step5:` 选择要组合的算子类别 -6. `step6:` 选择要组合的算子 -7. `step7:` 如果算子提供了prompttemplate需要选择 -8. `step8:` 编辑算子输入和输出key!! -9. `step9:` 运行 -10. `step10:` 可以查看组装的代码,和处理结果数据,以及输出文件路径 - -### 高级技巧 - -- **清空 Pipeline**: 点击"🗑️ 清空 Pipeline"按钮 -- **参数复用**: 系统会自动将上一个算子的输出键链接到下一个算子的输入 -- **调试**: 如果执行失败,检查日志中的错误信息,调整参数后重试 - ---- - -## 4. 算子复用/提示词优化 - -### 功能概述 -PromptAgent 前端,用于生成和优化算子的 Prompt 模板,支持多轮对话式改写和测试。 - -### 使用场景 -- 为算子创建高质量的 Prompt 模板 -- 优化现有 Prompt 的效果 -- 快速迭代 Prompt 设计 -- 生成测试代码和数据 - -### 输入参数 - -#### 运行配置 -- **Chat API Base URL**: LLM 服务地址 - - 默认:`http://123.129.219.111:3000/v1/` -- **Chat API Key**: 访问密钥 -- **Model**: 模型名称,默认 `gpt-4o` -- **Language**: 提示词语言,`zh` (中文) 或 `en` (英文) - -#### Prompt 配置 -- **任务描述** (必需) - - 详细描述 Prompt 要完成的任务 - - 示例:`"对用户输入的文本进行情感分析,判断是正面、负面还是中性"` - - 示例:`"将产品描述改写为更吸引人的营销文案"` - -- **算子名称 (op-name)** (必需) - - Prompt 类的名称 - - 示例:`SentimentAnalysisPrompt` - -- **输出格式** (可选) - - 指定 Prompt 输出的格式 - - 示例: - ``` - { - "sentiment": "positive/negative/neutral", - "confidence": 0.95 - } - ``` - -- **参数列表** (可选) - - Prompt 模板需要的参数,用逗号、空格或换行分隔 - - 示例:`text, language, style` - - 示例: - ``` - input_text - target_audience - tone - ``` - -- **文件输出根路径** (可选) - - 保存生成文件的目录 - - 默认:`./pa_cache` - -- **生成后删除测试文件** - - 是否在生成后删除测试文件(保留路径占位) - - 默认:启用 - -### 输出结果 - -#### 1. Prompt 文件路径 -- 生成的 Prompt 模板文件位置 -- 示例:`./pa_cache/prompts/SentimentAnalysisPrompt.py` - -#### 2. 测试数据文件路径 -- 自动生成的测试数据文件 -- 示例:`./pa_cache/test_data/test_data.jsonl` - -#### 3. 测试代码文件路径 -- 自动生成的测试代码 -- 示例:`./pa_cache/tests/test_prompt.py` - -#### 4. 测试数据预览 -```json -[ - {"text": "这个产品真不错!", "language": "zh"}, - {"text": "质量太差了", "language": "zh"}, - {"text": "还可以吧", "language": "zh"} -] -``` - -#### 5. 测试结果预览 -```json -[ - { - "input": {"text": "这个产品真不错!"}, - "output": { - "sentiment": "positive", - "confidence": 0.92 - } - } -] -``` - -#### 6. Prompt 代码预览 -```python -class SentimentAnalysisPrompt(PromptTemplate): - """情感分析 Prompt 模板""" - - def __init__(self): - super().__init__() - self.system_prompt = "你是一个情感分析专家..." - self.user_prompt_template = "请分析以下文本的情感:{text}" - - def format(self, text: str, **kwargs) -> str: - return self.user_prompt_template.format(text=text) -``` - -### 多轮改写功能 - -在右侧对话区域,您可以: - -1. **查看初次生成结果** - - Prompt 代码 - - 测试结果 - -2. **提出改进建议** - - 在对话输入框中描述您希望如何修改 - - 示例: - - `"增加对讽刺语气的识别"` - - `"输出格式改为只返回 positive/negative/neutral 字符串"` - - `"添加置信度阈值,低于 0.7 时返回 uncertain"` - -3. **发送改写指令** - - 点击"发送改写指令"按钮 - - 系统会根据反馈重新生成 Prompt - -4. **迭代优化** - - 查看更新后的代码和测试结果 - - 继续提出改进建议 - - 重复直到满意 - -5. **清空会话** - - 点击"清空会话"按钮重新开始 - -### 使用步骤 - -![prompt_agent](/prompt_agent.png) - -1. `step1:` 选择你要复用的带有prompttemplate的算子名称 -2. `step2:` 输入你想修改的提示词内容 -3. `step3:` 点击“生成提示词模板” -4. `step4:` 右侧预览生成的“输出文件路径,测试数据,提示词模板代码,测试代码” - - -#### 初次生成 -1. 配置 API 信息(URL、Key、模型) -2. 填写任务描述、算子名称 -3. (可选)指定输出格式和参数列表 -4. 点击"生成 Prompt 模板"按钮 -5. 查看生成的 Prompt 代码和测试结果 - -#### 多轮优化 -1. 在右侧对话框中输入改进建议 -2. 点击"发送改写指令" -3. 查看更新后的代码和测试结果 -4. 重复步骤 1-3 直到满意 - -#### 使用生成的 Prompt -1. 从"Prompt 文件路径"获取文件位置 -2. 将 Prompt 类导入到您的算子中 -3. 在算子的 `__init__()` 中指定 `prompt_template` - ---- - -## 5. Web Search/数据采集 - -### 功能概述 -从网络(HuggingFace、Kaggle 等平台)自动采集数据集,并转换为统一格式,支持智能搜索、下载和数据清洗。 - -### 使用场景 -- 快速构建训练数据集 -- 收集特定领域的数据 -- 数据集格式转换 -- 批量下载和处理 - -### 输入参数 - -#### 采集配置 -- **目标描述** (必需) - - 描述您想要收集的数据类型 - - 示例:`"收集 Python 代码示例的数据集"` - - 示例:`"收集中文对话数据,用于训练聊天机器人"` - - 示例:`"收集图像分类数据集,包含猫和狗的图片"` - -- **数据类别** - - `PT`: 预训练数据(Pre-Training) - - `SFT`: 监督微调数据(Supervised Fine-Tuning) - - 默认:`SFT` - -- **数据集数量上限(每关键词)** - - 每个搜索关键词返回的数据集数量 - - 范围:1-50 - - 默认:5 - - 注意:仅用于参考,实际数量可能因搜索结果而异 - -- **数据集大小范围** - - 筛选数据集的大小范围 - - 选项: - - `n<1K`: 小于 1000 条 - - `1K1M`: 大于 1000000 条 - - 默认:`1K **注意:显式配置要求** 与 UI 的“自动链接”不同,脚本模式下您必须**显式配置**所有参数。您需要确保上一个算子的 `output_key` 与下一个算子的 `input_key` 严格匹配,脚本不会自动为您纠正参数名。 - -#### 3. 运行脚本 - -```Bash -python script/run_dfa_op_assemble.py -``` - -#### 4. 结果输出 - -脚本执行后,控制台将打印: - -- **[Generation]**: 生成的 Pipeline 代码路径。 -- **[Code Preview]**: 生成代码的前 20 行预览。 -- **[Execution]**: 执行情况。 - -### 3.3 实战 Case:通用文本推理与伪答案生成 - -你可以参考以下教程学习,也可以参考我们提供的[Google Colab](https://colab.research.google.com/drive/1W3Wb1sTyea1xDAGmVu3Tyn7fcvrsppAp?usp=sharing)样例来运行: - -我们有一个 `tests/test.jsonl` 文件,里面每行都有一个 `"raw_content"` 字段。我们希望:基于该字段的通用英文文本内容,先调用大语言模型针对文本内容生成推理式答案,再通过多轮生成候选答案并统计选优的方式生成伪答案,最终输出候选答案列表、最优伪答案、对应推理过程及典型正确推理示例等关键字段。所以我们选择 `ReasoningAnswerGenerator` 和 `ReasoningPseudoAnswerGenerator` 两个算子来编排 Pipeline。 - -以下是完整的配置示例: - -```Python -# [Pipeline 定义] -PIPELINE_STEPS = [ - { - "op_name": "ReasoningAnswerGenerator", - "params": { - # __init__ 参数 (注意:在 wf_df_op_usage 中统一合并为 params) - "prompt_template": "dataflow.prompts.reasoning.general.GeneralAnswerGeneratorPrompt", - # run 参数 - "input_key": "raw_content", - "output_key": "generated_cot" - } - }, - { - "op_name": "ReasoningPseudoAnswerGenerator", - "params": { - "max_times": 3, - "input_key": "generated_cot", - "output_key_answer": "pseudo_answers", - "output_key_answer_value": "pseudo_answer_value", - "output_key_solutions": "pseudo_solutions", - "output_key_correct_solution_example": "pseudo_correct_solution_example" - } - } -] -``` -配置完成后,在终端执行: - -```Bash -python script/run_dfa_op_assemble.py -``` -脚本会自动完成以下动作: - -1. 构建图:解析您的 PIPELINE_STEPS。 -2. 生成代码:将配置转换为标准的 Python 代码,存储在 `dataflow_cache/generated_pipelines/` 下。 -3. 执行任务:启动子进程运行生成的 Pipeline。 -4. 输出报告:终端会显示 [Execution] Status: success 以及代码部分预览。 - -您可以直接去 `CACHE_DIR` 目录下查看生成的 JSONL 结果文件,验证数据是否符合预期。 \ No newline at end of file diff --git a/docs/zh/notes/guide/agent/operator_qa.md b/docs/zh/notes/guide/agent/operator_qa.md deleted file mode 100644 index 69f362aecb..0000000000 --- a/docs/zh/notes/guide/agent/operator_qa.md +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: 算子问答 -createTime: 2026/02/05 22:11:00 -permalink: /zh/guide/agent/operator_qa/ ---- - -## 1. 概述 - -**算子智能问答 (Operator QA)** 是 DataFlow-Agent 平台内置的垂直领域专家助手。它的核心使命是帮助用户快速在海量的 DataFlow 算子库中找到所需的工具,理解其用法,并查看底层源码。 - -不同于通用的聊天机器人,Operator QA 集成了 **RAG(检索增强生成)** 技术。它挂载了 DataFlow 项目的完整算子索引(FAISS)和元数据知识库。当用户提问时,Agent 会自主决定是否需要检索知识库,检索哪些算子,并将准确的技术细节(包括代码片段、参数说明)反馈给用户。 - -## 2. 核心特性 - -该功能模块由前端 UI (`gradio_app/pages/operator_qa.py`)、执行工作流 (`dataflow_agent/workflow/wf_operator_qa.py`) 和后端智能体 (`dataflow_agent/agentroles/data_agents/operator_qa_agent.py`) 共同驱动,具备以下核心能力: - -### 2.1 智能检索与推荐 - -Agent 并非简单地进行关键词匹配,而是基于语义理解用户的需求。 - -- **语义搜索**:用户只需描述“我想过滤掉缺失值”,Agent 会通过向量检索找到 `ContentNullFilter`等相关算子。 -- **按需调用**:基于 `BaseAgent` 的图模式 (`use_agent=True`),Agent 会根据对话上下文自动判断是否需要调用 `search_operators` 工具,或者直接基于上下文回答。 - -### 2.2 多轮对话 - -利用 `AdvancedMessageHistory` 模块,系统维护了完整的会话上下文。 - -- **上下文记忆**:用户可以先问“有哪些加载数据的算子?”,接着问“**它**的参数怎么填?”。Agent 能识别“它”指的是上一轮推荐的算子。 -- **状态保持**:在脚本交互模式或 UI 中,通过复用同一个 `state` 和 `graph` 实例,`messages` 列表会在多轮对话中累积,确保 LLM 拥有完整记忆。 - -### 2.3 可视化与交互 - -- **Gradio UI**:提供代码预览、算子高亮和快捷提问按钮。 -- **交互**:支持多轮问答,支持清除历史、查看历史等。 - -## 3. 架构组件 - -### 3.1 OperatorQAAgent - -- 继承自 `BaseAgent`,配置为 ReAct/Graph 模式。 -- 它拥有后置工具(Post-Tools)权限,可以调用 RAG 服务获取数据。 -- 它负责解析用户的自然语言,规划是否查库,并生成最终的自然语言回复。 - -### 3.2 OperatorRAGService - -- 这是一个与 Agent 解耦的服务层。 -- 管理 FAISS 向量索引和 `ops.json` 元数据。 -- 提供 `search`(向量搜索)、`get_operator_info`(获取详情)、`get_operator_source`(获取源码)等底层能力。 - -## 4. 使用指南 - -本功能提供 **图形界面 (Gradio UI)** 和 **命令行脚本** 两种使用方式。 - -### 4.1 界面操作 - -适合交互式探索和快速验证。启动 Web 界面: -```python -python gradio_app/app.py -``` -访问 `http://127.0.0.1:7860` 开始使用 - -1. **配置模型**:在右侧“配置”面板确认 API URL 和 Key,并选择模型(默认使用 `gpt-4o`)。 -2. **发起提问**: - 1. **对话框**:输入你的问题。 - 2. **快捷按钮**:也可以点击下方的“快捷问题”按钮,如“过滤缺失值用什么算子?”快速开始。 -3. **查看结果**: - 1. **对话区**:显示 Agent 的回答和引用来源。 - 2. **右侧面板**: - - `相关算子`:列出 Agent 检索到的算子名称。 - - `代码片段`:如果涉及具体实现,这里会显示 Python 源码。 - -### 4.2 脚本调用与显式配置 - -除了 UI 界面,系统提供了 `script/run_dfa_operator_qa.py` 脚本。这种方式适合开发调试,或者通过代码自动化地查询算子用法。 - -#### 1. 修改配置 - -打开 `script/run_dfa_operator_qa.py`,在文件顶部的配置区域进行修改。 - -**API 和文件配置** - -* **CHAT_API_URL**: LLM 服务地址。 -* **API_KEY**: 模型调用密钥。Agent 需要调用大模型来理解您的问题并总结答案。 -* **MODEL**: 模型名称,默认为 `gpt-4o`。 -* **CACHE_DIR**: 缓存目录。 -* **TOP_K**: 检索深度。指定 Agent 在知识库中检索相关算子时,最多返回多少个候选结果(默认 5 个)。 - -**查询与交互模式配置** - -* **INTERACTIVE**: **交互控制开关**(`True` / `False`)。 - * `True` (交互模式):启动终端内的连续对话模式,您可以像聊天一样不断追问,支持 `clear` 清除历史。 - * `False` (单次模式):脚本只执行 `QUERY` 指定的一个问题,输出结果后立即结束。 -* **QUERY**: 单次查询的问题内容。仅在 `INTERACTIVE = False` 时生效。 -* **OUTPUT_JSON**: 结果保存路径。 - * 仅在单次模式下生效。 - * 如果设置了路径,Agent 的回答、检索到的算子列表及代码片段会被完整保存为 JSON 文件;留空则只打印到控制台。 - - - -#### 2. 运行脚本 - -配置完成后,在终端执行: - -```bash -python script/run_dfa_operator_qa.py - -``` - -#### 3. 结果输出 - -脚本执行后,根据模式不同,控制台会有不同表现: - -* **交互模式**:终端会出现 `🧑 你:` 提示符,等待输入。 - * 输入 `exit` 或 `quit` 退出。 - * 输入 `clear` 清除对话历史 - * 输入 `history` 查看历史对话。 -* **单次模式**:控制台将直接打印 Agent 的思考过程、检索到的算子列表以及最终回答。如果配置了 `OUTPUT_JSON`,还会提示文件保存成功。 - -### 4.3 实战 Case:查找“清洗数据”的算子 - -你可以参考以下教程学习,也可以参考我们提供的[Google Colab](https://colab.research.google.com/drive/1maDKWp-3zEQNScmL_S7MHUdUC1xyCIcK?usp=sharing)样例来运行: - -假设您在开发 Pipeline 时遇到数据需要清洗,想知道 DataFlow 库里有没有现成的算子可以处理。 - -**场景配置:** 我们将其设置为单次查询模式,并指定将结果保存到本地,以便后续在代码中查看详细参数。 - -打开脚本修改如下配置: - -```python -# ===== Example config (edit here) ===== - -# 1. 关闭交互模式,执行单次查询 -INTERACTIVE = False - -# 2. 定义您的具体需求 -QUERY = "我想清洗数据,应该用哪个算子?" - -# 3. 确保 API 配置正确 -CHAT_API_URL = os.getenv("DF_API_URL", "http://123.129.219.111:3000/v1/") -API_KEY = os.getenv("DF_API_KEY", "") -MODEL = os.getenv("DF_MODEL", "gpt-4o") - -# 4. 指定结果保存位置 -OUTPUT_JSON = "cache_local/operator_qa_result.json" - - -``` - -**运行:** - -运行脚本后,Agent 会执行 RAG 检索并生成回答。打开生成的 `script/cache_local/operator_qa_result.json`,您可以看到如下结构的数据: - -```json -{ - "success": true, - "query": "我想清洗数据,应该用哪个算子?", - "answer": "对于数据清洗任务,您可以考虑使用以下算子:\n\n1. **KBCTextCleanerBatch**:用于对原始知识内容进行标准化处理,包括HTML标签清理、特殊字符规范化、链接处理和结构优化,提升RAG知识库的质量。\n\n2. **KBCTextCleaner**:类似于KBCTextCleanerBatch,专注于知识内容的标准化处理。\n\n3. **HtmlUrlRemoverRefiner**:去除文本中的URL链接和HTML标签,净化文本内容。\n\n4. **RemoveNumberRefiner**:移除文本中的数字字符,保留纯文本内容。\n\n5. **ReferenceRemoverRefiner**:删除文本中未闭合的引用标签和引用链接,净化文本中的引用标记。", - "related_operators": [ - "KBCTextCleanerBatch", - "KBCTextCleaner", - "HtmlUrlRemoverRefiner", - "RemoveNumberRefiner", - "ReferenceRemoverRefiner" - ], - "code_snippet": null, - "follow_up_suggestions": [ - "您需要清洗哪种类型的数据?", - "是否需要了解某个算子的详细参数配置?", - "需要查看算子的源码实现吗?" - ], - "messages": [ - { - "type": "SystemMessage", - "content": "\n[角色]\n你是 DataFlow 算子库的智能问答助手。你的职责是帮助用户了解和使用 DataFlow 中的各种数据处理算子。\n\n[能力]\n1. 根据用户描述的需求,推荐合适的算子\n2. 解释算子的功能、用途和使用场景\n3. 详细说明算子的参数含义和配置方法\n4. 在需要时展示算子的源码实现\n5. 基于多轮对话理解用户的上下文需求\n\n[DataFlow 算子简介]\nDataFlow 是一个数据处理框架,提供了丰富的算子用于数据清洗、过滤、生成、评估等任务。\n每个算子都是一个 Python 类,通常包含:\n- `__init__` 方法:初始化算子,配置必要的参数(如 LLM 服务、提示词等)\n- `run` 方法:执行数据处理逻辑,接收输入数据并产出处理结果\n\n[可用工具]\n你可以调用以下工具来获取算子信息:\n\n1. **search_operators(query, top_k)** - 根据功能描述搜索相关算子\n - 当用户询问某类功能的算子时使用\n - 如果对话历史中已有相关算子信息,可以不调用直接回答\n\n2. **get_operator_info(operator_name)** - 获取指定算子的详细描述\n - 当用户询问特定算子的功能时使用\n\n3. **get_operator_source_code(operator_name)** - 获取算子的完整源代码\n - 当用户需要了解算子实现细节时使用\n\n4. **get_operator_parameters(operator_name)** - 获取算子的参数详情\n - 当用户询问算子如何配置、参数含义时使用\n\n[工具调用策略]\n- 如果是新问题且对话历史中没有相关信息 → 调用 search_operators 检索\n- 如果对话历史中已有相关算子信息 → 可以直接回答,无需重复检索\n- 如果用户追问某个算子的细节 → 调用 get_operator_info/get_operator_source_code/get_operator_parameters\n\n[回答风格]\n1. 清晰简洁,重点突出\n2. 使用中文回答(除非用户要求英文)\n3. 对于技术细节,提供具体的代码示例\n4. 在解释参数时,说明参数类型、默认值和作用\n\n[输出格式]\n请以 JSON 格式返回,包含以下字段:\n{\n \"answer\": \"对用户问题的详细回答\",\n \"related_operators\": [\"相关算子名称列表\"],\n \"source_explanation\": \"说明答案的信息来源,例如:'通过search_operators检索到的XXX算子'、'基于对话历史中的算子信息'、'基于我的知识库'\",\n \"code_snippet\": \"如有必要,提供代码片段(可选)\",\n \"follow_up_suggestions\": [\"可能的后续问题建议(可选)\"]\n}\n\n\n请以JSON格式返回结果,不要包含其他文字说明!!!直接返回json内容,不要```json进行包裹!!", - "role": "", - "additional_kwargs": {}, - "metadata": {} - }, - { - "type": "HumanMessage", - "content": "\n[用户问题]\n我想清洗数据,应该用哪个算子?\n\n[任务]\n请根据用户问题回答。对话历史会自动包含在消息中,你可以参考之前的对话。\n\n工具调用指南:\n1. 如果需要查找算子,调用 search_operators 工具\n2. 如果需要某个算子的详细信息,调用 get_operator_info 工具\n3. 如果需要源码,调用 get_operator_source_code 工具\n4. 如果需要参数详情,调用 get_operator_parameters 工具\n5. 如果之前的对话中已有相关信息,可以直接回答,无需重复调用工具\n\n回答要求:\n- 基于工具返回的信息或对话上下文中的信息回答\n- 在 source_explanation 中说明答案来源\n- 如果问题不明确,可以在 follow_up_suggestions 中给出澄清建议\n\n请以 JSON 格式返回你的回答。\n", - "role": "", - "additional_kwargs": {}, - "metadata": {} - }, - { - "type": "AIMessage", - "content": "", - "role": "", - "additional_kwargs": { - "tool_calls": [ - { - "id": "call_DZer3f6W1WLsvDeHBktupSXw", - "function": { - "arguments": "{\"query\":\"数据清洗\"}", - "name": "search_operators" - }, - "type": "function" - } - ], - "refusal": null - }, - "metadata": {} - }, - { - "content": "{\n \"query\": \"数据清洗\",\n \"matched_operators\": [\n \"KBCTextCleanerBatch\",\n \"KBCTextCleaner\",\n \"HtmlUrlRemoverRefiner\",\n \"RemoveNumberRefiner\",\n \"ReferenceRemoverRefiner\"\n ],\n \"operator_details\": [\n {\n \"node\": 1,\n \"name\": \"KBCTextCleanerBatch\",\n \"description\": \"知识清洗算子:对原始知识内容进行标准化处理,包括HTML标签清理、特殊字符规范化、链接处理和结构优化,提升RAG知识库的质量。主要功能:\\n1. 移除冗余HTML标签但保留语义化标签\\n2. 标准化引号/破折号等特殊字符\\n3. 处理超链接同时保留文本\\n4. 保持原始段落结构和代码缩进\\n5. 确保事实性内容零修改\",\n \"category\": \"knowledge_cleaning\"\n },\n {\n \"node\": 2,\n \"name\": \"KBCTextCleaner\",\n \"description\": \"知识清洗算子:对原始知识内容进行标准化处理,包括HTML标签清理、特殊字符规范化、链接处理和结构优化,提升RAG知识库的质量。主要功能:\\n1. 移除冗余HTML标签但保留语义化标签\\n2. 标准化引号/破折号等特殊字符\\n3. 处理超链接同时保留文本\\n4. 保持原始段落结构和代码缩进\\n5. 确保事实性内容零修改\\n\\n输入格式示例:\\n
\\n

标题文本

\\n

正文段落,包括特殊符号,例如“弯引号”、–破折号等

\\n \\\"示意图\\\"\\n 链接文本\\n
代码片段
\\n ...\\n
\\n\\n输出格式示例:\\n标题文本\\n\\n正文段落,包括特殊符号,例如\\\"直引号\\\"、-破折号等\\n\\n[Image: 示例图 example.jpg]\\n\\n链接文本\\n\\n代码片段\\n\\n[结构保持,语义保留,敏感信息脱敏处理(如手机号、保密标记等)]\",\n \"category\": \"knowledge_cleaning\"\n },\n {\n \"node\": 3,\n \"name\": \"HtmlUrlRemoverRefiner\",\n \"description\": \"去除文本中的URL链接和HTML标签,净化文本内容。使用正则表达式匹配并移除各种形式的URL和HTML标签。输入参数:\\n- input_key:输入文本字段名\\n输出参数:\\n- 包含净化后文本的DataFrame\\n- 返回输入字段名,用于后续算子引用\",\n \"category\": \"general_text\"\n },\n {\n \"node\": 4,\n \"name\": \"RemoveNumberRefiner\",\n \"description\": \"该算子用于移除文本中的数字字符,包括0-9的阿拉伯数字。\\n通过字符过滤实现数字移除,保留纯文本内容。\\n输入参数:\\n- 无初始化参数\\n运行参数:\\n- input_key:输入文本字段名\\n输出参数:\\n- 处理后的DataFrame,包含去除数字的文本\\n- 返回包含输入字段名的列表,用于后续算子引用\",\n \"category\": \"general_text\"\n },\n {\n \"node\": 5,\n \"name\": \"ReferenceRemoverRefiner\",\n \"description\": \"删除文本中未闭合的引用标签和引用链接,包括标签和{{cite}}模板的各种完整和不完整形式。净化文本中的引用标记。输入参数:\\n- input_key:输入文本字段名\\n输出参数:\\n- 包含移除引用标记后文本的DataFrame\\n- 返回输入字段名,用于后续算子引用\",\n \"category\": \"general_text\"\n }\n ]\n}", - "additional_kwargs": {}, - "response_metadata": {}, - "type": "tool", - "name": "search_operators", - "id": "63bb6bcd-5184-456f-91d8-72fb06b9cdc0", - "tool_call_id": "call_DZer3f6W1WLsvDeHBktupSXw", - "artifact": null, - "status": "success" - }, - { - "type": "AIMessage", - "content": "{\n \"answer\": \"对于数据清洗任务,您可以考虑使用以下算子:\\n\\n1. **KBCTextCleanerBatch**:用于对原始知识内容进行标准化处理,包括HTML标签清理、特殊字符规范化、链接处理和结构优化,提升RAG知识库的质量。\\n\\n2. **KBCTextCleaner**:类似于KBCTextCleanerBatch,专注于知识内容的标准化处理。\\n\\n3. **HtmlUrlRemoverRefiner**:去除文本中的URL链接和HTML标签,净化文本内容。\\n\\n4. **RemoveNumberRefiner**:移除文本中的数字字符,保留纯文本内容。\\n\\n5. **ReferenceRemoverRefiner**:删除文本中未闭合的引用标签和引用链接,净化文本中的引用标记。\",\n \"related_operators\": [\n \"KBCTextCleanerBatch\",\n \"KBCTextCleaner\",\n \"HtmlUrlRemoverRefiner\",\n \"RemoveNumberRefiner\",\n \"ReferenceRemoverRefiner\"\n ],\n \"source_explanation\": \"通过search_operators检索到的相关算子\",\n \"code_snippet\": null,\n \"follow_up_suggestions\": [\n \"您需要清洗哪种类型的数据?\",\n \"是否需要了解某个算子的详细参数配置?\",\n \"需要查看算子的源码实现吗?\"\n ]\n}", - "role": "", - "additional_kwargs": { - "refusal": null - }, - "metadata": {} - } - ] -} - -``` - diff --git a/docs/zh/notes/guide/agent/operator_write.md b/docs/zh/notes/guide/agent/operator_write.md deleted file mode 100644 index 201629d7c8..0000000000 --- a/docs/zh/notes/guide/agent/operator_write.md +++ /dev/null @@ -1,272 +0,0 @@ ---- -title: 算子编写 -createTime: 2026/02/05 22:11:00 -permalink: /zh/guide/agent/operator_write/ ---- - -## 1. 概述 - -**算子编写 (Operator Write)** 是 DataFlow-Agent 的核心生产力模块。它不仅仅是根据用户需求生成一段 Python 代码,而是构建了一个闭环的**生成-执行-调试**系统。 - -该工作流能够: - -1. **语义匹配**:理解用户意图(如“过滤缺失值”),在现有算子库中寻找最匹配的基类。 -2. **代码生成**:基于基类和用户数据样例,编写可执行的算子代码。 -3. **自动注入**:如果需要,为算子注入 LLM 服务能力。 -4. **子进程执行**:在一个受控环境中实例化并运行生成的算子。 -5. **自我修复**:如果执行报错,启动 Debugger 分析堆栈信息,自动修改代码并重试,直到成功或达到最大重试次数。 - -## 2. 核心特性 - -### 2.1 智能代码生成 - -- **基于样例编程**:Agent 会读取实际的数据样例 (调用前置工具`local_tool_for_sample`) 和数据 Schema,确保生成的代码能够正确处理真实的字段名和数据类型。 -- **算子复用**:系统优先检索现有的算子库(调用前置工具`match_operator`),生成继承自现有基类的代码,而不是从零开始,保证了代码的规范性和可维护性。 - -### 2.2 自动调试闭环 - -这是一个具备自我反思能力的系统。 - -- **执行监控**:在 `llm_instantiate` 节点,系统尝试执行生成的代码 (`exec(code_str)`) 并捕获标准输出和标准错误。 -- **错误诊断**:如果发生异常,`code_debugger` Agent 会分析错误堆栈 (`error_trace`) 和当前代码,生成修复建议 (`debug_reason`)。 -- **自动重写**:`rewriter` Agent 根据修复建议重新生成代码,并自动更新文件,进入下一轮测试。 - -### 2.3 LLM 服务注入 - -对于需要调用大模型的复杂算子(如“根据内容生成摘要”),`llm_append_serving` 节点会自动在算子代码中注入标准的 LLM 调用接口 (`self.llm_serving`),使其具备 AI 能力。 - -## 3. 工作流架构 - -该功能由 `dataflow_agent/workflow/wf_pipeline_write.py` 编排,形成一个包含条件循环的有向图。 - -1. **Match Node**: 检索参考算子。 -2. **Write Node**: 编写初始代码。 -3. **Append Serving Node**: 注入 LLM 能力。 -4. **Instantiate Node**: 尝试运行代码。 -5. **Debugger Node** (条件触发): 分析错误。 -6. **Rewriter Node**: 修复代码。 - -## 4. 使用指南 - -本功能提供 **图形界面 (Gradio UI)** 和 **命令行脚本** 两种使用方式。 - -### 4.1 界面操作 - -前端页面代码位于 `gradio_app/pages/operator_write.py`,提供了可视化的交互体验,适合交互式探索和快速验证。启动 Web 界面: -```python -python gradio_app/app.py -``` -访问 `http://127.0.0.1:7860` 开始使用 - -#### 1. 配置输入 - -在页面的左侧面板进行配置: - -- **目标描述**: 详细描述您想要创建的算子功能和用途。 - - 示例: "创建一个算子,用于对文本进行情感分析。" -- **算子类别**: 算子所属类别,用于匹配相似算子作为参考,默认为 `"Default"`,可选:`"filter"`, `"mapper"`, `"aggregator"` 等。 -- **测试数据文件**: 指定用于测试生成的算子的 `.jsonl` 文件路径。默认为项目自带的 `tests/test.jsonl`。 -- **调试设置**: - - `启用调试模式 (Enable Debug Mode)`: 勾选后,如果代码报错,系统会自动尝试修复。 - - `最大调试轮次`: 设置自动修复的最大尝试次数(默认 3 次)。 -- **输出路径**: 指定生成代码的保存路径(可选)。 - -#### 2. 查看结果 - -点击 **"生成算子"** 按钮后,右侧面板会展示详细结果: - -- **生成的代码**: 最终可用的 Python 代码,支持语法高亮 -- **匹配的算子**: 显示系统在算子库中找到的参考算子列表(如 `"LangkitSampleEvaluator"`,`"LexicalDiversitySampleEvaluator"`,`"PresidioSampleEvaluator"`,`"PerspectiveSampleEvaluator"`等) -- **执行结果**: 显示 `success: true/false` 以及具体的日志信息`stdout`/`stderr`。 -- **调试信息**: 如果触发了调试,这里会显示运行时捕获的 `stdout`/`stderr` 以及选定的输入字段键名 (`input_key`) -- **Agent结果:** 各个 Agent 节点的详细执行结果 -- **执行日志**: 完整的执行日志信息,方便排查 Agent 的思考过程 - -### 4.2 脚本调用与显式配置 - -对于开发者,推荐直接修改并运行 `script/run_dfa_operator_write.py`。这种方式可以更灵活地集成到自动化流程中,并保存生成的算子文件。 - -#### 1. 修改配置 - -打开 `script/run_dfa_operator_write.py`,在文件顶部的配置区域修改参数。 - -**任务配置** - * **`TARGET`**: 用自然语言描述算子的功能。描述越具体,生成的代码越准确。建议包含对输入字段和预期输出的描述。 - * 示例:`"创建一个算子,用于对文本进行情感分析"` - * 示例:`"实现一个数据去重算子,支持多字段组合去重"` - * **`CATEGORY`**: 算子所属类别,用于匹配相似算子作为参考 - * 默认:`"Default"` - * 可选:`"reasoning"`, `"agentic_rag"`, `"knowledge_cleaning"` 等 - * **`JSON_FILE`**: 用于测试算子的数据文件(`.jsonl` 格式)。 - * 默认:留空则使用项目自带的测试数据`tests/test.jsonl`。 - * **`OUTPUT_PATH`**: 生成的 Python 代码保存路径。如果留空,代码只会打印在控制台,不会保存文件。 - -**API 与 调试配置** - * **`CHAT_API_URL`**: LLM 服务地址 - * **`api_key`**: 访问密钥(使用环境变量 DF_API_KEY) - * **`MODEL`**: 模型名称,默认 gpt-4o - * **`NEED_DEBUG`**: 是否开启自动调试循环 (`True` / `False`) - * `True`:如果生成的代码在 `JSON_FILE` 上运行报错,Agent 会自动分析错误堆栈并尝试重写代码 - * `False`:生成代码并执行后立即结束,不管是否能运行成功 - * **`MAX_DEBUG_ROUNDS`**: 最大自动修复次数,默认 3 次 - -#### 2. 运行脚本 - -配置完成后,在终端执行: - -```bash -python script/run_dfa_operator_write.py - -``` - -#### 3. 结果输出 - -脚本执行过程中会输出以下关键信息: - -* **[Match Operator Result]**: 显示 Agent 在现有算子库中找到的“参考算子” -* **[Writer Result]**: 生成的代码长度和保存位置 -* **[Execution Result]**:代码执行结果 - * `Success: True`:表示代码生成成功,且在测试数据上运行无误。 - * `Success: False`:表示运行失败。 -* **[Debug Runtime Preview]**:运行时捕获的 `stdout`/`stderr` 以及选定的输入字段键名 (`input_key`) - -### 4.3 实战 Case:编写一个情感分析算子 - -你可以参考以下教程学习,也可以参考我们提供的[Google Colab](https://colab.research.google.com/drive/1oTkwMNwxMFGAe9rNtYCC47CQ9HxsA0uH?usp=sharing)样例来运行: - -我们有一个日志文件 `tests/test.jsonl`,其中包含字段 `"raw_content"`。我们希望创建一个算子,对该字段的文本内容进行情感分析。 - -**配置示例:** - -```python -# ===== Example config (edit here) =====、 -# API KEY 通过设置环境变量 DF_API_KEY 传入 -CHAT_API_URL = os.getenv("DF_API_URL", "http://123.129.219.111:3000/v1/") -MODEL = os.getenv("DF_MODEL", "gpt-4o") -LANGUAGE = "en" - -# 1. 定义具体需求 -TARGET = "创建一个算子,用于对文本进行情感分析" -CATEGORY = "Default" -# 2. 指定结果保存路径 -OUTPUT_PATH = "cache_local/my_operator.py" -# 3. 指定测试数据路径 -JSON_FILE = "tests/test.jsonl" -# 4. 开启调试 -NEED_DEBUG = True -MAX_DEBUG_ROUNDS = 10 - -``` - -**运行:** -运行脚本后,终端会给出以下输出: -``` bash -==== Match Operator Result ==== -Matched ops: ['LangkitSampleEvaluator', 'LexicalDiversitySampleEvaluator', 'PresidioSampleEvaluator', 'PerspectiveSampleEvaluator'] - -==== Writer Result ==== -Code length: 3619 -Saved to: cache_local/my_operator.py - -==== Execution Result (instantiate) ==== -Success: True - -==== Debug Runtime Preview ==== -input_key: raw_content -available_keys: ['raw_content'] -[debug stdout] - [selected_input_key] raw_content - -[debug stderr] -Generating......: 100%|######### | 18/20 [00:08<00:00, 3.34it/s] -``` -生成的代码保存到 `script/cache_local/my_operator.py`中,打开可以查看生成的代码: -``` python -from dataflow.core import OperatorABC -from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.utils.storage import DataFlowStorage, FileStorage -from dataflow import get_logger -from dataflow.serving import APILLMServing_request -import pandas as pd - -@OPERATOR_REGISTRY.register() -class SentimentAnalysisOperator(OperatorABC): - def __init__(self, llm_serving=None): - self.logger = get_logger() - self.logger.info(f'Initializing {self.__class__.__name__}...') - self.llm_serving = llm_serving - self.score_name = 'SentimentScore' - self.logger.info(f'{self.__class__.__name__} initialized.') - - @staticmethod - def get_desc(lang: str = "zh"): - if lang == "zh": - return ( - "使用LLM进行文本情感分析,返回情感得分,得分越高表示情感越积极。\n" - "输入参数:\n" - "- llm_serving:LLM服务对象\n" - "- input_key:输入文本字段名\n" - "- output_key:输出得分字段名,默认'SentimentScore'\n" - "输出参数:\n" - "- 包含情感分析得分的DataFrame" - ) - else: - return ( - "Perform sentiment analysis on text using LLM, returning sentiment scores where higher scores indicate more positive sentiment.\n" - "Input Parameters:\n" - "- llm_serving: LLM serving object\n" - "- input_key: Field name for input text\n" - "- output_key: Field name for output score, default 'SentimentScore'\n" - "Output Parameters:\n" - "- DataFrame containing sentiment analysis scores" - ) - - def get_score(self, samples: list[dict], input_key: str) -> list[float]: - texts = [sample.get(input_key, '') or '' for sample in samples] - return self.llm_serving.generate_from_input(texts) - - def eval(self, dataframe: pd.DataFrame, input_key: str) -> list[float]: - self.logger.info(f"Evaluating {self.score_name}...") - samples = dataframe.to_dict(orient='records') - scores = self.get_score(samples, input_key) - self.logger.info("Evaluation complete!") - return scores - - def run(self, - storage: DataFlowStorage, - input_key: str | None = None, - output_key: str = 'SentimentScore'): - dataframe = storage.read("dataframe") - if input_key is None: - input_key = self._auto_select_input_key(dataframe) - dataframe[output_key] = self.eval(dataframe, input_key) - storage.write(dataframe) - - def _auto_select_input_key(self, dataframe: pd.DataFrame) -> str: - preferred_keys = ['raw_content', 'text', 'content', 'sentence', 'instruction', 'input', 'query', 'problem', 'prompt'] - for key in preferred_keys: - if key in dataframe.columns and dataframe[key].notnull().any(): - return key - return dataframe.columns[0] - -# Runnable entry code - -test_data_path = '/root/autodl-tmp/DataFlow-Agent/tests/test.jsonl' - -# Initialize FileStorage -storage = FileStorage(first_entry_file_name=test_data_path, cache_path="./cache_local", file_name_prefix="dataflow_cache_step", cache_type="jsonl") -storage = storage.step() - -# Initialize llm_serving -llm_serving = APILLMServing_request(api_url="http://123.129.219.111:3000/v1/chat/completions", key_name_of_api_key="DF_API_KEY", model_name="gpt-4o") - -# Select input key -available_keys = ['raw_content'] -preselected_input_key = 'raw_content' -input_key = preselected_input_key if preselected_input_key in available_keys else available_keys[0] -print(f"[selected_input_key] {input_key}") - -# Instantiate and run the operator -operator = SentimentAnalysisOperator(llm_serving=llm_serving) -operator.run(storage=storage, input_key=input_key) -``` \ No newline at end of file diff --git a/docs/zh/notes/guide/agent/pipeline_prompt.md b/docs/zh/notes/guide/agent/pipeline_prompt.md deleted file mode 100644 index ca45d317c7..0000000000 --- a/docs/zh/notes/guide/agent/pipeline_prompt.md +++ /dev/null @@ -1,235 +0,0 @@ ---- -title: 算子复用/提示词优化 -createTime: 2026/02/05 22:11:00 -permalink: /zh/guide/agent/pipeline_prompt/ ---- - -## 1. 概述 - -**提示词优化**是 DataFlow-Agent 面向 **Prompt-Engineering** 的核心模块。它的设计目标是解决“通用算子逻辑复用”的问题。 - -该模块采用**单节点架构**。当用户提出一个新的数据处理需求时,Agent 不仅会编写符合算子规范的 Prompt,还会**自动生成合成测试数据**,并在内部构建和运行测试脚本。 - -## 2. 核心特性 - -### 2.1 基于范例的生成 - -- **算子代码分析**:Agent 会自动读取目标算子(`OP_NAME`)的源码,提取其参数定义。 -- **Prompt 迁移**:系统检索算子库中已有的 Prompt 案例,将其作为上下文,指导 LLM 生成符合该算子接口规范(如 `init` 参数结构)的新 Prompt 类。 - -### 2.2 基于合成数据的自我验证 - -生成的 Prompt 不会仅停留在文本层面,Agent 会立即对其进行测试。 - -- **数据合成**:Agent **不需要**用户的大规模业务数据进行测试,而是利用 LLM 分析算子逻辑,自动生成一组覆盖多种边界情况的**合成测试数据**,保存为临时的 JSONL 文件。 -- **子进程执行**:Agent 内部会自动构建一个临时 Python 测试脚本,并启动子进程执行该脚本,验证生成的 Prompt 是否能正确跑通并产生预期结果。 - -### 2.3 迭代优化 - -- **交互式反馈**:系统不进行盲目的自动重试。用户在前端查看测试结果、生成的 Prompt 和数据预览后,输入修改意见。 -- **定向热更新**:后端 `PromptWriter` 接收反馈后,调用 `revise_with_feedback` 方法,在保持现有上下文的基础上定向修改 Prompt 代码,并自动触发新一轮的测试循环。 - -## 3. 系统架构 - -该功能由 `dataflow_agent/workflow/wf_pipeline_prompt.py` 定义,核心为一个**单节点**工作流。所有的生成与验证逻辑均高度内聚在 `PromptWriter` Agent 中。 - -### 3.1 核心节点流程 - -**Prompt Writer Node** 是图中唯一的节点,它按顺序执行以下内部逻辑: - -1. **上下文检索**: 调用 Pre-tools 获取目标算子的源码、用户的target 和 Prompt 范例。 -2. **提示词生成**: 调用 LLM 生成 Python 形式的 Prompt 类代码。随后通过 `update_state_result` 方法将生成的代码保存到 `state` 对象并写入本地文件,为后续测试步骤提供依赖。 -3. **测试数据合成**: 调用内部方法 `_build_test_data_by_llm`,根据任务描述生成合成测试数据。 -4. **测试脚本构建**: 调用内部方法 `_build_test_code`,利用字符串模板生成临时的测试脚本。 -5. **子进程执行**: 使用 `subprocess` 运行测试脚本,捕获标准输出 (stdout) 和标准错误 (stderr)。 -6. **测试结果输出**: 扫描读取子进程运行生成的测试结果文件,将测试结果更新到 `state.temp_data` 中,完成流程。 - -### 3.2 迭代优化机制 - -优化过程依赖于**前端交互**: - -1. 用户在 UI 查看执行结果。 -2. 用户提交反馈。 -3. 前端调用 `_on_chat_submit`,触发 Agent 的 `revise_with_feedback` 接口。 -4. Agent 根据反馈修改代码并重新复用执行上述的验证阶段,即测试数据合成 -> 测试脚本构建 -> 子进程执行。 - -## 4. 使用指南 - -本功能提供 **图形界面 (Gradio UI)** 和 **命令行脚本** 两种使用方式。 - -### 4.1 图形界面 - -前端代码位于 `gradio_app/pages/PA_frontend.py`,提供了可视化的交互体验,适合交互式探索和快速验证。启动 Web 界面: -```python -python gradio_app/app.py -``` -访问 `http://127.0.0.1:7860` 开始使用 - -**初次生成:** - -1. 配置 API 信息(URL、Key、模型) -2. 填写任务描述、算子名称 -3. (可选)指定输出格式、参数列表和文件输出根路径 -4. 点击"生成 Prompt 模板"按钮 -5. 查看 Agent 生成的测试数据、测试结果、Prompt 代码和测试代码 - -**多轮优化:** - -1. 如果结果不符合预期,在右侧对话框中输入改进建议 -2. 点击"发送改写指令" -3. 查看更新后的代码和测试结果 -4. 重复步骤 1-3 直到获得满意结果 - -**使用生成的 Prompt:** - -1. 从"Prompt 文件路径"获取生成的 Prompt 文件位置 -2. 将 Prompt 类导入到您的算子中 -3. 在算子的 `init()` 中指定 `prompt_template` - -### 4.2 脚本调用与显式配置 - -对于需要将 Prompt 生成集成到自动化流水线,或者习惯代码配置的开发者,可以使用 `script/run_dfa_pipeline_prompt.py`。 - -#### 1. 修改配置 - -打开 `script/run_dfa_pipeline_prompt.py`,在文件顶部的配置区域进行修改。 - -**API 配置** - * **`CHAT_API_URL`**: LLM 服务地址 - * **`api_key`**: 访问密钥(使用环境变量 DF_API_KEY) - * **`MODEL`**: 模型名称,默认 gpt-4o - -**任务配置** - * **`TASK_DESCRIPTION`**: 用自然语言描述您希望这个 Prompt 完成什么任务 - * 示例:`"我想写一个适用于金融问题的过滤器提示词."` - * **`OP_NAME`**: 指定生成的 Prompt 将被哪个算子加载使用 - * **`OUTPUT_FORMAT`** (可选): 指定 Prompt 输出的格式。如果不填,Agent 会仿照已有提示词生成 - * **`ARGUMENTS`** (可选): Prompt 模板需要的参数,用逗号、空格或换行分隔 - * 示例:`["min_len=10", "drop_na=true"]` - -**环境配置** - * **`CACHE_DIR`**: 结果输出目录。生成的 Prompt 文件(`.py`)、临时的测试数据、测试代码等都会保存在这里 - * **`DELETE_TEST_FILES`**: 运行结束后是否自动清理临时的合成测试数据(`True`/`False`) - -#### 2. 运行脚本 - -配置完成后,在终端执行: - -```bash -python script/run_dfa_pipeline_prompt.py - -``` - -#### 3. 结果输出 - -脚本执行完毕后,控制台会打印生成过程。您可以在 `CACHE_DIR` 目录下找到生成的文件 - -### 4.3 实战 Case:复用ReasoningQuestionFilter过滤器,编写适用金融问题的过滤器提示词 - -你可以参考以下教程学习,也可以参考我们提供的[Google Colab](https://colab.research.google.com/drive/1cU5Eg6tuc7WVDG33tU9Wplza52e54kts?usp=sharing)样例来运行: - -假设我们想复用系统中的 `ReasoningQuestionFilter` 算子,让它变成为一个金融领域问题的过滤器。打开脚本修改如下配置: - -```python -# ===== Example config (edit here) ===== - -# 1. 定义任务 -TASK_DESCRIPTION = "我想写一个适用于金融问题的过滤器提示词" - -# 2. 指定复用的算子 (告诉 Agent 这个 Prompt 是给 PromptedGenerator 用的) -OP_NAME = "ReasoningQuestionFilter" - -# 这两项在算子不拥有任何一个预置提示词时才需要提供,否则会仿照已有提示词生成 -OUTPUT_FORMAT = "" # e.g. "Return JSON with keys: ..." -ARGUMENTS = [] # e.g. ["min_len=10", "drop_na=true"] - -# 缓存目录,用于存储测试数据和提示词 -CACHE_DIR = "./pa_cache" -DELETE_TEST_FILES = False - -``` - -**运行:** - -运行脚本后,终端会输出执行的日志,您可以在 `CACHE_DIR` 目录下找到生成的 Prompt 文件`finance_question_filter_prompt20260209143556.py`、测试代码`test_FinanceQuestionFilterPrompt.py`以及测试数据,生成的 Prompt 内容如下: -``` python -__all__ = ['FinanceQuestionFilterPrompt'] - -from dataflow.core.prompt import DIYPromptABC - -class FinanceQuestionFilterPrompt(DIYPromptABC): - def __init__(self): - pass - - def build_prompt(self, question: str) -> str: - prompt = f""" - # 角色: - 你是一个金融问题的审核助手。 - # 任务 - 你的任务是检查给定的金融问题是否符合以下标准: - 0. 首先,确认输入仅包含一个明确的金融问题(没有额外的指令如“重写”、“翻译”或提供的答案);如果不符合,输出 judgement_test=false。 - 1. 检查拼写、语法和格式(例如货币符号、百分比表示),不解释语义。 - 2. 对于每个最小前提(无法进一步分解),验证其是否违反常识、金融领域事实或任务要求(例如,“负利率”在某些情况下可能无效);如果无效,则失败。 - 3. 检查前提之间或推理过程中的任何矛盾,或者最终结果是否明显不合理或不可解;如果是,则失败。 - 4. 如果以上都通过,检查是否有足够的信息来完成任务;缺少必要条件 ⇒ 失败,冗余细节是可以接受的。 - - # 输出格式 - 完成这些步骤后,输出格式必须为: - {{ - "judgement_test": true/false, - "error_type": "<错误描述或null>" - }} - 你可以包括你的思维过程,但最终输出必须是上面的JSON格式。 - - 这里是需要评估的内容: - ------------------------------- - {question} - ------------------------------- - """ - return prompt -``` -Agent 生成的测试代码`test_FinanceQuestionFilterPrompt.py`如下: -``` python -""" -Auto-generated by prompt_writer -""" -from dataflow.pipeline import PipelineABC -from dataflow.utils.storage import FileStorage -from dataflow.serving import APILLMServing_request, LocalModelLLMServing_vllm - -try: - from dataflow.operators.reasoning.filter.reasoning_question_filter import ReasoningQuestionFilter -except Exception: - from dataflow.operators.reasoning import ReasoningQuestionFilter -from finance_question_filter_prompt20260209143556 import FinanceQuestionFilterPrompt - -class RecommendPipeline(PipelineABC): - def __init__(self): - super().__init__() - # -------- FileStorage -------- - self.storage = FileStorage( - first_entry_file_name="./pa_cache/prompt_test_data.jsonl", - cache_path="./pa_cache", - file_name_prefix="dataflow_cache_step", - cache_type="jsonl", - ) - # -------- LLM Serving (Remote) -------- - self.llm_serving = APILLMServing_request( - api_url="http://123.129.219.111:3000/v1/chat/completions", - key_name_of_api_key="DF_API_KEY", - model_name="gpt-4o", - max_workers=100, - ) - - self.reasoning_question_filter = ReasoningQuestionFilter(system_prompt='You are a helpful assistant.', llm_serving=self.llm_serving, prompt_template=FinanceQuestionFilterPrompt()) - - def forward(self): - self.reasoning_question_filter.run( - storage=self.storage.step(), input_key='math_problem' - ) - -if __name__ == "__main__": - pipeline = RecommendPipeline() - pipeline.compile() - pipeline.forward() -``` \ No newline at end of file diff --git a/docs/zh/notes/guide/agent/pipeline_rec&refine.md b/docs/zh/notes/guide/agent/pipeline_rec&refine.md deleted file mode 100644 index 2c566c2a42..0000000000 --- a/docs/zh/notes/guide/agent/pipeline_rec&refine.md +++ /dev/null @@ -1,311 +0,0 @@ ---- -title: 智能 Pipeline 推荐与优化 -createTime: 2026/02/05 22:11:00 -permalink: /zh/guide/agent/pipeline_rec&refine/ ---- - - -本模块包含两个紧密协作的核心子系统: - -1. **智能 Pipeline 推荐 (Recommendation)**:负责“从 0 到 1”,将自然语言需求转化为完整的可执行 Pipeline。 -2. **Pipieline 迭代优化 (Refinement)**:负责“从 1 到 N”,基于用户反馈对现有 Pipeline 结构进行微调。 - -## 第一部分:Pipeline 推荐 (Pipeline Recommendation) - -### 1. 概述 - -**Pipeline 推荐** 是 DataFlow-Agent 的核心编排引擎。它能够理解复杂的业务需求,自动拆解任务步骤,从算子库中检索最佳组件,规划数据流向,并生成可执行的 Python 代码。 - -该系统具备自我修复能力:在生成代码执行失败时,Agent 会主动查阅算子源码文档,分析错误原因并修正代码,直至执行成功。 - -### 2. 系统架构 - -该功能由 `dataflow_agent/workflow/wf_pipeline_recommend_extract_json.py` 编排,形成一个包含多级智能体的有向图。以下是详细的节点职责说明: - -#### 2.1 分析与规划阶段 - -1. **Classifier Node** - 1. **职责**: 读取少量数据样例,识别数据类型和业务领域。这决定了后续推荐算子的倾向性。 - 2. **输入**: `state.request.json_file` (数据文件路径)。 - 3. **输出**: `state.category`。 -2. **Target Parser Node** - 1. **核心任务 (What it does)**: 充当业务分析的角色。它不直接生成代码,而是将用户模糊的需求转化为逻辑严密的步骤。 - 2. **输入**: 用户的自然语言需求(例如:“过滤掉pdf中长度小于10的文本,然后去重,最后提取关键词”)。 - 3. **LLM 思考**: 将需求拆解为标准的、符合数据处理逻辑的步骤列表(如 `["读取解析pdf成纯文本", "过滤掉长度小于10个字符的文本数据", "对文本数据进行去重处理,移除重复内容","从文本数据中提取关键词"]`)。 - 4. **后续动作**: 利用拆解出的步骤描述,去算子向量数据库中检索最相似的物理算子,形成**候选算子池**,供下一阶段使用。 -3. **Recommender Node** - 1. **核心任务**: 负责将散乱的候选算子变成有序的执行方案。 - 2. **输入**: - - `target`: 用户的原始需求。 - - `sample`: 数据样本(了解数据特征,如字段名、格式)。 - - `split_ops`: 上一步 `target_parser` 通过 RAG 检索出来的候选算子列表及其功能描述。 - 3. **LLM** **思考**: - - **逻辑排序**: 每个阶段不是只能有一个算子,而是遵循 “需求” - - **数据兼容性**: 若某算子需要字段“X”但样例数据中不存在,必须确保在它之前有算子创建该字段 - - **查漏补缺**: 现有算子能满足需求吗?如果不行,需要插入一个万能的 `PromptedGenerator` - 4. **输出**: 一个有序的算子名称列表以及推荐理由,如 - ```JSON - { - "ops": [ - "Text2SQLQuestionGenerator", - "SQLExecutionFilter", - "SQLConsistencyFilter", - "SQLVariationGenerator", - "Text2SQLQuestionGenerator", - "Text2SQLPromptGenerator", - "Text2SQLCoTGenerator", - "ReasoningQuestionSolvableSampleEvaluator", - "SQLComponentClassifier", - "PromptedGenerator" - ], - "reason": "该流水线设计旨在满足用户的所有需求。 - 1. 首先,通过 Text2SQLQuestionGenerator 解析 SQL 数据文件并提取 SQL 语句和对应的自然语言问题。 - 2. 接着,使用 SQLExecutionFilter 在数据库中执行 SQL 语句以验证其有效性。 - 3. 然后,使用 SQLConsistencyFilter 进行一致性过滤,确保 SQL 语句与其对应的自然语言问题一致。 - 4. 接下来,使用 SQLVariationGenerator 对有效的 SQL 语句进行扩增,包括替换数值、提高语法难度和更改书写方式。 - 5. 随后,使用 Text2SQLQuestionGenerator 基于扩增后的 SQL 语句生成对应的自然语言问题。 - 6. 接着,使用 Text2SQLPromptGenerator 生成 Prompt 提示词内容,并通过 Text2SQLCoTGenerator 生成思维链推理过程。 - 7. 然后,使用 ReasoningQuestionSolvableSampleEvaluator 对生成的数据进行分类,评估大模型解决问题的难度,并使用 SQLComponentClassifier 评估 SQL 组成部分的难度。 - 8. 最后,使用 PromptedGenerator 输出合成的 SQL 数据及其对应的自然语言问题和推理过程,以确保所有需求得到满足。" - } - ``` - -#### 2.2 构建与执行阶段 - -1. **Builder Node** - 1. **职责**: 将推荐方案(JSON)转化为实际的 Python 代码文件,并启动子进程执行该代码。 - 2. **机制**: 支持创建子进程执行代码,捕获标准输出 (stdout) 和标准错误 (stderr)。 - 3. **输出**: `state.execution_result` (Success/Fail 状态及日志)。 - -#### 2.3 自动修复闭环 - -当 `builder` 执行失败且 `need_debug=True` 时,进入此循环: - -1. **Debugger Node** - - - **职责**: 分析错误堆栈 (`error_trace`) 和当前代码,判断错误类型(参数错误、逻辑错误等)。 -2. **Info Requester Node** - - - **职责**: 这是一个主动学习节点。如果 Debugger 认为信息不足,它会调用工具读取相关算子的**源代码**或**文档**,获取上下文信息。 -3. **Rewriter Node** - 1. **职责**: 综合错误日志和 InfoRequester 查到的源码知识,生成修复后的完整代码。 - 2. **流转**: 修复后的代码会再次送入 `builder` 进行测试,直到成功或达到最大重试次数 (`max_debug_rounds`)。 - -#### 2.4 输出阶段 - -- **Exporter Node** - - - **职责**: 执行成功后,整理最终的 Pipeline 信息、代码路径及数据样例,格式化输出给用户。 - -### 3. 使用指南 - -本功能提供 **图形界面 (Gradio UI)** 和 **命令行脚本** 两种使用方式。 - -#### 3.1 图形界面 - -代码位于 `gradio_app/pages/pipeline_rec.py`,适合交互式探索和快速验证。启动 Web 界面: -```python -python gradio_app/app.py -``` -访问 `http://127.0.0.1:7860` 开始使用 - -1. **配置输入**: - 1. 在"目标描述"框中输入您的需求 - 2. 输入需要处理jsonl文件 - 3. 配置 API 信息(URL、Key、模型) - 4. (可选)配置嵌入模型和调试选项 - 5. 选择是否需要自动更新向量索引(如果出现算子不在注册机里,则需要勾选) - 6. 选择是否使用debug模式(debug模式会自动运行生成的 Pipeline 代码,直到最大迭代轮次) -2. **生成pipeline**: - - 点击 **" Generate Pipeline"**。 -3. **结果查看**: - 1. **Pipeline Code**: 查看最终生成的pipeline 代码 - 2. **Execution Log**: 查看执行的日志信息 - 3. **Agent Results:** 各个 Agent 节点的详细执行结果,包含推荐的算子列表、构建过程等 - 4. **Pipeline JSON:** 生成的Pipeline拓扑结构JSON,包含算子节点列表和节点间连接关系 - -#### 3.2 脚本调用 - -对于自动化任务或批量生成,推荐直接修改并运行 `script/run_dfa_pipeline_recommend.py`。 -##### 1. 修改配置 - -打开 `script/run_dfa_pipeline_recommend.py`,在文件顶部的配置区域进行修改。 - -**API 配置** - - * **`CHAT_API_URL`**: LLM 服务地址 - * **`api_key`**: 访问密钥(使用环境变量 DF_API_KEY) - * **`MODEL`**: 模型名称,默认 gpt-4o - -**任务配置** - - * **`TARGET`**: 用自然语言详细描述您的数据处理需求 - * 示例:`"请帮我编排一个专门用于大规模预训练数据清洗的流水线,涵盖从去重、改写到质量过滤的全过程"` - * **`TEST_JSON_REL_PATH`**: 用于测试 Pipeline 的数据文件的相对路径 - * 格式:每行一个 JSON 对象 - * 默认:`{项目根目录}/tests/test.jsonl` - -**调试配置** - - * **`NEED_DEBUG`**: 是否启用自动调试和修复 - * **`True`**: Agent 生成代码后会立即尝试运行。如果报错(如 `ImportError`, `KeyError`),它会启动 Debugger Agent 分析错误堆栈,自动修改代码并重试 - * **`False`**:生成代码运行后立即结束,不进行自动调试和修复 - * **`MAX_DEBUG_ROUNDS`**: 最大自动修复次数,默认 5 次 - -**文件配置** - - * **`CACHE_DIR`**: 结果输出目录。生成的 pipeline 代码、执行的日志、中间结果等都会保存在这里 - -##### 2. 运行脚本 - -```bash -python script/run_dfa_pipeline_recommend.py - -``` - -##### 3. 结果输出 - -脚本执行完毕后,控制台会打印执行的日志和最终执行状态,脚本运行后会在 `CACHE_DIR` 下生成 `my_pipeline.py`, `final_state.json` 和 `graph.png`。 - -#### 3.3 实战 Case:预训练数据清洗流水线 - -你可以参考以下教程学习,也可以参考我们提供的[Google Colab](https://colab.research.google.com/drive/1MMJxRpfYi7Zd-jc_pyhvM1Y2WoQXOFcu?usp=sharing)样例来运行: - -假设我们有一个包含脏数据的预训练数据 `tests/test.jsonl`,我们希望清洗出一份高质量数据。打开脚本修改如下配置: - -**场景配置:** - -```python -# ===== Example config (edit here) ===== - -# 1. 定义任务流程 -TARGET = """ -- 1.请帮我编排一个专门用于大规模预训练数据清洗的流水线,涵盖从去重、改写到质量过滤的全过程。 - 1. 请帮我编排一个专门用于大规模预训练数据清洗的流水线,涵盖从去重、改写到质量过滤的全过程。 -- 2. 在预训练阶段,原始的网页数据(如Common Crawl)往往充斥着大量的噪声、广告、乱码以及重复内容,数据质量参差不齐。我需要先做对原始数据做适当的改写,比如删除大量多余空格、html标签等。接着,需要通过基于规则的启发式过滤,把那些显而易见的垃圾文本、不完整文本和过短的无效数据剔除掉。同时,考虑到网络上内容复杂,我需要筛选指定语言的数据来训练大模型。网络数据的重复率很高,最好能通过模糊去重算法把相似的文档都清理掉,只保留一份。最后,为了保证模型学到的是高质量知识,我希望还能有一个质量分类模型,对清洗后的数据打分,只留下那些高教育价值的内容,从而构建一个高质量的预训练语料库。 -- 3. 我需要一个专门处理海量预训练语料的端到端流水线。首先,你可以对原始文本进行基础的规范化处理,删除多余空格、html标签和表情符号。接着,利用启发式规则进行初步过滤,筛掉显着的低质量文本。这些启发式规则覆盖广泛,需要过滤掉符号/单词比例过高的文段、含敏感词的文段、单词数量异常的文段、以冒号/省略号结尾的不完整文段、语句数量异常的文段、空文本、平均单词长度异常的文段、含html标签的文段、无标点符号的文段、含特殊符号或水印的文段、括号比例过高的文段、大写字母比例过高的文段、含lorem ipsum(随机假文)的文段、独立单词比例过小的文段、字符数量较少的文段、以项目符号开头的文段和含有Javascript数量过多的文段。在此基础上,使用MinHash或类似算法进行文档级的模糊去重,大幅降低数据冗余。随后,利用训练好的质量评估模型对剩余数据进行打分和筛选。最后,还可以加入一个语言识别步骤,确保最终留下的都是目标语言的高质量纯净文本。 -""" - -# 2. 指定测试数据路径 -TEST_JSON_REL_PATH = "tests/test.jsonl" - -# 3. 开启 Debug -NEED_DEBUG = True -MAX_DEBUG_ROUNDS = 5 - -``` - -**运行:** -运行脚本后,工作流会按以下步骤执行: - -1. **分析用户的数据和意图**:分析用户的数据的特征。 -2. **拆解用户任务,推荐算子**:将用户的意图拆解成多个任务,检索匹配出与用户意图相关的算子。 -3. **生成代码**:分析需求顺序,串联这些算子,编写 pipeline 代码。 -4. **自动测试**:启动子进程试运行。如果出现了错误并启动了调试模式,Debugger Node 会尝试修复。 -5. **最终交付**:在成功执行或者达到最大调试轮数时结束工作流。 - -用户可以在`CACHE_DIR`目录下找到生成的 Pipeline 代码文件和执行的日志文件。 - - - -## 第二部分:Pipeline 迭代优化 (Pipeline Refinement) - -### 1. 概述 - -Pipeline 迭代优化 (Refinement) 允许用户通过自然语言对已生成的 DataFlow Pipeline 进行微调。用户无需手动修改复杂的 JSON 配置或 Python 代码,只需输入如“删除中间的过滤节点”等指令,系统便会智能解析意图并自动调整 Pipeline 的拓扑结构。 - -### 2. 系统架构 - -该功能由 `dataflow_agent/workflow/wf_pipeline_refine.py` 编排,采用 **Analyzer -> Planner -> Refiner** 的三段式架构: - -#### 2.1 Refine Target Analyzer - -- **核心职责**: - - **意图识别**: 比较当前的 Pipeline 结构(`state.pipeline_structure_code`)和用户的自然语言需求(`target`),分析用户希望进行的修改类型(增、删、改)。 - - **RAG 预检索 (Pre-emptive RAG)**: 这是关键特性。Analyzer 会解析出用户需求中隐含的子操作描述,并直接调用 RAG 搜索 `_get_operators_by_rag_with_scores`。它会计算相似度分数、评估匹配质量,并将最佳匹配的算子代码`code_snippet`和警告信息打包进 `op_contexts`。 -- **输入**: `state.pipeline_structure_code` (当前 pipeline 代码), `state.request.target` (用户修改指令)。 -- **输出**: 包含 `needed_operators_desc` 的意图分析结果,以及包含丰富上下文的 `op_contexts`(算子代码、匹配度评分)。 - -#### 2.2 Refine Planner - -- **职责**: 基于 Analyzer 提供的意图和预检索到的算子上下文,制定具体的**修改计划**。它不直接修改代码,而是生成结构化的操作步骤。 -- **输入**: Analyzer 的分析结果 (`intent`)、算子上下文 (`op_context`)、当前节点摘要。 -- **输出**: 结构化的操作步骤列表,例如: - - `REMOVE_NODE: node_filter_1` - - `ADD_NODE: node_deduplicate (after node_loader)` - - `UPDATE_EDGE: node_loader -> node_deduplicate`。 - -#### 2.3 JSON Pipeline Refiner - -- **职责**: 执行 Planner 的计划,直接操作 Pipeline 的 JSON 数据结构 Nodes 和 Edges。 -- **工具增强**: 该 Agent 挂载了 `search_operator_by_description` 和 `get_operator_code_by_name` 作为后置工具。虽然 Analyzer 已经提供了 `op_context`,但如果 Refiner 在执行过程中发现信息不足,它仍可以主动发起搜索来补充算子信息。 -- **输出**: 更新后的 `state.pipeline_structure_code`。 - -### 3. 使用指南 - -本功能提供 **图形界面 (Gradio UI)** 和 **命令行脚本** 两种使用方式。 - -#### 3.1 图形界面 - -集成在 `gradio_app/pages/pipeline_rec.py`,适合交互式探索和快速验证。启动 Web 界面: -```python -python gradio_app/app.py -``` -访问 `http://127.0.0.1:7860` 开始使用 - -1. **前提**:必须先在页面上方点击 "Generate Pipeline" 生成初始 pipeline 代码,此时 `pipeline_json_state` 会被初始化。 -2. **输入优化指令**:在 "优化需求" 文本框中输入指令。 -3. **执行优化**:点击 **"Refine Pipeline"**。系统将显示更新后的 Python 代码、JSON 结构以及 Agent 的执行日志。 -4. **历史回溯**:使用 "上一轮" 和 "下一轮" 按钮在不同的优化版本间切换,查看代码演进过程。 -5. **警告提示**: 如果 RAG 匹配度较低,代码顶部会自动添加 `优化警告` 注释,提示用户当前生成的算子可能未完全匹配需求。 - -#### 3.2 脚本调用 - -使用 `script/run_dfa_pipeline_refine.py` 对已有的 Pipeline 结构进行微调。 - -##### 1. 修改配置 - -**API 配置** - - * **`CHAT_API_URL`**: LLM 服务地址 - * **`api_key`**: 访问密钥(使用环境变量 DF_API_KEY) - * **`MODEL`**: 模型名称,默认 gpt-4o - -**任务配置** - - * **`INPUT_JSON`**: 待优化的 Pipeline 结构文件路径 - * **`OUTPUT_JSON`**: 优化后的 Pipeline JSON 结构文件保存路径 - * **`TARGET`**: 用自然语言描述您希望如何修改 Pipeline - * 示例:`"请将Pipeline调整为只包含3个节点,简化数据流"` - -##### 2. 运行脚本 - -```bash -python script/run_dfa_pipeline_refine.py - -``` - -#### 3.3 实战 Case:简化流水线 - -你可以参考以下教程学习,也可以参考我们提供的[Google Colab](https://colab.research.google.com/drive/1MMJxRpfYi7Zd-jc_pyhvM1Y2WoQXOFcu?usp=sharing)样例来运行: - -假设上一步生成的流水线太复杂,包含了多余的“清洗”算子,我们希望将其移除来简化 Pipeline。 - -**场景配置:** - -```python -# ===== Example config (edit here) ===== - -# 1. 指定上一步生成的 Pipeline 结构文件 -INPUT_JSON = "dataflow_agent/tmps/pipeline.json" - -# 2. 下达修改指令 -TARGET = "请简化中间的清洗算子,简化数据流。" - -# 3. 指定结果保存位置 -OUTPUT_JSON = "cache_local/pipeline_refine_result.json.json" - -``` - -**运行:** -Agent 会分析当前 Pipeline 的 JSON 拓扑结构,找到对应的去重节点,将其移除。 \ No newline at end of file diff --git a/docs/zh/notes/guide/agent/web_collection.md b/docs/zh/notes/guide/agent/web_collection.md deleted file mode 100644 index 995175e8ca..0000000000 --- a/docs/zh/notes/guide/agent/web_collection.md +++ /dev/null @@ -1,372 +0,0 @@ ---- -title: Web 数据采集 -createTime: 2026/02/14 00:00:00 -permalink: /zh/guide/agent/web_collection/ ---- - -## 1. 概述 - -**Web Collection Agent** 是 DataFlow-Agent 中的智能数据收集模块,专门用于从互联网自动收集、处理和格式化训练数据集。该系统支持两种数据类型: - -- **PT(Pre-Training,预训练)**:大规模无标注语料,用于模型预训练。 -- **SFT(Supervised Fine-Tuning,监督微调)**:结构化的指令-回答对,用于模型微调。 - -该工作流能够: - -1. **网页搜索与探索**:基于多层 BFS 森林探索策略,由 LLM 驱动 URL 筛选,自动发现和定位目标数据集。 -2. **多平台数据下载**:支持 HuggingFace、Kaggle、Web 直接下载三种方式,LLM 智能决策下载优先顺序。 -3. **双通道并行采集**:WebSearch 和 WebCrawler 两条采集流程并行执行,提供更丰富的数据来源。 -4. **自适应数据映射**:LLM 生成 Python 映射函数,通过三重验证机制,自动将异构数据转换为标准 Alpaca 格式。 - -## 2. 系统架构 - -该功能由 `dataflow_agent/workflow/wf_web_collection.py` 编排,形成一个包含并行分支和条件循环的有向图。整体流程分为四个阶段:任务分析、数据采集(并行)、数据下载、数据处理与映射。 - -### 2.1 任务分析阶段 - -1. **Start Node(初始化节点)** - 1. **职责**: 初始化工作流配置,创建下载目录,准备执行环境。 - 2. **输入**: `state.request.target`(用户原始需求)。 - 3. **输出**: 初始化后的 `user_query` 和下载目录。 - -2. **Task Decomposer(任务分解节点)** - 1. **职责**: 使用 LLM 将复杂的用户需求分解为可执行的子任务,限制最大任务数量(默认 5 个)。 - 2. **输入**: 用户原始查询。 - 3. **LLM 思考**: 分析需求语义,拆分为独立的数据收集子任务。 - 4. **输出**: `state.task_list`,例如: - - 子任务 1:收集 NLP 问答数据集 - - 子任务 2:收集文本分类数据集 - - 子任务 3:收集图像分类数据集 - -3. **Category Classifier(分类节点)** - 1. **职责**: 判断当前任务属于 PT 还是 SFT 类型。 - 2. **输入**: 当前子任务名称。 - 3. **LLM 思考**: 结合任务描述判断数据类别,生成数据集背景描述。 - 4. **输出**: `state.category`(`"PT"` 或 `"SFT"`)以及 `dataset_background`。 - 5. **后备机制**: 当 LLM 无法判断时,使用关键词匹配。SFT 关键词包括:`["sft", "微调", "问答", "qa", "instruction", "fine-tuning"]`。 - -### 2.2 数据采集阶段(并行执行) - -任务分析完成后,系统进入 `parallel_collection` 并行分支,同时启动 WebSearch 和 WebCrawler 两条采集流程。 - -#### 2.2.1 WebSearch Node(网页搜索节点) - -WebSearch Node 是系统的核心数据收集节点,实现了完整的网页探索和信息提取流程,包含以下核心组件: - -1. **QueryGenerator(查询生成器)** - - **职责**: 基于用户原始需求,生成 3-5 个多样化的搜索查询。 - - **示例**: 输入 `"收集 Python 代码生成数据集"`,输出: - - `"Python code generation dataset download"` - - `"Python programming instruction dataset HuggingFace"` - - `"code completion training data GitHub"` - -2. **WebTools(网页工具集)** - - **search_web()**: 调用搜索引擎(Tavily / DuckDuckGo / Jina)获取初始 URL 列表。 - - **read_with_jina_reader()**: 使用 Jina Reader/MinerU-HTML 爬取网页内容,返回结构化的 Markdown 格式文本。 - -3. **多层 BFS 森林探索** - - **算法**: 采用广度优先搜索(BFS)策略,逐层探索网页链接。每一层中,使用 Jina Reader/MinerU-HTML 爬取页面内容,提取候选 URL,再由 URLSelector 筛选最相关的链接进入下一层。 - - **关键参数**: - - `max_depth`: 最大探索深度(默认 2) - - `concurrent_limit`: 并发请求数(默认 10) - - `topk_urls`: 每层筛选的 URL 数量(默认 5) - - `url_timeout`: 请求超时时间(默认 60 秒) - -4. **URLSelector(URL 筛选器)** - - **职责**: 使用 LLM 从候选 URL 列表中选择与研究目标最相关的 URL。 - - **筛选策略**: 分析 URL 与研究目标的相关性、域名可信度,避免重复内容,过滤被阻止的域名。 - -5. **RAGManager(RAG 管理器)** - - **职责**: 将爬取的网页内容存储到向量数据库中,支持后续的语义检索,为 SummaryAgent 提供上下文。 - -6. **SummaryAgent(摘要代理)** - - **职责**: 基于 RAG 检索的内容,生成具体的下载子任务。 - - **输出**: 结构化的子任务列表,例如: - ```json - { - "type": "download", - "objective": "下载 Spider Text2SQL 数据集", - "search_keywords": ["spider dataset", "text2sql"], - "platform_hint": "huggingface", - "priority": 1 - } - ``` - -#### 2.2.2 WebCrawler Node(网页爬虫节点) - -WebCrawler Node 专门用于从网页中提取代码块和技术内容,与 WebSearch Node 并行执行,提供更丰富的数据来源。 - -1. **生成搜索查询**: 针对代码/技术内容生成专用搜索查询。 -2. **搜索与爬取**: 搜索网页获取 URL 列表,使用 Jina Reader 并发爬取页面内容。 -3. **代码块提取**: 调用 `extract_code_blocks_from_markdown` 从 Markdown 内容中提取代码块。 -4. **结果保存**: 将爬取结果保存为 `webcrawler_crawled.jsonl`。 - -### 2.3 数据下载阶段 - -**Download Node(下载节点)** 执行实际的数据集下载任务,支持三种下载方式,并使用 LLM 智能决策下载优先顺序。 - -1. **DownloadMethodDecisionAgent(LLM 决策)** - - **职责**: 根据任务目标分析最佳下载方式,输出优先顺序列表,例如 `["huggingface", "kaggle", "web"]`。 - -2. **依次尝试每种下载方式**: - - **HuggingFace**: 搜索 HuggingFace Hub,LLM 选择最佳匹配数据集,调用 API 下载。 - - **Kaggle**: 搜索 Kaggle 数据集,LLM 选择最佳匹配,通过 Kaggle API 下载。 - - **Web**: 使用 WebAgent 智能探索网页,直接下载文件。 - -3. **记录下载结果**: 更新 `state.download_results`,包含每个数据集的下载状态和路径。 - -### 2.4 数据处理与映射阶段 - -#### Postprocess Node(后处理节点) - -- **职责**: 检查是否还有未完成的子任务(`check_more_tasks`),如果有则循环回到采集阶段;否则进入映射阶段。 - -#### Mapping Node(数据映射节点) - -Mapping Node 负责将收集到的中间格式数据转换为标准的 Alpaca 格式,使用 LLM 生成自适应的 Python 映射函数。 - -1. **读取中间数据**: 加载 `intermediate.jsonl` 中的原始记录。 -2. **LLM 生成映射函数(三重验证)**: - 1. 生成映射函数 3 次。 - 2. 在样本数据上验证一致性。 - 3. 通过验证后使用。 -3. **批量处理**: 对所有记录执行映射转换。 -4. **质量过滤**: 应用质量过滤器剔除低质量数据。 -5. **保存结果**: 输出为 `.jsonl` 和 `.json` 两种格式。 - -**Alpaca 格式定义**: - -```json -{ - "instruction": "任务指令或问题", - "input": "可选的输入上下文(如系统提示、SQL Schema)", - "output": "期望的回答或输出" -} -``` - -**SFT 数据映射规则**: -- `system` 角色 → `input` 字段 -- `user` 角色 → `instruction` 字段 -- `assistant` 角色 → `output` 字段 - -**映射示例(Text2SQL)**: - -```json -// 输入格式 -{ - "messages": [ - {"role": "system", "content": "CREATE TABLE farm (Id VARCHAR)"}, - {"role": "user", "content": "How many farms are there?"}, - {"role": "assistant", "content": "SELECT COUNT(*) FROM farm"} - ] -} - -// 输出 Alpaca 格式 -{ - "instruction": "How many farms are there?", - "input": "CREATE TABLE farm (Id VARCHAR)", - "output": "SELECT COUNT(*) FROM farm" -} -``` - -## 3. 状态管理与输出 - -### 3.1 WebCollectionState 核心字段 - -```python -@dataclass -class WebCollectionState(MainState): - # 任务相关 - user_query: str # 用户原始需求 - task_list: List[Dict] # 分解后的任务列表 - current_task_index: int # 当前任务索引 - - # 搜索相关 - research_summary: str # 调研总结 - urls_visited: List[str] # 已访问 URL - subtasks: List[Dict] # 下载子任务 - - # 下载相关 - download_results: Dict # 下载结果统计 - - # WebCrawler 相关 - webcrawler_crawled_pages: List # 爬取的页面 - webcrawler_sft_records: List # SFT 记录 - webcrawler_pt_records: List # PT 记录 - - # 映射相关 - mapping_results: Dict # 映射结果 - intermediate_data_path: str # 中间数据路径 -``` - -### 3.2 WebCollectionRequest 配置 - -```python -@dataclass -class WebCollectionRequest(MainRequest): - # 任务配置 - category: str = "PT" # PT 或 SFT - output_format: str = "alpaca" - - # 搜索配置 - search_engine: str = "tavily" - max_depth: int = 2 - max_urls: int = 10 - concurrent_limit: int = 5 - topk_urls: int = 5 - - # WebCrawler 配置 - enable_webcrawler: bool = True - webcrawler_num_queries: int = 5 - webcrawler_crawl_depth: int = 3 - webcrawler_concurrent_pages: int = 3 -``` - -### 3.3 输出文件结构 - -``` -web_collection_output/ -├── rag_db/ # RAG 向量数据库 -├── hf_datasets/ # HuggingFace 下载数据 -│ └── dataset_name/ -├── kaggle_datasets/ # Kaggle 下载数据 -├── web_downloads/ # Web 直接下载 -├── webcrawler_output/ # WebCrawler 爬取结果 -│ └── webcrawler_crawled.jsonl -├── processed_output/ # 后处理结果 -│ └── intermediate.jsonl -└── mapped_output/ # 最终映射结果 - ├── final_alpaca_sft.jsonl # Alpaca 格式(JSONL) - └── final_alpaca_sft.json # Alpaca 格式(JSON) -``` - -## 4. 使用指南 - -本功能提供 **图形界面 (Gradio UI)** 和 **命令行脚本** 两种使用方式。 - -### 4.1 图形界面 - -前端页面代码位于 `gradio_app/pages/web_collection.py`,提供了可视化的交互体验。启动 Web 界面: - -```bash -python gradio_app/app.py -``` - -访问 `http://127.0.0.1:7860` 开始使用 - -![web_agent](/web_agent.png) - -1. `step1:` 在"目标描述"中详细说明要收集的数据类型 -2. `step2:` 选择数据类别(PT 或 SFT) -3. `step3:` 配置数据集数量和大小限制 -4. `step4:` 配置 LLM API 信息(URL、Key、模型) -5. `step5:`(可选)配置 Kaggle、Tavily 等服务的密钥 -6. `step6:` 点击 **"开始网页采集与转换"** 按钮 -7. `step7:` 实时查看执行日志 -8. `step8:` 等待完成后查看结果摘要 -9. `step9:` 在下载目录中查看采集的数据 - -**高级使用**:展开"高级配置"区域,可调整搜索引擎选择、并行处理数量、缓存策略、数据转换参数等。 - -### 4.2 脚本调用 - -对于自动化任务或批量采集,推荐直接使用命令行脚本 `script/run_web_collection.py`。 - -#### 1. 环境变量配置 - -```bash -export DF_API_URL="https://api.openai.com/v1" -export DF_API_KEY="your_api_key" -export TAVILY_API_KEY="your_tavily_key" -export KAGGLE_USERNAME="" -export KAGGLE_KEY="" -export RAG_API_URL="" -export RAG_API_KEY="" -``` - -#### 2. 运行脚本 - -```bash -# 基本用法 -python script/run_web_collection.py --target "收集机器学习问答数据集" - -# 完整参数 -python script/run_web_collection.py \ - --target "收集代码生成数据集" \ - --category SFT \ - --max-urls 10 \ - --max-depth 2 \ - --download-dir ./my_output -``` - -**主要参数说明**: - -- **`--target`**: 数据收集目标描述(必填) -- **`--category`**: 数据类别,`PT` 或 `SFT`(默认 `SFT`) -- **`--max-urls`**: 最大 URL 数量(默认 10) -- **`--max-depth`**: 最大爬取深度(默认 2) -- **`--output-format`**: 输出格式(默认 `alpaca`) - -#### 3. Python API 调用 - -```python -from dataflow_agent.workflow.wf_web_collection import run_web_collection - -result = await run_web_collection( - target="收集机器学习代码示例", - category="SFT", - output_format="alpaca", - download_dir="./my_output", - model="gpt-4o" -) -``` - -### 4.3 实战 Case:收集中文问答数据集 - -假设我们需要为聊天机器人构建一份中文问答训练数据集,以下是完整的操作流程。 - -**场景配置:** - -```bash -export DF_API_URL="https://api.openai.com/v1" -export DF_API_KEY="your_api_key" -export TAVILY_API_KEY="your_tavily_key" - -python script/run_web_collection.py \ - --target "收集中文问答数据集用于微调" \ - --category SFT \ - --max-urls 20 -``` - -**运行:** -运行脚本后,工作流会按以下步骤执行: - -1. **任务分解**: LLM 将"收集中文问答数据集用于微调"拆解为多个子任务(如中文常识问答、中文阅读理解等)。 -2. **分类判定**: 根据"微调"关键词,自动判定为 SFT 类型。 -3. **并行采集**: WebSearch 探索 HuggingFace、GitHub 等平台上的中文 QA 数据集;WebCrawler 同步抓取技术博客中的问答内容。 -4. **智能下载**: LLM 决策优先从 HuggingFace 下载匹配数据集,失败后回退到 Kaggle 和 Web 直接下载。 -5. **格式映射**: 将下载的异构数据统一转换为 Alpaca 格式,输出到 `mapped_output/` 目录。 - -用户可以在下载目录下找到最终的 `final_alpaca_sft.jsonl` 文件,直接用于模型微调训练。 - -### 4.4 注意事项 - -1. **API 密钥** - - 确保配置了必要的 API 密钥 - - Tavily 用于搜索,Kaggle 用于下载 Kaggle 数据集 - -2. **网络环境** - - 如果在国内,建议使用 HuggingFace 镜像(设置 `HF_ENDPOINT`) - - 调整并行数量以适应网络带宽 - -3. **存储空间** - - 确保有足够的磁盘空间 - - 大型数据集可能需要数 GB 空间 - -4. **执行时间** - - 采集过程可能需要较长时间(几分钟到几小时) - - 可以通过限制下载任务数量来控制时间 - -5. **数据质量** - - 启用 RAG 增强可以提高数据质量 - - 调整采样参数以平衡质量和速度 diff --git a/docs/zh/notes/guide/quickstart/dataflow_skills.md b/docs/zh/notes/guide/quickstart/dataflow_skills.md new file mode 100644 index 0000000000..e20539fdfd --- /dev/null +++ b/docs/zh/notes/guide/quickstart/dataflow_skills.md @@ -0,0 +1,281 @@ +--- +title: DataFlow Skills +icon: material-symbols:auto-awesome +createTime: 2026/05/22 12:45:39 +permalink: /zh/guide/quickstart/dataflow_skills/ +--- + +# DataFlow Skills + +为 [DataFlow](https://github.com/OpenDCAI/DataFlow) 数据处理框架准备的可复用 [Claude Code Skills](https://docs.anthropic.com/en/docs/claude-code/skills)。共三个 skill: + +| Skill | 功能 | 调用方式 | +|---|---|---| +| **`generating-dataflow-pipeline`** | 给一个任务目标 + 一个 JSONL 样本文件,规划算子链并生成可运行的 DataFlow pipeline 代码。 | `/generating-dataflow-pipeline` | +| **`dataflow-dev`** | DataFlow 开发助手。按意图路由(新建算子 / 新建 pipeline / 新建 prompt / 诊断报错 / 规范审查 / 知识库同步)。在 DataFlow 仓库里使用。 | `/dataflow-dev` | +| **`core_text`** | 算子级 API 参考(8 个 generator、3 个 filter、2 个 refiner、5 个 evaluator)。当 pipeline skill 需要超出 6 个核心算子之外的扩展算子时会查阅它。 | _(不直接调用)_ | + + +## 安装 + +**前置条件:** [Claude Code](https://docs.anthropic.com/en/docs/claude-code) CLI 已安装并在 `PATH` 中。 + +```bash +git clone https://github.com/haolpku/DataFlow-Skills.git +cd DataFlow-Skills +./install.sh +``` + +脚本会把三个 skill 都拷到 `~/.claude/skills/`(用户级——所有项目都能用)。然后在任意 Claude Code 会话里: + +``` +/generating-dataflow-pipeline +``` + +补全里能看到这个斜杠命令就装好了。 + +### 安装选项 + +```bash +./install.sh --project # 装到当前项目的 ./.claude/skills/ +./install.sh dataflow-dev # 只装指定的 skill +./install.sh --force # 覆盖已存在的 skill(默认是跳过) +``` + +### 更新 + +```bash +cd DataFlow-Skills +git pull +./install.sh --force +``` + + +## Generating DataFlow Pipeline + +> [视频教程:生成 DataFlow Pipeline](https://github.com/user-attachments/assets/ca1fefbf-9bf7-469f-b856-b201952fb99b) + +推理引导的 Pipeline 规划器。给它一个**目标**(pipeline 要达成什么)和一个**样本 JSONL 文件**(1–5 行代表性数据),它会分析数据、选择算子、校验字段依赖,最终生成完整可运行的 Python pipeline 代码。 + +### 快速开始 + +#### 1. 准备数据 + +创建一个 JSONL 文件(每行一个 JSON 对象),包含 1–5 行代表性样本: + +```jsonl +{"product_name": "笔记本电脑", "category": "电子产品"} +{"product_name": "咖啡机", "category": "家电"} +``` + +#### 2. 运行 Skill + +在 Claude Code 中调用 `/generating-dataflow-pipeline` 并描述目标: + +``` +/generating-dataflow-pipeline +目标:生成商品描述并筛选优质内容 +样本文件:./data/products.jsonl +预期输出字段:generated_description, quality_score +``` + +#### 3. 查看输出 + +Skill 返回两阶段结果: + +1. **中间算子决策** — JSON,包含算子链、字段流、推理过程 +2. **完整 5 段式响应**: + - 字段映射 — 哪些字段已存在 vs. 需要生成 + - 有序算子列表 — 按执行顺序排列,附理由 + - 推理摘要 — 为什么这个设计能满足目标 + - 完整 Pipeline 代码 — 可直接执行的 Python 代码 + - 可调参数 / 注意事项 — 可调节的旋钮和调试建议 + +### 六个核心算子 + +| 算子 | 用途 | 需要 LLM? | +|------|------|-----------| +| `PromptedGenerator` | 单字段 LLM 生成 | 是 | +| `FormatStrPromptedGenerator` | 多字段模板生成 | 是 | +| `Text2MultiHopQAGenerator` | 从文本构建多跳 QA 对 | 是 | +| `PromptedFilter` | 基于 LLM 的质量打分与过滤 | 是 | +| `GeneralFilter` | 基于规则的确定性过滤 | 否 | +| **KBC 三件套**(3 个算子,始终按序一起使用) | 文件/URL → Markdown → 分块 → 清洗文本 | 部分 | + +### 生成的 Pipeline 结构 + +所有生成的 pipeline 遵循相同的标准结构: + +```python +from dataflow.operators.core_text import PromptedGenerator, PromptedFilter +from dataflow.serving import APILLMServing_request +from dataflow.utils.storage import FileStorage + +class MyPipeline: + def __init__(self): + self.storage = FileStorage( + first_entry_file_name="./data/input.jsonl", + cache_path="./cache", + file_name_prefix="step", + cache_type="jsonl" + ) + self.llm_serving = APILLMServing_request( + api_url="https://api.openai.com/v1/chat/completions", + model_name="gpt-4o", + max_workers=10 + ) + # 算子实例 ... + + def forward(self): + # 按序调用 operator.run(),每次传 storage.step() + ... + +if __name__ == "__main__": + pipeline = MyPipeline() + pipeline.forward() +``` + +关键规则: +- `first_entry_file_name` 设为用户提供的 JSONL 路径 +- 每个 `operator.run()` 使用 `storage=self.storage.step()` 做断点续传 +- 字段向前传播:字段必须在样本中存在或由前序步骤输出,后续步骤才能消费 + + + +## DataFlow Dev + +DataFlow 仓库内的开发助手 skill。它加载架构知识库,探测 git 状态,然后按你的意图路由: + +| 你说类似这样的话…… | 执行流程 | +|---|---| +| "新建一个 filter 算子……" | 算子创建(重复检查 → 规格确认 → 生成代码 + 注册提醒) | +| "新建一个 pipeline……" | Pipeline 创建(标准 `storage.step()` 模式) | +| "给 X 写个 prompt" | Prompt 创建(`PromptABC` / `DIYPromptABC`,`@prompt_restrict` 位置) | +| "我遇到 `KeyError: …`" | 诊断:匹配已知 Issue #001–#008 | +| "审查这个算子" | 14 项检查清单(注册、`run()` 签名、`get_desc` 等) | +| "上游仓库有新算子了" | 对照知识库找差异,输出更新步骤 | + +### 算子创建 + +Skill 会先执行防重复检查,然后向你确认规格: +- 算子类型(filter / generate / refine / eval) +- 所属模块(general_text / text_sft / reasoning / code / 其他) +- 是否依赖 LLM +- 输入/输出列名 + +生成的代码必须满足: +- 继承 `OperatorABC`,调用 `super().__init__()` +- 类上方有 `@OPERATOR_REGISTRY.register()` 装饰器 +- `run()` 参数:`input_*` 前缀、`output_*` 前缀、第一个参数为 `storage: DataFlowStorage` +- `run()` 返回输出 key 列表 +- LLM 驱动算子使用 `self.llm_serving` +- 包含 `get_desc(lang)` 支持 zh/en + +### Pipeline 创建 + +生成的 pipeline 遵循以下规则: +- `storage` 在 `__init__` 中声明,不在 `forward()` 里临时创建 +- 每个算子调用传 `storage=self.storage.step()` +- `max_workers` 根据 API 能力设置 +- API key 通过环境变量注入,不硬编码 +- 包含 `if __name__ == "__main__":` 入口 + +### 报错诊断 + +常见错误快速匹配表: + +| 报错关键词 | 根因 | +|---|---| +| `Unexpected key 'xxx' in operator` | 配置参数命名(仅警告非错误) | +| `No object named 'Xxx' found in 'operators' registry` | `__init__.py` 未注册 | +| `Key Matching Error` | Pipeline key 不一致 | +| `You must call storage.step() before` | 缺少 `storage.step()` | +| `DummyStorage` + `AttributeError` | DummyStorage 不支持 `get_keys_from_dataframe` | +| `ModuleNotFoundError` + `dataflow.operators.reasoning.refine` | LazyLoader 路径,应从父模块 import | + + +## Core Text 扩展算子参考 + +供 pipeline skill 查阅的扩展算子参考。当 6 个核心算子不够用时,可以使用以下算子: + +### Generate + +| 算子 | 描述 | +|------|------| +| `prompted-generator` | 基础单字段 LLM 生成 | +| `format-str-prompted-generator` | 多字段模板生成 | +| `chunked-prompted-generator` | 长文档逐块处理 | +| `embedding-generator` | 使用 Embedding API 向量化文本 | +| `retrieval-generator` | 使用 LightRAG 的异步 RAG 生成 | +| `bench-answer-generator` | 基准测试答案生成 | +| `text2multihopqa-generator` | 从文本构建多跳 QA 对 | +| `random-domain-knowledge-row-generator` | 从种子数据生成领域知识行 | + +### Filter + +| 算子 | 描述 | +|------|------| +| `prompted-filter` | 基于 LLM 的质量打分与过滤 | +| `general-filter` | 基于规则的确定性过滤 | +| `kcentergreedy-filter` | 基于 k-Center Greedy 的多样性过滤 | + +### Refine + +| 算子 | 描述 | +|------|------| +| `prompted-refiner` | 基于 LLM 的文本改写和精炼 | +| `pandas-operator` | 自定义 pandas DataFrame 操作 | + +### Eval + +| 算子 | 描述 | +|------|------| +| `prompted-evaluator` | 基于 LLM 的打分和评估 | +| `bench-dataset-evaluator` | 基准数据集评估 | +| `bench-dataset-evaluator-question` | 基准题目级评估 | +| `text2qa-sample-evaluator` | QA 样本质量评估 | +| `unified-bench-dataset-evaluator` | 跨格式统一基准评估 | + +每个算子目录结构如下: + +``` +<算子名>/ +├── SKILL.md # 英文文档 +├── SKILL_zh.md # 中文文档 +└── examples/ + ├── good.md # 正确用法示例 + └── bad.md # 常见错误 +``` + + +## 添加新算子 + +### 作为扩展算子 + +1. 在 `core_text/<分类>/<你的算子>/` 下创建 skill 定义: + +``` +core_text/// +├── SKILL.md +├── SKILL_zh.md +└── examples/ + ├── good.md + └── bad.md +``` + +2. 在 `generating-dataflow-pipeline/SKILL.md` 的 **Extended Operator Reference** 对应分类表格里加一行。**不加这行 pipeline 规划器就发现不了你的算子**。 + +### 升级为核心算子 + +如果算子高频到需要优先选择: + +1. 加入 Preferred Operator Strategy 核心算子列表 +2. 在 Operator Selection Priority Rule 加决策表行 +3. 在 Operator Parameter Signature Rule 加完整构造函数和 `run()` 签名 +4. 在 Correct Import Paths 加导入路径 +5. 在 Input File Content Analysis Rule 加输入模式匹配(如涉及新数据类型) + + +## 仓库地址 + +GitHub: [https://github.com/OpenDCAI/DataFlow-Skills](https://github.com/OpenDCAI/DataFlow-Skills)