From d36e7c3e4ae02aa6df0d5c7722bc65dfe885527c Mon Sep 17 00:00:00 2001 From: snova-fengluh Date: Tue, 21 Apr 2026 15:15:32 -0700 Subject: [PATCH 1/2] Add task-specific prompts feature for Generator, Reflector, and Curator - Add PromptConfig dataclass (ace/prompts/config.py) to hold all agent prompts - Add load_prompts() function (ace/prompts/loader.py) to load task-specific prompts with fallback to defaults - Modify Generator, Reflector, and Curator agents to accept optional prompt parameters in __init__ - Add prompt_config parameter to ACE class to pass custom prompts to agents - Add --task_prompts_dir CLI argument to all task run.py files (finance, mind2web, mind2web2) - Update README.md with new argument and Custom Task-Specific Prompts section - Update EXTENDING_ACE.md with corrected repository structure and comprehensive documentation for the new prompts feature Co-Authored-By: Claude Opus 4.5 --- EXTENDING_ACE.md | 304 +++++++++++++++++++++++++++++++--------- README.md | 21 +++ ace/__init__.py | 3 +- ace/ace.py | 30 +++- ace/core/curator.py | 16 ++- ace/core/generator.py | 11 +- ace/core/reflector.py | 16 ++- ace/prompts/__init__.py | 10 +- ace/prompts/config.py | 16 +++ ace/prompts/loader.py | 113 +++++++++++++++ eval/finance/run.py | 16 ++- eval/mind2web/run.py | 12 +- eval/mind2web2/run.py | 12 +- 13 files changed, 485 insertions(+), 95 deletions(-) create mode 100644 ace/prompts/config.py create mode 100644 ace/prompts/loader.py diff --git a/EXTENDING_ACE.md b/EXTENDING_ACE.md index f7dab7b1..900ae4d7 100644 --- a/EXTENDING_ACE.md +++ b/EXTENDING_ACE.md @@ -7,31 +7,44 @@ This guide provides detailed instructions for adding new tasks to the ACE framew Understanding the codebase structure will help you navigate and extend ACE effectively: ``` -ACE-pre-release/ +ace/ ├── ace/ # Core ACE framework │ ├── core/ # Agent implementations │ │ ├── __init__.py │ │ ├── generator.py # Generator agent │ │ ├── reflector.py # Reflector agent │ │ ├── curator.py # Curator agent -│ │ └── bulletpoint_analyzer.py # Bulletpoint analyzer for playbook de-duplication +│ │ └── bulletpoint_analyzer.py # Bulletpoint analyzer for playbook de-duplication │ ├── prompts/ # Prompt templates │ │ ├── __init__.py -│ │ ├── generator.py # Generator prompts -│ │ ├── reflector.py # Reflector prompts -│ │ └── curator.py # Curator prompts +│ │ ├── config.py # PromptConfig dataclass +│ │ ├── loader.py # Prompt loader with task-specific override support +│ │ ├── generator.py # Default generator prompts +│ │ ├── reflector.py # Default reflector prompts +│ │ └── curator.py # Default curator prompts │ ├── __init__.py │ └── ace.py # Main ACE orchestrator │ -├── finance/ # Finance domain implementation (reference example) -│ ├── data_processor.py # Finance data processing -│ └── run.py # Unified training and evaluation script +├── eval/ # Evaluation tasks +│ ├── finance/ # Finance domain implementation (reference example) +│ │ ├── data/ # Task data and config +│ │ ├── data_processor.py # Finance data processing +│ │ ├── run.py # Unified training and evaluation script +│ │ └── prompts/ # (Optional) Task-specific prompts +│ ├── mind2web/ # Mind2Web task implementation +│ │ ├── data/ +│ │ ├── data_processor.py +│ │ └── run.py +│ └── mind2web2/ # Mind2Web variant (50 candidates) +│ ├── data/ +│ ├── data_processor.py +│ └── run.py │ ├── llm.py # LLM utilities ├── logger.py # Logging utilities -├── utils.py # General utilities +├── utils.py # General utilities ├── playbook_utils.py # Playbook operations -├── requirements.txt # Dependencies +├── pyproject.toml # Project dependencies (uv) ├── .env.example # Environment template ├── README.md # Main documentation └── EXTENDING_ACE.md # This file @@ -54,23 +67,23 @@ Or with custom field names: {"input": "question text", "output": "answer", "metadata": {...}} ``` -Create a configuration file (e.g., `your_task/data/task_config.json`): +Create a configuration file (e.g., `eval/your_task/data/sample_config.json`): ```json { "your_task_name": { - "train_data": "./your_task/data/train.jsonl", - "val_data": "./your_task/data/val.jsonl", - "test_data": "./your_task/data/test.jsonl" + "train_data": "./eval/your_task/data/train.jsonl", + "val_data": "./eval/your_task/data/val.jsonl", + "test_data": "./eval/your_task/data/test.jsonl" } } ``` ### Step 2: Create a Data Processor -Create `your_task/data_processor.py` with a `DataProcessor` class. You only need to implement **3 simple methods**: +Create `eval/your_task/data_processor.py` with a `DataProcessor` class. You only need to implement **3 simple methods**: ```python -# your_task/data_processor.py +# eval/your_task/data_processor.py import os import json from typing import List, Dict, Any, Tuple @@ -185,7 +198,7 @@ class DataProcessor: ### Step 3: Create a Training Script -Create `your_task/run.py`: +Create `eval/your_task/run.py`: ```python #!/usr/bin/env python3 @@ -196,6 +209,7 @@ from datetime import datetime from .data_processor import DataProcessor, load_data from ace import ACE +from ace.prompts import load_prompts from utils import initialize_clients @@ -209,47 +223,49 @@ def parse_args(): "'online' for online training, 'eval_only' for evaluation only") parser.add_argument("--save_path", type=str, required=True) parser.add_argument("--initial_playbook_path", type=str, default=None) - parser.add_argument("--config_path", type=str, default="./your_task/data/task_config.json") - # Add other arguments as needed (see finance/run.py for full list) + parser.add_argument("--task_prompts_dir", type=str, default=None, + help="Path to task-specific prompts directory. " + "If not specified, uses default prompts.") + # Add other arguments as needed (see eval/finance/run.py for full list) return parser.parse_args() def preprocess_data(task_name, config, mode): """Load and preprocess data.""" processor = DataProcessor(task_name=task_name) - + # For online and eval_only modes, only load test data if mode in ["online", "eval_only"]: train_samples = None val_samples = None - + if "test_data" in config: test_samples = load_data(config["test_data"]) test_samples = processor.process_task_data(test_samples) else: raise ValueError(f"{mode} mode requires test data in config.") - + if mode == "online": print(f"Online mode: Training and testing on {len(test_samples)} examples") else: print(f"Eval only mode: Testing on {len(test_samples)} examples") - + # For offline mode, load train, val, and optionally test data else: train_samples = load_data(config["train_data"]) val_samples = load_data(config["val_data"]) train_samples = processor.process_task_data(train_samples) val_samples = processor.process_task_data(val_samples) - + if "test_data" in config: test_samples = load_data(config["test_data"]) test_samples = processor.process_task_data(test_samples) else: test_samples = [] - + print(f"Offline mode: Training on {len(train_samples)} examples, " f"validating on {len(val_samples)}, testing on {len(test_samples)}") - + return train_samples, val_samples, test_samples, processor @@ -263,33 +279,37 @@ def load_initial_playbook(path): def main(): args = parse_args() - + # Load task configuration - with open(args.config_path, 'r') as f: + with open("./eval/your_task/data/sample_config.json", 'r') as f: task_config = json.load(f) - + # Preprocess data train_samples, val_samples, test_samples, data_processor = \ preprocess_data(args.task_name, task_config[args.task_name], args.mode) - + # Load initial playbook (or use empty if None provided) initial_playbook = load_initial_playbook(args.initial_playbook_path) if initial_playbook: print(f"Loaded initial playbook from {args.initial_playbook_path}\n") else: print("Using empty playbook as initial playbook\n") - + + # Load prompts - uses task-specific if provided, otherwise defaults + prompt_config = load_prompts(task_prompts_dir=args.task_prompts_dir) + # Initialize ACE - api_provider = "sambanova" # or "together", "openai", "commonstack" + api_provider = "sambanova" # or "together", "openai", "commonstack" ace_system = ACE( api_provider=api_provider, generator_model="DeepSeek-V3.1", # Or your preferred model reflector_model="DeepSeek-V3.1", curator_model="DeepSeek-V3.1", max_tokens=4096, - initial_playbook=initial_playbook + initial_playbook=initial_playbook, + prompt_config=prompt_config ) - + # Configure config = { 'num_epochs': 1, @@ -306,10 +326,10 @@ def main(): 'save_dir': args.save_path, 'test_workers': 20, 'initial_playbook_path': args.initial_playbook_path, - 'use_bulletpoint_analyzer': false, # Turn on for playbook bulletpoints de-duplication and merging + 'use_bulletpoint_analyzer': False, # Turn on for playbook bulletpoints de-duplication and merging 'api_provider': api_provider } - + # Run using the unified interface results = ace_system.run( mode=args.mode, @@ -319,7 +339,7 @@ def main(): data_processor=data_processor, config=config ) - + if __name__ == "__main__": main() @@ -329,26 +349,30 @@ if __name__ == "__main__": ```bash # Offline training (with automatic initial and final testing) -python -m your_task.run \ +uv run python -m eval.your_task.run \ --task_name your_task_name \ --mode offline \ - --save_path results \ - --config_path ./your_task/data/task_config.json + --save_path results # Online training and testing -python -m your_task.run \ +uv run python -m eval.your_task.run \ --task_name your_task_name \ --mode online \ - --save_path results \ - --config_path ./your_task/data/task_config.json + --save_path results # Evaluation only (test a pre-trained playbook) -python -m your_task.run \ +uv run python -m eval.your_task.run \ --task_name your_task_name \ --mode eval_only \ --initial_playbook_path results/ace_run_timestamp/best_playbook.txt \ - --save_path test_results \ - --config_path ./your_task/data/task_config.json + --save_path test_results + +# With custom task-specific prompts +uv run python -m eval.your_task.run \ + --task_name your_task_name \ + --mode offline \ + --save_path results \ + --task_prompts_dir ./eval/your_task/prompts ``` ## Key Implementation Notes @@ -393,47 +417,191 @@ You can use any OpenAI-compatible model by changing the model names in the train ## Customizing Prompts -To adapt ACE's prompts to your domain, modify the prompt templates in `ace/prompts/`: +ACE supports **task-specific prompts** that override the default prompts without modifying the core framework. This allows you to customize prompts for different domains while keeping the base ACE code unchanged. + +### Task-Specific Prompts Directory + +Create a `prompts/` directory under your task folder: + +``` +eval/your_task/ +├── data/ +│ └── sample_config.json +├── data_processor.py +├── run.py +└── prompts/ # Optional - only create if customizing + ├── generator.py # Optional - define GENERATOR_PROMPT + ├── reflector.py # Optional - define REFLECTOR_PROMPT, REFLECTOR_PROMPT_NO_GT + └── curator.py # Optional - define CURATOR_PROMPT, CURATOR_PROMPT_NO_GT +``` + +**Important**: You only need to create files for the prompts you want to override. Missing prompts will automatically fall back to the defaults in `ace/prompts/`. + +### Using Task-Specific Prompts + +Pass the `--task_prompts_dir` argument when running your task: + +```bash +# Use custom prompts +uv run python -m eval.your_task.run \ + --task_name your_task_name \ + --mode offline \ + --save_path results \ + --task_prompts_dir ./eval/your_task/prompts +``` + +Or load prompts programmatically: ```python -# ace/prompts/generator.py -# Customize the generator system prompt for your domain +from ace import ACE +from ace.prompts import load_prompts + +# Load task-specific prompts (with fallback to defaults) +prompt_config = load_prompts(task_prompts_dir="./eval/your_task/prompts") + +# Initialize ACE with custom prompts +ace_system = ACE( + api_provider="sambanova", + generator_model="DeepSeek-V3.1", + reflector_model="DeepSeek-V3.1", + curator_model="DeepSeek-V3.1", + prompt_config=prompt_config +) +``` + +### Example: Custom Generator Prompt -# ace/prompts/reflector.py -# Customize the reflector's evaluation criteria +Create `eval/your_task/prompts/generator.py`: -# ace/prompts/curator.py -# Customize how insights are curated into the playbook +```python +# eval/your_task/prompts/generator.py + +GENERATOR_PROMPT = """You are a medical AI assistant specializing in clinical decision support. + +**Instructions:** +- Always prioritize patient safety +- Cite medical evidence when available +- Acknowledge uncertainty when appropriate +- Consider differential diagnoses +- Apply relevant strategies from the playbook + +Your output should be a json object with these fields: +- reasoning: your detailed analysis +- bullet_ids: relevant playbook bullet IDs used +- final_answer: your concise final answer + +**Playbook:** +{} + +**Reflection:** +{} + +**Question:** +{} + +**Context:** +{} + +**Answer in JSON format:** +{{ + "reasoning": "[Your analysis]", + "bullet_ids": ["med-00001"], + "final_answer": "[Your answer]" +}} +""" ``` -### Example: Domain-Specific Generator Prompt +### Example: Custom Reflector Prompt + +Create `eval/your_task/prompts/reflector.py`: ```python -# In ace/prompts/generator.py +# eval/your_task/prompts/reflector.py + +REFLECTOR_PROMPT = """You are a medical expert reviewing AI-generated diagnoses. + +Analyze the model's reasoning and identify: +- Clinical reasoning errors +- Missed differential diagnoses +- Safety concerns +- Evidence quality issues + +**Question:** +{} + +**Model's Reasoning:** +{} -MEDICAL_GENERATOR_PROMPT = """ -You are a medical AI assistant specializing in clinical decision support. -When answering questions: -1. Always prioritize patient safety -2. Cite medical evidence when available -3. Acknowledge uncertainty when appropriate -4. Consider differential diagnoses +**Model's Answer:** +{} -{playbook} +**Ground Truth:** +{} -Question: {question} -Context: {context} +**Environment Feedback:** +{} + +**Playbook Bullets Used:** +{} + +**Your Analysis (JSON):** +{{ + "reasoning": "[Your analysis]", + "error_identification": "[What went wrong]", + "root_cause_analysis": "[Why it went wrong]", + "correct_approach": "[Better approach]", + "key_insight": "[Lesson learned]", + "bullet_tags": [{{"id": "med-00001", "tag": "helpful"}}] +}} """ + +# Only define this if you also need a no-ground-truth variant +REFLECTOR_PROMPT_NO_GT = """...""" +``` + +### Prompt Variables + +The default prompts use `{}` placeholders that get filled via `.format()`. Ensure your custom prompts have the same number and order of placeholders: + +| Prompt | Placeholders (in order) | +|--------|------------------------| +| `GENERATOR_PROMPT` | playbook, reflection, question, context | +| `REFLECTOR_PROMPT` | question, reasoning_trace, predicted_answer, ground_truth, environment_feedback, bullets_used | +| `REFLECTOR_PROMPT_NO_GT` | question, reasoning_trace, predicted_answer, environment_feedback, bullets_used | +| `CURATOR_PROMPT` | Uses named placeholders: `{current_step}`, `{total_samples}`, `{token_budget}`, `{playbook_stats}`, `{recent_reflection}`, `{current_playbook}`, `{question_context}` | +| `CURATOR_PROMPT_NO_GT` | Same as `CURATOR_PROMPT` | + +### A/B Testing Prompts + +The task-specific prompts feature makes it easy to A/B test different prompt versions: + +```bash +# Test prompt version 1 +uv run python -m eval.your_task.run \ + --task_name your_task_name \ + --save_path results_v1 \ + --task_prompts_dir ./prompts_v1 + +# Test prompt version 2 +uv run python -m eval.your_task.run \ + --task_name your_task_name \ + --save_path results_v2 \ + --task_prompts_dir ./prompts_v2 ``` ## Reference Implementation -The `finance/` directory contains a complete working example of a custom task implementation. Use it as a reference for: +The `eval/finance/` directory contains a complete working example of a custom task implementation. Use it as a reference for: - Data preprocessing with multiple parsing strategies (`parse_instruction_and_input`, `parse_context_and_question_formula`) - Task-specific evaluation logic (`_finer_answer_is_correct`, `_formula_answer_is_correct`) - Handling different data formats and answer types - Using the unified `run()` interface with different modes +- Task-specific prompts configuration (via `--task_prompts_dir`) + +Other reference implementations: +- `eval/mind2web/` - Web navigation task with element selection +- `eval/mind2web2/` - Variant with larger candidate pool (50 candidates) ## Troubleshooting @@ -443,12 +611,14 @@ The `finance/` directory contains a complete working example of a custom task im 2. **Data format mismatches**: Verify your `process_task_data` returns the correct dictionary structure 3. **Evaluation errors**: Check that `answer_is_correct` handles edge cases (empty strings, None values, etc.) 4. **Memory issues**: Reduce `test_workers` parameter if running into memory constraints +5. **Custom prompts not loading**: Verify the `--task_prompts_dir` path exists and contains properly named Python files (`generator.py`, `reflector.py`, `curator.py`) with the correct variable names (`GENERATOR_PROMPT`, etc.) +6. **Prompt placeholder errors**: Ensure your custom prompts have the same number and order of `{}` placeholders as the default prompts ### Getting Help - **Issues**: Open an issue on GitHub with details about your task and error messages - **Discussions**: Join the [GitHub Discussions](../../discussions) for implementation questions -- **Examples**: Check the `finance/` directory for working reference implementations +- **Examples**: Check the `eval/finance/` directory for working reference implementations --- diff --git a/README.md b/README.md index c9bf961d..455b5de9 100644 --- a/README.md +++ b/README.md @@ -218,9 +218,30 @@ uv run python -m eval.finance.run \ | `--no_ground_truth` | Don't use ground truth in reflection | False | | `--use_bulletpoint_analyzer` | Enable bulletpoint analyzer for playbook deduplication and merging | False | | `--bulletpoint_analyzer_threshold` | Similarity threshold for bulletpoint analyzer (0-1) | 0.9 | +| `--task_prompts_dir` | Path to task-specific prompts directory (see [Custom Prompts](#custom-task-specific-prompts)) | None | +### Custom Task-Specific Prompts + +ACE supports task-specific prompts that override the default prompts. This allows you to customize the Generator, Reflector, and Curator behavior for different domains without modifying the core ACE code. + +```bash +# Use custom prompts for a specific task +uv run python -m eval.finance.run \ + --task_name finer \ + --mode offline \ + --save_path results \ + --task_prompts_dir ./eval/finance/prompts +``` + +To create custom prompts, create a `prompts/` directory under your task folder with any of these files: +- `generator.py` - Define `GENERATOR_PROMPT` +- `reflector.py` - Define `REFLECTOR_PROMPT` and/or `REFLECTOR_PROMPT_NO_GT` +- `curator.py` - Define `CURATOR_PROMPT` and/or `CURATOR_PROMPT_NO_GT` + +Only the prompts you define will override the defaults; missing prompts fall back to the built-in defaults. + ## 📈 Results and Outputs Using offline training as an example, after training, ACE generates: diff --git a/ace/__init__.py b/ace/__init__.py index a2e108cb..699647ce 100644 --- a/ace/__init__.py +++ b/ace/__init__.py @@ -47,7 +47,8 @@ from .ace import ACE from .core import Generator, Reflector, Curator, BulletpointAnalyzer +from .prompts import PromptConfig, load_prompts -__all__ = ['ACE', 'Generator', 'Reflector', 'Curator', 'BulletpointAnalyzer'] +__all__ = ['ACE', 'Generator', 'Reflector', 'Curator', 'BulletpointAnalyzer', 'PromptConfig', 'load_prompts'] __version__ = "1.0.0" \ No newline at end of file diff --git a/ace/ace.py b/ace/ace.py index 2d662adc..16b319f0 100644 --- a/ace/ace.py +++ b/ace/ace.py @@ -14,6 +14,7 @@ from typing import Dict, List, Tuple, Optional, Any from .core import Generator, Reflector, Curator, BulletpointAnalyzer +from .prompts import PromptConfig, load_prompts from playbook_utils import * from logger import * from utils import * @@ -39,11 +40,12 @@ def __init__( max_tokens: int = 4096, initial_playbook: Optional[str] = None, use_bulletpoint_analyzer: bool = False, - bulletpoint_analyzer_threshold: float = 0.90 + bulletpoint_analyzer_threshold: float = 0.90, + prompt_config: Optional[PromptConfig] = None ): """ Initialize the ACE system. - + Args: api_provider: API provider for LLM calls generator_model: Model name for generator @@ -53,14 +55,30 @@ def __init__( initial_playbook: Initial playbook content (optional) use_bulletpoint_analyzer: Whether to use bulletpoint analyzer for deduplication bulletpoint_analyzer_threshold: Similarity threshold for bulletpoint analyzer (0-1) + prompt_config: PromptConfig with custom prompts (optional, uses defaults if None) """ + # Load default prompts if none provided + if prompt_config is None: + prompt_config = load_prompts() + # Initialize API clients generator_client, reflector_client, curator_client = initialize_clients(api_provider) - # Initialize the three agents - self.generator = Generator(generator_client, api_provider, generator_model, max_tokens) - self.reflector = Reflector(reflector_client, api_provider, reflector_model, max_tokens) - self.curator = Curator(curator_client, api_provider, curator_model, max_tokens) + # Initialize the three agents with prompts from config + self.generator = Generator( + generator_client, api_provider, generator_model, max_tokens, + generator_prompt=prompt_config.generator_prompt + ) + self.reflector = Reflector( + reflector_client, api_provider, reflector_model, max_tokens, + reflector_prompt=prompt_config.reflector_prompt, + reflector_prompt_no_gt=prompt_config.reflector_prompt_no_gt + ) + self.curator = Curator( + curator_client, api_provider, curator_model, max_tokens, + curator_prompt=prompt_config.curator_prompt, + curator_prompt_no_gt=prompt_config.curator_prompt_no_gt + ) # Initialize bulletpoint analyzer if requested and available self.use_bulletpoint_analyzer = use_bulletpoint_analyzer diff --git a/ace/core/curator.py b/ace/core/curator.py index d1e4cf7d..5a71c216 100644 --- a/ace/core/curator.py +++ b/ace/core/curator.py @@ -16,21 +16,27 @@ class Curator: Curator agent that manages the playbook by adding, updating, merging, and deleting bullets based on reflection feedback. """ - - def __init__(self, api_client, api_provider, model: str, max_tokens: int = 4096): + + def __init__(self, api_client, api_provider, model: str, max_tokens: int = 4096, + curator_prompt: Optional[str] = None, + curator_prompt_no_gt: Optional[str] = None): """ Initialize the Curator agent. - + Args: api_client: OpenAI client for LLM calls api_provider: API provider for LLM calls model: Model name to use for curation max_tokens: Maximum tokens for curation + curator_prompt: Custom curator prompt (optional, uses default if None) + curator_prompt_no_gt: Custom curator prompt without ground truth (optional) """ self.api_client = api_client self.api_provider = api_provider self.model = model self.max_tokens = max_tokens + self.curator_prompt = curator_prompt or CURATOR_PROMPT + self.curator_prompt_no_gt = curator_prompt_no_gt or CURATOR_PROMPT_NO_GT def curate( self, @@ -72,7 +78,7 @@ def curate( # Select the appropriate prompt if use_ground_truth: - prompt = CURATOR_PROMPT.format( + prompt = self.curator_prompt.format( current_step=current_step, total_samples=total_samples, token_budget=token_budget, @@ -82,7 +88,7 @@ def curate( question_context=question_context ) else: - prompt = CURATOR_PROMPT_NO_GT.format( + prompt = self.curator_prompt_no_gt.format( current_step=current_step, total_samples=total_samples, token_budget=token_budget, diff --git a/ace/core/generator.py b/ace/core/generator.py index 3ceebecc..63bf0217 100644 --- a/ace/core/generator.py +++ b/ace/core/generator.py @@ -14,21 +14,24 @@ class Generator: Generator agent that produces answers to questions using knowledge from a playbook and previous reflections. """ - - def __init__(self, api_client, api_provider, model: str, max_tokens: int = 4096): + + def __init__(self, api_client, api_provider, model: str, max_tokens: int = 4096, + generator_prompt: Optional[str] = None): """ Initialize the Generator agent. - + Args: api_client: OpenAI client for LLM calls api_provider: API provider for LLM calls model: Model name to use for generation max_tokens: Maximum tokens for generation + generator_prompt: Custom generator prompt (optional, uses default if None) """ self.api_client = api_client self.api_provider = api_provider self.model = model self.max_tokens = max_tokens + self.generator_prompt = generator_prompt or GENERATOR_PROMPT def generate( self, @@ -56,7 +59,7 @@ def generate( Tuple of (full_response, bullet_ids_used, call_info) """ # Format the prompt - prompt = GENERATOR_PROMPT.format(playbook, reflection, question, context) + prompt = self.generator_prompt.format(playbook, reflection, question, context) response, call_info = timed_llm_call( self.api_client, diff --git a/ace/core/reflector.py b/ace/core/reflector.py index 134ea4c3..372a2f25 100644 --- a/ace/core/reflector.py +++ b/ace/core/reflector.py @@ -14,21 +14,27 @@ class Reflector: Reflector agent that analyzes the generator's reasoning and tags bullets as helpful, harmful, or neutral. """ - - def __init__(self, api_client, api_provider, model: str, max_tokens: int = 4096): + + def __init__(self, api_client, api_provider, model: str, max_tokens: int = 4096, + reflector_prompt: Optional[str] = None, + reflector_prompt_no_gt: Optional[str] = None): """ Initialize the Reflector agent. - + Args: api_client: OpenAI client for LLM calls api_provider: API provider for LLM calls model: Model name to use for reflection max_tokens: Maximum tokens for reflection + reflector_prompt: Custom reflector prompt (optional, uses default if None) + reflector_prompt_no_gt: Custom reflector prompt without ground truth (optional) """ self.api_client = api_client self.api_provider = api_provider self.model = model self.max_tokens = max_tokens + self.reflector_prompt = reflector_prompt or REFLECTOR_PROMPT + self.reflector_prompt_no_gt = reflector_prompt_no_gt or REFLECTOR_PROMPT_NO_GT def reflect( self, @@ -63,7 +69,7 @@ def reflect( """ # Select the appropriate prompt if use_ground_truth and ground_truth: - prompt = REFLECTOR_PROMPT.format( + prompt = self.reflector_prompt.format( question, reasoning_trace, predicted_answer, @@ -72,7 +78,7 @@ def reflect( bullets_used ) else: - prompt = REFLECTOR_PROMPT_NO_GT.format( + prompt = self.reflector_prompt_no_gt.format( question, reasoning_trace, predicted_answer, diff --git a/ace/prompts/__init__.py b/ace/prompts/__init__.py index d292ffb0..a3351bcf 100644 --- a/ace/prompts/__init__.py +++ b/ace/prompts/__init__.py @@ -6,16 +6,22 @@ from .generator import * from .reflector import * from .curator import * +from .config import PromptConfig +from .loader import load_prompts __all__ = [ # Generator prompts 'GENERATOR_PROMPT', - + # Reflector prompts 'REFLECTOR_PROMPT', 'REFLECTOR_PROMPT_NO_GT', - + # Curator prompts 'CURATOR_PROMPT', 'CURATOR_PROMPT_NO_GT', + + # Prompt configuration + 'PromptConfig', + 'load_prompts', ] \ No newline at end of file diff --git a/ace/prompts/config.py b/ace/prompts/config.py new file mode 100644 index 00000000..070dd700 --- /dev/null +++ b/ace/prompts/config.py @@ -0,0 +1,16 @@ +""" +PromptConfig dataclass for ACE system. +Holds all agent prompts in a single configuration object. +""" + +from dataclasses import dataclass + + +@dataclass +class PromptConfig: + """Configuration dataclass holding all agent prompts.""" + generator_prompt: str + reflector_prompt: str + reflector_prompt_no_gt: str + curator_prompt: str + curator_prompt_no_gt: str diff --git a/ace/prompts/loader.py b/ace/prompts/loader.py new file mode 100644 index 00000000..67ffc8e6 --- /dev/null +++ b/ace/prompts/loader.py @@ -0,0 +1,113 @@ +""" +Prompt loader for ACE system. +Provides functionality to load task-specific prompts with fallback to defaults. +""" + +import os +import importlib.util +from typing import Optional + +from .config import PromptConfig +from .generator import GENERATOR_PROMPT +from .reflector import REFLECTOR_PROMPT, REFLECTOR_PROMPT_NO_GT +from .curator import CURATOR_PROMPT, CURATOR_PROMPT_NO_GT + + +def _load_prompt_from_file(file_path: str, variable_name: str) -> Optional[str]: + """ + Load a prompt variable from a Python file. + + Args: + file_path: Path to the Python file containing the prompt + variable_name: Name of the variable to load (e.g., 'GENERATOR_PROMPT') + + Returns: + The prompt string if found, None otherwise + """ + if not os.path.exists(file_path): + return None + + try: + spec = importlib.util.spec_from_file_location("prompt_module", file_path) + if spec is None or spec.loader is None: + return None + + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + if hasattr(module, variable_name): + return getattr(module, variable_name) + return None + except Exception as e: + print(f"Warning: Failed to load {variable_name} from {file_path}: {e}") + return None + + +def load_prompts(task_prompts_dir: Optional[str] = None) -> PromptConfig: + """ + Load prompts with task-specific overrides. + + Checks if task-specific prompts exist in the given directory and loads them. + Falls back to default prompts for any missing prompts. + + Args: + task_prompts_dir: Path to task's prompts directory (e.g., "eval/my_task/prompts") + If None or doesn't exist, uses all defaults. + + Returns: + PromptConfig with loaded prompts (custom where available, defaults otherwise) + + Example: + # Load with task-specific prompts + config = load_prompts("eval/finance/prompts") + + # Load with all defaults + config = load_prompts() + """ + # Start with default prompts + generator_prompt = GENERATOR_PROMPT + reflector_prompt = REFLECTOR_PROMPT + reflector_prompt_no_gt = REFLECTOR_PROMPT_NO_GT + curator_prompt = CURATOR_PROMPT + curator_prompt_no_gt = CURATOR_PROMPT_NO_GT + + # If task_prompts_dir is provided and exists, try to load custom prompts + if task_prompts_dir and os.path.isdir(task_prompts_dir): + # Try to load generator prompt + generator_file = os.path.join(task_prompts_dir, "generator.py") + custom_generator = _load_prompt_from_file(generator_file, "GENERATOR_PROMPT") + if custom_generator is not None: + generator_prompt = custom_generator + print(f"Loaded custom GENERATOR_PROMPT from {generator_file}") + + # Try to load reflector prompts + reflector_file = os.path.join(task_prompts_dir, "reflector.py") + custom_reflector = _load_prompt_from_file(reflector_file, "REFLECTOR_PROMPT") + if custom_reflector is not None: + reflector_prompt = custom_reflector + print(f"Loaded custom REFLECTOR_PROMPT from {reflector_file}") + + custom_reflector_no_gt = _load_prompt_from_file(reflector_file, "REFLECTOR_PROMPT_NO_GT") + if custom_reflector_no_gt is not None: + reflector_prompt_no_gt = custom_reflector_no_gt + print(f"Loaded custom REFLECTOR_PROMPT_NO_GT from {reflector_file}") + + # Try to load curator prompts + curator_file = os.path.join(task_prompts_dir, "curator.py") + custom_curator = _load_prompt_from_file(curator_file, "CURATOR_PROMPT") + if custom_curator is not None: + curator_prompt = custom_curator + print(f"Loaded custom CURATOR_PROMPT from {curator_file}") + + custom_curator_no_gt = _load_prompt_from_file(curator_file, "CURATOR_PROMPT_NO_GT") + if custom_curator_no_gt is not None: + curator_prompt_no_gt = custom_curator_no_gt + print(f"Loaded custom CURATOR_PROMPT_NO_GT from {curator_file}") + + return PromptConfig( + generator_prompt=generator_prompt, + reflector_prompt=reflector_prompt, + reflector_prompt_no_gt=reflector_prompt_no_gt, + curator_prompt=curator_prompt, + curator_prompt_no_gt=curator_prompt_no_gt + ) diff --git a/eval/finance/run.py b/eval/finance/run.py index a263c871..231b40c1 100644 --- a/eval/finance/run.py +++ b/eval/finance/run.py @@ -11,6 +11,7 @@ from .data_processor import DataProcessor from ace import ACE +from ace.prompts import load_prompts from utils import initialize_clients def parse_args(): @@ -78,7 +79,12 @@ def parse_args(): # Output configuration parser.add_argument("--save_path", type=str, required=True, help="Directory to save results") - + + # Task-specific prompts configuration + parser.add_argument("--task_prompts_dir", type=str, default=None, + help="Path to task-specific prompts directory. " + "If not specified, uses default prompts.") + return parser.parse_args() def load_data(data_path: str): @@ -192,7 +198,10 @@ def main(): print(f"Loaded initial playbook from {args.initial_playbook_path}\n") else: print("Using empty playbook as initial playbook\n") - + + # Load prompts - uses task-specific if provided, otherwise defaults + prompt_config = load_prompts(task_prompts_dir=args.task_prompts_dir) + # Create ACE system ace_system = ACE( api_provider=args.api_provider, @@ -202,7 +211,8 @@ def main(): max_tokens=args.max_tokens, initial_playbook=initial_playbook, use_bulletpoint_analyzer=args.use_bulletpoint_analyzer, - bulletpoint_analyzer_threshold=args.bulletpoint_analyzer_threshold + bulletpoint_analyzer_threshold=args.bulletpoint_analyzer_threshold, + prompt_config=prompt_config ) # Prepare configuration diff --git a/eval/mind2web/run.py b/eval/mind2web/run.py index c30e0880..77395699 100644 --- a/eval/mind2web/run.py +++ b/eval/mind2web/run.py @@ -4,6 +4,7 @@ from .data_processor import DataProcessor, load_data from ace import ACE +from ace.prompts import load_prompts def parse_args(): @@ -76,6 +77,11 @@ def parse_args(): parser.add_argument("--save_path", type=str, required=True, help="Directory to save results") + # Task-specific prompts configuration + parser.add_argument("--task_prompts_dir", type=str, default=None, + help="Path to task-specific prompts directory. " + "If not specified, uses default prompts.") + return parser.parse_args() @@ -169,6 +175,9 @@ def main(): else: print("Using empty playbook as initial playbook\n") + # Load prompts - uses task-specific if provided, otherwise defaults + prompt_config = load_prompts(task_prompts_dir=args.task_prompts_dir) + # Create ACE system ace_system = ACE( api_provider=args.api_provider, @@ -178,7 +187,8 @@ def main(): max_tokens=args.max_tokens, initial_playbook=initial_playbook, use_bulletpoint_analyzer=args.use_bulletpoint_analyzer, - bulletpoint_analyzer_threshold=args.bulletpoint_analyzer_threshold + bulletpoint_analyzer_threshold=args.bulletpoint_analyzer_threshold, + prompt_config=prompt_config ) # Prepare configuration diff --git a/eval/mind2web2/run.py b/eval/mind2web2/run.py index 73be68be..b2dd5729 100644 --- a/eval/mind2web2/run.py +++ b/eval/mind2web2/run.py @@ -4,6 +4,7 @@ from .data_processor import DataProcessor, load_data from ace import ACE +from ace.prompts import load_prompts def parse_args(): @@ -76,6 +77,11 @@ def parse_args(): parser.add_argument("--save_path", type=str, required=True, help="Directory to save results") + # Task-specific prompts configuration + parser.add_argument("--task_prompts_dir", type=str, default=None, + help="Path to task-specific prompts directory. " + "If not specified, uses default prompts.") + return parser.parse_args() @@ -169,6 +175,9 @@ def main(): else: print("Using empty playbook as initial playbook\n") + # Load prompts - uses task-specific if provided, otherwise defaults + prompt_config = load_prompts(task_prompts_dir=args.task_prompts_dir) + # Create ACE system ace_system = ACE( api_provider=args.api_provider, @@ -178,7 +187,8 @@ def main(): max_tokens=args.max_tokens, initial_playbook=initial_playbook, use_bulletpoint_analyzer=args.use_bulletpoint_analyzer, - bulletpoint_analyzer_threshold=args.bulletpoint_analyzer_threshold + bulletpoint_analyzer_threshold=args.bulletpoint_analyzer_threshold, + prompt_config=prompt_config ) # Prepare configuration From f2a1043a0495ed92ff2b9bbdb027789794c17267 Mon Sep 17 00:00:00 2001 From: snova-fengluh Date: Tue, 21 Apr 2026 16:18:59 -0700 Subject: [PATCH 2/2] use named placeholders --- EXTENDING_ACE.md | 36 +++++++++++++++++++----------------- ace/core/generator.py | 7 ++++++- ace/core/reflector.py | 22 +++++++++++----------- ace/prompts/generator.py | 8 ++++---- ace/prompts/reflector.py | 22 +++++++++++----------- 5 files changed, 51 insertions(+), 44 deletions(-) diff --git a/EXTENDING_ACE.md b/EXTENDING_ACE.md index 900ae4d7..81ad6288 100644 --- a/EXTENDING_ACE.md +++ b/EXTENDING_ACE.md @@ -491,16 +491,16 @@ Your output should be a json object with these fields: - final_answer: your concise final answer **Playbook:** -{} +{playbook} **Reflection:** -{} +{reflection} **Question:** -{} +{question} **Context:** -{} +{context} **Answer in JSON format:** {{ @@ -527,22 +527,22 @@ Analyze the model's reasoning and identify: - Evidence quality issues **Question:** -{} +{question} **Model's Reasoning:** -{} +{reasoning_trace} **Model's Answer:** -{} +{predicted_answer} **Ground Truth:** -{} +{ground_truth} **Environment Feedback:** -{} +{environment_feedback} **Playbook Bullets Used:** -{} +{bullets_used} **Your Analysis (JSON):** {{ @@ -561,16 +561,18 @@ REFLECTOR_PROMPT_NO_GT = """...""" ### Prompt Variables -The default prompts use `{}` placeholders that get filled via `.format()`. Ensure your custom prompts have the same number and order of placeholders: +All prompts use **named placeholders** (e.g., `{playbook}`, `{question}`) that get filled via `.format()`. Your custom prompts can use any subset of the available variables in any order. -| Prompt | Placeholders (in order) | -|--------|------------------------| -| `GENERATOR_PROMPT` | playbook, reflection, question, context | -| `REFLECTOR_PROMPT` | question, reasoning_trace, predicted_answer, ground_truth, environment_feedback, bullets_used | -| `REFLECTOR_PROMPT_NO_GT` | question, reasoning_trace, predicted_answer, environment_feedback, bullets_used | -| `CURATOR_PROMPT` | Uses named placeholders: `{current_step}`, `{total_samples}`, `{token_budget}`, `{playbook_stats}`, `{recent_reflection}`, `{current_playbook}`, `{question_context}` | +| Prompt | Available Variables | +|--------|---------------------| +| `GENERATOR_PROMPT` | `{playbook}`, `{reflection}`, `{question}`, `{context}` | +| `REFLECTOR_PROMPT` | `{question}`, `{reasoning_trace}`, `{predicted_answer}`, `{ground_truth}`, `{environment_feedback}`, `{bullets_used}` | +| `REFLECTOR_PROMPT_NO_GT` | `{question}`, `{reasoning_trace}`, `{predicted_answer}`, `{environment_feedback}`, `{bullets_used}` | +| `CURATOR_PROMPT` | `{current_step}`, `{total_samples}`, `{token_budget}`, `{playbook_stats}`, `{recent_reflection}`, `{current_playbook}`, `{question_context}` | | `CURATOR_PROMPT_NO_GT` | Same as `CURATOR_PROMPT` | +**Note**: You only need to include the variables you want to use. For example, a minimal generator prompt could just use `{question}` and `{playbook}`. + ### A/B Testing Prompts The task-specific prompts feature makes it easy to A/B test different prompt versions: diff --git a/ace/core/generator.py b/ace/core/generator.py index 63bf0217..16ee7b3f 100644 --- a/ace/core/generator.py +++ b/ace/core/generator.py @@ -59,7 +59,12 @@ def generate( Tuple of (full_response, bullet_ids_used, call_info) """ # Format the prompt - prompt = self.generator_prompt.format(playbook, reflection, question, context) + prompt = self.generator_prompt.format( + playbook=playbook, + reflection=reflection, + question=question, + context=context + ) response, call_info = timed_llm_call( self.api_client, diff --git a/ace/core/reflector.py b/ace/core/reflector.py index 372a2f25..74712ca4 100644 --- a/ace/core/reflector.py +++ b/ace/core/reflector.py @@ -70,20 +70,20 @@ def reflect( # Select the appropriate prompt if use_ground_truth and ground_truth: prompt = self.reflector_prompt.format( - question, - reasoning_trace, - predicted_answer, - ground_truth, - environment_feedback, - bullets_used + question=question, + reasoning_trace=reasoning_trace, + predicted_answer=predicted_answer, + ground_truth=ground_truth, + environment_feedback=environment_feedback, + bullets_used=bullets_used ) else: prompt = self.reflector_prompt_no_gt.format( - question, - reasoning_trace, - predicted_answer, - environment_feedback, - bullets_used + question=question, + reasoning_trace=reasoning_trace, + predicted_answer=predicted_answer, + environment_feedback=environment_feedback, + bullets_used=bullets_used ) response, call_info = timed_llm_call( diff --git a/ace/prompts/generator.py b/ace/prompts/generator.py index 3304ca76..0b2c47b5 100644 --- a/ace/prompts/generator.py +++ b/ace/prompts/generator.py @@ -20,16 +20,16 @@ **Playbook:** -{} +{playbook} **Reflection:** -{} +{reflection} **Question:** -{} +{question} **Context:** -{} +{context} **Answer in this exact JSON format:** {{ diff --git a/ace/prompts/reflector.py b/ace/prompts/reflector.py index 7b9f841b..ab4b93ea 100644 --- a/ace/prompts/reflector.py +++ b/ace/prompts/reflector.py @@ -27,22 +27,22 @@ **Question:** -{} +{question} **Model's Reasoning Trace:** -{} +{reasoning_trace} **Model's Predicted Answer:** -{} +{predicted_answer} **Ground Truth Answer:** -{} +{ground_truth} **Environment Feedback:** -{} +{environment_feedback} **Part of Playbook that's used by the generator to answer the question:** -{} +{bullets_used} **Answer in this exact JSON format:** {{ @@ -84,19 +84,19 @@ **Question:** -{} +{question} **Model's Reasoning Trace:** -{} +{reasoning_trace} **Model's Predicted Answer:** -{} +{predicted_answer} **Environment Feedback:** -{} +{environment_feedback} **Part of Playbook that's used by the generator to answer the question:** -{} +{bullets_used} **Answer in this exact JSON format:** {{