From d36e7c3e4ae02aa6df0d5c7722bc65dfe885527c Mon Sep 17 00:00:00 2001
From: snova-fengluh <fenglu.hong@sambanovasystems.com>
Date: Tue, 21 Apr 2026 15:15:32 -0700
Subject: [PATCH 1/2] Add task-specific prompts feature for Generator,
 Reflector, and Curator

- Add PromptConfig dataclass (ace/prompts/config.py) to hold all agent prompts
- Add load_prompts() function (ace/prompts/loader.py) to load task-specific
  prompts with fallback to defaults
- Modify Generator, Reflector, and Curator agents to accept optional prompt
  parameters in __init__
- Add prompt_config parameter to ACE class to pass custom prompts to agents
- Add --task_prompts_dir CLI argument to all task run.py files (finance,
  mind2web, mind2web2)
- Update README.md with new argument and Custom Task-Specific Prompts section
- Update EXTENDING_ACE.md with corrected repository structure and comprehensive
  documentation for the new prompts feature

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 EXTENDING_ACE.md        | 304 +++++++++++++++++++++++++++++++---------
 README.md               |  21 +++
 ace/__init__.py         |   3 +-
 ace/ace.py              |  30 +++-
 ace/core/curator.py     |  16 ++-
 ace/core/generator.py   |  11 +-
 ace/core/reflector.py   |  16 ++-
 ace/prompts/__init__.py |  10 +-
 ace/prompts/config.py   |  16 +++
 ace/prompts/loader.py   | 113 +++++++++++++++
 eval/finance/run.py     |  16 ++-
 eval/mind2web/run.py    |  12 +-
 eval/mind2web2/run.py   |  12 +-
 13 files changed, 485 insertions(+), 95 deletions(-)
 create mode 100644 ace/prompts/config.py
 create mode 100644 ace/prompts/loader.py

diff --git a/EXTENDING_ACE.md b/EXTENDING_ACE.md
index f7dab7b1..900ae4d7 100644
--- a/EXTENDING_ACE.md
+++ b/EXTENDING_ACE.md
@@ -7,31 +7,44 @@ This guide provides detailed instructions for adding new tasks to the ACE framew
 Understanding the codebase structure will help you navigate and extend ACE effectively:
 
 ```
-ACE-pre-release/
+ace/
 ├── ace/                         # Core ACE framework
 │   ├── core/                    # Agent implementations
 │   │   ├── __init__.py
 │   │   ├── generator.py         # Generator agent
 │   │   ├── reflector.py         # Reflector agent
 │   │   ├── curator.py           # Curator agent
-│   │   └── bulletpoint_analyzer.py       # Bulletpoint analyzer for playbook de-duplication
+│   │   └── bulletpoint_analyzer.py  # Bulletpoint analyzer for playbook de-duplication
 │   ├── prompts/                 # Prompt templates
 │   │   ├── __init__.py
-│   │   ├── generator.py         # Generator prompts
-│   │   ├── reflector.py         # Reflector prompts
-│   │   └── curator.py           # Curator prompts
+│   │   ├── config.py            # PromptConfig dataclass
+│   │   ├── loader.py            # Prompt loader with task-specific override support
+│   │   ├── generator.py         # Default generator prompts
+│   │   ├── reflector.py         # Default reflector prompts
+│   │   └── curator.py           # Default curator prompts
 │   ├── __init__.py
 │   └── ace.py                   # Main ACE orchestrator
 │
-├── finance/                     # Finance domain implementation (reference example)
-│   ├── data_processor.py        # Finance data processing
-│   └── run.py                   # Unified training and evaluation script
+├── eval/                        # Evaluation tasks
+│   ├── finance/                 # Finance domain implementation (reference example)
+│   │   ├── data/                # Task data and config
+│   │   ├── data_processor.py    # Finance data processing
+│   │   ├── run.py               # Unified training and evaluation script
+│   │   └── prompts/             # (Optional) Task-specific prompts
+│   ├── mind2web/                # Mind2Web task implementation
+│   │   ├── data/
+│   │   ├── data_processor.py
+│   │   └── run.py
+│   └── mind2web2/               # Mind2Web variant (50 candidates)
+│       ├── data/
+│       ├── data_processor.py
+│       └── run.py
 │
 ├── llm.py                       # LLM utilities
 ├── logger.py                    # Logging utilities
-├── utils.py                     # General utilities 
+├── utils.py                     # General utilities
 ├── playbook_utils.py            # Playbook operations
-├── requirements.txt             # Dependencies
+├── pyproject.toml               # Project dependencies (uv)
 ├── .env.example                 # Environment template
 ├── README.md                    # Main documentation
 └── EXTENDING_ACE.md             # This file
@@ -54,23 +67,23 @@ Or with custom field names:
 {"input": "question text", "output": "answer", "metadata": {...}}
 ```
 
-Create a configuration file (e.g., `your_task/data/task_config.json`):
+Create a configuration file (e.g., `eval/your_task/data/sample_config.json`):
 ```json
 {
     "your_task_name": {
-        "train_data": "./your_task/data/train.jsonl",
-        "val_data": "./your_task/data/val.jsonl",
-        "test_data": "./your_task/data/test.jsonl"
+        "train_data": "./eval/your_task/data/train.jsonl",
+        "val_data": "./eval/your_task/data/val.jsonl",
+        "test_data": "./eval/your_task/data/test.jsonl"
     }
 }
 ```
 
 ### Step 2: Create a Data Processor
 
-Create `your_task/data_processor.py` with a `DataProcessor` class. You only need to implement **3 simple methods**:
+Create `eval/your_task/data_processor.py` with a `DataProcessor` class. You only need to implement **3 simple methods**:
 
 ```python
-# your_task/data_processor.py
+# eval/your_task/data_processor.py
 import os
 import json
 from typing import List, Dict, Any, Tuple
@@ -185,7 +198,7 @@ class DataProcessor:
 
 ### Step 3: Create a Training Script
 
-Create `your_task/run.py`:
+Create `eval/your_task/run.py`:
 
 ```python
 #!/usr/bin/env python3
@@ -196,6 +209,7 @@ from datetime import datetime
 from .data_processor import DataProcessor, load_data
 
 from ace import ACE
+from ace.prompts import load_prompts
 from utils import initialize_clients
 
 
@@ -209,47 +223,49 @@ def parse_args():
                              "'online' for online training, 'eval_only' for evaluation only")
     parser.add_argument("--save_path", type=str, required=True)
     parser.add_argument("--initial_playbook_path", type=str, default=None)
-    parser.add_argument("--config_path", type=str, default="./your_task/data/task_config.json")
-    # Add other arguments as needed (see finance/run.py for full list)
+    parser.add_argument("--task_prompts_dir", type=str, default=None,
+                        help="Path to task-specific prompts directory. "
+                             "If not specified, uses default prompts.")
+    # Add other arguments as needed (see eval/finance/run.py for full list)
     return parser.parse_args()
 
 
 def preprocess_data(task_name, config, mode):
     """Load and preprocess data."""
     processor = DataProcessor(task_name=task_name)
-    
+
     # For online and eval_only modes, only load test data
     if mode in ["online", "eval_only"]:
         train_samples = None
         val_samples = None
-        
+
         if "test_data" in config:
             test_samples = load_data(config["test_data"])
             test_samples = processor.process_task_data(test_samples)
         else:
             raise ValueError(f"{mode} mode requires test data in config.")
-        
+
         if mode == "online":
             print(f"Online mode: Training and testing on {len(test_samples)} examples")
         else:
             print(f"Eval only mode: Testing on {len(test_samples)} examples")
-    
+
     # For offline mode, load train, val, and optionally test data
     else:
         train_samples = load_data(config["train_data"])
         val_samples = load_data(config["val_data"])
         train_samples = processor.process_task_data(train_samples)
         val_samples = processor.process_task_data(val_samples)
-        
+
         if "test_data" in config:
             test_samples = load_data(config["test_data"])
             test_samples = processor.process_task_data(test_samples)
         else:
             test_samples = []
-        
+
         print(f"Offline mode: Training on {len(train_samples)} examples, "
               f"validating on {len(val_samples)}, testing on {len(test_samples)}")
-    
+
     return train_samples, val_samples, test_samples, processor
 
 
@@ -263,33 +279,37 @@ def load_initial_playbook(path):
 
 def main():
     args = parse_args()
-    
+
     # Load task configuration
-    with open(args.config_path, 'r') as f:
+    with open("./eval/your_task/data/sample_config.json", 'r') as f:
         task_config = json.load(f)
-    
+
     # Preprocess data
     train_samples, val_samples, test_samples, data_processor = \
         preprocess_data(args.task_name, task_config[args.task_name], args.mode)
-    
+
     # Load initial playbook (or use empty if None provided)
     initial_playbook = load_initial_playbook(args.initial_playbook_path)
     if initial_playbook:
         print(f"Loaded initial playbook from {args.initial_playbook_path}\n")
     else:
         print("Using empty playbook as initial playbook\n")
-    
+
+    # Load prompts - uses task-specific if provided, otherwise defaults
+    prompt_config = load_prompts(task_prompts_dir=args.task_prompts_dir)
+
     # Initialize ACE
-    api_provider = "sambanova" # or "together", "openai", "commonstack"
+    api_provider = "sambanova"  # or "together", "openai", "commonstack"
     ace_system = ACE(
         api_provider=api_provider,
         generator_model="DeepSeek-V3.1",  # Or your preferred model
         reflector_model="DeepSeek-V3.1",
         curator_model="DeepSeek-V3.1",
         max_tokens=4096,
-        initial_playbook=initial_playbook
+        initial_playbook=initial_playbook,
+        prompt_config=prompt_config
     )
-    
+
     # Configure
     config = {
         'num_epochs': 1,
@@ -306,10 +326,10 @@ def main():
         'save_dir': args.save_path,
         'test_workers': 20,
         'initial_playbook_path': args.initial_playbook_path,
-        'use_bulletpoint_analyzer': false,   # Turn on for playbook bulletpoints de-duplication and merging
+        'use_bulletpoint_analyzer': False,  # Turn on for playbook bulletpoints de-duplication and merging
         'api_provider': api_provider
     }
-    
+
     # Run using the unified interface
     results = ace_system.run(
         mode=args.mode,
@@ -319,7 +339,7 @@ def main():
         data_processor=data_processor,
         config=config
     )
-   
+
 
 if __name__ == "__main__":
     main()
@@ -329,26 +349,30 @@ if __name__ == "__main__":
 
 ```bash
 # Offline training (with automatic initial and final testing)
-python -m your_task.run \
+uv run python -m eval.your_task.run \
     --task_name your_task_name \
     --mode offline \
-    --save_path results \
-    --config_path ./your_task/data/task_config.json
+    --save_path results
 
 # Online training and testing
-python -m your_task.run \
+uv run python -m eval.your_task.run \
     --task_name your_task_name \
     --mode online \
-    --save_path results \
-    --config_path ./your_task/data/task_config.json
+    --save_path results
 
 # Evaluation only (test a pre-trained playbook)
-python -m your_task.run \
+uv run python -m eval.your_task.run \
     --task_name your_task_name \
     --mode eval_only \
     --initial_playbook_path results/ace_run_timestamp/best_playbook.txt \
-    --save_path test_results \
-    --config_path ./your_task/data/task_config.json
+    --save_path test_results
+
+# With custom task-specific prompts
+uv run python -m eval.your_task.run \
+    --task_name your_task_name \
+    --mode offline \
+    --save_path results \
+    --task_prompts_dir ./eval/your_task/prompts
 ```
 
 ## Key Implementation Notes
@@ -393,47 +417,191 @@ You can use any OpenAI-compatible model by changing the model names in the train
 
 ## Customizing Prompts
 
-To adapt ACE's prompts to your domain, modify the prompt templates in `ace/prompts/`:
+ACE supports **task-specific prompts** that override the default prompts without modifying the core framework. This allows you to customize prompts for different domains while keeping the base ACE code unchanged.
+
+### Task-Specific Prompts Directory
+
+Create a `prompts/` directory under your task folder:
+
+```
+eval/your_task/
+├── data/
+│   └── sample_config.json
+├── data_processor.py
+├── run.py
+└── prompts/                    # Optional - only create if customizing
+    ├── generator.py            # Optional - define GENERATOR_PROMPT
+    ├── reflector.py            # Optional - define REFLECTOR_PROMPT, REFLECTOR_PROMPT_NO_GT
+    └── curator.py              # Optional - define CURATOR_PROMPT, CURATOR_PROMPT_NO_GT
+```
+
+**Important**: You only need to create files for the prompts you want to override. Missing prompts will automatically fall back to the defaults in `ace/prompts/`.
+
+### Using Task-Specific Prompts
+
+Pass the `--task_prompts_dir` argument when running your task:
+
+```bash
+# Use custom prompts
+uv run python -m eval.your_task.run \
+    --task_name your_task_name \
+    --mode offline \
+    --save_path results \
+    --task_prompts_dir ./eval/your_task/prompts
+```
+
+Or load prompts programmatically:
 
 ```python
-# ace/prompts/generator.py
-# Customize the generator system prompt for your domain
+from ace import ACE
+from ace.prompts import load_prompts
+
+# Load task-specific prompts (with fallback to defaults)
+prompt_config = load_prompts(task_prompts_dir="./eval/your_task/prompts")
+
+# Initialize ACE with custom prompts
+ace_system = ACE(
+    api_provider="sambanova",
+    generator_model="DeepSeek-V3.1",
+    reflector_model="DeepSeek-V3.1",
+    curator_model="DeepSeek-V3.1",
+    prompt_config=prompt_config
+)
+```
+
+### Example: Custom Generator Prompt
 
-# ace/prompts/reflector.py  
-# Customize the reflector's evaluation criteria
+Create `eval/your_task/prompts/generator.py`:
 
-# ace/prompts/curator.py
-# Customize how insights are curated into the playbook
+```python
+# eval/your_task/prompts/generator.py
+
+GENERATOR_PROMPT = """You are a medical AI assistant specializing in clinical decision support.
+
+**Instructions:**
+- Always prioritize patient safety
+- Cite medical evidence when available
+- Acknowledge uncertainty when appropriate
+- Consider differential diagnoses
+- Apply relevant strategies from the playbook
+
+Your output should be a json object with these fields:
+- reasoning: your detailed analysis
+- bullet_ids: relevant playbook bullet IDs used
+- final_answer: your concise final answer
+
+**Playbook:**
+{}
+
+**Reflection:**
+{}
+
+**Question:**
+{}
+
+**Context:**
+{}
+
+**Answer in JSON format:**
+{{
+  "reasoning": "[Your analysis]",
+  "bullet_ids": ["med-00001"],
+  "final_answer": "[Your answer]"
+}}
+"""
 ```
 
-### Example: Domain-Specific Generator Prompt
+### Example: Custom Reflector Prompt
+
+Create `eval/your_task/prompts/reflector.py`:
 
 ```python
-# In ace/prompts/generator.py
+# eval/your_task/prompts/reflector.py
+
+REFLECTOR_PROMPT = """You are a medical expert reviewing AI-generated diagnoses.
+
+Analyze the model's reasoning and identify:
+- Clinical reasoning errors
+- Missed differential diagnoses
+- Safety concerns
+- Evidence quality issues
+
+**Question:**
+{}
+
+**Model's Reasoning:**
+{}
 
-MEDICAL_GENERATOR_PROMPT = """
-You are a medical AI assistant specializing in clinical decision support.
-When answering questions:
-1. Always prioritize patient safety
-2. Cite medical evidence when available
-3. Acknowledge uncertainty when appropriate
-4. Consider differential diagnoses
+**Model's Answer:**
+{}
 
-{playbook}
+**Ground Truth:**
+{}
 
-Question: {question}
-Context: {context}
+**Environment Feedback:**
+{}
+
+**Playbook Bullets Used:**
+{}
+
+**Your Analysis (JSON):**
+{{
+  "reasoning": "[Your analysis]",
+  "error_identification": "[What went wrong]",
+  "root_cause_analysis": "[Why it went wrong]",
+  "correct_approach": "[Better approach]",
+  "key_insight": "[Lesson learned]",
+  "bullet_tags": [{{"id": "med-00001", "tag": "helpful"}}]
+}}
 """
+
+# Only define this if you also need a no-ground-truth variant
+REFLECTOR_PROMPT_NO_GT = """..."""
+```
+
+### Prompt Variables
+
+The default prompts use `{}` placeholders that get filled via `.format()`. Ensure your custom prompts have the same number and order of placeholders:
+
+| Prompt | Placeholders (in order) |
+|--------|------------------------|
+| `GENERATOR_PROMPT` | playbook, reflection, question, context |
+| `REFLECTOR_PROMPT` | question, reasoning_trace, predicted_answer, ground_truth, environment_feedback, bullets_used |
+| `REFLECTOR_PROMPT_NO_GT` | question, reasoning_trace, predicted_answer, environment_feedback, bullets_used |
+| `CURATOR_PROMPT` | Uses named placeholders: `{current_step}`, `{total_samples}`, `{token_budget}`, `{playbook_stats}`, `{recent_reflection}`, `{current_playbook}`, `{question_context}` |
+| `CURATOR_PROMPT_NO_GT` | Same as `CURATOR_PROMPT` |
+
+### A/B Testing Prompts
+
+The task-specific prompts feature makes it easy to A/B test different prompt versions:
+
+```bash
+# Test prompt version 1
+uv run python -m eval.your_task.run \
+    --task_name your_task_name \
+    --save_path results_v1 \
+    --task_prompts_dir ./prompts_v1
+
+# Test prompt version 2
+uv run python -m eval.your_task.run \
+    --task_name your_task_name \
+    --save_path results_v2 \
+    --task_prompts_dir ./prompts_v2
 ```
 
 ## Reference Implementation
 
-The `finance/` directory contains a complete working example of a custom task implementation. Use it as a reference for:
+The `eval/finance/` directory contains a complete working example of a custom task implementation. Use it as a reference for:
 
 - Data preprocessing with multiple parsing strategies (`parse_instruction_and_input`, `parse_context_and_question_formula`)
 - Task-specific evaluation logic (`_finer_answer_is_correct`, `_formula_answer_is_correct`)
 - Handling different data formats and answer types
 - Using the unified `run()` interface with different modes
+- Task-specific prompts configuration (via `--task_prompts_dir`)
+
+Other reference implementations:
+- `eval/mind2web/` - Web navigation task with element selection
+- `eval/mind2web2/` - Variant with larger candidate pool (50 candidates)
 
 ## Troubleshooting
 
@@ -443,12 +611,14 @@ The `finance/` directory contains a complete working example of a custom task im
 2. **Data format mismatches**: Verify your `process_task_data` returns the correct dictionary structure
 3. **Evaluation errors**: Check that `answer_is_correct` handles edge cases (empty strings, None values, etc.)
 4. **Memory issues**: Reduce `test_workers` parameter if running into memory constraints
+5. **Custom prompts not loading**: Verify the `--task_prompts_dir` path exists and contains properly named Python files (`generator.py`, `reflector.py`, `curator.py`) with the correct variable names (`GENERATOR_PROMPT`, etc.)
+6. **Prompt placeholder errors**: Ensure your custom prompts have the same number and order of `{}` placeholders as the default prompts
 
 ### Getting Help
 
 - **Issues**: Open an issue on GitHub with details about your task and error messages
 - **Discussions**: Join the [GitHub Discussions](../../discussions) for implementation questions
-- **Examples**: Check the `finance/` directory for working reference implementations
+- **Examples**: Check the `eval/finance/` directory for working reference implementations
 
 
 ---
diff --git a/README.md b/README.md
index c9bf961d..455b5de9 100644
--- a/README.md
+++ b/README.md
@@ -218,9 +218,30 @@ uv run python -m eval.finance.run \
 | `--no_ground_truth` | Don't use ground truth in reflection | False |
 | `--use_bulletpoint_analyzer` | Enable bulletpoint analyzer for playbook deduplication and merging | False |
 | `--bulletpoint_analyzer_threshold` | Similarity threshold for bulletpoint analyzer (0-1) | 0.9 |
+| `--task_prompts_dir` | Path to task-specific prompts directory (see [Custom Prompts](#custom-task-specific-prompts)) | None |
 
 </details>
 
+### Custom Task-Specific Prompts
+
+ACE supports task-specific prompts that override the default prompts. This allows you to customize the Generator, Reflector, and Curator behavior for different domains without modifying the core ACE code.
+
+```bash
+# Use custom prompts for a specific task
+uv run python -m eval.finance.run \
+    --task_name finer \
+    --mode offline \
+    --save_path results \
+    --task_prompts_dir ./eval/finance/prompts
+```
+
+To create custom prompts, create a `prompts/` directory under your task folder with any of these files:
+- `generator.py` - Define `GENERATOR_PROMPT`
+- `reflector.py` - Define `REFLECTOR_PROMPT` and/or `REFLECTOR_PROMPT_NO_GT`
+- `curator.py` - Define `CURATOR_PROMPT` and/or `CURATOR_PROMPT_NO_GT`
+
+Only the prompts you define will override the defaults; missing prompts fall back to the built-in defaults.
+
 ## 📈 Results and Outputs
 
 Using offline training as an example, after training, ACE generates:
diff --git a/ace/__init__.py b/ace/__init__.py
index a2e108cb..699647ce 100644
--- a/ace/__init__.py
+++ b/ace/__init__.py
@@ -47,7 +47,8 @@
 
 from .ace import ACE
 from .core import Generator, Reflector, Curator, BulletpointAnalyzer
+from .prompts import PromptConfig, load_prompts
 
-__all__ = ['ACE', 'Generator', 'Reflector', 'Curator', 'BulletpointAnalyzer']
+__all__ = ['ACE', 'Generator', 'Reflector', 'Curator', 'BulletpointAnalyzer', 'PromptConfig', 'load_prompts']
 
 __version__ = "1.0.0"
\ No newline at end of file
diff --git a/ace/ace.py b/ace/ace.py
index 2d662adc..16b319f0 100644
--- a/ace/ace.py
+++ b/ace/ace.py
@@ -14,6 +14,7 @@
 from typing import Dict, List, Tuple, Optional, Any
 
 from .core import Generator, Reflector, Curator, BulletpointAnalyzer
+from .prompts import PromptConfig, load_prompts
 from playbook_utils import *
 from logger import *
 from utils import *
@@ -39,11 +40,12 @@ def __init__(
         max_tokens: int = 4096,
         initial_playbook: Optional[str] = None,
         use_bulletpoint_analyzer: bool = False,
-        bulletpoint_analyzer_threshold: float = 0.90
+        bulletpoint_analyzer_threshold: float = 0.90,
+        prompt_config: Optional[PromptConfig] = None
     ):
         """
         Initialize the ACE system.
-        
+
         Args:
             api_provider: API provider for LLM calls
             generator_model: Model name for generator
@@ -53,14 +55,30 @@ def __init__(
             initial_playbook: Initial playbook content (optional)
             use_bulletpoint_analyzer: Whether to use bulletpoint analyzer for deduplication
             bulletpoint_analyzer_threshold: Similarity threshold for bulletpoint analyzer (0-1)
+            prompt_config: PromptConfig with custom prompts (optional, uses defaults if None)
         """
+        # Load default prompts if none provided
+        if prompt_config is None:
+            prompt_config = load_prompts()
+
         # Initialize API clients
         generator_client, reflector_client, curator_client = initialize_clients(api_provider)
 
-        # Initialize the three agents
-        self.generator = Generator(generator_client, api_provider, generator_model, max_tokens)
-        self.reflector = Reflector(reflector_client, api_provider, reflector_model, max_tokens)
-        self.curator = Curator(curator_client, api_provider, curator_model, max_tokens)
+        # Initialize the three agents with prompts from config
+        self.generator = Generator(
+            generator_client, api_provider, generator_model, max_tokens,
+            generator_prompt=prompt_config.generator_prompt
+        )
+        self.reflector = Reflector(
+            reflector_client, api_provider, reflector_model, max_tokens,
+            reflector_prompt=prompt_config.reflector_prompt,
+            reflector_prompt_no_gt=prompt_config.reflector_prompt_no_gt
+        )
+        self.curator = Curator(
+            curator_client, api_provider, curator_model, max_tokens,
+            curator_prompt=prompt_config.curator_prompt,
+            curator_prompt_no_gt=prompt_config.curator_prompt_no_gt
+        )
         
         # Initialize bulletpoint analyzer if requested and available
         self.use_bulletpoint_analyzer = use_bulletpoint_analyzer
diff --git a/ace/core/curator.py b/ace/core/curator.py
index d1e4cf7d..5a71c216 100644
--- a/ace/core/curator.py
+++ b/ace/core/curator.py
@@ -16,21 +16,27 @@ class Curator:
     Curator agent that manages the playbook by adding, updating,
     merging, and deleting bullets based on reflection feedback.
     """
-    
-    def __init__(self, api_client, api_provider, model: str, max_tokens: int = 4096):
+
+    def __init__(self, api_client, api_provider, model: str, max_tokens: int = 4096,
+                 curator_prompt: Optional[str] = None,
+                 curator_prompt_no_gt: Optional[str] = None):
         """
         Initialize the Curator agent.
-        
+
         Args:
             api_client: OpenAI client for LLM calls
             api_provider: API provider for LLM calls
             model: Model name to use for curation
             max_tokens: Maximum tokens for curation
+            curator_prompt: Custom curator prompt (optional, uses default if None)
+            curator_prompt_no_gt: Custom curator prompt without ground truth (optional)
         """
         self.api_client = api_client
         self.api_provider = api_provider
         self.model = model
         self.max_tokens = max_tokens
+        self.curator_prompt = curator_prompt or CURATOR_PROMPT
+        self.curator_prompt_no_gt = curator_prompt_no_gt or CURATOR_PROMPT_NO_GT
     
     def curate(
         self,
@@ -72,7 +78,7 @@ def curate(
         
         # Select the appropriate prompt
         if use_ground_truth:
-            prompt = CURATOR_PROMPT.format(
+            prompt = self.curator_prompt.format(
                 current_step=current_step,
                 total_samples=total_samples,
                 token_budget=token_budget,
@@ -82,7 +88,7 @@ def curate(
                 question_context=question_context
             )
         else:
-            prompt = CURATOR_PROMPT_NO_GT.format(
+            prompt = self.curator_prompt_no_gt.format(
                 current_step=current_step,
                 total_samples=total_samples,
                 token_budget=token_budget,
diff --git a/ace/core/generator.py b/ace/core/generator.py
index 3ceebecc..63bf0217 100644
--- a/ace/core/generator.py
+++ b/ace/core/generator.py
@@ -14,21 +14,24 @@ class Generator:
     Generator agent that produces answers to questions using knowledge
     from a playbook and previous reflections.
     """
-    
-    def __init__(self, api_client, api_provider, model: str, max_tokens: int = 4096):
+
+    def __init__(self, api_client, api_provider, model: str, max_tokens: int = 4096,
+                 generator_prompt: Optional[str] = None):
         """
         Initialize the Generator agent.
-        
+
         Args:
             api_client: OpenAI client for LLM calls
             api_provider: API provider for LLM calls
             model: Model name to use for generation
             max_tokens: Maximum tokens for generation
+            generator_prompt: Custom generator prompt (optional, uses default if None)
         """
         self.api_client = api_client
         self.api_provider = api_provider
         self.model = model
         self.max_tokens = max_tokens
+        self.generator_prompt = generator_prompt or GENERATOR_PROMPT
     
     def generate(
         self,
@@ -56,7 +59,7 @@ def generate(
             Tuple of (full_response, bullet_ids_used, call_info)
         """
         # Format the prompt
-        prompt = GENERATOR_PROMPT.format(playbook, reflection, question, context)
+        prompt = self.generator_prompt.format(playbook, reflection, question, context)
         
         response, call_info = timed_llm_call(
             self.api_client,
diff --git a/ace/core/reflector.py b/ace/core/reflector.py
index 134ea4c3..372a2f25 100644
--- a/ace/core/reflector.py
+++ b/ace/core/reflector.py
@@ -14,21 +14,27 @@ class Reflector:
     Reflector agent that analyzes the generator's reasoning and tags
     bullets as helpful, harmful, or neutral.
     """
-    
-    def __init__(self, api_client, api_provider, model: str, max_tokens: int = 4096):
+
+    def __init__(self, api_client, api_provider, model: str, max_tokens: int = 4096,
+                 reflector_prompt: Optional[str] = None,
+                 reflector_prompt_no_gt: Optional[str] = None):
         """
         Initialize the Reflector agent.
-        
+
         Args:
             api_client: OpenAI client for LLM calls
             api_provider: API provider for LLM calls
             model: Model name to use for reflection
             max_tokens: Maximum tokens for reflection
+            reflector_prompt: Custom reflector prompt (optional, uses default if None)
+            reflector_prompt_no_gt: Custom reflector prompt without ground truth (optional)
         """
         self.api_client = api_client
         self.api_provider = api_provider
         self.model = model
         self.max_tokens = max_tokens
+        self.reflector_prompt = reflector_prompt or REFLECTOR_PROMPT
+        self.reflector_prompt_no_gt = reflector_prompt_no_gt or REFLECTOR_PROMPT_NO_GT
     
     def reflect(
         self,
@@ -63,7 +69,7 @@ def reflect(
         """
         # Select the appropriate prompt
         if use_ground_truth and ground_truth:
-            prompt = REFLECTOR_PROMPT.format(
+            prompt = self.reflector_prompt.format(
                 question,
                 reasoning_trace,
                 predicted_answer,
@@ -72,7 +78,7 @@ def reflect(
                 bullets_used
             )
         else:
-            prompt = REFLECTOR_PROMPT_NO_GT.format(
+            prompt = self.reflector_prompt_no_gt.format(
                 question,
                 reasoning_trace,
                 predicted_answer,
diff --git a/ace/prompts/__init__.py b/ace/prompts/__init__.py
index d292ffb0..a3351bcf 100644
--- a/ace/prompts/__init__.py
+++ b/ace/prompts/__init__.py
@@ -6,16 +6,22 @@
 from .generator import *
 from .reflector import *
 from .curator import *
+from .config import PromptConfig
+from .loader import load_prompts
 
 __all__ = [
     # Generator prompts
     'GENERATOR_PROMPT',
-    
+
     # Reflector prompts
     'REFLECTOR_PROMPT',
     'REFLECTOR_PROMPT_NO_GT',
-    
+
     # Curator prompts
     'CURATOR_PROMPT',
     'CURATOR_PROMPT_NO_GT',
+
+    # Prompt configuration
+    'PromptConfig',
+    'load_prompts',
 ]
\ No newline at end of file
diff --git a/ace/prompts/config.py b/ace/prompts/config.py
new file mode 100644
index 00000000..070dd700
--- /dev/null
+++ b/ace/prompts/config.py
@@ -0,0 +1,16 @@
+"""
+PromptConfig dataclass for ACE system.
+Holds all agent prompts in a single configuration object.
+"""
+
+from dataclasses import dataclass
+
+
+@dataclass
+class PromptConfig:
+    """Configuration dataclass holding all agent prompts."""
+    generator_prompt: str
+    reflector_prompt: str
+    reflector_prompt_no_gt: str
+    curator_prompt: str
+    curator_prompt_no_gt: str
diff --git a/ace/prompts/loader.py b/ace/prompts/loader.py
new file mode 100644
index 00000000..67ffc8e6
--- /dev/null
+++ b/ace/prompts/loader.py
@@ -0,0 +1,113 @@
+"""
+Prompt loader for ACE system.
+Provides functionality to load task-specific prompts with fallback to defaults.
+"""
+
+import os
+import importlib.util
+from typing import Optional
+
+from .config import PromptConfig
+from .generator import GENERATOR_PROMPT
+from .reflector import REFLECTOR_PROMPT, REFLECTOR_PROMPT_NO_GT
+from .curator import CURATOR_PROMPT, CURATOR_PROMPT_NO_GT
+
+
+def _load_prompt_from_file(file_path: str, variable_name: str) -> Optional[str]:
+    """
+    Load a prompt variable from a Python file.
+
+    Args:
+        file_path: Path to the Python file containing the prompt
+        variable_name: Name of the variable to load (e.g., 'GENERATOR_PROMPT')
+
+    Returns:
+        The prompt string if found, None otherwise
+    """
+    if not os.path.exists(file_path):
+        return None
+
+    try:
+        spec = importlib.util.spec_from_file_location("prompt_module", file_path)
+        if spec is None or spec.loader is None:
+            return None
+
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+
+        if hasattr(module, variable_name):
+            return getattr(module, variable_name)
+        return None
+    except Exception as e:
+        print(f"Warning: Failed to load {variable_name} from {file_path}: {e}")
+        return None
+
+
+def load_prompts(task_prompts_dir: Optional[str] = None) -> PromptConfig:
+    """
+    Load prompts with task-specific overrides.
+
+    Checks if task-specific prompts exist in the given directory and loads them.
+    Falls back to default prompts for any missing prompts.
+
+    Args:
+        task_prompts_dir: Path to task's prompts directory (e.g., "eval/my_task/prompts")
+                         If None or doesn't exist, uses all defaults.
+
+    Returns:
+        PromptConfig with loaded prompts (custom where available, defaults otherwise)
+
+    Example:
+        # Load with task-specific prompts
+        config = load_prompts("eval/finance/prompts")
+
+        # Load with all defaults
+        config = load_prompts()
+    """
+    # Start with default prompts
+    generator_prompt = GENERATOR_PROMPT
+    reflector_prompt = REFLECTOR_PROMPT
+    reflector_prompt_no_gt = REFLECTOR_PROMPT_NO_GT
+    curator_prompt = CURATOR_PROMPT
+    curator_prompt_no_gt = CURATOR_PROMPT_NO_GT
+
+    # If task_prompts_dir is provided and exists, try to load custom prompts
+    if task_prompts_dir and os.path.isdir(task_prompts_dir):
+        # Try to load generator prompt
+        generator_file = os.path.join(task_prompts_dir, "generator.py")
+        custom_generator = _load_prompt_from_file(generator_file, "GENERATOR_PROMPT")
+        if custom_generator is not None:
+            generator_prompt = custom_generator
+            print(f"Loaded custom GENERATOR_PROMPT from {generator_file}")
+
+        # Try to load reflector prompts
+        reflector_file = os.path.join(task_prompts_dir, "reflector.py")
+        custom_reflector = _load_prompt_from_file(reflector_file, "REFLECTOR_PROMPT")
+        if custom_reflector is not None:
+            reflector_prompt = custom_reflector
+            print(f"Loaded custom REFLECTOR_PROMPT from {reflector_file}")
+
+        custom_reflector_no_gt = _load_prompt_from_file(reflector_file, "REFLECTOR_PROMPT_NO_GT")
+        if custom_reflector_no_gt is not None:
+            reflector_prompt_no_gt = custom_reflector_no_gt
+            print(f"Loaded custom REFLECTOR_PROMPT_NO_GT from {reflector_file}")
+
+        # Try to load curator prompts
+        curator_file = os.path.join(task_prompts_dir, "curator.py")
+        custom_curator = _load_prompt_from_file(curator_file, "CURATOR_PROMPT")
+        if custom_curator is not None:
+            curator_prompt = custom_curator
+            print(f"Loaded custom CURATOR_PROMPT from {curator_file}")
+
+        custom_curator_no_gt = _load_prompt_from_file(curator_file, "CURATOR_PROMPT_NO_GT")
+        if custom_curator_no_gt is not None:
+            curator_prompt_no_gt = custom_curator_no_gt
+            print(f"Loaded custom CURATOR_PROMPT_NO_GT from {curator_file}")
+
+    return PromptConfig(
+        generator_prompt=generator_prompt,
+        reflector_prompt=reflector_prompt,
+        reflector_prompt_no_gt=reflector_prompt_no_gt,
+        curator_prompt=curator_prompt,
+        curator_prompt_no_gt=curator_prompt_no_gt
+    )
diff --git a/eval/finance/run.py b/eval/finance/run.py
index a263c871..231b40c1 100644
--- a/eval/finance/run.py
+++ b/eval/finance/run.py
@@ -11,6 +11,7 @@
 from .data_processor import DataProcessor
 
 from ace import ACE
+from ace.prompts import load_prompts
 from utils import initialize_clients
 
 def parse_args():
@@ -78,7 +79,12 @@ def parse_args():
     # Output configuration
     parser.add_argument("--save_path", type=str, required=True,
                         help="Directory to save results")
-    
+
+    # Task-specific prompts configuration
+    parser.add_argument("--task_prompts_dir", type=str, default=None,
+                        help="Path to task-specific prompts directory. "
+                             "If not specified, uses default prompts.")
+
     return parser.parse_args()
 
 def load_data(data_path: str):
@@ -192,7 +198,10 @@ def main():
         print(f"Loaded initial playbook from {args.initial_playbook_path}\n")
     else:
         print("Using empty playbook as initial playbook\n")
-    
+
+    # Load prompts - uses task-specific if provided, otherwise defaults
+    prompt_config = load_prompts(task_prompts_dir=args.task_prompts_dir)
+
     # Create ACE system
     ace_system = ACE(
         api_provider=args.api_provider,
@@ -202,7 +211,8 @@ def main():
         max_tokens=args.max_tokens,
         initial_playbook=initial_playbook,
         use_bulletpoint_analyzer=args.use_bulletpoint_analyzer,
-        bulletpoint_analyzer_threshold=args.bulletpoint_analyzer_threshold
+        bulletpoint_analyzer_threshold=args.bulletpoint_analyzer_threshold,
+        prompt_config=prompt_config
     )
     
     # Prepare configuration
diff --git a/eval/mind2web/run.py b/eval/mind2web/run.py
index c30e0880..77395699 100644
--- a/eval/mind2web/run.py
+++ b/eval/mind2web/run.py
@@ -4,6 +4,7 @@
 from .data_processor import DataProcessor, load_data
 
 from ace import ACE
+from ace.prompts import load_prompts
 
 
 def parse_args():
@@ -76,6 +77,11 @@ def parse_args():
     parser.add_argument("--save_path", type=str, required=True,
                         help="Directory to save results")
 
+    # Task-specific prompts configuration
+    parser.add_argument("--task_prompts_dir", type=str, default=None,
+                        help="Path to task-specific prompts directory. "
+                             "If not specified, uses default prompts.")
+
     return parser.parse_args()
 
 
@@ -169,6 +175,9 @@ def main():
     else:
         print("Using empty playbook as initial playbook\n")
 
+    # Load prompts - uses task-specific if provided, otherwise defaults
+    prompt_config = load_prompts(task_prompts_dir=args.task_prompts_dir)
+
     # Create ACE system
     ace_system = ACE(
         api_provider=args.api_provider,
@@ -178,7 +187,8 @@ def main():
         max_tokens=args.max_tokens,
         initial_playbook=initial_playbook,
         use_bulletpoint_analyzer=args.use_bulletpoint_analyzer,
-        bulletpoint_analyzer_threshold=args.bulletpoint_analyzer_threshold
+        bulletpoint_analyzer_threshold=args.bulletpoint_analyzer_threshold,
+        prompt_config=prompt_config
     )
 
     # Prepare configuration
diff --git a/eval/mind2web2/run.py b/eval/mind2web2/run.py
index 73be68be..b2dd5729 100644
--- a/eval/mind2web2/run.py
+++ b/eval/mind2web2/run.py
@@ -4,6 +4,7 @@
 from .data_processor import DataProcessor, load_data
 
 from ace import ACE
+from ace.prompts import load_prompts
 
 
 def parse_args():
@@ -76,6 +77,11 @@ def parse_args():
     parser.add_argument("--save_path", type=str, required=True,
                         help="Directory to save results")
 
+    # Task-specific prompts configuration
+    parser.add_argument("--task_prompts_dir", type=str, default=None,
+                        help="Path to task-specific prompts directory. "
+                             "If not specified, uses default prompts.")
+
     return parser.parse_args()
 
 
@@ -169,6 +175,9 @@ def main():
     else:
         print("Using empty playbook as initial playbook\n")
 
+    # Load prompts - uses task-specific if provided, otherwise defaults
+    prompt_config = load_prompts(task_prompts_dir=args.task_prompts_dir)
+
     # Create ACE system
     ace_system = ACE(
         api_provider=args.api_provider,
@@ -178,7 +187,8 @@ def main():
         max_tokens=args.max_tokens,
         initial_playbook=initial_playbook,
         use_bulletpoint_analyzer=args.use_bulletpoint_analyzer,
-        bulletpoint_analyzer_threshold=args.bulletpoint_analyzer_threshold
+        bulletpoint_analyzer_threshold=args.bulletpoint_analyzer_threshold,
+        prompt_config=prompt_config
     )
 
     # Prepare configuration

From f2a1043a0495ed92ff2b9bbdb027789794c17267 Mon Sep 17 00:00:00 2001
From: snova-fengluh <fenglu.hong@sambanovasystems.com>
Date: Tue, 21 Apr 2026 16:18:59 -0700
Subject: [PATCH 2/2] use named placeholders

---
 EXTENDING_ACE.md         | 36 +++++++++++++++++++-----------------
 ace/core/generator.py    |  7 ++++++-
 ace/core/reflector.py    | 22 +++++++++++-----------
 ace/prompts/generator.py |  8 ++++----
 ace/prompts/reflector.py | 22 +++++++++++-----------
 5 files changed, 51 insertions(+), 44 deletions(-)

diff --git a/EXTENDING_ACE.md b/EXTENDING_ACE.md
index 900ae4d7..81ad6288 100644
--- a/EXTENDING_ACE.md
+++ b/EXTENDING_ACE.md
@@ -491,16 +491,16 @@ Your output should be a json object with these fields:
 - final_answer: your concise final answer
 
 **Playbook:**
-{}
+{playbook}
 
 **Reflection:**
-{}
+{reflection}
 
 **Question:**
-{}
+{question}
 
 **Context:**
-{}
+{context}
 
 **Answer in JSON format:**
 {{
@@ -527,22 +527,22 @@ Analyze the model's reasoning and identify:
 - Evidence quality issues
 
 **Question:**
-{}
+{question}
 
 **Model's Reasoning:**
-{}
+{reasoning_trace}
 
 **Model's Answer:**
-{}
+{predicted_answer}
 
 **Ground Truth:**
-{}
+{ground_truth}
 
 **Environment Feedback:**
-{}
+{environment_feedback}
 
 **Playbook Bullets Used:**
-{}
+{bullets_used}
 
 **Your Analysis (JSON):**
 {{
@@ -561,16 +561,18 @@ REFLECTOR_PROMPT_NO_GT = """..."""
 
 ### Prompt Variables
 
-The default prompts use `{}` placeholders that get filled via `.format()`. Ensure your custom prompts have the same number and order of placeholders:
+All prompts use **named placeholders** (e.g., `{playbook}`, `{question}`) that get filled via `.format()`. Your custom prompts can use any subset of the available variables in any order.
 
-| Prompt | Placeholders (in order) |
-|--------|------------------------|
-| `GENERATOR_PROMPT` | playbook, reflection, question, context |
-| `REFLECTOR_PROMPT` | question, reasoning_trace, predicted_answer, ground_truth, environment_feedback, bullets_used |
-| `REFLECTOR_PROMPT_NO_GT` | question, reasoning_trace, predicted_answer, environment_feedback, bullets_used |
-| `CURATOR_PROMPT` | Uses named placeholders: `{current_step}`, `{total_samples}`, `{token_budget}`, `{playbook_stats}`, `{recent_reflection}`, `{current_playbook}`, `{question_context}` |
+| Prompt | Available Variables |
+|--------|---------------------|
+| `GENERATOR_PROMPT` | `{playbook}`, `{reflection}`, `{question}`, `{context}` |
+| `REFLECTOR_PROMPT` | `{question}`, `{reasoning_trace}`, `{predicted_answer}`, `{ground_truth}`, `{environment_feedback}`, `{bullets_used}` |
+| `REFLECTOR_PROMPT_NO_GT` | `{question}`, `{reasoning_trace}`, `{predicted_answer}`, `{environment_feedback}`, `{bullets_used}` |
+| `CURATOR_PROMPT` | `{current_step}`, `{total_samples}`, `{token_budget}`, `{playbook_stats}`, `{recent_reflection}`, `{current_playbook}`, `{question_context}` |
 | `CURATOR_PROMPT_NO_GT` | Same as `CURATOR_PROMPT` |
 
+**Note**: You only need to include the variables you want to use. For example, a minimal generator prompt could just use `{question}` and `{playbook}`.
+
 ### A/B Testing Prompts
 
 The task-specific prompts feature makes it easy to A/B test different prompt versions:
diff --git a/ace/core/generator.py b/ace/core/generator.py
index 63bf0217..16ee7b3f 100644
--- a/ace/core/generator.py
+++ b/ace/core/generator.py
@@ -59,7 +59,12 @@ def generate(
             Tuple of (full_response, bullet_ids_used, call_info)
         """
         # Format the prompt
-        prompt = self.generator_prompt.format(playbook, reflection, question, context)
+        prompt = self.generator_prompt.format(
+            playbook=playbook,
+            reflection=reflection,
+            question=question,
+            context=context
+        )
         
         response, call_info = timed_llm_call(
             self.api_client,
diff --git a/ace/core/reflector.py b/ace/core/reflector.py
index 372a2f25..74712ca4 100644
--- a/ace/core/reflector.py
+++ b/ace/core/reflector.py
@@ -70,20 +70,20 @@ def reflect(
         # Select the appropriate prompt
         if use_ground_truth and ground_truth:
             prompt = self.reflector_prompt.format(
-                question,
-                reasoning_trace,
-                predicted_answer,
-                ground_truth,
-                environment_feedback,
-                bullets_used
+                question=question,
+                reasoning_trace=reasoning_trace,
+                predicted_answer=predicted_answer,
+                ground_truth=ground_truth,
+                environment_feedback=environment_feedback,
+                bullets_used=bullets_used
             )
         else:
             prompt = self.reflector_prompt_no_gt.format(
-                question,
-                reasoning_trace,
-                predicted_answer,
-                environment_feedback,
-                bullets_used
+                question=question,
+                reasoning_trace=reasoning_trace,
+                predicted_answer=predicted_answer,
+                environment_feedback=environment_feedback,
+                bullets_used=bullets_used
             )
         
         response, call_info = timed_llm_call(
diff --git a/ace/prompts/generator.py b/ace/prompts/generator.py
index 3304ca76..0b2c47b5 100644
--- a/ace/prompts/generator.py
+++ b/ace/prompts/generator.py
@@ -20,16 +20,16 @@
 
 
 **Playbook:**
-{}
+{playbook}
 
 **Reflection:**
-{}
+{reflection}
 
 **Question:**
-{}
+{question}
 
 **Context:**
-{}
+{context}
 
 **Answer in this exact JSON format:**
 {{
diff --git a/ace/prompts/reflector.py b/ace/prompts/reflector.py
index 7b9f841b..ab4b93ea 100644
--- a/ace/prompts/reflector.py
+++ b/ace/prompts/reflector.py
@@ -27,22 +27,22 @@
 
 
 **Question:**
-{}
+{question}
 
 **Model's Reasoning Trace:**
-{}
+{reasoning_trace}
 
 **Model's Predicted Answer:**
-{}
+{predicted_answer}
 
 **Ground Truth Answer:**
-{}
+{ground_truth}
 
 **Environment Feedback:**
-{}
+{environment_feedback}
 
 **Part of Playbook that's used by the generator to answer the question:**
-{}
+{bullets_used}
 
 **Answer in this exact JSON format:**
 {{
@@ -84,19 +84,19 @@
 
 
 **Question:**
-{}
+{question}
 
 **Model's Reasoning Trace:**
-{}
+{reasoning_trace}
 
 **Model's Predicted Answer:**
-{}
+{predicted_answer}
 
 **Environment Feedback:**
-{}
+{environment_feedback}
 
 **Part of Playbook that's used by the generator to answer the question:**
-{}
+{bullets_used}
 
 **Answer in this exact JSON format:**
 {{