Skip to content

Commit 228de8e

Browse files
committed
moving tau2 to be inside package
1 parent 871675e commit 228de8e

File tree

16 files changed

+3628
-13
lines changed

16 files changed

+3628
-13
lines changed

eval_protocol/benchmarks/data/airline_dataset.jsonl

Lines changed: 50 additions & 0 deletions
Large diffs are not rendered by default.

eval_protocol/benchmarks/data/retail_dataset.jsonl

Lines changed: 114 additions & 0 deletions
Large diffs are not rendered by default.

eval_protocol/benchmarks/test_tau_bench_retail.py

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,31 +27,22 @@
2727
from vendor.tau2.evaluator.evaluator_communicate import CommunicateEvaluator
2828
from vendor.tau2.evaluator.evaluator_nl_assertions import NLAssertionsEvaluator
2929
from vendor.tau2.registry import registry
30+
from eval_protocol.mcp_servers.tau2 import get_server_script_path, get_system_prompt
3031

3132

3233
def _get_retail_dataset_path() -> str:
3334
"""Get the retail dataset file path."""
34-
return str(Path(__file__).parent.parent.parent / "tests" / "pytest" / "data" / "retail_dataset.jsonl")
35-
36-
37-
def _get_server_script_path() -> str:
38-
"""Get the tau2 mcp server script path."""
39-
return str(Path(__file__).parent.parent.parent / "examples" / "tau2_mcp" / "server.py")
35+
return str(Path(__file__).parent / "data" / "retail_dataset.jsonl")
4036

4137

4238
def tau_bench_retail_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
4339
"""
4440
Convert entries from retail dataset to EvaluationRow objects.
4541
"""
4642
rows = []
47-
test_dir = Path(__file__).parent.parent.parent / "examples" / "tau2_mcp" / "tests"
48-
4943
# Load system prompt from file so we can change it in one place
5044
domain = data[0]["environment_context"]["domain"]
51-
prompt_file = test_dir / f"system_prompts/{domain}_agent_system_prompt.md"
52-
53-
with open(prompt_file, "r") as f:
54-
system_prompt = f.read().strip()
45+
system_prompt = get_system_prompt(domain)
5546

5647
for row in data:
5748
eval_row = EvaluationRow(
@@ -87,7 +78,7 @@ def tau_bench_retail_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
8778
num_runs=8,
8879
mode="pointwise",
8980
max_concurrent_rollouts=50,
90-
server_script_path=_get_server_script_path(),
81+
server_script_path=get_server_script_path(),
9182
exception_handler_config=ExceptionHandlerConfig(
9283
retryable_exceptions={
9384
litellm.RateLimitError,

eval_protocol/mcp_servers/__init__.py

Whitespace-only changes.
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
# Airline MCP-Gym Integration with τ²-Bench
2+
3+
This directory contains the implementation of MCP-Gym integration with τ²-Bench's airline domain for evaluating conversational AI agents on realistic flight booking scenarios.
4+
5+
## Overview
6+
7+
The airline domain is a **single-control** environment where:
8+
- **Agent**: Has access to airline booking APIs and company policies
9+
- **User**: Provides booking requirements through conversation (simulated)
10+
- **Environment**: Airline reservation system with flights, bookings, and policies
11+
- **Success Metric**: Correct final booking state and policy compliance
12+
13+
## Files Structure
14+
15+
```
16+
examples/tau2_mcp/
17+
├── README.md # This file
18+
├── tau2_mcp.py # Main MCP server with all airline tools
19+
├── tau2_adapter.py # Airline environment adapter
20+
├── airline_example.py # Comprehensive evaluation example
21+
└── server.py # Server launcher script
22+
```
23+
24+
## Key Components
25+
26+
### 1. `tau2_mcp.py` - MCP Server
27+
Implements all 14 airline tools from τ²-Bench as MCP tools:
28+
29+
- **Flight Search**: `search_direct_flight`, `search_onestop_flight`
30+
- **Booking Management**: `book_reservation`, `get_reservation_details`, `cancel_reservation`
31+
- **Reservation Updates**: `update_reservation_flights`, `update_reservation_passengers`, `update_reservation_baggages`
32+
- **User Management**: `get_user_details`, `send_certificate`
33+
- **Utility**: `list_all_airports`, `get_flight_status`, `calculate`
34+
- **Escalation**: `transfer_to_human_agents`
35+
36+
### 2. `tau2_adapter.py` - Environment Adapter
37+
Handles the integration between MCP-Gym and τ²-Bench:
38+
39+
- **Environment Creation**: Sets up τ²-Bench airline environment
40+
- **Action Execution**: Translates MCP tool calls to τ²-Bench actions
41+
- **State Management**: Tracks reservation states and task completion
42+
- **Mock Environment**: Fallback for testing without τ²-Bench
43+
44+
### 3. `airline_example.py` - Evaluation Example
45+
Complete example demonstrating:
46+
47+
- **Task Definition**: Sample airline booking scenarios
48+
- **Conversation Simulation**: Multi-turn agent interactions
49+
- **Evaluation Metrics**: Task completion scoring
50+
- **Pass@k Metrics**: Reliability measurement
51+
52+
## Installation
53+
54+
### Prerequisites
55+
56+
1. **Install τ²-Bench**:
57+
```bash
58+
git clone https://github.com/sierra-research/tau2-bench
59+
cd tau2-bench
60+
pip install -e .
61+
```
62+
63+
2. **Install eval-protocol** (if not already installed):
64+
```bash
65+
pip install reward-protocol
66+
```
67+
68+
### Setup Environment
69+
70+
```bash
71+
# Navigate to the tau2_mcp directory
72+
cd examples/tau2_mcp
73+
74+
# Install additional dependencies
75+
pip install asyncio
76+
```
77+
78+
## Usage
79+
80+
### 1. Quick Test
81+
82+
Run the example to verify everything works:
83+
84+
```bash
85+
python airline_example.py
86+
```
87+
88+
This will:
89+
- Test basic MCP server functionality
90+
- Run simulated conversations for 4 sample tasks
91+
- Display evaluation results and pass@1 metrics
92+
93+
### 2. Start MCP Server
94+
95+
Launch the airline MCP server:
96+
97+
```bash
98+
python tau2_mcp.py --port 8001 --seed 42
99+
```
100+
101+
### 3. Integration with τ²-Bench
102+
103+
Once τ²-Bench is installed, update the adapter to use real environment:
104+
105+
```python
106+
from tau2_bench.domains.airline import AirlineEnvironment
107+
108+
# This will automatically be used instead of mock environment
109+
env = AirlineEnvironment()
110+
```
111+
112+
### 4. Agent Evaluation
113+
114+
Create an agent policy and run evaluation:
115+
116+
```python
117+
from eval_protocol.policies import FireworksPolicy
118+
from airline_example import AirlineEvaluationExample
119+
120+
# Create agent policy
121+
policy = FireworksPolicy(
122+
model_id="accounts/fireworks/models/qwen-72b-instruct",
123+
temperature=0.1
124+
)
125+
126+
# Run evaluation
127+
evaluator = AirlineEvaluationExample()
128+
results = evaluator.run_evaluation_suite()
129+
130+
print(f"Pass@1: {results['pass_at_1']:.3f}")
131+
```
132+
133+
## Sample Tasks
134+
135+
The example includes 4 representative airline booking tasks:
136+
137+
1. **Simple Flight Booking**: Book a one-way flight from SFO to JFK
138+
2. **Modify Existing Booking**: Change flight dates on existing reservation
139+
3. **Cancel Booking**: Cancel a flight reservation
140+
4. **Complex Round-trip**: Book round-trip flight with multiple passengers
141+
142+
## Evaluation Metrics
143+
144+
### Task-Level Metrics
145+
- **Tool Usage**: Correct airline tools called
146+
- **Task Completion**: Booking successfully created/modified/cancelled
147+
- **Conversation Quality**: Appropriate multi-turn interaction
148+
149+
### Agent-Level Metrics
150+
- **Pass@1**: Success rate on first attempt
151+
- **Pass@k**: Reliability across multiple runs
152+
- **Average Score**: Overall task performance
153+
- **Policy Compliance**: Adherence to airline policies
154+
155+
## Expected Output
156+
157+
```
158+
✈️ Airline MCP-Gym Integration Example
159+
==================================================
160+
🧪 Running simple agent test...
161+
✅ list_all_airports result: {'result': {...}, 'reward': 0.0, ...}
162+
✅ search_direct_flight result: {'result': {...}, 'reward': 0.0, ...}
163+
🧪 Simple agent test completed
164+
165+
🚀 Starting Airline MCP-Gym Evaluation Suite
166+
==================================================
167+
168+
📋 Processing task: book_simple_flight
169+
🎭 Simulating conversation for task: book_simple_flight
170+
📊 Task score: 1.00
171+
🔧 Tool calls: ['search_direct_flight', 'book_reservation']
172+
173+
📋 Processing task: modify_existing_booking
174+
🎭 Simulating conversation for task: modify_existing_booking
175+
📊 Task score: 0.50
176+
🔧 Tool calls: ['get_reservation_details']
177+
178+
...
179+
180+
==================================================
181+
📈 EVALUATION SUMMARY
182+
==================================================
183+
Tasks completed: 4
184+
Average score: 0.625
185+
Total score: 2.500
186+
Pass@1 rate: 0.250
187+
188+
📋 Task Breakdown:
189+
book_simple_flight: 1.000
190+
modify_existing_booking: 0.500
191+
cancel_booking: 0.500
192+
complex_round_trip: 0.500
193+
```
194+
195+
## Next Steps
196+
197+
1. **Install τ²-Bench**: Get the real airline environment
198+
2. **Add Agent Policy**: Integrate actual LLM agent (e.g., FireworksPolicy)
199+
3. **Implement Pass@k**: Run multiple trials for reliability testing
200+
4. **Add Reward Functions**: Create detailed evaluation metrics
201+
5. **Policy Integration**: Add airline policy compliance checking
202+
6. **User Simulator**: Integrate τ²-Bench user simulator
203+
7. **Batch Evaluation**: Run on full τ²-Bench airline task set
204+
205+
## Architecture Benefits
206+
207+
This integration provides:
208+
209+
- **Realistic Evaluation**: Test agents on actual airline booking scenarios
210+
- **Standardized Tools**: Use exact τ²-Bench airline API schema
211+
- **MCP Compatibility**: Seamless integration with MCP-based agents
212+
- **Extensible Framework**: Easy to add new domains or tools
213+
- **Comprehensive Metrics**: Multiple evaluation dimensions
214+
215+
## Troubleshooting
216+
217+
### Common Issues
218+
219+
1. **τ²-Bench not installed**: The adapter will use mock environment
220+
2. **Port conflicts**: Change port with `--port` flag
221+
3. **Import errors**: Ensure all dependencies are installed
222+
223+
### Debug Mode
224+
225+
Run with verbose output:
226+
227+
```bash
228+
python tau2_mcp.py --port 8001 --seed 42 --verbose
229+
```
230+
231+
## Contributing
232+
233+
When extending this integration:
234+
235+
1. **Follow Tool Schema**: Use exact τ²-Bench API parameter names
236+
2. **Add Tests**: Include evaluation scenarios for new features
237+
3. **Update Documentation**: Document new tools and capabilities
238+
4. **Maintain Compatibility**: Ensure mock environment still works
239+
240+
## Performance Comparison
241+
242+
This integration enables direct comparison with τ²-Bench leaderboard results:
243+
244+
| Model | Pass@1 | Pass@4 | Our Framework |
245+
|-------|---------|---------|---------------|
246+
| Claude 3.5 Sonnet | 0.460 | 0.225 | ✅ Compatible |
247+
| GPT-4o | 0.420 | 0.200 | ✅ Compatible |
248+
| GPT-4o-mini | 0.225 | 0.100 | ✅ Compatible |
249+
250+
Your MCP-Gym integration can now evaluate agents on the same tasks and compare results directly with the research community.
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
"""
2+
Tau2-Bench MCP Server
3+
4+
This module provides MCP server implementations for tau2-bench domains
5+
(airline, mock, retail) along with test data and system prompts.
6+
"""
7+
8+
import importlib.resources
9+
from pathlib import Path
10+
11+
12+
def get_server_script_path() -> str:
13+
"""Get the path to the tau2 MCP server script."""
14+
try:
15+
# Try to get from installed package
16+
with importlib.resources.as_file(importlib.resources.files(__package__) / "server.py") as server_path:
17+
return str(server_path)
18+
except (ImportError, FileNotFoundError):
19+
# Fallback for development environment
20+
return str(Path(__file__).parent / "server.py")
21+
22+
23+
def get_system_prompt(domain: str) -> str:
24+
"""Get system prompt for the specified domain.
25+
26+
Args:
27+
domain: Domain name (airline, mock, retail)
28+
29+
Returns:
30+
System prompt text
31+
"""
32+
prompt_filename = f"{domain}_agent_system_prompt.md"
33+
34+
try:
35+
# Try to get from installed package
36+
with importlib.resources.open_text(f"{__package__}.tests.system_prompts", prompt_filename) as f:
37+
return f.read().strip()
38+
except (ImportError, FileNotFoundError):
39+
# Fallback for development environment
40+
prompt_path = Path(__file__).parent / "tests" / "system_prompts" / prompt_filename
41+
with open(prompt_path, "r") as f:
42+
return f.read().strip()
43+
44+
45+
def get_retail_system_prompt() -> str:
46+
"""Get the retail domain system prompt."""
47+
return get_system_prompt("retail")
48+
49+
50+
# Re-export the main MCP classes for convenience
51+
from .tau2_mcp import AirlineDomainMcp, MockDomainMcp, RetailDomainMcp
52+
53+
__all__ = [
54+
"get_server_script_path",
55+
"get_system_prompt",
56+
"get_retail_system_prompt",
57+
"AirlineDomainMcp",
58+
"MockDomainMcp",
59+
"RetailDomainMcp",
60+
]

0 commit comments

Comments
 (0)