Skip to content

Commit 829051e

Browse files
committed
adding tests
1 parent 840b7b4 commit 829051e

13 files changed

Lines changed: 369 additions & 6 deletions

File tree

.github/workflows/ci.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,13 @@ jobs:
8585
FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
8686
PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning"
8787
run: |
88-
# Run most tests in parallel, but explicitly ignore tests that manage their own servers
88+
# Run most tests in parallel, but explicitly ignore tests that manage their own servers or are slow
8989
uv run pytest \
9090
-n auto \
9191
--ignore=tests/test_batch_evaluation.py \
92+
--ignore=tests/pytest/test_frozen_lake.py \
93+
--ignore=tests/pytest/test_lunar_lander.py \
94+
--ignore=tests/pytest/test_tau_bench_airline.py \
9295
--cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10
9396
9497
- name: Store coverage file

eval_protocol/mcp/execution/manager.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,22 @@ async def _execute_with_semaphore(idx):
173173
# Convert trajectories to unified EvaluationRow format
174174
evaluation_rows = []
175175
for trajectory in trajectories:
176-
messages = [Message.model_validate(msg) for msg in trajectory.conversation_history]
176+
# Handle multimodal content by extracting text from complex content structures
177+
messages = []
178+
for msg in trajectory.conversation_history:
179+
# Create a copy to avoid modifying the original
180+
msg_dict = dict(msg)
181+
182+
# Handle multimodal content (list of content blocks) by extracting text
183+
if isinstance(msg_dict.get("content"), list):
184+
text_content = None
185+
for content_block in msg_dict["content"]:
186+
if isinstance(content_block, dict) and content_block.get("type") == "text":
187+
text_content = content_block.get("text")
188+
break
189+
msg_dict["content"] = text_content or ""
190+
191+
messages.append(Message.model_validate(msg_dict))
177192

178193
input_metadata = InputMetadata(
179194
row_id=trajectory.session.dataset_row.id if trajectory.session.dataset_row else None,

eval_protocol/mcp_env.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def make(
125125
sessions = []
126126

127127
# Handle evaluation_rows vs legacy approaches
128-
if internal_dataset is not None:
128+
if internal_dataset:
129129
# New evaluation_rows approach
130130
dataset_rows = []
131131
sessions = []

examples/frozen_lake_mcp/server.py

100644100755
File mode changed.

pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@ openevals = [
9595
fireworks = [
9696
"fireworks-ai>=0.19.10",
9797
]
98+
box2d = [
99+
"swig",
100+
"gymnasium[box2d]>=0.29.0",
101+
]
98102

99103
[project.scripts]
100104
fireworks-reward = "eval_protocol.cli:main"
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
[
2+
{
3+
"id": "airline_task_1",
4+
"user_prompt_template": "{observation}",
5+
"environment_context": {"domain": "airline"},
6+
"user_simulation": {
7+
"enabled": true,
8+
"llm": "gpt-4.1",
9+
"system_prompt": "Instructions:\n\tDomain: airline\nReason for call: You want to cancel reservation EHGLP3.\nKnown info: You are Emma Kim (user id emma_kim_9957).\nIf the agent says cancellation not possible, mention you were told insurance not needed. You don't want to cancel if no refund."
10+
},
11+
"evaluation_criteria": {
12+
"actions": [],
13+
"communicate_info": [],
14+
"nl_assertions": [
15+
"Agent should refuse to proceed with the cancellation."
16+
]
17+
}
18+
},
19+
{
20+
"id": "airline_task_2",
21+
"user_prompt_template": "{observation}",
22+
"environment_context": {"domain": "airline"},
23+
"user_simulation": {
24+
"enabled": true,
25+
"llm": "gpt-4.1",
26+
"system_prompt": "Instructions:\n\tDomain: airline\nReason for call: You recently spoke on the phone with a customer support representative that told you that a service agent will be able to help you cancel your reservation.\nKnown info: You are Raj Sanchez. Your user id is raj_sanchez_7340.\nTask instructions: The trip you want to cancel is the one from Philadelphia to LaGuardia.\nIf the service agent says that the reservation cannot be canceled, mention that the customer support representative approved it. You don't want to go ahead with the cancellation if you don't get a refund."
27+
},
28+
"evaluation_criteria": {
29+
"actions": [
30+
{
31+
"action_id": "1_0",
32+
"name": "get_user_details",
33+
"arguments": {
34+
"user_id": "raj_sanchez_7340"
35+
},
36+
"info": null
37+
},
38+
{
39+
"action_id": "1_1",
40+
"name": "get_reservation_details",
41+
"arguments": {
42+
"reservation_id": "Q69X3R"
43+
},
44+
"info": null
45+
}
46+
],
47+
"communicate_info": [],
48+
"nl_assertions": [
49+
"Agent should not approve the cancellation."
50+
]
51+
}
52+
},
53+
{
54+
"id": "airline_task_3",
55+
"user_prompt_template": "{observation}",
56+
"environment_context": {"domain": "airline"},
57+
"user_simulation": {
58+
"enabled": true,
59+
"llm": "gpt-4.1",
60+
"system_prompt": "Instructions:\n\tDomain: airline\nReason for call: First, try to book a flight from sf to ny. You will have 3 passengers. Halfway through the book flight process, abruptly mention that you would like to talk about something else by saying that you are frustrated with the delayed flight in your most recent reservation.\nKnown info: You are Noah Muller. Your user id is noah_muller_9847."
61+
},
62+
"evaluation_criteria": {
63+
"actions": [
64+
{
65+
"action_id": "2_0",
66+
"name": "get_user_details",
67+
"arguments": {
68+
"user_id": "noah_muller_9847"
69+
},
70+
"info": null
71+
},
72+
{
73+
"action_id": "2_1",
74+
"name": "get_reservation_details",
75+
"arguments": {
76+
"reservation_id": "SDZQKO"
77+
},
78+
"info": null
79+
},
80+
{
81+
"action_id": "2_2",
82+
"name": "get_reservation_details",
83+
"arguments": {
84+
"reservation_id": "4OG6T3"
85+
},
86+
"info": null
87+
},
88+
{
89+
"action_id": "2_3",
90+
"name": "send_certificate",
91+
"arguments": {
92+
"user_id": "noah_muller_9847",
93+
"amount": 50
94+
},
95+
"info": null
96+
}
97+
],
98+
"communicate_info": [],
99+
"nl_assertions": [
100+
"Agent should not offer compensation unless the user asks for it.",
101+
"Agent should check that the flight was indeed delayed.",
102+
"Agent should detect that the number of passengers on the delayed flight mentioned by the user is incorrect.",
103+
"Agent should offer a certificate of $50."
104+
]
105+
}
106+
}
107+
]
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"id": "run_001", "system_prompt": "You are playing FrozenLake, a grid-based navigation game displayed as a 4x4 text grid. The grid contains: S (Start), F (Frozen safe), H (Hole - deadly), G (Goal). You start at position S and must reach G while avoiding H tiles. In this version, the surface is not slippery so your moves are deterministic. IMPORTANT: When you are at the starting position, you appear as 'S'. When you move to other positions, the hightlighted position will change on the grid. If you step on H, the episode ends with failure. Use the lake_move tool with actions LEFT, DOWN, RIGHT, UP to navigate the grid.", "user_prompt_template": "Current game state grid:\n{observation}\n\nYou are navigating the 4x4 grid above. Navigate safely to reach the goal 'G' while avoiding holes 'H'. Choose your next move from: LEFT, DOWN, RIGHT, or UP.", "environment_context": {"game": "FrozenLake", "map_name": "4x4", "seed": 42}}
2+
{"id": "run_002", "system_prompt": "You are playing FrozenLake, a grid-based navigation game displayed as a 4x4 text grid. The grid contains: S (Start), F (Frozen safe), H (Hole - deadly), G (Goal). You start at position S and must reach G while avoiding H tiles. In this version, the surface is not slippery so your moves are deterministic. IMPORTANT: When you are at the starting position, you appear as 'S'. When you move to other positions, the hightlighted position will change on the grid. If you step on H, the episode ends with failure. Use the lake_move tool with actions LEFT, DOWN, RIGHT, UP to navigate the grid.", "user_prompt_template": "Current game state grid:\n{observation}\n\nYou are navigating the 4x4 grid above. Navigate safely to reach the goal 'G' while avoiding holes 'H'. Choose your next move from: LEFT, DOWN, RIGHT, or UP.", "environment_context": {"game": "FrozenLake", "map_name": "4x4", "seed": 123}}
3+
{"id": "run_003", "system_prompt": "You are playing FrozenLake, a grid-based navigation game displayed as a 4x4 text grid. The grid contains: S (Start), F (Frozen safe), H (Hole - deadly), G (Goal). You start at position S and must reach G while avoiding H tiles. In this version, the surface is not slippery so your moves are deterministic. IMPORTANT: When you are at the starting position, you appear as 'S'. When you move to other positions, the hightlighted position will change on the grid. If you step on H, the episode ends with failure. Use the lake_move tool with actions LEFT, DOWN, RIGHT, UP to navigate the grid.", "user_prompt_template": "Current game state grid:\n{observation}\n\nYou are navigating the 4x4 grid above. Navigate safely to reach the goal 'G' while avoiding holes 'H'. Choose your next move from: LEFT, DOWN, RIGHT, or UP.", "environment_context": {"game": "FrozenLake", "map_name": "4x4", "seed": 456}}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"id": "multi_env_test_001", "system_prompt": "You are controlling a lunar lander spacecraft. Use the lander_action tool with actions: NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT. Your goal is to land safely on the moon between the two flags without crashing.", "user_prompt_template": "Current state: {observation}. First, describe what is in the image attached and analyze the current state. You MUST explain your reasoning in picking the next best action (NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT) and call lander_action tool with it to land the spacecraft.", "environment_context": {"game": "LunarLander", "continuous": false, "gravity": -10.0, "enable_wind": false, "seed": 42}}
2+
{"id": "multi_env_test_002", "system_prompt": "You are controlling a lunar lander spacecraft. Use the lander_action tool with actions: NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT. Your goal is to land safely on the moon between the two flags without crashing.", "user_prompt_template": "Current state: {observation}. First, describe what is in the image attached and analyze the current state. You MUST explain your reasoning in picking the next best action (NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT) and call lander_action tool with it to land the spacecraft.", "environment_context": {"game": "LunarLander", "continuous": false, "gravity": -8.0, "enable_wind": false, "seed": 123}}
3+
{"id": "multi_env_test_003", "system_prompt": "You are controlling a lunar lander spacecraft. Use the lander_action tool with actions: NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT. Your goal is to land safely on the moon between the two flags without crashing.", "user_prompt_template": "Current state: {observation}. First, describe what is in the image attached and analyze the current state. You MUST explain your reasoning in picking the next best action (NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT) and call lander_action tool with it to land the spacecraft.", "environment_context": {"game": "LunarLander", "continuous": false, "gravity": -12.0, "enable_wind": false, "seed": 456}}

tests/pytest/test_frozen_lake.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
"""
2+
Pytest test for frozen lake evaluation using the evaluation_test decorator.
3+
4+
This test demonstrates how to use frozen lake environments within the pytest framework,
5+
similar to the test_frozen_lake_e2e test but integrated with the pytest evaluation system.
6+
"""
7+
8+
9+
from typing import Any, Dict, List
10+
11+
from eval_protocol.models import EvaluateResult, EvaluationRow, Message, InputMetadata, CompletionParams, MetricResult
12+
from eval_protocol.pytest import evaluation_test
13+
from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor
14+
15+
16+
def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
17+
"""
18+
Convert entries from frozen lake dataset to EvaluationRow objects.
19+
"""
20+
rows = []
21+
22+
for entry in data:
23+
row = EvaluationRow(
24+
messages=[Message(role="system", content=entry.get("system_prompt", ""))],
25+
input_metadata=InputMetadata(
26+
row_id=entry.get("id"),
27+
completion_params=CompletionParams(model="placeholder"), # This gets populated by the rollout processor
28+
dataset_info={
29+
"environment_context": entry.get("environment_context", {}),
30+
"user_prompt_template": entry.get("user_prompt_template", ""),
31+
}
32+
)
33+
)
34+
35+
rows.append(row)
36+
37+
return rows
38+
39+
@evaluation_test(
40+
input_dataset=["tests/pytest/data/frozen_lake_dataset.jsonl"],
41+
dataset_adapter=frozen_lake_to_evaluation_row,
42+
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
43+
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
44+
rollout_processor=default_mcp_gym_rollout_processor,
45+
threshold_of_success=0.66,
46+
num_runs=1,
47+
max_concurrent_rollouts=3,
48+
mode="pointwise",
49+
server_script_path="examples/frozen_lake_mcp/server.py",
50+
)
51+
def test_frozen_lake_evaluation(row: EvaluationRow) -> EvaluationRow:
52+
"""
53+
Test frozen lake evaluation using the pytest framework.
54+
55+
This test evaluates how well the model can navigate the FrozenLake environment
56+
by checking if it successfully reaches the goal while avoiding holes.
57+
58+
Args:
59+
row: EvaluationRow object from frozen lake dataset
60+
61+
Returns:
62+
EvaluationRow object with evaluation results
63+
"""
64+
score = row.get_total_reward()
65+
66+
if score == 1.0:
67+
reason = "Agent reached the goal"
68+
else:
69+
reason = "Agent did not reach the goal"
70+
71+
row.evaluation_result = EvaluateResult(
72+
score=score,
73+
reason=reason,
74+
)
75+
76+
return row

tests/pytest/test_lunar_lander.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
"""
2+
Pytest test for lunar lander evaluation using the evaluation_test decorator.
3+
4+
This test demonstrates how to use lunar lander environments within the pytest framework,
5+
similar to the test_lunar_lander_e2e test but integrated with the pytest evaluation system.
6+
"""
7+
8+
from typing import Any, Dict, List
9+
10+
from eval_protocol.models import EvaluateResult, EvaluationRow, Message, InputMetadata, CompletionParams
11+
from eval_protocol.pytest import evaluation_test
12+
from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor
13+
14+
15+
def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
16+
"""
17+
Convert entries from lunar lander dataset to EvaluationRow objects.
18+
"""
19+
rows = []
20+
21+
for entry in data:
22+
row = EvaluationRow(
23+
messages=[Message(role="system", content=entry.get("system_prompt", ""))],
24+
input_metadata=InputMetadata(
25+
row_id=entry.get("id"),
26+
completion_params=CompletionParams(model="placeholder"), # This gets populated by the rollout processor
27+
dataset_info={
28+
"environment_context": entry.get("environment_context", {}),
29+
"user_prompt_template": entry.get("user_prompt_template", ""),
30+
}
31+
)
32+
)
33+
34+
rows.append(row)
35+
36+
return rows
37+
38+
39+
@evaluation_test(
40+
input_dataset=["tests/pytest/data/lunar_lander_dataset.jsonl"],
41+
dataset_adapter=lunar_lander_to_evaluation_row,
42+
model=["gpt-4.1"],
43+
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
44+
rollout_processor=default_mcp_gym_rollout_processor,
45+
threshold_of_success=0.0,
46+
num_runs=1,
47+
mode="pointwise",
48+
max_concurrent_rollouts=3,
49+
steps=15,
50+
server_script_path="examples/lunar_lander_mcp/server.py",
51+
)
52+
def test_lunar_lander_evaluation(row: EvaluationRow) -> EvaluationRow:
53+
"""
54+
Test lunar lander evaluation using the pytest framework.
55+
56+
This test evaluates how well the model can control the lunar lander to achieve
57+
a successful landing by checking the final reward and termination status.
58+
59+
Args:
60+
row: EvaluationRow object from lunar lander dataset
61+
62+
Returns:
63+
EvaluationRow object with evaluation results
64+
"""
65+
score = row.get_total_reward()
66+
67+
evaluation_score = 1.0 if score >= 200 else 0.0
68+
reason = f"✅ Successful landing with reward {score:.2f}" if score >= 200 else f"❌ Failed landing with reward {score:.2f}"
69+
70+
row.evaluation_result = EvaluateResult(
71+
score=evaluation_score,
72+
reason=reason,
73+
)
74+
75+
return row

0 commit comments

Comments
 (0)