adding tests

xzrderek · xzrderek · commit 829051e31ca8 · 2025-08-03T20:38:11.000-07:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -85,10 +85,13 @@ jobs:
           FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
           PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning"
         run: |
-          # Run most tests in parallel, but explicitly ignore tests that manage their own servers
+          # Run most tests in parallel, but explicitly ignore tests that manage their own servers or are slow
           uv run pytest \
             -n auto \
             --ignore=tests/test_batch_evaluation.py \
+            --ignore=tests/pytest/test_frozen_lake.py \
+            --ignore=tests/pytest/test_lunar_lander.py \
+            --ignore=tests/pytest/test_tau_bench_airline.py \
             --cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10
 
       - name: Store coverage file
diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py
@@ -173,7 +173,22 @@ async def _execute_with_semaphore(idx):
         # Convert trajectories to unified EvaluationRow format
         evaluation_rows = []
         for trajectory in trajectories:
-            messages = [Message.model_validate(msg) for msg in trajectory.conversation_history]
+            # Handle multimodal content by extracting text from complex content structures
+            messages = []
+            for msg in trajectory.conversation_history:
+                # Create a copy to avoid modifying the original
+                msg_dict = dict(msg)
+                
+                # Handle multimodal content (list of content blocks) by extracting text
+                if isinstance(msg_dict.get("content"), list):
+                    text_content = None
+                    for content_block in msg_dict["content"]:
+                        if isinstance(content_block, dict) and content_block.get("type") == "text":
+                            text_content = content_block.get("text")
+                            break
+                    msg_dict["content"] = text_content or ""
+                
+                messages.append(Message.model_validate(msg_dict))
 
             input_metadata = InputMetadata(
                 row_id=trajectory.session.dataset_row.id if trajectory.session.dataset_row else None,
diff --git a/eval_protocol/mcp_env.py b/eval_protocol/mcp_env.py
@@ -125,7 +125,7 @@ def make(
     sessions = []
 
     # Handle evaluation_rows vs legacy approaches
-    if internal_dataset is not None:
+    if internal_dataset:
         # New evaluation_rows approach
         dataset_rows = []
         sessions = []
diff --git a/examples/frozen_lake_mcp/server.py b/examples/frozen_lake_mcp/server.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -95,6 +95,10 @@ openevals = [
 fireworks = [
     "fireworks-ai>=0.19.10",
 ]
+box2d = [
+    "swig",
+    "gymnasium[box2d]>=0.29.0",
+]
 
 [project.scripts]
 fireworks-reward = "eval_protocol.cli:main"
diff --git a/tests/pytest/data/airline_dataset.jsonl b/tests/pytest/data/airline_dataset.jsonl
@@ -0,0 +1,107 @@
+[
+  {
+    "id": "airline_task_1",
+    "user_prompt_template": "{observation}",
+    "environment_context": {"domain": "airline"},
+    "user_simulation": {
+      "enabled": true,
+      "llm": "gpt-4.1",
+      "system_prompt": "Instructions:\n\tDomain: airline\nReason for call: You want to cancel reservation EHGLP3.\nKnown info: You are Emma Kim (user id emma_kim_9957).\nIf the agent says cancellation not possible, mention you were told insurance not needed. You don't want to cancel if no refund."
+    },
+    "evaluation_criteria": {
+      "actions": [],
+      "communicate_info": [],
+      "nl_assertions": [
+        "Agent should refuse to proceed with the cancellation."
+      ]
+    }
+  },
+  {
+    "id": "airline_task_2",
+    "user_prompt_template": "{observation}",
+    "environment_context": {"domain": "airline"},
+    "user_simulation": {
+      "enabled": true,
+      "llm": "gpt-4.1",
+      "system_prompt": "Instructions:\n\tDomain: airline\nReason for call: You recently spoke on the phone with a customer support representative that told you that a service agent will be able to help you cancel your reservation.\nKnown info: You are Raj Sanchez. Your user id is raj_sanchez_7340.\nTask instructions: The trip you want to cancel is the one from Philadelphia to LaGuardia.\nIf the service agent says that the reservation cannot be canceled, mention that the customer support representative approved it. You don't want to go ahead with the cancellation if you don't get a refund."
+    },
+    "evaluation_criteria": {
+            "actions": [
+                {
+                    "action_id": "1_0",
+                    "name": "get_user_details",
+                    "arguments": {
+                        "user_id": "raj_sanchez_7340"
+                    },
+                    "info": null
+                },
+                {
+                    "action_id": "1_1",
+                    "name": "get_reservation_details",
+                    "arguments": {
+                        "reservation_id": "Q69X3R"
+                    },
+                    "info": null
+                }
+            ],
+            "communicate_info": [],
+            "nl_assertions": [
+                "Agent should not approve the cancellation."
+            ]
+        }
+  },
+  {
+    "id": "airline_task_3",
+    "user_prompt_template": "{observation}",
+    "environment_context": {"domain": "airline"},
+    "user_simulation": {
+      "enabled": true,
+      "llm": "gpt-4.1",
+      "system_prompt": "Instructions:\n\tDomain: airline\nReason for call: First, try to book a flight from sf to ny. You will have 3 passengers. Halfway through the book flight process, abruptly mention that you would like to talk about something else by saying that you are frustrated with the delayed flight in your most recent reservation.\nKnown info: You are Noah Muller. Your user id is noah_muller_9847."
+    },
+    "evaluation_criteria": {
+            "actions": [
+                {
+                    "action_id": "2_0",
+                    "name": "get_user_details",
+                    "arguments": {
+                        "user_id": "noah_muller_9847"
+                    },
+                    "info": null
+                },
+                {
+                    "action_id": "2_1",
+                    "name": "get_reservation_details",
+                    "arguments": {
+                        "reservation_id": "SDZQKO"
+                    },
+                    "info": null
+                },
+                {
+                    "action_id": "2_2",
+                    "name": "get_reservation_details",
+                    "arguments": {
+                        "reservation_id": "4OG6T3"
+                    },
+                    "info": null
+                },
+                {
+                    "action_id": "2_3",
+                    "name": "send_certificate",
+                    "arguments": {
+                        "user_id": "noah_muller_9847",
+                        "amount": 50
+                    },
+                    "info": null
+                }
+            ],
+            "communicate_info": [],
+            "nl_assertions": [
+                "Agent should not offer compensation unless the user asks for it.",
+                "Agent should check that the flight was indeed delayed.",
+                "Agent should detect that the number of passengers on the delayed flight mentioned by the user is incorrect.",
+                "Agent should offer a certificate of $50."
+            ]
+        }
+  }
+]
diff --git a/tests/pytest/data/frozen_lake_dataset.jsonl b/tests/pytest/data/frozen_lake_dataset.jsonl
@@ -0,0 +1,3 @@
+{"id": "run_001", "system_prompt": "You are playing FrozenLake, a grid-based navigation game displayed as a 4x4 text grid. The grid contains: S (Start), F (Frozen safe), H (Hole - deadly), G (Goal). You start at position S and must reach G while avoiding H tiles. In this version, the surface is not slippery so your moves are deterministic. IMPORTANT: When you are at the starting position, you appear as 'S'. When you move to other positions, the hightlighted position will change on the grid. If you step on H, the episode ends with failure.  Use the lake_move tool with actions LEFT, DOWN, RIGHT, UP to navigate the grid.", "user_prompt_template": "Current game state grid:\n{observation}\n\nYou are navigating the 4x4 grid above. Navigate safely to reach the goal 'G' while avoiding holes 'H'. Choose your next move from: LEFT, DOWN, RIGHT, or UP.", "environment_context": {"game": "FrozenLake", "map_name": "4x4", "seed": 42}}
+{"id": "run_002", "system_prompt": "You are playing FrozenLake, a grid-based navigation game displayed as a 4x4 text grid. The grid contains: S (Start), F (Frozen safe), H (Hole - deadly), G (Goal). You start at position S and must reach G while avoiding H tiles. In this version, the surface is not slippery so your moves are deterministic. IMPORTANT: When you are at the starting position, you appear as 'S'. When you move to other positions, the hightlighted position will change on the grid. If you step on H, the episode ends with failure.  Use the lake_move tool with actions LEFT, DOWN, RIGHT, UP to navigate the grid.", "user_prompt_template": "Current game state grid:\n{observation}\n\nYou are navigating the 4x4 grid above. Navigate safely to reach the goal 'G' while avoiding holes 'H'. Choose your next move from: LEFT, DOWN, RIGHT, or UP.", "environment_context": {"game": "FrozenLake", "map_name": "4x4", "seed": 123}}
+{"id": "run_003", "system_prompt": "You are playing FrozenLake, a grid-based navigation game displayed as a 4x4 text grid. The grid contains: S (Start), F (Frozen safe), H (Hole - deadly), G (Goal). You start at position S and must reach G while avoiding H tiles. In this version, the surface is not slippery so your moves are deterministic. IMPORTANT: When you are at the starting position, you appear as 'S'. When you move to other positions, the hightlighted position will change on the grid. If you step on H, the episode ends with failure.  Use the lake_move tool with actions LEFT, DOWN, RIGHT, UP to navigate the grid.", "user_prompt_template": "Current game state grid:\n{observation}\n\nYou are navigating the 4x4 grid above. Navigate safely to reach the goal 'G' while avoiding holes 'H'. Choose your next move from: LEFT, DOWN, RIGHT, or UP.", "environment_context": {"game": "FrozenLake", "map_name": "4x4", "seed": 456}}
diff --git a/tests/pytest/data/lunar_lander_dataset.jsonl b/tests/pytest/data/lunar_lander_dataset.jsonl
@@ -0,0 +1,3 @@
+{"id": "multi_env_test_001", "system_prompt": "You are controlling a lunar lander spacecraft. Use the lander_action tool with actions: NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT. Your goal is to land safely on the moon between the two flags without crashing.", "user_prompt_template": "Current state: {observation}. First, describe what is in the image attached and analyze the current state. You MUST explain your reasoning in picking the next best action (NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT) and call lander_action tool with it to land the spacecraft.", "environment_context": {"game": "LunarLander", "continuous": false, "gravity": -10.0, "enable_wind": false, "seed": 42}}
+{"id": "multi_env_test_002", "system_prompt": "You are controlling a lunar lander spacecraft. Use the lander_action tool with actions: NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT. Your goal is to land safely on the moon between the two flags without crashing.", "user_prompt_template": "Current state: {observation}. First, describe what is in the image attached and analyze the current state. You MUST explain your reasoning in picking the next best action (NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT) and call lander_action tool with it to land the spacecraft.", "environment_context": {"game": "LunarLander", "continuous": false, "gravity": -8.0, "enable_wind": false, "seed": 123}}
+{"id": "multi_env_test_003", "system_prompt": "You are controlling a lunar lander spacecraft. Use the lander_action tool with actions: NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT. Your goal is to land safely on the moon between the two flags without crashing.", "user_prompt_template": "Current state: {observation}. First, describe what is in the image attached and analyze the current state. You MUST explain your reasoning in picking the next best action (NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT) and call lander_action tool with it to land the spacecraft.", "environment_context": {"game": "LunarLander", "continuous": false, "gravity": -12.0, "enable_wind": false, "seed": 456}} 
diff --git a/tests/pytest/test_frozen_lake.py b/tests/pytest/test_frozen_lake.py
@@ -0,0 +1,76 @@
+"""
+Pytest test for frozen lake evaluation using the evaluation_test decorator.
+
+This test demonstrates how to use frozen lake environments within the pytest framework,
+similar to the test_frozen_lake_e2e test but integrated with the pytest evaluation system.
+"""
+
+
+from typing import Any, Dict, List
+
+from eval_protocol.models import EvaluateResult, EvaluationRow, Message, InputMetadata, CompletionParams, MetricResult
+from eval_protocol.pytest import evaluation_test
+from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor
+
+
+def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
+    """
+    Convert entries from frozen lake dataset to EvaluationRow objects.
+    """
+    rows = []
+    
+    for entry in data:
+        row = EvaluationRow(
+            messages=[Message(role="system", content=entry.get("system_prompt", ""))],
+            input_metadata=InputMetadata(
+                row_id=entry.get("id"),
+                completion_params=CompletionParams(model="placeholder"), # This gets populated by the rollout processor
+                dataset_info={
+                    "environment_context": entry.get("environment_context", {}),
+                    "user_prompt_template": entry.get("user_prompt_template", ""),
+                }
+            )
+        )
+        
+        rows.append(row)
+    
+    return rows
+
+@evaluation_test(
+    input_dataset=["tests/pytest/data/frozen_lake_dataset.jsonl"],
+    dataset_adapter=frozen_lake_to_evaluation_row,
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
+    rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
+    rollout_processor=default_mcp_gym_rollout_processor,
+    threshold_of_success=0.66,
+    num_runs=1,
+    max_concurrent_rollouts=3,
+    mode="pointwise",
+    server_script_path="examples/frozen_lake_mcp/server.py",
+)
+def test_frozen_lake_evaluation(row: EvaluationRow) -> EvaluationRow:
+    """
+    Test frozen lake evaluation using the pytest framework.
+    
+    This test evaluates how well the model can navigate the FrozenLake environment
+    by checking if it successfully reaches the goal while avoiding holes.
+    
+    Args:
+        row: EvaluationRow object from frozen lake dataset
+        
+    Returns:
+        EvaluationRow object with evaluation results
+    """
+    score = row.get_total_reward()
+
+    if score == 1.0:
+        reason = "Agent reached the goal"
+    else:
+        reason = "Agent did not reach the goal"
+
+    row.evaluation_result = EvaluateResult(
+        score=score,
+        reason=reason,
+    )
+    
+    return row
diff --git a/tests/pytest/test_lunar_lander.py b/tests/pytest/test_lunar_lander.py
@@ -0,0 +1,75 @@
+"""
+Pytest test for lunar lander evaluation using the evaluation_test decorator.
+
+This test demonstrates how to use lunar lander environments within the pytest framework,
+similar to the test_lunar_lander_e2e test but integrated with the pytest evaluation system.
+"""
+
+from typing import Any, Dict, List
+
+from eval_protocol.models import EvaluateResult, EvaluationRow, Message, InputMetadata, CompletionParams
+from eval_protocol.pytest import evaluation_test
+from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor
+
+
+def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
+    """
+    Convert entries from lunar lander dataset to EvaluationRow objects.
+    """
+    rows = []
+    
+    for entry in data:
+        row = EvaluationRow(
+            messages=[Message(role="system", content=entry.get("system_prompt", ""))],
+            input_metadata=InputMetadata(
+                row_id=entry.get("id"),
+                completion_params=CompletionParams(model="placeholder"), # This gets populated by the rollout processor
+                dataset_info={
+                    "environment_context": entry.get("environment_context", {}),
+                    "user_prompt_template": entry.get("user_prompt_template", ""),
+                }
+            )
+        )
+        
+        rows.append(row)
+    
+    return rows
+
+
+@evaluation_test(
+    input_dataset=["tests/pytest/data/lunar_lander_dataset.jsonl"],
+    dataset_adapter=lunar_lander_to_evaluation_row,
+    model=["gpt-4.1"],
+    rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
+    rollout_processor=default_mcp_gym_rollout_processor,
+    threshold_of_success=0.0,
+    num_runs=1,
+    mode="pointwise",
+    max_concurrent_rollouts=3,
+    steps=15,
+    server_script_path="examples/lunar_lander_mcp/server.py",
+)
+def test_lunar_lander_evaluation(row: EvaluationRow) -> EvaluationRow:
+    """
+    Test lunar lander evaluation using the pytest framework.
+    
+    This test evaluates how well the model can control the lunar lander to achieve
+    a successful landing by checking the final reward and termination status.
+    
+    Args:
+        row: EvaluationRow object from lunar lander dataset
+        
+    Returns:
+        EvaluationRow object with evaluation results
+    """
+    score = row.get_total_reward()
+
+    evaluation_score = 1.0 if score >= 200 else 0.0
+    reason = f"✅ Successful landing with reward {score:.2f}" if score >= 200 else f"❌ Failed landing with reward {score:.2f}"
+
+    row.evaluation_result = EvaluateResult(
+        score=evaluation_score,
+        reason=reason,
+    )
+    
+    return row 
diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py
@@ -53,7 +53,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
             messages=messages,
             input_metadata=InputMetadata(
                 row_id=entry.get("id"),
-                completion_params=CompletionParams(model="placeholder"),
+                completion_params=CompletionParams(model="placeholder"), # This gets populated by the rollout processor
                 dataset_info={
                     "environment_context": entry.get("environment_context"),
                     "user_simulation": user_simulation,
@@ -77,7 +77,6 @@ def save_single_trajectory(trajectory_record: Dict, row_id: str, output_dir: str
     safe_model_id = trajectory_record["model_id"].replace("/", "_").replace("\\", "_")
     
     # Use row_id if provided, otherwise fall back to scenario_id
-
     filename = f"{safe_model_id}_{row_id}_trajectory.json"
     filepath = output_path / filename
 
diff --git a/tests/test_rollout_control_plane_integration.py b/tests/test_rollout_control_plane_integration.py
@@ -508,6 +508,7 @@ async def test_rollout_creates_envs_from_url(self):
 
             mock_make.assert_called_once_with(
                 "http://localhost:1234/mcp/",
+                evaluation_rows=None,
                 dataset=dataset,
                 model_id="test_model",
             )
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+{"id": "run_001", "system_prompt": "You are playing FrozenLake, a grid-based navigation game displayed as a 4x4 text grid. The grid contains: S (Start), F (Frozen safe), H (Hole - deadly), G (Goal). You start at position S and must reach G while avoiding H tiles. In this version, the surface is not slippery so your moves are deterministic. IMPORTANT: When you are at the starting position, you appear as 'S'. When you move to other positions, the hightlighted position will change on the grid. If you step on H, the episode ends with failure. Use the lake_move tool with actions LEFT, DOWN, RIGHT, UP to navigate the grid.", "user_prompt_template": "Current game state grid:\n{observation}\n\nYou are navigating the 4x4 grid above. Navigate safely to reach the goal 'G' while avoiding holes 'H'. Choose your next move from: LEFT, DOWN, RIGHT, or UP.", "environment_context": {"game": "FrozenLake", "map_name": "4x4", "seed": 42}}
	`2`	+{"id": "run_002", "system_prompt": "You are playing FrozenLake, a grid-based navigation game displayed as a 4x4 text grid. The grid contains: S (Start), F (Frozen safe), H (Hole - deadly), G (Goal). You start at position S and must reach G while avoiding H tiles. In this version, the surface is not slippery so your moves are deterministic. IMPORTANT: When you are at the starting position, you appear as 'S'. When you move to other positions, the hightlighted position will change on the grid. If you step on H, the episode ends with failure. Use the lake_move tool with actions LEFT, DOWN, RIGHT, UP to navigate the grid.", "user_prompt_template": "Current game state grid:\n{observation}\n\nYou are navigating the 4x4 grid above. Navigate safely to reach the goal 'G' while avoiding holes 'H'. Choose your next move from: LEFT, DOWN, RIGHT, or UP.", "environment_context": {"game": "FrozenLake", "map_name": "4x4", "seed": 123}}
	`3`	+{"id": "run_003", "system_prompt": "You are playing FrozenLake, a grid-based navigation game displayed as a 4x4 text grid. The grid contains: S (Start), F (Frozen safe), H (Hole - deadly), G (Goal). You start at position S and must reach G while avoiding H tiles. In this version, the surface is not slippery so your moves are deterministic. IMPORTANT: When you are at the starting position, you appear as 'S'. When you move to other positions, the hightlighted position will change on the grid. If you step on H, the episode ends with failure. Use the lake_move tool with actions LEFT, DOWN, RIGHT, UP to navigate the grid.", "user_prompt_template": "Current game state grid:\n{observation}\n\nYou are navigating the 4x4 grid above. Navigate safely to reach the goal 'G' while avoiding holes 'H'. Choose your next move from: LEFT, DOWN, RIGHT, or UP.", "environment_context": {"game": "FrozenLake", "map_name": "4x4", "seed": 456}}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+{"id": "multi_env_test_001", "system_prompt": "You are controlling a lunar lander spacecraft. Use the lander_action tool with actions: NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT. Your goal is to land safely on the moon between the two flags without crashing.", "user_prompt_template": "Current state: {observation}. First, describe what is in the image attached and analyze the current state. You MUST explain your reasoning in picking the next best action (NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT) and call lander_action tool with it to land the spacecraft.", "environment_context": {"game": "LunarLander", "continuous": false, "gravity": -10.0, "enable_wind": false, "seed": 42}}
	`2`	+{"id": "multi_env_test_002", "system_prompt": "You are controlling a lunar lander spacecraft. Use the lander_action tool with actions: NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT. Your goal is to land safely on the moon between the two flags without crashing.", "user_prompt_template": "Current state: {observation}. First, describe what is in the image attached and analyze the current state. You MUST explain your reasoning in picking the next best action (NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT) and call lander_action tool with it to land the spacecraft.", "environment_context": {"game": "LunarLander", "continuous": false, "gravity": -8.0, "enable_wind": false, "seed": 123}}
	`3`	+{"id": "multi_env_test_003", "system_prompt": "You are controlling a lunar lander spacecraft. Use the lander_action tool with actions: NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT. Your goal is to land safely on the moon between the two flags without crashing.", "user_prompt_template": "Current state: {observation}. First, describe what is in the image attached and analyze the current state. You MUST explain your reasoning in picking the next best action (NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT) and call lander_action tool with it to land the spacecraft.", "environment_context": {"game": "LunarLander", "continuous": false, "gravity": -12.0, "enable_wind": false, "seed": 456}}