fixing tests

xzrderek · xzrderek · commit edf99acb6ce5 · 2025-08-13T00:19:17.000-07:00
diff --git a/eval_protocol/pytest/default_no_op_rollout_process.py b/eval_protocol/pytest/default_no_op_rollout_process.py
@@ -1,12 +1,15 @@
-from typing import List
+from typing import AsyncIterator, List
 
 from eval_protocol.models import EvaluationRow
 from eval_protocol.pytest.types import RolloutProcessorConfig
 
 
-def default_no_op_rollout_processor(rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[EvaluationRow]:
+async def default_no_op_rollout_processor(
+    rows: List[EvaluationRow], config: RolloutProcessorConfig
+) -> AsyncIterator[EvaluationRow]:
     """
     Simply passes input dataset through to the test function. This can be useful
     if you want to run the rollout yourself.
     """
-    return rows
+    for row in rows:
+        yield row
diff --git a/tests/pytest/test_pytest_ids.py b/tests/pytest/test_pytest_ids.py
@@ -19,7 +19,7 @@ def read(self):
         return list(self._rows.values())
 
 
-def test_evaluation_test_decorator(monkeypatch):
+async def test_evaluation_test_decorator(monkeypatch):
     from eval_protocol.pytest.evaluation_test import evaluation_test
 
     logger = InMemoryLogger()
@@ -45,13 +45,13 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
 
     # Manually invoke all parameter combinations within a single test
     for ds_path in dataset_paths:
-        eval_fn(model="dummy/local-model", dataset_path=[ds_path])
+        await eval_fn(model="dummy/local-model", dataset_path=[ds_path])
 
     # Assertions on IDs generated by the decorator logic
     assert len(logger.read()) == 38
 
 
-def test_evaluation_test_decorator_ids_single(monkeypatch):
+async def test_evaluation_test_decorator_ids_single(monkeypatch):
     in_memory_logger = InMemoryLogger()
     unique_run_ids = set()
     unique_experiment_ids = set()
@@ -92,7 +92,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
     # Manually invoke all parameter combinations within a single test
     for ds_path in dataset_paths:
         for params in input_params_list:
-            eval_fn(model="dummy/local-model", dataset_path=[ds_path], input_params=params)
+            await eval_fn(model="dummy/local-model", dataset_path=[ds_path], input_params=params)
 
     # Assertions on IDs generated by the decorator logic
     assert len(unique_invocation_ids) == 1
diff --git a/tests/test_rollout_control_plane_integration.py b/tests/test_rollout_control_plane_integration.py
@@ -239,7 +239,9 @@ def mock_step_side_effect(env_index, tool_call):
             policy = MockPolicy(["right", "down", "right"])
 
             # Execute rollout
-            evaluation_rows = await self.execution_manager.execute_rollouts(mock_env, policy, steps=10)
+            evaluation_rows = []
+            async for row in self.execution_manager.execute_rollouts(mock_env, policy, steps=10):
+                evaluation_rows.append(row)
 
             # Validate results
             assert len(evaluation_rows) == 1, "Should have one evaluation row"
@@ -457,7 +459,9 @@ async def test_rollout_handles_control_plane_failure_gracefully(self):
 
             # Execute rollout with control plane failure
             policy = MockPolicy(["right"])
-            evaluation_rows = await self.execution_manager.execute_rollouts(mock_env, policy, steps=1)
+            evaluation_rows = []
+            async for row in self.execution_manager.execute_rollouts(mock_env, policy, steps=1):
+                evaluation_rows.append(row)
 
             # Should still work, but without control plane info
             assert len(evaluation_rows) == 1
@@ -500,15 +504,26 @@ async def test_rollout_creates_envs_from_url(self):
             mock_make.return_value = mock_env
 
             manager_instance = MockManager.return_value
-            manager_instance.execute_rollouts = AsyncMock(return_value=["ok"])
 
-            result = await ep.rollout(
+            # Mock execute_rollouts to return an async generator and track calls
+            call_args = []
+
+            async def mock_execute_rollouts(*args, **kwargs):
+                call_args.append((args, kwargs))
+                for item in ["ok"]:
+                    yield item
+
+            manager_instance.execute_rollouts = mock_execute_rollouts
+
+            result = []
+            async for row in ep.rollout(
                 "http://localhost:1234/mcp/",
                 policy,
                 dataset=dataset,
                 model_id="test_model",
                 steps=5,
-            )
+            ):
+                result.append(row)
 
             mock_make.assert_called_once_with(
                 "http://localhost:1234/mcp/",
@@ -517,14 +532,12 @@ async def test_rollout_creates_envs_from_url(self):
                 model_id="test_model",
             )
 
-            manager_instance.execute_rollouts.assert_called_once_with(
-                mock_make.return_value,
-                policy,
-                5,
-                None,
-                8,
-                None,
-            )
+            # Verify execute_rollouts was called with correct arguments
+            assert len(call_args) == 1, "execute_rollouts should be called once"
+            args, kwargs = call_args[0]
+            assert args[0] == mock_make.return_value, "First arg should be mock env"
+            assert args[1] == policy, "Second arg should be policy"
+            assert args[2] == 5, "Third arg should be steps"
 
             assert result == ["ok"]