Address comments

xzrderek · xzrderek · commit 062e44877856 · 2025-08-14T00:39:02.000-07:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -92,6 +92,7 @@ jobs:
             --ignore=tests/pytest/test_frozen_lake.py \
             --ignore=tests/pytest/test_lunar_lander.py \
             --ignore=tests/pytest/test_tau_bench_airline.py \
+            --ignore=tests/pytest/test_apps_coding.py \
             --ignore=tests/test_tau_bench_airline_smoke.py \
             --cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10
 
diff --git a/eval_protocol/benchmarks/suites/gpqa.py b/eval_protocol/benchmarks/suites/gpqa.py
@@ -1,3 +1,4 @@
+import asyncio
 import csv
 import io
 import re
@@ -60,7 +61,7 @@ def _strip_gt_messages(msgs: List[Message]) -> List[Message]:
     return [m for m in msgs if not (m.role == "system" and (m.content or "").startswith("__GT__:"))]
 
 
-async def gpqa_strip_gt_rollout_processor(rows: List[EvaluationRow], config) -> List[EvaluationRow]:
+def gpqa_strip_gt_rollout_processor(rows: List[EvaluationRow], config) -> List[asyncio.Task[EvaluationRow]]:
     """Preprocess rows to set ground_truth and remove __GT__ messages, then delegate to default processor."""
     processed: List[EvaluationRow] = []
     for r in rows:
@@ -72,7 +73,7 @@ async def gpqa_strip_gt_rollout_processor(rows: List[EvaluationRow], config) ->
                 m for m in r.messages if not (m.role == "system" and (m.content or "").startswith("__GT__:"))
             ]
         processed.append(r)
-    return await default_single_turn_rollout_processor(processed, config)
+    return default_single_turn_rollout_processor(processed, config)
 
 
 @export_benchmark("gpqa")
diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py
@@ -35,15 +35,15 @@ class ExecutionManager:
     Manage rollout for MCP environments.
     """
 
-    async def execute_rollouts(
+    def execute_rollouts(
         self,
         envs: "GeneralMCPVectorEnv",
         policy: Union["LLMBasePolicy", Callable],
         steps: int = 512,
         openai_format_log_file: Optional[str] = None,
         max_concurrent_rollouts: int = 8,
         evaluation_rows: Optional[List[EvaluationRow]] = None,
-    ) -> AsyncIterator[EvaluationRow]:
+    ) -> List[asyncio.Task[EvaluationRow]]:
         """
         Execute general rollouts using tool calling interface with automatic record/playback.
 
@@ -66,7 +66,7 @@ async def execute_rollouts(
             - Set and file exists: Playback mode (uses recorded data)
 
         Returns:
-            AsyncIterator of EvaluationRow objects with unified evaluation data format
+            List of asyncio.Task objects for external handling
         """
         start_time = time.time()
 
@@ -151,18 +151,7 @@ async def _execute_with_semaphore(idx):
 
         # Create all tasks
         tasks = [asyncio.create_task(_execute_with_semaphore(i)) for i in range(envs.n)]
-
-        # Yield results as they complete (note that they're not necessarily in original order)
-        try:
-            for task in asyncio.as_completed(tasks):
-                try:
-                    yield await task
-                except Exception:
-                    logger.exception("Error processing rollout")
-        finally:
-            for t in tasks:
-                t.cancel()
-            await asyncio.gather(*tasks, return_exceptions=True)
+        return tasks
 
     async def _execute_rollout(
         self,
diff --git a/eval_protocol/mcp_env.py b/eval_protocol/mcp_env.py
@@ -236,7 +236,7 @@ def make(
     return mcp_envs
 
 
-async def rollout(
+def rollout(
     envs: GeneralMCPVectorEnv,
     policy: Union[FireworksPolicy, LLMBasePolicy, Callable],
     *,
@@ -246,7 +246,7 @@ async def rollout(
     steps: int = 512,
     openai_format_log_file: Optional[str] = None,
     max_concurrent_rollouts: int = 8,
-) -> AsyncIterator[EvaluationRow]:
+) -> List[asyncio.Task[EvaluationRow]]:
     """
     Execute general rollouts using tool calling interface with automatic record/playback.
 
@@ -274,14 +274,14 @@ async def rollout(
         - Set and file exists: Playback mode (uses recorded data)
 
     Returns:
-        List of EvaluationRow objects
+        List of asyncio.Task objects for external handling
 
     Example:
         # Live mode
-        evaluation_rows = await ep.rollout(envs, policy)
+        tasks = await ep.rollout(envs, policy)
 
         # Create environments automatically
-        trajectories = await ep.rollout(
+        tasks = await ep.rollout(
             "http://localhost:8000/mcp/",
             policy,
             evaluation_rows=my_evaluation_rows,
@@ -290,26 +290,26 @@ async def rollout(
 
         # Recording mode
         os.environ["EP_PLAYBACK_FILE"] = "record.jsonl"
-        evaluation_rows = await ep.rollout(envs, policy, openai_format_log_file="sft_data.jsonl")
+        tasks = await ep.rollout(envs, policy, openai_format_log_file="sft_data.jsonl")
 
         # Playback mode (after recording file exists)
-        evaluation_rows = await ep.rollout(envs, policy)
+        tasks = await ep.rollout(envs, policy)
     """
     # Automatically create environments if a base URL is provided
     if isinstance(envs, str):
         if evaluation_rows is None and dataset is None:
             raise ValueError("Either 'evaluation_rows' or 'dataset' must be provided when envs is a URL")
 
         auto_model_id = model_id or getattr(policy, "model_id", "unknown")
-        envs = await make(envs, evaluation_rows=evaluation_rows, dataset=dataset, model_id=auto_model_id)
+        envs = make(envs, evaluation_rows=evaluation_rows, dataset=dataset, model_id=auto_model_id)
 
     # Use the new ExecutionManager for execution
     execution_manager = ExecutionManager()
 
-    async for evaluation_row in execution_manager.execute_rollouts(
+    tasks = execution_manager.execute_rollouts(
         envs, policy, steps, openai_format_log_file, max_concurrent_rollouts, evaluation_rows
-    ):
-        yield evaluation_row
+    )
+    return tasks
 
 
 async def test_mcp(base_url: str, seeds: List[int]) -> Dict[str, Any]:
@@ -336,7 +336,7 @@ async def test_mcp(base_url: str, seeds: List[int]) -> Dict[str, Any]:
             policy = FireworksPolicy("test-model")
 
             # Run short rollout
-            evaluation_rows = await rollout(envs, policy=policy, steps=10)
+            evaluation_rows = rollout(envs, policy=policy, steps=10)
 
             if evaluation_rows and len(evaluation_rows[0].messages) > 1:
                 results["successful"] += 1
diff --git a/eval_protocol/pytest/default_agent_rollout_processor.py b/eval_protocol/pytest/default_agent_rollout_processor.py
@@ -115,10 +115,10 @@ def _get_content_from_tool_result(self, tool_result: CallToolResult) -> List[Tex
         return tool_result.content
 
 
-async def default_agent_rollout_processor(
+def default_agent_rollout_processor(
     rows: List[EvaluationRow], config: RolloutProcessorConfig
-) -> AsyncIterator[EvaluationRow]:
-    """Process agent rollouts with bounded concurrency and yield as they complete."""
+) -> List[asyncio.Task[EvaluationRow]]:
+    """Create agent rollout tasks and return them for external handling."""
 
     max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8
     semaphore = asyncio.Semaphore(max_concurrent)
@@ -138,24 +138,9 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
 
     async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
         async with semaphore:
-            try:
-                return await process_row(r)
-            except Exception as e:
-                r.rollout_status.status = "error"
-                r.rollout_status.termination_reason = str(e)
-                return r
-
-    # Create all tasks
-    tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows]
+            result = await process_row(r)
+            return result
 
-    # Yield results as they complete (note that they're not necessarily in original order)
-    try:
-        for task in asyncio.as_completed(tasks):
-            try:
-                yield await task
-            except Exception:
-                logger.exception("Error processing row")
-    finally:
-        for t in tasks:
-            t.cancel()
-        await asyncio.gather(*tasks, return_exceptions=True)
+    # Create and return tasks for external handling
+    tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows]
+    return tasks
diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
@@ -194,14 +194,14 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         return False  # Don't suppress exceptions
 
 
-async def default_mcp_gym_rollout_processor(
+def default_mcp_gym_rollout_processor(
     rows: List[EvaluationRow], config: RolloutProcessorConfig
-) -> AsyncIterator[EvaluationRow]:
+) -> List[asyncio.Task[EvaluationRow]]:
     """
     Rollout processor for tau bench environments.
 
-    This processor starts an MCP server, creates tau bench environments, and runs rollouts
-    using the eval_protocol framework, yielding results as they complete.
+    This processor starts an MCP server, creates tau bench environments, and returns rollout tasks
+    using the eval_protocol framework.
 
     Args:
         rows: List of EvaluationRow objects containing messages and dataset info in input_metadata
@@ -210,7 +210,7 @@ async def default_mcp_gym_rollout_processor(
                 - start_server (bool): If True, create fresh server and environments. If False, reuse existing ones. Default: True.
 
     Returns:
-        AsyncIterator of EvaluationRow objects with completed conversations
+        List of asyncio.Task objects for external handling
     """
     start_server = config.kwargs.get("start_server", True) if config.kwargs else True
     if start_server:
@@ -260,15 +260,15 @@ async def default_mcp_gym_rollout_processor(
         envs = CURRENT_RUN_STATE["envs"]
         policy = CURRENT_RUN_STATE["policy"]
 
-    # Run rollout with environments and policy (automatically resets environments)
-    async for evaluation_row in ep.rollout(
+    # Get rollout tasks from ep.rollout
+    tasks = ep.rollout(
         envs,
         policy=policy,
         evaluation_rows=rows,
         steps=config.steps,
         max_concurrent_rollouts=config.max_concurrent_rollouts,
-    ):
-        yield evaluation_row
+    )
+    return tasks
 
 
 # Add cleanup method directly to the function object
diff --git a/eval_protocol/pytest/default_no_op_rollout_process.py b/eval_protocol/pytest/default_no_op_rollout_process.py
@@ -1,15 +1,21 @@
-from typing import AsyncIterator, List
+import asyncio
+from typing import List
 
 from eval_protocol.models import EvaluationRow
 from eval_protocol.pytest.types import RolloutProcessorConfig
 
 
-async def default_no_op_rollout_processor(
+def default_no_op_rollout_processor(
     rows: List[EvaluationRow], config: RolloutProcessorConfig
-) -> AsyncIterator[EvaluationRow]:
+) -> List[asyncio.Task[EvaluationRow]]:
     """
     Simply passes input dataset through to the test function. This can be useful
     if you want to run the rollout yourself.
     """
-    for row in rows:
-        yield row
+
+    async def return_row(row: EvaluationRow) -> EvaluationRow:
+        return row
+
+    # Create tasks that immediately return the rows (no-op)
+    tasks = [asyncio.create_task(return_row(row)) for row in rows]
+    return tasks
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -15,10 +15,10 @@
 logger = logging.getLogger(__name__)
 
 
-async def default_single_turn_rollout_processor(
+def default_single_turn_rollout_processor(
     rows: List[EvaluationRow], config: RolloutProcessorConfig
-) -> AsyncIterator[EvaluationRow]:
-    """Generate a single response from any supported model provider using LiteLLM."""
+) -> List[asyncio.Task[EvaluationRow]]:
+    """Generate single turn rollout tasks and return them for external handling."""
 
     # Quiet LiteLLM logs in test runs unless user overrode
     try:
@@ -103,30 +103,15 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
         default_logger.log(row)
         return row
 
-    # Process rows with bounded concurrency and yield as they complete
+    # Process rows with bounded concurrency
     max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8
     semaphore = asyncio.Semaphore(max_concurrent)
 
     async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
         async with semaphore:
-            try:
-                return await process_row(r)
-            except Exception as e:
-                r.rollout_status.status = "error"
-                r.rollout_status.termination_reason = str(e)
-                return r
-
-    # Create all tasks
-    tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows]
+            result = await process_row(r)
+            return result
 
-    # Yield results as they complete (note that they're not necessarily in original order)
-    try:
-        for task in asyncio.as_completed(tasks):
-            try:
-                yield await task
-            except Exception:
-                logger.exception("Error processing row")
-    finally:
-        for t in tasks:
-            t.cancel()
-        await asyncio.gather(*tasks, return_exceptions=True)
+    # Create and return tasks for external handling
+    tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows]
+    return tasks
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -304,21 +304,37 @@ async def retry_handler(failed_row: EvaluationRow):
                     # add kwargs start_server=False to config so we don't start new MCP server
                     retry_config = replace(config, kwargs={**(config.kwargs or {}), "start_server": False})
 
-                    retry_call = rollout_processor([failed_row], retry_config)
+                    retry_tasks = rollout_processor([failed_row], retry_config)
 
-                    retry_result = await anext(retry_call)
-                    if retry_result.rollout_status and retry_result.rollout_status.status == "finished":
+                    try:
+                        retry_result = await retry_tasks[0]
+                        retry_result.rollout_status.status = "finished"
                         await queue.put(retry_result)
-                    else:
-                        asyncio.create_task(retry_handler(retry_result))  # retry failed, spawn another retry
+                    except Exception as e:
+                        failed_row.rollout_status.status = "error"
+                        failed_row.rollout_status.termination_reason = str(e)
+                        asyncio.create_task(retry_handler(failed_row))  # retry failed, spawn another retry
 
                 async def initial_processor():
                     """Process initial batch and spawn retries for failures"""
-                    async for initial_row in rollout_processor(fresh_dataset, config):
-                        if initial_row.rollout_status and initial_row.rollout_status.status == "finished":
-                            await queue.put(initial_row)  # rollout succeeded, put on queue
-                        else:
-                            asyncio.create_task(retry_handler(initial_row))  # rollout errored, spawn retry task
+                    base_tasks = rollout_processor(fresh_dataset, config)
+                    pending = set(base_tasks)
+
+                    while pending:
+                        done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
+
+                        for task in done:
+                            task_index = base_tasks.index(task)
+
+                            try:
+                                result = await task
+                                result.rollout_status.status = "finished"
+                                await queue.put(result)
+                            except Exception as e:
+                                failed_row = fresh_dataset[task_index]
+                                failed_row.rollout_status.status = "error"
+                                failed_row.rollout_status.termination_reason = str(e)
+                                asyncio.create_task(retry_handler(failed_row))  # rollout errored, spawn retry task
 
                 processor_task = asyncio.create_task(initial_processor())
 
@@ -606,7 +622,7 @@ async def _execute_with_semaphore(row):
                     for result in all_results:
                         for r in result:
                             if r.eval_metadata is not None:
-                                r.eval_metadata.status = "finished"
+                                r.eval_metadata.status = "finished"  # TODO: might not be needed
                                 r.eval_metadata.passed = passed
                             active_logger.log(r)
 
diff --git a/eval_protocol/pytest/types.py b/eval_protocol/pytest/types.py
@@ -2,8 +2,9 @@
 Parameter types
 """
 
+import asyncio
 from dataclasses import dataclass, field
-from typing import Any, AsyncIterator, Callable, Dict, List, Literal, Optional
+from typing import Any, Callable, Dict, List, Literal, Optional
 
 from eval_protocol.dataset_logger import default_logger
 from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
@@ -51,4 +52,4 @@ class RolloutProcessorConfig:
     kwargs: Dict[str, Any] = field(default_factory=dict)  # any additional kwargs to pass to the rollout processor
 
 
-RolloutProcessor = Callable[[List[EvaluationRow], RolloutProcessorConfig], AsyncIterator[EvaluationRow]]
+RolloutProcessor = Callable[[List[EvaluationRow], RolloutProcessorConfig], List[asyncio.Task[EvaluationRow]]]
diff --git a/tests/test_retry_mechanism.py b/tests/test_retry_mechanism.py
diff --git a/tests/test_rollout_control_plane_integration.py b/tests/test_rollout_control_plane_integration.py
diff --git a/tests/test_rollout_error_handling.py b/tests/test_rollout_error_handling.py