WIP: vibe coded as an mvp

xzrderek · xzrderek · commit 3cf966abb28f · 2025-08-10T03:36:53.000-07:00
diff --git a/eval_protocol/pytest/__init__.py b/eval_protocol/pytest/__init__.py
@@ -1,14 +1,16 @@
 from .default_agent_rollout_processor import default_agent_rollout_processor
+from .default_dataset_adapter import default_dataset_adapter
+from .default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor
 from .default_no_op_rollout_process import default_no_op_rollout_processor
 from .default_single_turn_rollout_process import default_single_turn_rollout_processor
 from .evaluation_test import evaluation_test
 from .types import RolloutProcessor, RolloutProcessorConfig
-from .default_dataset_adapter import default_dataset_adapter
 
 __all__ = [
     "default_agent_rollout_processor",
     "default_no_op_rollout_processor",
     "default_single_turn_rollout_processor",
+    "default_mcp_gym_rollout_processor",
     "default_dataset_adapter",
     "RolloutProcessor",
     "RolloutProcessorConfig",
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -1,18 +1,22 @@
 import asyncio
-from typing import List
+import logging
+import time
+from typing import AsyncIterator, List
 
-from litellm import acompletion
 import litellm
+from litellm import acompletion
 from openai.types.chat.chat_completion_message import ChatCompletionMessageToolCall
 
 from eval_protocol.dataset_logger import default_logger
 from eval_protocol.models import EvaluationRow, Message
 from eval_protocol.pytest.types import RolloutProcessorConfig
 
+logger = logging.getLogger(__name__)
+
 
 async def default_single_turn_rollout_processor(
     rows: List[EvaluationRow], config: RolloutProcessorConfig
-) -> List[EvaluationRow]:
+) -> AsyncIterator[EvaluationRow]:
     """Generate a single response from any supported model provider using LiteLLM."""
 
     # Explicitly disable LiteLLM caching to avoid reused responses across runs
@@ -70,17 +74,45 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
 
         row.messages = messages
         default_logger.log(row)
+        logger.info(f"FINISHED PROCESSING ROW: {row.input_metadata.row_id} at time {time.time()}")
         return row
 
-    # Process rows with bounded concurrency if configured
+    # Process rows with bounded concurrency and yield as they complete
     max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8
     semaphore = asyncio.Semaphore(max_concurrent)
 
     async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
         async with semaphore:
             return await process_row(r)
 
-    tasks = [_sem_wrapper(row) for row in rows]
-    dataset = list(await asyncio.gather(*tasks))
+    # Create all tasks
+    tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows]
 
-    return dataset
+    # Yield results as they complete (not in original order)
+    try:
+        while tasks:
+            # Wait for at least one task to complete
+            done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
+
+            # Yield completed results
+            for task in done:
+                try:
+                    result = await task
+                    yield result
+                except Exception as e:
+                    # Log error but continue processing other tasks
+                    print(f"Error processing row: {e}")
+                    # Could yield an error row or skip
+
+            # Update tasks list to only pending tasks
+            tasks = list(pending)
+
+    finally:
+        # Clean up any remaining tasks
+        for task in tasks:
+            if not task.done():
+                task.cancel()
+                try:
+                    await task
+                except asyncio.CancelledError:
+                    pass
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -1,7 +1,8 @@
-import inspect
-import os
+import asyncio
 import copy
+import inspect
 import math
+import os
 import statistics
 from typing import Any, Callable, Dict, List, Optional
 
@@ -33,7 +34,7 @@
 from ..common_utils import load_jsonl
 
 
-def evaluation_test(
+def evaluation_test(  # noqa: C901
     *,
     model: List[ModelParam],
     input_messages: Optional[List[InputMessagesParam]] = None,
@@ -221,7 +222,7 @@ def generate_combinations():
         # Create wrapper function with exact signature that pytest expects
         def create_wrapper_with_signature() -> Callable:
             # Create the function body that will be used
-            def wrapper_body(**kwargs):
+            async def wrapper_body(**kwargs):
                 model_name = kwargs["model"]
                 eval_metadata = None
                 all_results: List[EvaluationRow] = []
@@ -300,10 +301,14 @@ def wrapper_body(**kwargs):
                         # Regenerate outputs each run by deep-copying the pristine dataset
                         # so model responses are not reused across runs.
                         fresh_rows = [copy.deepcopy(r) for r in data]
-                        input_dataset = execute_function(rollout_processor, rows=fresh_rows, config=config)
+
+                        # All rollout processors now return AsyncIterator for pipelining
+                        rollout_result = rollout_processor(fresh_rows, config)
+
                         if mode == "pointwise":
-                            # Pointwise mode: apply the evaluator function to each row
-                            for row in input_dataset:
+                            # Pointwise mode: true pipelining with concurrent evaluations
+                            async def process_evaluation(row):
+                                """Process a single evaluation and return the result."""
                                 result = execute_with_params(
                                     test_func,
                                     row=row,
@@ -313,8 +318,25 @@ def wrapper_body(**kwargs):
                                     raise ValueError(
                                         f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
                                     )
-                                all_results.append(result)
+                                return result
+
+                            # Start evaluations as rollouts complete - true pipelining
+                            eval_tasks = []
+                            async for row in rollout_result:
+                                # Start evaluation immediately when rollout completes
+                                eval_task = asyncio.create_task(process_evaluation(row))
+                                eval_tasks.append(eval_task)
+
+                            # Collect all evaluation results
+                            if eval_tasks:
+                                eval_results = await asyncio.gather(*eval_tasks)
+                                all_results.extend(eval_results)
                         else:
+                            # Batch mode: collect all results first, then evaluate
+                            input_dataset = []
+                            async for row in rollout_result:
+                                input_dataset.append(row)
+
                             # Batch mode: call the test function with the full dataset
                             results = execute_with_params(
                                 test_func,
@@ -353,8 +375,12 @@ def wrapper_body(**kwargs):
                                 sample_std = statistics.stdev(scores)
                                 se = sample_std / math.sqrt(n)
                                 margin = 1.96 * se
-                                ci_low = float(max(0.0, (agg_score or 0.0) - margin)) if agg_score is not None else None
-                                ci_high = float(min(1.0, (agg_score or 0.0) + margin)) if agg_score is not None else None
+                                ci_low = (
+                                    float(max(0.0, (agg_score or 0.0) - margin)) if agg_score is not None else None
+                                )
+                                ci_high = (
+                                    float(min(1.0, (agg_score or 0.0) + margin)) if agg_score is not None else None
+                                )
                             except Exception:
                                 ci_low = None
                                 ci_high = None
@@ -392,6 +418,7 @@ def wrapper_body(**kwargs):
                         # Aggregate per-metric mean and 95% CI when available
                         metrics_summary: Dict[str, Dict[str, float]] = {}
                         from collections import defaultdict
+
                         metric_scores: Dict[str, list] = defaultdict(list)
                         for r in all_results:
                             if r.evaluation_result and r.evaluation_result.metrics:
@@ -435,12 +462,16 @@ def wrapper_body(**kwargs):
                                 parts = []
                                 for m_name, entry in metrics_summary.items():
                                     if "ci_low" in entry and "ci_high" in entry:
-                                        parts.append(f"{m_name}={entry['mean']:.3f} ci95=[{entry['ci_low']:.3f},{entry['ci_high']:.3f}]")
+                                        parts.append(
+                                            f"{m_name}={entry['mean']:.3f} ci95=[{entry['ci_low']:.3f},{entry['ci_high']:.3f}]"
+                                        )
                                     else:
                                         parts.append(f"{m_name}={entry['mean']:.3f}")
                                 print(f"EP Metrics | " + ", ".join(parts))
                         if summary_path:
-                            import json, pathlib, time
+                            import json
+                            import pathlib
+                            import time
 
                             p = pathlib.Path(summary_path)
                             p.parent.mkdir(parents=True, exist_ok=True)
@@ -483,6 +514,7 @@ def wrapper_body(**kwargs):
         # Create the pytest wrapper
         pytest_wrapper = create_wrapper_with_signature()
         pytest_wrapper = pytest.mark.parametrize(test_param_names, param_tuples)(pytest_wrapper)
+        pytest_wrapper = pytest.mark.asyncio(pytest_wrapper)
 
         def create_dual_mode_wrapper() -> Callable:
             """
@@ -500,17 +532,21 @@ def create_dual_mode_wrapper() -> Callable:
             """
             import asyncio
 
-            # Check if the test function is async
-            is_async = asyncio.iscoroutinefunction(test_func)
+            # Check if the pytest wrapper is async (it should be now)
+            is_pytest_wrapper_async = asyncio.iscoroutinefunction(pytest_wrapper)
+            is_test_func_async = asyncio.iscoroutinefunction(test_func)
 
-            if is_async:
+            if is_pytest_wrapper_async:
 
                 async def dual_mode_wrapper(*args, **kwargs):
                     # Check if this is a direct call with the expected signature
                     if mode == "pointwise":
                         # For pointwise mode, check if called with a single row argument
                         if len(args) == 1 and isinstance(args[0], EvaluationRow) and not kwargs:
-                            return await test_func(row=args[0])
+                            if is_test_func_async:
+                                return await test_func(row=args[0])
+                            else:
+                                return test_func(row=args[0])
                     else:
                         # For batch mode, check if called with rows argument
                         if (
@@ -519,18 +555,24 @@ async def dual_mode_wrapper(*args, **kwargs):
                             and all(isinstance(r, EvaluationRow) for r in args[0])
                             and not kwargs
                         ):
-                            return await test_func(rows=args[0])
+                            if is_test_func_async:
+                                return await test_func(rows=args[0])
+                            else:
+                                return test_func(rows=args[0])
                         # Also check if called with keyword argument 'rows'
                         if (
                             len(args) == 0
                             and "rows" in kwargs
                             and isinstance(kwargs["rows"], list)
                             and all(isinstance(r, EvaluationRow) for r in kwargs["rows"])
                         ):
-                            return await test_func(**kwargs)
+                            if is_test_func_async:
+                                return await test_func(**kwargs)
+                            else:
+                                return test_func(**kwargs)
 
                     # If not a direct call, use the pytest wrapper
-                    return pytest_wrapper(*args, **kwargs)
+                    return await pytest_wrapper(*args, **kwargs)
 
             else:
 
diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py
@@ -84,9 +84,18 @@ def create_dynamically_parameterized_wrapper(test_func, wrapper_body, test_param
     """
     from functools import wraps
 
-    @wraps(test_func)
-    def wrapper(**kwargs):
-        return wrapper_body(**kwargs)
+    # Check if wrapper_body is async and create appropriate wrapper
+    if asyncio.iscoroutinefunction(wrapper_body):
+
+        @wraps(test_func)
+        async def wrapper(**kwargs):
+            return await wrapper_body(**kwargs)
+
+    else:
+
+        @wraps(test_func)
+        def wrapper(**kwargs):
+            return wrapper_body(**kwargs)
 
     parameters = [inspect.Parameter(name, inspect.Parameter.POSITIONAL_OR_KEYWORD) for name in test_param_names]
     wrapper.__signature__ = inspect.Signature(parameters)
diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py
@@ -5,11 +5,15 @@
 and comparing the output against expected results in a pointwise manner.
 """
 
+import logging
+import time
 from typing import Any, Dict, List
 
 from eval_protocol.models import EvaluateResult, EvaluationRow, Message
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
-from eval_protocol.rewards.code_execution import extract_code_blocks, execute_python_code
+from eval_protocol.rewards.code_execution import execute_python_code, extract_code_blocks
+
+logger = logging.getLogger(__name__)
 
 
 def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
@@ -18,8 +22,8 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
     """
     return [
         EvaluationRow(
-            messages=[Message(role="user", content=f"{row['prompt']} Input: {row['input']}")], 
-            ground_truth=row["expected_output"]
+            messages=[Message(role="user", content=f"{row['prompt']} Input: {row['input']}")],
+            ground_truth=row["expected_output"],
         )
         for row in data
     ]
@@ -38,55 +42,52 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
 def test_coding_code_evaluation(row: EvaluationRow) -> EvaluationRow:
     """
     Evaluation function that tests code correctness by executing it locally.
-    
+
     This function:
     1. Extracts Python code from the assistant's response
     2. Executes the code locally with timeout=10
     3. Compares the output to ground_truth
     4. Returns a score of 1.0 if output matches, 0.0 otherwise
-    
+
     Args:
         row: EvaluationRow containing the conversation messages and expected_output in ground_truth
-        
+
     Returns:
         EvaluationRow with the evaluation result
     """
+    logger.info(f"STARTING TO EVALUATE ROW: {row.input_metadata.row_id} at time {time.time()}")
     # Check if we have an assistant response
     if len(row.messages) < 2 or row.messages[-1].role != "assistant":
         row.evaluation_result = EvaluateResult(score=0.0, reason="No assistant response found")
         return row
-    
+
     assistant_content = row.messages[-1].content or ""
     expected_output = (row.ground_truth or "").strip()
-    
+
     # Extract Python code blocks
     code_blocks = extract_code_blocks(assistant_content, language="python")
     if not code_blocks:
         row.evaluation_result = EvaluateResult(score=0.0, reason="No Python code block found")
         return row
-    
+
     code = code_blocks[0]["code"]
-    
+
     # Execute the code locally
     execution_result = execute_python_code(code, timeout=10)
-    
+
     if not execution_result.get("success", False):
         error_msg = execution_result.get("error", "Code execution failed")
         row.evaluation_result = EvaluateResult(score=0.0, reason=f"Execution error: {error_msg}")
         return row
-    
+
     # Compare output with expected
     actual_output = (execution_result.get("output", "") or "").strip()
-    
+
     if actual_output == expected_output:
-        row.evaluation_result = EvaluateResult(
-            score=1.0, 
-            reason=f"✅ Output matches: '{actual_output}'"
-        )
+        row.evaluation_result = EvaluateResult(score=1.0, reason=f"✅ Output matches: '{actual_output}'")
     else:
         row.evaluation_result = EvaluateResult(
-            score=0.0, 
-            reason=f"❌ Expected: '{expected_output}', Got: '{actual_output}'"
+            score=0.0, reason=f"❌ Expected: '{expected_output}', Got: '{actual_output}'"
         )
-    
+
     return row