move decorator into its own file

Dylan Huang · Dylan Huang · commit 54d9b11aa3cf · 2025-08-03T12:42:20.000-07:00
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -1,9 +1,9 @@
-import asyncio
 import inspect
 from typing import Any, Callable, Dict, List, Optional
 
 import pytest
 
+from eval_protocol.models import EvaluationRow
 from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
 from eval_protocol.pytest.types import (
     Dataset,
@@ -16,100 +16,9 @@
     RolloutProcessorConfig,
     TestFunction,
 )
+from eval_protocol.pytest.utils import aggregate, create_dynamically_parameterized_wrapper, execute_function
 
 from ..common_utils import load_jsonl
-from ..models import EvaluateResult, EvaluationRow
-
-
-def _execute_function(func: Callable, **kwargs) -> Any:
-    """
-    Execute a function with proper async handling.
-
-    This is a pure function that handles both async and non-async function execution
-    with proper event loop management for async functions.
-
-    Args:
-        func: The function to execute
-        **kwargs: Arguments to pass to the function
-
-    Returns:
-        The result of the function execution
-    """
-    is_async = asyncio.iscoroutinefunction(func)
-    if is_async:
-        # Handle async functions with proper event loop management
-        try:
-            loop = asyncio.get_event_loop()
-            if not loop.is_closed():
-                # Use existing loop
-                task = loop.create_task(func(**kwargs))
-                results = loop.run_until_complete(task)
-            else:
-                # Loop is closed, create a new one
-                results = asyncio.run(func(**kwargs))
-        except RuntimeError:
-            # No event loop or other issues, create a new one
-            results = asyncio.run(func(**kwargs))
-    else:
-        results = func(**kwargs)
-    return results
-
-
-def evaluate(
-    rows: List[EvaluationRow], reward_fn: Callable[..., EvaluateResult], **kwargs: Any
-) -> List[EvaluationRow]:
-    """Apply a reward function to each row and attach the result."""
-    evaluated: List[EvaluationRow] = []
-    for row in rows:
-        result = reward_fn(messages=row.messages, ground_truth=row.ground_truth, **kwargs)
-        row.evaluation_result = result
-        evaluated.append(row)
-    return evaluated
-
-
-def _aggregate(scores: List[float], method: str) -> float:
-    if not scores:
-        return 0.0
-    if method == "mean":
-        return sum(scores) / len(scores)
-    if method == "max":
-        return max(scores)
-    if method == "min":
-        return min(scores)
-    raise ValueError(f"Unknown aggregation method: {method}")
-
-
-def _create_dynamically_parameterized_wrapper(test_func, wrapper_body, test_param_names):
-    """
-    Creates a wrapper function with dynamic parameters for pytest parameterization.
-
-    This function takes a test function and creates a wrapper that:
-    1. Preserves the original function's metadata using functools.wraps
-    2. Creates a new function signature with the specified parameter names that maps to pytest.mark.parametrize decorator
-    3. Returns a callable that can be used with pytest.mark.parametrize
-
-    The function signature is dynamically created to match the parameter names expected by
-    pytest.mark.parametrize, ensuring that pytest can properly map the test parameters
-    to the function arguments.
-
-    Args:
-        test_func: The original test function to wrap
-        wrapper_body: The function body that contains the actual test logic
-        test_param_names: List of parameter names for the dynamic signature
-
-    Returns:
-        A wrapper function with the specified parameter signature that calls wrapper_body
-    """
-    from functools import wraps
-
-    @wraps(test_func)
-    def wrapper(**kwargs):
-        return wrapper_body(**kwargs)
-
-    parameters = [inspect.Parameter(name, inspect.Parameter.POSITIONAL_OR_KEYWORD) for name in test_param_names]
-    wrapper.__signature__ = inspect.Signature(parameters)
-
-    return wrapper
 
 
 def evaluation_test(
@@ -193,9 +102,6 @@ def test_func(model_name: str, input_messages: List[List[Message]]):
     def decorator(
         test_func: TestFunction,
     ):
-        # Check if the function is async
-        is_async = inspect.iscoroutinefunction(test_func)
-
         sig = inspect.signature(test_func)
 
         # For pointwise/rowwise mode, we expect a different signature
@@ -240,7 +146,7 @@ def execute_with_params(
                 kwargs["model"] = model
             if row is not None:
                 kwargs["row"] = row
-            return _execute_function(test_func, **kwargs)
+            return execute_function(test_func, **kwargs)
 
         # Calculate all possible combinations of parameters
         def generate_combinations():
@@ -315,7 +221,7 @@ def wrapper_body(**kwargs):
                     initial_messages=kwargs.get("input_messages") if "input_messages" in kwargs else [],
                 )
                 for row in data:
-                    processed: List[EvaluationRow] = _execute_function(rollout_processor, row=row, config=config)
+                    processed: List[EvaluationRow] = execute_function(rollout_processor, row=row, config=config)
                     input_dataset.extend(processed)
 
                 all_results: List[EvaluationRow] = []
@@ -361,13 +267,13 @@ def wrapper_body(**kwargs):
                         all_results.extend(results)
 
                 scores = [r.evaluation_result.score for r in all_results if r.evaluation_result]
-                agg_score = _aggregate(scores, aggregation_method)
+                agg_score = aggregate(scores, aggregation_method)
                 if threshold_of_success is not None:
                     assert (
                         agg_score >= threshold_of_success
                     ), f"Aggregated score {agg_score:.3f} below threshold {threshold_of_success}"
 
-            return _create_dynamically_parameterized_wrapper(test_func, wrapper_body, test_param_names)
+            return create_dynamically_parameterized_wrapper(test_func, wrapper_body, test_param_names)
 
         wrapper = create_wrapper_with_signature()
         wrapper = pytest.mark.parametrize(test_param_names, param_tuples)(wrapper)
diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py
@@ -0,0 +1,96 @@
+import asyncio
+import inspect
+from typing import Any, Callable, List
+
+from ..models import EvaluateResult, EvaluationRow
+
+
+def execute_function(func: Callable, **kwargs) -> Any:
+    """
+    Execute a function with proper async handling.
+
+    This is a pure function that handles both async and non-async function execution
+    with proper event loop management for async functions.
+
+    Args:
+        func: The function to execute
+        **kwargs: Arguments to pass to the function
+
+    Returns:
+        The result of the function execution
+    """
+    is_async = asyncio.iscoroutinefunction(func)
+    if is_async:
+        # Handle async functions with proper event loop management
+        try:
+            loop = asyncio.get_event_loop()
+            if not loop.is_closed():
+                # Use existing loop
+                task = loop.create_task(func(**kwargs))
+                results = loop.run_until_complete(task)
+            else:
+                # Loop is closed, create a new one
+                results = asyncio.run(func(**kwargs))
+        except RuntimeError:
+            # No event loop or other issues, create a new one
+            results = asyncio.run(func(**kwargs))
+    else:
+        results = func(**kwargs)
+    return results
+
+
+def evaluate(
+    rows: List[EvaluationRow], reward_fn: Callable[..., EvaluateResult], **kwargs: Any
+) -> List[EvaluationRow]:
+    """Apply a reward function to each row and attach the result."""
+    evaluated: List[EvaluationRow] = []
+    for row in rows:
+        result = reward_fn(messages=row.messages, ground_truth=row.ground_truth, **kwargs)
+        row.evaluation_result = result
+        evaluated.append(row)
+    return evaluated
+
+
+def aggregate(scores: List[float], method: str) -> float:
+    if not scores:
+        return 0.0
+    if method == "mean":
+        return sum(scores) / len(scores)
+    if method == "max":
+        return max(scores)
+    if method == "min":
+        return min(scores)
+    raise ValueError(f"Unknown aggregation method: {method}")
+
+
+def create_dynamically_parameterized_wrapper(test_func, wrapper_body, test_param_names):
+    """
+    Creates a wrapper function with dynamic parameters for pytest parameterization.
+
+    This function takes a test function and creates a wrapper that:
+    1. Preserves the original function's metadata using functools.wraps
+    2. Creates a new function signature with the specified parameter names that maps to pytest.mark.parametrize decorator
+    3. Returns a callable that can be used with pytest.mark.parametrize
+
+    The function signature is dynamically created to match the parameter names expected by
+    pytest.mark.parametrize, ensuring that pytest can properly map the test parameters
+    to the function arguments.
+
+    Args:
+        test_func: The original test function to wrap
+        wrapper_body: The function body that contains the actual test logic
+        test_param_names: List of parameter names for the dynamic signature
+
+    Returns:
+        A wrapper function with the specified parameter signature that calls wrapper_body
+    """
+    from functools import wraps
+
+    @wraps(test_func)
+    def wrapper(**kwargs):
+        return wrapper_body(**kwargs)
+
+    parameters = [inspect.Parameter(name, inspect.Parameter.POSITIONAL_OR_KEYWORD) for name in test_param_names]
+    wrapper.__signature__ = inspect.Signature(parameters)
+
+    return wrapper