fix

Dylan Huang · Dylan Huang · commit 56bf3abdf410 · 2025-08-03T13:11:36.000-07:00
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -16,7 +16,12 @@
     RolloutProcessorConfig,
     TestFunction,
 )
-from eval_protocol.pytest.utils import aggregate, create_dynamically_parameterized_wrapper, execute_function
+from eval_protocol.pytest.utils import (
+    AggregationMethod,
+    aggregate,
+    create_dynamically_parameterized_wrapper,
+    execute_function,
+)
 
 from ..common_utils import load_jsonl
 
@@ -29,7 +34,7 @@ def evaluation_test(
     dataset_adapter: Optional[Callable[[List[Dict[str, Any]]], Dataset]] = lambda x: x,
     input_params: Optional[List[InputParam]] = None,
     rollout_processor: RolloutProcessor = default_no_op_rollout_processor,
-    aggregation_method: str = "mean",
+    aggregation_method: AggregationMethod = "mean",
     threshold_of_success: Optional[float] = None,
     num_runs: int = 1,
     max_dataset_rows: Optional[int] = None,
@@ -58,45 +63,10 @@ def evaluation_test(
             below this threshold.
         num_runs: Number of times to repeat the evaluation.
         max_dataset_rows: Limit dataset to the first N rows.
+        mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
         mode: Evaluation mode. "batch" (default) expects test function to handle
             full dataset. "pointwise" applies test function to each row. If your evaluation requires
             the full rollout of all rows to compute the score, use
-
-    Usage:
-    With an input dataset and input params, the test function will be called with the following arguments:
-
-    ```python
-    @evaluation_test(
-        model=["gpt-4o", "gpt-4o-mini"],
-        input_dataset=["data/test.jsonl"],
-        input_params=[{"temperature": 0.5}],
-        rollout_processor=default_rollout_processor,
-        aggregation_method="mean",
-    )
-    def test_func(dataset_path: str, model_name: str, input_params: Dict[str, Any]):
-        pass
-    ```
-
-    Without an input dataset and input params, the test function will be called with the following arguments:
-
-    ```python
-    @evaluation_test(
-        model=["gpt-4o", "gpt-4o-mini"],
-    )
-    def test_func(model_name: str):
-        pass
-    ```
-
-    With model and input_messages, the test function will be called with the following arguments:
-
-    ```python
-    @evaluation_test(
-        model=["gpt-4o", "gpt-4o-mini"],
-        input_messages=[{"role": "user", "content": "Hello, how are you?"}],
-    )
-    def test_func(model_name: str, input_messages: List[List[Message]]):
-        pass
-    ```
     """
 
     def decorator(
@@ -132,18 +102,12 @@ def decorator(
 
         def execute_with_params(
             test_func: TestFunction,
-            model: str,
             row: EvaluationRow | None = None,
             input_dataset: List[EvaluationRow] | None = None,
-            input_params: InputParam | None = None,
         ):
             kwargs = {}
             if input_dataset is not None:
                 kwargs["rows"] = input_dataset
-            if input_params is not None:
-                kwargs["input_params"] = input_params
-            if model is not None:
-                kwargs["model"] = model
             if row is not None:
                 kwargs["row"] = row
             return execute_function(test_func, **kwargs)
@@ -231,9 +195,7 @@ def wrapper_body(**kwargs):
                         for row in input_dataset:
                             result = execute_with_params(
                                 test_func,
-                                model=model_name,
                                 row=row,
-                                input_params=kwargs.get("input_params") if "input_params" in kwargs else None,
                             )
                             if result is None or not isinstance(result, EvaluationRow):
                                 raise ValueError(
@@ -244,9 +206,7 @@ def wrapper_body(**kwargs):
                         # Batch mode: call the test function with the full dataset
                         results = execute_with_params(
                             test_func,
-                            model=model_name,
                             input_dataset=input_dataset,
-                            input_params=kwargs.get("input_params") if "input_params" in kwargs else None,
                         )
                         if results is None:
                             raise ValueError(
diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py
@@ -1,6 +1,6 @@
 import asyncio
 import inspect
-from typing import Any, Callable, List
+from typing import Any, Callable, List, Literal
 
 from ..models import EvaluateResult, EvaluationRow
 
@@ -51,7 +51,10 @@ def evaluate(
     return evaluated
 
 
-def aggregate(scores: List[float], method: str) -> float:
+AggregationMethod = Literal["mean", "max", "min"]
+
+
+def aggregate(scores: List[float], method: AggregationMethod) -> float:
     if not scores:
         return 0.0
     if method == "mean":
diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py
@@ -1,7 +1,8 @@
+from haikus import haikus
+
+from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
-from eval_protocol.models import EvaluateResult, MetricResult, EvaluationRow
 from tests.pytest.helper.word_count_to_evaluation_row import word_count_to_evaluation_row
-from haikus import haikus
 
 
 @evaluation_test(
@@ -74,8 +75,9 @@ def test_word_count_evaluate(row: EvaluationRow) -> EvaluationRow:
         ),
     }
 
-    return EvaluateResult(
+    row.evaluation_result = EvaluateResult(
         score=word_count_score,
         reason=f"Word count: {word_count}. {haiku_metric_reason}",
         metrics=metrics,
     )
+    return row