eval-protocol
diff --git a/‎README.md‎
Lines changed: 7 additions & 9 deletions b/‎README.md‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎eval_protocol/benchmarks/test_aime25.py‎
Lines changed: 1 addition & 1 deletion b/‎eval_protocol/benchmarks/test_aime25.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎eval_protocol/benchmarks/test_gpqa.py‎
Lines changed: 1 addition & 1 deletion b/‎eval_protocol/benchmarks/test_gpqa.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎eval_protocol/pytest/evaluation_test.py‎
Lines changed: 82 additions & 29 deletions b/‎eval_protocol/pytest/evaluation_test.py‎
Lines changed: 82 additions & 29 deletions
diff --git a/‎eval_protocol/pytest/types.py‎
Lines changed: 1 addition & 0 deletions b/‎eval_protocol/pytest/types.py‎
Lines changed: 1 addition & 0 deletions
@@ -2,15 +2,13 @@
 
 [![PyPI - Version](https://img.shields.io/pypi/v/eval-protocol)](https://pypi.org/project/eval-protocol/)
 
-EP is an open specification, Python SDK, pytest wrapper, and suite of tools that
-provides a standardized way to write evaluations for large language model (LLM)
-applications. Start with simple single-turn evals for model selection and prompt
-engineering, then scale up to complex multi-turn reinforcement learning (RL) for
-agents using Model Context Protocol (MCP). EP ensures consistent patterns for
-writing evals, storing traces, and saving results—enabling you to build
-sophisticated agent evaluations that work across real-world scenarios, from
-markdown generation tasks to customer service agents with tool calling
-capabilities.
+**Eval Protocol (EP) is the open-source standard and toolkit for practicing Eval-Driven Development.**
+
+Building with AI is different. Traditional software is deterministic, but AI systems are probabilistic. How do you ship new features without causing silent regressions? How do you prove a new prompt is actually better?
+
+The answer is a new engineering discipline: **Eval-Driven Development (EDD)**. It adapts the rigor of Test-Driven Development for the uncertain world of AI. With EDD, you define your AI's desired behavior as a suite of executable tests, creating a safety net that allows you to innovate with confidence.
+
+EP provides a consistent way to write evals, store traces, and analyze results.
 
 <p align="center">
 	<img src="https://raw.githubusercontent.com/eval-protocol/python-sdk/refs/heads/main/assets/ui.png" alt="UI" />
 
@@ -72,7 +72,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     aggregation_method="mean",
-    passed_threshold=None,
+    passed_threshold=0.8,
     num_runs=8,
     max_dataset_rows=2,
     max_concurrent_rollouts=4,
 
@@ -96,7 +96,7 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
     ],
     rollout_processor=GPQAStripGTRolloutProcessor(),
     aggregation_method="mean",
-    passed_threshold=None,
+    passed_threshold=0.6,
     num_runs=8,
     mode="pointwise",
 )
 
@@ -11,7 +11,8 @@
 from dataclasses import replace
 from typing import Any, Callable, Dict, List, Literal, Optional, Union
 from collections import defaultdict
-
+import hashlib
+import ast
 from mcp.types import Completion
 import pytest
 
@@ -35,6 +36,7 @@
     EvaluationInputParam,
     EvaluationTestMode,
     InputMessagesParam,
+    InputRowsParam,
     ModelParam,
     RolloutProcessorConfig,
     RolloutProcessorInputParam,
@@ -81,14 +83,16 @@ def postprocess(
     if aggregation_method == "mean":
         try:
             result_ci = compute_fixed_set_mu_ci([item for sublist in all_results for item in sublist])
-            _, mu_ci_low, mu_ci_high, standard_error = result_ci
-            if mu_ci_low is not None and mu_ci_high is not None:
+            _, mu_ci_low, mu_ci_high, se = result_ci
+            if mu_ci_low is not None and mu_ci_high is not None and se is not None:
                 ci_low = float(mu_ci_low)
                 ci_high = float(mu_ci_high)
+                standard_error = float(se)
                 # Keep agg_score as-is (mean over scores). For equal repeats per question these match.
         except Exception:
             ci_low = None
             ci_high = None
+            standard_error = None
 
     # Determine if the evaluation passed based on threshold
     passed = None
@@ -127,9 +131,10 @@ def postprocess(
             "num_runs": num_runs,
             "rows": total_rows,
         }
-        if ci_low is not None and ci_high is not None:
+        if ci_low is not None and ci_high is not None and standard_error is not None:
             summary_obj["agg_ci_low"] = ci_low
             summary_obj["agg_ci_high"] = ci_high
+            summary_obj["standard_error"] = standard_error
 
         # Aggregate per-metric mean and 95% CI when available
         metrics_summary: Dict[str, Dict[str, float]] = {}
@@ -164,9 +169,9 @@ def postprocess(
         if metrics_summary:
             summary_obj["metrics_agg"] = metrics_summary
         if should_print:
-            if ci_low is not None and ci_high is not None:
+            if ci_low is not None and ci_high is not None and standard_error is not None:
                 print(
-                    f"EP Summary | suite={suite_name} model={model_used} agg={summary_obj['agg_score']:.3f} ci95=[{ci_low:.3f},{ci_high:.3f}] runs={num_runs} rows={total_rows}"
+                    f"EP Summary | suite={suite_name} model={model_used} agg={summary_obj['agg_score']:.3f} se={summary_obj['standard_error']:.3f} ci95=[{ci_low:.3f},{ci_high:.3f}] runs={num_runs} rows={total_rows}"
                 )
             else:
                 print(
@@ -235,6 +240,7 @@ def evaluation_test(  # noqa: C901
     completion_params: List[CompletionParams],
     input_messages: Optional[List[InputMessagesParam]] = None,
     input_dataset: Optional[List[DatasetPathParam]] = None,
+    input_rows: Optional[List[InputRowsParam]] = None,
     dataset_adapter: Callable[[List[Dict[str, Any]]], Dataset] = default_dataset_adapter,
     rollout_processor: RolloutProcessor = NoOpRolloutProcessor(),
     evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None,
@@ -245,6 +251,7 @@ def evaluation_test(  # noqa: C901
     max_dataset_rows: Optional[int] = None,
     mcp_config_path: Optional[str] = None,
     max_concurrent_rollouts: int = 8,
+    max_concurrent_evaluations: int = 64,
     server_script_path: Optional[str] = None,
     steps: int = 30,
     mode: EvaluationTestMode = "pointwise",
@@ -295,6 +302,9 @@ def evaluation_test(  # noqa: C901
         input_dataset: Paths to JSONL datasets. This is useful if you have a
             dataset already. Provide a dataset_adapter to convert the input dataset
             to a list of EvaluationRows if you have a custom dataset format.
+        input_rows: Pre-constructed EvaluationRow objects to use directly. This is useful
+            when you want to provide EvaluationRow objects with custom metadata, input_messages,
+            or other fields already populated. Will be passed as "input_dataset" to the test function.
         dataset_adapter: Function to convert the input dataset to a list of
             EvaluationRows. This is useful if you have a custom dataset format.
         completion_params: Generation parameters for the rollout.
@@ -309,6 +319,7 @@ def evaluation_test(  # noqa: C901
         max_dataset_rows: Limit dataset to the first N rows.
         mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
         max_concurrent_rollouts: Maximum number of concurrent rollouts to run in parallel.
+        max_concurrent_evaluations: Maximum number of concurrent evaluations to run in parallel.
         server_script_path: Path to the MCP server script to run (default: "examples/tau2_mcp/server.py").
         steps: Number of rollout steps to execute (default: 30).
         mode: Evaluation mode. "pointwise" (default) applies test function to each row (rollout result).
@@ -408,33 +419,42 @@ async def execute_with_params(
         # Calculate all possible combinations of parameters
         if mode == "groupwise":
             combinations = generate_parameter_combinations(
-                input_dataset, None, input_messages, evaluation_test_kwargs, max_dataset_rows, combine_datasets
+                input_dataset,
+                None,
+                input_messages,
+                input_rows,
+                evaluation_test_kwargs,
+                max_dataset_rows,
+                combine_datasets,
             )
         else:
             combinations = generate_parameter_combinations(
                 input_dataset,
                 completion_params,
                 input_messages,
+                input_rows,
                 evaluation_test_kwargs,
                 max_dataset_rows,
                 combine_datasets,
             )
         if len(combinations) == 0:
             raise ValueError(
-                "No combinations of parameters were found. Please provide at least a model and one of input_dataset or input_messages."
+                "No combinations of parameters were found. Please provide at least a model and one of input_dataset, input_messages, or input_rows."
             )
 
         # Create parameter tuples for pytest.mark.parametrize
         param_tuples = []
         for combo in combinations:
-            dataset, cp, messages, etk = combo
+            dataset, cp, messages, rows, etk = combo
             param_tuple = []
             if input_dataset is not None:
                 param_tuple.append(dataset)
             if completion_params is not None:
                 param_tuple.append(cp)
             if input_messages is not None:
                 param_tuple.append(messages)
+            if input_rows is not None:
+                param_tuple.append(rows)
             if evaluation_test_kwargs is not None:
                 param_tuple.append(etk)
             param_tuples.append(tuple(param_tuple))
@@ -447,6 +467,8 @@ async def execute_with_params(
             test_param_names.append("completion_params")
         if input_messages is not None:
             test_param_names.append("input_messages")
+        if input_rows is not None:
+            test_param_names.append("input_rows")
         if evaluation_test_kwargs is not None:
             test_param_names.append("evaluation_test_kwargs")
 
@@ -472,6 +494,8 @@ def _log_eval_error(
                 try:
                     # Handle dataset loading
                     data: List[EvaluationRow] = []
+                    # Track all rows processed in the current run for error logging
+                    processed_rows_in_run: List[EvaluationRow] = []
                     if "dataset_path" in kwargs and kwargs["dataset_path"] is not None:
                         ds_arg = kwargs["dataset_path"]
                         # Support either a single path or a list of paths; if a list is provided,
@@ -496,8 +520,11 @@ def _log_eval_error(
                         else:
                             # Multiple rows: list of List[Message]
                             data = [EvaluationRow(messages=m) for m in im]
+                    elif "input_rows" in kwargs and kwargs["input_rows"] is not None:
+                        # Use pre-constructed EvaluationRow objects directly
+                        data = kwargs["input_rows"]
                     else:
-                        raise ValueError("No input dataset or input messages provided")
+                        raise ValueError("No input dataset, input messages, or input rows provided")
 
                     for row in data:
                         # generate a stable row_id for each row
@@ -585,30 +612,44 @@ def _log_eval_error(
                         # log the fresh_dataset
                         for row in fresh_dataset:
                             active_logger.log(row)
+                            processed_rows_in_run.append(row)
 
-                        if mode == "pointwise":
-                            # Pointwise mode, rollouts will return as they complete so we can pipeline evaluation_test execution
-                            semaphore = asyncio.Semaphore(max_concurrent_rollouts)
-                            tasks = []
+                        # prepare parallel eval helper function
+                        semaphore = asyncio.Semaphore(max_concurrent_evaluations)
 
-                            async def _execute_with_semaphore(row):
-                                async with semaphore:
-                                    # NOTE: we will still evaluate errored rows (give users control over this)
-                                    # i.e., they can choose to give EvaluateResult.score = 0 for errored rows in their test_func
+                        async def _execute_eval_with_semaphore(**inner_kwargs):
+                            async with semaphore:
+                                # NOTE: we will still evaluate errored rows (give users control over this)
+                                # i.e., they can choose to give EvaluateResult.score = 0 for errored rows in their test_func
+                                if "row" in inner_kwargs:
                                     result = await execute_with_params(
                                         test_func,
-                                        processed_row=row,
+                                        processed_row=inner_kwargs["row"],
                                         evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
                                     )
                                     if result is None or not isinstance(result, EvaluationRow):
                                         raise ValueError(
                                             f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
                                         )
                                     return result
+                                if "rows" in inner_kwargs:
+                                    results = await execute_with_params(
+                                        test_func,
+                                        processed_dataset=inner_kwargs["rows"],
+                                        evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
+                                    )
+                                    if results is None or not isinstance(results, list):
+                                        raise ValueError(
+                                            f"Test function {test_func.__name__} did not return a list of EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
+                                        )
+                                    return results
 
+                        if mode == "pointwise":
+                            # Pointwise mode, rollouts will return as they complete so we can pipeline evaluation_test execution
+                            tasks = []
                             # Use wrapper that handles retry logic internally
                             async for row in rollout_processor_with_retry(rollout_processor, fresh_dataset, config):
-                                tasks.append(asyncio.create_task(_execute_with_semaphore(row)))
+                                tasks.append(asyncio.create_task(_execute_eval_with_semaphore(row=row)))
 
                             results = await asyncio.gather(*tasks)
 
@@ -649,14 +690,13 @@ async def _collect_result(config, lst):
                             for result in rollout_results:
                                 for row in result:
                                     row_groups[row.input_metadata.row_id].append(row)
-                            results = []
+                            tasks = []
                             for row_id, rows in row_groups.items():
-                                result = await execute_with_params(
-                                    test_func,
-                                    processed_dataset=rows,
-                                    evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
-                                )
-                                results.extend(result)
+                                tasks.append(asyncio.create_task(_execute_eval_with_semaphore(rows=rows)))
+                            results = []
+                            for task in tasks:
+                                res = await task
+                                results.extend(res)
                             all_results[i] = results
                         else:
                             # Batch mode: collect all results first, then evaluate (no pipelining)
@@ -728,10 +768,16 @@ async def _collect_result(config, lst):
                         )
 
                 except AssertionError:
-                    _log_eval_error("finished", data if "data" in locals() else None, passed=False)
+                    _log_eval_error(
+                        "finished",
+                        processed_rows_in_run if "processed_rows_in_run" in locals() else None,
+                        passed=False,
+                    )
                     raise
                 except Exception:
-                    _log_eval_error("error", data if "data" in locals() else None, passed=False)
+                    _log_eval_error(
+                        "error", processed_rows_in_run if "processed_rows_in_run" in locals() else None, passed=False
+                    )
                     raise
 
             return create_dynamically_parameterized_wrapper(test_func, wrapper_body, test_param_names)
@@ -794,6 +840,13 @@ async def dual_mode_wrapper(*args, **kwargs):
                 # If not a direct call, use the pytest wrapper
                 return await pytest_wrapper(*args, **kwargs)
 
+            dual_mode_wrapper._origin_func = test_func
+            dual_mode_wrapper._metainfo = {
+                "mode": mode,
+                "max_rollout_concurrency": max_concurrent_rollouts,
+                "max_evaluation_concurrency": max_concurrent_evaluations,
+            }
+
             # Copy all attributes from the pytest wrapper to our dual mode wrapper
             import functools
 
 
@@ -15,6 +15,7 @@
 ModelParam = str  # gpt-4o, gpt-4o-mini, accounts/fireworks/models/llama-3.1-8b-instruct
 DatasetPathParam = str
 InputMessagesParam = List[Message]
+InputRowsParam = List[EvaluationRow]
 EvaluationInputParam = Dict[str, Any]
 RolloutProcessorInputParam = Dict[str, Any]
Original file line number	Diff line number	Diff line change
`@@ -96,7 +96,7 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->`
`96`	`96`	`],`
`97`	`97`	`rollout_processor=GPQAStripGTRolloutProcessor(),`
`98`	`98`	`aggregation_method="mean",`
`99`		`- passed_threshold=None,`
	`99`	`+ passed_threshold=0.6,`
`100`	`100`	`num_runs=8,`
`101`	`101`	`mode="pointwise",`
`102`	`102`	`)`