square up all the id madness and add a test

Dylan Huang · Dylan Huang · commit 2ceaf726f258 · 2025-08-10T14:44:01.000-07:00
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -63,7 +63,11 @@ def evaluation_test(  # noqa: C901
 
     Here are some key concepts to understand the terminology in EP:
 
-    - "cohort" is a group of runs with a static set of parameters. A single
+    - "invocation" is a single execution of a test function. An invocation can
+        generate 1 or more cohorts. Grouping by invocation might be useful to
+        aggregate eval scores across multiple invocations when you want to aggregate
+        scores across multiple datasets.
+    - "cohort" is a group of runs with for a combination of parameters. A single
         cohort will have multiple runs if num_runs > 1.
         1. If your evaluation_test has combinations of parameters, it will generate
         multiple cohorts per combination of parameters.
@@ -85,8 +89,8 @@ def evaluation_test(  # noqa: C901
         decorated test. It simply produces a score from 0 to 1 and attached it
         to the row as the "evaluation_result" field.
 
-    A "cohort", "run", "rollout", and "row" each have a unique ID which can be
-    used to easily group and identify them.
+    "invocation", "cohort", "run", "rollout", and "row" each have a unique ID
+    which can be used to easily group and identify your dataset by.
 
     Args:
         model: Model identifiers to query.
@@ -205,7 +209,7 @@ def generate_combinations():
                         datasets = [[input_dataset]]  # type: ignore
             else:
                 datasets = [None]
-            params: List[Optional[RolloutInputParam]] = rollout_input_params if rollout_input_params is not None else [None]  # type: ignore
+            rips: List[Optional[RolloutInputParam]] = rollout_input_params if rollout_input_params is not None else [None]  # type: ignore
             # Apply EP_MAX_DATASET_ROWS to input_messages, but do NOT parameterize over
             # each row. Instead, pass the entire sliced list through in a single test run
             # so summaries aggregate all rows together (AIME-style behavior).
@@ -224,15 +228,15 @@ def generate_combinations():
             # Generate all combinations
             for m in model:
                 for ds in datasets:
-                    for ip in params:
+                    for rip in rips:
                         for im in messages:
                             for etk in kwargs:
                                 # if no dataset and no messages, raise an error
                                 if ds is None and im is None:
                                     raise ValueError(
                                         "No dataset or messages provided. Please provide at least one of input_dataset or input_messages."
                                     )
-                                combinations.append((m, ds, ip, im, etk))
+                                combinations.append((m, ds, rip, im, etk))
 
             return combinations
 
@@ -245,12 +249,12 @@ def generate_combinations():
         # Create parameter tuples for pytest.mark.parametrize
         param_tuples = []
         for combo in combinations:
-            model_name, dataset, params, messages, etk = combo
+            model_name, dataset, rip, messages, etk = combo
             param_tuple = [model_name]
             if input_dataset is not None:
                 param_tuple.append(dataset)
             if rollout_input_params is not None:
-                param_tuple.append(params)
+                param_tuple.append(rip)
             if input_messages is not None:
                 param_tuple.append(messages)
             if evaluation_test_kwargs is not None:
@@ -271,13 +275,15 @@ def generate_combinations():
         # Create wrapper function with exact signature that pytest expects
         def create_wrapper_with_signature() -> Callable:
             # Create the function body that will be used
-            cohort_id = generate_id()
+            invocation_id = generate_id()
 
             def wrapper_body(**kwargs):
                 model_name = kwargs["model"]
                 eval_metadata = None
                 all_results: List[EvaluationRow] = []
 
+                cohort_id = generate_id()
+
                 def _log_eval_error(
                     status: Literal["finished", "error"], rows: Optional[List[EvaluationRow]] | None, passed: bool
                 ) -> None:
@@ -358,6 +364,7 @@ def _log_eval_error(
                         # Initialize eval_metadata for each row
                         row.eval_metadata = eval_metadata
                         row.cohort_id = cohort_id
+                        row.invocation_id = invocation_id
 
                         # has to be done in the pytest main process since it's
                         # used to determine whether this eval has stopped
diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py
@@ -7,7 +7,7 @@
 import re
 from typing import Any, Dict, List
 
-from eval_protocol.models import EvaluateResult, EvaluationRow, Message
+from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
 
 
@@ -16,7 +16,11 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
     Convert entries from markdown dataset to EvaluationRow objects.
     """
     return [
-        EvaluationRow(messages=[Message(role="user", content=row["prompt"])], ground_truth=str(row["num_highlights"]))
+        EvaluationRow(
+            messages=[Message(role="user", content=row["prompt"])],
+            ground_truth=str(row["num_highlights"]),
+            input_metadata=InputMetadata(row_id=str(row["key"])),
+        )
         for row in data
     ]
 
diff --git a/tests/pytest/test_pytest_ids.py b/tests/pytest/test_pytest_ids.py
@@ -0,0 +1,67 @@
+import eval_protocol.pytest.evaluation_test as evaluation_test_module
+from eval_protocol.models import EvaluationRow
+from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
+from eval_protocol.pytest.evaluation_test import evaluation_test as evaluation_decorator
+from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row
+
+
+class InMemoryLogger:
+    def __init__(self):
+        self._rows = []
+
+    def log(self, row):
+        self._rows.append(row)
+
+    def read(self):
+        return list(self._rows)
+
+
+def test_evaluation_test_decorator_ids_single(monkeypatch):
+    # Use an in-memory logger to avoid sqlite side effects
+    in_memory_logger = InMemoryLogger()
+    monkeypatch.setattr(evaluation_test_module, "default_logger", in_memory_logger, raising=False)
+
+    unique_run_ids = set()
+    unique_cohort_ids = set()
+    unique_rollout_ids = set()
+    unique_invocation_ids = set()
+    unique_row_ids = set()
+
+    @evaluation_decorator(
+        input_dataset=[
+            "tests/pytest/data/markdown_dataset.jsonl",
+            "tests/pytest/data/markdown_dataset.jsonl",
+        ],
+        rollout_input_params=[{"temperature": 0.0}, {"temperature": 1.0}],
+        model=["dummy/local-model"],
+        dataset_adapter=markdown_dataset_to_evaluation_row,
+        rollout_processor=default_no_op_rollout_processor,
+        mode="pointwise",
+        combine_datasets=False,
+        num_runs=5,
+    )
+    def eval_fn(row: EvaluationRow) -> EvaluationRow:
+        unique_run_ids.add(row.run_id)
+        unique_cohort_ids.add(row.cohort_id)
+        unique_rollout_ids.add(row.rollout_id)
+        unique_invocation_ids.add(row.invocation_id)
+        unique_row_ids.add(row.input_metadata.row_id)
+        return row
+
+    dataset_paths = [
+        "tests/pytest/data/markdown_dataset.jsonl",
+        "tests/pytest/data/markdown_dataset.jsonl",
+    ]
+    input_params_list = [{"temperature": 0.0}, {"temperature": 1.0}]
+
+    # Manually invoke all parameter combinations within a single test
+    for ds_path in dataset_paths:
+        for params in input_params_list:
+            eval_fn(model="dummy/local-model", dataset_path=[ds_path], input_params=params)
+
+    # Assertions on IDs generated by the decorator logic
+    assert len(unique_invocation_ids) == 1
+    assert len(unique_run_ids) == 20  # 4 combinations * 5 runs each
+    assert len(unique_cohort_ids) == 2 * 2  # 2 datasets * 2 param sets
+    assert len(unique_row_ids) == 19  # from the markdown dataset
+    assert len(unique_rollout_ids) == 19 * 5 * 2 * 2  # rows * runs * datasets * params