Refactor EvalMetadata and EvaluationRow models; add cohort_id, rollout_id, and run_id fields. Update evaluation_test to handle new identifiers and improve documentation on evaluation concepts.

Dylan Huang · Dylan Huang · commit 3ad780bb9638 · 2025-08-10T13:33:29.000-07:00
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -214,14 +214,6 @@ class EvalMetadata(BaseModel):
     status: Optional[Literal["running", "finished", "error", "stopped"]] = Field(
         None, description="Status of the evaluation"
     )
-    run_id: Optional[str] = Field(
-        None,
-        description=(
-            "Unique identifier for the run. A 'run' is a group of rows"
-            "that were evaluated together in single configuration of a @evaluation_test."
-            " This means that running the save @evaluation_test with "
-        ),
-    )
     num_runs: int = Field(..., description="Number of times the evaluation was repeated")
     aggregation_method: str = Field(..., description="Method used to aggregate scores across runs")
     threshold_of_success: Optional[float] = Field(None, description="Threshold score for test success")
@@ -253,8 +245,8 @@ class EvaluationRow(BaseModel):
     supporting both row-wise batch evaluation and trajectory-based RL evaluation.
     """
 
-    # Core conversation data
-    messages: List[Message] = Field(description="List of messages in the conversation/trajectory.")
+    # Core OpenAI ChatCompletion compatible conversation data
+    messages: List[Message] = Field(description="List of messages in the conversation. Also known as a trajectory.")
 
     # Tool and function call information
     tools: Optional[List[Dict[str, Any]]] = Field(
@@ -272,6 +264,21 @@ class EvaluationRow(BaseModel):
         description="The status of the rollout.",
     )
 
+    cohort_id: Optional[str] = Field(
+        default_factory=generate_id,
+        description="The ID of the cohort that this row belongs to.",
+    )
+
+    rollout_id: Optional[str] = Field(
+        default_factory=generate_id,
+        description="The ID of the rollout that this row belongs to.",
+    )
+
+    run_id: Optional[str] = Field(
+        None,
+        description=("The ID of the run that this row belongs to."),
+    )
+
     # Ground truth reference (moved from EvaluateResult to top level)
     ground_truth: Optional[str] = Field(
         default=None, description="Optional ground truth reference for this evaluation."
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -61,6 +61,33 @@ def evaluation_test(  # noqa: C901
 ]:
     """Decorator to create pytest-based evaluation tests.
 
+    Here are some key concepts to understand the terminology in EP:
+
+    - "cohort" is a group of runs with a static set of parameters. A single
+        cohort will have multiple runs if num_runs > 1.
+        1. If your evaluation_test has combinations of parameters, it will generate
+        multiple cohorts per combination of parameters.
+        2. A new execution of a test function will generate a new cohort.
+    - "run" is a group of rollouts. For multiple num_runs > 1, there will be
+        multiple "run_id"s.
+    - "rollout" is the execution/process that produces a "trajectory". You
+        "execute" multiple rollouts to generate a dataset of trajectories.
+    - "trajectory" is the result produced by a rollout — a list of OpenAI Chat
+        Completion messages (e.g. the "messages" field in EvaluationRow).
+    - "row" both the input and output of an evaluation. For example, in
+        tau-bench, a row is a task within the dataset that can be identified as
+        "airline_task_0" or "airline_task_1" etc. The "row_id" can be populated from
+        the dataset itself to identify a particular task you want to evaluate.  If
+        not provided, EP will generate a "row_id" for each row whenever you call the
+        evaluation test.
+    - "dataset" is a collection of rows (e.g. List[EvauluationRow])
+    - "eval" is a rubric implemented in the body of an @evaluation_test
+        decorated test. It simply produces a score from 0 to 1 and attached it
+        to the row as the "evaluation_result" field.
+
+    A "cohort", "run", "rollout", and "row" each have a unique ID which can be
+    used to easily group and identify them.
+
     Args:
         model: Model identifiers to query.
         input_messages: Messages to send to the model. This is useful if you
@@ -121,15 +148,15 @@ def decorator(
 
         def execute_with_params(
             test_func: TestFunction,
-            row: EvaluationRow | None = None,
-            input_dataset: List[EvaluationRow] | None = None,
+            processed_row: EvaluationRow | None = None,
+            processed_dataset: List[EvaluationRow] | None = None,
             evaluation_test_kwargs: Optional[EvaluationInputParam] = None,
         ):
             kwargs = {}
-            if input_dataset is not None:
-                kwargs["rows"] = input_dataset
-            if row is not None:
-                kwargs["row"] = row
+            if processed_dataset is not None:
+                kwargs["rows"] = processed_dataset
+            if processed_row is not None:
+                kwargs["row"] = processed_row
             if evaluation_test_kwargs is not None:
                 if "row" in evaluation_test_kwargs:
                     raise ValueError("'row' is a reserved parameter for the evaluation function")
@@ -244,7 +271,7 @@ def generate_combinations():
         # Create wrapper function with exact signature that pytest expects
         def create_wrapper_with_signature() -> Callable:
             # Create the function body that will be used
-            run_id = generate_id()
+            cohort_id = generate_id()
 
             def wrapper_body(**kwargs):
                 model_name = kwargs["model"]
@@ -310,7 +337,6 @@ def _log_eval_error(
                         aggregation_method=aggregation_method,
                         threshold_of_success=threshold_of_success,
                         passed=None,
-                        run_id=run_id,
                     )
 
                     # Populate completion_params in input_metadata for all rows and initialize eval_metadata BEFORE rollouts
@@ -331,6 +357,7 @@ def _log_eval_error(
                         row.input_metadata.session_data["mode"] = mode
                         # Initialize eval_metadata for each row
                         row.eval_metadata = eval_metadata
+                        row.cohort_id = cohort_id
 
                         # has to be done in the pytest main process since it's
                         # used to determine whether this eval has stopped
@@ -350,14 +377,25 @@ def _log_eval_error(
                     for _ in range(num_runs):
                         # Regenerate outputs each run by deep-copying the pristine dataset
                         # so model responses are not reused across runs.
-                        fresh_rows = [copy.deepcopy(r) for r in data]
-                        input_dataset = execute_function(rollout_processor, rows=fresh_rows, config=config)
+                        run_id = generate_id()
+                        fresh_dataset = [copy.deepcopy(r) for r in data]
+
+                        # apply new run_id to fresh_dataset
+                        for row in fresh_dataset:
+                            row.run_id = run_id
+
+                        # generate new rollout_id for each row
+                        for row in fresh_dataset:
+                            row.rollout_id = generate_id()
+
+                        processed_dataset = execute_function(rollout_processor, rows=fresh_dataset, config=config)
+
                         if mode == "pointwise":
                             # Pointwise mode: apply the evaluator function to each row
-                            for row in input_dataset:
+                            for row in processed_dataset:
                                 result = execute_with_params(
                                     test_func,
-                                    row=row,
+                                    processed_row=row,
                                     evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
                                 )
                                 if result is None or not isinstance(result, EvaluationRow):
@@ -369,7 +407,7 @@ def _log_eval_error(
                             # Batch mode: call the test function with the full dataset
                             results = execute_with_params(
                                 test_func,
-                                input_dataset=input_dataset,
+                                processed_dataset=processed_dataset,
                                 evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
                             )
                             if results is None:
diff --git a/vite-app/src/types/eval-protocol.ts b/vite-app/src/types/eval-protocol.ts
@@ -62,7 +62,7 @@ export const CompletionParamsSchema = z.object({
 });
 
 export const InputMetadataSchema = z.object({
-  row_id: z.string().describe('Unique string to ID the row'),
+  row_id: z.string().optional().describe('Unique string to ID the row'),
   completion_params: CompletionParamsSchema.optional().describe('Completion endpoint parameters used'),
   dataset_info: z.record(z.string(), z.any()).optional().describe('Dataset row details: seed, system_prompt, environment_context, etc'),
   session_data: z.record(z.string(), z.any()).optional().describe('Session metadata like timestamp (input only, no duration/usage)')
@@ -78,18 +78,30 @@ export const EvalMetadataSchema = z.object({
   name: z.string().describe('Name of the evaluation'),
   description: z.string().optional().describe('Description of the evaluation'),
   version: z.string().describe('Version of the evaluation. By default, we will populate this with the current commit hash.'),
-  status: z.enum(['running', 'finished', 'error', 'stopped']).default('running').describe('Status of the evaluation'),
+  status: z.enum(['running', 'finished', 'error', 'stopped']).optional().describe('Status of the evaluation'),
   num_runs: z.number().int().describe('Number of times the evaluation was repeated'),
   aggregation_method: z.string().describe('Method used to aggregate scores across runs'),
   threshold_of_success: z.number().optional().describe('Threshold score for test success'),
-  passed: z.boolean().optional().describe('Whether the evaluation passed based on the threshold'),
-  run_id: z.string().optional().describe('Unique identifier for the run. A "run" is a group of rows that were evaluated together in single configuration of a @evaluation_test.')
+  passed: z.boolean().optional().describe('Whether the evaluation passed based on the threshold')
+});
+
+// Rollout status model (matches Python RolloutStatus)
+export const RolloutStatusSchema = z.object({
+  status: z
+    .enum(['running', 'finished', 'error', 'stopped'])
+    .default('finished')
+    .describe('Status of the rollout.'),
+  error_message: z.string().optional().describe('Error message if the rollout failed.')
 });
 
 export const EvaluationRowSchema = z.object({
   messages: z.array(MessageSchema).describe('List of messages in the conversation/trajectory.'),
   tools: z.array(z.record(z.string(), z.any())).optional().describe('Available tools/functions that were provided to the agent.'),
   input_metadata: InputMetadataSchema.describe('Metadata related to the input (dataset info, model config, session data, etc.).'),
+  rollout_status: RolloutStatusSchema.default({ status: 'finished' }).describe('The status of the rollout.'),
+  cohort_id: z.string().optional().describe('The ID of the cohort that this row belongs to.'),
+  rollout_id: z.string().optional().describe('The ID of the rollout that this row belongs to.'),
+  run_id: z.string().optional().describe('The ID of the run that this row belongs to.'),
   ground_truth: z.string().optional().describe('Optional ground truth reference for this evaluation.'),
   evaluation_result: EvaluateResultSchema.optional().describe('The evaluation result for this row/trajectory.'),
   usage: CompletionUsageSchema.optional().describe('Token usage statistics from LLM calls during execution.'),
@@ -158,6 +170,7 @@ export type InputMetadata = z.infer<typeof InputMetadataSchema>;
 export type CompletionUsage = z.infer<typeof CompletionUsageSchema>;
 export type EvalMetadata = z.infer<typeof EvalMetadataSchema>;
 export type EvaluationRow = z.infer<typeof EvaluationRowSchema>;
+export type RolloutStatus = z.infer<typeof RolloutStatusSchema>;
 export type ResourceServerConfig = z.infer<typeof ResourceServerConfigSchema>;
 export type EvaluationCriteriaModel = z.infer<typeof EvaluationCriteriaModelSchema>;
 export type TaskDefinitionModel = z.infer<typeof TaskDefinitionModelSchema>;