first pass eval metadata

Dylan Huang · Dylan Huang · commit 6dca6e313cff · 2025-08-05T18:04:21.000-07:00
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -200,6 +200,17 @@ class InputMetadata(BaseModel):
     )
 
 
+class EvalMetadata(BaseModel):
+    """Metadata about the evaluation that was run."""
+
+    name: str = Field(..., description="Name of the evaluation")
+    description: Optional[str] = Field(None, description="Description of the evaluation")
+    version: str = Field(
+        ..., description="Version of the evaluation. By default, we will populate this with the current commit hash."
+    )
+    status: Literal["running", "finished", "error"] = Field("running", description="Status of the evaluation")
+
+
 class EvaluationRow(BaseModel):
     """
     Unified data structure for a single evaluation unit that contains messages,
@@ -241,6 +252,10 @@ class EvaluationRow(BaseModel):
 
     created_at: datetime = Field(default_factory=datetime.now, description="The timestamp when the row was created.")
 
+    eval_metadata: Optional[EvalMetadata] = Field(
+        default=None, description="Metadata about the evaluation that was run."
+    )
+
     def is_trajectory_evaluation(self) -> bool:
         """
         Returns True if this represents a trajectory evaluation (has step_outputs),
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -1,11 +1,13 @@
 import inspect
 from typing import Any, Callable, Dict, List, Optional
 
-from eval_protocol.dataset_logger import default_logger
-from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
 import pytest
 
-from eval_protocol.models import EvaluationRow
+# Import versioneer for getting version information
+import versioneer
+from eval_protocol.dataset_logger import default_logger
+from eval_protocol.models import EvalMetadata, EvaluationRow
+from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
 from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
 from eval_protocol.pytest.types import (
     Dataset,
@@ -255,7 +257,17 @@ def wrapper_body(**kwargs):
                             )
                         all_results.extend(results)
 
+                # Create eval metadata with test function info and current commit hash
+                eval_metadata = EvalMetadata(
+                    name=test_func.__name__,
+                    description=test_func.__doc__,
+                    version=versioneer.get_version(),
+                    status="finished",
+                )
+
+                # Add metadata to all results before logging
                 for r in all_results:
+                    r.eval_metadata = eval_metadata
                     default_logger.log(r)
 
                 scores = [r.evaluation_result.score for r in all_results if r.evaluation_result]
diff --git a/vite-app/src/types/eval-protocol.ts b/vite-app/src/types/eval-protocol.ts
@@ -74,6 +74,13 @@ export const CompletionUsageSchema = z.object({
   total_tokens: z.number()
 });
 
+export const EvalMetadataSchema = z.object({
+  name: z.string().describe('Name of the evaluation'),
+  description: z.string().optional().describe('Description of the evaluation'),
+  version: z.string().describe('Version of the evaluation. By default, we will populate this with the current commit hash.'),
+  status: z.enum(['running', 'finished', 'error']).default('running').describe('Status of the evaluation')
+});
+
 export const EvaluationRowSchema = z.object({
   messages: z.array(MessageSchema).describe('List of messages in the conversation/trajectory.'),
   tools: z.array(z.record(z.string(), z.any())).optional().describe('Available tools/functions that were provided to the agent.'),
@@ -84,7 +91,8 @@ export const EvaluationRowSchema = z.object({
   created_at: z.preprocess(
     (val) => typeof val === "string" ? new Date(val) : val,
     z.date()
-  ).describe('The timestamp when the row was created. Accepts string and parses to Date.')
+  ).describe('The timestamp when the row was created. Accepts string and parses to Date.'),
+  eval_metadata: EvalMetadataSchema.optional().describe('Metadata about the evaluation that was run.')
 });
 
 // Agent Evaluation Framework (V2) schemas
@@ -142,6 +150,7 @@ export type EvaluateResult = z.infer<typeof EvaluateResultSchema>;
 export type CompletionParams = z.infer<typeof CompletionParamsSchema>;
 export type InputMetadata = z.infer<typeof InputMetadataSchema>;
 export type CompletionUsage = z.infer<typeof CompletionUsageSchema>;
+export type EvalMetadata = z.infer<typeof EvalMetadataSchema>;
 export type EvaluationRow = z.infer<typeof EvaluationRowSchema>;
 export type ResourceServerConfig = z.infer<typeof ResourceServerConfigSchema>;
 export type EvaluationCriteriaModel = z.infer<typeof EvaluationCriteriaModelSchema>;