Skip to content

Commit 6dca6e3

Browse files
author
Dylan Huang
committed
first pass eval metadata
1 parent 1159eae commit 6dca6e3

File tree

3 files changed

+40
-4
lines changed

3 files changed

+40
-4
lines changed

eval_protocol/models.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,17 @@ class InputMetadata(BaseModel):
200200
)
201201

202202

203+
class EvalMetadata(BaseModel):
204+
"""Metadata about the evaluation that was run."""
205+
206+
name: str = Field(..., description="Name of the evaluation")
207+
description: Optional[str] = Field(None, description="Description of the evaluation")
208+
version: str = Field(
209+
..., description="Version of the evaluation. By default, we will populate this with the current commit hash."
210+
)
211+
status: Literal["running", "finished", "error"] = Field("running", description="Status of the evaluation")
212+
213+
203214
class EvaluationRow(BaseModel):
204215
"""
205216
Unified data structure for a single evaluation unit that contains messages,
@@ -241,6 +252,10 @@ class EvaluationRow(BaseModel):
241252

242253
created_at: datetime = Field(default_factory=datetime.now, description="The timestamp when the row was created.")
243254

255+
eval_metadata: Optional[EvalMetadata] = Field(
256+
default=None, description="Metadata about the evaluation that was run."
257+
)
258+
244259
def is_trajectory_evaluation(self) -> bool:
245260
"""
246261
Returns True if this represents a trajectory evaluation (has step_outputs),

eval_protocol/pytest/evaluation_test.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import inspect
22
from typing import Any, Callable, Dict, List, Optional
33

4-
from eval_protocol.dataset_logger import default_logger
5-
from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
64
import pytest
75

8-
from eval_protocol.models import EvaluationRow
6+
# Import versioneer for getting version information
7+
import versioneer
8+
from eval_protocol.dataset_logger import default_logger
9+
from eval_protocol.models import EvalMetadata, EvaluationRow
10+
from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
911
from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
1012
from eval_protocol.pytest.types import (
1113
Dataset,
@@ -255,7 +257,17 @@ def wrapper_body(**kwargs):
255257
)
256258
all_results.extend(results)
257259

260+
# Create eval metadata with test function info and current commit hash
261+
eval_metadata = EvalMetadata(
262+
name=test_func.__name__,
263+
description=test_func.__doc__,
264+
version=versioneer.get_version(),
265+
status="finished",
266+
)
267+
268+
# Add metadata to all results before logging
258269
for r in all_results:
270+
r.eval_metadata = eval_metadata
259271
default_logger.log(r)
260272

261273
scores = [r.evaluation_result.score for r in all_results if r.evaluation_result]

vite-app/src/types/eval-protocol.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,13 @@ export const CompletionUsageSchema = z.object({
7474
total_tokens: z.number()
7575
});
7676

77+
export const EvalMetadataSchema = z.object({
78+
name: z.string().describe('Name of the evaluation'),
79+
description: z.string().optional().describe('Description of the evaluation'),
80+
version: z.string().describe('Version of the evaluation. By default, we will populate this with the current commit hash.'),
81+
status: z.enum(['running', 'finished', 'error']).default('running').describe('Status of the evaluation')
82+
});
83+
7784
export const EvaluationRowSchema = z.object({
7885
messages: z.array(MessageSchema).describe('List of messages in the conversation/trajectory.'),
7986
tools: z.array(z.record(z.string(), z.any())).optional().describe('Available tools/functions that were provided to the agent.'),
@@ -84,7 +91,8 @@ export const EvaluationRowSchema = z.object({
8491
created_at: z.preprocess(
8592
(val) => typeof val === "string" ? new Date(val) : val,
8693
z.date()
87-
).describe('The timestamp when the row was created. Accepts string and parses to Date.')
94+
).describe('The timestamp when the row was created. Accepts string and parses to Date.'),
95+
eval_metadata: EvalMetadataSchema.optional().describe('Metadata about the evaluation that was run.')
8896
});
8997

9098
// Agent Evaluation Framework (V2) schemas
@@ -142,6 +150,7 @@ export type EvaluateResult = z.infer<typeof EvaluateResultSchema>;
142150
export type CompletionParams = z.infer<typeof CompletionParamsSchema>;
143151
export type InputMetadata = z.infer<typeof InputMetadataSchema>;
144152
export type CompletionUsage = z.infer<typeof CompletionUsageSchema>;
153+
export type EvalMetadata = z.infer<typeof EvalMetadataSchema>;
145154
export type EvaluationRow = z.infer<typeof EvaluationRowSchema>;
146155
export type ResourceServerConfig = z.infer<typeof ResourceServerConfigSchema>;
147156
export type EvaluationCriteriaModel = z.infer<typeof EvaluationCriteriaModelSchema>;

0 commit comments

Comments
 (0)