You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Refactor EvalMetadata and EvaluationRow models; add cohort_id, rollout_id, and run_id fields. Update evaluation_test to handle new identifiers and improve documentation on evaluation concepts.
name: z.string().describe('Name of the evaluation'),
79
79
description: z.string().optional().describe('Description of the evaluation'),
80
80
version: z.string().describe('Version of the evaluation. By default, we will populate this with the current commit hash.'),
81
-
status: z.enum(['running','finished','error','stopped']).default('running').describe('Status of the evaluation'),
81
+
status: z.enum(['running','finished','error','stopped']).optional().describe('Status of the evaluation'),
82
82
num_runs: z.number().int().describe('Number of times the evaluation was repeated'),
83
83
aggregation_method: z.string().describe('Method used to aggregate scores across runs'),
84
84
threshold_of_success: z.number().optional().describe('Threshold score for test success'),
85
-
passed: z.boolean().optional().describe('Whether the evaluation passed based on the threshold'),
86
-
run_id: z.string().optional().describe('Unique identifier for the run. A "run" is a group of rows that were evaluated together in single configuration of a @evaluation_test.')
85
+
passed: z.boolean().optional().describe('Whether the evaluation passed based on the threshold')
86
+
});
87
+
88
+
// Rollout status model (matches Python RolloutStatus)
89
+
exportconstRolloutStatusSchema=z.object({
90
+
status: z
91
+
.enum(['running','finished','error','stopped'])
92
+
.default('finished')
93
+
.describe('Status of the rollout.'),
94
+
error_message: z.string().optional().describe('Error message if the rollout failed.')
87
95
});
88
96
89
97
exportconstEvaluationRowSchema=z.object({
90
98
messages: z.array(MessageSchema).describe('List of messages in the conversation/trajectory.'),
91
99
tools: z.array(z.record(z.string(),z.any())).optional().describe('Available tools/functions that were provided to the agent.'),
92
100
input_metadata: InputMetadataSchema.describe('Metadata related to the input (dataset info, model config, session data, etc.).'),
101
+
rollout_status: RolloutStatusSchema.default({status: 'finished'}).describe('The status of the rollout.'),
102
+
cohort_id: z.string().optional().describe('The ID of the cohort that this row belongs to.'),
103
+
rollout_id: z.string().optional().describe('The ID of the rollout that this row belongs to.'),
104
+
run_id: z.string().optional().describe('The ID of the run that this row belongs to.'),
93
105
ground_truth: z.string().optional().describe('Optional ground truth reference for this evaluation.'),
94
106
evaluation_result: EvaluateResultSchema.optional().describe('The evaluation result for this row/trajectory.'),
95
107
usage: CompletionUsageSchema.optional().describe('Token usage statistics from LLM calls during execution.'),
@@ -158,6 +170,7 @@ export type InputMetadata = z.infer<typeof InputMetadataSchema>;
0 commit comments