Skip to content

Commit c0c437b

Browse files
author
Dylan Huang
committed
add zod types for eval protocol
1 parent 9b1e528 commit c0c437b

4 files changed

Lines changed: 329 additions & 1 deletion

File tree

vite-app/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
"dependencies": {
1313
"react": "^19.1.0",
1414
"react-dom": "^19.1.0",
15-
"react-router-dom": "^7.7.1"
15+
"react-router-dom": "^7.7.1",
16+
"zod": "^4.0.14"
1617
},
1718
"devDependencies": {
1819
"@eslint/js": "^9.30.1",

vite-app/pnpm-lock.yaml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vite-app/src/types/README.md

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
# Evaluation Protocol Types
2+
3+
This directory contains Zod schemas and TypeScript types that mirror the Pydantic models from the Python `eval_protocol/models.py` file.
4+
5+
## Files
6+
7+
- `eval-protocol.ts` - Main Zod schemas and TypeScript types
8+
- `eval-protocol-utils.ts` - Utility functions for working with evaluation data
9+
- `index.ts` - Re-exports for easier importing
10+
11+
## Usage
12+
13+
### Basic Import
14+
15+
```typescript
16+
import { EvaluationRow, Message, EvaluateResult } from '@/types';
17+
```
18+
19+
### Using Zod Schemas for Validation
20+
21+
```typescript
22+
import { EvaluationRowSchema, MessageSchema } from '@/types';
23+
24+
// Validate incoming data
25+
const validateEvaluationRow = (data: unknown) => {
26+
return EvaluationRowSchema.parse(data);
27+
};
28+
29+
// Safe parsing with error handling
30+
const safeParseMessage = (data: unknown) => {
31+
const result = MessageSchema.safeParse(data);
32+
if (!result.success) {
33+
console.error('Validation failed:', result.error);
34+
return null;
35+
}
36+
return result.data;
37+
};
38+
```
39+
40+
### Using Utility Functions
41+
42+
```typescript
43+
import { evalRowUtils, messageUtils, evaluateResultUtils } from '@/types/eval-protocol-utils';
44+
45+
// Check if evaluation is trajectory-based
46+
const isTrajectory = evalRowUtils.isTrajectoryEvaluation(evaluationRow);
47+
48+
// Get conversation length
49+
const length = evalRowUtils.getConversationLength(evaluationRow);
50+
51+
// Get system message
52+
const systemMsg = evalRowUtils.getSystemMessage(evaluationRow);
53+
54+
// Check if message has tool calls
55+
const hasTools = messageUtils.hasToolCalls(message);
56+
57+
// Get message content as string
58+
const content = messageUtils.getContentAsString(message);
59+
```
60+
61+
## Key Types
62+
63+
### Core Types
64+
65+
- `EvaluationRow` - Main data structure for evaluation results
66+
- `Message` - Chat message with role, content, and optional metadata
67+
- `EvaluateResult` - Evaluation results with scores and metrics
68+
- `MetricResult` - Individual metric results
69+
- `StepOutput` - Per-step evaluation output for RL scenarios
70+
71+
### Agent Evaluation Framework (V2)
72+
73+
- `TaskDefinitionModel` - Task configuration for agent evaluation
74+
- `ResourceServerConfig` - Server configuration for tasks
75+
- `EvaluationCriteriaModel` - Criteria for evaluating task success
76+
77+
### MCP Configuration
78+
79+
- `MCPMultiClientConfiguration` - MCP server configuration
80+
- `MCPConfigurationServerStdio` - Stdio-based MCP server
81+
- `MCPConfigurationServerUrl` - URL-based MCP server
82+
83+
## Validation Examples
84+
85+
### Validating API Responses
86+
87+
```typescript
88+
import { EvaluationRowSchema } from '@/types';
89+
90+
async function fetchEvaluationData(): Promise<EvaluationRow> {
91+
const response = await fetch('/api/evaluation');
92+
const data = await response.json();
93+
94+
// Validate the response
95+
return EvaluationRowSchema.parse(data);
96+
}
97+
```
98+
99+
### Creating New Evaluation Data
100+
101+
```typescript
102+
import { EvaluationRowSchema, MessageSchema } from '@/types';
103+
104+
const newMessage: Message = {
105+
role: 'user',
106+
content: 'Hello, how are you?'
107+
};
108+
109+
// Validate the message
110+
const validatedMessage = MessageSchema.parse(newMessage);
111+
112+
const newEvaluationRow = {
113+
messages: [validatedMessage],
114+
input_metadata: {
115+
row_id: 'unique-id-123'
116+
},
117+
created_at: new Date()
118+
};
119+
120+
// Validate the evaluation row
121+
const validatedRow = EvaluationRowSchema.parse(newEvaluationRow);
122+
```
123+
124+
## Type Safety
125+
126+
All types are derived from Zod schemas, ensuring runtime validation and compile-time type safety. The schemas include:
127+
128+
- Field validation (e.g., score ranges, required fields)
129+
- Default values
130+
- Optional fields
131+
- Union types for flexible content
132+
- Descriptive error messages
133+
134+
## Migration from Python
135+
136+
The TypeScript types closely mirror the Python Pydantic models:
137+
138+
- `BaseModel``z.object()`
139+
- `Field()``z.string().describe()`
140+
- `Optional[T]``z.optional()`
141+
- `List[T]``z.array()`
142+
- `Dict[str, Any]``z.record(z.any())`
143+
- `extra="allow"``.passthrough()`
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
import { EvaluationRow, Message } from './eval-protocol';
2+
3+
/**
4+
* Utility functions for working with EvaluationRow data
5+
* These mirror the methods from the Python EvaluationRow class
6+
*/
7+
8+
export const evalRowUtils = {
9+
/**
10+
* Returns True if this represents a trajectory evaluation (has step_outputs),
11+
* False if it represents a single turn evaluation.
12+
*/
13+
isTrajectoryEvaluation: (row: EvaluationRow): boolean => {
14+
return (
15+
row.evaluation_result !== undefined &&
16+
row.evaluation_result.step_outputs !== undefined &&
17+
row.evaluation_result.step_outputs.length > 0
18+
);
19+
},
20+
21+
/**
22+
* Returns the number of messages in the conversation.
23+
*/
24+
getConversationLength: (row: EvaluationRow): number => {
25+
return row.messages.length;
26+
},
27+
28+
/**
29+
* Returns the system message from the conversation. Returns empty Message if none found.
30+
*/
31+
getSystemMessage: (row: EvaluationRow): Message => {
32+
const systemMessages = row.messages.filter(msg => msg.role === 'system');
33+
if (systemMessages.length === 0) {
34+
return { role: 'system', content: '' };
35+
}
36+
return systemMessages[0];
37+
},
38+
39+
/**
40+
* Returns only the assistant messages from the conversation.
41+
*/
42+
getAssistantMessages: (row: EvaluationRow): Message[] => {
43+
return row.messages.filter(msg => msg.role === 'assistant');
44+
},
45+
46+
/**
47+
* Returns only the user messages from the conversation.
48+
*/
49+
getUserMessages: (row: EvaluationRow): Message[] => {
50+
return row.messages.filter(msg => msg.role === 'user');
51+
},
52+
53+
/**
54+
* Helper method to get a specific value from input_metadata.
55+
*/
56+
getInputMetadata: (row: EvaluationRow, key: string, defaultValue?: any): any => {
57+
if (!row.input_metadata) {
58+
return defaultValue;
59+
}
60+
return (row.input_metadata as any)[key] ?? defaultValue;
61+
},
62+
63+
/**
64+
* Get number of steps from control_plane_step data.
65+
*/
66+
getSteps: (row: EvaluationRow): number => {
67+
return row.messages.filter(msg => msg.control_plane_step).length;
68+
},
69+
70+
/**
71+
* Get total reward from control_plane_step data.
72+
*/
73+
getTotalReward: (row: EvaluationRow): number => {
74+
const messagesWithControlPlane = row.messages.filter(msg => msg.control_plane_step);
75+
if (messagesWithControlPlane.length === 0) {
76+
return 0.0;
77+
}
78+
return messagesWithControlPlane.reduce((total, msg) => {
79+
const reward = (msg.control_plane_step as any)?.reward;
80+
return total + (typeof reward === 'number' ? reward : 0);
81+
}, 0.0);
82+
},
83+
84+
/**
85+
* Get termination status from control_plane_step data.
86+
*/
87+
getTerminated: (row: EvaluationRow): boolean => {
88+
const messagesWithControlPlane = row.messages.filter(msg => msg.control_plane_step);
89+
if (messagesWithControlPlane.length === 0) {
90+
return false;
91+
}
92+
return messagesWithControlPlane.some(msg => {
93+
return (msg.control_plane_step as any)?.terminated === true;
94+
});
95+
},
96+
97+
/**
98+
* Get termination reason from the final control_plane_step data.
99+
*/
100+
getTerminationReason: (row: EvaluationRow): string => {
101+
// Find the last message with control_plane_step that has termination_reason
102+
for (let i = row.messages.length - 1; i >= 0; i--) {
103+
const msg = row.messages[i];
104+
if (msg.control_plane_step && (msg.control_plane_step as any)?.termination_reason) {
105+
return (msg.control_plane_step as any).termination_reason;
106+
}
107+
}
108+
return 'unknown';
109+
}
110+
};
111+
112+
/**
113+
* Utility functions for working with Message data
114+
*/
115+
export const messageUtils = {
116+
/**
117+
* Check if a message has tool calls
118+
*/
119+
hasToolCalls: (message: Message): boolean => {
120+
return message.tool_calls !== undefined && message.tool_calls.length > 0;
121+
},
122+
123+
/**
124+
* Check if a message has function calls
125+
*/
126+
hasFunctionCall: (message: Message): boolean => {
127+
return message.function_call !== undefined;
128+
},
129+
130+
/**
131+
* Get the content as a string, handling both string and array content types
132+
*/
133+
getContentAsString: (message: Message): string => {
134+
if (typeof message.content === 'string') {
135+
return message.content;
136+
}
137+
if (Array.isArray(message.content)) {
138+
return message.content
139+
.filter(part => part.type === 'text')
140+
.map(part => part.text)
141+
.join('');
142+
}
143+
return '';
144+
}
145+
};
146+
147+
/**
148+
* Utility functions for working with EvaluateResult data
149+
*/
150+
export const evaluateResultUtils = {
151+
/**
152+
* Check if the evaluation result has step outputs (trajectory evaluation)
153+
*/
154+
hasStepOutputs: (result: any): boolean => {
155+
return result.step_outputs !== undefined && result.step_outputs.length > 0;
156+
},
157+
158+
/**
159+
* Get the total base reward from step outputs
160+
*/
161+
getTotalBaseReward: (result: any): number => {
162+
if (!result.step_outputs) {
163+
return 0.0;
164+
}
165+
return result.step_outputs.reduce((total: number, step: any) => {
166+
return total + (step.base_reward || 0);
167+
}, 0.0);
168+
},
169+
170+
/**
171+
* Get the number of steps from step outputs
172+
*/
173+
getStepCount: (result: any): number => {
174+
return result.step_outputs?.length || 0;
175+
}
176+
};

0 commit comments

Comments
 (0)