add zod types for eval protocol

Dylan Huang · Dylan Huang · commit c0c437bd9010 · 2025-08-05T11:15:07.000-07:00
diff --git a/vite-app/package.json b/vite-app/package.json
@@ -12,7 +12,8 @@
   "dependencies": {
     "react": "^19.1.0",
     "react-dom": "^19.1.0",
-    "react-router-dom": "^7.7.1"
+    "react-router-dom": "^7.7.1",
+    "zod": "^4.0.14"
   },
   "devDependencies": {
     "@eslint/js": "^9.30.1",
diff --git a/vite-app/pnpm-lock.yaml b/vite-app/pnpm-lock.yaml
diff --git a/vite-app/src/types/README.md b/vite-app/src/types/README.md
@@ -0,0 +1,143 @@
+# Evaluation Protocol Types
+
+This directory contains Zod schemas and TypeScript types that mirror the Pydantic models from the Python `eval_protocol/models.py` file.
+
+## Files
+
+- `eval-protocol.ts` - Main Zod schemas and TypeScript types
+- `eval-protocol-utils.ts` - Utility functions for working with evaluation data
+- `index.ts` - Re-exports for easier importing
+
+## Usage
+
+### Basic Import
+
+```typescript
+import { EvaluationRow, Message, EvaluateResult } from '@/types';
+```
+
+### Using Zod Schemas for Validation
+
+```typescript
+import { EvaluationRowSchema, MessageSchema } from '@/types';
+
+// Validate incoming data
+const validateEvaluationRow = (data: unknown) => {
+  return EvaluationRowSchema.parse(data);
+};
+
+// Safe parsing with error handling
+const safeParseMessage = (data: unknown) => {
+  const result = MessageSchema.safeParse(data);
+  if (!result.success) {
+    console.error('Validation failed:', result.error);
+    return null;
+  }
+  return result.data;
+};
+```
+
+### Using Utility Functions
+
+```typescript
+import { evalRowUtils, messageUtils, evaluateResultUtils } from '@/types/eval-protocol-utils';
+
+// Check if evaluation is trajectory-based
+const isTrajectory = evalRowUtils.isTrajectoryEvaluation(evaluationRow);
+
+// Get conversation length
+const length = evalRowUtils.getConversationLength(evaluationRow);
+
+// Get system message
+const systemMsg = evalRowUtils.getSystemMessage(evaluationRow);
+
+// Check if message has tool calls
+const hasTools = messageUtils.hasToolCalls(message);
+
+// Get message content as string
+const content = messageUtils.getContentAsString(message);
+```
+
+## Key Types
+
+### Core Types
+
+- `EvaluationRow` - Main data structure for evaluation results
+- `Message` - Chat message with role, content, and optional metadata
+- `EvaluateResult` - Evaluation results with scores and metrics
+- `MetricResult` - Individual metric results
+- `StepOutput` - Per-step evaluation output for RL scenarios
+
+### Agent Evaluation Framework (V2)
+
+- `TaskDefinitionModel` - Task configuration for agent evaluation
+- `ResourceServerConfig` - Server configuration for tasks
+- `EvaluationCriteriaModel` - Criteria for evaluating task success
+
+### MCP Configuration
+
+- `MCPMultiClientConfiguration` - MCP server configuration
+- `MCPConfigurationServerStdio` - Stdio-based MCP server
+- `MCPConfigurationServerUrl` - URL-based MCP server
+
+## Validation Examples
+
+### Validating API Responses
+
+```typescript
+import { EvaluationRowSchema } from '@/types';
+
+async function fetchEvaluationData(): Promise<EvaluationRow> {
+  const response = await fetch('/api/evaluation');
+  const data = await response.json();
+  
+  // Validate the response
+  return EvaluationRowSchema.parse(data);
+}
+```
+
+### Creating New Evaluation Data
+
+```typescript
+import { EvaluationRowSchema, MessageSchema } from '@/types';
+
+const newMessage: Message = {
+  role: 'user',
+  content: 'Hello, how are you?'
+};
+
+// Validate the message
+const validatedMessage = MessageSchema.parse(newMessage);
+
+const newEvaluationRow = {
+  messages: [validatedMessage],
+  input_metadata: {
+    row_id: 'unique-id-123'
+  },
+  created_at: new Date()
+};
+
+// Validate the evaluation row
+const validatedRow = EvaluationRowSchema.parse(newEvaluationRow);
+```
+
+## Type Safety
+
+All types are derived from Zod schemas, ensuring runtime validation and compile-time type safety. The schemas include:
+
+- Field validation (e.g., score ranges, required fields)
+- Default values
+- Optional fields
+- Union types for flexible content
+- Descriptive error messages
+
+## Migration from Python
+
+The TypeScript types closely mirror the Python Pydantic models:
+
+- `BaseModel` → `z.object()`
+- `Field()` → `z.string().describe()`
+- `Optional[T]` → `z.optional()`
+- `List[T]` → `z.array()`
+- `Dict[str, Any]` → `z.record(z.any())`
+- `extra="allow"` → `.passthrough()` 
diff --git a/vite-app/src/types/eval-protocol-utils.ts b/vite-app/src/types/eval-protocol-utils.ts
@@ -0,0 +1,176 @@
+import { EvaluationRow, Message } from './eval-protocol';
+
+/**
+ * Utility functions for working with EvaluationRow data
+ * These mirror the methods from the Python EvaluationRow class
+ */
+
+export const evalRowUtils = {
+  /**
+   * Returns True if this represents a trajectory evaluation (has step_outputs),
+   * False if it represents a single turn evaluation.
+   */
+  isTrajectoryEvaluation: (row: EvaluationRow): boolean => {
+    return (
+      row.evaluation_result !== undefined &&
+      row.evaluation_result.step_outputs !== undefined &&
+      row.evaluation_result.step_outputs.length > 0
+    );
+  },
+
+  /**
+   * Returns the number of messages in the conversation.
+   */
+  getConversationLength: (row: EvaluationRow): number => {
+    return row.messages.length;
+  },
+
+  /**
+   * Returns the system message from the conversation. Returns empty Message if none found.
+   */
+  getSystemMessage: (row: EvaluationRow): Message => {
+    const systemMessages = row.messages.filter(msg => msg.role === 'system');
+    if (systemMessages.length === 0) {
+      return { role: 'system', content: '' };
+    }
+    return systemMessages[0];
+  },
+
+  /**
+   * Returns only the assistant messages from the conversation.
+   */
+  getAssistantMessages: (row: EvaluationRow): Message[] => {
+    return row.messages.filter(msg => msg.role === 'assistant');
+  },
+
+  /**
+   * Returns only the user messages from the conversation.
+   */
+  getUserMessages: (row: EvaluationRow): Message[] => {
+    return row.messages.filter(msg => msg.role === 'user');
+  },
+
+  /**
+   * Helper method to get a specific value from input_metadata.
+   */
+  getInputMetadata: (row: EvaluationRow, key: string, defaultValue?: any): any => {
+    if (!row.input_metadata) {
+      return defaultValue;
+    }
+    return (row.input_metadata as any)[key] ?? defaultValue;
+  },
+
+  /**
+   * Get number of steps from control_plane_step data.
+   */
+  getSteps: (row: EvaluationRow): number => {
+    return row.messages.filter(msg => msg.control_plane_step).length;
+  },
+
+  /**
+   * Get total reward from control_plane_step data.
+   */
+  getTotalReward: (row: EvaluationRow): number => {
+    const messagesWithControlPlane = row.messages.filter(msg => msg.control_plane_step);
+    if (messagesWithControlPlane.length === 0) {
+      return 0.0;
+    }
+    return messagesWithControlPlane.reduce((total, msg) => {
+      const reward = (msg.control_plane_step as any)?.reward;
+      return total + (typeof reward === 'number' ? reward : 0);
+    }, 0.0);
+  },
+
+  /**
+   * Get termination status from control_plane_step data.
+   */
+  getTerminated: (row: EvaluationRow): boolean => {
+    const messagesWithControlPlane = row.messages.filter(msg => msg.control_plane_step);
+    if (messagesWithControlPlane.length === 0) {
+      return false;
+    }
+    return messagesWithControlPlane.some(msg => {
+      return (msg.control_plane_step as any)?.terminated === true;
+    });
+  },
+
+  /**
+   * Get termination reason from the final control_plane_step data.
+   */
+  getTerminationReason: (row: EvaluationRow): string => {
+    // Find the last message with control_plane_step that has termination_reason
+    for (let i = row.messages.length - 1; i >= 0; i--) {
+      const msg = row.messages[i];
+      if (msg.control_plane_step && (msg.control_plane_step as any)?.termination_reason) {
+        return (msg.control_plane_step as any).termination_reason;
+      }
+    }
+    return 'unknown';
+  }
+};
+
+/**
+ * Utility functions for working with Message data
+ */
+export const messageUtils = {
+  /**
+   * Check if a message has tool calls
+   */
+  hasToolCalls: (message: Message): boolean => {
+    return message.tool_calls !== undefined && message.tool_calls.length > 0;
+  },
+
+  /**
+   * Check if a message has function calls
+   */
+  hasFunctionCall: (message: Message): boolean => {
+    return message.function_call !== undefined;
+  },
+
+  /**
+   * Get the content as a string, handling both string and array content types
+   */
+  getContentAsString: (message: Message): string => {
+    if (typeof message.content === 'string') {
+      return message.content;
+    }
+    if (Array.isArray(message.content)) {
+      return message.content
+        .filter(part => part.type === 'text')
+        .map(part => part.text)
+        .join('');
+    }
+    return '';
+  }
+};
+
+/**
+ * Utility functions for working with EvaluateResult data
+ */
+export const evaluateResultUtils = {
+  /**
+   * Check if the evaluation result has step outputs (trajectory evaluation)
+   */
+  hasStepOutputs: (result: any): boolean => {
+    return result.step_outputs !== undefined && result.step_outputs.length > 0;
+  },
+
+  /**
+   * Get the total base reward from step outputs
+   */
+  getTotalBaseReward: (result: any): number => {
+    if (!result.step_outputs) {
+      return 0.0;
+    }
+    return result.step_outputs.reduce((total: number, step: any) => {
+      return total + (step.base_reward || 0);
+    }, 0.0);
+  },
+
+  /**
+   * Get the number of steps from step outputs
+   */
+  getStepCount: (result: any): number => {
+    return result.step_outputs?.length || 0;
+  }
+};