diff --git a/docs/plans/1052-conversation-mode.md b/docs/plans/1052-conversation-mode.md new file mode 100644 index 000000000..2c50ab9e3 --- /dev/null +++ b/docs/plans/1052-conversation-mode.md @@ -0,0 +1,55 @@ +# Issue #1052: Multi-turn Conversational Test Case — Live Turn-by-Turn Evaluation + +## Problem + +Today, multi-turn evals script all intermediate assistant responses in `input` — the LLM only generates the last response. This means conversation context retention, progressive reasoning, and turn-by-turn quality cannot be measured independently. + +## Solution + +Add `mode: conversation` with a `turns` array that drives turn-by-turn LLM evaluation with per-turn and conversation-level grading. + +### New Schema Fields + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `mode` | `'conversation'` | - | Enables conversation evaluation mode | +| `turns` | `ConversationTurn[]` | - | Ordered user messages; each generates an LLM call | +| `aggregation` | `'mean' \| 'min' \| 'max'` | `'mean'` | How turn scores combine into final score | +| `on_turn_failure` | `'continue' \| 'stop'` | `'continue'` | What to do when a turn's assertions fail | +| `window_size` | `number` | all turns | Sliding window for context passed to graders | + +### How It Works + +1. `input` provides system prompt and initial context (same as today) +2. For each entry in `turns`: + a. Append the user message to accumulated history + b. Call the provider with full history — LLM generates assistant response + c. Grade the response against turn's `assertions` and `expected_output` + d. Append actual LLM response (not expected_output) to history +3. After all turns: run top-level `assertions` over full transcript +4. Final score = aggregation of per-turn + conversation assertion scores + +### Validation Rules + +- `turns` requires `mode: conversation` +- `mode: conversation` requires `turns` +- `turns` incompatible with top-level `expected_output` +- `aggregation` only valid with `mode: conversation` +- Each turn must have non-empty `input` + +### Files Modified + +| File | Change | +|------|--------| +| `packages/core/src/evaluation/types.ts` | ConversationTurn, mode, turns, etc. on EvalTest | +| `packages/core/src/evaluation/validation/eval-file.schema.ts` | Zod schema for new fields | +| `packages/core/src/evaluation/yaml-parser.ts` | Parse conversation fields | +| `packages/core/src/evaluation/orchestrator.ts` | Conversation runner in runEvalCase | +| `packages/core/test/evaluation/conversation-mode.test.ts` | Unit tests | +| `examples/features/multi-turn-conversation-live/` | UAT example | + +## References + +- Issue: #1052 +- Research: agentevals-research PR #57 +- Prior art: #505 / PR #507 (scripted multi-turn), #331 / PR #1051 (depends_on) diff --git a/examples/features/multi-turn-conversation-live/README.md b/examples/features/multi-turn-conversation-live/README.md new file mode 100644 index 000000000..db0a9bd28 --- /dev/null +++ b/examples/features/multi-turn-conversation-live/README.md @@ -0,0 +1,22 @@ +# Multi-Turn Conversation (Live) + +This example demonstrates **live turn-by-turn conversation evaluation** where the LLM generates each assistant response (unlike `multi-turn-conversation/` which scripts intermediate turns). + +## Features Shown + +- `mode: conversation` — enables live turn-by-turn evaluation +- `turns[]` — each entry is a user message that generates an LLM call +- Per-turn `assertions` — string shorthand (rubric) and structured evaluators +- `aggregation: mean | min | max` — how turn scores combine +- `on_turn_failure: stop | continue` — behavior on assertion failure +- Top-level `assertions` — conversation-level grading after all turns + +## Running + +```bash +# With default target +bun apps/cli/src/cli.ts eval examples/features/multi-turn-conversation-live/evals/dataset.eval.yaml + +# With specific test +bun apps/cli/src/cli.ts eval examples/features/multi-turn-conversation-live/evals/dataset.eval.yaml --test-id context-retention +``` diff --git a/examples/features/multi-turn-conversation-live/evals/dataset.eval.yaml b/examples/features/multi-turn-conversation-live/evals/dataset.eval.yaml new file mode 100644 index 000000000..831f6597c --- /dev/null +++ b/examples/features/multi-turn-conversation-live/evals/dataset.eval.yaml @@ -0,0 +1,105 @@ +# Multi-turn conversation evaluation (live turn-by-turn) +# Each turn generates a fresh LLM call; per-turn assertions grade each response. +# This is different from multi-turn-conversation/ which scripts intermediate turns. + +description: Live multi-turn conversation evaluation with per-turn grading + +execution: + target: llm + +tests: + # Test 1: Basic context retention across turns + - id: context-retention + mode: conversation + criteria: Agent maintains context and provides relevant responses across turns + aggregation: mean + input: + - role: system + content: |- + You are a helpful math tutor. Be concise and accurate. + Always show your work step by step. + turns: + - input: What is 15% of 200? + assertions: + - Correctly calculates 15% of 200 as 30 + - Shows the calculation steps + - input: Now double that result. + assertions: + - References the previous answer of 30 + - Correctly calculates double as 60 + - input: What were the original numbers I asked about? + assertions: + - Recalls that the user asked about 15% and 200 + - Demonstrates memory of the conversation context + + # Test 2: With aggregation: min (weakest-link scoring) + - id: weakest-link-scoring + mode: conversation + criteria: Agent provides accurate, well-structured responses + aggregation: min + input: + - role: system + content: You are a concise geography expert. Answer in 1-2 sentences. + turns: + - input: What is the capital of France? + assertions: + - Correctly identifies Paris as the capital of France + - input: What country is it in? + assertions: + - Recognizes the question refers to Paris from the previous turn + - Confirms Paris is in France + + # Test 3: With on_turn_failure: stop + - id: stop-on-failure + mode: conversation + on_turn_failure: stop + criteria: Agent follows instructions precisely + input: + - role: system + content: You are a helpful assistant. Be precise and accurate. + turns: + - input: What is 2 + 2? + assertions: + - Answers with 4 + - input: Multiply that by 3. + assertions: + - References the previous answer + - Calculates 12 correctly + + # Test 4: Mixed string and structured assertions + - id: mixed-assertions + mode: conversation + criteria: Agent writes correct, well-formed Python code + input: + - role: system + content: You are a helpful coding assistant. + turns: + - input: Write a Python function that adds two numbers. + assertions: + - Contains a Python function definition + - type: contains + value: def + - input: Now add type hints to the function. + assertions: + - Includes type hints (int, float, or similar) + - type: contains + value: "->" + + # Test 5: Conversation-level assertions + - id: conversation-coherence + mode: conversation + criteria: Agent maintains a coherent, helpful conversation + input: + - role: system + content: You are a helpful travel advisor. Be concise. + turns: + - input: I want to visit somewhere warm in December. + assertions: + - Suggests at least one warm destination + - input: I prefer beaches over cities. + assertions: + - Adjusts recommendations toward beach destinations + - Does not suggest purely urban destinations + assertions: + - Agent maintains consistency — later suggestions align with earlier preferences + - Agent does not contradict its own prior recommendations diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index ebedc1e00..7430bc029 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -24,6 +24,8 @@ import { resolveTargetDefinition, } from './providers/targets.js'; import type { + ChatMessage, + ChatMessageRole, EnvLookup, Message, Provider, @@ -47,6 +49,8 @@ import { import { aggregateTrials } from './trials.js'; import type { AssertionEntry, + ConversationAggregation, + ConversationTurn, DependencyResult, EvalTest, EvaluationResult, @@ -60,6 +64,8 @@ import type { JsonObject, JsonValue, LlmGraderEvaluatorConfig, + TestMessage, + TestMessageRole, TrialResult, TrialsConfig, WorkspaceHookConfig, @@ -1889,6 +1895,42 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise {}); + } + } + + return conversationResult; + } + const caseStartMs = Date.now(); const attemptBudget = (maxRetries ?? 0) + 1; let attempt = 0; @@ -2887,6 +2929,382 @@ function buildEvaluatorRegistry( }; } +// --------------------------------------------------------------------------- +// Conversation mode: turn-by-turn evaluation +// --------------------------------------------------------------------------- + +/** + * Run a multi-turn conversation evaluation. + * For each turn: append user message → call provider → grade turn → append LLM response. + * After all turns, run conversation-level assertions on the full transcript. + * Final score is aggregated from turn scores + conversation scores. + */ +async function runConversationMode(options: { + readonly evalCase: EvalTest; + readonly provider: Provider; + readonly target: ResolvedTarget; + readonly evaluators: Partial> & { readonly 'llm-grader': Evaluator }; + readonly typeRegistry: import('./registry/evaluator-registry.js').EvaluatorRegistry; + readonly graderProvider?: Provider; + readonly promptInputs: PromptInputs; + readonly nowFn: () => Date; + readonly signal?: AbortSignal; + readonly workspacePath?: string; + readonly caseWorkspaceFile?: string; + readonly agentTimeoutMs?: number; + readonly streamCallbacks?: ProviderStreamCallbacks; + readonly verbose?: boolean; + readonly threshold?: number; + readonly targetResolver?: (name: string) => Provider | undefined; + readonly availableTargets?: readonly string[]; +}): Promise { + const { + evalCase, + provider, + target, + evaluators, + typeRegistry, + graderProvider, + promptInputs, + nowFn, + signal, + workspacePath, + caseWorkspaceFile, + agentTimeoutMs, + streamCallbacks, + verbose, + threshold, + targetResolver, + availableTargets, + } = options; + + // biome-ignore lint/style/noNonNullAssertion: turns is guaranteed by the caller (conversation mode gate) + const turns = evalCase.turns!; + const aggregation = evalCase.aggregation ?? 'mean'; + const onTurnFailure = evalCase.on_turn_failure ?? 'continue'; + const windowSize = evalCase.window_size; + + // Build initial message history from evalCase.input (system prompt + any context) + const history: ChatMessage[] = []; + for (const msg of evalCase.input) { + const content = typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content); + history.push({ role: msg.role as ChatMessageRole, content }); + } + + const turnScores: EvaluatorResult[] = []; + const allTurnScoreValues: number[] = []; + let stopped = false; + const caseStartMs = Date.now(); + + for (let i = 0; i < turns.length; i++) { + const turn = turns[i]; + const turnIndex = i + 1; + + if (stopped) { + // Turn skipped due to on_turn_failure: stop + turnScores.push({ + name: `turn-${turnIndex}`, + type: 'rubrics' as EvaluatorKind, + score: 0, + verdict: 'skip' as EvaluationVerdict, + assertions: [{ text: 'Skipped due to previous turn failure', passed: false }], + }); + allTurnScoreValues.push(0); + continue; + } + + // Append user message to history + const userContent = typeof turn.input === 'string' ? turn.input : JSON.stringify(turn.input); + history.push({ role: 'user', content: userContent }); + + // Build chatPrompt for provider call (with optional window_size) + const chatPromptForProvider = windowSize + ? buildWindowedHistory(history, windowSize) + : [...history]; + + // Call provider with accumulated history + let response: ProviderResponse; + try { + response = await provider.invoke({ + question: userContent, + chatPrompt: chatPromptForProvider, + evalCaseId: `${evalCase.id}/turn-${turnIndex}`, + signal, + cwd: workspacePath, + workspaceFile: caseWorkspaceFile, + streamCallbacks, + }); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + turnScores.push({ + name: `turn-${turnIndex}`, + type: 'rubrics' as EvaluatorKind, + score: 0, + verdict: 'fail' as EvaluationVerdict, + assertions: [{ text: `Provider error: ${message}`, passed: false }], + }); + allTurnScoreValues.push(0); + if (onTurnFailure === 'stop') stopped = true; + continue; + } + + // Extract assistant response + const assistantContent = extractLastAssistantContent(response.output); + + // Append actual LLM response (NOT expected_output) to history + history.push({ role: 'assistant', content: assistantContent }); + + // Grade this turn + if (!turn.assertions?.length && !turn.expected_output) { + // No assertions or expected_output — turn scores 1.0 + turnScores.push({ + name: `turn-${turnIndex}`, + type: 'rubrics' as EvaluatorKind, + score: 1.0, + verdict: 'pass' as EvaluationVerdict, + assertions: [], + }); + allTurnScoreValues.push(1.0); + continue; + } + + // Build assertions for this turn + const turnAssertions = buildTurnAssertions(turn); + + // Create a synthetic EvalTest for this turn's grading + const turnEvalCase: EvalTest = { + ...evalCase, + id: `${evalCase.id}/turn-${turnIndex}`, + assertions: turnAssertions, + input: buildTurnGraderInput(history, windowSize), + expected_output: turn.expected_output + ? [ + typeof turn.expected_output === 'string' + ? ({ content: turn.expected_output } as JsonObject) + : (turn.expected_output as JsonObject), + ] + : [], + // Clear conversation fields to prevent recursion + mode: undefined, + turns: undefined, + }; + + const turnResult = await evaluateCandidate({ + evalCase: turnEvalCase, + candidate: assistantContent, + target, + provider, + evaluators, + typeRegistry, + promptInputs: { + question: buildConversationContext(history, windowSize), + chatPrompt: windowSize ? buildWindowedHistory(history, windowSize) : [...history], + }, + nowFn, + attempt: 0, + graderProvider, + agentTimeoutMs, + output: response.output, + verbose, + threshold, + targetResolver, + availableTargets, + }); + + const turnScore = turnResult.score; + allTurnScoreValues.push(turnScore); + + turnScores.push({ + name: `turn-${turnIndex}`, + type: 'rubrics' as EvaluatorKind, + score: turnScore, + verdict: scoreToVerdict(turnScore, threshold ?? DEFAULT_THRESHOLD) as EvaluationVerdict, + assertions: turnResult.assertions ? [...turnResult.assertions] : [], + scores: turnResult.scores, + }); + + // Check if we should stop on failure + if (onTurnFailure === 'stop' && turnScore < (threshold ?? DEFAULT_THRESHOLD)) { + stopped = true; + } + } + + // Run conversation-level assertions (top-level assertions on full transcript) + let conversationScores: EvaluatorResult[] = []; + if (evalCase.assertions?.length) { + const conversationEvalCase: EvalTest = { + ...evalCase, + id: `${evalCase.id}/conversation`, + input: history.map((m) => ({ + role: m.role as TestMessageRole, + content: m.content, + })), + expected_output: [], + mode: undefined, + turns: undefined, + }; + + const fullTranscript = history + .map((m) => { + const content = typeof m.content === 'string' ? m.content : JSON.stringify(m.content); + return `${m.role}: ${content}`; + }) + .join('\n\n'); + + const conversationResult = await evaluateCandidate({ + evalCase: conversationEvalCase, + candidate: fullTranscript, + target, + provider, + evaluators, + typeRegistry, + promptInputs: { + question: fullTranscript, + chatPrompt: [...history], + }, + nowFn, + attempt: 0, + graderProvider, + agentTimeoutMs, + verbose, + threshold, + targetResolver, + availableTargets, + }); + + conversationScores = [ + { + name: 'conversation', + type: 'rubrics' as EvaluatorKind, + score: conversationResult.score, + verdict: scoreToVerdict( + conversationResult.score, + threshold ?? DEFAULT_THRESHOLD, + ) as EvaluationVerdict, + assertions: conversationResult.assertions ? [...conversationResult.assertions] : [], + scores: conversationResult.scores, + }, + ]; + } + + // Aggregate final score + const allScoreValues = [...allTurnScoreValues, ...conversationScores.map((s) => s.score)]; + + const finalScore = aggregateConversationScores(allScoreValues, aggregation); + const allResultScores = [...turnScores, ...conversationScores]; + + // Build output as full conversation transcript + const outputMessages: Message[] = history.map((m) => ({ + role: m.role, + content: m.content, + })); + + const flatAssertions: AssertionEntry[] = allResultScores.flatMap((s) => [...s.assertions]); + const totalDurationMs = Date.now() - caseStartMs; + + return { + timestamp: nowFn().toISOString(), + testId: evalCase.id, + suite: evalCase.suite, + category: evalCase.category, + score: finalScore, + assertions: flatAssertions, + target: target.name, + output: outputMessages, + scores: allResultScores, + executionStatus: classifyQualityStatus(finalScore, threshold ?? DEFAULT_THRESHOLD), + input: evalCase.input.map((m) => ({ + role: m.role, + content: typeof m.content === 'string' ? m.content : JSON.stringify(m.content), + })), + evalRun: { durationMs: totalDurationMs }, + }; +} + +/** Include system messages + last windowSize*2 non-system messages */ +function buildWindowedHistory(history: readonly ChatMessage[], windowSize: number): ChatMessage[] { + const systemMessages = history.filter((m) => m.role === 'system'); + const nonSystem = history.filter((m) => m.role !== 'system'); + const windowed = nonSystem.slice(-windowSize * 2); + return [...systemMessages, ...windowed]; +} + +/** Build a text representation of the conversation for grader context */ +function buildConversationContext(history: readonly ChatMessage[], windowSize?: number): string { + const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history; + return msgs + .map((m) => { + const content = typeof m.content === 'string' ? m.content : JSON.stringify(m.content); + return `${m.role}: ${content}`; + }) + .join('\n\n'); +} + +/** Build TestMessage[] from history for synthetic EvalTest input */ +function buildTurnGraderInput(history: readonly ChatMessage[], windowSize?: number): TestMessage[] { + const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history; + return msgs.map((m) => ({ + role: m.role as TestMessageRole, + content: m.content, + })); +} + +/** + * Convert per-turn assertions to EvaluatorConfig[]. + * String assertions are grouped into a single rubrics evaluator. + * Structured assertions pass through as-is. + */ +function buildTurnAssertions(turn: ConversationTurn): EvaluatorConfig[] { + if (!turn.assertions?.length) return []; + + const stringCriteria: string[] = []; + const structured: EvaluatorConfig[] = []; + + for (const a of turn.assertions) { + if (typeof a === 'string') { + stringCriteria.push(a); + } else { + structured.push(a); + } + } + + const result: EvaluatorConfig[] = []; + + // Group string assertions into a single llm-grader evaluator with rubrics. + // Uses llm-grader (not rubrics) because 'rubrics' is a YAML shorthand resolved by + // the evaluator-parser — at runtime we always dispatch through 'llm-grader'. + if (stringCriteria.length > 0) { + result.push({ + name: 'turn-rubrics', + type: 'llm-grader' as EvaluatorKind, + rubrics: stringCriteria.map((text, idx) => ({ + id: `criterion-${idx + 1}`, + outcome: text, + weight: 1, + })), + } as unknown as EvaluatorConfig); + } + + result.push(...structured); + return result; +} + +/** Aggregate turn scores using the configured strategy */ +function aggregateConversationScores( + scores: readonly number[], + aggregation: ConversationAggregation, +): number { + if (scores.length === 0) return 1.0; + switch (aggregation) { + case 'min': + return Math.min(...scores); + case 'max': + return Math.max(...scores); + default: + return scores.reduce((sum, s) => sum + s, 0) / scores.length; + } +} + async function invokeProvider( provider: Provider, options: { diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 3b4adbaec..9a3705fbd 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -858,6 +858,41 @@ export type EvaluatorConfig = | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig; +/** + * A single turn in a multi-turn conversation evaluation. + * Each turn is a user message. The runner generates the assistant response. + */ +export interface ConversationTurn { + /** User message for this turn */ + readonly input: TestMessageContent; + /** Reference assistant response for grading (NOT carried forward — actual LLM response is used) */ + readonly expected_output?: TestMessageContent; + /** Per-turn assertions. Strings become rubric criteria via shorthand. */ + readonly assertions?: readonly (string | EvaluatorConfig)[]; +} + +/** + * Conversation evaluation mode. + * - undefined: standard single-response evaluation (default, backward-compatible) + * - 'conversation': multi-turn evaluation where the LLM generates each assistant turn + */ +export type ConversationMode = 'conversation'; + +/** + * Score aggregation strategy for multi-turn conversation evaluation. + * - 'mean': average of all turn scores (default) + * - 'min': weakest-link scoring — final score = lowest turn score + * - 'max': best turn score + */ +export type ConversationAggregation = 'mean' | 'min' | 'max'; + +/** + * Behavior when a turn's assertions fail. + * - 'continue': run all remaining turns regardless (default) + * - 'stop': skip remaining turns, score them as 0 + */ +export type TurnFailurePolicy = 'continue' | 'stop'; + /** * Eval test definition sourced from AgentV specs. */ @@ -884,6 +919,16 @@ export interface EvalTest { readonly targets?: readonly string[]; /** Per-test score threshold override (0-1). Resolution: CLI > test > suite > DEFAULT_THRESHOLD. */ readonly threshold?: number; + /** Conversation evaluation mode. When 'conversation', turns[] drives turn-by-turn LLM evaluation. */ + readonly mode?: ConversationMode; + /** Ordered turns for conversation evaluation. Each turn generates a fresh LLM call. */ + readonly turns?: readonly ConversationTurn[]; + /** Score aggregation for conversation turns: mean (default), min (weakest-link), max */ + readonly aggregation?: ConversationAggregation; + /** Behavior on turn assertion failure: continue (default) or stop */ + readonly on_turn_failure?: TurnFailurePolicy; + /** Sliding window size for context passed to per-turn graders. Default: all turns. */ + readonly window_size?: number; /** Test IDs this test depends on. Dependent tests wait for all dependencies to complete before running. */ readonly depends_on?: readonly string[]; /** What to do when a dependency fails: skip (default), fail, or run anyway. */ diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index 5de36a1a8..e35f46287 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -355,6 +355,16 @@ const ExecutionSchema = z.object({ threshold: z.number().min(0).max(1).optional(), }); +/** Per-turn assertion: string shorthand (becomes rubric) or full evaluator config */ +const TurnAssertionSchema = z.union([z.string(), EvaluatorSchema]); + +/** A single turn in a multi-turn conversation */ +const ConversationTurnSchema = z.object({ + input: z.union([z.string(), MessageContentSchema]), + expected_output: z.union([z.string(), MessageContentSchema]).optional(), + assertions: z.array(TurnAssertionSchema).optional(), +}); + // --------------------------------------------------------------------------- // Test case // --------------------------------------------------------------------------- @@ -375,6 +385,11 @@ const EvalTestSchema = z.object({ note: z.string().optional(), depends_on: z.array(z.string()).optional(), on_dependency_failure: z.enum(['skip', 'fail', 'run']).optional(), + mode: z.enum(['conversation']).optional(), + turns: z.array(ConversationTurnSchema).min(1).optional(), + aggregation: z.enum(['mean', 'min', 'max']).optional(), + on_turn_failure: z.enum(['continue', 'stop']).optional(), + window_size: z.number().int().min(1).optional(), }); // --------------------------------------------------------------------------- diff --git a/packages/core/src/evaluation/validation/eval-validator.ts b/packages/core/src/evaluation/validation/eval-validator.ts index 4506acc50..4ecc79fa4 100644 --- a/packages/core/src/evaluation/validation/eval-validator.ts +++ b/packages/core/src/evaluation/validation/eval-validator.ts @@ -67,6 +67,13 @@ const KNOWN_TEST_FIELDS = new Set([ 'conversation_id', 'suite', 'note', + 'depends_on', + 'on_dependency_failure', + 'mode', + 'turns', + 'aggregation', + 'on_turn_failure', + 'window_size', ]); /** Name field pattern: lowercase alphanumeric with hyphens. */ @@ -328,6 +335,9 @@ export async function validateEvalFile(filePath: string): Promise 0 || testCaseConfig.assertions !== undefined || - testCaseConfig.assert !== undefined; + testCaseConfig.assert !== undefined || + (Array.isArray(testCaseConfig.turns) && testCaseConfig.turns.length > 0); if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) { logError( - `Skipping incomplete test: ${id ?? 'unknown'}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`, + `Skipping incomplete test: ${id ?? 'unknown'}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions/turns`, ); continue; } @@ -522,6 +529,26 @@ async function loadTestsFromYaml( ? (onDependencyFailureRaw as import('./types.js').DependencyFailurePolicy) : undefined; + // Extract conversation mode fields + const modeRaw = asString(testCaseConfig.mode); + const mode: ConversationMode | undefined = + modeRaw === 'conversation' ? 'conversation' : undefined; + const turns = Array.isArray(testCaseConfig.turns) + ? parseTurns(testCaseConfig.turns as readonly unknown[]) + : undefined; + const aggregationRaw = asString(testCaseConfig.aggregation); + const aggregation: ConversationAggregation | undefined = + aggregationRaw === 'mean' || aggregationRaw === 'min' || aggregationRaw === 'max' + ? aggregationRaw + : undefined; + const onTurnFailureRaw = asString(testCaseConfig.on_turn_failure); + const onTurnFailure: TurnFailurePolicy | undefined = + onTurnFailureRaw === 'continue' || onTurnFailureRaw === 'stop' ? onTurnFailureRaw : undefined; + const windowSize = + typeof testCaseConfig.window_size === 'number' && testCaseConfig.window_size >= 1 + ? (testCaseConfig.window_size as number) + : undefined; + const testCase: EvalTest = { id, suite: suiteName, @@ -540,6 +567,11 @@ async function loadTestsFromYaml( metadata, targets: caseTargets, ...(caseThreshold !== undefined ? { threshold: caseThreshold } : {}), + ...(mode ? { mode } : {}), + ...(turns && turns.length > 0 ? { turns } : {}), + ...(aggregation ? { aggregation } : {}), + ...(onTurnFailure ? { on_turn_failure: onTurnFailure } : {}), + ...(windowSize !== undefined ? { window_size: windowSize } : {}), ...(dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {}), ...(onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}), }; @@ -571,6 +603,35 @@ export async function loadTestById( /** @deprecated Use `loadTestById` instead */ export const loadEvalCaseById = loadTestById; +/** + * Parse raw turn data from YAML into typed ConversationTurn objects. + * String assertions are preserved as-is — they become rubric criteria at runtime. + * Structured assertion objects pass through unchanged. + */ +function parseTurns(rawTurns: readonly unknown[]): ConversationTurn[] { + return rawTurns.map((rawTurn) => { + const turn = rawTurn as Record; + const input = turn.input as TestMessageContent; + const expectedOutput = turn.expected_output as TestMessageContent | undefined; + + // Parse per-turn assertions (string shorthand or structured evaluator config) + let assertions: (string | EvaluatorConfig)[] | undefined; + if (Array.isArray(turn.assertions)) { + assertions = turn.assertions.map((a: unknown) => { + if (typeof a === 'string') return a; + // Structured evaluator config — pass through as-is (validated by Zod schema) + return a as EvaluatorConfig; + }); + } + + return { + input, + ...(expectedOutput !== undefined ? { expected_output: expectedOutput } : {}), + ...(assertions && assertions.length > 0 ? { assertions } : {}), + }; + }); +} + /** * Normalize a command value from YAML into a string array. * Accepts a string (split on whitespace) or an array of strings. diff --git a/packages/core/test/evaluation/conversation-mode.test.ts b/packages/core/test/evaluation/conversation-mode.test.ts new file mode 100644 index 000000000..2eeb8eee4 --- /dev/null +++ b/packages/core/test/evaluation/conversation-mode.test.ts @@ -0,0 +1,927 @@ +/** + * Unit tests for the multi-turn conversation mode feature. + * + * Covers: + * - Orchestrator: runEvalCase with mode: conversation + * - Validation: validateEvalFile with conversation mode fields + * - Score aggregation strategies (mean, min, max) + * - Turn failure policies (continue, stop) + * - Window size behaviour + */ + +import { afterAll, beforeAll, describe, expect, it } from 'bun:test'; +import { mkdir, rm, writeFile } from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; + +import { runEvalCase } from '../../src/evaluation/orchestrator.js'; +import type { ResolvedTarget } from '../../src/evaluation/providers/targets.js'; +import type { + Provider, + ProviderRequest, + ProviderResponse, +} from '../../src/evaluation/providers/types.js'; +import type { EvalTest } from '../../src/evaluation/types.js'; +import { validateEvalFile } from '../../src/evaluation/validation/eval-validator.js'; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +class SequenceProvider implements Provider { + readonly id: string; + readonly kind = 'mock' as const; + readonly targetName: string; + readonly requests: ProviderRequest[] = []; + private readonly responses: ProviderResponse[]; + private index = 0; + + constructor(targetName: string, responses: ProviderResponse[]) { + this.id = `mock:${targetName}`; + this.targetName = targetName; + this.responses = responses; + } + + async invoke(request: ProviderRequest): Promise { + this.requests.push(request); + if (this.index >= this.responses.length) { + throw new Error(`SequenceProvider: no more responses (called ${this.index + 1} times)`); + } + return this.responses[this.index++]; + } +} + +class ErrorOnFirstProvider implements Provider { + readonly id = 'error-first'; + readonly kind = 'mock' as const; + readonly targetName = 'error-first'; + private called = false; + private readonly fallbackResponse: ProviderResponse; + + constructor(fallback: ProviderResponse) { + this.fallbackResponse = fallback; + } + + async invoke(): Promise { + if (!this.called) { + this.called = true; + throw new Error('Simulated provider error'); + } + return this.fallbackResponse; + } +} + +const baseTarget: ResolvedTarget = { + kind: 'mock', + name: 'mock', + config: { response: '{}' }, +}; + +function makeEvaluatorRegistry(score = 1.0) { + return { + 'llm-grader': { + kind: 'llm-grader' as const, + async evaluate() { + return { + score, + verdict: score >= 0.5 ? ('pass' as const) : ('fail' as const), + assertions: [{ text: 'graded', passed: score >= 0.5 }], + expectedAspectCount: 1, + }; + }, + }, + }; +} + +function assistantResponse(content: string): ProviderResponse { + return { output: [{ role: 'assistant', content }] }; +} + +const nowFn = () => new Date('2024-01-01T00:00:00Z'); + +// --------------------------------------------------------------------------- +// Orchestrator — conversation mode +// --------------------------------------------------------------------------- + +describe('runEvalCase — conversation mode', () => { + it('basic 2-turn conversation with no assertions scores 1.0 and calls provider twice', async () => { + const provider = new SequenceProvider('mock', [ + assistantResponse('Hello!'), + assistantResponse('Goodbye!'), + ]); + + const evalCase: EvalTest = { + id: 'conv-basic', + question: 'Chat test', + input: [{ role: 'user', content: 'Hi' }], + expected_output: [], + file_paths: [], + criteria: 'Be helpful', + mode: 'conversation', + turns: [{ input: 'Turn 1 message' }, { input: 'Turn 2 message' }], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: makeEvaluatorRegistry(), + now: nowFn, + }); + + expect(result.score).toBe(1.0); + expect(provider.requests).toHaveLength(2); + expect(result.executionStatus).toBe('ok'); + }); + + it('per-turn string assertions are evaluated and affect score', async () => { + const provider = new SequenceProvider('mock', [ + assistantResponse('Paris'), + assistantResponse('Berlin'), + ]); + + const evalCase: EvalTest = { + id: 'conv-string-assertions', + question: 'Geography', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Correct answers', + mode: 'conversation', + turns: [ + { input: 'Capital of France?', assertions: ['Response mentions Paris'] }, + { input: 'Capital of Germany?', assertions: ['Response mentions Berlin'] }, + ], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: makeEvaluatorRegistry(1.0), + now: nowFn, + }); + + expect(result.score).toBeGreaterThan(0); + expect(provider.requests).toHaveLength(2); + }); + + it('per-turn structured assertions are evaluated', async () => { + const provider = new SequenceProvider('mock', [assistantResponse('42')]); + + const evalCase: EvalTest = { + id: 'conv-struct-assertions', + question: 'Math', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Correct', + mode: 'conversation', + turns: [ + { + input: 'What is 6 * 7?', + assertions: [{ type: 'llm-grader', criteria: 'Answer is 42' }], + }, + ], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: makeEvaluatorRegistry(1.0), + now: nowFn, + }); + + expect(result.score).toBeGreaterThan(0); + expect(provider.requests).toHaveLength(1); + }); + + it('conversation-level assertions are evaluated against full transcript', async () => { + const provider = new SequenceProvider('mock', [ + assistantResponse('Yes'), + assistantResponse('No'), + ]); + + const evalCase: EvalTest = { + id: 'conv-top-level', + question: 'Consistency check', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Consistent throughout', + mode: 'conversation', + turns: [{ input: 'Turn 1' }, { input: 'Turn 2' }], + assertions: [{ type: 'llm-grader', criteria: 'Conversation was coherent' }], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: makeEvaluatorRegistry(0.9), + now: nowFn, + }); + + // Should have per-turn scores plus a conversation-level score + expect(result.scores).toBeDefined(); + const hasConversationScore = result.scores?.some((s) => s.name === 'conversation'); + expect(hasConversationScore).toBe(true); + }); + + it('aggregation: mean — averages all turn scores', async () => { + // 3 turns, no per-turn assertions → each scores 1.0 + const provider = new SequenceProvider('mock', [ + assistantResponse('A'), + assistantResponse('B'), + assistantResponse('C'), + ]); + + const evalCase: EvalTest = { + id: 'conv-mean', + question: 'mean test', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Anything', + mode: 'conversation', + aggregation: 'mean', + turns: [{ input: 'T1' }, { input: 'T2' }, { input: 'T3' }], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: makeEvaluatorRegistry(), + now: nowFn, + }); + + expect(result.score).toBeCloseTo(1.0, 5); + }); + + it('aggregation: min — uses lowest turn score', async () => { + // Use per-turn assertions so scores are driven by the grader + // Turn 1: grader returns 1.0, Turn 2: 0.5, Turn 3: 0.8 + let callCount = 0; + const scores = [1.0, 0.5, 0.8]; + + const customRegistry = { + 'llm-grader': { + kind: 'llm-grader' as const, + async evaluate() { + const s = scores[callCount++] ?? 1.0; + return { + score: s, + verdict: s >= 0.5 ? ('pass' as const) : ('fail' as const), + assertions: [{ text: 'graded', passed: s >= 0.5 }], + expectedAspectCount: 1, + }; + }, + }, + }; + + const provider = new SequenceProvider('mock', [ + assistantResponse('A'), + assistantResponse('B'), + assistantResponse('C'), + ]); + + const evalCase: EvalTest = { + id: 'conv-min', + question: 'min test', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Anything', + mode: 'conversation', + aggregation: 'min', + turns: [ + { input: 'T1', assertions: ['Criterion A'] }, + { input: 'T2', assertions: ['Criterion B'] }, + { input: 'T3', assertions: ['Criterion C'] }, + ], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: customRegistry, + now: nowFn, + }); + + expect(result.score).toBeCloseTo(0.5, 5); + }); + + it('aggregation: max — uses highest turn score', async () => { + let callCount = 0; + const scores = [1.0, 0.5, 0.8]; + + const customRegistry = { + 'llm-grader': { + kind: 'llm-grader' as const, + async evaluate() { + const s = scores[callCount++] ?? 1.0; + return { + score: s, + verdict: s >= 0.5 ? ('pass' as const) : ('fail' as const), + assertions: [{ text: 'graded', passed: s >= 0.5 }], + expectedAspectCount: 1, + }; + }, + }, + }; + + const provider = new SequenceProvider('mock', [ + assistantResponse('A'), + assistantResponse('B'), + assistantResponse('C'), + ]); + + const evalCase: EvalTest = { + id: 'conv-max', + question: 'max test', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Anything', + mode: 'conversation', + aggregation: 'max', + turns: [ + { input: 'T1', assertions: ['Criterion A'] }, + { input: 'T2', assertions: ['Criterion B'] }, + { input: 'T3', assertions: ['Criterion C'] }, + ], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: customRegistry, + now: nowFn, + }); + + expect(result.score).toBeCloseTo(1.0, 5); + }); + + it('on_turn_failure: stop — skips remaining turns after first failure', async () => { + let callCount = 0; + const customRegistry = { + 'llm-grader': { + kind: 'llm-grader' as const, + async evaluate() { + callCount++; + // First grader call fails + return { + score: 0.0, + verdict: 'fail' as const, + assertions: [{ text: 'failed', passed: false }], + expectedAspectCount: 1, + }; + }, + }, + }; + + const provider = new SequenceProvider('mock', [ + assistantResponse('Turn 1 response'), + assistantResponse('Turn 2 response'), + assistantResponse('Turn 3 response'), + ]); + + const evalCase: EvalTest = { + id: 'conv-stop', + question: 'stop test', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Anything', + mode: 'conversation', + on_turn_failure: 'stop', + turns: [ + { input: 'T1', assertions: ['Criterion'] }, + { input: 'T2', assertions: ['Criterion'] }, + { input: 'T3', assertions: ['Criterion'] }, + ], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: customRegistry, + now: nowFn, + }); + + // Provider should only be called once (first turn) + expect(provider.requests).toHaveLength(1); + + // Skipped turns should have score 0 with skip verdict + const skippedScores = result.scores?.filter((s) => s.verdict === 'skip') ?? []; + expect(skippedScores.length).toBeGreaterThanOrEqual(2); + }); + + it('on_turn_failure: continue (default) — all turns run even after failure', async () => { + let callCount = 0; + const customRegistry = { + 'llm-grader': { + kind: 'llm-grader' as const, + async evaluate() { + callCount++; + return { + score: callCount === 1 ? 0.0 : 1.0, + verdict: callCount === 1 ? ('fail' as const) : ('pass' as const), + assertions: [{ text: 'graded', passed: callCount !== 1 }], + expectedAspectCount: 1, + }; + }, + }, + }; + + const provider = new SequenceProvider('mock', [ + assistantResponse('A'), + assistantResponse('B'), + assistantResponse('C'), + ]); + + const evalCase: EvalTest = { + id: 'conv-continue', + question: 'continue test', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Anything', + mode: 'conversation', + on_turn_failure: 'continue', + turns: [ + { input: 'T1', assertions: ['Criterion'] }, + { input: 'T2', assertions: ['Criterion'] }, + { input: 'T3', assertions: ['Criterion'] }, + ], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: customRegistry, + now: nowFn, + }); + + // All 3 turns must run + expect(provider.requests).toHaveLength(3); + // No skipped turns + const skippedScores = result.scores?.filter((s) => s.verdict === 'skip') ?? []; + expect(skippedScores).toHaveLength(0); + }); + + it('window_size — chatPrompt passed to provider is limited to system + last N*2 messages', async () => { + const provider = new SequenceProvider('mock', [ + assistantResponse('R1'), + assistantResponse('R2'), + assistantResponse('R3'), + ]); + + const evalCase: EvalTest = { + id: 'conv-window', + question: 'window test', + input: [{ role: 'system', content: 'System prompt' }], + expected_output: [], + file_paths: [], + criteria: 'Anything', + mode: 'conversation', + window_size: 1, // keep system + last 1 user+assistant pair + turns: [{ input: 'T1' }, { input: 'T2' }, { input: 'T3' }], + }; + + await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: makeEvaluatorRegistry(), + now: nowFn, + }); + + // Provider called 3 times + expect(provider.requests).toHaveLength(3); + + // Third call chatPrompt should not include T1's messages (windowed) + const thirdRequest = provider.requests[2]; + const chatPrompt = thirdRequest?.chatPrompt ?? []; + // System prompt should always be present + expect(chatPrompt.some((m) => m.role === 'system')).toBe(true); + // With window_size=1: system + last 2 messages (T2 user + T2 assistant). + // T1 user message should NOT be in the windowed prompt + const userMessages = chatPrompt.filter((m) => m.role === 'user'); + expect(userMessages.length).toBeLessThanOrEqual(1); + }); + + it('provider error on a turn — turn scores 0 and execution continues', async () => { + const provider = new ErrorOnFirstProvider(assistantResponse('Turn 2 response')); + + const evalCase: EvalTest = { + id: 'conv-provider-error', + question: 'error test', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Anything', + mode: 'conversation', + turns: [{ input: 'T1' }, { input: 'T2' }], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: makeEvaluatorRegistry(), + now: nowFn, + }); + + // Turn 1 should score 0 + const turn1Score = result.scores?.find((s) => s.name === 'turn-1'); + expect(turn1Score?.score).toBe(0); + + // Turn 2 should still run (continue is default) + const turn2Score = result.scores?.find((s) => s.name === 'turn-2'); + expect(turn2Score).toBeDefined(); + expect(turn2Score?.score).toBe(1.0); + }); + + it('output contains full conversation transcript with all user and assistant messages', async () => { + const provider = new SequenceProvider('mock', [ + assistantResponse('Answer 1'), + assistantResponse('Answer 2'), + ]); + + const evalCase: EvalTest = { + id: 'conv-transcript', + question: 'transcript test', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Full transcript', + mode: 'conversation', + turns: [{ input: 'Question 1' }, { input: 'Question 2' }], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: makeEvaluatorRegistry(), + now: nowFn, + }); + + // Output should have all messages from the conversation + const output = result.output ?? []; + const userMessages = output.filter((m) => m.role === 'user'); + const assistantMessages = output.filter((m) => m.role === 'assistant'); + + expect(userMessages.length).toBe(2); + expect(assistantMessages.length).toBe(2); + expect(assistantMessages[0]?.content).toBe('Answer 1'); + expect(assistantMessages[1]?.content).toBe('Answer 2'); + }); + + it('top-level assertions are NOT applied per-turn — only at conversation level', async () => { + let graderCallCount = 0; + const customRegistry = { + 'llm-grader': { + kind: 'llm-grader' as const, + async evaluate() { + graderCallCount++; + return { + score: 0.8, + verdict: 'pass' as const, + assertions: [{ text: 'graded', passed: true }], + expectedAspectCount: 1, + }; + }, + }, + }; + + const provider = new SequenceProvider('mock', [assistantResponse('A'), assistantResponse('B')]); + + const evalCase: EvalTest = { + id: 'conv-no-double-count', + question: 'double count test', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Anything', + mode: 'conversation', + turns: [ + { input: 'T1' }, // no per-turn assertions → scores 1.0 without grader + { input: 'T2' }, // no per-turn assertions → scores 1.0 without grader + ], + assertions: [{ type: 'llm-grader', criteria: 'Conversation was coherent' }], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: customRegistry, + now: nowFn, + }); + + // Grader should be called exactly once — for the conversation-level pass only + expect(graderCallCount).toBe(1); + + // Should have 2 turn scores (1.0 each) + 1 conversation score + const turnScores = result.scores?.filter((s) => s.name.startsWith('turn-')) ?? []; + const convScore = result.scores?.find((s) => s.name === 'conversation'); + expect(turnScores).toHaveLength(2); + expect(turnScores[0]?.score).toBe(1.0); + expect(turnScores[1]?.score).toBe(1.0); + expect(convScore).toBeDefined(); + expect(convScore?.score).toBe(0.8); + }); + + it('conversation-level assertions grade the full transcript, not just last reply', async () => { + let graderCandidate = ''; + const customRegistry = { + 'llm-grader': { + kind: 'llm-grader' as const, + async evaluate(ctx: { candidate: string }) { + graderCandidate = ctx.candidate; + return { + score: 1.0, + verdict: 'pass' as const, + assertions: [{ text: 'graded', passed: true }], + expectedAspectCount: 1, + }; + }, + }, + }; + + const provider = new SequenceProvider('mock', [ + assistantResponse('First answer'), + assistantResponse('Second answer'), + ]); + + const evalCase: EvalTest = { + id: 'conv-transcript-candidate', + question: 'transcript candidate test', + input: [{ role: 'system', content: 'Be helpful' }], + expected_output: [], + file_paths: [], + criteria: 'Anything', + mode: 'conversation', + turns: [{ input: 'Question 1' }, { input: 'Question 2' }], + assertions: [{ type: 'llm-grader', criteria: 'Full transcript is coherent' }], + }; + + await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: customRegistry, + now: nowFn, + }); + + // The candidate passed to the grader should contain the full transcript, not just "Second answer" + expect(graderCandidate).toContain('First answer'); + expect(graderCandidate).toContain('Second answer'); + expect(graderCandidate).toContain('Question 1'); + expect(graderCandidate).toContain('Question 2'); + }); + + it('no regression — non-conversation test behaves as before', async () => { + const provider = new SequenceProvider('mock', [assistantResponse('Standard response')]); + + const evalCase: EvalTest = { + id: 'standard-test', + question: 'Standard test', + input: [{ role: 'user', content: 'Hello' }], + expected_output: [], + file_paths: [], + criteria: 'Helpful', + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: makeEvaluatorRegistry(0.8), + now: nowFn, + }); + + expect(result.score).toBeGreaterThan(0); + expect(result.executionStatus).toBe('ok'); + // Should not have turn-level scores + const hasTurnScores = result.scores?.some((s) => s.name.startsWith('turn-')); + expect(hasTurnScores).toBeFalsy(); + }); +}); + +// --------------------------------------------------------------------------- +// Validation tests +// --------------------------------------------------------------------------- + +describe('validateEvalFile — conversation mode', () => { + let tempDir: string; + + beforeAll(async () => { + tempDir = path.join(os.tmpdir(), `agentv-conv-test-${Date.now()}`); + await mkdir(tempDir, { recursive: true }); + }); + + afterAll(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + it('rejects turns without mode: conversation', async () => { + const filePath = path.join(tempDir, 'turns-no-mode.yaml'); + await writeFile( + filePath, + `tests: + - id: t1 + criteria: Goal + input: hello + turns: + - input: Turn 1 +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(false); + expect( + result.errors.some((e) => e.message.includes("'turns' requires mode: conversation")), + ).toBe(true); + }); + + it('rejects mode: conversation without turns', async () => { + const filePath = path.join(tempDir, 'mode-no-turns.yaml'); + await writeFile( + filePath, + `tests: + - id: t1 + criteria: Goal + input: hello + mode: conversation +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(false); + expect(result.errors.some((e) => e.message.includes("non-empty 'turns' array"))).toBe(true); + }); + + it('rejects mode: conversation with empty turns array', async () => { + const filePath = path.join(tempDir, 'mode-empty-turns.yaml'); + await writeFile( + filePath, + `tests: + - id: t1 + criteria: Goal + input: hello + mode: conversation + turns: [] +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(false); + expect(result.errors.some((e) => e.message.includes("non-empty 'turns' array"))).toBe(true); + }); + + it('rejects turns + top-level expected_output', async () => { + const filePath = path.join(tempDir, 'turns-expected-output.yaml'); + await writeFile( + filePath, + `tests: + - id: t1 + criteria: Goal + input: hello + mode: conversation + turns: + - input: Turn 1 + expected_output: "some output" +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(false); + expect( + result.errors.some((e) => + e.message.includes("'expected_output' is not allowed with mode: conversation"), + ), + ).toBe(true); + }); + + it('rejects aggregation without mode: conversation', async () => { + const filePath = path.join(tempDir, 'aggregation-no-mode.yaml'); + await writeFile( + filePath, + `tests: + - id: t1 + criteria: Goal + input: hello + aggregation: mean +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(false); + expect( + result.errors.some((e) => e.message.includes("'aggregation' requires mode: conversation")), + ).toBe(true); + }); + + it('rejects on_turn_failure without mode: conversation', async () => { + const filePath = path.join(tempDir, 'on-turn-failure-no-mode.yaml'); + await writeFile( + filePath, + `tests: + - id: t1 + criteria: Goal + input: hello + on_turn_failure: stop +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(false); + expect( + result.errors.some((e) => + e.message.includes("'on_turn_failure' requires mode: conversation"), + ), + ).toBe(true); + }); + + it('rejects window_size without mode: conversation', async () => { + const filePath = path.join(tempDir, 'window-no-mode.yaml'); + await writeFile( + filePath, + `tests: + - id: t1 + criteria: Goal + input: hello + window_size: 3 +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(false); + expect( + result.errors.some((e) => e.message.includes("'window_size' requires mode: conversation")), + ).toBe(true); + }); + + it('rejects a turn missing input', async () => { + const filePath = path.join(tempDir, 'turn-missing-input.yaml'); + await writeFile( + filePath, + `tests: + - id: t1 + criteria: Goal + input: hello + mode: conversation + turns: + - expected_output: "something" +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(false); + expect(result.errors.some((e) => e.message.includes('non-empty input'))).toBe(true); + }); + + it('rejects a turn with whitespace-only input', async () => { + const filePath = path.join(tempDir, 'turn-whitespace-input.yaml'); + await writeFile( + filePath, + `tests: + - id: t1 + criteria: Goal + input: hello + mode: conversation + turns: + - input: " " +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(false); + expect(result.errors.some((e) => e.message.includes('non-empty input'))).toBe(true); + }); + + it('accepts a valid conversation mode eval file', async () => { + const filePath = path.join(tempDir, 'valid-conversation.yaml'); + await writeFile( + filePath, + `tests: + - id: conv-valid + criteria: Be helpful + input: "System: you are a helpful assistant" + mode: conversation + aggregation: mean + on_turn_failure: continue + window_size: 5 + turns: + - input: "What is 2+2?" + expected_output: "4" + - input: "And 3+3?" + assertions: + - "Response mentions 6" +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(true); + expect(result.errors).toHaveLength(0); + }); +}); diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json index 69d694bbe..80dc2ebd8 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json +++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json @@ -5114,46 +5114,22 @@ "on_dependency_failure": { "type": "string", "enum": ["skip", "fail", "run"] - } - }, - "required": ["id"], - "additionalProperties": false - } - }, - { - "type": "string" - } - ] - }, - "eval_cases": { - "anyOf": [ - { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string", - "minLength": 1 }, - "criteria": { - "type": "string" + "mode": { + "type": "string", + "enum": ["conversation"] }, - "input": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "object", - "properties": { - "role": { - "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "turns": { + "type": "array", + "items": { + "type": "object", + "properties": { + "input": { + "anyOf": [ + { + "type": "string" }, - "content": { + { "anyOf": [ { "type": "string" @@ -5177,38 +5153,14 @@ } ] } - }, - "required": ["role", "content"], - "additionalProperties": false - } - } - ] - }, - "input_files": { - "type": "array", - "items": { - "type": "string" - } - }, - "expected_output": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "additionalProperties": {} - }, - { - "type": "array", - "items": { - "type": "object", - "properties": { - "role": { - "type": "string", - "enum": ["system", "user", "assistant", "tool"] + ] + }, + "expected_output": { + "anyOf": [ + { + "type": "string" }, - "content": { + { "anyOf": [ { "type": "string" @@ -5232,648 +5184,3076 @@ } ] } - }, - "required": ["role", "content"], - "additionalProperties": false - } - } - ] - }, - "assertions": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["code-grader", "code_grader"] - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "cwd": { - "type": "string" - }, - "target": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "object", - "properties": { - "max_calls": { - "type": "number" - } - }, - "additionalProperties": false - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { + ] + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { "type": "string" }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type", "command"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["llm-grader", "llm_grader"] - }, - "prompt": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { + ] + }, + "script": { + "anyOf": [ + { "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false } - ] + } }, - "config": { - "type": "object", - "additionalProperties": {} - } - }, - "additionalProperties": false - } - ] - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "required": ["type", "command"], + "additionalProperties": false }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } }, - { - "type": "integer", + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, "minimum": 0, - "maximum": 10 + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 + }, + "additionalProperties": false } }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "model": { - "type": "string" - }, - "target": { - "type": "string" - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { + "model": { "type": "string" }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "include": { - "type": "string", - "minLength": 1 - } - }, - "required": ["include"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "composite" - }, - "assertions": { - "type": "array", - "items": {} - }, - "evaluators": { - "type": "array", - "items": {} - }, - "aggregator": { - "anyOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "weighted_average" - }, - "weights": { - "type": "object", - "additionalProperties": { - "type": "number" - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "threshold" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "code-grader" - }, - "path": { - "type": "string" - }, - "cwd": { - "type": "string" - } - }, - "required": ["type", "path"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "llm-grader" - }, - "prompt": { - "type": "string" - }, - "model": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - } - ] - } - }, - "required": ["type", "aggregator"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] - }, - "mode": { - "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] - }, - "minimums": { - "type": "object", - "additionalProperties": { - "type": "integer", - "minimum": 0 - } - }, - "expected": { - "type": "array", - "items": { - "type": "object", - "properties": { - "tool": { - "type": "string" - }, - "args": { - "anyOf": [ - { - "type": "string", - "const": "any" + "target": { + "type": "string" }, - { + "config": { "type": "object", "additionalProperties": {} - } - ] - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "maxDurationMs": { - "type": "number", - "minimum": 0 - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] }, - { + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { "type": "array", "items": { - "type": "string" + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false } } - ] + }, + "required": ["type"], + "additionalProperties": false }, - "argsMatch": { - "anyOf": [ - { + { + "type": "object", + "properties": { + "include": { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { "type": "array", "items": { - "type": "string" + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] } - ] - } - }, - "required": ["tool"], - "additionalProperties": false - } - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "mode"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["field-accuracy", "field_accuracy"] - }, - "fields": { - "type": "array", - "items": { - "type": "object", - "properties": { - "path": { - "type": "string" - }, - "match": { - "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] - }, - "required": { - "type": "boolean" - }, - "weight": { - "type": "number" - }, - "tolerance": { - "type": "number", - "minimum": 0 + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["path", "match"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + ] + } + } + }, + "required": ["input"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["mean", "min", "max"] + }, + "on_turn_failure": { + "type": "string", + "enum": ["continue", "stop"] + }, + "window_size": { + "type": "integer", + "minimum": 1 + } + }, + "required": ["id"], + "additionalProperties": false + } + }, + { + "type": "string" + } + ] + }, + "eval_cases": { + "anyOf": [ + { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "criteria": { + "type": "string" + }, + "input": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + } + } + ] + }, + "input_files": { + "type": "array", + "items": { + "type": "string" + } + }, + "expected_output": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + } + } + ] + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["path", "match"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + }, + "evaluators": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 }, "relative": { "type": "boolean" @@ -6340,1201 +8720,1201 @@ "items": { "type": "object", "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + }, + "execution": { + "type": "object", + "properties": { + "target": { + "type": "string" + }, + "targets": { + "type": "array", + "items": { + "type": "string" + } + }, + "workers": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ { - "type": "integer", - "minimum": 0, - "maximum": 10 + "type": "string" }, { - "type": "integer", - "minimum": 0, - "maximum": 10 + "type": "array", + "items": { + "type": "string" + } } ] }, - "outcome": { - "type": "string", - "minLength": 1 + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} } }, - "required": ["score_range", "outcome"], "additionalProperties": false } - } - }, - "additionalProperties": false - }, - "minItems": 1 - } - }, - "required": ["type", "criteria"], - "additionalProperties": false - } - ] - } - }, - "evaluators": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["code-grader", "code_grader"] - }, - "command": { - "anyOf": [ - { - "type": "string" + ] }, - { + "rubrics": { "type": "array", "items": { - "type": "string" + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false } - } - ] - }, - "script": { - "anyOf": [ - { + }, + "model": { "type": "string" }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "cwd": { - "type": "string" - }, - "target": { - "anyOf": [ - { - "type": "boolean" + "target": { + "type": "string" }, - { + "config": { "type": "object", - "properties": { - "max_calls": { - "type": "number" - } - }, - "additionalProperties": false - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { - "type": "string" + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 }, - { - "type": "array", - "items": { - "type": "string" - } + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] } - ] + }, + "required": ["type", "command"], + "additionalProperties": false } - }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type", "command"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false }, - "weight": { - "type": "number", - "minimum": 0 + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["llm-grader", "llm_grader"] - }, - "prompt": { - "anyOf": [ - { - "type": "string" }, - { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" }, - { - "type": "array", - "items": { - "type": "string" + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" } } - ] + }, + "required": ["type"], + "additionalProperties": false }, - "script": { - "anyOf": [ - { - "type": "string" + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" }, - { - "type": "array", - "items": { - "type": "string" - } + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 } - ] + }, + "required": ["type", "threshold"], + "additionalProperties": false }, - "config": { - "type": "object", - "additionalProperties": {} - } - }, - "additionalProperties": false - } - ] - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { + { "type": "object", "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { + "type": { "type": "string", - "minLength": 1 + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" } }, - "required": ["score_range", "outcome"], + "required": ["type", "path"], "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "model": { - "type": "string" - }, - "target": { - "type": "string" - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { "type": "string" } - } - ] - } - }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "include": { - "type": "string", - "minLength": 1 - } - }, - "required": ["include"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "composite" - }, - "assertions": { - "type": "array", - "items": {} - }, - "evaluators": { - "type": "array", - "items": {} - }, - "aggregator": { - "anyOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "weighted_average" - }, - "weights": { - "type": "object", - "additionalProperties": { - "type": "number" - } - } - }, - "required": ["type"], - "additionalProperties": false + "minimum": 0 }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "threshold" + "required": { + "anyOf": [ + { + "type": "boolean" }, - "threshold": { + { "type": "number", + "exclusiveMinimum": true, "minimum": 0, "maximum": 1 } - }, - "required": ["type", "threshold"], - "additionalProperties": false + ] }, - { + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "minimums": { "type": "object", - "properties": { - "type": { - "type": "string", - "const": "code-grader" + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } }, - "path": { - "type": "string" + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] }, - "cwd": { - "type": "string" + { + "type": "array", + "items": { + "type": "string" + } } - }, - "required": ["type", "path"], - "additionalProperties": false + ] }, - { - "type": "object", - "properties": { - "type": { + "argsMatch": { + "anyOf": [ + { "type": "string", - "const": "llm-grader" - }, - "prompt": { - "type": "string" + "enum": ["exact", "ignore", "subset", "superset"] }, - "model": { - "type": "string" + { + "type": "array", + "items": { + "type": "string" + } } - }, - "required": ["type"], - "additionalProperties": false + ] } - ] - } - }, - "required": ["type", "aggregator"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": ["type", "mode"], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] - }, - "mode": { - "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] - }, - "minimums": { - "type": "object", - "additionalProperties": { - "type": "integer", - "minimum": 0 - } - }, - "expected": { - "type": "array", - "items": { - "type": "object", - "properties": { - "tool": { - "type": "string" - }, - "args": { - "anyOf": [ - { - "type": "string", - "const": "any" + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" }, - { - "type": "object", - "additionalProperties": {} - } - ] - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "maxDurationMs": { - "type": "number", - "minimum": 0 - }, - "args_match": { - "anyOf": [ - { + "match": { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": ["exact", "numeric_tolerance", "date"] }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "required": { + "type": "boolean" }, - { + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { "type": "array", "items": { "type": "string" } } - ] - } - }, - "required": ["tool"], - "additionalProperties": false - } - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "mode"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["field-accuracy", "field_accuracy"] - }, - "fields": { - "type": "array", - "items": { - "type": "object", - "properties": { - "path": { - "type": "string" - }, - "match": { - "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] - }, - "required": { - "type": "boolean" - }, - "weight": { - "type": "number" - }, - "tolerance": { - "type": "number", - "minimum": 0 - }, - "relative": { - "type": "boolean" + }, + "required": ["path", "match"], + "additionalProperties": false }, - "formats": { - "type": "array", - "items": { - "type": "string" - } - } + "minItems": 1 }, - "required": ["path", "match"], - "additionalProperties": false + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } }, - "minItems": 1 - }, - "aggregation": { - "type": "string", - "enum": ["weighted_average", "all_or_nothing"] - } - }, - "required": ["type", "fields"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + "required": ["type", "fields"], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "latency" - }, - "threshold": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": ["type", "threshold"], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "cost" - }, - "budget": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "budget"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": ["type", "budget"], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["token-usage", "token_usage"] - }, - "max_total": { - "type": "number", - "minimum": 0 - }, - "max_input": { - "type": "number", - "minimum": 0 - }, - "max_output": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["execution-metrics", "execution_metrics"] - }, - "max_tool_calls": { - "type": "number", - "minimum": 0 - }, - "max_llm_calls": { - "type": "number", - "minimum": 0 - }, - "max_tokens": { - "type": "number", - "minimum": 0 - }, - "max_cost_usd": { - "type": "number", - "minimum": 0 - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "target_exploration_ratio": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "exploration_tolerance": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { + }, + "negate": { "type": "boolean" }, - { + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { "type": "number", - "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": ["type"], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "regex" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": ["type", "value"], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] }, - { + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["is-json", "is_json"] - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": ["type", "value"], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "equals" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": ["type"], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "rubrics" + }, + "required": ["type", "value"], + "additionalProperties": false }, - "criteria": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 + "outcome": { + "type": "string", + "minLength": 1 } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 + }, + "required": ["score_range", "outcome"], + "additionalProperties": false } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false + } + }, + "additionalProperties": false + }, + "minItems": 1 + } }, - "minItems": 1 + "required": ["type", "criteria"], + "additionalProperties": false } - }, - "required": ["type", "criteria"], - "additionalProperties": false - } - ] - } - }, - "execution": { - "type": "object", - "properties": { - "target": { - "type": "string" - }, - "targets": { - "type": "array", - "items": { - "type": "string" + ] } }, - "workers": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "assertions": { + "evaluators": { "type": "array", "items": { "anyOf": [ @@ -8394,12 +10774,181 @@ "minimum": 0, "maximum": 1 }, - "exploration_tolerance": { - "type": "number", - "minimum": 0 + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" } }, - "required": ["type"], + "required": ["type", "value"], "additionalProperties": false }, { @@ -8436,342 +10985,537 @@ }, "type": { "type": "string", - "const": "contains" + "const": "rubrics" }, - "value": { + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + }, + "skip_defaults": { + "type": "boolean" + }, + "cache": { + "type": "boolean" + }, + "trials": { + "type": "object", + "properties": { + "count": { + "type": "integer", + "minimum": 1 + }, + "strategy": { + "type": "string", + "enum": ["pass_at_k", "mean", "confidence_interval"] + }, + "cost_limit_usd": { + "type": "number", + "minimum": 0 + }, + "costLimitUsd": { + "type": "number", + "minimum": 0 + } + }, + "required": ["count"], + "additionalProperties": false + }, + "total_budget_usd": { + "type": "number", + "minimum": 0 + }, + "totalBudgetUsd": { + "type": "number", + "minimum": 0 + }, + "fail_on_error": { + "type": "boolean" + }, + "failOnError": { + "type": "boolean" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "additionalProperties": false + }, + "workspace": { + "type": "object", + "properties": { + "template": { + "type": "string" + }, + "isolation": { + "type": "string", + "enum": ["shared", "per_test"] + }, + "repos": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "source": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "git" + }, + "url": { + "type": "string", + "format": "uri" + } + }, + "required": ["type", "url"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "local" + }, + "path": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + } + ] + }, + "checkout": { + "type": "object", + "properties": { + "ref": { + "type": "string" + }, + "base_commit": { + "type": "string", + "minLength": 1 + }, + "resolve": { + "type": "string", + "enum": ["remote", "local"] + }, + "ancestor": { + "type": "integer", + "minimum": 0 + } + }, + "additionalProperties": false + }, + "clone": { + "type": "object", + "properties": { + "depth": { + "type": "integer", + "minimum": 1 + }, + "filter": { + "type": "string" + }, + "sparse": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "hooks": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "before_all": { + "type": "object", + "properties": { + "command": { + "type": "array", + "items": { + "type": "string" + } + }, + "script": { + "type": "array", + "items": { "type": "string" } }, - "required": ["type", "value"], - "additionalProperties": false + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } }, - { - "type": "object", - "properties": { - "name": { + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "type": "array", + "items": { "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "regex" - }, - "value": { + } + }, + "script": { + "type": "array", + "items": { "type": "string" } }, - "required": ["type", "value"], - "additionalProperties": false + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } }, - { - "type": "object", - "properties": { - "name": { + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "type": "array", + "items": { "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["is-json", "is_json"] } }, - "required": ["type"], - "additionalProperties": false + "script": { + "type": "array", + "items": { + "type": "string" + } + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } }, - { - "type": "object", - "properties": { - "name": { + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "type": "array", + "items": { "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "equals" - }, - "value": { + } + }, + "script": { + "type": "array", + "items": { "type": "string" } }, - "required": ["type", "value"], - "additionalProperties": false + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + }, + "mode": { + "type": "string", + "enum": ["pooled", "temp", "static"] + }, + "path": { + "type": "string" + }, + "docker": { + "type": "object", + "properties": { + "image": { + "type": "string" + }, + "timeout": { + "type": "integer", + "minimum": 1 + }, + "memory": { + "type": "string" + }, + "cpus": { + "type": "number", + "minimum": 0.1 + } + }, + "required": ["image"], + "additionalProperties": false + } + }, + "additionalProperties": false + }, + "metadata": { + "type": "object", + "additionalProperties": {} + }, + "conversation_id": { + "type": "string" + }, + "suite": { + "type": "string" + }, + "note": { + "type": "string" + }, + "depends_on": { + "type": "array", + "items": { + "type": "string" + } + }, + "on_dependency_failure": { + "type": "string", + "enum": ["skip", "fail", "run"] + }, + "mode": { + "type": "string", + "enum": ["conversation"] + }, + "turns": { + "type": "array", + "items": { + "type": "object", + "properties": { + "input": { + "anyOf": [ + { + "type": "string" }, { - "type": "object", - "properties": { - "name": { + "anyOf": [ + { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "rubrics" - }, - "criteria": { + { "type": "array", "items": { "type": "object", "properties": { - "id": { - "type": "string" + "type": { + "type": "string", + "enum": ["text", "file", "image"] }, - "outcome": { + "value": { "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } } }, + "required": ["type", "value"], "additionalProperties": false - }, - "minItems": 1 + } } - }, - "required": ["type", "criteria"], - "additionalProperties": false + ] } ] - } - }, - "evaluators": { - "type": "array", - "items": { + }, + "expected_output": { "anyOf": [ { - "type": "object", - "properties": { - "name": { + "type": "string" + }, + { + "anyOf": [ + { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["code-grader", "code_grader"] - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { "type": "string" } - } - ] - }, - "cwd": { - "type": "string" - }, - "target": { - "anyOf": [ - { - "type": "boolean" }, - { - "type": "object", - "properties": { - "max_calls": { - "type": "number" - } - }, - "additionalProperties": false - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "preprocessors": { - "type": "array", - "items": { + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + ] + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "anyOf": [ + { "type": "object", "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, "type": { "type": "string", - "minLength": 1 + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] }, - "command": { + "script": { "anyOf": [ { "type": "string" @@ -8783,408 +11527,492 @@ } } ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } } }, "required": ["type", "command"], "additionalProperties": false - } - } - }, - "required": ["type", "command"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["llm-grader", "llm_grader"] - }, - "prompt": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { "type": "string" }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { + "outcome": { "type": "string" }, - { + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { "type": "array", "items": { - "type": "string" + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false } } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} + }, + "additionalProperties": false } }, - "additionalProperties": false - } - ] - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { + "model": { "type": "string" }, - "outcome": { + "target": { "type": "string" }, - "weight": { - "type": "number" + "config": { + "type": "object", + "additionalProperties": {} }, - "required": { - "type": "boolean" + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 }, - "min_score": { + "temperature": { "type": "number", - "exclusiveMinimum": true, "minimum": 0, - "maximum": 1 + "maximum": 2 }, - "score_ranges": { + "preprocessors": { "type": "array", "items": { "type": "object", "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ { - "type": "integer", - "minimum": 0, - "maximum": 10 + "type": "string" }, { - "type": "integer", - "minimum": 0, - "maximum": 10 + "type": "array", + "items": { + "type": "string" + } } ] - }, - "outcome": { - "type": "string", - "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": ["type", "command"], "additionalProperties": false } } }, + "required": ["type"], "additionalProperties": false - } - }, - "model": { - "type": "string" - }, - "target": { - "type": "string" - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "preprocessors": { - "type": "array", - "items": { + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { "type": "object", "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, "type": { "type": "string", - "minLength": 1 + "const": "composite" }, - "command": { + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { "anyOf": [ { - "type": "string" + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false }, { - "type": "array", - "items": { - "type": "string" - } + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false } ] } }, - "required": ["type", "command"], + "required": ["type", "aggregator"], "additionalProperties": false - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "include": { - "type": "string", - "minLength": 1 - } - }, - "required": ["include"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "composite" - }, - "assertions": { - "type": "array", - "items": {} - }, - "evaluators": { - "type": "array", - "items": {} - }, - "aggregator": { - "anyOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "weighted_average" - }, - "weights": { - "type": "object", - "additionalProperties": { - "type": "number" - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "threshold" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "code-grader" - }, - "path": { - "type": "string" - }, - "cwd": { - "type": "string" - } - }, - "required": ["type", "path"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "llm-grader" - }, - "prompt": { - "type": "string" - }, - "model": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - } - ] - } - }, - "required": ["type", "aggregator"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] - }, - "mode": { - "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] - }, - "minimums": { - "type": "object", - "additionalProperties": { - "type": "integer", - "minimum": 0 - } - }, - "expected": { - "type": "array", - "items": { + }, + { "type": "object", "properties": { - "tool": { + "name": { "type": "string" }, - "args": { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { "anyOf": [ { - "type": "string", - "const": "any" + "type": "boolean" }, { - "type": "object", - "additionalProperties": {} + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 } ] }, - "max_duration_ms": { + "min_score": { "type": "number", - "minimum": 0 + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } }, - "maxDurationMs": { - "type": "number", - "minimum": 0 + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false + } }, "args_match": { "anyOf": [ @@ -9215,935 +12043,589 @@ ] } }, - "required": ["tool"], + "required": ["type", "mode"], "additionalProperties": false - } - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "mode"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["field-accuracy", "field_accuracy"] - }, - "fields": { - "type": "array", - "items": { + }, + { "type": "object", "properties": { - "path": { + "name": { "type": "string" }, - "match": { - "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "weight": { + "type": "number", + "minimum": 0 }, "required": { - "type": "boolean" - }, - "weight": { - "type": "number" + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] }, - "tolerance": { + "min_score": { "type": "number", - "minimum": 0 + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 }, - "relative": { + "negate": { "type": "boolean" }, - "formats": { + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { "type": "array", - "items": { - "type": "string" - } - } - }, - "required": ["path", "match"], - "additionalProperties": false - }, - "minItems": 1 - }, - "aggregation": { - "type": "string", - "enum": ["weighted_average", "all_or_nothing"] - } - }, - "required": ["type", "fields"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "latency" - }, - "threshold": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "cost" - }, - "budget": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "budget"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["path", "match"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["token-usage", "token_usage"] - }, - "max_total": { - "type": "number", - "minimum": 0 - }, - "max_input": { - "type": "number", - "minimum": 0 - }, - "max_output": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["execution-metrics", "execution_metrics"] - }, - "max_tool_calls": { - "type": "number", - "minimum": 0 - }, - "max_llm_calls": { - "type": "number", - "minimum": 0 - }, - "max_tokens": { - "type": "number", - "minimum": 0 - }, - "max_cost_usd": { - "type": "number", - "minimum": 0 - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "target_exploration_ratio": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "exploration_tolerance": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "regex" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["is-json", "is_json"] - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "equals" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "rubrics" - }, - "criteria": { - "type": "array", - "items": { + "required": ["type", "value"], + "additionalProperties": false + }, + { "type": "object", "properties": { - "id": { + "name": { "type": "string" }, - "outcome": { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" }, "weight": { - "type": "number" + "type": "number", + "minimum": 0 }, "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { "type": "boolean" }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 }, - "score_ranges": { + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { "type": "array", "items": { "type": "object", "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] + "id": { + "type": "string" }, "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - }, - "minItems": 1 - } - }, - "required": ["type", "criteria"], - "additionalProperties": false - } - ] - } - }, - "skip_defaults": { - "type": "boolean" - }, - "cache": { - "type": "boolean" - }, - "trials": { - "type": "object", - "properties": { - "count": { - "type": "integer", - "minimum": 1 - }, - "strategy": { - "type": "string", - "enum": ["pass_at_k", "mean", "confidence_interval"] - }, - "cost_limit_usd": { - "type": "number", - "minimum": 0 - }, - "costLimitUsd": { - "type": "number", - "minimum": 0 - } - }, - "required": ["count"], - "additionalProperties": false - }, - "total_budget_usd": { - "type": "number", - "minimum": 0 - }, - "totalBudgetUsd": { - "type": "number", - "minimum": 0 - }, - "fail_on_error": { - "type": "boolean" - }, - "failOnError": { - "type": "boolean" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - } - }, - "additionalProperties": false - }, - "workspace": { - "type": "object", - "properties": { - "template": { - "type": "string" - }, - "isolation": { - "type": "string", - "enum": ["shared", "per_test"] - }, - "repos": { - "type": "array", - "items": { - "type": "object", - "properties": { - "path": { - "type": "string" - }, - "source": { - "anyOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "git" - }, - "url": { - "type": "string", - "format": "uri" - } - }, - "required": ["type", "url"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "local" + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } }, - "path": { - "type": "string" - } - }, - "required": ["type", "path"], - "additionalProperties": false - } - ] - }, - "checkout": { - "type": "object", - "properties": { - "ref": { - "type": "string" - }, - "base_commit": { - "type": "string", - "minLength": 1 - }, - "resolve": { - "type": "string", - "enum": ["remote", "local"] - }, - "ancestor": { - "type": "integer", - "minimum": 0 - } - }, - "additionalProperties": false - }, - "clone": { - "type": "object", - "properties": { - "depth": { - "type": "integer", - "minimum": 1 - }, - "filter": { - "type": "string" - }, - "sparse": { - "type": "array", - "items": { - "type": "string" + "required": ["type", "criteria"], + "additionalProperties": false } - } - }, - "additionalProperties": false - } - }, - "additionalProperties": false - } - }, - "hooks": { - "type": "object", - "properties": { - "enabled": { - "type": "boolean" - }, - "before_all": { - "type": "object", - "properties": { - "command": { - "type": "array", - "items": { - "type": "string" - } - }, - "script": { - "type": "array", - "items": { - "type": "string" - } - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] - } - }, - "additionalProperties": false - }, - "before_each": { - "type": "object", - "properties": { - "command": { - "type": "array", - "items": { - "type": "string" - } - }, - "script": { - "type": "array", - "items": { - "type": "string" - } - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] - } - }, - "additionalProperties": false - }, - "after_each": { - "type": "object", - "properties": { - "command": { - "type": "array", - "items": { - "type": "string" - } - }, - "script": { - "type": "array", - "items": { - "type": "string" - } - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] - } - }, - "additionalProperties": false - }, - "after_all": { - "type": "object", - "properties": { - "command": { - "type": "array", - "items": { - "type": "string" - } - }, - "script": { - "type": "array", - "items": { - "type": "string" - } - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] + ] } - }, - "additionalProperties": false + ] } - }, - "additionalProperties": false - }, - "mode": { - "type": "string", - "enum": ["pooled", "temp", "static"] - }, - "path": { - "type": "string" + } }, - "docker": { - "type": "object", - "properties": { - "image": { - "type": "string" - }, - "timeout": { - "type": "integer", - "minimum": 1 - }, - "memory": { - "type": "string" - }, - "cpus": { - "type": "number", - "minimum": 0.1 - } - }, - "required": ["image"], - "additionalProperties": false - } + "required": ["input"], + "additionalProperties": false }, - "additionalProperties": false - }, - "metadata": { - "type": "object", - "additionalProperties": {} - }, - "conversation_id": { - "type": "string" - }, - "suite": { - "type": "string" - }, - "note": { - "type": "string" + "minItems": 1 }, - "depends_on": { - "type": "array", - "items": { - "type": "string" - } + "aggregation": { + "type": "string", + "enum": ["mean", "min", "max"] }, - "on_dependency_failure": { + "on_turn_failure": { "type": "string", - "enum": ["skip", "fail", "run"] + "enum": ["continue", "stop"] + }, + "window_size": { + "type": "integer", + "minimum": 1 } }, "required": ["id"],