From f4d79601b4724ca564bd894a11e2fd42b23be81e Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 10:30:46 +0000 Subject: [PATCH 1/4] feat(eval): add multi-turn conversation mode with turn-by-turn evaluation Implements issue #1052: support for evaluating multi-turn conversations where the agent generates each assistant turn with per-turn grading. - Add ConversationTurn type, mode, turns, aggregation, on_turn_failure, window_size to EvalTest - Zod schema and YAML parser updates for new fields - Turn-by-turn loop in orchestrator: accumulate messages, call provider, grade, repeat - Conversation assertions run after all turns - Aggregation: mean (default), min (weakest-link), max - String shorthand in per-turn assertions works identically to top-level - Cross-field validation (turns requires mode:conversation, etc.) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/plans/1052-conversation-mode.md | 55 + packages/core/src/evaluation/orchestrator.ts | 392 + packages/core/src/evaluation/types.ts | 45 + .../evaluation/validation/eval-file.schema.ts | 15 + .../evaluation/validation/eval-validator.ts | 113 + packages/core/src/evaluation/yaml-parser.ts | 61 + .../references/eval-schema.json | 11399 +++++++++++----- 7 files changed, 8533 insertions(+), 3547 deletions(-) create mode 100644 docs/plans/1052-conversation-mode.md diff --git a/docs/plans/1052-conversation-mode.md b/docs/plans/1052-conversation-mode.md new file mode 100644 index 000000000..2c50ab9e3 --- /dev/null +++ b/docs/plans/1052-conversation-mode.md @@ -0,0 +1,55 @@ +# Issue #1052: Multi-turn Conversational Test Case — Live Turn-by-Turn Evaluation + +## Problem + +Today, multi-turn evals script all intermediate assistant responses in `input` — the LLM only generates the last response. This means conversation context retention, progressive reasoning, and turn-by-turn quality cannot be measured independently. + +## Solution + +Add `mode: conversation` with a `turns` array that drives turn-by-turn LLM evaluation with per-turn and conversation-level grading. + +### New Schema Fields + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `mode` | `'conversation'` | - | Enables conversation evaluation mode | +| `turns` | `ConversationTurn[]` | - | Ordered user messages; each generates an LLM call | +| `aggregation` | `'mean' \| 'min' \| 'max'` | `'mean'` | How turn scores combine into final score | +| `on_turn_failure` | `'continue' \| 'stop'` | `'continue'` | What to do when a turn's assertions fail | +| `window_size` | `number` | all turns | Sliding window for context passed to graders | + +### How It Works + +1. `input` provides system prompt and initial context (same as today) +2. For each entry in `turns`: + a. Append the user message to accumulated history + b. Call the provider with full history — LLM generates assistant response + c. Grade the response against turn's `assertions` and `expected_output` + d. Append actual LLM response (not expected_output) to history +3. After all turns: run top-level `assertions` over full transcript +4. Final score = aggregation of per-turn + conversation assertion scores + +### Validation Rules + +- `turns` requires `mode: conversation` +- `mode: conversation` requires `turns` +- `turns` incompatible with top-level `expected_output` +- `aggregation` only valid with `mode: conversation` +- Each turn must have non-empty `input` + +### Files Modified + +| File | Change | +|------|--------| +| `packages/core/src/evaluation/types.ts` | ConversationTurn, mode, turns, etc. on EvalTest | +| `packages/core/src/evaluation/validation/eval-file.schema.ts` | Zod schema for new fields | +| `packages/core/src/evaluation/yaml-parser.ts` | Parse conversation fields | +| `packages/core/src/evaluation/orchestrator.ts` | Conversation runner in runEvalCase | +| `packages/core/test/evaluation/conversation-mode.test.ts` | Unit tests | +| `examples/features/multi-turn-conversation-live/` | UAT example | + +## References + +- Issue: #1052 +- Research: agentevals-research PR #57 +- Prior art: #505 / PR #507 (scripted multi-turn), #331 / PR #1051 (depends_on) diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index ebedc1e00..809787351 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -24,6 +24,8 @@ import { resolveTargetDefinition, } from './providers/targets.js'; import type { + ChatMessage, + ChatMessageRole, EnvLookup, Message, Provider, @@ -47,6 +49,8 @@ import { import { aggregateTrials } from './trials.js'; import type { AssertionEntry, + ConversationAggregation, + ConversationTurn, DependencyResult, EvalTest, EvaluationResult, @@ -60,6 +64,8 @@ import type { JsonObject, JsonValue, LlmGraderEvaluatorConfig, + TestMessage, + TestMessageRole, TrialResult, TrialsConfig, WorkspaceHookConfig, @@ -1889,6 +1895,42 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise {}); + } + } + + return conversationResult; + } + const caseStartMs = Date.now(); const attemptBudget = (maxRetries ?? 0) + 1; let attempt = 0; @@ -2887,6 +2929,356 @@ function buildEvaluatorRegistry( }; } +// --------------------------------------------------------------------------- +// Conversation mode: turn-by-turn evaluation +// --------------------------------------------------------------------------- + +/** + * Run a multi-turn conversation evaluation. + * For each turn: append user message → call provider → grade turn → append LLM response. + * After all turns, run conversation-level assertions on the full transcript. + * Final score is aggregated from turn scores + conversation scores. + */ +async function runConversationMode(options: { + readonly evalCase: EvalTest; + readonly provider: Provider; + readonly target: ResolvedTarget; + readonly evaluators: Partial> & { readonly 'llm-grader': Evaluator }; + readonly typeRegistry: import('./registry/evaluator-registry.js').EvaluatorRegistry; + readonly graderProvider?: Provider; + readonly promptInputs: PromptInputs; + readonly nowFn: () => Date; + readonly signal?: AbortSignal; + readonly workspacePath?: string; + readonly caseWorkspaceFile?: string; + readonly agentTimeoutMs?: number; + readonly streamCallbacks?: ProviderStreamCallbacks; + readonly verbose?: boolean; + readonly threshold?: number; + readonly targetResolver?: (name: string) => Provider | undefined; + readonly availableTargets?: readonly string[]; +}): Promise { + const { + evalCase, provider, target, evaluators, typeRegistry, + graderProvider, promptInputs, nowFn, signal, + workspacePath, caseWorkspaceFile, agentTimeoutMs, + streamCallbacks, verbose, threshold, targetResolver, availableTargets, + } = options; + + const turns = evalCase.turns!; + const aggregation = evalCase.aggregation ?? 'mean'; + const onTurnFailure = evalCase.on_turn_failure ?? 'continue'; + const windowSize = evalCase.window_size; + + // Build initial message history from evalCase.input (system prompt + any context) + const history: ChatMessage[] = []; + for (const msg of evalCase.input) { + const content = typeof msg.content === 'string' + ? msg.content + : JSON.stringify(msg.content); + history.push({ role: msg.role as ChatMessageRole, content }); + } + + const turnScores: EvaluatorResult[] = []; + const allTurnScoreValues: number[] = []; + let stopped = false; + const caseStartMs = Date.now(); + + for (let i = 0; i < turns.length; i++) { + const turn = turns[i]; + const turnIndex = i + 1; + + if (stopped) { + // Turn skipped due to on_turn_failure: stop + turnScores.push({ + name: `turn-${turnIndex}`, + type: 'rubrics' as EvaluatorKind, + score: 0, + verdict: 'skip' as EvaluationVerdict, + assertions: [{ text: 'Skipped due to previous turn failure', passed: false }], + }); + allTurnScoreValues.push(0); + continue; + } + + // Append user message to history + const userContent = typeof turn.input === 'string' + ? turn.input + : JSON.stringify(turn.input); + history.push({ role: 'user', content: userContent }); + + // Build chatPrompt for provider call (with optional window_size) + const chatPromptForProvider = windowSize + ? buildWindowedHistory(history, windowSize) + : [...history]; + + // Call provider with accumulated history + let response: ProviderResponse; + try { + response = await provider.invoke({ + question: userContent, + chatPrompt: chatPromptForProvider, + evalCaseId: `${evalCase.id}/turn-${turnIndex}`, + signal, + cwd: workspacePath, + workspaceFile: caseWorkspaceFile, + streamCallbacks, + }); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + turnScores.push({ + name: `turn-${turnIndex}`, + type: 'rubrics' as EvaluatorKind, + score: 0, + verdict: 'fail' as EvaluationVerdict, + assertions: [{ text: `Provider error: ${message}`, passed: false }], + }); + allTurnScoreValues.push(0); + if (onTurnFailure === 'stop') stopped = true; + continue; + } + + // Extract assistant response + const assistantContent = extractLastAssistantContent(response.output); + + // Append actual LLM response (NOT expected_output) to history + history.push({ role: 'assistant', content: assistantContent }); + + // Grade this turn + if (!turn.assertions?.length && !turn.expected_output) { + // No assertions or expected_output — turn scores 1.0 + turnScores.push({ + name: `turn-${turnIndex}`, + type: 'rubrics' as EvaluatorKind, + score: 1.0, + verdict: 'pass' as EvaluationVerdict, + assertions: [], + }); + allTurnScoreValues.push(1.0); + continue; + } + + // Build assertions for this turn + const turnAssertions = buildTurnAssertions(turn); + + // Create a synthetic EvalTest for this turn's grading + const turnEvalCase: EvalTest = { + ...evalCase, + id: `${evalCase.id}/turn-${turnIndex}`, + assertions: turnAssertions.length > 0 ? turnAssertions : evalCase.assertions, + input: buildTurnGraderInput(history, windowSize), + expected_output: turn.expected_output + ? [typeof turn.expected_output === 'string' + ? { content: turn.expected_output } as JsonObject + : turn.expected_output as JsonObject] + : [], + // Clear conversation fields to prevent recursion + mode: undefined, + turns: undefined, + }; + + const turnResult = await evaluateCandidate({ + evalCase: turnEvalCase, + candidate: assistantContent, + target, + provider, + evaluators, + typeRegistry, + promptInputs: { + question: buildConversationContext(history, windowSize), + chatPrompt: windowSize ? buildWindowedHistory(history, windowSize) : [...history], + }, + nowFn, + attempt: 0, + graderProvider, + agentTimeoutMs, + output: response.output, + verbose, + threshold, + targetResolver, + availableTargets, + }); + + const turnScore = turnResult.score; + allTurnScoreValues.push(turnScore); + + turnScores.push({ + name: `turn-${turnIndex}`, + type: 'rubrics' as EvaluatorKind, + score: turnScore, + verdict: scoreToVerdict(turnScore, threshold ?? DEFAULT_THRESHOLD) as EvaluationVerdict, + assertions: turnResult.assertions ? [...turnResult.assertions] : [], + scores: turnResult.scores, + }); + + // Check if we should stop on failure + if (onTurnFailure === 'stop' && turnScore < (threshold ?? DEFAULT_THRESHOLD)) { + stopped = true; + } + } + + // Run conversation-level assertions (top-level assertions on full transcript) + let conversationScores: EvaluatorResult[] = []; + if (evalCase.assertions?.length) { + const lastAssistantContent = history.filter(m => m.role === 'assistant').pop()?.content ?? ''; + + const conversationEvalCase: EvalTest = { + ...evalCase, + id: `${evalCase.id}/conversation`, + input: history.map(m => ({ + role: m.role as TestMessageRole, + content: m.content, + })), + expected_output: [], + mode: undefined, + turns: undefined, + }; + + const fullTranscript = history.map(m => `${m.role}: ${m.content}`).join('\n\n'); + + const conversationResult = await evaluateCandidate({ + evalCase: conversationEvalCase, + candidate: lastAssistantContent, + target, + provider, + evaluators, + typeRegistry, + promptInputs: { + question: fullTranscript, + chatPrompt: [...history], + }, + nowFn, + attempt: 0, + graderProvider, + agentTimeoutMs, + verbose, + threshold, + targetResolver, + availableTargets, + }); + + conversationScores = [{ + name: 'conversation', + type: 'rubrics' as EvaluatorKind, + score: conversationResult.score, + verdict: scoreToVerdict(conversationResult.score, threshold ?? DEFAULT_THRESHOLD) as EvaluationVerdict, + assertions: conversationResult.assertions ? [...conversationResult.assertions] : [], + scores: conversationResult.scores, + }]; + } + + // Aggregate final score + const allScoreValues = [ + ...allTurnScoreValues, + ...conversationScores.map(s => s.score), + ]; + + const finalScore = aggregateConversationScores(allScoreValues, aggregation); + const allResultScores = [...turnScores, ...conversationScores]; + + // Build output as full conversation transcript + const outputMessages: Message[] = history.map(m => ({ + role: m.role, + content: m.content, + })); + + const flatAssertions: AssertionEntry[] = allResultScores.flatMap(s => [...s.assertions]); + const totalDurationMs = Date.now() - caseStartMs; + + return { + timestamp: nowFn().toISOString(), + testId: evalCase.id, + suite: evalCase.suite, + category: evalCase.category, + score: finalScore, + assertions: flatAssertions, + target: target.name, + output: outputMessages, + scores: allResultScores, + executionStatus: classifyQualityStatus(finalScore, threshold ?? DEFAULT_THRESHOLD), + input: evalCase.input.map(m => ({ + role: m.role, + content: typeof m.content === 'string' ? m.content : JSON.stringify(m.content), + })), + evalRun: { durationMs: totalDurationMs }, + }; +} + +/** Include system messages + last windowSize*2 non-system messages */ +function buildWindowedHistory(history: readonly ChatMessage[], windowSize: number): ChatMessage[] { + const systemMessages = history.filter(m => m.role === 'system'); + const nonSystem = history.filter(m => m.role !== 'system'); + const windowed = nonSystem.slice(-windowSize * 2); + return [...systemMessages, ...windowed]; +} + +/** Build a text representation of the conversation for grader context */ +function buildConversationContext(history: readonly ChatMessage[], windowSize?: number): string { + const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history; + return msgs.map(m => `${m.role}: ${m.content}`).join('\n\n'); +} + +/** Build TestMessage[] from history for synthetic EvalTest input */ +function buildTurnGraderInput(history: readonly ChatMessage[], windowSize?: number): TestMessage[] { + const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history; + return msgs.map(m => ({ + role: m.role as TestMessageRole, + content: m.content, + })); +} + +/** + * Convert per-turn assertions to EvaluatorConfig[]. + * String assertions are grouped into a single rubrics evaluator. + * Structured assertions pass through as-is. + */ +function buildTurnAssertions(turn: ConversationTurn): EvaluatorConfig[] { + if (!turn.assertions?.length) return []; + + const stringCriteria: string[] = []; + const structured: EvaluatorConfig[] = []; + + for (const a of turn.assertions) { + if (typeof a === 'string') { + stringCriteria.push(a); + } else { + structured.push(a); + } + } + + const result: EvaluatorConfig[] = []; + + // Group string assertions into a single rubrics evaluator + if (stringCriteria.length > 0) { + result.push({ + name: 'turn-rubrics', + type: 'rubrics' as EvaluatorKind, + criteria: stringCriteria.map((text, idx) => ({ + id: `criterion-${idx + 1}`, + outcome: text, + weight: 1, + })), + } as unknown as EvaluatorConfig); + } + + result.push(...structured); + return result; +} + +/** Aggregate turn scores using the configured strategy */ +function aggregateConversationScores(scores: readonly number[], aggregation: ConversationAggregation): number { + if (scores.length === 0) return 1.0; + switch (aggregation) { + case 'min': + return Math.min(...scores); + case 'max': + return Math.max(...scores); + case 'mean': + default: + return scores.reduce((sum, s) => sum + s, 0) / scores.length; + } +} + async function invokeProvider( provider: Provider, options: { diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 3b4adbaec..9a3705fbd 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -858,6 +858,41 @@ export type EvaluatorConfig = | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig; +/** + * A single turn in a multi-turn conversation evaluation. + * Each turn is a user message. The runner generates the assistant response. + */ +export interface ConversationTurn { + /** User message for this turn */ + readonly input: TestMessageContent; + /** Reference assistant response for grading (NOT carried forward — actual LLM response is used) */ + readonly expected_output?: TestMessageContent; + /** Per-turn assertions. Strings become rubric criteria via shorthand. */ + readonly assertions?: readonly (string | EvaluatorConfig)[]; +} + +/** + * Conversation evaluation mode. + * - undefined: standard single-response evaluation (default, backward-compatible) + * - 'conversation': multi-turn evaluation where the LLM generates each assistant turn + */ +export type ConversationMode = 'conversation'; + +/** + * Score aggregation strategy for multi-turn conversation evaluation. + * - 'mean': average of all turn scores (default) + * - 'min': weakest-link scoring — final score = lowest turn score + * - 'max': best turn score + */ +export type ConversationAggregation = 'mean' | 'min' | 'max'; + +/** + * Behavior when a turn's assertions fail. + * - 'continue': run all remaining turns regardless (default) + * - 'stop': skip remaining turns, score them as 0 + */ +export type TurnFailurePolicy = 'continue' | 'stop'; + /** * Eval test definition sourced from AgentV specs. */ @@ -884,6 +919,16 @@ export interface EvalTest { readonly targets?: readonly string[]; /** Per-test score threshold override (0-1). Resolution: CLI > test > suite > DEFAULT_THRESHOLD. */ readonly threshold?: number; + /** Conversation evaluation mode. When 'conversation', turns[] drives turn-by-turn LLM evaluation. */ + readonly mode?: ConversationMode; + /** Ordered turns for conversation evaluation. Each turn generates a fresh LLM call. */ + readonly turns?: readonly ConversationTurn[]; + /** Score aggregation for conversation turns: mean (default), min (weakest-link), max */ + readonly aggregation?: ConversationAggregation; + /** Behavior on turn assertion failure: continue (default) or stop */ + readonly on_turn_failure?: TurnFailurePolicy; + /** Sliding window size for context passed to per-turn graders. Default: all turns. */ + readonly window_size?: number; /** Test IDs this test depends on. Dependent tests wait for all dependencies to complete before running. */ readonly depends_on?: readonly string[]; /** What to do when a dependency fails: skip (default), fail, or run anyway. */ diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index 5de36a1a8..e35f46287 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -355,6 +355,16 @@ const ExecutionSchema = z.object({ threshold: z.number().min(0).max(1).optional(), }); +/** Per-turn assertion: string shorthand (becomes rubric) or full evaluator config */ +const TurnAssertionSchema = z.union([z.string(), EvaluatorSchema]); + +/** A single turn in a multi-turn conversation */ +const ConversationTurnSchema = z.object({ + input: z.union([z.string(), MessageContentSchema]), + expected_output: z.union([z.string(), MessageContentSchema]).optional(), + assertions: z.array(TurnAssertionSchema).optional(), +}); + // --------------------------------------------------------------------------- // Test case // --------------------------------------------------------------------------- @@ -375,6 +385,11 @@ const EvalTestSchema = z.object({ note: z.string().optional(), depends_on: z.array(z.string()).optional(), on_dependency_failure: z.enum(['skip', 'fail', 'run']).optional(), + mode: z.enum(['conversation']).optional(), + turns: z.array(ConversationTurnSchema).min(1).optional(), + aggregation: z.enum(['mean', 'min', 'max']).optional(), + on_turn_failure: z.enum(['continue', 'stop']).optional(), + window_size: z.number().int().min(1).optional(), }); // --------------------------------------------------------------------------- diff --git a/packages/core/src/evaluation/validation/eval-validator.ts b/packages/core/src/evaluation/validation/eval-validator.ts index 4506acc50..4acad75e6 100644 --- a/packages/core/src/evaluation/validation/eval-validator.ts +++ b/packages/core/src/evaluation/validation/eval-validator.ts @@ -67,6 +67,13 @@ const KNOWN_TEST_FIELDS = new Set([ 'conversation_id', 'suite', 'note', + 'depends_on', + 'on_dependency_failure', + 'mode', + 'turns', + 'aggregation', + 'on_turn_failure', + 'window_size', ]); /** Name field pattern: lowercase alphanumeric with hyphens. */ @@ -328,6 +335,9 @@ export async function validateEvalFile(filePath: string): Promise= 1 + ? (testCaseConfig.window_size as number) + : undefined; + const testCase: EvalTest = { id, suite: suiteName, @@ -540,6 +567,11 @@ async function loadTestsFromYaml( metadata, targets: caseTargets, ...(caseThreshold !== undefined ? { threshold: caseThreshold } : {}), + ...(mode ? { mode } : {}), + ...(turns && turns.length > 0 ? { turns } : {}), + ...(aggregation ? { aggregation } : {}), + ...(onTurnFailure ? { on_turn_failure: onTurnFailure } : {}), + ...(windowSize !== undefined ? { window_size: windowSize } : {}), ...(dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {}), ...(onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}), }; @@ -571,6 +603,35 @@ export async function loadTestById( /** @deprecated Use `loadTestById` instead */ export const loadEvalCaseById = loadTestById; +/** + * Parse raw turn data from YAML into typed ConversationTurn objects. + * String assertions are preserved as-is — they become rubric criteria at runtime. + * Structured assertion objects pass through unchanged. + */ +function parseTurns(rawTurns: readonly unknown[]): ConversationTurn[] { + return rawTurns.map((rawTurn) => { + const turn = rawTurn as Record; + const input = turn.input as TestMessageContent; + const expectedOutput = turn.expected_output as TestMessageContent | undefined; + + // Parse per-turn assertions (string shorthand or structured evaluator config) + let assertions: (string | EvaluatorConfig)[] | undefined; + if (Array.isArray(turn.assertions)) { + assertions = turn.assertions.map((a: unknown) => { + if (typeof a === 'string') return a; + // Structured evaluator config — pass through as-is (validated by Zod schema) + return a as EvaluatorConfig; + }); + } + + return { + input, + ...(expectedOutput !== undefined ? { expected_output: expectedOutput } : {}), + ...(assertions && assertions.length > 0 ? { assertions } : {}), + }; + }); +} + /** * Normalize a command value from YAML into a string array. * Accepts a string (split on whitespace) or an array of strings. diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json index 69d694bbe..a5ae6f2a5 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json +++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json @@ -56,7 +56,12 @@ "properties": { "role": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "enum": [ + "system", + "user", + "assistant", + "tool" + ] }, "content": { "anyOf": [ @@ -70,20 +75,30 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file", "image"] + "enum": [ + "text", + "file", + "image" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } }, - "required": ["role", "content"], + "required": [ + "role", + "content" + ], "additionalProperties": false } } @@ -121,7 +136,12 @@ "properties": { "role": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "enum": [ + "system", + "user", + "assistant", + "tool" + ] }, "content": { "anyOf": [ @@ -135,20 +155,30 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file", "image"] + "enum": [ + "text", + "file", + "image" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } }, - "required": ["role", "content"], + "required": [ + "role", + "content" + ], "additionalProperties": false } } @@ -176,7 +206,12 @@ "properties": { "role": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "enum": [ + "system", + "user", + "assistant", + "tool" + ] }, "content": { "anyOf": [ @@ -190,20 +225,30 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file", "image"] + "enum": [ + "text", + "file", + "image" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } }, - "required": ["role", "content"], + "required": [ + "role", + "content" + ], "additionalProperties": false } } @@ -247,7 +292,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -321,12 +369,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -363,7 +417,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -458,7 +515,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -509,12 +569,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -525,7 +590,9 @@ "minLength": 1 } }, - "required": ["include"], + "required": [ + "include" + ], "additionalProperties": false }, { @@ -588,7 +655,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -604,7 +673,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -621,7 +693,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -638,13 +713,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -681,11 +761,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -726,7 +815,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -740,7 +834,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -751,7 +850,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -759,7 +860,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -773,7 +879,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -784,7 +895,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -821,7 +935,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -833,7 +950,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -855,17 +976,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -909,7 +1039,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -953,7 +1086,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -990,7 +1126,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -1005,7 +1144,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1042,7 +1183,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -1074,7 +1218,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1117,7 +1263,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -1160,7 +1309,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -1197,10 +1349,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1243,7 +1400,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -1332,7 +1492,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -1342,7 +1505,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -1386,7 +1552,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -1460,12 +1629,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -1502,7 +1677,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -1597,7 +1775,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -1648,12 +1829,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1664,7 +1850,9 @@ "minLength": 1 } }, - "required": ["include"], + "required": [ + "include" + ], "additionalProperties": false }, { @@ -1727,7 +1915,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1743,7 +1933,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -1760,7 +1953,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -1777,13 +1973,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -1820,11 +2021,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -1865,7 +2075,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -1879,7 +2094,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -1890,7 +2110,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -1898,7 +2120,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -1912,7 +2139,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -1923,7 +2155,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -1960,7 +2195,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -1972,7 +2210,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -1994,17 +2236,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -2048,7 +2299,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -2092,7 +2346,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -2129,7 +2386,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -2144,7 +2404,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2181,7 +2443,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -2213,7 +2478,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2256,7 +2523,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -2299,7 +2569,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -2336,10 +2609,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2382,7 +2660,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -2471,7 +2752,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -2481,7 +2765,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -2542,7 +2829,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -2616,12 +2906,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -2658,7 +2954,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -2753,7 +3052,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -2804,12 +3106,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2820,7 +3127,9 @@ "minLength": 1 } }, - "required": ["include"], + "required": [ + "include" + ], "additionalProperties": false }, { @@ -2883,7 +3192,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2899,7 +3210,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -2916,7 +3230,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -2933,13 +3250,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -2976,11 +3298,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -3021,7 +3352,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -3035,7 +3371,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -3046,7 +3387,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -3054,7 +3397,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -3068,7 +3416,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -3079,7 +3432,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -3116,7 +3472,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -3128,7 +3487,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -3150,17 +3513,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -3204,7 +3576,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -3248,7 +3623,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -3285,7 +3663,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -3300,7 +3681,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3337,7 +3720,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -3369,7 +3755,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3412,7 +3800,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -3455,7 +3846,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -3492,10 +3886,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3538,7 +3937,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -3627,7 +4029,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -3637,7 +4042,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -3681,7 +4089,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -3755,12 +4166,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -3797,7 +4214,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -3892,7 +4312,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -3943,12 +4366,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3959,7 +4387,9 @@ "minLength": 1 } }, - "required": ["include"], + "required": [ + "include" + ], "additionalProperties": false }, { @@ -4022,7 +4452,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4038,7 +4470,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -4055,7 +4490,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -4072,13 +4510,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -4115,11 +4558,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -4160,7 +4612,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -4174,7 +4631,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -4185,7 +4647,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -4193,7 +4657,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -4207,7 +4676,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -4218,7 +4692,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -4255,7 +4732,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -4267,7 +4747,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -4289,17 +4773,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -4343,7 +4836,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -4387,7 +4883,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -4424,7 +4923,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -4439,7 +4941,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4476,7 +4980,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -4508,7 +5015,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4551,7 +5060,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -4594,7 +5106,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -4631,10 +5146,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4677,7 +5197,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -4766,7 +5289,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -4776,7 +5302,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -4797,7 +5326,11 @@ }, "strategy": { "type": "string", - "enum": ["pass_at_k", "mean", "confidence_interval"] + "enum": [ + "pass_at_k", + "mean", + "confidence_interval" + ] }, "cost_limit_usd": { "type": "number", @@ -4808,7 +5341,9 @@ "minimum": 0 } }, - "required": ["count"], + "required": [ + "count" + ], "additionalProperties": false }, "total_budget_usd": { @@ -4841,7 +5376,10 @@ }, "isolation": { "type": "string", - "enum": ["shared", "per_test"] + "enum": [ + "shared", + "per_test" + ] }, "repos": { "type": "array", @@ -4865,7 +5403,10 @@ "format": "uri" } }, - "required": ["type", "url"], + "required": [ + "type", + "url" + ], "additionalProperties": false }, { @@ -4879,7 +5420,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false } ] @@ -4896,7 +5440,10 @@ }, "resolve": { "type": "string", - "enum": ["remote", "local"] + "enum": [ + "remote", + "local" + ] }, "ancestor": { "type": "integer", @@ -4960,7 +5507,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -4991,7 +5542,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -5022,7 +5577,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -5053,7 +5612,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -5063,7 +5626,11 @@ }, "mode": { "type": "string", - "enum": ["pooled", "temp", "static"] + "enum": [ + "pooled", + "temp", + "static" + ] }, "path": { "type": "string" @@ -5086,7 +5653,9 @@ "minimum": 0.1 } }, - "required": ["image"], + "required": [ + "image" + ], "additionalProperties": false } }, @@ -5113,47 +5682,29 @@ }, "on_dependency_failure": { "type": "string", - "enum": ["skip", "fail", "run"] - } - }, - "required": ["id"], - "additionalProperties": false - } - }, - { - "type": "string" - } - ] - }, - "eval_cases": { - "anyOf": [ - { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string", - "minLength": 1 + "enum": [ + "skip", + "fail", + "run" + ] }, - "criteria": { - "type": "string" + "mode": { + "type": "string", + "enum": [ + "conversation" + ] }, - "input": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "object", - "properties": { - "role": { - "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "turns": { + "type": "array", + "items": { + "type": "object", + "properties": { + "input": { + "anyOf": [ + { + "type": "string" }, - "content": { + { "anyOf": [ { "type": "string" @@ -5165,50 +5716,33 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file", "image"] + "enum": [ + "text", + "file", + "image" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } - }, - "required": ["role", "content"], - "additionalProperties": false - } - } - ] - }, - "input_files": { - "type": "array", - "items": { - "type": "string" - } - }, - "expected_output": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "additionalProperties": {} - }, - { - "type": "array", - "items": { - "type": "object", - "properties": { - "role": { - "type": "string", - "enum": ["system", "user", "assistant", "tool"] + ] + }, + "expected_output": { + "anyOf": [ + { + "type": "string" }, - "content": { + { "anyOf": [ { "type": "string" @@ -5220,708 +5754,3505 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file", "image"] + "enum": [ + "text", + "file", + "image" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } - }, - "required": ["role", "content"], - "additionalProperties": false - } - } - ] - }, - "assertions": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["code-grader", "code_grader"] - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "cwd": { - "type": "string" - }, - "target": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "object", - "properties": { - "max_calls": { - "type": "number" - } - }, - "additionalProperties": false - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { + ] + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { "type": "string" }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type", "command"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["llm-grader", "llm_grader"] - }, - "prompt": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "code-grader", + "code_grader" + ] + }, + "command": { + "anyOf": [ + { "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } } - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - } - }, - "additionalProperties": false - } - ] - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } + } + ] }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "model": { - "type": "string" - }, - "target": { - "type": "string" - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { + "cwd": { "type": "string" }, - { + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { "type": "array", "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "include": { - "type": "string", - "minLength": 1 - } - }, - "required": ["include"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "composite" - }, - "assertions": { - "type": "array", - "items": {} - }, - "evaluators": { - "type": "array", - "items": {} - }, - "aggregator": { - "anyOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "weighted_average" - }, - "weights": { - "type": "object", - "additionalProperties": { - "type": "number" + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": [ + "type", + "command" + ], + "additionalProperties": false + } } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "threshold" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "code-grader" - }, - "path": { - "type": "string" - }, - "cwd": { - "type": "string" - } - }, - "required": ["type", "path"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "llm-grader" - }, - "prompt": { - "type": "string" }, - "model": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - } - ] - } - }, - "required": ["type", "aggregator"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] - }, - "mode": { - "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] - }, - "minimums": { - "type": "object", - "additionalProperties": { - "type": "integer", - "minimum": 0 - } - }, - "expected": { - "type": "array", - "items": { - "type": "object", - "properties": { - "tool": { - "type": "string" + "required": [ + "type", + "command" + ], + "additionalProperties": false }, - "args": { - "anyOf": [ - { - "type": "string", - "const": "any" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { - "type": "object", - "additionalProperties": {} - } - ] - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "maxDurationMs": { - "type": "number", - "minimum": 0 - }, - "args_match": { - "anyOf": [ - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, - { + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { "type": "array", "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "score_range", + "outcome" + ], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } }, - { + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { "type": "array", "items": { - "type": "string" + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": [ + "type", + "command" + ], + "additionalProperties": false } } - ] - } - }, - "required": ["tool"], - "additionalProperties": false - } - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "mode"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["field-accuracy", "field_accuracy"] - }, - "fields": { - "type": "array", - "items": { - "type": "object", - "properties": { - "path": { - "type": "string" - }, - "match": { - "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] - }, - "required": { - "type": "boolean" - }, - "weight": { - "type": "number" + }, + "required": [ + "type" + ], + "additionalProperties": false }, - "tolerance": { - "type": "number", - "minimum": 0 + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "include" + ], + "additionalProperties": false }, - "relative": { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": [ + "type", + "threshold" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": [ + "type", + "path" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": [ + "type" + ], + "additionalProperties": false + } + ] + } + }, + "required": [ + "type", + "aggregator" + ], + "additionalProperties": false }, - "formats": { - "type": "array", - "items": { - "type": "string" - } - } - }, - "required": ["path", "match"], - "additionalProperties": false - }, - "minItems": 1 - }, - "aggregation": { - "type": "string", - "enum": ["weighted_average", "all_or_nothing"] - } - }, - "required": ["type", "fields"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] + }, + "mode": { + "type": "string", + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": [ + "tool" + ], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": [ + "type", + "mode" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "field-accuracy", + "field_accuracy" + ] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": [ + "path", + "match" + ], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": [ + "weighted_average", + "all_or_nothing" + ] + } + }, + "required": [ + "type", + "fields" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": [ + "type", + "threshold" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": [ + "type", + "budget" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "token-usage", + "token_usage" + ] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "execution-metrics", + "execution_metrics" + ] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "is-json", + "is_json" + ] + } + }, + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "score_range", + "outcome" + ], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": [ + "type", + "criteria" + ], + "additionalProperties": false + } + ] + } + ] + } + } + }, + "required": [ + "input" + ], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": [ + "mean", + "min", + "max" + ] + }, + "on_turn_failure": { + "type": "string", + "enum": [ + "continue", + "stop" + ] + }, + "window_size": { + "type": "integer", + "minimum": 1 + } + }, + "required": [ + "id" + ], + "additionalProperties": false + } + }, + { + "type": "string" + } + ] + }, + "eval_cases": { + "anyOf": [ + { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "criteria": { + "type": "string" + }, + "input": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "system", + "user", + "assistant", + "tool" + ] + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "text", + "file", + "image" + ] + }, + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + } + } + ] + } + }, + "required": [ + "role", + "content" + ], + "additionalProperties": false + } + } + ] + }, + "input_files": { + "type": "array", + "items": { + "type": "string" + } + }, + "expected_output": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "system", + "user", + "assistant", + "tool" + ] + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "text", + "file", + "image" + ] + }, + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + } + } + ] + } + }, + "required": [ + "role", + "content" + ], + "additionalProperties": false + } + } + ] + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "code-grader", + "code_grader" + ] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": [ + "type", + "command" + ], + "additionalProperties": false + } + } + }, + "required": [ + "type", + "command" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "llm-grader", + "llm_grader" + ] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "score_range", + "outcome" + ], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": [ + "type", + "command" + ], + "additionalProperties": false + } + } + }, + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "include" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": [ + "type", + "threshold" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": [ + "type", + "path" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": [ + "type" + ], + "additionalProperties": false + } + ] + } + }, + "required": [ + "type", + "aggregator" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] + }, + "mode": { + "type": "string", + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": [ + "tool" + ], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": [ + "type", + "mode" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "field-accuracy", + "field_accuracy" + ] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": [ + "path", + "match" + ], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": [ + "weighted_average", + "all_or_nothing" + ] + } + }, + "required": [ + "type", + "fields" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": [ + "type", + "threshold" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": [ + "type", + "budget" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "token-usage", + "token_usage" + ] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "execution-metrics", + "execution_metrics" + ] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "is-json", + "is_json" + ] + } + }, + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "score_range", + "outcome" + ], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": [ + "type", + "criteria" + ], + "additionalProperties": false + } + ] + } + }, + "evaluators": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "code-grader", + "code_grader" + ] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": [ + "type", + "command" + ], + "additionalProperties": false + } + } + }, + "required": [ + "type", + "command" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "llm-grader", + "llm_grader" + ] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "score_range", + "outcome" + ], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": [ + "type", + "command" + ], + "additionalProperties": false + } + } + }, + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "include" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": [ + "type", + "threshold" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": [ + "type", + "path" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": [ + "type" + ], + "additionalProperties": false + } + ] + } + }, + "required": [ + "type", + "aggregator" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] + }, + "mode": { + "type": "string", + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": [ + "tool" + ], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": [ + "type", + "mode" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "field-accuracy", + "field_accuracy" + ] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": [ + "path", + "match" + ], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": [ + "weighted_average", + "all_or_nothing" + ] + } + }, + "required": [ + "type", + "fields" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, @@ -5939,7 +9270,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -5983,7 +9317,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -6020,7 +9357,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -6035,7 +9375,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -6072,7 +9414,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -6104,7 +9449,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -6147,7 +9494,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -6190,7 +9540,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -6227,10 +9580,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -6273,7 +9631,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -6340,1201 +9701,1328 @@ "items": { "type": "object", "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "score_range", + "outcome" + ], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": [ + "type", + "criteria" + ], + "additionalProperties": false + } + ] + } + }, + "execution": { + "type": "object", + "properties": { + "target": { + "type": "string" + }, + "targets": { + "type": "array", + "items": { + "type": "string" + } + }, + "workers": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "code-grader", + "code_grader" + ] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": [ + "type", + "command" + ], + "additionalProperties": false + } + } + }, + "required": [ + "type", + "command" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "llm-grader", + "llm_grader" + ] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ { - "type": "integer", - "minimum": 0, - "maximum": 10 + "type": "string" }, { - "type": "integer", - "minimum": 0, - "maximum": 10 + "type": "array", + "items": { + "type": "string" + } } ] }, - "outcome": { - "type": "string", - "minLength": 1 + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} } }, - "required": ["score_range", "outcome"], "additionalProperties": false } - } - }, - "additionalProperties": false - }, - "minItems": 1 - } - }, - "required": ["type", "criteria"], - "additionalProperties": false - } - ] - } - }, - "evaluators": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["code-grader", "code_grader"] - }, - "command": { - "anyOf": [ - { - "type": "string" + ] }, - { + "rubrics": { "type": "array", "items": { - "type": "string" + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "score_range", + "outcome" + ], + "additionalProperties": false + } + } + }, + "additionalProperties": false } - } - ] - }, - "script": { - "anyOf": [ - { + }, + "model": { "type": "string" }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "cwd": { - "type": "string" - }, - "target": { - "anyOf": [ - { - "type": "boolean" + "target": { + "type": "string" }, - { + "config": { "type": "object", - "properties": { - "max_calls": { - "type": "number" - } - }, - "additionalProperties": false - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { - "type": "string" + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 }, - { - "type": "array", - "items": { - "type": "string" - } + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] } - ] + }, + "required": [ + "type", + "command" + ], + "additionalProperties": false } - }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type", "command"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" + } + }, + "required": [ + "type" + ], + "additionalProperties": false }, - "weight": { - "type": "number", - "minimum": 0 + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "include" + ], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["llm-grader", "llm_grader"] - }, - "prompt": { - "anyOf": [ - { - "type": "string" }, - { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" }, - { - "type": "array", - "items": { - "type": "string" + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" } } - ] + }, + "required": [ + "type" + ], + "additionalProperties": false }, - "script": { - "anyOf": [ - { - "type": "string" + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" }, - { - "type": "array", - "items": { - "type": "string" - } + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 } - ] + }, + "required": [ + "type", + "threshold" + ], + "additionalProperties": false }, - "config": { - "type": "object", - "additionalProperties": {} - } - }, - "additionalProperties": false - } - ] - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { + { "type": "object", "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { + "type": { "type": "string", - "minLength": 1 + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" } }, - "required": ["score_range", "outcome"], + "required": [ + "type", + "path" + ], "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "model": { - "type": "string" - }, - "target": { - "type": "string" - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { "type": "string" } - } - ] - } - }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "include": { - "type": "string", - "minLength": 1 - } - }, - "required": ["include"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": [ + "type" + ], + "additionalProperties": false + } + ] + } + }, + "required": [ + "type", + "aggregator" + ], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "composite" - }, - "assertions": { - "type": "array", - "items": {} - }, - "evaluators": { - "type": "array", - "items": {} - }, - "aggregator": { - "anyOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "weighted_average" - }, - "weights": { - "type": "object", - "additionalProperties": { - "type": "number" - } - } - }, - "required": ["type"], - "additionalProperties": false + "minimum": 0 }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "threshold" + "required": { + "anyOf": [ + { + "type": "boolean" }, - "threshold": { + { "type": "number", + "exclusiveMinimum": true, "minimum": 0, "maximum": 1 } - }, - "required": ["type", "threshold"], - "additionalProperties": false + ] }, - { + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] + }, + "mode": { + "type": "string", + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] + }, + "minimums": { "type": "object", - "properties": { - "type": { - "type": "string", - "const": "code-grader" + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } }, - "path": { - "type": "string" + "required": [ + "tool" + ], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, - "cwd": { - "type": "string" + { + "type": "array", + "items": { + "type": "string" + } } - }, - "required": ["type", "path"], - "additionalProperties": false + ] }, - { - "type": "object", - "properties": { - "type": { + "argsMatch": { + "anyOf": [ + { "type": "string", - "const": "llm-grader" - }, - "prompt": { - "type": "string" + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, - "model": { - "type": "string" + { + "type": "array", + "items": { + "type": "string" + } } - }, - "required": ["type"], - "additionalProperties": false - } - ] - } - }, - "required": ["type", "aggregator"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + ] + } + }, + "required": [ + "type", + "mode" + ], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] - }, - "mode": { - "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] - }, - "minimums": { - "type": "object", - "additionalProperties": { - "type": "integer", - "minimum": 0 - } - }, - "expected": { - "type": "array", - "items": { - "type": "object", - "properties": { - "tool": { - "type": "string" - }, - "args": { - "anyOf": [ - { - "type": "string", - "const": "any" + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "field-accuracy", + "field_accuracy" + ] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" }, - { - "type": "object", - "additionalProperties": {} - } - ] - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "maxDurationMs": { - "type": "number", - "minimum": 0 - }, - "args_match": { - "anyOf": [ - { + "match": { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "required": { + "type": "boolean" }, - { + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { "type": "array", "items": { "type": "string" } } - ] - } - }, - "required": ["tool"], - "additionalProperties": false - } - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + }, + "required": [ + "path", + "match" + ], + "additionalProperties": false + }, + "minItems": 1 }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { + "aggregation": { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "mode"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "enum": [ + "weighted_average", + "all_or_nothing" + ] } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["field-accuracy", "field_accuracy"] - }, - "fields": { - "type": "array", - "items": { - "type": "object", - "properties": { - "path": { - "type": "string" - }, - "match": { - "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] - }, - "required": { - "type": "boolean" - }, - "weight": { - "type": "number" - }, - "tolerance": { - "type": "number", - "minimum": 0 - }, - "relative": { - "type": "boolean" - }, - "formats": { - "type": "array", - "items": { - "type": "string" - } - } - }, - "required": ["path", "match"], - "additionalProperties": false }, - "minItems": 1 - }, - "aggregation": { - "type": "string", - "enum": ["weighted_average", "all_or_nothing"] - } - }, - "required": ["type", "fields"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + "required": [ + "type", + "fields" + ], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "latency" - }, - "threshold": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": [ + "type", + "threshold" + ], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "cost" - }, - "budget": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "budget"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": [ + "type", + "budget" + ], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "token-usage", + "token_usage" + ] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["token-usage", "token_usage"] - }, - "max_total": { - "type": "number", - "minimum": 0 - }, - "max_input": { - "type": "number", - "minimum": 0 - }, - "max_output": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": [ + "type" + ], + "additionalProperties": false }, - "required": { - "anyOf": [ - { + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { "type": "boolean" }, - { + "type": { + "type": "string", + "enum": [ + "execution-metrics", + "execution_metrics" + ] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { "type": "number", - "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["execution-metrics", "execution_metrics"] - }, - "max_tool_calls": { - "type": "number", - "minimum": 0 - }, - "max_llm_calls": { - "type": "number", - "minimum": 0 - }, - "max_tokens": { - "type": "number", - "minimum": 0 - }, - "max_cost_usd": { - "type": "number", - "minimum": 0 - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "target_exploration_ratio": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "exploration_tolerance": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": [ + "type" + ], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "regex" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] }, - { + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "is-json", + "is_json" + ] } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["is-json", "is_json"] - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": [ + "type" + ], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "equals" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "rubrics" - }, - "criteria": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 + "outcome": { + "type": "string", + "minLength": 1 } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 + }, + "required": [ + "score_range", + "outcome" + ], + "additionalProperties": false } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false + } + }, + "additionalProperties": false + }, + "minItems": 1 + } }, - "minItems": 1 + "required": [ + "type", + "criteria" + ], + "additionalProperties": false } - }, - "required": ["type", "criteria"], - "additionalProperties": false - } - ] - } - }, - "execution": { - "type": "object", - "properties": { - "target": { - "type": "string" - }, - "targets": { - "type": "array", - "items": { - "type": "string" + ] } }, - "workers": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "assertions": { + "evaluators": { "type": "array", "items": { "anyOf": [ @@ -7572,7 +11060,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -7646,12 +11137,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -7688,7 +11185,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -7783,7 +11283,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -7834,12 +11337,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -7850,7 +11358,9 @@ "minLength": 1 } }, - "required": ["include"], + "required": [ + "include" + ], "additionalProperties": false }, { @@ -7913,7 +11423,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -7929,7 +11441,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -7946,7 +11461,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -7963,13 +11481,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -8006,11 +11529,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -8051,7 +11583,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -8065,7 +11602,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -8076,7 +11618,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -8084,7 +11628,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -8098,7 +11647,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -8109,7 +11663,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -8146,7 +11703,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -8158,7 +11718,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -8180,17 +11744,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -8234,7 +11807,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -8278,7 +11854,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -8315,7 +11894,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -8330,7 +11912,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -8367,7 +11951,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -8399,7 +11986,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -8442,7 +12031,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -8485,7 +12077,55 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "is-json", + "is_json" + ] + } + }, + "required": [ + "type" + ], "additionalProperties": false }, { @@ -8522,10 +12162,16 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "const": "equals" + }, + "value": { + "type": "string" } }, - "required": ["type"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -8557,337 +12203,580 @@ "minimum": 0, "maximum": 1 }, - "negate": { - "type": "boolean" + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "score_range", + "outcome" + ], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": [ + "type", + "criteria" + ], + "additionalProperties": false + } + ] + } + }, + "skip_defaults": { + "type": "boolean" + }, + "cache": { + "type": "boolean" + }, + "trials": { + "type": "object", + "properties": { + "count": { + "type": "integer", + "minimum": 1 + }, + "strategy": { + "type": "string", + "enum": [ + "pass_at_k", + "mean", + "confidence_interval" + ] + }, + "cost_limit_usd": { + "type": "number", + "minimum": 0 + }, + "costLimitUsd": { + "type": "number", + "minimum": 0 + } + }, + "required": [ + "count" + ], + "additionalProperties": false + }, + "total_budget_usd": { + "type": "number", + "minimum": 0 + }, + "totalBudgetUsd": { + "type": "number", + "minimum": 0 + }, + "fail_on_error": { + "type": "boolean" + }, + "failOnError": { + "type": "boolean" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "additionalProperties": false + }, + "workspace": { + "type": "object", + "properties": { + "template": { + "type": "string" + }, + "isolation": { + "type": "string", + "enum": [ + "shared", + "per_test" + ] + }, + "repos": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "source": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "git" + }, + "url": { + "type": "string", + "format": "uri" + } + }, + "required": [ + "type", + "url" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "local" + }, + "path": { + "type": "string" + } + }, + "required": [ + "type", + "path" + ], + "additionalProperties": false + } + ] + }, + "checkout": { + "type": "object", + "properties": { + "ref": { + "type": "string" + }, + "base_commit": { + "type": "string", + "minLength": 1 }, - "type": { + "resolve": { "type": "string", - "const": "equals" + "enum": [ + "remote", + "local" + ] }, - "value": { - "type": "string" + "ancestor": { + "type": "integer", + "minimum": 0 } }, - "required": ["type", "value"], "additionalProperties": false }, - { + "clone": { "type": "object", "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" + "depth": { + "type": "integer", + "minimum": 1 }, - "type": { - "type": "string", - "const": "rubrics" + "filter": { + "type": "string" }, - "criteria": { + "sparse": { "type": "array", "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - }, - "minItems": 1 + "type": "string" + } } }, - "required": ["type", "criteria"], "additionalProperties": false } - ] + }, + "additionalProperties": false } }, - "evaluators": { - "type": "array", - "items": { + "hooks": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "before_all": { + "type": "object", + "properties": { + "command": { + "type": "array", + "items": { + "type": "string" + } + }, + "script": { + "type": "array", + "items": { + "type": "string" + } + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": [ + "none", + "fast", + "strict" + ] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "type": "array", + "items": { + "type": "string" + } + }, + "script": { + "type": "array", + "items": { + "type": "string" + } + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": [ + "none", + "fast", + "strict" + ] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "type": "array", + "items": { + "type": "string" + } + }, + "script": { + "type": "array", + "items": { + "type": "string" + } + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": [ + "none", + "fast", + "strict" + ] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "type": "array", + "items": { + "type": "string" + } + }, + "script": { + "type": "array", + "items": { + "type": "string" + } + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": [ + "none", + "fast", + "strict" + ] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + }, + "mode": { + "type": "string", + "enum": [ + "pooled", + "temp", + "static" + ] + }, + "path": { + "type": "string" + }, + "docker": { + "type": "object", + "properties": { + "image": { + "type": "string" + }, + "timeout": { + "type": "integer", + "minimum": 1 + }, + "memory": { + "type": "string" + }, + "cpus": { + "type": "number", + "minimum": 0.1 + } + }, + "required": [ + "image" + ], + "additionalProperties": false + } + }, + "additionalProperties": false + }, + "metadata": { + "type": "object", + "additionalProperties": {} + }, + "conversation_id": { + "type": "string" + }, + "suite": { + "type": "string" + }, + "note": { + "type": "string" + }, + "depends_on": { + "type": "array", + "items": { + "type": "string" + } + }, + "on_dependency_failure": { + "type": "string", + "enum": [ + "skip", + "fail", + "run" + ] + }, + "mode": { + "type": "string", + "enum": [ + "conversation" + ] + }, + "turns": { + "type": "array", + "items": { + "type": "object", + "properties": { + "input": { "anyOf": [ { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["code-grader", "code_grader"] - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "cwd": { + "type": "string" + }, + { + "anyOf": [ + { "type": "string" }, - "target": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "object", - "properties": { - "max_calls": { - "type": "number" - } - }, - "additionalProperties": false - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "preprocessors": { + { "type": "array", "items": { "type": "object", "properties": { "type": { "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } + "enum": [ + "text", + "file", + "image" ] + }, + "value": { + "type": "string" } }, - "required": ["type", "command"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } - }, - "required": ["type", "command"], - "additionalProperties": false + ] + } + ] + }, + "expected_output": { + "anyOf": [ + { + "type": "string" }, { - "type": "object", - "properties": { - "name": { + "anyOf": [ + { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["llm-grader", "llm_grader"] - }, - "prompt": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - } - }, - "additionalProperties": false - } - ] - }, - "rubrics": { + { "type": "array", "items": { "type": "object", "properties": { - "id": { - "type": "string" + "type": { + "type": "string", + "enum": [ + "text", + "file", + "image" + ] }, - "outcome": { + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + } + } + ] + } + ] + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { "type": "string" }, "weight": { - "type": "number" + "type": "number", + "minimum": 0 }, "required": { - "type": "boolean" + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] }, "min_score": { "type": "number", @@ -8895,302 +12784,585 @@ "minimum": 0, "maximum": 1 }, - "score_ranges": { + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "code-grader", + "code_grader" + ] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { "type": "array", "items": { "type": "object", "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ { - "type": "integer", - "minimum": 0, - "maximum": 10 + "type": "string" }, { - "type": "integer", - "minimum": 0, - "maximum": 10 + "type": "array", + "items": { + "type": "string" + } } ] - }, - "outcome": { - "type": "string", - "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, + "required": [ + "type", + "command" + ], "additionalProperties": false - } - }, - "model": { - "type": "string" - }, - "target": { - "type": "string" - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "preprocessors": { - "type": "array", - "items": { + }, + { "type": "object", "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, "type": { "type": "string", - "minLength": 1 + "enum": [ + "llm-grader", + "llm_grader" + ] }, - "command": { + "prompt": { "anyOf": [ { "type": "string" }, { - "type": "array", - "items": { - "type": "string" - } + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false } ] - } - }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "include": { - "type": "string", - "minLength": 1 - } - }, - "required": ["include"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "composite" - }, - "assertions": { - "type": "array", - "items": {} - }, - "evaluators": { - "type": "array", - "items": {} - }, - "aggregator": { - "anyOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "weighted_average" - }, - "weights": { + }, + "rubrics": { + "type": "array", + "items": { "type": "object", - "additionalProperties": { - "type": "number" - } + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "score_range", + "outcome" + ], + "additionalProperties": false + } + } + }, + "additionalProperties": false } }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "threshold" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - } + "model": { + "type": "string" }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "code-grader" - }, - "path": { - "type": "string" - }, - "cwd": { - "type": "string" - } + "target": { + "type": "string" }, - "required": ["type", "path"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "llm-grader" - }, - "prompt": { - "type": "string" - }, - "model": { - "type": "string" - } + "config": { + "type": "object", + "additionalProperties": {} }, - "required": ["type"], - "additionalProperties": false - } - ] - } - }, - "required": ["type", "aggregator"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": [ + "type", + "command" + ], + "additionalProperties": false + } + } }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] - }, - "mode": { - "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] - }, - "minimums": { - "type": "object", - "additionalProperties": { - "type": "integer", - "minimum": 0 - } - }, - "expected": { - "type": "array", - "items": { + "required": [ + "type" + ], + "additionalProperties": false + }, + { "type": "object", "properties": { - "tool": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "include" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" }, - "args": { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { "anyOf": [ { - "type": "string", - "const": "any" + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": [ + "type" + ], + "additionalProperties": false }, { "type": "object", - "additionalProperties": {} + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": [ + "type", + "threshold" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": [ + "type", + "path" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": [ + "type" + ], + "additionalProperties": false } ] + } + }, + "required": [ + "type", + "aggregator" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - "max_duration_ms": { + "weight": { "type": "number", "minimum": 0 }, - "maxDurationMs": { + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { "type": "number", - "minimum": 0 + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] + }, + "mode": { + "type": "string", + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": [ + "tool" + ], + "additionalProperties": false + } }, "args_match": { "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -9204,7 +13376,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -9215,938 +13392,658 @@ ] } }, - "required": ["tool"], + "required": [ + "type", + "mode" + ], "additionalProperties": false - } - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "mode"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["field-accuracy", "field_accuracy"] - }, - "fields": { - "type": "array", - "items": { + }, + { "type": "object", "properties": { - "path": { + "name": { "type": "string" }, - "match": { - "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] - }, - "required": { - "type": "boolean" - }, "weight": { - "type": "number" - }, - "tolerance": { "type": "number", "minimum": 0 }, - "relative": { - "type": "boolean" - }, - "formats": { - "type": "array", - "items": { - "type": "string" - } - } - }, - "required": ["path", "match"], - "additionalProperties": false - }, - "minItems": 1 - }, - "aggregation": { - "type": "string", - "enum": ["weighted_average", "all_or_nothing"] - } - }, - "required": ["type", "fields"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "latency" - }, - "threshold": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "cost" - }, - "budget": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "budget"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["token-usage", "token_usage"] - }, - "max_total": { - "type": "number", - "minimum": 0 - }, - "max_input": { - "type": "number", - "minimum": 0 - }, - "max_output": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "field-accuracy", + "field_accuracy" + ] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": [ + "path", + "match" + ], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": [ + "weighted_average", + "all_or_nothing" + ] + } }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["execution-metrics", "execution_metrics"] - }, - "max_tool_calls": { - "type": "number", - "minimum": 0 - }, - "max_llm_calls": { - "type": "number", - "minimum": 0 - }, - "max_tokens": { - "type": "number", - "minimum": 0 - }, - "max_cost_usd": { - "type": "number", - "minimum": 0 - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "target_exploration_ratio": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "exploration_tolerance": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "required": [ + "type", + "fields" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "required": [ + "type", + "threshold" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "regex" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "required": [ + "type", + "budget" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "token-usage", + "token_usage" + ] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["is-json", "is_json"] - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "execution-metrics", + "execution_metrics" + ] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "equals" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "rubrics" - }, - "criteria": { - "type": "array", - "items": { + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { "type": "object", "properties": { - "id": { + "name": { "type": "string" }, - "outcome": { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "is-json", + "is_json" + ] + } + }, + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" }, "weight": { - "type": "number" + "type": "number", + "minimum": 0 }, "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { "type": "boolean" }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 }, - "score_ranges": { + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { "type": "array", "items": { "type": "object", "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] + "id": { + "type": "string" }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - }, - "minItems": 1 - } - }, - "required": ["type", "criteria"], - "additionalProperties": false - } - ] - } - }, - "skip_defaults": { - "type": "boolean" - }, - "cache": { - "type": "boolean" - }, - "trials": { - "type": "object", - "properties": { - "count": { - "type": "integer", - "minimum": 1 - }, - "strategy": { - "type": "string", - "enum": ["pass_at_k", "mean", "confidence_interval"] - }, - "cost_limit_usd": { - "type": "number", - "minimum": 0 - }, - "costLimitUsd": { - "type": "number", - "minimum": 0 - } - }, - "required": ["count"], - "additionalProperties": false - }, - "total_budget_usd": { - "type": "number", - "minimum": 0 - }, - "totalBudgetUsd": { - "type": "number", - "minimum": 0 - }, - "fail_on_error": { - "type": "boolean" - }, - "failOnError": { - "type": "boolean" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - } - }, - "additionalProperties": false - }, - "workspace": { - "type": "object", - "properties": { - "template": { - "type": "string" - }, - "isolation": { - "type": "string", - "enum": ["shared", "per_test"] - }, - "repos": { - "type": "array", - "items": { - "type": "object", - "properties": { - "path": { - "type": "string" - }, - "source": { - "anyOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "git" - }, - "url": { - "type": "string", - "format": "uri" - } - }, - "required": ["type", "url"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "local" - }, - "path": { - "type": "string" - } - }, - "required": ["type", "path"], - "additionalProperties": false - } - ] - }, - "checkout": { - "type": "object", - "properties": { - "ref": { - "type": "string" - }, - "base_commit": { - "type": "string", - "minLength": 1 - }, - "resolve": { - "type": "string", - "enum": ["remote", "local"] - }, - "ancestor": { - "type": "integer", - "minimum": 0 - } - }, - "additionalProperties": false - }, - "clone": { - "type": "object", - "properties": { - "depth": { - "type": "integer", - "minimum": 1 - }, - "filter": { - "type": "string" - }, - "sparse": { - "type": "array", - "items": { - "type": "string" + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "score_range", + "outcome" + ], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": [ + "type", + "criteria" + ], + "additionalProperties": false } - } - }, - "additionalProperties": false - } - }, - "additionalProperties": false - } - }, - "hooks": { - "type": "object", - "properties": { - "enabled": { - "type": "boolean" - }, - "before_all": { - "type": "object", - "properties": { - "command": { - "type": "array", - "items": { - "type": "string" - } - }, - "script": { - "type": "array", - "items": { - "type": "string" - } - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] - } - }, - "additionalProperties": false - }, - "before_each": { - "type": "object", - "properties": { - "command": { - "type": "array", - "items": { - "type": "string" - } - }, - "script": { - "type": "array", - "items": { - "type": "string" - } - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] - } - }, - "additionalProperties": false - }, - "after_each": { - "type": "object", - "properties": { - "command": { - "type": "array", - "items": { - "type": "string" - } - }, - "script": { - "type": "array", - "items": { - "type": "string" - } - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] - } - }, - "additionalProperties": false - }, - "after_all": { - "type": "object", - "properties": { - "command": { - "type": "array", - "items": { - "type": "string" - } - }, - "script": { - "type": "array", - "items": { - "type": "string" - } - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] + ] } - }, - "additionalProperties": false + ] } - }, - "additionalProperties": false - }, - "mode": { - "type": "string", - "enum": ["pooled", "temp", "static"] - }, - "path": { - "type": "string" + } }, - "docker": { - "type": "object", - "properties": { - "image": { - "type": "string" - }, - "timeout": { - "type": "integer", - "minimum": 1 - }, - "memory": { - "type": "string" - }, - "cpus": { - "type": "number", - "minimum": 0.1 - } - }, - "required": ["image"], - "additionalProperties": false - } + "required": [ + "input" + ], + "additionalProperties": false }, - "additionalProperties": false - }, - "metadata": { - "type": "object", - "additionalProperties": {} - }, - "conversation_id": { - "type": "string" - }, - "suite": { - "type": "string" - }, - "note": { - "type": "string" + "minItems": 1 }, - "depends_on": { - "type": "array", - "items": { - "type": "string" - } + "aggregation": { + "type": "string", + "enum": [ + "mean", + "min", + "max" + ] }, - "on_dependency_failure": { + "on_turn_failure": { "type": "string", - "enum": ["skip", "fail", "run"] + "enum": [ + "continue", + "stop" + ] + }, + "window_size": { + "type": "integer", + "minimum": 1 } }, - "required": ["id"], + "required": [ + "id" + ], "additionalProperties": false } }, @@ -10213,7 +14110,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -10287,12 +14187,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -10329,7 +14235,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -10424,7 +14333,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -10475,12 +14387,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -10491,7 +14408,9 @@ "minLength": 1 } }, - "required": ["include"], + "required": [ + "include" + ], "additionalProperties": false }, { @@ -10554,7 +14473,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -10570,7 +14491,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -10587,7 +14511,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -10604,13 +14531,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -10647,11 +14579,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -10692,7 +14633,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -10706,7 +14652,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -10717,7 +14668,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -10725,7 +14678,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -10739,7 +14697,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -10750,7 +14713,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -10787,7 +14753,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -10799,7 +14768,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -10821,17 +14794,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -10875,7 +14857,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -10919,7 +14904,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -10956,7 +14944,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -10971,7 +14962,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11008,7 +15001,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -11040,7 +15036,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11083,7 +15081,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -11126,7 +15127,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -11163,10 +15167,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11209,7 +15218,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -11298,7 +15310,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -11308,7 +15323,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -11352,7 +15370,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -11426,12 +15447,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -11468,7 +15495,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -11563,7 +15593,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -11614,12 +15647,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11630,7 +15668,9 @@ "minLength": 1 } }, - "required": ["include"], + "required": [ + "include" + ], "additionalProperties": false }, { @@ -11693,7 +15733,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11709,7 +15751,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -11726,7 +15771,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -11743,13 +15791,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -11786,11 +15839,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -11831,7 +15893,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -11845,7 +15912,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -11856,7 +15928,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -11864,7 +15938,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -11878,7 +15957,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -11889,7 +15973,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -11926,7 +16013,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -11938,7 +16028,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -11960,17 +16054,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -12014,7 +16117,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -12058,7 +16164,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -12095,7 +16204,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -12110,7 +16222,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12147,7 +16261,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -12179,7 +16296,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12222,7 +16341,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -12265,7 +16387,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -12302,10 +16427,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12348,7 +16478,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -12437,7 +16570,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -12447,7 +16583,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -12468,7 +16607,11 @@ }, "strategy": { "type": "string", - "enum": ["pass_at_k", "mean", "confidence_interval"] + "enum": [ + "pass_at_k", + "mean", + "confidence_interval" + ] }, "cost_limit_usd": { "type": "number", @@ -12479,7 +16622,9 @@ "minimum": 0 } }, - "required": ["count"], + "required": [ + "count" + ], "additionalProperties": false }, "total_budget_usd": { @@ -12542,7 +16687,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -12616,12 +16764,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -12658,7 +16812,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -12753,7 +16910,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -12804,12 +16964,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12820,7 +16985,9 @@ "minLength": 1 } }, - "required": ["include"], + "required": [ + "include" + ], "additionalProperties": false }, { @@ -12883,7 +17050,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12899,7 +17068,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -12916,7 +17088,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -12933,13 +17108,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -12976,11 +17156,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -13021,7 +17210,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -13035,7 +17229,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -13046,7 +17245,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -13054,7 +17255,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -13068,7 +17274,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -13079,7 +17290,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -13116,7 +17330,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -13128,7 +17345,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -13150,17 +17371,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -13204,7 +17434,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -13248,7 +17481,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -13285,7 +17521,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -13300,7 +17539,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -13337,7 +17578,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -13369,7 +17613,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -13412,7 +17658,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -13455,7 +17704,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -13492,10 +17744,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -13538,7 +17795,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -13627,7 +17887,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -13637,7 +17900,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -13666,7 +17932,10 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } }, @@ -13680,7 +17949,10 @@ }, "isolation": { "type": "string", - "enum": ["shared", "per_test"] + "enum": [ + "shared", + "per_test" + ] }, "repos": { "type": "array", @@ -13704,7 +17976,10 @@ "format": "uri" } }, - "required": ["type", "url"], + "required": [ + "type", + "url" + ], "additionalProperties": false }, { @@ -13718,7 +17993,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false } ] @@ -13735,7 +18013,10 @@ }, "resolve": { "type": "string", - "enum": ["remote", "local"] + "enum": [ + "remote", + "local" + ] }, "ancestor": { "type": "integer", @@ -13799,7 +18080,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -13830,7 +18115,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -13861,7 +18150,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -13892,7 +18185,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -13902,7 +18199,11 @@ }, "mode": { "type": "string", - "enum": ["pooled", "temp", "static"] + "enum": [ + "pooled", + "temp", + "static" + ] }, "path": { "type": "string" @@ -13925,7 +18226,9 @@ "minimum": 0.1 } }, - "required": ["image"], + "required": [ + "image" + ], "additionalProperties": false } }, @@ -13937,7 +18240,9 @@ ] } }, - "required": ["tests"], + "required": [ + "tests" + ], "additionalProperties": false } } From 3ecf2a8f8f3bdd6861636b841b0d58897f81307d Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 10:36:18 +0000 Subject: [PATCH 2/4] feat(eval): add multi-turn conversation-live example for UAT Adds examples/features/multi-turn-conversation-live/ with 5 test cases exercising conversation mode features: context retention, aggregation modes, on_turn_failure, mixed assertions, and conversation-level assertions. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../multi-turn-conversation-live/README.md | 22 ++++ .../evals/dataset.eval.yaml | 105 ++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 examples/features/multi-turn-conversation-live/README.md create mode 100644 examples/features/multi-turn-conversation-live/evals/dataset.eval.yaml diff --git a/examples/features/multi-turn-conversation-live/README.md b/examples/features/multi-turn-conversation-live/README.md new file mode 100644 index 000000000..db0a9bd28 --- /dev/null +++ b/examples/features/multi-turn-conversation-live/README.md @@ -0,0 +1,22 @@ +# Multi-Turn Conversation (Live) + +This example demonstrates **live turn-by-turn conversation evaluation** where the LLM generates each assistant response (unlike `multi-turn-conversation/` which scripts intermediate turns). + +## Features Shown + +- `mode: conversation` — enables live turn-by-turn evaluation +- `turns[]` — each entry is a user message that generates an LLM call +- Per-turn `assertions` — string shorthand (rubric) and structured evaluators +- `aggregation: mean | min | max` — how turn scores combine +- `on_turn_failure: stop | continue` — behavior on assertion failure +- Top-level `assertions` — conversation-level grading after all turns + +## Running + +```bash +# With default target +bun apps/cli/src/cli.ts eval examples/features/multi-turn-conversation-live/evals/dataset.eval.yaml + +# With specific test +bun apps/cli/src/cli.ts eval examples/features/multi-turn-conversation-live/evals/dataset.eval.yaml --test-id context-retention +``` diff --git a/examples/features/multi-turn-conversation-live/evals/dataset.eval.yaml b/examples/features/multi-turn-conversation-live/evals/dataset.eval.yaml new file mode 100644 index 000000000..831f6597c --- /dev/null +++ b/examples/features/multi-turn-conversation-live/evals/dataset.eval.yaml @@ -0,0 +1,105 @@ +# Multi-turn conversation evaluation (live turn-by-turn) +# Each turn generates a fresh LLM call; per-turn assertions grade each response. +# This is different from multi-turn-conversation/ which scripts intermediate turns. + +description: Live multi-turn conversation evaluation with per-turn grading + +execution: + target: llm + +tests: + # Test 1: Basic context retention across turns + - id: context-retention + mode: conversation + criteria: Agent maintains context and provides relevant responses across turns + aggregation: mean + input: + - role: system + content: |- + You are a helpful math tutor. Be concise and accurate. + Always show your work step by step. + turns: + - input: What is 15% of 200? + assertions: + - Correctly calculates 15% of 200 as 30 + - Shows the calculation steps + - input: Now double that result. + assertions: + - References the previous answer of 30 + - Correctly calculates double as 60 + - input: What were the original numbers I asked about? + assertions: + - Recalls that the user asked about 15% and 200 + - Demonstrates memory of the conversation context + + # Test 2: With aggregation: min (weakest-link scoring) + - id: weakest-link-scoring + mode: conversation + criteria: Agent provides accurate, well-structured responses + aggregation: min + input: + - role: system + content: You are a concise geography expert. Answer in 1-2 sentences. + turns: + - input: What is the capital of France? + assertions: + - Correctly identifies Paris as the capital of France + - input: What country is it in? + assertions: + - Recognizes the question refers to Paris from the previous turn + - Confirms Paris is in France + + # Test 3: With on_turn_failure: stop + - id: stop-on-failure + mode: conversation + on_turn_failure: stop + criteria: Agent follows instructions precisely + input: + - role: system + content: You are a helpful assistant. Be precise and accurate. + turns: + - input: What is 2 + 2? + assertions: + - Answers with 4 + - input: Multiply that by 3. + assertions: + - References the previous answer + - Calculates 12 correctly + + # Test 4: Mixed string and structured assertions + - id: mixed-assertions + mode: conversation + criteria: Agent writes correct, well-formed Python code + input: + - role: system + content: You are a helpful coding assistant. + turns: + - input: Write a Python function that adds two numbers. + assertions: + - Contains a Python function definition + - type: contains + value: def + - input: Now add type hints to the function. + assertions: + - Includes type hints (int, float, or similar) + - type: contains + value: "->" + + # Test 5: Conversation-level assertions + - id: conversation-coherence + mode: conversation + criteria: Agent maintains a coherent, helpful conversation + input: + - role: system + content: You are a helpful travel advisor. Be concise. + turns: + - input: I want to visit somewhere warm in December. + assertions: + - Suggests at least one warm destination + - input: I prefer beaches over cities. + assertions: + - Adjusts recommendations toward beach destinations + - Does not suggest purely urban destinations + assertions: + - Agent maintains consistency — later suggestions align with earlier preferences + - Agent does not contradict its own prior recommendations From 36752d0b85b2b07dceff1194890f1a5bd9b474c9 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 10:41:01 +0000 Subject: [PATCH 3/4] test(eval): add unit tests for multi-turn conversation mode Tests for conversation-mode orchestrator, validation rules, and score aggregation (mean/min/max). Also fixes buildTurnAssertions to emit type: 'llm-grader' with rubrics instead of type: 'rubrics' (which is not registered in the builtin registry). The evaluator-parser uses the same pattern for YAML-sourced rubrics. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- packages/core/src/evaluation/orchestrator.ts | 8 +- .../test/evaluation/conversation-mode.test.ts | 811 ++++++++++++++++++ 2 files changed, 816 insertions(+), 3 deletions(-) create mode 100644 packages/core/test/evaluation/conversation-mode.test.ts diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 809787351..326ca44c5 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -3248,12 +3248,14 @@ function buildTurnAssertions(turn: ConversationTurn): EvaluatorConfig[] { const result: EvaluatorConfig[] = []; - // Group string assertions into a single rubrics evaluator + // Group string assertions into a single llm-grader evaluator with rubrics. + // Uses llm-grader (not rubrics) because 'rubrics' is a YAML shorthand resolved by + // the evaluator-parser — at runtime we always dispatch through 'llm-grader'. if (stringCriteria.length > 0) { result.push({ name: 'turn-rubrics', - type: 'rubrics' as EvaluatorKind, - criteria: stringCriteria.map((text, idx) => ({ + type: 'llm-grader' as EvaluatorKind, + rubrics: stringCriteria.map((text, idx) => ({ id: `criterion-${idx + 1}`, outcome: text, weight: 1, diff --git a/packages/core/test/evaluation/conversation-mode.test.ts b/packages/core/test/evaluation/conversation-mode.test.ts new file mode 100644 index 000000000..fde4c5c99 --- /dev/null +++ b/packages/core/test/evaluation/conversation-mode.test.ts @@ -0,0 +1,811 @@ +/** + * Unit tests for the multi-turn conversation mode feature. + * + * Covers: + * - Orchestrator: runEvalCase with mode: conversation + * - Validation: validateEvalFile with conversation mode fields + * - Score aggregation strategies (mean, min, max) + * - Turn failure policies (continue, stop) + * - Window size behaviour + */ + +import { afterAll, beforeAll, describe, expect, it } from 'bun:test'; +import { mkdir, rm, writeFile } from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; + +import { runEvalCase } from '../../src/evaluation/orchestrator.js'; +import type { ResolvedTarget } from '../../src/evaluation/providers/targets.js'; +import type { Provider, ProviderRequest, ProviderResponse } from '../../src/evaluation/providers/types.js'; +import type { EvalTest } from '../../src/evaluation/types.js'; +import { validateEvalFile } from '../../src/evaluation/validation/eval-validator.js'; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +class SequenceProvider implements Provider { + readonly id: string; + readonly kind = 'mock' as const; + readonly targetName: string; + readonly requests: ProviderRequest[] = []; + private readonly responses: ProviderResponse[]; + private index = 0; + + constructor(targetName: string, responses: ProviderResponse[]) { + this.id = `mock:${targetName}`; + this.targetName = targetName; + this.responses = responses; + } + + async invoke(request: ProviderRequest): Promise { + this.requests.push(request); + if (this.index >= this.responses.length) { + throw new Error(`SequenceProvider: no more responses (called ${this.index + 1} times)`); + } + return this.responses[this.index++]; + } +} + +class ErrorOnFirstProvider implements Provider { + readonly id = 'error-first'; + readonly kind = 'mock' as const; + readonly targetName = 'error-first'; + private called = false; + private readonly fallbackResponse: ProviderResponse; + + constructor(fallback: ProviderResponse) { + this.fallbackResponse = fallback; + } + + async invoke(): Promise { + if (!this.called) { + this.called = true; + throw new Error('Simulated provider error'); + } + return this.fallbackResponse; + } +} + +const baseTarget: ResolvedTarget = { + kind: 'mock', + name: 'mock', + config: { response: '{}' }, +}; + +function makeEvaluatorRegistry(score = 1.0) { + return { + 'llm-grader': { + kind: 'llm-grader' as const, + async evaluate() { + return { + score, + verdict: score >= 0.5 ? ('pass' as const) : ('fail' as const), + assertions: [{ text: 'graded', passed: score >= 0.5 }], + expectedAspectCount: 1, + }; + }, + }, + }; +} + +function assistantResponse(content: string): ProviderResponse { + return { output: [{ role: 'assistant', content }] }; +} + +const nowFn = () => new Date('2024-01-01T00:00:00Z'); + +// --------------------------------------------------------------------------- +// Orchestrator — conversation mode +// --------------------------------------------------------------------------- + +describe('runEvalCase — conversation mode', () => { + it('basic 2-turn conversation with no assertions scores 1.0 and calls provider twice', async () => { + const provider = new SequenceProvider('mock', [ + assistantResponse('Hello!'), + assistantResponse('Goodbye!'), + ]); + + const evalCase: EvalTest = { + id: 'conv-basic', + question: 'Chat test', + input: [{ role: 'user', content: 'Hi' }], + expected_output: [], + file_paths: [], + criteria: 'Be helpful', + mode: 'conversation', + turns: [ + { input: 'Turn 1 message' }, + { input: 'Turn 2 message' }, + ], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: makeEvaluatorRegistry(), + now: nowFn, + }); + + expect(result.score).toBe(1.0); + expect(provider.requests).toHaveLength(2); + expect(result.executionStatus).toBe('ok'); + }); + + it('per-turn string assertions are evaluated and affect score', async () => { + const provider = new SequenceProvider('mock', [ + assistantResponse('Paris'), + assistantResponse('Berlin'), + ]); + + const evalCase: EvalTest = { + id: 'conv-string-assertions', + question: 'Geography', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Correct answers', + mode: 'conversation', + turns: [ + { input: 'Capital of France?', assertions: ['Response mentions Paris'] }, + { input: 'Capital of Germany?', assertions: ['Response mentions Berlin'] }, + ], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: makeEvaluatorRegistry(1.0), + now: nowFn, + }); + + expect(result.score).toBeGreaterThan(0); + expect(provider.requests).toHaveLength(2); + }); + + it('per-turn structured assertions are evaluated', async () => { + const provider = new SequenceProvider('mock', [ + assistantResponse('42'), + ]); + + const evalCase: EvalTest = { + id: 'conv-struct-assertions', + question: 'Math', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Correct', + mode: 'conversation', + turns: [ + { + input: 'What is 6 * 7?', + assertions: [{ type: 'llm-grader', criteria: 'Answer is 42' }], + }, + ], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: makeEvaluatorRegistry(1.0), + now: nowFn, + }); + + expect(result.score).toBeGreaterThan(0); + expect(provider.requests).toHaveLength(1); + }); + + it('conversation-level assertions are evaluated against full transcript', async () => { + const provider = new SequenceProvider('mock', [ + assistantResponse('Yes'), + assistantResponse('No'), + ]); + + const evalCase: EvalTest = { + id: 'conv-top-level', + question: 'Consistency check', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Consistent throughout', + mode: 'conversation', + turns: [ + { input: 'Turn 1' }, + { input: 'Turn 2' }, + ], + assertions: [{ type: 'llm-grader', criteria: 'Conversation was coherent' }], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: makeEvaluatorRegistry(0.9), + now: nowFn, + }); + + // Should have per-turn scores plus a conversation-level score + expect(result.scores).toBeDefined(); + const hasConversationScore = result.scores?.some((s) => s.name === 'conversation'); + expect(hasConversationScore).toBe(true); + }); + + it('aggregation: mean — averages all turn scores', async () => { + // 3 turns, no per-turn assertions → each scores 1.0 + const provider = new SequenceProvider('mock', [ + assistantResponse('A'), + assistantResponse('B'), + assistantResponse('C'), + ]); + + const evalCase: EvalTest = { + id: 'conv-mean', + question: 'mean test', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Anything', + mode: 'conversation', + aggregation: 'mean', + turns: [ + { input: 'T1' }, + { input: 'T2' }, + { input: 'T3' }, + ], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: makeEvaluatorRegistry(), + now: nowFn, + }); + + expect(result.score).toBeCloseTo(1.0, 5); + }); + + it('aggregation: min — uses lowest turn score', async () => { + // Use per-turn assertions so scores are driven by the grader + // Turn 1: grader returns 1.0, Turn 2: 0.5, Turn 3: 0.8 + let callCount = 0; + const scores = [1.0, 0.5, 0.8]; + + const customRegistry = { + 'llm-grader': { + kind: 'llm-grader' as const, + async evaluate() { + const s = scores[callCount++] ?? 1.0; + return { + score: s, + verdict: s >= 0.5 ? ('pass' as const) : ('fail' as const), + assertions: [{ text: 'graded', passed: s >= 0.5 }], + expectedAspectCount: 1, + }; + }, + }, + }; + + const provider = new SequenceProvider('mock', [ + assistantResponse('A'), + assistantResponse('B'), + assistantResponse('C'), + ]); + + const evalCase: EvalTest = { + id: 'conv-min', + question: 'min test', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Anything', + mode: 'conversation', + aggregation: 'min', + turns: [ + { input: 'T1', assertions: ['Criterion A'] }, + { input: 'T2', assertions: ['Criterion B'] }, + { input: 'T3', assertions: ['Criterion C'] }, + ], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: customRegistry, + now: nowFn, + }); + + expect(result.score).toBeCloseTo(0.5, 5); + }); + + it('aggregation: max — uses highest turn score', async () => { + let callCount = 0; + const scores = [1.0, 0.5, 0.8]; + + const customRegistry = { + 'llm-grader': { + kind: 'llm-grader' as const, + async evaluate() { + const s = scores[callCount++] ?? 1.0; + return { + score: s, + verdict: s >= 0.5 ? ('pass' as const) : ('fail' as const), + assertions: [{ text: 'graded', passed: s >= 0.5 }], + expectedAspectCount: 1, + }; + }, + }, + }; + + const provider = new SequenceProvider('mock', [ + assistantResponse('A'), + assistantResponse('B'), + assistantResponse('C'), + ]); + + const evalCase: EvalTest = { + id: 'conv-max', + question: 'max test', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Anything', + mode: 'conversation', + aggregation: 'max', + turns: [ + { input: 'T1', assertions: ['Criterion A'] }, + { input: 'T2', assertions: ['Criterion B'] }, + { input: 'T3', assertions: ['Criterion C'] }, + ], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: customRegistry, + now: nowFn, + }); + + expect(result.score).toBeCloseTo(1.0, 5); + }); + + it('on_turn_failure: stop — skips remaining turns after first failure', async () => { + let callCount = 0; + const customRegistry = { + 'llm-grader': { + kind: 'llm-grader' as const, + async evaluate() { + callCount++; + // First grader call fails + return { + score: 0.0, + verdict: 'fail' as const, + assertions: [{ text: 'failed', passed: false }], + expectedAspectCount: 1, + }; + }, + }, + }; + + const provider = new SequenceProvider('mock', [ + assistantResponse('Turn 1 response'), + assistantResponse('Turn 2 response'), + assistantResponse('Turn 3 response'), + ]); + + const evalCase: EvalTest = { + id: 'conv-stop', + question: 'stop test', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Anything', + mode: 'conversation', + on_turn_failure: 'stop', + turns: [ + { input: 'T1', assertions: ['Criterion'] }, + { input: 'T2', assertions: ['Criterion'] }, + { input: 'T3', assertions: ['Criterion'] }, + ], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: customRegistry, + now: nowFn, + }); + + // Provider should only be called once (first turn) + expect(provider.requests).toHaveLength(1); + + // Skipped turns should have score 0 with skip verdict + const skippedScores = result.scores?.filter((s) => s.verdict === 'skip') ?? []; + expect(skippedScores.length).toBeGreaterThanOrEqual(2); + }); + + it('on_turn_failure: continue (default) — all turns run even after failure', async () => { + let callCount = 0; + const customRegistry = { + 'llm-grader': { + kind: 'llm-grader' as const, + async evaluate() { + callCount++; + return { + score: callCount === 1 ? 0.0 : 1.0, + verdict: callCount === 1 ? ('fail' as const) : ('pass' as const), + assertions: [{ text: 'graded', passed: callCount !== 1 }], + expectedAspectCount: 1, + }; + }, + }, + }; + + const provider = new SequenceProvider('mock', [ + assistantResponse('A'), + assistantResponse('B'), + assistantResponse('C'), + ]); + + const evalCase: EvalTest = { + id: 'conv-continue', + question: 'continue test', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Anything', + mode: 'conversation', + on_turn_failure: 'continue', + turns: [ + { input: 'T1', assertions: ['Criterion'] }, + { input: 'T2', assertions: ['Criterion'] }, + { input: 'T3', assertions: ['Criterion'] }, + ], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: customRegistry, + now: nowFn, + }); + + // All 3 turns must run + expect(provider.requests).toHaveLength(3); + // No skipped turns + const skippedScores = result.scores?.filter((s) => s.verdict === 'skip') ?? []; + expect(skippedScores).toHaveLength(0); + }); + + it('window_size — chatPrompt passed to provider is limited to system + last N*2 messages', async () => { + const provider = new SequenceProvider('mock', [ + assistantResponse('R1'), + assistantResponse('R2'), + assistantResponse('R3'), + ]); + + const evalCase: EvalTest = { + id: 'conv-window', + question: 'window test', + input: [{ role: 'system', content: 'System prompt' }], + expected_output: [], + file_paths: [], + criteria: 'Anything', + mode: 'conversation', + window_size: 1, // keep system + last 1 user+assistant pair + turns: [ + { input: 'T1' }, + { input: 'T2' }, + { input: 'T3' }, + ], + }; + + await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: makeEvaluatorRegistry(), + now: nowFn, + }); + + // Provider called 3 times + expect(provider.requests).toHaveLength(3); + + // Third call chatPrompt should not include T1's messages (windowed) + const thirdRequest = provider.requests[2]; + const chatPrompt = thirdRequest?.chatPrompt ?? []; + // System prompt should always be present + expect(chatPrompt.some((m) => m.role === 'system')).toBe(true); + // With window_size=1: system + last 2 messages (T2 user + T2 assistant). + // T1 user message should NOT be in the windowed prompt + const userMessages = chatPrompt.filter((m) => m.role === 'user'); + expect(userMessages.length).toBeLessThanOrEqual(1); + }); + + it('provider error on a turn — turn scores 0 and execution continues', async () => { + const provider = new ErrorOnFirstProvider(assistantResponse('Turn 2 response')); + + const evalCase: EvalTest = { + id: 'conv-provider-error', + question: 'error test', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Anything', + mode: 'conversation', + turns: [ + { input: 'T1' }, + { input: 'T2' }, + ], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: makeEvaluatorRegistry(), + now: nowFn, + }); + + // Turn 1 should score 0 + const turn1Score = result.scores?.find((s) => s.name === 'turn-1'); + expect(turn1Score?.score).toBe(0); + + // Turn 2 should still run (continue is default) + const turn2Score = result.scores?.find((s) => s.name === 'turn-2'); + expect(turn2Score).toBeDefined(); + expect(turn2Score?.score).toBe(1.0); + }); + + it('output contains full conversation transcript with all user and assistant messages', async () => { + const provider = new SequenceProvider('mock', [ + assistantResponse('Answer 1'), + assistantResponse('Answer 2'), + ]); + + const evalCase: EvalTest = { + id: 'conv-transcript', + question: 'transcript test', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Full transcript', + mode: 'conversation', + turns: [ + { input: 'Question 1' }, + { input: 'Question 2' }, + ], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: makeEvaluatorRegistry(), + now: nowFn, + }); + + // Output should have all messages from the conversation + const output = result.output ?? []; + const userMessages = output.filter((m) => m.role === 'user'); + const assistantMessages = output.filter((m) => m.role === 'assistant'); + + expect(userMessages.length).toBe(2); + expect(assistantMessages.length).toBe(2); + expect(assistantMessages[0]?.content).toBe('Answer 1'); + expect(assistantMessages[1]?.content).toBe('Answer 2'); + }); + + it('no regression — non-conversation test behaves as before', async () => { + const provider = new SequenceProvider('mock', [ + assistantResponse('Standard response'), + ]); + + const evalCase: EvalTest = { + id: 'standard-test', + question: 'Standard test', + input: [{ role: 'user', content: 'Hello' }], + expected_output: [], + file_paths: [], + criteria: 'Helpful', + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: makeEvaluatorRegistry(0.8), + now: nowFn, + }); + + expect(result.score).toBeGreaterThan(0); + expect(result.executionStatus).toBe('ok'); + // Should not have turn-level scores + const hasTurnScores = result.scores?.some((s) => s.name.startsWith('turn-')); + expect(hasTurnScores).toBeFalsy(); + }); +}); + +// --------------------------------------------------------------------------- +// Validation tests +// --------------------------------------------------------------------------- + +describe('validateEvalFile — conversation mode', () => { + let tempDir: string; + + beforeAll(async () => { + tempDir = path.join(os.tmpdir(), `agentv-conv-test-${Date.now()}`); + await mkdir(tempDir, { recursive: true }); + }); + + afterAll(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + it('rejects turns without mode: conversation', async () => { + const filePath = path.join(tempDir, 'turns-no-mode.yaml'); + await writeFile( + filePath, + `tests: + - id: t1 + criteria: Goal + input: hello + turns: + - input: Turn 1 +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(false); + expect(result.errors.some((e) => e.message.includes("'turns' requires mode: conversation"))).toBe(true); + }); + + it('rejects mode: conversation without turns', async () => { + const filePath = path.join(tempDir, 'mode-no-turns.yaml'); + await writeFile( + filePath, + `tests: + - id: t1 + criteria: Goal + input: hello + mode: conversation +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(false); + expect(result.errors.some((e) => e.message.includes("non-empty 'turns' array"))).toBe(true); + }); + + it('rejects mode: conversation with empty turns array', async () => { + const filePath = path.join(tempDir, 'mode-empty-turns.yaml'); + await writeFile( + filePath, + `tests: + - id: t1 + criteria: Goal + input: hello + mode: conversation + turns: [] +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(false); + expect(result.errors.some((e) => e.message.includes("non-empty 'turns' array"))).toBe(true); + }); + + it('rejects turns + top-level expected_output', async () => { + const filePath = path.join(tempDir, 'turns-expected-output.yaml'); + await writeFile( + filePath, + `tests: + - id: t1 + criteria: Goal + input: hello + mode: conversation + turns: + - input: Turn 1 + expected_output: "some output" +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(false); + expect(result.errors.some((e) => e.message.includes("'expected_output' is not allowed with mode: conversation"))).toBe(true); + }); + + it('rejects aggregation without mode: conversation', async () => { + const filePath = path.join(tempDir, 'aggregation-no-mode.yaml'); + await writeFile( + filePath, + `tests: + - id: t1 + criteria: Goal + input: hello + aggregation: mean +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(false); + expect(result.errors.some((e) => e.message.includes("'aggregation' requires mode: conversation"))).toBe(true); + }); + + it('rejects on_turn_failure without mode: conversation', async () => { + const filePath = path.join(tempDir, 'on-turn-failure-no-mode.yaml'); + await writeFile( + filePath, + `tests: + - id: t1 + criteria: Goal + input: hello + on_turn_failure: stop +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(false); + expect(result.errors.some((e) => e.message.includes("'on_turn_failure' requires mode: conversation"))).toBe(true); + }); + + it('rejects window_size without mode: conversation', async () => { + const filePath = path.join(tempDir, 'window-no-mode.yaml'); + await writeFile( + filePath, + `tests: + - id: t1 + criteria: Goal + input: hello + window_size: 3 +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(false); + expect(result.errors.some((e) => e.message.includes("'window_size' requires mode: conversation"))).toBe(true); + }); + + it('rejects a turn missing input', async () => { + const filePath = path.join(tempDir, 'turn-missing-input.yaml'); + await writeFile( + filePath, + `tests: + - id: t1 + criteria: Goal + input: hello + mode: conversation + turns: + - expected_output: "something" +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(false); + expect(result.errors.some((e) => e.message.includes('non-empty input'))).toBe(true); + }); + + it('accepts a valid conversation mode eval file', async () => { + const filePath = path.join(tempDir, 'valid-conversation.yaml'); + await writeFile( + filePath, + `tests: + - id: conv-valid + criteria: Be helpful + input: "System: you are a helpful assistant" + mode: conversation + aggregation: mean + on_turn_failure: continue + window_size: 5 + turns: + - input: "What is 2+2?" + expected_output: "4" + - input: "And 3+3?" + assertions: + - "Response mentions 6" +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(true); + expect(result.errors).toHaveLength(0); + }); +}); From 8affcd016631d3fadbe62af9c13e90ca0291c146 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 12:21:41 +0000 Subject: [PATCH 4/4] fix(eval): correct conversation mode scoring, loader, and serialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - YAML loader: include `turns` in completeness gate so conversation-only cases (no top-level criteria/assertions) are not silently skipped - Orchestrator: stop falling back to evalCase.assertions per-turn — turns without own assertions score 1.0 instead of double-counting top-level - Orchestrator: pass full transcript as candidate for conversation-level grading instead of only the last assistant reply - Orchestrator: serialize structured message content with JSON.stringify instead of producing [object Object] in transcript strings - Validator: reject whitespace-only and empty-array turn inputs - Tests: add regression coverage for double-counting, transcript candidate, and whitespace input validation Co-Authored-By: Claude Opus 4.6 --- packages/core/src/evaluation/orchestrator.ts | 104 +- .../evaluation/validation/eval-validator.ts | 11 +- packages/core/src/evaluation/yaml-parser.ts | 14 +- .../test/evaluation/conversation-mode.test.ts | 190 +- .../references/eval-schema.json | 2989 ++++------------- 5 files changed, 816 insertions(+), 2492 deletions(-) diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 326ca44c5..7430bc029 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -2959,12 +2959,26 @@ async function runConversationMode(options: { readonly availableTargets?: readonly string[]; }): Promise { const { - evalCase, provider, target, evaluators, typeRegistry, - graderProvider, promptInputs, nowFn, signal, - workspacePath, caseWorkspaceFile, agentTimeoutMs, - streamCallbacks, verbose, threshold, targetResolver, availableTargets, + evalCase, + provider, + target, + evaluators, + typeRegistry, + graderProvider, + promptInputs, + nowFn, + signal, + workspacePath, + caseWorkspaceFile, + agentTimeoutMs, + streamCallbacks, + verbose, + threshold, + targetResolver, + availableTargets, } = options; + // biome-ignore lint/style/noNonNullAssertion: turns is guaranteed by the caller (conversation mode gate) const turns = evalCase.turns!; const aggregation = evalCase.aggregation ?? 'mean'; const onTurnFailure = evalCase.on_turn_failure ?? 'continue'; @@ -2973,9 +2987,7 @@ async function runConversationMode(options: { // Build initial message history from evalCase.input (system prompt + any context) const history: ChatMessage[] = []; for (const msg of evalCase.input) { - const content = typeof msg.content === 'string' - ? msg.content - : JSON.stringify(msg.content); + const content = typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content); history.push({ role: msg.role as ChatMessageRole, content }); } @@ -3002,9 +3014,7 @@ async function runConversationMode(options: { } // Append user message to history - const userContent = typeof turn.input === 'string' - ? turn.input - : JSON.stringify(turn.input); + const userContent = typeof turn.input === 'string' ? turn.input : JSON.stringify(turn.input); history.push({ role: 'user', content: userContent }); // Build chatPrompt for provider call (with optional window_size) @@ -3065,12 +3075,14 @@ async function runConversationMode(options: { const turnEvalCase: EvalTest = { ...evalCase, id: `${evalCase.id}/turn-${turnIndex}`, - assertions: turnAssertions.length > 0 ? turnAssertions : evalCase.assertions, + assertions: turnAssertions, input: buildTurnGraderInput(history, windowSize), expected_output: turn.expected_output - ? [typeof turn.expected_output === 'string' - ? { content: turn.expected_output } as JsonObject - : turn.expected_output as JsonObject] + ? [ + typeof turn.expected_output === 'string' + ? ({ content: turn.expected_output } as JsonObject) + : (turn.expected_output as JsonObject), + ] : [], // Clear conversation fields to prevent recursion mode: undefined, @@ -3120,12 +3132,10 @@ async function runConversationMode(options: { // Run conversation-level assertions (top-level assertions on full transcript) let conversationScores: EvaluatorResult[] = []; if (evalCase.assertions?.length) { - const lastAssistantContent = history.filter(m => m.role === 'assistant').pop()?.content ?? ''; - const conversationEvalCase: EvalTest = { ...evalCase, id: `${evalCase.id}/conversation`, - input: history.map(m => ({ + input: history.map((m) => ({ role: m.role as TestMessageRole, content: m.content, })), @@ -3134,11 +3144,16 @@ async function runConversationMode(options: { turns: undefined, }; - const fullTranscript = history.map(m => `${m.role}: ${m.content}`).join('\n\n'); + const fullTranscript = history + .map((m) => { + const content = typeof m.content === 'string' ? m.content : JSON.stringify(m.content); + return `${m.role}: ${content}`; + }) + .join('\n\n'); const conversationResult = await evaluateCandidate({ evalCase: conversationEvalCase, - candidate: lastAssistantContent, + candidate: fullTranscript, target, provider, evaluators, @@ -3157,32 +3172,34 @@ async function runConversationMode(options: { availableTargets, }); - conversationScores = [{ - name: 'conversation', - type: 'rubrics' as EvaluatorKind, - score: conversationResult.score, - verdict: scoreToVerdict(conversationResult.score, threshold ?? DEFAULT_THRESHOLD) as EvaluationVerdict, - assertions: conversationResult.assertions ? [...conversationResult.assertions] : [], - scores: conversationResult.scores, - }]; + conversationScores = [ + { + name: 'conversation', + type: 'rubrics' as EvaluatorKind, + score: conversationResult.score, + verdict: scoreToVerdict( + conversationResult.score, + threshold ?? DEFAULT_THRESHOLD, + ) as EvaluationVerdict, + assertions: conversationResult.assertions ? [...conversationResult.assertions] : [], + scores: conversationResult.scores, + }, + ]; } // Aggregate final score - const allScoreValues = [ - ...allTurnScoreValues, - ...conversationScores.map(s => s.score), - ]; + const allScoreValues = [...allTurnScoreValues, ...conversationScores.map((s) => s.score)]; const finalScore = aggregateConversationScores(allScoreValues, aggregation); const allResultScores = [...turnScores, ...conversationScores]; // Build output as full conversation transcript - const outputMessages: Message[] = history.map(m => ({ + const outputMessages: Message[] = history.map((m) => ({ role: m.role, content: m.content, })); - const flatAssertions: AssertionEntry[] = allResultScores.flatMap(s => [...s.assertions]); + const flatAssertions: AssertionEntry[] = allResultScores.flatMap((s) => [...s.assertions]); const totalDurationMs = Date.now() - caseStartMs; return { @@ -3196,7 +3213,7 @@ async function runConversationMode(options: { output: outputMessages, scores: allResultScores, executionStatus: classifyQualityStatus(finalScore, threshold ?? DEFAULT_THRESHOLD), - input: evalCase.input.map(m => ({ + input: evalCase.input.map((m) => ({ role: m.role, content: typeof m.content === 'string' ? m.content : JSON.stringify(m.content), })), @@ -3206,8 +3223,8 @@ async function runConversationMode(options: { /** Include system messages + last windowSize*2 non-system messages */ function buildWindowedHistory(history: readonly ChatMessage[], windowSize: number): ChatMessage[] { - const systemMessages = history.filter(m => m.role === 'system'); - const nonSystem = history.filter(m => m.role !== 'system'); + const systemMessages = history.filter((m) => m.role === 'system'); + const nonSystem = history.filter((m) => m.role !== 'system'); const windowed = nonSystem.slice(-windowSize * 2); return [...systemMessages, ...windowed]; } @@ -3215,13 +3232,18 @@ function buildWindowedHistory(history: readonly ChatMessage[], windowSize: numbe /** Build a text representation of the conversation for grader context */ function buildConversationContext(history: readonly ChatMessage[], windowSize?: number): string { const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history; - return msgs.map(m => `${m.role}: ${m.content}`).join('\n\n'); + return msgs + .map((m) => { + const content = typeof m.content === 'string' ? m.content : JSON.stringify(m.content); + return `${m.role}: ${content}`; + }) + .join('\n\n'); } /** Build TestMessage[] from history for synthetic EvalTest input */ function buildTurnGraderInput(history: readonly ChatMessage[], windowSize?: number): TestMessage[] { const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history; - return msgs.map(m => ({ + return msgs.map((m) => ({ role: m.role as TestMessageRole, content: m.content, })); @@ -3268,14 +3290,16 @@ function buildTurnAssertions(turn: ConversationTurn): EvaluatorConfig[] { } /** Aggregate turn scores using the configured strategy */ -function aggregateConversationScores(scores: readonly number[], aggregation: ConversationAggregation): number { +function aggregateConversationScores( + scores: readonly number[], + aggregation: ConversationAggregation, +): number { if (scores.length === 0) return 1.0; switch (aggregation) { case 'min': return Math.min(...scores); case 'max': return Math.max(...scores); - case 'mean': default: return scores.reduce((sum, s) => sum + s, 0) / scores.length; } diff --git a/packages/core/src/evaluation/validation/eval-validator.ts b/packages/core/src/evaluation/validation/eval-validator.ts index 4acad75e6..4ecc79fa4 100644 --- a/packages/core/src/evaluation/validation/eval-validator.ts +++ b/packages/core/src/evaluation/validation/eval-validator.ts @@ -833,7 +833,8 @@ function validateConversationMode( severity: 'error', filePath, location: `${location}.expected_output`, - message: "Top-level 'expected_output' is not allowed with mode: conversation (use per-turn expected_output instead)", + message: + "Top-level 'expected_output' is not allowed with mode: conversation (use per-turn expected_output instead)", }); } @@ -880,7 +881,13 @@ function validateConversationMode( }); continue; } - if (turn.input === undefined || turn.input === '') { + const turnInput = turn.input; + const isEmpty = + turnInput === undefined || + turnInput === '' || + (typeof turnInput === 'string' && turnInput.trim() === '') || + (Array.isArray(turnInput) && turnInput.length === 0); + if (isEmpty) { errors.push({ severity: 'error', filePath, diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 25887ee28..377c719c3 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -391,15 +391,16 @@ async function loadTestsFromYaml( // Resolve expected_output with shorthand support const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? []; - // A test is complete when it has id, input, and at least one of: criteria, expected_output, or assertions + // A test is complete when it has id, input, and at least one of: criteria, expected_output, assertions, or turns (conversation mode) const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== undefined || - testCaseConfig.assert !== undefined; + testCaseConfig.assert !== undefined || + (Array.isArray(testCaseConfig.turns) && testCaseConfig.turns.length > 0); if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) { logError( - `Skipping incomplete test: ${id ?? 'unknown'}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`, + `Skipping incomplete test: ${id ?? 'unknown'}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions/turns`, ); continue; } @@ -530,7 +531,8 @@ async function loadTestsFromYaml( // Extract conversation mode fields const modeRaw = asString(testCaseConfig.mode); - const mode: ConversationMode | undefined = modeRaw === 'conversation' ? 'conversation' : undefined; + const mode: ConversationMode | undefined = + modeRaw === 'conversation' ? 'conversation' : undefined; const turns = Array.isArray(testCaseConfig.turns) ? parseTurns(testCaseConfig.turns as readonly unknown[]) : undefined; @@ -541,9 +543,7 @@ async function loadTestsFromYaml( : undefined; const onTurnFailureRaw = asString(testCaseConfig.on_turn_failure); const onTurnFailure: TurnFailurePolicy | undefined = - onTurnFailureRaw === 'continue' || onTurnFailureRaw === 'stop' - ? onTurnFailureRaw - : undefined; + onTurnFailureRaw === 'continue' || onTurnFailureRaw === 'stop' ? onTurnFailureRaw : undefined; const windowSize = typeof testCaseConfig.window_size === 'number' && testCaseConfig.window_size >= 1 ? (testCaseConfig.window_size as number) diff --git a/packages/core/test/evaluation/conversation-mode.test.ts b/packages/core/test/evaluation/conversation-mode.test.ts index fde4c5c99..2eeb8eee4 100644 --- a/packages/core/test/evaluation/conversation-mode.test.ts +++ b/packages/core/test/evaluation/conversation-mode.test.ts @@ -16,7 +16,11 @@ import path from 'node:path'; import { runEvalCase } from '../../src/evaluation/orchestrator.js'; import type { ResolvedTarget } from '../../src/evaluation/providers/targets.js'; -import type { Provider, ProviderRequest, ProviderResponse } from '../../src/evaluation/providers/types.js'; +import type { + Provider, + ProviderRequest, + ProviderResponse, +} from '../../src/evaluation/providers/types.js'; import type { EvalTest } from '../../src/evaluation/types.js'; import { validateEvalFile } from '../../src/evaluation/validation/eval-validator.js'; @@ -114,10 +118,7 @@ describe('runEvalCase — conversation mode', () => { file_paths: [], criteria: 'Be helpful', mode: 'conversation', - turns: [ - { input: 'Turn 1 message' }, - { input: 'Turn 2 message' }, - ], + turns: [{ input: 'Turn 1 message' }, { input: 'Turn 2 message' }], }; const result = await runEvalCase({ @@ -166,9 +167,7 @@ describe('runEvalCase — conversation mode', () => { }); it('per-turn structured assertions are evaluated', async () => { - const provider = new SequenceProvider('mock', [ - assistantResponse('42'), - ]); + const provider = new SequenceProvider('mock', [assistantResponse('42')]); const evalCase: EvalTest = { id: 'conv-struct-assertions', @@ -212,10 +211,7 @@ describe('runEvalCase — conversation mode', () => { file_paths: [], criteria: 'Consistent throughout', mode: 'conversation', - turns: [ - { input: 'Turn 1' }, - { input: 'Turn 2' }, - ], + turns: [{ input: 'Turn 1' }, { input: 'Turn 2' }], assertions: [{ type: 'llm-grader', criteria: 'Conversation was coherent' }], }; @@ -250,11 +246,7 @@ describe('runEvalCase — conversation mode', () => { criteria: 'Anything', mode: 'conversation', aggregation: 'mean', - turns: [ - { input: 'T1' }, - { input: 'T2' }, - { input: 'T3' }, - ], + turns: [{ input: 'T1' }, { input: 'T2' }, { input: 'T3' }], }; const result = await runEvalCase({ @@ -500,11 +492,7 @@ describe('runEvalCase — conversation mode', () => { criteria: 'Anything', mode: 'conversation', window_size: 1, // keep system + last 1 user+assistant pair - turns: [ - { input: 'T1' }, - { input: 'T2' }, - { input: 'T3' }, - ], + turns: [{ input: 'T1' }, { input: 'T2' }, { input: 'T3' }], }; await runEvalCase({ @@ -540,10 +528,7 @@ describe('runEvalCase — conversation mode', () => { file_paths: [], criteria: 'Anything', mode: 'conversation', - turns: [ - { input: 'T1' }, - { input: 'T2' }, - ], + turns: [{ input: 'T1' }, { input: 'T2' }], }; const result = await runEvalCase({ @@ -578,10 +563,7 @@ describe('runEvalCase — conversation mode', () => { file_paths: [], criteria: 'Full transcript', mode: 'conversation', - turns: [ - { input: 'Question 1' }, - { input: 'Question 2' }, - ], + turns: [{ input: 'Question 1' }, { input: 'Question 2' }], }; const result = await runEvalCase({ @@ -603,11 +585,113 @@ describe('runEvalCase — conversation mode', () => { expect(assistantMessages[1]?.content).toBe('Answer 2'); }); - it('no regression — non-conversation test behaves as before', async () => { + it('top-level assertions are NOT applied per-turn — only at conversation level', async () => { + let graderCallCount = 0; + const customRegistry = { + 'llm-grader': { + kind: 'llm-grader' as const, + async evaluate() { + graderCallCount++; + return { + score: 0.8, + verdict: 'pass' as const, + assertions: [{ text: 'graded', passed: true }], + expectedAspectCount: 1, + }; + }, + }, + }; + + const provider = new SequenceProvider('mock', [assistantResponse('A'), assistantResponse('B')]); + + const evalCase: EvalTest = { + id: 'conv-no-double-count', + question: 'double count test', + input: [], + expected_output: [], + file_paths: [], + criteria: 'Anything', + mode: 'conversation', + turns: [ + { input: 'T1' }, // no per-turn assertions → scores 1.0 without grader + { input: 'T2' }, // no per-turn assertions → scores 1.0 without grader + ], + assertions: [{ type: 'llm-grader', criteria: 'Conversation was coherent' }], + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: customRegistry, + now: nowFn, + }); + + // Grader should be called exactly once — for the conversation-level pass only + expect(graderCallCount).toBe(1); + + // Should have 2 turn scores (1.0 each) + 1 conversation score + const turnScores = result.scores?.filter((s) => s.name.startsWith('turn-')) ?? []; + const convScore = result.scores?.find((s) => s.name === 'conversation'); + expect(turnScores).toHaveLength(2); + expect(turnScores[0]?.score).toBe(1.0); + expect(turnScores[1]?.score).toBe(1.0); + expect(convScore).toBeDefined(); + expect(convScore?.score).toBe(0.8); + }); + + it('conversation-level assertions grade the full transcript, not just last reply', async () => { + let graderCandidate = ''; + const customRegistry = { + 'llm-grader': { + kind: 'llm-grader' as const, + async evaluate(ctx: { candidate: string }) { + graderCandidate = ctx.candidate; + return { + score: 1.0, + verdict: 'pass' as const, + assertions: [{ text: 'graded', passed: true }], + expectedAspectCount: 1, + }; + }, + }, + }; + const provider = new SequenceProvider('mock', [ - assistantResponse('Standard response'), + assistantResponse('First answer'), + assistantResponse('Second answer'), ]); + const evalCase: EvalTest = { + id: 'conv-transcript-candidate', + question: 'transcript candidate test', + input: [{ role: 'system', content: 'Be helpful' }], + expected_output: [], + file_paths: [], + criteria: 'Anything', + mode: 'conversation', + turns: [{ input: 'Question 1' }, { input: 'Question 2' }], + assertions: [{ type: 'llm-grader', criteria: 'Full transcript is coherent' }], + }; + + await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: customRegistry, + now: nowFn, + }); + + // The candidate passed to the grader should contain the full transcript, not just "Second answer" + expect(graderCandidate).toContain('First answer'); + expect(graderCandidate).toContain('Second answer'); + expect(graderCandidate).toContain('Question 1'); + expect(graderCandidate).toContain('Question 2'); + }); + + it('no regression — non-conversation test behaves as before', async () => { + const provider = new SequenceProvider('mock', [assistantResponse('Standard response')]); + const evalCase: EvalTest = { id: 'standard-test', question: 'Standard test', @@ -663,7 +747,9 @@ describe('validateEvalFile — conversation mode', () => { ); const result = await validateEvalFile(filePath); expect(result.valid).toBe(false); - expect(result.errors.some((e) => e.message.includes("'turns' requires mode: conversation"))).toBe(true); + expect( + result.errors.some((e) => e.message.includes("'turns' requires mode: conversation")), + ).toBe(true); }); it('rejects mode: conversation without turns', async () => { @@ -715,7 +801,11 @@ describe('validateEvalFile — conversation mode', () => { ); const result = await validateEvalFile(filePath); expect(result.valid).toBe(false); - expect(result.errors.some((e) => e.message.includes("'expected_output' is not allowed with mode: conversation"))).toBe(true); + expect( + result.errors.some((e) => + e.message.includes("'expected_output' is not allowed with mode: conversation"), + ), + ).toBe(true); }); it('rejects aggregation without mode: conversation', async () => { @@ -731,7 +821,9 @@ describe('validateEvalFile — conversation mode', () => { ); const result = await validateEvalFile(filePath); expect(result.valid).toBe(false); - expect(result.errors.some((e) => e.message.includes("'aggregation' requires mode: conversation"))).toBe(true); + expect( + result.errors.some((e) => e.message.includes("'aggregation' requires mode: conversation")), + ).toBe(true); }); it('rejects on_turn_failure without mode: conversation', async () => { @@ -747,7 +839,11 @@ describe('validateEvalFile — conversation mode', () => { ); const result = await validateEvalFile(filePath); expect(result.valid).toBe(false); - expect(result.errors.some((e) => e.message.includes("'on_turn_failure' requires mode: conversation"))).toBe(true); + expect( + result.errors.some((e) => + e.message.includes("'on_turn_failure' requires mode: conversation"), + ), + ).toBe(true); }); it('rejects window_size without mode: conversation', async () => { @@ -763,7 +859,9 @@ describe('validateEvalFile — conversation mode', () => { ); const result = await validateEvalFile(filePath); expect(result.valid).toBe(false); - expect(result.errors.some((e) => e.message.includes("'window_size' requires mode: conversation"))).toBe(true); + expect( + result.errors.some((e) => e.message.includes("'window_size' requires mode: conversation")), + ).toBe(true); }); it('rejects a turn missing input', async () => { @@ -784,6 +882,24 @@ describe('validateEvalFile — conversation mode', () => { expect(result.errors.some((e) => e.message.includes('non-empty input'))).toBe(true); }); + it('rejects a turn with whitespace-only input', async () => { + const filePath = path.join(tempDir, 'turn-whitespace-input.yaml'); + await writeFile( + filePath, + `tests: + - id: t1 + criteria: Goal + input: hello + mode: conversation + turns: + - input: " " +`, + ); + const result = await validateEvalFile(filePath); + expect(result.valid).toBe(false); + expect(result.errors.some((e) => e.message.includes('non-empty input'))).toBe(true); + }); + it('accepts a valid conversation mode eval file', async () => { const filePath = path.join(tempDir, 'valid-conversation.yaml'); await writeFile( diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json index a5ae6f2a5..80dc2ebd8 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json +++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json @@ -56,12 +56,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -75,30 +70,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file", - "image" - ] + "enum": ["text", "file", "image"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -136,12 +121,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -155,30 +135,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file", - "image" - ] + "enum": ["text", "file", "image"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -206,12 +176,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -225,30 +190,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file", - "image" - ] + "enum": ["text", "file", "image"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -292,10 +247,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -369,18 +321,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -417,10 +363,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -515,10 +458,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -569,17 +509,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -590,9 +525,7 @@ "minLength": 1 } }, - "required": [ - "include" - ], + "required": ["include"], "additionalProperties": false }, { @@ -655,9 +588,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -673,10 +604,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -693,10 +621,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -713,18 +638,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -761,20 +681,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -815,12 +726,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -834,12 +740,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -850,9 +751,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -860,12 +759,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -879,12 +773,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -895,10 +784,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -935,10 +821,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -950,11 +833,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -976,26 +855,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -1039,10 +909,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -1086,10 +953,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -1126,10 +990,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -1144,9 +1005,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1183,10 +1042,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -1218,9 +1074,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1263,10 +1117,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -1309,10 +1160,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -1349,15 +1197,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1400,10 +1243,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -1492,10 +1332,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -1505,10 +1342,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -1552,10 +1386,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -1629,18 +1460,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -1677,10 +1502,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -1775,10 +1597,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -1829,17 +1648,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1850,9 +1664,7 @@ "minLength": 1 } }, - "required": [ - "include" - ], + "required": ["include"], "additionalProperties": false }, { @@ -1915,9 +1727,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1933,10 +1743,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -1953,10 +1760,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -1973,18 +1777,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -2021,20 +1820,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -2075,12 +1865,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -2094,12 +1879,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -2110,9 +1890,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -2120,12 +1898,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -2139,12 +1912,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -2155,10 +1923,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -2195,10 +1960,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -2210,11 +1972,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -2236,26 +1994,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -2299,10 +2048,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -2346,10 +2092,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -2386,10 +2129,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -2404,9 +2144,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -2443,10 +2181,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -2478,9 +2213,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -2523,10 +2256,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -2569,10 +2299,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -2609,15 +2336,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -2660,10 +2382,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -2752,10 +2471,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -2765,10 +2481,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -2829,10 +2542,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -2906,18 +2616,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -2954,10 +2658,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -3052,10 +2753,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -3106,17 +2804,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3127,9 +2820,7 @@ "minLength": 1 } }, - "required": [ - "include" - ], + "required": ["include"], "additionalProperties": false }, { @@ -3192,9 +2883,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3210,10 +2899,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -3230,10 +2916,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -3250,18 +2933,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -3298,20 +2976,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -3352,12 +3021,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -3371,12 +3035,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -3387,9 +3046,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -3397,12 +3054,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -3416,12 +3068,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -3432,10 +3079,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -3472,10 +3116,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -3487,11 +3128,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -3513,26 +3150,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -3576,10 +3204,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -3623,10 +3248,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -3663,10 +3285,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -3681,9 +3300,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3720,10 +3337,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -3755,9 +3369,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3800,10 +3412,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -3846,10 +3455,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -3886,15 +3492,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3937,10 +3538,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -4029,10 +3627,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -4042,10 +3637,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -4089,10 +3681,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -4166,18 +3755,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -4214,10 +3797,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -4312,10 +3892,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -4366,17 +3943,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -4387,9 +3959,7 @@ "minLength": 1 } }, - "required": [ - "include" - ], + "required": ["include"], "additionalProperties": false }, { @@ -4452,9 +4022,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -4470,10 +4038,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -4490,10 +4055,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -4510,18 +4072,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -4558,20 +4115,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -4612,12 +4160,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -4631,12 +4174,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -4647,9 +4185,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -4657,12 +4193,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -4676,12 +4207,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -4692,10 +4218,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -4732,10 +4255,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -4747,11 +4267,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -4773,26 +4289,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -4836,10 +4343,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -4883,10 +4387,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -4923,10 +4424,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -4941,9 +4439,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -4980,10 +4476,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -5015,9 +4508,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -5060,10 +4551,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -5106,10 +4594,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -5146,15 +4631,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -5197,10 +4677,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -5289,10 +4766,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -5302,10 +4776,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -5326,11 +4797,7 @@ }, "strategy": { "type": "string", - "enum": [ - "pass_at_k", - "mean", - "confidence_interval" - ] + "enum": ["pass_at_k", "mean", "confidence_interval"] }, "cost_limit_usd": { "type": "number", @@ -5341,9 +4808,7 @@ "minimum": 0 } }, - "required": [ - "count" - ], + "required": ["count"], "additionalProperties": false }, "total_budget_usd": { @@ -5376,10 +4841,7 @@ }, "isolation": { "type": "string", - "enum": [ - "shared", - "per_test" - ] + "enum": ["shared", "per_test"] }, "repos": { "type": "array", @@ -5403,10 +4865,7 @@ "format": "uri" } }, - "required": [ - "type", - "url" - ], + "required": ["type", "url"], "additionalProperties": false }, { @@ -5420,10 +4879,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false } ] @@ -5440,10 +4896,7 @@ }, "resolve": { "type": "string", - "enum": [ - "remote", - "local" - ] + "enum": ["remote", "local"] }, "ancestor": { "type": "integer", @@ -5507,11 +4960,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -5542,11 +4991,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -5577,11 +5022,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -5612,11 +5053,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -5626,11 +5063,7 @@ }, "mode": { "type": "string", - "enum": [ - "pooled", - "temp", - "static" - ] + "enum": ["pooled", "temp", "static"] }, "path": { "type": "string" @@ -5653,9 +5086,7 @@ "minimum": 0.1 } }, - "required": [ - "image" - ], + "required": ["image"], "additionalProperties": false } }, @@ -5682,17 +5113,11 @@ }, "on_dependency_failure": { "type": "string", - "enum": [ - "skip", - "fail", - "run" - ] + "enum": ["skip", "fail", "run"] }, "mode": { "type": "string", - "enum": [ - "conversation" - ] + "enum": ["conversation"] }, "turns": { "type": "array", @@ -5716,20 +5141,13 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file", - "image" - ] + "enum": ["text", "file", "image"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } @@ -5754,20 +5172,13 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file", - "image" - ] + "enum": ["text", "file", "image"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } @@ -5818,10 +5229,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -5895,18 +5303,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -5943,10 +5345,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -6041,10 +5440,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -6095,17 +5491,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6116,9 +5507,7 @@ "minLength": 1 } }, - "required": [ - "include" - ], + "required": ["include"], "additionalProperties": false }, { @@ -6181,9 +5570,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6199,10 +5586,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -6219,10 +5603,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -6239,18 +5620,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -6287,10 +5663,7 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", @@ -6341,12 +5714,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -6360,12 +5728,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -6376,9 +5739,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -6386,12 +5747,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -6405,12 +5761,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -6421,10 +5772,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -6461,10 +5809,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -6476,11 +5821,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -6502,26 +5843,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -6565,10 +5897,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -6612,10 +5941,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -6652,10 +5978,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -6670,9 +5993,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6709,10 +6030,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -6744,9 +6062,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6789,10 +6105,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -6835,10 +6148,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -6875,15 +6185,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6926,10 +6231,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -7018,10 +6320,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -7031,10 +6330,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -7043,36 +6339,25 @@ } } }, - "required": [ - "input" - ], + "required": ["input"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "mean", - "min", - "max" - ] + "enum": ["mean", "min", "max"] }, "on_turn_failure": { "type": "string", - "enum": [ - "continue", - "stop" - ] + "enum": ["continue", "stop"] }, "window_size": { "type": "integer", "minimum": 1 } }, - "required": [ - "id" - ], + "required": ["id"], "additionalProperties": false } }, @@ -7107,12 +6392,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -7126,30 +6406,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file", - "image" - ] + "enum": ["text", "file", "image"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -7177,12 +6447,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -7196,30 +6461,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file", - "image" - ] + "enum": ["text", "file", "image"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -7263,10 +6518,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -7340,18 +6592,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -7388,10 +6634,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -7486,10 +6729,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -7540,17 +6780,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -7561,9 +6796,7 @@ "minLength": 1 } }, - "required": [ - "include" - ], + "required": ["include"], "additionalProperties": false }, { @@ -7626,9 +6859,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -7644,10 +6875,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -7664,10 +6892,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -7684,18 +6909,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -7732,20 +6952,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -7786,12 +6997,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -7805,12 +7011,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -7821,9 +7022,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -7831,12 +7030,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -7850,12 +7044,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -7866,10 +7055,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -7906,10 +7092,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -7921,11 +7104,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -7947,26 +7126,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -8010,10 +7180,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -8057,10 +7224,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -8097,10 +7261,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -8115,9 +7276,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8154,10 +7313,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -8189,9 +7345,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8234,10 +7388,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -8280,10 +7431,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -8320,15 +7468,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8371,10 +7514,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -8463,10 +7603,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -8476,10 +7613,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -8523,10 +7657,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -8600,18 +7731,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -8648,10 +7773,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -8746,10 +7868,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -8800,17 +7919,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8821,9 +7935,7 @@ "minLength": 1 } }, - "required": [ - "include" - ], + "required": ["include"], "additionalProperties": false }, { @@ -8886,9 +7998,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8904,10 +8014,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -8924,10 +8031,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -8944,18 +8048,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -8992,20 +8091,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -9046,12 +8136,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -9065,12 +8150,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -9081,9 +8161,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -9091,12 +8169,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -9110,12 +8183,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -9126,10 +8194,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -9166,10 +8231,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -9181,11 +8243,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -9207,26 +8265,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -9270,10 +8319,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -9317,10 +8363,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -9357,10 +8400,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -9375,9 +8415,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -9414,10 +8452,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -9449,9 +8484,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -9494,10 +8527,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -9540,10 +8570,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -9580,15 +8607,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -9631,10 +8653,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -9723,10 +8742,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -9736,10 +8752,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -9800,10 +8813,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -9877,18 +8887,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -9925,10 +8929,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -10023,10 +9024,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -10077,17 +9075,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -10098,9 +9091,7 @@ "minLength": 1 } }, - "required": [ - "include" - ], + "required": ["include"], "additionalProperties": false }, { @@ -10163,9 +9154,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -10181,10 +9170,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -10201,10 +9187,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -10221,18 +9204,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -10269,20 +9247,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -10323,12 +9292,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -10342,12 +9306,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -10358,9 +9317,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -10368,12 +9325,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -10387,12 +9339,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -10403,10 +9350,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -10443,10 +9387,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -10458,11 +9399,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -10484,26 +9421,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -10547,10 +9475,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -10594,10 +9519,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -10634,10 +9556,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -10652,9 +9571,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -10691,10 +9608,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -10726,9 +9640,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -10771,10 +9683,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -10817,10 +9726,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -10857,15 +9763,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -10908,10 +9809,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -11000,10 +9898,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -11013,10 +9908,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -11060,10 +9952,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -11137,18 +10026,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -11185,10 +10068,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -11283,10 +10163,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -11337,17 +10214,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -11358,9 +10230,7 @@ "minLength": 1 } }, - "required": [ - "include" - ], + "required": ["include"], "additionalProperties": false }, { @@ -11423,9 +10293,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -11441,10 +10309,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -11461,10 +10326,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -11481,18 +10343,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -11529,20 +10386,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -11583,12 +10431,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -11602,12 +10445,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -11618,9 +10456,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -11628,12 +10464,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -11647,12 +10478,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -11663,10 +10489,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -11703,10 +10526,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -11718,11 +10538,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -11744,26 +10560,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -11807,10 +10614,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -11854,10 +10658,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -11894,10 +10695,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -11912,9 +10710,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -11951,10 +10747,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -11986,9 +10779,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -12031,10 +10822,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -12077,10 +10865,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -12117,15 +10902,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -12168,10 +10948,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -12260,10 +11037,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -12273,10 +11047,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -12297,11 +11068,7 @@ }, "strategy": { "type": "string", - "enum": [ - "pass_at_k", - "mean", - "confidence_interval" - ] + "enum": ["pass_at_k", "mean", "confidence_interval"] }, "cost_limit_usd": { "type": "number", @@ -12312,9 +11079,7 @@ "minimum": 0 } }, - "required": [ - "count" - ], + "required": ["count"], "additionalProperties": false }, "total_budget_usd": { @@ -12347,10 +11112,7 @@ }, "isolation": { "type": "string", - "enum": [ - "shared", - "per_test" - ] + "enum": ["shared", "per_test"] }, "repos": { "type": "array", @@ -12374,10 +11136,7 @@ "format": "uri" } }, - "required": [ - "type", - "url" - ], + "required": ["type", "url"], "additionalProperties": false }, { @@ -12391,10 +11150,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false } ] @@ -12411,10 +11167,7 @@ }, "resolve": { "type": "string", - "enum": [ - "remote", - "local" - ] + "enum": ["remote", "local"] }, "ancestor": { "type": "integer", @@ -12478,11 +11231,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -12513,11 +11262,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -12548,11 +11293,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -12583,11 +11324,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -12597,11 +11334,7 @@ }, "mode": { "type": "string", - "enum": [ - "pooled", - "temp", - "static" - ] + "enum": ["pooled", "temp", "static"] }, "path": { "type": "string" @@ -12624,9 +11357,7 @@ "minimum": 0.1 } }, - "required": [ - "image" - ], + "required": ["image"], "additionalProperties": false } }, @@ -12653,17 +11384,11 @@ }, "on_dependency_failure": { "type": "string", - "enum": [ - "skip", - "fail", - "run" - ] + "enum": ["skip", "fail", "run"] }, "mode": { "type": "string", - "enum": [ - "conversation" - ] + "enum": ["conversation"] }, "turns": { "type": "array", @@ -12687,20 +11412,13 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file", - "image" - ] + "enum": ["text", "file", "image"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } @@ -12725,20 +11443,13 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file", - "image" - ] + "enum": ["text", "file", "image"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } @@ -12789,10 +11500,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -12866,18 +11574,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -12914,10 +11616,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -13012,10 +11711,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -13066,17 +11762,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -13087,9 +11778,7 @@ "minLength": 1 } }, - "required": [ - "include" - ], + "required": ["include"], "additionalProperties": false }, { @@ -13152,9 +11841,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -13170,10 +11857,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -13190,10 +11874,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -13210,18 +11891,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -13258,10 +11934,7 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", @@ -13312,12 +11985,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -13331,12 +11999,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -13347,9 +12010,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -13357,12 +12018,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -13376,12 +12032,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -13392,10 +12043,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -13432,10 +12080,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -13447,11 +12092,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -13473,26 +12114,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -13536,10 +12168,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -13583,10 +12212,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -13623,10 +12249,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -13641,9 +12264,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -13680,10 +12301,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -13715,9 +12333,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -13760,10 +12376,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -13806,10 +12419,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -13846,15 +12456,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -13897,10 +12502,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -13989,10 +12591,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -14002,10 +12601,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -14014,36 +12610,25 @@ } } }, - "required": [ - "input" - ], + "required": ["input"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "mean", - "min", - "max" - ] + "enum": ["mean", "min", "max"] }, "on_turn_failure": { "type": "string", - "enum": [ - "continue", - "stop" - ] + "enum": ["continue", "stop"] }, "window_size": { "type": "integer", "minimum": 1 } }, - "required": [ - "id" - ], + "required": ["id"], "additionalProperties": false } }, @@ -14110,10 +12695,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -14187,18 +12769,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -14235,10 +12811,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -14333,10 +12906,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -14387,17 +12957,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -14408,9 +12973,7 @@ "minLength": 1 } }, - "required": [ - "include" - ], + "required": ["include"], "additionalProperties": false }, { @@ -14473,9 +13036,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -14491,10 +13052,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -14511,10 +13069,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -14531,18 +13086,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -14579,20 +13129,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -14633,12 +13174,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -14652,12 +13188,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -14668,9 +13199,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -14678,12 +13207,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -14697,12 +13221,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -14713,10 +13232,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -14753,10 +13269,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -14768,11 +13281,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -14794,26 +13303,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -14857,10 +13357,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -14904,10 +13401,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -14944,10 +13438,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -14962,9 +13453,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -15001,10 +13490,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -15036,9 +13522,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -15081,10 +13565,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -15127,10 +13608,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -15167,15 +13645,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -15218,10 +13691,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -15310,10 +13780,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -15323,10 +13790,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -15370,10 +13834,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -15447,18 +13908,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -15495,10 +13950,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -15593,10 +14045,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -15647,17 +14096,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -15668,9 +14112,7 @@ "minLength": 1 } }, - "required": [ - "include" - ], + "required": ["include"], "additionalProperties": false }, { @@ -15733,9 +14175,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -15751,10 +14191,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -15771,10 +14208,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -15791,18 +14225,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -15839,20 +14268,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -15893,12 +14313,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -15912,12 +14327,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -15928,9 +14338,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -15938,12 +14346,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -15957,12 +14360,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -15973,10 +14371,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -16013,10 +14408,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -16028,11 +14420,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -16054,26 +14442,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -16117,10 +14496,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -16164,10 +14540,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -16204,10 +14577,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -16222,9 +14592,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -16261,10 +14629,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -16296,9 +14661,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -16341,10 +14704,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -16387,10 +14747,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -16427,15 +14784,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -16478,10 +14830,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -16570,10 +14919,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -16583,10 +14929,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -16607,11 +14950,7 @@ }, "strategy": { "type": "string", - "enum": [ - "pass_at_k", - "mean", - "confidence_interval" - ] + "enum": ["pass_at_k", "mean", "confidence_interval"] }, "cost_limit_usd": { "type": "number", @@ -16622,9 +14961,7 @@ "minimum": 0 } }, - "required": [ - "count" - ], + "required": ["count"], "additionalProperties": false }, "total_budget_usd": { @@ -16687,10 +15024,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -16764,18 +15098,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -16812,10 +15140,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -16910,10 +15235,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -16964,17 +15286,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -16985,9 +15302,7 @@ "minLength": 1 } }, - "required": [ - "include" - ], + "required": ["include"], "additionalProperties": false }, { @@ -17050,9 +15365,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -17068,10 +15381,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -17088,10 +15398,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -17108,18 +15415,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -17156,20 +15458,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -17210,12 +15503,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -17229,12 +15517,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -17245,9 +15528,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -17255,12 +15536,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -17274,12 +15550,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -17290,10 +15561,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -17330,10 +15598,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -17345,11 +15610,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -17371,26 +15632,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -17434,10 +15686,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -17481,10 +15730,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -17521,10 +15767,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -17539,9 +15782,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -17578,10 +15819,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -17613,9 +15851,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -17658,10 +15894,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -17704,10 +15937,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -17744,15 +15974,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -17795,10 +16020,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -17887,10 +16109,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -17900,10 +16119,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -17932,10 +16148,7 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } }, @@ -17949,10 +16162,7 @@ }, "isolation": { "type": "string", - "enum": [ - "shared", - "per_test" - ] + "enum": ["shared", "per_test"] }, "repos": { "type": "array", @@ -17976,10 +16186,7 @@ "format": "uri" } }, - "required": [ - "type", - "url" - ], + "required": ["type", "url"], "additionalProperties": false }, { @@ -17993,10 +16200,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false } ] @@ -18013,10 +16217,7 @@ }, "resolve": { "type": "string", - "enum": [ - "remote", - "local" - ] + "enum": ["remote", "local"] }, "ancestor": { "type": "integer", @@ -18080,11 +16281,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -18115,11 +16312,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -18150,11 +16343,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -18185,11 +16374,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -18199,11 +16384,7 @@ }, "mode": { "type": "string", - "enum": [ - "pooled", - "temp", - "static" - ] + "enum": ["pooled", "temp", "static"] }, "path": { "type": "string" @@ -18226,9 +16407,7 @@ "minimum": 0.1 } }, - "required": [ - "image" - ], + "required": ["image"], "additionalProperties": false } }, @@ -18240,9 +16419,7 @@ ] } }, - "required": [ - "tests" - ], + "required": ["tests"], "additionalProperties": false } }