diff --git a/docs/plans/1052-conversation-mode.md b/docs/plans/1052-conversation-mode.md
new file mode 100644
index 000000000..2c50ab9e3
--- /dev/null
+++ b/docs/plans/1052-conversation-mode.md
@@ -0,0 +1,55 @@
+# Issue #1052: Multi-turn Conversational Test Case — Live Turn-by-Turn Evaluation
+
+## Problem
+
+Today, multi-turn evals script all intermediate assistant responses in `input` — the LLM only generates the last response. This means conversation context retention, progressive reasoning, and turn-by-turn quality cannot be measured independently.
+
+## Solution
+
+Add `mode: conversation` with a `turns` array that drives turn-by-turn LLM evaluation with per-turn and conversation-level grading.
+
+### New Schema Fields
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `mode` | `'conversation'` | - | Enables conversation evaluation mode |
+| `turns` | `ConversationTurn[]` | - | Ordered user messages; each generates an LLM call |
+| `aggregation` | `'mean' \| 'min' \| 'max'` | `'mean'` | How turn scores combine into final score |
+| `on_turn_failure` | `'continue' \| 'stop'` | `'continue'` | What to do when a turn's assertions fail |
+| `window_size` | `number` | all turns | Sliding window for context passed to graders |
+
+### How It Works
+
+1. `input` provides system prompt and initial context (same as today)
+2. For each entry in `turns`:
+   a. Append the user message to accumulated history
+   b. Call the provider with full history — LLM generates assistant response
+   c. Grade the response against turn's `assertions` and `expected_output`
+   d. Append actual LLM response (not expected_output) to history
+3. After all turns: run top-level `assertions` over full transcript
+4. Final score = aggregation of per-turn + conversation assertion scores
+
+### Validation Rules
+
+- `turns` requires `mode: conversation`
+- `mode: conversation` requires `turns`
+- `turns` incompatible with top-level `expected_output`
+- `aggregation` only valid with `mode: conversation`
+- Each turn must have non-empty `input`
+
+### Files Modified
+
+| File | Change |
+|------|--------|
+| `packages/core/src/evaluation/types.ts` | ConversationTurn, mode, turns, etc. on EvalTest |
+| `packages/core/src/evaluation/validation/eval-file.schema.ts` | Zod schema for new fields |
+| `packages/core/src/evaluation/yaml-parser.ts` | Parse conversation fields |
+| `packages/core/src/evaluation/orchestrator.ts` | Conversation runner in runEvalCase |
+| `packages/core/test/evaluation/conversation-mode.test.ts` | Unit tests |
+| `examples/features/multi-turn-conversation-live/` | UAT example |
+
+## References
+
+- Issue: #1052
+- Research: agentevals-research PR #57
+- Prior art: #505 / PR #507 (scripted multi-turn), #331 / PR #1051 (depends_on)
diff --git a/examples/features/multi-turn-conversation-live/README.md b/examples/features/multi-turn-conversation-live/README.md
new file mode 100644
index 000000000..db0a9bd28
--- /dev/null
+++ b/examples/features/multi-turn-conversation-live/README.md
@@ -0,0 +1,22 @@
+# Multi-Turn Conversation (Live)
+
+This example demonstrates **live turn-by-turn conversation evaluation** where the LLM generates each assistant response (unlike `multi-turn-conversation/` which scripts intermediate turns).
+
+## Features Shown
+
+- `mode: conversation` — enables live turn-by-turn evaluation
+- `turns[]` — each entry is a user message that generates an LLM call
+- Per-turn `assertions` — string shorthand (rubric) and structured evaluators
+- `aggregation: mean | min | max` — how turn scores combine
+- `on_turn_failure: stop | continue` — behavior on assertion failure
+- Top-level `assertions` — conversation-level grading after all turns
+
+## Running
+
+```bash
+# With default target
+bun apps/cli/src/cli.ts eval examples/features/multi-turn-conversation-live/evals/dataset.eval.yaml
+
+# With specific test
+bun apps/cli/src/cli.ts eval examples/features/multi-turn-conversation-live/evals/dataset.eval.yaml --test-id context-retention
+```
diff --git a/examples/features/multi-turn-conversation-live/evals/dataset.eval.yaml b/examples/features/multi-turn-conversation-live/evals/dataset.eval.yaml
new file mode 100644
index 000000000..831f6597c
--- /dev/null
+++ b/examples/features/multi-turn-conversation-live/evals/dataset.eval.yaml
@@ -0,0 +1,105 @@
+# Multi-turn conversation evaluation (live turn-by-turn)
+# Each turn generates a fresh LLM call; per-turn assertions grade each response.
+# This is different from multi-turn-conversation/ which scripts intermediate turns.
+
+description: Live multi-turn conversation evaluation with per-turn grading
+
+execution:
+  target: llm
+
+tests:
+  # Test 1: Basic context retention across turns
+  - id: context-retention
+    mode: conversation
+    criteria: Agent maintains context and provides relevant responses across turns
+    aggregation: mean
+    input:
+      - role: system
+        content: |-
+          You are a helpful math tutor. Be concise and accurate.
+          Always show your work step by step.
+    turns:
+      - input: What is 15% of 200?
+        assertions:
+          - Correctly calculates 15% of 200 as 30
+          - Shows the calculation steps
+      - input: Now double that result.
+        assertions:
+          - References the previous answer of 30
+          - Correctly calculates double as 60
+      - input: What were the original numbers I asked about?
+        assertions:
+          - Recalls that the user asked about 15% and 200
+          - Demonstrates memory of the conversation context
+
+  # Test 2: With aggregation: min (weakest-link scoring)
+  - id: weakest-link-scoring
+    mode: conversation
+    criteria: Agent provides accurate, well-structured responses
+    aggregation: min
+    input:
+      - role: system
+        content: You are a concise geography expert. Answer in 1-2 sentences.
+    turns:
+      - input: What is the capital of France?
+        assertions:
+          - Correctly identifies Paris as the capital of France
+      - input: What country is it in?
+        assertions:
+          - Recognizes the question refers to Paris from the previous turn
+          - Confirms Paris is in France
+
+  # Test 3: With on_turn_failure: stop
+  - id: stop-on-failure
+    mode: conversation
+    on_turn_failure: stop
+    criteria: Agent follows instructions precisely
+    input:
+      - role: system
+        content: You are a helpful assistant. Be precise and accurate.
+    turns:
+      - input: What is 2 + 2?
+        assertions:
+          - Answers with 4
+      - input: Multiply that by 3.
+        assertions:
+          - References the previous answer
+          - Calculates 12 correctly
+
+  # Test 4: Mixed string and structured assertions
+  - id: mixed-assertions
+    mode: conversation
+    criteria: Agent writes correct, well-formed Python code
+    input:
+      - role: system
+        content: You are a helpful coding assistant.
+    turns:
+      - input: Write a Python function that adds two numbers.
+        assertions:
+          - Contains a Python function definition
+          - type: contains
+            value: def
+      - input: Now add type hints to the function.
+        assertions:
+          - Includes type hints (int, float, or similar)
+          - type: contains
+            value: "->"
+
+  # Test 5: Conversation-level assertions
+  - id: conversation-coherence
+    mode: conversation
+    criteria: Agent maintains a coherent, helpful conversation
+    input:
+      - role: system
+        content: You are a helpful travel advisor. Be concise.
+    turns:
+      - input: I want to visit somewhere warm in December.
+        assertions:
+          - Suggests at least one warm destination
+      - input: I prefer beaches over cities.
+        assertions:
+          - Adjusts recommendations toward beach destinations
+          - Does not suggest purely urban destinations
+    assertions:
+      - Agent maintains consistency — later suggestions align with earlier preferences
+      - Agent does not contradict its own prior recommendations
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index ebedc1e00..7430bc029 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -24,6 +24,8 @@ import {
   resolveTargetDefinition,
 } from './providers/targets.js';
 import type {
+  ChatMessage,
+  ChatMessageRole,
   EnvLookup,
   Message,
   Provider,
@@ -47,6 +49,8 @@ import {
 import { aggregateTrials } from './trials.js';
 import type {
   AssertionEntry,
+  ConversationAggregation,
+  ConversationTurn,
   DependencyResult,
   EvalTest,
   EvaluationResult,
@@ -60,6 +64,8 @@ import type {
   JsonObject,
   JsonValue,
   LlmGraderEvaluatorConfig,
+  TestMessage,
+  TestMessageRole,
   TrialResult,
   TrialsConfig,
   WorkspaceHookConfig,
@@ -1889,6 +1895,42 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
     }
   }
 
+  // Conversation mode: turn-by-turn evaluation
+  if (evalCase.mode === 'conversation' && evalCase.turns?.length) {
+    const conversationResult = await runConversationMode({
+      evalCase,
+      provider,
+      target,
+      evaluators,
+      typeRegistry,
+      graderProvider,
+      promptInputs,
+      nowFn,
+      signal,
+      workspacePath,
+      caseWorkspaceFile: caseWorkspaceFile ?? suiteWorkspaceFile,
+      agentTimeoutMs,
+      streamCallbacks: options.streamCallbacks,
+      verbose,
+      threshold: evalCase.threshold ?? caseThreshold,
+      targetResolver,
+      availableTargets,
+    });
+
+    // Cleanup workspace (same logic as standard path)
+    if (workspacePath && !isSharedWorkspace) {
+      const shouldRetain =
+        conversationResult.executionStatus === 'ok'
+          ? retainOnSuccess === 'keep' || keepWorkspaces
+          : retainOnFailure === 'keep' || (!forceCleanup && !keepWorkspaces);
+      if (!shouldRetain) {
+        await cleanupWorkspace(workspacePath).catch(() => {});
+      }
+    }
+
+    return conversationResult;
+  }
+
   const caseStartMs = Date.now();
   const attemptBudget = (maxRetries ?? 0) + 1;
   let attempt = 0;
@@ -2887,6 +2929,382 @@ function buildEvaluatorRegistry(
   };
 }
 
+// ---------------------------------------------------------------------------
+// Conversation mode: turn-by-turn evaluation
+// ---------------------------------------------------------------------------
+
+/**
+ * Run a multi-turn conversation evaluation.
+ * For each turn: append user message → call provider → grade turn → append LLM response.
+ * After all turns, run conversation-level assertions on the full transcript.
+ * Final score is aggregated from turn scores + conversation scores.
+ */
+async function runConversationMode(options: {
+  readonly evalCase: EvalTest;
+  readonly provider: Provider;
+  readonly target: ResolvedTarget;
+  readonly evaluators: Partial<Record<string, Evaluator>> & { readonly 'llm-grader': Evaluator };
+  readonly typeRegistry: import('./registry/evaluator-registry.js').EvaluatorRegistry;
+  readonly graderProvider?: Provider;
+  readonly promptInputs: PromptInputs;
+  readonly nowFn: () => Date;
+  readonly signal?: AbortSignal;
+  readonly workspacePath?: string;
+  readonly caseWorkspaceFile?: string;
+  readonly agentTimeoutMs?: number;
+  readonly streamCallbacks?: ProviderStreamCallbacks;
+  readonly verbose?: boolean;
+  readonly threshold?: number;
+  readonly targetResolver?: (name: string) => Provider | undefined;
+  readonly availableTargets?: readonly string[];
+}): Promise<EvaluationResult> {
+  const {
+    evalCase,
+    provider,
+    target,
+    evaluators,
+    typeRegistry,
+    graderProvider,
+    promptInputs,
+    nowFn,
+    signal,
+    workspacePath,
+    caseWorkspaceFile,
+    agentTimeoutMs,
+    streamCallbacks,
+    verbose,
+    threshold,
+    targetResolver,
+    availableTargets,
+  } = options;
+
+  // biome-ignore lint/style/noNonNullAssertion: turns is guaranteed by the caller (conversation mode gate)
+  const turns = evalCase.turns!;
+  const aggregation = evalCase.aggregation ?? 'mean';
+  const onTurnFailure = evalCase.on_turn_failure ?? 'continue';
+  const windowSize = evalCase.window_size;
+
+  // Build initial message history from evalCase.input (system prompt + any context)
+  const history: ChatMessage[] = [];
+  for (const msg of evalCase.input) {
+    const content = typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content);
+    history.push({ role: msg.role as ChatMessageRole, content });
+  }
+
+  const turnScores: EvaluatorResult[] = [];
+  const allTurnScoreValues: number[] = [];
+  let stopped = false;
+  const caseStartMs = Date.now();
+
+  for (let i = 0; i < turns.length; i++) {
+    const turn = turns[i];
+    const turnIndex = i + 1;
+
+    if (stopped) {
+      // Turn skipped due to on_turn_failure: stop
+      turnScores.push({
+        name: `turn-${turnIndex}`,
+        type: 'rubrics' as EvaluatorKind,
+        score: 0,
+        verdict: 'skip' as EvaluationVerdict,
+        assertions: [{ text: 'Skipped due to previous turn failure', passed: false }],
+      });
+      allTurnScoreValues.push(0);
+      continue;
+    }
+
+    // Append user message to history
+    const userContent = typeof turn.input === 'string' ? turn.input : JSON.stringify(turn.input);
+    history.push({ role: 'user', content: userContent });
+
+    // Build chatPrompt for provider call (with optional window_size)
+    const chatPromptForProvider = windowSize
+      ? buildWindowedHistory(history, windowSize)
+      : [...history];
+
+    // Call provider with accumulated history
+    let response: ProviderResponse;
+    try {
+      response = await provider.invoke({
+        question: userContent,
+        chatPrompt: chatPromptForProvider,
+        evalCaseId: `${evalCase.id}/turn-${turnIndex}`,
+        signal,
+        cwd: workspacePath,
+        workspaceFile: caseWorkspaceFile,
+        streamCallbacks,
+      });
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      turnScores.push({
+        name: `turn-${turnIndex}`,
+        type: 'rubrics' as EvaluatorKind,
+        score: 0,
+        verdict: 'fail' as EvaluationVerdict,
+        assertions: [{ text: `Provider error: ${message}`, passed: false }],
+      });
+      allTurnScoreValues.push(0);
+      if (onTurnFailure === 'stop') stopped = true;
+      continue;
+    }
+
+    // Extract assistant response
+    const assistantContent = extractLastAssistantContent(response.output);
+
+    // Append actual LLM response (NOT expected_output) to history
+    history.push({ role: 'assistant', content: assistantContent });
+
+    // Grade this turn
+    if (!turn.assertions?.length && !turn.expected_output) {
+      // No assertions or expected_output — turn scores 1.0
+      turnScores.push({
+        name: `turn-${turnIndex}`,
+        type: 'rubrics' as EvaluatorKind,
+        score: 1.0,
+        verdict: 'pass' as EvaluationVerdict,
+        assertions: [],
+      });
+      allTurnScoreValues.push(1.0);
+      continue;
+    }
+
+    // Build assertions for this turn
+    const turnAssertions = buildTurnAssertions(turn);
+
+    // Create a synthetic EvalTest for this turn's grading
+    const turnEvalCase: EvalTest = {
+      ...evalCase,
+      id: `${evalCase.id}/turn-${turnIndex}`,
+      assertions: turnAssertions,
+      input: buildTurnGraderInput(history, windowSize),
+      expected_output: turn.expected_output
+        ? [
+            typeof turn.expected_output === 'string'
+              ? ({ content: turn.expected_output } as JsonObject)
+              : (turn.expected_output as JsonObject),
+          ]
+        : [],
+      // Clear conversation fields to prevent recursion
+      mode: undefined,
+      turns: undefined,
+    };
+
+    const turnResult = await evaluateCandidate({
+      evalCase: turnEvalCase,
+      candidate: assistantContent,
+      target,
+      provider,
+      evaluators,
+      typeRegistry,
+      promptInputs: {
+        question: buildConversationContext(history, windowSize),
+        chatPrompt: windowSize ? buildWindowedHistory(history, windowSize) : [...history],
+      },
+      nowFn,
+      attempt: 0,
+      graderProvider,
+      agentTimeoutMs,
+      output: response.output,
+      verbose,
+      threshold,
+      targetResolver,
+      availableTargets,
+    });
+
+    const turnScore = turnResult.score;
+    allTurnScoreValues.push(turnScore);
+
+    turnScores.push({
+      name: `turn-${turnIndex}`,
+      type: 'rubrics' as EvaluatorKind,
+      score: turnScore,
+      verdict: scoreToVerdict(turnScore, threshold ?? DEFAULT_THRESHOLD) as EvaluationVerdict,
+      assertions: turnResult.assertions ? [...turnResult.assertions] : [],
+      scores: turnResult.scores,
+    });
+
+    // Check if we should stop on failure
+    if (onTurnFailure === 'stop' && turnScore < (threshold ?? DEFAULT_THRESHOLD)) {
+      stopped = true;
+    }
+  }
+
+  // Run conversation-level assertions (top-level assertions on full transcript)
+  let conversationScores: EvaluatorResult[] = [];
+  if (evalCase.assertions?.length) {
+    const conversationEvalCase: EvalTest = {
+      ...evalCase,
+      id: `${evalCase.id}/conversation`,
+      input: history.map((m) => ({
+        role: m.role as TestMessageRole,
+        content: m.content,
+      })),
+      expected_output: [],
+      mode: undefined,
+      turns: undefined,
+    };
+
+    const fullTranscript = history
+      .map((m) => {
+        const content = typeof m.content === 'string' ? m.content : JSON.stringify(m.content);
+        return `${m.role}: ${content}`;
+      })
+      .join('\n\n');
+
+    const conversationResult = await evaluateCandidate({
+      evalCase: conversationEvalCase,
+      candidate: fullTranscript,
+      target,
+      provider,
+      evaluators,
+      typeRegistry,
+      promptInputs: {
+        question: fullTranscript,
+        chatPrompt: [...history],
+      },
+      nowFn,
+      attempt: 0,
+      graderProvider,
+      agentTimeoutMs,
+      verbose,
+      threshold,
+      targetResolver,
+      availableTargets,
+    });
+
+    conversationScores = [
+      {
+        name: 'conversation',
+        type: 'rubrics' as EvaluatorKind,
+        score: conversationResult.score,
+        verdict: scoreToVerdict(
+          conversationResult.score,
+          threshold ?? DEFAULT_THRESHOLD,
+        ) as EvaluationVerdict,
+        assertions: conversationResult.assertions ? [...conversationResult.assertions] : [],
+        scores: conversationResult.scores,
+      },
+    ];
+  }
+
+  // Aggregate final score
+  const allScoreValues = [...allTurnScoreValues, ...conversationScores.map((s) => s.score)];
+
+  const finalScore = aggregateConversationScores(allScoreValues, aggregation);
+  const allResultScores = [...turnScores, ...conversationScores];
+
+  // Build output as full conversation transcript
+  const outputMessages: Message[] = history.map((m) => ({
+    role: m.role,
+    content: m.content,
+  }));
+
+  const flatAssertions: AssertionEntry[] = allResultScores.flatMap((s) => [...s.assertions]);
+  const totalDurationMs = Date.now() - caseStartMs;
+
+  return {
+    timestamp: nowFn().toISOString(),
+    testId: evalCase.id,
+    suite: evalCase.suite,
+    category: evalCase.category,
+    score: finalScore,
+    assertions: flatAssertions,
+    target: target.name,
+    output: outputMessages,
+    scores: allResultScores,
+    executionStatus: classifyQualityStatus(finalScore, threshold ?? DEFAULT_THRESHOLD),
+    input: evalCase.input.map((m) => ({
+      role: m.role,
+      content: typeof m.content === 'string' ? m.content : JSON.stringify(m.content),
+    })),
+    evalRun: { durationMs: totalDurationMs },
+  };
+}
+
+/** Include system messages + last windowSize*2 non-system messages */
+function buildWindowedHistory(history: readonly ChatMessage[], windowSize: number): ChatMessage[] {
+  const systemMessages = history.filter((m) => m.role === 'system');
+  const nonSystem = history.filter((m) => m.role !== 'system');
+  const windowed = nonSystem.slice(-windowSize * 2);
+  return [...systemMessages, ...windowed];
+}
+
+/** Build a text representation of the conversation for grader context */
+function buildConversationContext(history: readonly ChatMessage[], windowSize?: number): string {
+  const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history;
+  return msgs
+    .map((m) => {
+      const content = typeof m.content === 'string' ? m.content : JSON.stringify(m.content);
+      return `${m.role}: ${content}`;
+    })
+    .join('\n\n');
+}
+
+/** Build TestMessage[] from history for synthetic EvalTest input */
+function buildTurnGraderInput(history: readonly ChatMessage[], windowSize?: number): TestMessage[] {
+  const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history;
+  return msgs.map((m) => ({
+    role: m.role as TestMessageRole,
+    content: m.content,
+  }));
+}
+
+/**
+ * Convert per-turn assertions to EvaluatorConfig[].
+ * String assertions are grouped into a single rubrics evaluator.
+ * Structured assertions pass through as-is.
+ */
+function buildTurnAssertions(turn: ConversationTurn): EvaluatorConfig[] {
+  if (!turn.assertions?.length) return [];
+
+  const stringCriteria: string[] = [];
+  const structured: EvaluatorConfig[] = [];
+
+  for (const a of turn.assertions) {
+    if (typeof a === 'string') {
+      stringCriteria.push(a);
+    } else {
+      structured.push(a);
+    }
+  }
+
+  const result: EvaluatorConfig[] = [];
+
+  // Group string assertions into a single llm-grader evaluator with rubrics.
+  // Uses llm-grader (not rubrics) because 'rubrics' is a YAML shorthand resolved by
+  // the evaluator-parser — at runtime we always dispatch through 'llm-grader'.
+  if (stringCriteria.length > 0) {
+    result.push({
+      name: 'turn-rubrics',
+      type: 'llm-grader' as EvaluatorKind,
+      rubrics: stringCriteria.map((text, idx) => ({
+        id: `criterion-${idx + 1}`,
+        outcome: text,
+        weight: 1,
+      })),
+    } as unknown as EvaluatorConfig);
+  }
+
+  result.push(...structured);
+  return result;
+}
+
+/** Aggregate turn scores using the configured strategy */
+function aggregateConversationScores(
+  scores: readonly number[],
+  aggregation: ConversationAggregation,
+): number {
+  if (scores.length === 0) return 1.0;
+  switch (aggregation) {
+    case 'min':
+      return Math.min(...scores);
+    case 'max':
+      return Math.max(...scores);
+    default:
+      return scores.reduce((sum, s) => sum + s, 0) / scores.length;
+  }
+}
+
 async function invokeProvider(
   provider: Provider,
   options: {
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index 3b4adbaec..9a3705fbd 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -858,6 +858,41 @@ export type EvaluatorConfig =
   | RubricsEvaluatorConfig
   | InlineAssertEvaluatorConfig;
 
+/**
+ * A single turn in a multi-turn conversation evaluation.
+ * Each turn is a user message. The runner generates the assistant response.
+ */
+export interface ConversationTurn {
+  /** User message for this turn */
+  readonly input: TestMessageContent;
+  /** Reference assistant response for grading (NOT carried forward — actual LLM response is used) */
+  readonly expected_output?: TestMessageContent;
+  /** Per-turn assertions. Strings become rubric criteria via shorthand. */
+  readonly assertions?: readonly (string | EvaluatorConfig)[];
+}
+
+/**
+ * Conversation evaluation mode.
+ * - undefined: standard single-response evaluation (default, backward-compatible)
+ * - 'conversation': multi-turn evaluation where the LLM generates each assistant turn
+ */
+export type ConversationMode = 'conversation';
+
+/**
+ * Score aggregation strategy for multi-turn conversation evaluation.
+ * - 'mean': average of all turn scores (default)
+ * - 'min': weakest-link scoring — final score = lowest turn score
+ * - 'max': best turn score
+ */
+export type ConversationAggregation = 'mean' | 'min' | 'max';
+
+/**
+ * Behavior when a turn's assertions fail.
+ * - 'continue': run all remaining turns regardless (default)
+ * - 'stop': skip remaining turns, score them as 0
+ */
+export type TurnFailurePolicy = 'continue' | 'stop';
+
 /**
  * Eval test definition sourced from AgentV specs.
  */
@@ -884,6 +919,16 @@ export interface EvalTest {
   readonly targets?: readonly string[];
   /** Per-test score threshold override (0-1). Resolution: CLI > test > suite > DEFAULT_THRESHOLD. */
   readonly threshold?: number;
+  /** Conversation evaluation mode. When 'conversation', turns[] drives turn-by-turn LLM evaluation. */
+  readonly mode?: ConversationMode;
+  /** Ordered turns for conversation evaluation. Each turn generates a fresh LLM call. */
+  readonly turns?: readonly ConversationTurn[];
+  /** Score aggregation for conversation turns: mean (default), min (weakest-link), max */
+  readonly aggregation?: ConversationAggregation;
+  /** Behavior on turn assertion failure: continue (default) or stop */
+  readonly on_turn_failure?: TurnFailurePolicy;
+  /** Sliding window size for context passed to per-turn graders. Default: all turns. */
+  readonly window_size?: number;
   /** Test IDs this test depends on. Dependent tests wait for all dependencies to complete before running. */
   readonly depends_on?: readonly string[];
   /** What to do when a dependency fails: skip (default), fail, or run anyway. */
diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts
index 5de36a1a8..e35f46287 100644
--- a/packages/core/src/evaluation/validation/eval-file.schema.ts
+++ b/packages/core/src/evaluation/validation/eval-file.schema.ts
@@ -355,6 +355,16 @@ const ExecutionSchema = z.object({
   threshold: z.number().min(0).max(1).optional(),
 });
 
+/** Per-turn assertion: string shorthand (becomes rubric) or full evaluator config */
+const TurnAssertionSchema = z.union([z.string(), EvaluatorSchema]);
+
+/** A single turn in a multi-turn conversation */
+const ConversationTurnSchema = z.object({
+  input: z.union([z.string(), MessageContentSchema]),
+  expected_output: z.union([z.string(), MessageContentSchema]).optional(),
+  assertions: z.array(TurnAssertionSchema).optional(),
+});
+
 // ---------------------------------------------------------------------------
 // Test case
 // ---------------------------------------------------------------------------
@@ -375,6 +385,11 @@ const EvalTestSchema = z.object({
   note: z.string().optional(),
   depends_on: z.array(z.string()).optional(),
   on_dependency_failure: z.enum(['skip', 'fail', 'run']).optional(),
+  mode: z.enum(['conversation']).optional(),
+  turns: z.array(ConversationTurnSchema).min(1).optional(),
+  aggregation: z.enum(['mean', 'min', 'max']).optional(),
+  on_turn_failure: z.enum(['continue', 'stop']).optional(),
+  window_size: z.number().int().min(1).optional(),
 });
 
 // ---------------------------------------------------------------------------
diff --git a/packages/core/src/evaluation/validation/eval-validator.ts b/packages/core/src/evaluation/validation/eval-validator.ts
index 4506acc50..4ecc79fa4 100644
--- a/packages/core/src/evaluation/validation/eval-validator.ts
+++ b/packages/core/src/evaluation/validation/eval-validator.ts
@@ -67,6 +67,13 @@ const KNOWN_TEST_FIELDS = new Set([
   'conversation_id',
   'suite',
   'note',
+  'depends_on',
+  'on_dependency_failure',
+  'mode',
+  'turns',
+  'aggregation',
+  'on_turn_failure',
+  'window_size',
 ]);
 
 /** Name field pattern: lowercase alphanumeric with hyphens. */
@@ -328,6 +335,9 @@ export async function validateEvalFile(filePath: string): Promise<ValidationResu
       validateAssertArray(assertField, location, absolutePath, errors);
     }
 
+    // Cross-field validation for conversation mode
+    validateConversationMode(evalCase, location, absolutePath, errors);
+
     await validateWorkspaceConfig(
       evalCase.workspace,
       absolutePath,
@@ -778,3 +788,113 @@ function validateContentForRoleMarkers(
     }
   }
 }
+
+/**
+ * Cross-field validation for conversation mode fields.
+ * Ensures consistency between mode, turns, aggregation, on_turn_failure, window_size.
+ */
+function validateConversationMode(
+  evalCase: JsonObject,
+  location: string,
+  filePath: string,
+  errors: ValidationError[],
+): void {
+  const mode = evalCase.mode;
+  const turns = evalCase.turns;
+  const aggregation = evalCase.aggregation;
+  const onTurnFailure = evalCase.on_turn_failure;
+  const windowSize = evalCase.window_size;
+
+  const isConversationMode = mode === 'conversation';
+
+  // turns present without mode: conversation
+  if (turns !== undefined && !isConversationMode) {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location: `${location}.turns`,
+      message: "'turns' requires mode: conversation",
+    });
+  }
+
+  // mode: conversation without turns or empty turns
+  if (isConversationMode && (!Array.isArray(turns) || turns.length === 0)) {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location: `${location}.mode`,
+      message: "mode: conversation requires a non-empty 'turns' array",
+    });
+  }
+
+  // turns + top-level expected_output
+  if (isConversationMode && Array.isArray(turns) && evalCase.expected_output !== undefined) {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location: `${location}.expected_output`,
+      message:
+        "Top-level 'expected_output' is not allowed with mode: conversation (use per-turn expected_output instead)",
+    });
+  }
+
+  // aggregation without mode: conversation
+  if (aggregation !== undefined && !isConversationMode) {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location: `${location}.aggregation`,
+      message: "'aggregation' requires mode: conversation",
+    });
+  }
+
+  // on_turn_failure without mode: conversation
+  if (onTurnFailure !== undefined && !isConversationMode) {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location: `${location}.on_turn_failure`,
+      message: "'on_turn_failure' requires mode: conversation",
+    });
+  }
+
+  // window_size without mode: conversation
+  if (windowSize !== undefined && !isConversationMode) {
+    errors.push({
+      severity: 'error',
+      filePath,
+      location: `${location}.window_size`,
+      message: "'window_size' requires mode: conversation",
+    });
+  }
+
+  // Validate each turn has non-empty input
+  if (isConversationMode && Array.isArray(turns)) {
+    for (let i = 0; i < turns.length; i++) {
+      const turn = turns[i];
+      if (!isObject(turn)) {
+        errors.push({
+          severity: 'error',
+          filePath,
+          location: `${location}.turns[${i}]`,
+          message: 'Turn must be an object',
+        });
+        continue;
+      }
+      const turnInput = turn.input;
+      const isEmpty =
+        turnInput === undefined ||
+        turnInput === '' ||
+        (typeof turnInput === 'string' && turnInput.trim() === '') ||
+        (Array.isArray(turnInput) && turnInput.length === 0);
+      if (isEmpty) {
+        errors.push({
+          severity: 'error',
+          filePath,
+          location: `${location}.turns[${i}].input`,
+          message: 'Each turn must have a non-empty input',
+        });
+      }
+    }
+  }
+}
diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts
index f9ce28eb3..377c719c3 100644
--- a/packages/core/src/evaluation/yaml-parser.ts
+++ b/packages/core/src/evaluation/yaml-parser.ts
@@ -36,13 +36,19 @@ import {
 } from './loaders/shorthand-expansion.js';
 import { parseMetadata } from './metadata.js';
 import type {
+  ConversationAggregation,
+  ConversationMode,
+  ConversationTurn,
   DockerWorkspaceConfig,
   EvalTest,
+  EvaluatorConfig,
   JsonObject,
   JsonValue,
   RepoConfig,
   TestMessage,
+  TestMessageContent,
   TrialsConfig,
+  TurnFailurePolicy,
   WorkspaceConfig,
   WorkspaceHookConfig,
   WorkspaceHooksConfig,
@@ -385,15 +391,16 @@ async function loadTestsFromYaml(
     // Resolve expected_output with shorthand support
     const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
 
-    // A test is complete when it has id, input, and at least one of: criteria, expected_output, or assertions
+    // A test is complete when it has id, input, and at least one of: criteria, expected_output, assertions, or turns (conversation mode)
     const hasEvaluationSpec =
       !!outcome ||
       expectedMessages.length > 0 ||
       testCaseConfig.assertions !== undefined ||
-      testCaseConfig.assert !== undefined;
+      testCaseConfig.assert !== undefined ||
+      (Array.isArray(testCaseConfig.turns) && testCaseConfig.turns.length > 0);
     if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
       logError(
-        `Skipping incomplete test: ${id ?? 'unknown'}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`,
+        `Skipping incomplete test: ${id ?? 'unknown'}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions/turns`,
       );
       continue;
     }
@@ -522,6 +529,26 @@ async function loadTestsFromYaml(
         ? (onDependencyFailureRaw as import('./types.js').DependencyFailurePolicy)
         : undefined;
 
+    // Extract conversation mode fields
+    const modeRaw = asString(testCaseConfig.mode);
+    const mode: ConversationMode | undefined =
+      modeRaw === 'conversation' ? 'conversation' : undefined;
+    const turns = Array.isArray(testCaseConfig.turns)
+      ? parseTurns(testCaseConfig.turns as readonly unknown[])
+      : undefined;
+    const aggregationRaw = asString(testCaseConfig.aggregation);
+    const aggregation: ConversationAggregation | undefined =
+      aggregationRaw === 'mean' || aggregationRaw === 'min' || aggregationRaw === 'max'
+        ? aggregationRaw
+        : undefined;
+    const onTurnFailureRaw = asString(testCaseConfig.on_turn_failure);
+    const onTurnFailure: TurnFailurePolicy | undefined =
+      onTurnFailureRaw === 'continue' || onTurnFailureRaw === 'stop' ? onTurnFailureRaw : undefined;
+    const windowSize =
+      typeof testCaseConfig.window_size === 'number' && testCaseConfig.window_size >= 1
+        ? (testCaseConfig.window_size as number)
+        : undefined;
+
     const testCase: EvalTest = {
       id,
       suite: suiteName,
@@ -540,6 +567,11 @@ async function loadTestsFromYaml(
       metadata,
       targets: caseTargets,
       ...(caseThreshold !== undefined ? { threshold: caseThreshold } : {}),
+      ...(mode ? { mode } : {}),
+      ...(turns && turns.length > 0 ? { turns } : {}),
+      ...(aggregation ? { aggregation } : {}),
+      ...(onTurnFailure ? { on_turn_failure: onTurnFailure } : {}),
+      ...(windowSize !== undefined ? { window_size: windowSize } : {}),
       ...(dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {}),
       ...(onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}),
     };
@@ -571,6 +603,35 @@ export async function loadTestById(
 /** @deprecated Use `loadTestById` instead */
 export const loadEvalCaseById = loadTestById;
 
+/**
+ * Parse raw turn data from YAML into typed ConversationTurn objects.
+ * String assertions are preserved as-is — they become rubric criteria at runtime.
+ * Structured assertion objects pass through unchanged.
+ */
+function parseTurns(rawTurns: readonly unknown[]): ConversationTurn[] {
+  return rawTurns.map((rawTurn) => {
+    const turn = rawTurn as Record<string, unknown>;
+    const input = turn.input as TestMessageContent;
+    const expectedOutput = turn.expected_output as TestMessageContent | undefined;
+
+    // Parse per-turn assertions (string shorthand or structured evaluator config)
+    let assertions: (string | EvaluatorConfig)[] | undefined;
+    if (Array.isArray(turn.assertions)) {
+      assertions = turn.assertions.map((a: unknown) => {
+        if (typeof a === 'string') return a;
+        // Structured evaluator config — pass through as-is (validated by Zod schema)
+        return a as EvaluatorConfig;
+      });
+    }
+
+    return {
+      input,
+      ...(expectedOutput !== undefined ? { expected_output: expectedOutput } : {}),
+      ...(assertions && assertions.length > 0 ? { assertions } : {}),
+    };
+  });
+}
+
 /**
  * Normalize a command value from YAML into a string array.
  * Accepts a string (split on whitespace) or an array of strings.
diff --git a/packages/core/test/evaluation/conversation-mode.test.ts b/packages/core/test/evaluation/conversation-mode.test.ts
new file mode 100644
index 000000000..2eeb8eee4
--- /dev/null
+++ b/packages/core/test/evaluation/conversation-mode.test.ts
@@ -0,0 +1,927 @@
+/**
+ * Unit tests for the multi-turn conversation mode feature.
+ *
+ * Covers:
+ * - Orchestrator: runEvalCase with mode: conversation
+ * - Validation: validateEvalFile with conversation mode fields
+ * - Score aggregation strategies (mean, min, max)
+ * - Turn failure policies (continue, stop)
+ * - Window size behaviour
+ */
+
+import { afterAll, beforeAll, describe, expect, it } from 'bun:test';
+import { mkdir, rm, writeFile } from 'node:fs/promises';
+import os from 'node:os';
+import path from 'node:path';
+
+import { runEvalCase } from '../../src/evaluation/orchestrator.js';
+import type { ResolvedTarget } from '../../src/evaluation/providers/targets.js';
+import type {
+  Provider,
+  ProviderRequest,
+  ProviderResponse,
+} from '../../src/evaluation/providers/types.js';
+import type { EvalTest } from '../../src/evaluation/types.js';
+import { validateEvalFile } from '../../src/evaluation/validation/eval-validator.js';
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+class SequenceProvider implements Provider {
+  readonly id: string;
+  readonly kind = 'mock' as const;
+  readonly targetName: string;
+  readonly requests: ProviderRequest[] = [];
+  private readonly responses: ProviderResponse[];
+  private index = 0;
+
+  constructor(targetName: string, responses: ProviderResponse[]) {
+    this.id = `mock:${targetName}`;
+    this.targetName = targetName;
+    this.responses = responses;
+  }
+
+  async invoke(request: ProviderRequest): Promise<ProviderResponse> {
+    this.requests.push(request);
+    if (this.index >= this.responses.length) {
+      throw new Error(`SequenceProvider: no more responses (called ${this.index + 1} times)`);
+    }
+    return this.responses[this.index++];
+  }
+}
+
+class ErrorOnFirstProvider implements Provider {
+  readonly id = 'error-first';
+  readonly kind = 'mock' as const;
+  readonly targetName = 'error-first';
+  private called = false;
+  private readonly fallbackResponse: ProviderResponse;
+
+  constructor(fallback: ProviderResponse) {
+    this.fallbackResponse = fallback;
+  }
+
+  async invoke(): Promise<ProviderResponse> {
+    if (!this.called) {
+      this.called = true;
+      throw new Error('Simulated provider error');
+    }
+    return this.fallbackResponse;
+  }
+}
+
+const baseTarget: ResolvedTarget = {
+  kind: 'mock',
+  name: 'mock',
+  config: { response: '{}' },
+};
+
+function makeEvaluatorRegistry(score = 1.0) {
+  return {
+    'llm-grader': {
+      kind: 'llm-grader' as const,
+      async evaluate() {
+        return {
+          score,
+          verdict: score >= 0.5 ? ('pass' as const) : ('fail' as const),
+          assertions: [{ text: 'graded', passed: score >= 0.5 }],
+          expectedAspectCount: 1,
+        };
+      },
+    },
+  };
+}
+
+function assistantResponse(content: string): ProviderResponse {
+  return { output: [{ role: 'assistant', content }] };
+}
+
+const nowFn = () => new Date('2024-01-01T00:00:00Z');
+
+// ---------------------------------------------------------------------------
+// Orchestrator — conversation mode
+// ---------------------------------------------------------------------------
+
+describe('runEvalCase — conversation mode', () => {
+  it('basic 2-turn conversation with no assertions scores 1.0 and calls provider twice', async () => {
+    const provider = new SequenceProvider('mock', [
+      assistantResponse('Hello!'),
+      assistantResponse('Goodbye!'),
+    ]);
+
+    const evalCase: EvalTest = {
+      id: 'conv-basic',
+      question: 'Chat test',
+      input: [{ role: 'user', content: 'Hi' }],
+      expected_output: [],
+      file_paths: [],
+      criteria: 'Be helpful',
+      mode: 'conversation',
+      turns: [{ input: 'Turn 1 message' }, { input: 'Turn 2 message' }],
+    };
+
+    const result = await runEvalCase({
+      evalCase,
+      provider,
+      target: baseTarget,
+      evaluators: makeEvaluatorRegistry(),
+      now: nowFn,
+    });
+
+    expect(result.score).toBe(1.0);
+    expect(provider.requests).toHaveLength(2);
+    expect(result.executionStatus).toBe('ok');
+  });
+
+  it('per-turn string assertions are evaluated and affect score', async () => {
+    const provider = new SequenceProvider('mock', [
+      assistantResponse('Paris'),
+      assistantResponse('Berlin'),
+    ]);
+
+    const evalCase: EvalTest = {
+      id: 'conv-string-assertions',
+      question: 'Geography',
+      input: [],
+      expected_output: [],
+      file_paths: [],
+      criteria: 'Correct answers',
+      mode: 'conversation',
+      turns: [
+        { input: 'Capital of France?', assertions: ['Response mentions Paris'] },
+        { input: 'Capital of Germany?', assertions: ['Response mentions Berlin'] },
+      ],
+    };
+
+    const result = await runEvalCase({
+      evalCase,
+      provider,
+      target: baseTarget,
+      evaluators: makeEvaluatorRegistry(1.0),
+      now: nowFn,
+    });
+
+    expect(result.score).toBeGreaterThan(0);
+    expect(provider.requests).toHaveLength(2);
+  });
+
+  it('per-turn structured assertions are evaluated', async () => {
+    const provider = new SequenceProvider('mock', [assistantResponse('42')]);
+
+    const evalCase: EvalTest = {
+      id: 'conv-struct-assertions',
+      question: 'Math',
+      input: [],
+      expected_output: [],
+      file_paths: [],
+      criteria: 'Correct',
+      mode: 'conversation',
+      turns: [
+        {
+          input: 'What is 6 * 7?',
+          assertions: [{ type: 'llm-grader', criteria: 'Answer is 42' }],
+        },
+      ],
+    };
+
+    const result = await runEvalCase({
+      evalCase,
+      provider,
+      target: baseTarget,
+      evaluators: makeEvaluatorRegistry(1.0),
+      now: nowFn,
+    });
+
+    expect(result.score).toBeGreaterThan(0);
+    expect(provider.requests).toHaveLength(1);
+  });
+
+  it('conversation-level assertions are evaluated against full transcript', async () => {
+    const provider = new SequenceProvider('mock', [
+      assistantResponse('Yes'),
+      assistantResponse('No'),
+    ]);
+
+    const evalCase: EvalTest = {
+      id: 'conv-top-level',
+      question: 'Consistency check',
+      input: [],
+      expected_output: [],
+      file_paths: [],
+      criteria: 'Consistent throughout',
+      mode: 'conversation',
+      turns: [{ input: 'Turn 1' }, { input: 'Turn 2' }],
+      assertions: [{ type: 'llm-grader', criteria: 'Conversation was coherent' }],
+    };
+
+    const result = await runEvalCase({
+      evalCase,
+      provider,
+      target: baseTarget,
+      evaluators: makeEvaluatorRegistry(0.9),
+      now: nowFn,
+    });
+
+    // Should have per-turn scores plus a conversation-level score
+    expect(result.scores).toBeDefined();
+    const hasConversationScore = result.scores?.some((s) => s.name === 'conversation');
+    expect(hasConversationScore).toBe(true);
+  });
+
+  it('aggregation: mean — averages all turn scores', async () => {
+    // 3 turns, no per-turn assertions → each scores 1.0
+    const provider = new SequenceProvider('mock', [
+      assistantResponse('A'),
+      assistantResponse('B'),
+      assistantResponse('C'),
+    ]);
+
+    const evalCase: EvalTest = {
+      id: 'conv-mean',
+      question: 'mean test',
+      input: [],
+      expected_output: [],
+      file_paths: [],
+      criteria: 'Anything',
+      mode: 'conversation',
+      aggregation: 'mean',
+      turns: [{ input: 'T1' }, { input: 'T2' }, { input: 'T3' }],
+    };
+
+    const result = await runEvalCase({
+      evalCase,
+      provider,
+      target: baseTarget,
+      evaluators: makeEvaluatorRegistry(),
+      now: nowFn,
+    });
+
+    expect(result.score).toBeCloseTo(1.0, 5);
+  });
+
+  it('aggregation: min — uses lowest turn score', async () => {
+    // Use per-turn assertions so scores are driven by the grader
+    // Turn 1: grader returns 1.0, Turn 2: 0.5, Turn 3: 0.8
+    let callCount = 0;
+    const scores = [1.0, 0.5, 0.8];
+
+    const customRegistry = {
+      'llm-grader': {
+        kind: 'llm-grader' as const,
+        async evaluate() {
+          const s = scores[callCount++] ?? 1.0;
+          return {
+            score: s,
+            verdict: s >= 0.5 ? ('pass' as const) : ('fail' as const),
+            assertions: [{ text: 'graded', passed: s >= 0.5 }],
+            expectedAspectCount: 1,
+          };
+        },
+      },
+    };
+
+    const provider = new SequenceProvider('mock', [
+      assistantResponse('A'),
+      assistantResponse('B'),
+      assistantResponse('C'),
+    ]);
+
+    const evalCase: EvalTest = {
+      id: 'conv-min',
+      question: 'min test',
+      input: [],
+      expected_output: [],
+      file_paths: [],
+      criteria: 'Anything',
+      mode: 'conversation',
+      aggregation: 'min',
+      turns: [
+        { input: 'T1', assertions: ['Criterion A'] },
+        { input: 'T2', assertions: ['Criterion B'] },
+        { input: 'T3', assertions: ['Criterion C'] },
+      ],
+    };
+
+    const result = await runEvalCase({
+      evalCase,
+      provider,
+      target: baseTarget,
+      evaluators: customRegistry,
+      now: nowFn,
+    });
+
+    expect(result.score).toBeCloseTo(0.5, 5);
+  });
+
+  it('aggregation: max — uses highest turn score', async () => {
+    let callCount = 0;
+    const scores = [1.0, 0.5, 0.8];
+
+    const customRegistry = {
+      'llm-grader': {
+        kind: 'llm-grader' as const,
+        async evaluate() {
+          const s = scores[callCount++] ?? 1.0;
+          return {
+            score: s,
+            verdict: s >= 0.5 ? ('pass' as const) : ('fail' as const),
+            assertions: [{ text: 'graded', passed: s >= 0.5 }],
+            expectedAspectCount: 1,
+          };
+        },
+      },
+    };
+
+    const provider = new SequenceProvider('mock', [
+      assistantResponse('A'),
+      assistantResponse('B'),
+      assistantResponse('C'),
+    ]);
+
+    const evalCase: EvalTest = {
+      id: 'conv-max',
+      question: 'max test',
+      input: [],
+      expected_output: [],
+      file_paths: [],
+      criteria: 'Anything',
+      mode: 'conversation',
+      aggregation: 'max',
+      turns: [
+        { input: 'T1', assertions: ['Criterion A'] },
+        { input: 'T2', assertions: ['Criterion B'] },
+        { input: 'T3', assertions: ['Criterion C'] },
+      ],
+    };
+
+    const result = await runEvalCase({
+      evalCase,
+      provider,
+      target: baseTarget,
+      evaluators: customRegistry,
+      now: nowFn,
+    });
+
+    expect(result.score).toBeCloseTo(1.0, 5);
+  });
+
+  it('on_turn_failure: stop — skips remaining turns after first failure', async () => {
+    let callCount = 0;
+    const customRegistry = {
+      'llm-grader': {
+        kind: 'llm-grader' as const,
+        async evaluate() {
+          callCount++;
+          // First grader call fails
+          return {
+            score: 0.0,
+            verdict: 'fail' as const,
+            assertions: [{ text: 'failed', passed: false }],
+            expectedAspectCount: 1,
+          };
+        },
+      },
+    };
+
+    const provider = new SequenceProvider('mock', [
+      assistantResponse('Turn 1 response'),
+      assistantResponse('Turn 2 response'),
+      assistantResponse('Turn 3 response'),
+    ]);
+
+    const evalCase: EvalTest = {
+      id: 'conv-stop',
+      question: 'stop test',
+      input: [],
+      expected_output: [],
+      file_paths: [],
+      criteria: 'Anything',
+      mode: 'conversation',
+      on_turn_failure: 'stop',
+      turns: [
+        { input: 'T1', assertions: ['Criterion'] },
+        { input: 'T2', assertions: ['Criterion'] },
+        { input: 'T3', assertions: ['Criterion'] },
+      ],
+    };
+
+    const result = await runEvalCase({
+      evalCase,
+      provider,
+      target: baseTarget,
+      evaluators: customRegistry,
+      now: nowFn,
+    });
+
+    // Provider should only be called once (first turn)
+    expect(provider.requests).toHaveLength(1);
+
+    // Skipped turns should have score 0 with skip verdict
+    const skippedScores = result.scores?.filter((s) => s.verdict === 'skip') ?? [];
+    expect(skippedScores.length).toBeGreaterThanOrEqual(2);
+  });
+
+  it('on_turn_failure: continue (default) — all turns run even after failure', async () => {
+    let callCount = 0;
+    const customRegistry = {
+      'llm-grader': {
+        kind: 'llm-grader' as const,
+        async evaluate() {
+          callCount++;
+          return {
+            score: callCount === 1 ? 0.0 : 1.0,
+            verdict: callCount === 1 ? ('fail' as const) : ('pass' as const),
+            assertions: [{ text: 'graded', passed: callCount !== 1 }],
+            expectedAspectCount: 1,
+          };
+        },
+      },
+    };
+
+    const provider = new SequenceProvider('mock', [
+      assistantResponse('A'),
+      assistantResponse('B'),
+      assistantResponse('C'),
+    ]);
+
+    const evalCase: EvalTest = {
+      id: 'conv-continue',
+      question: 'continue test',
+      input: [],
+      expected_output: [],
+      file_paths: [],
+      criteria: 'Anything',
+      mode: 'conversation',
+      on_turn_failure: 'continue',
+      turns: [
+        { input: 'T1', assertions: ['Criterion'] },
+        { input: 'T2', assertions: ['Criterion'] },
+        { input: 'T3', assertions: ['Criterion'] },
+      ],
+    };
+
+    const result = await runEvalCase({
+      evalCase,
+      provider,
+      target: baseTarget,
+      evaluators: customRegistry,
+      now: nowFn,
+    });
+
+    // All 3 turns must run
+    expect(provider.requests).toHaveLength(3);
+    // No skipped turns
+    const skippedScores = result.scores?.filter((s) => s.verdict === 'skip') ?? [];
+    expect(skippedScores).toHaveLength(0);
+  });
+
+  it('window_size — chatPrompt passed to provider is limited to system + last N*2 messages', async () => {
+    const provider = new SequenceProvider('mock', [
+      assistantResponse('R1'),
+      assistantResponse('R2'),
+      assistantResponse('R3'),
+    ]);
+
+    const evalCase: EvalTest = {
+      id: 'conv-window',
+      question: 'window test',
+      input: [{ role: 'system', content: 'System prompt' }],
+      expected_output: [],
+      file_paths: [],
+      criteria: 'Anything',
+      mode: 'conversation',
+      window_size: 1, // keep system + last 1 user+assistant pair
+      turns: [{ input: 'T1' }, { input: 'T2' }, { input: 'T3' }],
+    };
+
+    await runEvalCase({
+      evalCase,
+      provider,
+      target: baseTarget,
+      evaluators: makeEvaluatorRegistry(),
+      now: nowFn,
+    });
+
+    // Provider called 3 times
+    expect(provider.requests).toHaveLength(3);
+
+    // Third call chatPrompt should not include T1's messages (windowed)
+    const thirdRequest = provider.requests[2];
+    const chatPrompt = thirdRequest?.chatPrompt ?? [];
+    // System prompt should always be present
+    expect(chatPrompt.some((m) => m.role === 'system')).toBe(true);
+    // With window_size=1: system + last 2 messages (T2 user + T2 assistant).
+    // T1 user message should NOT be in the windowed prompt
+    const userMessages = chatPrompt.filter((m) => m.role === 'user');
+    expect(userMessages.length).toBeLessThanOrEqual(1);
+  });
+
+  it('provider error on a turn — turn scores 0 and execution continues', async () => {
+    const provider = new ErrorOnFirstProvider(assistantResponse('Turn 2 response'));
+
+    const evalCase: EvalTest = {
+      id: 'conv-provider-error',
+      question: 'error test',
+      input: [],
+      expected_output: [],
+      file_paths: [],
+      criteria: 'Anything',
+      mode: 'conversation',
+      turns: [{ input: 'T1' }, { input: 'T2' }],
+    };
+
+    const result = await runEvalCase({
+      evalCase,
+      provider,
+      target: baseTarget,
+      evaluators: makeEvaluatorRegistry(),
+      now: nowFn,
+    });
+
+    // Turn 1 should score 0
+    const turn1Score = result.scores?.find((s) => s.name === 'turn-1');
+    expect(turn1Score?.score).toBe(0);
+
+    // Turn 2 should still run (continue is default)
+    const turn2Score = result.scores?.find((s) => s.name === 'turn-2');
+    expect(turn2Score).toBeDefined();
+    expect(turn2Score?.score).toBe(1.0);
+  });
+
+  it('output contains full conversation transcript with all user and assistant messages', async () => {
+    const provider = new SequenceProvider('mock', [
+      assistantResponse('Answer 1'),
+      assistantResponse('Answer 2'),
+    ]);
+
+    const evalCase: EvalTest = {
+      id: 'conv-transcript',
+      question: 'transcript test',
+      input: [],
+      expected_output: [],
+      file_paths: [],
+      criteria: 'Full transcript',
+      mode: 'conversation',
+      turns: [{ input: 'Question 1' }, { input: 'Question 2' }],
+    };
+
+    const result = await runEvalCase({
+      evalCase,
+      provider,
+      target: baseTarget,
+      evaluators: makeEvaluatorRegistry(),
+      now: nowFn,
+    });
+
+    // Output should have all messages from the conversation
+    const output = result.output ?? [];
+    const userMessages = output.filter((m) => m.role === 'user');
+    const assistantMessages = output.filter((m) => m.role === 'assistant');
+
+    expect(userMessages.length).toBe(2);
+    expect(assistantMessages.length).toBe(2);
+    expect(assistantMessages[0]?.content).toBe('Answer 1');
+    expect(assistantMessages[1]?.content).toBe('Answer 2');
+  });
+
+  it('top-level assertions are NOT applied per-turn — only at conversation level', async () => {
+    let graderCallCount = 0;
+    const customRegistry = {
+      'llm-grader': {
+        kind: 'llm-grader' as const,
+        async evaluate() {
+          graderCallCount++;
+          return {
+            score: 0.8,
+            verdict: 'pass' as const,
+            assertions: [{ text: 'graded', passed: true }],
+            expectedAspectCount: 1,
+          };
+        },
+      },
+    };
+
+    const provider = new SequenceProvider('mock', [assistantResponse('A'), assistantResponse('B')]);
+
+    const evalCase: EvalTest = {
+      id: 'conv-no-double-count',
+      question: 'double count test',
+      input: [],
+      expected_output: [],
+      file_paths: [],
+      criteria: 'Anything',
+      mode: 'conversation',
+      turns: [
+        { input: 'T1' }, // no per-turn assertions → scores 1.0 without grader
+        { input: 'T2' }, // no per-turn assertions → scores 1.0 without grader
+      ],
+      assertions: [{ type: 'llm-grader', criteria: 'Conversation was coherent' }],
+    };
+
+    const result = await runEvalCase({
+      evalCase,
+      provider,
+      target: baseTarget,
+      evaluators: customRegistry,
+      now: nowFn,
+    });
+
+    // Grader should be called exactly once — for the conversation-level pass only
+    expect(graderCallCount).toBe(1);
+
+    // Should have 2 turn scores (1.0 each) + 1 conversation score
+    const turnScores = result.scores?.filter((s) => s.name.startsWith('turn-')) ?? [];
+    const convScore = result.scores?.find((s) => s.name === 'conversation');
+    expect(turnScores).toHaveLength(2);
+    expect(turnScores[0]?.score).toBe(1.0);
+    expect(turnScores[1]?.score).toBe(1.0);
+    expect(convScore).toBeDefined();
+    expect(convScore?.score).toBe(0.8);
+  });
+
+  it('conversation-level assertions grade the full transcript, not just last reply', async () => {
+    let graderCandidate = '';
+    const customRegistry = {
+      'llm-grader': {
+        kind: 'llm-grader' as const,
+        async evaluate(ctx: { candidate: string }) {
+          graderCandidate = ctx.candidate;
+          return {
+            score: 1.0,
+            verdict: 'pass' as const,
+            assertions: [{ text: 'graded', passed: true }],
+            expectedAspectCount: 1,
+          };
+        },
+      },
+    };
+
+    const provider = new SequenceProvider('mock', [
+      assistantResponse('First answer'),
+      assistantResponse('Second answer'),
+    ]);
+
+    const evalCase: EvalTest = {
+      id: 'conv-transcript-candidate',
+      question: 'transcript candidate test',
+      input: [{ role: 'system', content: 'Be helpful' }],
+      expected_output: [],
+      file_paths: [],
+      criteria: 'Anything',
+      mode: 'conversation',
+      turns: [{ input: 'Question 1' }, { input: 'Question 2' }],
+      assertions: [{ type: 'llm-grader', criteria: 'Full transcript is coherent' }],
+    };
+
+    await runEvalCase({
+      evalCase,
+      provider,
+      target: baseTarget,
+      evaluators: customRegistry,
+      now: nowFn,
+    });
+
+    // The candidate passed to the grader should contain the full transcript, not just "Second answer"
+    expect(graderCandidate).toContain('First answer');
+    expect(graderCandidate).toContain('Second answer');
+    expect(graderCandidate).toContain('Question 1');
+    expect(graderCandidate).toContain('Question 2');
+  });
+
+  it('no regression — non-conversation test behaves as before', async () => {
+    const provider = new SequenceProvider('mock', [assistantResponse('Standard response')]);
+
+    const evalCase: EvalTest = {
+      id: 'standard-test',
+      question: 'Standard test',
+      input: [{ role: 'user', content: 'Hello' }],
+      expected_output: [],
+      file_paths: [],
+      criteria: 'Helpful',
+    };
+
+    const result = await runEvalCase({
+      evalCase,
+      provider,
+      target: baseTarget,
+      evaluators: makeEvaluatorRegistry(0.8),
+      now: nowFn,
+    });
+
+    expect(result.score).toBeGreaterThan(0);
+    expect(result.executionStatus).toBe('ok');
+    // Should not have turn-level scores
+    const hasTurnScores = result.scores?.some((s) => s.name.startsWith('turn-'));
+    expect(hasTurnScores).toBeFalsy();
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Validation tests
+// ---------------------------------------------------------------------------
+
+describe('validateEvalFile — conversation mode', () => {
+  let tempDir: string;
+
+  beforeAll(async () => {
+    tempDir = path.join(os.tmpdir(), `agentv-conv-test-${Date.now()}`);
+    await mkdir(tempDir, { recursive: true });
+  });
+
+  afterAll(async () => {
+    await rm(tempDir, { recursive: true, force: true });
+  });
+
+  it('rejects turns without mode: conversation', async () => {
+    const filePath = path.join(tempDir, 'turns-no-mode.yaml');
+    await writeFile(
+      filePath,
+      `tests:
+  - id: t1
+    criteria: Goal
+    input: hello
+    turns:
+      - input: Turn 1
+`,
+    );
+    const result = await validateEvalFile(filePath);
+    expect(result.valid).toBe(false);
+    expect(
+      result.errors.some((e) => e.message.includes("'turns' requires mode: conversation")),
+    ).toBe(true);
+  });
+
+  it('rejects mode: conversation without turns', async () => {
+    const filePath = path.join(tempDir, 'mode-no-turns.yaml');
+    await writeFile(
+      filePath,
+      `tests:
+  - id: t1
+    criteria: Goal
+    input: hello
+    mode: conversation
+`,
+    );
+    const result = await validateEvalFile(filePath);
+    expect(result.valid).toBe(false);
+    expect(result.errors.some((e) => e.message.includes("non-empty 'turns' array"))).toBe(true);
+  });
+
+  it('rejects mode: conversation with empty turns array', async () => {
+    const filePath = path.join(tempDir, 'mode-empty-turns.yaml');
+    await writeFile(
+      filePath,
+      `tests:
+  - id: t1
+    criteria: Goal
+    input: hello
+    mode: conversation
+    turns: []
+`,
+    );
+    const result = await validateEvalFile(filePath);
+    expect(result.valid).toBe(false);
+    expect(result.errors.some((e) => e.message.includes("non-empty 'turns' array"))).toBe(true);
+  });
+
+  it('rejects turns + top-level expected_output', async () => {
+    const filePath = path.join(tempDir, 'turns-expected-output.yaml');
+    await writeFile(
+      filePath,
+      `tests:
+  - id: t1
+    criteria: Goal
+    input: hello
+    mode: conversation
+    turns:
+      - input: Turn 1
+    expected_output: "some output"
+`,
+    );
+    const result = await validateEvalFile(filePath);
+    expect(result.valid).toBe(false);
+    expect(
+      result.errors.some((e) =>
+        e.message.includes("'expected_output' is not allowed with mode: conversation"),
+      ),
+    ).toBe(true);
+  });
+
+  it('rejects aggregation without mode: conversation', async () => {
+    const filePath = path.join(tempDir, 'aggregation-no-mode.yaml');
+    await writeFile(
+      filePath,
+      `tests:
+  - id: t1
+    criteria: Goal
+    input: hello
+    aggregation: mean
+`,
+    );
+    const result = await validateEvalFile(filePath);
+    expect(result.valid).toBe(false);
+    expect(
+      result.errors.some((e) => e.message.includes("'aggregation' requires mode: conversation")),
+    ).toBe(true);
+  });
+
+  it('rejects on_turn_failure without mode: conversation', async () => {
+    const filePath = path.join(tempDir, 'on-turn-failure-no-mode.yaml');
+    await writeFile(
+      filePath,
+      `tests:
+  - id: t1
+    criteria: Goal
+    input: hello
+    on_turn_failure: stop
+`,
+    );
+    const result = await validateEvalFile(filePath);
+    expect(result.valid).toBe(false);
+    expect(
+      result.errors.some((e) =>
+        e.message.includes("'on_turn_failure' requires mode: conversation"),
+      ),
+    ).toBe(true);
+  });
+
+  it('rejects window_size without mode: conversation', async () => {
+    const filePath = path.join(tempDir, 'window-no-mode.yaml');
+    await writeFile(
+      filePath,
+      `tests:
+  - id: t1
+    criteria: Goal
+    input: hello
+    window_size: 3
+`,
+    );
+    const result = await validateEvalFile(filePath);
+    expect(result.valid).toBe(false);
+    expect(
+      result.errors.some((e) => e.message.includes("'window_size' requires mode: conversation")),
+    ).toBe(true);
+  });
+
+  it('rejects a turn missing input', async () => {
+    const filePath = path.join(tempDir, 'turn-missing-input.yaml');
+    await writeFile(
+      filePath,
+      `tests:
+  - id: t1
+    criteria: Goal
+    input: hello
+    mode: conversation
+    turns:
+      - expected_output: "something"
+`,
+    );
+    const result = await validateEvalFile(filePath);
+    expect(result.valid).toBe(false);
+    expect(result.errors.some((e) => e.message.includes('non-empty input'))).toBe(true);
+  });
+
+  it('rejects a turn with whitespace-only input', async () => {
+    const filePath = path.join(tempDir, 'turn-whitespace-input.yaml');
+    await writeFile(
+      filePath,
+      `tests:
+  - id: t1
+    criteria: Goal
+    input: hello
+    mode: conversation
+    turns:
+      - input: "   "
+`,
+    );
+    const result = await validateEvalFile(filePath);
+    expect(result.valid).toBe(false);
+    expect(result.errors.some((e) => e.message.includes('non-empty input'))).toBe(true);
+  });
+
+  it('accepts a valid conversation mode eval file', async () => {
+    const filePath = path.join(tempDir, 'valid-conversation.yaml');
+    await writeFile(
+      filePath,
+      `tests:
+  - id: conv-valid
+    criteria: Be helpful
+    input: "System: you are a helpful assistant"
+    mode: conversation
+    aggregation: mean
+    on_turn_failure: continue
+    window_size: 5
+    turns:
+      - input: "What is 2+2?"
+        expected_output: "4"
+      - input: "And 3+3?"
+        assertions:
+          - "Response mentions 6"
+`,
+    );
+    const result = await validateEvalFile(filePath);
+    expect(result.valid).toBe(true);
+    expect(result.errors).toHaveLength(0);
+  });
+});
diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json
index 69d694bbe..80dc2ebd8 100644
--- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json
+++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json
@@ -5114,46 +5114,22 @@
                   "on_dependency_failure": {
                     "type": "string",
                     "enum": ["skip", "fail", "run"]
-                  }
-                },
-                "required": ["id"],
-                "additionalProperties": false
-              }
-            },
-            {
-              "type": "string"
-            }
-          ]
-        },
-        "eval_cases": {
-          "anyOf": [
-            {
-              "type": "array",
-              "items": {
-                "type": "object",
-                "properties": {
-                  "id": {
-                    "type": "string",
-                    "minLength": 1
                   },
-                  "criteria": {
-                    "type": "string"
+                  "mode": {
+                    "type": "string",
+                    "enum": ["conversation"]
                   },
-                  "input": {
-                    "anyOf": [
-                      {
-                        "type": "string"
-                      },
-                      {
-                        "type": "array",
-                        "items": {
-                          "type": "object",
-                          "properties": {
-                            "role": {
-                              "type": "string",
-                              "enum": ["system", "user", "assistant", "tool"]
+                  "turns": {
+                    "type": "array",
+                    "items": {
+                      "type": "object",
+                      "properties": {
+                        "input": {
+                          "anyOf": [
+                            {
+                              "type": "string"
                             },
-                            "content": {
+                            {
                               "anyOf": [
                                 {
                                   "type": "string"
@@ -5177,38 +5153,14 @@
                                 }
                               ]
                             }
-                          },
-                          "required": ["role", "content"],
-                          "additionalProperties": false
-                        }
-                      }
-                    ]
-                  },
-                  "input_files": {
-                    "type": "array",
-                    "items": {
-                      "type": "string"
-                    }
-                  },
-                  "expected_output": {
-                    "anyOf": [
-                      {
-                        "type": "string"
-                      },
-                      {
-                        "type": "object",
-                        "additionalProperties": {}
-                      },
-                      {
-                        "type": "array",
-                        "items": {
-                          "type": "object",
-                          "properties": {
-                            "role": {
-                              "type": "string",
-                              "enum": ["system", "user", "assistant", "tool"]
+                          ]
+                        },
+                        "expected_output": {
+                          "anyOf": [
+                            {
+                              "type": "string"
                             },
-                            "content": {
+                            {
                               "anyOf": [
                                 {
                                   "type": "string"
@@ -5232,648 +5184,3076 @@
                                 }
                               ]
                             }
-                          },
-                          "required": ["role", "content"],
-                          "additionalProperties": false
-                        }
-                      }
-                    ]
-                  },
-                  "assertions": {
-                    "type": "array",
-                    "items": {
-                      "anyOf": [
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["code-grader", "code_grader"]
-                            },
-                            "command": {
-                              "anyOf": [
-                                {
-                                  "type": "string"
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
-                                }
-                              ]
-                            },
-                            "script": {
-                              "anyOf": [
-                                {
-                                  "type": "string"
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
-                                }
-                              ]
-                            },
-                            "cwd": {
-                              "type": "string"
-                            },
-                            "target": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "max_calls": {
-                                      "type": "number"
-                                    }
-                                  },
-                                  "additionalProperties": false
-                                }
-                              ]
-                            },
-                            "config": {
-                              "type": "object",
-                              "additionalProperties": {}
-                            },
-                            "preprocessors": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "type": {
-                                    "type": "string",
-                                    "minLength": 1
-                                  },
-                                  "command": {
-                                    "anyOf": [
-                                      {
+                          ]
+                        },
+                        "assertions": {
+                          "type": "array",
+                          "items": {
+                            "anyOf": [
+                              {
+                                "type": "string"
+                              },
+                              {
+                                "anyOf": [
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
                                         "type": "string"
                                       },
-                                      {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "string"
-                                        }
-                                      }
-                                    ]
-                                  }
-                                },
-                                "required": ["type", "command"],
-                                "additionalProperties": false
-                              }
-                            }
-                          },
-                          "required": ["type", "command"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["llm-grader", "llm_grader"]
-                            },
-                            "prompt": {
-                              "anyOf": [
-                                {
-                                  "type": "string"
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "command": {
-                                      "anyOf": [
-                                        {
-                                          "type": "string"
-                                        },
-                                        {
-                                          "type": "array",
-                                          "items": {
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "enum": ["code-grader", "code_grader"]
+                                      },
+                                      "command": {
+                                        "anyOf": [
+                                          {
                                             "type": "string"
+                                          },
+                                          {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
                                           }
-                                        }
-                                      ]
-                                    },
-                                    "script": {
-                                      "anyOf": [
-                                        {
-                                          "type": "string"
-                                        },
-                                        {
-                                          "type": "array",
-                                          "items": {
+                                        ]
+                                      },
+                                      "script": {
+                                        "anyOf": [
+                                          {
                                             "type": "string"
+                                          },
+                                          {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        ]
+                                      },
+                                      "cwd": {
+                                        "type": "string"
+                                      },
+                                      "target": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "object",
+                                            "properties": {
+                                              "max_calls": {
+                                                "type": "number"
+                                              }
+                                            },
+                                            "additionalProperties": false
                                           }
+                                        ]
+                                      },
+                                      "config": {
+                                        "type": "object",
+                                        "additionalProperties": {}
+                                      },
+                                      "preprocessors": {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "object",
+                                          "properties": {
+                                            "type": {
+                                              "type": "string",
+                                              "minLength": 1
+                                            },
+                                            "command": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            }
+                                          },
+                                          "required": ["type", "command"],
+                                          "additionalProperties": false
                                         }
-                                      ]
+                                      }
                                     },
-                                    "config": {
-                                      "type": "object",
-                                      "additionalProperties": {}
-                                    }
-                                  },
-                                  "additionalProperties": false
-                                }
-                              ]
-                            },
-                            "rubrics": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "id": {
-                                    "type": "string"
-                                  },
-                                  "outcome": {
-                                    "type": "string"
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "min_score": {
-                                    "type": "number",
-                                    "exclusiveMinimum": true,
-                                    "minimum": 0,
-                                    "maximum": 1
+                                    "required": ["type", "command"],
+                                    "additionalProperties": false
                                   },
-                                  "score_ranges": {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "object",
-                                      "properties": {
-                                        "score_range": {
-                                          "type": "array",
-                                          "minItems": 2,
-                                          "maxItems": 2,
-                                          "items": [
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "enum": ["llm-grader", "llm_grader"]
+                                      },
+                                      "prompt": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string"
+                                          },
+                                          {
+                                            "type": "object",
+                                            "properties": {
+                                              "command": {
+                                                "anyOf": [
+                                                  {
+                                                    "type": "string"
+                                                  },
+                                                  {
+                                                    "type": "array",
+                                                    "items": {
+                                                      "type": "string"
+                                                    }
+                                                  }
+                                                ]
+                                              },
+                                              "script": {
+                                                "anyOf": [
+                                                  {
+                                                    "type": "string"
+                                                  },
+                                                  {
+                                                    "type": "array",
+                                                    "items": {
+                                                      "type": "string"
+                                                    }
+                                                  }
+                                                ]
+                                              },
+                                              "config": {
+                                                "type": "object",
+                                                "additionalProperties": {}
+                                              }
                                             },
-                                            {
-                                              "type": "integer",
+                                            "additionalProperties": false
+                                          }
+                                        ]
+                                      },
+                                      "rubrics": {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "object",
+                                          "properties": {
+                                            "id": {
+                                              "type": "string"
+                                            },
+                                            "outcome": {
+                                              "type": "string"
+                                            },
+                                            "weight": {
+                                              "type": "number"
+                                            },
+                                            "required": {
+                                              "type": "boolean"
+                                            },
+                                            "min_score": {
+                                              "type": "number",
+                                              "exclusiveMinimum": true,
                                               "minimum": 0,
-                                              "maximum": 10
+                                              "maximum": 1
+                                            },
+                                            "score_ranges": {
+                                              "type": "array",
+                                              "items": {
+                                                "type": "object",
+                                                "properties": {
+                                                  "score_range": {
+                                                    "type": "array",
+                                                    "minItems": 2,
+                                                    "maxItems": 2,
+                                                    "items": [
+                                                      {
+                                                        "type": "integer",
+                                                        "minimum": 0,
+                                                        "maximum": 10
+                                                      },
+                                                      {
+                                                        "type": "integer",
+                                                        "minimum": 0,
+                                                        "maximum": 10
+                                                      }
+                                                    ]
+                                                  },
+                                                  "outcome": {
+                                                    "type": "string",
+                                                    "minLength": 1
+                                                  }
+                                                },
+                                                "required": ["score_range", "outcome"],
+                                                "additionalProperties": false
+                                              }
                                             }
-                                          ]
-                                        },
-                                        "outcome": {
-                                          "type": "string",
-                                          "minLength": 1
+                                          },
+                                          "additionalProperties": false
                                         }
                                       },
-                                      "required": ["score_range", "outcome"],
-                                      "additionalProperties": false
-                                    }
-                                  }
-                                },
-                                "additionalProperties": false
-                              }
-                            },
-                            "model": {
-                              "type": "string"
-                            },
-                            "target": {
-                              "type": "string"
-                            },
-                            "config": {
-                              "type": "object",
-                              "additionalProperties": {}
-                            },
-                            "max_steps": {
-                              "type": "integer",
-                              "minimum": 1,
-                              "maximum": 50
-                            },
-                            "temperature": {
-                              "type": "number",
-                              "minimum": 0,
-                              "maximum": 2
-                            },
-                            "preprocessors": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "type": {
-                                    "type": "string",
-                                    "minLength": 1
-                                  },
-                                  "command": {
-                                    "anyOf": [
-                                      {
+                                      "model": {
                                         "type": "string"
                                       },
-                                      {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "string"
-                                        }
-                                      }
-                                    ]
-                                  }
-                                },
-                                "required": ["type", "command"],
-                                "additionalProperties": false
-                              }
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "include": {
-                              "type": "string",
-                              "minLength": 1
-                            }
-                          },
-                          "required": ["include"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "composite"
-                            },
-                            "assertions": {
-                              "type": "array",
-                              "items": {}
-                            },
-                            "evaluators": {
-                              "type": "array",
-                              "items": {}
-                            },
-                            "aggregator": {
-                              "anyOf": [
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "weighted_average"
-                                    },
-                                    "weights": {
-                                      "type": "object",
-                                      "additionalProperties": {
-                                        "type": "number"
-                                      }
-                                    }
-                                  },
-                                  "required": ["type"],
-                                  "additionalProperties": false
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "threshold"
-                                    },
-                                    "threshold": {
-                                      "type": "number",
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  },
-                                  "required": ["type", "threshold"],
-                                  "additionalProperties": false
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "code-grader"
-                                    },
-                                    "path": {
-                                      "type": "string"
-                                    },
-                                    "cwd": {
-                                      "type": "string"
-                                    }
-                                  },
-                                  "required": ["type", "path"],
-                                  "additionalProperties": false
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "llm-grader"
-                                    },
-                                    "prompt": {
-                                      "type": "string"
-                                    },
-                                    "model": {
-                                      "type": "string"
-                                    }
-                                  },
-                                  "required": ["type"],
-                                  "additionalProperties": false
-                                }
-                              ]
-                            }
-                          },
-                          "required": ["type", "aggregator"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["tool-trajectory", "tool_trajectory"]
-                            },
-                            "mode": {
-                              "type": "string",
-                              "enum": ["any_order", "in_order", "exact", "subset", "superset"]
-                            },
-                            "minimums": {
-                              "type": "object",
-                              "additionalProperties": {
-                                "type": "integer",
-                                "minimum": 0
-                              }
-                            },
-                            "expected": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "tool": {
-                                    "type": "string"
-                                  },
-                                  "args": {
-                                    "anyOf": [
-                                      {
-                                        "type": "string",
-                                        "const": "any"
+                                      "target": {
+                                        "type": "string"
                                       },
-                                      {
+                                      "config": {
                                         "type": "object",
                                         "additionalProperties": {}
-                                      }
-                                    ]
-                                  },
-                                  "max_duration_ms": {
-                                    "type": "number",
-                                    "minimum": 0
-                                  },
-                                  "maxDurationMs": {
-                                    "type": "number",
-                                    "minimum": 0
-                                  },
-                                  "args_match": {
-                                    "anyOf": [
-                                      {
-                                        "type": "string",
-                                        "enum": ["exact", "ignore", "subset", "superset"]
                                       },
-                                      {
+                                      "max_steps": {
+                                        "type": "integer",
+                                        "minimum": 1,
+                                        "maximum": 50
+                                      },
+                                      "temperature": {
+                                        "type": "number",
+                                        "minimum": 0,
+                                        "maximum": 2
+                                      },
+                                      "preprocessors": {
                                         "type": "array",
                                         "items": {
-                                          "type": "string"
+                                          "type": "object",
+                                          "properties": {
+                                            "type": {
+                                              "type": "string",
+                                              "minLength": 1
+                                            },
+                                            "command": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            }
+                                          },
+                                          "required": ["type", "command"],
+                                          "additionalProperties": false
                                         }
                                       }
-                                    ]
+                                    },
+                                    "required": ["type"],
+                                    "additionalProperties": false
                                   },
-                                  "argsMatch": {
-                                    "anyOf": [
-                                      {
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "include": {
                                         "type": "string",
-                                        "enum": ["exact", "ignore", "subset", "superset"]
-                                      },
-                                      {
+                                        "minLength": 1
+                                      }
+                                    },
+                                    "required": ["include"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "const": "composite"
+                                      },
+                                      "assertions": {
+                                        "type": "array",
+                                        "items": {}
+                                      },
+                                      "evaluators": {
+                                        "type": "array",
+                                        "items": {}
+                                      },
+                                      "aggregator": {
+                                        "anyOf": [
+                                          {
+                                            "type": "object",
+                                            "properties": {
+                                              "type": {
+                                                "type": "string",
+                                                "const": "weighted_average"
+                                              },
+                                              "weights": {
+                                                "type": "object",
+                                                "additionalProperties": {
+                                                  "type": "number"
+                                                }
+                                              }
+                                            },
+                                            "required": ["type"],
+                                            "additionalProperties": false
+                                          },
+                                          {
+                                            "type": "object",
+                                            "properties": {
+                                              "type": {
+                                                "type": "string",
+                                                "const": "threshold"
+                                              },
+                                              "threshold": {
+                                                "type": "number",
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            },
+                                            "required": ["type", "threshold"],
+                                            "additionalProperties": false
+                                          },
+                                          {
+                                            "type": "object",
+                                            "properties": {
+                                              "type": {
+                                                "type": "string",
+                                                "const": "code-grader"
+                                              },
+                                              "path": {
+                                                "type": "string"
+                                              },
+                                              "cwd": {
+                                                "type": "string"
+                                              }
+                                            },
+                                            "required": ["type", "path"],
+                                            "additionalProperties": false
+                                          },
+                                          {
+                                            "type": "object",
+                                            "properties": {
+                                              "type": {
+                                                "type": "string",
+                                                "const": "llm-grader"
+                                              },
+                                              "prompt": {
+                                                "type": "string"
+                                              },
+                                              "model": {
+                                                "type": "string"
+                                              }
+                                            },
+                                            "required": ["type"],
+                                            "additionalProperties": false
+                                          }
+                                        ]
+                                      }
+                                    },
+                                    "required": ["type", "aggregator"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "enum": ["tool-trajectory", "tool_trajectory"]
+                                      },
+                                      "mode": {
+                                        "type": "string",
+                                        "enum": [
+                                          "any_order",
+                                          "in_order",
+                                          "exact",
+                                          "subset",
+                                          "superset"
+                                        ]
+                                      },
+                                      "minimums": {
+                                        "type": "object",
+                                        "additionalProperties": {
+                                          "type": "integer",
+                                          "minimum": 0
+                                        }
+                                      },
+                                      "expected": {
                                         "type": "array",
                                         "items": {
-                                          "type": "string"
+                                          "type": "object",
+                                          "properties": {
+                                            "tool": {
+                                              "type": "string"
+                                            },
+                                            "args": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string",
+                                                  "const": "any"
+                                                },
+                                                {
+                                                  "type": "object",
+                                                  "additionalProperties": {}
+                                                }
+                                              ]
+                                            },
+                                            "max_duration_ms": {
+                                              "type": "number",
+                                              "minimum": 0
+                                            },
+                                            "maxDurationMs": {
+                                              "type": "number",
+                                              "minimum": 0
+                                            },
+                                            "args_match": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string",
+                                                  "enum": ["exact", "ignore", "subset", "superset"]
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            },
+                                            "argsMatch": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string",
+                                                  "enum": ["exact", "ignore", "subset", "superset"]
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            }
+                                          },
+                                          "required": ["tool"],
+                                          "additionalProperties": false
                                         }
+                                      },
+                                      "args_match": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string",
+                                            "enum": ["exact", "ignore", "subset", "superset"]
+                                          },
+                                          {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        ]
+                                      },
+                                      "argsMatch": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string",
+                                            "enum": ["exact", "ignore", "subset", "superset"]
+                                          },
+                                          {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        ]
                                       }
-                                    ]
-                                  }
-                                },
-                                "required": ["tool"],
-                                "additionalProperties": false
-                              }
-                            },
-                            "args_match": {
-                              "anyOf": [
-                                {
-                                  "type": "string",
-                                  "enum": ["exact", "ignore", "subset", "superset"]
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
-                                }
-                              ]
-                            },
-                            "argsMatch": {
-                              "anyOf": [
-                                {
-                                  "type": "string",
-                                  "enum": ["exact", "ignore", "subset", "superset"]
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
-                                }
-                              ]
-                            }
-                          },
-                          "required": ["type", "mode"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["field-accuracy", "field_accuracy"]
-                            },
-                            "fields": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "path": {
-                                    "type": "string"
-                                  },
-                                  "match": {
-                                    "type": "string",
-                                    "enum": ["exact", "numeric_tolerance", "date"]
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "tolerance": {
-                                    "type": "number",
-                                    "minimum": 0
+                                    },
+                                    "required": ["type", "mode"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "enum": ["field-accuracy", "field_accuracy"]
+                                      },
+                                      "fields": {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "object",
+                                          "properties": {
+                                            "path": {
+                                              "type": "string"
+                                            },
+                                            "match": {
+                                              "type": "string",
+                                              "enum": ["exact", "numeric_tolerance", "date"]
+                                            },
+                                            "required": {
+                                              "type": "boolean"
+                                            },
+                                            "weight": {
+                                              "type": "number"
+                                            },
+                                            "tolerance": {
+                                              "type": "number",
+                                              "minimum": 0
+                                            },
+                                            "relative": {
+                                              "type": "boolean"
+                                            },
+                                            "formats": {
+                                              "type": "array",
+                                              "items": {
+                                                "type": "string"
+                                              }
+                                            }
+                                          },
+                                          "required": ["path", "match"],
+                                          "additionalProperties": false
+                                        },
+                                        "minItems": 1
+                                      },
+                                      "aggregation": {
+                                        "type": "string",
+                                        "enum": ["weighted_average", "all_or_nothing"]
+                                      }
+                                    },
+                                    "required": ["type", "fields"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "const": "latency"
+                                      },
+                                      "threshold": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      }
+                                    },
+                                    "required": ["type", "threshold"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "const": "cost"
+                                      },
+                                      "budget": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      }
+                                    },
+                                    "required": ["type", "budget"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "enum": ["token-usage", "token_usage"]
+                                      },
+                                      "max_total": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "max_input": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "max_output": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      }
+                                    },
+                                    "required": ["type"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "enum": ["execution-metrics", "execution_metrics"]
+                                      },
+                                      "max_tool_calls": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "max_llm_calls": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "max_tokens": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "max_cost_usd": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "max_duration_ms": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "target_exploration_ratio": {
+                                        "type": "number",
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "exploration_tolerance": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      }
+                                    },
+                                    "required": ["type"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "const": "contains"
+                                      },
+                                      "value": {
+                                        "type": "string"
+                                      }
+                                    },
+                                    "required": ["type", "value"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "const": "regex"
+                                      },
+                                      "value": {
+                                        "type": "string"
+                                      }
+                                    },
+                                    "required": ["type", "value"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "enum": ["is-json", "is_json"]
+                                      }
+                                    },
+                                    "required": ["type"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "const": "equals"
+                                      },
+                                      "value": {
+                                        "type": "string"
+                                      }
+                                    },
+                                    "required": ["type", "value"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "const": "rubrics"
+                                      },
+                                      "criteria": {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "object",
+                                          "properties": {
+                                            "id": {
+                                              "type": "string"
+                                            },
+                                            "outcome": {
+                                              "type": "string"
+                                            },
+                                            "weight": {
+                                              "type": "number"
+                                            },
+                                            "required": {
+                                              "type": "boolean"
+                                            },
+                                            "min_score": {
+                                              "type": "number",
+                                              "exclusiveMinimum": true,
+                                              "minimum": 0,
+                                              "maximum": 1
+                                            },
+                                            "score_ranges": {
+                                              "type": "array",
+                                              "items": {
+                                                "type": "object",
+                                                "properties": {
+                                                  "score_range": {
+                                                    "type": "array",
+                                                    "minItems": 2,
+                                                    "maxItems": 2,
+                                                    "items": [
+                                                      {
+                                                        "type": "integer",
+                                                        "minimum": 0,
+                                                        "maximum": 10
+                                                      },
+                                                      {
+                                                        "type": "integer",
+                                                        "minimum": 0,
+                                                        "maximum": 10
+                                                      }
+                                                    ]
+                                                  },
+                                                  "outcome": {
+                                                    "type": "string",
+                                                    "minLength": 1
+                                                  }
+                                                },
+                                                "required": ["score_range", "outcome"],
+                                                "additionalProperties": false
+                                              }
+                                            }
+                                          },
+                                          "additionalProperties": false
+                                        },
+                                        "minItems": 1
+                                      }
+                                    },
+                                    "required": ["type", "criteria"],
+                                    "additionalProperties": false
+                                  }
+                                ]
+                              }
+                            ]
+                          }
+                        }
+                      },
+                      "required": ["input"],
+                      "additionalProperties": false
+                    },
+                    "minItems": 1
+                  },
+                  "aggregation": {
+                    "type": "string",
+                    "enum": ["mean", "min", "max"]
+                  },
+                  "on_turn_failure": {
+                    "type": "string",
+                    "enum": ["continue", "stop"]
+                  },
+                  "window_size": {
+                    "type": "integer",
+                    "minimum": 1
+                  }
+                },
+                "required": ["id"],
+                "additionalProperties": false
+              }
+            },
+            {
+              "type": "string"
+            }
+          ]
+        },
+        "eval_cases": {
+          "anyOf": [
+            {
+              "type": "array",
+              "items": {
+                "type": "object",
+                "properties": {
+                  "id": {
+                    "type": "string",
+                    "minLength": 1
+                  },
+                  "criteria": {
+                    "type": "string"
+                  },
+                  "input": {
+                    "anyOf": [
+                      {
+                        "type": "string"
+                      },
+                      {
+                        "type": "array",
+                        "items": {
+                          "type": "object",
+                          "properties": {
+                            "role": {
+                              "type": "string",
+                              "enum": ["system", "user", "assistant", "tool"]
+                            },
+                            "content": {
+                              "anyOf": [
+                                {
+                                  "type": "string"
+                                },
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "type": {
+                                        "type": "string",
+                                        "enum": ["text", "file", "image"]
+                                      },
+                                      "value": {
+                                        "type": "string"
+                                      }
+                                    },
+                                    "required": ["type", "value"],
+                                    "additionalProperties": false
+                                  }
+                                }
+                              ]
+                            }
+                          },
+                          "required": ["role", "content"],
+                          "additionalProperties": false
+                        }
+                      }
+                    ]
+                  },
+                  "input_files": {
+                    "type": "array",
+                    "items": {
+                      "type": "string"
+                    }
+                  },
+                  "expected_output": {
+                    "anyOf": [
+                      {
+                        "type": "string"
+                      },
+                      {
+                        "type": "object",
+                        "additionalProperties": {}
+                      },
+                      {
+                        "type": "array",
+                        "items": {
+                          "type": "object",
+                          "properties": {
+                            "role": {
+                              "type": "string",
+                              "enum": ["system", "user", "assistant", "tool"]
+                            },
+                            "content": {
+                              "anyOf": [
+                                {
+                                  "type": "string"
+                                },
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "type": {
+                                        "type": "string",
+                                        "enum": ["text", "file", "image"]
+                                      },
+                                      "value": {
+                                        "type": "string"
+                                      }
+                                    },
+                                    "required": ["type", "value"],
+                                    "additionalProperties": false
+                                  }
+                                }
+                              ]
+                            }
+                          },
+                          "required": ["role", "content"],
+                          "additionalProperties": false
+                        }
+                      }
+                    ]
+                  },
+                  "assertions": {
+                    "type": "array",
+                    "items": {
+                      "anyOf": [
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "enum": ["code-grader", "code_grader"]
+                            },
+                            "command": {
+                              "anyOf": [
+                                {
+                                  "type": "string"
+                                },
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              ]
+                            },
+                            "script": {
+                              "anyOf": [
+                                {
+                                  "type": "string"
+                                },
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              ]
+                            },
+                            "cwd": {
+                              "type": "string"
+                            },
+                            "target": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "max_calls": {
+                                      "type": "number"
+                                    }
+                                  },
+                                  "additionalProperties": false
+                                }
+                              ]
+                            },
+                            "config": {
+                              "type": "object",
+                              "additionalProperties": {}
+                            },
+                            "preprocessors": {
+                              "type": "array",
+                              "items": {
+                                "type": "object",
+                                "properties": {
+                                  "type": {
+                                    "type": "string",
+                                    "minLength": 1
+                                  },
+                                  "command": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string"
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  }
+                                },
+                                "required": ["type", "command"],
+                                "additionalProperties": false
+                              }
+                            }
+                          },
+                          "required": ["type", "command"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "enum": ["llm-grader", "llm_grader"]
+                            },
+                            "prompt": {
+                              "anyOf": [
+                                {
+                                  "type": "string"
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "command": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string"
+                                        },
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
+                                          }
+                                        }
+                                      ]
+                                    },
+                                    "script": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string"
+                                        },
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
+                                          }
+                                        }
+                                      ]
+                                    },
+                                    "config": {
+                                      "type": "object",
+                                      "additionalProperties": {}
+                                    }
+                                  },
+                                  "additionalProperties": false
+                                }
+                              ]
+                            },
+                            "rubrics": {
+                              "type": "array",
+                              "items": {
+                                "type": "object",
+                                "properties": {
+                                  "id": {
+                                    "type": "string"
+                                  },
+                                  "outcome": {
+                                    "type": "string"
+                                  },
+                                  "weight": {
+                                    "type": "number"
+                                  },
+                                  "required": {
+                                    "type": "boolean"
+                                  },
+                                  "min_score": {
+                                    "type": "number",
+                                    "exclusiveMinimum": true,
+                                    "minimum": 0,
+                                    "maximum": 1
+                                  },
+                                  "score_ranges": {
+                                    "type": "array",
+                                    "items": {
+                                      "type": "object",
+                                      "properties": {
+                                        "score_range": {
+                                          "type": "array",
+                                          "minItems": 2,
+                                          "maxItems": 2,
+                                          "items": [
+                                            {
+                                              "type": "integer",
+                                              "minimum": 0,
+                                              "maximum": 10
+                                            },
+                                            {
+                                              "type": "integer",
+                                              "minimum": 0,
+                                              "maximum": 10
+                                            }
+                                          ]
+                                        },
+                                        "outcome": {
+                                          "type": "string",
+                                          "minLength": 1
+                                        }
+                                      },
+                                      "required": ["score_range", "outcome"],
+                                      "additionalProperties": false
+                                    }
+                                  }
+                                },
+                                "additionalProperties": false
+                              }
+                            },
+                            "model": {
+                              "type": "string"
+                            },
+                            "target": {
+                              "type": "string"
+                            },
+                            "config": {
+                              "type": "object",
+                              "additionalProperties": {}
+                            },
+                            "max_steps": {
+                              "type": "integer",
+                              "minimum": 1,
+                              "maximum": 50
+                            },
+                            "temperature": {
+                              "type": "number",
+                              "minimum": 0,
+                              "maximum": 2
+                            },
+                            "preprocessors": {
+                              "type": "array",
+                              "items": {
+                                "type": "object",
+                                "properties": {
+                                  "type": {
+                                    "type": "string",
+                                    "minLength": 1
+                                  },
+                                  "command": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string"
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  }
+                                },
+                                "required": ["type", "command"],
+                                "additionalProperties": false
+                              }
+                            }
+                          },
+                          "required": ["type"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "include": {
+                              "type": "string",
+                              "minLength": 1
+                            }
+                          },
+                          "required": ["include"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "const": "composite"
+                            },
+                            "assertions": {
+                              "type": "array",
+                              "items": {}
+                            },
+                            "evaluators": {
+                              "type": "array",
+                              "items": {}
+                            },
+                            "aggregator": {
+                              "anyOf": [
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "type": {
+                                      "type": "string",
+                                      "const": "weighted_average"
+                                    },
+                                    "weights": {
+                                      "type": "object",
+                                      "additionalProperties": {
+                                        "type": "number"
+                                      }
+                                    }
+                                  },
+                                  "required": ["type"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "type": {
+                                      "type": "string",
+                                      "const": "threshold"
+                                    },
+                                    "threshold": {
+                                      "type": "number",
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  },
+                                  "required": ["type", "threshold"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "type": {
+                                      "type": "string",
+                                      "const": "code-grader"
+                                    },
+                                    "path": {
+                                      "type": "string"
+                                    },
+                                    "cwd": {
+                                      "type": "string"
+                                    }
+                                  },
+                                  "required": ["type", "path"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "type": {
+                                      "type": "string",
+                                      "const": "llm-grader"
+                                    },
+                                    "prompt": {
+                                      "type": "string"
+                                    },
+                                    "model": {
+                                      "type": "string"
+                                    }
+                                  },
+                                  "required": ["type"],
+                                  "additionalProperties": false
+                                }
+                              ]
+                            }
+                          },
+                          "required": ["type", "aggregator"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "enum": ["tool-trajectory", "tool_trajectory"]
+                            },
+                            "mode": {
+                              "type": "string",
+                              "enum": ["any_order", "in_order", "exact", "subset", "superset"]
+                            },
+                            "minimums": {
+                              "type": "object",
+                              "additionalProperties": {
+                                "type": "integer",
+                                "minimum": 0
+                              }
+                            },
+                            "expected": {
+                              "type": "array",
+                              "items": {
+                                "type": "object",
+                                "properties": {
+                                  "tool": {
+                                    "type": "string"
+                                  },
+                                  "args": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string",
+                                        "const": "any"
+                                      },
+                                      {
+                                        "type": "object",
+                                        "additionalProperties": {}
+                                      }
+                                    ]
+                                  },
+                                  "max_duration_ms": {
+                                    "type": "number",
+                                    "minimum": 0
+                                  },
+                                  "maxDurationMs": {
+                                    "type": "number",
+                                    "minimum": 0
+                                  },
+                                  "args_match": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string",
+                                        "enum": ["exact", "ignore", "subset", "superset"]
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  },
+                                  "argsMatch": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string",
+                                        "enum": ["exact", "ignore", "subset", "superset"]
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  }
+                                },
+                                "required": ["tool"],
+                                "additionalProperties": false
+                              }
+                            },
+                            "args_match": {
+                              "anyOf": [
+                                {
+                                  "type": "string",
+                                  "enum": ["exact", "ignore", "subset", "superset"]
+                                },
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              ]
+                            },
+                            "argsMatch": {
+                              "anyOf": [
+                                {
+                                  "type": "string",
+                                  "enum": ["exact", "ignore", "subset", "superset"]
+                                },
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              ]
+                            }
+                          },
+                          "required": ["type", "mode"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "enum": ["field-accuracy", "field_accuracy"]
+                            },
+                            "fields": {
+                              "type": "array",
+                              "items": {
+                                "type": "object",
+                                "properties": {
+                                  "path": {
+                                    "type": "string"
+                                  },
+                                  "match": {
+                                    "type": "string",
+                                    "enum": ["exact", "numeric_tolerance", "date"]
+                                  },
+                                  "required": {
+                                    "type": "boolean"
+                                  },
+                                  "weight": {
+                                    "type": "number"
+                                  },
+                                  "tolerance": {
+                                    "type": "number",
+                                    "minimum": 0
+                                  },
+                                  "relative": {
+                                    "type": "boolean"
+                                  },
+                                  "formats": {
+                                    "type": "array",
+                                    "items": {
+                                      "type": "string"
+                                    }
+                                  }
+                                },
+                                "required": ["path", "match"],
+                                "additionalProperties": false
+                              },
+                              "minItems": 1
+                            },
+                            "aggregation": {
+                              "type": "string",
+                              "enum": ["weighted_average", "all_or_nothing"]
+                            }
+                          },
+                          "required": ["type", "fields"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "const": "latency"
+                            },
+                            "threshold": {
+                              "type": "number",
+                              "minimum": 0
+                            }
+                          },
+                          "required": ["type", "threshold"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "const": "cost"
+                            },
+                            "budget": {
+                              "type": "number",
+                              "minimum": 0
+                            }
+                          },
+                          "required": ["type", "budget"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "enum": ["token-usage", "token_usage"]
+                            },
+                            "max_total": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "max_input": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "max_output": {
+                              "type": "number",
+                              "minimum": 0
+                            }
+                          },
+                          "required": ["type"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "enum": ["execution-metrics", "execution_metrics"]
+                            },
+                            "max_tool_calls": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "max_llm_calls": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "max_tokens": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "max_cost_usd": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "max_duration_ms": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "target_exploration_ratio": {
+                              "type": "number",
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "exploration_tolerance": {
+                              "type": "number",
+                              "minimum": 0
+                            }
+                          },
+                          "required": ["type"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "const": "contains"
+                            },
+                            "value": {
+                              "type": "string"
+                            }
+                          },
+                          "required": ["type", "value"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "const": "regex"
+                            },
+                            "value": {
+                              "type": "string"
+                            }
+                          },
+                          "required": ["type", "value"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "enum": ["is-json", "is_json"]
+                            }
+                          },
+                          "required": ["type"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "const": "equals"
+                            },
+                            "value": {
+                              "type": "string"
+                            }
+                          },
+                          "required": ["type", "value"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "const": "rubrics"
+                            },
+                            "criteria": {
+                              "type": "array",
+                              "items": {
+                                "type": "object",
+                                "properties": {
+                                  "id": {
+                                    "type": "string"
+                                  },
+                                  "outcome": {
+                                    "type": "string"
+                                  },
+                                  "weight": {
+                                    "type": "number"
+                                  },
+                                  "required": {
+                                    "type": "boolean"
+                                  },
+                                  "min_score": {
+                                    "type": "number",
+                                    "exclusiveMinimum": true,
+                                    "minimum": 0,
+                                    "maximum": 1
+                                  },
+                                  "score_ranges": {
+                                    "type": "array",
+                                    "items": {
+                                      "type": "object",
+                                      "properties": {
+                                        "score_range": {
+                                          "type": "array",
+                                          "minItems": 2,
+                                          "maxItems": 2,
+                                          "items": [
+                                            {
+                                              "type": "integer",
+                                              "minimum": 0,
+                                              "maximum": 10
+                                            },
+                                            {
+                                              "type": "integer",
+                                              "minimum": 0,
+                                              "maximum": 10
+                                            }
+                                          ]
+                                        },
+                                        "outcome": {
+                                          "type": "string",
+                                          "minLength": 1
+                                        }
+                                      },
+                                      "required": ["score_range", "outcome"],
+                                      "additionalProperties": false
+                                    }
+                                  }
+                                },
+                                "additionalProperties": false
+                              },
+                              "minItems": 1
+                            }
+                          },
+                          "required": ["type", "criteria"],
+                          "additionalProperties": false
+                        }
+                      ]
+                    }
+                  },
+                  "evaluators": {
+                    "type": "array",
+                    "items": {
+                      "anyOf": [
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "enum": ["code-grader", "code_grader"]
+                            },
+                            "command": {
+                              "anyOf": [
+                                {
+                                  "type": "string"
+                                },
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              ]
+                            },
+                            "script": {
+                              "anyOf": [
+                                {
+                                  "type": "string"
+                                },
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              ]
+                            },
+                            "cwd": {
+                              "type": "string"
+                            },
+                            "target": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "max_calls": {
+                                      "type": "number"
+                                    }
+                                  },
+                                  "additionalProperties": false
+                                }
+                              ]
+                            },
+                            "config": {
+                              "type": "object",
+                              "additionalProperties": {}
+                            },
+                            "preprocessors": {
+                              "type": "array",
+                              "items": {
+                                "type": "object",
+                                "properties": {
+                                  "type": {
+                                    "type": "string",
+                                    "minLength": 1
+                                  },
+                                  "command": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string"
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  }
+                                },
+                                "required": ["type", "command"],
+                                "additionalProperties": false
+                              }
+                            }
+                          },
+                          "required": ["type", "command"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "enum": ["llm-grader", "llm_grader"]
+                            },
+                            "prompt": {
+                              "anyOf": [
+                                {
+                                  "type": "string"
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "command": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string"
+                                        },
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
+                                          }
+                                        }
+                                      ]
+                                    },
+                                    "script": {
+                                      "anyOf": [
+                                        {
+                                          "type": "string"
+                                        },
+                                        {
+                                          "type": "array",
+                                          "items": {
+                                            "type": "string"
+                                          }
+                                        }
+                                      ]
+                                    },
+                                    "config": {
+                                      "type": "object",
+                                      "additionalProperties": {}
+                                    }
+                                  },
+                                  "additionalProperties": false
+                                }
+                              ]
+                            },
+                            "rubrics": {
+                              "type": "array",
+                              "items": {
+                                "type": "object",
+                                "properties": {
+                                  "id": {
+                                    "type": "string"
+                                  },
+                                  "outcome": {
+                                    "type": "string"
+                                  },
+                                  "weight": {
+                                    "type": "number"
+                                  },
+                                  "required": {
+                                    "type": "boolean"
+                                  },
+                                  "min_score": {
+                                    "type": "number",
+                                    "exclusiveMinimum": true,
+                                    "minimum": 0,
+                                    "maximum": 1
+                                  },
+                                  "score_ranges": {
+                                    "type": "array",
+                                    "items": {
+                                      "type": "object",
+                                      "properties": {
+                                        "score_range": {
+                                          "type": "array",
+                                          "minItems": 2,
+                                          "maxItems": 2,
+                                          "items": [
+                                            {
+                                              "type": "integer",
+                                              "minimum": 0,
+                                              "maximum": 10
+                                            },
+                                            {
+                                              "type": "integer",
+                                              "minimum": 0,
+                                              "maximum": 10
+                                            }
+                                          ]
+                                        },
+                                        "outcome": {
+                                          "type": "string",
+                                          "minLength": 1
+                                        }
+                                      },
+                                      "required": ["score_range", "outcome"],
+                                      "additionalProperties": false
+                                    }
+                                  }
+                                },
+                                "additionalProperties": false
+                              }
+                            },
+                            "model": {
+                              "type": "string"
+                            },
+                            "target": {
+                              "type": "string"
+                            },
+                            "config": {
+                              "type": "object",
+                              "additionalProperties": {}
+                            },
+                            "max_steps": {
+                              "type": "integer",
+                              "minimum": 1,
+                              "maximum": 50
+                            },
+                            "temperature": {
+                              "type": "number",
+                              "minimum": 0,
+                              "maximum": 2
+                            },
+                            "preprocessors": {
+                              "type": "array",
+                              "items": {
+                                "type": "object",
+                                "properties": {
+                                  "type": {
+                                    "type": "string",
+                                    "minLength": 1
+                                  },
+                                  "command": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string"
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  }
+                                },
+                                "required": ["type", "command"],
+                                "additionalProperties": false
+                              }
+                            }
+                          },
+                          "required": ["type"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "include": {
+                              "type": "string",
+                              "minLength": 1
+                            }
+                          },
+                          "required": ["include"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "const": "composite"
+                            },
+                            "assertions": {
+                              "type": "array",
+                              "items": {}
+                            },
+                            "evaluators": {
+                              "type": "array",
+                              "items": {}
+                            },
+                            "aggregator": {
+                              "anyOf": [
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "type": {
+                                      "type": "string",
+                                      "const": "weighted_average"
+                                    },
+                                    "weights": {
+                                      "type": "object",
+                                      "additionalProperties": {
+                                        "type": "number"
+                                      }
+                                    }
+                                  },
+                                  "required": ["type"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "type": {
+                                      "type": "string",
+                                      "const": "threshold"
+                                    },
+                                    "threshold": {
+                                      "type": "number",
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  },
+                                  "required": ["type", "threshold"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "type": {
+                                      "type": "string",
+                                      "const": "code-grader"
+                                    },
+                                    "path": {
+                                      "type": "string"
+                                    },
+                                    "cwd": {
+                                      "type": "string"
+                                    }
+                                  },
+                                  "required": ["type", "path"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "type": {
+                                      "type": "string",
+                                      "const": "llm-grader"
+                                    },
+                                    "prompt": {
+                                      "type": "string"
+                                    },
+                                    "model": {
+                                      "type": "string"
+                                    }
+                                  },
+                                  "required": ["type"],
+                                  "additionalProperties": false
+                                }
+                              ]
+                            }
+                          },
+                          "required": ["type", "aggregator"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "enum": ["tool-trajectory", "tool_trajectory"]
+                            },
+                            "mode": {
+                              "type": "string",
+                              "enum": ["any_order", "in_order", "exact", "subset", "superset"]
+                            },
+                            "minimums": {
+                              "type": "object",
+                              "additionalProperties": {
+                                "type": "integer",
+                                "minimum": 0
+                              }
+                            },
+                            "expected": {
+                              "type": "array",
+                              "items": {
+                                "type": "object",
+                                "properties": {
+                                  "tool": {
+                                    "type": "string"
+                                  },
+                                  "args": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string",
+                                        "const": "any"
+                                      },
+                                      {
+                                        "type": "object",
+                                        "additionalProperties": {}
+                                      }
+                                    ]
+                                  },
+                                  "max_duration_ms": {
+                                    "type": "number",
+                                    "minimum": 0
+                                  },
+                                  "maxDurationMs": {
+                                    "type": "number",
+                                    "minimum": 0
+                                  },
+                                  "args_match": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string",
+                                        "enum": ["exact", "ignore", "subset", "superset"]
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  },
+                                  "argsMatch": {
+                                    "anyOf": [
+                                      {
+                                        "type": "string",
+                                        "enum": ["exact", "ignore", "subset", "superset"]
+                                      },
+                                      {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "string"
+                                        }
+                                      }
+                                    ]
+                                  }
+                                },
+                                "required": ["tool"],
+                                "additionalProperties": false
+                              }
+                            },
+                            "args_match": {
+                              "anyOf": [
+                                {
+                                  "type": "string",
+                                  "enum": ["exact", "ignore", "subset", "superset"]
+                                },
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              ]
+                            },
+                            "argsMatch": {
+                              "anyOf": [
+                                {
+                                  "type": "string",
+                                  "enum": ["exact", "ignore", "subset", "superset"]
+                                },
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              ]
+                            }
+                          },
+                          "required": ["type", "mode"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
+                              "type": "string"
+                            },
+                            "weight": {
+                              "type": "number",
+                              "minimum": 0
+                            },
+                            "required": {
+                              "anyOf": [
+                                {
+                                  "type": "boolean"
+                                },
+                                {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                }
+                              ]
+                            },
+                            "min_score": {
+                              "type": "number",
+                              "exclusiveMinimum": true,
+                              "minimum": 0,
+                              "maximum": 1
+                            },
+                            "negate": {
+                              "type": "boolean"
+                            },
+                            "type": {
+                              "type": "string",
+                              "enum": ["field-accuracy", "field_accuracy"]
+                            },
+                            "fields": {
+                              "type": "array",
+                              "items": {
+                                "type": "object",
+                                "properties": {
+                                  "path": {
+                                    "type": "string"
+                                  },
+                                  "match": {
+                                    "type": "string",
+                                    "enum": ["exact", "numeric_tolerance", "date"]
+                                  },
+                                  "required": {
+                                    "type": "boolean"
+                                  },
+                                  "weight": {
+                                    "type": "number"
+                                  },
+                                  "tolerance": {
+                                    "type": "number",
+                                    "minimum": 0
                                   },
                                   "relative": {
                                     "type": "boolean"
@@ -6340,1201 +8720,1201 @@
                                     "items": {
                                       "type": "object",
                                       "properties": {
-                                        "score_range": {
-                                          "type": "array",
-                                          "minItems": 2,
-                                          "maxItems": 2,
-                                          "items": [
+                                        "score_range": {
+                                          "type": "array",
+                                          "minItems": 2,
+                                          "maxItems": 2,
+                                          "items": [
+                                            {
+                                              "type": "integer",
+                                              "minimum": 0,
+                                              "maximum": 10
+                                            },
+                                            {
+                                              "type": "integer",
+                                              "minimum": 0,
+                                              "maximum": 10
+                                            }
+                                          ]
+                                        },
+                                        "outcome": {
+                                          "type": "string",
+                                          "minLength": 1
+                                        }
+                                      },
+                                      "required": ["score_range", "outcome"],
+                                      "additionalProperties": false
+                                    }
+                                  }
+                                },
+                                "additionalProperties": false
+                              },
+                              "minItems": 1
+                            }
+                          },
+                          "required": ["type", "criteria"],
+                          "additionalProperties": false
+                        }
+                      ]
+                    }
+                  },
+                  "execution": {
+                    "type": "object",
+                    "properties": {
+                      "target": {
+                        "type": "string"
+                      },
+                      "targets": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        }
+                      },
+                      "workers": {
+                        "type": "integer",
+                        "minimum": 1,
+                        "maximum": 50
+                      },
+                      "assertions": {
+                        "type": "array",
+                        "items": {
+                          "anyOf": [
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
+                                },
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["code-grader", "code_grader"]
+                                },
+                                "command": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string"
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "string"
+                                      }
+                                    }
+                                  ]
+                                },
+                                "script": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string"
+                                    },
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "string"
+                                      }
+                                    }
+                                  ]
+                                },
+                                "cwd": {
+                                  "type": "string"
+                                },
+                                "target": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {
+                                        "max_calls": {
+                                          "type": "number"
+                                        }
+                                      },
+                                      "additionalProperties": false
+                                    }
+                                  ]
+                                },
+                                "config": {
+                                  "type": "object",
+                                  "additionalProperties": {}
+                                },
+                                "preprocessors": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "type": {
+                                        "type": "string",
+                                        "minLength": 1
+                                      },
+                                      "command": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string"
+                                          },
+                                          {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        ]
+                                      }
+                                    },
+                                    "required": ["type", "command"],
+                                    "additionalProperties": false
+                                  }
+                                }
+                              },
+                              "required": ["type", "command"],
+                              "additionalProperties": false
+                            },
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
+                                },
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["llm-grader", "llm_grader"]
+                                },
+                                "prompt": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string"
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {
+                                        "command": {
+                                          "anyOf": [
                                             {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
+                                              "type": "string"
                                             },
                                             {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
+                                              "type": "array",
+                                              "items": {
+                                                "type": "string"
+                                              }
                                             }
                                           ]
                                         },
-                                        "outcome": {
-                                          "type": "string",
-                                          "minLength": 1
+                                        "script": {
+                                          "anyOf": [
+                                            {
+                                              "type": "string"
+                                            },
+                                            {
+                                              "type": "array",
+                                              "items": {
+                                                "type": "string"
+                                              }
+                                            }
+                                          ]
+                                        },
+                                        "config": {
+                                          "type": "object",
+                                          "additionalProperties": {}
                                         }
                                       },
-                                      "required": ["score_range", "outcome"],
                                       "additionalProperties": false
                                     }
-                                  }
-                                },
-                                "additionalProperties": false
-                              },
-                              "minItems": 1
-                            }
-                          },
-                          "required": ["type", "criteria"],
-                          "additionalProperties": false
-                        }
-                      ]
-                    }
-                  },
-                  "evaluators": {
-                    "type": "array",
-                    "items": {
-                      "anyOf": [
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["code-grader", "code_grader"]
-                            },
-                            "command": {
-                              "anyOf": [
-                                {
-                                  "type": "string"
+                                  ]
                                 },
-                                {
+                                "rubrics": {
                                   "type": "array",
                                   "items": {
-                                    "type": "string"
+                                    "type": "object",
+                                    "properties": {
+                                      "id": {
+                                        "type": "string"
+                                      },
+                                      "outcome": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number"
+                                      },
+                                      "required": {
+                                        "type": "boolean"
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "score_ranges": {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "object",
+                                          "properties": {
+                                            "score_range": {
+                                              "type": "array",
+                                              "minItems": 2,
+                                              "maxItems": 2,
+                                              "items": [
+                                                {
+                                                  "type": "integer",
+                                                  "minimum": 0,
+                                                  "maximum": 10
+                                                },
+                                                {
+                                                  "type": "integer",
+                                                  "minimum": 0,
+                                                  "maximum": 10
+                                                }
+                                              ]
+                                            },
+                                            "outcome": {
+                                              "type": "string",
+                                              "minLength": 1
+                                            }
+                                          },
+                                          "required": ["score_range", "outcome"],
+                                          "additionalProperties": false
+                                        }
+                                      }
+                                    },
+                                    "additionalProperties": false
                                   }
-                                }
-                              ]
-                            },
-                            "script": {
-                              "anyOf": [
-                                {
+                                },
+                                "model": {
                                   "type": "string"
                                 },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
-                                }
-                              ]
-                            },
-                            "cwd": {
-                              "type": "string"
-                            },
-                            "target": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                                "target": {
+                                  "type": "string"
                                 },
-                                {
+                                "config": {
                                   "type": "object",
-                                  "properties": {
-                                    "max_calls": {
-                                      "type": "number"
-                                    }
-                                  },
-                                  "additionalProperties": false
-                                }
-                              ]
-                            },
-                            "config": {
-                              "type": "object",
-                              "additionalProperties": {}
-                            },
-                            "preprocessors": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "type": {
-                                    "type": "string",
-                                    "minLength": 1
-                                  },
-                                  "command": {
-                                    "anyOf": [
-                                      {
-                                        "type": "string"
+                                  "additionalProperties": {}
+                                },
+                                "max_steps": {
+                                  "type": "integer",
+                                  "minimum": 1,
+                                  "maximum": 50
+                                },
+                                "temperature": {
+                                  "type": "number",
+                                  "minimum": 0,
+                                  "maximum": 2
+                                },
+                                "preprocessors": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "type": {
+                                        "type": "string",
+                                        "minLength": 1
                                       },
-                                      {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "string"
-                                        }
+                                      "command": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string"
+                                          },
+                                          {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        ]
                                       }
-                                    ]
+                                    },
+                                    "required": ["type", "command"],
+                                    "additionalProperties": false
                                   }
-                                },
-                                "required": ["type", "command"],
-                                "additionalProperties": false
-                              }
-                            }
-                          },
-                          "required": ["type", "command"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
+                                }
+                              },
+                              "required": ["type"],
+                              "additionalProperties": false
                             },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                            {
+                              "type": "object",
+                              "properties": {
+                                "include": {
+                                  "type": "string",
+                                  "minLength": 1
+                                }
+                              },
+                              "required": ["include"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["llm-grader", "llm_grader"]
-                            },
-                            "prompt": {
-                              "anyOf": [
-                                {
-                                  "type": "string"
                                 },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "command": {
-                                      "anyOf": [
-                                        {
-                                          "type": "string"
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "composite"
+                                },
+                                "assertions": {
+                                  "type": "array",
+                                  "items": {}
+                                },
+                                "evaluators": {
+                                  "type": "array",
+                                  "items": {}
+                                },
+                                "aggregator": {
+                                  "anyOf": [
+                                    {
+                                      "type": "object",
+                                      "properties": {
+                                        "type": {
+                                          "type": "string",
+                                          "const": "weighted_average"
                                         },
-                                        {
-                                          "type": "array",
-                                          "items": {
-                                            "type": "string"
+                                        "weights": {
+                                          "type": "object",
+                                          "additionalProperties": {
+                                            "type": "number"
                                           }
                                         }
-                                      ]
+                                      },
+                                      "required": ["type"],
+                                      "additionalProperties": false
                                     },
-                                    "script": {
-                                      "anyOf": [
-                                        {
-                                          "type": "string"
+                                    {
+                                      "type": "object",
+                                      "properties": {
+                                        "type": {
+                                          "type": "string",
+                                          "const": "threshold"
                                         },
-                                        {
-                                          "type": "array",
-                                          "items": {
-                                            "type": "string"
-                                          }
+                                        "threshold": {
+                                          "type": "number",
+                                          "minimum": 0,
+                                          "maximum": 1
                                         }
-                                      ]
+                                      },
+                                      "required": ["type", "threshold"],
+                                      "additionalProperties": false
                                     },
-                                    "config": {
-                                      "type": "object",
-                                      "additionalProperties": {}
-                                    }
-                                  },
-                                  "additionalProperties": false
-                                }
-                              ]
-                            },
-                            "rubrics": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "id": {
-                                    "type": "string"
-                                  },
-                                  "outcome": {
-                                    "type": "string"
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "min_score": {
-                                    "type": "number",
-                                    "exclusiveMinimum": true,
-                                    "minimum": 0,
-                                    "maximum": 1
-                                  },
-                                  "score_ranges": {
-                                    "type": "array",
-                                    "items": {
+                                    {
                                       "type": "object",
                                       "properties": {
-                                        "score_range": {
-                                          "type": "array",
-                                          "minItems": 2,
-                                          "maxItems": 2,
-                                          "items": [
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            },
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            }
-                                          ]
-                                        },
-                                        "outcome": {
+                                        "type": {
                                           "type": "string",
-                                          "minLength": 1
+                                          "const": "code-grader"
+                                        },
+                                        "path": {
+                                          "type": "string"
+                                        },
+                                        "cwd": {
+                                          "type": "string"
                                         }
                                       },
-                                      "required": ["score_range", "outcome"],
+                                      "required": ["type", "path"],
                                       "additionalProperties": false
-                                    }
-                                  }
-                                },
-                                "additionalProperties": false
-                              }
-                            },
-                            "model": {
-                              "type": "string"
-                            },
-                            "target": {
-                              "type": "string"
-                            },
-                            "config": {
-                              "type": "object",
-                              "additionalProperties": {}
-                            },
-                            "max_steps": {
-                              "type": "integer",
-                              "minimum": 1,
-                              "maximum": 50
-                            },
-                            "temperature": {
-                              "type": "number",
-                              "minimum": 0,
-                              "maximum": 2
-                            },
-                            "preprocessors": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "type": {
-                                    "type": "string",
-                                    "minLength": 1
-                                  },
-                                  "command": {
-                                    "anyOf": [
-                                      {
-                                        "type": "string"
-                                      },
-                                      {
-                                        "type": "array",
-                                        "items": {
+                                    },
+                                    {
+                                      "type": "object",
+                                      "properties": {
+                                        "type": {
+                                          "type": "string",
+                                          "const": "llm-grader"
+                                        },
+                                        "prompt": {
+                                          "type": "string"
+                                        },
+                                        "model": {
                                           "type": "string"
                                         }
-                                      }
-                                    ]
-                                  }
-                                },
-                                "required": ["type", "command"],
-                                "additionalProperties": false
-                              }
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "include": {
-                              "type": "string",
-                              "minLength": 1
-                            }
-                          },
-                          "required": ["include"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                                      },
+                                      "required": ["type"],
+                                      "additionalProperties": false
+                                    }
+                                  ]
+                                }
+                              },
+                              "required": ["type", "aggregator"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
                                   "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "composite"
-                            },
-                            "assertions": {
-                              "type": "array",
-                              "items": {}
-                            },
-                            "evaluators": {
-                              "type": "array",
-                              "items": {}
-                            },
-                            "aggregator": {
-                              "anyOf": [
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "weighted_average"
-                                    },
-                                    "weights": {
-                                      "type": "object",
-                                      "additionalProperties": {
-                                        "type": "number"
-                                      }
-                                    }
-                                  },
-                                  "required": ["type"],
-                                  "additionalProperties": false
+                                  "minimum": 0
                                 },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "threshold"
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
                                     },
-                                    "threshold": {
+                                    {
                                       "type": "number",
+                                      "exclusiveMinimum": true,
                                       "minimum": 0,
                                       "maximum": 1
                                     }
-                                  },
-                                  "required": ["type", "threshold"],
-                                  "additionalProperties": false
+                                  ]
                                 },
-                                {
+                                "min_score": {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["tool-trajectory", "tool_trajectory"]
+                                },
+                                "mode": {
+                                  "type": "string",
+                                  "enum": ["any_order", "in_order", "exact", "subset", "superset"]
+                                },
+                                "minimums": {
                                   "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "code-grader"
+                                  "additionalProperties": {
+                                    "type": "integer",
+                                    "minimum": 0
+                                  }
+                                },
+                                "expected": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "tool": {
+                                        "type": "string"
+                                      },
+                                      "args": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string",
+                                            "const": "any"
+                                          },
+                                          {
+                                            "type": "object",
+                                            "additionalProperties": {}
+                                          }
+                                        ]
+                                      },
+                                      "max_duration_ms": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "maxDurationMs": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "args_match": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string",
+                                            "enum": ["exact", "ignore", "subset", "superset"]
+                                          },
+                                          {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        ]
+                                      },
+                                      "argsMatch": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string",
+                                            "enum": ["exact", "ignore", "subset", "superset"]
+                                          },
+                                          {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        ]
+                                      }
                                     },
-                                    "path": {
-                                      "type": "string"
+                                    "required": ["tool"],
+                                    "additionalProperties": false
+                                  }
+                                },
+                                "args_match": {
+                                  "anyOf": [
+                                    {
+                                      "type": "string",
+                                      "enum": ["exact", "ignore", "subset", "superset"]
                                     },
-                                    "cwd": {
-                                      "type": "string"
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "string"
+                                      }
                                     }
-                                  },
-                                  "required": ["type", "path"],
-                                  "additionalProperties": false
+                                  ]
                                 },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
+                                "argsMatch": {
+                                  "anyOf": [
+                                    {
                                       "type": "string",
-                                      "const": "llm-grader"
-                                    },
-                                    "prompt": {
-                                      "type": "string"
+                                      "enum": ["exact", "ignore", "subset", "superset"]
                                     },
-                                    "model": {
-                                      "type": "string"
+                                    {
+                                      "type": "array",
+                                      "items": {
+                                        "type": "string"
+                                      }
                                     }
-                                  },
-                                  "required": ["type"],
-                                  "additionalProperties": false
+                                  ]
                                 }
-                              ]
-                            }
-                          },
-                          "required": ["type", "aggregator"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                              },
+                              "required": ["type", "mode"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["tool-trajectory", "tool_trajectory"]
-                            },
-                            "mode": {
-                              "type": "string",
-                              "enum": ["any_order", "in_order", "exact", "subset", "superset"]
-                            },
-                            "minimums": {
-                              "type": "object",
-                              "additionalProperties": {
-                                "type": "integer",
-                                "minimum": 0
-                              }
-                            },
-                            "expected": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "tool": {
-                                    "type": "string"
-                                  },
-                                  "args": {
-                                    "anyOf": [
-                                      {
-                                        "type": "string",
-                                        "const": "any"
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["field-accuracy", "field_accuracy"]
+                                },
+                                "fields": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "path": {
+                                        "type": "string"
                                       },
-                                      {
-                                        "type": "object",
-                                        "additionalProperties": {}
-                                      }
-                                    ]
-                                  },
-                                  "max_duration_ms": {
-                                    "type": "number",
-                                    "minimum": 0
-                                  },
-                                  "maxDurationMs": {
-                                    "type": "number",
-                                    "minimum": 0
-                                  },
-                                  "args_match": {
-                                    "anyOf": [
-                                      {
+                                      "match": {
                                         "type": "string",
-                                        "enum": ["exact", "ignore", "subset", "superset"]
+                                        "enum": ["exact", "numeric_tolerance", "date"]
                                       },
-                                      {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "string"
-                                        }
-                                      }
-                                    ]
-                                  },
-                                  "argsMatch": {
-                                    "anyOf": [
-                                      {
-                                        "type": "string",
-                                        "enum": ["exact", "ignore", "subset", "superset"]
+                                      "required": {
+                                        "type": "boolean"
                                       },
-                                      {
+                                      "weight": {
+                                        "type": "number"
+                                      },
+                                      "tolerance": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "relative": {
+                                        "type": "boolean"
+                                      },
+                                      "formats": {
                                         "type": "array",
                                         "items": {
                                           "type": "string"
                                         }
                                       }
-                                    ]
-                                  }
-                                },
-                                "required": ["tool"],
-                                "additionalProperties": false
-                              }
-                            },
-                            "args_match": {
-                              "anyOf": [
-                                {
-                                  "type": "string",
-                                  "enum": ["exact", "ignore", "subset", "superset"]
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
-                                }
-                              ]
-                            },
-                            "argsMatch": {
-                              "anyOf": [
-                                {
-                                  "type": "string",
-                                  "enum": ["exact", "ignore", "subset", "superset"]
-                                },
-                                {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
-                                  }
-                                }
-                              ]
-                            }
-                          },
-                          "required": ["type", "mode"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["field-accuracy", "field_accuracy"]
-                            },
-                            "fields": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "path": {
-                                    "type": "string"
-                                  },
-                                  "match": {
-                                    "type": "string",
-                                    "enum": ["exact", "numeric_tolerance", "date"]
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "tolerance": {
-                                    "type": "number",
-                                    "minimum": 0
-                                  },
-                                  "relative": {
-                                    "type": "boolean"
+                                    },
+                                    "required": ["path", "match"],
+                                    "additionalProperties": false
                                   },
-                                  "formats": {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "string"
-                                    }
-                                  }
+                                  "minItems": 1
                                 },
-                                "required": ["path", "match"],
-                                "additionalProperties": false
+                                "aggregation": {
+                                  "type": "string",
+                                  "enum": ["weighted_average", "all_or_nothing"]
+                                }
                               },
-                              "minItems": 1
-                            },
-                            "aggregation": {
-                              "type": "string",
-                              "enum": ["weighted_average", "all_or_nothing"]
-                            }
-                          },
-                          "required": ["type", "fields"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                              "required": ["type", "fields"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "latency"
+                                },
+                                "threshold": {
+                                  "type": "number",
+                                  "minimum": 0
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "latency"
-                            },
-                            "threshold": {
-                              "type": "number",
-                              "minimum": 0
-                            }
-                          },
-                          "required": ["type", "threshold"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                              },
+                              "required": ["type", "threshold"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "cost"
+                                },
+                                "budget": {
+                                  "type": "number",
+                                  "minimum": 0
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "cost"
-                            },
-                            "budget": {
-                              "type": "number",
-                              "minimum": 0
-                            }
-                          },
-                          "required": ["type", "budget"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                              },
+                              "required": ["type", "budget"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["token-usage", "token_usage"]
-                            },
-                            "max_total": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_input": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_output": {
-                              "type": "number",
-                              "minimum": 0
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["token-usage", "token_usage"]
+                                },
+                                "max_total": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "max_input": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "max_output": {
+                                  "type": "number",
+                                  "minimum": 0
+                                }
+                              },
+                              "required": ["type"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
-                                }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["execution-metrics", "execution_metrics"]
-                            },
-                            "max_tool_calls": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_llm_calls": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_tokens": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_cost_usd": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "max_duration_ms": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "target_exploration_ratio": {
-                              "type": "number",
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "exploration_tolerance": {
-                              "type": "number",
-                              "minimum": 0
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
+                                },
+                                "negate": {
                                   "type": "boolean"
                                 },
-                                {
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["execution-metrics", "execution_metrics"]
+                                },
+                                "max_tool_calls": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "max_llm_calls": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "max_tokens": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "max_cost_usd": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "max_duration_ms": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "target_exploration_ratio": {
                                   "type": "number",
-                                  "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
+                                },
+                                "exploration_tolerance": {
+                                  "type": "number",
+                                  "minimum": 0
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "contains"
-                            },
-                            "value": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type", "value"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                              },
+                              "required": ["type"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "contains"
+                                },
+                                "value": {
+                                  "type": "string"
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "regex"
-                            },
-                            "value": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type", "value"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                              },
+                              "required": ["type", "value"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
+                                },
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
                                 },
-                                {
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "regex"
+                                },
+                                "value": {
+                                  "type": "string"
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["is-json", "is_json"]
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                              },
+                              "required": ["type", "value"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["is-json", "is_json"]
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "equals"
-                            },
-                            "value": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type", "value"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
+                              },
+                              "required": ["type"],
+                              "additionalProperties": false
                             },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
                                 },
-                                {
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
                                   "type": "number",
                                   "exclusiveMinimum": true,
                                   "minimum": 0,
                                   "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "equals"
+                                },
+                                "value": {
+                                  "type": "string"
                                 }
-                              ]
-                            },
-                            "min_score": {
-                              "type": "number",
-                              "exclusiveMinimum": true,
-                              "minimum": 0,
-                              "maximum": 1
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "rubrics"
+                              },
+                              "required": ["type", "value"],
+                              "additionalProperties": false
                             },
-                            "criteria": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "id": {
-                                    "type": "string"
-                                  },
-                                  "outcome": {
-                                    "type": "string"
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "min_score": {
-                                    "type": "number",
-                                    "exclusiveMinimum": true,
-                                    "minimum": 0,
-                                    "maximum": 1
-                                  },
-                                  "score_ranges": {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "object",
-                                      "properties": {
-                                        "score_range": {
-                                          "type": "array",
-                                          "minItems": 2,
-                                          "maxItems": 2,
-                                          "items": [
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
+                                },
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "rubrics"
+                                },
+                                "criteria": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "id": {
+                                        "type": "string"
+                                      },
+                                      "outcome": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number"
+                                      },
+                                      "required": {
+                                        "type": "boolean"
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "score_ranges": {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "object",
+                                          "properties": {
+                                            "score_range": {
+                                              "type": "array",
+                                              "minItems": 2,
+                                              "maxItems": 2,
+                                              "items": [
+                                                {
+                                                  "type": "integer",
+                                                  "minimum": 0,
+                                                  "maximum": 10
+                                                },
+                                                {
+                                                  "type": "integer",
+                                                  "minimum": 0,
+                                                  "maximum": 10
+                                                }
+                                              ]
                                             },
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
+                                            "outcome": {
+                                              "type": "string",
+                                              "minLength": 1
                                             }
-                                          ]
-                                        },
-                                        "outcome": {
-                                          "type": "string",
-                                          "minLength": 1
+                                          },
+                                          "required": ["score_range", "outcome"],
+                                          "additionalProperties": false
                                         }
-                                      },
-                                      "required": ["score_range", "outcome"],
-                                      "additionalProperties": false
-                                    }
-                                  }
-                                },
-                                "additionalProperties": false
+                                      }
+                                    },
+                                    "additionalProperties": false
+                                  },
+                                  "minItems": 1
+                                }
                               },
-                              "minItems": 1
+                              "required": ["type", "criteria"],
+                              "additionalProperties": false
                             }
-                          },
-                          "required": ["type", "criteria"],
-                          "additionalProperties": false
-                        }
-                      ]
-                    }
-                  },
-                  "execution": {
-                    "type": "object",
-                    "properties": {
-                      "target": {
-                        "type": "string"
-                      },
-                      "targets": {
-                        "type": "array",
-                        "items": {
-                          "type": "string"
+                          ]
                         }
                       },
-                      "workers": {
-                        "type": "integer",
-                        "minimum": 1,
-                        "maximum": 50
-                      },
-                      "assertions": {
+                      "evaluators": {
                         "type": "array",
                         "items": {
                           "anyOf": [
@@ -8394,12 +10774,181 @@
                                   "minimum": 0,
                                   "maximum": 1
                                 },
-                                "exploration_tolerance": {
-                                  "type": "number",
-                                  "minimum": 0
+                                "exploration_tolerance": {
+                                  "type": "number",
+                                  "minimum": 0
+                                }
+                              },
+                              "required": ["type"],
+                              "additionalProperties": false
+                            },
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
+                                },
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "contains"
+                                },
+                                "value": {
+                                  "type": "string"
+                                }
+                              },
+                              "required": ["type", "value"],
+                              "additionalProperties": false
+                            },
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
+                                },
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "regex"
+                                },
+                                "value": {
+                                  "type": "string"
+                                }
+                              },
+                              "required": ["type", "value"],
+                              "additionalProperties": false
+                            },
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
+                                },
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "enum": ["is-json", "is_json"]
+                                }
+                              },
+                              "required": ["type"],
+                              "additionalProperties": false
+                            },
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
+                                  "type": "string"
+                                },
+                                "weight": {
+                                  "type": "number",
+                                  "minimum": 0
+                                },
+                                "required": {
+                                  "anyOf": [
+                                    {
+                                      "type": "boolean"
+                                    },
+                                    {
+                                      "type": "number",
+                                      "exclusiveMinimum": true,
+                                      "minimum": 0,
+                                      "maximum": 1
+                                    }
+                                  ]
+                                },
+                                "min_score": {
+                                  "type": "number",
+                                  "exclusiveMinimum": true,
+                                  "minimum": 0,
+                                  "maximum": 1
+                                },
+                                "negate": {
+                                  "type": "boolean"
+                                },
+                                "type": {
+                                  "type": "string",
+                                  "const": "equals"
+                                },
+                                "value": {
+                                  "type": "string"
                                 }
                               },
-                              "required": ["type"],
+                              "required": ["type", "value"],
                               "additionalProperties": false
                             },
                             {
@@ -8436,342 +10985,537 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "const": "contains"
+                                  "const": "rubrics"
                                 },
-                                "value": {
+                                "criteria": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "id": {
+                                        "type": "string"
+                                      },
+                                      "outcome": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number"
+                                      },
+                                      "required": {
+                                        "type": "boolean"
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "score_ranges": {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "object",
+                                          "properties": {
+                                            "score_range": {
+                                              "type": "array",
+                                              "minItems": 2,
+                                              "maxItems": 2,
+                                              "items": [
+                                                {
+                                                  "type": "integer",
+                                                  "minimum": 0,
+                                                  "maximum": 10
+                                                },
+                                                {
+                                                  "type": "integer",
+                                                  "minimum": 0,
+                                                  "maximum": 10
+                                                }
+                                              ]
+                                            },
+                                            "outcome": {
+                                              "type": "string",
+                                              "minLength": 1
+                                            }
+                                          },
+                                          "required": ["score_range", "outcome"],
+                                          "additionalProperties": false
+                                        }
+                                      }
+                                    },
+                                    "additionalProperties": false
+                                  },
+                                  "minItems": 1
+                                }
+                              },
+                              "required": ["type", "criteria"],
+                              "additionalProperties": false
+                            }
+                          ]
+                        }
+                      },
+                      "skip_defaults": {
+                        "type": "boolean"
+                      },
+                      "cache": {
+                        "type": "boolean"
+                      },
+                      "trials": {
+                        "type": "object",
+                        "properties": {
+                          "count": {
+                            "type": "integer",
+                            "minimum": 1
+                          },
+                          "strategy": {
+                            "type": "string",
+                            "enum": ["pass_at_k", "mean", "confidence_interval"]
+                          },
+                          "cost_limit_usd": {
+                            "type": "number",
+                            "minimum": 0
+                          },
+                          "costLimitUsd": {
+                            "type": "number",
+                            "minimum": 0
+                          }
+                        },
+                        "required": ["count"],
+                        "additionalProperties": false
+                      },
+                      "total_budget_usd": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "totalBudgetUsd": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "fail_on_error": {
+                        "type": "boolean"
+                      },
+                      "failOnError": {
+                        "type": "boolean"
+                      },
+                      "threshold": {
+                        "type": "number",
+                        "minimum": 0,
+                        "maximum": 1
+                      }
+                    },
+                    "additionalProperties": false
+                  },
+                  "workspace": {
+                    "type": "object",
+                    "properties": {
+                      "template": {
+                        "type": "string"
+                      },
+                      "isolation": {
+                        "type": "string",
+                        "enum": ["shared", "per_test"]
+                      },
+                      "repos": {
+                        "type": "array",
+                        "items": {
+                          "type": "object",
+                          "properties": {
+                            "path": {
+                              "type": "string"
+                            },
+                            "source": {
+                              "anyOf": [
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "type": {
+                                      "type": "string",
+                                      "const": "git"
+                                    },
+                                    "url": {
+                                      "type": "string",
+                                      "format": "uri"
+                                    }
+                                  },
+                                  "required": ["type", "url"],
+                                  "additionalProperties": false
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {
+                                    "type": {
+                                      "type": "string",
+                                      "const": "local"
+                                    },
+                                    "path": {
+                                      "type": "string"
+                                    }
+                                  },
+                                  "required": ["type", "path"],
+                                  "additionalProperties": false
+                                }
+                              ]
+                            },
+                            "checkout": {
+                              "type": "object",
+                              "properties": {
+                                "ref": {
+                                  "type": "string"
+                                },
+                                "base_commit": {
+                                  "type": "string",
+                                  "minLength": 1
+                                },
+                                "resolve": {
+                                  "type": "string",
+                                  "enum": ["remote", "local"]
+                                },
+                                "ancestor": {
+                                  "type": "integer",
+                                  "minimum": 0
+                                }
+                              },
+                              "additionalProperties": false
+                            },
+                            "clone": {
+                              "type": "object",
+                              "properties": {
+                                "depth": {
+                                  "type": "integer",
+                                  "minimum": 1
+                                },
+                                "filter": {
+                                  "type": "string"
+                                },
+                                "sparse": {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "string"
+                                  }
+                                }
+                              },
+                              "additionalProperties": false
+                            }
+                          },
+                          "additionalProperties": false
+                        }
+                      },
+                      "hooks": {
+                        "type": "object",
+                        "properties": {
+                          "enabled": {
+                            "type": "boolean"
+                          },
+                          "before_all": {
+                            "type": "object",
+                            "properties": {
+                              "command": {
+                                "type": "array",
+                                "items": {
+                                  "type": "string"
+                                }
+                              },
+                              "script": {
+                                "type": "array",
+                                "items": {
                                   "type": "string"
                                 }
                               },
-                              "required": ["type", "value"],
-                              "additionalProperties": false
+                              "timeout_ms": {
+                                "type": "number"
+                              },
+                              "timeoutMs": {
+                                "type": "number"
+                              },
+                              "cwd": {
+                                "type": "string"
+                              },
+                              "reset": {
+                                "type": "string",
+                                "enum": ["none", "fast", "strict"]
+                              }
                             },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
+                            "additionalProperties": false
+                          },
+                          "before_each": {
+                            "type": "object",
+                            "properties": {
+                              "command": {
+                                "type": "array",
+                                "items": {
                                   "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "regex"
-                                },
-                                "value": {
+                                }
+                              },
+                              "script": {
+                                "type": "array",
+                                "items": {
                                   "type": "string"
                                 }
                               },
-                              "required": ["type", "value"],
-                              "additionalProperties": false
+                              "timeout_ms": {
+                                "type": "number"
+                              },
+                              "timeoutMs": {
+                                "type": "number"
+                              },
+                              "cwd": {
+                                "type": "string"
+                              },
+                              "reset": {
+                                "type": "string",
+                                "enum": ["none", "fast", "strict"]
+                              }
                             },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
+                            "additionalProperties": false
+                          },
+                          "after_each": {
+                            "type": "object",
+                            "properties": {
+                              "command": {
+                                "type": "array",
+                                "items": {
                                   "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["is-json", "is_json"]
                                 }
                               },
-                              "required": ["type"],
-                              "additionalProperties": false
+                              "script": {
+                                "type": "array",
+                                "items": {
+                                  "type": "string"
+                                }
+                              },
+                              "timeout_ms": {
+                                "type": "number"
+                              },
+                              "timeoutMs": {
+                                "type": "number"
+                              },
+                              "cwd": {
+                                "type": "string"
+                              },
+                              "reset": {
+                                "type": "string",
+                                "enum": ["none", "fast", "strict"]
+                              }
                             },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
+                            "additionalProperties": false
+                          },
+                          "after_all": {
+                            "type": "object",
+                            "properties": {
+                              "command": {
+                                "type": "array",
+                                "items": {
                                   "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "equals"
-                                },
-                                "value": {
+                                }
+                              },
+                              "script": {
+                                "type": "array",
+                                "items": {
                                   "type": "string"
                                 }
                               },
-                              "required": ["type", "value"],
-                              "additionalProperties": false
+                              "timeout_ms": {
+                                "type": "number"
+                              },
+                              "timeoutMs": {
+                                "type": "number"
+                              },
+                              "cwd": {
+                                "type": "string"
+                              },
+                              "reset": {
+                                "type": "string",
+                                "enum": ["none", "fast", "strict"]
+                              }
+                            },
+                            "additionalProperties": false
+                          }
+                        },
+                        "additionalProperties": false
+                      },
+                      "mode": {
+                        "type": "string",
+                        "enum": ["pooled", "temp", "static"]
+                      },
+                      "path": {
+                        "type": "string"
+                      },
+                      "docker": {
+                        "type": "object",
+                        "properties": {
+                          "image": {
+                            "type": "string"
+                          },
+                          "timeout": {
+                            "type": "integer",
+                            "minimum": 1
+                          },
+                          "memory": {
+                            "type": "string"
+                          },
+                          "cpus": {
+                            "type": "number",
+                            "minimum": 0.1
+                          }
+                        },
+                        "required": ["image"],
+                        "additionalProperties": false
+                      }
+                    },
+                    "additionalProperties": false
+                  },
+                  "metadata": {
+                    "type": "object",
+                    "additionalProperties": {}
+                  },
+                  "conversation_id": {
+                    "type": "string"
+                  },
+                  "suite": {
+                    "type": "string"
+                  },
+                  "note": {
+                    "type": "string"
+                  },
+                  "depends_on": {
+                    "type": "array",
+                    "items": {
+                      "type": "string"
+                    }
+                  },
+                  "on_dependency_failure": {
+                    "type": "string",
+                    "enum": ["skip", "fail", "run"]
+                  },
+                  "mode": {
+                    "type": "string",
+                    "enum": ["conversation"]
+                  },
+                  "turns": {
+                    "type": "array",
+                    "items": {
+                      "type": "object",
+                      "properties": {
+                        "input": {
+                          "anyOf": [
+                            {
+                              "type": "string"
                             },
                             {
-                              "type": "object",
-                              "properties": {
-                                "name": {
+                              "anyOf": [
+                                {
                                   "type": "string"
                                 },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "rubrics"
-                                },
-                                "criteria": {
+                                {
                                   "type": "array",
                                   "items": {
                                     "type": "object",
                                     "properties": {
-                                      "id": {
-                                        "type": "string"
+                                      "type": {
+                                        "type": "string",
+                                        "enum": ["text", "file", "image"]
                                       },
-                                      "outcome": {
+                                      "value": {
                                         "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number"
-                                      },
-                                      "required": {
-                                        "type": "boolean"
-                                      },
-                                      "min_score": {
-                                        "type": "number",
-                                        "exclusiveMinimum": true,
-                                        "minimum": 0,
-                                        "maximum": 1
-                                      },
-                                      "score_ranges": {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "object",
-                                          "properties": {
-                                            "score_range": {
-                                              "type": "array",
-                                              "minItems": 2,
-                                              "maxItems": 2,
-                                              "items": [
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                },
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                }
-                                              ]
-                                            },
-                                            "outcome": {
-                                              "type": "string",
-                                              "minLength": 1
-                                            }
-                                          },
-                                          "required": ["score_range", "outcome"],
-                                          "additionalProperties": false
-                                        }
                                       }
                                     },
+                                    "required": ["type", "value"],
                                     "additionalProperties": false
-                                  },
-                                  "minItems": 1
+                                  }
                                 }
-                              },
-                              "required": ["type", "criteria"],
-                              "additionalProperties": false
+                              ]
                             }
                           ]
-                        }
-                      },
-                      "evaluators": {
-                        "type": "array",
-                        "items": {
+                        },
+                        "expected_output": {
                           "anyOf": [
                             {
-                              "type": "object",
-                              "properties": {
-                                "name": {
+                              "type": "string"
+                            },
+                            {
+                              "anyOf": [
+                                {
                                   "type": "string"
                                 },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["code-grader", "code_grader"]
-                                },
-                                "command": {
-                                  "anyOf": [
-                                    {
-                                      "type": "string"
-                                    },
-                                    {
-                                      "type": "array",
-                                      "items": {
-                                        "type": "string"
-                                      }
-                                    }
-                                  ]
-                                },
-                                "script": {
-                                  "anyOf": [
-                                    {
-                                      "type": "string"
-                                    },
-                                    {
-                                      "type": "array",
-                                      "items": {
+                                {
+                                  "type": "array",
+                                  "items": {
+                                    "type": "object",
+                                    "properties": {
+                                      "type": {
+                                        "type": "string",
+                                        "enum": ["text", "file", "image"]
+                                      },
+                                      "value": {
                                         "type": "string"
                                       }
-                                    }
-                                  ]
-                                },
-                                "cwd": {
-                                  "type": "string"
-                                },
-                                "target": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
                                     },
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "max_calls": {
-                                          "type": "number"
-                                        }
-                                      },
-                                      "additionalProperties": false
-                                    }
-                                  ]
-                                },
-                                "config": {
-                                  "type": "object",
-                                  "additionalProperties": {}
-                                },
-                                "preprocessors": {
-                                  "type": "array",
-                                  "items": {
+                                    "required": ["type", "value"],
+                                    "additionalProperties": false
+                                  }
+                                }
+                              ]
+                            }
+                          ]
+                        },
+                        "assertions": {
+                          "type": "array",
+                          "items": {
+                            "anyOf": [
+                              {
+                                "type": "string"
+                              },
+                              {
+                                "anyOf": [
+                                  {
                                     "type": "object",
                                     "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
                                       "type": {
                                         "type": "string",
-                                        "minLength": 1
+                                        "enum": ["code-grader", "code_grader"]
+                                      },
+                                      "command": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string"
+                                          },
+                                          {
+                                            "type": "array",
+                                            "items": {
+                                              "type": "string"
+                                            }
+                                          }
+                                        ]
                                       },
-                                      "command": {
+                                      "script": {
                                         "anyOf": [
                                           {
                                             "type": "string"
@@ -8783,408 +11527,492 @@
                                             }
                                           }
                                         ]
+                                      },
+                                      "cwd": {
+                                        "type": "string"
+                                      },
+                                      "target": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "object",
+                                            "properties": {
+                                              "max_calls": {
+                                                "type": "number"
+                                              }
+                                            },
+                                            "additionalProperties": false
+                                          }
+                                        ]
+                                      },
+                                      "config": {
+                                        "type": "object",
+                                        "additionalProperties": {}
+                                      },
+                                      "preprocessors": {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "object",
+                                          "properties": {
+                                            "type": {
+                                              "type": "string",
+                                              "minLength": 1
+                                            },
+                                            "command": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string"
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            }
+                                          },
+                                          "required": ["type", "command"],
+                                          "additionalProperties": false
+                                        }
                                       }
                                     },
                                     "required": ["type", "command"],
                                     "additionalProperties": false
-                                  }
-                                }
-                              },
-                              "required": ["type", "command"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["llm-grader", "llm_grader"]
-                                },
-                                "prompt": {
-                                  "anyOf": [
-                                    {
-                                      "type": "string"
-                                    },
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "command": {
-                                          "anyOf": [
-                                            {
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "enum": ["llm-grader", "llm_grader"]
+                                      },
+                                      "prompt": {
+                                        "anyOf": [
+                                          {
+                                            "type": "string"
+                                          },
+                                          {
+                                            "type": "object",
+                                            "properties": {
+                                              "command": {
+                                                "anyOf": [
+                                                  {
+                                                    "type": "string"
+                                                  },
+                                                  {
+                                                    "type": "array",
+                                                    "items": {
+                                                      "type": "string"
+                                                    }
+                                                  }
+                                                ]
+                                              },
+                                              "script": {
+                                                "anyOf": [
+                                                  {
+                                                    "type": "string"
+                                                  },
+                                                  {
+                                                    "type": "array",
+                                                    "items": {
+                                                      "type": "string"
+                                                    }
+                                                  }
+                                                ]
+                                              },
+                                              "config": {
+                                                "type": "object",
+                                                "additionalProperties": {}
+                                              }
+                                            },
+                                            "additionalProperties": false
+                                          }
+                                        ]
+                                      },
+                                      "rubrics": {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "object",
+                                          "properties": {
+                                            "id": {
                                               "type": "string"
                                             },
-                                            {
-                                              "type": "array",
-                                              "items": {
-                                                "type": "string"
-                                              }
-                                            }
-                                          ]
-                                        },
-                                        "script": {
-                                          "anyOf": [
-                                            {
+                                            "outcome": {
                                               "type": "string"
                                             },
-                                            {
+                                            "weight": {
+                                              "type": "number"
+                                            },
+                                            "required": {
+                                              "type": "boolean"
+                                            },
+                                            "min_score": {
+                                              "type": "number",
+                                              "exclusiveMinimum": true,
+                                              "minimum": 0,
+                                              "maximum": 1
+                                            },
+                                            "score_ranges": {
                                               "type": "array",
                                               "items": {
-                                                "type": "string"
+                                                "type": "object",
+                                                "properties": {
+                                                  "score_range": {
+                                                    "type": "array",
+                                                    "minItems": 2,
+                                                    "maxItems": 2,
+                                                    "items": [
+                                                      {
+                                                        "type": "integer",
+                                                        "minimum": 0,
+                                                        "maximum": 10
+                                                      },
+                                                      {
+                                                        "type": "integer",
+                                                        "minimum": 0,
+                                                        "maximum": 10
+                                                      }
+                                                    ]
+                                                  },
+                                                  "outcome": {
+                                                    "type": "string",
+                                                    "minLength": 1
+                                                  }
+                                                },
+                                                "required": ["score_range", "outcome"],
+                                                "additionalProperties": false
                                               }
                                             }
-                                          ]
-                                        },
-                                        "config": {
-                                          "type": "object",
-                                          "additionalProperties": {}
+                                          },
+                                          "additionalProperties": false
                                         }
                                       },
-                                      "additionalProperties": false
-                                    }
-                                  ]
-                                },
-                                "rubrics": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "id": {
+                                      "model": {
                                         "type": "string"
                                       },
-                                      "outcome": {
+                                      "target": {
                                         "type": "string"
                                       },
-                                      "weight": {
-                                        "type": "number"
+                                      "config": {
+                                        "type": "object",
+                                        "additionalProperties": {}
                                       },
-                                      "required": {
-                                        "type": "boolean"
+                                      "max_steps": {
+                                        "type": "integer",
+                                        "minimum": 1,
+                                        "maximum": 50
                                       },
-                                      "min_score": {
+                                      "temperature": {
                                         "type": "number",
-                                        "exclusiveMinimum": true,
                                         "minimum": 0,
-                                        "maximum": 1
+                                        "maximum": 2
                                       },
-                                      "score_ranges": {
+                                      "preprocessors": {
                                         "type": "array",
                                         "items": {
                                           "type": "object",
                                           "properties": {
-                                            "score_range": {
-                                              "type": "array",
-                                              "minItems": 2,
-                                              "maxItems": 2,
-                                              "items": [
+                                            "type": {
+                                              "type": "string",
+                                              "minLength": 1
+                                            },
+                                            "command": {
+                                              "anyOf": [
                                                 {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
+                                                  "type": "string"
                                                 },
                                                 {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
                                                 }
                                               ]
-                                            },
-                                            "outcome": {
-                                              "type": "string",
-                                              "minLength": 1
                                             }
                                           },
-                                          "required": ["score_range", "outcome"],
+                                          "required": ["type", "command"],
                                           "additionalProperties": false
                                         }
                                       }
                                     },
+                                    "required": ["type"],
                                     "additionalProperties": false
-                                  }
-                                },
-                                "model": {
-                                  "type": "string"
-                                },
-                                "target": {
-                                  "type": "string"
-                                },
-                                "config": {
-                                  "type": "object",
-                                  "additionalProperties": {}
-                                },
-                                "max_steps": {
-                                  "type": "integer",
-                                  "minimum": 1,
-                                  "maximum": 50
-                                },
-                                "temperature": {
-                                  "type": "number",
-                                  "minimum": 0,
-                                  "maximum": 2
-                                },
-                                "preprocessors": {
-                                  "type": "array",
-                                  "items": {
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "include": {
+                                        "type": "string",
+                                        "minLength": 1
+                                      }
+                                    },
+                                    "required": ["include"],
+                                    "additionalProperties": false
+                                  },
+                                  {
                                     "type": "object",
                                     "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
                                       "type": {
                                         "type": "string",
-                                        "minLength": 1
+                                        "const": "composite"
                                       },
-                                      "command": {
+                                      "assertions": {
+                                        "type": "array",
+                                        "items": {}
+                                      },
+                                      "evaluators": {
+                                        "type": "array",
+                                        "items": {}
+                                      },
+                                      "aggregator": {
                                         "anyOf": [
                                           {
-                                            "type": "string"
+                                            "type": "object",
+                                            "properties": {
+                                              "type": {
+                                                "type": "string",
+                                                "const": "weighted_average"
+                                              },
+                                              "weights": {
+                                                "type": "object",
+                                                "additionalProperties": {
+                                                  "type": "number"
+                                                }
+                                              }
+                                            },
+                                            "required": ["type"],
+                                            "additionalProperties": false
                                           },
                                           {
-                                            "type": "array",
-                                            "items": {
-                                              "type": "string"
-                                            }
+                                            "type": "object",
+                                            "properties": {
+                                              "type": {
+                                                "type": "string",
+                                                "const": "threshold"
+                                              },
+                                              "threshold": {
+                                                "type": "number",
+                                                "minimum": 0,
+                                                "maximum": 1
+                                              }
+                                            },
+                                            "required": ["type", "threshold"],
+                                            "additionalProperties": false
+                                          },
+                                          {
+                                            "type": "object",
+                                            "properties": {
+                                              "type": {
+                                                "type": "string",
+                                                "const": "code-grader"
+                                              },
+                                              "path": {
+                                                "type": "string"
+                                              },
+                                              "cwd": {
+                                                "type": "string"
+                                              }
+                                            },
+                                            "required": ["type", "path"],
+                                            "additionalProperties": false
+                                          },
+                                          {
+                                            "type": "object",
+                                            "properties": {
+                                              "type": {
+                                                "type": "string",
+                                                "const": "llm-grader"
+                                              },
+                                              "prompt": {
+                                                "type": "string"
+                                              },
+                                              "model": {
+                                                "type": "string"
+                                              }
+                                            },
+                                            "required": ["type"],
+                                            "additionalProperties": false
                                           }
                                         ]
                                       }
                                     },
-                                    "required": ["type", "command"],
+                                    "required": ["type", "aggregator"],
                                     "additionalProperties": false
-                                  }
-                                }
-                              },
-                              "required": ["type"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "include": {
-                                  "type": "string",
-                                  "minLength": 1
-                                }
-                              },
-                              "required": ["include"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "composite"
-                                },
-                                "assertions": {
-                                  "type": "array",
-                                  "items": {}
-                                },
-                                "evaluators": {
-                                  "type": "array",
-                                  "items": {}
-                                },
-                                "aggregator": {
-                                  "anyOf": [
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "type": {
-                                          "type": "string",
-                                          "const": "weighted_average"
-                                        },
-                                        "weights": {
-                                          "type": "object",
-                                          "additionalProperties": {
-                                            "type": "number"
-                                          }
-                                        }
-                                      },
-                                      "required": ["type"],
-                                      "additionalProperties": false
-                                    },
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "type": {
-                                          "type": "string",
-                                          "const": "threshold"
-                                        },
-                                        "threshold": {
-                                          "type": "number",
-                                          "minimum": 0,
-                                          "maximum": 1
-                                        }
-                                      },
-                                      "required": ["type", "threshold"],
-                                      "additionalProperties": false
-                                    },
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "type": {
-                                          "type": "string",
-                                          "const": "code-grader"
-                                        },
-                                        "path": {
-                                          "type": "string"
-                                        },
-                                        "cwd": {
-                                          "type": "string"
-                                        }
-                                      },
-                                      "required": ["type", "path"],
-                                      "additionalProperties": false
-                                    },
-                                    {
-                                      "type": "object",
-                                      "properties": {
-                                        "type": {
-                                          "type": "string",
-                                          "const": "llm-grader"
-                                        },
-                                        "prompt": {
-                                          "type": "string"
-                                        },
-                                        "model": {
-                                          "type": "string"
-                                        }
-                                      },
-                                      "required": ["type"],
-                                      "additionalProperties": false
-                                    }
-                                  ]
-                                }
-                              },
-                              "required": ["type", "aggregator"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["tool-trajectory", "tool_trajectory"]
-                                },
-                                "mode": {
-                                  "type": "string",
-                                  "enum": ["any_order", "in_order", "exact", "subset", "superset"]
-                                },
-                                "minimums": {
-                                  "type": "object",
-                                  "additionalProperties": {
-                                    "type": "integer",
-                                    "minimum": 0
-                                  }
-                                },
-                                "expected": {
-                                  "type": "array",
-                                  "items": {
+                                  },
+                                  {
                                     "type": "object",
                                     "properties": {
-                                      "tool": {
+                                      "name": {
                                         "type": "string"
                                       },
-                                      "args": {
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
                                         "anyOf": [
                                           {
-                                            "type": "string",
-                                            "const": "any"
+                                            "type": "boolean"
                                           },
                                           {
-                                            "type": "object",
-                                            "additionalProperties": {}
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
                                           }
                                         ]
                                       },
-                                      "max_duration_ms": {
+                                      "min_score": {
                                         "type": "number",
-                                        "minimum": 0
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "enum": ["tool-trajectory", "tool_trajectory"]
+                                      },
+                                      "mode": {
+                                        "type": "string",
+                                        "enum": [
+                                          "any_order",
+                                          "in_order",
+                                          "exact",
+                                          "subset",
+                                          "superset"
+                                        ]
+                                      },
+                                      "minimums": {
+                                        "type": "object",
+                                        "additionalProperties": {
+                                          "type": "integer",
+                                          "minimum": 0
+                                        }
                                       },
-                                      "maxDurationMs": {
-                                        "type": "number",
-                                        "minimum": 0
+                                      "expected": {
+                                        "type": "array",
+                                        "items": {
+                                          "type": "object",
+                                          "properties": {
+                                            "tool": {
+                                              "type": "string"
+                                            },
+                                            "args": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string",
+                                                  "const": "any"
+                                                },
+                                                {
+                                                  "type": "object",
+                                                  "additionalProperties": {}
+                                                }
+                                              ]
+                                            },
+                                            "max_duration_ms": {
+                                              "type": "number",
+                                              "minimum": 0
+                                            },
+                                            "maxDurationMs": {
+                                              "type": "number",
+                                              "minimum": 0
+                                            },
+                                            "args_match": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string",
+                                                  "enum": ["exact", "ignore", "subset", "superset"]
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            },
+                                            "argsMatch": {
+                                              "anyOf": [
+                                                {
+                                                  "type": "string",
+                                                  "enum": ["exact", "ignore", "subset", "superset"]
+                                                },
+                                                {
+                                                  "type": "array",
+                                                  "items": {
+                                                    "type": "string"
+                                                  }
+                                                }
+                                              ]
+                                            }
+                                          },
+                                          "required": ["tool"],
+                                          "additionalProperties": false
+                                        }
                                       },
                                       "args_match": {
                                         "anyOf": [
@@ -9215,935 +12043,589 @@
                                         ]
                                       }
                                     },
-                                    "required": ["tool"],
+                                    "required": ["type", "mode"],
                                     "additionalProperties": false
-                                  }
-                                },
-                                "args_match": {
-                                  "anyOf": [
-                                    {
-                                      "type": "string",
-                                      "enum": ["exact", "ignore", "subset", "superset"]
-                                    },
-                                    {
-                                      "type": "array",
-                                      "items": {
-                                        "type": "string"
-                                      }
-                                    }
-                                  ]
-                                },
-                                "argsMatch": {
-                                  "anyOf": [
-                                    {
-                                      "type": "string",
-                                      "enum": ["exact", "ignore", "subset", "superset"]
-                                    },
-                                    {
-                                      "type": "array",
-                                      "items": {
-                                        "type": "string"
-                                      }
-                                    }
-                                  ]
-                                }
-                              },
-                              "required": ["type", "mode"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["field-accuracy", "field_accuracy"]
-                                },
-                                "fields": {
-                                  "type": "array",
-                                  "items": {
+                                  },
+                                  {
                                     "type": "object",
                                     "properties": {
-                                      "path": {
+                                      "name": {
                                         "type": "string"
                                       },
-                                      "match": {
-                                        "type": "string",
-                                        "enum": ["exact", "numeric_tolerance", "date"]
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
                                       },
                                       "required": {
-                                        "type": "boolean"
-                                      },
-                                      "weight": {
-                                        "type": "number"
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
                                       },
-                                      "tolerance": {
+                                      "min_score": {
                                         "type": "number",
-                                        "minimum": 0
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
                                       },
-                                      "relative": {
+                                      "negate": {
                                         "type": "boolean"
                                       },
-                                      "formats": {
+                                      "type": {
+                                        "type": "string",
+                                        "enum": ["field-accuracy", "field_accuracy"]
+                                      },
+                                      "fields": {
                                         "type": "array",
-                                        "items": {
-                                          "type": "string"
-                                        }
-                                      }
-                                    },
-                                    "required": ["path", "match"],
-                                    "additionalProperties": false
-                                  },
-                                  "minItems": 1
-                                },
-                                "aggregation": {
-                                  "type": "string",
-                                  "enum": ["weighted_average", "all_or_nothing"]
-                                }
-                              },
-                              "required": ["type", "fields"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "latency"
-                                },
-                                "threshold": {
-                                  "type": "number",
-                                  "minimum": 0
-                                }
-                              },
-                              "required": ["type", "threshold"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "cost"
-                                },
-                                "budget": {
-                                  "type": "number",
-                                  "minimum": 0
-                                }
-                              },
-                              "required": ["type", "budget"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                        "items": {
+                                          "type": "object",
+                                          "properties": {
+                                            "path": {
+                                              "type": "string"
+                                            },
+                                            "match": {
+                                              "type": "string",
+                                              "enum": ["exact", "numeric_tolerance", "date"]
+                                            },
+                                            "required": {
+                                              "type": "boolean"
+                                            },
+                                            "weight": {
+                                              "type": "number"
+                                            },
+                                            "tolerance": {
+                                              "type": "number",
+                                              "minimum": 0
+                                            },
+                                            "relative": {
+                                              "type": "boolean"
+                                            },
+                                            "formats": {
+                                              "type": "array",
+                                              "items": {
+                                                "type": "string"
+                                              }
+                                            }
+                                          },
+                                          "required": ["path", "match"],
+                                          "additionalProperties": false
+                                        },
+                                        "minItems": 1
+                                      },
+                                      "aggregation": {
+                                        "type": "string",
+                                        "enum": ["weighted_average", "all_or_nothing"]
+                                      }
                                     },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["token-usage", "token_usage"]
-                                },
-                                "max_total": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "max_input": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "max_output": {
-                                  "type": "number",
-                                  "minimum": 0
-                                }
-                              },
-                              "required": ["type"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                    "required": ["type", "fields"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "const": "latency"
+                                      },
+                                      "threshold": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      }
                                     },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["execution-metrics", "execution_metrics"]
-                                },
-                                "max_tool_calls": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "max_llm_calls": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "max_tokens": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "max_cost_usd": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "max_duration_ms": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "target_exploration_ratio": {
-                                  "type": "number",
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "exploration_tolerance": {
-                                  "type": "number",
-                                  "minimum": 0
-                                }
-                              },
-                              "required": ["type"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                    "required": ["type", "threshold"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "const": "cost"
+                                      },
+                                      "budget": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      }
                                     },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "contains"
-                                },
-                                "value": {
-                                  "type": "string"
-                                }
-                              },
-                              "required": ["type", "value"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                    "required": ["type", "budget"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "enum": ["token-usage", "token_usage"]
+                                      },
+                                      "max_total": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "max_input": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "max_output": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      }
                                     },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "regex"
-                                },
-                                "value": {
-                                  "type": "string"
-                                }
-                              },
-                              "required": ["type", "value"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                    "required": ["type"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "enum": ["execution-metrics", "execution_metrics"]
+                                      },
+                                      "max_tool_calls": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "max_llm_calls": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "max_tokens": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "max_cost_usd": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "max_duration_ms": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "target_exploration_ratio": {
+                                        "type": "number",
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "exploration_tolerance": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      }
                                     },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["is-json", "is_json"]
-                                }
-                              },
-                              "required": ["type"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                    "required": ["type"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "const": "contains"
+                                      },
+                                      "value": {
+                                        "type": "string"
+                                      }
                                     },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "equals"
-                                },
-                                "value": {
-                                  "type": "string"
-                                }
-                              },
-                              "required": ["type", "value"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
+                                    "required": ["type", "value"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "const": "regex"
+                                      },
+                                      "value": {
+                                        "type": "string"
+                                      }
                                     },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": true,
-                                      "minimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "min_score": {
-                                  "type": "number",
-                                  "exclusiveMinimum": true,
-                                  "minimum": 0,
-                                  "maximum": 1
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "rubrics"
-                                },
-                                "criteria": {
-                                  "type": "array",
-                                  "items": {
+                                    "required": ["type", "value"],
+                                    "additionalProperties": false
+                                  },
+                                  {
                                     "type": "object",
                                     "properties": {
-                                      "id": {
+                                      "name": {
                                         "type": "string"
                                       },
-                                      "outcome": {
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "enum": ["is-json", "is_json"]
+                                      }
+                                    },
+                                    "required": ["type"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
                                         "type": "string"
                                       },
                                       "weight": {
-                                        "type": "number"
+                                        "type": "number",
+                                        "minimum": 0
                                       },
                                       "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
+                                      "min_score": {
+                                        "type": "number",
+                                        "exclusiveMinimum": true,
+                                        "minimum": 0,
+                                        "maximum": 1
+                                      },
+                                      "negate": {
                                         "type": "boolean"
                                       },
+                                      "type": {
+                                        "type": "string",
+                                        "const": "equals"
+                                      },
+                                      "value": {
+                                        "type": "string"
+                                      }
+                                    },
+                                    "required": ["type", "value"],
+                                    "additionalProperties": false
+                                  },
+                                  {
+                                    "type": "object",
+                                    "properties": {
+                                      "name": {
+                                        "type": "string"
+                                      },
+                                      "weight": {
+                                        "type": "number",
+                                        "minimum": 0
+                                      },
+                                      "required": {
+                                        "anyOf": [
+                                          {
+                                            "type": "boolean"
+                                          },
+                                          {
+                                            "type": "number",
+                                            "exclusiveMinimum": true,
+                                            "minimum": 0,
+                                            "maximum": 1
+                                          }
+                                        ]
+                                      },
                                       "min_score": {
                                         "type": "number",
                                         "exclusiveMinimum": true,
                                         "minimum": 0,
                                         "maximum": 1
                                       },
-                                      "score_ranges": {
+                                      "negate": {
+                                        "type": "boolean"
+                                      },
+                                      "type": {
+                                        "type": "string",
+                                        "const": "rubrics"
+                                      },
+                                      "criteria": {
                                         "type": "array",
                                         "items": {
                                           "type": "object",
                                           "properties": {
-                                            "score_range": {
-                                              "type": "array",
-                                              "minItems": 2,
-                                              "maxItems": 2,
-                                              "items": [
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                },
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                }
-                                              ]
+                                            "id": {
+                                              "type": "string"
                                             },
                                             "outcome": {
-                                              "type": "string",
-                                              "minLength": 1
-                                            }
-                                          },
-                                          "required": ["score_range", "outcome"],
-                                          "additionalProperties": false
-                                        }
-                                      }
-                                    },
-                                    "additionalProperties": false
-                                  },
-                                  "minItems": 1
-                                }
-                              },
-                              "required": ["type", "criteria"],
-                              "additionalProperties": false
-                            }
-                          ]
-                        }
-                      },
-                      "skip_defaults": {
-                        "type": "boolean"
-                      },
-                      "cache": {
-                        "type": "boolean"
-                      },
-                      "trials": {
-                        "type": "object",
-                        "properties": {
-                          "count": {
-                            "type": "integer",
-                            "minimum": 1
-                          },
-                          "strategy": {
-                            "type": "string",
-                            "enum": ["pass_at_k", "mean", "confidence_interval"]
-                          },
-                          "cost_limit_usd": {
-                            "type": "number",
-                            "minimum": 0
-                          },
-                          "costLimitUsd": {
-                            "type": "number",
-                            "minimum": 0
-                          }
-                        },
-                        "required": ["count"],
-                        "additionalProperties": false
-                      },
-                      "total_budget_usd": {
-                        "type": "number",
-                        "minimum": 0
-                      },
-                      "totalBudgetUsd": {
-                        "type": "number",
-                        "minimum": 0
-                      },
-                      "fail_on_error": {
-                        "type": "boolean"
-                      },
-                      "failOnError": {
-                        "type": "boolean"
-                      },
-                      "threshold": {
-                        "type": "number",
-                        "minimum": 0,
-                        "maximum": 1
-                      }
-                    },
-                    "additionalProperties": false
-                  },
-                  "workspace": {
-                    "type": "object",
-                    "properties": {
-                      "template": {
-                        "type": "string"
-                      },
-                      "isolation": {
-                        "type": "string",
-                        "enum": ["shared", "per_test"]
-                      },
-                      "repos": {
-                        "type": "array",
-                        "items": {
-                          "type": "object",
-                          "properties": {
-                            "path": {
-                              "type": "string"
-                            },
-                            "source": {
-                              "anyOf": [
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "git"
-                                    },
-                                    "url": {
-                                      "type": "string",
-                                      "format": "uri"
-                                    }
-                                  },
-                                  "required": ["type", "url"],
-                                  "additionalProperties": false
-                                },
-                                {
-                                  "type": "object",
-                                  "properties": {
-                                    "type": {
-                                      "type": "string",
-                                      "const": "local"
+                                              "type": "string"
+                                            },
+                                            "weight": {
+                                              "type": "number"
+                                            },
+                                            "required": {
+                                              "type": "boolean"
+                                            },
+                                            "min_score": {
+                                              "type": "number",
+                                              "exclusiveMinimum": true,
+                                              "minimum": 0,
+                                              "maximum": 1
+                                            },
+                                            "score_ranges": {
+                                              "type": "array",
+                                              "items": {
+                                                "type": "object",
+                                                "properties": {
+                                                  "score_range": {
+                                                    "type": "array",
+                                                    "minItems": 2,
+                                                    "maxItems": 2,
+                                                    "items": [
+                                                      {
+                                                        "type": "integer",
+                                                        "minimum": 0,
+                                                        "maximum": 10
+                                                      },
+                                                      {
+                                                        "type": "integer",
+                                                        "minimum": 0,
+                                                        "maximum": 10
+                                                      }
+                                                    ]
+                                                  },
+                                                  "outcome": {
+                                                    "type": "string",
+                                                    "minLength": 1
+                                                  }
+                                                },
+                                                "required": ["score_range", "outcome"],
+                                                "additionalProperties": false
+                                              }
+                                            }
+                                          },
+                                          "additionalProperties": false
+                                        },
+                                        "minItems": 1
+                                      }
                                     },
-                                    "path": {
-                                      "type": "string"
-                                    }
-                                  },
-                                  "required": ["type", "path"],
-                                  "additionalProperties": false
-                                }
-                              ]
-                            },
-                            "checkout": {
-                              "type": "object",
-                              "properties": {
-                                "ref": {
-                                  "type": "string"
-                                },
-                                "base_commit": {
-                                  "type": "string",
-                                  "minLength": 1
-                                },
-                                "resolve": {
-                                  "type": "string",
-                                  "enum": ["remote", "local"]
-                                },
-                                "ancestor": {
-                                  "type": "integer",
-                                  "minimum": 0
-                                }
-                              },
-                              "additionalProperties": false
-                            },
-                            "clone": {
-                              "type": "object",
-                              "properties": {
-                                "depth": {
-                                  "type": "integer",
-                                  "minimum": 1
-                                },
-                                "filter": {
-                                  "type": "string"
-                                },
-                                "sparse": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "string"
+                                    "required": ["type", "criteria"],
+                                    "additionalProperties": false
                                   }
-                                }
-                              },
-                              "additionalProperties": false
-                            }
-                          },
-                          "additionalProperties": false
-                        }
-                      },
-                      "hooks": {
-                        "type": "object",
-                        "properties": {
-                          "enabled": {
-                            "type": "boolean"
-                          },
-                          "before_all": {
-                            "type": "object",
-                            "properties": {
-                              "command": {
-                                "type": "array",
-                                "items": {
-                                  "type": "string"
-                                }
-                              },
-                              "script": {
-                                "type": "array",
-                                "items": {
-                                  "type": "string"
-                                }
-                              },
-                              "timeout_ms": {
-                                "type": "number"
-                              },
-                              "timeoutMs": {
-                                "type": "number"
-                              },
-                              "cwd": {
-                                "type": "string"
-                              },
-                              "reset": {
-                                "type": "string",
-                                "enum": ["none", "fast", "strict"]
-                              }
-                            },
-                            "additionalProperties": false
-                          },
-                          "before_each": {
-                            "type": "object",
-                            "properties": {
-                              "command": {
-                                "type": "array",
-                                "items": {
-                                  "type": "string"
-                                }
-                              },
-                              "script": {
-                                "type": "array",
-                                "items": {
-                                  "type": "string"
-                                }
-                              },
-                              "timeout_ms": {
-                                "type": "number"
-                              },
-                              "timeoutMs": {
-                                "type": "number"
-                              },
-                              "cwd": {
-                                "type": "string"
-                              },
-                              "reset": {
-                                "type": "string",
-                                "enum": ["none", "fast", "strict"]
-                              }
-                            },
-                            "additionalProperties": false
-                          },
-                          "after_each": {
-                            "type": "object",
-                            "properties": {
-                              "command": {
-                                "type": "array",
-                                "items": {
-                                  "type": "string"
-                                }
-                              },
-                              "script": {
-                                "type": "array",
-                                "items": {
-                                  "type": "string"
-                                }
-                              },
-                              "timeout_ms": {
-                                "type": "number"
-                              },
-                              "timeoutMs": {
-                                "type": "number"
-                              },
-                              "cwd": {
-                                "type": "string"
-                              },
-                              "reset": {
-                                "type": "string",
-                                "enum": ["none", "fast", "strict"]
-                              }
-                            },
-                            "additionalProperties": false
-                          },
-                          "after_all": {
-                            "type": "object",
-                            "properties": {
-                              "command": {
-                                "type": "array",
-                                "items": {
-                                  "type": "string"
-                                }
-                              },
-                              "script": {
-                                "type": "array",
-                                "items": {
-                                  "type": "string"
-                                }
-                              },
-                              "timeout_ms": {
-                                "type": "number"
-                              },
-                              "timeoutMs": {
-                                "type": "number"
-                              },
-                              "cwd": {
-                                "type": "string"
-                              },
-                              "reset": {
-                                "type": "string",
-                                "enum": ["none", "fast", "strict"]
+                                ]
                               }
-                            },
-                            "additionalProperties": false
+                            ]
                           }
-                        },
-                        "additionalProperties": false
-                      },
-                      "mode": {
-                        "type": "string",
-                        "enum": ["pooled", "temp", "static"]
-                      },
-                      "path": {
-                        "type": "string"
+                        }
                       },
-                      "docker": {
-                        "type": "object",
-                        "properties": {
-                          "image": {
-                            "type": "string"
-                          },
-                          "timeout": {
-                            "type": "integer",
-                            "minimum": 1
-                          },
-                          "memory": {
-                            "type": "string"
-                          },
-                          "cpus": {
-                            "type": "number",
-                            "minimum": 0.1
-                          }
-                        },
-                        "required": ["image"],
-                        "additionalProperties": false
-                      }
+                      "required": ["input"],
+                      "additionalProperties": false
                     },
-                    "additionalProperties": false
-                  },
-                  "metadata": {
-                    "type": "object",
-                    "additionalProperties": {}
-                  },
-                  "conversation_id": {
-                    "type": "string"
-                  },
-                  "suite": {
-                    "type": "string"
-                  },
-                  "note": {
-                    "type": "string"
+                    "minItems": 1
                   },
-                  "depends_on": {
-                    "type": "array",
-                    "items": {
-                      "type": "string"
-                    }
+                  "aggregation": {
+                    "type": "string",
+                    "enum": ["mean", "min", "max"]
                   },
-                  "on_dependency_failure": {
+                  "on_turn_failure": {
                     "type": "string",
-                    "enum": ["skip", "fail", "run"]
+                    "enum": ["continue", "stop"]
+                  },
+                  "window_size": {
+                    "type": "integer",
+                    "minimum": 1
                   }
                 },
                 "required": ["id"],