From e784424954028ed50605ded7e2bb3d8d10b8242d Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 8 Apr 2026 05:38:07 +0000 Subject: [PATCH 1/2] feat(core): Docker workspace execution environments Implements Docker-based workspace type for coding benchmarks (SWE-bench). Agent runs on host, grader runs inside container. Closes #965 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 22888f22d55fe54692f742ca1e40e48866acc780 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 8 Apr 2026 05:59:30 +0000 Subject: [PATCH 2/2] feat(core): add Docker workspace execution environments Add support for running code-grader evaluations inside Docker containers, enabling benchmarks like SWE-bench to run in isolated container environments. New YAML schema: workspace: docker: image: timeout: 1800 # seconds memory: 4g # optional cpus: 2 # optional Changes: - Add DockerWorkspaceConfig type and Zod schema - Create DockerWorkspaceProvider with full container lifecycle management (pull, create, start, cp, exec, rm) using execFile for security - Update CodeEvaluator to run graders inside Docker when configured - Thread dockerConfig through orchestrator evaluation pipeline - Pull Docker image once during eval setup phase - Add comprehensive unit tests with mock executor (28 test cases) - Add docker-workspace example with EVAL.yaml - Regenerate eval-schema.json The provider uses a CommandExecutor interface for testability and always cleans up containers in finally blocks, even on errors. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- examples/features/docker-workspace/README.md | 45 + .../evals/docker-example.EVAL.yaml | 31 + .../evaluation/evaluators/code-evaluator.ts | 34 +- .../core/src/evaluation/evaluators/types.ts | 10 +- packages/core/src/evaluation/orchestrator.ts | 26 + packages/core/src/evaluation/types.ts | 18 + .../evaluation/validation/eval-file.schema.ts | 8 + .../evaluation/workspace/docker-workspace.ts | 218 ++ .../core/src/evaluation/workspace/index.ts | 7 + packages/core/src/evaluation/yaml-parser.ts | 24 +- .../workspace/docker-workspace.test.ts | 368 +++ .../references/eval-schema.json | 2541 +++++++++++++---- 12 files changed, 2843 insertions(+), 487 deletions(-) create mode 100644 examples/features/docker-workspace/README.md create mode 100644 examples/features/docker-workspace/evals/docker-example.EVAL.yaml create mode 100644 packages/core/src/evaluation/workspace/docker-workspace.ts create mode 100644 packages/core/test/evaluation/workspace/docker-workspace.test.ts diff --git a/examples/features/docker-workspace/README.md b/examples/features/docker-workspace/README.md new file mode 100644 index 000000000..2e15e675b --- /dev/null +++ b/examples/features/docker-workspace/README.md @@ -0,0 +1,45 @@ +# Docker Workspace Example + +This example demonstrates how to run code-grader evaluations inside Docker containers. + +## Use Case + +When evaluating coding agents (e.g., SWE-bench), the grader script needs to: +1. Apply the agent's patch to a repository +2. Run tests inside the repository's environment +3. Report pass/fail results + +Docker workspaces let you run this grading process inside a pre-built container +image that has the repository, dependencies, and test infrastructure ready. + +## How It Works + +``` +1. AgentV sends prompt to agent target → receives patch/diff output +2. docker pull (once per eval run, cached) +3. For each test case: + a. docker create --memory=4g --cpus=2 + b. docker start + c. docker exec -i < payload.json + d. Parse grader JSON output (score, assertions) + e. docker rm -f +4. Aggregate results +``` + +## YAML Schema + +```yaml +workspace: + docker: + image: swebench/sweb.eval.x86_64.django__django-15180 + timeout: 1800 # seconds (default: 1800) + memory: 4g # optional Docker memory limit + cpus: 2 # optional Docker CPU limit +``` + +## Running + +```bash +# Requires Docker to be installed and running +bun apps/cli/src/cli.ts eval examples/features/docker-workspace/evals/docker-example.EVAL.yaml +``` diff --git a/examples/features/docker-workspace/evals/docker-example.EVAL.yaml b/examples/features/docker-workspace/evals/docker-example.EVAL.yaml new file mode 100644 index 000000000..4b840afbb --- /dev/null +++ b/examples/features/docker-workspace/evals/docker-example.EVAL.yaml @@ -0,0 +1,31 @@ +# Docker Workspace Example +# Demonstrates running a code-grader inside a Docker container. +# +# This eval sends a coding prompt to an agent, then grades the agent's output +# inside a Docker container that has the target repository and test environment. +# +# Prerequisites: +# - Docker installed and running +# - The specified image available (will be pulled automatically) + +name: docker-workspace-example +description: Example eval using Docker workspace for grading + +workspace: + docker: + image: python:3.11-slim + timeout: 300 + memory: 2g + cpus: 1 + +execution: + target: dry-run + workers: 1 + +tests: + - id: hello-world + input: "Write a Python function that returns 'hello world'" + criteria: "The output should contain a working Python function" + assertions: + - type: code-grader + command: ["python", "-c", "import sys, json; data = json.load(sys.stdin); print(json.dumps({'score': 1.0, 'assertions': [{'text': 'grader ran in container', 'passed': True}]}))"] diff --git a/packages/core/src/evaluation/evaluators/code-evaluator.ts b/packages/core/src/evaluation/evaluators/code-evaluator.ts index 980816731..fa47981ab 100644 --- a/packages/core/src/evaluation/evaluators/code-evaluator.ts +++ b/packages/core/src/evaluation/evaluators/code-evaluator.ts @@ -210,13 +210,33 @@ export class CodeEvaluator implements Evaluator { const env = proxyEnv || workspaceEnv ? { ...proxyEnv, ...workspaceEnv } : undefined; try { - const stdout = await executeScript( - this.command, - inputPayload, - this.agentTimeoutMs, - this.cwd, - env, - ); + let stdout: string; + if (context.dockerConfig) { + // Docker execution mode: run grader inside a container + const { DockerWorkspaceProvider } = await import('../workspace/docker-workspace.js'); + const dockerProvider = new DockerWorkspaceProvider(context.dockerConfig); + const result = await dockerProvider.runGraderInContainer({ + command: [...this.command], + stdin: inputPayload, + }); + if (result.exitCode !== 0) { + const trimmedErr = result.stderr.trim(); + throw new Error( + trimmedErr.length > 0 + ? `Code evaluator exited with code ${result.exitCode}: ${trimmedErr}` + : `Code evaluator exited with code ${result.exitCode}`, + ); + } + stdout = result.stdout.trim(); + } else { + stdout = await executeScript( + this.command, + inputPayload, + this.agentTimeoutMs, + this.cwd, + env, + ); + } const parsed = parseJsonSafe(stdout); const score = clampScore(typeof parsed?.score === 'number' ? parsed.score : 0); const assertions: AssertionEntry[] = Array.isArray(parsed?.assertions) diff --git a/packages/core/src/evaluation/evaluators/types.ts b/packages/core/src/evaluation/evaluators/types.ts index 9a554bab9..6d299d4d4 100644 --- a/packages/core/src/evaluation/evaluators/types.ts +++ b/packages/core/src/evaluation/evaluators/types.ts @@ -1,7 +1,13 @@ import type { ResolvedTarget } from '../providers/targets.js'; import type { ChatPrompt, Message, Provider } from '../providers/types.js'; import type { TokenUsage, TraceSummary } from '../trace.js'; -import type { EvalTest, EvaluationVerdict, EvaluatorConfig, JsonObject } from '../types.js'; +import type { + DockerWorkspaceConfig, + EvalTest, + EvaluationVerdict, + EvaluatorConfig, + JsonObject, +} from '../types.js'; export type { EvaluationVerdict }; @@ -50,6 +56,8 @@ export interface EvaluationContext { readonly fileChanges?: string; /** Absolute path to the workspace directory (when workspace_template is configured) */ readonly workspacePath?: string; + /** Docker workspace config: when present, code-grader commands run inside a container */ + readonly dockerConfig?: DockerWorkspaceConfig; } export interface EvaluationScore { diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 416fa1ba2..4e6306fc7 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -759,6 +759,21 @@ export async function runEvaluation( } } + // --- Docker workspace: pull image once at setup --- + const suiteDockerConfig = suiteWorkspace?.docker; + if (suiteDockerConfig) { + setupLog(`pulling Docker image: ${suiteDockerConfig.image}`); + const { DockerWorkspaceProvider } = await import('./workspace/docker-workspace.js'); + const dockerSetup = new DockerWorkspaceProvider(suiteDockerConfig); + if (!(await dockerSetup.isDockerAvailable())) { + throw new Error( + 'Docker workspace configured but Docker CLI is not available. Install Docker and ensure it is running.', + ); + } + await dockerSetup.pullImage(); + setupLog('Docker image pull complete'); + } + // Execute before_all (runs ONCE before first test per workspace) const suiteHooksEnabled = hooksEnabled(suiteWorkspace); const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all; @@ -1830,6 +1845,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise { @@ -2114,6 +2131,7 @@ async function evaluateCandidate(options: { availableTargets, fileChanges, workspacePath, + dockerConfig, threshold: evalThreshold, } = options; @@ -2141,6 +2159,7 @@ async function evaluateCandidate(options: { availableTargets, fileChanges, workspacePath, + dockerConfig, threshold: evalThreshold, }); @@ -2226,6 +2245,7 @@ async function runEvaluatorsForCase(options: { readonly availableTargets?: readonly string[]; readonly fileChanges?: string; readonly workspacePath?: string; + readonly dockerConfig?: import('./types.js').DockerWorkspaceConfig; readonly threshold?: number; }): Promise<{ score: EvaluationScore; scores?: EvaluatorResult[] }> { const { @@ -2251,6 +2271,7 @@ async function runEvaluatorsForCase(options: { availableTargets, fileChanges, workspacePath, + dockerConfig, threshold, } = options; @@ -2279,6 +2300,7 @@ async function runEvaluatorsForCase(options: { availableTargets, fileChanges, workspacePath, + dockerConfig, threshold, }); } @@ -2313,6 +2335,7 @@ async function runEvaluatorsForCase(options: { availableTargets, fileChanges, workspacePath, + dockerConfig, ...(implicitEvaluator ? { evaluator: implicitEvaluator } : {}), }); @@ -2357,6 +2380,7 @@ async function runEvaluatorList(options: { readonly availableTargets?: readonly string[]; readonly fileChanges?: string; readonly workspacePath?: string; + readonly dockerConfig?: import('./types.js').DockerWorkspaceConfig; readonly threshold?: number; }): Promise<{ score: EvaluationScore; scores: EvaluatorResult[] }> { const { @@ -2383,6 +2407,7 @@ async function runEvaluatorList(options: { availableTargets, fileChanges, workspacePath, + dockerConfig, } = options; const scored: Array<{ @@ -2416,6 +2441,7 @@ async function runEvaluatorList(options: { availableTargets, fileChanges, workspacePath, + dockerConfig, }; // Build the dispatch context for evaluator factories diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index ae21d986b..8614d54ee 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -276,6 +276,22 @@ export type WorkspaceHooksConfig = { readonly after_all?: WorkspaceHookConfig; }; +/** + * Docker-based workspace configuration. + * When present, code-grader commands run inside a Docker container + * instead of on the host. + */ +export type DockerWorkspaceConfig = { + /** Docker image to use (e.g. 'swebench/sweb.eval.x86_64.django__django-15180') */ + readonly image: string; + /** Container execution timeout in seconds (default: 1800) */ + readonly timeout?: number; + /** Memory limit (e.g. '4g', '512m') */ + readonly memory?: string; + /** CPU limit (e.g. 2, 0.5) */ + readonly cpus?: number; +}; + export type WorkspaceConfig = { /** Template directory or .code-workspace file. Directories are copied to temp workspace. * .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */ @@ -290,6 +306,8 @@ export type WorkspaceConfig = { readonly mode?: 'pooled' | 'temp' | 'static'; /** Required when mode=static: use this existing directory directly */ readonly path?: string; + /** Docker-based workspace: run grader commands inside a container */ + readonly docker?: DockerWorkspaceConfig; }; export type CodeEvaluatorConfig = { diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index e0af2feee..46b1ecd1f 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -298,6 +298,13 @@ const WorkspaceHooksSchema = z.object({ after_all: WorkspaceHookSchema.optional(), }); +const DockerWorkspaceSchema = z.object({ + image: z.string(), + timeout: z.number().int().min(1).optional(), + memory: z.string().optional(), + cpus: z.number().min(0.1).optional(), +}); + const WorkspaceSchema = z .object({ template: z.string().optional(), @@ -306,6 +313,7 @@ const WorkspaceSchema = z hooks: WorkspaceHooksSchema.optional(), mode: z.enum(['pooled', 'temp', 'static']).optional(), path: z.string().optional(), + docker: DockerWorkspaceSchema.optional(), }) .strict(); diff --git a/packages/core/src/evaluation/workspace/docker-workspace.ts b/packages/core/src/evaluation/workspace/docker-workspace.ts new file mode 100644 index 000000000..1ce4f25e3 --- /dev/null +++ b/packages/core/src/evaluation/workspace/docker-workspace.ts @@ -0,0 +1,218 @@ +/** + * Docker workspace provider — manages Docker container lifecycle for eval grading. + * + * Flow: pull image → create container → copy files in → exec grader → parse output → destroy container. + * All Docker commands use `execFile` (no shell) for security. + * + * To add a new Docker command: add a method that calls `this.exec(...)` with the appropriate argv. + * + * Design decisions: + * - CommandExecutor interface for testability (mock `execFile` in tests) + * - Always `docker rm -f` in cleanup, even on errors (try/finally) + * - Lazy-loaded: non-Docker evals never import this module + */ + +import type { DockerWorkspaceConfig } from '../types.js'; + +/** Result of a command execution */ +export interface ExecResult { + readonly stdout: string; + readonly stderr: string; + readonly exitCode: number; +} + +/** Abstraction over process execution for testability */ +export interface CommandExecutor { + exec( + argv: readonly string[], + options?: { timeoutMs?: number; stdin?: string }, + ): Promise; +} + +/** + * Default command executor using Bun.spawn / Node child_process. + * Mirrors the pattern in runtime/exec.ts. + */ +export class DefaultCommandExecutor implements CommandExecutor { + async exec( + argv: readonly string[], + options: { timeoutMs?: number; stdin?: string } = {}, + ): Promise { + const { execFileWithStdin } = await import('../../runtime/exec.js'); + return execFileWithStdin(argv, options.stdin ?? '', { + timeoutMs: options.timeoutMs, + }); + } +} + +/** Options for creating a Docker container */ +export interface CreateContainerOptions { + readonly image: string; + readonly memory?: string; + readonly cpus?: number; +} + +/** Options for executing a command inside a container */ +export interface ExecInContainerOptions { + readonly containerId: string; + readonly command: readonly string[]; + readonly timeoutMs?: number; + readonly stdin?: string; +} + +const DEFAULT_TIMEOUT_S = 1800; + +/** + * Manages Docker container lifecycle for workspace-based evaluations. + * + * Usage: + * const docker = new DockerWorkspaceProvider(config); + * await docker.pullImage(); + * const containerId = await docker.createContainer(); + * try { + * await docker.copyToContainer(containerId, localPath, containerPath); + * const output = await docker.execInContainer({ containerId, command: [...] }); + * // parse output... + * } finally { + * await docker.removeContainer(containerId); + * } + */ +export class DockerWorkspaceProvider { + private readonly config: DockerWorkspaceConfig; + private readonly executor: CommandExecutor; + private readonly timeoutMs: number; + + constructor(config: DockerWorkspaceConfig, executor?: CommandExecutor) { + this.config = config; + this.executor = executor ?? new DefaultCommandExecutor(); + this.timeoutMs = (config.timeout ?? DEFAULT_TIMEOUT_S) * 1000; + } + + /** Check whether the Docker CLI is available on the host. */ + async isDockerAvailable(): Promise { + try { + const result = await this.executor.exec( + ['docker', 'version', '--format', '{{.Server.Version}}'], + { + timeoutMs: 10_000, + }, + ); + return result.exitCode === 0; + } catch { + return false; + } + } + + /** Pull the configured Docker image. No-op if already cached locally. */ + async pullImage(): Promise { + const result = await this.executor.exec(['docker', 'pull', this.config.image], { + timeoutMs: this.timeoutMs, + }); + if (result.exitCode !== 0) { + throw new Error(`docker pull failed (exit ${result.exitCode}): ${result.stderr.trim()}`); + } + } + + /** Create a stopped container from the configured image with resource limits. Returns container ID. */ + async createContainer(): Promise { + const argv: string[] = ['docker', 'create']; + + if (this.config.memory) { + argv.push(`--memory=${this.config.memory}`); + } + if (this.config.cpus !== undefined) { + argv.push(`--cpus=${this.config.cpus}`); + } + + // Keep the container alive with a long sleep so we can exec into it + argv.push(this.config.image, 'sleep', 'infinity'); + + const result = await this.executor.exec(argv, { timeoutMs: 30_000 }); + if (result.exitCode !== 0) { + throw new Error(`docker create failed (exit ${result.exitCode}): ${result.stderr.trim()}`); + } + return result.stdout.trim(); + } + + /** Start a previously created container. */ + async startContainer(containerId: string): Promise { + const result = await this.executor.exec(['docker', 'start', containerId], { + timeoutMs: 30_000, + }); + if (result.exitCode !== 0) { + throw new Error(`docker start failed (exit ${result.exitCode}): ${result.stderr.trim()}`); + } + } + + /** Copy a local file or directory into a running container. */ + async copyToContainer( + containerId: string, + localPath: string, + containerPath: string, + ): Promise { + const result = await this.executor.exec( + ['docker', 'cp', localPath, `${containerId}:${containerPath}`], + { timeoutMs: 60_000 }, + ); + if (result.exitCode !== 0) { + throw new Error(`docker cp failed (exit ${result.exitCode}): ${result.stderr.trim()}`); + } + } + + /** + * Execute a command inside a running container. + * If stdin is provided, it is piped via `docker exec -i`. + */ + async execInContainer(options: ExecInContainerOptions): Promise { + const { containerId, command, timeoutMs, stdin } = options; + const argv: string[] = ['docker', 'exec']; + + if (stdin !== undefined) { + argv.push('-i'); + } + + argv.push(containerId, ...command); + + return this.executor.exec(argv, { + timeoutMs: timeoutMs ?? this.timeoutMs, + stdin, + }); + } + + /** Force-remove a container (always succeeds, even if container doesn't exist). */ + async removeContainer(containerId: string): Promise { + try { + await this.executor.exec(['docker', 'rm', '-f', containerId], { + timeoutMs: 30_000, + }); + } catch { + // Best-effort cleanup — don't throw on removal failure + } + } + + /** Full lifecycle: create → start → exec → cleanup. Convenience for single-command grading. */ + async runGraderInContainer(options: { + readonly command: readonly string[]; + readonly stdin?: string; + readonly copyFiles?: ReadonlyArray<{ localPath: string; containerPath: string }>; + }): Promise { + const containerId = await this.createContainer(); + try { + await this.startContainer(containerId); + + if (options.copyFiles) { + for (const file of options.copyFiles) { + await this.copyToContainer(containerId, file.localPath, file.containerPath); + } + } + + return await this.execInContainer({ + containerId, + command: options.command, + stdin: options.stdin, + }); + } finally { + await this.removeContainer(containerId); + } + } +} diff --git a/packages/core/src/evaluation/workspace/index.ts b/packages/core/src/evaluation/workspace/index.ts index cb7fa2799..669062d67 100644 --- a/packages/core/src/evaluation/workspace/index.ts +++ b/packages/core/src/evaluation/workspace/index.ts @@ -22,3 +22,10 @@ export { type PoolSlot, } from './pool-manager.js'; export { scanRepoDeps, type RepoDep, type DepsScanResult } from './deps-scanner.js'; +export { + DockerWorkspaceProvider, + type CommandExecutor, + type ExecResult, + type CreateContainerOptions, + type ExecInContainerOptions, +} from './docker-workspace.js'; diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 72ef09b2e..b0c58305b 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -36,6 +36,7 @@ import { } from './loaders/shorthand-expansion.js'; import { parseMetadata } from './metadata.js'; import type { + DockerWorkspaceConfig, EvalTest, JsonObject, JsonValue, @@ -706,7 +707,10 @@ function parseWorkspaceConfig(raw: unknown, evalFileDir: string): WorkspaceConfi const workspacePath = typeof obj.path === 'string' ? obj.path : undefined; const mode = explicitMode ?? (workspacePath ? 'static' : undefined); - if (!template && !isolation && !repos && !hooks && !mode && !workspacePath) return undefined; + const docker = parseDockerWorkspaceConfig(obj.docker); + + if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker) + return undefined; return { ...(template !== undefined && { template }), @@ -715,6 +719,23 @@ function parseWorkspaceConfig(raw: unknown, evalFileDir: string): WorkspaceConfi ...(hooks !== undefined && { hooks }), ...(mode !== undefined && { mode }), ...(workspacePath !== undefined && { path: workspacePath }), + ...(docker !== undefined && { docker }), + }; +} + +/** + * Parse a DockerWorkspaceConfig from raw YAML value. + */ +function parseDockerWorkspaceConfig(raw: unknown): DockerWorkspaceConfig | undefined { + if (!isJsonObject(raw)) return undefined; + const obj = raw as Record; + if (typeof obj.image !== 'string') return undefined; + + return { + image: obj.image, + ...(typeof obj.timeout === 'number' && { timeout: obj.timeout }), + ...(typeof obj.memory === 'string' && { memory: obj.memory }), + ...(typeof obj.cpus === 'number' && { cpus: obj.cpus }), }; } @@ -759,6 +780,7 @@ function mergeWorkspaceConfigs( ...(hasHooks && { hooks: mergedHooks as WorkspaceHooksConfig }), mode: caseLevel.mode ?? suiteLevel.mode, path: caseLevel.path ?? suiteLevel.path, + docker: caseLevel.docker ?? suiteLevel.docker, }; } diff --git a/packages/core/test/evaluation/workspace/docker-workspace.test.ts b/packages/core/test/evaluation/workspace/docker-workspace.test.ts new file mode 100644 index 000000000..9452e0513 --- /dev/null +++ b/packages/core/test/evaluation/workspace/docker-workspace.test.ts @@ -0,0 +1,368 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; +import { + type CommandExecutor, + DockerWorkspaceProvider, + type ExecResult, +} from '../../../src/evaluation/workspace/docker-workspace.js'; + +/** + * Mock command executor for testing Docker workspace provider. + * Records all calls and returns configurable responses. + */ +class MockExecutor implements CommandExecutor { + readonly calls: Array<{ + argv: readonly string[]; + options?: { timeoutMs?: number; stdin?: string }; + }> = []; + private responses: ExecResult[] = []; + private defaultResponse: ExecResult = { stdout: '', stderr: '', exitCode: 0 }; + + /** Queue a response for the next exec call */ + pushResponse(response: Partial): void { + this.responses.push({ ...this.defaultResponse, ...response }); + } + + /** Set the default response for all unqueued calls */ + setDefault(response: Partial): void { + this.defaultResponse = { ...this.defaultResponse, ...response }; + } + + async exec( + argv: readonly string[], + options?: { timeoutMs?: number; stdin?: string }, + ): Promise { + this.calls.push({ argv, options }); + return this.responses.shift() ?? { ...this.defaultResponse }; + } + + /** Get the argv of the Nth call (0-indexed) */ + callArgv(n: number): readonly string[] { + return this.calls[n]?.argv ?? []; + } + + /** Get the options of the Nth call */ + callOptions(n: number): { timeoutMs?: number; stdin?: string } | undefined { + return this.calls[n]?.options; + } +} + +describe('DockerWorkspaceProvider', () => { + let executor: MockExecutor; + + beforeEach(() => { + executor = new MockExecutor(); + }); + + describe('isDockerAvailable', () => { + it('returns true when docker version succeeds', async () => { + executor.pushResponse({ stdout: '24.0.7', exitCode: 0 }); + const provider = new DockerWorkspaceProvider({ image: 'test:latest' }, executor); + expect(await provider.isDockerAvailable()).toBe(true); + expect(executor.callArgv(0)).toEqual([ + 'docker', + 'version', + '--format', + '{{.Server.Version}}', + ]); + }); + + it('returns false when docker version fails', async () => { + executor.pushResponse({ exitCode: 1, stderr: 'command not found' }); + const provider = new DockerWorkspaceProvider({ image: 'test:latest' }, executor); + expect(await provider.isDockerAvailable()).toBe(false); + }); + + it('returns false when executor throws', async () => { + const throwingExecutor: CommandExecutor = { + exec: async () => { + throw new Error('not found'); + }, + }; + const provider = new DockerWorkspaceProvider({ image: 'test:latest' }, throwingExecutor); + expect(await provider.isDockerAvailable()).toBe(false); + }); + }); + + describe('pullImage', () => { + it('calls docker pull with the configured image', async () => { + executor.pushResponse({ stdout: 'Pull complete\n', exitCode: 0 }); + const provider = new DockerWorkspaceProvider({ image: 'myimage:v1' }, executor); + await provider.pullImage(); + expect(executor.callArgv(0)).toEqual(['docker', 'pull', 'myimage:v1']); + }); + + it('throws on pull failure', async () => { + executor.pushResponse({ exitCode: 1, stderr: 'manifest not found' }); + const provider = new DockerWorkspaceProvider({ image: 'bad:image' }, executor); + await expect(provider.pullImage()).rejects.toThrow('docker pull failed'); + }); + + it('uses configured timeout', async () => { + executor.pushResponse({ exitCode: 0 }); + const provider = new DockerWorkspaceProvider({ image: 'img:1', timeout: 60 }, executor); + await provider.pullImage(); + expect(executor.callOptions(0)?.timeoutMs).toBe(60_000); + }); + }); + + describe('createContainer', () => { + it('creates container with image and sleep command', async () => { + executor.pushResponse({ stdout: 'abc123\n', exitCode: 0 }); + const provider = new DockerWorkspaceProvider({ image: 'myimage:v1' }, executor); + const id = await provider.createContainer(); + expect(id).toBe('abc123'); + expect(executor.callArgv(0)).toEqual(['docker', 'create', 'myimage:v1', 'sleep', 'infinity']); + }); + + it('includes memory limit when configured', async () => { + executor.pushResponse({ stdout: 'abc123\n', exitCode: 0 }); + const provider = new DockerWorkspaceProvider({ image: 'img:1', memory: '4g' }, executor); + await provider.createContainer(); + expect(executor.callArgv(0)).toContain('--memory=4g'); + }); + + it('includes CPU limit when configured', async () => { + executor.pushResponse({ stdout: 'abc123\n', exitCode: 0 }); + const provider = new DockerWorkspaceProvider({ image: 'img:1', cpus: 2 }, executor); + await provider.createContainer(); + expect(executor.callArgv(0)).toContain('--cpus=2'); + }); + + it('includes both resource limits', async () => { + executor.pushResponse({ stdout: 'abc123\n', exitCode: 0 }); + const provider = new DockerWorkspaceProvider( + { image: 'img:1', memory: '2g', cpus: 0.5 }, + executor, + ); + await provider.createContainer(); + const argv = executor.callArgv(0); + expect(argv).toContain('--memory=2g'); + expect(argv).toContain('--cpus=0.5'); + }); + + it('throws on create failure', async () => { + executor.pushResponse({ exitCode: 125, stderr: 'no such image' }); + const provider = new DockerWorkspaceProvider({ image: 'bad:img' }, executor); + await expect(provider.createContainer()).rejects.toThrow('docker create failed'); + }); + }); + + describe('startContainer', () => { + it('starts a container by ID', async () => { + executor.pushResponse({ exitCode: 0 }); + const provider = new DockerWorkspaceProvider({ image: 'img:1' }, executor); + await provider.startContainer('abc123'); + expect(executor.callArgv(0)).toEqual(['docker', 'start', 'abc123']); + }); + + it('throws on start failure', async () => { + executor.pushResponse({ exitCode: 1, stderr: 'container not found' }); + const provider = new DockerWorkspaceProvider({ image: 'img:1' }, executor); + await expect(provider.startContainer('bad')).rejects.toThrow('docker start failed'); + }); + }); + + describe('copyToContainer', () => { + it('copies local path to container path', async () => { + executor.pushResponse({ exitCode: 0 }); + const provider = new DockerWorkspaceProvider({ image: 'img:1' }, executor); + await provider.copyToContainer('abc123', '/local/patch.diff', '/tmp/patch.diff'); + expect(executor.callArgv(0)).toEqual([ + 'docker', + 'cp', + '/local/patch.diff', + 'abc123:/tmp/patch.diff', + ]); + }); + + it('throws on cp failure', async () => { + executor.pushResponse({ exitCode: 1, stderr: 'no such container' }); + const provider = new DockerWorkspaceProvider({ image: 'img:1' }, executor); + await expect(provider.copyToContainer('bad', '/a', '/b')).rejects.toThrow('docker cp failed'); + }); + }); + + describe('execInContainer', () => { + it('executes command in container', async () => { + executor.pushResponse({ + stdout: '{"score": 1.0, "assertions": []}', + exitCode: 0, + }); + const provider = new DockerWorkspaceProvider({ image: 'img:1' }, executor); + const result = await provider.execInContainer({ + containerId: 'abc123', + command: ['/bin/bash', '-c', 'python grade.py'], + }); + expect(result.stdout).toContain('"score": 1.0'); + expect(executor.callArgv(0)).toEqual([ + 'docker', + 'exec', + 'abc123', + '/bin/bash', + '-c', + 'python grade.py', + ]); + }); + + it('adds -i flag when stdin is provided', async () => { + executor.pushResponse({ stdout: '{}', exitCode: 0 }); + const provider = new DockerWorkspaceProvider({ image: 'img:1' }, executor); + await provider.execInContainer({ + containerId: 'abc123', + command: ['cat'], + stdin: 'hello', + }); + expect(executor.callArgv(0)).toEqual(['docker', 'exec', '-i', 'abc123', 'cat']); + expect(executor.callOptions(0)?.stdin).toBe('hello'); + }); + + it('uses custom timeout', async () => { + executor.pushResponse({ exitCode: 0 }); + const provider = new DockerWorkspaceProvider({ image: 'img:1' }, executor); + await provider.execInContainer({ + containerId: 'abc123', + command: ['true'], + timeoutMs: 5000, + }); + expect(executor.callOptions(0)?.timeoutMs).toBe(5000); + }); + + it('uses default timeout from config', async () => { + executor.pushResponse({ exitCode: 0 }); + const provider = new DockerWorkspaceProvider({ image: 'img:1', timeout: 120 }, executor); + await provider.execInContainer({ + containerId: 'abc123', + command: ['true'], + }); + expect(executor.callOptions(0)?.timeoutMs).toBe(120_000); + }); + }); + + describe('removeContainer', () => { + it('force-removes a container', async () => { + executor.pushResponse({ exitCode: 0 }); + const provider = new DockerWorkspaceProvider({ image: 'img:1' }, executor); + await provider.removeContainer('abc123'); + expect(executor.callArgv(0)).toEqual(['docker', 'rm', '-f', 'abc123']); + }); + + it('does not throw when removal fails', async () => { + executor.pushResponse({ exitCode: 1, stderr: 'no such container' }); + const provider = new DockerWorkspaceProvider({ image: 'img:1' }, executor); + // Should not throw + await provider.removeContainer('nonexistent'); + }); + + it('does not throw when executor throws', async () => { + const throwingExecutor: CommandExecutor = { + exec: async () => { + throw new Error('connection refused'); + }, + }; + const provider = new DockerWorkspaceProvider({ image: 'img:1' }, throwingExecutor); + await provider.removeContainer('abc123'); + }); + }); + + describe('runGraderInContainer', () => { + it('runs full lifecycle: create → start → exec → cleanup', async () => { + // create returns container ID + executor.pushResponse({ stdout: 'container-id-123\n', exitCode: 0 }); + // start succeeds + executor.pushResponse({ exitCode: 0 }); + // exec returns grader output + executor.pushResponse({ + stdout: '{"score": 0.75, "assertions": [{"text": "test passed", "passed": true}]}', + exitCode: 0, + }); + // rm succeeds + executor.pushResponse({ exitCode: 0 }); + + const provider = new DockerWorkspaceProvider({ image: 'test:latest' }, executor); + const result = await provider.runGraderInContainer({ + command: ['python', 'grade.py'], + stdin: '{"input": "test"}', + }); + + expect(result.stdout).toContain('"score": 0.75'); + expect(executor.calls).toHaveLength(4); + // Verify lifecycle order: create, start, exec, rm + expect(executor.callArgv(0)[1]).toBe('create'); + expect(executor.callArgv(1)[1]).toBe('start'); + expect(executor.callArgv(2)[1]).toBe('exec'); + expect(executor.callArgv(3)[1]).toBe('rm'); + }); + + it('copies files before exec when copyFiles is specified', async () => { + executor.pushResponse({ stdout: 'cid\n', exitCode: 0 }); // create + executor.pushResponse({ exitCode: 0 }); // start + executor.pushResponse({ exitCode: 0 }); // cp file 1 + executor.pushResponse({ exitCode: 0 }); // cp file 2 + executor.pushResponse({ stdout: '{"score": 1}', exitCode: 0 }); // exec + executor.pushResponse({ exitCode: 0 }); // rm + + const provider = new DockerWorkspaceProvider({ image: 'img:1' }, executor); + await provider.runGraderInContainer({ + command: ['grade'], + copyFiles: [ + { localPath: '/host/a.diff', containerPath: '/tmp/a.diff' }, + { localPath: '/host/b.txt', containerPath: '/tmp/b.txt' }, + ], + }); + + expect(executor.calls).toHaveLength(6); + expect(executor.callArgv(2)).toEqual(['docker', 'cp', '/host/a.diff', 'cid:/tmp/a.diff']); + expect(executor.callArgv(3)).toEqual(['docker', 'cp', '/host/b.txt', 'cid:/tmp/b.txt']); + }); + + it('cleans up container even when exec fails', async () => { + executor.pushResponse({ stdout: 'cid\n', exitCode: 0 }); // create + executor.pushResponse({ exitCode: 0 }); // start + executor.pushResponse({ exitCode: 1, stderr: 'grader crashed' }); // exec fails + executor.pushResponse({ exitCode: 0 }); // rm (cleanup) + + const provider = new DockerWorkspaceProvider({ image: 'img:1' }, executor); + const result = await provider.runGraderInContainer({ + command: ['grade'], + }); + + // Should return the error result, not throw + expect(result.exitCode).toBe(1); + // Container should still be cleaned up + expect(executor.calls).toHaveLength(4); + expect(executor.callArgv(3)[1]).toBe('rm'); + }); + + it('cleans up container even when start fails', async () => { + executor.pushResponse({ stdout: 'cid\n', exitCode: 0 }); // create + executor.pushResponse({ exitCode: 1, stderr: 'start failed' }); // start fails + executor.pushResponse({ exitCode: 0 }); // rm (cleanup) + + const provider = new DockerWorkspaceProvider({ image: 'img:1' }, executor); + await expect(provider.runGraderInContainer({ command: ['grade'] })).rejects.toThrow( + 'docker start failed', + ); + + // Container should still be cleaned up + const rmCall = executor.calls.find((c) => c.argv[1] === 'rm'); + expect(rmCall).toBeDefined(); + expect(rmCall?.argv).toEqual(['docker', 'rm', '-f', 'cid']); + }); + }); + + describe('timeout configuration', () => { + it('defaults to 1800s (30 min) timeout', async () => { + executor.pushResponse({ exitCode: 0 }); + const provider = new DockerWorkspaceProvider({ image: 'img:1' }, executor); + await provider.pullImage(); + expect(executor.callOptions(0)?.timeoutMs).toBe(1_800_000); + }); + + it('uses custom timeout from config', async () => { + executor.pushResponse({ exitCode: 0 }); + const provider = new DockerWorkspaceProvider({ image: 'img:1', timeout: 300 }, executor); + await provider.pullImage(); + expect(executor.callOptions(0)?.timeoutMs).toBe(300_000); + }); + }); +}); diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json index bb340e350..2792f120f 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json +++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json @@ -53,7 +53,12 @@ "properties": { "role": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "enum": [ + "system", + "user", + "assistant", + "tool" + ] }, "content": { "anyOf": [ @@ -67,20 +72,30 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file", "image"] + "enum": [ + "text", + "file", + "image" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } }, - "required": ["role", "content"], + "required": [ + "role", + "content" + ], "additionalProperties": false } } @@ -118,7 +133,12 @@ "properties": { "role": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "enum": [ + "system", + "user", + "assistant", + "tool" + ] }, "content": { "anyOf": [ @@ -132,20 +152,30 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file", "image"] + "enum": [ + "text", + "file", + "image" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } }, - "required": ["role", "content"], + "required": [ + "role", + "content" + ], "additionalProperties": false } } @@ -173,7 +203,12 @@ "properties": { "role": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "enum": [ + "system", + "user", + "assistant", + "tool" + ] }, "content": { "anyOf": [ @@ -187,20 +222,30 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file", "image"] + "enum": [ + "text", + "file", + "image" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } }, - "required": ["role", "content"], + "required": [ + "role", + "content" + ], "additionalProperties": false } } @@ -244,7 +289,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -318,12 +366,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -360,7 +414,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -455,7 +512,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -506,12 +566,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -574,7 +639,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -590,7 +657,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -607,7 +677,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -624,13 +697,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -667,11 +745,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -712,7 +799,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -726,7 +818,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -737,7 +834,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -745,7 +844,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -759,7 +863,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -770,7 +879,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -807,7 +919,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -819,7 +934,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -841,17 +960,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -895,7 +1023,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -939,7 +1070,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -976,7 +1110,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -991,7 +1128,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1028,7 +1167,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -1060,7 +1202,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1103,7 +1247,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -1146,7 +1293,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -1183,10 +1333,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1229,7 +1384,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -1318,7 +1476,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -1328,7 +1489,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -1372,7 +1536,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -1446,12 +1613,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -1488,7 +1661,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -1583,7 +1759,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -1634,12 +1813,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1702,7 +1886,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1718,7 +1904,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -1735,7 +1924,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -1752,13 +1944,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -1795,11 +1992,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -1840,7 +2046,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -1854,7 +2065,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -1865,7 +2081,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -1873,7 +2091,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -1887,7 +2110,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -1898,7 +2126,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -1935,7 +2166,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -1947,7 +2181,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -1969,17 +2207,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -2023,7 +2270,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -2067,7 +2317,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -2104,7 +2357,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -2119,7 +2375,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2156,7 +2414,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -2188,7 +2449,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2231,7 +2494,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -2274,7 +2540,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -2311,10 +2580,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2357,7 +2631,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -2446,7 +2723,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -2456,7 +2736,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -2517,7 +2800,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -2591,12 +2877,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -2633,7 +2925,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -2728,7 +3023,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -2779,12 +3077,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2847,7 +3150,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2863,7 +3168,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -2880,7 +3188,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -2897,13 +3208,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -2940,11 +3256,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -2985,7 +3310,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -2999,7 +3329,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -3010,7 +3345,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -3018,7 +3355,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -3032,7 +3374,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -3043,7 +3390,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -3080,7 +3430,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -3092,7 +3445,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -3114,17 +3471,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -3168,7 +3534,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -3212,7 +3581,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -3249,7 +3621,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -3264,7 +3639,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3301,7 +3678,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -3333,7 +3713,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3376,7 +3758,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -3419,7 +3804,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -3456,10 +3844,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3502,7 +3895,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -3591,7 +3987,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -3601,7 +4000,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -3645,7 +4047,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -3719,12 +4124,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -3761,7 +4172,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -3856,7 +4270,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -3907,12 +4324,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3975,7 +4397,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3991,7 +4415,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -4008,7 +4435,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -4025,13 +4455,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -4068,11 +4503,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -4113,7 +4557,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -4127,7 +4576,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -4138,7 +4592,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -4146,7 +4602,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -4160,7 +4621,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -4171,7 +4637,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -4208,7 +4677,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -4220,7 +4692,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -4242,17 +4718,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -4296,7 +4781,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -4340,7 +4828,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -4377,7 +4868,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -4392,7 +4886,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4429,7 +4925,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -4461,7 +4960,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4504,7 +5005,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -4547,7 +5051,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -4584,10 +5091,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4630,7 +5142,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -4719,7 +5234,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -4729,7 +5247,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -4750,7 +5271,11 @@ }, "strategy": { "type": "string", - "enum": ["pass_at_k", "mean", "confidence_interval"] + "enum": [ + "pass_at_k", + "mean", + "confidence_interval" + ] }, "cost_limit_usd": { "type": "number", @@ -4761,7 +5286,9 @@ "minimum": 0 } }, - "required": ["count"], + "required": [ + "count" + ], "additionalProperties": false }, "total_budget_usd": { @@ -4794,7 +5321,10 @@ }, "isolation": { "type": "string", - "enum": ["shared", "per_test"] + "enum": [ + "shared", + "per_test" + ] }, "repos": { "type": "array", @@ -4818,7 +5348,10 @@ "format": "uri" } }, - "required": ["type", "url"], + "required": [ + "type", + "url" + ], "additionalProperties": false }, { @@ -4832,7 +5365,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false } ] @@ -4845,7 +5381,10 @@ }, "resolve": { "type": "string", - "enum": ["remote", "local"] + "enum": [ + "remote", + "local" + ] }, "ancestor": { "type": "integer", @@ -4874,7 +5413,10 @@ "additionalProperties": false } }, - "required": ["path", "source"], + "required": [ + "path", + "source" + ], "additionalProperties": false } }, @@ -4910,7 +5452,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -4941,7 +5487,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -4972,7 +5522,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -5003,7 +5557,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -5013,10 +5571,37 @@ }, "mode": { "type": "string", - "enum": ["pooled", "temp", "static"] + "enum": [ + "pooled", + "temp", + "static" + ] }, "path": { "type": "string" + }, + "docker": { + "type": "object", + "properties": { + "image": { + "type": "string" + }, + "timeout": { + "type": "integer", + "minimum": 1 + }, + "memory": { + "type": "string" + }, + "cpus": { + "type": "number", + "minimum": 0.1 + } + }, + "required": [ + "image" + ], + "additionalProperties": false } }, "additionalProperties": false @@ -5035,7 +5620,9 @@ "type": "string" } }, - "required": ["id"], + "required": [ + "id" + ], "additionalProperties": false } }, @@ -5070,7 +5657,12 @@ "properties": { "role": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "enum": [ + "system", + "user", + "assistant", + "tool" + ] }, "content": { "anyOf": [ @@ -5084,20 +5676,30 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file", "image"] + "enum": [ + "text", + "file", + "image" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } }, - "required": ["role", "content"], + "required": [ + "role", + "content" + ], "additionalProperties": false } } @@ -5125,7 +5727,12 @@ "properties": { "role": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "enum": [ + "system", + "user", + "assistant", + "tool" + ] }, "content": { "anyOf": [ @@ -5139,20 +5746,30 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file", "image"] + "enum": [ + "text", + "file", + "image" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } }, - "required": ["role", "content"], + "required": [ + "role", + "content" + ], "additionalProperties": false } } @@ -5196,7 +5813,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -5270,12 +5890,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -5312,7 +5938,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -5407,7 +6036,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -5458,12 +6090,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -5526,7 +6163,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -5542,7 +6181,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -5559,7 +6201,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -5576,13 +6221,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -5619,11 +6269,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -5664,7 +6323,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -5678,7 +6342,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -5689,7 +6358,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -5697,7 +6368,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -5711,7 +6387,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -5722,7 +6403,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -5759,7 +6443,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -5771,7 +6458,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -5793,17 +6484,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -5847,7 +6547,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -5891,7 +6594,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -5928,7 +6634,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -5943,7 +6652,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -5980,7 +6691,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -6012,7 +6726,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -6055,7 +6771,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -6098,7 +6817,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -6135,10 +6857,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -6181,7 +6908,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -6270,7 +7000,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -6280,7 +7013,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -6324,7 +7060,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -6398,12 +7137,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -6440,7 +7185,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -6535,7 +7283,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -6586,12 +7337,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -6654,7 +7410,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -6670,7 +7428,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -6687,7 +7448,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -6704,13 +7468,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -6747,11 +7516,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -6792,7 +7570,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -6806,7 +7589,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -6817,7 +7605,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -6825,7 +7615,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -6839,7 +7634,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -6850,7 +7650,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -6887,7 +7690,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -6899,7 +7705,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -6921,17 +7731,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -6975,7 +7794,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -7019,7 +7841,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -7056,7 +7881,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -7071,7 +7899,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -7108,7 +7938,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -7140,7 +7973,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -7183,7 +8018,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -7226,7 +8064,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -7263,10 +8104,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -7309,7 +8155,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -7398,7 +8247,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -7408,7 +8260,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -7469,7 +8324,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -7543,12 +8401,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -7585,7 +8449,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -7680,7 +8547,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -7731,12 +8601,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -7799,7 +8674,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -7815,7 +8692,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -7832,7 +8712,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -7849,13 +8732,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -7892,11 +8780,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -7937,7 +8834,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -7951,7 +8853,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -7962,7 +8869,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -7970,7 +8879,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -7984,7 +8898,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -7995,7 +8914,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -8032,7 +8954,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -8044,7 +8969,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -8066,17 +8995,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -8120,7 +9058,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -8164,7 +9105,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -8201,7 +9145,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -8216,7 +9163,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -8253,7 +9202,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -8285,7 +9237,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -8328,7 +9282,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -8371,7 +9328,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -8408,10 +9368,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -8454,7 +9419,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -8543,7 +9511,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -8553,7 +9524,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -8597,7 +9571,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -8671,12 +9648,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -8713,7 +9696,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -8808,7 +9794,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -8859,12 +9848,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -8927,7 +9921,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -8943,7 +9939,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -8960,7 +9959,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -8977,13 +9979,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -9020,11 +10027,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -9065,7 +10081,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -9079,7 +10100,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -9090,7 +10116,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -9098,7 +10126,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -9112,7 +10145,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -9123,7 +10161,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -9160,7 +10201,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -9172,7 +10216,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -9194,17 +10242,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -9248,7 +10305,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -9292,7 +10352,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -9329,7 +10392,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -9344,7 +10410,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -9381,7 +10449,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -9413,7 +10484,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -9456,7 +10529,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -9499,7 +10575,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -9536,10 +10615,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -9582,7 +10666,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -9671,7 +10758,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -9681,7 +10771,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -9702,7 +10795,11 @@ }, "strategy": { "type": "string", - "enum": ["pass_at_k", "mean", "confidence_interval"] + "enum": [ + "pass_at_k", + "mean", + "confidence_interval" + ] }, "cost_limit_usd": { "type": "number", @@ -9713,7 +10810,9 @@ "minimum": 0 } }, - "required": ["count"], + "required": [ + "count" + ], "additionalProperties": false }, "total_budget_usd": { @@ -9746,7 +10845,10 @@ }, "isolation": { "type": "string", - "enum": ["shared", "per_test"] + "enum": [ + "shared", + "per_test" + ] }, "repos": { "type": "array", @@ -9770,7 +10872,10 @@ "format": "uri" } }, - "required": ["type", "url"], + "required": [ + "type", + "url" + ], "additionalProperties": false }, { @@ -9784,7 +10889,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false } ] @@ -9797,7 +10905,10 @@ }, "resolve": { "type": "string", - "enum": ["remote", "local"] + "enum": [ + "remote", + "local" + ] }, "ancestor": { "type": "integer", @@ -9826,7 +10937,10 @@ "additionalProperties": false } }, - "required": ["path", "source"], + "required": [ + "path", + "source" + ], "additionalProperties": false } }, @@ -9862,7 +10976,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -9893,7 +11011,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -9924,7 +11046,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -9955,7 +11081,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -9965,10 +11095,37 @@ }, "mode": { "type": "string", - "enum": ["pooled", "temp", "static"] + "enum": [ + "pooled", + "temp", + "static" + ] }, "path": { "type": "string" + }, + "docker": { + "type": "object", + "properties": { + "image": { + "type": "string" + }, + "timeout": { + "type": "integer", + "minimum": 1 + }, + "memory": { + "type": "string" + }, + "cpus": { + "type": "number", + "minimum": 0.1 + } + }, + "required": [ + "image" + ], + "additionalProperties": false } }, "additionalProperties": false @@ -9987,7 +11144,9 @@ "type": "string" } }, - "required": ["id"], + "required": [ + "id" + ], "additionalProperties": false } }, @@ -10054,7 +11213,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -10128,12 +11290,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -10170,7 +11338,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -10265,7 +11436,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -10316,12 +11490,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -10384,7 +11563,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -10400,7 +11581,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -10417,7 +11601,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -10434,13 +11621,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -10477,11 +11669,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -10522,7 +11723,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -10536,7 +11742,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -10547,7 +11758,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -10555,7 +11768,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -10569,7 +11787,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -10580,7 +11803,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -10617,7 +11843,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -10629,7 +11858,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -10651,17 +11884,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -10705,7 +11947,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -10749,7 +11994,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -10786,7 +12034,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -10801,7 +12052,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -10838,7 +12091,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -10870,7 +12126,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -10913,7 +12171,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -10956,7 +12217,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -10993,10 +12257,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11039,7 +12308,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -11128,7 +12400,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -11138,7 +12413,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -11182,7 +12460,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -11256,12 +12537,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -11298,7 +12585,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -11393,7 +12683,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -11444,12 +12737,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11512,7 +12810,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11528,7 +12828,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -11545,7 +12848,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -11562,13 +12868,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -11605,11 +12916,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -11650,7 +12970,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -11664,7 +12989,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -11675,7 +13005,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -11683,7 +13015,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -11697,7 +13034,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -11708,7 +13050,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -11745,7 +13090,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -11757,7 +13105,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -11779,17 +13131,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -11833,7 +13194,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -11877,7 +13241,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -11914,7 +13281,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -11929,7 +13299,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11966,7 +13338,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -11998,7 +13373,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12041,7 +13418,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -12084,7 +13464,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -12121,10 +13504,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12167,7 +13555,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -12256,7 +13647,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -12266,7 +13660,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -12287,7 +13684,11 @@ }, "strategy": { "type": "string", - "enum": ["pass_at_k", "mean", "confidence_interval"] + "enum": [ + "pass_at_k", + "mean", + "confidence_interval" + ] }, "cost_limit_usd": { "type": "number", @@ -12298,7 +13699,9 @@ "minimum": 0 } }, - "required": ["count"], + "required": [ + "count" + ], "additionalProperties": false }, "total_budget_usd": { @@ -12361,7 +13764,10 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "enum": [ + "code-grader", + "code_grader" + ] }, "command": { "anyOf": [ @@ -12435,12 +13841,18 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -12477,7 +13889,10 @@ }, "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "enum": [ + "llm-grader", + "llm_grader" + ] }, "prompt": { "anyOf": [ @@ -12572,7 +13987,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -12623,12 +14041,17 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12691,7 +14114,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12707,7 +14132,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -12724,7 +14152,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -12741,13 +14172,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -12784,11 +14220,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -12829,7 +14274,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -12843,7 +14293,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -12854,7 +14309,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -12862,7 +14319,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -12876,7 +14338,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -12887,7 +14354,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -12924,7 +14394,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -12936,7 +14409,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -12958,17 +14435,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -13012,7 +14498,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -13056,7 +14545,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -13093,7 +14585,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -13108,7 +14603,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -13145,7 +14642,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -13177,7 +14677,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -13220,7 +14722,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -13263,7 +14768,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -13300,10 +14808,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -13346,7 +14859,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -13435,7 +14951,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -13445,7 +14964,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -13474,7 +14996,10 @@ ] } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false } }, @@ -13488,7 +15013,10 @@ }, "isolation": { "type": "string", - "enum": ["shared", "per_test"] + "enum": [ + "shared", + "per_test" + ] }, "repos": { "type": "array", @@ -13512,7 +15040,10 @@ "format": "uri" } }, - "required": ["type", "url"], + "required": [ + "type", + "url" + ], "additionalProperties": false }, { @@ -13526,7 +15057,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false } ] @@ -13539,7 +15073,10 @@ }, "resolve": { "type": "string", - "enum": ["remote", "local"] + "enum": [ + "remote", + "local" + ] }, "ancestor": { "type": "integer", @@ -13568,7 +15105,10 @@ "additionalProperties": false } }, - "required": ["path", "source"], + "required": [ + "path", + "source" + ], "additionalProperties": false } }, @@ -13604,7 +15144,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -13635,7 +15179,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -13666,7 +15214,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -13697,7 +15249,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -13707,10 +15263,37 @@ }, "mode": { "type": "string", - "enum": ["pooled", "temp", "static"] + "enum": [ + "pooled", + "temp", + "static" + ] }, "path": { "type": "string" + }, + "docker": { + "type": "object", + "properties": { + "image": { + "type": "string" + }, + "timeout": { + "type": "integer", + "minimum": 1 + }, + "memory": { + "type": "string" + }, + "cpus": { + "type": "number", + "minimum": 0.1 + } + }, + "required": [ + "image" + ], + "additionalProperties": false } }, "additionalProperties": false @@ -13721,7 +15304,9 @@ ] } }, - "required": ["tests"], + "required": [ + "tests" + ], "additionalProperties": false } }