Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions examples/features/docker-workspace/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Docker Workspace Example

This example demonstrates how to run code-grader evaluations inside Docker containers.

## Use Case

When evaluating coding agents (e.g., SWE-bench), the grader script needs to:
1. Apply the agent's patch to a repository
2. Run tests inside the repository's environment
3. Report pass/fail results

Docker workspaces let you run this grading process inside a pre-built container
image that has the repository, dependencies, and test infrastructure ready.

## How It Works

```
1. AgentV sends prompt to agent target → receives patch/diff output
2. docker pull <image> (once per eval run, cached)
3. For each test case:
a. docker create --memory=4g --cpus=2 <image>
b. docker start <container>
c. docker exec -i <container> <grader-command> < payload.json
d. Parse grader JSON output (score, assertions)
e. docker rm -f <container>
4. Aggregate results
```

## YAML Schema

```yaml
workspace:
docker:
image: swebench/sweb.eval.x86_64.django__django-15180
timeout: 1800 # seconds (default: 1800)
memory: 4g # optional Docker memory limit
cpus: 2 # optional Docker CPU limit
```

## Running

```bash
# Requires Docker to be installed and running
bun apps/cli/src/cli.ts eval examples/features/docker-workspace/evals/docker-example.EVAL.yaml
```
31 changes: 31 additions & 0 deletions examples/features/docker-workspace/evals/docker-example.EVAL.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Docker Workspace Example
# Demonstrates running a code-grader inside a Docker container.
#
# This eval sends a coding prompt to an agent, then grades the agent's output
# inside a Docker container that has the target repository and test environment.
#
# Prerequisites:
# - Docker installed and running
# - The specified image available (will be pulled automatically)

name: docker-workspace-example
description: Example eval using Docker workspace for grading

workspace:
docker:
image: python:3.11-slim
timeout: 300
memory: 2g
cpus: 1

execution:
target: dry-run
workers: 1

tests:
- id: hello-world
input: "Write a Python function that returns 'hello world'"
criteria: "The output should contain a working Python function"
assertions:
- type: code-grader
command: ["python", "-c", "import sys, json; data = json.load(sys.stdin); print(json.dumps({'score': 1.0, 'assertions': [{'text': 'grader ran in container', 'passed': True}]}))"]
34 changes: 27 additions & 7 deletions packages/core/src/evaluation/evaluators/code-evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -210,13 +210,33 @@ export class CodeEvaluator implements Evaluator {
const env = proxyEnv || workspaceEnv ? { ...proxyEnv, ...workspaceEnv } : undefined;

try {
const stdout = await executeScript(
this.command,
inputPayload,
this.agentTimeoutMs,
this.cwd,
env,
);
let stdout: string;
if (context.dockerConfig) {
// Docker execution mode: run grader inside a container
const { DockerWorkspaceProvider } = await import('../workspace/docker-workspace.js');
const dockerProvider = new DockerWorkspaceProvider(context.dockerConfig);
const result = await dockerProvider.runGraderInContainer({
command: [...this.command],
stdin: inputPayload,
});
if (result.exitCode !== 0) {
const trimmedErr = result.stderr.trim();
throw new Error(
trimmedErr.length > 0
? `Code evaluator exited with code ${result.exitCode}: ${trimmedErr}`
: `Code evaluator exited with code ${result.exitCode}`,
);
}
stdout = result.stdout.trim();
} else {
stdout = await executeScript(
this.command,
inputPayload,
this.agentTimeoutMs,
this.cwd,
env,
);
}
const parsed = parseJsonSafe(stdout);
const score = clampScore(typeof parsed?.score === 'number' ? parsed.score : 0);
const assertions: AssertionEntry[] = Array.isArray(parsed?.assertions)
Expand Down
10 changes: 9 additions & 1 deletion packages/core/src/evaluation/evaluators/types.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
import type { ResolvedTarget } from '../providers/targets.js';
import type { ChatPrompt, Message, Provider } from '../providers/types.js';
import type { TokenUsage, TraceSummary } from '../trace.js';
import type { EvalTest, EvaluationVerdict, EvaluatorConfig, JsonObject } from '../types.js';
import type {
DockerWorkspaceConfig,
EvalTest,
EvaluationVerdict,
EvaluatorConfig,
JsonObject,
} from '../types.js';

export type { EvaluationVerdict };

Expand Down Expand Up @@ -50,6 +56,8 @@ export interface EvaluationContext {
readonly fileChanges?: string;
/** Absolute path to the workspace directory (when workspace_template is configured) */
readonly workspacePath?: string;
/** Docker workspace config: when present, code-grader commands run inside a container */
readonly dockerConfig?: DockerWorkspaceConfig;
}

export interface EvaluationScore {
Expand Down
26 changes: 26 additions & 0 deletions packages/core/src/evaluation/orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,21 @@ export async function runEvaluation(
}
}

// --- Docker workspace: pull image once at setup ---
const suiteDockerConfig = suiteWorkspace?.docker;
if (suiteDockerConfig) {
setupLog(`pulling Docker image: ${suiteDockerConfig.image}`);
const { DockerWorkspaceProvider } = await import('./workspace/docker-workspace.js');
const dockerSetup = new DockerWorkspaceProvider(suiteDockerConfig);
if (!(await dockerSetup.isDockerAvailable())) {
throw new Error(
'Docker workspace configured but Docker CLI is not available. Install Docker and ensure it is running.',
);
}
await dockerSetup.pullImage();
setupLog('Docker image pull complete');
}

// Execute before_all (runs ONCE before first test per workspace)
const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
Expand Down Expand Up @@ -1830,6 +1845,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
availableTargets,
fileChanges,
workspacePath,
dockerConfig: evalCase.workspace?.docker,
verbose,
threshold: evalCase.threshold ?? caseThreshold,
});
Expand Down Expand Up @@ -2088,6 +2104,7 @@ async function evaluateCandidate(options: {
readonly availableTargets?: readonly string[];
readonly fileChanges?: string;
readonly workspacePath?: string;
readonly dockerConfig?: import('./types.js').DockerWorkspaceConfig;
readonly verbose?: boolean;
readonly threshold?: number;
}): Promise<EvaluationResult> {
Expand All @@ -2114,6 +2131,7 @@ async function evaluateCandidate(options: {
availableTargets,
fileChanges,
workspacePath,
dockerConfig,
threshold: evalThreshold,
} = options;

Expand Down Expand Up @@ -2141,6 +2159,7 @@ async function evaluateCandidate(options: {
availableTargets,
fileChanges,
workspacePath,
dockerConfig,
threshold: evalThreshold,
});

Expand Down Expand Up @@ -2226,6 +2245,7 @@ async function runEvaluatorsForCase(options: {
readonly availableTargets?: readonly string[];
readonly fileChanges?: string;
readonly workspacePath?: string;
readonly dockerConfig?: import('./types.js').DockerWorkspaceConfig;
readonly threshold?: number;
}): Promise<{ score: EvaluationScore; scores?: EvaluatorResult[] }> {
const {
Expand All @@ -2251,6 +2271,7 @@ async function runEvaluatorsForCase(options: {
availableTargets,
fileChanges,
workspacePath,
dockerConfig,
threshold,
} = options;

Expand Down Expand Up @@ -2279,6 +2300,7 @@ async function runEvaluatorsForCase(options: {
availableTargets,
fileChanges,
workspacePath,
dockerConfig,
threshold,
});
}
Expand Down Expand Up @@ -2313,6 +2335,7 @@ async function runEvaluatorsForCase(options: {
availableTargets,
fileChanges,
workspacePath,
dockerConfig,
...(implicitEvaluator ? { evaluator: implicitEvaluator } : {}),
});

Expand Down Expand Up @@ -2357,6 +2380,7 @@ async function runEvaluatorList(options: {
readonly availableTargets?: readonly string[];
readonly fileChanges?: string;
readonly workspacePath?: string;
readonly dockerConfig?: import('./types.js').DockerWorkspaceConfig;
readonly threshold?: number;
}): Promise<{ score: EvaluationScore; scores: EvaluatorResult[] }> {
const {
Expand All @@ -2383,6 +2407,7 @@ async function runEvaluatorList(options: {
availableTargets,
fileChanges,
workspacePath,
dockerConfig,
} = options;

const scored: Array<{
Expand Down Expand Up @@ -2416,6 +2441,7 @@ async function runEvaluatorList(options: {
availableTargets,
fileChanges,
workspacePath,
dockerConfig,
};

// Build the dispatch context for evaluator factories
Expand Down
18 changes: 18 additions & 0 deletions packages/core/src/evaluation/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,22 @@ export type WorkspaceHooksConfig = {
readonly after_all?: WorkspaceHookConfig;
};

/**
* Docker-based workspace configuration.
* When present, code-grader commands run inside a Docker container
* instead of on the host.
*/
export type DockerWorkspaceConfig = {
/** Docker image to use (e.g. 'swebench/sweb.eval.x86_64.django__django-15180') */
readonly image: string;
/** Container execution timeout in seconds (default: 1800) */
readonly timeout?: number;
/** Memory limit (e.g. '4g', '512m') */
readonly memory?: string;
/** CPU limit (e.g. 2, 0.5) */
readonly cpus?: number;
};

export type WorkspaceConfig = {
/** Template directory or .code-workspace file. Directories are copied to temp workspace.
* .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
Expand All @@ -290,6 +306,8 @@ export type WorkspaceConfig = {
readonly mode?: 'pooled' | 'temp' | 'static';
/** Required when mode=static: use this existing directory directly */
readonly path?: string;
/** Docker-based workspace: run grader commands inside a container */
readonly docker?: DockerWorkspaceConfig;
};

export type CodeEvaluatorConfig = {
Expand Down
8 changes: 8 additions & 0 deletions packages/core/src/evaluation/validation/eval-file.schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,13 @@ const WorkspaceHooksSchema = z.object({
after_all: WorkspaceHookSchema.optional(),
});

const DockerWorkspaceSchema = z.object({
image: z.string(),
timeout: z.number().int().min(1).optional(),
memory: z.string().optional(),
cpus: z.number().min(0.1).optional(),
});

const WorkspaceSchema = z
.object({
template: z.string().optional(),
Expand All @@ -306,6 +313,7 @@ const WorkspaceSchema = z
hooks: WorkspaceHooksSchema.optional(),
mode: z.enum(['pooled', 'temp', 'static']).optional(),
path: z.string().optional(),
docker: DockerWorkspaceSchema.optional(),
})
.strict();

Expand Down
Loading
Loading