Skip to content
Merged
2 changes: 1 addition & 1 deletion apps/cli/src/commands/eval/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ export const evalRunCommand = command({
type: optional(number),
long: 'workers',
description:
'Number of parallel workers (default: 3, max: 50). Can also be set per-target in targets.yaml',
'Number of parallel test cases within each eval file (default: 3, max: 50). Eval files always run sequentially. Can also be set per-target in targets.yaml',
}),
out: option({
type: optional(string),
Expand Down
41 changes: 10 additions & 31 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -615,25 +615,6 @@ async function prepareFileMetadata(params: {
};
}

async function runWithLimit<T>(
items: readonly T[],
limit: number,
task: (item: T) => Promise<void>,
): Promise<void> {
const safeLimit = Math.max(1, limit);
let index = 0;

const workers = Array.from({ length: safeLimit }, async () => {
while (index < items.length) {
const current = items[index];
index += 1;
await task(current);
}
});

await Promise.all(workers);
}

async function runSingleEvalFile(params: {
readonly testFilePath: string;
readonly cwd: string;
Expand Down Expand Up @@ -1088,15 +1069,8 @@ export async function runEvalCommand(
const seenTestCases = new Set<string>();
const displayIdTracker = createDisplayIdTracker();

// Derive file-level concurrency from worker count (global) when provided
const totalWorkers = options.workers ?? DEFAULT_WORKERS;
const fileConcurrency = Math.min(
Math.max(1, totalWorkers),
Math.max(1, resolvedTestFiles.length),
);
const perFileWorkers = options.workers
? Math.max(1, Math.floor(totalWorkers / fileConcurrency))
: undefined;
// Each file gets the full worker budget — no splitting across files
const perFileWorkers = options.workers;
const fileMetadata = new Map<
string,
{
Expand Down Expand Up @@ -1228,7 +1202,9 @@ export async function runEvalCommand(
}
throw new Error('No tests matched the provided filters.');
}
const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose });
const progressReporter = createProgressReporter(options.workers ?? DEFAULT_WORKERS, {
verbose: options.verbose,
});
progressReporter.start();
progressReporter.setTotal(totalEvalCount);
const seenCodexLogPaths = new Set<string>();
Expand Down Expand Up @@ -1309,8 +1285,11 @@ export async function runEvalCommand(
);
}

// Eval files run sequentially; within each file, --workers N test cases run in parallel.
// This matches industry practice (promptfoo, deepeval, OpenAI Evals) and avoids cross-file
// workspace races without any grouping complexity.
try {
await runWithLimit(activeTestFiles, fileConcurrency, async (testFilePath) => {
for (const testFilePath of activeTestFiles) {
const targetPrep = fileMetadata.get(testFilePath);
if (!targetPrep) {
throw new Error(`Missing metadata for ${testFilePath}`);
Expand Down Expand Up @@ -1404,7 +1383,7 @@ export async function runEvalCommand(
for (const results of targetResults) {
allResults.push(...results);
}
});
}

progressReporter.finish();

Expand Down
14 changes: 14 additions & 0 deletions apps/web/src/content/docs/docs/evaluation/running-evals.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,20 @@ export OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer token"
agentv eval evals/my-eval.yaml --export-otel
```

### Parallelism

The `--workers N` flag controls how many **test cases run in parallel within each eval file** (default: 3). Eval files always run sequentially — one file completes before the next starts.

```bash
agentv eval evals/my-eval.yaml --workers 4
# Up to 4 test cases from the file run concurrently

agentv eval evals/file1.yaml evals/file2.yaml evals/file3.yaml --workers 3
# Files run one at a time; within each file, up to 3 test cases run in parallel
```

This matches the standard model used by eval frameworks (promptfoo, deepeval, OpenAI Evals) and avoids cross-file workspace races without any special configuration.

### Workspace Modes and Finish Policy

Use workspace mode and finish policies instead of multiple conflicting booleans:
Expand Down
2 changes: 2 additions & 0 deletions apps/web/src/content/docs/docs/guides/workspace-pool.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ This creates up to 4 slots (`slot-0` through `slot-3`). PID-based lock files pre

The maximum number of pool slots defaults to 10 (capped at 50). Slots are created on demand — a run with 2 workers only creates 2 slots, even if the pool allows 10.

**Multiple eval files:** When you pass multiple eval files to `agentv eval`, they run sequentially — one file completes before the next starts (see [Parallelism](/docs/evaluation/running-evals/#parallelism)). Within each file, pool slots support concurrent workers as described above.

## Drift detection

If you change the workspace config (e.g., update a repo URL or checkout ref), the computed fingerprint changes. AgentV detects this drift by comparing the stored `metadata.json` fingerprint against the newly computed one:
Expand Down
13 changes: 10 additions & 3 deletions packages/core/src/evaluation/yaml-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,8 @@ export type EvalSuiteResult = {
readonly failOnError?: import('./types.js').FailOnError;
/** Suite-level quality threshold (0-1) — suite fails if mean score is below */
readonly threshold?: number;
/** Resolved workspace.path from the eval YAML (after env-var expansion), if set */
readonly workspacePath?: string;
};

/**
Expand All @@ -212,7 +214,11 @@ export async function loadTestSuite(
if (format === 'agent-skills-json') {
return { tests: await loadTestsFromAgentSkills(evalFilePath) };
}
const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
evalFilePath,
repoRoot,
options,
);
const metadata = parseMetadata(parsed);
const failOnError = extractFailOnError(parsed);
const threshold = extractThreshold(parsed);
Expand All @@ -226,6 +232,7 @@ export async function loadTestSuite(
...(metadata !== undefined && { metadata }),
...(failOnError !== undefined && { failOnError }),
...(threshold !== undefined && { threshold }),
...(suiteWorkspacePath !== undefined && { workspacePath: suiteWorkspacePath }),
};
}

Expand Down Expand Up @@ -256,7 +263,7 @@ async function loadTestsFromYaml(
evalFilePath: string,
repoRoot: URL | string,
options?: LoadOptions,
): Promise<{ tests: readonly EvalTest[]; parsed: JsonObject }> {
): Promise<{ tests: readonly EvalTest[]; parsed: JsonObject; suiteWorkspacePath?: string }> {
// YAML parsing (existing implementation)
const verbose = options?.verbose ?? false;
const filterPattern = options?.filter;
Expand Down Expand Up @@ -524,7 +531,7 @@ async function loadTestsFromYaml(
results.push(testCase);
}

return { tests: results, parsed: suite };
return { tests: results, parsed: suite, suiteWorkspacePath: suiteWorkspace?.path };
}

/**
Expand Down
Loading