diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 282d8d655..8d3f5f18b 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -41,7 +41,7 @@ export const evalRunCommand = command({ type: optional(number), long: 'workers', description: - 'Number of parallel workers (default: 3, max: 50). Can also be set per-target in targets.yaml', + 'Number of parallel test cases within each eval file (default: 3, max: 50). Eval files always run sequentially. Can also be set per-target in targets.yaml', }), out: option({ type: optional(string), diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index b7ad4ffa1..c5aa81a2b 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -615,25 +615,6 @@ async function prepareFileMetadata(params: { }; } -async function runWithLimit( - items: readonly T[], - limit: number, - task: (item: T) => Promise, -): Promise { - const safeLimit = Math.max(1, limit); - let index = 0; - - const workers = Array.from({ length: safeLimit }, async () => { - while (index < items.length) { - const current = items[index]; - index += 1; - await task(current); - } - }); - - await Promise.all(workers); -} - async function runSingleEvalFile(params: { readonly testFilePath: string; readonly cwd: string; @@ -1088,15 +1069,8 @@ export async function runEvalCommand( const seenTestCases = new Set(); const displayIdTracker = createDisplayIdTracker(); - // Derive file-level concurrency from worker count (global) when provided - const totalWorkers = options.workers ?? DEFAULT_WORKERS; - const fileConcurrency = Math.min( - Math.max(1, totalWorkers), - Math.max(1, resolvedTestFiles.length), - ); - const perFileWorkers = options.workers - ? Math.max(1, Math.floor(totalWorkers / fileConcurrency)) - : undefined; + // Each file gets the full worker budget — no splitting across files + const perFileWorkers = options.workers; const fileMetadata = new Map< string, { @@ -1228,7 +1202,9 @@ export async function runEvalCommand( } throw new Error('No tests matched the provided filters.'); } - const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose }); + const progressReporter = createProgressReporter(options.workers ?? DEFAULT_WORKERS, { + verbose: options.verbose, + }); progressReporter.start(); progressReporter.setTotal(totalEvalCount); const seenCodexLogPaths = new Set(); @@ -1309,8 +1285,11 @@ export async function runEvalCommand( ); } + // Eval files run sequentially; within each file, --workers N test cases run in parallel. + // This matches industry practice (promptfoo, deepeval, OpenAI Evals) and avoids cross-file + // workspace races without any grouping complexity. try { - await runWithLimit(activeTestFiles, fileConcurrency, async (testFilePath) => { + for (const testFilePath of activeTestFiles) { const targetPrep = fileMetadata.get(testFilePath); if (!targetPrep) { throw new Error(`Missing metadata for ${testFilePath}`); @@ -1404,7 +1383,7 @@ export async function runEvalCommand( for (const results of targetResults) { allResults.push(...results); } - }); + } progressReporter.finish(); diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index 341b71d30..efa3ddc8f 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -190,6 +190,20 @@ export OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer token" agentv eval evals/my-eval.yaml --export-otel ``` +### Parallelism + +The `--workers N` flag controls how many **test cases run in parallel within each eval file** (default: 3). Eval files always run sequentially — one file completes before the next starts. + +```bash +agentv eval evals/my-eval.yaml --workers 4 +# Up to 4 test cases from the file run concurrently + +agentv eval evals/file1.yaml evals/file2.yaml evals/file3.yaml --workers 3 +# Files run one at a time; within each file, up to 3 test cases run in parallel +``` + +This matches the standard model used by eval frameworks (promptfoo, deepeval, OpenAI Evals) and avoids cross-file workspace races without any special configuration. + ### Workspace Modes and Finish Policy Use workspace mode and finish policies instead of multiple conflicting booleans: diff --git a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx index 0969f710b..58e024e2b 100644 --- a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx +++ b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx @@ -135,6 +135,8 @@ This creates up to 4 slots (`slot-0` through `slot-3`). PID-based lock files pre The maximum number of pool slots defaults to 10 (capped at 50). Slots are created on demand — a run with 2 workers only creates 2 slots, even if the pool allows 10. +**Multiple eval files:** When you pass multiple eval files to `agentv eval`, they run sequentially — one file completes before the next starts (see [Parallelism](/docs/evaluation/running-evals/#parallelism)). Within each file, pool slots support concurrent workers as described above. + ## Drift detection If you change the workspace config (e.g., update a repo URL or checkout ref), the computed fingerprint changes. AgentV detects this drift by comparing the stored `metadata.json` fingerprint against the newly computed one: diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 9e6f7de1e..66111a9b6 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -194,6 +194,8 @@ export type EvalSuiteResult = { readonly failOnError?: import('./types.js').FailOnError; /** Suite-level quality threshold (0-1) — suite fails if mean score is below */ readonly threshold?: number; + /** Resolved workspace.path from the eval YAML (after env-var expansion), if set */ + readonly workspacePath?: string; }; /** @@ -212,7 +214,11 @@ export async function loadTestSuite( if (format === 'agent-skills-json') { return { tests: await loadTestsFromAgentSkills(evalFilePath) }; } - const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options); + const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml( + evalFilePath, + repoRoot, + options, + ); const metadata = parseMetadata(parsed); const failOnError = extractFailOnError(parsed); const threshold = extractThreshold(parsed); @@ -226,6 +232,7 @@ export async function loadTestSuite( ...(metadata !== undefined && { metadata }), ...(failOnError !== undefined && { failOnError }), ...(threshold !== undefined && { threshold }), + ...(suiteWorkspacePath !== undefined && { workspacePath: suiteWorkspacePath }), }; } @@ -256,7 +263,7 @@ async function loadTestsFromYaml( evalFilePath: string, repoRoot: URL | string, options?: LoadOptions, -): Promise<{ tests: readonly EvalTest[]; parsed: JsonObject }> { +): Promise<{ tests: readonly EvalTest[]; parsed: JsonObject; suiteWorkspacePath?: string }> { // YAML parsing (existing implementation) const verbose = options?.verbose ?? false; const filterPattern = options?.filter; @@ -524,7 +531,7 @@ async function loadTestsFromYaml( results.push(testCase); } - return { tests: results, parsed: suite }; + return { tests: results, parsed: suite, suiteWorkspacePath: suiteWorkspace?.path }; } /**