From 3f0aff322cb2362de569af4fe8b3c2f65b156892 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 06:51:18 +0000 Subject: [PATCH 1/8] docs: clarify --workers parallelism scope and workspace-mode semantics (#1039) Answers the questions from #1039: - --workers N is a global concurrent test-case budget: with a single eval file, N test cases run in parallel; with M eval files, min(N, M) files run concurrently each with floor(N/min(N,M)) workers. - Documents the static workspace race condition when multiple eval files share the same path and run concurrently, with --workers 1 as the serialization escape hatch. - Updates workspace-pool.mdx concurrency section to explain multi-file slot allocation and the static-workspace cross-file hazard. Co-Authored-By: Claude Sonnet 4.6 --- apps/cli/src/commands/eval/commands/run.ts | 2 +- .../docs/docs/evaluation/running-evals.mdx | 31 +++++++++++++++++++ .../docs/docs/guides/workspace-pool.mdx | 4 +++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 282d8d655..33ce025d4 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -41,7 +41,7 @@ export const evalRunCommand = command({ type: optional(number), long: 'workers', description: - 'Number of parallel workers (default: 3, max: 50). Can also be set per-target in targets.yaml', + 'Maximum concurrent test cases across the run (default: 3, max: 50). With a single eval file, N test cases run in parallel. With multiple eval files, the budget is split: up to min(N, files) eval files run concurrently, each receiving floor(N / concurrent_files) workers. Use --workers 1 to serialize everything. Can also be set per-target in targets.yaml', }), out: option({ type: optional(string), diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index 341b71d30..df0a6b6db 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -190,6 +190,37 @@ export OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer token" agentv eval evals/my-eval.yaml --export-otel ``` +### Parallelism + +The `--workers N` flag is a **global concurrent test-case budget** for the entire run. + +**Single eval file** — up to N test cases run in parallel within that file: + +```bash +agentv eval evals/my-eval.yaml --workers 4 +# Up to 4 test cases run concurrently +``` + +**Multiple eval files** — workers are split across files: `min(N, files)` eval files run concurrently, each getting `floor(N / concurrent_files)` workers for intra-file parallelism: + +```bash +agentv eval evals/file1.yaml evals/file2.yaml evals/file3.yaml --workers 6 +# 3 eval files run concurrently, each with 2 workers → up to 6 concurrent test cases total +``` + +:::caution[Static workspace race conditions] +When multiple eval files reference the **same static workspace path** (e.g., via an env var like `EVAL_WORKSPACE_PATH`), concurrent execution can corrupt the workspace — for example, one file resetting a repo while another file's tests are running against it. + +To serialize all eval files (one file at a time), use `--workers 1`: + +```bash +agentv eval evals/**/*.yaml --workers 1 +# One file at a time; one test case at a time within each file +``` + +Alternatively, use `--workspace-mode pooled` (the default for evals with `repos`). Pooled mode allocates separate slots per worker, so concurrent test cases and concurrent eval files never share a workspace directory. See [Workspace Pool](/docs/guides/workspace-pool/) for details. +::: + ### Workspace Modes and Finish Policy Use workspace mode and finish policies instead of multiple conflicting booleans: diff --git a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx index 0969f710b..6a5a99c89 100644 --- a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx +++ b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx @@ -135,6 +135,10 @@ This creates up to 4 slots (`slot-0` through `slot-3`). PID-based lock files pre The maximum number of pool slots defaults to 10 (capped at 50). Slots are created on demand — a run with 2 workers only creates 2 slots, even if the pool allows 10. +**Multiple eval files:** When you pass multiple eval files to `agentv eval`, they may run concurrently (see [Parallelism](/docs/evaluation/running-evals/#parallelism)). Pooled workspaces handle this safely — each active worker (across all eval files) acquires its own slot. Eval files that share the same workspace config (same fingerprint) draw from the same pool; eval files with different configs maintain separate pools. No cross-file workspace contention occurs in pooled mode. + +**Static workspaces and multiple eval files:** If you use `--workspace-mode static --workspace-path /path` across multiple eval files, all files will share that single directory concurrently. This can cause race conditions — for example, one file running `git checkout ` while another file's tests read from the same repository. To prevent this, either use `--workers 1` to serialize execution or use pooled mode instead. + ## Drift detection If you change the workspace config (e.g., update a repo URL or checkout ref), the computed fingerprint changes. AgentV detects this drift by comparing the stored `metadata.json` fingerprint against the newly computed one: From ece25fce66cb398fe483a9eab59a11701f054f1f Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 07:10:43 +0000 Subject: [PATCH 2/8] feat: warn when multiple eval files share a static workspace path and run concurrently Surfaces workspace.path from EvalSuiteResult so the CLI can detect cross-file workspace collisions before starting concurrent execution. Emits a console.warn pointing users to --workers 1 as the fix. Co-Authored-By: Claude Sonnet 4.6 --- apps/cli/src/commands/eval/run-eval.ts | 32 +++++++++++++++++++++ packages/core/src/evaluation/yaml-parser.ts | 9 ++++-- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index b7ad4ffa1..e9f4463ac 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -498,6 +498,7 @@ async function prepareFileMetadata(params: { readonly failOnError?: FailOnError; readonly threshold?: number; readonly tags?: readonly string[]; + readonly workspacePath?: string; }> { const { testFilePath, repoRoot, cwd, options } = params; @@ -612,6 +613,7 @@ async function prepareFileMetadata(params: { failOnError: suite.failOnError, threshold: suite.threshold, tags: suite.metadata?.tags, + workspacePath: suite.workspacePath, }; } @@ -1115,6 +1117,7 @@ export async function runEvalCommand( readonly failOnError?: FailOnError; readonly threshold?: number; readonly tags?: readonly string[]; + readonly workspacePath?: string; } >(); // Separate TypeScript/JS eval files from YAML files. @@ -1284,6 +1287,35 @@ export async function runEvalCommand( // Use only files that survived tag filtering (fileMetadata keys) const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f)); + // Warn when multiple eval files share a static workspace path and will run concurrently, + // since concurrent writes to the same directory can corrupt each other's runs. + if (fileConcurrency > 1 && activeTestFiles.length > 1) { + const cliPath = options.workspacePath; + if (cliPath) { + console.warn( + `Warning: ${activeTestFiles.length} eval files share --workspace-path "${cliPath}" and will run concurrently. ` + + `Concurrent writes to the same workspace directory may corrupt results. Use --workers 1 to serialize.`, + ); + } else { + const pathToFiles = new Map(); + for (const [filePath, meta] of fileMetadata.entries()) { + if (meta.workspacePath) { + const group = pathToFiles.get(meta.workspacePath) ?? []; + group.push(path.relative(cwd, filePath)); + pathToFiles.set(meta.workspacePath, group); + } + } + for (const [wsPath, files] of pathToFiles.entries()) { + if (files.length > 1) { + console.warn( + `Warning: ${files.length} eval files share workspace path "${wsPath}" and will run concurrently (${files.join(', ')}). ` + + `Concurrent writes to the same workspace directory may corrupt results. Use --workers 1 to serialize.`, + ); + } + } + } + } + // --transcript: create a shared TranscriptProvider and validate line count let transcriptProviderFactory: | ((target: import('@agentv/core').ResolvedTarget) => import('@agentv/core').Provider) diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 9e6f7de1e..6c6b95ea8 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -194,6 +194,8 @@ export type EvalSuiteResult = { readonly failOnError?: import('./types.js').FailOnError; /** Suite-level quality threshold (0-1) — suite fails if mean score is below */ readonly threshold?: number; + /** Resolved workspace.path from the eval YAML (after env-var expansion), if set */ + readonly workspacePath?: string; }; /** @@ -212,7 +214,7 @@ export async function loadTestSuite( if (format === 'agent-skills-json') { return { tests: await loadTestsFromAgentSkills(evalFilePath) }; } - const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options); + const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(evalFilePath, repoRoot, options); const metadata = parseMetadata(parsed); const failOnError = extractFailOnError(parsed); const threshold = extractThreshold(parsed); @@ -226,6 +228,7 @@ export async function loadTestSuite( ...(metadata !== undefined && { metadata }), ...(failOnError !== undefined && { failOnError }), ...(threshold !== undefined && { threshold }), + ...(suiteWorkspacePath !== undefined && { workspacePath: suiteWorkspacePath }), }; } @@ -256,7 +259,7 @@ async function loadTestsFromYaml( evalFilePath: string, repoRoot: URL | string, options?: LoadOptions, -): Promise<{ tests: readonly EvalTest[]; parsed: JsonObject }> { +): Promise<{ tests: readonly EvalTest[]; parsed: JsonObject; suiteWorkspacePath?: string }> { // YAML parsing (existing implementation) const verbose = options?.verbose ?? false; const filterPattern = options?.filter; @@ -524,7 +527,7 @@ async function loadTestsFromYaml( results.push(testCase); } - return { tests: results, parsed: suite }; + return { tests: results, parsed: suite, suiteWorkspacePath: suiteWorkspace?.path }; } /** From 477d4eed9240f5a07b7bc6fc029179d3c4cbe37a Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 07:16:50 +0000 Subject: [PATCH 3/8] =?UTF-8?q?fix:=20lint=20=E2=80=94=20format=20long=20l?= =?UTF-8?q?ine=20and=20consolidate=20warn=20strings=20into=20template=20li?= =?UTF-8?q?terals?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- apps/cli/src/commands/eval/run-eval.ts | 6 ++---- packages/core/src/evaluation/yaml-parser.ts | 6 +++++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index e9f4463ac..d7f518be0 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -1293,8 +1293,7 @@ export async function runEvalCommand( const cliPath = options.workspacePath; if (cliPath) { console.warn( - `Warning: ${activeTestFiles.length} eval files share --workspace-path "${cliPath}" and will run concurrently. ` + - `Concurrent writes to the same workspace directory may corrupt results. Use --workers 1 to serialize.`, + `Warning: ${activeTestFiles.length} eval files share --workspace-path "${cliPath}" and will run concurrently. Concurrent writes to the same workspace directory may corrupt results. Use --workers 1 to serialize.`, ); } else { const pathToFiles = new Map(); @@ -1308,8 +1307,7 @@ export async function runEvalCommand( for (const [wsPath, files] of pathToFiles.entries()) { if (files.length > 1) { console.warn( - `Warning: ${files.length} eval files share workspace path "${wsPath}" and will run concurrently (${files.join(', ')}). ` + - `Concurrent writes to the same workspace directory may corrupt results. Use --workers 1 to serialize.`, + `Warning: ${files.length} eval files share workspace path "${wsPath}" and will run concurrently (${files.join(', ')}). Concurrent writes to the same workspace directory may corrupt results. Use --workers 1 to serialize.`, ); } } diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 6c6b95ea8..66111a9b6 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -214,7 +214,11 @@ export async function loadTestSuite( if (format === 'agent-skills-json') { return { tests: await loadTestsFromAgentSkills(evalFilePath) }; } - const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(evalFilePath, repoRoot, options); + const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml( + evalFilePath, + repoRoot, + options, + ); const metadata = parseMetadata(parsed); const failOnError = extractFailOnError(parsed); const threshold = extractThreshold(parsed); From 80f7d8e653e82d86f9c01f044f0ebb7f50e5fe99 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 07:38:03 +0000 Subject: [PATCH 4/8] feat: auto-serialize eval files that share a static workspace path Replace the global worker-budget-split scheduler with workspace-aware grouping: eval files sharing the same static workspace.path run sequentially within their group; groups with distinct paths (or no static workspace) run in parallel. Each file gets the full --workers N budget with no splitting. Also removes the now-unnecessary concurrent-workspace warning added in the previous commit and updates docs + CLI help to reflect the new semantics. Co-Authored-By: Claude Sonnet 4.6 --- apps/cli/src/commands/eval/commands/run.ts | 2 +- apps/cli/src/commands/eval/run-eval.ts | 245 +++++++++--------- .../docs/docs/evaluation/running-evals.mdx | 27 +- .../docs/docs/guides/workspace-pool.mdx | 4 +- 4 files changed, 127 insertions(+), 151 deletions(-) diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 33ce025d4..442edb65a 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -41,7 +41,7 @@ export const evalRunCommand = command({ type: optional(number), long: 'workers', description: - 'Maximum concurrent test cases across the run (default: 3, max: 50). With a single eval file, N test cases run in parallel. With multiple eval files, the budget is split: up to min(N, files) eval files run concurrently, each receiving floor(N / concurrent_files) workers. Use --workers 1 to serialize everything. Can also be set per-target in targets.yaml', + 'Number of parallel test cases within each eval file (default: 3, max: 50). Eval files run sequentially unless they use distinct workspace paths, in which case they run in parallel. Can also be set per-target in targets.yaml', }), out: option({ type: optional(string), diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index d7f518be0..ace651c4c 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -1090,15 +1090,8 @@ export async function runEvalCommand( const seenTestCases = new Set(); const displayIdTracker = createDisplayIdTracker(); - // Derive file-level concurrency from worker count (global) when provided - const totalWorkers = options.workers ?? DEFAULT_WORKERS; - const fileConcurrency = Math.min( - Math.max(1, totalWorkers), - Math.max(1, resolvedTestFiles.length), - ); - const perFileWorkers = options.workers - ? Math.max(1, Math.floor(totalWorkers / fileConcurrency)) - : undefined; + // Each file gets the full worker budget — no splitting across files + const perFileWorkers = options.workers; const fileMetadata = new Map< string, { @@ -1231,7 +1224,9 @@ export async function runEvalCommand( } throw new Error('No tests matched the provided filters.'); } - const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose }); + const progressReporter = createProgressReporter(options.workers ?? DEFAULT_WORKERS, { + verbose: options.verbose, + }); progressReporter.start(); progressReporter.setTotal(totalEvalCount); const seenCodexLogPaths = new Set(); @@ -1287,33 +1282,6 @@ export async function runEvalCommand( // Use only files that survived tag filtering (fileMetadata keys) const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f)); - // Warn when multiple eval files share a static workspace path and will run concurrently, - // since concurrent writes to the same directory can corrupt each other's runs. - if (fileConcurrency > 1 && activeTestFiles.length > 1) { - const cliPath = options.workspacePath; - if (cliPath) { - console.warn( - `Warning: ${activeTestFiles.length} eval files share --workspace-path "${cliPath}" and will run concurrently. Concurrent writes to the same workspace directory may corrupt results. Use --workers 1 to serialize.`, - ); - } else { - const pathToFiles = new Map(); - for (const [filePath, meta] of fileMetadata.entries()) { - if (meta.workspacePath) { - const group = pathToFiles.get(meta.workspacePath) ?? []; - group.push(path.relative(cwd, filePath)); - pathToFiles.set(meta.workspacePath, group); - } - } - for (const [wsPath, files] of pathToFiles.entries()) { - if (files.length > 1) { - console.warn( - `Warning: ${files.length} eval files share workspace path "${wsPath}" and will run concurrently (${files.join(', ')}). Concurrent writes to the same workspace directory may corrupt results. Use --workers 1 to serialize.`, - ); - } - } - } - } - // --transcript: create a shared TranscriptProvider and validate line count let transcriptProviderFactory: | ((target: import('@agentv/core').ResolvedTarget) => import('@agentv/core').Provider) @@ -1339,102 +1307,121 @@ export async function runEvalCommand( ); } - try { - await runWithLimit(activeTestFiles, fileConcurrency, async (testFilePath) => { - const targetPrep = fileMetadata.get(testFilePath); - if (!targetPrep) { - throw new Error(`Missing metadata for ${testFilePath}`); - } + // Group files by static workspace path. Files sharing the same static workspace run + // sequentially to prevent concurrent writes from racing. Files using pooled mode or + // no workspace each get their own group and run in parallel with other groups. + const workspaceGroups = new Map(); + for (const filePath of activeTestFiles) { + const staticPath = options.workspacePath ?? fileMetadata.get(filePath)?.workspacePath; + // Files with no static workspace get a unique key so they run in parallel + const groupKey = staticPath ?? filePath; + const group = workspaceGroups.get(groupKey) ?? []; + group.push(filePath); + workspaceGroups.set(groupKey, group); + } - // Run all targets concurrently (each target has its own worker limit) - const targetResults = await Promise.all( - targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => { - // Filter test cases to those applicable to this target. - const targetName = selection.targetName; - const applicableTestCases = - targetPrep.selections.length > 1 - ? targetPrep.testCases.filter((test) => { - if (test.targets && test.targets.length > 0) { - return test.targets.includes(targetName); - } - return true; - }) - : targetPrep.testCases; - - if (applicableTestCases.length === 0) { - return []; + try { + await Promise.all( + [...workspaceGroups.values()].map(async (group) => { + for (const testFilePath of group) { + const targetPrep = fileMetadata.get(testFilePath); + if (!targetPrep) { + throw new Error(`Missing metadata for ${testFilePath}`); } - try { - const result = await runSingleEvalFile({ - testFilePath, - cwd, - repoRoot, - options, - outputWriter, - otelExporter, - cache, - evaluationRunner, - workersOverride: perFileWorkers, - yamlWorkers: targetPrep.yamlWorkers, - progressReporter, - seenTestCases, - displayIdTracker, - selection, - inlineTargetLabel, - testCases: applicableTestCases, - trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig, - matrixMode: targetPrep.selections.length > 1, - totalBudgetUsd: targetPrep.totalBudgetUsd, - failOnError: targetPrep.failOnError, - threshold: resolvedThreshold, - providerFactory: transcriptProviderFactory, - }); - const evalFile = path.relative(cwd, testFilePath); - const existingSummary = remoteEvalSummaries.find( - (summary) => summary.evalFile === evalFile, - ); - if (existingSummary) { - existingSummary.results.push(...result.results); - } else { - remoteEvalSummaries.push({ - evalFile, - results: [...result.results], - }); - } - - return result.results; - } catch (fileError) { - // before_all or other setup failures should not abort the entire run. - // Mark all tests in this file as errors and continue with other files. - const message = fileError instanceof Error ? fileError.message : String(fileError); - console.error(`\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`); - const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({ - timestamp: new Date().toISOString(), - testId: testCase.id, - score: 0, - assertions: [], - output: [], - scores: [], - error: message, - executionStatus: 'execution_error' as const, - failureStage: 'setup' as const, - failureReasonCode: 'setup_error' as const, - durationMs: 0, - tokenUsage: { input: 0, output: 0, inputTokens: 0, outputTokens: 0 }, - target: selection.targetName, - })); - for (const errResult of errorResults) { - await outputWriter.append(errResult); - } - return errorResults; + // Run all targets concurrently (each target has its own worker limit) + const targetResults = await Promise.all( + targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => { + // Filter test cases to those applicable to this target. + const targetName = selection.targetName; + const applicableTestCases = + targetPrep.selections.length > 1 + ? targetPrep.testCases.filter((test) => { + if (test.targets && test.targets.length > 0) { + return test.targets.includes(targetName); + } + return true; + }) + : targetPrep.testCases; + + if (applicableTestCases.length === 0) { + return []; + } + + try { + const result = await runSingleEvalFile({ + testFilePath, + cwd, + repoRoot, + options, + outputWriter, + otelExporter, + cache, + evaluationRunner, + workersOverride: perFileWorkers, + yamlWorkers: targetPrep.yamlWorkers, + progressReporter, + seenTestCases, + displayIdTracker, + selection, + inlineTargetLabel, + testCases: applicableTestCases, + trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig, + matrixMode: targetPrep.selections.length > 1, + totalBudgetUsd: targetPrep.totalBudgetUsd, + failOnError: targetPrep.failOnError, + threshold: resolvedThreshold, + providerFactory: transcriptProviderFactory, + }); + const evalFile = path.relative(cwd, testFilePath); + const existingSummary = remoteEvalSummaries.find( + (summary) => summary.evalFile === evalFile, + ); + if (existingSummary) { + existingSummary.results.push(...result.results); + } else { + remoteEvalSummaries.push({ + evalFile, + results: [...result.results], + }); + } + + return result.results; + } catch (fileError) { + // before_all or other setup failures should not abort the entire run. + // Mark all tests in this file as errors and continue with other files. + const message = fileError instanceof Error ? fileError.message : String(fileError); + console.error( + `\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`, + ); + const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({ + timestamp: new Date().toISOString(), + testId: testCase.id, + score: 0, + assertions: [], + output: [], + scores: [], + error: message, + executionStatus: 'execution_error' as const, + failureStage: 'setup' as const, + failureReasonCode: 'setup_error' as const, + durationMs: 0, + tokenUsage: { input: 0, output: 0, inputTokens: 0, outputTokens: 0 }, + target: selection.targetName, + })); + for (const errResult of errorResults) { + await outputWriter.append(errResult); + } + return errorResults; + } + }), + ); + for (const results of targetResults) { + allResults.push(...results); } - }), - ); - for (const results of targetResults) { - allResults.push(...results); - } - }); + } + }), + ); progressReporter.finish(); diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index df0a6b6db..b6ca5c233 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -192,34 +192,25 @@ agentv eval evals/my-eval.yaml --export-otel ### Parallelism -The `--workers N` flag is a **global concurrent test-case budget** for the entire run. - -**Single eval file** — up to N test cases run in parallel within that file: +The `--workers N` flag controls how many **test cases run in parallel within each eval file** (default: 3). ```bash agentv eval evals/my-eval.yaml --workers 4 -# Up to 4 test cases run concurrently -``` - -**Multiple eval files** — workers are split across files: `min(N, files)` eval files run concurrently, each getting `floor(N / concurrent_files)` workers for intra-file parallelism: - -```bash -agentv eval evals/file1.yaml evals/file2.yaml evals/file3.yaml --workers 6 -# 3 eval files run concurrently, each with 2 workers → up to 6 concurrent test cases total +# Up to 4 test cases from the file run concurrently ``` -:::caution[Static workspace race conditions] -When multiple eval files reference the **same static workspace path** (e.g., via an env var like `EVAL_WORKSPACE_PATH`), concurrent execution can corrupt the workspace — for example, one file resetting a repo while another file's tests are running against it. +**Multiple eval files** are scheduled by workspace path: -To serialize all eval files (one file at a time), use `--workers 1`: +- Files that share the **same static workspace path** run **sequentially** — one file completes before the next starts, so they never write to the same directory concurrently. +- Files with **distinct workspace paths** (or no workspace) run **in parallel**, each with the full `--workers N` budget. ```bash -agentv eval evals/**/*.yaml --workers 1 -# One file at a time; one test case at a time within each file +# file1 and file2 share EVAL_WORKSPACE_PATH → run sequentially, 3 workers each +# file3 has its own path → runs in parallel with the file1/file2 group +agentv eval evals/file1.yaml evals/file2.yaml evals/file3.yaml --workers 3 ``` -Alternatively, use `--workspace-mode pooled` (the default for evals with `repos`). Pooled mode allocates separate slots per worker, so concurrent test cases and concurrent eval files never share a workspace directory. See [Workspace Pool](/docs/guides/workspace-pool/) for details. -::: +Pooled workspaces (the default for evals with `repos`) are always safe to run in parallel — each worker gets its own pool slot. See [Workspace Pool](/docs/guides/workspace-pool/) for details. ### Workspace Modes and Finish Policy diff --git a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx index 6a5a99c89..72e2b8d94 100644 --- a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx +++ b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx @@ -135,9 +135,7 @@ This creates up to 4 slots (`slot-0` through `slot-3`). PID-based lock files pre The maximum number of pool slots defaults to 10 (capped at 50). Slots are created on demand — a run with 2 workers only creates 2 slots, even if the pool allows 10. -**Multiple eval files:** When you pass multiple eval files to `agentv eval`, they may run concurrently (see [Parallelism](/docs/evaluation/running-evals/#parallelism)). Pooled workspaces handle this safely — each active worker (across all eval files) acquires its own slot. Eval files that share the same workspace config (same fingerprint) draw from the same pool; eval files with different configs maintain separate pools. No cross-file workspace contention occurs in pooled mode. - -**Static workspaces and multiple eval files:** If you use `--workspace-mode static --workspace-path /path` across multiple eval files, all files will share that single directory concurrently. This can cause race conditions — for example, one file running `git checkout ` while another file's tests read from the same repository. To prevent this, either use `--workers 1` to serialize execution or use pooled mode instead. +**Multiple eval files:** When you pass multiple eval files to `agentv eval`, files with distinct workspace paths run in parallel while files that share a static workspace path are automatically serialized (see [Parallelism](/docs/evaluation/running-evals/#parallelism)). Pooled workspaces are always safe to run in parallel — each active worker acquires its own slot regardless of which eval file it belongs to. No cross-file workspace contention occurs in pooled mode. ## Drift detection From 87f381ee8bc0398c1aed6542f1be3615a65bf60d Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 07:44:25 +0000 Subject: [PATCH 5/8] perf: cap parallel workspace groups at --workers to bound total concurrency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit runWithLimit limits how many groups run in parallel to --workers N. Each file within a group still gets the full --workers budget for within-file test-case parallelism. Max concurrent test cases is bounded by workers² rather than unbounded across all groups. Co-Authored-By: Claude Sonnet 4.6 --- apps/cli/src/commands/eval/run-eval.ts | 192 ++++++++++++------------- 1 file changed, 95 insertions(+), 97 deletions(-) diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index ace651c4c..166f8dc65 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -1309,7 +1309,9 @@ export async function runEvalCommand( // Group files by static workspace path. Files sharing the same static workspace run // sequentially to prevent concurrent writes from racing. Files using pooled mode or - // no workspace each get their own group and run in parallel with other groups. + // no workspace each get their own group. Groups run in parallel up to --workers limit, + // each file within a group gets the full --workers budget for within-file concurrency. + const workers = options.workers ?? DEFAULT_WORKERS; const workspaceGroups = new Map(); for (const filePath of activeTestFiles) { const staticPath = options.workspacePath ?? fileMetadata.get(filePath)?.workspacePath; @@ -1321,107 +1323,103 @@ export async function runEvalCommand( } try { - await Promise.all( - [...workspaceGroups.values()].map(async (group) => { - for (const testFilePath of group) { - const targetPrep = fileMetadata.get(testFilePath); - if (!targetPrep) { - throw new Error(`Missing metadata for ${testFilePath}`); - } + await runWithLimit([...workspaceGroups.values()], workers, async (group) => { + for (const testFilePath of group) { + const targetPrep = fileMetadata.get(testFilePath); + if (!targetPrep) { + throw new Error(`Missing metadata for ${testFilePath}`); + } - // Run all targets concurrently (each target has its own worker limit) - const targetResults = await Promise.all( - targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => { - // Filter test cases to those applicable to this target. - const targetName = selection.targetName; - const applicableTestCases = - targetPrep.selections.length > 1 - ? targetPrep.testCases.filter((test) => { - if (test.targets && test.targets.length > 0) { - return test.targets.includes(targetName); - } - return true; - }) - : targetPrep.testCases; - - if (applicableTestCases.length === 0) { - return []; + // Run all targets concurrently (each target has its own worker limit) + const targetResults = await Promise.all( + targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => { + // Filter test cases to those applicable to this target. + const targetName = selection.targetName; + const applicableTestCases = + targetPrep.selections.length > 1 + ? targetPrep.testCases.filter((test) => { + if (test.targets && test.targets.length > 0) { + return test.targets.includes(targetName); + } + return true; + }) + : targetPrep.testCases; + + if (applicableTestCases.length === 0) { + return []; + } + + try { + const result = await runSingleEvalFile({ + testFilePath, + cwd, + repoRoot, + options, + outputWriter, + otelExporter, + cache, + evaluationRunner, + workersOverride: perFileWorkers, + yamlWorkers: targetPrep.yamlWorkers, + progressReporter, + seenTestCases, + displayIdTracker, + selection, + inlineTargetLabel, + testCases: applicableTestCases, + trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig, + matrixMode: targetPrep.selections.length > 1, + totalBudgetUsd: targetPrep.totalBudgetUsd, + failOnError: targetPrep.failOnError, + threshold: resolvedThreshold, + providerFactory: transcriptProviderFactory, + }); + const evalFile = path.relative(cwd, testFilePath); + const existingSummary = remoteEvalSummaries.find( + (summary) => summary.evalFile === evalFile, + ); + if (existingSummary) { + existingSummary.results.push(...result.results); + } else { + remoteEvalSummaries.push({ + evalFile, + results: [...result.results], + }); } - try { - const result = await runSingleEvalFile({ - testFilePath, - cwd, - repoRoot, - options, - outputWriter, - otelExporter, - cache, - evaluationRunner, - workersOverride: perFileWorkers, - yamlWorkers: targetPrep.yamlWorkers, - progressReporter, - seenTestCases, - displayIdTracker, - selection, - inlineTargetLabel, - testCases: applicableTestCases, - trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig, - matrixMode: targetPrep.selections.length > 1, - totalBudgetUsd: targetPrep.totalBudgetUsd, - failOnError: targetPrep.failOnError, - threshold: resolvedThreshold, - providerFactory: transcriptProviderFactory, - }); - const evalFile = path.relative(cwd, testFilePath); - const existingSummary = remoteEvalSummaries.find( - (summary) => summary.evalFile === evalFile, - ); - if (existingSummary) { - existingSummary.results.push(...result.results); - } else { - remoteEvalSummaries.push({ - evalFile, - results: [...result.results], - }); - } - - return result.results; - } catch (fileError) { - // before_all or other setup failures should not abort the entire run. - // Mark all tests in this file as errors and continue with other files. - const message = fileError instanceof Error ? fileError.message : String(fileError); - console.error( - `\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`, - ); - const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({ - timestamp: new Date().toISOString(), - testId: testCase.id, - score: 0, - assertions: [], - output: [], - scores: [], - error: message, - executionStatus: 'execution_error' as const, - failureStage: 'setup' as const, - failureReasonCode: 'setup_error' as const, - durationMs: 0, - tokenUsage: { input: 0, output: 0, inputTokens: 0, outputTokens: 0 }, - target: selection.targetName, - })); - for (const errResult of errorResults) { - await outputWriter.append(errResult); - } - return errorResults; + return result.results; + } catch (fileError) { + // before_all or other setup failures should not abort the entire run. + // Mark all tests in this file as errors and continue with other files. + const message = fileError instanceof Error ? fileError.message : String(fileError); + console.error(`\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`); + const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({ + timestamp: new Date().toISOString(), + testId: testCase.id, + score: 0, + assertions: [], + output: [], + scores: [], + error: message, + executionStatus: 'execution_error' as const, + failureStage: 'setup' as const, + failureReasonCode: 'setup_error' as const, + durationMs: 0, + tokenUsage: { input: 0, output: 0, inputTokens: 0, outputTokens: 0 }, + target: selection.targetName, + })); + for (const errResult of errorResults) { + await outputWriter.append(errResult); } - }), - ); - for (const results of targetResults) { - allResults.push(...results); - } + return errorResults; + } + }), + ); + for (const results of targetResults) { + allResults.push(...results); } - }), - ); + } + }); progressReporter.finish(); From 1d168e12a0df0c68e4c9e4aba8946d66c78b3f44 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 08:04:41 +0000 Subject: [PATCH 6/8] refactor: simplify to sequential eval files with parallel test cases Replace workspace-group scheduler with a plain for-of loop. Eval files always run sequentially; --workers N controls within-file test-case parallelism. This matches the standard model used by promptfoo and convex-evals and eliminates all cross-file workspace race conditions without any grouping complexity. Co-Authored-By: Claude Sonnet 4.6 --- apps/cli/src/commands/eval/commands/run.ts | 2 +- apps/cli/src/commands/eval/run-eval.ts | 200 ++++++++---------- .../docs/docs/evaluation/running-evals.mdx | 14 +- .../docs/docs/guides/workspace-pool.mdx | 2 +- 4 files changed, 98 insertions(+), 120 deletions(-) diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 442edb65a..8d3f5f18b 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -41,7 +41,7 @@ export const evalRunCommand = command({ type: optional(number), long: 'workers', description: - 'Number of parallel test cases within each eval file (default: 3, max: 50). Eval files run sequentially unless they use distinct workspace paths, in which case they run in parallel. Can also be set per-target in targets.yaml', + 'Number of parallel test cases within each eval file (default: 3, max: 50). Eval files always run sequentially. Can also be set per-target in targets.yaml', }), out: option({ type: optional(string), diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 166f8dc65..a09356920 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -1307,119 +1307,105 @@ export async function runEvalCommand( ); } - // Group files by static workspace path. Files sharing the same static workspace run - // sequentially to prevent concurrent writes from racing. Files using pooled mode or - // no workspace each get their own group. Groups run in parallel up to --workers limit, - // each file within a group gets the full --workers budget for within-file concurrency. - const workers = options.workers ?? DEFAULT_WORKERS; - const workspaceGroups = new Map(); - for (const filePath of activeTestFiles) { - const staticPath = options.workspacePath ?? fileMetadata.get(filePath)?.workspacePath; - // Files with no static workspace get a unique key so they run in parallel - const groupKey = staticPath ?? filePath; - const group = workspaceGroups.get(groupKey) ?? []; - group.push(filePath); - workspaceGroups.set(groupKey, group); - } - + // Eval files run sequentially; within each file, --workers N test cases run in parallel. + // This matches industry practice (promptfoo, convex-evals) and avoids cross-file workspace + // races without any grouping complexity. try { - await runWithLimit([...workspaceGroups.values()], workers, async (group) => { - for (const testFilePath of group) { - const targetPrep = fileMetadata.get(testFilePath); - if (!targetPrep) { - throw new Error(`Missing metadata for ${testFilePath}`); - } + for (const testFilePath of activeTestFiles) { + const targetPrep = fileMetadata.get(testFilePath); + if (!targetPrep) { + throw new Error(`Missing metadata for ${testFilePath}`); + } - // Run all targets concurrently (each target has its own worker limit) - const targetResults = await Promise.all( - targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => { - // Filter test cases to those applicable to this target. - const targetName = selection.targetName; - const applicableTestCases = - targetPrep.selections.length > 1 - ? targetPrep.testCases.filter((test) => { - if (test.targets && test.targets.length > 0) { - return test.targets.includes(targetName); - } - return true; - }) - : targetPrep.testCases; - - if (applicableTestCases.length === 0) { - return []; - } + // Run all targets concurrently (each target has its own worker limit) + const targetResults = await Promise.all( + targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => { + // Filter test cases to those applicable to this target. + const targetName = selection.targetName; + const applicableTestCases = + targetPrep.selections.length > 1 + ? targetPrep.testCases.filter((test) => { + if (test.targets && test.targets.length > 0) { + return test.targets.includes(targetName); + } + return true; + }) + : targetPrep.testCases; + + if (applicableTestCases.length === 0) { + return []; + } - try { - const result = await runSingleEvalFile({ - testFilePath, - cwd, - repoRoot, - options, - outputWriter, - otelExporter, - cache, - evaluationRunner, - workersOverride: perFileWorkers, - yamlWorkers: targetPrep.yamlWorkers, - progressReporter, - seenTestCases, - displayIdTracker, - selection, - inlineTargetLabel, - testCases: applicableTestCases, - trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig, - matrixMode: targetPrep.selections.length > 1, - totalBudgetUsd: targetPrep.totalBudgetUsd, - failOnError: targetPrep.failOnError, - threshold: resolvedThreshold, - providerFactory: transcriptProviderFactory, + try { + const result = await runSingleEvalFile({ + testFilePath, + cwd, + repoRoot, + options, + outputWriter, + otelExporter, + cache, + evaluationRunner, + workersOverride: perFileWorkers, + yamlWorkers: targetPrep.yamlWorkers, + progressReporter, + seenTestCases, + displayIdTracker, + selection, + inlineTargetLabel, + testCases: applicableTestCases, + trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig, + matrixMode: targetPrep.selections.length > 1, + totalBudgetUsd: targetPrep.totalBudgetUsd, + failOnError: targetPrep.failOnError, + threshold: resolvedThreshold, + providerFactory: transcriptProviderFactory, + }); + const evalFile = path.relative(cwd, testFilePath); + const existingSummary = remoteEvalSummaries.find( + (summary) => summary.evalFile === evalFile, + ); + if (existingSummary) { + existingSummary.results.push(...result.results); + } else { + remoteEvalSummaries.push({ + evalFile, + results: [...result.results], }); - const evalFile = path.relative(cwd, testFilePath); - const existingSummary = remoteEvalSummaries.find( - (summary) => summary.evalFile === evalFile, - ); - if (existingSummary) { - existingSummary.results.push(...result.results); - } else { - remoteEvalSummaries.push({ - evalFile, - results: [...result.results], - }); - } - - return result.results; - } catch (fileError) { - // before_all or other setup failures should not abort the entire run. - // Mark all tests in this file as errors and continue with other files. - const message = fileError instanceof Error ? fileError.message : String(fileError); - console.error(`\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`); - const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({ - timestamp: new Date().toISOString(), - testId: testCase.id, - score: 0, - assertions: [], - output: [], - scores: [], - error: message, - executionStatus: 'execution_error' as const, - failureStage: 'setup' as const, - failureReasonCode: 'setup_error' as const, - durationMs: 0, - tokenUsage: { input: 0, output: 0, inputTokens: 0, outputTokens: 0 }, - target: selection.targetName, - })); - for (const errResult of errorResults) { - await outputWriter.append(errResult); - } - return errorResults; } - }), - ); - for (const results of targetResults) { - allResults.push(...results); - } + + return result.results; + } catch (fileError) { + // before_all or other setup failures should not abort the entire run. + // Mark all tests in this file as errors and continue with other files. + const message = fileError instanceof Error ? fileError.message : String(fileError); + console.error(`\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`); + const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({ + timestamp: new Date().toISOString(), + testId: testCase.id, + score: 0, + assertions: [], + output: [], + scores: [], + error: message, + executionStatus: 'execution_error' as const, + failureStage: 'setup' as const, + failureReasonCode: 'setup_error' as const, + durationMs: 0, + tokenUsage: { input: 0, output: 0, inputTokens: 0, outputTokens: 0 }, + target: selection.targetName, + })); + for (const errResult of errorResults) { + await outputWriter.append(errResult); + } + return errorResults; + } + }), + ); + for (const results of targetResults) { + allResults.push(...results); } - }); + } progressReporter.finish(); diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index b6ca5c233..ea5eb35f2 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -192,25 +192,17 @@ agentv eval evals/my-eval.yaml --export-otel ### Parallelism -The `--workers N` flag controls how many **test cases run in parallel within each eval file** (default: 3). +The `--workers N` flag controls how many **test cases run in parallel within each eval file** (default: 3). Eval files always run sequentially — one file completes before the next starts. ```bash agentv eval evals/my-eval.yaml --workers 4 # Up to 4 test cases from the file run concurrently -``` - -**Multiple eval files** are scheduled by workspace path: -- Files that share the **same static workspace path** run **sequentially** — one file completes before the next starts, so they never write to the same directory concurrently. -- Files with **distinct workspace paths** (or no workspace) run **in parallel**, each with the full `--workers N` budget. - -```bash -# file1 and file2 share EVAL_WORKSPACE_PATH → run sequentially, 3 workers each -# file3 has its own path → runs in parallel with the file1/file2 group agentv eval evals/file1.yaml evals/file2.yaml evals/file3.yaml --workers 3 +# Files run one at a time; within each file, up to 3 test cases run in parallel ``` -Pooled workspaces (the default for evals with `repos`) are always safe to run in parallel — each worker gets its own pool slot. See [Workspace Pool](/docs/guides/workspace-pool/) for details. +This matches the standard model used by eval frameworks (promptfoo, convex-evals) and avoids cross-file workspace races without any special configuration. ### Workspace Modes and Finish Policy diff --git a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx index 72e2b8d94..58e024e2b 100644 --- a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx +++ b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx @@ -135,7 +135,7 @@ This creates up to 4 slots (`slot-0` through `slot-3`). PID-based lock files pre The maximum number of pool slots defaults to 10 (capped at 50). Slots are created on demand — a run with 2 workers only creates 2 slots, even if the pool allows 10. -**Multiple eval files:** When you pass multiple eval files to `agentv eval`, files with distinct workspace paths run in parallel while files that share a static workspace path are automatically serialized (see [Parallelism](/docs/evaluation/running-evals/#parallelism)). Pooled workspaces are always safe to run in parallel — each active worker acquires its own slot regardless of which eval file it belongs to. No cross-file workspace contention occurs in pooled mode. +**Multiple eval files:** When you pass multiple eval files to `agentv eval`, they run sequentially — one file completes before the next starts (see [Parallelism](/docs/evaluation/running-evals/#parallelism)). Within each file, pool slots support concurrent workers as described above. ## Drift detection From efebcc9f73247b734094d514212a2d471cf4deb2 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 08:10:18 +0000 Subject: [PATCH 7/8] docs: reference deepeval and OpenAI Evals instead of convex-evals Co-Authored-By: Claude Sonnet 4.6 --- apps/cli/src/commands/eval/run-eval.ts | 4 ++-- apps/web/src/content/docs/docs/evaluation/running-evals.mdx | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index a09356920..5f0176d9f 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -1308,8 +1308,8 @@ export async function runEvalCommand( } // Eval files run sequentially; within each file, --workers N test cases run in parallel. - // This matches industry practice (promptfoo, convex-evals) and avoids cross-file workspace - // races without any grouping complexity. + // This matches industry practice (promptfoo, deepeval, OpenAI Evals) and avoids cross-file + // workspace races without any grouping complexity. try { for (const testFilePath of activeTestFiles) { const targetPrep = fileMetadata.get(testFilePath); diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index ea5eb35f2..efa3ddc8f 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -202,7 +202,7 @@ agentv eval evals/file1.yaml evals/file2.yaml evals/file3.yaml --workers 3 # Files run one at a time; within each file, up to 3 test cases run in parallel ``` -This matches the standard model used by eval frameworks (promptfoo, convex-evals) and avoids cross-file workspace races without any special configuration. +This matches the standard model used by eval frameworks (promptfoo, deepeval, OpenAI Evals) and avoids cross-file workspace races without any special configuration. ### Workspace Modes and Finish Policy From f09ffa753622cbba576e8da1e6b54c51b9369c8a Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 08:15:21 +0000 Subject: [PATCH 8/8] chore: remove dead code from earlier iterations Remove runWithLimit (unused after switching to plain for-of loop) and workspacePath from fileMetadata Map type (set but never read). Co-Authored-By: Claude Sonnet 4.6 --- apps/cli/src/commands/eval/run-eval.ts | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 5f0176d9f..c5aa81a2b 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -498,7 +498,6 @@ async function prepareFileMetadata(params: { readonly failOnError?: FailOnError; readonly threshold?: number; readonly tags?: readonly string[]; - readonly workspacePath?: string; }> { const { testFilePath, repoRoot, cwd, options } = params; @@ -613,29 +612,9 @@ async function prepareFileMetadata(params: { failOnError: suite.failOnError, threshold: suite.threshold, tags: suite.metadata?.tags, - workspacePath: suite.workspacePath, }; } -async function runWithLimit( - items: readonly T[], - limit: number, - task: (item: T) => Promise, -): Promise { - const safeLimit = Math.max(1, limit); - let index = 0; - - const workers = Array.from({ length: safeLimit }, async () => { - while (index < items.length) { - const current = items[index]; - index += 1; - await task(current); - } - }); - - await Promise.all(workers); -} - async function runSingleEvalFile(params: { readonly testFilePath: string; readonly cwd: string; @@ -1110,7 +1089,6 @@ export async function runEvalCommand( readonly failOnError?: FailOnError; readonly threshold?: number; readonly tags?: readonly string[]; - readonly workspacePath?: string; } >(); // Separate TypeScript/JS eval files from YAML files.