diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 18668aa53..d27985abb 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -209,6 +209,12 @@ export const evalRunCommand = command({ description: 'Per-test score threshold (0-1, default 0.8). Exit 1 if any test scores below this value', }), + budgetUsd: option({ + type: optional(number), + long: 'budget-usd', + description: + 'Maximum total cost in USD across all eval files in this run. Stops dispatching new cases when exceeded.', + }), tag: multioption({ type: array(string), long: 'tag', @@ -235,6 +241,10 @@ export const evalRunCommand = command({ } const resolvedPaths = await resolveEvalPaths(args.evalPaths, process.cwd()); + if (args.budgetUsd !== undefined && args.budgetUsd <= 0) { + console.error('Error: --budget-usd must be a positive number.'); + process.exit(2); + } const rawOptions: Record = { target: args.target, targets: args.targets, @@ -273,6 +283,7 @@ export const evalRunCommand = command({ model: args.model, outputMessages: args.outputMessages, threshold: args.threshold, + budgetUsd: args.budgetUsd, tag: args.tag, excludeTag: args.excludeTag, transcript: args.transcript, @@ -281,6 +292,9 @@ export const evalRunCommand = command({ if (result?.allExecutionErrors) { process.exit(2); } + if (result?.budgetExceeded) { + process.exit(1); + } if (result?.thresholdFailed) { process.exit(1); } diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index a69078be0..50451658f 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -13,6 +13,7 @@ import { type OtelTraceExporter as OtelTraceExporterType, type ResolvedTarget, ResponseCache, + RunBudgetTracker, type TrialsConfig, runEvaluation as defaultRunEvaluation, deriveCategory, @@ -118,6 +119,7 @@ interface NormalizedOptions { readonly excludeTags: readonly string[]; readonly transcript?: string; readonly experiment?: string; + readonly budgetUsd?: number; } function normalizeBoolean(value: unknown): boolean { @@ -393,6 +395,7 @@ function normalizeOptions( excludeTags: normalizeStringArray(rawOptions.excludeTag), transcript: normalizeString(rawOptions.transcript), experiment: normalizeString(rawOptions.experiment), + budgetUsd: normalizeOptionalNumber(rawOptions.budgetUsd), } satisfies NormalizedOptions; } @@ -681,6 +684,7 @@ async function runSingleEvalFile(params: { readonly trialsConfig?: TrialsConfig; readonly matrixMode?: boolean; readonly budgetUsd?: number; + readonly runBudgetTracker?: RunBudgetTracker; readonly failOnError?: FailOnError; readonly threshold?: number; readonly providerFactory?: ( @@ -707,6 +711,7 @@ async function runSingleEvalFile(params: { trialsConfig, matrixMode, budgetUsd, + runBudgetTracker, failOnError, providerFactory, } = params; @@ -803,6 +808,7 @@ async function runSingleEvalFile(params: { keepWorkspaces: options.keepWorkspaces, trials: trialsConfig, budgetUsd, + runBudgetTracker, failOnError, graderTarget: options.graderTarget, model: options.model, @@ -887,6 +893,8 @@ export interface RunEvalResult { readonly thresholdFailed?: boolean; /** True when all tests had execution errors and no evaluation was performed */ readonly allExecutionErrors?: boolean; + /** True when --budget-usd was set and the run-level budget was exceeded */ + readonly budgetExceeded?: boolean; } interface RemoteEvalSummaryInput { @@ -1150,6 +1158,12 @@ export async function runEvalCommand( const seenTestCases = new Set(); const displayIdTracker = createDisplayIdTracker(); + // Run-level budget tracker: caps total cost across all eval files in this run. + const runBudgetTracker = options.budgetUsd ? new RunBudgetTracker(options.budgetUsd) : undefined; + if (runBudgetTracker) { + console.log(`Run budget cap: $${runBudgetTracker.budgetCapUsd.toFixed(2)}`); + } + // Each file gets the full worker budget — no splitting across files const perFileWorkers = options.workers; const fileMetadata = new Map< @@ -1388,6 +1402,35 @@ export async function runEvalCommand( // workspace races without any grouping complexity. try { for (const testFilePath of activeTestFiles) { + // Run-level budget check: skip remaining files if budget exceeded + if (runBudgetTracker?.isExceeded()) { + const targetPrep = fileMetadata.get(testFilePath); + if (!targetPrep) continue; + const budgetMsg = `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`; + console.log(`\n⚠ ${budgetMsg} — skipping ${path.basename(testFilePath)}`); + for (const { selection } of targetPrep.selections) { + const skippedResults: EvaluationResult[] = targetPrep.testCases.map((testCase) => ({ + timestamp: new Date().toISOString(), + testId: testCase.id, + score: 0, + assertions: [], + output: [], + error: budgetMsg, + budgetExceeded: true, + executionStatus: 'execution_error' as const, + failureStage: 'setup' as const, + failureReasonCode: 'budget_exceeded' as const, + executionError: { message: budgetMsg, stage: 'setup' as const }, + target: selection.targetName, + })); + for (const r of skippedResults) { + await outputWriter.append(r); + } + allResults.push(...skippedResults); + } + continue; + } + const targetPrep = fileMetadata.get(testFilePath); if (!targetPrep) { throw new Error(`Missing metadata for ${testFilePath}`); @@ -1440,6 +1483,7 @@ export async function runEvalCommand( trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig, matrixMode: targetPrep.selections.length > 1, budgetUsd: targetPrep.budgetUsd, + runBudgetTracker, failOnError: targetPrep.failOnError, threshold: resolvedThreshold, providerFactory: transcriptProviderFactory, @@ -1658,6 +1702,14 @@ export async function runEvalCommand( ); } + // Print run-level budget summary when exceeded + const runBudgetExceeded = runBudgetTracker?.isExceeded() ?? false; + if (runBudgetExceeded) { + console.log( + `\n⚠ Run budget exceeded: $${runBudgetTracker?.currentCostUsd.toFixed(4)} spent of $${runBudgetTracker?.budgetCapUsd.toFixed(4)} cap`, + ); + } + return { executionErrorCount: summary.executionErrorCount, outputPath, @@ -1665,6 +1717,7 @@ export async function runEvalCommand( target: options.target, thresholdFailed, allExecutionErrors, + budgetExceeded: runBudgetExceeded || undefined, }; } finally { unsubscribeCodexLogs(); diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index c97599fed..f045fab58 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -270,4 +270,20 @@ describe('agentv eval CLI', () => { await rm(fixture.baseDir, { recursive: true, force: true }); } }); + + it('passes run-level budget tracking through to the evaluator', async () => { + const fixture = await createFixture(); + try { + await runCli(fixture, ['eval', fixture.testFilePath, '--budget-usd', '0.5']); + + const diagnostics = await readDiagnostics(fixture); + expect(diagnostics).toMatchObject({ + budgetUsd: null, + hasRunBudgetTracker: true, + runBudgetCapUsd: 0.5, + }); + } finally { + await rm(fixture.baseDir, { recursive: true, force: true }); + } + }); }); diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts index 524f4b016..91bc84080 100644 --- a/apps/cli/test/fixtures/mock-run-evaluation.ts +++ b/apps/cli/test/fixtures/mock-run-evaluation.ts @@ -18,6 +18,10 @@ interface RunEvaluationOptionsLike { readonly filter?: string | readonly string[]; readonly evalCases?: ReadonlyArray; readonly verbose?: boolean; + readonly budgetUsd?: number; + readonly runBudgetTracker?: { + readonly budgetCapUsd?: number; + }; readonly onResult?: (result: EvaluationResultLike) => Promise | void; } @@ -82,6 +86,9 @@ async function maybeWriteDiagnostics( envSample: process.env.CLI_ENV_SAMPLE ?? null, envRootOnly: process.env.CLI_ENV_ROOT_ONLY ?? null, envLocalOnly: process.env.CLI_ENV_LOCAL_ONLY ?? null, + budgetUsd: options.budgetUsd ?? null, + hasRunBudgetTracker: options.runBudgetTracker !== undefined, + runBudgetCapUsd: options.runBudgetTracker?.budgetCapUsd ?? null, evalCaseIds: Array.isArray(options.evalCases) ? options.evalCases .map((evalCase) => diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 6a1a3a0c4..1744b8b7c 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -42,6 +42,7 @@ import { isAgentProvider, } from './providers/types.js'; import { createBuiltinRegistry, discoverAssertions, discoverGraders } from './registry/index.js'; +import type { RunBudgetTracker } from './run-budget-tracker.js'; import { type TokenUsage, type TraceSummary, @@ -414,6 +415,8 @@ export interface RunEvaluationOptions { readonly streamCallbacks?: ProviderStreamCallbacks; /** Suite-level total cost budget in USD (stops dispatching when exceeded) */ readonly budgetUsd?: number; + /** Run-level total cost tracker shared across multiple eval files/targets in one CLI invocation */ + readonly runBudgetTracker?: RunBudgetTracker; /** Execution error tolerance: true halts on first error */ readonly failOnError?: FailOnError; /** Workspace pooling: true (default) enables pool, false disables, undefined defaults to true */ @@ -467,6 +470,7 @@ export async function runEvaluation( trials, streamCallbacks, budgetUsd, + runBudgetTracker, failOnError, poolWorkspaces, poolMaxSlots: configPoolMaxSlots, @@ -1153,6 +1157,14 @@ export async function runEvaluation( return { ok: allPassed, depResults }; } + function extractEvaluationCostUsd(result: EvaluationResult): number | undefined { + if (result.trials && result.trials.length > 0) { + const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0); + return trialCostSum > 0 ? trialCostSum : undefined; + } + return result.costUsd; + } + // Worker function: dispatches a single eval case with dependency context async function dispatchTest( evalCase: EvalTest, @@ -1161,6 +1173,47 @@ export async function runEvaluation( const workerId = nextWorkerId++; workerIdByEvalId.set(evalCase.id, workerId); + // Check run-level budget before dispatching. This shared tracker spans all + // eval files/targets in the current CLI invocation, so queued cases stop once + // cumulative spend reaches the cap while already-running cases are allowed to finish. + if (runBudgetTracker?.isExceeded()) { + const budgetResult: EvaluationResult = { + timestamp: (now ?? (() => new Date()))().toISOString(), + testId: evalCase.id, + suite: evalCase.suite, + category: evalCase.category, + score: 0, + assertions: [], + output: [], + target: target.name, + error: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`, + budgetExceeded: true, + executionStatus: 'execution_error', + failureStage: 'setup', + failureReasonCode: 'budget_exceeded', + executionError: { + message: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`, + stage: 'setup', + }, + }; + + if (onProgress) { + await onProgress({ + workerId, + testId: evalCase.id, + status: 'failed', + completedAt: Date.now(), + error: budgetResult.error, + score: budgetResult.score, + executionStatus: budgetResult.executionStatus, + }); + } + if (onResult) { + await onResult(budgetResult); + } + return budgetResult; + } + // Check suite-level budget before dispatching if (budgetUsd !== undefined && budgetExhausted) { const budgetResult: EvaluationResult = { @@ -1291,24 +1344,17 @@ export async function runEvaluation( ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions); - // Track suite-level budget - if (budgetUsd !== undefined) { - // Sum all trial costs when trials are used, otherwise use trace cost - let caseCost: number | undefined; - if (result.trials && result.trials.length > 0) { - const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0); - if (trialCostSum > 0) { - caseCost = trialCostSum; - } - } else { - caseCost = result.costUsd; - } - if (caseCost !== undefined) { + const caseCost = extractEvaluationCostUsd(result); + if (caseCost !== undefined) { + if (budgetUsd !== undefined) { cumulativeBudgetCost += caseCost; if (cumulativeBudgetCost >= budgetUsd) { budgetExhausted = true; } } + if (runBudgetTracker) { + runBudgetTracker.add(caseCost); + } } // Track fail_on_error diff --git a/packages/core/src/evaluation/run-budget-tracker.ts b/packages/core/src/evaluation/run-budget-tracker.ts new file mode 100644 index 000000000..66ec4fdbc --- /dev/null +++ b/packages/core/src/evaluation/run-budget-tracker.ts @@ -0,0 +1,42 @@ +/** + * Tracks cumulative cost across all eval files in a single CLI run. + * + * The per-suite budget (`execution.budget_usd` in YAML) is enforced by the orchestrator + * and caps spend within one eval file. This tracker provides a **run-level** cap that + * spans all files in a single `agentv run` invocation. + * + * Usage: + * 1. Instantiate with the cap from `--budget-usd`. + * 2. Share the tracker with each orchestrator running in the invocation. + * 3. After each completed case, call `add()` with that case's total cost. + * 4. Before dispatching the next case or file, check `isExceeded()`. + * + * Thread-safety note: AgentV mutates this tracker from async orchestration code, but all + * updates occur on the JavaScript event loop. There is no shared-memory mutation across + * threads, so simple cumulative accounting is sufficient here. + */ +export class RunBudgetTracker { + private cumulative = 0; + + constructor(private readonly capUsd: number) {} + + /** Accumulate cost from a completed test or file. */ + add(costUsd: number): void { + this.cumulative += costUsd; + } + + /** True when cumulative cost meets or exceeds the cap. */ + isExceeded(): boolean { + return this.cumulative >= this.capUsd; + } + + /** Current accumulated cost. */ + get currentCostUsd(): number { + return this.cumulative; + } + + /** The configured cap. */ + get budgetCapUsd(): number { + return this.capUsd; + } +} diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 3fcae757c..a4e36e343 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -123,6 +123,7 @@ export { type AssertionResult, } from './evaluation/graders/assertions.js'; export { discoverGraders } from './evaluation/registry/grader-discovery.js'; +export { RunBudgetTracker } from './evaluation/run-budget-tracker.js'; // Import pipeline export * from './import/index.js'; diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index a5a290848..daac1ee15 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -17,6 +17,7 @@ import type { ProviderResponse, ToolCall, } from '../../src/evaluation/providers/types.js'; +import { RunBudgetTracker } from '../../src/evaluation/run-budget-tracker.js'; import type { EvalTest, TrialsConfig } from '../../src/evaluation/types.js'; class SequenceProvider implements Provider { @@ -2661,6 +2662,51 @@ describe('suite-level total budget guardrail', () => { expect(results[3].budgetExceeded).toBe(true); expect(results[3].error).toContain('Suite budget exceeded'); }); + + it('uses shared run-level budget tracking to stop queued cases within a file', async () => { + let callCount = 0; + const provider: Provider = { + id: 'budget:mock', + kind: 'mock' as const, + targetName: 'mock', + async invoke(): Promise { + callCount++; + return { + output: [{ role: 'assistant', content: 'response' }], + costUsd: 3.0, + }; + }, + }; + + const evalCases: EvalTest[] = [ + { ...baseTestCase, id: 'case-1' }, + { ...baseTestCase, id: 'case-2' }, + { ...baseTestCase, id: 'case-3' }, + { ...baseTestCase, id: 'case-4' }, + ]; + + const runBudgetTracker = new RunBudgetTracker(5.0); + const results = await runEvaluation({ + testFilePath: 'in-memory.yaml', + repoRoot: 'in-memory', + target: baseTarget, + providerFactory: () => provider, + evaluators: evaluatorRegistry, + evalCases, + maxConcurrency: 1, + runBudgetTracker, + }); + + expect(callCount).toBe(2); + expect(runBudgetTracker.currentCostUsd).toBe(6); + expect(results).toHaveLength(4); + expect(results[0].budgetExceeded).toBeUndefined(); + expect(results[1].budgetExceeded).toBeUndefined(); + expect(results[2].budgetExceeded).toBe(true); + expect(results[3].budgetExceeded).toBe(true); + expect(results[2].error).toContain('Run budget exceeded'); + expect(results[3].error).toContain('Run budget exceeded'); + }); }); describe('fail_on_error tolerance', () => { diff --git a/packages/core/test/evaluation/run-budget-tracker.test.ts b/packages/core/test/evaluation/run-budget-tracker.test.ts new file mode 100644 index 000000000..e658ff307 --- /dev/null +++ b/packages/core/test/evaluation/run-budget-tracker.test.ts @@ -0,0 +1,52 @@ +import { describe, expect, it } from 'vitest'; +import { RunBudgetTracker } from '../../src/evaluation/run-budget-tracker.js'; + +describe('RunBudgetTracker', () => { + it('starts with zero cumulative cost', () => { + const tracker = new RunBudgetTracker(10); + expect(tracker.currentCostUsd).toBe(0); + expect(tracker.budgetCapUsd).toBe(10); + expect(tracker.isExceeded()).toBe(false); + }); + + it('accumulates cost and detects when budget is exceeded', () => { + const tracker = new RunBudgetTracker(1.0); + + tracker.add(0.4); + expect(tracker.currentCostUsd).toBe(0.4); + expect(tracker.isExceeded()).toBe(false); + + tracker.add(0.5); + expect(tracker.currentCostUsd).toBeCloseTo(0.9); + expect(tracker.isExceeded()).toBe(false); + + tracker.add(0.2); + expect(tracker.currentCostUsd).toBeCloseTo(1.1); + expect(tracker.isExceeded()).toBe(true); + }); + + it('treats exact cap as exceeded', () => { + const tracker = new RunBudgetTracker(1.0); + tracker.add(1.0); + expect(tracker.isExceeded()).toBe(true); + }); + + it('handles many small additions', () => { + const tracker = new RunBudgetTracker(0.5); + for (let i = 0; i < 100; i++) { + tracker.add(0.001); + } + expect(tracker.currentCostUsd).toBeCloseTo(0.1); + expect(tracker.isExceeded()).toBe(false); + + tracker.add(0.5); + expect(tracker.isExceeded()).toBe(true); + }); + + it('never exceeds with zero-cost additions', () => { + const tracker = new RunBudgetTracker(0.01); + tracker.add(0); + tracker.add(0); + expect(tracker.isExceeded()).toBe(false); + }); +});