Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions apps/cli/src/commands/eval/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,12 @@ export const evalRunCommand = command({
description:
'Per-test score threshold (0-1, default 0.8). Exit 1 if any test scores below this value',
}),
budgetUsd: option({
type: optional(number),
long: 'budget-usd',
description:
'Maximum total cost in USD across all eval files in this run. Stops dispatching new cases when exceeded.',
}),
tag: multioption({
type: array(string),
long: 'tag',
Expand All @@ -235,6 +241,10 @@ export const evalRunCommand = command({
}

const resolvedPaths = await resolveEvalPaths(args.evalPaths, process.cwd());
if (args.budgetUsd !== undefined && args.budgetUsd <= 0) {
console.error('Error: --budget-usd must be a positive number.');
process.exit(2);
}
const rawOptions: Record<string, unknown> = {
target: args.target,
targets: args.targets,
Expand Down Expand Up @@ -273,6 +283,7 @@ export const evalRunCommand = command({
model: args.model,
outputMessages: args.outputMessages,
threshold: args.threshold,
budgetUsd: args.budgetUsd,
tag: args.tag,
excludeTag: args.excludeTag,
transcript: args.transcript,
Expand All @@ -281,6 +292,9 @@ export const evalRunCommand = command({
if (result?.allExecutionErrors) {
process.exit(2);
}
if (result?.budgetExceeded) {
process.exit(1);
}
if (result?.thresholdFailed) {
process.exit(1);
}
Expand Down
53 changes: 53 additions & 0 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import {
type OtelTraceExporter as OtelTraceExporterType,
type ResolvedTarget,
ResponseCache,
RunBudgetTracker,
type TrialsConfig,
runEvaluation as defaultRunEvaluation,
deriveCategory,
Expand Down Expand Up @@ -118,6 +119,7 @@ interface NormalizedOptions {
readonly excludeTags: readonly string[];
readonly transcript?: string;
readonly experiment?: string;
readonly budgetUsd?: number;
}

function normalizeBoolean(value: unknown): boolean {
Expand Down Expand Up @@ -393,6 +395,7 @@ function normalizeOptions(
excludeTags: normalizeStringArray(rawOptions.excludeTag),
transcript: normalizeString(rawOptions.transcript),
experiment: normalizeString(rawOptions.experiment),
budgetUsd: normalizeOptionalNumber(rawOptions.budgetUsd),
} satisfies NormalizedOptions;
}

Expand Down Expand Up @@ -681,6 +684,7 @@ async function runSingleEvalFile(params: {
readonly trialsConfig?: TrialsConfig;
readonly matrixMode?: boolean;
readonly budgetUsd?: number;
readonly runBudgetTracker?: RunBudgetTracker;
readonly failOnError?: FailOnError;
readonly threshold?: number;
readonly providerFactory?: (
Expand All @@ -707,6 +711,7 @@ async function runSingleEvalFile(params: {
trialsConfig,
matrixMode,
budgetUsd,
runBudgetTracker,
failOnError,
providerFactory,
} = params;
Expand Down Expand Up @@ -803,6 +808,7 @@ async function runSingleEvalFile(params: {
keepWorkspaces: options.keepWorkspaces,
trials: trialsConfig,
budgetUsd,
runBudgetTracker,
failOnError,
graderTarget: options.graderTarget,
model: options.model,
Expand Down Expand Up @@ -887,6 +893,8 @@ export interface RunEvalResult {
readonly thresholdFailed?: boolean;
/** True when all tests had execution errors and no evaluation was performed */
readonly allExecutionErrors?: boolean;
/** True when --budget-usd was set and the run-level budget was exceeded */
readonly budgetExceeded?: boolean;
}

interface RemoteEvalSummaryInput {
Expand Down Expand Up @@ -1150,6 +1158,12 @@ export async function runEvalCommand(
const seenTestCases = new Set<string>();
const displayIdTracker = createDisplayIdTracker();

// Run-level budget tracker: caps total cost across all eval files in this run.
const runBudgetTracker = options.budgetUsd ? new RunBudgetTracker(options.budgetUsd) : undefined;
if (runBudgetTracker) {
console.log(`Run budget cap: $${runBudgetTracker.budgetCapUsd.toFixed(2)}`);
}

// Each file gets the full worker budget — no splitting across files
const perFileWorkers = options.workers;
const fileMetadata = new Map<
Expand Down Expand Up @@ -1388,6 +1402,35 @@ export async function runEvalCommand(
// workspace races without any grouping complexity.
try {
for (const testFilePath of activeTestFiles) {
// Run-level budget check: skip remaining files if budget exceeded
if (runBudgetTracker?.isExceeded()) {
const targetPrep = fileMetadata.get(testFilePath);
if (!targetPrep) continue;
const budgetMsg = `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`;
console.log(`\n⚠ ${budgetMsg} — skipping ${path.basename(testFilePath)}`);
for (const { selection } of targetPrep.selections) {
const skippedResults: EvaluationResult[] = targetPrep.testCases.map((testCase) => ({
timestamp: new Date().toISOString(),
testId: testCase.id,
score: 0,
assertions: [],
output: [],
error: budgetMsg,
budgetExceeded: true,
executionStatus: 'execution_error' as const,
failureStage: 'setup' as const,
failureReasonCode: 'budget_exceeded' as const,
executionError: { message: budgetMsg, stage: 'setup' as const },
target: selection.targetName,
}));
for (const r of skippedResults) {
await outputWriter.append(r);
}
allResults.push(...skippedResults);
}
continue;
}

const targetPrep = fileMetadata.get(testFilePath);
if (!targetPrep) {
throw new Error(`Missing metadata for ${testFilePath}`);
Expand Down Expand Up @@ -1440,6 +1483,7 @@ export async function runEvalCommand(
trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig,
matrixMode: targetPrep.selections.length > 1,
budgetUsd: targetPrep.budgetUsd,
runBudgetTracker,
failOnError: targetPrep.failOnError,
threshold: resolvedThreshold,
providerFactory: transcriptProviderFactory,
Expand Down Expand Up @@ -1658,13 +1702,22 @@ export async function runEvalCommand(
);
}

// Print run-level budget summary when exceeded
const runBudgetExceeded = runBudgetTracker?.isExceeded() ?? false;
if (runBudgetExceeded) {
console.log(
`\n⚠ Run budget exceeded: $${runBudgetTracker?.currentCostUsd.toFixed(4)} spent of $${runBudgetTracker?.budgetCapUsd.toFixed(4)} cap`,
);
}

return {
executionErrorCount: summary.executionErrorCount,
outputPath,
testFiles: activeTestFiles,
target: options.target,
thresholdFailed,
allExecutionErrors,
budgetExceeded: runBudgetExceeded || undefined,
};
} finally {
unsubscribeCodexLogs();
Expand Down
16 changes: 16 additions & 0 deletions apps/cli/test/eval.integration.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -270,4 +270,20 @@ describe('agentv eval CLI', () => {
await rm(fixture.baseDir, { recursive: true, force: true });
}
});

it('passes run-level budget tracking through to the evaluator', async () => {
const fixture = await createFixture();
try {
await runCli(fixture, ['eval', fixture.testFilePath, '--budget-usd', '0.5']);

const diagnostics = await readDiagnostics(fixture);
expect(diagnostics).toMatchObject({
budgetUsd: null,
hasRunBudgetTracker: true,
runBudgetCapUsd: 0.5,
});
} finally {
await rm(fixture.baseDir, { recursive: true, force: true });
}
});
});
7 changes: 7 additions & 0 deletions apps/cli/test/fixtures/mock-run-evaluation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ interface RunEvaluationOptionsLike {
readonly filter?: string | readonly string[];
readonly evalCases?: ReadonlyArray<unknown>;
readonly verbose?: boolean;
readonly budgetUsd?: number;
readonly runBudgetTracker?: {
readonly budgetCapUsd?: number;
};
readonly onResult?: (result: EvaluationResultLike) => Promise<void> | void;
}

Expand Down Expand Up @@ -82,6 +86,9 @@ async function maybeWriteDiagnostics(
envSample: process.env.CLI_ENV_SAMPLE ?? null,
envRootOnly: process.env.CLI_ENV_ROOT_ONLY ?? null,
envLocalOnly: process.env.CLI_ENV_LOCAL_ONLY ?? null,
budgetUsd: options.budgetUsd ?? null,
hasRunBudgetTracker: options.runBudgetTracker !== undefined,
runBudgetCapUsd: options.runBudgetTracker?.budgetCapUsd ?? null,
evalCaseIds: Array.isArray(options.evalCases)
? options.evalCases
.map((evalCase) =>
Expand Down
72 changes: 59 additions & 13 deletions packages/core/src/evaluation/orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ import {
isAgentProvider,
} from './providers/types.js';
import { createBuiltinRegistry, discoverAssertions, discoverGraders } from './registry/index.js';
import type { RunBudgetTracker } from './run-budget-tracker.js';
import {
type TokenUsage,
type TraceSummary,
Expand Down Expand Up @@ -414,6 +415,8 @@ export interface RunEvaluationOptions {
readonly streamCallbacks?: ProviderStreamCallbacks;
/** Suite-level total cost budget in USD (stops dispatching when exceeded) */
readonly budgetUsd?: number;
/** Run-level total cost tracker shared across multiple eval files/targets in one CLI invocation */
readonly runBudgetTracker?: RunBudgetTracker;
/** Execution error tolerance: true halts on first error */
readonly failOnError?: FailOnError;
/** Workspace pooling: true (default) enables pool, false disables, undefined defaults to true */
Expand Down Expand Up @@ -467,6 +470,7 @@ export async function runEvaluation(
trials,
streamCallbacks,
budgetUsd,
runBudgetTracker,
failOnError,
poolWorkspaces,
poolMaxSlots: configPoolMaxSlots,
Expand Down Expand Up @@ -1153,6 +1157,14 @@ export async function runEvaluation(
return { ok: allPassed, depResults };
}

function extractEvaluationCostUsd(result: EvaluationResult): number | undefined {
if (result.trials && result.trials.length > 0) {
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
return trialCostSum > 0 ? trialCostSum : undefined;
}
return result.costUsd;
}

// Worker function: dispatches a single eval case with dependency context
async function dispatchTest(
evalCase: EvalTest,
Expand All @@ -1161,6 +1173,47 @@ export async function runEvaluation(
const workerId = nextWorkerId++;
workerIdByEvalId.set(evalCase.id, workerId);

// Check run-level budget before dispatching. This shared tracker spans all
// eval files/targets in the current CLI invocation, so queued cases stop once
// cumulative spend reaches the cap while already-running cases are allowed to finish.
if (runBudgetTracker?.isExceeded()) {
const budgetResult: EvaluationResult = {
timestamp: (now ?? (() => new Date()))().toISOString(),
testId: evalCase.id,
suite: evalCase.suite,
category: evalCase.category,
score: 0,
assertions: [],
output: [],
target: target.name,
error: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`,
budgetExceeded: true,
executionStatus: 'execution_error',
failureStage: 'setup',
failureReasonCode: 'budget_exceeded',
executionError: {
message: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`,
stage: 'setup',
},
};

if (onProgress) {
await onProgress({
workerId,
testId: evalCase.id,
status: 'failed',
completedAt: Date.now(),
error: budgetResult.error,
score: budgetResult.score,
executionStatus: budgetResult.executionStatus,
});
}
if (onResult) {
await onResult(budgetResult);
}
return budgetResult;
}

// Check suite-level budget before dispatching
if (budgetUsd !== undefined && budgetExhausted) {
const budgetResult: EvaluationResult = {
Expand Down Expand Up @@ -1291,24 +1344,17 @@ export async function runEvaluation(
? await runEvalCaseWithTrials(runCaseOptions, trials)
: await runEvalCase(runCaseOptions);

// Track suite-level budget
if (budgetUsd !== undefined) {
// Sum all trial costs when trials are used, otherwise use trace cost
let caseCost: number | undefined;
if (result.trials && result.trials.length > 0) {
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
if (trialCostSum > 0) {
caseCost = trialCostSum;
}
} else {
caseCost = result.costUsd;
}
if (caseCost !== undefined) {
const caseCost = extractEvaluationCostUsd(result);
if (caseCost !== undefined) {
if (budgetUsd !== undefined) {
cumulativeBudgetCost += caseCost;
if (cumulativeBudgetCost >= budgetUsd) {
budgetExhausted = true;
}
}
if (runBudgetTracker) {
runBudgetTracker.add(caseCost);
}
}

// Track fail_on_error
Expand Down
42 changes: 42 additions & 0 deletions packages/core/src/evaluation/run-budget-tracker.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/**
* Tracks cumulative cost across all eval files in a single CLI run.
*
* The per-suite budget (`execution.budget_usd` in YAML) is enforced by the orchestrator
* and caps spend within one eval file. This tracker provides a **run-level** cap that
* spans all files in a single `agentv run` invocation.
*
* Usage:
* 1. Instantiate with the cap from `--budget-usd`.
* 2. Share the tracker with each orchestrator running in the invocation.
* 3. After each completed case, call `add()` with that case's total cost.
* 4. Before dispatching the next case or file, check `isExceeded()`.
*
* Thread-safety note: AgentV mutates this tracker from async orchestration code, but all
* updates occur on the JavaScript event loop. There is no shared-memory mutation across
* threads, so simple cumulative accounting is sufficient here.
*/
export class RunBudgetTracker {
private cumulative = 0;

constructor(private readonly capUsd: number) {}

/** Accumulate cost from a completed test or file. */
add(costUsd: number): void {
this.cumulative += costUsd;
}

/** True when cumulative cost meets or exceeds the cap. */
isExceeded(): boolean {
return this.cumulative >= this.capUsd;
}

/** Current accumulated cost. */
get currentCostUsd(): number {
return this.cumulative;
}

/** The configured cap. */
get budgetCapUsd(): number {
return this.capUsd;
}
}
1 change: 1 addition & 0 deletions packages/core/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ export {
type AssertionResult,
} from './evaluation/graders/assertions.js';
export { discoverGraders } from './evaluation/registry/grader-discovery.js';
export { RunBudgetTracker } from './evaluation/run-budget-tracker.js';

// Import pipeline
export * from './import/index.js';
Expand Down
Loading
Loading