diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 003d8886..755d36a1 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -15,7 +15,9 @@ export function buildTestTargetKey(testId?: string, target?: string): string { } // Deduplication helper — keeps the last entry per (test_id, target) pair. -export function deduplicateByTestIdTarget(results: readonly EvaluationResult[]): EvaluationResult[] { +export function deduplicateByTestIdTarget( + results: readonly EvaluationResult[], +): EvaluationResult[] { const seen = new Map(); for (let i = 0; i < results.length; i++) { seen.set(buildTestTargetKey(results[i].testId, results[i].target), i); diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index b1737f19..a69078be 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -527,7 +527,7 @@ async function prepareFileMetadata(params: { readonly yamlWorkers?: number; readonly yamlCache?: boolean; readonly yamlCachePath?: string; - readonly totalBudgetUsd?: number; + readonly budgetUsd?: number; readonly failOnError?: FailOnError; readonly threshold?: number; readonly tags?: readonly string[]; @@ -654,7 +654,7 @@ async function prepareFileMetadata(params: { yamlWorkers: suite.workers, yamlCache: suite.cacheConfig?.enabled, yamlCachePath: suite.cacheConfig?.cachePath, - totalBudgetUsd: suite.totalBudgetUsd, + budgetUsd: suite.budgetUsd, failOnError: suite.failOnError, threshold: suite.threshold, tags: suite.metadata?.tags, @@ -680,7 +680,7 @@ async function runSingleEvalFile(params: { readonly testCases: readonly EvalTest[]; readonly trialsConfig?: TrialsConfig; readonly matrixMode?: boolean; - readonly totalBudgetUsd?: number; + readonly budgetUsd?: number; readonly failOnError?: FailOnError; readonly threshold?: number; readonly providerFactory?: ( @@ -706,7 +706,7 @@ async function runSingleEvalFile(params: { testCases, trialsConfig, matrixMode, - totalBudgetUsd, + budgetUsd, failOnError, providerFactory, } = params; @@ -802,7 +802,7 @@ async function runSingleEvalFile(params: { workspacePath: options.workspacePath, keepWorkspaces: options.keepWorkspaces, trials: trialsConfig, - totalBudgetUsd, + budgetUsd, failOnError, graderTarget: options.graderTarget, model: options.model, @@ -1166,7 +1166,7 @@ export async function runEvalCommand( readonly yamlWorkers?: number; readonly yamlCache?: boolean; readonly yamlCachePath?: string; - readonly totalBudgetUsd?: number; + readonly budgetUsd?: number; readonly failOnError?: FailOnError; readonly threshold?: number; readonly tags?: readonly string[]; @@ -1439,7 +1439,7 @@ export async function runEvalCommand( testCases: filteredTestCases, trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig, matrixMode: targetPrep.selections.length > 1, - totalBudgetUsd: targetPrep.totalBudgetUsd, + budgetUsd: targetPrep.budgetUsd, failOnError: targetPrep.failOnError, threshold: resolvedThreshold, providerFactory: transcriptProviderFactory, diff --git a/packages/core/src/evaluation/loaders/config-loader.ts b/packages/core/src/evaluation/loaders/config-loader.ts index eeb68ac4..7aede85f 100644 --- a/packages/core/src/evaluation/loaders/config-loader.ts +++ b/packages/core/src/evaluation/loaders/config-loader.ts @@ -394,14 +394,22 @@ export function extractCacheConfig(suite: JsonObject): CacheConfig | undefined { * Extract suite-level total budget from parsed eval suite's execution block. * Returns undefined when not specified. */ -export function extractTotalBudgetUsd(suite: JsonObject): number | undefined { +export function extractBudgetUsd(suite: JsonObject): number | undefined { const execution = suite.execution; if (!execution || typeof execution !== 'object' || Array.isArray(execution)) { return undefined; } const executionObj = execution as Record; - const rawBudget = executionObj.total_budget_usd ?? executionObj.totalBudgetUsd; + + // Reject the old key with a clear error + if ('total_budget_usd' in executionObj || 'totalBudgetUsd' in executionObj) { + throw new Error( + 'execution.total_budget_usd has been renamed to execution.budget_usd. Update your eval YAML.', + ); + } + + const rawBudget = executionObj.budget_usd ?? executionObj.budgetUsd; if (rawBudget === undefined || rawBudget === null) { return undefined; @@ -411,9 +419,7 @@ export function extractTotalBudgetUsd(suite: JsonObject): number | undefined { return rawBudget; } - logWarning( - `Invalid execution.total_budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`, - ); + logWarning(`Invalid execution.budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`); return undefined; } diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 2b77d604..6a1a3a0c 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -413,7 +413,7 @@ export interface RunEvaluationOptions { /** Real-time observability callbacks passed to the provider */ readonly streamCallbacks?: ProviderStreamCallbacks; /** Suite-level total cost budget in USD (stops dispatching when exceeded) */ - readonly totalBudgetUsd?: number; + readonly budgetUsd?: number; /** Execution error tolerance: true halts on first error */ readonly failOnError?: FailOnError; /** Workspace pooling: true (default) enables pool, false disables, undefined defaults to true */ @@ -466,7 +466,7 @@ export async function runEvaluation( cleanupWorkspaces, trials, streamCallbacks, - totalBudgetUsd, + budgetUsd, failOnError, poolWorkspaces, poolMaxSlots: configPoolMaxSlots, @@ -1162,7 +1162,7 @@ export async function runEvaluation( workerIdByEvalId.set(evalCase.id, workerId); // Check suite-level budget before dispatching - if (totalBudgetUsd !== undefined && budgetExhausted) { + if (budgetUsd !== undefined && budgetExhausted) { const budgetResult: EvaluationResult = { timestamp: (now ?? (() => new Date()))().toISOString(), testId: evalCase.id, @@ -1172,13 +1172,13 @@ export async function runEvaluation( assertions: [], output: [], target: target.name, - error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`, + error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`, budgetExceeded: true, executionStatus: 'execution_error', failureStage: 'setup', failureReasonCode: 'budget_exceeded', executionError: { - message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`, + message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`, stage: 'setup', }, }; @@ -1292,7 +1292,7 @@ export async function runEvaluation( : await runEvalCase(runCaseOptions); // Track suite-level budget - if (totalBudgetUsd !== undefined) { + if (budgetUsd !== undefined) { // Sum all trial costs when trials are used, otherwise use trace cost let caseCost: number | undefined; if (result.trials && result.trials.length > 0) { @@ -1305,7 +1305,7 @@ export async function runEvaluation( } if (caseCost !== undefined) { cumulativeBudgetCost += caseCost; - if (cumulativeBudgetCost >= totalBudgetUsd) { + if (cumulativeBudgetCost >= budgetUsd) { budgetExhausted = true; } } diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index 4cb155d2..66946323 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -366,8 +366,8 @@ const ExecutionSchema = z.object({ skip_defaults: z.boolean().optional(), cache: z.boolean().optional(), trials: TrialsSchema.optional(), - total_budget_usd: z.number().min(0).optional(), - totalBudgetUsd: z.number().min(0).optional(), + budget_usd: z.number().min(0).optional(), + budgetUsd: z.number().min(0).optional(), fail_on_error: FailOnErrorSchema.optional(), failOnError: FailOnErrorSchema.optional(), threshold: z.number().min(0).max(1).optional(), diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 6b6c8f00..928e73d5 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -8,6 +8,7 @@ import { interpolateEnv } from './interpolation.js'; import { loadTestsFromAgentSkills } from './loaders/agent-skills-parser.js'; import { expandFileReferences, loadCasesFromFile } from './loaders/case-file-loader.js'; import { + extractBudgetUsd, extractCacheConfig, extractFailOnError, extractTargetFromSuite, @@ -15,7 +16,6 @@ import { extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, - extractTotalBudgetUsd, extractTrialsConfig, extractWorkersFromSuite, loadConfig, @@ -203,7 +203,7 @@ export type EvalSuiteResult = { /** Suite-level metadata (name, description, version, etc.) */ readonly metadata?: import('./metadata.js').EvalMetadata; /** Suite-level total cost budget in USD */ - readonly totalBudgetUsd?: number; + readonly budgetUsd?: number; /** Execution error tolerance: true or false */ readonly failOnError?: import('./types.js').FailOnError; /** Suite-level quality threshold (0-1) — suite fails if mean score is below */ @@ -243,7 +243,7 @@ export async function loadTestSuite( targetRefs: extractTargetRefsFromSuite(parsed), workers: extractWorkersFromSuite(parsed), cacheConfig: extractCacheConfig(parsed), - totalBudgetUsd: extractTotalBudgetUsd(parsed), + budgetUsd: extractBudgetUsd(parsed), ...(metadata !== undefined && { metadata }), ...(failOnError !== undefined && { failOnError }), ...(threshold !== undefined && { threshold }), diff --git a/packages/core/test/evaluation/loaders/config-loader.test.ts b/packages/core/test/evaluation/loaders/config-loader.test.ts index 2c8ac2ad..303bf3a2 100644 --- a/packages/core/test/evaluation/loaders/config-loader.test.ts +++ b/packages/core/test/evaluation/loaders/config-loader.test.ts @@ -1,13 +1,13 @@ import { describe, expect, it } from 'bun:test'; import { + extractBudgetUsd, extractFailOnError, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, - extractTotalBudgetUsd, extractTrialsConfig, parseExecutionDefaults, parseResultsConfig, @@ -380,40 +380,54 @@ describe('extractTargetsFromTestCase', () => { }); }); -describe('extractTotalBudgetUsd', () => { +describe('extractBudgetUsd', () => { it('returns undefined when no execution block', () => { const suite: JsonObject = { tests: [] }; - expect(extractTotalBudgetUsd(suite)).toBeUndefined(); + expect(extractBudgetUsd(suite)).toBeUndefined(); }); - it('returns undefined when no total_budget_usd in execution', () => { + it('returns undefined when no budget_usd in execution', () => { const suite: JsonObject = { execution: { target: 'default' } }; - expect(extractTotalBudgetUsd(suite)).toBeUndefined(); + expect(extractBudgetUsd(suite)).toBeUndefined(); }); - it('parses valid total_budget_usd (snake_case)', () => { - const suite: JsonObject = { execution: { total_budget_usd: 10.0 } }; - expect(extractTotalBudgetUsd(suite)).toBe(10.0); + it('parses valid budget_usd (snake_case)', () => { + const suite: JsonObject = { execution: { budget_usd: 10.0 } }; + expect(extractBudgetUsd(suite)).toBe(10.0); }); - it('parses valid totalBudgetUsd (camelCase)', () => { - const suite: JsonObject = { execution: { totalBudgetUsd: 5.5 } }; - expect(extractTotalBudgetUsd(suite)).toBe(5.5); + it('parses valid budgetUsd (camelCase)', () => { + const suite: JsonObject = { execution: { budgetUsd: 5.5 } }; + expect(extractBudgetUsd(suite)).toBe(5.5); }); it('returns undefined for zero budget', () => { - const suite: JsonObject = { execution: { total_budget_usd: 0 } }; - expect(extractTotalBudgetUsd(suite)).toBeUndefined(); + const suite: JsonObject = { execution: { budget_usd: 0 } }; + expect(extractBudgetUsd(suite)).toBeUndefined(); }); it('returns undefined for negative budget', () => { - const suite: JsonObject = { execution: { total_budget_usd: -1 } }; - expect(extractTotalBudgetUsd(suite)).toBeUndefined(); + const suite: JsonObject = { execution: { budget_usd: -1 } }; + expect(extractBudgetUsd(suite)).toBeUndefined(); }); it('returns undefined for non-number budget', () => { - const suite: JsonObject = { execution: { total_budget_usd: 'ten' } }; - expect(extractTotalBudgetUsd(suite)).toBeUndefined(); + const suite: JsonObject = { execution: { budget_usd: 'ten' } }; + expect(extractBudgetUsd(suite)).toBeUndefined(); + }); + + it('rejects old key total_budget_usd with a clear error', () => { + const suite: JsonObject = { execution: { total_budget_usd: 10.0 } }; + expect(() => extractBudgetUsd(suite)).toThrow( + 'execution.total_budget_usd has been renamed to execution.budget_usd. Update your eval YAML.', + ); + }); + + it('rejects old key totalBudgetUsd with a clear error', () => { + const suite: JsonObject = { execution: { totalBudgetUsd: 10.0 } }; + expect(() => extractBudgetUsd(suite)).toThrow( + 'execution.total_budget_usd has been renamed to execution.budget_usd. Update your eval YAML.', + ); }); }); diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 70e75bad..a5a29084 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -2507,7 +2507,7 @@ describe('workspace.template .code-workspace resolution', () => { }); describe('suite-level total budget guardrail', () => { - it('completes normally when totalBudgetUsd is not set', async () => { + it('completes normally when budgetUsd is not set', async () => { const provider: Provider = { id: 'budget:mock', kind: 'mock' as const, @@ -2564,7 +2564,7 @@ describe('suite-level total budget guardrail', () => { providerFactory: () => provider, evaluators: evaluatorRegistry, evalCases, - totalBudgetUsd: 10.0, + budgetUsd: 10.0, }); expect(results).toHaveLength(2); @@ -2598,7 +2598,7 @@ describe('suite-level total budget guardrail', () => { providerFactory: () => provider, evaluators: evaluatorRegistry, evalCases, - totalBudgetUsd: 5.0, + budgetUsd: 5.0, maxConcurrency: 1, }); @@ -2647,7 +2647,7 @@ describe('suite-level total budget guardrail', () => { providerFactory: () => provider, evaluators: evaluatorRegistry, evalCases, - totalBudgetUsd: 5.0, + budgetUsd: 5.0, maxConcurrency: 1, trials: { count: 2, strategy: 'pass_at_k' }, }); diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json index e515225b..2f6fd88a 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json +++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json @@ -5015,11 +5015,11 @@ "required": ["count"], "additionalProperties": false }, - "total_budget_usd": { + "budget_usd": { "type": "number", "minimum": 0 }, - "totalBudgetUsd": { + "budgetUsd": { "type": "number", "minimum": 0 }, @@ -11543,11 +11543,11 @@ "required": ["count"], "additionalProperties": false }, - "total_budget_usd": { + "budget_usd": { "type": "number", "minimum": 0 }, - "totalBudgetUsd": { + "budgetUsd": { "type": "number", "minimum": 0 }, @@ -15682,11 +15682,11 @@ "required": ["count"], "additionalProperties": false }, - "total_budget_usd": { + "budget_usd": { "type": "number", "minimum": 0 }, - "totalBudgetUsd": { + "budgetUsd": { "type": "number", "minimum": 0 },