Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ export function buildTestTargetKey(testId?: string, target?: string): string {
}

// Deduplication helper — keeps the last entry per (test_id, target) pair.
export function deduplicateByTestIdTarget(results: readonly EvaluationResult[]): EvaluationResult[] {
export function deduplicateByTestIdTarget(
results: readonly EvaluationResult[],
): EvaluationResult[] {
const seen = new Map<string, number>();
for (let i = 0; i < results.length; i++) {
seen.set(buildTestTargetKey(results[i].testId, results[i].target), i);
Expand Down
14 changes: 7 additions & 7 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ async function prepareFileMetadata(params: {
readonly yamlWorkers?: number;
readonly yamlCache?: boolean;
readonly yamlCachePath?: string;
readonly totalBudgetUsd?: number;
readonly budgetUsd?: number;
readonly failOnError?: FailOnError;
readonly threshold?: number;
readonly tags?: readonly string[];
Expand Down Expand Up @@ -654,7 +654,7 @@ async function prepareFileMetadata(params: {
yamlWorkers: suite.workers,
yamlCache: suite.cacheConfig?.enabled,
yamlCachePath: suite.cacheConfig?.cachePath,
totalBudgetUsd: suite.totalBudgetUsd,
budgetUsd: suite.budgetUsd,
failOnError: suite.failOnError,
threshold: suite.threshold,
tags: suite.metadata?.tags,
Expand All @@ -680,7 +680,7 @@ async function runSingleEvalFile(params: {
readonly testCases: readonly EvalTest[];
readonly trialsConfig?: TrialsConfig;
readonly matrixMode?: boolean;
readonly totalBudgetUsd?: number;
readonly budgetUsd?: number;
readonly failOnError?: FailOnError;
readonly threshold?: number;
readonly providerFactory?: (
Expand All @@ -706,7 +706,7 @@ async function runSingleEvalFile(params: {
testCases,
trialsConfig,
matrixMode,
totalBudgetUsd,
budgetUsd,
failOnError,
providerFactory,
} = params;
Expand Down Expand Up @@ -802,7 +802,7 @@ async function runSingleEvalFile(params: {
workspacePath: options.workspacePath,
keepWorkspaces: options.keepWorkspaces,
trials: trialsConfig,
totalBudgetUsd,
budgetUsd,
failOnError,
graderTarget: options.graderTarget,
model: options.model,
Expand Down Expand Up @@ -1166,7 +1166,7 @@ export async function runEvalCommand(
readonly yamlWorkers?: number;
readonly yamlCache?: boolean;
readonly yamlCachePath?: string;
readonly totalBudgetUsd?: number;
readonly budgetUsd?: number;
readonly failOnError?: FailOnError;
readonly threshold?: number;
readonly tags?: readonly string[];
Expand Down Expand Up @@ -1439,7 +1439,7 @@ export async function runEvalCommand(
testCases: filteredTestCases,
trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig,
matrixMode: targetPrep.selections.length > 1,
totalBudgetUsd: targetPrep.totalBudgetUsd,
budgetUsd: targetPrep.budgetUsd,
failOnError: targetPrep.failOnError,
threshold: resolvedThreshold,
providerFactory: transcriptProviderFactory,
Expand Down
16 changes: 11 additions & 5 deletions packages/core/src/evaluation/loaders/config-loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -394,14 +394,22 @@ export function extractCacheConfig(suite: JsonObject): CacheConfig | undefined {
* Extract suite-level total budget from parsed eval suite's execution block.
* Returns undefined when not specified.
*/
export function extractTotalBudgetUsd(suite: JsonObject): number | undefined {
export function extractBudgetUsd(suite: JsonObject): number | undefined {
const execution = suite.execution;
if (!execution || typeof execution !== 'object' || Array.isArray(execution)) {
return undefined;
}

const executionObj = execution as Record<string, unknown>;
const rawBudget = executionObj.total_budget_usd ?? executionObj.totalBudgetUsd;

// Reject the old key with a clear error
if ('total_budget_usd' in executionObj || 'totalBudgetUsd' in executionObj) {
throw new Error(
'execution.total_budget_usd has been renamed to execution.budget_usd. Update your eval YAML.',
);
}

const rawBudget = executionObj.budget_usd ?? executionObj.budgetUsd;

if (rawBudget === undefined || rawBudget === null) {
return undefined;
Expand All @@ -411,9 +419,7 @@ export function extractTotalBudgetUsd(suite: JsonObject): number | undefined {
return rawBudget;
}

logWarning(
`Invalid execution.total_budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`,
);
logWarning(`Invalid execution.budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`);
return undefined;
}

Expand Down
14 changes: 7 additions & 7 deletions packages/core/src/evaluation/orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ export interface RunEvaluationOptions {
/** Real-time observability callbacks passed to the provider */
readonly streamCallbacks?: ProviderStreamCallbacks;
/** Suite-level total cost budget in USD (stops dispatching when exceeded) */
readonly totalBudgetUsd?: number;
readonly budgetUsd?: number;
/** Execution error tolerance: true halts on first error */
readonly failOnError?: FailOnError;
/** Workspace pooling: true (default) enables pool, false disables, undefined defaults to true */
Expand Down Expand Up @@ -466,7 +466,7 @@ export async function runEvaluation(
cleanupWorkspaces,
trials,
streamCallbacks,
totalBudgetUsd,
budgetUsd,
failOnError,
poolWorkspaces,
poolMaxSlots: configPoolMaxSlots,
Expand Down Expand Up @@ -1162,7 +1162,7 @@ export async function runEvaluation(
workerIdByEvalId.set(evalCase.id, workerId);

// Check suite-level budget before dispatching
if (totalBudgetUsd !== undefined && budgetExhausted) {
if (budgetUsd !== undefined && budgetExhausted) {
const budgetResult: EvaluationResult = {
timestamp: (now ?? (() => new Date()))().toISOString(),
testId: evalCase.id,
Expand All @@ -1172,13 +1172,13 @@ export async function runEvaluation(
assertions: [],
output: [],
target: target.name,
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`,
budgetExceeded: true,
executionStatus: 'execution_error',
failureStage: 'setup',
failureReasonCode: 'budget_exceeded',
executionError: {
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`,
stage: 'setup',
},
};
Expand Down Expand Up @@ -1292,7 +1292,7 @@ export async function runEvaluation(
: await runEvalCase(runCaseOptions);

// Track suite-level budget
if (totalBudgetUsd !== undefined) {
if (budgetUsd !== undefined) {
// Sum all trial costs when trials are used, otherwise use trace cost
let caseCost: number | undefined;
if (result.trials && result.trials.length > 0) {
Expand All @@ -1305,7 +1305,7 @@ export async function runEvaluation(
}
if (caseCost !== undefined) {
cumulativeBudgetCost += caseCost;
if (cumulativeBudgetCost >= totalBudgetUsd) {
if (cumulativeBudgetCost >= budgetUsd) {
budgetExhausted = true;
}
}
Expand Down
4 changes: 2 additions & 2 deletions packages/core/src/evaluation/validation/eval-file.schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -366,8 +366,8 @@ const ExecutionSchema = z.object({
skip_defaults: z.boolean().optional(),
cache: z.boolean().optional(),
trials: TrialsSchema.optional(),
total_budget_usd: z.number().min(0).optional(),
totalBudgetUsd: z.number().min(0).optional(),
budget_usd: z.number().min(0).optional(),
budgetUsd: z.number().min(0).optional(),
fail_on_error: FailOnErrorSchema.optional(),
failOnError: FailOnErrorSchema.optional(),
threshold: z.number().min(0).max(1).optional(),
Expand Down
6 changes: 3 additions & 3 deletions packages/core/src/evaluation/yaml-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@ import { interpolateEnv } from './interpolation.js';
import { loadTestsFromAgentSkills } from './loaders/agent-skills-parser.js';
import { expandFileReferences, loadCasesFromFile } from './loaders/case-file-loader.js';
import {
extractBudgetUsd,
extractCacheConfig,
extractFailOnError,
extractTargetFromSuite,
extractTargetRefsFromSuite,
extractTargetsFromSuite,
extractTargetsFromTestCase,
extractThreshold,
extractTotalBudgetUsd,
extractTrialsConfig,
extractWorkersFromSuite,
loadConfig,
Expand Down Expand Up @@ -203,7 +203,7 @@ export type EvalSuiteResult = {
/** Suite-level metadata (name, description, version, etc.) */
readonly metadata?: import('./metadata.js').EvalMetadata;
/** Suite-level total cost budget in USD */
readonly totalBudgetUsd?: number;
readonly budgetUsd?: number;
/** Execution error tolerance: true or false */
readonly failOnError?: import('./types.js').FailOnError;
/** Suite-level quality threshold (0-1) — suite fails if mean score is below */
Expand Down Expand Up @@ -243,7 +243,7 @@ export async function loadTestSuite(
targetRefs: extractTargetRefsFromSuite(parsed),
workers: extractWorkersFromSuite(parsed),
cacheConfig: extractCacheConfig(parsed),
totalBudgetUsd: extractTotalBudgetUsd(parsed),
budgetUsd: extractBudgetUsd(parsed),
...(metadata !== undefined && { metadata }),
...(failOnError !== undefined && { failOnError }),
...(threshold !== undefined && { threshold }),
Expand Down
48 changes: 31 additions & 17 deletions packages/core/test/evaluation/loaders/config-loader.test.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import { describe, expect, it } from 'bun:test';

import {
extractBudgetUsd,
extractFailOnError,
extractTargetFromSuite,
extractTargetRefsFromSuite,
extractTargetsFromSuite,
extractTargetsFromTestCase,
extractThreshold,
extractTotalBudgetUsd,
extractTrialsConfig,
parseExecutionDefaults,
parseResultsConfig,
Expand Down Expand Up @@ -380,40 +380,54 @@ describe('extractTargetsFromTestCase', () => {
});
});

describe('extractTotalBudgetUsd', () => {
describe('extractBudgetUsd', () => {
it('returns undefined when no execution block', () => {
const suite: JsonObject = { tests: [] };
expect(extractTotalBudgetUsd(suite)).toBeUndefined();
expect(extractBudgetUsd(suite)).toBeUndefined();
});

it('returns undefined when no total_budget_usd in execution', () => {
it('returns undefined when no budget_usd in execution', () => {
const suite: JsonObject = { execution: { target: 'default' } };
expect(extractTotalBudgetUsd(suite)).toBeUndefined();
expect(extractBudgetUsd(suite)).toBeUndefined();
});

it('parses valid total_budget_usd (snake_case)', () => {
const suite: JsonObject = { execution: { total_budget_usd: 10.0 } };
expect(extractTotalBudgetUsd(suite)).toBe(10.0);
it('parses valid budget_usd (snake_case)', () => {
const suite: JsonObject = { execution: { budget_usd: 10.0 } };
expect(extractBudgetUsd(suite)).toBe(10.0);
});

it('parses valid totalBudgetUsd (camelCase)', () => {
const suite: JsonObject = { execution: { totalBudgetUsd: 5.5 } };
expect(extractTotalBudgetUsd(suite)).toBe(5.5);
it('parses valid budgetUsd (camelCase)', () => {
const suite: JsonObject = { execution: { budgetUsd: 5.5 } };
expect(extractBudgetUsd(suite)).toBe(5.5);
});

it('returns undefined for zero budget', () => {
const suite: JsonObject = { execution: { total_budget_usd: 0 } };
expect(extractTotalBudgetUsd(suite)).toBeUndefined();
const suite: JsonObject = { execution: { budget_usd: 0 } };
expect(extractBudgetUsd(suite)).toBeUndefined();
});

it('returns undefined for negative budget', () => {
const suite: JsonObject = { execution: { total_budget_usd: -1 } };
expect(extractTotalBudgetUsd(suite)).toBeUndefined();
const suite: JsonObject = { execution: { budget_usd: -1 } };
expect(extractBudgetUsd(suite)).toBeUndefined();
});

it('returns undefined for non-number budget', () => {
const suite: JsonObject = { execution: { total_budget_usd: 'ten' } };
expect(extractTotalBudgetUsd(suite)).toBeUndefined();
const suite: JsonObject = { execution: { budget_usd: 'ten' } };
expect(extractBudgetUsd(suite)).toBeUndefined();
});

it('rejects old key total_budget_usd with a clear error', () => {
const suite: JsonObject = { execution: { total_budget_usd: 10.0 } };
expect(() => extractBudgetUsd(suite)).toThrow(
'execution.total_budget_usd has been renamed to execution.budget_usd. Update your eval YAML.',
);
});

it('rejects old key totalBudgetUsd with a clear error', () => {
const suite: JsonObject = { execution: { totalBudgetUsd: 10.0 } };
expect(() => extractBudgetUsd(suite)).toThrow(
'execution.total_budget_usd has been renamed to execution.budget_usd. Update your eval YAML.',
);
});
});

Expand Down
8 changes: 4 additions & 4 deletions packages/core/test/evaluation/orchestrator.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2507,7 +2507,7 @@ describe('workspace.template .code-workspace resolution', () => {
});

describe('suite-level total budget guardrail', () => {
it('completes normally when totalBudgetUsd is not set', async () => {
it('completes normally when budgetUsd is not set', async () => {
const provider: Provider = {
id: 'budget:mock',
kind: 'mock' as const,
Expand Down Expand Up @@ -2564,7 +2564,7 @@ describe('suite-level total budget guardrail', () => {
providerFactory: () => provider,
evaluators: evaluatorRegistry,
evalCases,
totalBudgetUsd: 10.0,
budgetUsd: 10.0,
});

expect(results).toHaveLength(2);
Expand Down Expand Up @@ -2598,7 +2598,7 @@ describe('suite-level total budget guardrail', () => {
providerFactory: () => provider,
evaluators: evaluatorRegistry,
evalCases,
totalBudgetUsd: 5.0,
budgetUsd: 5.0,
maxConcurrency: 1,
});

Expand Down Expand Up @@ -2647,7 +2647,7 @@ describe('suite-level total budget guardrail', () => {
providerFactory: () => provider,
evaluators: evaluatorRegistry,
evalCases,
totalBudgetUsd: 5.0,
budgetUsd: 5.0,
maxConcurrency: 1,
trials: { count: 2, strategy: 'pass_at_k' },
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5015,11 +5015,11 @@
"required": ["count"],
"additionalProperties": false
},
"total_budget_usd": {
"budget_usd": {
"type": "number",
"minimum": 0
},
"totalBudgetUsd": {
"budgetUsd": {
"type": "number",
"minimum": 0
},
Expand Down Expand Up @@ -11543,11 +11543,11 @@
"required": ["count"],
"additionalProperties": false
},
"total_budget_usd": {
"budget_usd": {
"type": "number",
"minimum": 0
},
"totalBudgetUsd": {
"budgetUsd": {
"type": "number",
"minimum": 0
},
Expand Down Expand Up @@ -15682,11 +15682,11 @@
"required": ["count"],
"additionalProperties": false
},
"total_budget_usd": {
"budget_usd": {
"type": "number",
"minimum": 0
},
"totalBudgetUsd": {
"budgetUsd": {
"type": "number",
"minimum": 0
},
Expand Down
Loading