Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -266,17 +266,18 @@ Before marking any branch as ready for review, complete this checklist:

2. **Run unit tests**: `bun run test` — all must pass.

3. **Manual red/green UAT (REQUIRED for all changes):**
Automated tests are not sufficient. Every change must be manually verified from the end user's perspective using a red/green approach:
- **Red (before fix):** Reproduce the bug or demonstrate the missing feature on `main` (or before your change). Confirm the undesired behavior is observable from the CLI / user-facing output.
- **Green (after fix):** Run the same scenario with your changes applied. Confirm the fix or feature works correctly from the end user's perspective.
- Document both the red and green results in the PR or conversation so the user can see the before/after.
3. **⚠️ BLOCKING: Manual red/green UAT — must complete before steps 4-5:**
Unit tests passing is NOT sufficient. Every change must be manually verified from the end user's perspective. Do NOT skip this step or proceed to step 4 until red/green evidence is documented.

- **Red (before your changes):** Run the scenario on `main` (or the code state before your changes). Confirm the bug or missing feature is observable from the CLI / user-facing output. Capture the output.
- **Green (with your changes):** Run the identical scenario with your branch. Confirm the fix or feature works correctly from the end user's perspective. Capture the output.
- **Document both** red and green results in the PR description or comments so reviewers can see the before/after evidence.

For evaluator changes, this means running a real eval (not `--dry-run`) and inspecting the output JSONL. For CLI/UX changes, this means running the CLI command and verifying the console output.

4. **Verify no regressions** in areas adjacent to your changes (e.g., if you changed evaluator parsing, run an eval that exercises different evaluator types).

5. **Mark PR as ready** only after all above steps pass.
5. **Mark PR as ready** only after steps 1-4 have been completed AND red/green UAT evidence is included in the PR.

## Documentation Updates

Expand Down
3 changes: 3 additions & 0 deletions apps/cli/src/commands/eval/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,9 @@ export const evalRunCommand = command({
excludeTag: args.excludeTag,
};
const result = await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
if (result?.allExecutionErrors) {
process.exit(2);
}
if (result?.thresholdFailed) {
process.exit(1);
}
Expand Down
17 changes: 11 additions & 6 deletions apps/cli/src/commands/eval/junit-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,15 +58,18 @@ export class JunitWriter {

const suiteXmls: string[] = [];
for (const [suiteName, results] of grouped) {
const failures = results.filter((r) => r.score < this.threshold).length;
const errors = results.filter((r) => r.error !== undefined).length;
const errors = results.filter((r) => r.executionStatus === 'execution_error').length;
const failures = results.filter(
(r) => r.executionStatus !== 'execution_error' && r.score < this.threshold,
).length;

const testCases = results.map((r) => {
const time = r.durationMs ? (r.durationMs / 1000).toFixed(3) : '0.000';

let inner = '';
if (r.error) {
inner = `\n <error message="${escapeXml(r.error)}">${escapeXml(r.error)}</error>\n `;
if (r.executionStatus === 'execution_error') {
const errorMsg = r.error ?? 'Execution error';
inner = `\n <error message="${escapeXml(errorMsg)}">${escapeXml(errorMsg)}</error>\n `;
} else if (r.score < this.threshold) {
const message = `score=${r.score.toFixed(3)}`;
const failedAssertions = r.assertions.filter((a) => !a.passed);
Expand All @@ -90,8 +93,10 @@ export class JunitWriter {
}

const totalTests = this.results.length;
const totalFailures = this.results.filter((r) => r.score < this.threshold).length;
const totalErrors = this.results.filter((r) => r.error !== undefined).length;
const totalErrors = this.results.filter((r) => r.executionStatus === 'execution_error').length;
const totalFailures = this.results.filter(
(r) => r.executionStatus !== 'execution_error' && r.score < this.threshold,
).length;

const xml = `<?xml version="1.0" encoding="UTF-8"?>\n<testsuites tests="${totalTests}" failures="${totalFailures}" errors="${totalErrors}">\n${suiteXmls.join('\n')}\n</testsuites>\n`;

Expand Down
7 changes: 6 additions & 1 deletion apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -818,6 +818,8 @@ export interface RunEvalResult {
readonly target?: string;
/** True when --threshold is set and mean score is below the threshold */
readonly thresholdFailed?: boolean;
/** True when all tests had execution errors and no evaluation was performed */
readonly allExecutionErrors?: boolean;
}

export async function runEvalCommand(
Expand Down Expand Up @@ -1299,7 +1301,9 @@ export async function runEvalCommand(
const summary = calculateEvaluationSummary(allResults, thresholdOpts);
console.log(formatEvaluationSummary(summary, thresholdOpts));

// Exit code matches RESULT verdict: fail if any test scored below threshold.
// Exit code: 2 when all tests are execution errors (no evaluation performed),
// 1 when any test scored below threshold.
const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
const thresholdFailed = resolvedThreshold !== undefined && summary.qualityFailureCount > 0;

// Print matrix summary when multiple targets were evaluated
Expand Down Expand Up @@ -1397,6 +1401,7 @@ export async function runEvalCommand(
testFiles: activeTestFiles,
target: options.target,
thresholdFailed,
allExecutionErrors,
};
} finally {
unsubscribeCodexLogs();
Expand Down
22 changes: 17 additions & 5 deletions apps/cli/src/commands/eval/statistics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -209,13 +209,25 @@ export function formatEvaluationSummary(
// Overall verdict: all non-error cases must score >= per-test threshold.
const gradedCount = summary.total - summary.executionErrorCount;
const threshold = options?.threshold ?? 0.8;
const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
const overallPassed =
summary.passedCount === gradedCount ||
(summary.qualityFailureCount === 0 && summary.executionErrorCount === 0);
const overallVerdict = overallPassed ? 'PASS' : 'FAIL';
!allExecutionErrors &&
(summary.passedCount === gradedCount ||
(summary.qualityFailureCount === 0 && summary.executionErrorCount === 0));
const useColor = !(process.env.NO_COLOR !== undefined) && (process.stdout.isTTY ?? false);
const verdictColor = overallPassed ? '\x1b[32m' : '\x1b[31m';
const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`;

let overallVerdict: string;
let verdictColor: string;
let verdictText: string;
if (allExecutionErrors) {
overallVerdict = 'INCONCLUSIVE';
verdictColor = '\x1b[33m'; // yellow
verdictText = `RESULT: INCONCLUSIVE (all ${summary.total} test(s) had execution errors — no evaluation was performed)`;
} else {
overallVerdict = overallPassed ? 'PASS' : 'FAIL';
verdictColor = overallPassed ? '\x1b[32m' : '\x1b[31m';
verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`;
}

lines.push('\n==================================================');
if (useColor) {
Expand Down
78 changes: 77 additions & 1 deletion apps/cli/test/commands/eval/output-writers.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ function makeResult(overrides: Partial<EvaluationResult> = {}): EvaluationResult
assertions: [{ text: 'criterion-1', passed: true }],
output: [{ role: 'assistant' as const, content: 'answer' }],
target: 'default',
executionStatus: 'ok',
...overrides,
};
}
Expand Down Expand Up @@ -146,7 +147,14 @@ describe('JunitWriter', () => {

it('should handle errors as <error> elements', async () => {
const writer = await JunitWriter.open(testFilePath);
await writer.append(makeResult({ testId: 'err-1', score: 0, error: 'Timeout exceeded' }));
await writer.append(
makeResult({
testId: 'err-1',
score: 0,
error: 'Timeout exceeded',
executionStatus: 'execution_error',
}),
);
await writer.close();

const xml = await readFile(testFilePath, 'utf8');
Expand Down Expand Up @@ -188,6 +196,74 @@ describe('JunitWriter', () => {
expect(xml).not.toContain('<failure message="score=0.600"');
expect(xml).toContain('<failure message="score=0.300"');
});

it('should use executionStatus to classify errors vs failures', async () => {
const writer = await JunitWriter.open(testFilePath);

await writer.append(
makeResult({
testId: 'exec-err',
score: 0,
executionStatus: 'execution_error',
error: 'Not Found',
}),
);
await writer.append(
makeResult({ testId: 'quality-fail', score: 0.3, executionStatus: 'quality_failure' }),
);
await writer.append(makeResult({ testId: 'pass', score: 0.9, executionStatus: 'ok' }));
await writer.close();

const xml = await readFile(testFilePath, 'utf8');
// Execution error produces <error>, not <failure>
expect(xml).toContain('<error message="Not Found"');
// Quality failure produces <failure>
expect(xml).toContain('<failure message="score=0.300"');
// Counts: 1 error, 1 failure (execution error excluded from failure count)
expect(xml).toContain('errors="1"');
expect(xml).toContain('failures="1"');
});

it('should not double-count execution errors as failures', async () => {
const writer = await JunitWriter.open(testFilePath);

// All execution errors — should have 0 failures, 2 errors
await writer.append(
makeResult({
testId: 'err-1',
score: 0,
executionStatus: 'execution_error',
error: 'Provider error',
}),
);
await writer.append(
makeResult({
testId: 'err-2',
score: 0,
executionStatus: 'execution_error',
error: 'Timeout',
}),
);
await writer.close();

const xml = await readFile(testFilePath, 'utf8');
expect(xml).toContain('failures="0"');
expect(xml).toContain('errors="2"');
});

it('should emit <error> for execution_error even without error message', async () => {
const writer = await JunitWriter.open(testFilePath);

await writer.append(
makeResult({ testId: 'no-msg', score: 0, executionStatus: 'execution_error' }),
);
await writer.close();

const xml = await readFile(testFilePath, 'utf8');
expect(xml).toContain('<error message="Execution error"');
expect(xml).toContain('errors="1"');
expect(xml).toContain('failures="0"');
});
});

describe('escapeXml', () => {
Expand Down
103 changes: 103 additions & 0 deletions apps/cli/test/commands/eval/statistics-inconclusive.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import { describe, expect, it } from 'bun:test';

import type { EvaluationResult } from '@agentv/core';

import {
calculateEvaluationSummary,
formatEvaluationSummary,
} from '../../../src/commands/eval/statistics.js';

function makeResult(overrides: Partial<EvaluationResult> = {}): EvaluationResult {
return {
timestamp: '2024-01-01T00:00:00Z',
testId: 'test-1',
score: 1.0,
assertions: [{ text: 'criterion-1', passed: true }],
output: [{ role: 'assistant' as const, content: 'answer' }],
target: 'default',
executionStatus: 'ok',
...overrides,
};
}

describe('formatEvaluationSummary — inconclusive verdict', () => {
it('shows INCONCLUSIVE when all tests are execution errors', () => {
const results = [
makeResult({
testId: 'err-1',
score: 0,
executionStatus: 'execution_error',
error: 'Not Found',
}),
makeResult({
testId: 'err-2',
score: 0,
executionStatus: 'execution_error',
error: 'Not Found',
}),
makeResult({
testId: 'err-3',
score: 0,
executionStatus: 'execution_error',
error: 'Not Found',
}),
];

const summary = calculateEvaluationSummary(results);
const output = formatEvaluationSummary(summary);

expect(output).toContain('RESULT: INCONCLUSIVE');
expect(output).toContain('all 3 test(s) had execution errors');
expect(output).toContain('no evaluation was performed');
});

it('shows PASS/FAIL when only some tests are execution errors', () => {
const results = [
makeResult({ testId: 'pass-1', score: 0.9, executionStatus: 'ok' }),
makeResult({
testId: 'err-1',
score: 0,
executionStatus: 'execution_error',
error: 'Not Found',
}),
];

const summary = calculateEvaluationSummary(results);
const output = formatEvaluationSummary(summary);

// Should show PASS (the one graded test passed) not INCONCLUSIVE
expect(output).toContain('RESULT: PASS');
expect(output).not.toContain('INCONCLUSIVE');
});

it('shows FAIL when there are quality failures mixed with execution errors', () => {
const results = [
makeResult({ testId: 'fail-1', score: 0.3, executionStatus: 'quality_failure' }),
makeResult({
testId: 'err-1',
score: 0,
executionStatus: 'execution_error',
error: 'Not Found',
}),
];

const summary = calculateEvaluationSummary(results, { threshold: 0.8 });
const output = formatEvaluationSummary(summary, { threshold: 0.8 });

expect(output).toContain('RESULT: FAIL');
expect(output).not.toContain('INCONCLUSIVE');
});

it('shows PASS when all tests pass and none are errors', () => {
const results = [
makeResult({ testId: 'pass-1', score: 0.9, executionStatus: 'ok' }),
makeResult({ testId: 'pass-2', score: 0.85, executionStatus: 'ok' }),
];

const summary = calculateEvaluationSummary(results);
const output = formatEvaluationSummary(summary);

expect(output).toContain('RESULT: PASS');
expect(output).not.toContain('INCONCLUSIVE');
});
});
Loading