diff --git a/AGENTS.md b/AGENTS.md
index 9bec0b95d..3bbae5d35 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -266,17 +266,18 @@ Before marking any branch as ready for review, complete this checklist:
2. **Run unit tests**: `bun run test` — all must pass.
-3. **Manual red/green UAT (REQUIRED for all changes):**
- Automated tests are not sufficient. Every change must be manually verified from the end user's perspective using a red/green approach:
- - **Red (before fix):** Reproduce the bug or demonstrate the missing feature on `main` (or before your change). Confirm the undesired behavior is observable from the CLI / user-facing output.
- - **Green (after fix):** Run the same scenario with your changes applied. Confirm the fix or feature works correctly from the end user's perspective.
- - Document both the red and green results in the PR or conversation so the user can see the before/after.
+3. **⚠️ BLOCKING: Manual red/green UAT — must complete before steps 4-5:**
+ Unit tests passing is NOT sufficient. Every change must be manually verified from the end user's perspective. Do NOT skip this step or proceed to step 4 until red/green evidence is documented.
+
+ - **Red (before your changes):** Run the scenario on `main` (or the code state before your changes). Confirm the bug or missing feature is observable from the CLI / user-facing output. Capture the output.
+ - **Green (with your changes):** Run the identical scenario with your branch. Confirm the fix or feature works correctly from the end user's perspective. Capture the output.
+ - **Document both** red and green results in the PR description or comments so reviewers can see the before/after evidence.
For evaluator changes, this means running a real eval (not `--dry-run`) and inspecting the output JSONL. For CLI/UX changes, this means running the CLI command and verifying the console output.
4. **Verify no regressions** in areas adjacent to your changes (e.g., if you changed evaluator parsing, run an eval that exercises different evaluator types).
-5. **Mark PR as ready** only after all above steps pass.
+5. **Mark PR as ready** only after steps 1-4 have been completed AND red/green UAT evidence is included in the PR.
## Documentation Updates
diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
index 8e6903c52..2245f18ee 100644
--- a/apps/cli/src/commands/eval/commands/run.ts
+++ b/apps/cli/src/commands/eval/commands/run.ts
@@ -238,6 +238,9 @@ export const evalRunCommand = command({
excludeTag: args.excludeTag,
};
const result = await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
+ if (result?.allExecutionErrors) {
+ process.exit(2);
+ }
if (result?.thresholdFailed) {
process.exit(1);
}
diff --git a/apps/cli/src/commands/eval/junit-writer.ts b/apps/cli/src/commands/eval/junit-writer.ts
index 84198bb99..c53beca45 100644
--- a/apps/cli/src/commands/eval/junit-writer.ts
+++ b/apps/cli/src/commands/eval/junit-writer.ts
@@ -58,15 +58,18 @@ export class JunitWriter {
const suiteXmls: string[] = [];
for (const [suiteName, results] of grouped) {
- const failures = results.filter((r) => r.score < this.threshold).length;
- const errors = results.filter((r) => r.error !== undefined).length;
+ const errors = results.filter((r) => r.executionStatus === 'execution_error').length;
+ const failures = results.filter(
+ (r) => r.executionStatus !== 'execution_error' && r.score < this.threshold,
+ ).length;
const testCases = results.map((r) => {
const time = r.durationMs ? (r.durationMs / 1000).toFixed(3) : '0.000';
let inner = '';
- if (r.error) {
- inner = `\n ${escapeXml(r.error)}\n `;
+ if (r.executionStatus === 'execution_error') {
+ const errorMsg = r.error ?? 'Execution error';
+ inner = `\n ${escapeXml(errorMsg)}\n `;
} else if (r.score < this.threshold) {
const message = `score=${r.score.toFixed(3)}`;
const failedAssertions = r.assertions.filter((a) => !a.passed);
@@ -90,8 +93,10 @@ export class JunitWriter {
}
const totalTests = this.results.length;
- const totalFailures = this.results.filter((r) => r.score < this.threshold).length;
- const totalErrors = this.results.filter((r) => r.error !== undefined).length;
+ const totalErrors = this.results.filter((r) => r.executionStatus === 'execution_error').length;
+ const totalFailures = this.results.filter(
+ (r) => r.executionStatus !== 'execution_error' && r.score < this.threshold,
+ ).length;
const xml = `\n\n${suiteXmls.join('\n')}\n\n`;
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index 1a26fff4b..097831f73 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -818,6 +818,8 @@ export interface RunEvalResult {
readonly target?: string;
/** True when --threshold is set and mean score is below the threshold */
readonly thresholdFailed?: boolean;
+ /** True when all tests had execution errors and no evaluation was performed */
+ readonly allExecutionErrors?: boolean;
}
export async function runEvalCommand(
@@ -1299,7 +1301,9 @@ export async function runEvalCommand(
const summary = calculateEvaluationSummary(allResults, thresholdOpts);
console.log(formatEvaluationSummary(summary, thresholdOpts));
- // Exit code matches RESULT verdict: fail if any test scored below threshold.
+ // Exit code: 2 when all tests are execution errors (no evaluation performed),
+ // 1 when any test scored below threshold.
+ const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
const thresholdFailed = resolvedThreshold !== undefined && summary.qualityFailureCount > 0;
// Print matrix summary when multiple targets were evaluated
@@ -1397,6 +1401,7 @@ export async function runEvalCommand(
testFiles: activeTestFiles,
target: options.target,
thresholdFailed,
+ allExecutionErrors,
};
} finally {
unsubscribeCodexLogs();
diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts
index 38aa4c502..1e204b5c0 100644
--- a/apps/cli/src/commands/eval/statistics.ts
+++ b/apps/cli/src/commands/eval/statistics.ts
@@ -209,13 +209,25 @@ export function formatEvaluationSummary(
// Overall verdict: all non-error cases must score >= per-test threshold.
const gradedCount = summary.total - summary.executionErrorCount;
const threshold = options?.threshold ?? 0.8;
+ const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
const overallPassed =
- summary.passedCount === gradedCount ||
- (summary.qualityFailureCount === 0 && summary.executionErrorCount === 0);
- const overallVerdict = overallPassed ? 'PASS' : 'FAIL';
+ !allExecutionErrors &&
+ (summary.passedCount === gradedCount ||
+ (summary.qualityFailureCount === 0 && summary.executionErrorCount === 0));
const useColor = !(process.env.NO_COLOR !== undefined) && (process.stdout.isTTY ?? false);
- const verdictColor = overallPassed ? '\x1b[32m' : '\x1b[31m';
- const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`;
+
+ let overallVerdict: string;
+ let verdictColor: string;
+ let verdictText: string;
+ if (allExecutionErrors) {
+ overallVerdict = 'INCONCLUSIVE';
+ verdictColor = '\x1b[33m'; // yellow
+ verdictText = `RESULT: INCONCLUSIVE (all ${summary.total} test(s) had execution errors — no evaluation was performed)`;
+ } else {
+ overallVerdict = overallPassed ? 'PASS' : 'FAIL';
+ verdictColor = overallPassed ? '\x1b[32m' : '\x1b[31m';
+ verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`;
+ }
lines.push('\n==================================================');
if (useColor) {
diff --git a/apps/cli/test/commands/eval/output-writers.test.ts b/apps/cli/test/commands/eval/output-writers.test.ts
index d13abcc35..feffdef4b 100644
--- a/apps/cli/test/commands/eval/output-writers.test.ts
+++ b/apps/cli/test/commands/eval/output-writers.test.ts
@@ -19,6 +19,7 @@ function makeResult(overrides: Partial = {}): EvaluationResult
assertions: [{ text: 'criterion-1', passed: true }],
output: [{ role: 'assistant' as const, content: 'answer' }],
target: 'default',
+ executionStatus: 'ok',
...overrides,
};
}
@@ -146,7 +147,14 @@ describe('JunitWriter', () => {
it('should handle errors as elements', async () => {
const writer = await JunitWriter.open(testFilePath);
- await writer.append(makeResult({ testId: 'err-1', score: 0, error: 'Timeout exceeded' }));
+ await writer.append(
+ makeResult({
+ testId: 'err-1',
+ score: 0,
+ error: 'Timeout exceeded',
+ executionStatus: 'execution_error',
+ }),
+ );
await writer.close();
const xml = await readFile(testFilePath, 'utf8');
@@ -188,6 +196,74 @@ describe('JunitWriter', () => {
expect(xml).not.toContain(' {
+ const writer = await JunitWriter.open(testFilePath);
+
+ await writer.append(
+ makeResult({
+ testId: 'exec-err',
+ score: 0,
+ executionStatus: 'execution_error',
+ error: 'Not Found',
+ }),
+ );
+ await writer.append(
+ makeResult({ testId: 'quality-fail', score: 0.3, executionStatus: 'quality_failure' }),
+ );
+ await writer.append(makeResult({ testId: 'pass', score: 0.9, executionStatus: 'ok' }));
+ await writer.close();
+
+ const xml = await readFile(testFilePath, 'utf8');
+ // Execution error produces , not
+ expect(xml).toContain('
+ expect(xml).toContain(' {
+ const writer = await JunitWriter.open(testFilePath);
+
+ // All execution errors — should have 0 failures, 2 errors
+ await writer.append(
+ makeResult({
+ testId: 'err-1',
+ score: 0,
+ executionStatus: 'execution_error',
+ error: 'Provider error',
+ }),
+ );
+ await writer.append(
+ makeResult({
+ testId: 'err-2',
+ score: 0,
+ executionStatus: 'execution_error',
+ error: 'Timeout',
+ }),
+ );
+ await writer.close();
+
+ const xml = await readFile(testFilePath, 'utf8');
+ expect(xml).toContain('failures="0"');
+ expect(xml).toContain('errors="2"');
+ });
+
+ it('should emit for execution_error even without error message', async () => {
+ const writer = await JunitWriter.open(testFilePath);
+
+ await writer.append(
+ makeResult({ testId: 'no-msg', score: 0, executionStatus: 'execution_error' }),
+ );
+ await writer.close();
+
+ const xml = await readFile(testFilePath, 'utf8');
+ expect(xml).toContain(' {
diff --git a/apps/cli/test/commands/eval/statistics-inconclusive.test.ts b/apps/cli/test/commands/eval/statistics-inconclusive.test.ts
new file mode 100644
index 000000000..76e6196dd
--- /dev/null
+++ b/apps/cli/test/commands/eval/statistics-inconclusive.test.ts
@@ -0,0 +1,103 @@
+import { describe, expect, it } from 'bun:test';
+
+import type { EvaluationResult } from '@agentv/core';
+
+import {
+ calculateEvaluationSummary,
+ formatEvaluationSummary,
+} from '../../../src/commands/eval/statistics.js';
+
+function makeResult(overrides: Partial = {}): EvaluationResult {
+ return {
+ timestamp: '2024-01-01T00:00:00Z',
+ testId: 'test-1',
+ score: 1.0,
+ assertions: [{ text: 'criterion-1', passed: true }],
+ output: [{ role: 'assistant' as const, content: 'answer' }],
+ target: 'default',
+ executionStatus: 'ok',
+ ...overrides,
+ };
+}
+
+describe('formatEvaluationSummary — inconclusive verdict', () => {
+ it('shows INCONCLUSIVE when all tests are execution errors', () => {
+ const results = [
+ makeResult({
+ testId: 'err-1',
+ score: 0,
+ executionStatus: 'execution_error',
+ error: 'Not Found',
+ }),
+ makeResult({
+ testId: 'err-2',
+ score: 0,
+ executionStatus: 'execution_error',
+ error: 'Not Found',
+ }),
+ makeResult({
+ testId: 'err-3',
+ score: 0,
+ executionStatus: 'execution_error',
+ error: 'Not Found',
+ }),
+ ];
+
+ const summary = calculateEvaluationSummary(results);
+ const output = formatEvaluationSummary(summary);
+
+ expect(output).toContain('RESULT: INCONCLUSIVE');
+ expect(output).toContain('all 3 test(s) had execution errors');
+ expect(output).toContain('no evaluation was performed');
+ });
+
+ it('shows PASS/FAIL when only some tests are execution errors', () => {
+ const results = [
+ makeResult({ testId: 'pass-1', score: 0.9, executionStatus: 'ok' }),
+ makeResult({
+ testId: 'err-1',
+ score: 0,
+ executionStatus: 'execution_error',
+ error: 'Not Found',
+ }),
+ ];
+
+ const summary = calculateEvaluationSummary(results);
+ const output = formatEvaluationSummary(summary);
+
+ // Should show PASS (the one graded test passed) not INCONCLUSIVE
+ expect(output).toContain('RESULT: PASS');
+ expect(output).not.toContain('INCONCLUSIVE');
+ });
+
+ it('shows FAIL when there are quality failures mixed with execution errors', () => {
+ const results = [
+ makeResult({ testId: 'fail-1', score: 0.3, executionStatus: 'quality_failure' }),
+ makeResult({
+ testId: 'err-1',
+ score: 0,
+ executionStatus: 'execution_error',
+ error: 'Not Found',
+ }),
+ ];
+
+ const summary = calculateEvaluationSummary(results, { threshold: 0.8 });
+ const output = formatEvaluationSummary(summary, { threshold: 0.8 });
+
+ expect(output).toContain('RESULT: FAIL');
+ expect(output).not.toContain('INCONCLUSIVE');
+ });
+
+ it('shows PASS when all tests pass and none are errors', () => {
+ const results = [
+ makeResult({ testId: 'pass-1', score: 0.9, executionStatus: 'ok' }),
+ makeResult({ testId: 'pass-2', score: 0.85, executionStatus: 'ok' }),
+ ];
+
+ const summary = calculateEvaluationSummary(results);
+ const output = formatEvaluationSummary(summary);
+
+ expect(output).toContain('RESULT: PASS');
+ expect(output).not.toContain('INCONCLUSIVE');
+ });
+});