From 02ed3e1edb882dfa1e357333e0731d5e90620b50 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 5 Apr 2026 07:19:10 +0000 Subject: [PATCH 1/2] feat(core): report inconclusive status when all tests have execution errors When all eval tests fail due to execution errors (e.g., misconfigured model, network failures), the run now reports INCONCLUSIVE instead of a misleading PASS/FAIL verdict. - Exit code 2 for all-execution-error runs (distinct from exit 1 for threshold failures) - CLI shows yellow INCONCLUSIVE verdict with clear messaging - JUnit XML uses executionStatus to classify vs , preventing double-counting of execution errors as failures Closes #894 Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/eval/commands/run.ts | 3 + apps/cli/src/commands/eval/junit-writer.ts | 17 ++- apps/cli/src/commands/eval/run-eval.ts | 7 +- apps/cli/src/commands/eval/statistics.ts | 22 +++- .../test/commands/eval/output-writers.test.ts | 78 ++++++++++++- .../eval/statistics-inconclusive.test.ts | 103 ++++++++++++++++++ 6 files changed, 217 insertions(+), 13 deletions(-) create mode 100644 apps/cli/test/commands/eval/statistics-inconclusive.test.ts diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 8e6903c52..2245f18ee 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -238,6 +238,9 @@ export const evalRunCommand = command({ excludeTag: args.excludeTag, }; const result = await runEvalCommand({ testFiles: resolvedPaths, rawOptions }); + if (result?.allExecutionErrors) { + process.exit(2); + } if (result?.thresholdFailed) { process.exit(1); } diff --git a/apps/cli/src/commands/eval/junit-writer.ts b/apps/cli/src/commands/eval/junit-writer.ts index 84198bb99..c53beca45 100644 --- a/apps/cli/src/commands/eval/junit-writer.ts +++ b/apps/cli/src/commands/eval/junit-writer.ts @@ -58,15 +58,18 @@ export class JunitWriter { const suiteXmls: string[] = []; for (const [suiteName, results] of grouped) { - const failures = results.filter((r) => r.score < this.threshold).length; - const errors = results.filter((r) => r.error !== undefined).length; + const errors = results.filter((r) => r.executionStatus === 'execution_error').length; + const failures = results.filter( + (r) => r.executionStatus !== 'execution_error' && r.score < this.threshold, + ).length; const testCases = results.map((r) => { const time = r.durationMs ? (r.durationMs / 1000).toFixed(3) : '0.000'; let inner = ''; - if (r.error) { - inner = `\n ${escapeXml(r.error)}\n `; + if (r.executionStatus === 'execution_error') { + const errorMsg = r.error ?? 'Execution error'; + inner = `\n ${escapeXml(errorMsg)}\n `; } else if (r.score < this.threshold) { const message = `score=${r.score.toFixed(3)}`; const failedAssertions = r.assertions.filter((a) => !a.passed); @@ -90,8 +93,10 @@ export class JunitWriter { } const totalTests = this.results.length; - const totalFailures = this.results.filter((r) => r.score < this.threshold).length; - const totalErrors = this.results.filter((r) => r.error !== undefined).length; + const totalErrors = this.results.filter((r) => r.executionStatus === 'execution_error').length; + const totalFailures = this.results.filter( + (r) => r.executionStatus !== 'execution_error' && r.score < this.threshold, + ).length; const xml = `\n\n${suiteXmls.join('\n')}\n\n`; diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 1a26fff4b..097831f73 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -818,6 +818,8 @@ export interface RunEvalResult { readonly target?: string; /** True when --threshold is set and mean score is below the threshold */ readonly thresholdFailed?: boolean; + /** True when all tests had execution errors and no evaluation was performed */ + readonly allExecutionErrors?: boolean; } export async function runEvalCommand( @@ -1299,7 +1301,9 @@ export async function runEvalCommand( const summary = calculateEvaluationSummary(allResults, thresholdOpts); console.log(formatEvaluationSummary(summary, thresholdOpts)); - // Exit code matches RESULT verdict: fail if any test scored below threshold. + // Exit code: 2 when all tests are execution errors (no evaluation performed), + // 1 when any test scored below threshold. + const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total; const thresholdFailed = resolvedThreshold !== undefined && summary.qualityFailureCount > 0; // Print matrix summary when multiple targets were evaluated @@ -1397,6 +1401,7 @@ export async function runEvalCommand( testFiles: activeTestFiles, target: options.target, thresholdFailed, + allExecutionErrors, }; } finally { unsubscribeCodexLogs(); diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts index 38aa4c502..1e204b5c0 100644 --- a/apps/cli/src/commands/eval/statistics.ts +++ b/apps/cli/src/commands/eval/statistics.ts @@ -209,13 +209,25 @@ export function formatEvaluationSummary( // Overall verdict: all non-error cases must score >= per-test threshold. const gradedCount = summary.total - summary.executionErrorCount; const threshold = options?.threshold ?? 0.8; + const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total; const overallPassed = - summary.passedCount === gradedCount || - (summary.qualityFailureCount === 0 && summary.executionErrorCount === 0); - const overallVerdict = overallPassed ? 'PASS' : 'FAIL'; + !allExecutionErrors && + (summary.passedCount === gradedCount || + (summary.qualityFailureCount === 0 && summary.executionErrorCount === 0)); const useColor = !(process.env.NO_COLOR !== undefined) && (process.stdout.isTTY ?? false); - const verdictColor = overallPassed ? '\x1b[32m' : '\x1b[31m'; - const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`; + + let overallVerdict: string; + let verdictColor: string; + let verdictText: string; + if (allExecutionErrors) { + overallVerdict = 'INCONCLUSIVE'; + verdictColor = '\x1b[33m'; // yellow + verdictText = `RESULT: INCONCLUSIVE (all ${summary.total} test(s) had execution errors — no evaluation was performed)`; + } else { + overallVerdict = overallPassed ? 'PASS' : 'FAIL'; + verdictColor = overallPassed ? '\x1b[32m' : '\x1b[31m'; + verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`; + } lines.push('\n=================================================='); if (useColor) { diff --git a/apps/cli/test/commands/eval/output-writers.test.ts b/apps/cli/test/commands/eval/output-writers.test.ts index d13abcc35..feffdef4b 100644 --- a/apps/cli/test/commands/eval/output-writers.test.ts +++ b/apps/cli/test/commands/eval/output-writers.test.ts @@ -19,6 +19,7 @@ function makeResult(overrides: Partial = {}): EvaluationResult assertions: [{ text: 'criterion-1', passed: true }], output: [{ role: 'assistant' as const, content: 'answer' }], target: 'default', + executionStatus: 'ok', ...overrides, }; } @@ -146,7 +147,14 @@ describe('JunitWriter', () => { it('should handle errors as elements', async () => { const writer = await JunitWriter.open(testFilePath); - await writer.append(makeResult({ testId: 'err-1', score: 0, error: 'Timeout exceeded' })); + await writer.append( + makeResult({ + testId: 'err-1', + score: 0, + error: 'Timeout exceeded', + executionStatus: 'execution_error', + }), + ); await writer.close(); const xml = await readFile(testFilePath, 'utf8'); @@ -188,6 +196,74 @@ describe('JunitWriter', () => { expect(xml).not.toContain(' { + const writer = await JunitWriter.open(testFilePath); + + await writer.append( + makeResult({ + testId: 'exec-err', + score: 0, + executionStatus: 'execution_error', + error: 'Not Found', + }), + ); + await writer.append( + makeResult({ testId: 'quality-fail', score: 0.3, executionStatus: 'quality_failure' }), + ); + await writer.append(makeResult({ testId: 'pass', score: 0.9, executionStatus: 'ok' })); + await writer.close(); + + const xml = await readFile(testFilePath, 'utf8'); + // Execution error produces , not + expect(xml).toContain(' + expect(xml).toContain(' { + const writer = await JunitWriter.open(testFilePath); + + // All execution errors — should have 0 failures, 2 errors + await writer.append( + makeResult({ + testId: 'err-1', + score: 0, + executionStatus: 'execution_error', + error: 'Provider error', + }), + ); + await writer.append( + makeResult({ + testId: 'err-2', + score: 0, + executionStatus: 'execution_error', + error: 'Timeout', + }), + ); + await writer.close(); + + const xml = await readFile(testFilePath, 'utf8'); + expect(xml).toContain('failures="0"'); + expect(xml).toContain('errors="2"'); + }); + + it('should emit for execution_error even without error message', async () => { + const writer = await JunitWriter.open(testFilePath); + + await writer.append( + makeResult({ testId: 'no-msg', score: 0, executionStatus: 'execution_error' }), + ); + await writer.close(); + + const xml = await readFile(testFilePath, 'utf8'); + expect(xml).toContain(' { diff --git a/apps/cli/test/commands/eval/statistics-inconclusive.test.ts b/apps/cli/test/commands/eval/statistics-inconclusive.test.ts new file mode 100644 index 000000000..76e6196dd --- /dev/null +++ b/apps/cli/test/commands/eval/statistics-inconclusive.test.ts @@ -0,0 +1,103 @@ +import { describe, expect, it } from 'bun:test'; + +import type { EvaluationResult } from '@agentv/core'; + +import { + calculateEvaluationSummary, + formatEvaluationSummary, +} from '../../../src/commands/eval/statistics.js'; + +function makeResult(overrides: Partial = {}): EvaluationResult { + return { + timestamp: '2024-01-01T00:00:00Z', + testId: 'test-1', + score: 1.0, + assertions: [{ text: 'criterion-1', passed: true }], + output: [{ role: 'assistant' as const, content: 'answer' }], + target: 'default', + executionStatus: 'ok', + ...overrides, + }; +} + +describe('formatEvaluationSummary — inconclusive verdict', () => { + it('shows INCONCLUSIVE when all tests are execution errors', () => { + const results = [ + makeResult({ + testId: 'err-1', + score: 0, + executionStatus: 'execution_error', + error: 'Not Found', + }), + makeResult({ + testId: 'err-2', + score: 0, + executionStatus: 'execution_error', + error: 'Not Found', + }), + makeResult({ + testId: 'err-3', + score: 0, + executionStatus: 'execution_error', + error: 'Not Found', + }), + ]; + + const summary = calculateEvaluationSummary(results); + const output = formatEvaluationSummary(summary); + + expect(output).toContain('RESULT: INCONCLUSIVE'); + expect(output).toContain('all 3 test(s) had execution errors'); + expect(output).toContain('no evaluation was performed'); + }); + + it('shows PASS/FAIL when only some tests are execution errors', () => { + const results = [ + makeResult({ testId: 'pass-1', score: 0.9, executionStatus: 'ok' }), + makeResult({ + testId: 'err-1', + score: 0, + executionStatus: 'execution_error', + error: 'Not Found', + }), + ]; + + const summary = calculateEvaluationSummary(results); + const output = formatEvaluationSummary(summary); + + // Should show PASS (the one graded test passed) not INCONCLUSIVE + expect(output).toContain('RESULT: PASS'); + expect(output).not.toContain('INCONCLUSIVE'); + }); + + it('shows FAIL when there are quality failures mixed with execution errors', () => { + const results = [ + makeResult({ testId: 'fail-1', score: 0.3, executionStatus: 'quality_failure' }), + makeResult({ + testId: 'err-1', + score: 0, + executionStatus: 'execution_error', + error: 'Not Found', + }), + ]; + + const summary = calculateEvaluationSummary(results, { threshold: 0.8 }); + const output = formatEvaluationSummary(summary, { threshold: 0.8 }); + + expect(output).toContain('RESULT: FAIL'); + expect(output).not.toContain('INCONCLUSIVE'); + }); + + it('shows PASS when all tests pass and none are errors', () => { + const results = [ + makeResult({ testId: 'pass-1', score: 0.9, executionStatus: 'ok' }), + makeResult({ testId: 'pass-2', score: 0.85, executionStatus: 'ok' }), + ]; + + const summary = calculateEvaluationSummary(results); + const output = formatEvaluationSummary(summary); + + expect(output).toContain('RESULT: PASS'); + expect(output).not.toContain('INCONCLUSIVE'); + }); +}); From c73afe640c00540ca578493442c131fc4737bc26 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 5 Apr 2026 07:41:09 +0000 Subject: [PATCH 2/2] docs: clarify manual red/green UAT as blocking step in E2E checklist Make the UAT requirement more prominent with a blocking warning, clearer red/green definitions, and explicit dependency on completing UAT before proceeding to later checklist steps. Co-Authored-By: Claude Opus 4.6 (1M context) --- AGENTS.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 9bec0b95d..3bbae5d35 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -266,17 +266,18 @@ Before marking any branch as ready for review, complete this checklist: 2. **Run unit tests**: `bun run test` — all must pass. -3. **Manual red/green UAT (REQUIRED for all changes):** - Automated tests are not sufficient. Every change must be manually verified from the end user's perspective using a red/green approach: - - **Red (before fix):** Reproduce the bug or demonstrate the missing feature on `main` (or before your change). Confirm the undesired behavior is observable from the CLI / user-facing output. - - **Green (after fix):** Run the same scenario with your changes applied. Confirm the fix or feature works correctly from the end user's perspective. - - Document both the red and green results in the PR or conversation so the user can see the before/after. +3. **⚠️ BLOCKING: Manual red/green UAT — must complete before steps 4-5:** + Unit tests passing is NOT sufficient. Every change must be manually verified from the end user's perspective. Do NOT skip this step or proceed to step 4 until red/green evidence is documented. + + - **Red (before your changes):** Run the scenario on `main` (or the code state before your changes). Confirm the bug or missing feature is observable from the CLI / user-facing output. Capture the output. + - **Green (with your changes):** Run the identical scenario with your branch. Confirm the fix or feature works correctly from the end user's perspective. Capture the output. + - **Document both** red and green results in the PR description or comments so reviewers can see the before/after evidence. For evaluator changes, this means running a real eval (not `--dry-run`) and inspecting the output JSONL. For CLI/UX changes, this means running the CLI command and verifying the console output. 4. **Verify no regressions** in areas adjacent to your changes (e.g., if you changed evaluator parsing, run an eval that exercises different evaluator types). -5. **Mark PR as ready** only after all above steps pass. +5. **Mark PR as ready** only after steps 1-4 have been completed AND red/green UAT evidence is included in the PR. ## Documentation Updates