From 02ed3e1edb882dfa1e357333e0731d5e90620b50 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 5 Apr 2026 07:19:10 +0000
Subject: [PATCH 1/2] feat(core): report inconclusive status when all tests
 have execution errors

When all eval tests fail due to execution errors (e.g., misconfigured
model, network failures), the run now reports INCONCLUSIVE instead of
a misleading PASS/FAIL verdict.

- Exit code 2 for all-execution-error runs (distinct from exit 1 for
  threshold failures)
- CLI shows yellow INCONCLUSIVE verdict with clear messaging
- JUnit XML uses executionStatus to classify <error> vs <failure>,
  preventing double-counting of execution errors as failures

Closes #894

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/commands/run.ts    |   3 +
 apps/cli/src/commands/eval/junit-writer.ts    |  17 ++-
 apps/cli/src/commands/eval/run-eval.ts        |   7 +-
 apps/cli/src/commands/eval/statistics.ts      |  22 +++-
 .../test/commands/eval/output-writers.test.ts |  78 ++++++++++++-
 .../eval/statistics-inconclusive.test.ts      | 103 ++++++++++++++++++
 6 files changed, 217 insertions(+), 13 deletions(-)
 create mode 100644 apps/cli/test/commands/eval/statistics-inconclusive.test.ts
diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
index 8e6903c52..2245f18ee 100644
--- a/apps/cli/src/commands/eval/commands/run.ts
+++ b/apps/cli/src/commands/eval/commands/run.ts
@@ -238,6 +238,9 @@ export const evalRunCommand = command({
       excludeTag: args.excludeTag,
     };
     const result = await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
+    if (result?.allExecutionErrors) {
+      process.exit(2);
+    }
     if (result?.thresholdFailed) {
       process.exit(1);
     }
diff --git a/apps/cli/src/commands/eval/junit-writer.ts b/apps/cli/src/commands/eval/junit-writer.ts
index 84198bb99..c53beca45 100644
--- a/apps/cli/src/commands/eval/junit-writer.ts
+++ b/apps/cli/src/commands/eval/junit-writer.ts
@@ -58,15 +58,18 @@ export class JunitWriter {
 
     const suiteXmls: string[] = [];
     for (const [suiteName, results] of grouped) {
-      const failures = results.filter((r) => r.score < this.threshold).length;
-      const errors = results.filter((r) => r.error !== undefined).length;
+      const errors = results.filter((r) => r.executionStatus === 'execution_error').length;
+      const failures = results.filter(
+        (r) => r.executionStatus !== 'execution_error' && r.score < this.threshold,
+      ).length;
 
       const testCases = results.map((r) => {
         const time = r.durationMs ? (r.durationMs / 1000).toFixed(3) : '0.000';
 
         let inner = '';
-        if (r.error) {
-          inner = `\n      <error message="${escapeXml(r.error)}">${escapeXml(r.error)}</error>\n    `;
+        if (r.executionStatus === 'execution_error') {
+          const errorMsg = r.error ?? 'Execution error';
+          inner = `\n      <error message="${escapeXml(errorMsg)}">${escapeXml(errorMsg)}</error>\n    `;
         } else if (r.score < this.threshold) {
           const message = `score=${r.score.toFixed(3)}`;
           const failedAssertions = r.assertions.filter((a) => !a.passed);
@@ -90,8 +93,10 @@ export class JunitWriter {
     }
 
     const totalTests = this.results.length;
-    const totalFailures = this.results.filter((r) => r.score < this.threshold).length;
-    const totalErrors = this.results.filter((r) => r.error !== undefined).length;
+    const totalErrors = this.results.filter((r) => r.executionStatus === 'execution_error').length;
+    const totalFailures = this.results.filter(
+      (r) => r.executionStatus !== 'execution_error' && r.score < this.threshold,
+    ).length;
 
     const xml = `<?xml version="1.0" encoding="UTF-8"?>\n<testsuites tests="${totalTests}" failures="${totalFailures}" errors="${totalErrors}">\n${suiteXmls.join('\n')}\n</testsuites>\n`;
 
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index 1a26fff4b..097831f73 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -818,6 +818,8 @@ export interface RunEvalResult {
   readonly target?: string;
   /** True when --threshold is set and mean score is below the threshold */
   readonly thresholdFailed?: boolean;
+  /** True when all tests had execution errors and no evaluation was performed */
+  readonly allExecutionErrors?: boolean;
 }
 
 export async function runEvalCommand(
@@ -1299,7 +1301,9 @@ export async function runEvalCommand(
     const summary = calculateEvaluationSummary(allResults, thresholdOpts);
     console.log(formatEvaluationSummary(summary, thresholdOpts));
 
-    // Exit code matches RESULT verdict: fail if any test scored below threshold.
+    // Exit code: 2 when all tests are execution errors (no evaluation performed),
+    // 1 when any test scored below threshold.
+    const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
     const thresholdFailed = resolvedThreshold !== undefined && summary.qualityFailureCount > 0;
 
     // Print matrix summary when multiple targets were evaluated
@@ -1397,6 +1401,7 @@ export async function runEvalCommand(
       testFiles: activeTestFiles,
       target: options.target,
       thresholdFailed,
+      allExecutionErrors,
     };
   } finally {
     unsubscribeCodexLogs();
diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts
index 38aa4c502..1e204b5c0 100644
--- a/apps/cli/src/commands/eval/statistics.ts
+++ b/apps/cli/src/commands/eval/statistics.ts
@@ -209,13 +209,25 @@ export function formatEvaluationSummary(
   // Overall verdict: all non-error cases must score >= per-test threshold.
   const gradedCount = summary.total - summary.executionErrorCount;
   const threshold = options?.threshold ?? 0.8;
+  const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
   const overallPassed =
-    summary.passedCount === gradedCount ||
-    (summary.qualityFailureCount === 0 && summary.executionErrorCount === 0);
-  const overallVerdict = overallPassed ? 'PASS' : 'FAIL';
+    !allExecutionErrors &&
+    (summary.passedCount === gradedCount ||
+      (summary.qualityFailureCount === 0 && summary.executionErrorCount === 0));
   const useColor = !(process.env.NO_COLOR !== undefined) && (process.stdout.isTTY ?? false);
-  const verdictColor = overallPassed ? '\x1b[32m' : '\x1b[31m';
-  const verdictText = `RESULT: ${overallVerdict}  (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`;
+
+  let overallVerdict: string;
+  let verdictColor: string;
+  let verdictText: string;
+  if (allExecutionErrors) {
+    overallVerdict = 'INCONCLUSIVE';
+    verdictColor = '\x1b[33m'; // yellow
+    verdictText = `RESULT: INCONCLUSIVE  (all ${summary.total} test(s) had execution errors — no evaluation was performed)`;
+  } else {
+    overallVerdict = overallPassed ? 'PASS' : 'FAIL';
+    verdictColor = overallPassed ? '\x1b[32m' : '\x1b[31m';
+    verdictText = `RESULT: ${overallVerdict}  (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`;
+  }
 
   lines.push('\n==================================================');
   if (useColor) {
diff --git a/apps/cli/test/commands/eval/output-writers.test.ts b/apps/cli/test/commands/eval/output-writers.test.ts
index d13abcc35..feffdef4b 100644
--- a/apps/cli/test/commands/eval/output-writers.test.ts
+++ b/apps/cli/test/commands/eval/output-writers.test.ts
@@ -19,6 +19,7 @@ function makeResult(overrides: Partial<EvaluationResult> = {}): EvaluationResult
     assertions: [{ text: 'criterion-1', passed: true }],
     output: [{ role: 'assistant' as const, content: 'answer' }],
     target: 'default',
+    executionStatus: 'ok',
     ...overrides,
   };
 }
@@ -146,7 +147,14 @@ describe('JunitWriter', () => {
 
   it('should handle errors as <error> elements', async () => {
     const writer = await JunitWriter.open(testFilePath);
-    await writer.append(makeResult({ testId: 'err-1', score: 0, error: 'Timeout exceeded' }));
+    await writer.append(
+      makeResult({
+        testId: 'err-1',
+        score: 0,
+        error: 'Timeout exceeded',
+        executionStatus: 'execution_error',
+      }),
+    );
     await writer.close();
 
     const xml = await readFile(testFilePath, 'utf8');
@@ -188,6 +196,74 @@ describe('JunitWriter', () => {
     expect(xml).not.toContain('<failure message="score=0.600"');
     expect(xml).toContain('<failure message="score=0.300"');
   });
+
+  it('should use executionStatus to classify errors vs failures', async () => {
+    const writer = await JunitWriter.open(testFilePath);
+
+    await writer.append(
+      makeResult({
+        testId: 'exec-err',
+        score: 0,
+        executionStatus: 'execution_error',
+        error: 'Not Found',
+      }),
+    );
+    await writer.append(
+      makeResult({ testId: 'quality-fail', score: 0.3, executionStatus: 'quality_failure' }),
+    );
+    await writer.append(makeResult({ testId: 'pass', score: 0.9, executionStatus: 'ok' }));
+    await writer.close();
+
+    const xml = await readFile(testFilePath, 'utf8');
+    // Execution error produces <error>, not <failure>
+    expect(xml).toContain('<error message="Not Found"');
+    // Quality failure produces <failure>
+    expect(xml).toContain('<failure message="score=0.300"');
+    // Counts: 1 error, 1 failure (execution error excluded from failure count)
+    expect(xml).toContain('errors="1"');
+    expect(xml).toContain('failures="1"');
+  });
+
+  it('should not double-count execution errors as failures', async () => {
+    const writer = await JunitWriter.open(testFilePath);
+
+    // All execution errors — should have 0 failures, 2 errors
+    await writer.append(
+      makeResult({
+        testId: 'err-1',
+        score: 0,
+        executionStatus: 'execution_error',
+        error: 'Provider error',
+      }),
+    );
+    await writer.append(
+      makeResult({
+        testId: 'err-2',
+        score: 0,
+        executionStatus: 'execution_error',
+        error: 'Timeout',
+      }),
+    );
+    await writer.close();
+
+    const xml = await readFile(testFilePath, 'utf8');
+    expect(xml).toContain('failures="0"');
+    expect(xml).toContain('errors="2"');
+  });
+
+  it('should emit <error> for execution_error even without error message', async () => {
+    const writer = await JunitWriter.open(testFilePath);
+
+    await writer.append(
+      makeResult({ testId: 'no-msg', score: 0, executionStatus: 'execution_error' }),
+    );
+    await writer.close();
+
+    const xml = await readFile(testFilePath, 'utf8');
+    expect(xml).toContain('<error message="Execution error"');
+    expect(xml).toContain('errors="1"');
+    expect(xml).toContain('failures="0"');
+  });
 });
 
 describe('escapeXml', () => {
diff --git a/apps/cli/test/commands/eval/statistics-inconclusive.test.ts b/apps/cli/test/commands/eval/statistics-inconclusive.test.ts
new file mode 100644
index 000000000..76e6196dd
--- /dev/null
+++ b/apps/cli/test/commands/eval/statistics-inconclusive.test.ts
@@ -0,0 +1,103 @@
+import { describe, expect, it } from 'bun:test';
+
+import type { EvaluationResult } from '@agentv/core';
+
+import {
+  calculateEvaluationSummary,
+  formatEvaluationSummary,
+} from '../../../src/commands/eval/statistics.js';
+
+function makeResult(overrides: Partial<EvaluationResult> = {}): EvaluationResult {
+  return {
+    timestamp: '2024-01-01T00:00:00Z',
+    testId: 'test-1',
+    score: 1.0,
+    assertions: [{ text: 'criterion-1', passed: true }],
+    output: [{ role: 'assistant' as const, content: 'answer' }],
+    target: 'default',
+    executionStatus: 'ok',
+    ...overrides,
+  };
+}
+
+describe('formatEvaluationSummary — inconclusive verdict', () => {
+  it('shows INCONCLUSIVE when all tests are execution errors', () => {
+    const results = [
+      makeResult({
+        testId: 'err-1',
+        score: 0,
+        executionStatus: 'execution_error',
+        error: 'Not Found',
+      }),
+      makeResult({
+        testId: 'err-2',
+        score: 0,
+        executionStatus: 'execution_error',
+        error: 'Not Found',
+      }),
+      makeResult({
+        testId: 'err-3',
+        score: 0,
+        executionStatus: 'execution_error',
+        error: 'Not Found',
+      }),
+    ];
+
+    const summary = calculateEvaluationSummary(results);
+    const output = formatEvaluationSummary(summary);
+
+    expect(output).toContain('RESULT: INCONCLUSIVE');
+    expect(output).toContain('all 3 test(s) had execution errors');
+    expect(output).toContain('no evaluation was performed');
+  });
+
+  it('shows PASS/FAIL when only some tests are execution errors', () => {
+    const results = [
+      makeResult({ testId: 'pass-1', score: 0.9, executionStatus: 'ok' }),
+      makeResult({
+        testId: 'err-1',
+        score: 0,
+        executionStatus: 'execution_error',
+        error: 'Not Found',
+      }),
+    ];
+
+    const summary = calculateEvaluationSummary(results);
+    const output = formatEvaluationSummary(summary);
+
+    // Should show PASS (the one graded test passed) not INCONCLUSIVE
+    expect(output).toContain('RESULT: PASS');
+    expect(output).not.toContain('INCONCLUSIVE');
+  });
+
+  it('shows FAIL when there are quality failures mixed with execution errors', () => {
+    const results = [
+      makeResult({ testId: 'fail-1', score: 0.3, executionStatus: 'quality_failure' }),
+      makeResult({
+        testId: 'err-1',
+        score: 0,
+        executionStatus: 'execution_error',
+        error: 'Not Found',
+      }),
+    ];
+
+    const summary = calculateEvaluationSummary(results, { threshold: 0.8 });
+    const output = formatEvaluationSummary(summary, { threshold: 0.8 });
+
+    expect(output).toContain('RESULT: FAIL');
+    expect(output).not.toContain('INCONCLUSIVE');
+  });
+
+  it('shows PASS when all tests pass and none are errors', () => {
+    const results = [
+      makeResult({ testId: 'pass-1', score: 0.9, executionStatus: 'ok' }),
+      makeResult({ testId: 'pass-2', score: 0.85, executionStatus: 'ok' }),
+    ];
+
+    const summary = calculateEvaluationSummary(results);
+    const output = formatEvaluationSummary(summary);
+
+    expect(output).toContain('RESULT: PASS');
+    expect(output).not.toContain('INCONCLUSIVE');
+  });
+});

From c73afe640c00540ca578493442c131fc4737bc26 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 5 Apr 2026 07:41:09 +0000
Subject: [PATCH 2/2] docs: clarify manual red/green UAT as blocking step in
 E2E checklist

Make the UAT requirement more prominent with a blocking warning,
clearer red/green definitions, and explicit dependency on completing
UAT before proceeding to later checklist steps.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 AGENTS.md | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 9bec0b95d..3bbae5d35 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -266,17 +266,18 @@ Before marking any branch as ready for review, complete this checklist:
 
 2. **Run unit tests**: `bun run test` — all must pass.
 
-3. **Manual red/green UAT (REQUIRED for all changes):**
-   Automated tests are not sufficient. Every change must be manually verified from the end user's perspective using a red/green approach:
-   - **Red (before fix):** Reproduce the bug or demonstrate the missing feature on `main` (or before your change). Confirm the undesired behavior is observable from the CLI / user-facing output.
-   - **Green (after fix):** Run the same scenario with your changes applied. Confirm the fix or feature works correctly from the end user's perspective.
-   - Document both the red and green results in the PR or conversation so the user can see the before/after.
+3. **⚠️ BLOCKING: Manual red/green UAT — must complete before steps 4-5:**
+   Unit tests passing is NOT sufficient. Every change must be manually verified from the end user's perspective. Do NOT skip this step or proceed to step 4 until red/green evidence is documented.
+
+   - **Red (before your changes):** Run the scenario on `main` (or the code state before your changes). Confirm the bug or missing feature is observable from the CLI / user-facing output. Capture the output.
+   - **Green (with your changes):** Run the identical scenario with your branch. Confirm the fix or feature works correctly from the end user's perspective. Capture the output.
+   - **Document both** red and green results in the PR description or comments so reviewers can see the before/after evidence.
 
    For evaluator changes, this means running a real eval (not `--dry-run`) and inspecting the output JSONL. For CLI/UX changes, this means running the CLI command and verifying the console output.
 
 4. **Verify no regressions** in areas adjacent to your changes (e.g., if you changed evaluator parsing, run an eval that exercises different evaluator types).
 
-5. **Mark PR as ready** only after all above steps pass.
+5. **Mark PR as ready** only after steps 1-4 have been completed AND red/green UAT evidence is included in the PR.
 
 ## Documentation Updates