From 120088c367865c278f4c69ea9e9d5787a23e7028 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Tue, 14 Apr 2026 23:51:22 +0000
Subject: [PATCH 1/3] feat(compare): add normalized gain metric to agentv
 compare
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add Hake's normalized gain (g) to compare output, measuring improvement
relative to remaining headroom rather than raw absolute delta.

Formula: g = (score_candidate − score_baseline) / (1 − score_baseline)

This separates genuine scaffolding from ceiling effects — a +5pp gain
from a 90% baseline (g=0.5) is proportionally much larger than +5pp
from a 10% baseline (g=0.056).

Shown as "Norm. gain" in table output and "g" in matrix pairwise summary.
Available as mean_normalized_gain in JSON output. Returns null when
baseline is 1.0 (perfect score, no headroom).

Closes #1100

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 apps/cli/src/commands/compare/index.ts        |  51 +++++-
 .../cli/test/commands/compare/compare.test.ts | 173 +++++++++++++++++-
 2 files changed, 207 insertions(+), 17 deletions(-)

diff --git a/apps/cli/src/commands/compare/index.ts b/apps/cli/src/commands/compare/index.ts
index 53b0c7651..3f8f984ed 100644
--- a/apps/cli/src/commands/compare/index.ts
+++ b/apps/cli/src/commands/compare/index.ts
@@ -40,6 +40,7 @@ interface MatchedResult {
   score1: number;
   score2: number;
   delta: number;
+  normalizedGain: number | null;
   outcome: 'win' | 'loss' | 'tie';
 }
 
@@ -53,6 +54,7 @@ export interface ComparisonOutput {
     losses: number;
     ties: number;
     meanDelta: number;
+    meanNormalizedGain: number | null;
   };
   baseline?: string;
   candidate?: string;
@@ -111,6 +113,20 @@ export function loadCombinedResults(filePath: string): Map<string, EvalResult[]>
   return groups;
 }
 
+/**
+ * Hake's normalized gain: g = (score_candidate − score_baseline) / (1 − score_baseline)
+ * Measures improvement relative to remaining headroom. Returns null when baseline is 1.0
+ * (perfect score leaves no room for improvement).
+ * Reference: Hake (1998), used by SkillsBench (arXiv:2602.12670).
+ */
+export function computeNormalizedGain(
+  baselineScore: number,
+  candidateScore: number,
+): number | null {
+  if (baselineScore >= 1.0) return null;
+  return (candidateScore - baselineScore) / (1 - baselineScore);
+}
+
 export function classifyOutcome(delta: number, threshold: number): 'win' | 'loss' | 'tie' {
   if (delta >= threshold) return 'win';
   if (delta <= -threshold) return 'loss';
@@ -137,6 +153,7 @@ export function compareResults(
         score1,
         score2,
         delta,
+        normalizedGain: computeNormalizedGain(score1, score2),
         outcome: classifyOutcome(delta, threshold),
       });
       matchedIds.add(testId);
@@ -153,6 +170,12 @@ export function compareResults(
   const meanDelta =
     matched.length > 0 ? matched.reduce((sum, m) => sum + m.delta, 0) / matched.length : 0;
 
+  const gainValues = matched.map((m) => m.normalizedGain).filter((g): g is number => g !== null);
+  const meanNormalizedGain =
+    gainValues.length > 0
+      ? Math.round((gainValues.reduce((sum, g) => sum + g, 0) / gainValues.length) * 1000) / 1000
+      : null;
+
   return {
     matched,
     unmatched: { file1: unmatchedFile1, file2: unmatchedFile2 },
@@ -163,6 +186,7 @@ export function compareResults(
       losses,
       ties,
       meanDelta: Math.round(meanDelta * 1000) / 1000,
+      meanNormalizedGain,
     },
   };
 }
@@ -323,7 +347,7 @@ export function formatTable(comparison: ComparisonOutput, file1: string, file2:
 
   // Summary
   lines.push('');
-  const { wins, losses, ties, meanDelta } = comparison.summary;
+  const { wins, losses, ties, meanDelta, meanNormalizedGain } = comparison.summary;
 
   const winStr =
     wins > 0 ? `${c.green}${wins} win${wins !== 1 ? 's' : ''}${c.reset}` : `${wins} wins`;
@@ -340,9 +364,15 @@ export function formatTable(comparison: ComparisonOutput, file1: string, file2:
         ? `${c.red}regressed${c.reset}`
         : `${c.gray}neutral${c.reset}`;
 
-  lines.push(
-    `${c.bold}Summary:${c.reset} ${winStr}, ${lossStr}, ${tieStr} | Mean Δ: ${deltaColor}${deltaSign}${meanDelta.toFixed(3)}${c.reset} | Status: ${status}`,
-  );
+  let summaryLine = `${c.bold}Summary:${c.reset} ${winStr}, ${lossStr}, ${tieStr} | Mean Δ: ${deltaColor}${deltaSign}${meanDelta.toFixed(3)}${c.reset}`;
+  if (meanNormalizedGain != null) {
+    const gColor = meanNormalizedGain > 0 ? c.green : meanNormalizedGain < 0 ? c.red : c.gray;
+    const gSign = meanNormalizedGain >= 0 ? '+' : '';
+    summaryLine += ` | Norm. gain: ${gColor}${gSign}${meanNormalizedGain.toFixed(3)}${c.reset}`;
+  }
+  summaryLine += ` | Status: ${status}`;
+
+  lines.push(summaryLine);
   lines.push('');
 
   return lines.join('\n');
@@ -414,13 +444,18 @@ export function formatMatrix(matrixOutput: MatrixOutput, baselineTarget?: string
       ...pairwise.map((pw) => `  ${pw.baseline} → ${pw.candidate}:`.length),
     );
     for (const p of pairwise) {
-      const { wins, losses, ties, meanDelta } = p.summary;
+      const { wins, losses, ties, meanDelta, meanNormalizedGain } = p.summary;
       const sign = meanDelta >= 0 ? '+' : '';
       const deltaColor = meanDelta > 0 ? c.green : meanDelta < 0 ? c.red : c.gray;
       const label = `  ${p.baseline} → ${p.candidate}:`;
-      lines.push(
-        `${padRight(label, maxLabelLen)}  ${wins} win${wins !== 1 ? 's' : ''}, ${losses} loss${losses !== 1 ? 'es' : ''}, ${ties} tie${ties !== 1 ? 's' : ''}  (${c.bold}Δ${c.reset} ${deltaColor}${sign}${meanDelta.toFixed(3)}${c.reset})`,
-      );
+      let pairLine = `${padRight(label, maxLabelLen)}  ${wins} win${wins !== 1 ? 's' : ''}, ${losses} loss${losses !== 1 ? 'es' : ''}, ${ties} tie${ties !== 1 ? 's' : ''}  (${c.bold}Δ${c.reset} ${deltaColor}${sign}${meanDelta.toFixed(3)}${c.reset}`;
+      if (meanNormalizedGain != null) {
+        const gColor = meanNormalizedGain > 0 ? c.green : meanNormalizedGain < 0 ? c.red : c.gray;
+        const gSign = meanNormalizedGain >= 0 ? '+' : '';
+        pairLine += `, ${c.bold}g${c.reset} ${gColor}${gSign}${meanNormalizedGain.toFixed(3)}${c.reset}`;
+      }
+      pairLine += ')';
+      lines.push(pairLine);
     }
   }
 
diff --git a/apps/cli/test/commands/compare/compare.test.ts b/apps/cli/test/commands/compare/compare.test.ts
index 3b77b1bb1..8b082071b 100644
--- a/apps/cli/test/commands/compare/compare.test.ts
+++ b/apps/cli/test/commands/compare/compare.test.ts
@@ -7,6 +7,7 @@ import {
   classifyOutcome,
   compareMatrix,
   compareResults,
+  computeNormalizedGain,
   determineExitCode,
   determineMatrixExitCode,
   formatMatrix,
@@ -459,7 +460,15 @@ describe('compare command', () => {
           {
             matched: [],
             unmatched: { file1: 0, file2: 0 },
-            summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.1 },
+            summary: {
+              total: 2,
+              matched: 1,
+              wins: 1,
+              losses: 0,
+              ties: 0,
+              meanDelta: 0.1,
+              meanNormalizedGain: null,
+            },
             baseline: 'base',
             candidate: 'cand',
           },
@@ -476,14 +485,30 @@ describe('compare command', () => {
           {
             matched: [],
             unmatched: { file1: 0, file2: 0 },
-            summary: { total: 2, matched: 1, wins: 0, losses: 1, ties: 0, meanDelta: -0.1 },
+            summary: {
+              total: 2,
+              matched: 1,
+              wins: 0,
+              losses: 1,
+              ties: 0,
+              meanDelta: -0.1,
+              meanNormalizedGain: null,
+            },
             baseline: 'base',
             candidate: 'cand1',
           },
           {
             matched: [],
             unmatched: { file1: 0, file2: 0 },
-            summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.1 },
+            summary: {
+              total: 2,
+              matched: 1,
+              wins: 1,
+              losses: 0,
+              ties: 0,
+              meanDelta: 0.1,
+              meanNormalizedGain: null,
+            },
             baseline: 'base',
             candidate: 'cand2',
           },
@@ -500,14 +525,30 @@ describe('compare command', () => {
           {
             matched: [],
             unmatched: { file1: 0, file2: 0 },
-            summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.05 },
+            summary: {
+              total: 2,
+              matched: 1,
+              wins: 1,
+              losses: 0,
+              ties: 0,
+              meanDelta: 0.05,
+              meanNormalizedGain: null,
+            },
             baseline: 'base',
             candidate: 'cand1',
           },
           {
             matched: [],
             unmatched: { file1: 0, file2: 0 },
-            summary: { total: 2, matched: 1, wins: 0, losses: 1, ties: 0, meanDelta: -0.2 },
+            summary: {
+              total: 2,
+              matched: 1,
+              wins: 0,
+              losses: 1,
+              ties: 0,
+              meanDelta: -0.2,
+              meanNormalizedGain: null,
+            },
             baseline: 'cand1',
             candidate: 'cand2',
           },
@@ -530,7 +571,15 @@ describe('compare command', () => {
             matched: [],
             unmatched: { file1: 0, file2: 0 },
             // delta > 0 means candidate (zeta/baseline) scored higher → alpha regressed
-            summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.2 },
+            summary: {
+              total: 2,
+              matched: 1,
+              wins: 1,
+              losses: 0,
+              ties: 0,
+              meanDelta: 0.2,
+              meanNormalizedGain: null,
+            },
             baseline: 'alpha',
             candidate: 'zeta',
           },
@@ -550,7 +599,15 @@ describe('compare command', () => {
             unmatched: { file1: 0, file2: 0 },
             // delta < 0 means candidate (zeta/baseline) scored lower → alpha is better
             // That means alpha did NOT regress vs baseline zeta
-            summary: { total: 2, matched: 1, wins: 0, losses: 1, ties: 0, meanDelta: -0.1 },
+            summary: {
+              total: 2,
+              matched: 1,
+              wins: 0,
+              losses: 1,
+              ties: 0,
+              meanDelta: -0.1,
+              meanNormalizedGain: null,
+            },
             baseline: 'alpha',
             candidate: 'zeta',
           },
@@ -584,7 +641,15 @@ describe('compare command', () => {
           {
             matched: [],
             unmatched: { file1: 0, file2: 0 },
-            summary: { total: 4, matched: 2, wins: 1, losses: 1, ties: 0, meanDelta: 0.025 },
+            summary: {
+              total: 4,
+              matched: 2,
+              wins: 1,
+              losses: 1,
+              ties: 0,
+              meanDelta: 0.025,
+              meanNormalizedGain: null,
+            },
             baseline: 'model-a',
             candidate: 'model-b',
           },
@@ -622,7 +687,15 @@ describe('compare command', () => {
           {
             matched: [],
             unmatched: { file1: 0, file2: 0 },
-            summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.1 },
+            summary: {
+              total: 2,
+              matched: 1,
+              wins: 1,
+              losses: 0,
+              ties: 0,
+              meanDelta: 0.1,
+              meanNormalizedGain: null,
+            },
             baseline: 'a',
             candidate: 'b',
           },
@@ -648,4 +721,86 @@ describe('compare command', () => {
       expect(output).toContain('--');
     });
   });
+
+  describe('computeNormalizedGain', () => {
+    it('should compute gain relative to remaining headroom', () => {
+      // baseline 0.5, candidate 0.75 → gained 0.25 out of 0.5 headroom = 0.5
+      expect(computeNormalizedGain(0.5, 0.75)).toBeCloseTo(0.5, 10);
+    });
+
+    it('should return 1.0 when candidate reaches perfect score', () => {
+      expect(computeNormalizedGain(0.5, 1.0)).toBeCloseTo(1.0, 10);
+    });
+
+    it('should return negative values when candidate regresses', () => {
+      // baseline 0.5, candidate 0.25 → lost 0.25 out of 0.5 headroom = -0.5
+      expect(computeNormalizedGain(0.5, 0.25)).toBeCloseTo(-0.5, 10);
+    });
+
+    it('should return null when baseline is perfect (no headroom)', () => {
+      expect(computeNormalizedGain(1.0, 1.0)).toBeNull();
+      expect(computeNormalizedGain(1.0, 0.5)).toBeNull();
+    });
+
+    it('should return 0 when scores are equal', () => {
+      expect(computeNormalizedGain(0.5, 0.5)).toBeCloseTo(0, 10);
+    });
+
+    it('should handle low baseline correctly', () => {
+      // baseline 0.1, candidate 0.55 → gained 0.45 out of 0.9 headroom = 0.5
+      expect(computeNormalizedGain(0.1, 0.55)).toBeCloseTo(0.5, 10);
+    });
+  });
+
+  describe('compareResults normalized gain', () => {
+    it('should include normalizedGain in matched results', () => {
+      const results1 = [{ testId: 'case-1', score: 0.5 }];
+      const results2 = [{ testId: 'case-1', score: 0.75 }];
+
+      const comparison = compareResults(results1, results2, 0.1);
+
+      expect(comparison.matched[0].normalizedGain).toBeCloseTo(0.5, 10);
+    });
+
+    it('should compute meanNormalizedGain in summary', () => {
+      const results1 = [
+        { testId: 'case-1', score: 0.5 },
+        { testId: 'case-2', score: 0.8 },
+      ];
+      const results2 = [
+        { testId: 'case-1', score: 0.75 }, // g = 0.25/0.5 = 0.5
+        { testId: 'case-2', score: 0.9 }, // g = 0.1/0.2 = 0.5
+      ];
+
+      const comparison = compareResults(results1, results2, 0.1);
+
+      expect(comparison.summary.meanNormalizedGain).toBeCloseTo(0.5, 10);
+    });
+
+    it('should set normalizedGain to null when baseline is 1.0', () => {
+      const results1 = [{ testId: 'case-1', score: 1.0 }];
+      const results2 = [{ testId: 'case-1', score: 1.0 }];
+
+      const comparison = compareResults(results1, results2, 0.1);
+
+      expect(comparison.matched[0].normalizedGain).toBeNull();
+      expect(comparison.summary.meanNormalizedGain).toBeNull();
+    });
+
+    it('should exclude null gains from mean computation', () => {
+      const results1 = [
+        { testId: 'case-1', score: 0.5 },
+        { testId: 'case-2', score: 1.0 }, // perfect baseline, gain is null
+      ];
+      const results2 = [
+        { testId: 'case-1', score: 0.75 }, // g = 0.5
+        { testId: 'case-2', score: 1.0 },
+      ];
+
+      const comparison = compareResults(results1, results2, 0.1);
+
+      // Only case-1 contributes to mean (g=0.5); case-2 is excluded
+      expect(comparison.summary.meanNormalizedGain).toBeCloseTo(0.5, 10);
+    });
+  });
 });

From 6da47fb3a995563447a7a81d4b7182efe5c4be9c Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Tue, 14 Apr 2026 23:54:44 +0000
Subject: [PATCH 2/3] refactor(compare): use standard symbol 'g' for normalized
 gain

Use 'g' consistently in both table summary and matrix pairwise output,
matching the standard notation from Hake (1998) and SkillsBench paper.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 apps/cli/src/commands/compare/index.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/cli/src/commands/compare/index.ts b/apps/cli/src/commands/compare/index.ts
index 3f8f984ed..bf4852606 100644
--- a/apps/cli/src/commands/compare/index.ts
+++ b/apps/cli/src/commands/compare/index.ts
@@ -368,7 +368,7 @@ export function formatTable(comparison: ComparisonOutput, file1: string, file2:
   if (meanNormalizedGain != null) {
     const gColor = meanNormalizedGain > 0 ? c.green : meanNormalizedGain < 0 ? c.red : c.gray;
     const gSign = meanNormalizedGain >= 0 ? '+' : '';
-    summaryLine += ` | Norm. gain: ${gColor}${gSign}${meanNormalizedGain.toFixed(3)}${c.reset}`;
+    summaryLine += ` | g: ${gColor}${gSign}${meanNormalizedGain.toFixed(3)}${c.reset}`;
   }
   summaryLine += ` | Status: ${status}`;
 

From ea1e60a2cfa4f3aeef665869827b22b60bbe42d4 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Tue, 14 Apr 2026 23:58:49 +0000
Subject: [PATCH 3/3] docs(compare): document normalized gain metric

Add normalized gain (g) to compare docs: formula, interpretation table,
updated table/JSON output examples, and tips section.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../src/content/docs/docs/tools/compare.mdx   | 58 +++++++++++++------
 1 file changed, 40 insertions(+), 18 deletions(-)

diff --git a/apps/web/src/content/docs/docs/tools/compare.mdx b/apps/web/src/content/docs/docs/tools/compare.mdx
index 392e9d737..85109e531 100644
--- a/apps/web/src/content/docs/docs/tools/compare.mdx
+++ b/apps/web/src/content/docs/docs/tools/compare.mdx
@@ -31,26 +31,45 @@ agentv compare before.jsonl after.jsonl
 1. **Load Results** -- reads both JSONL files containing evaluation results
 2. **Match by test_id** -- pairs results with matching `test_id` fields
 3. **Compute Deltas** -- calculates `delta = score2 - score1` for each pair
-4. **Classify Outcomes**:
+4. **Compute Normalized Gain** -- calculates `g = delta / (1 - score1)` for each pair (see below)
+5. **Classify Outcomes**:
    - **win**: delta &gt;= threshold (candidate better)
    - **loss**: delta &lt;= -threshold (baseline better)
    - **tie**: |delta| &lt; threshold (no significant difference)
-5. **Output Summary** -- human-readable table or JSON
+6. **Output Summary** -- human-readable table or JSON
+
+## Normalized Gain (g)
+
+In addition to raw delta, `compare` reports **normalized gain** (`g`):
+
+```
+g = (score_candidate − score_baseline) / (1 − score_baseline)
+```
+
+`g` measures improvement relative to remaining headroom rather than as an absolute number. This matters when baselines differ across tasks:
+
+| Baseline | Candidate | Δ | g | Interpretation |
+|----------|-----------|------|------|----------------|
+| 0.10 | 0.55 | +0.45 | +0.50 | Captured 50% of remaining headroom |
+| 0.90 | 0.95 | +0.05 | +0.50 | Same proportional gain despite smaller Δ |
+| 0.50 | 0.25 | −0.25 | −0.50 | Regression: lost 50% of headroom |
+
+`g` is `null` when the baseline is already 1.0 (no headroom to improve). Null values are excluded from the mean.
 
 ## Output Formats
 
 ### Table Format (default)
 
 ```
-Comparing: baseline.jsonl -> candidate.jsonl
+Comparing: baseline/ → candidate/
 
-  Test ID        Baseline  Candidate     Delta  Result
-  -------------  --------  ---------  --------  --------
-  safety-check       0.70       0.90     +0.20  win
-  accuracy-test      0.85       0.80     -0.05  = tie
-  latency-eval       0.90       0.75     -0.15  loss
+  Test ID              Baseline  Candidate     Delta  Result
+  ───────────────────  ────────  ─────────  ────────  ────────
+  fix-cwd-bug              0.00       0.60     +0.60  ✓ win
+  spec-driven-impl         0.40       0.80     +0.40  ✓ win
+  multi-file-refactor      0.60       0.40     -0.20  ✗ loss
 
-Summary: 1 win, 1 loss, 1 tie | Mean delta: +0.000 | Status: neutral
+Summary: 2 wins, 1 loss, 0 ties | Mean Δ: +0.267 | g: +0.256 | Status: improved
 ```
 
 Wins are highlighted green, losses red, and ties gray. Colors are automatically disabled when output is piped or `NO_COLOR` is set.
@@ -63,10 +82,11 @@ Use `--json` or `--format=json` for machine-readable output. Fields use snake_ca
 {
   "matched": [
     {
-      "test_id": "case-1",
-      "score1": 0.7,
-      "score2": 0.9,
-      "delta": 0.2,
+      "test_id": "fix-cwd-bug",
+      "score1": 0.0,
+      "score2": 0.6,
+      "delta": 0.6,
+      "normalized_gain": 0.6,
       "outcome": "win"
     }
   ],
@@ -75,12 +95,13 @@ Use `--json` or `--format=json` for machine-readable output. Fields use snake_ca
     "file2": 0
   },
   "summary": {
-    "total": 2,
-    "matched": 1,
-    "wins": 1,
-    "losses": 0,
+    "total": 6,
+    "matched": 3,
+    "wins": 2,
+    "losses": 1,
     "ties": 0,
-    "mean_delta": 0.2
+    "mean_delta": 0.267,
+    "mean_normalized_gain": 0.256
   }
 }
 ```
@@ -143,5 +164,6 @@ echo "Candidate is equal or better than baseline."
 ## Tips
 
 - **Threshold selection** -- the default 0.1 means a 10% difference is required for a win or loss. Use stricter thresholds (0.05) for critical evaluations.
+- **Normalized gain vs delta** -- use `g` to compare across tasks with different baseline difficulty; use `Δ` for absolute improvement tracking.
 - **Unmatched results** -- check `unmatched` counts in JSON output to identify tests that only exist in one file.
 - **Multiple comparisons** -- compare against multiple baselines by running the command multiple times.