From 120088c367865c278f4c69ea9e9d5787a23e7028 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 14 Apr 2026 23:51:22 +0000 Subject: [PATCH 1/3] feat(compare): add normalized gain metric to agentv compare MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Hake's normalized gain (g) to compare output, measuring improvement relative to remaining headroom rather than raw absolute delta. Formula: g = (score_candidate − score_baseline) / (1 − score_baseline) This separates genuine scaffolding from ceiling effects — a +5pp gain from a 90% baseline (g=0.5) is proportionally much larger than +5pp from a 10% baseline (g=0.056). Shown as "Norm. gain" in table output and "g" in matrix pairwise summary. Available as mean_normalized_gain in JSON output. Returns null when baseline is 1.0 (perfect score, no headroom). Closes #1100 Co-Authored-By: Claude Sonnet 4.6 --- apps/cli/src/commands/compare/index.ts | 51 +++++- .../cli/test/commands/compare/compare.test.ts | 173 +++++++++++++++++- 2 files changed, 207 insertions(+), 17 deletions(-) diff --git a/apps/cli/src/commands/compare/index.ts b/apps/cli/src/commands/compare/index.ts index 53b0c7651..3f8f984ed 100644 --- a/apps/cli/src/commands/compare/index.ts +++ b/apps/cli/src/commands/compare/index.ts @@ -40,6 +40,7 @@ interface MatchedResult { score1: number; score2: number; delta: number; + normalizedGain: number | null; outcome: 'win' | 'loss' | 'tie'; } @@ -53,6 +54,7 @@ export interface ComparisonOutput { losses: number; ties: number; meanDelta: number; + meanNormalizedGain: number | null; }; baseline?: string; candidate?: string; @@ -111,6 +113,20 @@ export function loadCombinedResults(filePath: string): Map return groups; } +/** + * Hake's normalized gain: g = (score_candidate − score_baseline) / (1 − score_baseline) + * Measures improvement relative to remaining headroom. Returns null when baseline is 1.0 + * (perfect score leaves no room for improvement). + * Reference: Hake (1998), used by SkillsBench (arXiv:2602.12670). + */ +export function computeNormalizedGain( + baselineScore: number, + candidateScore: number, +): number | null { + if (baselineScore >= 1.0) return null; + return (candidateScore - baselineScore) / (1 - baselineScore); +} + export function classifyOutcome(delta: number, threshold: number): 'win' | 'loss' | 'tie' { if (delta >= threshold) return 'win'; if (delta <= -threshold) return 'loss'; @@ -137,6 +153,7 @@ export function compareResults( score1, score2, delta, + normalizedGain: computeNormalizedGain(score1, score2), outcome: classifyOutcome(delta, threshold), }); matchedIds.add(testId); @@ -153,6 +170,12 @@ export function compareResults( const meanDelta = matched.length > 0 ? matched.reduce((sum, m) => sum + m.delta, 0) / matched.length : 0; + const gainValues = matched.map((m) => m.normalizedGain).filter((g): g is number => g !== null); + const meanNormalizedGain = + gainValues.length > 0 + ? Math.round((gainValues.reduce((sum, g) => sum + g, 0) / gainValues.length) * 1000) / 1000 + : null; + return { matched, unmatched: { file1: unmatchedFile1, file2: unmatchedFile2 }, @@ -163,6 +186,7 @@ export function compareResults( losses, ties, meanDelta: Math.round(meanDelta * 1000) / 1000, + meanNormalizedGain, }, }; } @@ -323,7 +347,7 @@ export function formatTable(comparison: ComparisonOutput, file1: string, file2: // Summary lines.push(''); - const { wins, losses, ties, meanDelta } = comparison.summary; + const { wins, losses, ties, meanDelta, meanNormalizedGain } = comparison.summary; const winStr = wins > 0 ? `${c.green}${wins} win${wins !== 1 ? 's' : ''}${c.reset}` : `${wins} wins`; @@ -340,9 +364,15 @@ export function formatTable(comparison: ComparisonOutput, file1: string, file2: ? `${c.red}regressed${c.reset}` : `${c.gray}neutral${c.reset}`; - lines.push( - `${c.bold}Summary:${c.reset} ${winStr}, ${lossStr}, ${tieStr} | Mean Δ: ${deltaColor}${deltaSign}${meanDelta.toFixed(3)}${c.reset} | Status: ${status}`, - ); + let summaryLine = `${c.bold}Summary:${c.reset} ${winStr}, ${lossStr}, ${tieStr} | Mean Δ: ${deltaColor}${deltaSign}${meanDelta.toFixed(3)}${c.reset}`; + if (meanNormalizedGain != null) { + const gColor = meanNormalizedGain > 0 ? c.green : meanNormalizedGain < 0 ? c.red : c.gray; + const gSign = meanNormalizedGain >= 0 ? '+' : ''; + summaryLine += ` | Norm. gain: ${gColor}${gSign}${meanNormalizedGain.toFixed(3)}${c.reset}`; + } + summaryLine += ` | Status: ${status}`; + + lines.push(summaryLine); lines.push(''); return lines.join('\n'); @@ -414,13 +444,18 @@ export function formatMatrix(matrixOutput: MatrixOutput, baselineTarget?: string ...pairwise.map((pw) => ` ${pw.baseline} → ${pw.candidate}:`.length), ); for (const p of pairwise) { - const { wins, losses, ties, meanDelta } = p.summary; + const { wins, losses, ties, meanDelta, meanNormalizedGain } = p.summary; const sign = meanDelta >= 0 ? '+' : ''; const deltaColor = meanDelta > 0 ? c.green : meanDelta < 0 ? c.red : c.gray; const label = ` ${p.baseline} → ${p.candidate}:`; - lines.push( - `${padRight(label, maxLabelLen)} ${wins} win${wins !== 1 ? 's' : ''}, ${losses} loss${losses !== 1 ? 'es' : ''}, ${ties} tie${ties !== 1 ? 's' : ''} (${c.bold}Δ${c.reset} ${deltaColor}${sign}${meanDelta.toFixed(3)}${c.reset})`, - ); + let pairLine = `${padRight(label, maxLabelLen)} ${wins} win${wins !== 1 ? 's' : ''}, ${losses} loss${losses !== 1 ? 'es' : ''}, ${ties} tie${ties !== 1 ? 's' : ''} (${c.bold}Δ${c.reset} ${deltaColor}${sign}${meanDelta.toFixed(3)}${c.reset}`; + if (meanNormalizedGain != null) { + const gColor = meanNormalizedGain > 0 ? c.green : meanNormalizedGain < 0 ? c.red : c.gray; + const gSign = meanNormalizedGain >= 0 ? '+' : ''; + pairLine += `, ${c.bold}g${c.reset} ${gColor}${gSign}${meanNormalizedGain.toFixed(3)}${c.reset}`; + } + pairLine += ')'; + lines.push(pairLine); } } diff --git a/apps/cli/test/commands/compare/compare.test.ts b/apps/cli/test/commands/compare/compare.test.ts index 3b77b1bb1..8b082071b 100644 --- a/apps/cli/test/commands/compare/compare.test.ts +++ b/apps/cli/test/commands/compare/compare.test.ts @@ -7,6 +7,7 @@ import { classifyOutcome, compareMatrix, compareResults, + computeNormalizedGain, determineExitCode, determineMatrixExitCode, formatMatrix, @@ -459,7 +460,15 @@ describe('compare command', () => { { matched: [], unmatched: { file1: 0, file2: 0 }, - summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.1 }, + summary: { + total: 2, + matched: 1, + wins: 1, + losses: 0, + ties: 0, + meanDelta: 0.1, + meanNormalizedGain: null, + }, baseline: 'base', candidate: 'cand', }, @@ -476,14 +485,30 @@ describe('compare command', () => { { matched: [], unmatched: { file1: 0, file2: 0 }, - summary: { total: 2, matched: 1, wins: 0, losses: 1, ties: 0, meanDelta: -0.1 }, + summary: { + total: 2, + matched: 1, + wins: 0, + losses: 1, + ties: 0, + meanDelta: -0.1, + meanNormalizedGain: null, + }, baseline: 'base', candidate: 'cand1', }, { matched: [], unmatched: { file1: 0, file2: 0 }, - summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.1 }, + summary: { + total: 2, + matched: 1, + wins: 1, + losses: 0, + ties: 0, + meanDelta: 0.1, + meanNormalizedGain: null, + }, baseline: 'base', candidate: 'cand2', }, @@ -500,14 +525,30 @@ describe('compare command', () => { { matched: [], unmatched: { file1: 0, file2: 0 }, - summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.05 }, + summary: { + total: 2, + matched: 1, + wins: 1, + losses: 0, + ties: 0, + meanDelta: 0.05, + meanNormalizedGain: null, + }, baseline: 'base', candidate: 'cand1', }, { matched: [], unmatched: { file1: 0, file2: 0 }, - summary: { total: 2, matched: 1, wins: 0, losses: 1, ties: 0, meanDelta: -0.2 }, + summary: { + total: 2, + matched: 1, + wins: 0, + losses: 1, + ties: 0, + meanDelta: -0.2, + meanNormalizedGain: null, + }, baseline: 'cand1', candidate: 'cand2', }, @@ -530,7 +571,15 @@ describe('compare command', () => { matched: [], unmatched: { file1: 0, file2: 0 }, // delta > 0 means candidate (zeta/baseline) scored higher → alpha regressed - summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.2 }, + summary: { + total: 2, + matched: 1, + wins: 1, + losses: 0, + ties: 0, + meanDelta: 0.2, + meanNormalizedGain: null, + }, baseline: 'alpha', candidate: 'zeta', }, @@ -550,7 +599,15 @@ describe('compare command', () => { unmatched: { file1: 0, file2: 0 }, // delta < 0 means candidate (zeta/baseline) scored lower → alpha is better // That means alpha did NOT regress vs baseline zeta - summary: { total: 2, matched: 1, wins: 0, losses: 1, ties: 0, meanDelta: -0.1 }, + summary: { + total: 2, + matched: 1, + wins: 0, + losses: 1, + ties: 0, + meanDelta: -0.1, + meanNormalizedGain: null, + }, baseline: 'alpha', candidate: 'zeta', }, @@ -584,7 +641,15 @@ describe('compare command', () => { { matched: [], unmatched: { file1: 0, file2: 0 }, - summary: { total: 4, matched: 2, wins: 1, losses: 1, ties: 0, meanDelta: 0.025 }, + summary: { + total: 4, + matched: 2, + wins: 1, + losses: 1, + ties: 0, + meanDelta: 0.025, + meanNormalizedGain: null, + }, baseline: 'model-a', candidate: 'model-b', }, @@ -622,7 +687,15 @@ describe('compare command', () => { { matched: [], unmatched: { file1: 0, file2: 0 }, - summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.1 }, + summary: { + total: 2, + matched: 1, + wins: 1, + losses: 0, + ties: 0, + meanDelta: 0.1, + meanNormalizedGain: null, + }, baseline: 'a', candidate: 'b', }, @@ -648,4 +721,86 @@ describe('compare command', () => { expect(output).toContain('--'); }); }); + + describe('computeNormalizedGain', () => { + it('should compute gain relative to remaining headroom', () => { + // baseline 0.5, candidate 0.75 → gained 0.25 out of 0.5 headroom = 0.5 + expect(computeNormalizedGain(0.5, 0.75)).toBeCloseTo(0.5, 10); + }); + + it('should return 1.0 when candidate reaches perfect score', () => { + expect(computeNormalizedGain(0.5, 1.0)).toBeCloseTo(1.0, 10); + }); + + it('should return negative values when candidate regresses', () => { + // baseline 0.5, candidate 0.25 → lost 0.25 out of 0.5 headroom = -0.5 + expect(computeNormalizedGain(0.5, 0.25)).toBeCloseTo(-0.5, 10); + }); + + it('should return null when baseline is perfect (no headroom)', () => { + expect(computeNormalizedGain(1.0, 1.0)).toBeNull(); + expect(computeNormalizedGain(1.0, 0.5)).toBeNull(); + }); + + it('should return 0 when scores are equal', () => { + expect(computeNormalizedGain(0.5, 0.5)).toBeCloseTo(0, 10); + }); + + it('should handle low baseline correctly', () => { + // baseline 0.1, candidate 0.55 → gained 0.45 out of 0.9 headroom = 0.5 + expect(computeNormalizedGain(0.1, 0.55)).toBeCloseTo(0.5, 10); + }); + }); + + describe('compareResults normalized gain', () => { + it('should include normalizedGain in matched results', () => { + const results1 = [{ testId: 'case-1', score: 0.5 }]; + const results2 = [{ testId: 'case-1', score: 0.75 }]; + + const comparison = compareResults(results1, results2, 0.1); + + expect(comparison.matched[0].normalizedGain).toBeCloseTo(0.5, 10); + }); + + it('should compute meanNormalizedGain in summary', () => { + const results1 = [ + { testId: 'case-1', score: 0.5 }, + { testId: 'case-2', score: 0.8 }, + ]; + const results2 = [ + { testId: 'case-1', score: 0.75 }, // g = 0.25/0.5 = 0.5 + { testId: 'case-2', score: 0.9 }, // g = 0.1/0.2 = 0.5 + ]; + + const comparison = compareResults(results1, results2, 0.1); + + expect(comparison.summary.meanNormalizedGain).toBeCloseTo(0.5, 10); + }); + + it('should set normalizedGain to null when baseline is 1.0', () => { + const results1 = [{ testId: 'case-1', score: 1.0 }]; + const results2 = [{ testId: 'case-1', score: 1.0 }]; + + const comparison = compareResults(results1, results2, 0.1); + + expect(comparison.matched[0].normalizedGain).toBeNull(); + expect(comparison.summary.meanNormalizedGain).toBeNull(); + }); + + it('should exclude null gains from mean computation', () => { + const results1 = [ + { testId: 'case-1', score: 0.5 }, + { testId: 'case-2', score: 1.0 }, // perfect baseline, gain is null + ]; + const results2 = [ + { testId: 'case-1', score: 0.75 }, // g = 0.5 + { testId: 'case-2', score: 1.0 }, + ]; + + const comparison = compareResults(results1, results2, 0.1); + + // Only case-1 contributes to mean (g=0.5); case-2 is excluded + expect(comparison.summary.meanNormalizedGain).toBeCloseTo(0.5, 10); + }); + }); }); From 6da47fb3a995563447a7a81d4b7182efe5c4be9c Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 14 Apr 2026 23:54:44 +0000 Subject: [PATCH 2/3] refactor(compare): use standard symbol 'g' for normalized gain Use 'g' consistently in both table summary and matrix pairwise output, matching the standard notation from Hake (1998) and SkillsBench paper. Co-Authored-By: Claude Sonnet 4.6 --- apps/cli/src/commands/compare/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/cli/src/commands/compare/index.ts b/apps/cli/src/commands/compare/index.ts index 3f8f984ed..bf4852606 100644 --- a/apps/cli/src/commands/compare/index.ts +++ b/apps/cli/src/commands/compare/index.ts @@ -368,7 +368,7 @@ export function formatTable(comparison: ComparisonOutput, file1: string, file2: if (meanNormalizedGain != null) { const gColor = meanNormalizedGain > 0 ? c.green : meanNormalizedGain < 0 ? c.red : c.gray; const gSign = meanNormalizedGain >= 0 ? '+' : ''; - summaryLine += ` | Norm. gain: ${gColor}${gSign}${meanNormalizedGain.toFixed(3)}${c.reset}`; + summaryLine += ` | g: ${gColor}${gSign}${meanNormalizedGain.toFixed(3)}${c.reset}`; } summaryLine += ` | Status: ${status}`; From ea1e60a2cfa4f3aeef665869827b22b60bbe42d4 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 14 Apr 2026 23:58:49 +0000 Subject: [PATCH 3/3] docs(compare): document normalized gain metric Add normalized gain (g) to compare docs: formula, interpretation table, updated table/JSON output examples, and tips section. Co-Authored-By: Claude Sonnet 4.6 --- .../src/content/docs/docs/tools/compare.mdx | 58 +++++++++++++------ 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/apps/web/src/content/docs/docs/tools/compare.mdx b/apps/web/src/content/docs/docs/tools/compare.mdx index 392e9d737..85109e531 100644 --- a/apps/web/src/content/docs/docs/tools/compare.mdx +++ b/apps/web/src/content/docs/docs/tools/compare.mdx @@ -31,26 +31,45 @@ agentv compare before.jsonl after.jsonl 1. **Load Results** -- reads both JSONL files containing evaluation results 2. **Match by test_id** -- pairs results with matching `test_id` fields 3. **Compute Deltas** -- calculates `delta = score2 - score1` for each pair -4. **Classify Outcomes**: +4. **Compute Normalized Gain** -- calculates `g = delta / (1 - score1)` for each pair (see below) +5. **Classify Outcomes**: - **win**: delta >= threshold (candidate better) - **loss**: delta <= -threshold (baseline better) - **tie**: |delta| < threshold (no significant difference) -5. **Output Summary** -- human-readable table or JSON +6. **Output Summary** -- human-readable table or JSON + +## Normalized Gain (g) + +In addition to raw delta, `compare` reports **normalized gain** (`g`): + +``` +g = (score_candidate − score_baseline) / (1 − score_baseline) +``` + +`g` measures improvement relative to remaining headroom rather than as an absolute number. This matters when baselines differ across tasks: + +| Baseline | Candidate | Δ | g | Interpretation | +|----------|-----------|------|------|----------------| +| 0.10 | 0.55 | +0.45 | +0.50 | Captured 50% of remaining headroom | +| 0.90 | 0.95 | +0.05 | +0.50 | Same proportional gain despite smaller Δ | +| 0.50 | 0.25 | −0.25 | −0.50 | Regression: lost 50% of headroom | + +`g` is `null` when the baseline is already 1.0 (no headroom to improve). Null values are excluded from the mean. ## Output Formats ### Table Format (default) ``` -Comparing: baseline.jsonl -> candidate.jsonl +Comparing: baseline/ → candidate/ - Test ID Baseline Candidate Delta Result - ------------- -------- --------- -------- -------- - safety-check 0.70 0.90 +0.20 win - accuracy-test 0.85 0.80 -0.05 = tie - latency-eval 0.90 0.75 -0.15 loss + Test ID Baseline Candidate Delta Result + ─────────────────── ──────── ───────── ──────── ──────── + fix-cwd-bug 0.00 0.60 +0.60 ✓ win + spec-driven-impl 0.40 0.80 +0.40 ✓ win + multi-file-refactor 0.60 0.40 -0.20 ✗ loss -Summary: 1 win, 1 loss, 1 tie | Mean delta: +0.000 | Status: neutral +Summary: 2 wins, 1 loss, 0 ties | Mean Δ: +0.267 | g: +0.256 | Status: improved ``` Wins are highlighted green, losses red, and ties gray. Colors are automatically disabled when output is piped or `NO_COLOR` is set. @@ -63,10 +82,11 @@ Use `--json` or `--format=json` for machine-readable output. Fields use snake_ca { "matched": [ { - "test_id": "case-1", - "score1": 0.7, - "score2": 0.9, - "delta": 0.2, + "test_id": "fix-cwd-bug", + "score1": 0.0, + "score2": 0.6, + "delta": 0.6, + "normalized_gain": 0.6, "outcome": "win" } ], @@ -75,12 +95,13 @@ Use `--json` or `--format=json` for machine-readable output. Fields use snake_ca "file2": 0 }, "summary": { - "total": 2, - "matched": 1, - "wins": 1, - "losses": 0, + "total": 6, + "matched": 3, + "wins": 2, + "losses": 1, "ties": 0, - "mean_delta": 0.2 + "mean_delta": 0.267, + "mean_normalized_gain": 0.256 } } ``` @@ -143,5 +164,6 @@ echo "Candidate is equal or better than baseline." ## Tips - **Threshold selection** -- the default 0.1 means a 10% difference is required for a win or loss. Use stricter thresholds (0.05) for critical evaluations. +- **Normalized gain vs delta** -- use `g` to compare across tasks with different baseline difficulty; use `Δ` for absolute improvement tracking. - **Unmatched results** -- check `unmatched` counts in JSON output to identify tests that only exist in one file. - **Multiple comparisons** -- compare against multiple baselines by running the command multiple times.