Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 43 additions & 8 deletions apps/cli/src/commands/compare/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ interface MatchedResult {
score1: number;
score2: number;
delta: number;
normalizedGain: number | null;
outcome: 'win' | 'loss' | 'tie';
}

Expand All @@ -53,6 +54,7 @@ export interface ComparisonOutput {
losses: number;
ties: number;
meanDelta: number;
meanNormalizedGain: number | null;
};
baseline?: string;
candidate?: string;
Expand Down Expand Up @@ -111,6 +113,20 @@ export function loadCombinedResults(filePath: string): Map<string, EvalResult[]>
return groups;
}

/**
* Hake's normalized gain: g = (score_candidate − score_baseline) / (1 − score_baseline)
* Measures improvement relative to remaining headroom. Returns null when baseline is 1.0
* (perfect score leaves no room for improvement).
* Reference: Hake (1998), used by SkillsBench (arXiv:2602.12670).
*/
export function computeNormalizedGain(
baselineScore: number,
candidateScore: number,
): number | null {
if (baselineScore >= 1.0) return null;
return (candidateScore - baselineScore) / (1 - baselineScore);
}

export function classifyOutcome(delta: number, threshold: number): 'win' | 'loss' | 'tie' {
if (delta >= threshold) return 'win';
if (delta <= -threshold) return 'loss';
Expand All @@ -137,6 +153,7 @@ export function compareResults(
score1,
score2,
delta,
normalizedGain: computeNormalizedGain(score1, score2),
outcome: classifyOutcome(delta, threshold),
});
matchedIds.add(testId);
Expand All @@ -153,6 +170,12 @@ export function compareResults(
const meanDelta =
matched.length > 0 ? matched.reduce((sum, m) => sum + m.delta, 0) / matched.length : 0;

const gainValues = matched.map((m) => m.normalizedGain).filter((g): g is number => g !== null);
const meanNormalizedGain =
gainValues.length > 0
? Math.round((gainValues.reduce((sum, g) => sum + g, 0) / gainValues.length) * 1000) / 1000
: null;

return {
matched,
unmatched: { file1: unmatchedFile1, file2: unmatchedFile2 },
Expand All @@ -163,6 +186,7 @@ export function compareResults(
losses,
ties,
meanDelta: Math.round(meanDelta * 1000) / 1000,
meanNormalizedGain,
},
};
}
Expand Down Expand Up @@ -323,7 +347,7 @@ export function formatTable(comparison: ComparisonOutput, file1: string, file2:

// Summary
lines.push('');
const { wins, losses, ties, meanDelta } = comparison.summary;
const { wins, losses, ties, meanDelta, meanNormalizedGain } = comparison.summary;

const winStr =
wins > 0 ? `${c.green}${wins} win${wins !== 1 ? 's' : ''}${c.reset}` : `${wins} wins`;
Expand All @@ -340,9 +364,15 @@ export function formatTable(comparison: ComparisonOutput, file1: string, file2:
? `${c.red}regressed${c.reset}`
: `${c.gray}neutral${c.reset}`;

lines.push(
`${c.bold}Summary:${c.reset} ${winStr}, ${lossStr}, ${tieStr} | Mean Δ: ${deltaColor}${deltaSign}${meanDelta.toFixed(3)}${c.reset} | Status: ${status}`,
);
let summaryLine = `${c.bold}Summary:${c.reset} ${winStr}, ${lossStr}, ${tieStr} | Mean Δ: ${deltaColor}${deltaSign}${meanDelta.toFixed(3)}${c.reset}`;
if (meanNormalizedGain != null) {
const gColor = meanNormalizedGain > 0 ? c.green : meanNormalizedGain < 0 ? c.red : c.gray;
const gSign = meanNormalizedGain >= 0 ? '+' : '';
summaryLine += ` | g: ${gColor}${gSign}${meanNormalizedGain.toFixed(3)}${c.reset}`;
}
summaryLine += ` | Status: ${status}`;

lines.push(summaryLine);
lines.push('');

return lines.join('\n');
Expand Down Expand Up @@ -414,13 +444,18 @@ export function formatMatrix(matrixOutput: MatrixOutput, baselineTarget?: string
...pairwise.map((pw) => ` ${pw.baseline} → ${pw.candidate}:`.length),
);
for (const p of pairwise) {
const { wins, losses, ties, meanDelta } = p.summary;
const { wins, losses, ties, meanDelta, meanNormalizedGain } = p.summary;
const sign = meanDelta >= 0 ? '+' : '';
const deltaColor = meanDelta > 0 ? c.green : meanDelta < 0 ? c.red : c.gray;
const label = ` ${p.baseline} → ${p.candidate}:`;
lines.push(
`${padRight(label, maxLabelLen)} ${wins} win${wins !== 1 ? 's' : ''}, ${losses} loss${losses !== 1 ? 'es' : ''}, ${ties} tie${ties !== 1 ? 's' : ''} (${c.bold}Δ${c.reset} ${deltaColor}${sign}${meanDelta.toFixed(3)}${c.reset})`,
);
let pairLine = `${padRight(label, maxLabelLen)} ${wins} win${wins !== 1 ? 's' : ''}, ${losses} loss${losses !== 1 ? 'es' : ''}, ${ties} tie${ties !== 1 ? 's' : ''} (${c.bold}Δ${c.reset} ${deltaColor}${sign}${meanDelta.toFixed(3)}${c.reset}`;
if (meanNormalizedGain != null) {
const gColor = meanNormalizedGain > 0 ? c.green : meanNormalizedGain < 0 ? c.red : c.gray;
const gSign = meanNormalizedGain >= 0 ? '+' : '';
pairLine += `, ${c.bold}g${c.reset} ${gColor}${gSign}${meanNormalizedGain.toFixed(3)}${c.reset}`;
}
pairLine += ')';
lines.push(pairLine);
}
}

Expand Down
173 changes: 164 additions & 9 deletions apps/cli/test/commands/compare/compare.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import {
classifyOutcome,
compareMatrix,
compareResults,
computeNormalizedGain,
determineExitCode,
determineMatrixExitCode,
formatMatrix,
Expand Down Expand Up @@ -459,7 +460,15 @@ describe('compare command', () => {
{
matched: [],
unmatched: { file1: 0, file2: 0 },
summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.1 },
summary: {
total: 2,
matched: 1,
wins: 1,
losses: 0,
ties: 0,
meanDelta: 0.1,
meanNormalizedGain: null,
},
baseline: 'base',
candidate: 'cand',
},
Expand All @@ -476,14 +485,30 @@ describe('compare command', () => {
{
matched: [],
unmatched: { file1: 0, file2: 0 },
summary: { total: 2, matched: 1, wins: 0, losses: 1, ties: 0, meanDelta: -0.1 },
summary: {
total: 2,
matched: 1,
wins: 0,
losses: 1,
ties: 0,
meanDelta: -0.1,
meanNormalizedGain: null,
},
baseline: 'base',
candidate: 'cand1',
},
{
matched: [],
unmatched: { file1: 0, file2: 0 },
summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.1 },
summary: {
total: 2,
matched: 1,
wins: 1,
losses: 0,
ties: 0,
meanDelta: 0.1,
meanNormalizedGain: null,
},
baseline: 'base',
candidate: 'cand2',
},
Expand All @@ -500,14 +525,30 @@ describe('compare command', () => {
{
matched: [],
unmatched: { file1: 0, file2: 0 },
summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.05 },
summary: {
total: 2,
matched: 1,
wins: 1,
losses: 0,
ties: 0,
meanDelta: 0.05,
meanNormalizedGain: null,
},
baseline: 'base',
candidate: 'cand1',
},
{
matched: [],
unmatched: { file1: 0, file2: 0 },
summary: { total: 2, matched: 1, wins: 0, losses: 1, ties: 0, meanDelta: -0.2 },
summary: {
total: 2,
matched: 1,
wins: 0,
losses: 1,
ties: 0,
meanDelta: -0.2,
meanNormalizedGain: null,
},
baseline: 'cand1',
candidate: 'cand2',
},
Expand All @@ -530,7 +571,15 @@ describe('compare command', () => {
matched: [],
unmatched: { file1: 0, file2: 0 },
// delta > 0 means candidate (zeta/baseline) scored higher → alpha regressed
summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.2 },
summary: {
total: 2,
matched: 1,
wins: 1,
losses: 0,
ties: 0,
meanDelta: 0.2,
meanNormalizedGain: null,
},
baseline: 'alpha',
candidate: 'zeta',
},
Expand All @@ -550,7 +599,15 @@ describe('compare command', () => {
unmatched: { file1: 0, file2: 0 },
// delta < 0 means candidate (zeta/baseline) scored lower → alpha is better
// That means alpha did NOT regress vs baseline zeta
summary: { total: 2, matched: 1, wins: 0, losses: 1, ties: 0, meanDelta: -0.1 },
summary: {
total: 2,
matched: 1,
wins: 0,
losses: 1,
ties: 0,
meanDelta: -0.1,
meanNormalizedGain: null,
},
baseline: 'alpha',
candidate: 'zeta',
},
Expand Down Expand Up @@ -584,7 +641,15 @@ describe('compare command', () => {
{
matched: [],
unmatched: { file1: 0, file2: 0 },
summary: { total: 4, matched: 2, wins: 1, losses: 1, ties: 0, meanDelta: 0.025 },
summary: {
total: 4,
matched: 2,
wins: 1,
losses: 1,
ties: 0,
meanDelta: 0.025,
meanNormalizedGain: null,
},
baseline: 'model-a',
candidate: 'model-b',
},
Expand Down Expand Up @@ -622,7 +687,15 @@ describe('compare command', () => {
{
matched: [],
unmatched: { file1: 0, file2: 0 },
summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.1 },
summary: {
total: 2,
matched: 1,
wins: 1,
losses: 0,
ties: 0,
meanDelta: 0.1,
meanNormalizedGain: null,
},
baseline: 'a',
candidate: 'b',
},
Expand All @@ -648,4 +721,86 @@ describe('compare command', () => {
expect(output).toContain('--');
});
});

describe('computeNormalizedGain', () => {
it('should compute gain relative to remaining headroom', () => {
// baseline 0.5, candidate 0.75 → gained 0.25 out of 0.5 headroom = 0.5
expect(computeNormalizedGain(0.5, 0.75)).toBeCloseTo(0.5, 10);
});

it('should return 1.0 when candidate reaches perfect score', () => {
expect(computeNormalizedGain(0.5, 1.0)).toBeCloseTo(1.0, 10);
});

it('should return negative values when candidate regresses', () => {
// baseline 0.5, candidate 0.25 → lost 0.25 out of 0.5 headroom = -0.5
expect(computeNormalizedGain(0.5, 0.25)).toBeCloseTo(-0.5, 10);
});

it('should return null when baseline is perfect (no headroom)', () => {
expect(computeNormalizedGain(1.0, 1.0)).toBeNull();
expect(computeNormalizedGain(1.0, 0.5)).toBeNull();
});

it('should return 0 when scores are equal', () => {
expect(computeNormalizedGain(0.5, 0.5)).toBeCloseTo(0, 10);
});

it('should handle low baseline correctly', () => {
// baseline 0.1, candidate 0.55 → gained 0.45 out of 0.9 headroom = 0.5
expect(computeNormalizedGain(0.1, 0.55)).toBeCloseTo(0.5, 10);
});
});

describe('compareResults normalized gain', () => {
it('should include normalizedGain in matched results', () => {
const results1 = [{ testId: 'case-1', score: 0.5 }];
const results2 = [{ testId: 'case-1', score: 0.75 }];

const comparison = compareResults(results1, results2, 0.1);

expect(comparison.matched[0].normalizedGain).toBeCloseTo(0.5, 10);
});

it('should compute meanNormalizedGain in summary', () => {
const results1 = [
{ testId: 'case-1', score: 0.5 },
{ testId: 'case-2', score: 0.8 },
];
const results2 = [
{ testId: 'case-1', score: 0.75 }, // g = 0.25/0.5 = 0.5
{ testId: 'case-2', score: 0.9 }, // g = 0.1/0.2 = 0.5
];

const comparison = compareResults(results1, results2, 0.1);

expect(comparison.summary.meanNormalizedGain).toBeCloseTo(0.5, 10);
});

it('should set normalizedGain to null when baseline is 1.0', () => {
const results1 = [{ testId: 'case-1', score: 1.0 }];
const results2 = [{ testId: 'case-1', score: 1.0 }];

const comparison = compareResults(results1, results2, 0.1);

expect(comparison.matched[0].normalizedGain).toBeNull();
expect(comparison.summary.meanNormalizedGain).toBeNull();
});

it('should exclude null gains from mean computation', () => {
const results1 = [
{ testId: 'case-1', score: 0.5 },
{ testId: 'case-2', score: 1.0 }, // perfect baseline, gain is null
];
const results2 = [
{ testId: 'case-1', score: 0.75 }, // g = 0.5
{ testId: 'case-2', score: 1.0 },
];

const comparison = compareResults(results1, results2, 0.1);

// Only case-1 contributes to mean (g=0.5); case-2 is excluded
expect(comparison.summary.meanNormalizedGain).toBeCloseTo(0.5, 10);
});
});
});
Loading
Loading