diff --git a/.bench/baseline.json b/.bench/baseline.json index fc2cc46..a196152 100644 --- a/.bench/baseline.json +++ b/.bench/baseline.json @@ -1,5 +1,5 @@ { - "capturedAt": "2026-05-06T00:58:34.994Z", + "capturedAt": "2026-05-06T01:12:11.470Z", "node": "v22.13.0", "platform": "darwin-arm64", "options": { @@ -13,7 +13,7 @@ "fixture": "tiny", "fileCount": 5, "approxTokens": 790, - "durationMs": 2, + "durationMs": 1, "llmCalls": 0, "llmTotalMs": 0, "llmTotalPromptTokens": 0 @@ -22,54 +22,54 @@ "fixture": "medium", "fileCount": 25, "approxTokens": 36150, - "durationMs": 29267, - "llmCalls": 19, - "llmTotalMs": 109679, - "llmTotalPromptTokens": 36895 + "durationMs": 6906, + "llmCalls": 6, + "llmTotalMs": 25221, + "llmTotalPromptTokens": 8525 }, { "fixture": "large", "fileCount": 50, "approxTokens": 83410, - "durationMs": 59992, - "llmCalls": 30, - "llmTotalMs": 228089, - "llmTotalPromptTokens": 74609 + "durationMs": 9749, + "llmCalls": 6, + "llmTotalMs": 42401, + "llmTotalPromptTokens": 16602 }, { "fixture": "feature-add", "fileCount": 14, "approxTokens": 17600, - "durationMs": 19591, - "llmCalls": 11, - "llmTotalMs": 59354, - "llmTotalPromptTokens": 20707 + "durationMs": 5640, + "llmCalls": 4, + "llmTotalMs": 18854, + "llmTotalPromptTokens": 6117 }, { "fixture": "refactor", "fileCount": 30, "approxTokens": 32650, - "durationMs": 41340, + "durationMs": 41347, "llmCalls": 20, - "llmTotalMs": 143983, + "llmTotalMs": 143990, "llmTotalPromptTokens": 53548 }, { "fixture": "initial-commit", "fileCount": 50, "approxTokens": 83410, - "durationMs": 60034, - "llmCalls": 30, - "llmTotalMs": 229291, - "llmTotalPromptTokens": 74948 + "durationMs": 9818, + "llmCalls": 6, + "llmTotalMs": 42557, + "llmTotalPromptTokens": 16306 }, { "fixture": "docs-update", "fileCount": 9, "approxTokens": 15050, - "durationMs": 18563, + "durationMs": 18564, "llmCalls": 7, - "llmTotalMs": 52225, + "llmTotalMs": 52222, "llmTotalPromptTokens": 13139 }, { diff --git a/src/lib/parsers/default/utils/summarizeLargeFiles.ts b/src/lib/parsers/default/utils/summarizeLargeFiles.ts index 8563443..b003409 100644 --- a/src/lib/parsers/default/utils/summarizeLargeFiles.ts +++ b/src/lib/parsers/default/utils/summarizeLargeFiles.ts @@ -2,6 +2,7 @@ import { FileDiff, DiffNode } from '../../../types' import { SummarizeContext, summarize } from '../../../langchain/chains/summarize' import { TokenCounter } from '../../../utils/tokenizer' import { Logger } from '../../../utils/logger' +import { summarizeTrivialDiff } from './trivialDiff' export type SummarizeLargeFilesOptions = { /** @@ -22,6 +23,14 @@ export type SummarizeLargeFilesOptions = { /** * Summarize a single file diff that exceeds the token threshold. + * + * Trivial-shape short-circuit (#845, PR 2): pure additions / deletions + * / renames / binary changes have no information content beyond the + * diff's shape, so we templated-summarize them instead of paying for + * an LLM call. On initial-commit fixtures (lots of pure adds) this + * collapses the per-file summary phase entirely; the resulting tiny + * synthetic summaries usually drop the directory token totals under + * budget so wave consolidation skips too. */ async function summarizeFileDiff( fileDiff: FileDiff, @@ -33,6 +42,19 @@ async function summarizeFileDiff( metadata, }: Pick ): Promise { + const trivialSummary = summarizeTrivialDiff(fileDiff) + if (trivialSummary !== undefined) { + logger.verbose( + ` - ${fileDiff.file}: trivial-shape skip (no LLM call)`, + { color: 'gray' } + ) + return { + ...fileDiff, + diff: trivialSummary, + tokenCount: tokenizer(trivialSummary), + } + } + try { const fileSummary = await summarize( [ diff --git a/src/lib/parsers/default/utils/trivialDiff.test.ts b/src/lib/parsers/default/utils/trivialDiff.test.ts new file mode 100644 index 0000000..902223c --- /dev/null +++ b/src/lib/parsers/default/utils/trivialDiff.test.ts @@ -0,0 +1,145 @@ +import { FileDiff } from '../../../types' +import { + detectTrivialDiffShape, + summarizeTrivialDiff, +} from './trivialDiff' + +const additionDiff = `diff --git a/foo.ts b/foo.ts +new file mode 100644 +index 0000000..1234567 +--- /dev/null ++++ b/foo.ts +@@ -0,0 +1,3 @@ ++export const foo = 1 ++export const bar = 2 ++export const baz = 3 +` + +const deletionDiff = `diff --git a/legacy.ts b/legacy.ts +deleted file mode 100644 +index 1234567..0000000 +--- a/legacy.ts ++++ /dev/null +@@ -1,3 +0,0 @@ +-export const legacy = 1 +-export const old = 2 +-export const stale = 3 +` + +const renameDiff = `diff --git a/old/path.ts b/new/path.ts +similarity index 100% +rename from old/path.ts +rename to new/path.ts +` + +const binaryDiff = `diff --git a/assets/logo.png b/assets/logo.png +Binary files a/assets/logo.png and b/assets/logo.png differ +` + +const modificationDiff = `diff --git a/src/foo.ts b/src/foo.ts +index 1234567..89abcde 100644 +--- a/src/foo.ts ++++ b/src/foo.ts +@@ -1,5 +1,7 @@ + const foo = 1 +-const bar = 2 ++const bar = 22 ++const baz = 3 + const quux = 4 +` + +const renameWithEditDiff = `diff --git a/old/path.ts b/new/path.ts +similarity index 87% +rename from old/path.ts +rename to new/path.ts +@@ -1,3 +1,4 @@ + const foo = 1 +-const bar = 2 ++const bar = 22 ++const baz = 3 +` + +describe('detectTrivialDiffShape', () => { + it('detects pure additions', () => { + expect(detectTrivialDiffShape(additionDiff)).toBe('addition') + }) + + it('detects pure deletions', () => { + expect(detectTrivialDiffShape(deletionDiff)).toBe('deletion') + }) + + it('detects pure renames (no body)', () => { + expect(detectTrivialDiffShape(renameDiff)).toBe('rename') + }) + + it('detects binary file changes', () => { + expect(detectTrivialDiffShape(binaryDiff)).toBe('binary') + }) + + it('returns undefined for modifications (mixed +/-)', () => { + expect(detectTrivialDiffShape(modificationDiff)).toBeUndefined() + }) + + it('returns undefined for renames that also include edits (rename + body)', () => { + expect(detectTrivialDiffShape(renameWithEditDiff)).toBeUndefined() + }) + + it('returns undefined for empty input', () => { + expect(detectTrivialDiffShape('')).toBeUndefined() + }) + + it('ignores +++ / --- header markers when classifying', () => { + // The `+++ b/file` and `--- a/file` headers shouldn't fool the + // counter — they're metadata, not content. + expect(detectTrivialDiffShape(additionDiff)).toBe('addition') + expect(detectTrivialDiffShape(deletionDiff)).toBe('deletion') + }) +}) + +describe('summarizeTrivialDiff', () => { + function makeDiff(file: string, diff: string): FileDiff { + return { file, diff, summary: '', tokenCount: 100 } + } + + it('templated summary for pure addition includes line count', () => { + expect(summarizeTrivialDiff(makeDiff('foo.ts', additionDiff))) + .toBe('Added `foo.ts` (3 lines).') + }) + + it('templated summary for pure deletion includes line count', () => { + expect(summarizeTrivialDiff(makeDiff('legacy.ts', deletionDiff))) + .toBe('Removed `legacy.ts` (3 lines).') + }) + + it('singular line wording when count is 1', () => { + const oneLine = `diff --git a/foo b/foo +new file mode 100644 +--- /dev/null ++++ b/foo +@@ -0,0 +1,1 @@ ++only one line +` + expect(summarizeTrivialDiff(makeDiff('foo', oneLine))) + .toBe('Added `foo` (1 line).') + }) + + it('rename summary names both old and new path', () => { + expect(summarizeTrivialDiff(makeDiff('new/path.ts', renameDiff))) + .toBe('Renamed `old/path.ts` → `new/path.ts`.') + }) + + it('binary summary is shape-only (no line count)', () => { + expect(summarizeTrivialDiff(makeDiff('assets/logo.png', binaryDiff))) + .toBe('Updated binary file `assets/logo.png`.') + }) + + it('returns undefined for modifications so the LLM path stays in charge', () => { + expect(summarizeTrivialDiff(makeDiff('src/foo.ts', modificationDiff))) + .toBeUndefined() + }) + + it('returns undefined for renames-with-edit', () => { + expect(summarizeTrivialDiff(makeDiff('new/path.ts', renameWithEditDiff))) + .toBeUndefined() + }) +}) diff --git a/src/lib/parsers/default/utils/trivialDiff.ts b/src/lib/parsers/default/utils/trivialDiff.ts new file mode 100644 index 0000000..c62a42a --- /dev/null +++ b/src/lib/parsers/default/utils/trivialDiff.ts @@ -0,0 +1,143 @@ +import { FileDiff } from '../../../types' + +/** + * Diff-shape detection + deterministic summarization for "trivial" + * diffs (#845). A trivial diff is one whose meaning is fully + * captured by its shape — pure additions, pure deletions, renames + * with no content change, and binary file changes — so an LLM + * summary adds nothing the templated string can't already convey. + * + * Used by the pre-process pass in `summarizeLargeFiles` to skip + * the LLM call for trivial files entirely. On the bench's + * pure-additions fixtures (initial commit, feature add) the wave + * consolidation often doesn't even fire afterward because the + * synthetic summaries collapse the directory token totals under + * the budget. + * + * Defensive about input shape: the helpers run before any other + * parsing, so a malformed diff (zero hunks, missing headers, weird + * formatting from a custom diff producer) should fall through as + * "modification" — the existing LLM path still handles it. + */ + +export type TrivialDiffShape = 'addition' | 'deletion' | 'rename' | 'binary' + +/** + * Inspect a unified-diff string and report its shape, or undefined + * if the diff isn't trivial (mixed +/- lines, weird headers, etc.). + * + * Detection rules (cheap on purpose — we're called per-file and the + * goal is to skip work, not be exhaustive): + * + * - `Binary files ... differ` header → 'binary' + * - `rename from`/`rename to` headers and no `+`/`-` content + * lines → 'rename' + * - All content lines are `+` (and at least one is) → 'addition' + * - All content lines are `-` (and at least one is) → 'deletion' + * - Otherwise → undefined (let the LLM handle it) + */ +export function detectTrivialDiffShape(diff: string): TrivialDiffShape | undefined { + if (!diff) return undefined + + // Binary marker is unambiguous and short-circuits early. + if (/^Binary files .+ and .+ differ$/m.test(diff)) { + return 'binary' + } + + // Pure rename: git emits `rename from` / `rename to` and no body. + // We require BOTH markers AND no `+`/`-` content lines. Some + // renames-with-edit show rename headers AND a hunk; those should + // fall through to the LLM path. + const hasRenameFrom = /^rename from /m.test(diff) + const hasRenameTo = /^rename to /m.test(diff) + if (hasRenameFrom && hasRenameTo) { + const hasContentChange = diff + .split('\n') + .some((line) => isContentChangeLine(line)) + if (!hasContentChange) { + return 'rename' + } + } + + // Walk the body once classifying content lines. We skip header + // lines (diff --git, index, ---, +++, @@, etc.) and only inspect + // the lines that represent actual change content. + let plus = 0 + let minus = 0 + for (const line of diff.split('\n')) { + if (isHeaderLine(line)) continue + if (line.startsWith('+')) plus++ + else if (line.startsWith('-')) minus++ + // Context lines (' ' prefix) are ignored for shape classification: + // a pure addition can still have surrounding context if a hunk + // anchors at line 0, though `git diff` for a brand-new file + // typically has none. + } + + if (plus > 0 && minus === 0) return 'addition' + if (minus > 0 && plus === 0) return 'deletion' + return undefined +} + +/** + * Build a deterministic summary string for a trivial diff. Returns + * undefined when the shape can't be templated (caller should fall + * back to the LLM path). + */ +export function summarizeTrivialDiff(fileDiff: FileDiff): string | undefined { + const shape = detectTrivialDiffShape(fileDiff.diff) + if (!shape) return undefined + + const lineCount = countContentLines(fileDiff.diff, shape) + switch (shape) { + case 'addition': + return `Added \`${fileDiff.file}\` (${lineCount} line${lineCount === 1 ? '' : 's'}).` + case 'deletion': + return `Removed \`${fileDiff.file}\` (${lineCount} line${lineCount === 1 ? '' : 's'}).` + case 'rename': { + const oldPath = extractRenameOldPath(fileDiff.diff) + return oldPath + ? `Renamed \`${oldPath}\` → \`${fileDiff.file}\`.` + : `Renamed file to \`${fileDiff.file}\`.` + } + case 'binary': + return `Updated binary file \`${fileDiff.file}\`.` + } +} + +function isHeaderLine(line: string): boolean { + return ( + line.startsWith('diff --git') || + line.startsWith('index ') || + line.startsWith('--- ') || + line.startsWith('+++ ') || + line.startsWith('@@') || + line.startsWith('new file mode') || + line.startsWith('deleted file mode') || + line.startsWith('similarity index') || + line.startsWith('rename from ') || + line.startsWith('rename to ') || + line.startsWith('Binary files ') + ) +} + +function isContentChangeLine(line: string): boolean { + if (isHeaderLine(line)) return false + return line.startsWith('+') || line.startsWith('-') +} + +function countContentLines(diff: string, shape: TrivialDiffShape): number { + if (shape === 'binary' || shape === 'rename') return 0 + const prefix = shape === 'addition' ? '+' : '-' + let count = 0 + for (const line of diff.split('\n')) { + if (isHeaderLine(line)) continue + if (line.startsWith(prefix)) count++ + } + return count +} + +function extractRenameOldPath(diff: string): string | undefined { + const match = diff.match(/^rename from (.+)$/m) + return match ? match[1].trim() : undefined +}