From 477c1992fee3886e18a9eff1998b6f9db39b9e55 Mon Sep 17 00:00:00 2001 From: Griffen Fargo <3642037+gfargo@users.noreply.github.com> Date: Tue, 5 May 2026 21:01:03 -0400 Subject: [PATCH] feat(parser): raise default token budget from 2048 to 4096 (#845) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Match the canonical service tokenLimit shipped in `langchain/utils.ts` for openai / anthropic / ollama (all 4096). The 2048 fallback was a holdover from when 4k was a stretch for fast models — today every shipped service overrides it to 4096 already, so the fallback only fires for users whose custom service definition omits tokenLimit. Without this raise, those users hit a needlessly tight budget that triggers extra pre-summarization on diffs the model could absorb whole. Two call sites updated: - `summarizeDiffs.ts:250` default param - `parsers/default/index.ts:55` `||` fallback Bench (bin/benchmark.ts default also bumped to 4096 so per-PR diffs reflect the most-common production budget): | fixture | calls before | calls after | Δ calls | |----------------|-------------:|------------:|--------:| | tiny | 0 | 0 | 0 | | medium | 20 | 19 | -1 | | large | 41 | 30 | -11 (-27%) | | feature-add | 11 | 11 | 0 | | refactor | 28 | 20 | -8 (-29%) | | initial-commit | 41 | 30 | -11 (-27%) | | docs-update | 8 | 7 | -1 | | dep-bump | 0 | 0 | 0 | Heavy fixtures (large, initial-commit, refactor) get a real 27-29% reduction in LLM call count — direct API cost reduction. Wall clock for `large` / `initial-commit` improved 12 s (17%); the `refactor` wall went up slightly because fewer-but-larger calls serialize a bit (each pays the latency model's per-call base cost), trading API spend for a small wall-clock cost. Net is a clear win on the cost dimension that scales with diff size. --- .bench/baseline.json | 52 +++++++++---------- bin/benchmark.ts | 6 ++- src/lib/parsers/default/index.ts | 9 +++- .../parsers/default/utils/summarizeDiffs.ts | 10 +++- 4 files changed, 48 insertions(+), 29 deletions(-) diff --git a/.bench/baseline.json b/.bench/baseline.json index 74fcbbe..fc2cc46 100644 --- a/.bench/baseline.json +++ b/.bench/baseline.json @@ -1,19 +1,19 @@ { - "capturedAt": "2026-05-06T00:45:56.229Z", + "capturedAt": "2026-05-06T00:58:34.994Z", "node": "v22.13.0", "platform": "darwin-arm64", "options": { "baseLatencyMs": 1500, "perTokenMs": 2, "maxConcurrent": 6, - "maxTokens": 2048 + "maxTokens": 4096 }, "results": [ { "fixture": "tiny", "fileCount": 5, "approxTokens": 790, - "durationMs": 1, + "durationMs": 2, "llmCalls": 0, "llmTotalMs": 0, "llmTotalPromptTokens": 0 @@ -22,55 +22,55 @@ "fixture": "medium", "fileCount": 25, "approxTokens": 36150, - "durationMs": 31137, - "llmCalls": 20, - "llmTotalMs": 106348, - "llmTotalPromptTokens": 34237 + "durationMs": 29267, + "llmCalls": 19, + "llmTotalMs": 109679, + "llmTotalPromptTokens": 36895 }, { "fixture": "large", "fileCount": 50, "approxTokens": 83410, - "durationMs": 72093, - "llmCalls": 41, - "llmTotalMs": 244101, - "llmTotalPromptTokens": 74197 + "durationMs": 59992, + "llmCalls": 30, + "llmTotalMs": 228089, + "llmTotalPromptTokens": 74609 }, { "fixture": "feature-add", "fileCount": 14, "approxTokens": 17600, - "durationMs": 15967, + "durationMs": 19591, "llmCalls": 11, - "llmTotalMs": 54727, - "llmTotalPromptTokens": 18937 + "llmTotalMs": 59354, + "llmTotalPromptTokens": 20707 }, { "fixture": "refactor", "fileCount": 30, "approxTokens": 32650, - "durationMs": 33999, - "llmCalls": 28, - "llmTotalMs": 153888, - "llmTotalPromptTokens": 52430 + "durationMs": 41340, + "llmCalls": 20, + "llmTotalMs": 143983, + "llmTotalPromptTokens": 53548 }, { "fixture": "initial-commit", "fileCount": 50, "approxTokens": 83410, - "durationMs": 72285, - "llmCalls": 41, - "llmTotalMs": 245148, - "llmTotalPromptTokens": 74546 + "durationMs": 60034, + "llmCalls": 30, + "llmTotalMs": 229291, + "llmTotalPromptTokens": 74948 }, { "fixture": "docs-update", "fileCount": 9, "approxTokens": 15050, - "durationMs": 18570, - "llmCalls": 8, - "llmTotalMs": 56293, - "llmTotalPromptTokens": 13908 + "durationMs": 18563, + "llmCalls": 7, + "llmTotalMs": 52225, + "llmTotalPromptTokens": 13139 }, { "fixture": "dep-bump", diff --git a/bin/benchmark.ts b/bin/benchmark.ts index 81a1a80..7bdbb57 100644 --- a/bin/benchmark.ts +++ b/bin/benchmark.ts @@ -71,7 +71,11 @@ const DEFAULT_OPTIONS: BenchOptions = { baseLatencyMs: 1500, perTokenMs: 2, maxConcurrent: 6, - maxTokens: 2048, + // Match the canonical service tokenLimit from `langchain/utils.ts` + // (raised from 2048 to 4096 in PR 1 of #845). The bench mirrors + // the most-common production budget so per-PR diffs reflect what + // real users will see. + maxTokens: 4096, } type BenchResult = { diff --git a/src/lib/parsers/default/index.ts b/src/lib/parsers/default/index.ts index 667d8e8..025bfbb 100644 --- a/src/lib/parsers/default/index.ts +++ b/src/lib/parsers/default/index.ts @@ -49,10 +49,17 @@ export async function fileChangeParser({ // 1. Pre-process large files to prevent bias // 2. Group by directory and assess token count // 3. Wave-based parallel summarization until under budget + // + // The 4096 fallback (#845) matches the default service configs + // for openai / anthropic / ollama (`langchain/utils.ts`). It's a + // safety net for users with custom service definitions that omit + // `tokenLimit` — without it those users hit a degenerate 2048 + // budget that triggers needless pre-summarization on diffs the + // model could absorb whole. logger.startTimer() const summary = await summarizeDiffs(diffs, { tokenizer, - maxTokens: maxTokens || 2048, + maxTokens: maxTokens || 4096, minTokensForSummary, maxFileTokens, maxConcurrent, diff --git a/src/lib/parsers/default/utils/summarizeDiffs.ts b/src/lib/parsers/default/utils/summarizeDiffs.ts index 3375e73..dddc604 100644 --- a/src/lib/parsers/default/utils/summarizeDiffs.ts +++ b/src/lib/parsers/default/utils/summarizeDiffs.ts @@ -247,7 +247,15 @@ export async function summarizeDiffs( { tokenizer, logger, - maxTokens = 2048, + // Default raised to 4096 (#845) so the budget matches the + // canonical service configs in `langchain/utils.ts`. The + // previous 2048 default came from an earlier era when 4k + // context was a stretch for fast models; today every shipped + // service overrides it to 4096 anyway. Keeping this in sync + // with the service defaults means a caller that omits + // `maxTokens` doesn't accidentally fall into a tighter budget + // than the rest of the system assumes. + maxTokens = 4096, minTokensForSummary = 400, maxFileTokens, maxConcurrent = 6,