From 477c1992fee3886e18a9eff1998b6f9db39b9e55 Mon Sep 17 00:00:00 2001
From: Griffen Fargo <3642037+gfargo@users.noreply.github.com>
Date: Tue, 5 May 2026 21:01:03 -0400
Subject: [PATCH] feat(parser): raise default token budget from 2048 to 4096
 (#845)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Match the canonical service tokenLimit shipped in
`langchain/utils.ts` for openai / anthropic / ollama (all 4096).
The 2048 fallback was a holdover from when 4k was a stretch for
fast models — today every shipped service overrides it to 4096
already, so the fallback only fires for users whose custom
service definition omits tokenLimit. Without this raise, those
users hit a needlessly tight budget that triggers extra
pre-summarization on diffs the model could absorb whole.

Two call sites updated:
  - `summarizeDiffs.ts:250` default param
  - `parsers/default/index.ts:55` `||` fallback

Bench (bin/benchmark.ts default also bumped to 4096 so per-PR
diffs reflect the most-common production budget):

| fixture        | calls before | calls after | Δ calls |
|----------------|-------------:|------------:|--------:|
| tiny           |            0 |           0 |       0 |
| medium         |           20 |          19 |      -1 |
| large          |           41 |          30 |     -11 (-27%) |
| feature-add    |           11 |          11 |       0 |
| refactor       |           28 |          20 |      -8 (-29%) |
| initial-commit |           41 |          30 |     -11 (-27%) |
| docs-update    |            8 |           7 |      -1 |
| dep-bump       |            0 |           0 |       0 |

Heavy fixtures (large, initial-commit, refactor) get a real 27-29%
reduction in LLM call count — direct API cost reduction. Wall
clock for `large` / `initial-commit` improved 12 s (17%); the
`refactor` wall went up slightly because fewer-but-larger calls
serialize a bit (each pays the latency model's per-call base
cost), trading API spend for a small wall-clock cost. Net is a
clear win on the cost dimension that scales with diff size.
---
 .bench/baseline.json                          | 52 +++++++++----------
 bin/benchmark.ts                              |  6 ++-
 src/lib/parsers/default/index.ts              |  9 +++-
 .../parsers/default/utils/summarizeDiffs.ts   | 10 +++-
 4 files changed, 48 insertions(+), 29 deletions(-)

diff --git a/.bench/baseline.json b/.bench/baseline.json
index 74fcbbe..fc2cc46 100644
--- a/.bench/baseline.json
+++ b/.bench/baseline.json
@@ -1,19 +1,19 @@
 {
-  "capturedAt": "2026-05-06T00:45:56.229Z",
+  "capturedAt": "2026-05-06T00:58:34.994Z",
   "node": "v22.13.0",
   "platform": "darwin-arm64",
   "options": {
     "baseLatencyMs": 1500,
     "perTokenMs": 2,
     "maxConcurrent": 6,
-    "maxTokens": 2048
+    "maxTokens": 4096
   },
   "results": [
     {
       "fixture": "tiny",
       "fileCount": 5,
       "approxTokens": 790,
-      "durationMs": 1,
+      "durationMs": 2,
       "llmCalls": 0,
       "llmTotalMs": 0,
       "llmTotalPromptTokens": 0
@@ -22,55 +22,55 @@
       "fixture": "medium",
       "fileCount": 25,
       "approxTokens": 36150,
-      "durationMs": 31137,
-      "llmCalls": 20,
-      "llmTotalMs": 106348,
-      "llmTotalPromptTokens": 34237
+      "durationMs": 29267,
+      "llmCalls": 19,
+      "llmTotalMs": 109679,
+      "llmTotalPromptTokens": 36895
     },
     {
       "fixture": "large",
       "fileCount": 50,
       "approxTokens": 83410,
-      "durationMs": 72093,
-      "llmCalls": 41,
-      "llmTotalMs": 244101,
-      "llmTotalPromptTokens": 74197
+      "durationMs": 59992,
+      "llmCalls": 30,
+      "llmTotalMs": 228089,
+      "llmTotalPromptTokens": 74609
     },
     {
       "fixture": "feature-add",
       "fileCount": 14,
       "approxTokens": 17600,
-      "durationMs": 15967,
+      "durationMs": 19591,
       "llmCalls": 11,
-      "llmTotalMs": 54727,
-      "llmTotalPromptTokens": 18937
+      "llmTotalMs": 59354,
+      "llmTotalPromptTokens": 20707
     },
     {
       "fixture": "refactor",
       "fileCount": 30,
       "approxTokens": 32650,
-      "durationMs": 33999,
-      "llmCalls": 28,
-      "llmTotalMs": 153888,
-      "llmTotalPromptTokens": 52430
+      "durationMs": 41340,
+      "llmCalls": 20,
+      "llmTotalMs": 143983,
+      "llmTotalPromptTokens": 53548
     },
     {
       "fixture": "initial-commit",
       "fileCount": 50,
       "approxTokens": 83410,
-      "durationMs": 72285,
-      "llmCalls": 41,
-      "llmTotalMs": 245148,
-      "llmTotalPromptTokens": 74546
+      "durationMs": 60034,
+      "llmCalls": 30,
+      "llmTotalMs": 229291,
+      "llmTotalPromptTokens": 74948
     },
     {
       "fixture": "docs-update",
       "fileCount": 9,
       "approxTokens": 15050,
-      "durationMs": 18570,
-      "llmCalls": 8,
-      "llmTotalMs": 56293,
-      "llmTotalPromptTokens": 13908
+      "durationMs": 18563,
+      "llmCalls": 7,
+      "llmTotalMs": 52225,
+      "llmTotalPromptTokens": 13139
     },
     {
       "fixture": "dep-bump",
diff --git a/bin/benchmark.ts b/bin/benchmark.ts
index 81a1a80..7bdbb57 100644
--- a/bin/benchmark.ts
+++ b/bin/benchmark.ts
@@ -71,7 +71,11 @@ const DEFAULT_OPTIONS: BenchOptions = {
   baseLatencyMs: 1500,
   perTokenMs: 2,
   maxConcurrent: 6,
-  maxTokens: 2048,
+  // Match the canonical service tokenLimit from `langchain/utils.ts`
+  // (raised from 2048 to 4096 in PR 1 of #845). The bench mirrors
+  // the most-common production budget so per-PR diffs reflect what
+  // real users will see.
+  maxTokens: 4096,
 }
 
 type BenchResult = {
diff --git a/src/lib/parsers/default/index.ts b/src/lib/parsers/default/index.ts
index 667d8e8..025bfbb 100644
--- a/src/lib/parsers/default/index.ts
+++ b/src/lib/parsers/default/index.ts
@@ -49,10 +49,17 @@ export async function fileChangeParser({
   // 1. Pre-process large files to prevent bias
   // 2. Group by directory and assess token count
   // 3. Wave-based parallel summarization until under budget
+  //
+  // The 4096 fallback (#845) matches the default service configs
+  // for openai / anthropic / ollama (`langchain/utils.ts`). It's a
+  // safety net for users with custom service definitions that omit
+  // `tokenLimit` — without it those users hit a degenerate 2048
+  // budget that triggers needless pre-summarization on diffs the
+  // model could absorb whole.
   logger.startTimer()
   const summary = await summarizeDiffs(diffs, {
     tokenizer,
-    maxTokens: maxTokens || 2048,
+    maxTokens: maxTokens || 4096,
     minTokensForSummary,
     maxFileTokens,
     maxConcurrent,
diff --git a/src/lib/parsers/default/utils/summarizeDiffs.ts b/src/lib/parsers/default/utils/summarizeDiffs.ts
index 3375e73..dddc604 100644
--- a/src/lib/parsers/default/utils/summarizeDiffs.ts
+++ b/src/lib/parsers/default/utils/summarizeDiffs.ts
@@ -247,7 +247,15 @@ export async function summarizeDiffs(
   {
     tokenizer,
     logger,
-    maxTokens = 2048,
+    // Default raised to 4096 (#845) so the budget matches the
+    // canonical service configs in `langchain/utils.ts`. The
+    // previous 2048 default came from an earlier era when 4k
+    // context was a stretch for fast models; today every shipped
+    // service overrides it to 4096 anyway. Keeping this in sync
+    // with the service defaults means a caller that omits
+    // `maxTokens` doesn't accidentally fall into a tighter budget
+    // than the rest of the system assumes.
+    maxTokens = 4096,
     minTokensForSummary = 400,
     maxFileTokens,
     maxConcurrent = 6,