Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 19 additions & 19 deletions .bench/baseline.json
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
{
"capturedAt": "2026-05-06T01:12:11.470Z",
"capturedAt": "2026-05-06T03:03:50.907Z",
"node": "v22.13.0",
"platform": "darwin-arm64",
"options": {
"baseLatencyMs": 1500,
"perTokenMs": 2,
"maxConcurrent": 6,
"maxConcurrent": 24,
"maxTokens": 4096
},
"results": [
{
"fixture": "tiny",
"fileCount": 5,
"approxTokens": 790,
"durationMs": 1,
"durationMs": 2,
"llmCalls": 0,
"llmTotalMs": 0,
"llmTotalPromptTokens": 0
Expand All @@ -22,54 +22,54 @@
"fixture": "medium",
"fileCount": 25,
"approxTokens": 36150,
"durationMs": 6906,
"durationMs": 6905,
"llmCalls": 6,
"llmTotalMs": 25221,
"llmTotalMs": 25222,
"llmTotalPromptTokens": 8525
},
{
"fixture": "large",
"fileCount": 50,
"approxTokens": 83410,
"durationMs": 9749,
"llmCalls": 6,
"llmTotalMs": 42401,
"llmTotalPromptTokens": 16602
"durationMs": 9754,
"llmCalls": 7,
"llmTotalMs": 45865,
"llmTotalPromptTokens": 17461
},
{
"fixture": "feature-add",
"fileCount": 14,
"approxTokens": 17600,
"durationMs": 5640,
"durationMs": 5641,
"llmCalls": 4,
"llmTotalMs": 18854,
"llmTotalMs": 18853,
"llmTotalPromptTokens": 6117
},
{
"fixture": "refactor",
"fileCount": 30,
"approxTokens": 32650,
"durationMs": 41347,
"durationMs": 26718,
"llmCalls": 20,
"llmTotalMs": 143990,
"llmTotalMs": 143989,
"llmTotalPromptTokens": 53548
},
{
"fixture": "initial-commit",
"fileCount": 50,
"approxTokens": 83410,
"durationMs": 9818,
"llmCalls": 6,
"llmTotalMs": 42557,
"llmTotalPromptTokens": 16306
"durationMs": 9816,
"llmCalls": 7,
"llmTotalMs": 45897,
"llmTotalPromptTokens": 17107
},
{
"fixture": "docs-update",
"fileCount": 9,
"approxTokens": 15050,
"durationMs": 18564,
"durationMs": 10248,
"llmCalls": 7,
"llmTotalMs": 52222,
"llmTotalMs": 52224,
"llmTotalPromptTokens": 13139
},
{
Expand Down
10 changes: 6 additions & 4 deletions bin/benchmark.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,13 @@ const DEFAULT_OPTIONS: BenchOptions = {
// multi-thousand-token inputs. Adjust if real-world timings drift.
baseLatencyMs: 1500,
perTokenMs: 2,
maxConcurrent: 6,
// Match the canonical service maxConcurrent from
// `langchain/utils.ts` (raised 12 → 24 in PR 3 of #845). The
// bench mirrors the most-common production setting so per-PR
// diffs reflect what real users see.
maxConcurrent: 24,
// Match the canonical service tokenLimit from `langchain/utils.ts`
// (raised from 2048 to 4096 in PR 1 of #845). The bench mirrors
// the most-common production budget so per-PR diffs reflect what
// real users will see.
// (raised from 2048 to 4096 in PR 1 of #845).
maxTokens: 4096,
}

Expand Down
93 changes: 93 additions & 0 deletions src/lib/langchain/chains/summarize/index.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import { Document } from '@langchain/classic/document'
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters'

import { summarize, SummarizeContext } from './index'

/**
* The summarize() helper wraps the chain.invoke() call in a
* 429-aware adaptive backoff (#845, PR 3). These tests pin the
* retry behavior so a future regression in the chain wrapper
* doesn't silently start failing user pipelines on transient rate
* limits.
*/

function makeContext(overrides: { invoke: jest.Mock }): SummarizeContext {
const chain = { invoke: overrides.invoke } as unknown as SummarizeContext['chain']
return {
chain,
textSplitter: new RecursiveCharacterTextSplitter({ chunkSize: 100, chunkOverlap: 0 }),
}
}

describe('summarize() retry behavior', () => {
beforeAll(() => {
// Speed up tests by faking the backoff timer.
jest.useFakeTimers({ doNotFake: ['nextTick', 'queueMicrotask'] })
})
afterAll(() => {
jest.useRealTimers()
})

it('returns the chain output on success without retrying', async () => {
const invoke = jest.fn().mockResolvedValue({ text: 'summary' })
const ctx = makeContext({ invoke })
const result = await summarize([{ pageContent: 'hello world' }], ctx)
expect(result).toBe('summary')
expect(invoke).toHaveBeenCalledTimes(1)
})

it.each([
['429', { status: 429 }],
['rate_limit_exceeded code', { code: 'rate_limit_exceeded' }],
['ECONNRESET', { code: 'ECONNRESET' }],
['429 in message', new Error('OpenAI returned 429: Too Many Requests')],
['rate-limit in message', new Error('Anthropic rate-limit exceeded, retry later')],
['503', { status: 503 }],
])('retries on retryable %s and eventually succeeds', async (_, errorShape) => {
const error = errorShape instanceof Error ? errorShape : Object.assign(new Error('boom'), errorShape)
const invoke = jest.fn()
.mockRejectedValueOnce(error)
.mockResolvedValue({ text: 'late summary' })
const ctx = makeContext({ invoke })
const promise = summarize([{ pageContent: 'hello' }], ctx)
// Drain the backoff timer.
await jest.runOnlyPendingTimersAsync()
await expect(promise).resolves.toBe('late summary')
expect(invoke).toHaveBeenCalledTimes(2)
})

it('does NOT retry on non-retryable errors', async () => {
const invoke = jest.fn().mockRejectedValue(Object.assign(new Error('Bad Request'), { status: 400 }))
const ctx = makeContext({ invoke })
await expect(summarize([{ pageContent: 'hello' }], ctx)).rejects.toThrow('Bad Request')
expect(invoke).toHaveBeenCalledTimes(1)
})

it('gives up after BACKOFF_RETRIES (3) attempts on persistent rate limits', async () => {
const error = Object.assign(new Error('429'), { status: 429 })
const invoke = jest.fn().mockImplementation(() => Promise.reject(error))
const ctx = makeContext({ invoke })
const promise = summarize([{ pageContent: 'hello' }], ctx)
// Pre-attach a rejection handler so node's unhandled-rejection
// tracking doesn't fire before we drain the backoff timers.
promise.catch(() => undefined)
await jest.runAllTimersAsync()
await expect(promise).rejects.toThrow('429')
// Initial attempt + 3 retries = 4 total invocations.
expect(invoke).toHaveBeenCalledTimes(4)
})

it('passes the chain Document[] shape unchanged through retries', async () => {
const invoke = jest.fn()
.mockRejectedValueOnce(Object.assign(new Error('429'), { status: 429 }))
.mockResolvedValue({ text: 'ok' })
const ctx = makeContext({ invoke })
const promise = summarize([{ pageContent: 'first doc' }], ctx)
await jest.runOnlyPendingTimersAsync()
await promise
// Both calls should have received the same input shape.
const [firstCall, secondCall] = invoke.mock.calls
expect(firstCall[0].input_documents).toEqual(secondCall[0].input_documents)
expect(firstCall[0].input_documents[0]).toBeInstanceOf(Document)
})
})
62 changes: 60 additions & 2 deletions src/lib/langchain/chains/summarize/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,64 @@ export type SummarizeContext = {
metadata?: Partial<LlmCallMetadata>
}

/**
* Adaptive backoff (#845, PR 3). Wraps the chain invocation so a
* transient 429 (rate limit) or 5xx no longer kills the whole
* pipeline — instead we wait briefly and retry up to N times
* before surfacing the failure.
*
* Cap is intentionally short. Diff condensing fans out to many
* concurrent calls; if rate limits hit hard, queueing requests
* indefinitely just makes the user wait longer for a result the
* pipeline ultimately handles via fewer concurrent passes anyway.
* 3 retries with 1s/2s/4s waits trade ~7s of worst-case extra
* latency for resilience to brief rate-limit blips.
*/
const BACKOFF_RETRIES = 3
const BACKOFF_BASE_MS = 1000
const BACKOFF_CAP_MS = 5000

function isRetryableError(error: unknown): boolean {
if (!error || typeof error !== 'object') return false
const err = error as { status?: number; code?: string | number; message?: string }
if (err.status === 429 || err.status === 503 || err.status === 502 || err.status === 504) {
return true
}
if (err.code === 429 || err.code === 'rate_limit_exceeded' || err.code === 'ECONNRESET' || err.code === 'ETIMEDOUT') {
return true
}
if (typeof err.message === 'string' && /(rate.?limit|429|too many requests|timeout|temporarily unavailable)/i.test(err.message)) {
return true
}
return false
}

async function invokeWithBackoff(
chain: SummarizeContext['chain'],
input: { input_documents: Document[]; returnIntermediateSteps: boolean },
logger: Logger | undefined
): Promise<{ text: string; error?: string }> {
let lastError: unknown
for (let attempt = 0; attempt <= BACKOFF_RETRIES; attempt++) {
try {
return await chain.invoke(input) as { text: string; error?: string }
} catch (error) {
lastError = error
if (!isRetryableError(error) || attempt === BACKOFF_RETRIES) {
throw error
}
const wait = Math.min(BACKOFF_CAP_MS, BACKOFF_BASE_MS * Math.pow(2, attempt))
logger?.verbose(
`[summarize] retryable error (attempt ${attempt + 1}/${BACKOFF_RETRIES}); backing off ${wait}ms`,
{ color: 'yellow' }
)
await new Promise((resolve) => setTimeout(resolve, wait))
}
}
// Unreachable — the loop either returns or rethrows above.
throw lastError
}

export async function summarize(
documents: DocumentInput[],
{ chain, textSplitter, options, logger, tokenizer, metadata }: SummarizeContext
Expand All @@ -32,10 +90,10 @@ export async function summarize(
: undefined

const startedAt = Date.now()
const res = await chain.invoke({
const res = await invokeWithBackoff(chain, {
input_documents: docs,
returnIntermediateSteps,
})
}, logger)
const elapsedMs = Date.now() - startedAt

logLlmCall(logger, {
Expand Down
14 changes: 12 additions & 2 deletions src/lib/langchain/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,13 @@ export const DEFAULT_OPENAI_LLM_SERVICE: OpenAILLMService = {
model: 'gpt-4.1-nano',
tokenLimit: 4096,
temperature: 0.32,
maxConcurrent: 12,
// Bumped 12 → 24 (#845, PR 3). The OpenAI fast tier comfortably
// handles ~30 concurrent on the per-key default rate limit; 24
// leaves headroom for retries while still doubling throughput.
// The summarize chain has a 429-aware backoff (`summarize`
// helper) so a temporary rate-limit hit no longer kills the
// whole pipeline.
maxConcurrent: 24,
minTokensForSummary: 800,
maxFileTokens: 2000,
authentication: {
Expand All @@ -133,7 +139,11 @@ export const DEFAULT_ANTHROPIC_LLM_SERVICE: AnthropicLLMService = {
model: 'claude-haiku-4-5-20251001',
temperature: 0.32,
tokenLimit: 4096,
maxConcurrent: 12,
// Bumped 12 → 24 (#845, PR 3). Matches the OpenAI default;
// Anthropic's per-key concurrency on Haiku is generous enough
// that 24 stays under the rate ceiling for typical fast-model
// request shapes. Backoff in `summarize` handles spikes.
maxConcurrent: 24,
minTokensForSummary: 800,
maxFileTokens: 2000,
authentication: {
Expand Down
Loading