diff --git a/docs/wire-protocol-reference.md b/docs/wire-protocol-reference.md index c41e5f8..6c4e9e4 100644 --- a/docs/wire-protocol-reference.md +++ b/docs/wire-protocol-reference.md @@ -27,6 +27,7 @@ | Date | Agent | Version | Change | |------|-------|---------|--------| +| 2026-06-09 | Claude Code | 2.1.x | Confirmed via loopback wire capture: `anthropic-beta` carries `context-1m-2025-08-07` on **every** request when the account's 1M context is enabled — including haiku title-gen turns (it is a client/account-level capability flag, not a per-turn window declaration). ccxray now uses it as the non-lagging 1M-window signal, gated by model capability (`SUPPORTS_1M`), replacing sole reliance on the lagging system-prompt `[1m]` marker (#58). | | 2026-06-05 | ccxray | 1.11.x | Usage normalization: OpenAI `input_tokens` includes `cached_tokens` (subset), unlike Anthropic's disjoint fields. `normalizeUsageForProvider` now subtracts the overlap so canonical `input_tokens + cache_read + cache_creation = total context` holds for both providers. Normalized entries carry `_ccxrayUsageNormalized: true`. Historical entries normalized on restore (in-memory, index unchanged). Cache display: Codex sessions show `cache N% hit` instead of TTL countdown; topbar adapts per provider (`ephemeral-ttl` vs `server-managed`). `UPSTREAM_PROFILES` registry added to `providers.js`. | | 2026-06-04 | ccxray | 1.10.x | Fix: WS `stopReason` now extracts `response.status` from terminal events (`completed`/`incomplete`/`failed`/`cancelled`) instead of WS close reason. WS `title` extracts user input summary via `getOpenAIInputSummary` instead of hardcoded string. Non-terminal statuses (`in_progress`/`queued`) are ignored to prevent masking close/error reasons. | | 2026-06-02 | ccxray | 1.10.0 | Doc audit: 13 major + 25 minor corrections applied (F1–F38) | @@ -65,6 +66,8 @@ | ChatGPT base path | N/A | `/backend-api/codex/...` (the proxy strips the `/v1` prefix before prepending the base path, so `POST /v1/responses` → `/backend-api/codex/responses`) | `obs-stable` codex ≥0.131 | | Version header | `anthropic-version: 2023-06-01` | N/A | `contractual` | | Beta features | `anthropic-beta: ...` (comma-separated) | `openai-beta: ...` | `contractual` | +| 1M context window signal | `anthropic-beta` list contains `context-1m-2025-08-07` (present on every request when 1M enabled — a client-level flag, also on haiku turns; does **not** lag a mid-session model switch, unlike the system-prompt `[1m]` marker) | N/A | `obs-stable` Claude Code ≥2.1.x | +| ~~Rate-limit ≠ context window~~ | `anthropic-ratelimit-tokens-limit` (e.g. `80000`) is a per-window quota, **not** the context window — never use it to size the denominator | N/A | `obs-stable` | --- diff --git a/server/config.js b/server/config.js index 0e232c0..4337843 100644 --- a/server/config.js +++ b/server/config.js @@ -224,6 +224,14 @@ const MODEL_CONTEXT_FALLBACK = { }; const DEFAULT_CONTEXT = 200_000; +// Models that can actually be served with a 1M context window. The 1M signal +// (anthropic-beta context-1m header, or the system "[1m]" marker) is a +// client/account-level capability flag — it rides on EVERY Claude Code request, +// including haiku title-gen turns. Gate the 1M jump on the model itself so a +// haiku request carrying the beta header is not shown as a 1M window. New 1M +// families get one line here, not a logic change. +const SUPPORTS_1M = /^claude-(opus|sonnet)-4/; + // Extract effective model ID from system prompt (includes [1m] suffix if present). // API request model field never includes [1m], but system prompt does: // "The exact model ID is claude-opus-4-6[1m]." @@ -237,14 +245,27 @@ function extractModelFromSystem(system) { return null; } -function getMaxContext(model, system) { - // Prefer model ID from system prompt (has [1m] suffix when applicable) - const effective = extractModelFromSystem(system) || model; - if (!effective) return DEFAULT_CONTEXT; - // 1) Explicit suffix: "claude-opus-4-6[1m]" → 1M - if (/\[1m\]/i.test(effective)) return 1_000_000; - // 2) Known Claude Code defaults (200K standard plan) - const stripped = effective.replace(/\[.*\]/, ''); +function getMaxContext(model, system, opts = {}) { + // Model IDENTITY comes from the request `model` field — it updates immediately + // on a mid-session model switch. The system marker is only a fallback for + // identity, because Claude Code's "The exact model ID is ..." line lags several + // turns behind the switch and would otherwise corrupt the window denominator + // (issue #58). The system marker is still the place the "[1m]" suffix appears. + const sysModel = extractModelFromSystem(system); + const identity = model || sysModel; + if (!identity) return DEFAULT_CONTEXT; + const stripped = identity.replace(/\[.*\]/, ''); + // 1) 1M plan active? Two non-mutually-exclusive signals: + // - opts.beta1m: anthropic-beta `context-1m-*` request header (non-lagging, + // present on every turn — the authoritative plan flag). + // - "[1m]" suffix in the system marker (legacy; lags after a model switch). + // Either signal counts, but only for a 1M-capable model (SUPPORTS_1M) so a + // client-level flag riding on a haiku request does not over-claim 1M. + const has1mSignal = opts.beta1m === true + || /\[1m\]/i.test(sysModel || '') + || /\[1m\]/i.test(model || ''); + if (has1mSignal && SUPPORTS_1M.test(stripped)) return 1_000_000; + // 2) Known Claude Code / Codex defaults (200K / 400K) const keys = Object.keys(MODEL_CONTEXT_FALLBACK).sort((a, b) => b.length - a.length); for (const key of keys) { if (stripped.startsWith(key)) return MODEL_CONTEXT_FALLBACK[key]; @@ -263,8 +284,8 @@ function getMaxContext(model, system) { // base, bump Claude models up to 1M so the dashboard "X / Y (Z%)" stays // self-consistent. Non-Claude models are not bumped because we have no // reliable next tier to escalate to. -function inferMaxContext(model, system, usage) { - const base = getMaxContext(model, system); +function inferMaxContext(model, system, usage, opts = {}) { + const base = getMaxContext(model, system, opts); if (!usage) return base; const used = (usage.input_tokens || 0) + (usage.cache_creation_input_tokens || 0) diff --git a/server/forward.js b/server/forward.js index 29a0562..6e4b6f8 100644 --- a/server/forward.js +++ b/server/forward.js @@ -589,7 +589,7 @@ function handleSSEResponse(ctx, proxyRes, clientRes) { console.log(`\x1b[90m Context HUD: injecting into session ${reqSessionId.slice(0, 8)}\x1b[0m`); _hudLoggedSessions.add(reqSessionId); } - const maxCtx = config.inferMaxContext(parsedBody?.model, parsedBody?.system, usage); + const maxCtx = config.inferMaxContext(parsedBody?.model, parsedBody?.system, usage, { beta1m: ctx.beta1m }); const pct = (totalCtx / maxCtx * 100).toFixed(1); const newIdx = maxBlockIndex + 1; const costInfo = calculateCost(usage, parsedBody?.model); @@ -666,7 +666,7 @@ function handleSSEResponse(ctx, proxyRes, clientRes) { proxyRes, sessionId, sessionInferred: ctx.sessionInferred, sysHash: ctx.sysHash, toolsHash: ctx.toolsHash, coreHash: ctx.coreHash, cwd: store.sessionMeta[sessionId]?.cwd || null, - stopReason, title, thinkingDuration, thinkingStripped, + stopReason, title, thinkingDuration, thinkingStripped, beta1m: ctx.beta1m, isSubagent, toolFail: helpers.hasToolFail(parsedBody), startTime, }), }; @@ -695,7 +695,7 @@ function handleSSEResponse(ctx, proxyRes, clientRes) { const outTok = usage?.output_tokens ? ` out=${usage.output_tokens.toLocaleString()} tok` : ''; const prefix = ctx.attribPrefix || ''; console.log(`${color}📥 [${helpers.taipeiTime()}] ${prefix} ${glyph} ${code} ${elapsed}s${outTok}\x1b[0m`); - if (usage) helpers.printContextBar(usage, parsedBody?.model, parsedBody?.system); + if (usage) helpers.printContextBar(usage, parsedBody?.model, parsedBody?.system, ctx.beta1m); if (entry.cost?.cost != null) { store.sessionCosts.set(sessionId, (store.sessionCosts.get(sessionId) || 0) + entry.cost.cost); console.log(` 💰 $${entry.cost.cost.toFixed(4)} this turn | $${store.sessionCosts.get(sessionId).toFixed(4)} session`); @@ -886,7 +886,7 @@ function handleNonSSEResponse(ctx, proxyRes, clientRes) { sessionId, sessionInferred: ctx.sessionInferred, sysHash: ctx.sysHash, toolsHash: ctx.toolsHash, coreHash: ctx.coreHash, cwd: store.sessionMeta[sessionId]?.cwd || null, - stopReason, title, thinkingDuration: null, thinkingStripped, + stopReason, title, thinkingDuration: null, thinkingStripped, beta1m: ctx.beta1m, isSubagent, toolFail: helpers.hasToolFail(parsedBody), startTime, }), }; diff --git a/server/helpers.js b/server/helpers.js index 7d5fde1..4bc6ebe 100644 --- a/server/helpers.js +++ b/server/helpers.js @@ -369,14 +369,16 @@ function totalContextTokens(usage) { + (usage.cache_read_input_tokens || 0); } -function printContextBar(usage, model, system) { +function printContextBar(usage, model, system, beta1m) { const { inferMaxContext } = require('./config'); if (!usage) return; // Use inferMaxContext (not getMaxContext) so the terminal HUD bumps // claude-opus-* / claude-sonnet-* to 1M when observed usage exceeds 200K // but the [1m] marker isn't present in the system prompt — otherwise the - // bar clamps to "100% (X / 200,000)" while X overflows the max. - const maxCtx = inferMaxContext(model, system, usage); + // bar clamps to "100% (X / 200,000)" while X overflows the max. beta1m + // (anthropic-beta context-1m header) gives 1M immediately, before usage + // crosses 200K and without waiting for the lagging [1m] marker (#58). + const maxCtx = inferMaxContext(model, system, usage, { beta1m }); const used = totalContextTokens(usage); if (!used) return; const pct = Math.min(100, (used / maxCtx) * 100); diff --git a/server/index.js b/server/index.js index fb95415..a600af8 100755 --- a/server/index.js +++ b/server/index.js @@ -376,9 +376,17 @@ const server = http.createServer((clientReq, clientRes) => { // Build context for forwarding const fwdHeaders = buildForwardHeaders(clientReq.headers, upstream); + // #58: the 1M context window is enabled via the `anthropic-beta: + // context-1m-*` request header. Unlike the system-prompt "[1m]" marker, this + // header rides every turn (it does not lag a mid-session model switch), so it + // is the authoritative, non-lagging signal for the context-window denominator. + // Carried on ctx and fed into inferMaxContext downstream. + const beta1m = /(^|,)\s*context-1m-/.test(clientReq.headers['anthropic-beta'] || ''); + const ctx = { id, ts, startTime, parsedBody, rawBody, clientReq, clientRes, fwdHeaders, reqSessionId, reqWritePromise, sysHash, toolsHash, coreHash, sessionInferred, upstream, + beta1m, isSubagent: provider === 'openai' ? isOpenAISubagent(clientReq.headers, parsedBody) : undefined, }; diff --git a/server/wire-parsers/anthropic.js b/server/wire-parsers/anthropic.js index 70f874c..5a9b7cd 100644 --- a/server/wire-parsers/anthropic.js +++ b/server/wire-parsers/anthropic.js @@ -58,7 +58,7 @@ function buildEntryFields(ctx) { cwd: ctx.cwd ?? null, usage, cost: calculateCost(usage, model), - maxContext: config.inferMaxContext(model, parsedBody?.system, usage), + maxContext: config.inferMaxContext(model, parsedBody?.system, usage, { beta1m: ctx.beta1m }), responseMetadata: undefined, stopReason: ctx.stopReason || '', title: ctx.title || null, diff --git a/test/config.test.js b/test/config.test.js index f374d91..96bb91e 100644 --- a/test/config.test.js +++ b/test/config.test.js @@ -649,3 +649,59 @@ describe('inferMaxContext', () => { assert.equal(inferMaxContext('claude-opus-4-7', null, usage), 1_000_000); }); }); + +// ── #58: anthropic-beta context-1m header as a non-lagging 1M signal ───────── +// Empirically confirmed (2026-06-09): Claude Code sends +// anthropic-beta: ...,context-1m-2025-08-07,... +// on EVERY request when the account has the 1M context beta enabled. Unlike the +// system-prompt "[1m]" marker (which lags several turns after a mid-session +// model switch), this header is present on every turn, so it fixes the lag-window +// flicker / ctx% cliff at the source. Because the header is a client-level +// capability flag (also present on haiku title-gen requests), it must be GATED by +// model 1M-capability so it does not over-claim a 1M window for haiku. +describe('getMaxContext / inferMaxContext — context-1m beta header (#58)', () => { + const { getMaxContext, inferMaxContext } = require('../server/config'); + const staleMarker = [{ type: 'text', text: 'The exact model ID is claude-opus-4-6.' }]; + + it('FIX: stale system marker no longer corrupts identity — beta header wins (fail-on-old)', () => { + // The reported bug: request model already switched to opus-4-8 (1M plan), but + // the system marker still lags on opus-4-6 with no [1m]. Old code trusted the + // stale marker for identity AND had no beta signal → 200K → ctx% ~99%. + const usage = { input_tokens: 196_602 }; // turn 35 from the issue, < 200K + assert.equal(inferMaxContext('claude-opus-4-8', staleMarker, usage, { beta1m: true }), 1_000_000); + }); + + it('FIX: beta header gives 1M even with no [1m] marker and small usage (fail-on-old)', () => { + // Lag window, early turn: no [1m] anywhere, usage below 200K. The header is the + // only non-lagging signal — without it the turn shows 200K until usage crosses. + assert.equal(getMaxContext('claude-opus-4-8', null, { beta1m: true }), 1_000_000); + assert.equal(inferMaxContext('claude-opus-4-8', null, { input_tokens: 50_000 }, { beta1m: true }), 1_000_000); + assert.equal(inferMaxContext('claude-sonnet-4-6', null, { input_tokens: 50_000 }, { beta1m: true }), 1_000_000); + }); + + it('GUARD: beta header does NOT over-claim 1M for non-1M-capable models', () => { + // haiku title-gen requests carry the same client-level beta header but haiku + // does not support 1M — must stay at base when usage is small. + assert.equal(getMaxContext('claude-haiku-4-5', null, { beta1m: true }), 200_000); + assert.equal(inferMaxContext('claude-haiku-4-5', null, { input_tokens: 50_000 }, { beta1m: true }), 200_000); + // claude-3 families are not 1M-capable either. + assert.equal(getMaxContext('claude-3-opus', null, { beta1m: true }), 200_000); + }); + + it('GUARD: no signal at all stays at base (no false 1M)', () => { + assert.equal(getMaxContext('claude-opus-4-8', null), 200_000); + assert.equal(getMaxContext('claude-opus-4-8', null, {}), 200_000); + assert.equal(inferMaxContext('claude-opus-4-8', null, { input_tokens: 50_000 }), 200_000); + }); + + it('REGRESSION: existing [1m] marker path and usage hatch still work without opts', () => { + const marker1m = [{ type: 'text', text: 'The exact model ID is claude-opus-4-8[1m].' }]; + assert.equal(getMaxContext('claude-opus-4-8', marker1m), 1_000_000); + assert.equal(inferMaxContext('claude-haiku-4-5', null, { input_tokens: 260_000 }), 1_000_000); // usage hatch + assert.equal(getMaxContext('gpt-5.1-codex', null), 400_000); + }); + + it('GUARD: beta header is ignored for OpenAI models (no 1M concept there)', () => { + assert.equal(getMaxContext('gpt-5.1-codex', null, { beta1m: true }), 400_000); + }); +}); diff --git a/test/dashboard-codex-e2e.test.js b/test/dashboard-codex-e2e.test.js index 0d71491..c12ec89 100644 --- a/test/dashboard-codex-e2e.test.js +++ b/test/dashboard-codex-e2e.test.js @@ -14,6 +14,17 @@ const PROJECT_CWD = path.resolve(__dirname, '..'); const PROJECT_NAME = path.basename(PROJECT_CWD); const tmpDirs = []; +// Mirror of the dashboard's project-label truncation (public/miller-columns.js +// truncateMiddle). The project label is the cwd basename, which is long when the +// suite runs from a git worktree (e.g. ".claude/worktrees/"), so compare +// against the same truncation the UI applies rather than the raw name. +function truncateMiddle(s, max) { + if (s.length <= max) return s; + const tail = Math.ceil(max * 0.6); + const head = max - tail - 1; + return s.slice(0, head) + '…' + s.slice(-tail); +} + function makeOpenAISSE() { return [ 'event: response.created', @@ -162,7 +173,7 @@ describe('Codex dashboard status E2E', () => { }; }); - assert.equal(state.projectText, PROJECT_NAME); + assert.equal(state.projectText, truncateMiddle(PROJECT_NAME, 20)); assert.equal(state.sessionText, 'Codex Raw'); assert.match(state.url, /s=codex-raw/); assert.equal(state.hasOkDot, true);