Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/wire-protocol-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

| Date | Agent | Version | Change |
|------|-------|---------|--------|
| 2026-06-09 | Claude Code | 2.1.x | Confirmed via loopback wire capture: `anthropic-beta` carries `context-1m-2025-08-07` on **every** request when the account's 1M context is enabled — including haiku title-gen turns (it is a client/account-level capability flag, not a per-turn window declaration). ccxray now uses it as the non-lagging 1M-window signal, gated by model capability (`SUPPORTS_1M`), replacing sole reliance on the lagging system-prompt `[1m]` marker (#58). |
| 2026-06-05 | ccxray | 1.11.x | Usage normalization: OpenAI `input_tokens` includes `cached_tokens` (subset), unlike Anthropic's disjoint fields. `normalizeUsageForProvider` now subtracts the overlap so canonical `input_tokens + cache_read + cache_creation = total context` holds for both providers. Normalized entries carry `_ccxrayUsageNormalized: true`. Historical entries normalized on restore (in-memory, index unchanged). Cache display: Codex sessions show `cache N% hit` instead of TTL countdown; topbar adapts per provider (`ephemeral-ttl` vs `server-managed`). `UPSTREAM_PROFILES` registry added to `providers.js`. |
| 2026-06-04 | ccxray | 1.10.x | Fix: WS `stopReason` now extracts `response.status` from terminal events (`completed`/`incomplete`/`failed`/`cancelled`) instead of WS close reason. WS `title` extracts user input summary via `getOpenAIInputSummary` instead of hardcoded string. Non-terminal statuses (`in_progress`/`queued`) are ignored to prevent masking close/error reasons. |
| 2026-06-02 | ccxray | 1.10.0 | Doc audit: 13 major + 25 minor corrections applied (F1–F38) |
Expand Down Expand Up @@ -65,6 +66,8 @@
| ChatGPT base path | N/A | `/backend-api/codex/...` (the proxy strips the `/v1` prefix before prepending the base path, so `POST /v1/responses` → `/backend-api/codex/responses`) | `obs-stable` codex ≥0.131 |
| Version header | `anthropic-version: 2023-06-01` | N/A | `contractual` |
| Beta features | `anthropic-beta: ...` (comma-separated) | `openai-beta: ...` | `contractual` |
| 1M context window signal | `anthropic-beta` list contains `context-1m-2025-08-07` (present on every request when 1M enabled — a client-level flag, also on haiku turns; does **not** lag a mid-session model switch, unlike the system-prompt `[1m]` marker) | N/A | `obs-stable` Claude Code ≥2.1.x |
| ~~Rate-limit ≠ context window~~ | `anthropic-ratelimit-tokens-limit` (e.g. `80000`) is a per-window quota, **not** the context window — never use it to size the denominator | N/A | `obs-stable` |

---

Expand Down
41 changes: 31 additions & 10 deletions server/config.js
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,14 @@ const MODEL_CONTEXT_FALLBACK = {
};
const DEFAULT_CONTEXT = 200_000;

// Models that can actually be served with a 1M context window. The 1M signal
// (anthropic-beta context-1m header, or the system "[1m]" marker) is a
// client/account-level capability flag — it rides on EVERY Claude Code request,
// including haiku title-gen turns. Gate the 1M jump on the model itself so a
// haiku request carrying the beta header is not shown as a 1M window. New 1M
// families get one line here, not a logic change.
const SUPPORTS_1M = /^claude-(opus|sonnet)-4/;

// Extract effective model ID from system prompt (includes [1m] suffix if present).
// API request model field never includes [1m], but system prompt does:
// "The exact model ID is claude-opus-4-6[1m]."
Expand All @@ -237,14 +245,27 @@ function extractModelFromSystem(system) {
return null;
}

function getMaxContext(model, system) {
// Prefer model ID from system prompt (has [1m] suffix when applicable)
const effective = extractModelFromSystem(system) || model;
if (!effective) return DEFAULT_CONTEXT;
// 1) Explicit suffix: "claude-opus-4-6[1m]" → 1M
if (/\[1m\]/i.test(effective)) return 1_000_000;
// 2) Known Claude Code defaults (200K standard plan)
const stripped = effective.replace(/\[.*\]/, '');
function getMaxContext(model, system, opts = {}) {
// Model IDENTITY comes from the request `model` field — it updates immediately
// on a mid-session model switch. The system marker is only a fallback for
// identity, because Claude Code's "The exact model ID is ..." line lags several
// turns behind the switch and would otherwise corrupt the window denominator
// (issue #58). The system marker is still the place the "[1m]" suffix appears.
const sysModel = extractModelFromSystem(system);
const identity = model || sysModel;
if (!identity) return DEFAULT_CONTEXT;
const stripped = identity.replace(/\[.*\]/, '');
// 1) 1M plan active? Two non-mutually-exclusive signals:
// - opts.beta1m: anthropic-beta `context-1m-*` request header (non-lagging,
// present on every turn — the authoritative plan flag).
// - "[1m]" suffix in the system marker (legacy; lags after a model switch).
// Either signal counts, but only for a 1M-capable model (SUPPORTS_1M) so a
// client-level flag riding on a haiku request does not over-claim 1M.
const has1mSignal = opts.beta1m === true
|| /\[1m\]/i.test(sysModel || '')
|| /\[1m\]/i.test(model || '');
if (has1mSignal && SUPPORTS_1M.test(stripped)) return 1_000_000;
// 2) Known Claude Code / Codex defaults (200K / 400K)
const keys = Object.keys(MODEL_CONTEXT_FALLBACK).sort((a, b) => b.length - a.length);
for (const key of keys) {
if (stripped.startsWith(key)) return MODEL_CONTEXT_FALLBACK[key];
Expand All @@ -263,8 +284,8 @@ function getMaxContext(model, system) {
// base, bump Claude models up to 1M so the dashboard "X / Y (Z%)" stays
// self-consistent. Non-Claude models are not bumped because we have no
// reliable next tier to escalate to.
function inferMaxContext(model, system, usage) {
const base = getMaxContext(model, system);
function inferMaxContext(model, system, usage, opts = {}) {
const base = getMaxContext(model, system, opts);
if (!usage) return base;
const used = (usage.input_tokens || 0)
+ (usage.cache_creation_input_tokens || 0)
Expand Down
8 changes: 4 additions & 4 deletions server/forward.js
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,7 @@ function handleSSEResponse(ctx, proxyRes, clientRes) {
console.log(`\x1b[90m Context HUD: injecting into session ${reqSessionId.slice(0, 8)}\x1b[0m`);
_hudLoggedSessions.add(reqSessionId);
}
const maxCtx = config.inferMaxContext(parsedBody?.model, parsedBody?.system, usage);
const maxCtx = config.inferMaxContext(parsedBody?.model, parsedBody?.system, usage, { beta1m: ctx.beta1m });
const pct = (totalCtx / maxCtx * 100).toFixed(1);
const newIdx = maxBlockIndex + 1;
const costInfo = calculateCost(usage, parsedBody?.model);
Expand Down Expand Up @@ -666,7 +666,7 @@ function handleSSEResponse(ctx, proxyRes, clientRes) {
proxyRes, sessionId, sessionInferred: ctx.sessionInferred,
sysHash: ctx.sysHash, toolsHash: ctx.toolsHash, coreHash: ctx.coreHash,
cwd: store.sessionMeta[sessionId]?.cwd || null,
stopReason, title, thinkingDuration, thinkingStripped,
stopReason, title, thinkingDuration, thinkingStripped, beta1m: ctx.beta1m,
isSubagent, toolFail: helpers.hasToolFail(parsedBody), startTime,
}),
};
Expand Down Expand Up @@ -695,7 +695,7 @@ function handleSSEResponse(ctx, proxyRes, clientRes) {
const outTok = usage?.output_tokens ? ` out=${usage.output_tokens.toLocaleString()} tok` : '';
const prefix = ctx.attribPrefix || '';
console.log(`${color}📥 [${helpers.taipeiTime()}] ${prefix} ${glyph} ${code} ${elapsed}s${outTok}\x1b[0m`);
if (usage) helpers.printContextBar(usage, parsedBody?.model, parsedBody?.system);
if (usage) helpers.printContextBar(usage, parsedBody?.model, parsedBody?.system, ctx.beta1m);
if (entry.cost?.cost != null) {
store.sessionCosts.set(sessionId, (store.sessionCosts.get(sessionId) || 0) + entry.cost.cost);
console.log(` 💰 $${entry.cost.cost.toFixed(4)} this turn | $${store.sessionCosts.get(sessionId).toFixed(4)} session`);
Expand Down Expand Up @@ -886,7 +886,7 @@ function handleNonSSEResponse(ctx, proxyRes, clientRes) {
sessionId, sessionInferred: ctx.sessionInferred,
sysHash: ctx.sysHash, toolsHash: ctx.toolsHash, coreHash: ctx.coreHash,
cwd: store.sessionMeta[sessionId]?.cwd || null,
stopReason, title, thinkingDuration: null, thinkingStripped,
stopReason, title, thinkingDuration: null, thinkingStripped, beta1m: ctx.beta1m,
isSubagent, toolFail: helpers.hasToolFail(parsedBody), startTime,
}),
};
Expand Down
8 changes: 5 additions & 3 deletions server/helpers.js
Original file line number Diff line number Diff line change
Expand Up @@ -369,14 +369,16 @@ function totalContextTokens(usage) {
+ (usage.cache_read_input_tokens || 0);
}

function printContextBar(usage, model, system) {
function printContextBar(usage, model, system, beta1m) {
const { inferMaxContext } = require('./config');
if (!usage) return;
// Use inferMaxContext (not getMaxContext) so the terminal HUD bumps
// claude-opus-* / claude-sonnet-* to 1M when observed usage exceeds 200K
// but the [1m] marker isn't present in the system prompt — otherwise the
// bar clamps to "100% (X / 200,000)" while X overflows the max.
const maxCtx = inferMaxContext(model, system, usage);
// bar clamps to "100% (X / 200,000)" while X overflows the max. beta1m
// (anthropic-beta context-1m header) gives 1M immediately, before usage
// crosses 200K and without waiting for the lagging [1m] marker (#58).
const maxCtx = inferMaxContext(model, system, usage, { beta1m });
const used = totalContextTokens(usage);
if (!used) return;
const pct = Math.min(100, (used / maxCtx) * 100);
Expand Down
8 changes: 8 additions & 0 deletions server/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -376,9 +376,17 @@ const server = http.createServer((clientReq, clientRes) => {
// Build context for forwarding
const fwdHeaders = buildForwardHeaders(clientReq.headers, upstream);

// #58: the 1M context window is enabled via the `anthropic-beta:
// context-1m-*` request header. Unlike the system-prompt "[1m]" marker, this
// header rides every turn (it does not lag a mid-session model switch), so it
// is the authoritative, non-lagging signal for the context-window denominator.
// Carried on ctx and fed into inferMaxContext downstream.
const beta1m = /(^|,)\s*context-1m-/.test(clientReq.headers['anthropic-beta'] || '');

const ctx = {
id, ts, startTime, parsedBody, rawBody, clientReq, clientRes, fwdHeaders,
reqSessionId, reqWritePromise, sysHash, toolsHash, coreHash, sessionInferred, upstream,
beta1m,
isSubagent: provider === 'openai' ? isOpenAISubagent(clientReq.headers, parsedBody) : undefined,
};

Expand Down
2 changes: 1 addition & 1 deletion server/wire-parsers/anthropic.js
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ function buildEntryFields(ctx) {
cwd: ctx.cwd ?? null,
usage,
cost: calculateCost(usage, model),
maxContext: config.inferMaxContext(model, parsedBody?.system, usage),
maxContext: config.inferMaxContext(model, parsedBody?.system, usage, { beta1m: ctx.beta1m }),
responseMetadata: undefined,
stopReason: ctx.stopReason || '',
title: ctx.title || null,
Expand Down
56 changes: 56 additions & 0 deletions test/config.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -649,3 +649,59 @@ describe('inferMaxContext', () => {
assert.equal(inferMaxContext('claude-opus-4-7', null, usage), 1_000_000);
});
});

// ── #58: anthropic-beta context-1m header as a non-lagging 1M signal ─────────
// Empirically confirmed (2026-06-09): Claude Code sends
// anthropic-beta: ...,context-1m-2025-08-07,...
// on EVERY request when the account has the 1M context beta enabled. Unlike the
// system-prompt "[1m]" marker (which lags several turns after a mid-session
// model switch), this header is present on every turn, so it fixes the lag-window
// flicker / ctx% cliff at the source. Because the header is a client-level
// capability flag (also present on haiku title-gen requests), it must be GATED by
// model 1M-capability so it does not over-claim a 1M window for haiku.
describe('getMaxContext / inferMaxContext — context-1m beta header (#58)', () => {
const { getMaxContext, inferMaxContext } = require('../server/config');
const staleMarker = [{ type: 'text', text: 'The exact model ID is claude-opus-4-6.' }];

it('FIX: stale system marker no longer corrupts identity — beta header wins (fail-on-old)', () => {
// The reported bug: request model already switched to opus-4-8 (1M plan), but
// the system marker still lags on opus-4-6 with no [1m]. Old code trusted the
// stale marker for identity AND had no beta signal → 200K → ctx% ~99%.
const usage = { input_tokens: 196_602 }; // turn 35 from the issue, < 200K
assert.equal(inferMaxContext('claude-opus-4-8', staleMarker, usage, { beta1m: true }), 1_000_000);
});

it('FIX: beta header gives 1M even with no [1m] marker and small usage (fail-on-old)', () => {
// Lag window, early turn: no [1m] anywhere, usage below 200K. The header is the
// only non-lagging signal — without it the turn shows 200K until usage crosses.
assert.equal(getMaxContext('claude-opus-4-8', null, { beta1m: true }), 1_000_000);
assert.equal(inferMaxContext('claude-opus-4-8', null, { input_tokens: 50_000 }, { beta1m: true }), 1_000_000);
assert.equal(inferMaxContext('claude-sonnet-4-6', null, { input_tokens: 50_000 }, { beta1m: true }), 1_000_000);
});

it('GUARD: beta header does NOT over-claim 1M for non-1M-capable models', () => {
// haiku title-gen requests carry the same client-level beta header but haiku
// does not support 1M — must stay at base when usage is small.
assert.equal(getMaxContext('claude-haiku-4-5', null, { beta1m: true }), 200_000);
assert.equal(inferMaxContext('claude-haiku-4-5', null, { input_tokens: 50_000 }, { beta1m: true }), 200_000);
// claude-3 families are not 1M-capable either.
assert.equal(getMaxContext('claude-3-opus', null, { beta1m: true }), 200_000);
});

it('GUARD: no signal at all stays at base (no false 1M)', () => {
assert.equal(getMaxContext('claude-opus-4-8', null), 200_000);
assert.equal(getMaxContext('claude-opus-4-8', null, {}), 200_000);
assert.equal(inferMaxContext('claude-opus-4-8', null, { input_tokens: 50_000 }), 200_000);
});

it('REGRESSION: existing [1m] marker path and usage hatch still work without opts', () => {
const marker1m = [{ type: 'text', text: 'The exact model ID is claude-opus-4-8[1m].' }];
assert.equal(getMaxContext('claude-opus-4-8', marker1m), 1_000_000);
assert.equal(inferMaxContext('claude-haiku-4-5', null, { input_tokens: 260_000 }), 1_000_000); // usage hatch
assert.equal(getMaxContext('gpt-5.1-codex', null), 400_000);
});

it('GUARD: beta header is ignored for OpenAI models (no 1M concept there)', () => {
assert.equal(getMaxContext('gpt-5.1-codex', null, { beta1m: true }), 400_000);
});
});
13 changes: 12 additions & 1 deletion test/dashboard-codex-e2e.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,17 @@ const PROJECT_CWD = path.resolve(__dirname, '..');
const PROJECT_NAME = path.basename(PROJECT_CWD);
const tmpDirs = [];

// Mirror of the dashboard's project-label truncation (public/miller-columns.js
// truncateMiddle). The project label is the cwd basename, which is long when the
// suite runs from a git worktree (e.g. ".claude/worktrees/<branch>"), so compare
// against the same truncation the UI applies rather than the raw name.
function truncateMiddle(s, max) {
if (s.length <= max) return s;
const tail = Math.ceil(max * 0.6);
const head = max - tail - 1;
return s.slice(0, head) + '…' + s.slice(-tail);
}

function makeOpenAISSE() {
return [
'event: response.created',
Expand Down Expand Up @@ -162,7 +173,7 @@ describe('Codex dashboard status E2E', () => {
};
});

assert.equal(state.projectText, PROJECT_NAME);
assert.equal(state.projectText, truncateMiddle(PROJECT_NAME, 20));
assert.equal(state.sessionText, 'Codex Raw');
assert.match(state.url, /s=codex-raw/);
assert.equal(state.hasOkDot, true);
Expand Down
Loading