From 25468df523e1d428bdbd92238bc98b67bb06cc34 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 22 Jun 2026 10:19:36 +0000 Subject: [PATCH] feat(budget): model-aware startup context budget for profiles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a model-aware "startup budget" check so a profile's always-on footprint (skill frontmatter + MCP tool schemas) is warned about when it exceeds 50% of the model's context window โ€” a ~128K ceiling on a 256K model, leaving the other half free for the conversation. - lib/token-budget.ts: pure, tested math โ€” MODEL_CONTEXT_WINDOWS map, resolveContextWindow (profile contextWindow/model โ†’ CUE_CONTEXT_WINDOW /CUE_MODEL env โ†’ 256K default), estimateMcpTokens, computeContextBudget, formatContextBudgetWarning (silent <80% budget, ๐ŸŸก near, ๐Ÿ”ด over). - profiles: optional `model` / `contextWindow` fields (schema.json + _types.ts), resolved leaf-wins through inheritance in profile-loader. core declares contextWindow: 256000 as the fan-out baseline. - cue launch: budget warning in the startup banner (real resolved skill + MCP token data). - cue profile suggest: new "Context budget" section auditing every profile; --model / --context / --load-factor / --no-budget flags. - docs/agent-context-budget.md: documents the budget + flags. - token-budget.test.ts: 20 new tests. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01TwQNpDsSR466euk2d1McHV --- docs/agent-context-budget.md | 34 ++++++ profiles/_types.ts | 14 +++ profiles/core/profile.yaml | 9 ++ profiles/schema.json | 9 ++ src/commands/launch.ts | 18 +++ src/commands/profile-suggest.ts | 128 +++++++++++++++++++++- src/lib/profile-loader.ts | 11 ++ src/lib/token-budget.test.ts | 141 ++++++++++++++++++++++++ src/lib/token-budget.ts | 187 ++++++++++++++++++++++++++++++++ 9 files changed, 550 insertions(+), 1 deletion(-) create mode 100644 src/lib/token-budget.test.ts diff --git a/docs/agent-context-budget.md b/docs/agent-context-budget.md index d881ad98..f02b12cb 100644 --- a/docs/agent-context-budget.md +++ b/docs/agent-context-budget.md @@ -32,6 +32,40 @@ Use `setup/lean-cue.md` for the smallest path. It should install cue and pin `core` by default. Caveman, RTK, skill-writing, memory, gbrain, and Office MCPs are optional add-ons. +## Model-Aware Startup Budget + +A profile's *always-on* footprint โ€” skill frontmatter (loaded into the skill +router every message) plus MCP tool schemas (one set per connected server) โ€” is +paid before the user types anything. The rule cue enforces: that startup load +should stay under **50% of the model's context window**, leaving the other half +as working headroom. For a 256K model that's a **~128K startup ceiling**. + +The math lives in `src/lib/token-budget.ts` (pure, unit-tested in +`token-budget.test.ts`): + +- `resolveContextWindow` picks the window. Precedence: profile `contextWindow` + โ†’ profile `model` (via `MODEL_CONTEXT_WINDOWS`) โ†’ env `CUE_CONTEXT_WINDOW` + โ†’ env `CUE_MODEL` โ†’ `DEFAULT_CONTEXT_WINDOW` (256K). cue can't auto-detect the + main-session model (per-launch `/model` choice), hence the declared/env source. +- `estimateMcpTokens` charges `MCP_TOKENS_PER_SERVER` (default 1500, override + with `CUE_MCP_TOKENS_PER_SERVER`) per connected MCP. +- `computeContextBudget` returns the budget, the startup load, and whether it + fits. `formatContextBudgetWarning` stays quiet under 80% of budget, prints a + ๐ŸŸก note as it approaches, and a ๐Ÿ”ด over-budget warning past the ceiling. + +Surfaces: + +- `cue launch` prints the warning in the startup banner (real resolved skill + + MCP token data). +- `cue profile suggest` adds a **Context budget** section auditing every + profile; flags `--model `, `--context `, `--load-factor <0..1>`, + and `--no-budget` to skip it. + +Profiles declare their target via the `model` / `contextWindow` fields (both +optional, leaf-wins through inheritance). `core` sets `contextWindow: 256000` as +the fan-out baseline; a long-context profile can bump it (e.g. +`model: claude-opus-4-8[1m]` for a 1M window). + ## Onboarding Source `src/commands/init.ts` controls first-run global onboarding. Keep the first diff --git a/profiles/_types.ts b/profiles/_types.ts index e05c9b08..ca041a82 100644 --- a/profiles/_types.ts +++ b/profiles/_types.ts @@ -43,6 +43,20 @@ export interface Profile { description: string; icon?: string; iconImage?: string; + /** + * Target main-session model id (e.g. "claude-opus-4-8"). Advisory only โ€” cue + * can't pin the main session model (that's a per-launch `/model` choice) โ€” but + * the model-aware startup budget (lib/token-budget.ts) uses it to resolve the + * context window so the over-budget warning knows what window to size against. + * Leaf-wins through inheritance. Env `CUE_MODEL` overrides per launch. + */ + model?: string; + /** + * Explicit context window (tokens) to budget the startup load against, when + * the model id alone isn't enough (custom/long-context deployments). Takes + * precedence over `model`. Leaf-wins; env `CUE_CONTEXT_WINDOW` overrides. + */ + contextWindow?: number; agents?: AgentKind[]; inherits?: string | string[]; // Companion profiles surfaced at `cue use` time as suggestions. Activating diff --git a/profiles/core/profile.yaml b/profiles/core/profile.yaml index b93ee9af..ea78d9cd 100644 --- a/profiles/core/profile.yaml +++ b/profiles/core/profile.yaml @@ -2,6 +2,15 @@ name: core icon: "๐Ÿง " iconImage: "logo.png" description: Baseline shared by every cue profile โ€” essentials only +# Startup context budget baseline, fans out to every inheriting profile. +# cue can't pin the main-session model (that's a per-launch /model choice), so +# this just tells the model-aware budget (lib/token-budget.ts) which window to +# size against: the always-on load (skill frontmatter + MCP tool schemas) should +# stay under 50% of it, leaving the other half free for the conversation. At +# 256K that's a ~128K startup ceiling; `cue launch` and `cue profile suggest` +# warn when a profile blows past it. Override per-profile by redeclaring +# `contextWindow`/`model`, or per-launch via CUE_CONTEXT_WINDOW / CUE_MODEL. +contextWindow: 256000 # Cost knob, fans out to every inheriting profile. Surfaced into the runtime # settings.json `env` block by buildClaudeSettings (allowlisted). Pins Task/Agent # subagents (code-reviewer, Explore, file-read/grep/summarize) to Sonnet โ€” diff --git a/profiles/schema.json b/profiles/schema.json index 4b90e4a4..41e7ce11 100644 --- a/profiles/schema.json +++ b/profiles/schema.json @@ -27,6 +27,15 @@ "type": "string", "description": "Path to a PNG/JPG logo, relative to the profile dir. Rendered inline in Kitty terminals via the graphics protocol; falls back to `icon` emoji elsewhere." }, + "model": { + "type": "string", + "description": "Advisory target main-session model id (e.g. 'claude-opus-4-8'). cue can't pin the main session model (per-launch /model choice); the model-aware startup budget uses it to resolve the context window for the over-budget warning. Leaf-wins; env CUE_MODEL overrides." + }, + "contextWindow": { + "type": "integer", + "minimum": 1, + "description": "Explicit context window (tokens) to budget the startup load against, when the model id alone isn't enough. Takes precedence over `model`. Leaf-wins; env CUE_CONTEXT_WINDOW overrides." + }, "bundles": { "type": "array", "items": { "type": "string" }, diff --git a/src/commands/launch.ts b/src/commands/launch.ts index 8060dd1d..e5367a8a 100644 --- a/src/commands/launch.ts +++ b/src/commands/launch.ts @@ -20,6 +20,8 @@ import { configDir } from "../lib/config-paths"; import { debug } from "../lib/debug-log"; import { computeTokenBreakdown, + computeContextBudget, + formatContextBudgetWarning, splitSkillBytes, tokenLevelEmoji, type SkillTokens, @@ -1874,6 +1876,22 @@ export async function run(args: string[]): Promise { const breakdown = computeTokenBreakdown(profile, parts, tokensForSkill); alwaysOnForBadge = breakdown.alwaysOn; const lines = formatTokenWarning(breakdown); + + // Model-aware startup budget: skills frontmatter + MCP tool-schema cost + // should stay under 50% of the model's context window so the other half + // is free for the conversation. Window precedence: profile + // contextWindow/model โ†’ env (CUE_CONTEXT_WINDOW/CUE_MODEL) โ†’ 256K default. + const budget = computeContextBudget({ + skillTokens: breakdown.alwaysOn, + mcpCount: profile.mcps.length, + window: profile.contextWindow, + model: profile.model, + }); + const bc = colorFns(); + lines.push( + ...formatContextBudgetWarning(budget, { yellow: bc.yellow, bold: bc.bold, dim: bc.dim }), + ); + if (lines.length > 0) { process.stderr.write("\n"); for (const l of lines) process.stderr.write(`${l}\n`); diff --git a/src/commands/profile-suggest.ts b/src/commands/profile-suggest.ts index 048a8e8b..7560aa96 100644 --- a/src/commands/profile-suggest.ts +++ b/src/commands/profile-suggest.ts @@ -22,9 +22,16 @@ import { skillFrequency, type ClusterItem, } from "../lib/cluster-skills"; +import { loadProfile } from "../lib/profile-loader"; +import { + computeContextBudget, + resolveContextWindow, + splitSkillBytes, +} from "../lib/token-budget"; const REPO_ROOT = process.env.CUE_REPO_ROOT ?? process.env.SOUL_REPO_ROOT ?? resolve(dirname(fileURLToPath(import.meta.url)), "..", ".."); const PROFILES_DIR = join(REPO_ROOT, "profiles"); +const SKILLS_DIR = join(REPO_ROOT, "resources", "skills", "skills"); const DISCOVER_CACHE = join( process.env.XDG_CONFIG_HOME ?? join(homedir(), ".config"), "cue", "discover", "gems.json", @@ -160,6 +167,105 @@ function reportUnfitGems(minSize: number): void { process.stdout.write(`\n ${dim("โ†’ run `cue discover suggest-profiles` to generate draft profile.yaml files")}\n\n`); } +// --------------------------------------------------------------------------- +// Context-budget audit (model-aware) +// +// Resolve each profile (so inherited core skills/MCPs count) and estimate its +// always-on startup load: skill frontmatter tokens + MCP tool-schema tokens. +// Flag profiles whose load exceeds the 50% startup target for the chosen +// model's context window (default 256K โ†’ ~128K budget). +// --------------------------------------------------------------------------- + +interface BudgetOpts { + model?: string; + window?: number; + loadFactor?: number; +} + +const skillTokenCache = new Map(); + +/** Always-on frontmatter tokens for one local skill id, estimated from bytes. */ +function skillFrontmatterTokens(id: string): number { + const cached = skillTokenCache.get(id); + if (cached !== undefined) return cached; + let tokens = 0; + try { + const src = readFileSync(join(SKILLS_DIR, id, "SKILL.md"), "utf8"); + tokens = Math.ceil(splitSkillBytes(src).frontmatter / 4); + } catch { + // Skill not on disk (npx-only or moved) โ€” counts as 0; the resolved + // profile may still list it, but we can't measure what we can't read. + } + skillTokenCache.set(id, tokens); + return tokens; +} + +async function reportContextBudget(profileNames: string[], opts: BudgetOpts): Promise { + const window = resolveContextWindow({ contextWindow: opts.window, model: opts.model }); + const rows: Array<{ name: string; load: number; over: boolean; pct: number; mcps: number }> = []; + + for (const name of profileNames) { + let resolved; + try { + resolved = await loadProfile(name); + } catch { + continue; // malformed / unresolvable โ€” `cue validate` is the right tool. + } + let skillTokens = 0; + for (const s of resolved.skills.local) skillTokens += skillFrontmatterTokens(s.id); + const budget = computeContextBudget({ + skillTokens, + mcpCount: resolved.mcps.length, + window: opts.window ?? resolved.contextWindow, + model: opts.model ?? resolved.model, + loadFactor: opts.loadFactor, + }); + rows.push({ + name, + load: budget.startupLoad, + over: !budget.withinBudget, + pct: budget.pctOfBudget, + mcps: budget.mcpCount, + }); + } + + if (rows.length === 0) { + process.stdout.write(` ${dim("no resolvable profiles to budget")}\n\n`); + return; + } + + const loadFactor = opts.loadFactor && opts.loadFactor > 0 ? opts.loadFactor : 0.5; + const budgetTokens = Math.round(window * loadFactor); + const windowK = `${(window / 1000).toFixed(0)}K`; + const budgetK = `${(budgetTokens / 1000).toFixed(0)}K`; + process.stdout.write( + ` ${dim(`window ${windowK} ยท ${Math.round(loadFactor * 100)}% startup target โ†’ budget ~${budgetK} always-on`)}\n\n`, + ); + + const over = rows.filter(r => r.over).sort((a, b) => b.load - a.load); + if (over.length === 0) { + process.stdout.write(` ${dim(`all ${rows.length} profiles fit the ${budgetK} startup budget`)}\n`); + } else { + process.stdout.write(` ${bold(`${over.length} profile(s) over the ${budgetK} startup budget:`)}\n\n`); + for (const r of over) { + const loadK = `${(r.load / 1000).toFixed(1)}K`; + const mcpNote = r.mcps > 0 ? `, ${r.mcps} MCP${r.mcps > 1 ? "s" : ""}` : ""; + process.stdout.write( + ` ๐Ÿ”ด ${r.name} ${dim(`~${loadK} always-on${mcpNote} โ€” ${Math.round(r.pct * 100)}% of budget`)}\n`, + ); + } + } + + // Always show the 3 heaviest so a near-miss is visible before it tips over. + const heaviest = [...rows].sort((a, b) => b.load - a.load).slice(0, 3); + process.stdout.write(`\n ${dim("Heaviest profiles:")}\n`); + for (const r of heaviest) { + const loadK = `${(r.load / 1000).toFixed(1)}K`; + process.stdout.write(` โ€ข ${r.name} ${dim(`(~${loadK}, ${Math.round(r.pct * 100)}% of budget)`)}\n`); + } + process.stdout.write(`\n`); +} + // --------------------------------------------------------------------------- // Tiny ANSI helpers (no dependency) // --------------------------------------------------------------------------- @@ -177,13 +283,17 @@ export async function run(args: string[]): Promise { process.stdout.write(`cue profile suggest โ€” audit profiles/ and propose regroupings Usage: - cue profile suggest Run all three signals (default) + cue profile suggest Run all signals (default) cue profile suggest --no-cluster Skip the discover-cache clustering section + cue profile suggest --no-budget Skip the model-aware context-budget audit Options: --min-profiles Promote-to-core threshold (default: 3) --jaccard <0..1> Merge-candidate threshold (default: 0.5) --min-size Cluster size threshold for unfit gems (default: 3) + --model Model id for the context-budget audit (e.g. claude-opus-4-8) + --context Explicit context window for the budget (overrides --model) + --load-factor <0..1> Fraction of the window allowed at startup (default: 0.5) Output: report only. Nothing is written. `); @@ -197,6 +307,13 @@ Output: report only. Nothing is written. const minSizeIdx = args.indexOf("--min-size"); const minSize = minSizeIdx >= 0 ? parseInt(args[minSizeIdx + 1] ?? "3", 10) : 3; const skipCluster = args.includes("--no-cluster"); + const skipBudget = args.includes("--no-budget"); + const modelIdx = args.indexOf("--model"); + const model = modelIdx >= 0 ? args[modelIdx + 1] : undefined; + const contextIdx = args.indexOf("--context"); + const contextWindow = contextIdx >= 0 ? parseInt(args[contextIdx + 1] ?? "", 10) : undefined; + const loadFactorIdx = args.indexOf("--load-factor"); + const loadFactor = loadFactorIdx >= 0 ? parseFloat(args[loadFactorIdx + 1] ?? "") : undefined; const profileSkills = readProfileSkills(); const total = Object.keys(profileSkills).length; @@ -218,6 +335,15 @@ Output: report only. Nothing is written. reportUnfitGems(minSize); } + if (!skipBudget) { + process.stdout.write(`${bold("4. Context budget (model-aware)")}\n\n`); + await reportContextBudget(Object.keys(profileSkills), { + model, + window: Number.isFinite(contextWindow) ? contextWindow : undefined, + loadFactor: Number.isFinite(loadFactor) ? loadFactor : undefined, + }); + } + process.stdout.write(`${dim("(report-only โ€” review and edit profiles/*/profile.yaml by hand)")}\n`); return 0; } diff --git a/src/lib/profile-loader.ts b/src/lib/profile-loader.ts index a2c1a271..c8f83e3d 100644 --- a/src/lib/profile-loader.ts +++ b/src/lib/profile-loader.ts @@ -444,6 +444,10 @@ function foldChain(chain: Profile[]): ResolvedProfile { description: child.description, icon: child.icon ?? acc.icon, iconImage: child.iconImage ?? acc.iconImage, + // Budget hints are leaf-wins: a child that declares its own model / + // context window overrides the parent; otherwise it inherits. + model: child.model ?? acc.model, + contextWindow: child.contextWindow ?? acc.contextWindow, // agents: arrays merge by dedupe; if neither parent nor child declares // agents we fall back to the default at the end. agents: dedupePrimitiveArray( @@ -504,6 +508,8 @@ function normalizeToResolved(p: Profile, chain: string[]): ResolvedProfile { description: p.description, icon: p.icon, iconImage: p.iconImage, + model: p.model, + contextWindow: p.contextWindow, agents: p.agents && p.agents.length > 0 ? [...p.agents] : [], inherits: p.inherits, skills: { @@ -599,6 +605,9 @@ function foldComposite(selector: string, parts: ResolvedProfile[]): ResolvedProf description: parts.map((p) => p.description).join(" + "), icon: parts.find((p) => p.icon)?.icon, iconImage: parts.find((p) => p.iconImage)?.iconImage, + // First part that declares a budget hint wins for the composite. + model: parts.find((p) => p.model)?.model, + contextWindow: parts.find((p) => p.contextWindow)?.contextWindow, agents: [...head.agents] as ResolvedProfile["agents"], inherits: undefined, skills: { local: [...head.skills.local], npx: [...head.skills.npx] }, @@ -635,6 +644,8 @@ function foldComposite(selector: string, parts: ResolvedProfile[]): ResolvedProf description: acc.description, icon: acc.icon ?? next.icon, iconImage: acc.iconImage ?? next.iconImage, + model: acc.model ?? next.model, + contextWindow: acc.contextWindow ?? next.contextWindow, agents: dedupePrimitiveArray(acc.agents, next.agents) as ResolvedProfile["agents"], inherits: undefined, skills: { diff --git a/src/lib/token-budget.test.ts b/src/lib/token-budget.test.ts new file mode 100644 index 00000000..90d13f66 --- /dev/null +++ b/src/lib/token-budget.test.ts @@ -0,0 +1,141 @@ +import { describe, expect, test } from "bun:test"; + +import { + DEFAULT_CONTEXT_WINDOW, + DEFAULT_LOAD_FACTOR, + MCP_TOKENS_PER_SERVER, + MODEL_CONTEXT_WINDOWS, + computeContextBudget, + estimateMcpTokens, + formatContextBudgetWarning, + resolveContextWindow, +} from "./token-budget"; + +describe("resolveContextWindow", () => { + test("explicit contextWindow wins over everything", () => { + expect( + resolveContextWindow({ + contextWindow: 64_000, + model: "claude-opus-4-8[1m]", + env: { CUE_CONTEXT_WINDOW: "999000" }, + }), + ).toBe(64_000); + }); + + test("model id maps to its window", () => { + expect(resolveContextWindow({ model: "claude-opus-4-8" })).toBe(256_000); + expect(resolveContextWindow({ model: "claude-opus-4-8[1m]" })).toBe(1_000_000); + }); + + test("env CUE_CONTEXT_WINDOW used when no profile signal", () => { + expect(resolveContextWindow({ env: { CUE_CONTEXT_WINDOW: "300000" } })).toBe(300_000); + }); + + test("env CUE_MODEL used after CUE_CONTEXT_WINDOW", () => { + expect(resolveContextWindow({ env: { CUE_MODEL: "claude-sonnet-4-6" } })).toBe(256_000); + }); + + test("falls back to the 256K default", () => { + expect(resolveContextWindow({ env: {} })).toBe(DEFAULT_CONTEXT_WINDOW); + expect(DEFAULT_CONTEXT_WINDOW).toBe(256_000); + }); + + test("ignores non-positive / unparseable overrides", () => { + expect(resolveContextWindow({ contextWindow: 0, env: {} })).toBe(DEFAULT_CONTEXT_WINDOW); + expect(resolveContextWindow({ env: { CUE_CONTEXT_WINDOW: "nonsense" } })).toBe(DEFAULT_CONTEXT_WINDOW); + expect(resolveContextWindow({ env: { CUE_CONTEXT_WINDOW: "-5" } })).toBe(DEFAULT_CONTEXT_WINDOW); + }); + + test("unknown model id falls through to default", () => { + expect(resolveContextWindow({ model: "gpt-imaginary", env: {} })).toBe(DEFAULT_CONTEXT_WINDOW); + }); +}); + +describe("estimateMcpTokens", () => { + test("scales linearly with server count", () => { + expect(estimateMcpTokens(0)).toBe(0); + expect(estimateMcpTokens(4)).toBe(4 * MCP_TOKENS_PER_SERVER); + }); + + test("negative counts clamp to zero", () => { + expect(estimateMcpTokens(-3)).toBe(0); + }); + + test("honours a custom per-server estimate", () => { + expect(estimateMcpTokens(2, 1000)).toBe(2000); + }); +}); + +describe("computeContextBudget", () => { + test("256K model โ†’ 128K budget; small load is within budget", () => { + const b = computeContextBudget({ skillTokens: 12_000, mcpCount: 4, model: "claude-opus-4-8" }); + expect(b.window).toBe(256_000); + expect(b.budget).toBe(128_000); + expect(b.mcpTokens).toBe(4 * MCP_TOKENS_PER_SERVER); + expect(b.startupLoad).toBe(12_000 + 4 * MCP_TOKENS_PER_SERVER); + expect(b.withinBudget).toBe(true); + expect(b.overBy).toBe(0); + }); + + test("flags over-budget when skills + MCPs exceed half the window", () => { + const b = computeContextBudget({ skillTokens: 130_000, mcpCount: 2, model: "claude-opus-4-8" }); + expect(b.withinBudget).toBe(false); + expect(b.overBy).toBeGreaterThan(0); + expect(b.overBy).toBe(b.startupLoad - b.budget); + expect(b.pctOfWindow).toBeGreaterThan(0.5); + }); + + test("default load factor is 50%", () => { + expect(DEFAULT_LOAD_FACTOR).toBe(0.5); + const b = computeContextBudget({ skillTokens: 0, mcpCount: 0, window: 200_000 }); + expect(b.budget).toBe(100_000); + }); + + test("custom load factor changes the ceiling", () => { + const b = computeContextBudget({ skillTokens: 0, mcpCount: 0, window: 256_000, loadFactor: 0.25 }); + expect(b.budget).toBe(64_000); + }); + + test("a tighter window can push a profile over budget", () => { + const within = computeContextBudget({ skillTokens: 90_000, mcpCount: 0, model: "claude-opus-4-8" }); + expect(within.withinBudget).toBe(true); + const tight = computeContextBudget({ skillTokens: 90_000, mcpCount: 0, window: 128_000 }); + expect(tight.budget).toBe(64_000); + expect(tight.withinBudget).toBe(false); + }); +}); + +describe("formatContextBudgetWarning", () => { + test("silent well under budget", () => { + const b = computeContextBudget({ skillTokens: 5_000, mcpCount: 1, model: "claude-opus-4-8" }); + expect(formatContextBudgetWarning(b)).toEqual([]); + }); + + test("soft ๐ŸŸก note when approaching the budget", () => { + // ~85% of a 128K budget: 100K skills + ~9K MCP. + const b = computeContextBudget({ skillTokens: 100_000, mcpCount: 6, model: "claude-opus-4-8" }); + const lines = formatContextBudgetWarning(b); + expect(lines.length).toBe(1); + expect(lines[0]).toContain("๐ŸŸก"); + }); + + test("๐Ÿ”ด over-budget warning past the ceiling", () => { + const b = computeContextBudget({ skillTokens: 140_000, mcpCount: 2, model: "claude-opus-4-8" }); + const lines = formatContextBudgetWarning(b); + expect(lines.length).toBe(2); + expect(lines[0]).toContain("๐Ÿ”ด"); + expect(lines.join(" ")).toContain("256K"); + }); + + test("mentions MCP count and works without color helpers", () => { + const b = computeContextBudget({ skillTokens: 140_000, mcpCount: 3, model: "claude-opus-4-8" }); + expect(formatContextBudgetWarning(b)[0]).toContain("3 MCPs"); + }); +}); + +describe("MODEL_CONTEXT_WINDOWS", () => { + test("known Claude models are present", () => { + expect(MODEL_CONTEXT_WINDOWS["claude-opus-4-8"]).toBe(256_000); + expect(MODEL_CONTEXT_WINDOWS["claude-sonnet-4-6"]).toBe(256_000); + }); +}); diff --git a/src/lib/token-budget.ts b/src/lib/token-budget.ts index 6f95018e..30b561a3 100644 --- a/src/lib/token-budget.ts +++ b/src/lib/token-budget.ts @@ -109,3 +109,190 @@ export function tokenLevelEmoji(alwaysOn: number): "๐Ÿ”ด" | "๐ŸŸ " | "๐ŸŸก" | " : alwaysOn > 5000 ? "๐ŸŸก" : "๐ŸŸข"; } + +// --------------------------------------------------------------------------- +// Model-aware startup budget +// +// The always-on footprint (skill frontmatter + MCP tool schemas) is paid on +// every single message, before the user has typed anything. The guidance we +// enforce here: at session start a profile should consume no more than HALF +// the model's context window, leaving the other half as working headroom for +// the actual task. For a 256K model that's a ~128K startup ceiling. +// --------------------------------------------------------------------------- + +/** Fallback context window (tokens) when neither the profile nor the env says + * which model is in play. cue can't auto-detect the main-session model (it's a + * per-launch `/model` choice), so 256K is the conservative baseline. */ +export const DEFAULT_CONTEXT_WINDOW = 256_000; + +/** Fraction of the window a profile may occupy at startup. Half the window + * keeps the other half free for the conversation. */ +export const DEFAULT_LOAD_FACTOR = 0.5; + +/** + * Rough always-on cost of one MCP server, in tokens. Every connected server + * injects its tool-schema definitions into the system prompt on every message. + * Real cost varies widely by server (a 2-tool server vs codegraph's dozens), + * so this is a deliberately conservative per-server heuristic; override with + * CUE_MCP_TOKENS_PER_SERVER when you have a measured number. + */ +export const MCP_TOKENS_PER_SERVER = 1500; + +/** + * Known model โ†’ context-window (tokens). Keyed on the exact model ids cue + * surfaces in its `/model` hints. The `[1m]` long-context Opus variant is the + * one outlier; everything else defaults to the standard 256K window. + */ +export const MODEL_CONTEXT_WINDOWS: Record = { + "claude-opus-4-8": 256_000, + "claude-opus-4-8[1m]": 1_000_000, + "claude-opus-4-7": 256_000, + "claude-opus-4-6": 256_000, + "claude-sonnet-4-6": 256_000, + "claude-haiku-4-5": 256_000, + "claude-fable-5": 256_000, +}; + +/** + * Resolve the context window to budget against. Precedence (first hit wins): + * 1. explicit `contextWindow` (e.g. the profile's declared field) + * 2. `model` looked up in MODEL_CONTEXT_WINDOWS + * 3. env `CUE_CONTEXT_WINDOW` (a raw token count) + * 4. env `CUE_MODEL` looked up in MODEL_CONTEXT_WINDOWS + * 5. DEFAULT_CONTEXT_WINDOW (256K) + * Non-positive / unparseable values are ignored so a bad override can't drive + * the budget to zero. + */ +export function resolveContextWindow(opts: { + contextWindow?: number; + model?: string; + env?: Record; +} = {}): number { + const env = opts.env ?? process.env; + if (opts.contextWindow && opts.contextWindow > 0) return opts.contextWindow; + if (opts.model && MODEL_CONTEXT_WINDOWS[opts.model]) return MODEL_CONTEXT_WINDOWS[opts.model]!; + const envWindow = Number(env.CUE_CONTEXT_WINDOW); + if (Number.isFinite(envWindow) && envWindow > 0) return envWindow; + const envModel = env.CUE_MODEL; + if (envModel && MODEL_CONTEXT_WINDOWS[envModel]) return MODEL_CONTEXT_WINDOWS[envModel]!; + return DEFAULT_CONTEXT_WINDOW; +} + +/** Estimate the always-on token cost of `mcpCount` connected MCP servers. */ +export function estimateMcpTokens( + mcpCount: number, + perServer: number = MCP_TOKENS_PER_SERVER, +): number { + if (mcpCount <= 0) return 0; + return mcpCount * perServer; +} + +export interface ContextBudget { + /** Model context window in tokens. */ + window: number; + /** Fraction of the window allowed at startup (0..1). */ + loadFactor: number; + /** Token ceiling = window * loadFactor. */ + budget: number; + /** Always-on skill frontmatter tokens. */ + skillTokens: number; + /** Estimated always-on MCP tool-schema tokens. */ + mcpTokens: number; + /** Number of MCP servers counted. */ + mcpCount: number; + /** Total always-on startup load = skillTokens + mcpTokens. */ + startupLoad: number; + /** True when the startup load is at or under the budget. */ + withinBudget: boolean; + /** Tokens over the budget (0 when within). */ + overBy: number; + /** startupLoad / window. */ + pctOfWindow: number; + /** startupLoad / budget. */ + pctOfBudget: number; +} + +/** + * Compute the model-aware startup budget for a profile. Pure: callers pass the + * measured skill frontmatter total and the MCP count; the window is resolved + * via `resolveContextWindow`. + */ +export function computeContextBudget(input: { + skillTokens: number; + mcpCount: number; + window?: number; + model?: string; + loadFactor?: number; + mcpTokensPerServer?: number; + env?: Record; +}): ContextBudget { + const window = resolveContextWindow({ + contextWindow: input.window, + model: input.model, + env: input.env, + }); + const loadFactor = input.loadFactor && input.loadFactor > 0 ? input.loadFactor : DEFAULT_LOAD_FACTOR; + const budget = Math.round(window * loadFactor); + const skillTokens = Math.max(0, input.skillTokens); + const mcpTokens = estimateMcpTokens(input.mcpCount, input.mcpTokensPerServer); + const startupLoad = skillTokens + mcpTokens; + const overBy = Math.max(0, startupLoad - budget); + return { + window, + loadFactor, + budget, + skillTokens, + mcpTokens, + mcpCount: Math.max(0, input.mcpCount), + startupLoad, + withinBudget: startupLoad <= budget, + overBy, + pctOfWindow: window > 0 ? startupLoad / window : 0, + pctOfBudget: budget > 0 ? startupLoad / budget : 0, + }; +} + +const fmtK = (n: number): string => `${(n / 1000).toFixed(n >= 100_000 ? 0 : 1)}K`; + +/** + * Format the model-aware budget block for the CLI. Returns `[]` when the + * profile sits comfortably under the budget (< 80%), a soft ๐ŸŸก note when it's + * approaching it, and a ๐Ÿ”ด over-budget warning past the ceiling. The `color` + * helpers are injected so this stays free of any terminal/ANSI dependency. + */ +export function formatContextBudgetWarning( + b: ContextBudget, + color: { yellow: (s: string) => string; bold: (s: string) => string; dim: (s: string) => string } = { + yellow: (s) => s, + bold: (s) => s, + dim: (s) => s, + }, +): string[] { + // Quiet until the profile is within striking distance of the ceiling. + if (b.pctOfBudget < 0.8) return []; + + const windowK = fmtK(b.window); + const budgetK = fmtK(b.budget); + const loadK = fmtK(b.startupLoad); + const pct = Math.round(b.pctOfWindow * 100); + const mcpNote = b.mcpCount > 0 ? ` + ${b.mcpCount} MCP${b.mcpCount > 1 ? "s" : ""}` : ""; + const lines: string[] = []; + + if (!b.withinBudget) { + const overK = fmtK(b.overBy); + lines.push( + `๐Ÿ”ด Context budget: ${color.yellow(`~${loadK}`)} always-on (skills${mcpNote}) โ€” ` + + `${color.bold(`over the ${Math.round(b.loadFactor * 100)}% startup target`)} for a ${windowK} model ` + + `(budget ~${budgetK}, over by ~${overK}).`, + ); + lines.push( + ` ${color.dim(`You're at ${pct}% of the window before the first message. Trim skills/MCPs, launch a narrower stack, or raise the window (CUE_CONTEXT_WINDOW).`)}`, + ); + } else { + lines.push( + `๐ŸŸก Context budget: ${color.yellow(`~${loadK}`)} always-on (skills${mcpNote}) โ€” ` + + `${Math.round(b.pctOfBudget * 100)}% of the ${Math.round(b.loadFactor * 100)}% startup target for a ${windowK} model (budget ~${budgetK}).`, + ); + } + return lines; +}