From 5b67df192dcfe1c1015119195794985fadfadd58 Mon Sep 17 00:00:00 2001
From: chauncygu <gshangd@163.com>
Date: Fri, 5 Jun 2026 09:52:47 -0700
Subject: [PATCH] feat: user-controllable token/cost budgets with tight
 enforcement + auto-save
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a friendly UX layer on top of the existing quota.py engine so users can
cap spend per session or per day, in tokens or USD, and never lose work when a
cap is hit.

- /budget command (commands/core.py): view usage vs each cap as colored bars +
  %, or set one — /budget $5 (cost), /budget 200k (tokens, parses 200k/1.5m),
  /budget daily $20, /budget clear. --budget $5 / --budget 200k startup flag.
- One budget per scope: a new cap REPLACES the other unit for that scope, so
  switching tokens<->USD just works and no stale cap silently keeps blocking.
- Tight enforcement (no surprise overshoot): check_quota projects the next
  request's input and stops BEFORE the call if it would cross the cap;
  quota.output_room clamps that call's max_tokens to the remaining headroom — a
  tool-heavy turn can't blow 40k->49k past the budget anymore.
- Proximity warnings at >=80% (yellow) / >=95% (red) end-of-turn.
- On hit: agent yields QuotaPause (carrying which cap broke: key/scope/unit/
  limit); the REPL auto-saves the session (session_latest.json + daily backup,
  the path /resume reads) and prints next steps. The "raise it" hint matches the
  breached unit (token cap -> /budget 40k, daily cost -> /budget daily $40).

quota.py: parse_budget / fmt_amount / usage_vs_limits / warnings / output_room,
projected check_quota, QuotaExceeded carries the breached cap. agent.py:
QuotaPause + projection + output clamp. cheetahclaws.py: /budget registration,
QuotaPause handling, --budget, near-limit warnings. 42-case tests/test_budget.py
(isolated quota dir). Docs: README, features.md, reference.md, news.md.
Defaults stay unlimited (interactive); daemon serve-mode guardrails unchanged.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 README.md                                |   3 +-
 agent.py                                 |  41 +++-
 cheetahclaws.py                          |  62 +++++-
 commands/core.py                         |  80 +++++++
 docs/guides/features.md                  |   1 +
 docs/guides/reference.md                 |   7 +
 docs/news.md                             |   3 +-
 quota.py                                 | 192 +++++++++++++++--
 tests/fixtures/golden_default_prompt.txt |   1 +
 tests/test_budget.py                     | 254 +++++++++++++++++++++++
 10 files changed, 617 insertions(+), 27 deletions(-)
 create mode 100644 tests/test_budget.py

diff --git a/README.md b/README.md
index d775769..83684e3 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,8 @@ Other install methods: [pip install](#alternative-install-with-pip) | [uv instal
 
 ## 🔥🔥🔥 News (Pacific Time)
 
-- June 5, 2026 (latest, **v3.05.82**): **Adaptive Markdown streaming — live output stays correct on every device** by auto-selecting a per-device tier (`live` in-place redraw on capable terminals incl. modern SSH emulators, append-only `commit` for SSH/Apple Terminal/pipes/CJK text so frames never duplicate, `plain` fallback); also ships a visual `/context` usage grid and a 1M context window for `deepseek-v4-flash`. Details: [docs/guides/features.md](docs/guides/features.md) · [docs/news.md](docs/news.md).
+- June 5, 2026 (latest, **v3.05.82**): **User-controllable token/cost budgets** — `/budget $5` / `/budget 200k` / `/budget daily $20` cap spend per session or per day, enforced before each model call; on hit the session auto-saves and you're shown how to `/resume` or raise the cap and continue (warns at ≥80%/95%; `--budget` sets it at startup). Details: [docs/guides/features.md](docs/guides/features.md) · [docs/news.md](docs/news.md).
+- June 5, 2026: **Adaptive Markdown streaming — live output stays correct on every device** by auto-selecting a per-device tier (`live` in-place redraw on capable terminals incl. modern SSH emulators, append-only `commit` for SSH/Apple Terminal/pipes/CJK text so frames never duplicate, `plain` fallback); also ships a visual `/context` usage grid and a 1M context window for `deepseek-v4-flash`. Details: [docs/guides/features.md](docs/guides/features.md) · [docs/news.md](docs/news.md).
 - June 4, 2026 (**v3.05.81**): **Claude-Code-style quiet output** hides per-tool execution and shows one summary line per turn (on by default), with a live spinner timer + token estimate and a `✻ Worked for…` footer; `/verbose` overrides, toggle with `/quiet`. Details: [docs/guides/features.md](docs/guides/features.md) · [docs/news.md](docs/news.md).
 - June 4, 2026: **Context-window override** — `/config context_window=<N>` sets the context length that drives the prompt `%`, `/context`, the compaction trigger, and the output cap consistently (distinct from `max_tokens`; read live, no restart). Details: [docs/guides/reference.md](docs/guides/reference.md) · [docs/news.md](docs/news.md).
 - June 4, 2026: **Rich Live streaming** keeps long responses live via a bounded tail window — redrawing only the most recent screenful and committing the full output when done, fixing duplicate/stale frames (builds on PR #133). Details: [docs/guides/features.md](docs/guides/features.md) · [docs/news.md](docs/news.md).
diff --git a/agent.py b/agent.py
index 673685e..5ad4909 100644
--- a/agent.py
+++ b/agent.py
@@ -57,6 +57,19 @@ class PermissionRequest:
     description: str
     granted: bool = False
 
+@dataclass
+class QuotaPause:
+    """Yielded when a configured budget is reached, instead of making a billable
+    call. The REPL auto-saves the session and tells the user how to resume or
+    raise the budget. ``usage`` is the snapshot from quota.get_usage(); the
+    key/scope/unit/limit identify which cap broke so the hint targets it."""
+    reason: str
+    usage: dict = field(default_factory=dict)
+    key: str | None = None
+    scope: str | None = None
+    unit: str | None = None
+    limit: float | None = None
+
 
 # ── Agent loop ─────────────────────────────────────────────────────────────
 
@@ -149,12 +162,34 @@ def run(
                       removed=_before_len - len(state.messages))
 
         # ── Quota check — before spending tokens ──────────────────────────
+        # Project this request's INPUT so a single large (tool-heavy) call can't
+        # blow past the cap, then clamp the OUTPUT cap to the remaining headroom
+        # so the response can't either — keeping the overshoot near zero.
+        _proj_tokens, _proj_cost = 0, 0.0
+        _call_config = config
+        if any(config.get(k) for k in ("session_token_budget", "session_cost_budget",
+                                       "daily_token_budget", "daily_cost_budget")):
+            try:
+                from compaction import estimate_tokens as _est_tok
+                from providers import calc_cost as _calc_cost
+                _proj_tokens = (_est_tok(state.messages)
+                                + _est_tok([{"role": "system", "content": system_prompt}]))
+                _proj_cost = _calc_cost(config["model"], _proj_tokens, 0)
+            except Exception:
+                _proj_tokens, _proj_cost = 0, 0.0
         try:
-            _quota.check_quota(session_id, config)
+            _quota.check_quota(session_id, config,
+                               projected_tokens=_proj_tokens, projected_cost=_proj_cost)
         except _quota.QuotaExceeded as qe:
             _log.warn("quota_exceeded", session_id=session_id, reason=qe.reason)
-            yield TextChunk(f"\n[Quota exceeded — {qe.reason}]\n")
+            yield QuotaPause(qe.reason, _quota.get_usage(session_id),
+                             key=qe.key, scope=qe.scope, unit=qe.unit, limit=qe.limit)
             break
+        _room = _quota.output_room(session_id, config, _proj_tokens, _proj_cost)
+        if _room is not None:
+            _cur_cap = config.get("max_tokens") or 4096
+            if _room < _cur_cap:
+                _call_config = {**config, "max_tokens": max(256, int(_room))}
 
         # NIM-only: when build.nvidia.com rate-limits a model, cycle to
         # the next free-tier model before consuming a regular retry. Capped
@@ -177,7 +212,7 @@ def run(
                     system=system_prompt,
                     messages=state.messages,
                     tool_schemas=get_tool_schemas(),
-                    config=config,
+                    config=_call_config,
                 ):
                     if isinstance(event, (TextChunk, ThinkingChunk)):
                         yield event
diff --git a/cheetahclaws.py b/cheetahclaws.py
index 5fdd5da..4c1587d 100755
--- a/cheetahclaws.py
+++ b/cheetahclaws.py
@@ -23,6 +23,7 @@
   /history    Print conversation history
   /context    Show context window usage
   /cost       Show API cost this session
+  /budget     View or set token/cost budgets (session + daily)
   /status     Show current session status (model, mode, tokens, cost)
   /verbose    Toggle verbose mode
   /quiet      Toggle compact tool display (hide execution, show per-turn summary)
@@ -239,7 +240,7 @@ def __getattr__(self, name):
 
 # ── Core commands ──────────────────────────────────────────────────────────
 from commands.core import (
-    cmd_help, cmd_clear, cmd_context, cmd_cost, cmd_compact,
+    cmd_help, cmd_clear, cmd_context, cmd_cost, cmd_budget, cmd_compact,
     cmd_init, cmd_export, cmd_copy, cmd_status, cmd_doctor,
     cmd_proactive, cmd_image, cmd_circuit, cmd_web, run_setup_wizard,
 )
@@ -452,6 +453,7 @@ def _proactive_watcher_loop(config):
     "search":      cmd_search,
     "context":     cmd_context,
     "cost":        cmd_cost,
+    "budget":      cmd_budget,
     "verbose":     cmd_verbose,
     "quiet":       cmd_quiet,
     "thinking":    cmd_thinking,
@@ -615,6 +617,7 @@ def handle_slash(line: str, state, config) -> Union[bool, tuple]:
     "search":      ("Search past sessions",               []),
     "context":     ("Visualize context-window usage by category", []),
     "cost":        ("Show cost estimate",                 []),
+    "budget":      ("View or set token/cost budgets (session + daily)", ["session", "daily", "clear"]),
     "verbose":     ("Toggle verbose output",              []),
     "quiet":       ("Toggle compact tool display",        []),
     "thinking":    ("Toggle extended thinking",           []),
@@ -895,7 +898,7 @@ def _headless_run_query(prompt: str, is_background: bool = False) -> None:
 def repl(config: dict, initial_prompt: str = None):
     from cc_config import HISTORY_FILE
     from context import build_system_prompt
-    from agent import AgentState, run, TextChunk, ThinkingChunk, ToolStart, ToolEnd, TurnDone, PermissionRequest
+    from agent import AgentState, run, TextChunk, ThinkingChunk, ToolStart, ToolEnd, TurnDone, PermissionRequest, QuotaPause
 
     if HAS_PROMPT_TOOLKIT:
         # Inject live providers so ui.input's completer enumerates the same
@@ -1101,6 +1104,7 @@ def run_query(user_input: str, is_background: bool = False):
             turn_start = time.monotonic()
             turn_in_tokens = 0
             turn_out_tokens = 0
+            quota_paused = False    # set when a budget is reached mid-turn
             streamed_chars = 0
 
             # Rebuild system prompt each turn (picks up cwd changes, etc.)
@@ -1251,6 +1255,38 @@ def run_query(user_input: str, is_background: bool = False):
                                 f"\n  [tokens: +{event.input_tokens} in / "
                                 f"+{event.output_tokens} out]", "dim"
                             ))
+
+                    elif isinstance(event, QuotaPause):
+                        # A configured budget was reached BEFORE making the next
+                        # (billable) call. Auto-save so nothing is lost, then tell
+                        # the user how to resume or raise the budget and continue.
+                        _stop_tool_spinner()
+                        spinner_shown = False
+                        flush_response()
+                        quota_paused = True
+                        print()
+                        print(clr(f"  ⛔ Budget reached — {event.reason}", "yellow", "bold"))
+                        # save_latest() prints the saved paths itself — don't echo.
+                        try:
+                            from commands.session import save_latest
+                            save_latest("", state, config)
+                        except Exception:
+                            pass
+                        # Suggest raising the cap that actually broke, in its own
+                        # unit/scope — a token cap can't be lifted with a $ amount.
+                        try:
+                            import quota as _q
+                            _pre = "daily " if event.scope == "daily" else ""
+                            _amt = _q.fmt_amount((event.limit or 0) * 2, event.unit or "tok")
+                            _raise_cmd = f"/budget {_pre}{_amt}" if event.limit else "/budget 40k"
+                        except Exception:
+                            _raise_cmd = "/budget 40k"
+                        print(clr("  To continue:", "bold"))
+                        print("    • raise it:   " + clr(_raise_cmd, "cyan")
+                              + "  (or " + clr("/budget clear", "cyan") + "), then resend your message")
+                        print("    • later:      restart and run " + clr("/resume", "cyan")
+                              + " to pick up where you left off")
+                        print("    • view usage: " + clr("/budget", "cyan"))
             except KeyboardInterrupt:
                 _stop_tool_spinner()
                 flush_response()
@@ -1285,6 +1321,15 @@ def run_query(user_input: str, is_background: bool = False):
             if quiet:
                 print_turn_stats(time.monotonic() - turn_start,
                                  turn_in_tokens, turn_out_tokens)
+            # Budget proximity warnings (≥80% / ≥95%) — heads-up before the hard
+            # stop arrives. Skipped when this turn already hit the cap.
+            if not quota_paused:
+                try:
+                    import quota as _quota
+                    for _level, _msg in _quota.warnings(config.get("_session_id", "default"), config):
+                        (err if _level == "crit" else warn)(f"  ⚠ Budget: {_msg} — /budget to view")
+                except Exception:
+                    pass
             print(clr("╰──────────────────────────────────────────────", "dim"))
             print()
 
@@ -1912,6 +1957,10 @@ def main():
                         help="Show each tool call instead of a per-turn summary")
     parser.add_argument("--thinking", action="store_true",
                         help="Enable extended thinking")
+    parser.add_argument("--budget", metavar="AMOUNT",
+                        help="Session budget cap, e.g. --budget $5 (cost) or "
+                             "--budget 200k (tokens). Auto-saves and prompts to "
+                             "resume / raise when reached.")
     parser.add_argument("--version", action="store_true", help="Print version")
     parser.add_argument("--setup", action="store_true", help="Run interactive setup wizard")
     parser.add_argument("--web", action="store_true",
@@ -1994,6 +2043,15 @@ def main():
         config["quiet"] = False
     if args.thinking:
         config["thinking"] = True
+    if getattr(args, "budget", None):
+        import quota as _quota
+        try:
+            _kind, _val = _quota.parse_budget(args.budget)
+            config[_quota.BUDGET_KEYS[(_kind, "session")]] = _val
+            _shown = _quota.fmt_amount(_val, "usd" if _kind == "cost" else "tok")
+            print(clr(f"  Session {'cost' if _kind == 'cost' else 'token'} budget: {_shown}", "dim"))
+        except ValueError as _e:
+            warn(f"--budget: {_e} (e.g. --budget $5 or --budget 200k); ignoring.")
 
     # ── Setup wizard: --setup flag or first-run auto-trigger ─────────────
     from cc_config import CONFIG_FILE
diff --git a/commands/core.py b/commands/core.py
index fd57e52..413bf0c 100644
--- a/commands/core.py
+++ b/commands/core.py
@@ -221,6 +221,86 @@ def cmd_cost(_args: str, state, config) -> bool:
     return True
 
 
+def _budget_bar(pct: float | None, width: int = 16) -> str:
+    filled = int(round((pct or 0) / 100 * width))
+    filled = max(0, min(width, filled))
+    return "█" * filled + "░" * (width - filled)
+
+
+def cmd_budget(args: str, state, config) -> bool:
+    """View or set token / cost budgets (session + daily).
+
+    /budget                 show usage vs every budget (bars + %)
+    /budget $5              session cost cap (the $ means USD)
+    /budget 200k            session token cap (supports 200k / 1.5m / 200000)
+    /budget daily $20       daily cost cap   ·   /budget daily 2m  daily tokens
+    /budget clear           remove all caps (unlimited)
+    """
+    import quota as _quota
+    from cc_config import save_config
+
+    arg = args.strip()
+    sid = config.get("_session_id", "default")
+
+    # ── view ────────────────────────────────────────────────────────────────
+    if not arg:
+        rows = _quota.usage_vs_limits(sid, config)
+        print(clr("  Token Budget", "bold"))
+        any_set = False
+        for r in rows:
+            used = _quota.fmt_amount(r["used"], r["unit"])
+            if r["limit"] is None:
+                print(f"  {r['label']:<15} {used:>9}  " + clr("unlimited", "dim"))
+                continue
+            any_set = True
+            lim = _quota.fmt_amount(r["limit"], r["unit"])
+            pct = r["pct"] or 0
+            color = "red" if pct >= 95 else ("yellow" if pct >= 80 else "green")
+            print(f"  {r['label']:<15} {used:>9} / {lim:<9} "
+                  f"{clr(_budget_bar(pct), color)} {pct:4.0f}%")
+        print()
+        if any_set:
+            info("  Change: /budget $5 · /budget 200k · /budget daily $20 · /budget clear")
+        else:
+            info("  No budgets set (unlimited). Set one: /budget $5 · /budget 200k · /budget daily $20")
+        return True
+
+    # ── clear ─────────────────────────────────────────────────────────────────
+    if arg.lower() in ("clear", "off", "none", "reset", "unlimited"):
+        for key in _quota.BUDGET_KEYS.values():
+            config[key] = None
+        save_config(config)
+        ok("All budgets cleared (unlimited).")
+        return True
+
+    # ── set ───────────────────────────────────────────────────────────────────
+    parts = arg.split()
+    scope = "session"
+    if parts[0].lower() in ("session", "daily"):
+        scope, rest = parts[0].lower(), " ".join(parts[1:])
+    else:
+        rest = arg
+    if not rest.strip():
+        err("Usage: /budget [session|daily] <amount>  —  e.g. /budget $5  ·  /budget daily 2m")
+        return True
+    try:
+        kind, value = _quota.parse_budget(rest)
+    except ValueError as e:
+        err(f"{e}. Examples: /budget $5 (cost) · /budget 200k (tokens) · /budget daily $20")
+        return True
+    config[_quota.BUDGET_KEYS[(kind, scope)]] = value
+    # One budget per scope: a new cap replaces the other unit for that scope, so
+    # e.g. setting a $ cap clears a leftover token cap that would still block.
+    config[_quota.BUDGET_KEYS[("tokens" if kind == "cost" else "cost", scope)]] = None
+    save_config(config)
+    shown = _quota.fmt_amount(value, "usd" if kind == "cost" else "tok")
+    ok(f"{scope.capitalize()} budget set to {shown} "
+       f"({'cost' if kind == 'cost' else 'tokens'}).")
+    info(f"Replaces any previous {scope} cap. Checked before each model call; "
+         "auto-saves and shows how to resume when reached.")
+    return True
+
+
 def cmd_compact(args: str, state, config) -> bool:
     """Manually compact conversation history."""
     from compaction import manual_compact
diff --git a/docs/guides/features.md b/docs/guides/features.md
index 2f228df..49e9260 100644
--- a/docs/guides/features.md
+++ b/docs/guides/features.md
@@ -55,5 +55,6 @@ and indexed in the [README Documentation section](../../README.md#documentation)
 | Cloud sync | `/cloudsave` syncs sessions to private GitHub Gists; auto-sync on exit; load from cloud by Gist ID. No new dependencies (stdlib `urllib`). |
 | Extended Thinking | Toggle on/off for Claude models; native `<think>` block streaming for local Ollama reasoning models (deepseek-r1, qwen3, gemma4) |
 | Cost tracking | Token usage + estimated USD cost |
+| Token / cost budgets | `/budget` sets and views spend caps — per-session or per-day, in tokens or USD (`/budget $5`, `/budget 200k`, `/budget daily $20`, `/budget clear`; or `--budget $5` at startup). **One budget per scope**: a new cap replaces the other unit for that scope (so switching tokens↔USD just works, no stale cap left blocking). Enforced before each model call, and **tight** — it projects the next request's input and clamps its output cap, so a single tool-heavy turn can't overshoot the budget. Warns at ≥80%/95%. When a cap is hit the session is **auto-saved** and you're shown how to `/resume` later or raise the **same** cap (the hint matches the breached unit) and continue — nothing is lost. Backed by `quota.py`; the daemon ships conservative defaults (200k tok / $2 per session) in `serve` mode. |
 | Non-interactive mode | `--print` flag for scripting / CI |
 | **Web UI** | `--web` opens the browser. Multi-user accounts (bcrypt + JWT), SQLite-persisted history, session CRUD + markdown export, light/dark/system theme, `/health` + `/metrics`, auto-picks a free port if 8080 is busy. `pip install 'cheetahclaws[web]'`. See [web-ui.md](web-ui.md). |
diff --git a/docs/guides/reference.md b/docs/guides/reference.md
index 7faa5a0..86da3ca 100644
--- a/docs/guides/reference.md
+++ b/docs/guides/reference.md
@@ -14,6 +14,8 @@ Options:
   --show-tools         Show each tool call instead of a per-turn summary
                        (alias: --no-quiet; default is the compact summary)
   --thinking           Enable Extended Thinking (Claude only)
+  --budget AMOUNT      Session budget cap: --budget $5 (cost) or --budget 200k
+                       (tokens). Auto-saves and prompts to resume / raise on hit.
   --version            Print version and exit
   -h, --help           Show help
 ```
@@ -64,6 +66,7 @@ Type `/` and press **Tab** to see all commands with descriptions. Continue typin
 | `/history` | Print full conversation history |
 | `/context` | Visualize context-window usage as a Claude-Code-style cell grid, broken down by category (system prompt, system tools, memory files, skills, messages, free space) with per-category token counts and percentages. Honors a `context_window` override; falls back to `#`/`.` when the terminal isn't UTF-8. |
 | `/cost` | Show token usage and estimated USD cost |
+| `/budget` | View or set token/cost budgets. No args = show usage vs each budget (bars + %). `/budget $5` = session cost cap (USD); `/budget 200k` = session token cap (supports `200k`/`1.5m`); `/budget daily $20` / `/budget daily 2m` = daily caps; `/budget clear` = remove all. **One budget per scope** — a new cap *replaces* the other unit for that scope (so `/budget $5` after `/budget 200k` switches the session cap to cost, it doesn't stack). Enforced before each model call (projects the next request's input + clamps its output, so overshoot stays ≈ 0); warns at ≥80%/95%; on hit, auto-saves the session and prints how to `/resume` or raise the **same** cap (the hint matches the breached unit) and continue. Backed by the `session_token_budget` / `session_cost_budget` / `daily_token_budget` / `daily_cost_budget` config keys. |
 | `/verbose` | Toggle verbose mode (tokens + thinking) |
 | `/quiet` | Toggle compact tool display — hide per-tool execution lines and show one summary line per turn (on by default; `/verbose` overrides it) |
 | `/thinking` | Toggle Extended Thinking (Claude only) |
@@ -330,6 +333,10 @@ Keys are saved to `~/.cheetahclaws/config.json` and loaded automatically on next
   "quiet": true,
   "thinking": false,
   "stream_mode": null,
+  "session_token_budget": null,
+  "session_cost_budget": null,
+  "daily_token_budget": null,
+  "daily_cost_budget": null,
   "qwen_api_key": "sk-...",
   "kimi_api_key": "sk-...",
   "deepseek_api_key": "sk-...",
diff --git a/docs/news.md b/docs/news.md
index 503e200..28994ef 100644
--- a/docs/news.md
+++ b/docs/news.md
@@ -3,7 +3,8 @@
 ## 🔥🔥🔥 News (Pacific Time)
 
 
-- June 5, 2026 (**v3.05.82**) (latest): **Adaptive Markdown streaming — live output that stays correct on every device.** In-place Rich Live redraw is great on capable terminals but breaks elsewhere: it was disabled wholesale over SSH (so SSH users got raw tokens with no formatting), and where it *did* run it could leave **duplicate or stale frames** — on macOS Terminal (which can't erase above the scroll boundary), over laggy network PTYs, or with **wide CJK / emoji text** whose display width a naive line-count gets wrong. The renderer now selects a **streaming tier per device** in `ui.render.auto_stream_mode(config)`: **`live`** — full in-place redraw, only on terminals known to handle cursor-up (local TTYs, and modern emulators *even over SSH*: iTerm2, WezTerm, Windows Terminal, VSCode, kitty, Alacritty, Ghostty, detected via `TERM_PROGRAM` / `TERM` / `WT_SESSION` / `KITTY_WINDOW_ID` / `ALACRITTY_WINDOW_ID` / `WEZTERM_PANE`); **`commit`** — **append-only progressive Markdown**, the safe default for unknown-SSH / Apple Terminal / pipes / non-TTY, where each completed block (split on blank lines, respecting open code fences so a fenced block renders atomically) is rendered and printed **permanently** and the cursor is **never moved**, making a duplicate frame structurally impossible regardless of terminal, latency, or character width; **`plain`** — raw tokens, only when `rich` is unavailable. The append-only floor is provably duplication-free; `live` is progressive enhancement on top. Override with **`/config stream_mode=live|commit|plain`** (legacy boolean **`/config rich_live=true|false`** still works → `live`/`commit`). Implemented in `ui/render.py` (`set_stream_mode` / `auto_stream_mode` / `_safe_commit_point` / `_commit_stream` / `_commit_flush`), wired in at REPL start in `cheetahclaws.py`, with a 26-case test suite in `tests/test_stream_modes.py` (device routing, code-fence-aware block boundaries, append-only commit, and a regression asserting commit mode emits **zero** cursor sequences even on a TTY with CJK text). Two related UX items shipped alongside: **`/context` is now a visual grid** — a Claude-Code-style 20×10 cell grid of context-window usage, colored and broken down by category (system prompt / system tools / memory files / skills / messages / free space) with per-category token counts and percentages, adapting to the model's real context window and falling back to `#`/`.` on non-UTF-8 terminals (`commands/core.py:cmd_context`); and **`deepseek-v4-flash` is registered at its 1M context window** in `providers._MODEL_CONTEXT_LIMITS` (overriding the 128K deepseek provider default, which still applies to `deepseek-chat` / `deepseek-v4-pro`), so the prompt `%`, `/context`, and the compaction trigger all reflect the true 1M window. See [docs/guides/features.md](guides/features.md) · [docs/guides/reference.md](guides/reference.md).
+- June 5, 2026 (**v3.05.82**) (latest): **User-controllable token / cost budgets — set a spend cap; on hit the session auto-saves and you can resume or raise it.** The quota engine (`quota.py`: per-session + per-day token/cost counters, enforced before each model call) already existed but had no friendly surface — you had to know four config keys (`session_token_budget` / `session_cost_budget` / `daily_token_budget` / `daily_cost_budget`) and there was no way to see how close you were, no warning before the wall, and the hard stop printed a bare `[Quota exceeded]`. This adds the UX layer on top of the unchanged engine: a **`/budget`** command — no args shows usage vs every budget as colored bars + percentages; **`/budget $5`** sets a session **cost** cap (the `$` means USD), **`/budget 200k`** a session **token** cap (parses `200k` / `1.5m` / `200000`), **`/budget daily $20`** / **`/budget daily 2m`** the daily caps, and **`/budget clear`** removes all. A **`--budget $5`** / **`--budget 200k`** startup flag sets the session cap at launch. **Proximity warnings** fire at the end of any turn that crosses **≥80%** (yellow) / **≥95%** (red) of a cap, so the wall never arrives by surprise. **On hit** the agent now yields a `QuotaPause` event (instead of a plain text line): the REPL **auto-saves the session** (`session_latest.json` + daily backup, the same path `/resume` reads) and prints a friendly next-steps block — raise the **same** cap or remove it (`/budget clear`) then resend, or restart later and `/resume`. So a long task that runs out of budget is never lost: you analyze, adjust, and continue. **Tight enforcement (no surprise overshoot):** the check projects the next request's *input* (`compaction.estimate_tokens`) and stops *before* the call if it would cross the cap, and clamps that call's `max_tokens` to the remaining headroom (`quota.output_room`) — so a single tool-heavy turn can't blow 40k→49k past the budget the way a pure "already-spent ≥ limit" check let it. **One budget per scope:** setting a cap *replaces* the other unit for that scope (`/budget $5` after `/budget 200k` switches the session cap to cost rather than stacking), so a leftover token cap can't silently keep blocking after you switch to a `$` cap. **Unit-matched hint:** `QuotaExceeded` / `QuotaPause` carry which cap broke (`key`/`scope`/`unit`/`limit`), so the "raise it" suggestion is in the *right* unit — a token cap shows `/budget 40k`, a daily cost cap shows `/budget daily $40` — instead of a generic `$` amount that wouldn't lift a token cap. New helpers `quota.parse_budget` / `fmt_amount` / `usage_vs_limits` / `warnings` / `output_room`; command in `commands/core.py:cmd_budget`; `QuotaPause` in `agent.py`; REPL handling + `--budget` in `cheetahclaws.py`; 42-case `tests/test_budget.py` (isolated quota dir, incl. a regression that the hint matches the breached unit and that switching units clears the stale cap). The daemon's conservative `serve`-mode defaults (200k tok / $2 per session, 2M / $20 per day) are unchanged — interactive stays unlimited by default, the server stays guard-railed. See [docs/guides/features.md](guides/features.md) · [docs/guides/reference.md](guides/reference.md).
+- June 5, 2026 (**v3.05.82**): **Adaptive Markdown streaming — live output that stays correct on every device.** In-place Rich Live redraw is great on capable terminals but breaks elsewhere: it was disabled wholesale over SSH (so SSH users got raw tokens with no formatting), and where it *did* run it could leave **duplicate or stale frames** — on macOS Terminal (which can't erase above the scroll boundary), over laggy network PTYs, or with **wide CJK / emoji text** whose display width a naive line-count gets wrong. The renderer now selects a **streaming tier per device** in `ui.render.auto_stream_mode(config)`: **`live`** — full in-place redraw, only on terminals known to handle cursor-up (local TTYs, and modern emulators *even over SSH*: iTerm2, WezTerm, Windows Terminal, VSCode, kitty, Alacritty, Ghostty, detected via `TERM_PROGRAM` / `TERM` / `WT_SESSION` / `KITTY_WINDOW_ID` / `ALACRITTY_WINDOW_ID` / `WEZTERM_PANE`); **`commit`** — **append-only progressive Markdown**, the safe default for unknown-SSH / Apple Terminal / pipes / non-TTY, where each completed block (split on blank lines, respecting open code fences so a fenced block renders atomically) is rendered and printed **permanently** and the cursor is **never moved**, making a duplicate frame structurally impossible regardless of terminal, latency, or character width; **`plain`** — raw tokens, only when `rich` is unavailable. The append-only floor is provably duplication-free; `live` is progressive enhancement on top. Override with **`/config stream_mode=live|commit|plain`** (legacy boolean **`/config rich_live=true|false`** still works → `live`/`commit`). Implemented in `ui/render.py` (`set_stream_mode` / `auto_stream_mode` / `_safe_commit_point` / `_commit_stream` / `_commit_flush`), wired in at REPL start in `cheetahclaws.py`, with a 26-case test suite in `tests/test_stream_modes.py` (device routing, code-fence-aware block boundaries, append-only commit, and a regression asserting commit mode emits **zero** cursor sequences even on a TTY with CJK text). Two related UX items shipped alongside: **`/context` is now a visual grid** — a Claude-Code-style 20×10 cell grid of context-window usage, colored and broken down by category (system prompt / system tools / memory files / skills / messages / free space) with per-category token counts and percentages, adapting to the model's real context window and falling back to `#`/`.` on non-UTF-8 terminals (`commands/core.py:cmd_context`); and **`deepseek-v4-flash` is registered at its 1M context window** in `providers._MODEL_CONTEXT_LIMITS` (overriding the 128K deepseek provider default, which still applies to `deepseek-chat` / `deepseek-v4-pro`), so the prompt `%`, `/context`, and the compaction trigger all reflect the true 1M window. See [docs/guides/features.md](guides/features.md) · [docs/guides/reference.md](guides/reference.md).
 - June 4, 2026 (**v3.05.81**): **Claude-Code-style quiet output — hide tool execution, show one summary line per turn.** Long analysis turns used to scroll the terminal with a `⚙ Bash(...)` line and a `✓ → N lines (… chars)` line for *every* tool call, and the permission prompt dumped the entire inline script (e.g. a 60-line `python3 << 'PYEOF'` heredoc). A new **quiet mode (on by default)** suppresses the per-tool lines — the spinner conveys live activity and a single summary line is emitted at the tool→text boundary, sitting just above the reply (`Read 2 files, ran 3 shell commands`), the way Claude Code does. Errors and denials still surface so a mid-turn failure is never silent. In quiet mode the **permission prompt also collapses** a multi-line command to one line (`Run: python3 << 'PYEOF'  … (+59 行)`) instead of printing the whole script. `/verbose` overrides quiet (full per-tool lines + inputs + token counts); toggle with **`/quiet`**, or launch with **`--show-tools`** (alias `--no-quiet`). The startup banner gains an **`Output: quiet` / `Output: full`** line so the active mode is visible at a glance. **Live status line:** the spinner now shows elapsed time plus a running output-token estimate (`Thinking… (7s · ↓ 435 tokens)`) — char-based, since providers only report real usage at the end — and each quiet turn closes with a real-usage footer **`✻ Worked for 7.2s · ↑ 1.2k · ↓ 435`** built from the true `TurnDone` counts. Implemented in `ui/render.py` (turn-level tool accumulator + `turn_summary_line()`, spinner token meter, `print_turn_stats()`), wired through the REPL event loop in `cheetahclaws.py`, with the `/quiet` toggle in `commands/config_cmd.py`. See [docs/guides/features.md](guides/features.md).
 - June 4, 2026: **Context-window override — the prompt % and compaction now follow a settable context length.** The prompt's context-usage `%` (and the compaction trigger) derive from the model's context window, which previously could only be a hardcoded provider default — and `max_tokens` (the OUTPUT cap) doesn't change it, so `/config max_tokens=…` left the `%` unchanged (a common point of confusion). New per-session key **`context_window`** (`/config context_window=<N>`, `0` = model default) overrides it, kept deliberately distinct from `max_tokens`. A single parser (`providers.context_window_override`) feeds the prompt `%`, `/context`, the compaction trigger, **and** the per-call output-token cap, so all four stay consistent; it is bidirectional — a smaller value forces earlier compaction, a larger value corrects a stale default. The value is read live each prompt, so switching model **or** `context_window` updates the `%` with no restart. `/config` warns when the value exceeds the model's real window (which would disable compaction and let the API reject oversized prompts). No-op when unset, so existing behavior is unchanged. See [docs/guides/reference.md](guides/reference.md).
 - June 4, 2026: **Rich Live streaming — long responses stay live via a bounded tail window.** Large streamed responses that would overflow the terminal's redraw area could leave duplicate or stale frames behind on some emulators (macOS Terminal, etc.), because Rich Live redraws the whole accumulated output in place and the cursor can't reach content that has scrolled into the scrollback. Building on the per-response fallback from PR #133, Rich Live now keeps the live region **bounded to the viewport**: a short response is shown in full, but once it would overflow, only the **last screenful of rendered lines (a tail window) is redrawn** — so the Live region can never exceed the terminal and cannot leave stale frames. The complete output is committed once when the response finishes (including on Ctrl-C, since the REPL flushes on interrupt), so the head that scrolled out of the window is never lost. Plain streaming is kept only as a safety net (precise render failed, or the terminal is too small to bound a window). A cheap per-line wrap estimate short-circuits the expensive full `render_lines()` measurement while a response stays well under the limit, so normal responses pay no extra Markdown re-render per chunk. Adds focused tests covering full-frame streaming, the full→tail transition, tail-window commit-on-flush, real `Segments` rendering, and both safety-net fallbacks. See [docs/guides/features.md](guides/features.md).
diff --git a/quota.py b/quota.py
index d10ba5e..428c2b8 100644
--- a/quota.py
+++ b/quota.py
@@ -23,10 +23,18 @@
 
 
 class QuotaExceeded(Exception):
-    """Raised before an API call when a configured budget would be exceeded."""
-    def __init__(self, reason: str):
+    """Raised before an API call when a configured budget would be exceeded.
+
+    Carries which cap broke (``key`` / ``scope`` / ``unit`` / ``limit``) so the
+    REPL can suggest raising *that* cap in the right unit instead of a generic,
+    possibly-wrong hint."""
+    def __init__(self, reason: str, *, key=None, scope=None, unit=None, limit=None):
         super().__init__(reason)
         self.reason = reason
+        self.key = key          # config key, e.g. "session_token_budget"
+        self.scope = scope      # "session" | "daily"
+        self.unit = unit        # "tok" | "usd"
+        self.limit = limit      # the breached limit value
 
 
 # ── In-memory counters (per session, reset on session end) ─────────────────
@@ -73,10 +81,17 @@ def _save_daily(tokens: int, cost: float) -> None:
 
 # ── Public API ─────────────────────────────────────────────────────────────
 
-def check_quota(session_id: str, config: dict) -> None:
+def check_quota(session_id: str, config: dict,
+                projected_tokens: int = 0, projected_cost: float = 0.0) -> None:
     """
-    Raise QuotaExceeded if any configured limit has already been reached.
+    Raise QuotaExceeded if any configured limit is (or would be) reached.
     Call this BEFORE making an API request.
+
+    ``projected_tokens`` / ``projected_cost`` estimate the pending request's
+    INPUT. When given, the cap also fires if the *next* call would cross it —
+    stopping before the (billable) call instead of letting one large tool-heavy
+    turn overshoot the budget. With both at 0 the behaviour is the original
+    "already spent ≥ limit" check, so existing callers are unaffected.
     """
     lim_st = config.get("session_token_budget") or 0
     lim_sc = config.get("session_cost_budget")  or 0.0
@@ -92,22 +107,66 @@ def check_quota(session_id: str, config: dict) -> None:
         sc = _sess_cost.get(session_id, 0.0)
         dt, dc = _load_daily()
 
-    if lim_st and st >= lim_st:
-        raise QuotaExceeded(
-            f"Session token budget reached ({st:,}/{lim_st:,} tokens)"
-        )
-    if lim_sc and sc >= lim_sc:
-        raise QuotaExceeded(
-            f"Session cost budget reached (${sc:.4f}/${lim_sc:.4f})"
-        )
-    if lim_dt and dt >= lim_dt:
-        raise QuotaExceeded(
-            f"Daily token budget reached ({dt:,}/{lim_dt:,} tokens)"
-        )
-    if lim_dc and dc >= lim_dc:
-        raise QuotaExceeded(
-            f"Daily cost budget reached (${dc:.4f}/${lim_dc:.4f})"
-        )
+    pt = max(0, int(projected_tokens or 0))
+    pc = max(0.0, float(projected_cost or 0.0))
+
+    # For each cap: a hard stop when already reached, else a pre-call stop when
+    # the projected next request would cross it (overshoot stays ≈ 0). Each raise
+    # tags which cap broke so the REPL can suggest raising it in the right unit.
+    specs = [
+        ("session_token_budget", "session", "tok", lim_st, st, pt),
+        ("session_cost_budget",  "session", "usd", lim_sc, sc, pc),
+        ("daily_token_budget",   "daily",   "tok", lim_dt, dt, pt),
+        ("daily_cost_budget",    "daily",   "usd", lim_dc, dc, pc),
+    ]
+    for key, scope, unit, lim, used, proj in specs:
+        if not lim:
+            continue
+        label = f"{scope.capitalize()} {'token' if unit == 'tok' else 'cost'} budget"
+        if unit == "tok":
+            caps = f"{lim:,}"
+            reached_amt, proj_amt, tail = f"{used:,}", f"{used + proj:,}", " tokens"
+        else:
+            caps = f"${lim:.4f}"
+            reached_amt, proj_amt, tail = f"${used:.4f}", f"${used + proj:.4f}", ""
+        if used >= lim:
+            raise QuotaExceeded(f"{label} reached ({reached_amt}/{caps}{tail})",
+                                key=key, scope=scope, unit=unit, limit=lim)
+        if used + proj >= lim:
+            raise QuotaExceeded(f"{label} would be exceeded by the next request "
+                                f"(~{proj_amt}/{caps}{tail})",
+                                key=key, scope=scope, unit=unit, limit=lim)
+
+
+def output_room(session_id: str, config: dict,
+                projected_tokens: int = 0, projected_cost: float = 0.0) -> int | None:
+    """Max output tokens this call may emit before any configured budget is hit,
+    given the projected input already counted. ``None`` when no token/cost cap
+    constrains the output. Used to clamp ``max_tokens`` so one response can't
+    blow past a cap (cost caps convert via the model's per-output-token price)."""
+    u = get_usage(session_id)
+    pt = max(0, int(projected_tokens or 0))
+    pc = max(0.0, float(projected_cost or 0.0))
+    rooms: list[int] = []
+    if config.get("session_token_budget"):
+        rooms.append(int(config["session_token_budget"]) - u["session_tokens"] - pt)
+    if config.get("daily_token_budget"):
+        rooms.append(int(config["daily_token_budget"]) - u["daily_tokens"] - pt)
+    try:
+        from providers import COSTS, bare_model
+        _ic, oc = COSTS.get(bare_model(config.get("model", "")), (0.0, 0.0))
+    except Exception:
+        oc = 0.0
+    if oc and oc > 0:
+        if config.get("session_cost_budget"):
+            rooms.append(int((float(config["session_cost_budget"]) - u["session_cost"] - pc)
+                             * 1_000_000 / oc))
+        if config.get("daily_cost_budget"):
+            rooms.append(int((float(config["daily_cost_budget"]) - u["daily_cost"] - pc)
+                             * 1_000_000 / oc))
+    if not rooms:
+        return None
+    return max(0, min(rooms))
 
 
 def record_usage(session_id: str, model: str, in_tokens: int, out_tokens: int) -> None:
@@ -152,3 +211,96 @@ def reset_session(session_id: str) -> None:
     with _lock:
         _sess_tokens.pop(session_id, None)
         _sess_cost.pop(session_id, None)
+
+
+# ── User-facing helpers (for the /budget command, --budget flag, warnings) ──
+
+# Maps a parsed budget kind+scope to its config key.
+BUDGET_KEYS = {
+    ("tokens", "session"): "session_token_budget",
+    ("cost",   "session"): "session_cost_budget",
+    ("tokens", "daily"):   "daily_token_budget",
+    ("cost",   "daily"):   "daily_cost_budget",
+}
+
+
+def parse_budget(s: str) -> tuple[str, float]:
+    """Parse a human budget string into ``(kind, value)``.
+
+    Cost (``kind="cost"``) when prefixed ``$`` or suffixed ``usd``/``$``
+    (e.g. ``$5``, ``5usd`` → ``("cost", 5.0)``); otherwise a token count with
+    optional ``k``/``m`` suffix (``200k`` → ``("tokens", 200000)``,
+    ``1.5m`` → ``("tokens", 1500000)``). Raises ``ValueError`` on bad input.
+    """
+    raw = s.strip().lower().replace(",", "").replace(" ", "")
+    if not raw:
+        raise ValueError("empty budget")
+    is_cost = False
+    if raw.startswith("$"):
+        is_cost, raw = True, raw[1:]
+    elif raw.endswith("usd"):
+        is_cost, raw = True, raw[:-3]
+    elif raw.endswith("$"):
+        is_cost, raw = True, raw[:-1]
+    mult = 1.0
+    if raw.endswith("k"):
+        mult, raw = 1_000, raw[:-1]
+    elif raw.endswith("m"):
+        mult, raw = 1_000_000, raw[:-1]
+    try:
+        num = float(raw) * mult
+    except ValueError:
+        raise ValueError(f"can't parse budget: {s!r}")
+    if num <= 0:
+        raise ValueError("budget must be a positive number")
+    return ("cost", round(num, 4)) if is_cost else ("tokens", int(num))
+
+
+def fmt_amount(value: float, unit: str) -> str:
+    """Compact rendering of a budget amount: ``$1.83`` for cost, ``124k`` for tokens."""
+    if unit == "usd":
+        return f"${value:,.2f}"
+    value = int(value)
+    if value >= 1_000_000:
+        return f"{value / 1_000_000:.1f}m".replace(".0m", "m")
+    if value >= 1_000:
+        return f"{value / 1_000:.1f}k".replace(".0k", "k")
+    return str(value)
+
+
+def usage_vs_limits(session_id: str, config: dict) -> list[dict]:
+    """Return the four budget rows with current usage, limit, and percent.
+
+    Each row: ``{key, label, scope, unit, used, limit, pct}`` where ``limit`` is
+    ``None`` (unlimited) and ``pct`` is ``None`` when no limit is set.
+    """
+    u = get_usage(session_id)
+    spec = [
+        ("session_cost_budget",  "Session cost",   "session", "usd", u["session_cost"]),
+        ("session_token_budget", "Session tokens", "session", "tok", u["session_tokens"]),
+        ("daily_cost_budget",    "Daily cost",     "daily",   "usd", u["daily_cost"]),
+        ("daily_token_budget",   "Daily tokens",   "daily",   "tok", u["daily_tokens"]),
+    ]
+    rows = []
+    for key, label, scope, unit, used in spec:
+        limit = config.get(key) or None
+        pct = (used / limit * 100) if limit else None
+        rows.append({"key": key, "label": label, "scope": scope, "unit": unit,
+                     "used": used, "limit": limit, "pct": pct})
+    return rows
+
+
+def warnings(session_id: str, config: dict) -> list[tuple[str, str]]:
+    """Return ``(level, message)`` for any budget at ≥80% (``warn``) / ≥95%
+    (``crit``) but not yet exhausted. Empty when nothing is close. Used by the
+    REPL to warn before the hard stop arrives."""
+    out: list[tuple[str, str]] = []
+    for r in usage_vs_limits(session_id, config):
+        if not r["limit"] or r["pct"] is None or r["pct"] >= 100 or r["pct"] < 80:
+            continue
+        level = "crit" if r["pct"] >= 95 else "warn"
+        out.append((level,
+                    f"{r['label']} at {r['pct']:.0f}% "
+                    f"({fmt_amount(r['used'], r['unit'])} / "
+                    f"{fmt_amount(r['limit'], r['unit'])})"))
+    return out
diff --git a/tests/fixtures/golden_default_prompt.txt b/tests/fixtures/golden_default_prompt.txt
index 9b8a42a..2ce586b 100644
--- a/tests/fixtures/golden_default_prompt.txt
+++ b/tests/fixtures/golden_default_prompt.txt
@@ -109,6 +109,7 @@ These commands the **user** can invoke at the REPL prompt — they are NOT tools
 - `/agent` `[start | stop | list | status | templates]` — Autonomous agent loop (task templates)
 - `/agents` — Show background agents
 - `/brainstorm` — Multi-persona AI debate + auto tasks
+- `/budget` `[session | daily | clear]` — View or set token/cost budgets (session + daily)
 - `/checkpoint` `[clear]` — List / restore checkpoints
 - `/circuit` `[status | reset]` — Show / reset per-provider circuit breakers
 - `/clear` — Clear conversation history
diff --git a/tests/test_budget.py b/tests/test_budget.py
new file mode 100644
index 0000000..2cc5e09
--- /dev/null
+++ b/tests/test_budget.py
@@ -0,0 +1,254 @@
+"""Tests for the user-facing token/cost budget feature.
+
+Covers the quota helpers (parse_budget / fmt_amount / usage_vs_limits / warnings),
+the /budget command (view / set / clear), and the QuotaPause event the agent
+yields when a budget is reached.
+"""
+import pytest
+
+import quota
+
+
+@pytest.fixture(autouse=True)
+def _clean_session(tmp_path, monkeypatch):
+    # Isolate the on-disk daily counter so tests never read or pollute the real
+    # ~/.cheetahclaws/quota/ file (matches test_quota.py's approach).
+    monkeypatch.setattr(quota, "_quota_dir", lambda: tmp_path)
+    quota.reset_session("t")
+    yield
+    quota.reset_session("t")
+
+
+# ── parse_budget ────────────────────────────────────────────────────────────
+
+@pytest.mark.parametrize("text,expected", [
+    ("$5",      ("cost", 5.0)),
+    ("$5.50",   ("cost", 5.5)),
+    ("5usd",    ("cost", 5.0)),
+    ("10$",     ("cost", 10.0)),
+    ("200k",    ("tokens", 200_000)),
+    ("1.5m",    ("tokens", 1_500_000)),
+    ("200000",  ("tokens", 200_000)),
+    ("2,000",   ("tokens", 2_000)),
+    (" 200K ",  ("tokens", 200_000)),
+])
+def test_parse_budget_ok(text, expected):
+    assert quota.parse_budget(text) == expected
+
+
+@pytest.mark.parametrize("bad", ["", "abc", "-3", "$0", "0", "$-1", "k"])
+def test_parse_budget_rejects(bad):
+    with pytest.raises(ValueError):
+        quota.parse_budget(bad)
+
+
+def test_budget_keys_mapping():
+    assert quota.BUDGET_KEYS[("cost", "session")] == "session_cost_budget"
+    assert quota.BUDGET_KEYS[("tokens", "session")] == "session_token_budget"
+    assert quota.BUDGET_KEYS[("cost", "daily")] == "daily_cost_budget"
+    assert quota.BUDGET_KEYS[("tokens", "daily")] == "daily_token_budget"
+
+
+# ── fmt_amount ──────────────────────────────────────────────────────────────
+
+def test_fmt_amount():
+    assert quota.fmt_amount(5, "usd") == "$5.00"
+    assert quota.fmt_amount(1.834, "usd") == "$1.83"
+    assert quota.fmt_amount(124_000, "tok") == "124k"
+    assert quota.fmt_amount(2_000_000, "tok") == "2m"
+    assert quota.fmt_amount(540, "tok") == "540"
+
+
+# ── usage_vs_limits ─────────────────────────────────────────────────────────
+
+def test_usage_vs_limits_unlimited_by_default():
+    rows = quota.usage_vs_limits("t", {})
+    assert {r["key"] for r in rows} == set(quota.BUDGET_KEYS.values())
+    assert all(r["limit"] is None and r["pct"] is None for r in rows)
+
+
+def test_usage_vs_limits_computes_pct():
+    with quota._lock:
+        quota._sess_tokens["t"] = 50_000
+    rows = quota.usage_vs_limits("t", {"session_token_budget": 200_000})
+    row = next(r for r in rows if r["key"] == "session_token_budget")
+    assert row["used"] == 50_000
+    assert row["limit"] == 200_000
+    assert row["pct"] == pytest.approx(25.0)
+
+
+# ── warnings (80% warn / 95% crit / 100% hard-stop, no warn) ─────────────────
+
+@pytest.mark.parametrize("cost,level", [
+    (3.0, None),    # 60% — no warning
+    (4.3, "warn"),  # 86%
+    (4.8, "crit"),  # 96%
+    (6.0, None),    # 120% — exhausted; hard stop handles it, not a warning
+])
+def test_warnings_thresholds(cost, level):
+    with quota._lock:
+        quota._sess_cost["t"] = cost
+    out = quota.warnings("t", {"session_cost_budget": 5.0})
+    if level is None:
+        assert out == []
+    else:
+        assert len(out) == 1 and out[0][0] == level
+
+
+# ── /budget command ─────────────────────────────────────────────────────────
+
+@pytest.fixture
+def cmd(monkeypatch):
+    import cc_config
+    monkeypatch.setattr(cc_config, "save_config", lambda cfg: None)
+    from commands.core import cmd_budget
+    return cmd_budget
+
+
+def test_cmd_budget_set_cost(cmd):
+    cfg = {"_session_id": "t"}
+    assert cmd("$5", None, cfg) is True
+    assert cfg["session_cost_budget"] == 5.0
+
+
+def test_cmd_budget_set_daily_tokens(cmd):
+    cfg = {"_session_id": "t"}
+    cmd("daily 2m", None, cfg)
+    assert cfg["daily_token_budget"] == 2_000_000
+
+
+def test_cmd_budget_explicit_session_scope(cmd):
+    cfg = {"_session_id": "t"}
+    cmd("session 200k", None, cfg)
+    assert cfg["session_token_budget"] == 200_000
+
+
+def test_cmd_budget_clear(cmd):
+    cfg = {"_session_id": "t", "session_cost_budget": 5.0,
+           "daily_token_budget": 2_000_000}
+    cmd("clear", None, cfg)
+    assert all(cfg[k] is None for k in quota.BUDGET_KEYS.values())
+
+
+def test_cmd_budget_set_replaces_other_unit_in_scope(cmd):
+    # A leftover token cap must not keep blocking after switching to a $ cap.
+    cfg = {"_session_id": "t", "session_token_budget": 20_000}
+    cmd("$2", None, cfg)
+    assert cfg["session_cost_budget"] == 2.0
+    assert cfg["session_token_budget"] is None      # replaced, not coexisting
+    # Daily caps are a different scope — untouched.
+    cfg2 = {"_session_id": "t", "daily_cost_budget": 20.0}
+    cmd("session 200k", None, cfg2)
+    assert cfg2["session_token_budget"] == 200_000
+    assert cfg2["daily_cost_budget"] == 20.0
+
+
+def test_cmd_budget_bad_value_does_not_set(cmd, capsys):
+    cfg = {"_session_id": "t"}
+    cmd("banana", None, cfg)
+    assert "session_token_budget" not in cfg or cfg.get("session_token_budget") is None
+    assert "session_cost_budget" not in cfg
+
+
+def test_cmd_budget_view_runs_with_no_budgets(cmd, capsys):
+    assert cmd("", None, {"_session_id": "t"}) is True
+    assert "unlimited" in capsys.readouterr().out
+
+
+# ── QuotaPause event ────────────────────────────────────────────────────────
+
+def test_quota_pause_event_shape():
+    from agent import QuotaPause
+    ev = QuotaPause("Session cost budget reached", {"session_cost": 5.0})
+    assert ev.reason == "Session cost budget reached"
+    assert ev.usage["session_cost"] == 5.0
+
+
+def test_check_quota_raises_when_over_budget():
+    with quota._lock:
+        quota._sess_cost["t"] = 5.0
+    with pytest.raises(quota.QuotaExceeded):
+        quota.check_quota("t", {"session_cost_budget": 5.0})
+
+
+# ── pre-call projection (tight cap) ─────────────────────────────────────────
+
+def test_check_quota_projection_stops_before_overshoot():
+    # 30k spent, 20k cap... already over → "reached". Use under-cap spend instead.
+    with quota._lock:
+        quota._sess_tokens["t"] = 30_000
+    cfg = {"session_token_budget": 40_000}
+    # Without projection: 30k < 40k → allowed.
+    quota.check_quota("t", cfg)
+    # With a projected 15k next request: 30k+15k ≥ 40k → stop BEFORE the call.
+    with pytest.raises(quota.QuotaExceeded) as ei:
+        quota.check_quota("t", cfg, projected_tokens=15_000)
+    assert "would be exceeded" in str(ei.value)
+
+
+def test_check_quota_projection_allows_when_fits():
+    with quota._lock:
+        quota._sess_tokens["t"] = 10_000
+    # 10k + 5k = 15k < 40k → fine.
+    quota.check_quota("t", {"session_token_budget": 40_000}, projected_tokens=5_000)
+
+
+def test_quota_exceeded_carries_breached_cap_fields():
+    with quota._lock:
+        quota._sess_tokens["t"] = 25_000
+    with pytest.raises(quota.QuotaExceeded) as ei:
+        quota.check_quota("t", {"session_token_budget": 20_000})
+    e = ei.value
+    assert e.key == "session_token_budget"
+    assert e.scope == "session"
+    assert e.unit == "tok"
+    assert e.limit == 20_000
+
+
+def test_quota_exceeded_cost_cap_fields():
+    with quota._lock:
+        quota._sess_cost["t"] = 3.0
+    with pytest.raises(quota.QuotaExceeded) as ei:
+        quota.check_quota("t", {"session_cost_budget": 2.0})
+    assert ei.value.unit == "usd"
+    assert ei.value.scope == "session"
+    assert ei.value.key == "session_cost_budget"
+
+
+# ── output_room (clamp) ─────────────────────────────────────────────────────
+
+def test_output_room_none_without_budget():
+    assert quota.output_room("t", {}) is None
+
+
+def test_output_room_token_budget_headroom():
+    with quota._lock:
+        quota._sess_tokens["t"] = 30_000
+    # 40k cap, 30k spent, 5k projected input → 5k left for output.
+    room = quota.output_room("t", {"session_token_budget": 40_000}, projected_tokens=5_000)
+    assert room == 5_000
+
+
+def test_output_room_never_negative():
+    with quota._lock:
+        quota._sess_tokens["t"] = 50_000
+    assert quota.output_room("t", {"session_token_budget": 40_000}) == 0
+
+
+def test_output_room_takes_tightest_of_multiple():
+    # record_usage advances BOTH the session and (isolated) daily counters.
+    quota.record_usage("t", "claude-sonnet-4-6", 10_000, 0)
+    cfg = {"session_token_budget": 40_000, "daily_token_budget": 12_000}
+    # session leaves 30k, daily leaves 2k → min is 2k.
+    assert quota.output_room("t", cfg) == 2_000
+
+
+def test_output_room_cost_budget_uses_output_price(monkeypatch):
+    # Model with a known $/Mtok output price → cost cap converts to token room.
+    import providers
+    monkeypatch.setitem(providers.COSTS, "budgetmodel", (1.0, 2.0))  # $2 /Mtok out
+    with quota._lock:
+        quota._sess_cost["t"] = 0.0
+    cfg = {"model": "budgetmodel", "session_cost_budget": 0.10}  # $0.10 → 50k out tokens
+    room = quota.output_room("t", cfg)
+    assert room == 50_000