rohitg00 · harrykamboj1 · Jun 14, 2026
diff --git a/.env.example b/.env.example
@@ -100,6 +100,11 @@
 # MAX_OBS_PER_SESSION=500                        # Per-session observation cap before consolidation kicks in
 # SUMMARIZE_CHUNK_SIZE=400                       # When mem::summarize sees a session larger than this, it chunks observations and map-reduces (chunk-summarize → reduce-merge) to stay within the LLM's context window. Default 400 ≈ 50k tokens per chunk at ~110 tok/obs. Native sessions are capped by MAX_OBS_PER_SESSION; chunking primarily matters for bulk-imported jsonl sessions, which bypass that cap.
 # SUMMARIZE_CHUNK_CONCURRENCY=6                  # Parallel chunk LLM calls during chunked summarize. Default 6 fits ~100-chunk sessions under iii's 180s function-invocation timeout at typical ~8s/call. High-throughput providers (Novita, DeepInfra, DeepSeek) commonly allow 100+ concurrent — bump this for very large imported sessions.
+# AGENTMEMORY_SESSION_TOKEN_CAP=100000           # Per-session hard cap on estimated LLM tokens (compress + summarize). Once exceeded, further LLM calls for that session are blocked and compression falls back to zero-LLM synthetic output. A soft-warn event fires at 80%. Cost safety net for pathological sessions (runaway tool loops). Per-session override via the tokenCap field on POST /session/start.
+# AGENTMEMORY_SYSTEM_TOKEN_CAP=1000000           # Separate cap for cron-fired / cross-session LLM work (consolidation, reflect, graph extraction) that has no active session; tracked under the "__system__" sentinel.
+# AGENTMEMORY_SESSION_BUDGET_RETENTION_DAYS=7     # How long a session's token-budget row is kept after the session ends before the hourly reaper deletes it.
+# AGENTMEMORY_COST_IN_PER_1M=0.14                 # USD per 1M input tokens, used to display a rough per-session costEstimate from the raw token counts.
+# AGENTMEMORY_COST_OUT_PER_1M=0.28                # USD per 1M output tokens for the costEstimate.
 
 # -----------------------------------------------------------------------------
 # 5. Behaviour flags

diff --git a/AGENTS.md b/AGENTS.md
@@ -117,7 +117,7 @@ Hook scripts in `src/hooks/` are standalone Node.js scripts (no iii-sdk import).
 ## Current Stats (v0.9.16)
 
 - 53 MCP tools (8 visible by default, `AGENTMEMORY_TOOLS=all` for all)
-- 128 REST endpoints
+- 129 REST endpoints
 - 6 MCP resources, 3 MCP prompts
 - 12 hooks, 15 skills
 - 50+ iii functions

diff --git a/README.md b/README.md
@@ -1294,6 +1294,18 @@ Quality vs cost tradeoff for memory work: compression is a summarization task wi
 
 Sources: [OpenRouter pricing for Sonnet 4.6](https://openrouter.ai/anthropic/claude-sonnet-4.6/pricing), [DeepSeek V4 Pro](https://openrouter.ai/deepseek/deepseek-v4-pro), [DeepSeek pricing notes](https://api-docs.deepseek.com/quick_start/pricing/).
 
+### Per-session token budget
+
+Pathological sessions (long-running CC instances, runaway tool loops) can rack up unbounded background compress/summarize spend. Each session gets a running token budget with a **hard cap** (default **100k estimated tokens**) and an **80% soft warning**.
+
+| Knob | Default | Purpose |
+|------|---------|---------|
+| `AGENTMEMORY_SESSION_TOKEN_CAP` | `100000` | Hard cap per session. Once exceeded, further LLM calls for that session are blocked; compression falls back to zero-LLM synthetic output. |
+| `AGENTMEMORY_SYSTEM_TOKEN_CAP` | `1000000` | Separate cap for cron-fired / cross-session LLM work (consolidation, reflect) tracked under the `__system__` sentinel. |
+| `AGENTMEMORY_SESSION_BUDGET_RETENTION_DAYS` | `7` | How long budget rows persist after a session ends before the hourly reaper deletes them. |
+
+Per-session override: pass `tokenCap` in the body of `POST /agentmemory/session/start`. Budget state is KV-persisted and survives restarts. `agentmemory status` reports `N active, M near-cap, K exhausted`. OTEL histogram: `session.tokens_used`.
+
 ### Multi-agent memory (`AGENT_ID` + `AGENTMEMORY_AGENT_SCOPE`)
 
 In multi-agent setups where several roles share one agentmemory server (architect / developer / reviewer / researcher / support-agent), `AGENT_ID` tags every write with the role that made it. `AGENTMEMORY_AGENT_SCOPE` controls whether recall filters by that tag.
@@ -1448,6 +1460,13 @@ Create `~/.agentmemory/.env`:
                                    # LLM provider to compress the
                                    # observation — expect significant
                                    # token spend on active sessions.
+# AGENTMEMORY_SESSION_TOKEN_CAP=100000  # Per-session hard cap on estimated
+                                        # LLM tokens (compress + summarize).
+                                        # Soft-warn at 80%; synthetic fallback
+                                        # after exhaustion. Override per
+                                        # session via tokenCap on /session/start.
+# AGENTMEMORY_SYSTEM_TOKEN_CAP=1000000  # Cap for cron / cross-session LLM work.
+# AGENTMEMORY_SESSION_BUDGET_RETENTION_DAYS=7
 # AGENTMEMORY_SLOTS=false          # OFF by default. Editable pinned
                                    # memory slots — persona,
                                    # user_preferences, tool_guidelines,
@@ -1500,7 +1519,7 @@ Create `~/.agentmemory/.env`:
 
 <h2 id="api"><picture><source media="(prefers-color-scheme: dark)" srcset="assets/tags/light/section-api.svg"><img src="assets/tags/section-api.svg" alt="API" height="32" /></picture></h2>
 
-128 endpoints on port `3111`. The REST API binds to `127.0.0.1` by default. Protected endpoints require `Authorization: Bearer <secret>` when `AGENTMEMORY_SECRET` is set, and mesh sync endpoints require `AGENTMEMORY_SECRET` on both peers.
+129 endpoints on port `3111`. The REST API binds to `127.0.0.1` by default. Protected endpoints require `Authorization: Bearer <secret>` when `AGENTMEMORY_SECRET` is set, and mesh sync endpoints require `AGENTMEMORY_SECRET` on both peers.
 
 <details>
 <summary>Key endpoints</summary>

diff --git a/plugin/skills/agentmemory-config/REFERENCE.md b/plugin/skills/agentmemory-config/REFERENCE.md
@@ -3,13 +3,15 @@
 Generated by scanning `src/` for `AGENTMEMORY_*` usage. Do not edit the block below by hand; run `npm run skills:gen` after adding or removing a variable. Internal markers ending in two underscores are excluded.
 
 <!-- AUTOGEN:env START - generated by scripts/skills/generate.ts, do not edit by hand -->
-Configuration is read from the environment and from `~/.agentmemory/.env` (no `export` prefix). 34 recognized variables:
+Configuration is read from the environment and from `~/.agentmemory/.env` (no `export` prefix). 39 recognized variables:
 
 - `AGENTMEMORY_AGENT_SCOPE`
 - `AGENTMEMORY_ALLOW_AGENT_SDK`
 - `AGENTMEMORY_AUTO_COMPRESS`
 - `AGENTMEMORY_COMMIT_SHA`
 - `AGENTMEMORY_COPILOT_MCP_BLOCK`
+- `AGENTMEMORY_COST_IN_PER_1M`
+- `AGENTMEMORY_COST_OUT_PER_1M`
 - `AGENTMEMORY_CWD`
 - `AGENTMEMORY_DEBUG`
 - `AGENTMEMORY_DROP_STALE_INDEX`
@@ -30,9 +32,12 @@ Configuration is read from the environment and from `~/.agentmemory/.env` (no `e
 - `AGENTMEMORY_REFLECT`
 - `AGENTMEMORY_SDK_CHILD`
 - `AGENTMEMORY_SECRET`
+- `AGENTMEMORY_SESSION_BUDGET_RETENTION_DAYS`
 - `AGENTMEMORY_SESSION_ID`
+- `AGENTMEMORY_SESSION_TOKEN_CAP`
 - `AGENTMEMORY_SLOTS`
 - `AGENTMEMORY_SUPPRESS_COST_WARNING`
+- `AGENTMEMORY_SYSTEM_TOKEN_CAP`
 - `AGENTMEMORY_TOOLS`
 - `AGENTMEMORY_URL`
 - `AGENTMEMORY_USE_DOCKER`

diff --git a/plugin/skills/agentmemory-rest-api/REFERENCE.md b/plugin/skills/agentmemory-rest-api/REFERENCE.md
@@ -5,7 +5,7 @@ Generated from `src/triggers/api.ts`. Do not edit the block below by hand; run `
 <!-- AUTOGEN:rest START - generated by scripts/skills/generate.ts, do not edit by hand -->
 The REST API is the primary surface. All paths are under `http://localhost:3111` (override with `--port`). When `AGENTMEMORY_SECRET` is set, send `Authorization: Bearer $AGENTMEMORY_SECRET`; localhost is otherwise open.
 
-117 registered endpoints:
+118 registered endpoints:
 
 | Method | Path |
 | --- | --- |
@@ -96,6 +96,7 @@ The REST API is the primary surface. All paths are under `http://localhost:3111`
 | POST | `/agentmemory/sentinels/cancel` |
 | POST | `/agentmemory/sentinels/check` |
 | POST | `/agentmemory/sentinels/trigger` |
+| GET | `/agentmemory/session/budget` |
 | GET | `/agentmemory/session/by-commit` |
 | POST | `/agentmemory/session/commit` |
 | POST | `/agentmemory/session/end` |

diff --git a/src/cli.ts b/src/cli.ts
@@ -1345,13 +1345,14 @@ async function runStatus() {
   }
 
   try {
-    const [healthRes, sessionsRes, graphRes, memoriesRes, flagsRes, followupRes] = await Promise.all([
+    const [healthRes, sessionsRes, graphRes, memoriesRes, flagsRes, followupRes, budgetsRes] = await Promise.all([
       apiFetch<any>(base, "health"),
       apiFetch<any>(base, "sessions"),
       apiFetch<any>(base, "graph/stats"),
       apiFetch<any>(base, "memories?count=true"),
       apiFetch<any>(base, "config/flags"),
       apiFetch<any>(base, "diagnostics/followup"),
+      apiFetch<any>(base, "session/budget").catch(() => null),
     ]);
 
     if (typeof healthRes?.viewerPort === "number") {
@@ -1422,6 +1423,33 @@ async function runStatus() {
       );
     }
 
+    const budgetList = Array.isArray(budgetsRes?.budgets) ? budgetsRes.budgets : [];
+    const sessionBudgets = budgetList.filter(
+      (b: any) => b && b.sessionId && b.sessionId !== "__system__",
+    );
+    if (sessionBudgets.length > 0) {
+      let exhausted = 0;
+      let nearCap = 0;
+      let active = 0;
+      for (const b of sessionBudgets) {
+        const used = Number(b.tokensUsed) || 0;
+        const cap = Number(b.tokenCap) || 0;
+        const isExhausted = Boolean(b.exhaustedAt) || (cap > 0 && used >= cap);
+        if (isExhausted) {
+          exhausted++;
+        } else if (b.warnEmittedAt || (cap > 0 && used >= cap * 0.8)) {
+          nearCap++;
+          active++;
+        } else {
+          active++;
+        }
+      }
+      lines.push("");
+      lines.push(
+        `Token budgets: ${active} active, ${nearCap} near-cap, ${exhausted} exhausted`,
+      );
+    }
+
     p.note(lines.join("\n"), "agentmemory");
   } catch (err) {
     p.log.error(err instanceof Error ? err.message : String(err));

diff --git a/src/config.ts b/src/config.ts
@@ -403,6 +403,55 @@ export function getConsolidationDecayDays(): number {
   return safeParseInt(getMergedEnv()["CONSOLIDATION_DECAY_DAYS"], 30);
 }
 
+// Per-session LLM token budget. Hard cap default is 100k
+// estimated tokens per session. AGENTMEMORY_SESSION_TOKEN_CAP overrides the
+// global default; mem::session::start can override per-session.
+const SESSION_TOKEN_CAP_DEFAULT = 100_000;
+const SYSTEM_TOKEN_CAP_DEFAULT = 1_000_000;
+const SESSION_BUDGET_RETENTION_DAYS_DEFAULT = 7;
+// Soft warning fires at this fraction of the cap.
+export const SESSION_BUDGET_WARN_RATIO = 0.8;
+
+export function getSessionTokenCap(): number {
+  const n = safeParseInt(
+getMergedEnv()["AGENTMEMORY_SESSION_TOKEN_CAP"],
+    SESSION_TOKEN_CAP_DEFAULT
+  );
+  return n > 0 ? n : SESSION_TOKEN_CAP_DEFAULT;
+}
+
+// Cron-fired / cross-session LLM calls (consolidation, reflect, graph
+// extraction) have no active session; they bill the "__system__" sentinel
+// against this separate, larger cap.
+export function getSystemTokenCap(): number {
+  const n = safeParseInt(
+    getMergedEnv()["AGENTMEMORY_SYSTEM_TOKEN_CAP"],
+    SYSTEM_TOKEN_CAP_DEFAULT,
+  );
+  return n > 0 ? n : SYSTEM_TOKEN_CAP_DEFAULT;
+}
+
+export function getSessionBudgetRetentionDays(): number {
+  const n = safeParseInt(
+    getMergedEnv()["AGENTMEMORY_SESSION_BUDGET_RETENTION_DAYS"],
+    SESSION_BUDGET_RETENTION_DAYS_DEFAULT,
+  );
+  return n > 0 ? n : SESSION_BUDGET_RETENTION_DAYS_DEFAULT;
+}
+
+// USD-per-1M-token rates used to normalize raw token counts into a rough
+// costEstimate at record time. Override with
+// AGENTMEMORY_COST_IN_PER_1M / AGENTMEMORY_COST_OUT_PER_1M.
+export function getCostRatesPer1M(): { input: number; output: number } {
+  const env = getMergedEnv();
+  const input = parseFloat(env["AGENTMEMORY_COST_IN_PER_1M"] ?? "");
+  const output = parseFloat(env["AGENTMEMORY_COST_OUT_PER_1M"] ?? "");
+  return {
+    input: Number.isFinite(input) && input >= 0 ? input : 0.14,
+    output: Number.isFinite(output) && output >= 0 ? output : 0.28,
+  };
+}
+
 export function isStandaloneMcp(): boolean {
   return getMergedEnv()["STANDALONE_MCP"] === "true";
 }

diff --git a/src/functions/compress.ts b/src/functions/compress.ts
@@ -16,6 +16,8 @@ import {
 import { VISION_DESCRIPTION_PROMPT } from "../prompts/vision.js";
 import { getXmlTag, getXmlChildren } from "../prompts/xml.js";
 import { getSearchIndex, vectorIndexAddGuarded } from "./search.js";
+import { buildSyntheticCompression } from "./compress-synthetic.js";
+import { withSession } from "../state/session-context.js";
 import { CompressOutputSchema } from "../eval/schemas.js";
 import { validateOutput } from "../eval/validator.js";
 import { scoreCompression } from "../eval/quality.js";
@@ -75,7 +77,7 @@ export function registerCompressFunction(
       observationId: string;
       sessionId: string;
       raw: RawObservation;
-    }) => {
+    }) => withSession(data.sessionId, async () => {
       const startMs = Date.now();
 
       let imageDescription: string | undefined;
@@ -253,6 +255,43 @@ export function registerCompressFunction(
       } catch (err) {
         const msg = err instanceof Error ? err.message : String(err);
         const latencyMs = Date.now() - startMs;
+
+        // the session's token cap is exhausted. Don't drop
+        // the observation — fall back to zero-LLM synthetic compression so
+        // recall/search still work, exactly as if AGENTMEMORY_AUTO_COMPRESS
+        // were off, until the budget is reset or the session ends.
+        if (msg === "session_budget_exhausted") {
+          const synthetic = buildSyntheticCompression(data.raw);
+          synthetic.id = data.observationId;
+          synthetic.sessionId = data.sessionId;
+          await kv.set(
+            KV.observations(data.sessionId),
+            data.observationId,
+            synthetic,
+          );
+          try {
+            getSearchIndex().add(synthetic);
+          } catch {}
+          await vectorIndexAddGuarded(
+            synthetic.id,
+            synthetic.sessionId,
+            synthetic.title + " " + (synthetic.narrative || ""),
+            { kind: "synthetic", logId: synthetic.id },
+          ).catch(() => {});
+          if (metricsStore) {
+            await metricsStore.record("mem::compress", latencyMs, false);
+          }
+          logger.warn("Compression budget exhausted; stored synthetic", {
+            obsId: data.observationId,
+            sessionId: data.sessionId,
+          });
+          return {
+            success: true,
+            compressed: synthetic,
+            budgetExhausted: true,
+          };
+        }
+
         if (metricsStore) {
           await metricsStore.record("mem::compress", latencyMs, false);
         }
@@ -262,6 +301,6 @@ export function registerCompressFunction(
         });
         return { success: false, error: "compression_failed" };
       }
-    },
+    }),
   );
 }
diff --git a/src/functions/observe.ts b/src/functions/observe.ts
@@ -272,6 +272,14 @@ export function registerObserveFunction(
               ? { firstPrompt: trimmedPrompt }
               : {}),
           });
+          // OpenCode and other plugins that skip
+          // POST /session/start create the session here; seed its token
+          // budget too so background compress/summarize spend is capped.
+          sdk.trigger({
+            function_id: "mem::session::budget::init",
+            payload: { sessionId: payload.sessionId },
+            action: TriggerAction.Void(),
+          });
         }
 
         // Per-observation LLM compression is opt-in as of 0.8.8 (#138).