diff --git a/README.md b/README.md index 0abbe19d..5401d34a 100644 --- a/README.md +++ b/README.md @@ -1815,8 +1815,10 @@ The `/agentmemory/health` evaluator uses conservative defaults, and operators ca | `AGENTMEMORY_HEALTH_MEM_WARN_PCT` | `80` | Heap usage warning threshold percent | | `AGENTMEMORY_HEALTH_MEM_CRITICAL_PCT` | `95` | Heap usage critical threshold percent | | `AGENTMEMORY_HEALTH_MEM_RSS_FLOOR_MB` | `512` | Minimum RSS in MB before heap warning/critical alerts fire | +| `AGENTMEMORY_HEALTH_MEM_CRITICAL_RSS_MB` | `4096` | Process RSS in MB that counts as critical memory pressure | +| `AGENTMEMORY_HEALTH_MEM_SYSTEM_FREE_FLOOR_RATIO` | `0.1` | Host free-memory ratio below which high heap usage may become critical; set to `0` to disable this host-memory gate | -Unset, empty, invalid, zero, or negative values fall back to the defaults above. +Unset, empty, invalid, zero, or negative values fall back to the defaults above, except `AGENTMEMORY_HEALTH_MEM_SYSTEM_FREE_FLOOR_RATIO=0`, which disables the host free-memory pressure gate. ### Scheduled backups diff --git a/docs/todos/2026-06-18-issue-491-memory-critical-pressure/plan.md b/docs/todos/2026-06-18-issue-491-memory-critical-pressure/plan.md new file mode 100644 index 00000000..206ad1f8 --- /dev/null +++ b/docs/todos/2026-06-18-issue-491-memory-critical-pressure/plan.md @@ -0,0 +1,202 @@ +# Memory Critical Pressure Gate Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make `memory_critical` require real memory pressure instead of only high heap ratio plus the existing RSS floor. + +**Architecture:** Keep the health evaluator as the single policy owner in `src/health/thresholds.ts`. Add critical-pressure thresholds to the existing config, collect host memory in `src/health/monitor.ts`, and preserve warn-level memory semantics. + +**Tech Stack:** TypeScript ESM, Node built-ins, Vitest, existing health monitor/evaluator modules. + +--- + +## File Structure + +- Modify `src/types.ts`: allow optional host memory fields in `HealthSnapshot.memory`. +- Modify `src/health/thresholds.ts`: add critical RSS ceiling and system-free ratio floor config, env parsing, and critical memory gate logic. +- Modify `src/health/monitor.ts`: collect `os.freemem()` and `os.totalmem()` in the snapshot. +- Modify `test/health-thresholds.test.ts`: add red/green coverage for false-positive suppression, process RSS critical pressure, host low-memory critical pressure, env overrides, and invalid fallback. +- Modify `README.md`: document new health threshold environment variables. +- Update `docs/todos/2026-06-18-issue-491-memory-critical-pressure/todo.md`: record evidence, verification, reviews, and final status. + +## Task 1: Write Failing Threshold Tests + +**Files:** +- Modify: `test/health-thresholds.test.ts` + +- [x] **Step 1: Add new env keys to test isolation** + +Add `AGENTMEMORY_HEALTH_MEM_CRITICAL_RSS_MB` and `AGENTMEMORY_HEALTH_MEM_SYSTEM_FREE_FLOOR_RATIO` to `HEALTH_ENV_KEYS`. + +- [x] **Step 2: Add regression tests for pressure gating** + +Add tests in `describe("evaluateHealth memory severity", ...)`: + +```ts +it("does not go critical for high heap ratio without real memory pressure", () => { + const s = snap({ + memory: { + heapUsed: 970 * 1024 * 1024, + heapTotal: 1000 * 1024 * 1024, + rss: 1100 * 1024 * 1024, + external: 0, + systemFree: 8 * 1024 * 1024 * 1024, + systemTotal: 16 * 1024 * 1024 * 1024, + }, + }); + const { status, alerts, notes } = evaluateWithSnapshotHeapTotal(s); + expect(status).toBe("degraded"); + expect(alerts.some((a) => a.startsWith("memory_critical_"))).toBe(false); + expect(alerts.some((a) => a.startsWith("memory_warn_"))).toBe(true); + expect(notes.some((n) => n.startsWith("memory_heap_tight_"))).toBe(false); +}); + +it("goes critical when high heap ratio has critical process RSS pressure", () => { + const s = snap({ + memory: { + heapUsed: 970 * 1024 * 1024, + heapTotal: 1000 * 1024 * 1024, + rss: 5 * 1024 * 1024 * 1024, + external: 0, + systemFree: 8 * 1024 * 1024 * 1024, + systemTotal: 16 * 1024 * 1024 * 1024, + }, + }); + const { status, alerts } = evaluateWithSnapshotHeapTotal(s); + expect(status).toBe("critical"); + expect(alerts.some((a) => a.startsWith("memory_critical_"))).toBe(true); +}); + +it("goes critical when high heap ratio has low host free memory", () => { + const s = snap({ + memory: { + heapUsed: 970 * 1024 * 1024, + heapTotal: 1000 * 1024 * 1024, + rss: 1100 * 1024 * 1024, + external: 0, + systemFree: 1 * 1024 * 1024 * 1024, + systemTotal: 16 * 1024 * 1024 * 1024, + }, + }); + const { status, alerts } = evaluateWithSnapshotHeapTotal(s); + expect(status).toBe("critical"); + expect(alerts.some((a) => a.startsWith("memory_critical_"))).toBe(true); +}); +``` + +- [x] **Step 3: Run red test** + +Run: `corepack pnpm exec vitest run test/health-thresholds.test.ts --exclude test/integration.test.ts` + +Expected before implementation: false-positive suppression test fails because status is `critical`. + +## Task 2: Implement Minimal Pressure Gate + +**Files:** +- Modify: `src/types.ts` +- Modify: `src/health/thresholds.ts` +- Modify: `src/health/monitor.ts` + +- [x] **Step 1: Extend snapshot memory type** + +Add optional `systemFree?: number` and `systemTotal?: number` to `HealthSnapshot.memory` in `src/types.ts`. + +- [x] **Step 2: Extend threshold config** + +In `src/health/thresholds.ts`, add `memoryCriticalRssBytes` and `memorySystemFreeFloorRatio` to `ThresholdConfig`, default to `4096 * 1024 * 1024` and `0.1`, and parse env vars `AGENTMEMORY_HEALTH_MEM_CRITICAL_RSS_MB` plus `AGENTMEMORY_HEALTH_MEM_SYSTEM_FREE_FLOOR_RATIO`. + +- [x] **Step 3: Gate only critical memory alerts** + +Compute: + +```ts +const criticalRssPressure = rss >= cfg.memoryCriticalRssBytes; +const systemFreeRatio = + snapshot.memory.systemTotal && snapshot.memory.systemTotal > 0 + ? (snapshot.memory.systemFree ?? snapshot.memory.systemTotal) / + snapshot.memory.systemTotal + : 1; +const systemMemoryPressure = + cfg.memorySystemFreeFloorRatio > 0 && + systemFreeRatio < cfg.memorySystemFreeFloorRatio; +const realMemoryPressure = criticalRssPressure || systemMemoryPressure; +``` + +Then require `realMemoryPressure` only in the `memory_critical` branch: + +```ts +if (memPercent > cfg.memoryCriticalPercent && rssAboveFloor && realMemoryPressure) { + alerts.push(`memory_critical_${Math.round(memPercent)}%_rss${memMb}mb`); + critical = true; +} else if (memPercent > cfg.memoryWarnPercent && rssAboveFloor) { + alerts.push(`memory_warn_${Math.round(memPercent)}%_rss${memMb}mb`); + degraded = true; +} else if (memPercent > cfg.memoryWarnPercent) { + notes.push(`memory_heap_tight_${Math.round(memPercent)}%_rss${memMb}mb`); +} +``` + +- [x] **Step 4: Collect host memory** + +In `src/health/monitor.ts`, import `freemem` and `totalmem` from `node:os`, call them once during snapshot collection, and include `systemFree` and `systemTotal` in `snapshot.memory`. + +- [x] **Step 5: Run green targeted test** + +Run: `corepack pnpm exec vitest run test/health-thresholds.test.ts --exclude test/integration.test.ts` + +Expected: health threshold tests pass. + +## Task 3: Cover Env Overrides And Docs + +**Files:** +- Modify: `test/health-thresholds.test.ts` +- Modify: `README.md` +- Modify: `docs/todos/2026-06-18-issue-491-memory-critical-pressure/todo.md` + +- [x] **Step 1: Add env override assertions** + +Extend the existing env override/fallback tests to prove: +- `AGENTMEMORY_HEALTH_MEM_CRITICAL_RSS_MB=2048` allows a 3 GB RSS critical sample. +- `AGENTMEMORY_HEALTH_MEM_SYSTEM_FREE_FLOOR_RATIO=0.25` treats 20% host free memory as pressure. +- Invalid values for both new vars fall back to defaults. + +- [x] **Step 2: Update README** + +Add rows under "Health Thresholds": +- `AGENTMEMORY_HEALTH_MEM_CRITICAL_RSS_MB` default `4096`, "Process RSS in MB that counts as critical memory pressure." +- `AGENTMEMORY_HEALTH_MEM_SYSTEM_FREE_FLOOR_RATIO` default `0.1`, "Host free-memory ratio below which high heap usage may become critical; set to `0` to disable this host-memory gate." + +- [x] **Step 3: Run targeted checks** + +Run: +- `corepack pnpm exec vitest run test/health-thresholds.test.ts test/health-monitor.test.ts --exclude test/integration.test.ts` +- `git diff --check` +- `rg -n "AGENTMEMORY_HEALTH_MEM_(CRITICAL_RSS_MB|SYSTEM_FREE_FLOOR_RATIO)|memory_critical" README.md src test` + +Expected: tests pass, diff check exits 0, references are consistent. + +## Task 4: Review, Security Gates, Commit, Push, PR, Merge + +**Files:** +- All task-owned files above + +- [ ] **Step 1: Focused simplification and review** + +Inspect the diff for unnecessary abstraction, stale tests, warning behavior regressions, and boundary creep. Preserve the existing health API shape and alert strings except for when `memory_critical` is emitted. + +- [x] **Step 2: Final verification** + +Run the smallest covering repo-native checks first, then broader checks if feasible: +- `corepack pnpm exec vitest run test/health-thresholds.test.ts test/health-monitor.test.ts --exclude test/integration.test.ts` +- `corepack pnpm run lint` +- `corepack pnpm run build` +- `semgrep scan --config p/default --error --metrics=off src/health/thresholds.ts src/health/monitor.ts src/types.ts test/health-thresholds.test.ts` +- `git diff --check` + +- [ ] **Step 3: Secret gate and commit** + +Stage only task-owned paths, inspect staged diff, run `gitleaks protect --staged --redact`, then commit with `fix: gate memory critical on real pressure`. + +- [ ] **Step 4: GitHub PR flow** + +Fetch `origin main`, verify branch diff against refreshed `origin/main`, push `issue/491-memory-critical-pressure` to `origin`, create PR against `main`, monitor CI, merge when checks pass, then verify final target state and archive the source thread if the thread tool is available. diff --git a/docs/todos/2026-06-18-issue-491-memory-critical-pressure/todo.md b/docs/todos/2026-06-18-issue-491-memory-critical-pressure/todo.md new file mode 100644 index 00000000..49f236b3 --- /dev/null +++ b/docs/todos/2026-06-18-issue-491-memory-critical-pressure/todo.md @@ -0,0 +1,103 @@ +# Issue #491 Memory Critical Pressure Gate + +## Scope + +- Repository: `/Users/A1538552/.codex/worktrees/0aeb/agentmemory` +- Working branch: `issue/491-memory-critical-pressure` +- Origin target: `https://github.com/wbugitlab1/agentmemory.git`, base `origin/main` +- Issue: GitHub #491, upstream PR evidence only; no upstream PR target +- Initial checkout: detached `HEAD` at `a029b7e1`, clean working tree before branch creation + +## Assumptions + +- Issue #491 is partly valid: the old `heapUsed / heapTotal` framing is stale because current code uses the effective V8/Node heap ceiling first, but local health evaluation can still emit `memory_critical` when heap usage is above the critical ratio and RSS is merely above the existing 512 MB floor, with no high process RSS or low host free memory signal. +- The existing RSS floor remains useful for warn/degraded behavior and for suppressing tiny-process heap-tight false positives. +- A real-pressure gate should affect only critical memory alerts, not CPU, event loop, connection, KV, or warn-level memory behavior. +- New process env knobs are acceptable only as optional health evaluator thresholds; no API, schema, endpoint, auth, persistence, dependency, or remote/system boundary changes are intended. + +## Sprint Contract + +- **Goal:** Gate `memory_critical` on real memory pressure while preserving existing warning behavior and health API shape. +- **Scope:** Modify health threshold evaluation, monitor snapshot collection, health threshold tests, and README health-threshold docs. +- **Non-goals:** Import upstream code directly, change REST/MCP surfaces, change persisted schema contracts beyond optional snapshot fields, add dependencies, alter auth/security behavior, or target upstream PRs/remotes. +- **Acceptance criteria:** + - High heap ratio plus ordinary RSS no longer emits `memory_critical` when host free memory is not low. + - High heap ratio emits `memory_critical` when process RSS crosses a high critical ceiling. + - High heap ratio emits `memory_critical` when host free memory ratio is below the configured floor. + - Existing warning/degraded memory threshold behavior remains unchanged. + - Environment override behavior is covered for new critical pressure gates and invalid values fall back safely. + - README documents the new operator-facing knobs. +- **Intended verification:** TDD red/green targeted health tests, focused health-monitor tests if monitor shape changes, lint/build where feasible, security/secret gates before commit. +- **Known boundaries:** Health alerts are operational behavior; no state migration or endpoint contract change is planned. `origin/main` is the only PR base. +- **Stop conditions:** Fix requires external API/schema/auth/persistence boundary changes, private registry/dependency changes, destructive cleanup, force-push/rebase, or unresolved verification/security findings. + +## Feature / Verification Matrix + +| Change | Verification method | Status | Evidence | +| --- | --- | --- | --- | +| Validate issue #491 | Main inspection plus read-only explorer | Done | Explorer: exact old claim stale due effective heap ceiling, but issue remains valid as missing real-pressure gate; main RED test reproduced current false-positive `critical` status | +| Real-pressure critical gate | Red/green `test/health-thresholds.test.ts` | Done | RED: targeted test failed with expected `degraded`, received `critical`; GREEN: targeted health tests passed 18/18 after implementation | +| Monitor host-memory collection | Focused source/test inspection and health-monitor test | Done | `src/health/monitor.ts` now records `systemFree`/`systemTotal`; `test/health-monitor.test.ts` asserts latest snapshot includes both fields | +| README/skill threshold docs | Diff review, generated reference check, and stale-reference search | Done | README and generated agentmemory-config reference document critical RSS and system-free ratio knobs, including `0` disabling only the host-memory gate | +| Final branch readiness | Targeted tests, lint/build/security gates, staged secret scan | In progress | Targeted tests, lint, build, diff-check, Semgrep, staged Gitleaks, local commit, post-base targeted checks, generated reference check, and PR CI passed; final merge pending | + +## Subagent Ledger + +| Workstream | Scope | Edits allowed | Expected output | Result | Residual risk | +| --- | --- | --- | --- | --- | --- | +| Validity explorer | Issue #491 evidence, `src/health`, health tests/docs | No | Validate whether `memory_critical` false-positive exists and recommend minimal fix | Complete: exact issue text is stale around heap ceiling, but current code still lacks real-pressure gate; recommended high-RSS or low-host-free gate | Did not run tests because read-only and dependencies were absent | +| Final security/boundary reviewer | Current working-tree diff and task record | No | Critical/Important security or boundary findings | ACCEPT: no Critical/Important actionable issue found | Did not run tests or scanners; main verification covers them | +| Final test coverage reviewer | Current working-tree diff and task record | No | Critical/Important test coverage findings | Complete: found invalid-env fallback assertions for new knobs were not discriminating | Fixed by adding no-pressure degraded and default-system-pressure critical assertions; focused health tests reran green | +| Final maintainability/integration reviewer | Current working-tree diff and task record | No | Critical/Important maintainability/integration findings | ACCEPT: no Critical/Important maintainability/integration issue found | Did not rerun tests or scanners; main verification covers them | + +## Progress + +- Read AGENTS instructions, GitHub feature loop, writing-plans, review-and-implement, github-push-prepare, and verification-before-completion skills. +- Read GitHub Issue #491 via origin repo API. Issue is open and tracks upstream PR #735 as evidence only. +- Confirmed working tree clean and detached before creating branch `issue/491-memory-critical-pressure`. +- Inspected `src/health/thresholds.ts`, `src/health/monitor.ts`, `test/health-thresholds.test.ts`, README health threshold docs, package scripts, worktree state, and prior health task notes. +- Root cause: `evaluateHealth` treated `memPercent > critical` plus `rss >= memoryRssFloorBytes` as sufficient for `memory_critical`; the existing RSS floor suppresses tiny-process heap-tight noise but is not a real pressure signal. +- TDD RED: `corepack pnpm exec vitest run test/health-thresholds.test.ts --exclude test/integration.test.ts` initially hit documented pnpm ignored-build hardening; after `corepack pnpm install --frozen-lockfile --ignore-scripts`, the same test command failed on the new false-positive test with expected `degraded`, received `critical`. +- Implemented critical-only pressure gating in `src/health/thresholds.ts`: critical memory now also requires process RSS above the critical RSS ceiling or host free-memory ratio below the configured floor. Warn/degraded behavior still uses the existing RSS floor. +- Extended `HealthSnapshot.memory` with optional `systemFree` and `systemTotal`, and `registerHealthMonitor` now populates them from `node:os`. +- Added tests for no-pressure degradation, critical process RSS pressure, low host free-memory pressure, caller config overrides, env overrides, invalid fallback, and monitor snapshot collection. +- Updated README health threshold docs with `AGENTMEMORY_HEALTH_MEM_CRITICAL_RSS_MB` and `AGENTMEMORY_HEALTH_MEM_SYSTEM_FREE_FLOOR_RATIO`. +- CI feedback on PR #1006 failed `pnpm run skills:check` on both ubuntu and macos because `plugin/skills/agentmemory-config/REFERENCE.md` was stale. Ran `corepack pnpm run skills:gen` to update the generated env reference from 69 to 71 variables, then `corepack pnpm run skills:check` passed locally. + +## Verification Evidence + +- `corepack pnpm install --frozen-lockfile --ignore-scripts`: passed after initial pnpm ignored-build hardening blocked `pnpm exec`. +- `corepack pnpm exec vitest run test/health-thresholds.test.ts --exclude test/integration.test.ts`: RED failed before implementation on `does not go critical for high heap ratio without real memory pressure`, expected `degraded`, received `critical`. +- `corepack pnpm exec vitest run test/health-thresholds.test.ts --exclude test/integration.test.ts`: passed after implementation, 1 file / 18 tests. +- `corepack pnpm exec vitest run test/health-thresholds.test.ts test/health-monitor.test.ts --exclude test/integration.test.ts`: passed, 2 files / 26 tests. +- `git diff --check`: passed. +- `corepack pnpm run lint`: passed. +- `corepack pnpm run build`: passed; emitted existing tsdown plugin timing and ineffective dynamic import warnings. +- `semgrep scan --config p/default --error --metrics=off src/health/thresholds.ts src/health/monitor.ts src/types.ts test/health-thresholds.test.ts test/health-monitor.test.ts`: passed with 0 findings over 5 tracked files. +- OSV not run: no dependency, lockfile, package-manager, container, vendored, or third-party package surface changed after removing pnpm's generated `allowBuilds` placeholder from `pnpm-workspace.yaml`. +- After test-coverage review fix, `corepack pnpm exec vitest run test/health-thresholds.test.ts test/health-monitor.test.ts --exclude test/integration.test.ts`: passed, 2 files / 26 tests. +- After test-coverage review fix, `git diff --check`: passed. +- Final post-review `corepack pnpm run lint`: passed. +- Final post-review `corepack pnpm run build`: passed; emitted existing tsdown plugin timing and ineffective dynamic import warnings. +- Final post-review `semgrep scan --config p/default --error --metrics=off src/health/thresholds.ts src/health/monitor.ts src/types.ts test/health-thresholds.test.ts test/health-monitor.test.ts`: passed with 0 findings. +- `gitleaks protect --staged --redact`: passed before implementation commit, no leaks found. +- Commit `2012fed4` created with message `fix: gate memory critical on real pressure`. +- `git fetch origin main`: refreshed PR base to `ee72dba7b487fa23ef9e4b7c7ec5f6cc0fe9005c`. +- Base merge: local merge commit `9f184862` merged captured base `ee72dba7b487fa23ef9e4b7c7ec5f6cc0fe9005c` into `issue/491-memory-critical-pressure`; no conflicts. +- Post-base `corepack pnpm exec vitest run test/health-thresholds.test.ts test/health-monitor.test.ts --exclude test/integration.test.ts`: passed, 2 files / 26 tests. +- Post-base `git diff --check $(git merge-base HEAD refs/remotes/origin/main)...HEAD`: passed. +- Post-base `semgrep scan --config p/default --error --metrics=off src/health/thresholds.ts src/health/monitor.ts src/types.ts test/health-thresholds.test.ts test/health-monitor.test.ts`: passed with 0 findings. +- CI log inspection: `gh run view 27784788798 --repo wbugitlab1/agentmemory --log-failed` showed only `plugin/skills/agentmemory-config/REFERENCE.md` env drift in `pnpm run skills:check`. +- `corepack pnpm run skills:gen`: passed and regenerated `plugin/skills/agentmemory-config/REFERENCE.md`. +- `corepack pnpm run skills:check`: passed, 15 skills checked. +- PR #1006 CI rerun for head `b89505f061077aaf8e15f7fa2137172997d358c0`: passed on `test (macos-latest, 22)` in 56s and `test (ubuntu-latest, 22)` in 1m13s. + +## Review Notes + +- Sprint Contract status: implementation criteria met locally; remote PR/merge steps remain. +- Passive TypeScript/Node security-best-practices pass found no critical or major issue: the diff adds no secrets, network calls, subprocesses, auth changes, filesystem access, or dependency surface. +- Final security/boundary reviewer accepted the diff with no Critical/Important findings. +- Final test coverage reviewer found one valid Important gap in invalid-env fallback assertions for the two new knobs; fixed and re-verified. +- Final maintainability/integration reviewer accepted the diff with no Critical/Important findings. +- Focused simplification pass found no safe code deletion or simpler shape that would preserve the explicit critical-RSS/system-free domain concepts more clearly. +- Local PR-prep notes: current branch is `issue/491-memory-critical-pressure`; refreshed base is `ee72dba7b487fa23ef9e4b7c7ec5f6cc0fe9005c`; branch diff against `origin/main` contains only Issue #491 task-owned files. diff --git a/plugin/skills/agentmemory-config/REFERENCE.md b/plugin/skills/agentmemory-config/REFERENCE.md index ccb17668..7fed1246 100644 --- a/plugin/skills/agentmemory-config/REFERENCE.md +++ b/plugin/skills/agentmemory-config/REFERENCE.md @@ -3,7 +3,7 @@ Generated by scanning `src/` for `AGENTMEMORY_*` usage. Do not edit the block below by hand; run `corepack pnpm run skills:gen` after adding or removing a variable. Internal markers ending in two underscores are excluded. -Configuration is read from the environment and from `~/.agentmemory/.env` (no `export` prefix). 70 recognized variables: +Configuration is read from the environment and from `~/.agentmemory/.env` (no `export` prefix). 72 recognized variables: - `AGENTMEMORY_AGENT_ID` - `AGENTMEMORY_AGENT_SCOPE` @@ -38,7 +38,9 @@ Configuration is read from the environment and from `~/.agentmemory/.env` (no `e - `AGENTMEMORY_HEALTH_EVENTLOOP_CRITICAL_MS` - `AGENTMEMORY_HEALTH_EVENTLOOP_WARN_MS` - `AGENTMEMORY_HEALTH_MEM_CRITICAL_PCT` +- `AGENTMEMORY_HEALTH_MEM_CRITICAL_RSS_MB` - `AGENTMEMORY_HEALTH_MEM_RSS_FLOOR_MB` +- `AGENTMEMORY_HEALTH_MEM_SYSTEM_FREE_FLOOR_RATIO` - `AGENTMEMORY_HEALTH_MEM_WARN_PCT` - `AGENTMEMORY_HOST` - `AGENTMEMORY_III_CONFIG` diff --git a/src/health/monitor.ts b/src/health/monitor.ts index 759fae8c..07fb1c34 100644 --- a/src/health/monitor.ts +++ b/src/health/monitor.ts @@ -1,4 +1,5 @@ import { TriggerAction, type ISdk } from "iii-sdk"; +import { freemem, totalmem } from "node:os"; import type { HealthSnapshot } from "../types.js"; import type { StateKV } from "../state/kv.js"; import { KV, STREAM } from "../state/schema.js"; @@ -97,6 +98,8 @@ export function registerHealthMonitor( async function collectHealth(): Promise { const mem = process.memoryUsage(); + const systemFree = freemem(); + const systemTotal = totalmem(); const currentCpu = process.cpuUsage(); const now = Date.now(); const uptime = process.uptime(); @@ -132,6 +135,8 @@ export function registerHealthMonitor( heapTotal: mem.heapTotal, rss: mem.rss, external: mem.external, + systemFree, + systemTotal, }, cpu: { userMicros: currentCpu.user, diff --git a/src/health/thresholds.ts b/src/health/thresholds.ts index 2aeb00b0..2929f787 100644 --- a/src/health/thresholds.ts +++ b/src/health/thresholds.ts @@ -9,6 +9,8 @@ interface ThresholdConfig { memoryWarnPercent: number; memoryCriticalPercent: number; memoryRssFloorBytes: number; + memoryCriticalRssBytes: number; + memorySystemFreeFloorRatio: number; memoryHeapCeilingBytes?: number; } @@ -27,6 +29,8 @@ const HARD_DEFAULTS: ThresholdConfig = { memoryWarnPercent: 80, memoryCriticalPercent: 95, memoryRssFloorBytes: 512 * 1024 * 1024, + memoryCriticalRssBytes: 4096 * 1024 * 1024, + memorySystemFreeFloorRatio: 0.1, }; function readPositiveNumberEnv(name: string, fallback: number): number { @@ -36,6 +40,13 @@ function readPositiveNumberEnv(name: string, fallback: number): number { return Number.isFinite(value) && value > 0 ? value : fallback; } +function readNonNegativeNumberEnv(name: string, fallback: number): number { + const raw = process.env[name]?.trim(); + if (!raw) return fallback; + const value = Number(raw); + return Number.isFinite(value) && value >= 0 ? value : fallback; +} + function getDefaultThresholds(): ThresholdConfig { return { eventLoopLagWarnMs: readPositiveNumberEnv( @@ -69,6 +80,17 @@ function getDefaultThresholds(): ThresholdConfig { ) * 1024 * 1024, + memoryCriticalRssBytes: + readPositiveNumberEnv( + "AGENTMEMORY_HEALTH_MEM_CRITICAL_RSS_MB", + HARD_DEFAULTS.memoryCriticalRssBytes / (1024 * 1024), + ) * + 1024 * + 1024, + memorySystemFreeFloorRatio: readNonNegativeNumberEnv( + "AGENTMEMORY_HEALTH_MEM_SYSTEM_FREE_FLOOR_RATIO", + HARD_DEFAULTS.memorySystemFreeFloorRatio, + ), }; } @@ -121,8 +143,22 @@ export function evaluateHealth( : 0; const rss = snapshot.memory.rss ?? 0; const rssAboveFloor = rss >= cfg.memoryRssFloorBytes; + const criticalRssPressure = rss >= cfg.memoryCriticalRssBytes; + const systemFreeRatio = + snapshot.memory.systemTotal && snapshot.memory.systemTotal > 0 + ? (snapshot.memory.systemFree ?? snapshot.memory.systemTotal) / + snapshot.memory.systemTotal + : 1; + const systemMemoryPressure = + cfg.memorySystemFreeFloorRatio > 0 && + systemFreeRatio < cfg.memorySystemFreeFloorRatio; + const realMemoryPressure = criticalRssPressure || systemMemoryPressure; const memMb = Math.round(rss / (1024 * 1024)); - if (memPercent > cfg.memoryCriticalPercent && rssAboveFloor) { + if ( + memPercent > cfg.memoryCriticalPercent && + rssAboveFloor && + realMemoryPressure + ) { alerts.push(`memory_critical_${Math.round(memPercent)}%_rss${memMb}mb`); critical = true; } else if (memPercent > cfg.memoryWarnPercent && rssAboveFloor) { diff --git a/src/types.ts b/src/types.ts index a26babbb..08b045ad 100644 --- a/src/types.ts +++ b/src/types.ts @@ -322,6 +322,8 @@ export interface HealthSnapshot { heapTotal: number; rss: number; external: number; + systemFree?: number; + systemTotal?: number; }; cpu: { userMicros: number; systemMicros: number; percent: number }; eventLoopLagMs: number; diff --git a/test/health-monitor.test.ts b/test/health-monitor.test.ts index 4f8cd55e..b2158cc4 100644 --- a/test/health-monitor.test.ts +++ b/test/health-monitor.test.ts @@ -89,7 +89,13 @@ describe("registerHealthMonitor", () => { expect(kv.set).toHaveBeenCalledWith( KV.health, "latest", - expect.objectContaining({ status: "healthy" }), + expect.objectContaining({ + memory: expect.objectContaining({ + systemFree: expect.any(Number), + systemTotal: expect.any(Number), + }), + status: "healthy", + }), ); const firstLatestWrites = kv.set.mock.calls.filter( diff --git a/test/health-thresholds.test.ts b/test/health-thresholds.test.ts index 41884b9f..940da05c 100644 --- a/test/health-thresholds.test.ts +++ b/test/health-thresholds.test.ts @@ -13,6 +13,8 @@ const HEALTH_ENV_KEYS = [ "AGENTMEMORY_HEALTH_MEM_WARN_PCT", "AGENTMEMORY_HEALTH_MEM_CRITICAL_PCT", "AGENTMEMORY_HEALTH_MEM_RSS_FLOOR_MB", + "AGENTMEMORY_HEALTH_MEM_CRITICAL_RSS_MB", + "AGENTMEMORY_HEALTH_MEM_SYSTEM_FREE_FLOOR_RATIO", ] as const; const originalHealthEnv = new Map(); @@ -89,7 +91,7 @@ describe("evaluateHealth memory severity", () => { expect(notes.find((n) => n.startsWith("memory_heap_tight_"))).toBeDefined(); }); - it("goes critical when heap ratio is high AND RSS is above the floor", () => { + it("goes degraded when heap ratio is high but RSS only clears the warning floor", () => { const s = snap({ memory: { heapUsed: 970 * 1024 * 1024, @@ -99,6 +101,63 @@ describe("evaluateHealth memory severity", () => { }, }); const { status, alerts } = evaluateWithSnapshotHeapTotal(s); + expect(status).toBe("degraded"); + expect(alerts.some((a) => a.startsWith("memory_warn_"))).toBe(true); + expect(alerts.some((a) => a.startsWith("memory_critical_"))).toBe(false); + }); + + it("does not go critical for high heap ratio without real memory pressure", () => { + const s = snap({ + memory: { + heapUsed: 970 * 1024 * 1024, + heapTotal: 1000 * 1024 * 1024, + rss: 1100 * 1024 * 1024, + external: 0, + systemFree: 8 * 1024 * 1024 * 1024, + systemTotal: 16 * 1024 * 1024 * 1024, + }, + }); + + const { status, alerts, notes } = evaluateWithSnapshotHeapTotal(s); + + expect(status).toBe("degraded"); + expect(alerts.some((a) => a.startsWith("memory_critical_"))).toBe(false); + expect(alerts.some((a) => a.startsWith("memory_warn_"))).toBe(true); + expect(notes.some((n) => n.startsWith("memory_heap_tight_"))).toBe(false); + }); + + it("goes critical when high heap ratio has critical process RSS pressure", () => { + const s = snap({ + memory: { + heapUsed: 970 * 1024 * 1024, + heapTotal: 1000 * 1024 * 1024, + rss: 5 * 1024 * 1024 * 1024, + external: 0, + systemFree: 8 * 1024 * 1024 * 1024, + systemTotal: 16 * 1024 * 1024 * 1024, + }, + }); + + const { status, alerts } = evaluateWithSnapshotHeapTotal(s); + + expect(status).toBe("critical"); + expect(alerts.some((a) => a.startsWith("memory_critical_"))).toBe(true); + }); + + it("goes critical when high heap ratio has low host free memory", () => { + const s = snap({ + memory: { + heapUsed: 970 * 1024 * 1024, + heapTotal: 1000 * 1024 * 1024, + rss: 1100 * 1024 * 1024, + external: 0, + systemFree: 1 * 1024 * 1024 * 1024, + systemTotal: 16 * 1024 * 1024 * 1024, + }, + }); + + const { status, alerts } = evaluateWithSnapshotHeapTotal(s); + expect(status).toBe("critical"); expect(alerts.some((a) => a.startsWith("memory_critical_"))).toBe(true); }); @@ -146,7 +205,7 @@ describe("evaluateHealth memory severity", () => { }, }); const loose = evaluateWithSnapshotHeapTotal(s, { memoryRssFloorBytes: 10 * 1024 * 1024 }); - expect(loose.status).toBe("critical"); + expect(loose.status).toBe("degraded"); const strict = evaluateWithSnapshotHeapTotal(s, { memoryRssFloorBytes: 1024 * 1024 * 1024, }); @@ -277,6 +336,8 @@ describe("evaluateHealth env threshold overrides", () => { process.env.AGENTMEMORY_HEALTH_MEM_WARN_PCT = "90"; process.env.AGENTMEMORY_HEALTH_MEM_CRITICAL_PCT = "99"; process.env.AGENTMEMORY_HEALTH_MEM_RSS_FLOOR_MB = "1024"; + process.env.AGENTMEMORY_HEALTH_MEM_CRITICAL_RSS_MB = "2048"; + process.env.AGENTMEMORY_HEALTH_MEM_SYSTEM_FREE_FLOOR_RATIO = "0.25"; expect(evaluateHealth(snap({ eventLoopLagMs: 150 })).status).toBe("healthy"); expect(evaluateHealth(snap({ eventLoopLagMs: 600 }))).toMatchObject({ @@ -329,6 +390,30 @@ describe("evaluateHealth env threshold overrides", () => { }, }); expect(evaluateWithSnapshotHeapTotal(highHeapAboveEnvRssFloor).status).toBe("degraded"); + + const highHeapAboveEnvCriticalRss = snap({ + memory: { + heapUsed: 995 * 1024 * 1024, + heapTotal: 1000 * 1024 * 1024, + rss: 3 * 1024 * 1024 * 1024, + external: 0, + systemFree: 8 * 1024 * 1024 * 1024, + systemTotal: 16 * 1024 * 1024 * 1024, + }, + }); + expect(evaluateWithSnapshotHeapTotal(highHeapAboveEnvCriticalRss).status).toBe("critical"); + + const highHeapWithEnvSystemPressure = snap({ + memory: { + heapUsed: 995 * 1024 * 1024, + heapTotal: 1000 * 1024 * 1024, + rss: 1100 * 1024 * 1024, + external: 0, + systemFree: 3 * 1024 * 1024 * 1024, + systemTotal: 16 * 1024 * 1024 * 1024, + }, + }); + expect(evaluateWithSnapshotHeapTotal(highHeapWithEnvSystemPressure).status).toBe("critical"); }); it("falls back to default thresholds for invalid environment values", () => { @@ -340,6 +425,8 @@ describe("evaluateHealth env threshold overrides", () => { process.env.AGENTMEMORY_HEALTH_MEM_WARN_PCT = "NaN"; process.env.AGENTMEMORY_HEALTH_MEM_CRITICAL_PCT = "Infinity"; process.env.AGENTMEMORY_HEALTH_MEM_RSS_FLOOR_MB = "-1"; + process.env.AGENTMEMORY_HEALTH_MEM_CRITICAL_RSS_MB = "0"; + process.env.AGENTMEMORY_HEALTH_MEM_SYSTEM_FREE_FLOOR_RATIO = "-0.25"; expect(evaluateHealth(snap({ eventLoopLagMs: 150 }))).toMatchObject({ status: "degraded", @@ -398,12 +485,38 @@ describe("evaluateHealth env threshold overrides", () => { }); expect(evaluateWithSnapshotHeapTotal(highHeapBelowDefaultRssFloor).status).toBe("healthy"); - const highCriticalHeap = snap({ + const highCriticalHeapWithoutDefaultPressure = snap({ memory: { heapUsed: 970 * 1024 * 1024, heapTotal: 1000 * 1024 * 1024, rss: 1100 * 1024 * 1024, external: 0, + systemFree: 8 * 1024 * 1024 * 1024, + systemTotal: 16 * 1024 * 1024 * 1024, + }, + }); + expect(evaluateWithSnapshotHeapTotal(highCriticalHeapWithoutDefaultPressure).status).toBe("degraded"); + + const highCriticalHeapWithDefaultSystemPressure = snap({ + memory: { + heapUsed: 970 * 1024 * 1024, + heapTotal: 1000 * 1024 * 1024, + rss: 1100 * 1024 * 1024, + external: 0, + systemFree: 1 * 1024 * 1024 * 1024, + systemTotal: 16 * 1024 * 1024 * 1024, + }, + }); + expect(evaluateWithSnapshotHeapTotal(highCriticalHeapWithDefaultSystemPressure).status).toBe("critical"); + + const highCriticalHeap = snap({ + memory: { + heapUsed: 970 * 1024 * 1024, + heapTotal: 1000 * 1024 * 1024, + rss: 5 * 1024 * 1024 * 1024, + external: 0, + systemFree: 8 * 1024 * 1024 * 1024, + systemTotal: 16 * 1024 * 1024 * 1024, }, }); expect(evaluateWithSnapshotHeapTotal(highCriticalHeap).status).toBe("critical"); @@ -424,8 +537,10 @@ describe("evaluateHealth env threshold overrides", () => { expect(evaluateWithSnapshotHeapTotal(s).status).toBe("healthy"); expect( - evaluateWithSnapshotHeapTotal(s, { memoryRssFloorBytes: 10 * 1024 * 1024 }) - .status, + evaluateWithSnapshotHeapTotal(s, { + memoryCriticalRssBytes: 10 * 1024 * 1024, + memoryRssFloorBytes: 10 * 1024 * 1024, + }).status, ).toBe("critical"); }); });