diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cacfbbc7e..7e1e49821 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -68,3 +68,38 @@ jobs: path: coverage/ if-no-files-found: error retention-days: 7 + + restart-retest: + name: "Issue #349 restart retest (${{ matrix.os }})" + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] + steps: + - uses: actions/checkout@v6 + with: + persist-credentials: false + - uses: actions/setup-node@v6 + with: + node-version: 22 + - run: corepack enable + - run: pnpm install --frozen-lockfile --ignore-scripts + - run: pnpm run build + - run: node scripts/github/issue-349-restart-retest.mjs + + engine-state-probe: + name: "Issue #349 engine state probe (${{ matrix.os }})" + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] + steps: + - uses: actions/checkout@v6 + with: + persist-credentials: false + - uses: actions/setup-node@v6 + with: + node-version: 22 + - run: node scripts/github/issue-349-engine-state-probe.mjs diff --git a/docs/todos/2026-06-20-issue-349-lost-data-after-restart/arena-synthesis.md b/docs/todos/2026-06-20-issue-349-lost-data-after-restart/arena-synthesis.md new file mode 100644 index 000000000..4cf6f967e --- /dev/null +++ b/docs/todos/2026-06-20-issue-349-lost-data-after-restart/arena-synthesis.md @@ -0,0 +1,102 @@ +# Arena Synthesis: Issue 349 + +## Rubric + +1. Uses current repo evidence for restart, persistence, and stop behavior. +2. Distinguishes issue #349's laptop restart / v0.9.27 screenshot from issue + #338's CLI stop path and Windows residual. +3. Correctly identifies duplicate, stale, already-fixed, or remaining valid + scope and the Human Checkpoint requirement. +4. Proposes the smallest testable next step without broad persistence or + iii-engine boundary changes. +5. Names inspected sources, commands, files, and residual uncertainty. + +## Scores + +| Candidate | Repo evidence | Issue distinction | Classification / checkpoint | Next step | Sources / uncertainty | Total | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| A | 5 | 5 | 5 | 4 | 5 | 24 | +| B | 5 | 5 | 5 | 5 | 5 | 25 | +| C | 4 | 5 | 5 | 3 | 3 | 20 | + +## Decision + +Base: Candidate B. + +Candidate B is the strongest base because it cleanly decomposes "restarted +laptop" into separate possible paths: supported `agentmemory stop`, OS reboot +while the daemon is running, forceful power loss, or a startup/catalog issue. +That keeps the conclusion precise: issue #349 overlaps with the #338 data-loss +family, but it is not implementation-ready and should not be closed or mutated +without a Human Checkpoint. + +Grafts: +- From Candidate A: current app-level index rebuilds use iii `state::list`; they + do not scan raw iii state files. If iii's catalog returns empty after boot, + patching around raw state files would cross engine/persistence boundaries and + needs approval. +- From Candidate A: the #338 path/data-dir class is stale on current code, but a + literal OS/laptop restart is broader than the CLI stop path. +- From Candidate C: compact final framing: #349 is stale or likely duplicate + only for the #338 `agentmemory stop` interpretation and is not independently + valid for implementation without a Human Checkpoint. + +Rejected: +- Closing #349 now as already fixed. The public issue action requires approval, + and #349 says "restarted laptop", not confirmed `agentmemory stop`. +- Implementing now. There is no current-main reproduction and the likely + distinct paths cross restart, persistence, iii-engine lifecycle, or startup + reconciliation boundaries. +- Treating #1034 as a persistence change. The diff from the #338 merge to + current `origin/main` is iii runtime compatibility diagnostics and task docs. +- Claiming the CLI stop fix covers arbitrary OS reboot. PR #1033 invokes the + checkpoint through `agentmemory stop`; a laptop reboot may bypass that endpoint + and rely on worker process signals or platform shutdown ordering. + +## Validity Finding + +Issue #349 requires a Human Checkpoint. + +Current evidence supports **already fixed / stale / likely duplicate only for +the #338 class**: `agentmemory stop` now checkpoints the worker before native +signals through `postShutdownFlush()`, `executeResponsiveNativeStop()`, +`mem::shutdown-flush`, and authenticated `POST /agentmemory/shutdown/flush`. + +Current evidence does **not** prove a literal laptop or OS restart is fixed. +The issue body and upstream source provide no commands, OS, logs, data-dir +details, or current-version reproduction. The screenshot shows v0.9.27 before +PR #1033 merged. The worker still has a normal `SIGINT`/`SIGTERM` shutdown path +for non-CLI process termination, so a non-CLI reboot can bypass the #1033 CLI +checkpoint. + +## Recommended Checkpoint Options + +Recommended: keep the issue open and post a clarification/retest comment asking +for OS, current version, whether `agentmemory stop` was used before reboot, +whether the issue reproduces on a build containing PR #1033, and whether old +state files remain under the data directory after restart. + +Other options: +- Close as covered by #338 / PR #1033 if the user accepts the ambiguity and + wants to treat the v0.9.27 report as stale or duplicate. +- Approve a narrow validation task that first builds a reproduction harness for + OS/laptop restart behavior separately from the already-fixed CLI stop path. + +## Verification + +Arena verification completed by reading every candidate report and comparing +the judge verdict with a parent source inspection: +- Public issue #349 and upstream #876 contain the same sparse v0.9.27 + laptop-restart report and no comments. +- Public issue #338 is closed completed by PR #1033, merge commit + `2ecbe54aa822462c5480beb59ac0f391723dfabd`. +- Current `origin/main` is + `257238ab1c318b2e9ae5efcbe72863b99c41ee35`. +- `git diff --quiet 2ecbe54aa822462c5480beb59ac0f391723dfabd..origin/main --` + the shutdown, index-persistence, API flush, and relevant test files returned + `0`, meaning #1034 did not change those surfaces. +- `rg` confirms the #1033 shutdown flush path and the remaining worker signal + shutdown path. + +No implementation tests were run because the current outcome is a read-only +validity checkpoint, not a code change. diff --git a/docs/todos/2026-06-20-issue-349-lost-data-after-restart/plan.md b/docs/todos/2026-06-20-issue-349-lost-data-after-restart/plan.md new file mode 100644 index 000000000..cba9edb04 --- /dev/null +++ b/docs/todos/2026-06-20-issue-349-lost-data-after-restart/plan.md @@ -0,0 +1,180 @@ +# Issue 349 GitHub Restart Retest Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a GitHub Actions retest harness that validates the current #338 restart fix in clean GitHub runners for issue #349. + +**Architecture:** Keep production code unchanged. Add a small Node.js script under `scripts/github/` that starts the built CLI in an isolated temp `HOME`, writes a sentinel memory through REST, stops via the supported CLI path, restarts, and verifies the sentinel survives via REST search/list endpoints. Wire it into the existing CI workflow as a dedicated job on Ubuntu and macOS so the workflow runs from the normal PR path. + +**Tech Stack:** GitHub Actions, Node.js 22, pnpm 11, existing built `dist/cli.mjs`, built-in `fetch`, `node:child_process`, and repository REST endpoints. + +--- + +## Files + +- Create: `scripts/github/issue-349-restart-retest.mjs` +- Modify: `.github/workflows/ci.yml` +- Modify: `docs/todos/2026-06-20-issue-349-lost-data-after-restart/todo.md` + +## Task 1: Add The Retest Harness Script + +**Files:** +- Create: `scripts/github/issue-349-restart-retest.mjs` + +- [x] **Step 1: Write the harness script** + +Create `scripts/github/issue-349-restart-retest.mjs` with these responsibilities: +- Create a temp root, temp `HOME`, temp data dir, and temp invocation cwd. +- Start `node dist/cli.mjs --data-dir ` with `HOME` and `AGENTMEMORY_READY_TIMEOUT_MS=120000`. +- Wait for `/agentmemory/health`. +- POST a unique sentinel to `/agentmemory/remember`. +- Verify the sentinel appears via `/agentmemory/search` and `/agentmemory/memories`. +- POST `/agentmemory/shutdown/flush`. +- Run `node dist/cli.mjs stop` with the same temp `HOME` and data dir. +- Restart the server with the same temp `HOME` and data dir. +- Verify the sentinel still appears via `/agentmemory/search` and `/agentmemory/memories`. +- Stop the restarted server. +- Print structured progress lines and fail fast with safe diagnostics if any step fails. + +- [x] **Step 2: Run the script without a built `dist/` expectation if needed** + +Run after build only: + +```bash +corepack pnpm run build +node scripts/github/issue-349-restart-retest.mjs +``` + +Expected on a clean runner: PASS. Expected locally in this worktree: may fail or be skipped if the default iii ports are already occupied by the user's daemon. Do not stop the user's daemon. + +Actual: `node --check scripts/github/issue-349-restart-retest.mjs` passed. The live harness was intentionally not run locally because the user's existing daemon is already listening on the default iii/REST ports; it will run on clean GitHub Actions runners after push. + +## Task 2: Wire The Harness Into Existing CI + +**Files:** +- Modify: `.github/workflows/ci.yml` + +- [x] **Step 1: Add a dedicated job** + +Add a `restart-retest` job after the existing test job: +- `runs-on: ${{ matrix.os }}` +- matrix `os: [ubuntu-latest, macos-latest]` +- checkout with `persist-credentials: false` +- setup Node 22 +- enable corepack +- `pnpm install --frozen-lockfile --ignore-scripts` +- `pnpm run build` +- `node scripts/github/issue-349-restart-retest.mjs` + +Keep it separate from the existing `test` job so failures point directly at issue #349 restart behavior. + +- [x] **Step 2: Verify the workflow text** + +Run: + +```bash +git diff --check +``` + +Expected: no whitespace errors. + +Actual: `git diff --check` passed. + +## Task 3: Local Verification + +**Files:** +- All touched files + +- [x] **Step 1: Run focused tests** + +Run: + +```bash +corepack pnpm exec vitest run test/index-persistence.test.ts test/search.test.ts test/shutdown-flush.test.ts test/api-boundary-coverage.test.ts test/cli-stop-port-detection.test.ts test/reconnect-registration.test.ts test/engine-launch.test.ts test/runtime-config.test.ts test/cli-iii-config.test.ts test/consistency.test.ts +``` + +Expected: all targeted tests pass. + +Actual: passed, 10 test files / 138 tests. + +- [x] **Step 2: Run build** + +Run: + +```bash +corepack pnpm run build +``` + +Expected: build exits 0 and produces `dist/cli.mjs`. + +Actual: `corepack pnpm run build` passed and produced `dist/cli.mjs`. + +- [x] **Step 3: Run local harness only if safe** + +Before running the live harness locally, verify no existing iii/agentmemory process is listening on `49134` or `3111`: + +```bash +lsof -nP -iTCP:49134 -sTCP:LISTEN +lsof -nP -iTCP:3111 -sTCP:LISTEN +``` + +If those ports are occupied, do not run the local live harness. Record the blocker and rely on GitHub Actions clean runners after push. + +Actual: ports `49134` and `3111` are occupied by the user's existing daemon, so the live harness was not run locally. + +## Task 4: Publish For GitHub Retest + +**Files:** +- Git branch / PR metadata + +- [x] **Step 1: Stage and commit task-owned files** + +Run: + +```bash +git add .github/workflows/ci.yml scripts/github/issue-349-restart-retest.mjs docs/todos/2026-06-20-issue-349-lost-data-after-restart/todo.md docs/todos/2026-06-20-issue-349-lost-data-after-restart/arena-synthesis.md docs/todos/2026-06-20-issue-349-lost-data-after-restart/plan.md +git commit -m "test: add issue 349 restart retest harness" +``` + +Actual: staged paths were limited to the workflow, GitHub retest script, and issue #349 task notes. `git diff --cached --check` passed and `gitleaks protect --staged --redact` found no leaks across about 30 KB of staged content before commit. + +- [x] **Step 2: Push to origin** + +Run only after local verification: + +```bash +git push -u origin issue/349-lost-data-after-restart +``` + +Actual: pushed branch `issue/349-lost-data-after-restart` to `origin`. + +- [x] **Step 3: Create PR against `origin/main`** + +Use a PR body that states: +- This is a retest harness for #349, not a product fix. +- It compares #349 against #338 / PR #1033. +- Local targeted tests and build passed. +- Local live harness was blocked by an existing user daemon on default ports. +- GitHub Actions clean runners are expected to run the restart harness. + +Actual: created PR #1038 against `origin/main`. + +- [x] **Step 4: Monitor GitHub Actions** + +Fetch PR check status and inspect failed job logs if any. Do not merge until the retest result is understood and the user approves the final issue outcome. + +Actual: first GitHub Actions run `27859863679` completed with both normal `test` jobs green and both new `Issue #349 restart retest` jobs red. Logs show the sentinel was visible before stop, `agentmemory stop` reported persistence, and the second start rebuilt the search index with zero entries before failing to find the sentinel. + +Follow-up: pushed diagnostic commit `78dd8f48` and reran GitHub Actions as +`27859962631`. Normal test jobs passed again. Both restart-retest jobs failed +again, now with explicit evidence that after restart both search and memory list +lost the sentinel: `search=false memories=false`, +`search={"format":"full","results":[],"tokens_used":0,"truncated":false}`, +and `memories={"limit":null,"memories":[],"offset":0,"total":0}`. + +## Self-Review + +- The plan does not change production TypeScript runtime behavior. +- The workflow change is isolated to a dedicated CI job for a user-approved GitHub retest. +- The live harness uses temp `HOME`/data directories and no credentials. +- The local daemon is explicitly protected from stop/reuse. diff --git a/docs/todos/2026-06-20-issue-349-lost-data-after-restart/todo.md b/docs/todos/2026-06-20-issue-349-lost-data-after-restart/todo.md new file mode 100644 index 000000000..238e1575e --- /dev/null +++ b/docs/todos/2026-06-20-issue-349-lost-data-after-restart/todo.md @@ -0,0 +1,484 @@ +# Issue 349 Lost Data After Restart + +Task id: `2026-06-20-issue-349-lost-data-after-restart` + +## Scope + +Handle fork issue #349 on branch `issue/349-lost-data-after-restart` from +requested start ref `2ecbe54aa822462c5480beb59ac0f391723dfabd`, the merge +commit for PR #1033 / issue #338. + +## Sprint Contract + +Goal: validate issue #349 against the merged #338 fix and current `origin/main` +behavior, then either stop for the required Human Checkpoint or implement a +surgical, tested fix if a distinct valid scope remains approved. + +Scope: +- Validate issue #349 using public unauthenticated issue evidence and repo + evidence. +- Compare against issue #338, PR #1033, merge commit + `2ecbe54aa822462c5480beb59ac0f391723dfabd`, and current `origin/main`. +- Use `$arena` before implementation. +- If a valid non-blocked scope remains, add focused red/green tests around the + reproduced restart/data-loss path before production edits. +- Target only `origin` (`https://github.com/wbugitlab1/agentmemory.git`) for any + later PR workflow. + +Non-goals: +- Do not target `https://github.com/rohitg00/agentmemory/` for branches, PRs, or + remote writes. +- Do not import broad upstream patches or unrelated persistence behavior. +- Do not change public APIs, tool counts, schemas, auth, dependencies, storage + model, iii-engine boundaries, or project policy without explicit approval. +- Do not close, comment on, or otherwise mutate the issue before the required + checkpoint. + +Acceptance criteria: +- Branch/worktree state is confirmed and recorded. +- Issue #349 evidence is compared with #338 and current `origin/main`. +- Arena synthesis records whether #349 is valid, duplicate, stale, already + fixed, or blocked by insufficient evidence. +- Human checkpoint happens before duplicate/stale/already-fixed closure, scope + expansion, or persistence/engine-boundary implementation. +- If implementation proceeds, verification covers every touched scope and + required security gates are run or blockers are recorded. + +Intended verification: +- `git status -sb --untracked-files=all` +- `git remote -v` +- `git worktree list --porcelain` +- Public unauthenticated issue/PR reads for #349, upstream #876, #338, and + PR #1033. +- Focused source/test inspection around shutdown, persistence, and restart + behavior. +- If implementation proceeds: targeted red/green tests, repo-native checks, and + required security scans. + +Known boundaries: +- `origin/main` advanced to `257238ab1c318b2e9ae5efcbe72863b99c41ee35` after + issue #337 merged, but this branch was created from the requested #338 merge + commit `2ecbe54aa822462c5480beb59ac0f391723dfabd`. +- The fork's `upstream` remote exists locally but is out of bounds for task + writes. +- Issue #349 is sparse and imported from upstream #876; its screenshot shows + agentmemory v0.9.27 with zero dashboard counts after a laptop restart. +- Issue #338 was closed by PR #1033 after adding an explicit shutdown flush + path for native stop and Windows fail-closed behavior. + +Stop conditions: +- Stop before closing issue #349 as duplicate, stale, invalid, unreproducible, or + already fixed. +- Stop before expanding scope beyond issue #349's evidence. +- Stop before persistence, restart, iii-engine lifecycle, public API/tool/schema, + auth, dependency, storage-boundary, or project-policy changes. +- Stop if arena conclusions diverge materially. +- Stop if required verification or security gates fail and the finding is not + fixed. + +## Evidence + +- Active instructions: `AGENTS.md` and the project-local + `triage-next-github-issues` skill were read. +- `$arena` skill was read and started with the required phase checklist. +- `git status -sb --untracked-files=all`: clean before branch creation and clean + after branch creation. +- `git remote -v`: `origin` targets `https://github.com/wbugitlab1/agentmemory.git`; + `upstream` exists but is out of bounds for task writes. +- `git worktree list --porcelain`: this isolated worktree is + `/Users/A1538552/.codex/worktrees/3d23/agentmemory`. +- Branch `issue/349-lost-data-after-restart` did not exist locally or on + `origin`; it was created from + `2ecbe54aa822462c5480beb59ac0f391723dfabd`. +- Public issue #349: open, no comments, imports upstream #876, body says the + reporter restarted a laptop and lost all memories. +- Upstream #876: open, same one-line body and screenshot, no comments. +- Screenshot: dashboard from agentmemory v0.9.27 on June 9, 2026, with sessions, + memories, graph nodes, and function calls all at zero. +- Public issue #338: closed completed at 2026-06-20T03:21:45Z. +- PR #1033: merged at 2026-06-20T03:21:44Z with merge commit + `2ecbe54aa822462c5480beb59ac0f391723dfabd`; summary says it adds + `mem::shutdown-flush`, `POST /agentmemory/shutdown/flush`, pre-stop + checkpointing, Windows fail-closed behavior, and readiness gating. + +## Feature / Verification Matrix + +| Change / Decision | Verification method | Status | Evidence | +| --- | --- | --- | --- | +| Branch/worktree setup | Git commands | Done | Branch created from requested #338 merge commit; status clean. | +| Issue evidence read | Public unauthenticated GitHub API and screenshot inspection | Done | #349 and upstream #876 contain the same sparse v0.9.27 restart-loss report and no comments. | +| #338 comparison | Public PR/issue evidence plus local task docs/source | Done | #338 fix artifacts and shutdown code inspected; PR #1033 covers the CLI `agentmemory stop` path. | +| Current `origin/main` comparison | Git diff/log from #338 merge to `origin/main` | Done | Current `origin/main` is #1034; no changes to shutdown flush, index persistence, API flush, core shutdown, or relevant tests. | +| Arena validity synthesis | `$arena` | Done | Synthesis saved in `arena-synthesis.md`; Candidate B selected as base with A/C grafts. | +| Local retest | Focused regression tests, build, safe live-harness feasibility check | Done | Targeted #338/#349-relevant tests passed 10 files / 138 tests; build passed; built CLI reports `0.9.28`. Isolated native live harness blocked by existing iii process on default ports. | +| GitHub restart retest harness | Script/workflow inspection, syntax check, workflow-focused tests, build, GitHub Actions after push | Remote reproduced restart data loss | Added a dedicated GitHub Actions restart-retest job for Ubuntu and macOS plus `scripts/github/issue-349-restart-retest.mjs`. Local `node --check`, quality/consistency tests, `git diff --check`, and build passed. GitHub runs `27859863679` and `27859962631` had normal test jobs green and restart-retest jobs red. The second run shows after restart both search and memory list lost the sentinel (`search=false memories=false`, `/memories` returned `total:0`). | +| Security gates | Semgrep and staged Gitleaks | Done | Initial Semgrep flagged the local loopback health checks; narrow `nosemgrep` rationales were added for those two CI harness fetches. Final `semgrep scan --config p/default --error --metrics=off .` passed with 0 findings across 997 targets; `gitleaks protect --staged --redact` found no leaks across about 30 KB staged content. | +| Human checkpoint | User decision | Diagnostic pivot approved; public issue actions still pending | User approved the engine-boundary diagnostic pivot with "gut, tue es". Public issue comments/closure and stopping/reusing the user's live daemon remain unapproved. | +| Persistence fix attempt 1 | Red/green tests, source inspection, build, full test suite, GitHub Actions | Remote retest failed | Added a shutdown `KV.state` checkpoint with readback after index flush and a confirmed-checkpoint settle phase before signaling iii-engine. RED run failed on the missing state checkpoint and missing settle event; GREEN run passed. GitHub Actions run `27862336799` still lost memories after restart, disproving this as sufficient. | +| Persistence fix attempt 2 | Red/green tests, focused restart/flush tests, GitHub Actions | Remote retest failed | Diagnostic run `27862428578` showed the sentinel and shutdown checkpoint visible in live `state::list` / `state::get` after flush, but no persisted state after the engine was stopped with `SIGTERM`. Added a test-backed change to signal iii-engine with `SIGINT` after a confirmed checkpoint while preserving `SIGTERM` for unconfirmed and force paths. GitHub Actions run `27862548284` still lost memories after restart, disproving this as sufficient. | +| Arena next-step synthesis | `$arena` with three candidates and cross-judge | Done | All candidates converged on stopping production fixes and adding an engine-only iii-state persistence probe. Candidate C was selected as base with B/A grafts; synthesis saved at `/tmp/arena-issue349-next/synthesis.md`. | +| Engine-only iii-state probe | Red/green static tests, syntax check, CI matrix after push | Engine-boundary failure proven; Human Checkpoint | Added `scripts/github/issue-349-engine-state-probe.mjs` and CI job `engine-state-probe` to test direct `state::set/get/list/list_groups` persistence across an iii-engine restart without agentmemory REST, worker, shutdown flush, or `agentmemory stop` in the path. After harness fixes, GitHub run `27865569335` reached the actual state check on Ubuntu and macOS: direct `state::set/get/list/list_groups` verified the sentinel while the first engine was live, the engine stopped cleanly with `SIGINT` and exit code 0, then the second engine returned empty `state::get`, empty `state::list`, and no `probe:issue-349` group. This proves #349 at the iii-engine file-based state boundary. | + +## Subagent Ledger + +| Workstream | Scope | Edits allowed | Expected output | Result | Residual risk | +| --- | --- | --- | --- | --- | --- | +| Arena candidate A | Issue #349 validity | No | Validity report with #338/current-main comparison | Done | Strong repo evidence and boot-reconcile warning; selected for grafts. | +| Arena candidate B | Issue #349 validity | No | Validity report with #338/current-main comparison | Done | Selected as base; best scenario decomposition and next-step framing. | +| Arena candidate C | Issue #349 validity | No | Validity report with #338/current-main comparison | Done | Concise checkpoint framing; selected for graft. | +| Arena judge | Candidate reports and rubric | No | Scores and recommended base | Done | Recommended Candidate B with A/C grafts; agreed Human Checkpoint is required. | + +## Progress Notes + +- 2026-06-20: Read active repo instructions, the project-local triage workflow, + `$arena`, and `using-superpowers`. +- 2026-06-20: Confirmed this worktree initially had detached `HEAD` and no dirty + files. Fetched `origin`; `origin/main` advanced beyond the requested #338 + merge commit to `257238ab1c318b2e9ae5efcbe72863b99c41ee35`. +- 2026-06-20: Verified requested start commit + `2ecbe54aa822462c5480beb59ac0f391723dfabd` exists and is an ancestor of + current `origin/main`; created branch `issue/349-lost-data-after-restart` from + that commit. +- 2026-06-20: Public unauthenticated issue reads show #349/upstream #876 are + sparse restart-loss reports with no reproduction commands, no comments, and a + v0.9.27 zero-count dashboard screenshot. +- 2026-06-20: Started `$arena` validity pass before implementation or closure. +- 2026-06-20: `$arena` completed. All candidates and the judge converged that + #349 is likely stale/duplicate only for the #338 `agentmemory stop` path, but + the literal laptop/OS restart wording is not proven fixed and is not + implementation-ready. Synthesis saved in `arena-synthesis.md`. +- 2026-06-20: Parent verification confirmed current `origin/main` + (`257238ab1c318b2e9ae5efcbe72863b99c41ee35`) changes only issue #337 iii + runtime compatibility surfaces after #338; the shutdown flush, index + persistence, API flush, core shutdown, and relevant tests are unchanged from + the #338 merge. +- 2026-06-20: Current state is Human Checkpoint. Recommended action is to keep + #349 open and, if approved, post a public clarification/retest comment. + Closing as duplicate/stale or starting OS-restart implementation both require + explicit approval. +- 2026-06-20: User approved the retest path with "mach den retest". Installed + locked dependencies with + `NPM_CONFIG_USERCONFIG=/dev/null corepack pnpm install --frozen-lockfile --ignore-scripts` + after confirming no repo-local `.npmrc`/`.pnpmrc`. The install reused cached + packages, did not change manifests or lockfiles, and ran no lifecycle scripts. +- 2026-06-20: Focused regression retest passed: + `corepack pnpm exec vitest run test/index-persistence.test.ts test/search.test.ts test/shutdown-flush.test.ts test/api-boundary-coverage.test.ts test/cli-stop-port-detection.test.ts test/reconnect-registration.test.ts test/engine-launch.test.ts test/runtime-config.test.ts test/cli-iii-config.test.ts test/consistency.test.ts` + reported 10 test files and 138 tests passed. +- 2026-06-20: Build retest passed with `corepack pnpm run build`; generated + ignored `dist/` output and the built CLI returned `0.9.28` for + `node dist/cli.mjs --version`. +- 2026-06-20: Isolated native live restart harness was not run. `lsof` showed an + existing iii/agentmemory process listening on `*:49134` and + `127.0.0.1:3111`; repo docs and `/Users/A1538552/.agentmemory/bin/iii --help` + confirm bundled native iii v0.11.2 exposes `--config` but no verified listen + port relocation. Starting a second native engine is therefore not safe, and + stopping/reusing the existing daemon would affect real user memory state + outside this worktree. +- 2026-06-20: User approved the GitHub retest path. Added + `scripts/github/issue-349-restart-retest.mjs`, which starts the built CLI in + a temp `HOME` and data directory, writes a sentinel memory through REST, + verifies it, calls `/agentmemory/shutdown/flush`, stops through the supported + CLI `stop` path, restarts, and verifies the sentinel survived. +- 2026-06-20: Added a dedicated `.github/workflows/ci.yml` job named + `restart-retest` with Ubuntu and macOS runners. Local harness syntax check + passed with `node --check scripts/github/issue-349-restart-retest.mjs`. + Workflow-focused tests passed with + `corepack pnpm exec vitest run test/quality-gates.test.ts test/consistency.test.ts`; + `git diff --check` passed; `corepack pnpm run build` passed. The actual live + restart retest is pending GitHub Actions on a clean runner after push. +- 2026-06-20: Security gates completed for the GitHub retest harness. The first + Semgrep pass reported two HTTP findings for local loopback health checks in + the CI harness; added narrow `nosemgrep` comments explaining the loopback-only + daemon boundary and reran `semgrep scan --config p/default --error --metrics=off .`, + which passed with 0 findings across 997 targets. Staged Gitleaks passed with + no leaks across about 30 KB of staged content. +- 2026-06-20: Created PR #1038 from + `issue/349-lost-data-after-restart` to `origin/main`. GitHub Actions run + `27859863679` completed with both normal `test` jobs green and both new + `Issue #349 restart retest` jobs red. Ubuntu and macOS logs both show the + sentinel visible before stop, `agentmemory stop` reporting persistence, and + second start rebuilding the search index with `entries:0` before the sentinel + search failed. This makes #349 valid enough to block closure as duplicate or + already-fixed; any production persistence/restart fix remains behind the + required Human Checkpoint. +- 2026-06-20: The first macOS failed log also showed cleanup masking risk after + the second-start failure. Hardened the harness to fetch both `/search` and + `/memories` before throwing, wait for stopped child processes, and avoid + cleanup failures masking the original restart result. +- 2026-06-20: Pushed diagnostic commit `78dd8f48` and monitored GitHub Actions + run `27859962631`. Normal `test` jobs passed on Ubuntu and macOS. Both + `Issue #349 restart retest` jobs failed again. The improved error on both + platforms was + `after-restart: sentinel verification failed search=false memories=false`; + `/search` returned no results and `/memories` returned + `{"limit":null,"memories":[],"offset":0,"total":0}`. This reproduces #349 as + restart data loss in clean GitHub runners after the merged #338 fix. +- 2026-06-20: Current checkpoint: issue #349 is valid, not duplicate, + already-fixed, or stale. Production remediation would cross the persistence / + restart boundary and requires explicit Human Checkpoint approval before + implementation. Public issue comments also require explicit approval before + posting. +- 2026-06-20: User approved retrying the remediation path with "versuchs + erneut". Public issue comments/closure were not posted. Root-cause pass found + that the #338 shutdown flush only saved BM25/vector index state and did not + create an application-level `state::set` checkpoint after the memory write + path; `agentmemory stop` then signaled the worker and immediately signaled + iii-engine. The local SDK exposes no dedicated engine state flush. +- 2026-06-20: Added RED tests for the missing `KV.state` shutdown checkpoint + and the missing confirmed-checkpoint settle phase before the engine signal. + The RED command + `corepack pnpm exec vitest run test/shutdown-flush.test.ts test/cli-stop-port-detection.test.ts` + failed as expected with three assertions: `statePersistence.set` was never + called, state checkpoint mismatch did not reject, and the stop event sequence + lacked `settle`. +- 2026-06-20: Implemented a surgical fix: `mem::shutdown-flush` now saves the + search index, writes `KV.state/system:shutdownFlush`, reads it back, and + rejects if the checkpoint is not confirmed. The responsive native stop path + waits one second after a confirmed pre-stop checkpoint and worker signal + before signaling iii-engine. No public REST/MCP/tool schema or dependency + surface changed. +- 2026-06-20: Local verification after the fix passed: + `corepack pnpm exec vitest run test/shutdown-flush.test.ts test/cli-stop-port-detection.test.ts` + reported 2 files / 19 tests passed; the broader focused set reported + 10 files / 140 tests passed; `corepack pnpm run build`, `node --check + scripts/github/issue-349-restart-retest.mjs`, and `git diff --check` passed; + full `corepack pnpm test` reported 221 files / 3023 tests passed. +- 2026-06-20: Security gate for the persistence implementation passed: + `semgrep scan --config p/default --error --metrics=off .` completed with + 0 findings across 997 tracked targets. OSV was not run because no dependency, + lockfile, container, vendored, package-manager, or third-party package + surface changed. +- 2026-06-20: Pushed fix commit `89484acd`. GitHub Actions run + `27862336799` passed the normal Ubuntu/macOS test jobs but both restart + retest jobs still failed. Logs confirm the new settle delay happened between + the worker stop and iii-engine stop, yet second start rebuilt the search index + with `entries:0` and `/memories` returned `total:0`. This disproves the + "missing application-level state barrier plus immediate engine SIGTERM" as a + sufficient fix. +- 2026-06-20: Added CI-harness diagnostics before attempting a second + production fix. The harness now prints generated iii config files, the + isolated data directory tree, and direct `iii trigger state::list_groups`, + `state::list mem:memories`, and `state::get mem:state/system:shutdownFlush` + output after the flush and after the second start. Local verification for + this diagnostic change passed: `node --check + scripts/github/issue-349-restart-retest.mjs`, + `corepack pnpm exec vitest run test/quality-gates.test.ts + test/consistency.test.ts`, `git diff --check`, and + `semgrep scan --config p/default --error --metrics=off .`. +- 2026-06-20: GitHub diagnostic run `27862428578` passed the normal Ubuntu and + macOS test jobs and failed both restart retest jobs. Diagnostics narrowed the + failure to the iii-engine file-based KV shutdown path: after the explicit + flush, direct `iii trigger state::list mem:memories` returned the sentinel + memory and `state::get mem:state/system:shutdownFlush` returned the checkpoint; + after `agentmemory stop` and second start, `state::list_groups` returned only + `mem:health`, `state::list mem:memories` was empty, and the isolated data tree + still showed only empty top-level `state_store.db` and `stream_store` + directories. +- 2026-06-20: Second production hypothesis: the confirmed app-level checkpoint + is live in iii-engine, but `SIGTERM` does not give the bundled file-based KV + adapter a graceful shutdown path. Added RED expectations that confirmed + responsive stops signal iii-engine with `SIGINT` after the worker signal and + settle phase, while unconfirmed and force stops continue to use `SIGTERM`. + The RED run of + `corepack pnpm exec vitest run test/cli-stop-port-detection.test.ts` failed + exactly on `SIGTERM` versus expected `SIGINT`. Implemented the minimal signal + selection in `executeResponsiveNativeStop`; the GREEN run reported 15 tests + passed, and the adjacent focused set + `corepack pnpm exec vitest run test/shutdown-flush.test.ts + test/cli-stop-port-detection.test.ts test/reconnect-registration.test.ts` + reported 3 files / 27 tests passed. +- 2026-06-20: Local verification for the second production attempt passed: + `git diff --check`, `node --check scripts/github/issue-349-restart-retest.mjs`, + `corepack pnpm run build`, full `corepack pnpm test` (221 files / 3023 + tests), `semgrep scan --config p/default --error --metrics=off .` (0 findings + across 997 tracked targets), and `gitleaks protect --staged --redact` (no + leaks across about 2.67 KB staged content). OSV was not run because no + dependency, lockfile, container, vendored, package-manager, or third-party + package surface changed. +- 2026-06-20: Pushed second fix attempt commit `8eb5cfc4`. GitHub Actions run + `27862548284` passed the normal Ubuntu/macOS test jobs but both Issue #349 + restart retest jobs failed again. The filtered logs show the same persistence + boundary failure after the stop: the configured `file_path` still points at + the isolated temp `data/state_store.db`, the data tree still contains only the + empty top-level `state_store.db` and `stream_store` directories, direct + `state::list_groups` after restart returns only `mem:health`, direct + `state::list mem:memories` returns `[]`, the shutdown checkpoint read returns + empty stdout, and `/search` plus `/memories` both miss the sentinel. This + disproves both implemented production hypotheses so far: adding an app-level + state checkpoint plus settle delay, and using `SIGINT` for confirmed engine + stop. Stop before a third production attempt; next step needs a Human + Checkpoint / architecture decision on the iii-engine file-based persistence + boundary rather than another blind stop-path tweak. +- 2026-06-20: Ran `$arena` for the next-step recommendation. Candidate C was + selected as the base by the cross-judge, with Candidate B's decision table / + boundary wording and Candidate A's identity-proof requirements grafted in. + The synthesized recommendation is to stop production patching and add a + diagnostic-only engine boundary proof that bypasses agentmemory REST handlers, + memory write paths, shutdown flush, worker execution, and `agentmemory stop`. + Synthesis was saved at `/tmp/arena-issue349-next/synthesis.md`. +- 2026-06-20: User approved the diagnostic pivot with "gut, tue es". Added RED + static tests requiring an engine-only Issue #349 probe script and CI wiring. + The RED command + `corepack pnpm exec vitest run test/issue-349-engine-state-probe.test.ts test/quality-gates.test.ts` + failed as expected because the script and workflow job were missing. Added + `scripts/github/issue-349-engine-state-probe.mjs`, which downloads pinned + iii v0.11.2, starts a minimal `iii-state` `file_based` engine with a temp + absolute `state_store.db` path, writes/verifies a direct sentinel through + `state::set/get/list/list_groups`, stops/restarts the same engine with + `SIGINT`, and logs version, binary path, config, cwd, pid, signal, exit + status, and data-tree evidence. Added CI job `engine-state-probe` for Ubuntu + and macOS. GREEN verification for the static probe tests passed 2 files / 24 + tests, and `node --check scripts/github/issue-349-engine-state-probe.mjs` + passed. The probe itself was not run locally because it intentionally refuses + an existing iii-engine on the default port; GitHub runners provide the + isolated proof loop. +- 2026-06-20: Final local verification before commit for the engine-only probe + passed: `corepack pnpm exec vitest run + test/issue-349-engine-state-probe.test.ts test/quality-gates.test.ts + test/consistency.test.ts` reported 3 files / 34 tests passed; `node --check` + passed for both Issue #349 GitHub harness scripts; `git diff --check` passed; + full `corepack pnpm test` reported 222 files / 3027 tests passed; `semgrep + scan --config p/default --error --metrics=off .` passed with 0 findings + across 998 tracked targets; `gitleaks protect --staged --redact` found no + leaks across about 18.09 KB staged content. OSV was not run because no + dependency, lockfile, container, vendored, package-manager, or third-party + package surface changed. +- 2026-06-20: Pushed engine-only probe commit `1384de31`. GitHub Actions run + `27865094228` passed the normal Ubuntu/macOS test jobs, and both existing + restart-retest jobs still failed with `search=false memories=false`. The new + engine-only probe jobs were not yet valid evidence: unquoted YAML job names + containing `#349` were parsed as comments and displayed only as `Issue + (ubuntu-latest)` / `Issue (macos-latest)`, making `gh` job-log retrieval + ambiguous with the restart retest jobs; the macOS engine probe also failed + before exercising state persistence with `Error: spawn tar ENOENT`. +- 2026-06-20: Hardened the diagnostic harness without touching product + persistence behavior: quoted the Issue #349 CI job names, used an absolute + Unix `/usr/bin/tar` path for the iii release extraction, and added static + tests for both requirements. Local verification for this harness correction + passed with `corepack pnpm exec vitest run + test/issue-349-engine-state-probe.test.ts test/quality-gates.test.ts` + reporting 2 files / 25 tests passed and `node --check + scripts/github/issue-349-engine-state-probe.mjs` passing. Remote retest is + pending after push. +- 2026-06-20: Pushed harness fix commit `999f7513`. GitHub Actions run + `27865198014` had unambiguous job names and passed both normal Ubuntu/macOS + test jobs. Both existing restart-retest jobs still failed. The engine-only + probe still did not reach the state-persistence check: Ubuntu and macOS both + failed while extracting the iii release with `spawn /usr/bin/tar ENOENT`. + Added a RED static test requiring Node-based `.tar.gz` extraction without an + external `tar` binary; the RED run failed as expected against the current + script. Replaced the external extraction with a small Node `gunzipSync` tar + reader that writes regular file entries under the probe bin directory with + path-escape checks. GREEN verification passed with `corepack pnpm exec vitest + run test/issue-349-engine-state-probe.test.ts test/quality-gates.test.ts` + reporting 2 files / 25 tests passed and `node --check + scripts/github/issue-349-engine-state-probe.mjs` passing. Remote retest is + pending after push. +- 2026-06-20: Pushed Node extraction commit `c067fea1`. GitHub Actions run + `27865292639` passed both normal Ubuntu/macOS test jobs; both restart-retest + jobs still failed. The engine-only probe advanced past the external `tar` + dependency but still did not reach the state-persistence check: both OSes + failed with `spawn /bin/iii ENOENT` when running `iii --version`. + A local Node-only archive parse of the same iii v0.11.2 release asset on this + Mac lists and executes `iii` successfully, so the next harness correction is + to log the extracted tree and locate the binary from the archive contents + instead of assuming a fixed `bin/iii` path. Added RED static expectations for + `after-download` tree logging and `findIiiBinary`; the RED run failed as + expected. Implemented recursive binary discovery and post-download tree + logging. GREEN verification passed with `corepack pnpm exec vitest run + test/issue-349-engine-state-probe.test.ts test/quality-gates.test.ts` + reporting 2 files / 25 tests passed and `node --check + scripts/github/issue-349-engine-state-probe.mjs` passing. Remote retest is + pending after push. +- 2026-06-20: Pushed binary discovery commit `cb3eeb46`. GitHub Actions run + `27865399960` passed both normal Ubuntu/macOS test jobs; both restart-retest + jobs still failed. The engine-only probe now logs that `bin/iii` exists after + extraction on both OSes (`33220928b` on Ubuntu, `28371136b` on macOS), but + spawning it from the OS temp probe directory still fails with `ENOENT`. This + keeps the run in harness-failure territory rather than engine-state evidence. + Added a RED static expectation that executable probe files live under the + checkout workspace by default (`process.cwd()`) or an explicit + `ISSUE_349_PROBE_PARENT`, not OS temp. The RED run failed as expected. + Implemented checkout-local probe workspaces with `.agentmemory-issue-349-engine-*` + directories and cleanup. GREEN verification passed with `corepack pnpm exec + vitest run test/issue-349-engine-state-probe.test.ts + test/quality-gates.test.ts` reporting 2 files / 26 tests passed and + `node --check scripts/github/issue-349-engine-state-probe.mjs` passing. + Remote retest is pending after push. +- 2026-06-20: Pushed checkout-workspace commit `55dd073e`. GitHub Actions run + `27865479396` still failed before the state-persistence check. The logs + showed the extracted `iii` existed under the checkout-local probe workspace, + but spawning it still returned `ENOENT`. Root cause in the harness: `run()` + defaults to `engineCwd`, and `downloadIii()` ran `iii --version` before + `engineCwd` was created; the ENOENT was the missing cwd, not the binary. + Added a RED static test that `engineCwd` is created before the version + check; the RED run failed as expected. Moved `mkdir(engineCwd)` before the + first `run()` call in `downloadIii()`. GREEN verification passed with + `corepack pnpm exec vitest run + test/issue-349-engine-state-probe.test.ts test/quality-gates.test.ts` + reporting 2 files / 27 tests passed and `node --check + scripts/github/issue-349-engine-state-probe.mjs` passing. Remote retest is + pending after push. +- 2026-06-20: Pushed cwd-creation harness fix commit `cdbbf0ae`. GitHub + Actions run `27865569335` passed the normal Ubuntu/macOS test jobs and still + failed both existing restart-retest jobs with the sentinel missing after + restart. The engine-only probe jobs now reached the direct iii-state + persistence check on both OSes and failed there, making the result valid + engine-boundary evidence rather than another harness failure. On Ubuntu and + macOS, pinned iii v0.11.2 started with a minimal `iii-state` `file_based` + config using an absolute checkout-local `state_store.db` path; direct + `state::set/get/list/list_groups` verified `probe:issue-349` while the first + engine was live; the engine then stopped via `SIGINT` with exit code 0. After + restart, direct `state::get` returned empty stdout, `state::list` returned + `[]`, and `state::list_groups` no longer included `probe:issue-349`. This + proves #349 is not confined to agentmemory REST handlers, memory writers, + shutdown flush, worker lifecycle, or `agentmemory stop`; the current blocker + is iii-engine file-based state persistence across restart. Stop at Human + Checkpoint before any third production attempt, engine-boundary workaround, + public issue/PR comment, or upstream issue filing. +- 2026-06-20: Rechecked the local Mac after the user restarted agentmemory. A + deliberately saved `memory_save` sentinel (`mem_mqmb6npc_d32cbc726ecd`) + survived restart via REST and direct iii `state::get`, with + `/Users/A1538552/.agentmemory/data/state_store.db/mem%3Amemories.bin` present. + Local state already contained hundreds of persisted observation/session scope + files, but no historical explicit `mem:memories` entries before the sentinel; + consolidation is disabled locally because no LLM provider is configured. + Follow-up discrepancy probe: local direct `state::set` into a new + `probe:issue-349-local-timing` scope did not create its `.bin` file + immediately, but the file appeared after a 3-second wait while the engine + stayed running. CI run `27865672399` stops 12-40 ms after the live state proof, + and the data tree shows `state_store.db` remains an empty directory on both + Ubuntu and macOS. Current working hypothesis: the CI failure is exposing the + iii file-based adapter's delayed disk flush window; it does not prove that all + previously flushed local memories are lost on restart. Next remote diagnostic, + before any production fix, should add an intentional pre-stop wait or poll for + the expected scope file and compare the restart result. +- 2026-06-20: Added the pre-stop scope-file materialization diagnostic to the + engine-only probe. RED verification: + `corepack pnpm exec vitest run test/issue-349-engine-state-probe.test.ts` + failed 1/7 on the missing `encodedScopeFilePath` assertion. GREEN change: + `scripts/github/issue-349-engine-state-probe.mjs` now computes the expected + encoded scope file path, waits up to 15 seconds for that `.bin` file to + materialize after live `state::set/get/list/list_groups` verification, logs + `scope file materialized`, and only then stops the first engine. GREEN + verification passed with the same Vitest target reporting 7/7 tests and + `node --check scripts/github/issue-349-engine-state-probe.mjs` exiting 0. + The CI result should classify the remaining branch: if restart passes after + the file appears, #349 is a delayed flush window; if restart still fails after + the file appears, the reload/read path is suspect; if the file never appears, + the file-based adapter is not writing the fresh scope in the CI setup. +- 2026-06-20: Pushed diagnostic commit `d5061f94`. GitHub Actions run + `27872251631` completed with normal test jobs green, both original + restart-retest jobs still red, and both engine-only probe jobs green. Engine + probe evidence: Ubuntu materialized + `data/state_store.db/probe%3Aissue-349.bin` after 5014 ms, then direct + `state::get/list/list_groups` survived restart and the job logged + `PASS direct iii state survived engine restart`; macOS materialized the same + scope file after 5075 ms and also survived restart. The unchanged app-level + restart retests still stopped before any memory scope file appeared; their + `state_store.db` directories remained empty and sentinel verification failed + after restart. Classification updated: #349 is now best explained as a + delayed iii file-based state flush/materialization window, not universal loss + of already-materialized state. A product-side mitigation should wait for + durable scope materialization or otherwise force/observe iii persistence + before reporting shutdown complete. diff --git a/scripts/github/issue-349-engine-state-probe.mjs b/scripts/github/issue-349-engine-state-probe.mjs new file mode 100644 index 000000000..9d0175fbc --- /dev/null +++ b/scripts/github/issue-349-engine-state-probe.mjs @@ -0,0 +1,443 @@ +#!/usr/bin/env node + +import { spawn } from "node:child_process"; +import { createConnection } from "node:net"; +import { chmod, mkdir, mkdtemp, readFile, readdir, rm, stat, writeFile } from "node:fs/promises"; +import { dirname, join, relative, resolve, sep } from "node:path"; +import { gunzipSync } from "node:zlib"; + +const III_VERSION = process.env.AGENTMEMORY_III_VERSION ?? "0.11.2"; +const ENGINE_PORT = 49134; +const probeParent = process.env.ISSUE_349_PROBE_PARENT ?? process.cwd(); +const root = await mkdtemp(join(probeParent, ".agentmemory-issue-349-engine-")); +const binDir = join(root, "bin"); +const dataDir = join(root, "data"); +const engineCwd = join(root, "engine-cwd"); +const configPath = join(root, "iii-config.yaml"); +const stateStorePath = join(dataDir, "state_store.db"); +const sentinel = `ISSUE_349_ENGINE_STATE_PROBE_${Date.now()}_${Math.random() + .toString(36) + .slice(2)}`; +const scope = "probe:issue-349"; +const key = "sentinel"; +const stopSignal = "SIGINT"; +const probeEnv = { + ...process.env, + HOME: join(root, "home"), + USERPROFILE: join(root, "home"), + CI: "true", +}; + +let engine = null; +let iiiBin = join(binDir, process.platform === "win32" ? "iii.exe" : "iii"); + +function log(message) { + console.log(`[issue-349-engine-probe] ${message}`); +} + +function yamlSingleQuote(value) { + return `'${value.replace(/'/g, "''")}'`; +} + +function delay(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +function encodedScopeFilePath(scopeName) { + return join(stateStorePath, `${encodeURIComponent(scopeName)}.bin`); +} + +function releaseAsset() { + if (process.platform === "darwin" && process.arch === "arm64") { + return "iii-aarch64-apple-darwin.tar.gz"; + } + if (process.platform === "darwin" && process.arch === "x64") { + return "iii-x86_64-apple-darwin.tar.gz"; + } + if (process.platform === "linux" && process.arch === "x64") { + return "iii-x86_64-unknown-linux-gnu.tar.gz"; + } + if (process.platform === "linux" && process.arch === "arm64") { + return "iii-aarch64-unknown-linux-gnu.tar.gz"; + } + return null; +} + +function releaseUrl() { + const asset = releaseAsset(); + if (!asset) { + throw new Error(`unsupported iii probe platform ${process.platform}/${process.arch}`); + } + return `https://github.com/iii-hq/iii/releases/download/iii/v${III_VERSION}/${asset}`; +} + +async function run(command, args, options = {}) { + const child = spawn(command, args, { + cwd: options.cwd ?? engineCwd, + env: options.env ?? probeEnv, + stdio: ["ignore", "pipe", "pipe"], + }); + let stdout = ""; + let stderr = ""; + child.stdout.on("data", (chunk) => { + stdout += chunk.toString(); + }); + child.stderr.on("data", (chunk) => { + stderr += chunk.toString(); + }); + const { code, signal } = await new Promise((resolve) => { + child.on("close", (code, signal) => resolve({ code, signal })); + }); + return { code, signal, stdout, stderr }; +} + +function tarString(header, start, length) { + return header + .subarray(start, start + length) + .toString("utf8") + .replace(/\0.*$/s, "") + .trim(); +} + +function tarOctal(header, start, length) { + const raw = tarString(header, start, length); + return raw ? Number.parseInt(raw, 8) : 0; +} + +function tarTarget(destination, name) { + if (!name || name === ".") return null; + const destinationRoot = resolve(destination); + const target = resolve(destinationRoot, name); + if (target !== destinationRoot && !target.startsWith(`${destinationRoot}${sep}`)) { + throw new Error(`archive entry escapes destination: ${name}`); + } + return target; +} + +async function extractTarGz(archiveBytes, destination) { + const archive = gunzipSync(archiveBytes); + let offset = 0; + + while (offset + 512 <= archive.length) { + const header = archive.subarray(offset, offset + 512); + if (header.every((byte) => byte === 0)) break; + + const name = tarString(header, 0, 100); + const prefix = tarString(header, 345, 155); + const fullName = prefix ? `${prefix}/${name}` : name; + const type = String.fromCharCode(header[156] || 0); + const size = tarOctal(header, 124, 12); + offset += 512; + + const target = tarTarget(destination, fullName); + if (target && type === "5") { + await mkdir(target, { recursive: true }); + } else if (target && (type === "0" || type === "\0")) { + await mkdir(dirname(target), { recursive: true }); + await writeFile(target, archive.subarray(offset, offset + size)); + } + + offset += Math.ceil(size / 512) * 512; + } +} + +async function findIiiBinary(dir) { + const expectedName = process.platform === "win32" ? "iii.exe" : "iii"; + const entries = await readdir(dir, { withFileTypes: true }); + + for (const entry of entries.sort((a, b) => a.name.localeCompare(b.name))) { + const fullPath = join(dir, entry.name); + if (entry.isFile() && entry.name === expectedName) return fullPath; + if (entry.isDirectory()) { + const nested = await findIiiBinary(fullPath).catch(() => null); + if (nested) return nested; + } + } + + return null; +} + +async function downloadIii() { + const url = releaseUrl(); + log(`downloading iii v${III_VERSION} from ${url}`); + const response = await fetch(url); + if (!response.ok) { + throw new Error(`download failed ${response.status} ${response.statusText}`); + } + await mkdir(binDir, { recursive: true }); + await mkdir(engineCwd, { recursive: true }); + await extractTarGz(Buffer.from(await response.arrayBuffer()), binDir); + await listTree("after-download", binDir); + iiiBin = (await findIiiBinary(binDir)) ?? iiiBin; + await chmod(iiiBin, 0o755); + const version = await run(iiiBin, ["--version"]); + if (version.code !== 0) { + throw new Error(`iii --version failed code=${version.code}\n${version.stderr}`); + } + log(`iii version ${version.stdout.trim() || "(empty)"}`); + log(`binary path ${iiiBin}`); +} + +async function writeConfig() { + const config = [ + "workers:", + " - name: iii-state", + " config:", + " adapter:", + " name: kv", + " config:", + " store_method: file_based", + ` file_path: ${yamlSingleQuote(stateStorePath)}`, + "", + ].join("\n"); + await mkdir(dataDir, { recursive: true }); + await mkdir(engineCwd, { recursive: true }); + await writeFile(configPath, config, "utf-8"); + log(`config path ${configPath}`); + log(`engine cwd ${engineCwd}`); + log(`rendered config\n${config}`); +} + +async function readTextIfExists(label, filePath) { + try { + const text = await readFile(filePath, "utf-8"); + log(`${label} ${filePath}\n${text}`); + } catch (err) { + log(`${label} ${filePath} unavailable: ${err instanceof Error ? err.message : String(err)}`); + } +} + +async function listTree(label, dir, maxEntries = 120) { + const rows = []; + + async function walk(current, depth) { + if (rows.length >= maxEntries || depth > 6) return; + let entries = []; + try { + entries = await readdir(current, { withFileTypes: true }); + } catch (err) { + rows.push(`${relative(root, current) || "."} `); + return; + } + + for (const entry of entries.sort((a, b) => a.name.localeCompare(b.name))) { + if (rows.length >= maxEntries) break; + const fullPath = join(current, entry.name); + const info = await stat(fullPath).catch(() => null); + const kind = entry.isDirectory() ? "dir" : entry.isFile() ? "file" : "other"; + rows.push( + `${relative(root, fullPath)} ${kind} ${info ? `${info.size}b ${info.mtime.toISOString()}` : "stat-unavailable"}`, + ); + if (entry.isDirectory()) await walk(fullPath, depth + 1); + } + } + + await walk(dir, 0); + log(`${label}: data dir tree\n${rows.length > 0 ? rows.join("\n") : "(empty)"}`); +} + +async function portOpen() { + return new Promise((resolve) => { + const socket = createConnection({ host: "127.0.0.1", port: ENGINE_PORT }); + socket.once("connect", () => { + socket.destroy(); + resolve(true); + }); + socket.once("error", () => { + socket.destroy(); + resolve(false); + }); + socket.setTimeout(1000, () => { + socket.destroy(); + resolve(false); + }); + }); +} + +async function assertEnginePortFree() { + if (await portOpen()) { + throw new Error(`iii-engine port ${ENGINE_PORT} is already in use; refusing ambiguous engine-only probe`); + } +} + +async function runIiiTrigger(stage, functionId, payload) { + const result = await run(iiiBin, [ + "trigger", + "--function-id", + functionId, + "--payload", + JSON.stringify(payload), + ]); + log( + `${stage}: iii trigger ${functionId} exit status code=${result.code ?? "null"} signal=${result.signal ?? "null"}\nstdout=${result.stdout.trim() || "(empty)"}\nstderr=${result.stderr.trim() || "(empty)"}`, + ); + return result; +} + +async function waitForState(timeoutMs = 30_000) { + const deadline = Date.now() + timeoutMs; + let last = null; + while (Date.now() < deadline) { + const result = await runIiiTrigger("ready-check", "state::list_groups", {}); + if (result.code === 0) return; + last = result.stderr || result.stdout; + await delay(1000); + } + throw new Error(`state::list_groups did not become ready within ${timeoutMs}ms: ${last ?? "(no output)"}`); +} + +async function waitForScopeFile(scopeName, timeoutMs = 15_000) { + const filePath = encodedScopeFilePath(scopeName); + const startedAt = Date.now(); + const deadline = startedAt + timeoutMs; + let lastError = null; + + while (Date.now() < deadline) { + const info = await stat(filePath).catch((err) => { + lastError = err; + return null; + }); + if (info?.isFile()) { + log(`scope file materialized after ${Date.now() - startedAt}ms ${filePath} ${info.size}b`); + return; + } + await delay(250); + } + + throw new Error( + `scope file did not materialize within ${timeoutMs}ms at ${filePath}: ${ + lastError instanceof Error ? lastError.message : "no stat error" + }`, + ); +} + +async function startEngine(label) { + log(`starting ${label}`); + engine = spawn(iiiBin, ["--config", configPath], { + cwd: engineCwd, + env: probeEnv, + stdio: ["ignore", "pipe", "pipe"], + }); + log(`engine pid ${engine.pid ?? "(unavailable)"}`); + engine.stdout.on("data", (chunk) => { + process.stdout.write(`[${label}:stdout] ${chunk.toString()}`); + }); + engine.stderr.on("data", (chunk) => { + process.stderr.write(`[${label}:stderr] ${chunk.toString()}`); + }); + engine.on("exit", (code, signal) => { + log(`${label} exit status code=${code ?? "null"} signal=${signal ?? "null"}`); + }); + await waitForState(); +} + +async function waitForClose(child, label, timeoutMs = 5000) { + if (child.exitCode !== null || child.signalCode !== null) { + return { code: child.exitCode, signal: child.signalCode }; + } + + let timeout = null; + try { + return await Promise.race([ + new Promise((resolve) => { + child.once("close", (code, signal) => resolve({ code, signal })); + }), + new Promise((_, reject) => { + timeout = setTimeout(() => { + reject(new Error(`${label} did not exit within ${timeoutMs}ms`)); + }, timeoutMs); + }), + ]); + } finally { + if (timeout) clearTimeout(timeout); + } +} + +async function stopEngine(label) { + if (!engine) return; + const active = engine; + engine = null; + log(`stop signal ${stopSignal} for ${label}`); + active.kill(stopSignal); + const status = await waitForClose(active, label).catch(async (err) => { + log(`exit status timeout for ${label}: ${err instanceof Error ? err.message : String(err)}`); + active.kill("SIGKILL"); + return waitForClose(active, `${label} cleanup`, 3000); + }); + log(`exit status ${label} code=${status.code ?? "null"} signal=${status.signal ?? "null"}`); +} + +function outputHasSentinel(result) { + return result.stdout.toLowerCase().includes(sentinel.toLowerCase()); +} + +async function verifyDirectState(stage) { + const [getResult, listResult, groupsResult] = await Promise.all([ + runIiiTrigger(stage, "state::get", { scope, key }), + runIiiTrigger(stage, "state::list", { scope }), + runIiiTrigger(stage, "state::list_groups", {}), + ]); + const hasGet = outputHasSentinel(getResult); + const hasList = outputHasSentinel(listResult); + const hasGroup = groupsResult.stdout.includes(scope); + if (!hasGet || !hasList || !hasGroup) { + throw new Error( + `${stage}: direct state verification failed get=${hasGet} list=${hasList} group=${hasGroup}\n` + + `get=${getResult.stdout.trim() || "(empty)"}\n` + + `list=${listResult.stdout.trim() || "(empty)"}\n` + + `groups=${groupsResult.stdout.trim() || "(empty)"}`, + ); + } + log(`${stage}: direct state verified`); +} + +try { + log(`workspace ${root}`); + await assertEnginePortFree(); + await downloadIii(); + await writeConfig(); + await readTextIfExists("config readback", configPath); + + await startEngine("first-engine"); + const setResult = await runIiiTrigger("after-set", "state::set", { + scope, + key, + value: { sentinel, writtenAt: new Date().toISOString() }, + }); + if (setResult.code !== 0) { + throw new Error(`state::set failed code=${setResult.code}\n${setResult.stderr}`); + } + await verifyDirectState("after-set"); + await listTree("after-set", dataDir); + await waitForScopeFile(scope); + await listTree("after-scope-file-wait", dataDir); + await stopEngine("first-engine"); + await listTree("after-stop", dataDir); + + await assertEnginePortFree(); + await startEngine("second-engine"); + await verifyDirectState("after-restart"); + await listTree("after-restart", dataDir); + await stopEngine("second-engine"); + + log("PASS direct iii state survived engine restart"); +} catch (err) { + log(`ENGINE_PROBE_RESULT=failed ${err instanceof Error ? err.message : String(err)}`); + throw err; +} finally { + const active = engine; + engine = null; + if (active && active.exitCode === null && active.signalCode === null) { + active.kill("SIGKILL"); + await waitForClose(active, "cleanup", 3000).catch((err) => { + log(`cleanup warning: ${err instanceof Error ? err.message : String(err)}`); + }); + } + if (process.env.ISSUE_349_KEEP_PROBE_DIR !== "1") { + await rm(root, { recursive: true, force: true, maxRetries: 3, retryDelay: 500 }).catch((err) => { + log(`cleanup warning: ${err instanceof Error ? err.message : String(err)}`); + }); + } else { + log(`kept workspace ${root}`); + } +} diff --git a/scripts/github/issue-349-restart-retest.mjs b/scripts/github/issue-349-restart-retest.mjs new file mode 100644 index 000000000..6b2ce06c2 --- /dev/null +++ b/scripts/github/issue-349-restart-retest.mjs @@ -0,0 +1,341 @@ +#!/usr/bin/env node + +import { spawn } from "node:child_process"; +import { mkdir, mkdtemp, readFile, readdir, rm, stat } from "node:fs/promises"; +import { existsSync } from "node:fs"; +import { join, relative } from "node:path"; +import { tmpdir } from "node:os"; + +const root = await mkdtemp(join(tmpdir(), "agentmemory-issue-349-")); +const home = join(root, "home"); +const dataDir = join(root, "data"); +const cwd = join(root, "cwd"); +const baseUrl = "http://127.0.0.1:3111"; +const cli = join(process.cwd(), "dist", "cli.mjs"); +const iiiBin = join( + home, + ".local", + "bin", + "agentmemory", + process.platform === "win32" ? "iii.exe" : "iii", +); +const sentinel = `ISSUE_349_RESTART_RETEST_${Date.now()}_${Math.random() + .toString(36) + .slice(2)}`; + +if (!existsSync(cli)) { + throw new Error(`Built CLI missing at ${cli}; run pnpm run build first`); +} + +const serverEnv = { + ...process.env, + HOME: home, + USERPROFILE: home, + AGENTMEMORY_DATA_DIR: dataDir, + AGENTMEMORY_READY_TIMEOUT_MS: "120000", + AGENTMEMORY_AUTO_COMPRESS: "false", + GRAPH_EXTRACTION_ENABLED: "false", + CONSOLIDATION_ENABLED: "false", + AGENTMEMORY_SLOTS: "false", + AGENTMEMORY_REFLECT: "false", + CI: "true", +}; + +let server = null; + +function log(message) { + console.log(`[issue-349-retest] ${message}`); +} + +function spawnCli(args, options = {}) { + return spawn(process.execPath, [cli, ...args], { + cwd, + env: serverEnv, + stdio: options.stdio ?? ["ignore", "pipe", "pipe"], + }); +} + +function collectOutput(child, label) { + let output = ""; + child.stdout?.on("data", (chunk) => { + const text = chunk.toString(); + output += text; + process.stdout.write(`[${label}:stdout] ${text}`); + }); + child.stderr?.on("data", (chunk) => { + const text = chunk.toString(); + output += text; + process.stderr.write(`[${label}:stderr] ${text}`); + }); + return () => output; +} + +async function delay(ms) { + await new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function readTextIfExists(label, filePath) { + try { + const text = await readFile(filePath, "utf-8"); + log(`${label} ${filePath}\n${text}`); + } catch (err) { + log(`${label} ${filePath} unavailable: ${err instanceof Error ? err.message : String(err)}`); + } +} + +async function listTree(label, dir, maxEntries = 80) { + const rows = []; + + async function walk(current, depth) { + if (rows.length >= maxEntries || depth > 4) return; + let entries = []; + try { + entries = await readdir(current, { withFileTypes: true }); + } catch (err) { + rows.push(`${relative(root, current) || "."} `); + return; + } + + for (const entry of entries.sort((a, b) => a.name.localeCompare(b.name))) { + if (rows.length >= maxEntries) break; + const fullPath = join(current, entry.name); + const info = await stat(fullPath).catch(() => null); + const kind = entry.isDirectory() ? "dir" : entry.isFile() ? "file" : "other"; + rows.push( + `${relative(root, fullPath)} ${kind} ${info ? `${info.size}b ${info.mtime.toISOString()}` : "stat-unavailable"}`, + ); + if (entry.isDirectory()) await walk(fullPath, depth + 1); + } + } + + await walk(dir, 0); + log(`${label} tree for ${dir}\n${rows.length > 0 ? rows.join("\n") : "(empty)"}`); +} + +async function runIiiTrigger(stage, functionId, payload) { + if (!existsSync(iiiBin)) { + log(`${stage}: iii trigger skipped; binary missing at ${iiiBin}`); + return; + } + + const child = spawn(iiiBin, [ + "trigger", + "--function-id", + functionId, + "--payload", + JSON.stringify(payload), + ], { + cwd, + env: serverEnv, + stdio: ["ignore", "pipe", "pipe"], + }); + + let stdout = ""; + let stderr = ""; + child.stdout.on("data", (chunk) => { + stdout += chunk.toString(); + }); + child.stderr.on("data", (chunk) => { + stderr += chunk.toString(); + }); + const code = await new Promise((resolve) => child.on("close", resolve)); + log( + `${stage}: iii trigger ${functionId} exited ${code}\nstdout=${stdout.trim() || "(empty)"}\nstderr=${stderr.trim() || "(empty)"}`, + ); +} + +async function inspectPersistence(stage) { + await readTextIfExists(`${stage}: user config`, join(home, ".agentmemory", "config", "iii-config.yaml")); + await readTextIfExists(`${stage}: data runtime config`, join(dataDir, "iii-config.yaml")); + await listTree(`${stage}: data dir`, dataDir); + await runIiiTrigger(stage, "state::list_groups", {}); + await runIiiTrigger(stage, "state::list", { scope: "mem:memories" }); + await runIiiTrigger(stage, "state::get", { + scope: "mem:state", + key: "system:shutdownFlush", + }); +} + +async function waitForChildClose(child, label, timeoutMs = 5000) { + if (child.exitCode !== null || child.signalCode !== null) return; + + let timeout = null; + await Promise.race([ + new Promise((resolve) => child.once("close", resolve)), + new Promise((_, reject) => { + timeout = setTimeout(() => { + reject(new Error(`${label} did not exit within ${timeoutMs}ms`)); + }, timeoutMs); + }), + ]).finally(() => { + if (timeout) clearTimeout(timeout); + }); +} + +async function waitForHealth(timeoutMs = 120_000) { + const deadline = Date.now() + timeoutMs; + let lastError = null; + while (Date.now() < deadline) { + try { + // nosemgrep: typescript.react.security.react-insecure-request.react-insecure-request -- CI harness talks only to the loopback-only local daemon it starts in this job. + const res = await fetch(`${baseUrl}/agentmemory/health`); + if (res.ok) { + const body = await res.json(); + if (body?.service === "agentmemory") return; + } + } catch (err) { + lastError = err; + } + await delay(1000); + } + throw new Error( + `agentmemory health did not become ready within ${timeoutMs}ms${ + lastError instanceof Error ? `: ${lastError.message}` : "" + }`, + ); +} + +async function assertNoExistingServer() { + try { + // nosemgrep: typescript.react.security.react-insecure-request.react-insecure-request -- Presence check is intentionally limited to the local loopback REST port before test data is written. + const res = await fetch(`${baseUrl}/agentmemory/health`, { + signal: AbortSignal.timeout(1000), + }); + if (res.ok) { + throw new Error( + `agentmemory is already responding at ${baseUrl}; refusing to write retest data into an existing daemon`, + ); + } + } catch (err) { + if (err instanceof Error && err.message.includes("already responding")) { + throw err; + } + } +} + +async function request(path, init = {}) { + const res = await fetch(`${baseUrl}${path}`, { + ...init, + headers: { + "Content-Type": "application/json", + ...(init.headers ?? {}), + }, + }); + const text = await res.text(); + let body = null; + if (text.length > 0) { + try { + body = JSON.parse(text); + } catch { + body = text; + } + } + if (!res.ok) { + throw new Error(`${init.method ?? "GET"} ${path} failed ${res.status}: ${text}`); + } + return body; +} + +async function startServer(label) { + log(`starting ${label}`); + server = spawnCli(["--data-dir", dataDir]); + const getOutput = collectOutput(server, label); + server.on("exit", (code, signal) => { + if (server) { + log(`${label} exited unexpectedly code=${code ?? "null"} signal=${signal ?? "null"}`); + } + }); + await waitForHealth(); + log(`${label} is healthy`); + return getOutput; +} + +async function stopViaCli(label) { + log(`stopping ${label} via CLI`); + const activeServer = server; + const stop = spawnCli(["stop", "--data-dir", dataDir]); + const getOutput = collectOutput(stop, `${label}-stop`); + const code = await new Promise((resolve) => stop.on("close", resolve)); + if (code !== 0) { + throw new Error(`${label} stop exited ${code}; output:\n${getOutput()}`); + } + if (activeServer) { + await waitForChildClose(activeServer, label); + } + server = null; + log(`${label} stopped`); +} + +function resultText(result) { + return JSON.stringify(result).toLowerCase(); +} + +async function verifySentinel(stage) { + const [search, memories] = await Promise.all([ + request("/agentmemory/search", { + method: "POST", + body: JSON.stringify({ query: sentinel, limit: 5 }), + }), + request("/agentmemory/memories"), + ]); + const lowerSentinel = sentinel.toLowerCase(); + const searchHasSentinel = resultText(search).includes(lowerSentinel); + const memoriesHasSentinel = resultText(memories).includes(lowerSentinel); + if (!searchHasSentinel || !memoriesHasSentinel) { + throw new Error( + `${stage}: sentinel verification failed search=${searchHasSentinel} memories=${memoriesHasSentinel}\n` + + `search=${JSON.stringify(search)}\nmemories=${JSON.stringify(memories)}`, + ); + } + log(`${stage}: sentinel verified`); +} + +try { + log(`workspace ${root}`); + await mkdir(home, { recursive: true }); + await mkdir(dataDir, { recursive: true }); + await mkdir(cwd, { recursive: true }); + await assertNoExistingServer(); + await startServer("first-start"); + + const remember = await request("/agentmemory/remember", { + method: "POST", + body: JSON.stringify({ + content: `Issue 349 restart retest sentinel ${sentinel}`, + type: "fact", + concepts: ["issue-349", "restart-retest"], + project: "github-actions-issue-349", + }), + }); + if (remember?.success !== true) { + throw new Error(`remember did not report success: ${JSON.stringify(remember)}`); + } + log(`saved sentinel memory ${remember.memory?.id ?? "(id unavailable)"}`); + + await verifySentinel("before-stop"); + await request("/agentmemory/shutdown/flush", { method: "POST", body: "{}" }); + await inspectPersistence("after-flush"); + await stopViaCli("first-start"); + await listTree("after-stop: data dir", dataDir); + + await startServer("second-start"); + await inspectPersistence("after-second-start"); + await verifySentinel("after-restart"); + await stopViaCli("second-start"); + + log("PASS sentinel survived supported stop/restart"); +} finally { + const activeServer = server; + server = null; + if (activeServer && activeServer.exitCode === null && activeServer.signalCode === null) { + activeServer.kill("SIGTERM"); + await waitForChildClose(activeServer, "cleanup").catch((err) => { + log(`cleanup warning: ${err instanceof Error ? err.message : String(err)}`); + }); + } + try { + await rm(root, { recursive: true, force: true, maxRetries: 3, retryDelay: 500 }); + } catch (err) { + log(`cleanup warning: ${err instanceof Error ? err.message : String(err)}`); + } +} diff --git a/src/cli.ts b/src/cli.ts index e9c9c06f3..3480fffc6 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -252,6 +252,7 @@ applyRuntimeHostArgs(args); const READY_TIMEOUT_MS = readyTimeoutMsFromEnv(); const readyTimeoutLabel = () => formatReadyTimeout(READY_TIMEOUT_MS); +const STATE_PERSISTENCE_SETTLE_MS = 1000; const skipEngine = args.includes("--no-engine"); @@ -263,6 +264,10 @@ function getBaseUrl(): string { return getCliBaseUrl(); } +function waitForStatePersistenceSettle(): Promise { + return new Promise((resolve) => setTimeout(resolve, STATE_PERSISTENCE_SETTLE_MS)); +} + let discoveredViewerPort: number | null = null; export async function discoverViewerPort(): Promise { @@ -2882,6 +2887,7 @@ async function runStop(): Promise { timeoutMs: 5000, warn: (message) => p.log.warn(message), }), + settleStatePersistence: () => waitForStatePersistenceSettle(), signal: async (pid, signal, timeoutMs, role) => { const s = p.spinner(); if (role === "worker") { diff --git a/src/cli/stop-processes.ts b/src/cli/stop-processes.ts index 497ec2b3a..33480c8f0 100644 --- a/src/cli/stop-processes.ts +++ b/src/cli/stop-processes.ts @@ -24,6 +24,7 @@ export type ResponsiveNativeStopEffects = { isWindows: boolean; force: boolean; flush: () => Promise; + settleStatePersistence?: () => Promise; signal: ( pid: number, signal: NodeJS.Signals, @@ -142,8 +143,12 @@ export async function executeResponsiveNativeStop( const ok = await effects.signal(pid, "SIGTERM", 5000, "worker"); if (!ok) allStopped = false; } + if (checkpointConfirmed && dedupedEnginePids.length > 0) { + await effects.settleStatePersistence?.(); + } + const engineSignal = checkpointConfirmed ? "SIGINT" : "SIGTERM"; for (const pid of dedupedEnginePids) { - const ok = await effects.signal(pid, "SIGTERM", 3000, "engine"); + const ok = await effects.signal(pid, engineSignal, 3000, "engine"); if (!ok) allStopped = false; } diff --git a/src/functions/shutdown-flush.ts b/src/functions/shutdown-flush.ts index b44932bc1..80c82b333 100644 --- a/src/functions/shutdown-flush.ts +++ b/src/functions/shutdown-flush.ts @@ -1,18 +1,44 @@ import type { ISdk } from "iii-sdk"; import type { IndexPersistence } from "../state/index-persistence.js"; +import type { StateKV } from "../state/kv.js"; +import { KV } from "../state/schema.js"; +import type { StateScope, StateScopeKey } from "../types.js"; type ShutdownFlushIndexPersistence = Pick; +type ShutdownFlushStatePersistence = Pick; + +const SHUTDOWN_FLUSH_STATE_KEY: StateScopeKey = "system:shutdownFlush"; + +function isConfirmedCheckpoint( + value: unknown, + checkpoint: StateScope[typeof SHUTDOWN_FLUSH_STATE_KEY], +): value is StateScope[typeof SHUTDOWN_FLUSH_STATE_KEY] { + return ( + typeof value === "object" && + value !== null && + !Array.isArray(value) && + (value as { flushedAt?: unknown }).flushedAt === checkpoint.flushedAt + ); +} export function registerShutdownFlushFunction( sdk: ISdk, indexPersistence: ShutdownFlushIndexPersistence, readiness: { isReady: () => boolean }, + statePersistence: ShutdownFlushStatePersistence, ): void { sdk.registerFunction("mem::shutdown-flush", async () => { if (!readiness.isReady()) { return { success: false, error: "index_not_ready" }; } + const flushedAt = new Date().toISOString(); await indexPersistence.saveOrThrow(); - return { success: true, flushedAt: new Date().toISOString() }; + const checkpoint: StateScope[typeof SHUTDOWN_FLUSH_STATE_KEY] = { flushedAt }; + await statePersistence.set(KV.state, SHUTDOWN_FLUSH_STATE_KEY, checkpoint); + const confirmed = await statePersistence.get(KV.state, SHUTDOWN_FLUSH_STATE_KEY); + if (!isConfirmedCheckpoint(confirmed, checkpoint)) { + throw new Error("shutdown state checkpoint was not confirmed"); + } + return { success: true, flushedAt }; }); } diff --git a/src/index.ts b/src/index.ts index c69e48a57..875be13d9 100644 --- a/src/index.ts +++ b/src/index.ts @@ -368,9 +368,12 @@ async function main() { let shutdownFlushState: "loading" | "rebuilding" | "ready" | "unavailable" = "loading"; const registerAllFunctions = () => { - registerShutdownFlushFunction(sdk, indexPersistence, { - isReady: () => shutdownFlushState === "ready", - }); + registerShutdownFlushFunction( + sdk, + indexPersistence, + { isReady: () => shutdownFlushState === "ready" }, + kv, + ); registerSessionBudgetFunctions(sdk, kv); registerPrivacyFunction(sdk); registerObserveFunction(sdk, kv, dedupMap, config.maxObservationsPerSession); diff --git a/src/types.ts b/src/types.ts index b76f3fc56..08570bf50 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1162,6 +1162,9 @@ export interface DecayConfig { */ export interface StateScope { "system:currentDiskSize": number; + "system:shutdownFlush": { + flushedAt: string; + }; } export type StateScopeKey = keyof StateScope; diff --git a/test/cli-stop-port-detection.test.ts b/test/cli-stop-port-detection.test.ts index 01230f40e..1e5eea627 100644 --- a/test/cli-stop-port-detection.test.ts +++ b/test/cli-stop-port-detection.test.ts @@ -195,6 +195,9 @@ describe("Windows stop port detection (#550)", () => { events.push(`${role}:${pid}:${signal}:${timeoutMs}`); return true; }, + settleStatePersistence: async () => { + events.push("settle"); + }, clear: () => events.push("clear"), warn: (message) => events.push(`warn:${message}`), error: (message) => events.push(`error:${message}`), @@ -206,12 +209,53 @@ describe("Windows stop port detection (#550)", () => { expect(events).toEqual([ "flush", "worker:40000:SIGTERM:5000", - "engine:39672:SIGTERM:3000", + "settle", + "engine:39672:SIGINT:3000", "clear", "outro:Stopped. Memories persisted to disk; restart anytime with: npx @agentmemory/agentmemory", ]); }); + it("skips the state settle phase when checkpointing was not confirmed", async () => { + const events: string[] = []; + + const result = await executeResponsiveNativeStop( + { + workerPids: [40000], + enginePids: [39672], + }, + { + isWindows: false, + force: false, + flush: async () => { + events.push("flush"); + return false; + }, + signal: async (pid, signal, timeoutMs, role) => { + events.push(`${role}:${pid}:${signal}:${timeoutMs}`); + return true; + }, + settleStatePersistence: async () => { + events.push("settle"); + }, + clear: () => events.push("clear"), + warn: (message) => events.push(`warn:${message}`), + error: (message) => events.push(`error:${message}`), + outro: (message) => events.push(`outro:${message}`), + }, + ); + + expect(result).toEqual({ action: "stopped", allStopped: true, exitCode: 0 }); + expect(events).toEqual([ + "flush", + "warn:pre-stop flush failed; relying on the worker shutdown signal path", + "worker:40000:SIGTERM:5000", + "engine:39672:SIGTERM:3000", + "clear", + "outro:Stopped. Persistence was not confirmed before termination; restart anytime with: npx @agentmemory/agentmemory", + ]); + }); + it("checkpoints responsive engine-only stop when the worker pidfile is missing", async () => { const events: string[] = []; @@ -241,7 +285,7 @@ describe("Windows stop port detection (#550)", () => { expect(result).toEqual({ action: "stopped", allStopped: true, exitCode: 0 }); expect(events).toEqual([ "flush", - "engine:39672:SIGTERM:3000", + "engine:39672:SIGINT:3000", "clear", "outro:Stopped. Memories persisted to disk; restart anytime with: npx @agentmemory/agentmemory", ]); diff --git a/test/issue-349-engine-state-probe.test.ts b/test/issue-349-engine-state-probe.test.ts new file mode 100644 index 000000000..edfe6c2ad --- /dev/null +++ b/test/issue-349-engine-state-probe.test.ts @@ -0,0 +1,89 @@ +import { existsSync, readFileSync } from "node:fs"; +import { describe, expect, it } from "vitest"; + +const SCRIPT = "scripts/github/issue-349-engine-state-probe.mjs"; + +function readScript(): string { + return readFileSync(SCRIPT, "utf-8"); +} + +describe("Issue #349 engine state probe", () => { + it("exists as a dedicated engine-only diagnostic harness", () => { + expect(existsSync(SCRIPT)).toBe(true); + }); + + it("uses the checkout workspace for executable probe files", () => { + const source = readScript(); + + expect(source).toContain("ISSUE_349_PROBE_PARENT"); + expect(source).toContain("process.cwd()"); + expect(source).toContain(".agentmemory-issue-349-engine-"); + }); + + it("uses direct iii state operations without agentmemory app paths", () => { + const source = readScript(); + + expect(source).toContain("state::set"); + expect(source).toContain("state::get"); + expect(source).toContain("state::list"); + expect(source).toContain("state::list_groups"); + expect(source).toContain("ISSUE_349_ENGINE_STATE_PROBE"); + + expect(source).not.toContain("/agentmemory/"); + expect(source).not.toContain("shutdown/flush"); + expect(source).not.toContain("dist/index.mjs"); + expect(source).not.toContain("agentmemory stop"); + }); + + it("records enough process and storage identity to classify the boundary", () => { + const source = readScript(); + + expect(source).toContain("iii version"); + expect(source).toContain("binary path"); + expect(source).toContain("config path"); + expect(source).toContain("engine cwd"); + expect(source).toContain("engine pid"); + expect(source).toContain("stop signal"); + expect(source).toContain("exit status"); + expect(source).toContain("after-download"); + expect(source).toContain("data dir tree"); + }); + + it("extracts iii release archives without an external tar binary", () => { + const source = readScript(); + + expect(source).toContain("gunzipSync"); + expect(source).toContain("extractTarGz"); + expect(source).toContain("findIiiBinary"); + expect(source).not.toContain("TAR_BIN"); + expect(source).not.toContain('await run("tar"'); + }); + + it("creates the engine cwd before spawning the extracted binary", () => { + const source = readScript(); + const mkdirEngineCwd = source.indexOf("await mkdir(engineCwd, { recursive: true });"); + const versionCheck = source.indexOf('const version = await run(iiiBin, ["--version"]);'); + + expect(mkdirEngineCwd).toBeGreaterThanOrEqual(0); + expect(versionCheck).toBeGreaterThanOrEqual(0); + expect(mkdirEngineCwd).toBeLessThan(versionCheck); + }); + + it("waits for the expected scope file before stopping the first engine", () => { + const source = readScript(); + + expect(source).toContain("encodedScopeFilePath"); + expect(source).toContain("waitForScopeFile"); + expect(source).toContain("scope file materialized"); + + const verifyBeforeStop = source.indexOf('await verifyDirectState("after-set");'); + const waitBeforeStop = source.indexOf("await waitForScopeFile(scope);"); + const stopFirstEngine = source.indexOf('await stopEngine("first-engine");'); + + expect(verifyBeforeStop).toBeGreaterThanOrEqual(0); + expect(waitBeforeStop).toBeGreaterThanOrEqual(0); + expect(stopFirstEngine).toBeGreaterThanOrEqual(0); + expect(verifyBeforeStop).toBeLessThan(waitBeforeStop); + expect(waitBeforeStop).toBeLessThan(stopFirstEngine); + }); +}); diff --git a/test/quality-gates.test.ts b/test/quality-gates.test.ts index 1de6e8cc4..b4604ee23 100644 --- a/test/quality-gates.test.ts +++ b/test/quality-gates.test.ts @@ -267,6 +267,17 @@ describe("root quality gates", () => { expect(ci).toContain("matrix.os == 'ubuntu-latest' && matrix.node-version == 22"); }); + it("wires the issue 349 engine-state diagnostic probe into CI", () => { + const ci = readText(".github/workflows/ci.yml"); + + expect(ci).toContain("engine-state-probe:"); + expect(ci).toContain('name: "Issue #349 restart retest (${{ matrix.os }})"'); + expect(ci).toContain('name: "Issue #349 engine state probe (${{ matrix.os }})"'); + expect(ci).toContain("os: [ubuntu-latest, macos-latest]"); + expect(ci).toContain("node-version: 22"); + expect(ci).toContain("run: node scripts/github/issue-349-engine-state-probe.mjs"); + }); + it("builds publish artifacts from the committed pnpm lockfile before npm publish", () => { const publish = readText(".github/workflows/publish.yml"); diff --git a/test/reconnect-registration.test.ts b/test/reconnect-registration.test.ts index 784a89210..56009e6b2 100644 --- a/test/reconnect-registration.test.ts +++ b/test/reconnect-registration.test.ts @@ -61,7 +61,8 @@ describe("registerWithReconnectReplay", () => { expect(replayCall).toBeGreaterThan(closureStart); const replayedSource = source.slice(closureStart, replayCall); - expect(replayedSource).toContain("registerShutdownFlushFunction(sdk, indexPersistence"); + expect(replayedSource).toContain("registerShutdownFlushFunction("); + expect(replayedSource).toContain("indexPersistence"); expect(replayedSource).toContain("registerApiTriggers(sdk, kv, secret, metricsStore, provider)"); expect(replayedSource).toContain("registerEventTriggers(sdk, kv)"); expect(replayedSource).toContain("registerMcpEndpoints(sdk, kv, secret)"); diff --git a/test/shutdown-flush.test.ts b/test/shutdown-flush.test.ts index f911e9c76..727a12dea 100644 --- a/test/shutdown-flush.test.ts +++ b/test/shutdown-flush.test.ts @@ -12,15 +12,32 @@ describe("registerShutdownFlushFunction", () => { ), }; const indexPersistence = { saveOrThrow: vi.fn(async () => {}) }; + let checkpoint: unknown = null; + const statePersistence = { + set: vi.fn(async (_scope: string, _key: string, value: unknown) => { + checkpoint = value; + return value; + }), + get: vi.fn(async () => checkpoint), + }; - registerShutdownFlushFunction(sdk as never, indexPersistence, { - isReady: () => true, - }); + registerShutdownFlushFunction( + sdk as never, + indexPersistence, + { isReady: () => true }, + statePersistence as never, + ); const handler = handlers.get("mem::shutdown-flush"); expect(handler).toBeDefined(); await expect(handler!({})).resolves.toMatchObject({ success: true }); expect(indexPersistence.saveOrThrow).toHaveBeenCalledTimes(1); + expect(statePersistence.set).toHaveBeenCalledWith( + "mem:state", + "system:shutdownFlush", + expect.objectContaining({ flushedAt: expect.any(String) }), + ); + expect(statePersistence.get).toHaveBeenCalledWith("mem:state", "system:shutdownFlush"); }); it("propagates checkpoint failures so callers can fail closed", async () => { @@ -37,16 +54,50 @@ describe("registerShutdownFlushFunction", () => { throw new Error("state::set failed"); }), }; + const statePersistence = { + set: vi.fn(async (_scope: string, _key: string, value: unknown) => value), + get: vi.fn(async () => null), + }; - registerShutdownFlushFunction(sdk as never, indexPersistence, { - isReady: () => true, - }); + registerShutdownFlushFunction( + sdk as never, + indexPersistence, + { isReady: () => true }, + statePersistence as never, + ); await expect(handlers.get("mem::shutdown-flush")!({})).rejects.toThrow( "state::set failed", ); }); + it("propagates state checkpoint failures so stop does not claim persistence", async () => { + const handlers = new Map Promise>(); + const sdk = { + registerFunction: vi.fn( + (id: string, handler: (payload: unknown) => Promise) => { + handlers.set(id, handler); + }, + ), + }; + const indexPersistence = { saveOrThrow: vi.fn(async () => {}) }; + const statePersistence = { + set: vi.fn(async (_scope: string, _key: string, value: unknown) => value), + get: vi.fn(async () => null), + }; + + registerShutdownFlushFunction( + sdk as never, + indexPersistence, + { isReady: () => true }, + statePersistence as never, + ); + + await expect(handlers.get("mem::shutdown-flush")!({})).rejects.toThrow( + "shutdown state checkpoint was not confirmed", + ); + }); + it("does not save before restored indexes are ready", async () => { const handlers = new Map Promise>(); const sdk = { @@ -57,10 +108,17 @@ describe("registerShutdownFlushFunction", () => { ), }; const indexPersistence = { saveOrThrow: vi.fn(async () => {}) }; + const statePersistence = { + set: vi.fn(async (_scope: string, _key: string, value: unknown) => value), + get: vi.fn(async () => null), + }; - registerShutdownFlushFunction(sdk as never, indexPersistence, { - isReady: () => false, - }); + registerShutdownFlushFunction( + sdk as never, + indexPersistence, + { isReady: () => false }, + statePersistence as never, + ); await expect(handlers.get("mem::shutdown-flush")!({})).resolves.toEqual({ success: false,