diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cacfbbc7e..7e1e49821 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -68,3 +68,38 @@ jobs:
           path: coverage/
           if-no-files-found: error
           retention-days: 7
+
+  restart-retest:
+    name: "Issue #349 restart retest (${{ matrix.os }})"
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          persist-credentials: false
+      - uses: actions/setup-node@v6
+        with:
+          node-version: 22
+      - run: corepack enable
+      - run: pnpm install --frozen-lockfile --ignore-scripts
+      - run: pnpm run build
+      - run: node scripts/github/issue-349-restart-retest.mjs
+
+  engine-state-probe:
+    name: "Issue #349 engine state probe (${{ matrix.os }})"
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          persist-credentials: false
+      - uses: actions/setup-node@v6
+        with:
+          node-version: 22
+      - run: node scripts/github/issue-349-engine-state-probe.mjs
diff --git a/docs/todos/2026-06-20-issue-349-lost-data-after-restart/arena-synthesis.md b/docs/todos/2026-06-20-issue-349-lost-data-after-restart/arena-synthesis.md
new file mode 100644
index 000000000..4cf6f967e
--- /dev/null
+++ b/docs/todos/2026-06-20-issue-349-lost-data-after-restart/arena-synthesis.md
@@ -0,0 +1,102 @@
+# Arena Synthesis: Issue 349
+
+## Rubric
+
+1. Uses current repo evidence for restart, persistence, and stop behavior.
+2. Distinguishes issue #349's laptop restart / v0.9.27 screenshot from issue
+   #338's CLI stop path and Windows residual.
+3. Correctly identifies duplicate, stale, already-fixed, or remaining valid
+   scope and the Human Checkpoint requirement.
+4. Proposes the smallest testable next step without broad persistence or
+   iii-engine boundary changes.
+5. Names inspected sources, commands, files, and residual uncertainty.
+
+## Scores
+
+| Candidate | Repo evidence | Issue distinction | Classification / checkpoint | Next step | Sources / uncertainty | Total |
+| --- | ---: | ---: | ---: | ---: | ---: | ---: |
+| A | 5 | 5 | 5 | 4 | 5 | 24 |
+| B | 5 | 5 | 5 | 5 | 5 | 25 |
+| C | 4 | 5 | 5 | 3 | 3 | 20 |
+
+## Decision
+
+Base: Candidate B.
+
+Candidate B is the strongest base because it cleanly decomposes "restarted
+laptop" into separate possible paths: supported `agentmemory stop`, OS reboot
+while the daemon is running, forceful power loss, or a startup/catalog issue.
+That keeps the conclusion precise: issue #349 overlaps with the #338 data-loss
+family, but it is not implementation-ready and should not be closed or mutated
+without a Human Checkpoint.
+
+Grafts:
+- From Candidate A: current app-level index rebuilds use iii `state::list`; they
+  do not scan raw iii state files. If iii's catalog returns empty after boot,
+  patching around raw state files would cross engine/persistence boundaries and
+  needs approval.
+- From Candidate A: the #338 path/data-dir class is stale on current code, but a
+  literal OS/laptop restart is broader than the CLI stop path.
+- From Candidate C: compact final framing: #349 is stale or likely duplicate
+  only for the #338 `agentmemory stop` interpretation and is not independently
+  valid for implementation without a Human Checkpoint.
+
+Rejected:
+- Closing #349 now as already fixed. The public issue action requires approval,
+  and #349 says "restarted laptop", not confirmed `agentmemory stop`.
+- Implementing now. There is no current-main reproduction and the likely
+  distinct paths cross restart, persistence, iii-engine lifecycle, or startup
+  reconciliation boundaries.
+- Treating #1034 as a persistence change. The diff from the #338 merge to
+  current `origin/main` is iii runtime compatibility diagnostics and task docs.
+- Claiming the CLI stop fix covers arbitrary OS reboot. PR #1033 invokes the
+  checkpoint through `agentmemory stop`; a laptop reboot may bypass that endpoint
+  and rely on worker process signals or platform shutdown ordering.
+
+## Validity Finding
+
+Issue #349 requires a Human Checkpoint.
+
+Current evidence supports **already fixed / stale / likely duplicate only for
+the #338 class**: `agentmemory stop` now checkpoints the worker before native
+signals through `postShutdownFlush()`, `executeResponsiveNativeStop()`,
+`mem::shutdown-flush`, and authenticated `POST /agentmemory/shutdown/flush`.
+
+Current evidence does **not** prove a literal laptop or OS restart is fixed.
+The issue body and upstream source provide no commands, OS, logs, data-dir
+details, or current-version reproduction. The screenshot shows v0.9.27 before
+PR #1033 merged. The worker still has a normal `SIGINT`/`SIGTERM` shutdown path
+for non-CLI process termination, so a non-CLI reboot can bypass the #1033 CLI
+checkpoint.
+
+## Recommended Checkpoint Options
+
+Recommended: keep the issue open and post a clarification/retest comment asking
+for OS, current version, whether `agentmemory stop` was used before reboot,
+whether the issue reproduces on a build containing PR #1033, and whether old
+state files remain under the data directory after restart.
+
+Other options:
+- Close as covered by #338 / PR #1033 if the user accepts the ambiguity and
+  wants to treat the v0.9.27 report as stale or duplicate.
+- Approve a narrow validation task that first builds a reproduction harness for
+  OS/laptop restart behavior separately from the already-fixed CLI stop path.
+
+## Verification
+
+Arena verification completed by reading every candidate report and comparing
+the judge verdict with a parent source inspection:
+- Public issue #349 and upstream #876 contain the same sparse v0.9.27
+  laptop-restart report and no comments.
+- Public issue #338 is closed completed by PR #1033, merge commit
+  `2ecbe54aa822462c5480beb59ac0f391723dfabd`.
+- Current `origin/main` is
+  `257238ab1c318b2e9ae5efcbe72863b99c41ee35`.
+- `git diff --quiet 2ecbe54aa822462c5480beb59ac0f391723dfabd..origin/main --`
+  the shutdown, index-persistence, API flush, and relevant test files returned
+  `0`, meaning #1034 did not change those surfaces.
+- `rg` confirms the #1033 shutdown flush path and the remaining worker signal
+  shutdown path.
+
+No implementation tests were run because the current outcome is a read-only
+validity checkpoint, not a code change.
diff --git a/docs/todos/2026-06-20-issue-349-lost-data-after-restart/plan.md b/docs/todos/2026-06-20-issue-349-lost-data-after-restart/plan.md
new file mode 100644
index 000000000..cba9edb04
--- /dev/null
+++ b/docs/todos/2026-06-20-issue-349-lost-data-after-restart/plan.md
@@ -0,0 +1,180 @@
+# Issue 349 GitHub Restart Retest Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Add a GitHub Actions retest harness that validates the current #338 restart fix in clean GitHub runners for issue #349.
+
+**Architecture:** Keep production code unchanged. Add a small Node.js script under `scripts/github/` that starts the built CLI in an isolated temp `HOME`, writes a sentinel memory through REST, stops via the supported CLI path, restarts, and verifies the sentinel survives via REST search/list endpoints. Wire it into the existing CI workflow as a dedicated job on Ubuntu and macOS so the workflow runs from the normal PR path.
+
+**Tech Stack:** GitHub Actions, Node.js 22, pnpm 11, existing built `dist/cli.mjs`, built-in `fetch`, `node:child_process`, and repository REST endpoints.
+
+---
+
+## Files
+
+- Create: `scripts/github/issue-349-restart-retest.mjs`
+- Modify: `.github/workflows/ci.yml`
+- Modify: `docs/todos/2026-06-20-issue-349-lost-data-after-restart/todo.md`
+
+## Task 1: Add The Retest Harness Script
+
+**Files:**
+- Create: `scripts/github/issue-349-restart-retest.mjs`
+
+- [x] **Step 1: Write the harness script**
+
+Create `scripts/github/issue-349-restart-retest.mjs` with these responsibilities:
+- Create a temp root, temp `HOME`, temp data dir, and temp invocation cwd.
+- Start `node dist/cli.mjs --data-dir <temp-data-dir>` with `HOME` and `AGENTMEMORY_READY_TIMEOUT_MS=120000`.
+- Wait for `/agentmemory/health`.
+- POST a unique sentinel to `/agentmemory/remember`.
+- Verify the sentinel appears via `/agentmemory/search` and `/agentmemory/memories`.
+- POST `/agentmemory/shutdown/flush`.
+- Run `node dist/cli.mjs stop` with the same temp `HOME` and data dir.
+- Restart the server with the same temp `HOME` and data dir.
+- Verify the sentinel still appears via `/agentmemory/search` and `/agentmemory/memories`.
+- Stop the restarted server.
+- Print structured progress lines and fail fast with safe diagnostics if any step fails.
+
+- [x] **Step 2: Run the script without a built `dist/` expectation if needed**
+
+Run after build only:
+
+```bash
+corepack pnpm run build
+node scripts/github/issue-349-restart-retest.mjs
+```
+
+Expected on a clean runner: PASS. Expected locally in this worktree: may fail or be skipped if the default iii ports are already occupied by the user's daemon. Do not stop the user's daemon.
+
+Actual: `node --check scripts/github/issue-349-restart-retest.mjs` passed. The live harness was intentionally not run locally because the user's existing daemon is already listening on the default iii/REST ports; it will run on clean GitHub Actions runners after push.
+
+## Task 2: Wire The Harness Into Existing CI
+
+**Files:**
+- Modify: `.github/workflows/ci.yml`
+
+- [x] **Step 1: Add a dedicated job**
+
+Add a `restart-retest` job after the existing test job:
+- `runs-on: ${{ matrix.os }}`
+- matrix `os: [ubuntu-latest, macos-latest]`
+- checkout with `persist-credentials: false`
+- setup Node 22
+- enable corepack
+- `pnpm install --frozen-lockfile --ignore-scripts`
+- `pnpm run build`
+- `node scripts/github/issue-349-restart-retest.mjs`
+
+Keep it separate from the existing `test` job so failures point directly at issue #349 restart behavior.
+
+- [x] **Step 2: Verify the workflow text**
+
+Run:
+
+```bash
+git diff --check
+```
+
+Expected: no whitespace errors.
+
+Actual: `git diff --check` passed.
+
+## Task 3: Local Verification
+
+**Files:**
+- All touched files
+
+- [x] **Step 1: Run focused tests**
+
+Run:
+
+```bash
+corepack pnpm exec vitest run test/index-persistence.test.ts test/search.test.ts test/shutdown-flush.test.ts test/api-boundary-coverage.test.ts test/cli-stop-port-detection.test.ts test/reconnect-registration.test.ts test/engine-launch.test.ts test/runtime-config.test.ts test/cli-iii-config.test.ts test/consistency.test.ts
+```
+
+Expected: all targeted tests pass.
+
+Actual: passed, 10 test files / 138 tests.
+
+- [x] **Step 2: Run build**
+
+Run:
+
+```bash
+corepack pnpm run build
+```
+
+Expected: build exits 0 and produces `dist/cli.mjs`.
+
+Actual: `corepack pnpm run build` passed and produced `dist/cli.mjs`.
+
+- [x] **Step 3: Run local harness only if safe**
+
+Before running the live harness locally, verify no existing iii/agentmemory process is listening on `49134` or `3111`:
+
+```bash
+lsof -nP -iTCP:49134 -sTCP:LISTEN
+lsof -nP -iTCP:3111 -sTCP:LISTEN
+```
+
+If those ports are occupied, do not run the local live harness. Record the blocker and rely on GitHub Actions clean runners after push.
+
+Actual: ports `49134` and `3111` are occupied by the user's existing daemon, so the live harness was not run locally.
+
+## Task 4: Publish For GitHub Retest
+
+**Files:**
+- Git branch / PR metadata
+
+- [x] **Step 1: Stage and commit task-owned files**
+
+Run:
+
+```bash
+git add .github/workflows/ci.yml scripts/github/issue-349-restart-retest.mjs docs/todos/2026-06-20-issue-349-lost-data-after-restart/todo.md docs/todos/2026-06-20-issue-349-lost-data-after-restart/arena-synthesis.md docs/todos/2026-06-20-issue-349-lost-data-after-restart/plan.md
+git commit -m "test: add issue 349 restart retest harness"
+```
+
+Actual: staged paths were limited to the workflow, GitHub retest script, and issue #349 task notes. `git diff --cached --check` passed and `gitleaks protect --staged --redact` found no leaks across about 30 KB of staged content before commit.
+
+- [x] **Step 2: Push to origin**
+
+Run only after local verification:
+
+```bash
+git push -u origin issue/349-lost-data-after-restart
+```
+
+Actual: pushed branch `issue/349-lost-data-after-restart` to `origin`.
+
+- [x] **Step 3: Create PR against `origin/main`**
+
+Use a PR body that states:
+- This is a retest harness for #349, not a product fix.
+- It compares #349 against #338 / PR #1033.
+- Local targeted tests and build passed.
+- Local live harness was blocked by an existing user daemon on default ports.
+- GitHub Actions clean runners are expected to run the restart harness.
+
+Actual: created PR #1038 against `origin/main`.
+
+- [x] **Step 4: Monitor GitHub Actions**
+
+Fetch PR check status and inspect failed job logs if any. Do not merge until the retest result is understood and the user approves the final issue outcome.
+
+Actual: first GitHub Actions run `27859863679` completed with both normal `test` jobs green and both new `Issue #349 restart retest` jobs red. Logs show the sentinel was visible before stop, `agentmemory stop` reported persistence, and the second start rebuilt the search index with zero entries before failing to find the sentinel.
+
+Follow-up: pushed diagnostic commit `78dd8f48` and reran GitHub Actions as
+`27859962631`. Normal test jobs passed again. Both restart-retest jobs failed
+again, now with explicit evidence that after restart both search and memory list
+lost the sentinel: `search=false memories=false`,
+`search={"format":"full","results":[],"tokens_used":0,"truncated":false}`,
+and `memories={"limit":null,"memories":[],"offset":0,"total":0}`.
+
+## Self-Review
+
+- The plan does not change production TypeScript runtime behavior.
+- The workflow change is isolated to a dedicated CI job for a user-approved GitHub retest.
+- The live harness uses temp `HOME`/data directories and no credentials.
+- The local daemon is explicitly protected from stop/reuse.
diff --git a/docs/todos/2026-06-20-issue-349-lost-data-after-restart/todo.md b/docs/todos/2026-06-20-issue-349-lost-data-after-restart/todo.md
new file mode 100644
index 000000000..238e1575e
--- /dev/null
+++ b/docs/todos/2026-06-20-issue-349-lost-data-after-restart/todo.md
@@ -0,0 +1,484 @@
+# Issue 349 Lost Data After Restart
+
+Task id: `2026-06-20-issue-349-lost-data-after-restart`
+
+## Scope
+
+Handle fork issue #349 on branch `issue/349-lost-data-after-restart` from
+requested start ref `2ecbe54aa822462c5480beb59ac0f391723dfabd`, the merge
+commit for PR #1033 / issue #338.
+
+## Sprint Contract
+
+Goal: validate issue #349 against the merged #338 fix and current `origin/main`
+behavior, then either stop for the required Human Checkpoint or implement a
+surgical, tested fix if a distinct valid scope remains approved.
+
+Scope:
+- Validate issue #349 using public unauthenticated issue evidence and repo
+  evidence.
+- Compare against issue #338, PR #1033, merge commit
+  `2ecbe54aa822462c5480beb59ac0f391723dfabd`, and current `origin/main`.
+- Use `$arena` before implementation.
+- If a valid non-blocked scope remains, add focused red/green tests around the
+  reproduced restart/data-loss path before production edits.
+- Target only `origin` (`https://github.com/wbugitlab1/agentmemory.git`) for any
+  later PR workflow.
+
+Non-goals:
+- Do not target `https://github.com/rohitg00/agentmemory/` for branches, PRs, or
+  remote writes.
+- Do not import broad upstream patches or unrelated persistence behavior.
+- Do not change public APIs, tool counts, schemas, auth, dependencies, storage
+  model, iii-engine boundaries, or project policy without explicit approval.
+- Do not close, comment on, or otherwise mutate the issue before the required
+  checkpoint.
+
+Acceptance criteria:
+- Branch/worktree state is confirmed and recorded.
+- Issue #349 evidence is compared with #338 and current `origin/main`.
+- Arena synthesis records whether #349 is valid, duplicate, stale, already
+  fixed, or blocked by insufficient evidence.
+- Human checkpoint happens before duplicate/stale/already-fixed closure, scope
+  expansion, or persistence/engine-boundary implementation.
+- If implementation proceeds, verification covers every touched scope and
+  required security gates are run or blockers are recorded.
+
+Intended verification:
+- `git status -sb --untracked-files=all`
+- `git remote -v`
+- `git worktree list --porcelain`
+- Public unauthenticated issue/PR reads for #349, upstream #876, #338, and
+  PR #1033.
+- Focused source/test inspection around shutdown, persistence, and restart
+  behavior.
+- If implementation proceeds: targeted red/green tests, repo-native checks, and
+  required security scans.
+
+Known boundaries:
+- `origin/main` advanced to `257238ab1c318b2e9ae5efcbe72863b99c41ee35` after
+  issue #337 merged, but this branch was created from the requested #338 merge
+  commit `2ecbe54aa822462c5480beb59ac0f391723dfabd`.
+- The fork's `upstream` remote exists locally but is out of bounds for task
+  writes.
+- Issue #349 is sparse and imported from upstream #876; its screenshot shows
+  agentmemory v0.9.27 with zero dashboard counts after a laptop restart.
+- Issue #338 was closed by PR #1033 after adding an explicit shutdown flush
+  path for native stop and Windows fail-closed behavior.
+
+Stop conditions:
+- Stop before closing issue #349 as duplicate, stale, invalid, unreproducible, or
+  already fixed.
+- Stop before expanding scope beyond issue #349's evidence.
+- Stop before persistence, restart, iii-engine lifecycle, public API/tool/schema,
+  auth, dependency, storage-boundary, or project-policy changes.
+- Stop if arena conclusions diverge materially.
+- Stop if required verification or security gates fail and the finding is not
+  fixed.
+
+## Evidence
+
+- Active instructions: `AGENTS.md` and the project-local
+  `triage-next-github-issues` skill were read.
+- `$arena` skill was read and started with the required phase checklist.
+- `git status -sb --untracked-files=all`: clean before branch creation and clean
+  after branch creation.
+- `git remote -v`: `origin` targets `https://github.com/wbugitlab1/agentmemory.git`;
+  `upstream` exists but is out of bounds for task writes.
+- `git worktree list --porcelain`: this isolated worktree is
+  `/Users/A1538552/.codex/worktrees/3d23/agentmemory`.
+- Branch `issue/349-lost-data-after-restart` did not exist locally or on
+  `origin`; it was created from
+  `2ecbe54aa822462c5480beb59ac0f391723dfabd`.
+- Public issue #349: open, no comments, imports upstream #876, body says the
+  reporter restarted a laptop and lost all memories.
+- Upstream #876: open, same one-line body and screenshot, no comments.
+- Screenshot: dashboard from agentmemory v0.9.27 on June 9, 2026, with sessions,
+  memories, graph nodes, and function calls all at zero.
+- Public issue #338: closed completed at 2026-06-20T03:21:45Z.
+- PR #1033: merged at 2026-06-20T03:21:44Z with merge commit
+  `2ecbe54aa822462c5480beb59ac0f391723dfabd`; summary says it adds
+  `mem::shutdown-flush`, `POST /agentmemory/shutdown/flush`, pre-stop
+  checkpointing, Windows fail-closed behavior, and readiness gating.
+
+## Feature / Verification Matrix
+
+| Change / Decision | Verification method | Status | Evidence |
+| --- | --- | --- | --- |
+| Branch/worktree setup | Git commands | Done | Branch created from requested #338 merge commit; status clean. |
+| Issue evidence read | Public unauthenticated GitHub API and screenshot inspection | Done | #349 and upstream #876 contain the same sparse v0.9.27 restart-loss report and no comments. |
+| #338 comparison | Public PR/issue evidence plus local task docs/source | Done | #338 fix artifacts and shutdown code inspected; PR #1033 covers the CLI `agentmemory stop` path. |
+| Current `origin/main` comparison | Git diff/log from #338 merge to `origin/main` | Done | Current `origin/main` is #1034; no changes to shutdown flush, index persistence, API flush, core shutdown, or relevant tests. |
+| Arena validity synthesis | `$arena` | Done | Synthesis saved in `arena-synthesis.md`; Candidate B selected as base with A/C grafts. |
+| Local retest | Focused regression tests, build, safe live-harness feasibility check | Done | Targeted #338/#349-relevant tests passed 10 files / 138 tests; build passed; built CLI reports `0.9.28`. Isolated native live harness blocked by existing iii process on default ports. |
+| GitHub restart retest harness | Script/workflow inspection, syntax check, workflow-focused tests, build, GitHub Actions after push | Remote reproduced restart data loss | Added a dedicated GitHub Actions restart-retest job for Ubuntu and macOS plus `scripts/github/issue-349-restart-retest.mjs`. Local `node --check`, quality/consistency tests, `git diff --check`, and build passed. GitHub runs `27859863679` and `27859962631` had normal test jobs green and restart-retest jobs red. The second run shows after restart both search and memory list lost the sentinel (`search=false memories=false`, `/memories` returned `total:0`). |
+| Security gates | Semgrep and staged Gitleaks | Done | Initial Semgrep flagged the local loopback health checks; narrow `nosemgrep` rationales were added for those two CI harness fetches. Final `semgrep scan --config p/default --error --metrics=off .` passed with 0 findings across 997 targets; `gitleaks protect --staged --redact` found no leaks across about 30 KB staged content. |
+| Human checkpoint | User decision | Diagnostic pivot approved; public issue actions still pending | User approved the engine-boundary diagnostic pivot with "gut, tue es". Public issue comments/closure and stopping/reusing the user's live daemon remain unapproved. |
+| Persistence fix attempt 1 | Red/green tests, source inspection, build, full test suite, GitHub Actions | Remote retest failed | Added a shutdown `KV.state` checkpoint with readback after index flush and a confirmed-checkpoint settle phase before signaling iii-engine. RED run failed on the missing state checkpoint and missing settle event; GREEN run passed. GitHub Actions run `27862336799` still lost memories after restart, disproving this as sufficient. |
+| Persistence fix attempt 2 | Red/green tests, focused restart/flush tests, GitHub Actions | Remote retest failed | Diagnostic run `27862428578` showed the sentinel and shutdown checkpoint visible in live `state::list` / `state::get` after flush, but no persisted state after the engine was stopped with `SIGTERM`. Added a test-backed change to signal iii-engine with `SIGINT` after a confirmed checkpoint while preserving `SIGTERM` for unconfirmed and force paths. GitHub Actions run `27862548284` still lost memories after restart, disproving this as sufficient. |
+| Arena next-step synthesis | `$arena` with three candidates and cross-judge | Done | All candidates converged on stopping production fixes and adding an engine-only iii-state persistence probe. Candidate C was selected as base with B/A grafts; synthesis saved at `/tmp/arena-issue349-next/synthesis.md`. |
+| Engine-only iii-state probe | Red/green static tests, syntax check, CI matrix after push | Engine-boundary failure proven; Human Checkpoint | Added `scripts/github/issue-349-engine-state-probe.mjs` and CI job `engine-state-probe` to test direct `state::set/get/list/list_groups` persistence across an iii-engine restart without agentmemory REST, worker, shutdown flush, or `agentmemory stop` in the path. After harness fixes, GitHub run `27865569335` reached the actual state check on Ubuntu and macOS: direct `state::set/get/list/list_groups` verified the sentinel while the first engine was live, the engine stopped cleanly with `SIGINT` and exit code 0, then the second engine returned empty `state::get`, empty `state::list`, and no `probe:issue-349` group. This proves #349 at the iii-engine file-based state boundary. |
+
+## Subagent Ledger
+
+| Workstream | Scope | Edits allowed | Expected output | Result | Residual risk |
+| --- | --- | --- | --- | --- | --- |
+| Arena candidate A | Issue #349 validity | No | Validity report with #338/current-main comparison | Done | Strong repo evidence and boot-reconcile warning; selected for grafts. |
+| Arena candidate B | Issue #349 validity | No | Validity report with #338/current-main comparison | Done | Selected as base; best scenario decomposition and next-step framing. |
+| Arena candidate C | Issue #349 validity | No | Validity report with #338/current-main comparison | Done | Concise checkpoint framing; selected for graft. |
+| Arena judge | Candidate reports and rubric | No | Scores and recommended base | Done | Recommended Candidate B with A/C grafts; agreed Human Checkpoint is required. |
+
+## Progress Notes
+
+- 2026-06-20: Read active repo instructions, the project-local triage workflow,
+  `$arena`, and `using-superpowers`.
+- 2026-06-20: Confirmed this worktree initially had detached `HEAD` and no dirty
+  files. Fetched `origin`; `origin/main` advanced beyond the requested #338
+  merge commit to `257238ab1c318b2e9ae5efcbe72863b99c41ee35`.
+- 2026-06-20: Verified requested start commit
+  `2ecbe54aa822462c5480beb59ac0f391723dfabd` exists and is an ancestor of
+  current `origin/main`; created branch `issue/349-lost-data-after-restart` from
+  that commit.
+- 2026-06-20: Public unauthenticated issue reads show #349/upstream #876 are
+  sparse restart-loss reports with no reproduction commands, no comments, and a
+  v0.9.27 zero-count dashboard screenshot.
+- 2026-06-20: Started `$arena` validity pass before implementation or closure.
+- 2026-06-20: `$arena` completed. All candidates and the judge converged that
+  #349 is likely stale/duplicate only for the #338 `agentmemory stop` path, but
+  the literal laptop/OS restart wording is not proven fixed and is not
+  implementation-ready. Synthesis saved in `arena-synthesis.md`.
+- 2026-06-20: Parent verification confirmed current `origin/main`
+  (`257238ab1c318b2e9ae5efcbe72863b99c41ee35`) changes only issue #337 iii
+  runtime compatibility surfaces after #338; the shutdown flush, index
+  persistence, API flush, core shutdown, and relevant tests are unchanged from
+  the #338 merge.
+- 2026-06-20: Current state is Human Checkpoint. Recommended action is to keep
+  #349 open and, if approved, post a public clarification/retest comment.
+  Closing as duplicate/stale or starting OS-restart implementation both require
+  explicit approval.
+- 2026-06-20: User approved the retest path with "mach den retest". Installed
+  locked dependencies with
+  `NPM_CONFIG_USERCONFIG=/dev/null corepack pnpm install --frozen-lockfile --ignore-scripts`
+  after confirming no repo-local `.npmrc`/`.pnpmrc`. The install reused cached
+  packages, did not change manifests or lockfiles, and ran no lifecycle scripts.
+- 2026-06-20: Focused regression retest passed:
+  `corepack pnpm exec vitest run test/index-persistence.test.ts test/search.test.ts test/shutdown-flush.test.ts test/api-boundary-coverage.test.ts test/cli-stop-port-detection.test.ts test/reconnect-registration.test.ts test/engine-launch.test.ts test/runtime-config.test.ts test/cli-iii-config.test.ts test/consistency.test.ts`
+  reported 10 test files and 138 tests passed.
+- 2026-06-20: Build retest passed with `corepack pnpm run build`; generated
+  ignored `dist/` output and the built CLI returned `0.9.28` for
+  `node dist/cli.mjs --version`.
+- 2026-06-20: Isolated native live restart harness was not run. `lsof` showed an
+  existing iii/agentmemory process listening on `*:49134` and
+  `127.0.0.1:3111`; repo docs and `/Users/A1538552/.agentmemory/bin/iii --help`
+  confirm bundled native iii v0.11.2 exposes `--config` but no verified listen
+  port relocation. Starting a second native engine is therefore not safe, and
+  stopping/reusing the existing daemon would affect real user memory state
+  outside this worktree.
+- 2026-06-20: User approved the GitHub retest path. Added
+  `scripts/github/issue-349-restart-retest.mjs`, which starts the built CLI in
+  a temp `HOME` and data directory, writes a sentinel memory through REST,
+  verifies it, calls `/agentmemory/shutdown/flush`, stops through the supported
+  CLI `stop` path, restarts, and verifies the sentinel survived.
+- 2026-06-20: Added a dedicated `.github/workflows/ci.yml` job named
+  `restart-retest` with Ubuntu and macOS runners. Local harness syntax check
+  passed with `node --check scripts/github/issue-349-restart-retest.mjs`.
+  Workflow-focused tests passed with
+  `corepack pnpm exec vitest run test/quality-gates.test.ts test/consistency.test.ts`;
+  `git diff --check` passed; `corepack pnpm run build` passed. The actual live
+  restart retest is pending GitHub Actions on a clean runner after push.
+- 2026-06-20: Security gates completed for the GitHub retest harness. The first
+  Semgrep pass reported two HTTP findings for local loopback health checks in
+  the CI harness; added narrow `nosemgrep` comments explaining the loopback-only
+  daemon boundary and reran `semgrep scan --config p/default --error --metrics=off .`,
+  which passed with 0 findings across 997 targets. Staged Gitleaks passed with
+  no leaks across about 30 KB of staged content.
+- 2026-06-20: Created PR #1038 from
+  `issue/349-lost-data-after-restart` to `origin/main`. GitHub Actions run
+  `27859863679` completed with both normal `test` jobs green and both new
+  `Issue #349 restart retest` jobs red. Ubuntu and macOS logs both show the
+  sentinel visible before stop, `agentmemory stop` reporting persistence, and
+  second start rebuilding the search index with `entries:0` before the sentinel
+  search failed. This makes #349 valid enough to block closure as duplicate or
+  already-fixed; any production persistence/restart fix remains behind the
+  required Human Checkpoint.
+- 2026-06-20: The first macOS failed log also showed cleanup masking risk after
+  the second-start failure. Hardened the harness to fetch both `/search` and
+  `/memories` before throwing, wait for stopped child processes, and avoid
+  cleanup failures masking the original restart result.
+- 2026-06-20: Pushed diagnostic commit `78dd8f48` and monitored GitHub Actions
+  run `27859962631`. Normal `test` jobs passed on Ubuntu and macOS. Both
+  `Issue #349 restart retest` jobs failed again. The improved error on both
+  platforms was
+  `after-restart: sentinel verification failed search=false memories=false`;
+  `/search` returned no results and `/memories` returned
+  `{"limit":null,"memories":[],"offset":0,"total":0}`. This reproduces #349 as
+  restart data loss in clean GitHub runners after the merged #338 fix.
+- 2026-06-20: Current checkpoint: issue #349 is valid, not duplicate,
+  already-fixed, or stale. Production remediation would cross the persistence /
+  restart boundary and requires explicit Human Checkpoint approval before
+  implementation. Public issue comments also require explicit approval before
+  posting.
+- 2026-06-20: User approved retrying the remediation path with "versuchs
+  erneut". Public issue comments/closure were not posted. Root-cause pass found
+  that the #338 shutdown flush only saved BM25/vector index state and did not
+  create an application-level `state::set` checkpoint after the memory write
+  path; `agentmemory stop` then signaled the worker and immediately signaled
+  iii-engine. The local SDK exposes no dedicated engine state flush.
+- 2026-06-20: Added RED tests for the missing `KV.state` shutdown checkpoint
+  and the missing confirmed-checkpoint settle phase before the engine signal.
+  The RED command
+  `corepack pnpm exec vitest run test/shutdown-flush.test.ts test/cli-stop-port-detection.test.ts`
+  failed as expected with three assertions: `statePersistence.set` was never
+  called, state checkpoint mismatch did not reject, and the stop event sequence
+  lacked `settle`.
+- 2026-06-20: Implemented a surgical fix: `mem::shutdown-flush` now saves the
+  search index, writes `KV.state/system:shutdownFlush`, reads it back, and
+  rejects if the checkpoint is not confirmed. The responsive native stop path
+  waits one second after a confirmed pre-stop checkpoint and worker signal
+  before signaling iii-engine. No public REST/MCP/tool schema or dependency
+  surface changed.
+- 2026-06-20: Local verification after the fix passed:
+  `corepack pnpm exec vitest run test/shutdown-flush.test.ts test/cli-stop-port-detection.test.ts`
+  reported 2 files / 19 tests passed; the broader focused set reported
+  10 files / 140 tests passed; `corepack pnpm run build`, `node --check
+  scripts/github/issue-349-restart-retest.mjs`, and `git diff --check` passed;
+  full `corepack pnpm test` reported 221 files / 3023 tests passed.
+- 2026-06-20: Security gate for the persistence implementation passed:
+  `semgrep scan --config p/default --error --metrics=off .` completed with
+  0 findings across 997 tracked targets. OSV was not run because no dependency,
+  lockfile, container, vendored, package-manager, or third-party package
+  surface changed.
+- 2026-06-20: Pushed fix commit `89484acd`. GitHub Actions run
+  `27862336799` passed the normal Ubuntu/macOS test jobs but both restart
+  retest jobs still failed. Logs confirm the new settle delay happened between
+  the worker stop and iii-engine stop, yet second start rebuilt the search index
+  with `entries:0` and `/memories` returned `total:0`. This disproves the
+  "missing application-level state barrier plus immediate engine SIGTERM" as a
+  sufficient fix.
+- 2026-06-20: Added CI-harness diagnostics before attempting a second
+  production fix. The harness now prints generated iii config files, the
+  isolated data directory tree, and direct `iii trigger state::list_groups`,
+  `state::list mem:memories`, and `state::get mem:state/system:shutdownFlush`
+  output after the flush and after the second start. Local verification for
+  this diagnostic change passed: `node --check
+  scripts/github/issue-349-restart-retest.mjs`,
+  `corepack pnpm exec vitest run test/quality-gates.test.ts
+  test/consistency.test.ts`, `git diff --check`, and
+  `semgrep scan --config p/default --error --metrics=off .`.
+- 2026-06-20: GitHub diagnostic run `27862428578` passed the normal Ubuntu and
+  macOS test jobs and failed both restart retest jobs. Diagnostics narrowed the
+  failure to the iii-engine file-based KV shutdown path: after the explicit
+  flush, direct `iii trigger state::list mem:memories` returned the sentinel
+  memory and `state::get mem:state/system:shutdownFlush` returned the checkpoint;
+  after `agentmemory stop` and second start, `state::list_groups` returned only
+  `mem:health`, `state::list mem:memories` was empty, and the isolated data tree
+  still showed only empty top-level `state_store.db` and `stream_store`
+  directories.
+- 2026-06-20: Second production hypothesis: the confirmed app-level checkpoint
+  is live in iii-engine, but `SIGTERM` does not give the bundled file-based KV
+  adapter a graceful shutdown path. Added RED expectations that confirmed
+  responsive stops signal iii-engine with `SIGINT` after the worker signal and
+  settle phase, while unconfirmed and force stops continue to use `SIGTERM`.
+  The RED run of
+  `corepack pnpm exec vitest run test/cli-stop-port-detection.test.ts` failed
+  exactly on `SIGTERM` versus expected `SIGINT`. Implemented the minimal signal
+  selection in `executeResponsiveNativeStop`; the GREEN run reported 15 tests
+  passed, and the adjacent focused set
+  `corepack pnpm exec vitest run test/shutdown-flush.test.ts
+  test/cli-stop-port-detection.test.ts test/reconnect-registration.test.ts`
+  reported 3 files / 27 tests passed.
+- 2026-06-20: Local verification for the second production attempt passed:
+  `git diff --check`, `node --check scripts/github/issue-349-restart-retest.mjs`,
+  `corepack pnpm run build`, full `corepack pnpm test` (221 files / 3023
+  tests), `semgrep scan --config p/default --error --metrics=off .` (0 findings
+  across 997 tracked targets), and `gitleaks protect --staged --redact` (no
+  leaks across about 2.67 KB staged content). OSV was not run because no
+  dependency, lockfile, container, vendored, package-manager, or third-party
+  package surface changed.
+- 2026-06-20: Pushed second fix attempt commit `8eb5cfc4`. GitHub Actions run
+  `27862548284` passed the normal Ubuntu/macOS test jobs but both Issue #349
+  restart retest jobs failed again. The filtered logs show the same persistence
+  boundary failure after the stop: the configured `file_path` still points at
+  the isolated temp `data/state_store.db`, the data tree still contains only the
+  empty top-level `state_store.db` and `stream_store` directories, direct
+  `state::list_groups` after restart returns only `mem:health`, direct
+  `state::list mem:memories` returns `[]`, the shutdown checkpoint read returns
+  empty stdout, and `/search` plus `/memories` both miss the sentinel. This
+  disproves both implemented production hypotheses so far: adding an app-level
+  state checkpoint plus settle delay, and using `SIGINT` for confirmed engine
+  stop. Stop before a third production attempt; next step needs a Human
+  Checkpoint / architecture decision on the iii-engine file-based persistence
+  boundary rather than another blind stop-path tweak.
+- 2026-06-20: Ran `$arena` for the next-step recommendation. Candidate C was
+  selected as the base by the cross-judge, with Candidate B's decision table /
+  boundary wording and Candidate A's identity-proof requirements grafted in.
+  The synthesized recommendation is to stop production patching and add a
+  diagnostic-only engine boundary proof that bypasses agentmemory REST handlers,
+  memory write paths, shutdown flush, worker execution, and `agentmemory stop`.
+  Synthesis was saved at `/tmp/arena-issue349-next/synthesis.md`.
+- 2026-06-20: User approved the diagnostic pivot with "gut, tue es". Added RED
+  static tests requiring an engine-only Issue #349 probe script and CI wiring.
+  The RED command
+  `corepack pnpm exec vitest run test/issue-349-engine-state-probe.test.ts test/quality-gates.test.ts`
+  failed as expected because the script and workflow job were missing. Added
+  `scripts/github/issue-349-engine-state-probe.mjs`, which downloads pinned
+  iii v0.11.2, starts a minimal `iii-state` `file_based` engine with a temp
+  absolute `state_store.db` path, writes/verifies a direct sentinel through
+  `state::set/get/list/list_groups`, stops/restarts the same engine with
+  `SIGINT`, and logs version, binary path, config, cwd, pid, signal, exit
+  status, and data-tree evidence. Added CI job `engine-state-probe` for Ubuntu
+  and macOS. GREEN verification for the static probe tests passed 2 files / 24
+  tests, and `node --check scripts/github/issue-349-engine-state-probe.mjs`
+  passed. The probe itself was not run locally because it intentionally refuses
+  an existing iii-engine on the default port; GitHub runners provide the
+  isolated proof loop.
+- 2026-06-20: Final local verification before commit for the engine-only probe
+  passed: `corepack pnpm exec vitest run
+  test/issue-349-engine-state-probe.test.ts test/quality-gates.test.ts
+  test/consistency.test.ts` reported 3 files / 34 tests passed; `node --check`
+  passed for both Issue #349 GitHub harness scripts; `git diff --check` passed;
+  full `corepack pnpm test` reported 222 files / 3027 tests passed; `semgrep
+  scan --config p/default --error --metrics=off .` passed with 0 findings
+  across 998 tracked targets; `gitleaks protect --staged --redact` found no
+  leaks across about 18.09 KB staged content. OSV was not run because no
+  dependency, lockfile, container, vendored, package-manager, or third-party
+  package surface changed.
+- 2026-06-20: Pushed engine-only probe commit `1384de31`. GitHub Actions run
+  `27865094228` passed the normal Ubuntu/macOS test jobs, and both existing
+  restart-retest jobs still failed with `search=false memories=false`. The new
+  engine-only probe jobs were not yet valid evidence: unquoted YAML job names
+  containing `#349` were parsed as comments and displayed only as `Issue
+  (ubuntu-latest)` / `Issue (macos-latest)`, making `gh` job-log retrieval
+  ambiguous with the restart retest jobs; the macOS engine probe also failed
+  before exercising state persistence with `Error: spawn tar ENOENT`.
+- 2026-06-20: Hardened the diagnostic harness without touching product
+  persistence behavior: quoted the Issue #349 CI job names, used an absolute
+  Unix `/usr/bin/tar` path for the iii release extraction, and added static
+  tests for both requirements. Local verification for this harness correction
+  passed with `corepack pnpm exec vitest run
+  test/issue-349-engine-state-probe.test.ts test/quality-gates.test.ts`
+  reporting 2 files / 25 tests passed and `node --check
+  scripts/github/issue-349-engine-state-probe.mjs` passing. Remote retest is
+  pending after push.
+- 2026-06-20: Pushed harness fix commit `999f7513`. GitHub Actions run
+  `27865198014` had unambiguous job names and passed both normal Ubuntu/macOS
+  test jobs. Both existing restart-retest jobs still failed. The engine-only
+  probe still did not reach the state-persistence check: Ubuntu and macOS both
+  failed while extracting the iii release with `spawn /usr/bin/tar ENOENT`.
+  Added a RED static test requiring Node-based `.tar.gz` extraction without an
+  external `tar` binary; the RED run failed as expected against the current
+  script. Replaced the external extraction with a small Node `gunzipSync` tar
+  reader that writes regular file entries under the probe bin directory with
+  path-escape checks. GREEN verification passed with `corepack pnpm exec vitest
+  run test/issue-349-engine-state-probe.test.ts test/quality-gates.test.ts`
+  reporting 2 files / 25 tests passed and `node --check
+  scripts/github/issue-349-engine-state-probe.mjs` passing. Remote retest is
+  pending after push.
+- 2026-06-20: Pushed Node extraction commit `c067fea1`. GitHub Actions run
+  `27865292639` passed both normal Ubuntu/macOS test jobs; both restart-retest
+  jobs still failed. The engine-only probe advanced past the external `tar`
+  dependency but still did not reach the state-persistence check: both OSes
+  failed with `spawn <temp>/bin/iii ENOENT` when running `iii --version`.
+  A local Node-only archive parse of the same iii v0.11.2 release asset on this
+  Mac lists and executes `iii` successfully, so the next harness correction is
+  to log the extracted tree and locate the binary from the archive contents
+  instead of assuming a fixed `bin/iii` path. Added RED static expectations for
+  `after-download` tree logging and `findIiiBinary`; the RED run failed as
+  expected. Implemented recursive binary discovery and post-download tree
+  logging. GREEN verification passed with `corepack pnpm exec vitest run
+  test/issue-349-engine-state-probe.test.ts test/quality-gates.test.ts`
+  reporting 2 files / 25 tests passed and `node --check
+  scripts/github/issue-349-engine-state-probe.mjs` passing. Remote retest is
+  pending after push.
+- 2026-06-20: Pushed binary discovery commit `cb3eeb46`. GitHub Actions run
+  `27865399960` passed both normal Ubuntu/macOS test jobs; both restart-retest
+  jobs still failed. The engine-only probe now logs that `bin/iii` exists after
+  extraction on both OSes (`33220928b` on Ubuntu, `28371136b` on macOS), but
+  spawning it from the OS temp probe directory still fails with `ENOENT`. This
+  keeps the run in harness-failure territory rather than engine-state evidence.
+  Added a RED static expectation that executable probe files live under the
+  checkout workspace by default (`process.cwd()`) or an explicit
+  `ISSUE_349_PROBE_PARENT`, not OS temp. The RED run failed as expected.
+  Implemented checkout-local probe workspaces with `.agentmemory-issue-349-engine-*`
+  directories and cleanup. GREEN verification passed with `corepack pnpm exec
+  vitest run test/issue-349-engine-state-probe.test.ts
+  test/quality-gates.test.ts` reporting 2 files / 26 tests passed and
+  `node --check scripts/github/issue-349-engine-state-probe.mjs` passing.
+  Remote retest is pending after push.
+- 2026-06-20: Pushed checkout-workspace commit `55dd073e`. GitHub Actions run
+  `27865479396` still failed before the state-persistence check. The logs
+  showed the extracted `iii` existed under the checkout-local probe workspace,
+  but spawning it still returned `ENOENT`. Root cause in the harness: `run()`
+  defaults to `engineCwd`, and `downloadIii()` ran `iii --version` before
+  `engineCwd` was created; the ENOENT was the missing cwd, not the binary.
+  Added a RED static test that `engineCwd` is created before the version
+  check; the RED run failed as expected. Moved `mkdir(engineCwd)` before the
+  first `run()` call in `downloadIii()`. GREEN verification passed with
+  `corepack pnpm exec vitest run
+  test/issue-349-engine-state-probe.test.ts test/quality-gates.test.ts`
+  reporting 2 files / 27 tests passed and `node --check
+  scripts/github/issue-349-engine-state-probe.mjs` passing. Remote retest is
+  pending after push.
+- 2026-06-20: Pushed cwd-creation harness fix commit `cdbbf0ae`. GitHub
+  Actions run `27865569335` passed the normal Ubuntu/macOS test jobs and still
+  failed both existing restart-retest jobs with the sentinel missing after
+  restart. The engine-only probe jobs now reached the direct iii-state
+  persistence check on both OSes and failed there, making the result valid
+  engine-boundary evidence rather than another harness failure. On Ubuntu and
+  macOS, pinned iii v0.11.2 started with a minimal `iii-state` `file_based`
+  config using an absolute checkout-local `state_store.db` path; direct
+  `state::set/get/list/list_groups` verified `probe:issue-349` while the first
+  engine was live; the engine then stopped via `SIGINT` with exit code 0. After
+  restart, direct `state::get` returned empty stdout, `state::list` returned
+  `[]`, and `state::list_groups` no longer included `probe:issue-349`. This
+  proves #349 is not confined to agentmemory REST handlers, memory writers,
+  shutdown flush, worker lifecycle, or `agentmemory stop`; the current blocker
+  is iii-engine file-based state persistence across restart. Stop at Human
+  Checkpoint before any third production attempt, engine-boundary workaround,
+  public issue/PR comment, or upstream issue filing.
+- 2026-06-20: Rechecked the local Mac after the user restarted agentmemory. A
+  deliberately saved `memory_save` sentinel (`mem_mqmb6npc_d32cbc726ecd`)
+  survived restart via REST and direct iii `state::get`, with
+  `/Users/A1538552/.agentmemory/data/state_store.db/mem%3Amemories.bin` present.
+  Local state already contained hundreds of persisted observation/session scope
+  files, but no historical explicit `mem:memories` entries before the sentinel;
+  consolidation is disabled locally because no LLM provider is configured.
+  Follow-up discrepancy probe: local direct `state::set` into a new
+  `probe:issue-349-local-timing` scope did not create its `.bin` file
+  immediately, but the file appeared after a 3-second wait while the engine
+  stayed running. CI run `27865672399` stops 12-40 ms after the live state proof,
+  and the data tree shows `state_store.db` remains an empty directory on both
+  Ubuntu and macOS. Current working hypothesis: the CI failure is exposing the
+  iii file-based adapter's delayed disk flush window; it does not prove that all
+  previously flushed local memories are lost on restart. Next remote diagnostic,
+  before any production fix, should add an intentional pre-stop wait or poll for
+  the expected scope file and compare the restart result.
+- 2026-06-20: Added the pre-stop scope-file materialization diagnostic to the
+  engine-only probe. RED verification:
+  `corepack pnpm exec vitest run test/issue-349-engine-state-probe.test.ts`
+  failed 1/7 on the missing `encodedScopeFilePath` assertion. GREEN change:
+  `scripts/github/issue-349-engine-state-probe.mjs` now computes the expected
+  encoded scope file path, waits up to 15 seconds for that `.bin` file to
+  materialize after live `state::set/get/list/list_groups` verification, logs
+  `scope file materialized`, and only then stops the first engine. GREEN
+  verification passed with the same Vitest target reporting 7/7 tests and
+  `node --check scripts/github/issue-349-engine-state-probe.mjs` exiting 0.
+  The CI result should classify the remaining branch: if restart passes after
+  the file appears, #349 is a delayed flush window; if restart still fails after
+  the file appears, the reload/read path is suspect; if the file never appears,
+  the file-based adapter is not writing the fresh scope in the CI setup.
+- 2026-06-20: Pushed diagnostic commit `d5061f94`. GitHub Actions run
+  `27872251631` completed with normal test jobs green, both original
+  restart-retest jobs still red, and both engine-only probe jobs green. Engine
+  probe evidence: Ubuntu materialized
+  `data/state_store.db/probe%3Aissue-349.bin` after 5014 ms, then direct
+  `state::get/list/list_groups` survived restart and the job logged
+  `PASS direct iii state survived engine restart`; macOS materialized the same
+  scope file after 5075 ms and also survived restart. The unchanged app-level
+  restart retests still stopped before any memory scope file appeared; their
+  `state_store.db` directories remained empty and sentinel verification failed
+  after restart. Classification updated: #349 is now best explained as a
+  delayed iii file-based state flush/materialization window, not universal loss
+  of already-materialized state. A product-side mitigation should wait for
+  durable scope materialization or otherwise force/observe iii persistence
+  before reporting shutdown complete.
diff --git a/scripts/github/issue-349-engine-state-probe.mjs b/scripts/github/issue-349-engine-state-probe.mjs
new file mode 100644
index 000000000..9d0175fbc
--- /dev/null
+++ b/scripts/github/issue-349-engine-state-probe.mjs
@@ -0,0 +1,443 @@
+#!/usr/bin/env node
+
+import { spawn } from "node:child_process";
+import { createConnection } from "node:net";
+import { chmod, mkdir, mkdtemp, readFile, readdir, rm, stat, writeFile } from "node:fs/promises";
+import { dirname, join, relative, resolve, sep } from "node:path";
+import { gunzipSync } from "node:zlib";
+
+const III_VERSION = process.env.AGENTMEMORY_III_VERSION ?? "0.11.2";
+const ENGINE_PORT = 49134;
+const probeParent = process.env.ISSUE_349_PROBE_PARENT ?? process.cwd();
+const root = await mkdtemp(join(probeParent, ".agentmemory-issue-349-engine-"));
+const binDir = join(root, "bin");
+const dataDir = join(root, "data");
+const engineCwd = join(root, "engine-cwd");
+const configPath = join(root, "iii-config.yaml");
+const stateStorePath = join(dataDir, "state_store.db");
+const sentinel = `ISSUE_349_ENGINE_STATE_PROBE_${Date.now()}_${Math.random()
+  .toString(36)
+  .slice(2)}`;
+const scope = "probe:issue-349";
+const key = "sentinel";
+const stopSignal = "SIGINT";
+const probeEnv = {
+  ...process.env,
+  HOME: join(root, "home"),
+  USERPROFILE: join(root, "home"),
+  CI: "true",
+};
+
+let engine = null;
+let iiiBin = join(binDir, process.platform === "win32" ? "iii.exe" : "iii");
+
+function log(message) {
+  console.log(`[issue-349-engine-probe] ${message}`);
+}
+
+function yamlSingleQuote(value) {
+  return `'${value.replace(/'/g, "''")}'`;
+}
+
+function delay(ms) {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+function encodedScopeFilePath(scopeName) {
+  return join(stateStorePath, `${encodeURIComponent(scopeName)}.bin`);
+}
+
+function releaseAsset() {
+  if (process.platform === "darwin" && process.arch === "arm64") {
+    return "iii-aarch64-apple-darwin.tar.gz";
+  }
+  if (process.platform === "darwin" && process.arch === "x64") {
+    return "iii-x86_64-apple-darwin.tar.gz";
+  }
+  if (process.platform === "linux" && process.arch === "x64") {
+    return "iii-x86_64-unknown-linux-gnu.tar.gz";
+  }
+  if (process.platform === "linux" && process.arch === "arm64") {
+    return "iii-aarch64-unknown-linux-gnu.tar.gz";
+  }
+  return null;
+}
+
+function releaseUrl() {
+  const asset = releaseAsset();
+  if (!asset) {
+    throw new Error(`unsupported iii probe platform ${process.platform}/${process.arch}`);
+  }
+  return `https://github.com/iii-hq/iii/releases/download/iii/v${III_VERSION}/${asset}`;
+}
+
+async function run(command, args, options = {}) {
+  const child = spawn(command, args, {
+    cwd: options.cwd ?? engineCwd,
+    env: options.env ?? probeEnv,
+    stdio: ["ignore", "pipe", "pipe"],
+  });
+  let stdout = "";
+  let stderr = "";
+  child.stdout.on("data", (chunk) => {
+    stdout += chunk.toString();
+  });
+  child.stderr.on("data", (chunk) => {
+    stderr += chunk.toString();
+  });
+  const { code, signal } = await new Promise((resolve) => {
+    child.on("close", (code, signal) => resolve({ code, signal }));
+  });
+  return { code, signal, stdout, stderr };
+}
+
+function tarString(header, start, length) {
+  return header
+    .subarray(start, start + length)
+    .toString("utf8")
+    .replace(/\0.*$/s, "")
+    .trim();
+}
+
+function tarOctal(header, start, length) {
+  const raw = tarString(header, start, length);
+  return raw ? Number.parseInt(raw, 8) : 0;
+}
+
+function tarTarget(destination, name) {
+  if (!name || name === ".") return null;
+  const destinationRoot = resolve(destination);
+  const target = resolve(destinationRoot, name);
+  if (target !== destinationRoot && !target.startsWith(`${destinationRoot}${sep}`)) {
+    throw new Error(`archive entry escapes destination: ${name}`);
+  }
+  return target;
+}
+
+async function extractTarGz(archiveBytes, destination) {
+  const archive = gunzipSync(archiveBytes);
+  let offset = 0;
+
+  while (offset + 512 <= archive.length) {
+    const header = archive.subarray(offset, offset + 512);
+    if (header.every((byte) => byte === 0)) break;
+
+    const name = tarString(header, 0, 100);
+    const prefix = tarString(header, 345, 155);
+    const fullName = prefix ? `${prefix}/${name}` : name;
+    const type = String.fromCharCode(header[156] || 0);
+    const size = tarOctal(header, 124, 12);
+    offset += 512;
+
+    const target = tarTarget(destination, fullName);
+    if (target && type === "5") {
+      await mkdir(target, { recursive: true });
+    } else if (target && (type === "0" || type === "\0")) {
+      await mkdir(dirname(target), { recursive: true });
+      await writeFile(target, archive.subarray(offset, offset + size));
+    }
+
+    offset += Math.ceil(size / 512) * 512;
+  }
+}
+
+async function findIiiBinary(dir) {
+  const expectedName = process.platform === "win32" ? "iii.exe" : "iii";
+  const entries = await readdir(dir, { withFileTypes: true });
+
+  for (const entry of entries.sort((a, b) => a.name.localeCompare(b.name))) {
+    const fullPath = join(dir, entry.name);
+    if (entry.isFile() && entry.name === expectedName) return fullPath;
+    if (entry.isDirectory()) {
+      const nested = await findIiiBinary(fullPath).catch(() => null);
+      if (nested) return nested;
+    }
+  }
+
+  return null;
+}
+
+async function downloadIii() {
+  const url = releaseUrl();
+  log(`downloading iii v${III_VERSION} from ${url}`);
+  const response = await fetch(url);
+  if (!response.ok) {
+    throw new Error(`download failed ${response.status} ${response.statusText}`);
+  }
+  await mkdir(binDir, { recursive: true });
+  await mkdir(engineCwd, { recursive: true });
+  await extractTarGz(Buffer.from(await response.arrayBuffer()), binDir);
+  await listTree("after-download", binDir);
+  iiiBin = (await findIiiBinary(binDir)) ?? iiiBin;
+  await chmod(iiiBin, 0o755);
+  const version = await run(iiiBin, ["--version"]);
+  if (version.code !== 0) {
+    throw new Error(`iii --version failed code=${version.code}\n${version.stderr}`);
+  }
+  log(`iii version ${version.stdout.trim() || "(empty)"}`);
+  log(`binary path ${iiiBin}`);
+}
+
+async function writeConfig() {
+  const config = [
+    "workers:",
+    "  - name: iii-state",
+    "    config:",
+    "      adapter:",
+    "        name: kv",
+    "        config:",
+    "          store_method: file_based",
+    `          file_path: ${yamlSingleQuote(stateStorePath)}`,
+    "",
+  ].join("\n");
+  await mkdir(dataDir, { recursive: true });
+  await mkdir(engineCwd, { recursive: true });
+  await writeFile(configPath, config, "utf-8");
+  log(`config path ${configPath}`);
+  log(`engine cwd ${engineCwd}`);
+  log(`rendered config\n${config}`);
+}
+
+async function readTextIfExists(label, filePath) {
+  try {
+    const text = await readFile(filePath, "utf-8");
+    log(`${label} ${filePath}\n${text}`);
+  } catch (err) {
+    log(`${label} ${filePath} unavailable: ${err instanceof Error ? err.message : String(err)}`);
+  }
+}
+
+async function listTree(label, dir, maxEntries = 120) {
+  const rows = [];
+
+  async function walk(current, depth) {
+    if (rows.length >= maxEntries || depth > 6) return;
+    let entries = [];
+    try {
+      entries = await readdir(current, { withFileTypes: true });
+    } catch (err) {
+      rows.push(`${relative(root, current) || "."} <unavailable: ${err instanceof Error ? err.message : String(err)}>`);
+      return;
+    }
+
+    for (const entry of entries.sort((a, b) => a.name.localeCompare(b.name))) {
+      if (rows.length >= maxEntries) break;
+      const fullPath = join(current, entry.name);
+      const info = await stat(fullPath).catch(() => null);
+      const kind = entry.isDirectory() ? "dir" : entry.isFile() ? "file" : "other";
+      rows.push(
+        `${relative(root, fullPath)} ${kind} ${info ? `${info.size}b ${info.mtime.toISOString()}` : "stat-unavailable"}`,
+      );
+      if (entry.isDirectory()) await walk(fullPath, depth + 1);
+    }
+  }
+
+  await walk(dir, 0);
+  log(`${label}: data dir tree\n${rows.length > 0 ? rows.join("\n") : "(empty)"}`);
+}
+
+async function portOpen() {
+  return new Promise((resolve) => {
+    const socket = createConnection({ host: "127.0.0.1", port: ENGINE_PORT });
+    socket.once("connect", () => {
+      socket.destroy();
+      resolve(true);
+    });
+    socket.once("error", () => {
+      socket.destroy();
+      resolve(false);
+    });
+    socket.setTimeout(1000, () => {
+      socket.destroy();
+      resolve(false);
+    });
+  });
+}
+
+async function assertEnginePortFree() {
+  if (await portOpen()) {
+    throw new Error(`iii-engine port ${ENGINE_PORT} is already in use; refusing ambiguous engine-only probe`);
+  }
+}
+
+async function runIiiTrigger(stage, functionId, payload) {
+  const result = await run(iiiBin, [
+    "trigger",
+    "--function-id",
+    functionId,
+    "--payload",
+    JSON.stringify(payload),
+  ]);
+  log(
+    `${stage}: iii trigger ${functionId} exit status code=${result.code ?? "null"} signal=${result.signal ?? "null"}\nstdout=${result.stdout.trim() || "(empty)"}\nstderr=${result.stderr.trim() || "(empty)"}`,
+  );
+  return result;
+}
+
+async function waitForState(timeoutMs = 30_000) {
+  const deadline = Date.now() + timeoutMs;
+  let last = null;
+  while (Date.now() < deadline) {
+    const result = await runIiiTrigger("ready-check", "state::list_groups", {});
+    if (result.code === 0) return;
+    last = result.stderr || result.stdout;
+    await delay(1000);
+  }
+  throw new Error(`state::list_groups did not become ready within ${timeoutMs}ms: ${last ?? "(no output)"}`);
+}
+
+async function waitForScopeFile(scopeName, timeoutMs = 15_000) {
+  const filePath = encodedScopeFilePath(scopeName);
+  const startedAt = Date.now();
+  const deadline = startedAt + timeoutMs;
+  let lastError = null;
+
+  while (Date.now() < deadline) {
+    const info = await stat(filePath).catch((err) => {
+      lastError = err;
+      return null;
+    });
+    if (info?.isFile()) {
+      log(`scope file materialized after ${Date.now() - startedAt}ms ${filePath} ${info.size}b`);
+      return;
+    }
+    await delay(250);
+  }
+
+  throw new Error(
+    `scope file did not materialize within ${timeoutMs}ms at ${filePath}: ${
+      lastError instanceof Error ? lastError.message : "no stat error"
+    }`,
+  );
+}
+
+async function startEngine(label) {
+  log(`starting ${label}`);
+  engine = spawn(iiiBin, ["--config", configPath], {
+    cwd: engineCwd,
+    env: probeEnv,
+    stdio: ["ignore", "pipe", "pipe"],
+  });
+  log(`engine pid ${engine.pid ?? "(unavailable)"}`);
+  engine.stdout.on("data", (chunk) => {
+    process.stdout.write(`[${label}:stdout] ${chunk.toString()}`);
+  });
+  engine.stderr.on("data", (chunk) => {
+    process.stderr.write(`[${label}:stderr] ${chunk.toString()}`);
+  });
+  engine.on("exit", (code, signal) => {
+    log(`${label} exit status code=${code ?? "null"} signal=${signal ?? "null"}`);
+  });
+  await waitForState();
+}
+
+async function waitForClose(child, label, timeoutMs = 5000) {
+  if (child.exitCode !== null || child.signalCode !== null) {
+    return { code: child.exitCode, signal: child.signalCode };
+  }
+
+  let timeout = null;
+  try {
+    return await Promise.race([
+      new Promise((resolve) => {
+        child.once("close", (code, signal) => resolve({ code, signal }));
+      }),
+      new Promise((_, reject) => {
+        timeout = setTimeout(() => {
+          reject(new Error(`${label} did not exit within ${timeoutMs}ms`));
+        }, timeoutMs);
+      }),
+    ]);
+  } finally {
+    if (timeout) clearTimeout(timeout);
+  }
+}
+
+async function stopEngine(label) {
+  if (!engine) return;
+  const active = engine;
+  engine = null;
+  log(`stop signal ${stopSignal} for ${label}`);
+  active.kill(stopSignal);
+  const status = await waitForClose(active, label).catch(async (err) => {
+    log(`exit status timeout for ${label}: ${err instanceof Error ? err.message : String(err)}`);
+    active.kill("SIGKILL");
+    return waitForClose(active, `${label} cleanup`, 3000);
+  });
+  log(`exit status ${label} code=${status.code ?? "null"} signal=${status.signal ?? "null"}`);
+}
+
+function outputHasSentinel(result) {
+  return result.stdout.toLowerCase().includes(sentinel.toLowerCase());
+}
+
+async function verifyDirectState(stage) {
+  const [getResult, listResult, groupsResult] = await Promise.all([
+    runIiiTrigger(stage, "state::get", { scope, key }),
+    runIiiTrigger(stage, "state::list", { scope }),
+    runIiiTrigger(stage, "state::list_groups", {}),
+  ]);
+  const hasGet = outputHasSentinel(getResult);
+  const hasList = outputHasSentinel(listResult);
+  const hasGroup = groupsResult.stdout.includes(scope);
+  if (!hasGet || !hasList || !hasGroup) {
+    throw new Error(
+      `${stage}: direct state verification failed get=${hasGet} list=${hasList} group=${hasGroup}\n` +
+        `get=${getResult.stdout.trim() || "(empty)"}\n` +
+        `list=${listResult.stdout.trim() || "(empty)"}\n` +
+        `groups=${groupsResult.stdout.trim() || "(empty)"}`,
+    );
+  }
+  log(`${stage}: direct state verified`);
+}
+
+try {
+  log(`workspace ${root}`);
+  await assertEnginePortFree();
+  await downloadIii();
+  await writeConfig();
+  await readTextIfExists("config readback", configPath);
+
+  await startEngine("first-engine");
+  const setResult = await runIiiTrigger("after-set", "state::set", {
+    scope,
+    key,
+    value: { sentinel, writtenAt: new Date().toISOString() },
+  });
+  if (setResult.code !== 0) {
+    throw new Error(`state::set failed code=${setResult.code}\n${setResult.stderr}`);
+  }
+  await verifyDirectState("after-set");
+  await listTree("after-set", dataDir);
+  await waitForScopeFile(scope);
+  await listTree("after-scope-file-wait", dataDir);
+  await stopEngine("first-engine");
+  await listTree("after-stop", dataDir);
+
+  await assertEnginePortFree();
+  await startEngine("second-engine");
+  await verifyDirectState("after-restart");
+  await listTree("after-restart", dataDir);
+  await stopEngine("second-engine");
+
+  log("PASS direct iii state survived engine restart");
+} catch (err) {
+  log(`ENGINE_PROBE_RESULT=failed ${err instanceof Error ? err.message : String(err)}`);
+  throw err;
+} finally {
+  const active = engine;
+  engine = null;
+  if (active && active.exitCode === null && active.signalCode === null) {
+    active.kill("SIGKILL");
+    await waitForClose(active, "cleanup", 3000).catch((err) => {
+      log(`cleanup warning: ${err instanceof Error ? err.message : String(err)}`);
+    });
+  }
+  if (process.env.ISSUE_349_KEEP_PROBE_DIR !== "1") {
+    await rm(root, { recursive: true, force: true, maxRetries: 3, retryDelay: 500 }).catch((err) => {
+      log(`cleanup warning: ${err instanceof Error ? err.message : String(err)}`);
+    });
+  } else {
+    log(`kept workspace ${root}`);
+  }
+}
diff --git a/scripts/github/issue-349-restart-retest.mjs b/scripts/github/issue-349-restart-retest.mjs
new file mode 100644
index 000000000..6b2ce06c2
--- /dev/null
+++ b/scripts/github/issue-349-restart-retest.mjs
@@ -0,0 +1,341 @@
+#!/usr/bin/env node
+
+import { spawn } from "node:child_process";
+import { mkdir, mkdtemp, readFile, readdir, rm, stat } from "node:fs/promises";
+import { existsSync } from "node:fs";
+import { join, relative } from "node:path";
+import { tmpdir } from "node:os";
+
+const root = await mkdtemp(join(tmpdir(), "agentmemory-issue-349-"));
+const home = join(root, "home");
+const dataDir = join(root, "data");
+const cwd = join(root, "cwd");
+const baseUrl = "http://127.0.0.1:3111";
+const cli = join(process.cwd(), "dist", "cli.mjs");
+const iiiBin = join(
+  home,
+  ".local",
+  "bin",
+  "agentmemory",
+  process.platform === "win32" ? "iii.exe" : "iii",
+);
+const sentinel = `ISSUE_349_RESTART_RETEST_${Date.now()}_${Math.random()
+  .toString(36)
+  .slice(2)}`;
+
+if (!existsSync(cli)) {
+  throw new Error(`Built CLI missing at ${cli}; run pnpm run build first`);
+}
+
+const serverEnv = {
+  ...process.env,
+  HOME: home,
+  USERPROFILE: home,
+  AGENTMEMORY_DATA_DIR: dataDir,
+  AGENTMEMORY_READY_TIMEOUT_MS: "120000",
+  AGENTMEMORY_AUTO_COMPRESS: "false",
+  GRAPH_EXTRACTION_ENABLED: "false",
+  CONSOLIDATION_ENABLED: "false",
+  AGENTMEMORY_SLOTS: "false",
+  AGENTMEMORY_REFLECT: "false",
+  CI: "true",
+};
+
+let server = null;
+
+function log(message) {
+  console.log(`[issue-349-retest] ${message}`);
+}
+
+function spawnCli(args, options = {}) {
+  return spawn(process.execPath, [cli, ...args], {
+    cwd,
+    env: serverEnv,
+    stdio: options.stdio ?? ["ignore", "pipe", "pipe"],
+  });
+}
+
+function collectOutput(child, label) {
+  let output = "";
+  child.stdout?.on("data", (chunk) => {
+    const text = chunk.toString();
+    output += text;
+    process.stdout.write(`[${label}:stdout] ${text}`);
+  });
+  child.stderr?.on("data", (chunk) => {
+    const text = chunk.toString();
+    output += text;
+    process.stderr.write(`[${label}:stderr] ${text}`);
+  });
+  return () => output;
+}
+
+async function delay(ms) {
+  await new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+async function readTextIfExists(label, filePath) {
+  try {
+    const text = await readFile(filePath, "utf-8");
+    log(`${label} ${filePath}\n${text}`);
+  } catch (err) {
+    log(`${label} ${filePath} unavailable: ${err instanceof Error ? err.message : String(err)}`);
+  }
+}
+
+async function listTree(label, dir, maxEntries = 80) {
+  const rows = [];
+
+  async function walk(current, depth) {
+    if (rows.length >= maxEntries || depth > 4) return;
+    let entries = [];
+    try {
+      entries = await readdir(current, { withFileTypes: true });
+    } catch (err) {
+      rows.push(`${relative(root, current) || "."} <unavailable: ${err instanceof Error ? err.message : String(err)}>`);
+      return;
+    }
+
+    for (const entry of entries.sort((a, b) => a.name.localeCompare(b.name))) {
+      if (rows.length >= maxEntries) break;
+      const fullPath = join(current, entry.name);
+      const info = await stat(fullPath).catch(() => null);
+      const kind = entry.isDirectory() ? "dir" : entry.isFile() ? "file" : "other";
+      rows.push(
+        `${relative(root, fullPath)} ${kind} ${info ? `${info.size}b ${info.mtime.toISOString()}` : "stat-unavailable"}`,
+      );
+      if (entry.isDirectory()) await walk(fullPath, depth + 1);
+    }
+  }
+
+  await walk(dir, 0);
+  log(`${label} tree for ${dir}\n${rows.length > 0 ? rows.join("\n") : "(empty)"}`);
+}
+
+async function runIiiTrigger(stage, functionId, payload) {
+  if (!existsSync(iiiBin)) {
+    log(`${stage}: iii trigger skipped; binary missing at ${iiiBin}`);
+    return;
+  }
+
+  const child = spawn(iiiBin, [
+    "trigger",
+    "--function-id",
+    functionId,
+    "--payload",
+    JSON.stringify(payload),
+  ], {
+    cwd,
+    env: serverEnv,
+    stdio: ["ignore", "pipe", "pipe"],
+  });
+
+  let stdout = "";
+  let stderr = "";
+  child.stdout.on("data", (chunk) => {
+    stdout += chunk.toString();
+  });
+  child.stderr.on("data", (chunk) => {
+    stderr += chunk.toString();
+  });
+  const code = await new Promise((resolve) => child.on("close", resolve));
+  log(
+    `${stage}: iii trigger ${functionId} exited ${code}\nstdout=${stdout.trim() || "(empty)"}\nstderr=${stderr.trim() || "(empty)"}`,
+  );
+}
+
+async function inspectPersistence(stage) {
+  await readTextIfExists(`${stage}: user config`, join(home, ".agentmemory", "config", "iii-config.yaml"));
+  await readTextIfExists(`${stage}: data runtime config`, join(dataDir, "iii-config.yaml"));
+  await listTree(`${stage}: data dir`, dataDir);
+  await runIiiTrigger(stage, "state::list_groups", {});
+  await runIiiTrigger(stage, "state::list", { scope: "mem:memories" });
+  await runIiiTrigger(stage, "state::get", {
+    scope: "mem:state",
+    key: "system:shutdownFlush",
+  });
+}
+
+async function waitForChildClose(child, label, timeoutMs = 5000) {
+  if (child.exitCode !== null || child.signalCode !== null) return;
+
+  let timeout = null;
+  await Promise.race([
+    new Promise((resolve) => child.once("close", resolve)),
+    new Promise((_, reject) => {
+      timeout = setTimeout(() => {
+        reject(new Error(`${label} did not exit within ${timeoutMs}ms`));
+      }, timeoutMs);
+    }),
+  ]).finally(() => {
+    if (timeout) clearTimeout(timeout);
+  });
+}
+
+async function waitForHealth(timeoutMs = 120_000) {
+  const deadline = Date.now() + timeoutMs;
+  let lastError = null;
+  while (Date.now() < deadline) {
+    try {
+      // nosemgrep: typescript.react.security.react-insecure-request.react-insecure-request -- CI harness talks only to the loopback-only local daemon it starts in this job.
+      const res = await fetch(`${baseUrl}/agentmemory/health`);
+      if (res.ok) {
+        const body = await res.json();
+        if (body?.service === "agentmemory") return;
+      }
+    } catch (err) {
+      lastError = err;
+    }
+    await delay(1000);
+  }
+  throw new Error(
+    `agentmemory health did not become ready within ${timeoutMs}ms${
+      lastError instanceof Error ? `: ${lastError.message}` : ""
+    }`,
+  );
+}
+
+async function assertNoExistingServer() {
+  try {
+    // nosemgrep: typescript.react.security.react-insecure-request.react-insecure-request -- Presence check is intentionally limited to the local loopback REST port before test data is written.
+    const res = await fetch(`${baseUrl}/agentmemory/health`, {
+      signal: AbortSignal.timeout(1000),
+    });
+    if (res.ok) {
+      throw new Error(
+        `agentmemory is already responding at ${baseUrl}; refusing to write retest data into an existing daemon`,
+      );
+    }
+  } catch (err) {
+    if (err instanceof Error && err.message.includes("already responding")) {
+      throw err;
+    }
+  }
+}
+
+async function request(path, init = {}) {
+  const res = await fetch(`${baseUrl}${path}`, {
+    ...init,
+    headers: {
+      "Content-Type": "application/json",
+      ...(init.headers ?? {}),
+    },
+  });
+  const text = await res.text();
+  let body = null;
+  if (text.length > 0) {
+    try {
+      body = JSON.parse(text);
+    } catch {
+      body = text;
+    }
+  }
+  if (!res.ok) {
+    throw new Error(`${init.method ?? "GET"} ${path} failed ${res.status}: ${text}`);
+  }
+  return body;
+}
+
+async function startServer(label) {
+  log(`starting ${label}`);
+  server = spawnCli(["--data-dir", dataDir]);
+  const getOutput = collectOutput(server, label);
+  server.on("exit", (code, signal) => {
+    if (server) {
+      log(`${label} exited unexpectedly code=${code ?? "null"} signal=${signal ?? "null"}`);
+    }
+  });
+  await waitForHealth();
+  log(`${label} is healthy`);
+  return getOutput;
+}
+
+async function stopViaCli(label) {
+  log(`stopping ${label} via CLI`);
+  const activeServer = server;
+  const stop = spawnCli(["stop", "--data-dir", dataDir]);
+  const getOutput = collectOutput(stop, `${label}-stop`);
+  const code = await new Promise((resolve) => stop.on("close", resolve));
+  if (code !== 0) {
+    throw new Error(`${label} stop exited ${code}; output:\n${getOutput()}`);
+  }
+  if (activeServer) {
+    await waitForChildClose(activeServer, label);
+  }
+  server = null;
+  log(`${label} stopped`);
+}
+
+function resultText(result) {
+  return JSON.stringify(result).toLowerCase();
+}
+
+async function verifySentinel(stage) {
+  const [search, memories] = await Promise.all([
+    request("/agentmemory/search", {
+      method: "POST",
+      body: JSON.stringify({ query: sentinel, limit: 5 }),
+    }),
+    request("/agentmemory/memories"),
+  ]);
+  const lowerSentinel = sentinel.toLowerCase();
+  const searchHasSentinel = resultText(search).includes(lowerSentinel);
+  const memoriesHasSentinel = resultText(memories).includes(lowerSentinel);
+  if (!searchHasSentinel || !memoriesHasSentinel) {
+    throw new Error(
+      `${stage}: sentinel verification failed search=${searchHasSentinel} memories=${memoriesHasSentinel}\n` +
+        `search=${JSON.stringify(search)}\nmemories=${JSON.stringify(memories)}`,
+    );
+  }
+  log(`${stage}: sentinel verified`);
+}
+
+try {
+  log(`workspace ${root}`);
+  await mkdir(home, { recursive: true });
+  await mkdir(dataDir, { recursive: true });
+  await mkdir(cwd, { recursive: true });
+  await assertNoExistingServer();
+  await startServer("first-start");
+
+  const remember = await request("/agentmemory/remember", {
+    method: "POST",
+    body: JSON.stringify({
+      content: `Issue 349 restart retest sentinel ${sentinel}`,
+      type: "fact",
+      concepts: ["issue-349", "restart-retest"],
+      project: "github-actions-issue-349",
+    }),
+  });
+  if (remember?.success !== true) {
+    throw new Error(`remember did not report success: ${JSON.stringify(remember)}`);
+  }
+  log(`saved sentinel memory ${remember.memory?.id ?? "(id unavailable)"}`);
+
+  await verifySentinel("before-stop");
+  await request("/agentmemory/shutdown/flush", { method: "POST", body: "{}" });
+  await inspectPersistence("after-flush");
+  await stopViaCli("first-start");
+  await listTree("after-stop: data dir", dataDir);
+
+  await startServer("second-start");
+  await inspectPersistence("after-second-start");
+  await verifySentinel("after-restart");
+  await stopViaCli("second-start");
+
+  log("PASS sentinel survived supported stop/restart");
+} finally {
+  const activeServer = server;
+  server = null;
+  if (activeServer && activeServer.exitCode === null && activeServer.signalCode === null) {
+    activeServer.kill("SIGTERM");
+    await waitForChildClose(activeServer, "cleanup").catch((err) => {
+      log(`cleanup warning: ${err instanceof Error ? err.message : String(err)}`);
+    });
+  }
+  try {
+    await rm(root, { recursive: true, force: true, maxRetries: 3, retryDelay: 500 });
+  } catch (err) {
+    log(`cleanup warning: ${err instanceof Error ? err.message : String(err)}`);
+  }
+}
diff --git a/src/cli.ts b/src/cli.ts
index e9c9c06f3..3480fffc6 100644
--- a/src/cli.ts
+++ b/src/cli.ts
@@ -252,6 +252,7 @@ applyRuntimeHostArgs(args);
 
 const READY_TIMEOUT_MS = readyTimeoutMsFromEnv();
 const readyTimeoutLabel = () => formatReadyTimeout(READY_TIMEOUT_MS);
+const STATE_PERSISTENCE_SETTLE_MS = 1000;
 
 const skipEngine = args.includes("--no-engine");
 
@@ -263,6 +264,10 @@ function getBaseUrl(): string {
   return getCliBaseUrl();
 }
 
+function waitForStatePersistenceSettle(): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, STATE_PERSISTENCE_SETTLE_MS));
+}
+
 let discoveredViewerPort: number | null = null;
 
 export async function discoverViewerPort(): Promise<void> {
@@ -2882,6 +2887,7 @@ async function runStop(): Promise<void> {
         timeoutMs: 5000,
         warn: (message) => p.log.warn(message),
       }),
+      settleStatePersistence: () => waitForStatePersistenceSettle(),
       signal: async (pid, signal, timeoutMs, role) => {
         const s = p.spinner();
         if (role === "worker") {
diff --git a/src/cli/stop-processes.ts b/src/cli/stop-processes.ts
index 497ec2b3a..33480c8f0 100644
--- a/src/cli/stop-processes.ts
+++ b/src/cli/stop-processes.ts
@@ -24,6 +24,7 @@ export type ResponsiveNativeStopEffects = {
   isWindows: boolean;
   force: boolean;
   flush: () => Promise<boolean>;
+  settleStatePersistence?: () => Promise<void>;
   signal: (
     pid: number,
     signal: NodeJS.Signals,
@@ -142,8 +143,12 @@ export async function executeResponsiveNativeStop(
     const ok = await effects.signal(pid, "SIGTERM", 5000, "worker");
     if (!ok) allStopped = false;
   }
+  if (checkpointConfirmed && dedupedEnginePids.length > 0) {
+    await effects.settleStatePersistence?.();
+  }
+  const engineSignal = checkpointConfirmed ? "SIGINT" : "SIGTERM";
   for (const pid of dedupedEnginePids) {
-    const ok = await effects.signal(pid, "SIGTERM", 3000, "engine");
+    const ok = await effects.signal(pid, engineSignal, 3000, "engine");
     if (!ok) allStopped = false;
   }
 
diff --git a/src/functions/shutdown-flush.ts b/src/functions/shutdown-flush.ts
index b44932bc1..80c82b333 100644
--- a/src/functions/shutdown-flush.ts
+++ b/src/functions/shutdown-flush.ts
@@ -1,18 +1,44 @@
 import type { ISdk } from "iii-sdk";
 import type { IndexPersistence } from "../state/index-persistence.js";
+import type { StateKV } from "../state/kv.js";
+import { KV } from "../state/schema.js";
+import type { StateScope, StateScopeKey } from "../types.js";
 
 type ShutdownFlushIndexPersistence = Pick<IndexPersistence, "saveOrThrow">;
+type ShutdownFlushStatePersistence = Pick<StateKV, "get" | "set">;
+
+const SHUTDOWN_FLUSH_STATE_KEY: StateScopeKey = "system:shutdownFlush";
+
+function isConfirmedCheckpoint(
+  value: unknown,
+  checkpoint: StateScope[typeof SHUTDOWN_FLUSH_STATE_KEY],
+): value is StateScope[typeof SHUTDOWN_FLUSH_STATE_KEY] {
+  return (
+    typeof value === "object" &&
+    value !== null &&
+    !Array.isArray(value) &&
+    (value as { flushedAt?: unknown }).flushedAt === checkpoint.flushedAt
+  );
+}
 
 export function registerShutdownFlushFunction(
   sdk: ISdk,
   indexPersistence: ShutdownFlushIndexPersistence,
   readiness: { isReady: () => boolean },
+  statePersistence: ShutdownFlushStatePersistence,
 ): void {
   sdk.registerFunction("mem::shutdown-flush", async () => {
     if (!readiness.isReady()) {
       return { success: false, error: "index_not_ready" };
     }
+    const flushedAt = new Date().toISOString();
     await indexPersistence.saveOrThrow();
-    return { success: true, flushedAt: new Date().toISOString() };
+    const checkpoint: StateScope[typeof SHUTDOWN_FLUSH_STATE_KEY] = { flushedAt };
+    await statePersistence.set(KV.state, SHUTDOWN_FLUSH_STATE_KEY, checkpoint);
+    const confirmed = await statePersistence.get(KV.state, SHUTDOWN_FLUSH_STATE_KEY);
+    if (!isConfirmedCheckpoint(confirmed, checkpoint)) {
+      throw new Error("shutdown state checkpoint was not confirmed");
+    }
+    return { success: true, flushedAt };
   });
 }
diff --git a/src/index.ts b/src/index.ts
index c69e48a57..875be13d9 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -368,9 +368,12 @@ async function main() {
   let shutdownFlushState: "loading" | "rebuilding" | "ready" | "unavailable" = "loading";
 
   const registerAllFunctions = () => {
-    registerShutdownFlushFunction(sdk, indexPersistence, {
-      isReady: () => shutdownFlushState === "ready",
-    });
+    registerShutdownFlushFunction(
+      sdk,
+      indexPersistence,
+      { isReady: () => shutdownFlushState === "ready" },
+      kv,
+    );
     registerSessionBudgetFunctions(sdk, kv);
     registerPrivacyFunction(sdk);
     registerObserveFunction(sdk, kv, dedupMap, config.maxObservationsPerSession);
diff --git a/src/types.ts b/src/types.ts
index b76f3fc56..08570bf50 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -1162,6 +1162,9 @@ export interface DecayConfig {
  */
 export interface StateScope {
   "system:currentDiskSize": number;
+  "system:shutdownFlush": {
+    flushedAt: string;
+  };
 }
 
 export type StateScopeKey = keyof StateScope;
diff --git a/test/cli-stop-port-detection.test.ts b/test/cli-stop-port-detection.test.ts
index 01230f40e..1e5eea627 100644
--- a/test/cli-stop-port-detection.test.ts
+++ b/test/cli-stop-port-detection.test.ts
@@ -195,6 +195,9 @@ describe("Windows stop port detection (#550)", () => {
           events.push(`${role}:${pid}:${signal}:${timeoutMs}`);
           return true;
         },
+        settleStatePersistence: async () => {
+          events.push("settle");
+        },
         clear: () => events.push("clear"),
         warn: (message) => events.push(`warn:${message}`),
         error: (message) => events.push(`error:${message}`),
@@ -206,12 +209,53 @@ describe("Windows stop port detection (#550)", () => {
     expect(events).toEqual([
       "flush",
       "worker:40000:SIGTERM:5000",
-      "engine:39672:SIGTERM:3000",
+      "settle",
+      "engine:39672:SIGINT:3000",
       "clear",
       "outro:Stopped. Memories persisted to disk; restart anytime with: npx @agentmemory/agentmemory",
     ]);
   });
 
+  it("skips the state settle phase when checkpointing was not confirmed", async () => {
+    const events: string[] = [];
+
+    const result = await executeResponsiveNativeStop(
+      {
+        workerPids: [40000],
+        enginePids: [39672],
+      },
+      {
+        isWindows: false,
+        force: false,
+        flush: async () => {
+          events.push("flush");
+          return false;
+        },
+        signal: async (pid, signal, timeoutMs, role) => {
+          events.push(`${role}:${pid}:${signal}:${timeoutMs}`);
+          return true;
+        },
+        settleStatePersistence: async () => {
+          events.push("settle");
+        },
+        clear: () => events.push("clear"),
+        warn: (message) => events.push(`warn:${message}`),
+        error: (message) => events.push(`error:${message}`),
+        outro: (message) => events.push(`outro:${message}`),
+      },
+    );
+
+    expect(result).toEqual({ action: "stopped", allStopped: true, exitCode: 0 });
+    expect(events).toEqual([
+      "flush",
+      "warn:pre-stop flush failed; relying on the worker shutdown signal path",
+      "worker:40000:SIGTERM:5000",
+      "engine:39672:SIGTERM:3000",
+      "clear",
+      "outro:Stopped. Persistence was not confirmed before termination; restart anytime with: npx @agentmemory/agentmemory",
+    ]);
+  });
+
   it("checkpoints responsive engine-only stop when the worker pidfile is missing", async () => {
     const events: string[] = [];
 
@@ -241,7 +285,7 @@ describe("Windows stop port detection (#550)", () => {
     expect(result).toEqual({ action: "stopped", allStopped: true, exitCode: 0 });
     expect(events).toEqual([
       "flush",
-      "engine:39672:SIGTERM:3000",
+      "engine:39672:SIGINT:3000",
       "clear",
       "outro:Stopped. Memories persisted to disk; restart anytime with: npx @agentmemory/agentmemory",
     ]);
diff --git a/test/issue-349-engine-state-probe.test.ts b/test/issue-349-engine-state-probe.test.ts
new file mode 100644
index 000000000..edfe6c2ad
--- /dev/null
+++ b/test/issue-349-engine-state-probe.test.ts
@@ -0,0 +1,89 @@
+import { existsSync, readFileSync } from "node:fs";
+import { describe, expect, it } from "vitest";
+
+const SCRIPT = "scripts/github/issue-349-engine-state-probe.mjs";
+
+function readScript(): string {
+  return readFileSync(SCRIPT, "utf-8");
+}
+
+describe("Issue #349 engine state probe", () => {
+  it("exists as a dedicated engine-only diagnostic harness", () => {
+    expect(existsSync(SCRIPT)).toBe(true);
+  });
+
+  it("uses the checkout workspace for executable probe files", () => {
+    const source = readScript();
+
+    expect(source).toContain("ISSUE_349_PROBE_PARENT");
+    expect(source).toContain("process.cwd()");
+    expect(source).toContain(".agentmemory-issue-349-engine-");
+  });
+
+  it("uses direct iii state operations without agentmemory app paths", () => {
+    const source = readScript();
+
+    expect(source).toContain("state::set");
+    expect(source).toContain("state::get");
+    expect(source).toContain("state::list");
+    expect(source).toContain("state::list_groups");
+    expect(source).toContain("ISSUE_349_ENGINE_STATE_PROBE");
+
+    expect(source).not.toContain("/agentmemory/");
+    expect(source).not.toContain("shutdown/flush");
+    expect(source).not.toContain("dist/index.mjs");
+    expect(source).not.toContain("agentmemory stop");
+  });
+
+  it("records enough process and storage identity to classify the boundary", () => {
+    const source = readScript();
+
+    expect(source).toContain("iii version");
+    expect(source).toContain("binary path");
+    expect(source).toContain("config path");
+    expect(source).toContain("engine cwd");
+    expect(source).toContain("engine pid");
+    expect(source).toContain("stop signal");
+    expect(source).toContain("exit status");
+    expect(source).toContain("after-download");
+    expect(source).toContain("data dir tree");
+  });
+
+  it("extracts iii release archives without an external tar binary", () => {
+    const source = readScript();
+
+    expect(source).toContain("gunzipSync");
+    expect(source).toContain("extractTarGz");
+    expect(source).toContain("findIiiBinary");
+    expect(source).not.toContain("TAR_BIN");
+    expect(source).not.toContain('await run("tar"');
+  });
+
+  it("creates the engine cwd before spawning the extracted binary", () => {
+    const source = readScript();
+    const mkdirEngineCwd = source.indexOf("await mkdir(engineCwd, { recursive: true });");
+    const versionCheck = source.indexOf('const version = await run(iiiBin, ["--version"]);');
+
+    expect(mkdirEngineCwd).toBeGreaterThanOrEqual(0);
+    expect(versionCheck).toBeGreaterThanOrEqual(0);
+    expect(mkdirEngineCwd).toBeLessThan(versionCheck);
+  });
+
+  it("waits for the expected scope file before stopping the first engine", () => {
+    const source = readScript();
+
+    expect(source).toContain("encodedScopeFilePath");
+    expect(source).toContain("waitForScopeFile");
+    expect(source).toContain("scope file materialized");
+
+    const verifyBeforeStop = source.indexOf('await verifyDirectState("after-set");');
+    const waitBeforeStop = source.indexOf("await waitForScopeFile(scope);");
+    const stopFirstEngine = source.indexOf('await stopEngine("first-engine");');
+
+    expect(verifyBeforeStop).toBeGreaterThanOrEqual(0);
+    expect(waitBeforeStop).toBeGreaterThanOrEqual(0);
+    expect(stopFirstEngine).toBeGreaterThanOrEqual(0);
+    expect(verifyBeforeStop).toBeLessThan(waitBeforeStop);
+    expect(waitBeforeStop).toBeLessThan(stopFirstEngine);
+  });
+});
diff --git a/test/quality-gates.test.ts b/test/quality-gates.test.ts
index 1de6e8cc4..b4604ee23 100644
--- a/test/quality-gates.test.ts
+++ b/test/quality-gates.test.ts
@@ -267,6 +267,17 @@ describe("root quality gates", () => {
     expect(ci).toContain("matrix.os == 'ubuntu-latest' && matrix.node-version == 22");
   });
 
+  it("wires the issue 349 engine-state diagnostic probe into CI", () => {
+    const ci = readText(".github/workflows/ci.yml");
+
+    expect(ci).toContain("engine-state-probe:");
+    expect(ci).toContain('name: "Issue #349 restart retest (${{ matrix.os }})"');
+    expect(ci).toContain('name: "Issue #349 engine state probe (${{ matrix.os }})"');
+    expect(ci).toContain("os: [ubuntu-latest, macos-latest]");
+    expect(ci).toContain("node-version: 22");
+    expect(ci).toContain("run: node scripts/github/issue-349-engine-state-probe.mjs");
+  });
+
   it("builds publish artifacts from the committed pnpm lockfile before npm publish", () => {
     const publish = readText(".github/workflows/publish.yml");
 
diff --git a/test/reconnect-registration.test.ts b/test/reconnect-registration.test.ts
index 784a89210..56009e6b2 100644
--- a/test/reconnect-registration.test.ts
+++ b/test/reconnect-registration.test.ts
@@ -61,7 +61,8 @@ describe("registerWithReconnectReplay", () => {
     expect(replayCall).toBeGreaterThan(closureStart);
 
     const replayedSource = source.slice(closureStart, replayCall);
-    expect(replayedSource).toContain("registerShutdownFlushFunction(sdk, indexPersistence");
+    expect(replayedSource).toContain("registerShutdownFlushFunction(");
+    expect(replayedSource).toContain("indexPersistence");
     expect(replayedSource).toContain("registerApiTriggers(sdk, kv, secret, metricsStore, provider)");
     expect(replayedSource).toContain("registerEventTriggers(sdk, kv)");
     expect(replayedSource).toContain("registerMcpEndpoints(sdk, kv, secret)");
diff --git a/test/shutdown-flush.test.ts b/test/shutdown-flush.test.ts
index f911e9c76..727a12dea 100644
--- a/test/shutdown-flush.test.ts
+++ b/test/shutdown-flush.test.ts
@@ -12,15 +12,32 @@ describe("registerShutdownFlushFunction", () => {
       ),
     };
     const indexPersistence = { saveOrThrow: vi.fn(async () => {}) };
+    let checkpoint: unknown = null;
+    const statePersistence = {
+      set: vi.fn(async (_scope: string, _key: string, value: unknown) => {
+        checkpoint = value;
+        return value;
+      }),
+      get: vi.fn(async () => checkpoint),
+    };
 
-    registerShutdownFlushFunction(sdk as never, indexPersistence, {
-      isReady: () => true,
-    });
+    registerShutdownFlushFunction(
+      sdk as never,
+      indexPersistence,
+      { isReady: () => true },
+      statePersistence as never,
+    );
 
     const handler = handlers.get("mem::shutdown-flush");
     expect(handler).toBeDefined();
     await expect(handler!({})).resolves.toMatchObject({ success: true });
     expect(indexPersistence.saveOrThrow).toHaveBeenCalledTimes(1);
+    expect(statePersistence.set).toHaveBeenCalledWith(
+      "mem:state",
+      "system:shutdownFlush",
+      expect.objectContaining({ flushedAt: expect.any(String) }),
+    );
+    expect(statePersistence.get).toHaveBeenCalledWith("mem:state", "system:shutdownFlush");
   });
 
   it("propagates checkpoint failures so callers can fail closed", async () => {
@@ -37,16 +54,50 @@ describe("registerShutdownFlushFunction", () => {
         throw new Error("state::set failed");
       }),
     };
+    const statePersistence = {
+      set: vi.fn(async (_scope: string, _key: string, value: unknown) => value),
+      get: vi.fn(async () => null),
+    };
 
-    registerShutdownFlushFunction(sdk as never, indexPersistence, {
-      isReady: () => true,
-    });
+    registerShutdownFlushFunction(
+      sdk as never,
+      indexPersistence,
+      { isReady: () => true },
+      statePersistence as never,
+    );
 
     await expect(handlers.get("mem::shutdown-flush")!({})).rejects.toThrow(
       "state::set failed",
     );
   });
 
+  it("propagates state checkpoint failures so stop does not claim persistence", async () => {
+    const handlers = new Map<string, (payload: unknown) => Promise<unknown>>();
+    const sdk = {
+      registerFunction: vi.fn(
+        (id: string, handler: (payload: unknown) => Promise<unknown>) => {
+          handlers.set(id, handler);
+        },
+      ),
+    };
+    const indexPersistence = { saveOrThrow: vi.fn(async () => {}) };
+    const statePersistence = {
+      set: vi.fn(async (_scope: string, _key: string, value: unknown) => value),
+      get: vi.fn(async () => null),
+    };
+
+    registerShutdownFlushFunction(
+      sdk as never,
+      indexPersistence,
+      { isReady: () => true },
+      statePersistence as never,
+    );
+
+    await expect(handlers.get("mem::shutdown-flush")!({})).rejects.toThrow(
+      "shutdown state checkpoint was not confirmed",
+    );
+  });
+
   it("does not save before restored indexes are ready", async () => {
     const handlers = new Map<string, (payload: unknown) => Promise<unknown>>();
     const sdk = {
@@ -57,10 +108,17 @@ describe("registerShutdownFlushFunction", () => {
       ),
     };
     const indexPersistence = { saveOrThrow: vi.fn(async () => {}) };
+    const statePersistence = {
+      set: vi.fn(async (_scope: string, _key: string, value: unknown) => value),
+      get: vi.fn(async () => null),
+    };
 
-    registerShutdownFlushFunction(sdk as never, indexPersistence, {
-      isReady: () => false,
-    });
+    registerShutdownFlushFunction(
+      sdk as never,
+      indexPersistence,
+      { isReady: () => false },
+      statePersistence as never,
+    );
 
     await expect(handlers.get("mem::shutdown-flush")!({})).resolves.toEqual({
       success: false,