finedesignz · finedesignz · May 27, 2026 · May 27, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -69,6 +69,14 @@ cd supervisor/tauri && cargo tauri build
 
 The legacy `npx remo-code-agent` / `claude-remote` shell-alias flow is retired as of 2026-05-26. Install the Tauri Supervisor MSI from https://github.com/finedesignz/remo-code/releases/latest instead.
 
+**Phase 09 follow-up (2026-05-27): legacy spawn path is hard-disabled in supervisor.** The supervisor's `process-manager.ts:spawn()` no longer invokes the retired CLI agent at all — it immediately finalizes every `session.start` as `stopped` with `exit_reason='legacy_agent_spawn_disabled'`. The in-process claude-runner that will replace it (direct `claude --input-format stream-json` spawn bridged over the supervisor WS) is a separate follow-up phase. Until that lands, `session.start` rolls cleanly to stopped instead of trapping the supervisor in a respawn loop against a cached buggy v0.4.1 agent.
+
+**Guards added in the same fix:**
+
+- Supervisor `BACKOFF_SCHEDULE` now caps at `MAX_RESTART_COUNT = 10` — after 10 consecutive restart attempts the run finalizes as `max_restarts_exceeded` and stops respawning.
+- Hub auto-resume on `supervisor.hello` (in `hub/src/ws/agent.ts`) now (a) filters orphan `session_runs` to rows newer than 24h, sweeping older rows as `exit_reason='stale'`, and (b) finalizes any orphan with `restart_count >= 10` as `max_restarts_exceeded` and skips the replay.
+- Canary test `supervisor/test/no-legacy-agent-spawn.test.ts` greps `supervisor/src/**` for the retired `remo-code-agent` package name and `--append-system-prompt` flag; the build FAILS if either reappears.
+
 ## Local Supervisor (only supported connection)
 
 The supervisor (`supervisor/src/index.ts`, compiled into the Tauri sidecar binary) runs on the dev machine as a tray app. It:

diff --git a/hub/src/ws/agent.ts b/hub/src/ws/agent.ts
@@ -512,17 +512,53 @@ async function handleSupervisorMessage(ws: ServerWebSocket<AgentWsData>, msg: an
     // These were orphaned by a reboot/restart. We end the old run row and send a
     // fresh session.start to the now-online supervisor. The new run reuses the
     // same project_dir, so the UI session row is reused and history persists.
+    //
+    // Guards added 2026-05-27 after the autonomous-loop RCA:
+    //   1. AGE CAP — only resume runs that started in the last 24h. Older
+    //      open rows are stale carryovers from a long-gone session; replaying
+    //      them produces zombie sessions the user has forgotten about.
+    //   2. RESTART CAP — when restart_count >= 10, finalize the run as
+    //      `max_restarts_exceeded` and skip the replay. Prevents the hub from
+    //      feeding the same broken spawn back into the supervisor over and
+    //      over after a reconnect.
     try {
       const { sql } = await import('../db/postgres')
+      const MAX_RESTART_COUNT = 10
       const orphans = await sql`
-        SELECT id, repo_path, branch, initial_prompt
+        SELECT id, repo_path, branch, initial_prompt, restart_count, started_at
         FROM session_runs
-        WHERE supervisor_id = ${row.id} AND ended_at IS NULL
+        WHERE supervisor_id = ${row.id}
+          AND ended_at IS NULL
+          AND started_at > now() - interval '24 hours'
         ORDER BY started_at ASC
       `
+      // Sweep any open rows that fell outside the 24h window — finalize them
+      // as `stale` so they don't reappear on the next reconnect.
+      const staleSweep = await sql`
+        UPDATE session_runs
+        SET ended_at = now(), exit_reason = 'stale'
+        WHERE supervisor_id = ${row.id}
+          AND ended_at IS NULL
+          AND started_at <= now() - interval '24 hours'
+        RETURNING id
+      `
+      if (staleSweep.length > 0) {
+        console.log(`[supervisor] auto-resume finalized ${staleSweep.length} stale run(s) older than 24h`)
+      }
       if (orphans.length > 0) {
         console.log(`[supervisor] auto-resuming ${orphans.length} orphan session(s)`)
         for (const o of orphans) {
+          // Restart-count cap — if this run has already been restarted too
+          // many times, finalize it instead of replaying. Stops runaway loops.
+          if (typeof o.restart_count === 'number' && o.restart_count >= MAX_RESTART_COUNT) {
+            await sql`
+              UPDATE session_runs
+              SET ended_at = now(), exit_reason = 'max_restarts_exceeded'
+              WHERE id = ${o.id}
+            `
+            console.warn(`[supervisor] auto-resume skipped run=${o.id} reason=max_restarts_exceeded restart_count=${o.restart_count}`)
+            continue
+          }
           // End the orphan FIRST so it doesn't count against the cap when we
           // reserve a slot for its replacement.
           await sql`UPDATE session_runs SET ended_at = now(), exit_reason = 'reboot' WHERE id = ${o.id}`

diff --git a/hub/test/auto-resume-caps.test.ts b/hub/test/auto-resume-caps.test.ts
@@ -0,0 +1,176 @@
+/**
+ * Auto-resume guards (Phase 09 follow-up, 2026-05-27 RCA).
+ *
+ * Verifies the two new caps on hub/src/ws/agent.ts:supervisor.hello auto-resume:
+ *
+ *   1. AGE CAP — only resume `session_runs` rows where `started_at > now() - 24h`.
+ *      Older open rows are swept and finalized as `exit_reason='stale'`.
+ *   2. RESTART CAP — when `restart_count >= 10`, finalize the run as
+ *      `exit_reason='max_restarts_exceeded'` and skip the replay.
+ *
+ * Gated on REMO_E2E_DB_URL so the rest of `bun test` stays green without a DB.
+ */
+
+process.env.JWT_SECRET = process.env.JWT_SECRET || 'test-secret-at-least-32-chars-long-aaaaaaaa'
+process.env.SESSION_SECRET = process.env.SESSION_SECRET || 'session-secret-at-least-32-chars-long-x'
+process.env.MAGIC_LINK_SECRET = process.env.MAGIC_LINK_SECRET || 'magic-link-secret-at-least-32-chars-x'
+process.env.TITANIUM_KEYGEN_API_URL = process.env.TITANIUM_KEYGEN_API_URL || 'https://keygen.titaniumlabs.us'
+process.env.TITANIUM_KEYGEN_ACCOUNT_ID = process.env.TITANIUM_KEYGEN_ACCOUNT_ID || 'acct_test_0000000000'
+process.env.TITANIUM_KEYGEN_PRODUCT_ID = process.env.TITANIUM_KEYGEN_PRODUCT_ID || 'prod_test_remo'
+if (process.env.REMO_E2E_DB_URL) process.env.DATABASE_URL = process.env.REMO_E2E_DB_URL
+
+import { describe, test, expect, beforeAll, afterAll, beforeEach } from 'bun:test'
+
+const HAS_TEST_DB = !!process.env.REMO_E2E_DB_URL
+const maybe = HAS_TEST_DB ? describe : describe.skip
+
+const TEST_USER_ID = '00000000-0000-0000-0000-0000000ar001'
+const TEST_API_KEY_ID = 'apikey_ar001'
+const TEST_SUPERVISOR_ID = 'sup_ar001'
+
+let sql: any
+
+async function seed() {
+  await sql.unsafe(`
+    INSERT INTO users (id, email, password_hash, role)
+    VALUES ('${TEST_USER_ID}', 'ar001+autoresume@test.local', 'x', 'user')
+    ON CONFLICT (id) DO NOTHING;
+  `)
+  await sql.unsafe(`
+    INSERT INTO api_keys (id, user_id, key_hash, capabilities, name)
+    VALUES ('${TEST_API_KEY_ID}', '${TEST_USER_ID}', 'ar001-hash', '["supervisor"]'::jsonb, 'ar001')
+    ON CONFLICT (id) DO NOTHING;
+  `)
+  await sql.unsafe(`
+    INSERT INTO supervisors (id, user_id, api_key_id, hostname, roots, concurrency_budget, last_seen_at)
+    VALUES ('${TEST_SUPERVISOR_ID}', '${TEST_USER_ID}', '${TEST_API_KEY_ID}', 'ar001-host', '{}'::text[], 4, now())
+    ON CONFLICT (id) DO NOTHING;
+  `)
+}
+
+async function cleanupRuns() {
+  await sql`DELETE FROM session_runs WHERE supervisor_id = ${TEST_SUPERVISOR_ID}`
+}
+
+maybe('auto-resume age + restart-count caps', () => {
+  beforeAll(async () => {
+    ;({ sql } = await import('../src/db/postgres'))
+    await seed()
+  })
+
+  afterAll(async () => {
+    await cleanupRuns()
+    await sql`DELETE FROM supervisors WHERE id = ${TEST_SUPERVISOR_ID}`
+    await sql`DELETE FROM api_keys WHERE id = ${TEST_API_KEY_ID}`
+    await sql`DELETE FROM users WHERE id = ${TEST_USER_ID}`
+  })
+
+  beforeEach(async () => {
+    await cleanupRuns()
+  })
+
+  test('fresh run (<24h old, restart_count=0) IS picked up by the orphan query', async () => {
+    await sql`
+      INSERT INTO session_runs (id, user_id, supervisor_id, repo_path, started_at, ended_at, restart_count)
+      VALUES ('run_fresh_1', ${TEST_USER_ID}, ${TEST_SUPERVISOR_ID}, 'C:/x/fresh', now() - interval '2 hours', NULL, 0)
+    `
+    const rows = await sql`
+      SELECT id, restart_count, started_at
+      FROM session_runs
+      WHERE supervisor_id = ${TEST_SUPERVISOR_ID}
+        AND ended_at IS NULL
+        AND started_at > now() - interval '24 hours'
+      ORDER BY started_at ASC
+    `
+    expect(rows.length).toBe(1)
+    expect(rows[0].id).toBe('run_fresh_1')
+    expect(rows[0].restart_count).toBe(0)
+  })
+
+  test('stale run (>24h old) is EXCLUDED from the orphan query AND finalized by the sweep', async () => {
+    await sql`
+      INSERT INTO session_runs (id, user_id, supervisor_id, repo_path, started_at, ended_at)
+      VALUES ('run_stale_1', ${TEST_USER_ID}, ${TEST_SUPERVISOR_ID}, 'C:/x/stale', now() - interval '48 hours', NULL)
+    `
+    // Mirror agent.ts: SELECT excludes >24h
+    const orphans = await sql`
+      SELECT id FROM session_runs
+      WHERE supervisor_id = ${TEST_SUPERVISOR_ID}
+        AND ended_at IS NULL
+        AND started_at > now() - interval '24 hours'
+    `
+    expect(orphans.length).toBe(0)
+
+    // Mirror agent.ts: UPDATE finalizes anything older than 24h as stale.
+    const swept = await sql`
+      UPDATE session_runs
+      SET ended_at = now(), exit_reason = 'stale'
+      WHERE supervisor_id = ${TEST_SUPERVISOR_ID}
+        AND ended_at IS NULL
+        AND started_at <= now() - interval '24 hours'
+      RETURNING id, exit_reason
+    `
+    expect(swept.length).toBe(1)
+    expect(swept[0].id).toBe('run_stale_1')
+    expect(swept[0].exit_reason).toBe('stale')
+  })
+
+  test('run with restart_count >= 10 is identified by the cap check (skipped + finalized)', async () => {
+    await sql`
+      INSERT INTO session_runs (id, user_id, supervisor_id, repo_path, started_at, ended_at, restart_count)
+      VALUES ('run_loopy_1', ${TEST_USER_ID}, ${TEST_SUPERVISOR_ID}, 'C:/x/loopy', now() - interval '1 hour', NULL, 10)
+    `
+    const orphans = await sql`
+      SELECT id, restart_count FROM session_runs
+      WHERE supervisor_id = ${TEST_SUPERVISOR_ID}
+        AND ended_at IS NULL
+        AND started_at > now() - interval '24 hours'
+    `
+    expect(orphans.length).toBe(1)
+    expect(orphans[0].restart_count).toBe(10)
+
+    // The replay code finalizes these and continues. Simulate that step.
+    await sql`
+      UPDATE session_runs
+      SET ended_at = now(), exit_reason = 'max_restarts_exceeded'
+      WHERE id = 'run_loopy_1'
+    `
+    const after = await sql`
+      SELECT ended_at, exit_reason FROM session_runs WHERE id = 'run_loopy_1'
+    `
+    expect(after[0].exit_reason).toBe('max_restarts_exceeded')
+    expect(after[0].ended_at).not.toBeNull()
+  })
+
+  test('combination: stale-and-loopy is finalized via the stale sweep (age guard wins)', async () => {
+    // A run that's both >24h old AND has restart_count=10. The age sweep
+    // fires first (in agent.ts), so the row is finalized as 'stale' and the
+    // restart-count gate never sees it. This matches the in-code order.
+    await sql`
+      INSERT INTO session_runs (id, user_id, supervisor_id, repo_path, started_at, ended_at, restart_count)
+      VALUES ('run_stale_loopy', ${TEST_USER_ID}, ${TEST_SUPERVISOR_ID}, 'C:/x/sl', now() - interval '30 hours', NULL, 10)
+    `
+    const orphans = await sql`
+      SELECT id FROM session_runs
+      WHERE supervisor_id = ${TEST_SUPERVISOR_ID}
+        AND ended_at IS NULL
+        AND started_at > now() - interval '24 hours'
+    `
+    expect(orphans.length).toBe(0)
+    const swept = await sql`
+      UPDATE session_runs
+      SET ended_at = now(), exit_reason = 'stale'
+      WHERE supervisor_id = ${TEST_SUPERVISOR_ID}
+        AND ended_at IS NULL
+        AND started_at <= now() - interval '24 hours'
+      RETURNING id, exit_reason
+    `
+    expect(swept.length).toBe(1)
+    expect(swept[0].exit_reason).toBe('stale')
+  })
+})
+
+if (!HAS_TEST_DB) {
+  // Sanity skip so the suite doesn't appear empty in CI.
+  test.skip('auto-resume caps test suite skipped — set REMO_E2E_DB_URL to run', () => {})
+}
diff --git a/supervisor/src/hub-client.ts b/supervisor/src/hub-client.ts
@@ -9,7 +9,7 @@ import { getHandler, nativeSupervisorCommands } from './commands/index'
 import { CONFIG_PATH, saveConfig, type SupervisorConfig } from './config'
 
 // Keep in sync with supervisor/tauri/src-tauri/tauri.conf.json version
-const VERSION = '0.5.1'
+const VERSION = '0.5.2'
 
 type OutboundMsg =
   | { type: 'auth'; api_key: string; project_dir: string; hostname: string; role: 'supervisor' }

diff --git a/supervisor/src/index.ts b/supervisor/src/index.ts
@@ -7,7 +7,7 @@ import { join } from 'path'
 import { homedir } from 'os'
 
 // Keep in sync with supervisor/tauri/src-tauri/tauri.conf.json version
-const VERSION = '0.5.1'
+const VERSION = '0.5.2'
 
 function logDir(): string {
   if (process.platform === 'win32') {