diff --git a/AGENTS.md b/AGENTS.md index 883d0152..50aff2ee 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -214,6 +214,7 @@ Namespaces (`src/cli/log.ts`): |---|---|---| | `mcp` | McpSource construction; per-call dispatch showing `taskSupport` / `path=task-augmented\|inline` / cached tool count | "Why is my tool going inline vs task-augmented?" "Is my tool cache populated?" | | `sse` | Every `tool.progress` / `tool.done` entering the runtime sink wrap; every `data.changed` broadcast with client count | "Are progress events reaching the SSE layer?" "Are broadcasts happening, to how many clients?" | +| `registry` | Per-source skips inside `ToolRegistry.availableTools` (one line per stuck source per enumeration; operator-facing "first failure" signal lives at the lifecycle transition site as a warn — see issue #194) | "Which sources are being skipped on each chat turn?" "Is enumeration silently dropping a connector?" | Add a namespace by calling `log.debug("ns", "message")` (from `src/cli/log.ts`). Keep this table and the `log.ts` doc comment in sync. diff --git a/src/api/server.ts b/src/api/server.ts index 8a5ce362..5ea3cd5f 100644 --- a/src/api/server.ts +++ b/src/api/server.ts @@ -62,7 +62,36 @@ export function startServer(options: ServerOptions): ServerHandle { const internalToken = runtime.getInternalToken(); const mcpSources = runtime.mcpSources(); - const healthMonitor = new HealthMonitor(mcpSources, runtime.getEventSink()); + // Wire HealthMonitor into the bundle lifecycle so a detected stdio + // crash transitions BundleInstance.state through the lifecycle's + // recordCrash/Recovery/Dead funnel. This is what gives the operator + // a single warn-on-first-failure log instead of one warn per chat + // turn forever (issue #194), and keeps the Configure UI's state pill + // accurate when a subprocess dies mid-session. + const lifecycle = runtime.getLifecycle(); + const healthMonitor = new HealthMonitor(mcpSources, runtime.getEventSink(), { + reportSourceTransition: (source, to) => { + // Resolve source → BundleInstance by (serverName, wsId). McpSource + // instances stamped with workspaceId via bundleContext at + // construction time — see `McpSource.getWorkspaceId`. The wsId + // disambiguates the same connector installed in multiple + // workspaces; without it a crash in workspace A could land on + // workspace B's BundleInstance (whichever the iteration hit first), + // leaving the actual crashed bundle stuck at `running` in its + // workspace's Configure UI. In-process platform sources and + // sources missing a bundleContext (rare) return undefined and we + // silently skip — they don't back a BundleInstance. + const wsId = source.getWorkspaceId(); + if (!wsId) return; + const inst = runtime + .getBundleInstances() + .find((i) => i.serverName === source.name && i.wsId === wsId); + if (!inst) return; + if (to === "crashed") lifecycle.recordCrash(inst.serverName, inst.wsId); + else if (to === "running") lifecycle.recordRecovery(inst.serverName, inst.wsId); + else lifecycle.recordDead(inst.serverName, inst.wsId); + }, + }); healthMonitor.start(); // SSE event manager — listens to runtime events and broadcasts to clients diff --git a/src/bundles/connection.ts b/src/bundles/connection.ts index c93d379e..d13360a7 100644 --- a/src/bundles/connection.ts +++ b/src/bundles/connection.ts @@ -105,3 +105,30 @@ export function summarizeConnectionState(connections: Map): if (states.includes("crashed")) return "crashed"; return states[0]!; } + +/** + * States that warrant operator attention when entered. Transitioning INTO + * one of these is the "first failure" signal worth a warn log; staying in + * one across many tool-list enumerations is not (that's the spam this set + * eliminates — see issue #194). + * + * - `dead` — process / transport gave up + * - `crashed` — transport closed unexpectedly mid-session + * - `reauth_required` — refresh token rejected; user must reconnect + * + * Excluded deliberately: + * - `not_authenticated` — silent resting state (fresh install, post-Disconnect) + * - `pending_auth` — in-flight OAuth, expected during normal Connect + * - `starting` — transient boot state + * - `stopped` — operator intent (stopBundle / uninstall) + * - `running` — healthy + */ +const BROKEN_STATES: ReadonlySet = new Set([ + "dead", + "crashed", + "reauth_required", +]); + +export function isBrokenState(s: BundleState): boolean { + return BROKEN_STATES.has(s); +} diff --git a/src/bundles/lifecycle.ts b/src/bundles/lifecycle.ts index 984a1fd8..9d97786e 100644 --- a/src/bundles/lifecycle.ts +++ b/src/bundles/lifecycle.ts @@ -17,6 +17,7 @@ import { connectorSlug, hasPersistedComposioConnection } from "./composio-connec import { type Connection, type ConnectionState, + isBrokenState, summarizeConnectionState, WORKSPACE_PRINCIPAL_ID, } from "./connection.ts"; @@ -878,9 +879,37 @@ export class BundleLifecycleManager { /** * Update state on a BundleInstance. Public so HealthMonitor can * report crashed/recovered/dead transitions. + * + * Edge-triggered logging (issue #194): + * - non-broken → broken: one `log.warn` at the moment of transition. + * Subsequent enumerations of the now-stuck source produce zero + * operator-visible noise. + * - broken → running: one `log.info` ("recovered") at the moment of + * recovery, symmetric with the warn. + * - No-op (same state in, same state out): silent. + * + * "Broken" set lives in `connection.ts::isBrokenState` = + * { dead, crashed, reauth_required }. Excludes `pending_auth` (in-flight + * OAuth) and `not_authenticated` (resting state). */ transition(instance: BundleInstance, newState: BundleState): void { + const oldState = instance.state; instance.state = newState; + if (oldState === newState) return; + + updateLastBrokenState(instance, oldState, newState); + + if (isBrokenState(newState) && !isBrokenState(oldState)) { + log.warn( + `[bundles] ${instance.serverName} (ws ${instance.wsId}) entered '${newState}' from '${oldState}'. ` + + `Tools unavailable until recovered. ${transitionRecoveryHint(newState)}`, + ); + } else if (newState === "running" && instance.lastBrokenState !== undefined) { + log.info( + `[bundles] ${instance.serverName} (ws ${instance.wsId}) recovered: '${instance.lastBrokenState}' → 'running'`, + ); + instance.lastBrokenState = undefined; + } } /** @@ -972,7 +1001,9 @@ export class BundleLifecycleManager { // Recompute summary state so legacy consumers (HealthMonitor, // briefing-collector, runtime status API) see the right surface. - instance.state = summarizeConnectionState(instance.connections); + // Routed through transition() so edge-triggered logging fires on + // non-broken → broken / broken → running transitions (issue #194). + this.transition(instance, summarizeConnectionState(instance.connections)); this.eventSink.emit({ type: "connection.state_changed", @@ -1741,6 +1772,56 @@ interface UpjackScheduleDeclaration { // Internal helpers // --------------------------------------------------------------------------- +/** + * Maintain `instance.lastBrokenState` for one transition. Three rules, + * in priority order: + * + * 1. Entering / staying broken → set to newState (latest wins) + * 2. Leaving broken with no label → capture oldState (handles + * instances constructed directly into a broken state, never + * having flowed through this funnel before) + * 3. Operator-stop → clear (explicit end of episode; + * a later boot → running must not emit a stale recovery log) + * + * The matching "leaving broken to running → clear after logging" case + * lives in `transition()` itself because it needs to read the value to + * format the recovery message before clearing. + * + * Split out from `transition()` so the funnel reads top-down as + * state-write → sticky maintenance → log decision, without sticky-bit + * branching obscuring the log policy. + */ +function updateLastBrokenState( + instance: BundleInstance, + oldState: BundleState, + newState: BundleState, +): void { + if (isBrokenState(newState)) { + instance.lastBrokenState = newState; + } else if (isBrokenState(oldState) && instance.lastBrokenState === undefined) { + instance.lastBrokenState = oldState; + } else if (newState === "stopped") { + instance.lastBrokenState = undefined; + } +} + +/** + * One-line operator hint appended to the broken-state warn. Tells the + * reader what recovery path applies without forcing them to look it up. + */ +function transitionRecoveryHint(state: BundleState): string { + switch (state) { + case "reauth_required": + return "User must reconnect via Connectors page."; + case "dead": + return "Use startBundle to restart."; + case "crashed": + return "HealthMonitor will attempt auto-restart with exponential backoff."; + default: + return ""; + } +} + function createInstance( serverName: string, bundleName: string, diff --git a/src/bundles/types.ts b/src/bundles/types.ts index 27108143..c9cd5246 100644 --- a/src/bundles/types.ts +++ b/src/bundles/types.ts @@ -351,6 +351,19 @@ export interface BundleInstance { description?: string; /** Current lifecycle state. */ state: BundleState; + /** + * Sticky breadcrumb of the most recent broken state observed since + * the last successful recovery to `running`. Owned by + * `BundleLifecycleManager.transition()` — set on any transition where + * the new (or old, for boot-into-broken instances) state is in the + * broken set, cleared on reaching `running` or an operator-driven + * `stopped`. Lets the recovery info log fire on multi-step paths the + * direct edge check misses — notably the URL bundle reconnect flow + * `reauth_required → pending_auth → running`, where neither + * transition is `broken → running`. Process-local; no persistence + * needed because a fresh process starts a fresh episode. + */ + lastBrokenState?: BundleState; /** MTF trust score from mpak (0-100), or null if unavailable. */ trustScore: number | null; /** UI placement metadata from _meta["ai.nimblebrain/host"]. */ diff --git a/src/cli/log.ts b/src/cli/log.ts index b73bed3c..15b8db39 100644 --- a/src/cli/log.ts +++ b/src/cli/log.ts @@ -14,6 +14,8 @@ * Known namespaces: * - `mcp` — McpSource construction, dispatch decisions (task-augmented vs inline) * - `sse` — Runtime event sink → SSE broadcast (tool.progress, data.changed) + * - `registry` — ToolRegistry.availableTools per-source skips (one line per stuck + * source per enumeration; operator-facing signal lives at lifecycle transition) * * Keep this list in sync with the CLAUDE.md "Debugging" section so it's * discoverable without reading source. diff --git a/src/runtime/runtime.ts b/src/runtime/runtime.ts index 0b449bb6..c4f318cf 100644 --- a/src/runtime/runtime.ts +++ b/src/runtime/runtime.ts @@ -1647,19 +1647,27 @@ export class Runtime { return [...names]; } - /** Get MCP sources across all workspace registries (for health monitoring). */ + /** Get MCP sources across all workspace registries (for health monitoring). + * + * Returns every McpSource instance — no dedupe by `source.name`. + * The same bundle (e.g. `linear-mcp`) installed in workspaces A and B + * produces two distinct McpSource processes, both of which need + * monitoring; collapsing them would leave one workspace's + * `BundleInstance.state` permanently stale on a crash. + * + * `Set` keys on instance identity so any McpSource + * registered directly in multiple workspace registries is monitored + * once. `SharedSourceRef` wrappers don't satisfy `instanceof McpSource` + * and are passed over here — their inner source is monitored via the + * native registry it was constructed in. */ mcpSources(): McpSource[] { - const sources: McpSource[] = []; - const seen = new Set(); + const sources = new Set(); for (const reg of this._workspaceRegistries.values()) { for (const s of reg.getSources()) { - if (s instanceof McpSource && !seen.has(s.name)) { - seen.add(s.name); - sources.push(s); - } + if (s instanceof McpSource) sources.add(s); } } - return sources; + return [...sources]; } /** Get all tracked bundle instances (unfiltered — use getBundleInstancesForWorkspace for scoped access). */ diff --git a/src/tools/health-monitor.ts b/src/tools/health-monitor.ts index 3d1266e9..0ed502d9 100644 --- a/src/tools/health-monitor.ts +++ b/src/tools/health-monitor.ts @@ -1,18 +1,18 @@ import type { EventSink } from "../engine/types.ts"; import type { McpSource } from "./mcp-source.ts"; -export type BundleState = "healthy" | "restarting" | "dead"; +export type HealthRecordState = "healthy" | "restarting" | "dead"; export interface BundleHealth { name: string; - state: BundleState; + state: HealthRecordState; uptime: number | null; restartCount: number; } interface BundleRecord { source: McpSource; - state: BundleState; + state: HealthRecordState; restartCount: number; } @@ -20,9 +20,31 @@ const MAX_RESTARTS = 5; const DEFAULT_BASE_DELAY_MS = 1000; const DEFAULT_CHECK_INTERVAL_MS = 30_000; +/** + * Reported HealthMonitor transition for the BundleInstance underlying a + * source. `crashed` and `dead` map to `lifecycle.recordCrash` / + * `recordDead`; `running` maps to `recordRecovery`. The single shape + * pushes the per-method switch + source→instance lookup into the + * caller (`startServer`) so HealthMonitor stays unaware of lifecycle + * internals — useful when the caller wants to inject a different + * backing system (tests, alternative observability paths). + */ +export type HealthMonitorTransition = "crashed" | "running" | "dead"; + export interface HealthMonitorOptions { checkIntervalMs?: number; baseDelayMs?: number; + /** + * Propagate a detected health transition back to `BundleInstance.state` + * via the bundle lifecycle. Without this hook the URL bundle path + * (which funnels through `recordConnectionStateChange`) still works, + * but stdio subprocess crashes leave the user-facing state stuck at + * `running`. Hook is no-op safe; the caller is responsible for the + * source → instance resolution (returning early if the source doesn't + * back any BundleInstance, e.g. shared or in-process platform sources). + * See issue #194 for the operator log story this enables. + */ + reportSourceTransition?: (source: McpSource, to: HealthMonitorTransition) => void; } /** @@ -34,6 +56,7 @@ export class HealthMonitor { private timer: ReturnType | null = null; private checkIntervalMs: number; private baseDelayMs: number; + private reportSourceTransition: HealthMonitorOptions["reportSourceTransition"]; constructor( sources: McpSource[], @@ -42,9 +65,10 @@ export class HealthMonitor { ) { this.checkIntervalMs = opts.checkIntervalMs ?? DEFAULT_CHECK_INTERVAL_MS; this.baseDelayMs = opts.baseDelayMs ?? DEFAULT_BASE_DELAY_MS; + this.reportSourceTransition = opts.reportSourceTransition; this.records = sources.map((source) => ({ source, - state: "healthy" as BundleState, + state: "healthy" as HealthRecordState, restartCount: 0, })); } @@ -98,6 +122,14 @@ export class HealthMonitor { }, }); + // Propagate to BundleInstance.state on first detection only. The + // `record.state === "healthy"` guard distinguishes the first failure + // from subsequent sweeps that find us in "restarting" — we don't want + // to report crashed on every poll cycle while a restart is pending. + if (record.state === "healthy") { + this.reportSourceTransition?.(record.source, "crashed"); + } + // Check if we've exhausted restart attempts if (record.restartCount >= MAX_RESTARTS) { record.state = "dead"; @@ -109,6 +141,7 @@ export class HealthMonitor { ...(remote ? { remote: true } : {}), }, }); + this.reportSourceTransition?.(record.source, "dead"); return; } @@ -149,6 +182,7 @@ export class HealthMonitor { ...(remote ? { remote: true } : {}), }, }); + this.reportSourceTransition?.(record.source, "running"); } else { // Restart failed — check again on next cycle (might hit max) record.state = "restarting"; diff --git a/src/tools/mcp-source.ts b/src/tools/mcp-source.ts index d3764e90..b696cd0b 100644 --- a/src/tools/mcp-source.ts +++ b/src/tools/mcp-source.ts @@ -321,6 +321,24 @@ export class McpSource implements ToolSource { return this.mode.type === "remote"; } + /** + * Workspace this source was constructed for, when available. Bundle- + * spawned sources (URL, stdio, named, local) always carry a workspace + * via `bundleContext.workspaceId` — that is the field returned here. + * In-process platform sources and short-lived test sources don't pass + * a `bundleContext` and return `undefined`. + * + * Used by `HealthMonitor`'s `reportSourceTransition` hook so the + * source → BundleInstance lookup is keyed on `(serverName, wsId)` + * rather than name alone. Without the wsId disambiguation, the same + * bundle (e.g. `linear-mcp`) installed in two workspaces would all + * route their crash detection to whichever instance happened to be + * first in iteration order, leaving the other stuck at `running`. + */ + getWorkspaceId(): string | undefined { + return this.bundleContext?.workspaceId; + } + async start(): Promise { // Fresh stderr state on every start. Restart cycles must not bleed // a dead instance's tail into the new instance's crash report. diff --git a/src/tools/registry.ts b/src/tools/registry.ts index 8b99159d..2d45fb8e 100644 --- a/src/tools/registry.ts +++ b/src/tools/registry.ts @@ -109,15 +109,22 @@ export class ToolRegistry implements ToolRouter { // workspace source. Surface the failure in the source's own // status (Configure page renders it from BundleInstance.state) // and leave the chat usable. + // + // Operator-facing signal lives at the lifecycle transition site + // (one warn per non-broken → broken edge — see + // `BundleLifecycleManager.transition` and issue #194). This skip + // path fires per chat turn; logging at warn level here would + // spam stderr for any source stuck in a broken state. + // Gated behind `NB_DEBUG=registry` for trace-level diagnosis. let tools: Tool[]; try { tools = await source.tools(); } catch (err) { const msg = err instanceof Error ? err.message : String(err); - log.warn( - `[registry] availableTools: skipping source "${source.name}" — ${msg}. ` + - `The bundle's own state surface (Connectors page) reflects this; the chat list ` + - `omits its tools until the source recovers.`, + log.debug( + "registry", + `availableTools: skipping source "${source.name}" — ${msg}. ` + + `Operator signal lives at lifecycle transition (warn on enter-broken).`, ); continue; } diff --git a/test/integration/health-monitor-lifecycle.test.ts b/test/integration/health-monitor-lifecycle.test.ts new file mode 100644 index 00000000..429ec0ce --- /dev/null +++ b/test/integration/health-monitor-lifecycle.test.ts @@ -0,0 +1,280 @@ +import { afterAll, afterEach, beforeEach, describe, expect, it } from "bun:test"; +import { existsSync, mkdirSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { BundleLifecycleManager } from "../../src/bundles/lifecycle.ts"; +import { log } from "../../src/cli/log.ts"; +import type { EngineEvent, EventSink } from "../../src/engine/types.ts"; +import { HealthMonitor } from "../../src/tools/health-monitor.ts"; +import { McpSource } from "../../src/tools/mcp-source.ts"; +import { ToolRegistry } from "../../src/tools/registry.ts"; + +/** + * End-to-end integration of HealthMonitor's crash detection with the + * bundle lifecycle's `transition()` funnel (issue #194). + * + * Unit tests cover `transition()` in isolation. This file exercises + * the wiring that lives in `src/api/server.ts`: + * + * HealthMonitor.check() observes !isAlive() + * → reportSourceTransition(source, "crashed") + * → source.getWorkspaceId() lookup + * → lifecycle.recordCrash(name, wsId) + * → transition(running → crashed) + * → log.warn fires once + * → instance.state === "crashed" + * + * Repeats the resolution to verify the (name, wsId) keying — same + * server name in two workspaces, only the crashed one transitions. + */ + +const testDir = join(tmpdir(), `nb-health-monitor-lifecycle-${Date.now()}`); + +function setupTestDir() { + if (existsSync(testDir)) rmSync(testDir, { recursive: true }); + mkdirSync(testDir, { recursive: true }); +} + +afterAll(() => { + if (existsSync(testDir)) rmSync(testDir, { recursive: true }); +}); + +function makeSink(): EventSink & { events: EngineEvent[] } { + const events: EngineEvent[] = []; + return { + events, + emit(event) { + events.push(event); + }, + }; +} + +function createEchoBundleOnDisk(dir: string): string { + mkdirSync(dir, { recursive: true }); + const nodeModulesPath = join(import.meta.dir, "../..", "node_modules"); + const serverCode = ` +const { Server } = require("${nodeModulesPath}/@modelcontextprotocol/sdk/dist/cjs/server/index.js"); +const { StdioServerTransport } = require("${nodeModulesPath}/@modelcontextprotocol/sdk/dist/cjs/server/stdio.js"); +const { ListToolsRequestSchema, CallToolRequestSchema } = require("${nodeModulesPath}/@modelcontextprotocol/sdk/dist/cjs/types.js"); + +async function main() { + const server = new Server({ name: "echo-test", version: "0.1.0" }, { capabilities: { tools: {} } }); + server.setRequestHandler(ListToolsRequestSchema, async () => ({ + tools: [{ name: "echo", description: "Echo input", inputSchema: { type: "object" } }], + })); + server.setRequestHandler(CallToolRequestSchema, async () => ({ content: [{ type: "text", text: "ok" }] })); + const transport = new StdioServerTransport(); + await server.connect(transport); +} +main(); +`; + writeFileSync(join(dir, "server.cjs"), serverCode); + const manifest = { + manifest_version: "0.4", + name: "@test/echo", + version: "1.0.0", + description: "Echo test bundle", + author: { name: "Test Author" }, + server: { + type: "node", + entry_point: "server.cjs", + mcp_config: { command: "node", args: ["${__dirname}/server.cjs"] }, + }, + }; + writeFileSync(join(dir, "manifest.json"), JSON.stringify(manifest, null, 2)); + return dir; +} + +/** + * Wire a HealthMonitor exactly the way `startServer` does — single hook + * doing (name, wsId) resolution against the lifecycle. Mirroring the + * production adapter shape is the point of this test file. + */ +function buildAdaptedMonitor( + sources: McpSource[], + lifecycle: BundleLifecycleManager, + eventSink: EventSink, +): HealthMonitor { + return new HealthMonitor(sources, eventSink, { + // Short check interval so the test doesn't have to wait the 30s default, + // though tests drive `check()` manually below; setting this defensively + // in case anyone later switches to timer-driven assertions. + checkIntervalMs: 1000, + baseDelayMs: 1, // skip the real exponential backoff in restart attempts + reportSourceTransition: (source, to) => { + const wsId = source.getWorkspaceId(); + if (!wsId) return; + const inst = lifecycle.getInstances().find((i) => i.serverName === source.name && i.wsId === wsId); + if (!inst) return; + if (to === "crashed") lifecycle.recordCrash(inst.serverName, inst.wsId); + else if (to === "running") lifecycle.recordRecovery(inst.serverName, inst.wsId); + else lifecycle.recordDead(inst.serverName, inst.wsId); + }, + }); +} + +/** Force-mark an McpSource dead so HealthMonitor's `isAlive()` check + * returns false on the next sweep. Mirrors what `transport.onclose` + * does on a real subprocess crash, without the test having to find + * and SIGKILL the child PID. + * + * Also stubs `restart()` to fail so the HealthMonitor's restart + * attempt doesn't immediately bounce the source back to healthy + * (which would flip `BundleInstance.state` back to running before + * the test's assertions can observe `crashed`). Production behavior + * on a real fatal crash with no recovery path matches this. */ +function markSourceCrashed(source: McpSource): void { + // McpSource keeps `dead` and `restart` accessible; casts here let the + // test simulate a transport-level crash plus an irrecoverable bundle + // (e.g. config invalid, vendor outage) without dragging in real + // subprocess kill semantics or vendor-dependent retry behavior. + const internal = source as unknown as { + dead: boolean; + restart: () => Promise; + }; + internal.dead = true; + internal.restart = async () => false; +} + +describe("HealthMonitor ↔ BundleLifecycleManager — end-to-end crash chain", () => { + let originalWorkDir: string | undefined; + let originalWarn: (msg: string) => void; + let originalInfo: (msg: string) => void; + let warnCalls: string[]; + let infoCalls: string[]; + + beforeEach(() => { + setupTestDir(); + originalWorkDir = process.env.NB_WORK_DIR; + process.env.NB_WORK_DIR = testDir; + originalWarn = log.warn; + originalInfo = log.info; + warnCalls = []; + infoCalls = []; + log.warn = (msg) => { + warnCalls.push(msg); + }; + log.info = (msg) => { + infoCalls.push(msg); + }; + }); + + afterEach(() => { + if (originalWorkDir === undefined) delete process.env.NB_WORK_DIR; + else process.env.NB_WORK_DIR = originalWorkDir; + log.warn = originalWarn; + log.info = originalInfo; + }); + + it( + "detected crash flows through reportSourceTransition → recordCrash → transition → warn", + async () => { + const bundleDir = createEchoBundleOnDisk(join(testDir, "echo-chain")); + const registry = new ToolRegistry(); + const sink = makeSink(); + const lifecycle = new BundleLifecycleManager(sink, undefined); + // Provide the wsId-bearing deps factory so the spawned McpSource + // carries a workspaceId. Resolver/rateLimit are never called + // because the test never executes a tool call. + lifecycle.setBundleMcpDepsFactory((wsId) => ({ + workspaceId: wsId, + // biome-ignore lint/suspicious/noExplicitAny: test stubs, never invoked + hostResources: {} as any, + // biome-ignore lint/suspicious/noExplicitAny: test stubs, never invoked + rateLimit: {} as any, + })); + + const instance = await lifecycle.installLocal(bundleDir, registry, "ws_test"); + expect(instance.state).toBe("running"); + + const source = registry.getSources().find((s) => s.name === instance.serverName); + expect(source).toBeInstanceOf(McpSource); + expect((source as McpSource).getWorkspaceId()).toBe("ws_test"); + + const monitor = buildAdaptedMonitor([source as McpSource], lifecycle, sink); + + // Reset any boot-time logs so the assertions below see only the + // transition output we're testing. + warnCalls.length = 0; + infoCalls.length = 0; + + // Simulate the transport-level crash that production HealthMonitor + // would see when a subprocess dies (transport.onclose → dead=true). + markSourceCrashed(source as McpSource); + + await monitor.check(); + + // BundleInstance.state reflects the crash, NOT a stale "running". + expect(instance.state).toBe("crashed"); + // Exactly one operator-facing warn fired (the first-failure signal + // — subsequent sweeps must NOT re-warn). + expect(warnCalls.length).toBe(1); + expect(warnCalls[0]).toContain("crashed"); + expect(warnCalls[0]).toContain(instance.serverName); + + // Subsequent sweeps with no recovery must not produce additional warns + // (record.state guard inside checkOne — already in restarting/dead). + await monitor.check(); + await monitor.check(); + expect(warnCalls.length).toBe(1); + + await registry.removeSource(instance.serverName); + }, + 30_000, + ); + + it( + "same serverName in two workspaces: crash only updates the matching wsId's instance", + async () => { + const bundleDirA = createEchoBundleOnDisk(join(testDir, "echo-wsA")); + const bundleDirB = createEchoBundleOnDisk(join(testDir, "echo-wsB")); + + const registryA = new ToolRegistry(); + const registryB = new ToolRegistry(); + const sink = makeSink(); + const lifecycle = new BundleLifecycleManager(sink, undefined); + lifecycle.setBundleMcpDepsFactory((wsId) => ({ + workspaceId: wsId, + // biome-ignore lint/suspicious/noExplicitAny: test stubs, never invoked + hostResources: {} as any, + // biome-ignore lint/suspicious/noExplicitAny: test stubs, never invoked + rateLimit: {} as any, + })); + + const instA = await lifecycle.installLocal(bundleDirA, registryA, "ws_alpha"); + const instB = await lifecycle.installLocal(bundleDirB, registryB, "ws_beta"); + + // Both instances share the same serverName because the manifest + // names match. The pre-#194 lookup keyed on name alone would + // collapse them; the (name, wsId) key keeps them distinct. + expect(instA.serverName).toBe(instB.serverName); + + const sourceA = registryA.getSources().find((s) => s.name === instA.serverName) as McpSource; + const sourceB = registryB.getSources().find((s) => s.name === instB.serverName) as McpSource; + expect(sourceA.getWorkspaceId()).toBe("ws_alpha"); + expect(sourceB.getWorkspaceId()).toBe("ws_beta"); + + const monitor = buildAdaptedMonitor([sourceA, sourceB], lifecycle, sink); + + warnCalls.length = 0; + infoCalls.length = 0; + + // Crash ONLY workspace alpha's instance. + markSourceCrashed(sourceA); + + await monitor.check(); + + expect(instA.state).toBe("crashed"); + // The cross-workspace bug pre-fix: instB.state would also flip + // because the lookup matched on name only and hit whichever + // instance came first in iteration. With (name, wsId) keying, B + // stays running. + expect(instB.state).toBe("running"); + expect(warnCalls.length).toBe(1); + + await registryA.removeSource(instA.serverName); + await registryB.removeSource(instB.serverName); + }, + 30_000, + ); +}); diff --git a/test/unit/bundle-transition-logging.test.ts b/test/unit/bundle-transition-logging.test.ts new file mode 100644 index 00000000..b6086c76 --- /dev/null +++ b/test/unit/bundle-transition-logging.test.ts @@ -0,0 +1,303 @@ +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { BundleLifecycleManager } from "../../src/bundles/lifecycle.ts"; +import { log } from "../../src/cli/log.ts"; +import type { EngineEvent, EventSink } from "../../src/engine/types.ts"; +import type { BundleInstance, BundleState } from "../../src/bundles/types.ts"; + +/** + * Edge-triggered logging on BundleInstance.state transitions. Issue #194. + * + * Per-turn `availableTools()` enumeration of a stuck source must NOT emit + * any operator-visible log. The single warn lives at the lifecycle + * transition site, fires once per non-broken → broken edge, and a + * matching info fires once per broken → running recovery edge. + * + * Broken set = { dead, crashed, reauth_required }. Excludes pending_auth + * (in-flight OAuth — expected during normal Connect) and not_authenticated + * (resting state on fresh install / after Disconnect). + */ + +function makeInstance(state: BundleState = "running"): BundleInstance { + return { + serverName: "test-src", + bundleName: "@scope/test", + version: "1.0.0", + state, + trustScore: null, + ui: null, + briefing: null, + httpProxy: null, + protected: false, + type: "plain", + wsId: "ws_test", + }; +} + +function makeSink(): EventSink & { events: EngineEvent[] } { + const events: EngineEvent[] = []; + return { + events, + emit(event) { + events.push(event); + }, + }; +} + +describe("BundleLifecycleManager.transition — edge-triggered logging", () => { + let lifecycle: BundleLifecycleManager; + let warnCalls: string[]; + let infoCalls: string[]; + let originalWarn: (msg: string) => void; + let originalInfo: (msg: string) => void; + + beforeEach(() => { + lifecycle = new BundleLifecycleManager(makeSink(), undefined); + warnCalls = []; + infoCalls = []; + originalWarn = log.warn; + originalInfo = log.info; + log.warn = (msg) => { + warnCalls.push(msg); + }; + log.info = (msg) => { + infoCalls.push(msg); + }; + }); + + afterEach(() => { + log.warn = originalWarn; + log.info = originalInfo; + }); + + test("running → dead emits exactly one warn", () => { + const inst = makeInstance("running"); + lifecycle.transition(inst, "dead"); + expect(warnCalls.length).toBe(1); + expect(warnCalls[0]).toContain("test-src"); + expect(warnCalls[0]).toContain("dead"); + }); + + test("running → crashed emits one warn", () => { + const inst = makeInstance("running"); + lifecycle.transition(inst, "crashed"); + expect(warnCalls.length).toBe(1); + expect(warnCalls[0]).toContain("crashed"); + }); + + test("running → reauth_required emits one warn", () => { + const inst = makeInstance("running"); + lifecycle.transition(inst, "reauth_required"); + expect(warnCalls.length).toBe(1); + expect(warnCalls[0]).toContain("reauth_required"); + }); + + test("repeated dead → dead does NOT warn again (no-op transition)", () => { + const inst = makeInstance("running"); + lifecycle.transition(inst, "dead"); + lifecycle.transition(inst, "dead"); + lifecycle.transition(inst, "dead"); + expect(warnCalls.length).toBe(1); + }); + + test("broken → broken (dead → crashed) does NOT warn (already broken)", () => { + const inst = makeInstance("dead"); + lifecycle.transition(inst, "crashed"); + expect(warnCalls.length).toBe(0); + }); + + test("running → pending_auth does NOT warn (in-flight OAuth, not broken)", () => { + const inst = makeInstance("running"); + lifecycle.transition(inst, "pending_auth"); + expect(warnCalls.length).toBe(0); + }); + + test("running → not_authenticated does NOT warn (resting state)", () => { + const inst = makeInstance("running"); + lifecycle.transition(inst, "not_authenticated"); + expect(warnCalls.length).toBe(0); + }); + + test("running → starting does NOT warn (transient)", () => { + const inst = makeInstance("running"); + lifecycle.transition(inst, "starting"); + expect(warnCalls.length).toBe(0); + }); + + test("running → stopped does NOT warn (operator intent)", () => { + const inst = makeInstance("running"); + lifecycle.transition(inst, "stopped"); + expect(warnCalls.length).toBe(0); + }); + + test("dead → running emits one info (recovery edge)", () => { + const inst = makeInstance("dead"); + lifecycle.transition(inst, "running"); + expect(infoCalls.length).toBe(1); + expect(infoCalls[0]).toContain("recovered"); + expect(infoCalls[0]).toContain("test-src"); + }); + + test("reauth_required → running emits one info", () => { + const inst = makeInstance("reauth_required"); + lifecycle.transition(inst, "running"); + expect(infoCalls.length).toBe(1); + }); + + test("crashed → running emits one info", () => { + const inst = makeInstance("crashed"); + lifecycle.transition(inst, "running"); + expect(infoCalls.length).toBe(1); + }); + + test("broken → non-running (dead → not_authenticated) emits no info", () => { + const inst = makeInstance("dead"); + lifecycle.transition(inst, "not_authenticated"); + expect(infoCalls.length).toBe(0); + expect(warnCalls.length).toBe(0); + }); + + test("not_authenticated → running emits no info (was not broken)", () => { + const inst = makeInstance("not_authenticated"); + lifecycle.transition(inst, "running"); + expect(infoCalls.length).toBe(0); + }); + + test("full cycle: running → dead → running → dead = 2 warns + 1 info", () => { + const inst = makeInstance("running"); + lifecycle.transition(inst, "dead"); + lifecycle.transition(inst, "running"); + lifecycle.transition(inst, "dead"); + expect(warnCalls.length).toBe(2); + expect(infoCalls.length).toBe(1); + }); + + test("instance.state field still updates on every call", () => { + const inst = makeInstance("running"); + lifecycle.transition(inst, "dead"); + expect(inst.state).toBe("dead"); + lifecycle.transition(inst, "running"); + expect(inst.state).toBe("running"); + lifecycle.transition(inst, "pending_auth"); + expect(inst.state).toBe("pending_auth"); + }); +}); + +/** + * Multi-step recovery via sticky-bit. The direct broken → running edge + * is rare in production — URL bundles recover through `pending_auth` + * (user clicks Reconnect, completes OAuth) and stdio bundles recover + * via a `restarting` intermediate. The sticky `lastBrokenState` + * breadcrumb on `BundleInstance` carries the broken signal across + * those intermediates so the recovery info still fires on the final + * `→ running` hop. + */ +describe("BundleLifecycleManager.transition — multi-step recovery (sticky-bit)", () => { + let lifecycle: BundleLifecycleManager; + let warnCalls: string[]; + let infoCalls: string[]; + let originalWarn: (msg: string) => void; + let originalInfo: (msg: string) => void; + + beforeEach(() => { + lifecycle = new BundleLifecycleManager(makeSink(), undefined); + warnCalls = []; + infoCalls = []; + originalWarn = log.warn; + originalInfo = log.info; + log.warn = (msg) => { + warnCalls.push(msg); + }; + log.info = (msg) => { + infoCalls.push(msg); + }; + }); + + afterEach(() => { + log.warn = originalWarn; + log.info = originalInfo; + }); + + test("URL bundle reconnect: reauth_required → pending_auth → running emits one info", () => { + const inst = makeInstance("running"); + lifecycle.transition(inst, "reauth_required"); + lifecycle.transition(inst, "pending_auth"); + lifecycle.transition(inst, "running"); + expect(warnCalls.length).toBe(1); + expect(infoCalls.length).toBe(1); + expect(infoCalls[0]).toContain("reauth_required"); + expect(infoCalls[0]).toContain("recovered"); + }); + + test("dead → not_authenticated → running still emits info (sticky preserved across intermediate)", () => { + const inst = makeInstance("running"); + lifecycle.transition(inst, "dead"); + lifecycle.transition(inst, "not_authenticated"); + lifecycle.transition(inst, "running"); + expect(infoCalls.length).toBe(1); + expect(infoCalls[0]).toContain("dead"); + }); + + test("recovery clears sticky-bit: second running after second-broken episode also fires info", () => { + const inst = makeInstance("running"); + lifecycle.transition(inst, "dead"); + lifecycle.transition(inst, "running"); + lifecycle.transition(inst, "reauth_required"); + lifecycle.transition(inst, "pending_auth"); + lifecycle.transition(inst, "running"); + expect(warnCalls.length).toBe(2); + expect(infoCalls.length).toBe(2); + expect(infoCalls[0]).toContain("dead"); + expect(infoCalls[1]).toContain("reauth_required"); + }); + + test("sticky-bit tracks LATEST broken state across broken→broken (crashed → dead → running labels 'dead')", () => { + const inst = makeInstance("running"); + lifecycle.transition(inst, "crashed"); + lifecycle.transition(inst, "dead"); + lifecycle.transition(inst, "running"); + expect(infoCalls.length).toBe(1); + expect(infoCalls[0]).toContain("dead"); + }); + + test("instance constructed in broken state: dead → running captures dead as label", () => { + // Mirrors HealthMonitor crash recovery on a bundle that came up + // dead before any transition flowed through this lifecycle instance. + const inst = makeInstance("dead"); + lifecycle.transition(inst, "running"); + expect(infoCalls.length).toBe(1); + expect(infoCalls[0]).toContain("dead"); + }); + + test("instance constructed in broken state: reauth_required → pending_auth → running captures reauth_required", () => { + const inst = makeInstance("reauth_required"); + lifecycle.transition(inst, "pending_auth"); + lifecycle.transition(inst, "running"); + expect(infoCalls.length).toBe(1); + expect(infoCalls[0]).toContain("reauth_required"); + }); + + test("operator-stop clears sticky-bit: dead → stopped → starting → running emits no info", () => { + const inst = makeInstance("running"); + lifecycle.transition(inst, "dead"); + lifecycle.transition(inst, "stopped"); + lifecycle.transition(inst, "starting"); + lifecycle.transition(inst, "running"); + expect(warnCalls.length).toBe(1); + expect(infoCalls.length).toBe(0); + }); + + test("starting → running on fresh boot emits no info (never broken)", () => { + const inst = makeInstance("starting"); + lifecycle.transition(inst, "running"); + expect(infoCalls.length).toBe(0); + expect(warnCalls.length).toBe(0); + }); + + test("lastBrokenState field is undefined after recovery", () => { + const inst = makeInstance("running"); + lifecycle.transition(inst, "dead"); + expect(inst.lastBrokenState).toBe("dead"); + lifecycle.transition(inst, "running"); + expect(inst.lastBrokenState).toBeUndefined(); + }); +}); diff --git a/test/unit/registry-tool-enumeration-resilience.test.ts b/test/unit/registry-tool-enumeration-resilience.test.ts index a70356ee..68808ab6 100644 --- a/test/unit/registry-tool-enumeration-resilience.test.ts +++ b/test/unit/registry-tool-enumeration-resilience.test.ts @@ -1,4 +1,5 @@ -import { describe, expect, test } from "bun:test"; +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { log } from "../../src/cli/log.ts"; import { textContent } from "../../src/engine/content-helpers.ts"; import type { ToolResult } from "../../src/engine/types.ts"; import { ToolRegistry } from "../../src/tools/registry.ts"; @@ -77,3 +78,52 @@ describe("ToolRegistry.availableTools — error containment", () => { await expect(registry.availableTools()).resolves.toEqual([]); }); }); + +/** + * Issue #194: the per-skip log must not be `log.warn`. A connector + * stuck in a broken state would otherwise emit one warn per + * `availableTools()` call — once per chat turn — flooding operator + * stderr and any alerting that hooks the warn rate. The operator + * signal lives at the lifecycle transition site (one warn per + * non-broken → broken edge); this site is per-turn and must stay + * silent at warn level. + */ +describe("ToolRegistry.availableTools — log noise", () => { + let warnCalls: string[]; + let originalWarn: (msg: string) => void; + + beforeEach(() => { + warnCalls = []; + originalWarn = log.warn; + log.warn = (msg) => { + warnCalls.push(msg); + }; + }); + + afterEach(() => { + log.warn = originalWarn; + }); + + test("repeated enumeration of a broken source emits zero warns", async () => { + const registry = new ToolRegistry(); + registry.addSource(new BrokenSource()); + + for (let i = 0; i < 100; i++) { + await registry.availableTools(); + } + + expect(warnCalls.length).toBe(0); + }); + + test("mixed healthy + broken source enumeration also stays quiet at warn", async () => { + const registry = new ToolRegistry(); + registry.addSource(new HealthySource()); + registry.addSource(new BrokenSource()); + + for (let i = 0; i < 25; i++) { + await registry.availableTools(); + } + + expect(warnCalls.length).toBe(0); + }); +});