From 5811f8d5a3327e07e126109875c8ba9256504454 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Thu, 28 May 2026 02:37:59 +0000 Subject: [PATCH 1/9] fix(cli): emit docker_unreachable header upfront in sandbox status Signed-off-by: Tinson Lai --- src/lib/actions/sandbox/status.test.ts | 38 ++++++++- src/lib/actions/sandbox/status.ts | 42 ++++++++-- test/cli.test.ts | 108 +++++++++++++++++++++++++ 3 files changed, 182 insertions(+), 6 deletions(-) diff --git a/src/lib/actions/sandbox/status.test.ts b/src/lib/actions/sandbox/status.test.ts index b75864a586..7508dea609 100644 --- a/src/lib/actions/sandbox/status.test.ts +++ b/src/lib/actions/sandbox/status.test.ts @@ -4,7 +4,10 @@ import { describe, expect, it } from "vitest"; import type { ProviderHealthProbeOptions } from "../../../../dist/lib/inference/health"; -import { getSandboxStatusInferenceHealth } from "../../../../dist/lib/actions/sandbox/status"; +import { + getSandboxStatusInferenceHealth, + isDockerDaemonUnreachableForStatus, +} from "../../../../dist/lib/actions/sandbox/status"; describe("sandbox status inference health", () => { it("passes the current model with the current provider", () => { @@ -50,3 +53,36 @@ describe("sandbox status inference health", () => { expect(called).toBe(false); }); }); + +describe("isDockerDaemonUnreachableForStatus", () => { + it("returns false when sandbox entry is null", () => { + expect(isDockerDaemonUnreachableForStatus(null, () => false)).toBe(false); + }); + + it("returns false when the openshell driver is not docker", () => { + expect( + isDockerDaemonUnreachableForStatus( + { name: "alpha", openshellDriver: "vm" } as never, + () => false, + ), + ).toBe(false); + }); + + it("returns true when driver is docker and the probe reports unreachable", () => { + expect( + isDockerDaemonUnreachableForStatus( + { name: "alpha", openshellDriver: "docker" } as never, + () => false, + ), + ).toBe(true); + }); + + it("returns false when driver is docker and the probe reports reachable", () => { + expect( + isDockerDaemonUnreachableForStatus( + { name: "alpha", openshellDriver: "docker" } as never, + () => true, + ), + ).toBe(false); + }); +}); diff --git a/src/lib/actions/sandbox/status.ts b/src/lib/actions/sandbox/status.ts index 1592249dc2..817d4e50ab 100644 --- a/src/lib/actions/sandbox/status.ts +++ b/src/lib/actions/sandbox/status.ts @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 +import { dockerInfo } from "../../adapters/docker/info"; import { detectOpenShellStateRpcResultIssue, printOpenShellStateRpcIssue, @@ -111,10 +112,31 @@ async function printGatewayFailureLayerHeader(sandboxName: string): Promise boolean; + +const defaultDockerInfoProbe: DockerInfoProbe = () => + dockerInfo({ ignoreError: true, timeout: DOCKER_INFO_PROBE_TIMEOUT_MS }).length > 0; + +export function isDockerDaemonUnreachableForStatus( + sb: registry.SandboxEntry | null, + probe: DockerInfoProbe = defaultDockerInfoProbe, +): boolean { + if (!sb || sb.openshellDriver !== "docker") return false; + return !probe(); +} + // eslint-disable-next-line complexity export async function showSandboxStatus(sandboxName: string): Promise { const sb = registry.getSandbox(sandboxName); maybeEnsureHermesToolGatewayBroker(sb); + // #4313: surface docker_unreachable header upfront when the host Docker + // daemon is stopped on a docker-driver sandbox. Without this, the cached + // sandbox metadata renders as a "healthy" report even though the local + // container stack is down, and the Inference probe hits the remote provider + // directly so it falsely shows healthy too. + const dockerUnreachable = isDockerDaemonUnreachableForStatus(sb); // #2666: never let an unexpected throw from the gateway probe (e.g. openshell // hanging when its container is stopped and the published port is held by a // foreign listener) suppress the sandbox header. The downstream switch @@ -154,11 +176,17 @@ export async function showSandboxStatus(sandboxName: string): Promise { liveResult && !isCommandTimeout(liveResult) ? parseGatewayInference(liveResult.output) : null; const currentModel = (live && live.model) || (sb && sb.model) || "unknown"; const currentProvider = (live && live.provider) || (sb && sb.provider) || "unknown"; - const inferenceHealth = getSandboxStatusInferenceHealth( - lookup.state === "present", - currentProvider, - currentModel, - ); + // When docker is unreachable on a docker-driver sandbox, host-side probes + // misrepresent the sandbox state — the remote-provider reachability check + // doesn't go through the local stack. Suppress the probe so the output + // doesn't conflict with the failure-layer header. + const inferenceHealth = dockerUnreachable + ? null + : getSandboxStatusInferenceHealth( + lookup.state === "present", + currentProvider, + currentModel, + ); // #3265 optional 3rd line: probe the full inference chain (openclaw gateway // → auth proxy → backend) from inside the sandbox so a broken hop the // host-side probes can't see still surfaces in `status`. @@ -183,6 +211,10 @@ export async function showSandboxStatus(sandboxName: string): Promise { } if (sb) { console.log(""); + if (dockerUnreachable) { + console.log(` ${getLayerHeader("docker_unreachable")}`); + process.exitCode = 1; + } console.log(` Sandbox: ${sb.name}`); console.log(` Model: ${currentModel}`); console.log(` Provider: ${currentProvider}`); diff --git a/test/cli.test.ts b/test/cli.test.ts index 5499e61f84..26aab031d7 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -943,6 +943,114 @@ describe("CLI dispatch", () => { }); }); + it("sandbox status surfaces docker_unreachable header and suppresses stale Inference probe", () => { + const home = fs.mkdtempSync( + path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-docker-unreachable-"), + ); + const localBin = path.join(home, "bin"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home, "alpha", { + provider: "openai-api", + model: "gpt-4o-mini", + openshellDriver: "docker", + } as unknown as Partial); + + fs.writeFileSync( + path.join(localBin, "docker"), + ["#!/usr/bin/env bash", "exit 1"].join("\n"), + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', + " echo 'Gateway inference:'", + " echo ' Provider: openai-api'", + " echo ' Model: gpt-4o-mini'", + " exit 0", + "fi", + 'if [ "$1" = "status" ]; then', + " echo 'Gateway: nemoclaw'", + " echo 'Status: Connected'", + " exit 0", + "fi", + 'if [ "$1" = "gateway" ] && [ "$2" = "info" ]; then', + " echo 'Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(1); + expect(r.out).toContain( + "Failure layer: docker_unreachable — Docker daemon is not reachable.", + ); + expect(r.out).not.toContain("Inference: healthy"); + const headerIdx = r.out.indexOf("Failure layer: docker_unreachable"); + const sandboxIdx = r.out.indexOf("Sandbox: alpha"); + expect(headerIdx).toBeGreaterThanOrEqual(0); + expect(sandboxIdx).toBeGreaterThan(headerIdx); + }); + + it("sandbox status preserves Inference probe when openshellDriver is not docker", () => { + const home = fs.mkdtempSync( + path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-non-docker-driver-"), + ); + const localBin = path.join(home, "bin"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home, "alpha", { + provider: "openai-api", + model: "gpt-4o-mini", + openshellDriver: "vm", + } as unknown as Partial); + + fs.writeFileSync( + path.join(localBin, "docker"), + ["#!/usr/bin/env bash", "exit 1"].join("\n"), + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', + " echo 'Gateway inference:'", + " echo ' Provider: openai-api'", + " echo ' Model: gpt-4o-mini'", + " exit 0", + "fi", + 'if [ "$1" = "status" ]; then', + " echo 'Gateway: nemoclaw'", + " echo 'Status: Connected'", + " exit 0", + "fi", + 'if [ "$1" = "gateway" ] && [ "$2" = "info" ]; then', + " echo 'Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.out).not.toContain( + "Failure layer: docker_unreachable", + ); + }); + it("status rejects unknown flags through current dispatch path", () => { const r = run("status --bogus"); expect(r.code).toBe(2); From 2da0d96682cf4f98ce9e44d2a5c7dca4c73bc331 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Thu, 28 May 2026 03:01:00 +0000 Subject: [PATCH 2/9] fix(cli): print docker_unreachable header verbatim and dedupe in status Signed-off-by: Tinson Lai --- docs/reference/commands.mdx | 2 + .../sandbox/gateway-failure-classifier.ts | 4 ++ src/lib/actions/sandbox/status.ts | 49 ++++++++++--------- test/cli.test.ts | 18 ++++--- 4 files changed, 45 insertions(+), 28 deletions(-) diff --git a/docs/reference/commands.mdx b/docs/reference/commands.mdx index 85451dca94..90f33ee28f 100644 --- a/docs/reference/commands.mdx +++ b/docs/reference/commands.mdx @@ -396,6 +396,8 @@ Use that line to distinguish a healthy backend from a broken proxy path that the For cloud-only providers, the output omits the NIM status line unless a NIM container is registered or an unexpected NIM container is running. +When the sandbox's recorded driver is `docker` and the host Docker daemon is not reachable, the command prints `Failure layer: docker_unreachable — Docker daemon is not reachable.` as the first line of stdout, suppresses the host-side `Inference` probe (which otherwise hits the remote provider directly and is misleading when the local stack is down), and exits with a non-zero status. + If the sandbox or gateway cannot be verified, the command exits non-zero instead of reporting healthy inference from stale registry state. Gateway and dashboard health checks treat HTTP `401` from device auth as a live service, not as an offline gateway. diff --git a/src/lib/actions/sandbox/gateway-failure-classifier.ts b/src/lib/actions/sandbox/gateway-failure-classifier.ts index dff2e3971f..a85481013a 100644 --- a/src/lib/actions/sandbox/gateway-failure-classifier.ts +++ b/src/lib/actions/sandbox/gateway-failure-classifier.ts @@ -34,6 +34,10 @@ function defaultDockerInfo(): boolean { return dockerInfo({ ignoreError: true, timeout: DOCKER_TIMEOUT_MS }).length > 0; } +export function isDockerDaemonReachable(): boolean { + return defaultDockerInfo(); +} + function dockerContainerListed(container: string, allFlag: boolean): boolean { const args = ["ps"]; if (allFlag) args.push("-a"); diff --git a/src/lib/actions/sandbox/status.ts b/src/lib/actions/sandbox/status.ts index 817d4e50ab..4fe22410ff 100644 --- a/src/lib/actions/sandbox/status.ts +++ b/src/lib/actions/sandbox/status.ts @@ -2,7 +2,6 @@ // SPDX-License-Identifier: Apache-2.0 -import { dockerInfo } from "../../adapters/docker/info"; import { detectOpenShellStateRpcResultIssue, printOpenShellStateRpcIssue, @@ -34,7 +33,11 @@ import { getActiveSandboxSessions, } from "../../state/sandbox-session"; import { getSandboxDockerHealth } from "./docker-health"; -import { classifyGatewayFailure, getLayerHeader } from "./gateway-failure-classifier"; +import { + classifyGatewayFailure, + getLayerHeader, + isDockerDaemonReachable, +} from "./gateway-failure-classifier"; import type { SandboxGatewayState } from "./gateway-state"; import { getReconciledSandboxGatewayState, @@ -107,21 +110,22 @@ function maybeEnsureHermesToolGatewayBroker(sb: registry.SandboxEntry | null): v } } -async function printGatewayFailureLayerHeader(sandboxName: string): Promise { +async function printGatewayFailureLayerHeader( + sandboxName: string, + alreadyPrintedDockerUnreachable = false, +): Promise { const failure = await classifyGatewayFailure(sandboxName); + if (alreadyPrintedDockerUnreachable && failure.layer === "docker_unreachable") { + return; + } console.log(` ${getLayerHeader(failure.layer)}`); } -const DOCKER_INFO_PROBE_TIMEOUT_MS = 3000; - type DockerInfoProbe = () => boolean; -const defaultDockerInfoProbe: DockerInfoProbe = () => - dockerInfo({ ignoreError: true, timeout: DOCKER_INFO_PROBE_TIMEOUT_MS }).length > 0; - export function isDockerDaemonUnreachableForStatus( sb: registry.SandboxEntry | null, - probe: DockerInfoProbe = defaultDockerInfoProbe, + probe: DockerInfoProbe = isDockerDaemonReachable, ): boolean { if (!sb || sb.openshellDriver !== "docker") return false; return !probe(); @@ -131,12 +135,17 @@ export function isDockerDaemonUnreachableForStatus( export async function showSandboxStatus(sandboxName: string): Promise { const sb = registry.getSandbox(sandboxName); maybeEnsureHermesToolGatewayBroker(sb); - // #4313: surface docker_unreachable header upfront when the host Docker - // daemon is stopped on a docker-driver sandbox. Without this, the cached - // sandbox metadata renders as a "healthy" report even though the local - // container stack is down, and the Inference probe hits the remote provider - // directly so it falsely shows healthy too. + // When the host Docker daemon is stopped on a docker-driver sandbox, the + // cached sandbox metadata renders as a "healthy" report even though the + // local container stack is down, and the host-side Inference probe hits the + // remote provider directly so it falsely shows healthy too. Probe the + // daemon once upfront so the failure-layer header is the first thing the + // user sees and the misleading probe is suppressed. const dockerUnreachable = isDockerDaemonUnreachableForStatus(sb); + if (dockerUnreachable) { + console.log(getLayerHeader("docker_unreachable")); + process.exitCode = 1; + } // #2666: never let an unexpected throw from the gateway probe (e.g. openshell // hanging when its container is stopped and the published port is held by a // foreign listener) suppress the sandbox header. The downstream switch @@ -211,10 +220,6 @@ export async function showSandboxStatus(sandboxName: string): Promise { } if (sb) { console.log(""); - if (dockerUnreachable) { - console.log(` ${getLayerHeader("docker_unreachable")}`); - process.exitCode = 1; - } console.log(` Sandbox: ${sb.name}`); console.log(` Model: ${currentModel}`); console.log(` Provider: ${currentProvider}`); @@ -326,7 +331,7 @@ export async function showSandboxStatus(sandboxName: string): Promise { if (guard.state === "connected_other") { printWrongGatewayActiveGuidance(sandboxName, guard.activeGateway, console.log); } else { - await printGatewayFailureLayerHeader(sandboxName); + await printGatewayFailureLayerHeader(sandboxName, dockerUnreachable); printGatewayLifecycleHint(guard.status || "", sandboxName, console.log); } } else { @@ -360,7 +365,7 @@ export async function showSandboxStatus(sandboxName: string): Promise { process.exit(1); } else if (lookup.state === "gateway_unreachable_after_restart") { console.log(""); - await printGatewayFailureLayerHeader(sandboxName); + await printGatewayFailureLayerHeader(sandboxName, dockerUnreachable); console.log( ` Sandbox '${sandboxName}' may still exist, but the selected ${CLI_DISPLAY_NAME} gateway is still refusing connections after restart.`, ); @@ -376,7 +381,7 @@ export async function showSandboxStatus(sandboxName: string): Promise { process.exit(1); } else if (lookup.state === "gateway_missing_after_restart") { console.log(""); - await printGatewayFailureLayerHeader(sandboxName); + await printGatewayFailureLayerHeader(sandboxName, dockerUnreachable); console.log( ` Sandbox '${sandboxName}' may still exist locally, but the ${CLI_DISPLAY_NAME} gateway is no longer configured after restart/rebuild.`, ); @@ -396,7 +401,7 @@ export async function showSandboxStatus(sandboxName: string): Promise { if (lookup.output) { console.log(lookup.output); } - await printGatewayFailureLayerHeader(sandboxName); + await printGatewayFailureLayerHeader(sandboxName, dockerUnreachable); printGatewayLifecycleHint(lookup.output, sandboxName, console.log); process.exit(1); } diff --git a/test/cli.test.ts b/test/cli.test.ts index 26aab031d7..bc557c55c5 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -990,17 +990,20 @@ describe("CLI dispatch", () => { }); expect(r.code).toBe(1); - expect(r.out).toContain( + expect(r.out.startsWith( "Failure layer: docker_unreachable — Docker daemon is not reachable.", - ); + )).toBe(true); expect(r.out).not.toContain("Inference: healthy"); const headerIdx = r.out.indexOf("Failure layer: docker_unreachable"); const sandboxIdx = r.out.indexOf("Sandbox: alpha"); expect(headerIdx).toBeGreaterThanOrEqual(0); expect(sandboxIdx).toBeGreaterThan(headerIdx); + expect( + (r.out.match(/Failure layer: docker_unreachable/g) || []).length, + ).toBe(1); }); - it("sandbox status preserves Inference probe when openshellDriver is not docker", () => { + it("sandbox status preserves Inference probe and exits 0 when openshellDriver is not docker", () => { const home = fs.mkdtempSync( path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-non-docker-driver-"), ); @@ -1046,9 +1049,12 @@ describe("CLI dispatch", () => { PATH: `${localBin}:${process.env.PATH || ""}`, }); - expect(r.out).not.toContain( - "Failure layer: docker_unreachable", - ); + expect(r.code).toBe(0); + expect(r.out).not.toContain("Failure layer: docker_unreachable"); + expect(r.out).toContain("Sandbox: alpha"); + expect(r.out).toContain("Provider: openai-api"); + expect(r.out).toContain("Model: gpt-4o-mini"); + expect(r.out).toMatch(/Inference:/); }); it("status rejects unknown flags through current dispatch path", () => { From 5a98bdf9b0c7b04e1a5c7430d10212014a210b71 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Sat, 30 May 2026 06:27:22 +0000 Subject: [PATCH 3/9] fix(cli): surface sandbox container stopped and dashboard port conflict in status Signed-off-by: Tinson Lai --- docs/reference/commands.mdx | 3 + .../sandbox/gateway-failure-classifier.ts | 91 ++++++++++- src/lib/actions/sandbox/status.test.ts | 66 ++++++++ src/lib/actions/sandbox/status.ts | 46 +++++- test/cli.test.ts | 152 ++++++++++++++++++ test/gateway-failure-classifier.test.ts | 140 ++++++++++++++++ 6 files changed, 491 insertions(+), 7 deletions(-) diff --git a/docs/reference/commands.mdx b/docs/reference/commands.mdx index b26a070014..e170876237 100644 --- a/docs/reference/commands.mdx +++ b/docs/reference/commands.mdx @@ -427,6 +427,9 @@ For cloud-only providers, the output omits the NIM status line unless a NIM cont When the sandbox's recorded driver is `docker` and the host Docker daemon is not reachable, the command prints `Failure layer: docker_unreachable — Docker daemon is not reachable.` as the first line of stdout, suppresses the host-side `Inference` probe (which otherwise hits the remote provider directly and is misleading when the local stack is down), and exits with a non-zero status. +When the host Docker daemon is reachable but the per-sandbox container is stopped, the command prints `Failure layer: sandbox_container_stopped — sandbox container exists but is not running.` as the first line of stdout, suppresses the host-side `Inference` probe, and exits with a non-zero status. +If the sandbox's recorded dashboard port is also held by a foreign listener, the header escalates to `Failure layer: sandbox_dashboard_port_conflict — sandbox container is stopped and the dashboard port is held by a foreign listener.` so the operator can recover the port before restarting the sandbox. + If the sandbox or gateway cannot be verified, the command exits non-zero instead of reporting healthy inference from stale registry state. Gateway and dashboard health checks treat HTTP `401` from device auth as a live service, not as an offline gateway. diff --git a/src/lib/actions/sandbox/gateway-failure-classifier.ts b/src/lib/actions/sandbox/gateway-failure-classifier.ts index a85481013a..8b563deadd 100644 --- a/src/lib/actions/sandbox/gateway-failure-classifier.ts +++ b/src/lib/actions/sandbox/gateway-failure-classifier.ts @@ -16,7 +16,9 @@ export type GatewayFailureLayer = | "container_missing" | "container_exited_port_conflict" | "container_exited" - | "gateway_unreachable"; + | "gateway_unreachable" + | "sandbox_container_stopped" + | "sandbox_dashboard_port_conflict"; export type GatewayFailureResult = { layer: GatewayFailureLayer; @@ -30,6 +32,21 @@ export type GatewayFailureRunners = { portProbe: (port: number) => Promise; }; +export type SandboxContainerFailureLayer = + | "sandbox_container_stopped" + | "sandbox_dashboard_port_conflict"; + +export type SandboxContainerFailureResult = { + layer: SandboxContainerFailureLayer; + detail: string; +}; + +export type SandboxContainerFailureRunners = { + listAllContainerNames: () => string; + listRunningContainerNames: () => string; + portProbe: (port: number) => Promise; +}; + function defaultDockerInfo(): boolean { return dockerInfo({ ignoreError: true, timeout: DOCKER_TIMEOUT_MS }).length > 0; } @@ -131,8 +148,80 @@ const LAYER_HEADERS: Record = { container_exited: "Failure layer: container_exited — container exited.", gateway_unreachable: "Failure layer: gateway_unreachable — container running but gateway API unresponsive.", + sandbox_container_stopped: + "Failure layer: sandbox_container_stopped — sandbox container exists but is not running.", + sandbox_dashboard_port_conflict: + "Failure layer: sandbox_dashboard_port_conflict — sandbox container is stopped and the dashboard port is held by a foreign listener.", }; export function getLayerHeader(layer: GatewayFailureLayer): string { return LAYER_HEADERS[layer]; } + +function defaultListAllContainerNames(): string { + return dockerCapture(["ps", "-a", "--format", "{{.Names}}"], { + ignoreError: true, + timeout: DOCKER_TIMEOUT_MS, + }); +} + +function defaultListRunningContainerNames(): string { + return dockerCapture(["ps", "--format", "{{.Names}}"], { + ignoreError: true, + timeout: DOCKER_TIMEOUT_MS, + }); +} + +const defaultSandboxContainerRunners: SandboxContainerFailureRunners = { + listAllContainerNames: defaultListAllContainerNames, + listRunningContainerNames: defaultListRunningContainerNames, + portProbe: defaultPortProbe, +}; + +function sandboxContainerNamePrefix(sandboxName: string): string { + return `openshell-${sandboxName}`; +} + +function findSandboxContainerName( + names: string, + sandboxName: string, +): string | null { + const prefix = sandboxContainerNamePrefix(sandboxName); + for (const line of names.split("\n")) { + const trimmed = line.trim(); + if (!trimmed) continue; + if (trimmed === prefix || trimmed.startsWith(`${prefix}-`)) return trimmed; + } + return null; +} + +export async function classifySandboxContainerFailure( + sandboxName: string, + opts: { + dashboardPort?: number | null; + runners?: SandboxContainerFailureRunners; + } = {}, +): Promise { + const runners = opts.runners ?? defaultSandboxContainerRunners; + const running = findSandboxContainerName( + runners.listRunningContainerNames(), + sandboxName, + ); + if (running) return null; + const present = findSandboxContainerName( + runners.listAllContainerNames(), + sandboxName, + ); + if (!present) return null; + const dashboardPort = opts.dashboardPort ?? null; + if (dashboardPort && (await runners.portProbe(dashboardPort))) { + return { + layer: "sandbox_dashboard_port_conflict", + detail: `Sandbox container '${present}' is stopped and dashboard port ${dashboardPort} is held by another process.`, + }; + } + return { + layer: "sandbox_container_stopped", + detail: `Sandbox container '${present}' exists but is not running.`, + }; +} diff --git a/src/lib/actions/sandbox/status.test.ts b/src/lib/actions/sandbox/status.test.ts index 7508dea609..c992b5a91d 100644 --- a/src/lib/actions/sandbox/status.test.ts +++ b/src/lib/actions/sandbox/status.test.ts @@ -5,6 +5,7 @@ import { describe, expect, it } from "vitest"; import type { ProviderHealthProbeOptions } from "../../../../dist/lib/inference/health"; import { + classifySandboxContainerFailureForStatus, getSandboxStatusInferenceHealth, isDockerDaemonUnreachableForStatus, } from "../../../../dist/lib/actions/sandbox/status"; @@ -86,3 +87,68 @@ describe("isDockerDaemonUnreachableForStatus", () => { ).toBe(false); }); }); + +describe("classifySandboxContainerFailureForStatus", () => { + it("returns null when sandbox entry is null", async () => { + const probe = async () => { + throw new Error("probe should not be invoked"); + }; + await expect( + classifySandboxContainerFailureForStatus(null, probe), + ).resolves.toBeNull(); + }); + + it("returns null when the openshell driver is not docker", async () => { + let called = false; + const probe = async () => { + called = true; + return null; + }; + await expect( + classifySandboxContainerFailureForStatus( + { name: "alpha", openshellDriver: "vm" } as never, + probe, + ), + ).resolves.toBeNull(); + expect(called).toBe(false); + }); + + it("forwards the sandbox name and dashboard port to the probe and propagates its verdict", async () => { + const observed: { sandboxName: string; port: number | null }[] = []; + const probe = async (sandboxName: string, dashboardPort: number | null) => { + observed.push({ sandboxName, port: dashboardPort }); + return { + layer: "sandbox_dashboard_port_conflict" as const, + detail: "stub failure", + }; + }; + const result = await classifySandboxContainerFailureForStatus( + { + name: "alpha", + openshellDriver: "docker", + dashboardPort: 18900, + } as never, + probe, + ); + expect(result).toEqual({ + layer: "sandbox_dashboard_port_conflict", + detail: "stub failure", + }); + expect(observed).toEqual([{ sandboxName: "alpha", port: 18900 }]); + }); + + it("passes null when the sandbox entry has no dashboard port recorded", async () => { + const observed: { sandboxName: string; port: number | null }[] = []; + const probe = async (sandboxName: string, dashboardPort: number | null) => { + observed.push({ sandboxName, port: dashboardPort }); + return null; + }; + await expect( + classifySandboxContainerFailureForStatus( + { name: "alpha", openshellDriver: "docker" } as never, + probe, + ), + ).resolves.toBeNull(); + expect(observed).toEqual([{ sandboxName: "alpha", port: null }]); + }); +}); diff --git a/src/lib/actions/sandbox/status.ts b/src/lib/actions/sandbox/status.ts index 73347b0182..ab075451aa 100644 --- a/src/lib/actions/sandbox/status.ts +++ b/src/lib/actions/sandbox/status.ts @@ -35,8 +35,10 @@ import { import { getSandboxDockerHealth } from "./docker-health"; import { classifyGatewayFailure, + classifySandboxContainerFailure, getLayerHeader, isDockerDaemonReachable, + type SandboxContainerFailureResult, } from "./gateway-failure-classifier"; import type { SandboxGatewayState } from "./gateway-state"; import { @@ -259,6 +261,24 @@ export function isDockerDaemonUnreachableForStatus( return !probe(); } +type SandboxContainerFailureProbe = ( + sandboxName: string, + dashboardPort: number | null, +) => Promise; + +const defaultSandboxContainerFailureProbe: SandboxContainerFailureProbe = ( + sandboxName, + dashboardPort, +) => classifySandboxContainerFailure(sandboxName, { dashboardPort }); + +export async function classifySandboxContainerFailureForStatus( + sb: registry.SandboxEntry | null, + probe: SandboxContainerFailureProbe = defaultSandboxContainerFailureProbe, +): Promise { + if (!sb || sb.openshellDriver !== "docker") return null; + return probe(sb.name, sb.dashboardPort ?? null); +} + // eslint-disable-next-line complexity export async function showSandboxStatus(sandboxName: string): Promise { // When the host Docker daemon is stopped on a docker-driver sandbox, the @@ -267,11 +287,24 @@ export async function showSandboxStatus(sandboxName: string): Promise { // remote provider directly so it falsely shows healthy too. Probe the // daemon once upfront so the failure-layer header is the first thing the // user sees and the misleading probe is suppressed. - const dockerUnreachable = isDockerDaemonUnreachableForStatus(registry.getSandbox(sandboxName)); + const sandboxEntryEarly = registry.getSandbox(sandboxName); + const dockerUnreachable = isDockerDaemonUnreachableForStatus(sandboxEntryEarly); if (dockerUnreachable) { console.log(getLayerHeader("docker_unreachable")); process.exitCode = 1; } + // #4515: when the per-sandbox container is stopped (and optionally its + // dashboard port is held by a foreign listener) the host-side Inference + // probe still hits the remote provider directly and falsely shows healthy. + // Probe the sandbox container upfront so the failure layer is the first + // user-visible signal and the misleading Inference line is suppressed. + const sandboxFailure = dockerUnreachable + ? null + : await classifySandboxContainerFailureForStatus(sandboxEntryEarly); + if (sandboxFailure) { + console.log(getLayerHeader(sandboxFailure.layer)); + process.exitCode = 1; + } // #2666: never let an unexpected throw from the gateway probe (e.g. openshell // hanging when its container is stopped and the published port is held by a // foreign listener) suppress the sandbox header. The downstream switch @@ -279,11 +312,12 @@ export async function showSandboxStatus(sandboxName: string): Promise { // synthesized fallback keeps the user-visible contract intact. const snapshot = await collectSandboxStatusSnapshot(sandboxName); const { sb, lookup, rpcIssue, currentModel, currentProvider } = snapshot; - // When docker is unreachable on a docker-driver sandbox, host-side probes - // misrepresent the sandbox state — the remote-provider reachability check - // doesn't go through the local stack. Suppress the probe so the output - // doesn't conflict with the failure-layer header. - const inferenceHealth = dockerUnreachable ? null : snapshot.inferenceHealth; + // When docker is unreachable or the sandbox container itself is stopped, + // host-side probes misrepresent the sandbox state — the remote-provider + // reachability check doesn't go through the local stack. Suppress the probe + // so the output doesn't conflict with the failure-layer header. + const inferenceHealth = + dockerUnreachable || sandboxFailure ? null : snapshot.inferenceHealth; maybeEnsureHermesToolGatewayBroker(sb); if (rpcIssue) { printOpenShellStateRpcIssue(rpcIssue, { diff --git a/test/cli.test.ts b/test/cli.test.ts index 8acf34dcdc..d2d45feec4 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -5,6 +5,7 @@ import { describe, it, expect } from "vitest"; import { execSync, spawn, spawnSync } from "node:child_process"; import type { ChildProcess } from "node:child_process"; import fs from "node:fs"; +import net from "node:net"; import os from "node:os"; import path from "node:path"; @@ -1057,6 +1058,157 @@ describe("CLI dispatch", () => { expect(r.out).toMatch(/Inference:/); }); + it("sandbox status surfaces sandbox_container_stopped when the per-sandbox container exists but is not running", () => { + const home = fs.mkdtempSync( + path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-container-stopped-"), + ); + const localBin = path.join(home, "bin"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home, "alpha", { + provider: "openai-api", + model: "gpt-4o-mini", + openshellDriver: "docker", + } as unknown as Partial); + + fs.writeFileSync( + path.join(localBin, "docker"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "info" ]; then echo "Server: docker"; exit 0; fi', + 'if [ "$1" = "ps" ] && [ "$2" = "-a" ]; then echo "openshell-alpha-7616dcb1"; exit 0; fi', + 'if [ "$1" = "ps" ]; then echo "openshell-cluster-nemoclaw"; exit 0; fi', + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', + " echo 'Gateway inference:'", + " echo ' Provider: openai-api'", + " echo ' Model: gpt-4o-mini'", + " exit 0", + "fi", + 'if [ "$1" = "status" ]; then', + " echo 'Gateway: nemoclaw'", + " echo 'Status: Connected'", + " exit 0", + "fi", + 'if [ "$1" = "gateway" ] && [ "$2" = "info" ]; then', + " echo 'Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(1); + expect( + r.out.startsWith( + "Failure layer: sandbox_container_stopped — sandbox container exists but is not running.", + ), + ).toBe(true); + expect(r.out).not.toContain("Inference: healthy"); + expect(r.out).not.toContain("Failure layer: docker_unreachable"); + expect(r.out).not.toContain("Failure layer: sandbox_dashboard_port_conflict"); + const headerIdx = r.out.indexOf("Failure layer: sandbox_container_stopped"); + const sandboxIdx = r.out.indexOf("Sandbox: alpha"); + expect(headerIdx).toBeGreaterThanOrEqual(0); + expect(sandboxIdx).toBeGreaterThan(headerIdx); + }); + + it("sandbox status surfaces sandbox_dashboard_port_conflict when the sandbox container is stopped and the dashboard port is held by a foreign listener", async () => { + const home = fs.mkdtempSync( + path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-port-conflict-"), + ); + const localBin = path.join(home, "bin"); + fs.mkdirSync(localBin, { recursive: true }); + + const server = net.createServer(); + await new Promise((resolve, reject) => { + server.once("error", reject); + server.listen(0, "127.0.0.1", () => resolve()); + }); + const address = server.address(); + if (!address || typeof address === "string") { + server.close(); + throw new Error("failed to bind foreign listener on a free port"); + } + const dashboardPort = address.port; + + try { + writeSandboxRegistry(home, "alpha", { + provider: "openai-api", + model: "gpt-4o-mini", + openshellDriver: "docker", + dashboardPort, + } as unknown as Partial); + + fs.writeFileSync( + path.join(localBin, "docker"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "info" ]; then echo "Server: docker"; exit 0; fi', + 'if [ "$1" = "ps" ] && [ "$2" = "-a" ]; then echo "openshell-alpha-7616dcb1"; exit 0; fi', + 'if [ "$1" = "ps" ]; then echo "openshell-cluster-nemoclaw"; exit 0; fi', + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', + " echo 'Gateway inference:'", + " echo ' Provider: openai-api'", + " echo ' Model: gpt-4o-mini'", + " exit 0", + "fi", + 'if [ "$1" = "status" ]; then', + " echo 'Gateway: nemoclaw'", + " echo 'Status: Connected'", + " exit 0", + "fi", + 'if [ "$1" = "gateway" ] && [ "$2" = "info" ]; then', + " echo 'Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(1); + expect( + r.out.startsWith( + "Failure layer: sandbox_dashboard_port_conflict — sandbox container is stopped and the dashboard port is held by a foreign listener.", + ), + ).toBe(true); + expect(r.out).not.toContain("Inference: healthy"); + expect(r.out).not.toContain("Failure layer: sandbox_container_stopped —"); + const headerIdx = r.out.indexOf("Failure layer: sandbox_dashboard_port_conflict"); + const sandboxIdx = r.out.indexOf("Sandbox: alpha"); + expect(headerIdx).toBeGreaterThanOrEqual(0); + expect(sandboxIdx).toBeGreaterThan(headerIdx); + } finally { + await new Promise((resolve) => server.close(() => resolve())); + } + }); + it("sandbox status --json emits structured per-sandbox report", () => { const home = fs.mkdtempSync( path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-json-"), diff --git a/test/gateway-failure-classifier.test.ts b/test/gateway-failure-classifier.test.ts index 6420e3c97a..50d386e0b3 100644 --- a/test/gateway-failure-classifier.test.ts +++ b/test/gateway-failure-classifier.test.ts @@ -5,8 +5,10 @@ import { describe, expect, it } from "vitest"; import { classifyGatewayFailure, + classifySandboxContainerFailure, getLayerHeader, type GatewayFailureRunners, + type SandboxContainerFailureRunners, } from "../dist/lib/actions/sandbox/gateway-failure-classifier.js"; function makeRunners(overrides: Partial = {}): GatewayFailureRunners { @@ -128,5 +130,143 @@ describe("getLayerHeader", () => { ); expect(getLayerHeader("container_exited")).toContain("container_exited"); expect(getLayerHeader("gateway_unreachable")).toContain("gateway_unreachable"); + expect(getLayerHeader("sandbox_container_stopped")).toContain( + "sandbox_container_stopped", + ); + expect(getLayerHeader("sandbox_dashboard_port_conflict")).toContain( + "sandbox_dashboard_port_conflict", + ); + }); +}); + +function makeSandboxRunners( + overrides: Partial = {}, +): SandboxContainerFailureRunners { + return { + listAllContainerNames: () => "", + listRunningContainerNames: () => "", + portProbe: async () => false, + ...overrides, + }; +} + +describe("classifySandboxContainerFailure", () => { + it("returns null when the sandbox container is running", async () => { + const result = await classifySandboxContainerFailure("my-assistant", { + runners: makeSandboxRunners({ + listRunningContainerNames: () => + "openshell-my-assistant-7616dcb1\nopenshell-cluster-nemoclaw", + listAllContainerNames: () => + "openshell-my-assistant-7616dcb1\nopenshell-cluster-nemoclaw", + }), + }); + expect(result).toBeNull(); + }); + + it("returns null when the sandbox container is not present anywhere", async () => { + const result = await classifySandboxContainerFailure("my-assistant", { + runners: makeSandboxRunners({ + listAllContainerNames: () => "openshell-cluster-nemoclaw\n", + }), + }); + expect(result).toBeNull(); + }); + + it("returns sandbox_container_stopped when the container exists but is not running and no port is recorded", async () => { + const result = await classifySandboxContainerFailure("my-assistant", { + runners: makeSandboxRunners({ + listAllContainerNames: () => + "openshell-my-assistant-7616dcb1\nopenshell-cluster-nemoclaw", + }), + }); + expect(result?.layer).toBe("sandbox_container_stopped"); + expect(result?.detail).toContain("openshell-my-assistant-7616dcb1"); + }); + + it("returns sandbox_container_stopped when the container exists, is stopped, and the dashboard port is free", async () => { + let probedPort: number | null = null; + const result = await classifySandboxContainerFailure("my-assistant", { + dashboardPort: 18789, + runners: makeSandboxRunners({ + listAllContainerNames: () => "openshell-my-assistant-7616dcb1\n", + portProbe: async (port) => { + probedPort = port; + return false; + }, + }), + }); + expect(result?.layer).toBe("sandbox_container_stopped"); + expect(probedPort).toBe(18789); + }); + + it("returns sandbox_dashboard_port_conflict when the container exists, is stopped, and the dashboard port is held", async () => { + const result = await classifySandboxContainerFailure("my-assistant", { + dashboardPort: 18789, + runners: makeSandboxRunners({ + listAllContainerNames: () => "openshell-my-assistant-7616dcb1\n", + portProbe: async () => true, + }), + }); + expect(result?.layer).toBe("sandbox_dashboard_port_conflict"); + expect(result?.detail).toContain("18789"); + expect(result?.detail).toContain("openshell-my-assistant-7616dcb1"); + }); + + it("does not probe the port when the container is running", async () => { + let portProbeCalled = false; + await classifySandboxContainerFailure("my-assistant", { + dashboardPort: 18789, + runners: makeSandboxRunners({ + listRunningContainerNames: () => "openshell-my-assistant-7616dcb1", + listAllContainerNames: () => "openshell-my-assistant-7616dcb1", + portProbe: async () => { + portProbeCalled = true; + return true; + }, + }), + }); + expect(portProbeCalled).toBe(false); + }); + + it("does not probe the port when the container is not present at all", async () => { + let portProbeCalled = false; + await classifySandboxContainerFailure("my-assistant", { + dashboardPort: 18789, + runners: makeSandboxRunners({ + listAllContainerNames: () => "openshell-cluster-nemoclaw\n", + portProbe: async () => { + portProbeCalled = true; + return true; + }, + }), + }); + expect(portProbeCalled).toBe(false); + }); + + it("matches the exact prefix and the uuid-suffixed shape but not unrelated containers", async () => { + const exactResult = await classifySandboxContainerFailure("my-assistant", { + runners: makeSandboxRunners({ + listAllContainerNames: () => "openshell-my-assistant\n", + }), + }); + expect(exactResult?.layer).toBe("sandbox_container_stopped"); + expect(exactResult?.detail).toContain("openshell-my-assistant"); + + const otherSandbox = await classifySandboxContainerFailure("my-assistant", { + runners: makeSandboxRunners({ + listAllContainerNames: () => + "openshell-my-assistant-evil\nopenshell-different-sandbox-abc", + }), + }); + expect(otherSandbox?.layer).toBe("sandbox_container_stopped"); + expect(otherSandbox?.detail).toContain("openshell-my-assistant-evil"); + + const unrelated = await classifySandboxContainerFailure("my-assistant", { + runners: makeSandboxRunners({ + listAllContainerNames: () => + "openshell-cluster-nemoclaw\nopenshell-my-assistantextra\n", + }), + }); + expect(unrelated).toBeNull(); }); }); From b0b0fd448fe497af40c13a99f219d2a08cd74f37 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Sat, 30 May 2026 08:15:34 +0000 Subject: [PATCH 4/9] fix(cli): resolve sandbox container by longest-owner and validate dashboardPort Signed-off-by: Tinson Lai --- .../sandbox/gateway-failure-classifier.ts | 72 ++++++++++++--- src/lib/actions/sandbox/status.ts | 10 +-- test/gateway-failure-classifier.test.ts | 90 +++++++++++++++++-- 3 files changed, 151 insertions(+), 21 deletions(-) diff --git a/src/lib/actions/sandbox/gateway-failure-classifier.ts b/src/lib/actions/sandbox/gateway-failure-classifier.ts index 8b563deadd..c771bb16ac 100644 --- a/src/lib/actions/sandbox/gateway-failure-classifier.ts +++ b/src/lib/actions/sandbox/gateway-failure-classifier.ts @@ -6,6 +6,7 @@ import net from "node:net"; import { dockerInfo } from "../../adapters/docker/info"; import { dockerCapture } from "../../adapters/docker/run"; import { GATEWAY_PORT } from "../../core/ports"; +import * as registry from "../../state/registry"; const DEFAULT_CONTAINER = "openshell-cluster-nemoclaw"; const DOCKER_TIMEOUT_MS = 3000; @@ -44,6 +45,7 @@ export type SandboxContainerFailureResult = { export type SandboxContainerFailureRunners = { listAllContainerNames: () => string; listRunningContainerNames: () => string; + listSandboxNames: () => string[]; portProbe: (port: number) => Promise; }; @@ -172,29 +174,71 @@ function defaultListRunningContainerNames(): string { }); } +function defaultListSandboxNames(): string[] { + try { + return registry.listSandboxes().sandboxes.map((entry) => entry.name); + } catch { + return []; + } +} + const defaultSandboxContainerRunners: SandboxContainerFailureRunners = { listAllContainerNames: defaultListAllContainerNames, listRunningContainerNames: defaultListRunningContainerNames, + listSandboxNames: defaultListSandboxNames, portProbe: defaultPortProbe, }; -function sandboxContainerNamePrefix(sandboxName: string): string { - return `openshell-${sandboxName}`; -} - +// OpenShell names sandbox containers either as `openshell-` (no +// suffix) or `openshell--`, where `` is appended by +// openshell at runtime. Two prefix collisions are possible: +// +// 1. A sandbox name can be a prefix of another sandbox name +// (`my` vs `my-assistant`). +// 2. Even with a hyphen-free ``, a sandbox name can be a prefix +// of another sandbox name whose own suffix is hyphen-free +// (`my-assistant` vs `my-assistant-prod`). +// +// Resolve each candidate to the LONGEST registered sandbox name it could +// belong to and only accept candidates that resolve to the sandbox we are +// looking up. The exact-name form is preferred before suffixed forms so +// that an `openshell-` container always wins over an unrelated +// `openshell--` co-tenant. Mirrors the resolver in +// `docker-health.ts`. function findSandboxContainerName( names: string, sandboxName: string, + registeredSandboxNames: readonly string[], ): string | null { - const prefix = sandboxContainerNamePrefix(sandboxName); - for (const line of names.split("\n")) { - const trimmed = line.trim(); - if (!trimmed) continue; - if (trimmed === prefix || trimmed.startsWith(`${prefix}-`)) return trimmed; + const ourPrefix = `openshell-${sandboxName}-`; + const ourExact = `openshell-${sandboxName}`; + const known = new Set(registeredSandboxNames); + known.add(sandboxName); + const candidates = names + .split("\n") + .map((line) => line.trim()) + .filter((line) => line === ourExact || line.startsWith(ourPrefix)); + if (candidates.includes(ourExact)) return ourExact; + const knownArr = [...known]; + for (const candidate of candidates) { + const stripped = candidate.replace(/^openshell-/, ""); + const owner = knownArr + .filter((name) => stripped === name || stripped.startsWith(`${name}-`)) + .sort((a, b) => b.length - a.length)[0]; + if (owner === sandboxName) return candidate; } return null; } +function isValidDashboardPort(port: number | null | undefined): port is number { + return ( + typeof port === "number" && + Number.isInteger(port) && + port >= 1 && + port <= 65535 + ); +} + export async function classifySandboxContainerFailure( sandboxName: string, opts: { @@ -203,18 +247,24 @@ export async function classifySandboxContainerFailure( } = {}, ): Promise { const runners = opts.runners ?? defaultSandboxContainerRunners; + const registeredSandboxNames = runners.listSandboxNames(); const running = findSandboxContainerName( runners.listRunningContainerNames(), sandboxName, + registeredSandboxNames, ); if (running) return null; const present = findSandboxContainerName( runners.listAllContainerNames(), sandboxName, + registeredSandboxNames, ); if (!present) return null; - const dashboardPort = opts.dashboardPort ?? null; - if (dashboardPort && (await runners.portProbe(dashboardPort))) { + const dashboardPort = opts.dashboardPort; + if ( + isValidDashboardPort(dashboardPort) && + (await runners.portProbe(dashboardPort)) + ) { return { layer: "sandbox_dashboard_port_conflict", detail: `Sandbox container '${present}' is stopped and dashboard port ${dashboardPort} is held by another process.`, diff --git a/src/lib/actions/sandbox/status.ts b/src/lib/actions/sandbox/status.ts index ab075451aa..aa41dfc913 100644 --- a/src/lib/actions/sandbox/status.ts +++ b/src/lib/actions/sandbox/status.ts @@ -293,11 +293,11 @@ export async function showSandboxStatus(sandboxName: string): Promise { console.log(getLayerHeader("docker_unreachable")); process.exitCode = 1; } - // #4515: when the per-sandbox container is stopped (and optionally its - // dashboard port is held by a foreign listener) the host-side Inference - // probe still hits the remote provider directly and falsely shows healthy. - // Probe the sandbox container upfront so the failure layer is the first - // user-visible signal and the misleading Inference line is suppressed. + // When the per-sandbox container is stopped (and optionally its dashboard + // port is held by a foreign listener) the host-side Inference probe still + // hits the remote provider directly and falsely shows healthy. Probe the + // sandbox container upfront so the failure layer is the first user-visible + // signal and the misleading Inference line is suppressed. const sandboxFailure = dockerUnreachable ? null : await classifySandboxContainerFailureForStatus(sandboxEntryEarly); diff --git a/test/gateway-failure-classifier.test.ts b/test/gateway-failure-classifier.test.ts index 50d386e0b3..c5aafc7b6c 100644 --- a/test/gateway-failure-classifier.test.ts +++ b/test/gateway-failure-classifier.test.ts @@ -145,6 +145,7 @@ function makeSandboxRunners( return { listAllContainerNames: () => "", listRunningContainerNames: () => "", + listSandboxNames: () => [], portProbe: async () => false, ...overrides, }; @@ -243,7 +244,7 @@ describe("classifySandboxContainerFailure", () => { expect(portProbeCalled).toBe(false); }); - it("matches the exact prefix and the uuid-suffixed shape but not unrelated containers", async () => { + it("matches the exact prefix and accepts uuid-suffixed shapes that resolve back to the queried sandbox", async () => { const exactResult = await classifySandboxContainerFailure("my-assistant", { runners: makeSandboxRunners({ listAllContainerNames: () => "openshell-my-assistant\n", @@ -252,14 +253,17 @@ describe("classifySandboxContainerFailure", () => { expect(exactResult?.layer).toBe("sandbox_container_stopped"); expect(exactResult?.detail).toContain("openshell-my-assistant"); - const otherSandbox = await classifySandboxContainerFailure("my-assistant", { + // `openshell-my-assistant-7616dcb1` belongs to `my-assistant` because no + // other registered sandbox name claims it via the longest-owner rule. + const uuidResult = await classifySandboxContainerFailure("my-assistant", { runners: makeSandboxRunners({ listAllContainerNames: () => - "openshell-my-assistant-evil\nopenshell-different-sandbox-abc", + "openshell-my-assistant-7616dcb1\nopenshell-different-sandbox-abc", + listSandboxNames: () => ["my-assistant", "different-sandbox"], }), }); - expect(otherSandbox?.layer).toBe("sandbox_container_stopped"); - expect(otherSandbox?.detail).toContain("openshell-my-assistant-evil"); + expect(uuidResult?.layer).toBe("sandbox_container_stopped"); + expect(uuidResult?.detail).toContain("openshell-my-assistant-7616dcb1"); const unrelated = await classifySandboxContainerFailure("my-assistant", { runners: makeSandboxRunners({ @@ -269,4 +273,80 @@ describe("classifySandboxContainerFailure", () => { }); expect(unrelated).toBeNull(); }); + + it("rejects a longer registered sandbox's container even when the literal prefix matches the queried name", async () => { + // `openshell-my-assistant-prod-7616dcb1` resolves to the longer + // `my-assistant-prod` sandbox via the longest-owner rule; the query for + // `my-assistant` must not consume it. Mirrors the docker-health.ts + // resolver and prevents the prefix-collision bug. + const collision = await classifySandboxContainerFailure("my-assistant", { + runners: makeSandboxRunners({ + listAllContainerNames: () => + "openshell-my-assistant-prod-7616dcb1\nopenshell-cluster-nemoclaw", + listSandboxNames: () => ["my-assistant", "my-assistant-prod"], + }), + }); + expect(collision).toBeNull(); + }); + + it("matches an `openshell-` exact container even when a co-tenant `openshell--` exists in the same listing", async () => { + const result = await classifySandboxContainerFailure("my-assistant", { + runners: makeSandboxRunners({ + listAllContainerNames: () => + "openshell-my-assistant-7616dcb1\nopenshell-my-assistant", + listSandboxNames: () => ["my-assistant"], + }), + }); + expect(result?.layer).toBe("sandbox_container_stopped"); + expect(result?.detail).toContain("openshell-my-assistant"); + expect(result?.detail).not.toContain("openshell-my-assistant-7616dcb1"); + }); + + it("ignores an out-of-range dashboardPort and falls back to sandbox_container_stopped", async () => { + let portProbeCalled = false; + const result = await classifySandboxContainerFailure("my-assistant", { + dashboardPort: 70000, + runners: makeSandboxRunners({ + listAllContainerNames: () => "openshell-my-assistant-7616dcb1\n", + portProbe: async () => { + portProbeCalled = true; + return true; + }, + }), + }); + expect(result?.layer).toBe("sandbox_container_stopped"); + expect(portProbeCalled).toBe(false); + }); + + it("ignores a non-integer dashboardPort and falls back to sandbox_container_stopped", async () => { + let portProbeCalled = false; + const result = await classifySandboxContainerFailure("my-assistant", { + dashboardPort: 18789.5 as unknown as number, + runners: makeSandboxRunners({ + listAllContainerNames: () => "openshell-my-assistant-7616dcb1\n", + portProbe: async () => { + portProbeCalled = true; + return true; + }, + }), + }); + expect(result?.layer).toBe("sandbox_container_stopped"); + expect(portProbeCalled).toBe(false); + }); + + it("ignores a zero dashboardPort and falls back to sandbox_container_stopped", async () => { + let portProbeCalled = false; + const result = await classifySandboxContainerFailure("my-assistant", { + dashboardPort: 0, + runners: makeSandboxRunners({ + listAllContainerNames: () => "openshell-my-assistant-7616dcb1\n", + portProbe: async () => { + portProbeCalled = true; + return true; + }, + }), + }); + expect(result?.layer).toBe("sandbox_container_stopped"); + expect(portProbeCalled).toBe(false); + }); }); From 39ee9e45b509fef4e6cead1b280c0a96a12305f3 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Sat, 30 May 2026 09:12:23 +0000 Subject: [PATCH 5/9] fix(cli): suppress inference probe upfront on docker/sandbox failure layers and expose failureLayer to status --json Signed-off-by: Tinson Lai --- src/commands/sandbox/status.ts | 7 +- src/lib/actions/sandbox/docker-health.ts | 48 +--- .../sandbox/gateway-failure-classifier.ts | 45 +--- .../sandbox/sandbox-container-owner.ts | 46 ++++ src/lib/actions/sandbox/status.test.ts | 124 +++++++++ src/lib/actions/sandbox/status.ts | 141 +++++++--- test/cli.test.ts | 243 ++++++++++++++++++ test/sandbox-container-owner.test.ts | 88 +++++++ 8 files changed, 626 insertions(+), 116 deletions(-) create mode 100644 src/lib/actions/sandbox/sandbox-container-owner.ts create mode 100644 test/sandbox-container-owner.test.ts diff --git a/src/commands/sandbox/status.ts b/src/commands/sandbox/status.ts index d081d5842f..8110dba44e 100644 --- a/src/commands/sandbox/status.ts +++ b/src/commands/sandbox/status.ts @@ -28,7 +28,12 @@ export default class SandboxStatusCommand extends NemoClawCommand { const { args } = await this.parse(SandboxStatusCommand); if (this.jsonEnabled()) { const report = await getSandboxStatusReport(args.sandboxName); - if (!report.found || report.gatewayState !== "present" || report.rpcIssue) { + if ( + !report.found || + report.gatewayState !== "present" || + report.rpcIssue || + report.failureLayer + ) { process.exitCode = 1; } // #4310: route the machine-readable report through the centralized diff --git a/src/lib/actions/sandbox/docker-health.ts b/src/lib/actions/sandbox/docker-health.ts index 0ac55223b2..6d8a8a0400 100644 --- a/src/lib/actions/sandbox/docker-health.ts +++ b/src/lib/actions/sandbox/docker-health.ts @@ -4,6 +4,7 @@ import { dockerContainerInspectFormat } from "../../adapters/docker/inspect"; import { dockerCapture } from "../../adapters/docker/run"; import * as registry from "../../state/registry"; +import { resolveSandboxContainerOwner } from "./sandbox-container-owner"; export type DockerHealthState = | "healthy" @@ -48,48 +49,11 @@ function resolveDockerDriverSandboxContainer( } catch { return null; } - // OpenShell names sandbox containers either as `openshell-` - // (no suffix) or `openshell--` where is a runtime - // identifier appended by openshell. Two ambiguities to avoid: - // - // 1. A sandbox name can be a prefix of another sandbox name - // (`my` vs `my-assistant`). - // 2. Even with a hyphen-free suffix, a sandbox name can be a prefix - // of another sandbox name whose own suffix is hyphen-free - // (`my-assistant` vs `my-assistant-prod`). - // - // To disambiguate, resolve each candidate to the LONGEST registered - // sandbox name it could belong to. We only accept a candidate when - // that resolved owner is the sandbox we are looking up. This also - // gives the right answer for the `openshell-` exact form. - const ourPrefix = `openshell-${sandboxName}-`; - const ourExact = `openshell-${sandboxName}`; - const knownSandboxes = new Set(deps.listSandboxNames()); - knownSandboxes.add(sandboxName); - const candidates = deps - .dockerPsNames() - .split("\n") - .map((line) => line.trim()) - .filter((line) => line === ourExact || line.startsWith(ourPrefix)); - // Prefer the exact-name container before considering suffixed ones. - // Without this short-circuit, a suffixed `openshell--` whose - // is a docker runtime suffix (not a registered sandbox name) would - // resolve to our sandbox via the longest-match heuristic and beat the - // co-existing exact `openshell-` if it appeared earlier in - // `docker ps`. - if (candidates.includes(ourExact)) return ourExact; - for (const candidate of candidates) { - const stripped = candidate.replace(/^openshell-/, ""); - // Find the longest known sandbox whose container name pattern - // matches this candidate. Longest-first defeats prefix collisions. - const owner = [...knownSandboxes] - .filter( - (name) => stripped === name || stripped.startsWith(`${name}-`), - ) - .sort((a, b) => b.length - a.length)[0]; - if (owner === sandboxName) return candidate; - } - return null; + return resolveSandboxContainerOwner( + deps.dockerPsNames(), + sandboxName, + deps.listSandboxNames(), + ); } function normalizeHealthState(raw: string): DockerHealthState { diff --git a/src/lib/actions/sandbox/gateway-failure-classifier.ts b/src/lib/actions/sandbox/gateway-failure-classifier.ts index c771bb16ac..156ee2555e 100644 --- a/src/lib/actions/sandbox/gateway-failure-classifier.ts +++ b/src/lib/actions/sandbox/gateway-failure-classifier.ts @@ -7,6 +7,7 @@ import { dockerInfo } from "../../adapters/docker/info"; import { dockerCapture } from "../../adapters/docker/run"; import { GATEWAY_PORT } from "../../core/ports"; import * as registry from "../../state/registry"; +import { resolveSandboxContainerOwner } from "./sandbox-container-owner"; const DEFAULT_CONTAINER = "openshell-cluster-nemoclaw"; const DOCKER_TIMEOUT_MS = 3000; @@ -189,46 +190,6 @@ const defaultSandboxContainerRunners: SandboxContainerFailureRunners = { portProbe: defaultPortProbe, }; -// OpenShell names sandbox containers either as `openshell-` (no -// suffix) or `openshell--`, where `` is appended by -// openshell at runtime. Two prefix collisions are possible: -// -// 1. A sandbox name can be a prefix of another sandbox name -// (`my` vs `my-assistant`). -// 2. Even with a hyphen-free ``, a sandbox name can be a prefix -// of another sandbox name whose own suffix is hyphen-free -// (`my-assistant` vs `my-assistant-prod`). -// -// Resolve each candidate to the LONGEST registered sandbox name it could -// belong to and only accept candidates that resolve to the sandbox we are -// looking up. The exact-name form is preferred before suffixed forms so -// that an `openshell-` container always wins over an unrelated -// `openshell--` co-tenant. Mirrors the resolver in -// `docker-health.ts`. -function findSandboxContainerName( - names: string, - sandboxName: string, - registeredSandboxNames: readonly string[], -): string | null { - const ourPrefix = `openshell-${sandboxName}-`; - const ourExact = `openshell-${sandboxName}`; - const known = new Set(registeredSandboxNames); - known.add(sandboxName); - const candidates = names - .split("\n") - .map((line) => line.trim()) - .filter((line) => line === ourExact || line.startsWith(ourPrefix)); - if (candidates.includes(ourExact)) return ourExact; - const knownArr = [...known]; - for (const candidate of candidates) { - const stripped = candidate.replace(/^openshell-/, ""); - const owner = knownArr - .filter((name) => stripped === name || stripped.startsWith(`${name}-`)) - .sort((a, b) => b.length - a.length)[0]; - if (owner === sandboxName) return candidate; - } - return null; -} function isValidDashboardPort(port: number | null | undefined): port is number { return ( @@ -248,13 +209,13 @@ export async function classifySandboxContainerFailure( ): Promise { const runners = opts.runners ?? defaultSandboxContainerRunners; const registeredSandboxNames = runners.listSandboxNames(); - const running = findSandboxContainerName( + const running = resolveSandboxContainerOwner( runners.listRunningContainerNames(), sandboxName, registeredSandboxNames, ); if (running) return null; - const present = findSandboxContainerName( + const present = resolveSandboxContainerOwner( runners.listAllContainerNames(), sandboxName, registeredSandboxNames, diff --git a/src/lib/actions/sandbox/sandbox-container-owner.ts b/src/lib/actions/sandbox/sandbox-container-owner.ts new file mode 100644 index 0000000000..326649836d --- /dev/null +++ b/src/lib/actions/sandbox/sandbox-container-owner.ts @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * Resolve which OpenShell container owns a given sandbox name. + * + * OpenShell names sandbox containers either as `openshell-` (no + * suffix) or `openshell--`, where `` is appended by openshell + * at runtime. Two prefix collisions are possible: + * + * 1. A sandbox name can be a prefix of another sandbox name + * (`my` vs `my-assistant`). + * 2. Even with a hyphen-free ``, a sandbox name can be a prefix + * of another sandbox name whose own suffix is hyphen-free + * (`my-assistant` vs `my-assistant-prod`). + * + * The longest-owner rule resolves each candidate to the longest registered + * sandbox name that could claim it, then only accepts candidates that resolve + * back to the queried sandbox. The exact-name form is preferred before + * suffixed forms so `openshell-` always wins over an unrelated + * `openshell--` co-tenant. + */ +export function resolveSandboxContainerOwner( + containerNamesRaw: string, + sandboxName: string, + registeredSandboxNames: Iterable, +): string | null { + const ourPrefix = `openshell-${sandboxName}-`; + const ourExact = `openshell-${sandboxName}`; + const known = new Set(registeredSandboxNames); + known.add(sandboxName); + const candidates = containerNamesRaw + .split("\n") + .map((line) => line.trim()) + .filter((line) => line === ourExact || line.startsWith(ourPrefix)); + if (candidates.includes(ourExact)) return ourExact; + const knownArr = [...known]; + for (const candidate of candidates) { + const stripped = candidate.replace(/^openshell-/, ""); + const owner = knownArr + .filter((name) => stripped === name || stripped.startsWith(`${name}-`)) + .sort((a, b) => b.length - a.length)[0]; + if (owner === sandboxName) return candidate; + } + return null; +} diff --git a/src/lib/actions/sandbox/status.test.ts b/src/lib/actions/sandbox/status.test.ts index c992b5a91d..9722fc0399 100644 --- a/src/lib/actions/sandbox/status.test.ts +++ b/src/lib/actions/sandbox/status.test.ts @@ -6,8 +6,10 @@ import { describe, expect, it } from "vitest"; import type { ProviderHealthProbeOptions } from "../../../../dist/lib/inference/health"; import { classifySandboxContainerFailureForStatus, + classifySandboxStatusPreflightFailure, getSandboxStatusInferenceHealth, isDockerDaemonUnreachableForStatus, + maybeGetSandboxStatusInferenceHealth, } from "../../../../dist/lib/actions/sandbox/status"; describe("sandbox status inference health", () => { @@ -152,3 +154,125 @@ describe("classifySandboxContainerFailureForStatus", () => { expect(observed).toEqual([{ sandboxName: "alpha", port: null }]); }); }); + +describe("maybeGetSandboxStatusInferenceHealth", () => { + it("does not invoke the provider probe when suppressInferenceProbe is true even with a present gateway and string provider", () => { + let probeCalls = 0; + const result = maybeGetSandboxStatusInferenceHealth( + true, + true, + "nvidia-prod", + "nvidia/nemotron", + (...args) => { + probeCalls += 1; + throw new Error(`probeProviderHealth should not be invoked (args=${JSON.stringify(args)})`); + }, + ); + expect(result).toBeNull(); + expect(probeCalls).toBe(0); + }); + + it("delegates to the probe when suppressInferenceProbe is false", () => { + const calls: { provider: string; options?: ProviderHealthProbeOptions }[] = []; + const result = maybeGetSandboxStatusInferenceHealth( + false, + true, + "nvidia-prod", + "nvidia/nemotron", + (provider, options) => { + calls.push({ provider, options }); + return { + ok: true, + probed: true, + providerLabel: "NVIDIA Endpoints", + endpoint: "https://integrate.api.nvidia.com/v1/chat/completions", + detail: "healthy", + }; + }, + ); + expect(result?.ok).toBe(true); + expect(calls).toEqual([ + { provider: "nvidia-prod", options: { model: "nvidia/nemotron" } }, + ]); + }); +}); + +describe("classifySandboxStatusPreflightFailure", () => { + it("returns docker_unreachable when the daemon probe reports unreachable", async () => { + let sandboxProbeCalled = false; + const result = await classifySandboxStatusPreflightFailure( + { name: "alpha", openshellDriver: "docker" } as never, + { + dockerProbe: () => false, + sandboxContainerProbe: async () => { + sandboxProbeCalled = true; + return null; + }, + }, + ); + expect(result).toEqual({ layer: "docker_unreachable", dockerUnreachable: true }); + // Short-circuits: a daemon that is already known to be down must not + // trigger a follow-up `docker ps` round trip. + expect(sandboxProbeCalled).toBe(false); + }); + + it("returns the sandbox container failure when the daemon is reachable", async () => { + const result = await classifySandboxStatusPreflightFailure( + { name: "alpha", openshellDriver: "docker", dashboardPort: 18789 } as never, + { + dockerProbe: () => true, + sandboxContainerProbe: async (sandboxName, dashboardPort) => { + expect(sandboxName).toBe("alpha"); + expect(dashboardPort).toBe(18789); + return { + layer: "sandbox_dashboard_port_conflict", + detail: "stub failure", + }; + }, + }, + ); + expect(result).toEqual({ + layer: "sandbox_dashboard_port_conflict", + dockerUnreachable: false, + }); + }); + + it("returns null when the sandbox container probe finds no failure", async () => { + const result = await classifySandboxStatusPreflightFailure( + { name: "alpha", openshellDriver: "docker" } as never, + { + dockerProbe: () => true, + sandboxContainerProbe: async () => null, + }, + ); + expect(result).toBeNull(); + }); + + it("returns null when the sandbox is not on the docker driver", async () => { + let dockerCalled = false; + let sandboxCalled = false; + const result = await classifySandboxStatusPreflightFailure( + { name: "alpha", openshellDriver: "vm" } as never, + { + dockerProbe: () => { + dockerCalled = true; + return false; + }, + sandboxContainerProbe: async () => { + sandboxCalled = true; + return null; + }, + }, + ); + expect(result).toBeNull(); + // Both gates are docker-driver-only; a vm sandbox must not provoke + // either probe. + expect(dockerCalled).toBe(false); + expect(sandboxCalled).toBe(false); + }); + + it("returns null when the sandbox entry is null", async () => { + const result = await classifySandboxStatusPreflightFailure(null); + expect(result).toBeNull(); + }); +}); diff --git a/src/lib/actions/sandbox/status.ts b/src/lib/actions/sandbox/status.ts index aa41dfc913..73be7bf3e4 100644 --- a/src/lib/actions/sandbox/status.ts +++ b/src/lib/actions/sandbox/status.ts @@ -69,6 +69,29 @@ export function getSandboxStatusInferenceHealth( }); } +/** + * Gate around `getSandboxStatusInferenceHealth` that short-circuits when the + * caller has already classified a pre-snapshot failure (docker daemon down, + * sandbox container stopped, dashboard port held). Returns null without + * touching the provider probe so the remote-provider reachability request is + * never issued in those cases. Exported so tests can drive the seam directly. + */ +export function maybeGetSandboxStatusInferenceHealth( + suppressInferenceProbe: boolean, + gatewayPresent: boolean, + currentProvider: unknown, + currentModel: unknown, + probeProviderHealthImpl?: ProbeProviderHealth, +): ProviderHealthStatus | null { + if (suppressInferenceProbe) return null; + return getSandboxStatusInferenceHealth( + gatewayPresent, + currentProvider, + currentModel, + probeProviderHealthImpl, + ); +} + export interface SandboxStatusReport { schemaVersion: 1; name: string; @@ -86,6 +109,11 @@ export interface SandboxStatusReport { openshellDriver: string; openshellVersion: string; policies: string[]; + failureLayer: + | "docker_unreachable" + | "sandbox_container_stopped" + | "sandbox_dashboard_port_conflict" + | null; } interface SandboxStatusSnapshot { @@ -97,8 +125,16 @@ interface SandboxStatusSnapshot { inferenceHealth: ProviderHealthStatus | null; } -async function collectSandboxStatusSnapshot( +interface CollectSandboxStatusSnapshotDeps { + probeProviderHealthImpl?: ProbeProviderHealth; +} + +export async function collectSandboxStatusSnapshot( sandboxName: string, + opts: { + suppressInferenceProbe?: boolean; + deps?: CollectSandboxStatusSnapshotDeps; + } = {}, ): Promise { const sb = registry.getSandbox(sandboxName); let lookup: SandboxGatewayState; @@ -136,10 +172,18 @@ async function collectSandboxStatusSnapshot( liveResult && !isCommandTimeout(liveResult) ? parseGatewayInference(liveResult.output) : null; const currentModel = (live && live.model) || (sb && sb.model) || "unknown"; const currentProvider = (live && live.provider) || (sb && sb.provider) || "unknown"; - const inferenceHealth = getSandboxStatusInferenceHealth( + // When the caller has already determined that the local stack is failed + // (docker daemon down, sandbox container stopped, dashboard port held), + // skip the provider probe entirely. Without this gate + // `getSandboxStatusInferenceHealth` would still issue the remote-provider + // reachability request even though the caller would overwrite the returned + // value to null afterwards. + const inferenceHealth = maybeGetSandboxStatusInferenceHealth( + opts.suppressInferenceProbe === true, lookup.state === "present", currentProvider, currentModel, + opts.deps?.probeProviderHealthImpl, ); if ( inferenceHealth && @@ -163,10 +207,55 @@ async function collectSandboxStatusSnapshot( return { sb, lookup, rpcIssue, currentModel, currentProvider, inferenceHealth }; } +export type SandboxStatusFailureLayer = + | "docker_unreachable" + | "sandbox_container_stopped" + | "sandbox_dashboard_port_conflict"; + +export interface SandboxStatusPreflightFailure { + layer: SandboxStatusFailureLayer; + dockerUnreachable: boolean; +} + +export interface ClassifySandboxStatusPreflightFailureDeps { + dockerProbe?: DockerInfoProbe; + sandboxContainerProbe?: SandboxContainerFailureProbe; +} + +/** + * Classify pre-snapshot failure layers (host docker daemon down, per-sandbox + * container stopped, dashboard port held by foreign listener). Returns null + * when none apply, including when the sandbox is not on the docker driver or + * the registry has no entry. Shared between the human-readable status + * renderer and the `--json` report so both paths gate the inference probe + * consistently and the JSON path can surface the same failure layer. + */ +export async function classifySandboxStatusPreflightFailure( + sb: registry.SandboxEntry | null, + deps: ClassifySandboxStatusPreflightFailureDeps = {}, +): Promise { + if (isDockerDaemonUnreachableForStatus(sb, deps.dockerProbe)) { + return { layer: "docker_unreachable", dockerUnreachable: true }; + } + const sandboxFailure = await classifySandboxContainerFailureForStatus( + sb, + deps.sandboxContainerProbe, + ); + if (sandboxFailure) { + return { layer: sandboxFailure.layer, dockerUnreachable: false }; + } + return null; +} + export async function getSandboxStatusReport( sandboxName: string, ): Promise { - const snapshot = await collectSandboxStatusSnapshot(sandboxName); + const sandboxEntryEarly = registry.getSandbox(sandboxName); + const preflightFailure = + await classifySandboxStatusPreflightFailure(sandboxEntryEarly); + const snapshot = await collectSandboxStatusSnapshot(sandboxName, { + suppressInferenceProbe: preflightFailure !== null, + }); const { sb, lookup, rpcIssue, currentModel, currentProvider, inferenceHealth } = snapshot; const phase = lookup.state === "present" ? parseSandboxPhase(lookup.output || "") : null; @@ -194,6 +283,7 @@ export async function getSandboxStatusReport( openshellDriver: (sb && sb.openshellDriver) || "unknown", openshellVersion: (sb && sb.openshellVersion) || "unknown", policies, + failureLayer: preflightFailure ? preflightFailure.layer : null, }; } @@ -281,28 +371,21 @@ export async function classifySandboxContainerFailureForStatus( // eslint-disable-next-line complexity export async function showSandboxStatus(sandboxName: string): Promise { - // When the host Docker daemon is stopped on a docker-driver sandbox, the + // When the host Docker daemon is stopped, or the per-sandbox container is + // stopped (with or without a foreign listener on the dashboard port), the // cached sandbox metadata renders as a "healthy" report even though the - // local container stack is down, and the host-side Inference probe hits the - // remote provider directly so it falsely shows healthy too. Probe the - // daemon once upfront so the failure-layer header is the first thing the - // user sees and the misleading probe is suppressed. + // local container stack is down, and the host-side Inference probe hits + // the remote provider directly so it falsely shows healthy too. Classify + // these failure layers upfront so the header is the first thing the user + // sees, the remote provider probe is never issued, and downstream + // gateway-state branches can suppress duplicate `docker_unreachable` + // headers via `alreadyPrintedDockerUnreachable`. const sandboxEntryEarly = registry.getSandbox(sandboxName); - const dockerUnreachable = isDockerDaemonUnreachableForStatus(sandboxEntryEarly); - if (dockerUnreachable) { - console.log(getLayerHeader("docker_unreachable")); - process.exitCode = 1; - } - // When the per-sandbox container is stopped (and optionally its dashboard - // port is held by a foreign listener) the host-side Inference probe still - // hits the remote provider directly and falsely shows healthy. Probe the - // sandbox container upfront so the failure layer is the first user-visible - // signal and the misleading Inference line is suppressed. - const sandboxFailure = dockerUnreachable - ? null - : await classifySandboxContainerFailureForStatus(sandboxEntryEarly); - if (sandboxFailure) { - console.log(getLayerHeader(sandboxFailure.layer)); + const preflightFailure = + await classifySandboxStatusPreflightFailure(sandboxEntryEarly); + const dockerUnreachable = preflightFailure?.dockerUnreachable === true; + if (preflightFailure) { + console.log(getLayerHeader(preflightFailure.layer)); process.exitCode = 1; } // #2666: never let an unexpected throw from the gateway probe (e.g. openshell @@ -310,14 +393,10 @@ export async function showSandboxStatus(sandboxName: string): Promise { // foreign listener) suppress the sandbox header. The downstream switch // handles `gateway_error` by printing an actionable block + exit(1), so a // synthesized fallback keeps the user-visible contract intact. - const snapshot = await collectSandboxStatusSnapshot(sandboxName); - const { sb, lookup, rpcIssue, currentModel, currentProvider } = snapshot; - // When docker is unreachable or the sandbox container itself is stopped, - // host-side probes misrepresent the sandbox state — the remote-provider - // reachability check doesn't go through the local stack. Suppress the probe - // so the output doesn't conflict with the failure-layer header. - const inferenceHealth = - dockerUnreachable || sandboxFailure ? null : snapshot.inferenceHealth; + const snapshot = await collectSandboxStatusSnapshot(sandboxName, { + suppressInferenceProbe: preflightFailure !== null, + }); + const { sb, lookup, rpcIssue, currentModel, currentProvider, inferenceHealth } = snapshot; maybeEnsureHermesToolGatewayBroker(sb); if (rpcIssue) { printOpenShellStateRpcIssue(rpcIssue, { diff --git a/test/cli.test.ts b/test/cli.test.ts index d2d45feec4..adc305be5f 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -1228,6 +1228,17 @@ describe("CLI dispatch", () => { openshellDriver: "docker", openshellVersion: "0.0.44", } as unknown as Partial); + fs.writeFileSync( + path.join(localBin, "docker"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "info" ]; then echo "Server: docker"; exit 0; fi', + `if [ "$1" = "ps" ] && [ "$2" = "-a" ]; then echo "openshell-cluster-nemoclaw"; echo "openshell-${sandboxName}-7616dcb1"; exit 0; fi`, + `if [ "$1" = "ps" ]; then echo "openshell-cluster-nemoclaw"; echo "openshell-${sandboxName}-7616dcb1"; exit 0; fi`, + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); fs.writeFileSync( path.join(localBin, "openshell"), [ @@ -1456,6 +1467,238 @@ describe("CLI dispatch", () => { expect(parsed.inferenceHealth).toBeNull(); }); + it("sandbox status --json sets failureLayer=docker_unreachable, suppresses inferenceHealth, and exits 1 when the host Docker daemon is unreachable", () => { + const home = fs.mkdtempSync( + path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-json-docker-unreachable-"), + ); + const localBin = path.join(home, "bin"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home, "alpha", { + provider: "openai-api", + model: "gpt-4o-mini", + openshellDriver: "docker", + } as unknown as Partial); + + fs.writeFileSync( + path.join(localBin, "docker"), + ["#!/usr/bin/env bash", "exit 1"].join("\n"), + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', + " echo 'Gateway inference:'", + " echo ' Provider: openai-api'", + " echo ' Model: gpt-4o-mini'", + " exit 0", + "fi", + 'if [ "$1" = "status" ]; then', + " echo 'Gateway: nemoclaw'", + " echo 'Status: Connected'", + " exit 0", + "fi", + 'if [ "$1" = "gateway" ] && [ "$2" = "info" ]; then', + " echo 'Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha status --json", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(1); + const parsed = JSON.parse(r.out); + expect(parsed.failureLayer).toBe("docker_unreachable"); + expect(parsed.inferenceHealth).toBeNull(); + expect(parsed.name).toBe("alpha"); + expect(parsed.found).toBe(true); + }); + + it("sandbox status --json sets failureLayer=sandbox_container_stopped when the per-sandbox container is stopped", () => { + const home = fs.mkdtempSync( + path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-json-container-stopped-"), + ); + const localBin = path.join(home, "bin"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home, "alpha", { + provider: "openai-api", + model: "gpt-4o-mini", + openshellDriver: "docker", + } as unknown as Partial); + + fs.writeFileSync( + path.join(localBin, "docker"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "info" ]; then echo "Server: docker"; exit 0; fi', + 'if [ "$1" = "ps" ] && [ "$2" = "-a" ]; then echo "openshell-alpha-7616dcb1"; exit 0; fi', + 'if [ "$1" = "ps" ]; then echo "openshell-cluster-nemoclaw"; exit 0; fi', + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', + " echo 'Gateway inference:'", + " echo ' Provider: openai-api'", + " echo ' Model: gpt-4o-mini'", + " exit 0", + "fi", + 'if [ "$1" = "status" ]; then', + " echo 'Gateway: nemoclaw'", + " echo 'Status: Connected'", + " exit 0", + "fi", + 'if [ "$1" = "gateway" ] && [ "$2" = "info" ]; then', + " echo 'Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha status --json", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(1); + const parsed = JSON.parse(r.out); + expect(parsed.failureLayer).toBe("sandbox_container_stopped"); + expect(parsed.inferenceHealth).toBeNull(); + }); + + it("sandbox status --json sets failureLayer=sandbox_dashboard_port_conflict when the dashboard port is held by a foreign listener", async () => { + const home = fs.mkdtempSync( + path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-json-port-conflict-"), + ); + const localBin = path.join(home, "bin"); + fs.mkdirSync(localBin, { recursive: true }); + + const server = net.createServer(); + await new Promise((resolve, reject) => { + server.once("error", reject); + server.listen(0, "127.0.0.1", () => resolve()); + }); + const address = server.address(); + if (!address || typeof address === "string") { + server.close(); + throw new Error("failed to bind foreign listener on a free port"); + } + const dashboardPort = address.port; + + try { + writeSandboxRegistry(home, "alpha", { + provider: "openai-api", + model: "gpt-4o-mini", + openshellDriver: "docker", + dashboardPort, + } as unknown as Partial); + + fs.writeFileSync( + path.join(localBin, "docker"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "info" ]; then echo "Server: docker"; exit 0; fi', + 'if [ "$1" = "ps" ] && [ "$2" = "-a" ]; then echo "openshell-alpha-7616dcb1"; exit 0; fi', + 'if [ "$1" = "ps" ]; then echo "openshell-cluster-nemoclaw"; exit 0; fi', + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', + " echo 'Gateway inference:'", + " echo ' Provider: openai-api'", + " echo ' Model: gpt-4o-mini'", + " exit 0", + "fi", + 'if [ "$1" = "status" ]; then', + " echo 'Gateway: nemoclaw'", + " echo 'Status: Connected'", + " exit 0", + "fi", + 'if [ "$1" = "gateway" ] && [ "$2" = "info" ]; then', + " echo 'Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha status --json", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(1); + const parsed = JSON.parse(r.out); + expect(parsed.failureLayer).toBe("sandbox_dashboard_port_conflict"); + expect(parsed.inferenceHealth).toBeNull(); + } finally { + await new Promise((resolve) => server.close(() => resolve())); + } + }); + + it("sandbox status --json sets failureLayer=null when no preflight failure applies", () => { + const home = fs.mkdtempSync( + path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-json-failure-layer-null-"), + ); + const localBin = path.join(home, "bin"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home, "alpha", { + provider: "openai-api", + model: "gpt-4o-mini", + openshellDriver: "vm", + } as unknown as Partial); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', + " echo 'Gateway inference:'", + " echo ' Provider: openai-api'", + " echo ' Model: gpt-4o-mini'", + " exit 0", + "fi", + 'if [ "$1" = "status" ]; then', + " echo 'Gateway: nemoclaw'", + " echo 'Status: Connected'", + " exit 0", + "fi", + 'if [ "$1" = "gateway" ] && [ "$2" = "info" ]; then', + " echo 'Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha status --json", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + const parsed = JSON.parse(r.out); + expect(parsed.failureLayer).toBeNull(); + }); + it("sandbox status --help advertises --json flag", () => { const home = fs.mkdtempSync( path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-help-json-"), diff --git a/test/sandbox-container-owner.test.ts b/test/sandbox-container-owner.test.ts new file mode 100644 index 0000000000..4974db2d26 --- /dev/null +++ b/test/sandbox-container-owner.test.ts @@ -0,0 +1,88 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it } from "vitest"; + +import { resolveSandboxContainerOwner } from "../dist/lib/actions/sandbox/sandbox-container-owner.js"; + +describe("resolveSandboxContainerOwner", () => { + it("returns null when no candidate matches the sandbox prefix", () => { + expect( + resolveSandboxContainerOwner( + "openshell-cluster-nemoclaw\nopenshell-different-sandbox-7616dcb1", + "my-assistant", + ["my-assistant", "different-sandbox"], + ), + ).toBeNull(); + }); + + it("prefers the exact-name container even when a co-tenant suffixed candidate exists in the same listing", () => { + expect( + resolveSandboxContainerOwner( + "openshell-my-assistant-7616dcb1\nopenshell-my-assistant", + "my-assistant", + ["my-assistant"], + ), + ).toBe("openshell-my-assistant"); + }); + + it("accepts a uuid-suffixed container that resolves to the queried sandbox via the longest-owner rule", () => { + expect( + resolveSandboxContainerOwner( + "openshell-my-assistant-7616dcb1\nopenshell-different-sandbox-abc", + "my-assistant", + ["my-assistant", "different-sandbox"], + ), + ).toBe("openshell-my-assistant-7616dcb1"); + }); + + it("rejects a container whose longest-owner is a different registered sandbox name", () => { + expect( + resolveSandboxContainerOwner( + "openshell-my-assistant-prod-7616dcb1\nopenshell-cluster-nemoclaw", + "my-assistant", + ["my-assistant", "my-assistant-prod"], + ), + ).toBeNull(); + }); + + it("rejects a container whose stripped name is not separated from the queried sandbox by a hyphen", () => { + expect( + resolveSandboxContainerOwner( + "openshell-my-assistantextra\nopenshell-cluster-nemoclaw", + "my-assistant", + ["my-assistant"], + ), + ).toBeNull(); + }); + + it("includes the queried sandbox in the known-owner set even when listSandboxNames omits it", () => { + expect( + resolveSandboxContainerOwner( + "openshell-my-assistant-7616dcb1", + "my-assistant", + [], + ), + ).toBe("openshell-my-assistant-7616dcb1"); + }); + + it("trims whitespace and ignores blank lines from the docker ps stream", () => { + expect( + resolveSandboxContainerOwner( + " openshell-my-assistant-7616dcb1 \n\n openshell-cluster-nemoclaw \n", + "my-assistant", + ["my-assistant"], + ), + ).toBe("openshell-my-assistant-7616dcb1"); + }); + + it("matches an exact-name container even when listSandboxNames is empty", () => { + expect( + resolveSandboxContainerOwner( + "openshell-my-assistant", + "my-assistant", + [], + ), + ).toBe("openshell-my-assistant"); + }); +}); From 9a8a3661a042371148132d84ebef9dfa7134bd50 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Sat, 30 May 2026 09:36:15 +0000 Subject: [PATCH 6/9] docs: document failureLayer in status --json field list and exit conditions Signed-off-by: Tinson Lai --- docs/reference/commands.mdx | 5 +++-- src/lib/actions/sandbox/status.ts | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/reference/commands.mdx b/docs/reference/commands.mdx index e170876237..bb285b67e8 100644 --- a/docs/reference/commands.mdx +++ b/docs/reference/commands.mdx @@ -396,9 +396,10 @@ $ nemoclaw my-assistant recover Show sandbox status, health, and inference configuration. Pass `--json` to emit a structured per-sandbox report instead of the text renderer. -The JSON output includes at least `schemaVersion`, `name`, `found`, `model`, `provider`, `phase`, `gatewayState`, `inferenceHealth`, `rpcIssue`, `hostGpuDetected`, `sandboxGpuEnabled`, `sandboxGpuMode`, `sandboxGpuDevice`, `openshellDriver`, `openshellVersion`, and `policies`. +The JSON output includes at least `schemaVersion`, `name`, `found`, `model`, `provider`, `phase`, `gatewayState`, `inferenceHealth`, `rpcIssue`, `hostGpuDetected`, `sandboxGpuEnabled`, `sandboxGpuMode`, `sandboxGpuDevice`, `openshellDriver`, `openshellVersion`, `policies`, and `failureLayer`. `openshellDriver` and `openshellVersion` are always strings (falling back to `"unknown"` when the registry has no value), so consumers can rely on `typeof` checks. -The command exits non-zero when the sandbox is missing locally, the gateway state is not `present`, or the gateway reports a schema/protobuf mismatch (mirrored as `rpcIssue`). +`failureLayer` is `null` when no preflight failure was detected and otherwise one of `docker_unreachable`, `sandbox_container_stopped`, or `sandbox_dashboard_port_conflict`; when set, `inferenceHealth` is suppressed to `null` so automation does not see a stale remote-provider healthy status during a local outage. +The command exits non-zero when the sandbox is missing locally, the gateway state is not `present`, the gateway reports a schema/protobuf mismatch (mirrored as `rpcIssue`), or `failureLayer` is non-null. The alias form `nemoclaw status --json` requires the sandbox to be registered locally; the canonical form `nemoclaw sandbox status --json` is the one to use from automation that may run against an unknown sandbox name, since it still emits a JSON document with `found: false` instead of a text error. ```console diff --git a/src/lib/actions/sandbox/status.ts b/src/lib/actions/sandbox/status.ts index 73be7bf3e4..bf65a44944 100644 --- a/src/lib/actions/sandbox/status.ts +++ b/src/lib/actions/sandbox/status.ts @@ -74,7 +74,7 @@ export function getSandboxStatusInferenceHealth( * caller has already classified a pre-snapshot failure (docker daemon down, * sandbox container stopped, dashboard port held). Returns null without * touching the provider probe so the remote-provider reachability request is - * never issued in those cases. Exported so tests can drive the seam directly. + * never issued in those cases. */ export function maybeGetSandboxStatusInferenceHealth( suppressInferenceProbe: boolean, @@ -129,7 +129,7 @@ interface CollectSandboxStatusSnapshotDeps { probeProviderHealthImpl?: ProbeProviderHealth; } -export async function collectSandboxStatusSnapshot( +async function collectSandboxStatusSnapshot( sandboxName: string, opts: { suppressInferenceProbe?: boolean; From 9631e887ac0031ee9018de0bd76060141c054f28 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Sat, 30 May 2026 10:06:04 +0000 Subject: [PATCH 7/9] fix(cli): suppress duplicate Failure layer header on any preflight-printed layer Signed-off-by: Tinson Lai --- src/lib/actions/sandbox/status.ts | 41 +++++++++++++++---------------- test/cli.test.ts | 8 ++++++ 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/src/lib/actions/sandbox/status.ts b/src/lib/actions/sandbox/status.ts index bf65a44944..2d0313dbf4 100644 --- a/src/lib/actions/sandbox/status.ts +++ b/src/lib/actions/sandbox/status.ts @@ -92,6 +92,11 @@ export function maybeGetSandboxStatusInferenceHealth( ); } +export type SandboxStatusFailureLayer = + | "docker_unreachable" + | "sandbox_container_stopped" + | "sandbox_dashboard_port_conflict"; + export interface SandboxStatusReport { schemaVersion: 1; name: string; @@ -109,11 +114,7 @@ export interface SandboxStatusReport { openshellDriver: string; openshellVersion: string; policies: string[]; - failureLayer: - | "docker_unreachable" - | "sandbox_container_stopped" - | "sandbox_dashboard_port_conflict" - | null; + failureLayer: SandboxStatusFailureLayer | null; } interface SandboxStatusSnapshot { @@ -207,11 +208,6 @@ async function collectSandboxStatusSnapshot( return { sb, lookup, rpcIssue, currentModel, currentProvider, inferenceHealth }; } -export type SandboxStatusFailureLayer = - | "docker_unreachable" - | "sandbox_container_stopped" - | "sandbox_dashboard_port_conflict"; - export interface SandboxStatusPreflightFailure { layer: SandboxStatusFailureLayer; dockerUnreachable: boolean; @@ -332,12 +328,15 @@ function maybeEnsureHermesToolGatewayBroker(sb: registry.SandboxEntry | null): v async function printGatewayFailureLayerHeader( sandboxName: string, - alreadyPrintedDockerUnreachable = false, + alreadyPrintedPreflightLayer: SandboxStatusFailureLayer | null = null, ): Promise { + // The preflight classifier (docker_unreachable, sandbox_container_stopped, + // sandbox_dashboard_port_conflict) is more specific than the downstream + // gateway-state classifier. When it already emitted a header, skip the + // gateway-level fallback entirely to avoid a duplicate `Failure layer:` + // line in the user-visible output. + if (alreadyPrintedPreflightLayer !== null) return; const failure = await classifyGatewayFailure(sandboxName); - if (alreadyPrintedDockerUnreachable && failure.layer === "docker_unreachable") { - return; - } console.log(` ${getLayerHeader(failure.layer)}`); } @@ -378,12 +377,12 @@ export async function showSandboxStatus(sandboxName: string): Promise { // the remote provider directly so it falsely shows healthy too. Classify // these failure layers upfront so the header is the first thing the user // sees, the remote provider probe is never issued, and downstream - // gateway-state branches can suppress duplicate `docker_unreachable` - // headers via `alreadyPrintedDockerUnreachable`. + // gateway-state branches can suppress their own `Failure layer:` line via + // `alreadyPrintedPreflightLayer`. const sandboxEntryEarly = registry.getSandbox(sandboxName); const preflightFailure = await classifySandboxStatusPreflightFailure(sandboxEntryEarly); - const dockerUnreachable = preflightFailure?.dockerUnreachable === true; + const preflightLayer = preflightFailure ? preflightFailure.layer : null; if (preflightFailure) { console.log(getLayerHeader(preflightFailure.layer)); process.exitCode = 1; @@ -518,7 +517,7 @@ export async function showSandboxStatus(sandboxName: string): Promise { if (guard.state === "connected_other") { printWrongGatewayActiveGuidance(sandboxName, guard.activeGateway, console.log); } else { - await printGatewayFailureLayerHeader(sandboxName, dockerUnreachable); + await printGatewayFailureLayerHeader(sandboxName, preflightLayer); printGatewayLifecycleHint(guard.status || "", sandboxName, console.log); } } else { @@ -552,7 +551,7 @@ export async function showSandboxStatus(sandboxName: string): Promise { process.exit(1); } else if (lookup.state === "gateway_unreachable_after_restart") { console.log(""); - await printGatewayFailureLayerHeader(sandboxName, dockerUnreachable); + await printGatewayFailureLayerHeader(sandboxName, preflightLayer); console.log( ` Sandbox '${sandboxName}' may still exist, but the selected ${CLI_DISPLAY_NAME} gateway is still refusing connections after restart.`, ); @@ -568,7 +567,7 @@ export async function showSandboxStatus(sandboxName: string): Promise { process.exit(1); } else if (lookup.state === "gateway_missing_after_restart") { console.log(""); - await printGatewayFailureLayerHeader(sandboxName, dockerUnreachable); + await printGatewayFailureLayerHeader(sandboxName, preflightLayer); console.log( ` Sandbox '${sandboxName}' may still exist locally, but the ${CLI_DISPLAY_NAME} gateway is no longer configured after restart/rebuild.`, ); @@ -588,7 +587,7 @@ export async function showSandboxStatus(sandboxName: string): Promise { if (lookup.output) { console.log(lookup.output); } - await printGatewayFailureLayerHeader(sandboxName, dockerUnreachable); + await printGatewayFailureLayerHeader(sandboxName, preflightLayer); printGatewayLifecycleHint(lookup.output, sandboxName, console.log); process.exit(1); } diff --git a/test/cli.test.ts b/test/cli.test.ts index adc305be5f..f2260ed90e 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -1123,6 +1123,11 @@ describe("CLI dispatch", () => { const sandboxIdx = r.out.indexOf("Sandbox: alpha"); expect(headerIdx).toBeGreaterThanOrEqual(0); expect(sandboxIdx).toBeGreaterThan(headerIdx); + // The downstream gateway-state fallback header (`Failure layer: ...`) + // must be suppressed once preflight has already emitted its own. + // Otherwise a non-`present` gateway lookup would print a redundant + // second `Failure layer:` line later in the output. + expect((r.out.match(/Failure layer:/g) || []).length).toBe(1); }); it("sandbox status surfaces sandbox_dashboard_port_conflict when the sandbox container is stopped and the dashboard port is held by a foreign listener", async () => { @@ -1204,6 +1209,9 @@ describe("CLI dispatch", () => { const sandboxIdx = r.out.indexOf("Sandbox: alpha"); expect(headerIdx).toBeGreaterThanOrEqual(0); expect(sandboxIdx).toBeGreaterThan(headerIdx); + // Downstream gateway-state fallback must not print a second + // `Failure layer:` line when preflight already emitted one. + expect((r.out.match(/Failure layer:/g) || []).length).toBe(1); } finally { await new Promise((resolve) => server.close(() => resolve())); } From 7b2e44ca98be4df2cf0ee0a86e091858eecd09bc Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Sun, 31 May 2026 05:54:14 +0000 Subject: [PATCH 8/9] refactor(cli): extract sandbox status preflight classifier into status-preflight.ts Signed-off-by: Tinson Lai --- src/lib/actions/sandbox/status-preflight.ts | 117 ++++++++++++++++++++ src/lib/actions/sandbox/status.ts | 105 +++--------------- 2 files changed, 133 insertions(+), 89 deletions(-) create mode 100644 src/lib/actions/sandbox/status-preflight.ts diff --git a/src/lib/actions/sandbox/status-preflight.ts b/src/lib/actions/sandbox/status-preflight.ts new file mode 100644 index 0000000000..3d0749b6cf --- /dev/null +++ b/src/lib/actions/sandbox/status-preflight.ts @@ -0,0 +1,117 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type * as registry from "../../state/registry"; +import { + classifyGatewayFailure, + classifySandboxContainerFailure, + getLayerHeader, + isDockerDaemonReachable, + type SandboxContainerFailureResult, +} from "./gateway-failure-classifier"; + +// Source-of-truth boundary: this module owns the host-level preflight +// classification for `sandbox status` — host docker daemon reachability, +// per-sandbox container state, and dashboard-port conflict. It is the +// durable owner because: +// - cached registry metadata, the in-sandbox gateway probe, and the +// host-side provider probe are all unreliable when the local stack +// is partly down (the provider probe in particular hits the remote +// endpoint directly and falsely shows "healthy"); +// - OpenShell does not currently expose a single host-driver health +// RPC that distinguishes these layers, so the classifier has to +// consult docker info + container state directly; +// - status.ts intentionally stays the renderer/report adapter; the +// classifier here lets the human-readable and `--json` paths share +// identical failure-layer decisions and inference-probe gating. +// Retire this module when OpenShell exposes a unified host-driver +// preflight RPC that returns the same `docker_unreachable`, +// `sandbox_container_stopped`, and `sandbox_dashboard_port_conflict` +// layers; at that point this classifier becomes a thin adapter over +// that RPC and `printGatewayFailureLayerHeader` can move into the +// renderer. + +export type SandboxStatusFailureLayer = + | "docker_unreachable" + | "sandbox_container_stopped" + | "sandbox_dashboard_port_conflict"; + +export interface SandboxStatusPreflightFailure { + layer: SandboxStatusFailureLayer; + dockerUnreachable: boolean; +} + +export type DockerInfoProbe = () => boolean; + +export type SandboxContainerFailureProbe = ( + sandboxName: string, + dashboardPort: number | null, +) => Promise; + +const defaultSandboxContainerFailureProbe: SandboxContainerFailureProbe = ( + sandboxName, + dashboardPort, +) => classifySandboxContainerFailure(sandboxName, { dashboardPort }); + +export interface ClassifySandboxStatusPreflightFailureDeps { + dockerProbe?: DockerInfoProbe; + sandboxContainerProbe?: SandboxContainerFailureProbe; +} + +export function isDockerDaemonUnreachableForStatus( + sb: registry.SandboxEntry | null, + probe: DockerInfoProbe = isDockerDaemonReachable, +): boolean { + if (!sb || sb.openshellDriver !== "docker") return false; + return !probe(); +} + +export async function classifySandboxContainerFailureForStatus( + sb: registry.SandboxEntry | null, + probe: SandboxContainerFailureProbe = defaultSandboxContainerFailureProbe, +): Promise { + if (!sb || sb.openshellDriver !== "docker") return null; + return probe(sb.name, sb.dashboardPort ?? null); +} + +/** + * Classify pre-snapshot failure layers (host docker daemon down, per-sandbox + * container stopped, dashboard port held by foreign listener). Returns null + * when none apply, including when the sandbox is not on the docker driver or + * the registry has no entry. Shared between the human-readable status + * renderer and the `--json` report so both paths gate the inference probe + * consistently and the JSON path can surface the same failure layer. + */ +export async function classifySandboxStatusPreflightFailure( + sb: registry.SandboxEntry | null, + deps: ClassifySandboxStatusPreflightFailureDeps = {}, +): Promise { + if (isDockerDaemonUnreachableForStatus(sb, deps.dockerProbe)) { + return { layer: "docker_unreachable", dockerUnreachable: true }; + } + const sandboxFailure = await classifySandboxContainerFailureForStatus( + sb, + deps.sandboxContainerProbe, + ); + if (sandboxFailure) { + return { layer: sandboxFailure.layer, dockerUnreachable: false }; + } + return null; +} + +/** + * Print the gateway-level failure-layer header for `sandbox status`. The + * preflight classifier (docker_unreachable, sandbox_container_stopped, + * sandbox_dashboard_port_conflict) is more specific than the downstream + * gateway-state classifier. When it already emitted a header, skip the + * gateway-level fallback entirely to avoid a duplicate `Failure layer:` + * line in the user-visible output. + */ +export async function printGatewayFailureLayerHeader( + sandboxName: string, + alreadyPrintedPreflightLayer: SandboxStatusFailureLayer | null = null, +): Promise { + if (alreadyPrintedPreflightLayer !== null) return; + const failure = await classifyGatewayFailure(sandboxName); + console.log(` ${getLayerHeader(failure.layer)}`); +} diff --git a/src/lib/actions/sandbox/status.ts b/src/lib/actions/sandbox/status.ts index 2d0313dbf4..1d86ddc6ab 100644 --- a/src/lib/actions/sandbox/status.ts +++ b/src/lib/actions/sandbox/status.ts @@ -33,13 +33,7 @@ import { getActiveSandboxSessions, } from "../../state/sandbox-session"; import { getSandboxDockerHealth } from "./docker-health"; -import { - classifyGatewayFailure, - classifySandboxContainerFailure, - getLayerHeader, - isDockerDaemonReachable, - type SandboxContainerFailureResult, -} from "./gateway-failure-classifier"; +import { getLayerHeader } from "./gateway-failure-classifier"; import type { SandboxGatewayState } from "./gateway-state"; import { getReconciledSandboxGatewayState, @@ -51,6 +45,21 @@ import { isSandboxGatewayRunningForStatus, probeSandboxInferenceGatewayHealth, } from "./process-recovery"; +import { + classifySandboxStatusPreflightFailure, + printGatewayFailureLayerHeader, + type SandboxStatusFailureLayer, +} from "./status-preflight"; + +export { + classifySandboxContainerFailureForStatus, + classifySandboxStatusPreflightFailure, + isDockerDaemonUnreachableForStatus, + printGatewayFailureLayerHeader, + type ClassifySandboxStatusPreflightFailureDeps, + type SandboxStatusFailureLayer, + type SandboxStatusPreflightFailure, +} from "./status-preflight"; type ProbeProviderHealth = ( provider: string, @@ -92,11 +101,6 @@ export function maybeGetSandboxStatusInferenceHealth( ); } -export type SandboxStatusFailureLayer = - | "docker_unreachable" - | "sandbox_container_stopped" - | "sandbox_dashboard_port_conflict"; - export interface SandboxStatusReport { schemaVersion: 1; name: string; @@ -208,41 +212,6 @@ async function collectSandboxStatusSnapshot( return { sb, lookup, rpcIssue, currentModel, currentProvider, inferenceHealth }; } -export interface SandboxStatusPreflightFailure { - layer: SandboxStatusFailureLayer; - dockerUnreachable: boolean; -} - -export interface ClassifySandboxStatusPreflightFailureDeps { - dockerProbe?: DockerInfoProbe; - sandboxContainerProbe?: SandboxContainerFailureProbe; -} - -/** - * Classify pre-snapshot failure layers (host docker daemon down, per-sandbox - * container stopped, dashboard port held by foreign listener). Returns null - * when none apply, including when the sandbox is not on the docker driver or - * the registry has no entry. Shared between the human-readable status - * renderer and the `--json` report so both paths gate the inference probe - * consistently and the JSON path can surface the same failure layer. - */ -export async function classifySandboxStatusPreflightFailure( - sb: registry.SandboxEntry | null, - deps: ClassifySandboxStatusPreflightFailureDeps = {}, -): Promise { - if (isDockerDaemonUnreachableForStatus(sb, deps.dockerProbe)) { - return { layer: "docker_unreachable", dockerUnreachable: true }; - } - const sandboxFailure = await classifySandboxContainerFailureForStatus( - sb, - deps.sandboxContainerProbe, - ); - if (sandboxFailure) { - return { layer: sandboxFailure.layer, dockerUnreachable: false }; - } - return null; -} - export async function getSandboxStatusReport( sandboxName: string, ): Promise { @@ -326,48 +295,6 @@ function maybeEnsureHermesToolGatewayBroker(sb: registry.SandboxEntry | null): v } } -async function printGatewayFailureLayerHeader( - sandboxName: string, - alreadyPrintedPreflightLayer: SandboxStatusFailureLayer | null = null, -): Promise { - // The preflight classifier (docker_unreachable, sandbox_container_stopped, - // sandbox_dashboard_port_conflict) is more specific than the downstream - // gateway-state classifier. When it already emitted a header, skip the - // gateway-level fallback entirely to avoid a duplicate `Failure layer:` - // line in the user-visible output. - if (alreadyPrintedPreflightLayer !== null) return; - const failure = await classifyGatewayFailure(sandboxName); - console.log(` ${getLayerHeader(failure.layer)}`); -} - -type DockerInfoProbe = () => boolean; - -export function isDockerDaemonUnreachableForStatus( - sb: registry.SandboxEntry | null, - probe: DockerInfoProbe = isDockerDaemonReachable, -): boolean { - if (!sb || sb.openshellDriver !== "docker") return false; - return !probe(); -} - -type SandboxContainerFailureProbe = ( - sandboxName: string, - dashboardPort: number | null, -) => Promise; - -const defaultSandboxContainerFailureProbe: SandboxContainerFailureProbe = ( - sandboxName, - dashboardPort, -) => classifySandboxContainerFailure(sandboxName, { dashboardPort }); - -export async function classifySandboxContainerFailureForStatus( - sb: registry.SandboxEntry | null, - probe: SandboxContainerFailureProbe = defaultSandboxContainerFailureProbe, -): Promise { - if (!sb || sb.openshellDriver !== "docker") return null; - return probe(sb.name, sb.dashboardPort ?? null); -} - // eslint-disable-next-line complexity export async function showSandboxStatus(sandboxName: string): Promise { // When the host Docker daemon is stopped, or the per-sandbox container is From 46b16be2fc2bc432b030525ab4d41a34c70b3d08 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Mon, 1 Jun 2026 17:48:20 -0700 Subject: [PATCH 9/9] fix(cli): preserve status container preflight on terminal phase --- src/lib/actions/sandbox/status-preflight.ts | 15 ++++++++--- test/cli.test.ts | 28 +++++++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/src/lib/actions/sandbox/status-preflight.ts b/src/lib/actions/sandbox/status-preflight.ts index d46bf0457c..ac34bb46a0 100644 --- a/src/lib/actions/sandbox/status-preflight.ts +++ b/src/lib/actions/sandbox/status-preflight.ts @@ -127,15 +127,19 @@ export async function getSandboxStatusPreflight( } /** - * Print the exact first-line preflight header. Unlike gateway-level fallback - * headers this intentionally has no leading indentation because users and - * tests rely on `docker_unreachable` being the first bytes of status output. + * Preserve terminal OpenShell sandbox phases as the primary user-facing cause + * only for host-wide Docker daemon outages. A terminal `Failed`/`Error` phase + * is authoritative enough that docker_unreachable guidance would be misleading + * (#4428). Per-sandbox stopped-container and dashboard-port-conflict failures + * are more specific local delivery failures and must remain visible even when + * OpenShell reports `Phase: Error` (#4515). */ export function withoutTerminalPhasePreflight( preflight: SandboxStatusPreflightResult, phase: string | null, ): SandboxStatusPreflightResult { if (!phase || !isTerminalSandboxPhase(phase)) return preflight; + if (preflight.failureLayer !== "docker_unreachable") return preflight; return { failure: null, failureLayer: null, @@ -144,6 +148,11 @@ export function withoutTerminalPhasePreflight( }; } +/** + * Print the exact first-line preflight header. Unlike gateway-level fallback + * headers this intentionally has no leading indentation because users and + * tests rely on `docker_unreachable` being the first bytes of status output. + */ export function printSandboxStatusPreflightHeader( preflight: SandboxStatusPreflightResult, writer: (message: string) => void = console.log, diff --git a/test/cli.test.ts b/test/cli.test.ts index bad2d477d7..5b2b7de06e 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -1250,6 +1250,12 @@ describe("CLI dispatch", () => { path.join(localBin, "openshell"), [ "#!/usr/bin/env bash", + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo ' Name: alpha'", + " echo ' Phase: Error'", + " exit 0", + "fi", 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', " echo 'Gateway inference:'", " echo ' Provider: openai-api'", @@ -1282,6 +1288,7 @@ describe("CLI dispatch", () => { ), ).toBe(true); expect(r.out).not.toContain("Inference: healthy"); + expect(r.out).toContain("Phase: Error"); expect(r.out).not.toContain("Failure layer: docker_unreachable"); expect(r.out).not.toContain("Failure layer: sandbox_dashboard_port_conflict"); const headerIdx = r.out.indexOf("Failure layer: sandbox_container_stopped"); @@ -1337,6 +1344,12 @@ describe("CLI dispatch", () => { path.join(localBin, "openshell"), [ "#!/usr/bin/env bash", + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo ' Name: alpha'", + " echo ' Phase: Error'", + " exit 0", + "fi", 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', " echo 'Gateway inference:'", " echo ' Provider: openai-api'", @@ -1369,6 +1382,7 @@ describe("CLI dispatch", () => { ), ).toBe(true); expect(r.out).not.toContain("Inference: healthy"); + expect(r.out).toContain("Phase: Error"); expect(r.out).not.toContain("Failure layer: sandbox_container_stopped —"); const headerIdx = r.out.indexOf("Failure layer: sandbox_dashboard_port_conflict"); const sandboxIdx = r.out.indexOf("Sandbox: alpha"); @@ -1816,6 +1830,12 @@ describe("CLI dispatch", () => { path.join(localBin, "openshell"), [ "#!/usr/bin/env bash", + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo ' Name: alpha'", + " echo ' Phase: Error'", + " exit 0", + "fi", 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', " echo 'Gateway inference:'", " echo ' Provider: openai-api'", @@ -1844,6 +1864,7 @@ describe("CLI dispatch", () => { expect(r.code).toBe(1); const parsed = JSON.parse(r.out); expect(parsed.failureLayer).toBe("sandbox_container_stopped"); + expect(parsed.phase).toBe("Error"); expect(parsed.inferenceHealth).toBeNull(); }); @@ -1889,6 +1910,12 @@ describe("CLI dispatch", () => { path.join(localBin, "openshell"), [ "#!/usr/bin/env bash", + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo ' Name: alpha'", + " echo ' Phase: Error'", + " exit 0", + "fi", 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', " echo 'Gateway inference:'", " echo ' Provider: openai-api'", @@ -1917,6 +1944,7 @@ describe("CLI dispatch", () => { expect(r.code).toBe(1); const parsed = JSON.parse(r.out); expect(parsed.failureLayer).toBe("sandbox_dashboard_port_conflict"); + expect(parsed.phase).toBe("Error"); expect(parsed.inferenceHealth).toBeNull(); } finally { await new Promise((resolve) => server.close(() => resolve()));