diff --git a/docs/reference/commands.mdx b/docs/reference/commands.mdx index c303b3a780..ddcb99d617 100644 --- a/docs/reference/commands.mdx +++ b/docs/reference/commands.mdx @@ -402,9 +402,10 @@ $ nemoclaw my-assistant recover Show sandbox status, health, and inference configuration. Pass `--json` to emit a structured per-sandbox report instead of the text renderer. -The JSON output includes at least `schemaVersion`, `name`, `found`, `model`, `provider`, `phase`, `gatewayState`, `inferenceHealth`, `rpcIssue`, `hostGpuDetected`, `sandboxGpuEnabled`, `sandboxGpuMode`, `sandboxGpuDevice`, `openshellDriver`, `openshellVersion`, and `policies`. +The JSON output includes at least `schemaVersion`, `name`, `found`, `model`, `provider`, `phase`, `gatewayState`, `inferenceHealth`, `rpcIssue`, `hostGpuDetected`, `sandboxGpuEnabled`, `sandboxGpuMode`, `sandboxGpuDevice`, `openshellDriver`, `openshellVersion`, `policies`, `failureLayer`, and `dockerPaused`. `openshellDriver` and `openshellVersion` are always strings (falling back to `"unknown"` when the registry has no value), so consumers can rely on `typeof` checks. -The command exits non-zero when the sandbox is missing locally, the gateway state is not `present`, or the gateway reports a schema/protobuf mismatch (mirrored as `rpcIssue`). +`failureLayer` is `null` when no preflight failure was detected and otherwise one of `docker_unreachable`, `sandbox_container_stopped`, or `sandbox_dashboard_port_conflict`; when set, `inferenceHealth` is suppressed to `null` so automation does not see a stale remote-provider healthy status during a local outage. +The command exits non-zero when the sandbox is missing locally, the gateway state is not `present`, the gateway reports a schema/protobuf mismatch (mirrored as `rpcIssue`), or `failureLayer` is non-null. The alias form `nemoclaw status --json` requires the sandbox to be registered locally; the canonical form `nemoclaw sandbox status --json` is the one to use from automation that may run against an unknown sandbox name, since it still emits a JSON document with `found: false` instead of a text error. ```console @@ -431,6 +432,11 @@ Use that line to distinguish a healthy backend from a broken proxy path that the For cloud-only providers, the output omits the NIM status line unless a NIM container is registered or an unexpected NIM container is running. +When the sandbox's recorded driver is `docker` and the host Docker daemon is not reachable, the command prints `Failure layer: docker_unreachable — Docker daemon is not reachable.` as the first line of stdout, suppresses the host-side `Inference` probe (which otherwise hits the remote provider directly and is misleading when the local stack is down), and exits with a non-zero status. + +When the host Docker daemon is reachable but the per-sandbox container is stopped, the command prints `Failure layer: sandbox_container_stopped — sandbox container exists but is not running.` as the first line of stdout, suppresses the host-side `Inference` probe, and exits with a non-zero status. +If the sandbox's recorded dashboard port is also held by a foreign listener, the header escalates to `Failure layer: sandbox_dashboard_port_conflict — sandbox container is stopped and the dashboard port is held by a foreign listener.` so the operator can recover the port before restarting the sandbox. + If the sandbox or gateway cannot be verified, the command exits non-zero instead of reporting healthy inference from stale registry state. Gateway and dashboard health checks treat HTTP `401` from device auth as a live service, not as an offline gateway. diff --git a/src/commands/sandbox/status.ts b/src/commands/sandbox/status.ts index d081d5842f..8110dba44e 100644 --- a/src/commands/sandbox/status.ts +++ b/src/commands/sandbox/status.ts @@ -28,7 +28,12 @@ export default class SandboxStatusCommand extends NemoClawCommand { const { args } = await this.parse(SandboxStatusCommand); if (this.jsonEnabled()) { const report = await getSandboxStatusReport(args.sandboxName); - if (!report.found || report.gatewayState !== "present" || report.rpcIssue) { + if ( + !report.found || + report.gatewayState !== "present" || + report.rpcIssue || + report.failureLayer + ) { process.exitCode = 1; } // #4310: route the machine-readable report through the centralized diff --git a/src/lib/actions/sandbox/docker-health.ts b/src/lib/actions/sandbox/docker-health.ts index fb5e3cfd12..4b46ddfb7b 100644 --- a/src/lib/actions/sandbox/docker-health.ts +++ b/src/lib/actions/sandbox/docker-health.ts @@ -4,6 +4,7 @@ import { dockerContainerInspectFormat } from "../../adapters/docker/inspect"; import { dockerCapture } from "../../adapters/docker/run"; import * as registry from "../../state/registry"; +import { resolveSandboxContainerOwner } from "./sandbox-container-owner"; export type DockerHealthState = | "healthy" @@ -68,48 +69,11 @@ function resolveDockerDriverSandboxContainer( } catch { return null; } - // OpenShell names sandbox containers either as `openshell-` - // (no suffix) or `openshell--` where is a runtime - // identifier appended by openshell. Two ambiguities to avoid: - // - // 1. A sandbox name can be a prefix of another sandbox name - // (`my` vs `my-assistant`). - // 2. Even with a hyphen-free suffix, a sandbox name can be a prefix - // of another sandbox name whose own suffix is hyphen-free - // (`my-assistant` vs `my-assistant-prod`). - // - // To disambiguate, resolve each candidate to the LONGEST registered - // sandbox name it could belong to. We only accept a candidate when - // that resolved owner is the sandbox we are looking up. This also - // gives the right answer for the `openshell-` exact form. - const ourPrefix = `openshell-${sandboxName}-`; - const ourExact = `openshell-${sandboxName}`; - const knownSandboxes = new Set(deps.listSandboxNames()); - knownSandboxes.add(sandboxName); - const candidates = deps - .dockerPsNames() - .split("\n") - .map((line) => line.trim()) - .filter((line) => line === ourExact || line.startsWith(ourPrefix)); - // Prefer the exact-name container before considering suffixed ones. - // Without this short-circuit, a suffixed `openshell--` whose - // is a docker runtime suffix (not a registered sandbox name) would - // resolve to our sandbox via the longest-match heuristic and beat the - // co-existing exact `openshell-` if it appeared earlier in - // `docker ps`. - if (candidates.includes(ourExact)) return ourExact; - for (const candidate of candidates) { - const stripped = candidate.replace(/^openshell-/, ""); - // Find the longest known sandbox whose container name pattern - // matches this candidate. Longest-first defeats prefix collisions. - const owner = [...knownSandboxes] - .filter( - (name) => stripped === name || stripped.startsWith(`${name}-`), - ) - .sort((a, b) => b.length - a.length)[0]; - if (owner === sandboxName) return candidate; - } - return null; + return resolveSandboxContainerOwner( + deps.dockerPsNames(), + sandboxName, + deps.listSandboxNames(), + ); } function normalizeHealthState(raw: string): DockerHealthState { diff --git a/src/lib/actions/sandbox/gateway-failure-classifier.ts b/src/lib/actions/sandbox/gateway-failure-classifier.ts index 5ba0928910..605d5a9b46 100644 --- a/src/lib/actions/sandbox/gateway-failure-classifier.ts +++ b/src/lib/actions/sandbox/gateway-failure-classifier.ts @@ -8,6 +8,7 @@ import { dockerCapture } from "../../adapters/docker/run"; import { CLI_NAME } from "../../cli/branding"; import { GATEWAY_PORT } from "../../core/ports"; import * as registry from "../../state/registry"; +import { resolveSandboxContainerOwner } from "./sandbox-container-owner"; const DEFAULT_CONTAINER = "openshell-cluster-nemoclaw"; const DOCKER_TIMEOUT_MS = 3000; @@ -18,7 +19,9 @@ export type GatewayFailureLayer = | "container_missing" | "container_exited_port_conflict" | "container_exited" - | "gateway_unreachable"; + | "gateway_unreachable" + | "sandbox_container_stopped" + | "sandbox_dashboard_port_conflict"; export type GatewayFailureResult = { layer: GatewayFailureLayer; @@ -32,10 +35,30 @@ export type GatewayFailureRunners = { portProbe: (port: number) => Promise; }; +export type SandboxContainerFailureLayer = + | "sandbox_container_stopped" + | "sandbox_dashboard_port_conflict"; + +export type SandboxContainerFailureResult = { + layer: SandboxContainerFailureLayer; + detail: string; +}; + +export type SandboxContainerFailureRunners = { + listAllContainerNames: () => string; + listRunningContainerNames: () => string; + listSandboxNames: () => string[]; + portProbe: (port: number) => Promise; +}; + function defaultDockerInfo(): boolean { return dockerInfo({ ignoreError: true, timeout: DOCKER_TIMEOUT_MS }).length > 0; } +export function isDockerDaemonReachable(): boolean { + return defaultDockerInfo(); +} + function dockerContainerListed(container: string, allFlag: boolean): boolean { const args = ["ps"]; if (allFlag) args.push("-a"); @@ -129,12 +152,85 @@ const LAYER_HEADERS: Record = { container_exited: "Failure layer: container_exited — container exited.", gateway_unreachable: "Failure layer: gateway_unreachable — container running but gateway API unresponsive.", + sandbox_container_stopped: + "Failure layer: sandbox_container_stopped — sandbox container exists but is not running.", + sandbox_dashboard_port_conflict: + "Failure layer: sandbox_dashboard_port_conflict — sandbox container is stopped and the dashboard port is held by a foreign listener.", }; export function getLayerHeader(layer: GatewayFailureLayer): string { return LAYER_HEADERS[layer]; } +function defaultListAllContainerNames(): string { + return dockerCapture(["ps", "-a", "--format", "{{.Names}}"], { + ignoreError: true, + timeout: DOCKER_TIMEOUT_MS, + }); +} + +function defaultListRunningContainerNames(): string { + return dockerCapture(["ps", "--format", "{{.Names}}"], { + ignoreError: true, + timeout: DOCKER_TIMEOUT_MS, + }); +} + +function defaultListSandboxNames(): string[] { + try { + return registry.listSandboxes().sandboxes.map((entry) => entry.name); + } catch { + return []; + } +} + +const defaultSandboxContainerRunners: SandboxContainerFailureRunners = { + listAllContainerNames: defaultListAllContainerNames, + listRunningContainerNames: defaultListRunningContainerNames, + listSandboxNames: defaultListSandboxNames, + portProbe: defaultPortProbe, +}; + +function isValidDashboardPort(port: number | null | undefined): port is number { + return ( + typeof port === "number" && Number.isInteger(port) && port >= 1 && port <= 65535 + ); +} + +export async function classifySandboxContainerFailure( + sandboxName: string, + opts: { + dashboardPort?: number | null; + runners?: SandboxContainerFailureRunners; + } = {}, +): Promise { + const runners = opts.runners ?? defaultSandboxContainerRunners; + const registeredSandboxNames = runners.listSandboxNames(); + const running = resolveSandboxContainerOwner( + runners.listRunningContainerNames(), + sandboxName, + registeredSandboxNames, + ); + if (running) return null; + const present = resolveSandboxContainerOwner( + runners.listAllContainerNames(), + sandboxName, + registeredSandboxNames, + ); + if (!present) return null; + const dashboardPort = opts.dashboardPort; + if (isValidDashboardPort(dashboardPort) && (await runners.portProbe(dashboardPort))) { + return { + layer: "sandbox_dashboard_port_conflict", + detail: `Sandbox container '${present}' is stopped and dashboard port ${dashboardPort} is held by another process.`, + }; + } + return { + layer: "sandbox_container_stopped", + detail: `Sandbox container '${present}' exists but is not running.`, + }; +} + type SandboxDriverLookup = ( name: string, ) => { openshellDriver?: string | null } | null | undefined; @@ -160,7 +256,10 @@ const NON_DOCKER_DRIVERS = new Set(["vm"]); * guidance on a Docker-less host; that is preferable to silently regressing * every legacy Docker sandbox. (#4428) */ -function isDockerBackedSandbox(sandboxName: string, getSandbox: SandboxDriverLookup): boolean { +function isDockerBackedSandbox( + sandboxName: string, + getSandbox: SandboxDriverLookup, +): boolean { const driver = getSandbox(sandboxName)?.openshellDriver; return !(typeof driver === "string" && NON_DOCKER_DRIVERS.has(driver.toLowerCase())); } @@ -170,10 +269,9 @@ function isDockerBackedSandbox(sandboxName: string, getSandbox: SandboxDriverLoo * `docker_unreachable` layer of {@link classifyGatewayFailure}). Sandbox * commands use this as a fast preflight so a transient Docker daemon outage is * classified as a host runtime problem rather than a stuck sandbox phase or a - * connect timeout (#4428). Returns `false` for non-Docker drivers so VM/ - * Kubernetes sandboxes are never misclassified. `docker info` is a `spawnSync` - * call, so this stays synchronous and can run from non-async call sites such - * as `logs` and `policy-list`. + * connect timeout (#4428). Returns `false` for VM sandboxes so they are never + * misclassified. `docker info` is a `spawnSync` call, so this stays synchronous + * and can run from non-async call sites such as `logs` and `policy-list`. */ export function isDockerRuntimeDown( sandboxName: string, diff --git a/src/lib/actions/sandbox/sandbox-container-owner.ts b/src/lib/actions/sandbox/sandbox-container-owner.ts new file mode 100644 index 0000000000..326649836d --- /dev/null +++ b/src/lib/actions/sandbox/sandbox-container-owner.ts @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * Resolve which OpenShell container owns a given sandbox name. + * + * OpenShell names sandbox containers either as `openshell-` (no + * suffix) or `openshell--`, where `` is appended by openshell + * at runtime. Two prefix collisions are possible: + * + * 1. A sandbox name can be a prefix of another sandbox name + * (`my` vs `my-assistant`). + * 2. Even with a hyphen-free ``, a sandbox name can be a prefix + * of another sandbox name whose own suffix is hyphen-free + * (`my-assistant` vs `my-assistant-prod`). + * + * The longest-owner rule resolves each candidate to the longest registered + * sandbox name that could claim it, then only accepts candidates that resolve + * back to the queried sandbox. The exact-name form is preferred before + * suffixed forms so `openshell-` always wins over an unrelated + * `openshell--` co-tenant. + */ +export function resolveSandboxContainerOwner( + containerNamesRaw: string, + sandboxName: string, + registeredSandboxNames: Iterable, +): string | null { + const ourPrefix = `openshell-${sandboxName}-`; + const ourExact = `openshell-${sandboxName}`; + const known = new Set(registeredSandboxNames); + known.add(sandboxName); + const candidates = containerNamesRaw + .split("\n") + .map((line) => line.trim()) + .filter((line) => line === ourExact || line.startsWith(ourPrefix)); + if (candidates.includes(ourExact)) return ourExact; + const knownArr = [...known]; + for (const candidate of candidates) { + const stripped = candidate.replace(/^openshell-/, ""); + const owner = knownArr + .filter((name) => stripped === name || stripped.startsWith(`${name}-`)) + .sort((a, b) => b.length - a.length)[0]; + if (owner === sandboxName) return candidate; + } + return null; +} diff --git a/src/lib/actions/sandbox/status-preflight.ts b/src/lib/actions/sandbox/status-preflight.ts new file mode 100644 index 0000000000..ac34bb46a0 --- /dev/null +++ b/src/lib/actions/sandbox/status-preflight.ts @@ -0,0 +1,180 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { isTerminalSandboxPhase } from "../../state/gateway"; +import type * as registry from "../../state/registry"; +import { + classifyGatewayFailure, + classifySandboxContainerFailure, + getLayerHeader, + isDockerDaemonReachable, + type SandboxContainerFailureResult, +} from "./gateway-failure-classifier"; + +// Source-of-truth boundary: this module owns the host-level preflight +// classification for `sandbox status` — host docker daemon reachability, +// per-sandbox container state, and dashboard-port conflict. It is the +// durable owner because: +// - cached registry metadata, the in-sandbox gateway probe, and the +// host-side provider probe are all unreliable when the local stack +// is partly down (the provider probe in particular hits the remote +// endpoint directly and falsely shows "healthy"); +// - OpenShell does not currently expose a single host-driver health +// RPC that distinguishes these layers, so the classifier has to +// consult docker info + container state directly; +// - status.ts intentionally stays the renderer/report adapter; the +// classifier here lets the human-readable and `--json` paths share +// identical failure-layer decisions and inference-probe gating. +// Retire this module when OpenShell exposes a unified host-driver +// preflight RPC that returns the same `docker_unreachable`, +// `sandbox_container_stopped`, and `sandbox_dashboard_port_conflict` +// layers; at that point this classifier becomes a thin adapter over +// that RPC and `printGatewayFailureLayerHeader` can move into the +// renderer. + +export type SandboxStatusFailureLayer = + | "docker_unreachable" + | "sandbox_container_stopped" + | "sandbox_dashboard_port_conflict"; + +export interface SandboxStatusPreflightFailure { + layer: SandboxStatusFailureLayer; + dockerUnreachable: boolean; +} + +export interface SandboxStatusPreflightResult { + failure: SandboxStatusPreflightFailure | null; + failureLayer: SandboxStatusFailureLayer | null; + suppressInferenceProbe: boolean; + exitCode: 0 | 1; +} + +export type DockerInfoProbe = () => boolean; + +export type SandboxContainerFailureProbe = ( + sandboxName: string, + dashboardPort: number | null, +) => Promise; + +const defaultSandboxContainerFailureProbe: SandboxContainerFailureProbe = ( + sandboxName, + dashboardPort, +) => classifySandboxContainerFailure(sandboxName, { dashboardPort }); + +export interface ClassifySandboxStatusPreflightFailureDeps { + dockerProbe?: DockerInfoProbe; + sandboxContainerProbe?: SandboxContainerFailureProbe; +} + +export function isDockerDaemonUnreachableForStatus( + sb: registry.SandboxEntry | null, + probe: DockerInfoProbe = isDockerDaemonReachable, +): boolean { + if (!sb || sb.openshellDriver !== "docker") return false; + return !probe(); +} + +export async function classifySandboxContainerFailureForStatus( + sb: registry.SandboxEntry | null, + probe: SandboxContainerFailureProbe = defaultSandboxContainerFailureProbe, +): Promise { + if (!sb || sb.openshellDriver !== "docker") return null; + return probe(sb.name, sb.dashboardPort ?? null); +} + +/** + * Classify pre-snapshot failure layers (host docker daemon down, per-sandbox + * container stopped, dashboard port held by foreign listener). Returns null + * when none apply, including when the sandbox is not on the docker driver or + * the registry has no entry. Shared between the human-readable status + * renderer and the `--json` report so both paths gate the inference probe + * consistently and the JSON path can surface the same failure layer. + */ +export async function classifySandboxStatusPreflightFailure( + sb: registry.SandboxEntry | null, + deps: ClassifySandboxStatusPreflightFailureDeps = {}, +): Promise { + if (isDockerDaemonUnreachableForStatus(sb, deps.dockerProbe)) { + return { layer: "docker_unreachable", dockerUnreachable: true }; + } + const sandboxFailure = await classifySandboxContainerFailureForStatus( + sb, + deps.sandboxContainerProbe, + ); + if (sandboxFailure) { + return { layer: sandboxFailure.layer, dockerUnreachable: false }; + } + return null; +} + +/** + * Shared text/JSON adapter for preflight failures. It owns the projection from + * classifier result to JSON `failureLayer`, inference-probe suppression, and + * the text renderer's non-zero exit decision so `status.ts` only renders the + * returned contract. + */ +export async function getSandboxStatusPreflight( + sb: registry.SandboxEntry | null, + deps: ClassifySandboxStatusPreflightFailureDeps = {}, +): Promise { + const failure = await classifySandboxStatusPreflightFailure(sb, deps); + return { + failure, + failureLayer: failure ? failure.layer : null, + suppressInferenceProbe: failure !== null, + exitCode: failure ? 1 : 0, + }; +} + +/** + * Preserve terminal OpenShell sandbox phases as the primary user-facing cause + * only for host-wide Docker daemon outages. A terminal `Failed`/`Error` phase + * is authoritative enough that docker_unreachable guidance would be misleading + * (#4428). Per-sandbox stopped-container and dashboard-port-conflict failures + * are more specific local delivery failures and must remain visible even when + * OpenShell reports `Phase: Error` (#4515). + */ +export function withoutTerminalPhasePreflight( + preflight: SandboxStatusPreflightResult, + phase: string | null, +): SandboxStatusPreflightResult { + if (!phase || !isTerminalSandboxPhase(phase)) return preflight; + if (preflight.failureLayer !== "docker_unreachable") return preflight; + return { + failure: null, + failureLayer: null, + suppressInferenceProbe: preflight.suppressInferenceProbe, + exitCode: 0, + }; +} + +/** + * Print the exact first-line preflight header. Unlike gateway-level fallback + * headers this intentionally has no leading indentation because users and + * tests rely on `docker_unreachable` being the first bytes of status output. + */ +export function printSandboxStatusPreflightHeader( + preflight: SandboxStatusPreflightResult, + writer: (message: string) => void = console.log, +): void { + if (preflight.failure) { + writer(getLayerHeader(preflight.failure.layer)); + } +} + +/** + * Print the gateway-level failure-layer header for `sandbox status`. The + * preflight classifier (docker_unreachable, sandbox_container_stopped, + * sandbox_dashboard_port_conflict) is more specific than the downstream + * gateway-state classifier. When it already emitted a header, skip the + * gateway-level fallback entirely to avoid a duplicate `Failure layer:` + * line in the user-visible output. + */ +export async function printGatewayFailureLayerHeader( + sandboxName: string, + alreadyPrintedPreflightLayer: SandboxStatusFailureLayer | null = null, +): Promise { + if (alreadyPrintedPreflightLayer !== null) return; + const failure = await classifyGatewayFailure(sandboxName); + console.log(` ${getLayerHeader(failure.layer)}`); +} diff --git a/src/lib/actions/sandbox/status-snapshot.ts b/src/lib/actions/sandbox/status-snapshot.ts new file mode 100644 index 0000000000..6073fc5237 --- /dev/null +++ b/src/lib/actions/sandbox/status-snapshot.ts @@ -0,0 +1,231 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { + detectOpenShellStateRpcResultIssue, + type OpenShellStateRpcIssue, +} from "../../adapters/openshell/gateway-drift"; +import { + captureOpenshellForStatus, + isCommandTimeout, +} from "../../adapters/openshell/runtime"; +import { parseGatewayInference } from "../../inference/config"; +import { + type ProviderHealthProbeOptions, + type ProviderHealthStatus, + probeProviderHealth, +} from "../../inference/health"; +import { parseSandboxPhase } from "../../state/gateway"; +import * as registry from "../../state/registry"; +import { getSandboxDockerRuntime } from "./docker-health"; +import type { SandboxGatewayState } from "./gateway-state"; +import { + getReconciledSandboxGatewayState, + getSandboxGatewayStateForStatus, +} from "./gateway-state"; +import { probeSandboxInferenceGatewayHealth } from "./process-recovery"; +import { + getSandboxStatusPreflight, + withoutTerminalPhasePreflight, + type SandboxStatusFailureLayer, +} from "./status-preflight"; + +type ProbeProviderHealth = ( + provider: string, + options?: ProviderHealthProbeOptions, +) => ProviderHealthStatus | null; + +export function getSandboxStatusInferenceHealth( + gatewayPresent: boolean, + currentProvider: unknown, + currentModel: unknown, + probeProviderHealthImpl: ProbeProviderHealth = probeProviderHealth, +): ProviderHealthStatus | null { + if (!gatewayPresent || typeof currentProvider !== "string") return null; + return probeProviderHealthImpl(currentProvider, { + model: typeof currentModel === "string" ? currentModel : undefined, + }); +} + +/** + * Gate around `getSandboxStatusInferenceHealth` that short-circuits when the + * caller has already classified a pre-snapshot failure (docker daemon down, + * sandbox container stopped, dashboard port held). Returns null without + * touching the provider probe so the remote-provider reachability request is + * never issued in those cases. + */ +export function maybeGetSandboxStatusInferenceHealth( + suppressInferenceProbe: boolean, + gatewayPresent: boolean, + currentProvider: unknown, + currentModel: unknown, + probeProviderHealthImpl?: ProbeProviderHealth, +): ProviderHealthStatus | null { + if (suppressInferenceProbe) return null; + return getSandboxStatusInferenceHealth( + gatewayPresent, + currentProvider, + currentModel, + probeProviderHealthImpl, + ); +} + +export interface SandboxStatusReport { + schemaVersion: 1; + name: string; + found: boolean; + model: string; + provider: string; + phase: string | null; + gatewayState: string; + inferenceHealth: ProviderHealthStatus | null; + rpcIssue: { kind: "image_drift" | "host_process_drift" | "protobuf_mismatch" } | null; + hostGpuDetected: boolean; + sandboxGpuEnabled: boolean; + sandboxGpuMode: string | null; + sandboxGpuDevice: string | null; + openshellDriver: string; + openshellVersion: string; + policies: string[]; + failureLayer: SandboxStatusFailureLayer | null; + /** + * Whether the resolved docker-driver sandbox container is paused + * (`docker pause`). `false` for non-docker-driver sandboxes or when no + * container is found. A paused container can report `Phase: Error` + * upstream while the sandbox is intact — see #4495. + */ + dockerPaused: boolean; +} + +export interface SandboxStatusSnapshot { + sb: registry.SandboxEntry | null; + lookup: SandboxGatewayState; + rpcIssue: OpenShellStateRpcIssue | null; + currentModel: string; + currentProvider: string; + inferenceHealth: ProviderHealthStatus | null; +} + +interface CollectSandboxStatusSnapshotDeps { + probeProviderHealthImpl?: ProbeProviderHealth; +} + +export async function collectSandboxStatusSnapshot( + sandboxName: string, + opts: { + suppressInferenceProbe?: boolean; + deps?: CollectSandboxStatusSnapshotDeps; + } = {}, +): Promise { + const sb = registry.getSandbox(sandboxName); + let lookup: SandboxGatewayState; + try { + lookup = await getReconciledSandboxGatewayState(sandboxName, { + getState: getSandboxGatewayStateForStatus, + }); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + lookup = { + state: "gateway_error", + output: ` Could not probe live gateway state: ${message}`, + }; + } + let liveResult: Awaited> | null = null; + if (lookup.state === "present") { + try { + liveResult = await captureOpenshellForStatus(["inference", "get"]); + } catch { + liveResult = null; + } + } + const rpcIssue = liveResult ? detectOpenShellStateRpcResultIssue(liveResult) : null; + if (rpcIssue) { + return { + sb, + lookup, + rpcIssue, + currentModel: "unknown", + currentProvider: "unknown", + inferenceHealth: null, + }; + } + const live = + liveResult && !isCommandTimeout(liveResult) ? parseGatewayInference(liveResult.output) : null; + const currentModel = (live && live.model) || (sb && sb.model) || "unknown"; + const currentProvider = (live && live.provider) || (sb && sb.provider) || "unknown"; + // When the caller has already determined that the local stack is failed + // (docker daemon down, sandbox container stopped, dashboard port held), + // skip the provider probe entirely. Without this gate + // `getSandboxStatusInferenceHealth` would still issue the remote-provider + // reachability request even though the caller would overwrite the returned + // value to null afterwards. + const inferenceHealth = maybeGetSandboxStatusInferenceHealth( + opts.suppressInferenceProbe === true, + lookup.state === "present", + currentProvider, + currentModel, + opts.deps?.probeProviderHealthImpl, + ); + if ( + inferenceHealth && + lookup.state === "present" && + (currentProvider === "ollama-local" || currentProvider === "vllm-local") + ) { + const gatewayChain = await probeSandboxInferenceGatewayHealth(sandboxName); + if (gatewayChain) { + const gatewaySubprobe: ProviderHealthStatus = { + ok: gatewayChain.ok, + probed: true, + providerLabel: "Inference gateway chain", + endpoint: gatewayChain.endpoint, + detail: gatewayChain.detail, + probeLabel: "gateway", + ...(gatewayChain.ok ? {} : { failureLabel: "unreachable" as const }), + }; + inferenceHealth.subprobes = [...(inferenceHealth.subprobes ?? []), gatewaySubprobe]; + } + } + return { sb, lookup, rpcIssue, currentModel, currentProvider, inferenceHealth }; +} + +export async function getSandboxStatusReport( + sandboxName: string, +): Promise { + const preflight = await getSandboxStatusPreflight(registry.getSandbox(sandboxName)); + const snapshot = await collectSandboxStatusSnapshot(sandboxName, { + suppressInferenceProbe: preflight.suppressInferenceProbe, + }); + const { sb, lookup, rpcIssue, currentModel, currentProvider, inferenceHealth } = snapshot; + const dockerRuntime = + lookup.state === "present" ? getSandboxDockerRuntime(sandboxName) : null; + const phase = + lookup.state === "present" ? parseSandboxPhase(lookup.output || "") : null; + const effectivePreflight = withoutTerminalPhasePreflight(preflight, phase); + const sandboxGpuEnabled = sb + ? (sb.sandboxGpuEnabled ?? (sb.gpuEnabled === true)) + : false; + const policies = + sb && Array.isArray(sb.policies) + ? sb.policies.filter((policy): policy is string => typeof policy === "string") + : []; + return { + schemaVersion: 1, + name: sandboxName, + found: !!sb, + model: currentModel, + provider: currentProvider, + phase, + gatewayState: lookup.state, + inferenceHealth, + rpcIssue: rpcIssue ? { kind: rpcIssue.kind } : null, + hostGpuDetected: !!(sb && sb.hostGpuDetected), + sandboxGpuEnabled, + sandboxGpuMode: (sb && sb.sandboxGpuMode) || null, + sandboxGpuDevice: (sb && sb.sandboxGpuDevice) || null, + openshellDriver: (sb && sb.openshellDriver) || "unknown", + openshellVersion: (sb && sb.openshellVersion) || "unknown", + policies, + failureLayer: effectivePreflight.failureLayer, + dockerPaused: !!dockerRuntime?.paused, + }; +} diff --git a/src/lib/actions/sandbox/status.test.ts b/src/lib/actions/sandbox/status.test.ts index b75864a586..9722fc0399 100644 --- a/src/lib/actions/sandbox/status.test.ts +++ b/src/lib/actions/sandbox/status.test.ts @@ -4,7 +4,13 @@ import { describe, expect, it } from "vitest"; import type { ProviderHealthProbeOptions } from "../../../../dist/lib/inference/health"; -import { getSandboxStatusInferenceHealth } from "../../../../dist/lib/actions/sandbox/status"; +import { + classifySandboxContainerFailureForStatus, + classifySandboxStatusPreflightFailure, + getSandboxStatusInferenceHealth, + isDockerDaemonUnreachableForStatus, + maybeGetSandboxStatusInferenceHealth, +} from "../../../../dist/lib/actions/sandbox/status"; describe("sandbox status inference health", () => { it("passes the current model with the current provider", () => { @@ -50,3 +56,223 @@ describe("sandbox status inference health", () => { expect(called).toBe(false); }); }); + +describe("isDockerDaemonUnreachableForStatus", () => { + it("returns false when sandbox entry is null", () => { + expect(isDockerDaemonUnreachableForStatus(null, () => false)).toBe(false); + }); + + it("returns false when the openshell driver is not docker", () => { + expect( + isDockerDaemonUnreachableForStatus( + { name: "alpha", openshellDriver: "vm" } as never, + () => false, + ), + ).toBe(false); + }); + + it("returns true when driver is docker and the probe reports unreachable", () => { + expect( + isDockerDaemonUnreachableForStatus( + { name: "alpha", openshellDriver: "docker" } as never, + () => false, + ), + ).toBe(true); + }); + + it("returns false when driver is docker and the probe reports reachable", () => { + expect( + isDockerDaemonUnreachableForStatus( + { name: "alpha", openshellDriver: "docker" } as never, + () => true, + ), + ).toBe(false); + }); +}); + +describe("classifySandboxContainerFailureForStatus", () => { + it("returns null when sandbox entry is null", async () => { + const probe = async () => { + throw new Error("probe should not be invoked"); + }; + await expect( + classifySandboxContainerFailureForStatus(null, probe), + ).resolves.toBeNull(); + }); + + it("returns null when the openshell driver is not docker", async () => { + let called = false; + const probe = async () => { + called = true; + return null; + }; + await expect( + classifySandboxContainerFailureForStatus( + { name: "alpha", openshellDriver: "vm" } as never, + probe, + ), + ).resolves.toBeNull(); + expect(called).toBe(false); + }); + + it("forwards the sandbox name and dashboard port to the probe and propagates its verdict", async () => { + const observed: { sandboxName: string; port: number | null }[] = []; + const probe = async (sandboxName: string, dashboardPort: number | null) => { + observed.push({ sandboxName, port: dashboardPort }); + return { + layer: "sandbox_dashboard_port_conflict" as const, + detail: "stub failure", + }; + }; + const result = await classifySandboxContainerFailureForStatus( + { + name: "alpha", + openshellDriver: "docker", + dashboardPort: 18900, + } as never, + probe, + ); + expect(result).toEqual({ + layer: "sandbox_dashboard_port_conflict", + detail: "stub failure", + }); + expect(observed).toEqual([{ sandboxName: "alpha", port: 18900 }]); + }); + + it("passes null when the sandbox entry has no dashboard port recorded", async () => { + const observed: { sandboxName: string; port: number | null }[] = []; + const probe = async (sandboxName: string, dashboardPort: number | null) => { + observed.push({ sandboxName, port: dashboardPort }); + return null; + }; + await expect( + classifySandboxContainerFailureForStatus( + { name: "alpha", openshellDriver: "docker" } as never, + probe, + ), + ).resolves.toBeNull(); + expect(observed).toEqual([{ sandboxName: "alpha", port: null }]); + }); +}); + +describe("maybeGetSandboxStatusInferenceHealth", () => { + it("does not invoke the provider probe when suppressInferenceProbe is true even with a present gateway and string provider", () => { + let probeCalls = 0; + const result = maybeGetSandboxStatusInferenceHealth( + true, + true, + "nvidia-prod", + "nvidia/nemotron", + (...args) => { + probeCalls += 1; + throw new Error(`probeProviderHealth should not be invoked (args=${JSON.stringify(args)})`); + }, + ); + expect(result).toBeNull(); + expect(probeCalls).toBe(0); + }); + + it("delegates to the probe when suppressInferenceProbe is false", () => { + const calls: { provider: string; options?: ProviderHealthProbeOptions }[] = []; + const result = maybeGetSandboxStatusInferenceHealth( + false, + true, + "nvidia-prod", + "nvidia/nemotron", + (provider, options) => { + calls.push({ provider, options }); + return { + ok: true, + probed: true, + providerLabel: "NVIDIA Endpoints", + endpoint: "https://integrate.api.nvidia.com/v1/chat/completions", + detail: "healthy", + }; + }, + ); + expect(result?.ok).toBe(true); + expect(calls).toEqual([ + { provider: "nvidia-prod", options: { model: "nvidia/nemotron" } }, + ]); + }); +}); + +describe("classifySandboxStatusPreflightFailure", () => { + it("returns docker_unreachable when the daemon probe reports unreachable", async () => { + let sandboxProbeCalled = false; + const result = await classifySandboxStatusPreflightFailure( + { name: "alpha", openshellDriver: "docker" } as never, + { + dockerProbe: () => false, + sandboxContainerProbe: async () => { + sandboxProbeCalled = true; + return null; + }, + }, + ); + expect(result).toEqual({ layer: "docker_unreachable", dockerUnreachable: true }); + // Short-circuits: a daemon that is already known to be down must not + // trigger a follow-up `docker ps` round trip. + expect(sandboxProbeCalled).toBe(false); + }); + + it("returns the sandbox container failure when the daemon is reachable", async () => { + const result = await classifySandboxStatusPreflightFailure( + { name: "alpha", openshellDriver: "docker", dashboardPort: 18789 } as never, + { + dockerProbe: () => true, + sandboxContainerProbe: async (sandboxName, dashboardPort) => { + expect(sandboxName).toBe("alpha"); + expect(dashboardPort).toBe(18789); + return { + layer: "sandbox_dashboard_port_conflict", + detail: "stub failure", + }; + }, + }, + ); + expect(result).toEqual({ + layer: "sandbox_dashboard_port_conflict", + dockerUnreachable: false, + }); + }); + + it("returns null when the sandbox container probe finds no failure", async () => { + const result = await classifySandboxStatusPreflightFailure( + { name: "alpha", openshellDriver: "docker" } as never, + { + dockerProbe: () => true, + sandboxContainerProbe: async () => null, + }, + ); + expect(result).toBeNull(); + }); + + it("returns null when the sandbox is not on the docker driver", async () => { + let dockerCalled = false; + let sandboxCalled = false; + const result = await classifySandboxStatusPreflightFailure( + { name: "alpha", openshellDriver: "vm" } as never, + { + dockerProbe: () => { + dockerCalled = true; + return false; + }, + sandboxContainerProbe: async () => { + sandboxCalled = true; + return null; + }, + }, + ); + expect(result).toBeNull(); + // Both gates are docker-driver-only; a vm sandbox must not provoke + // either probe. + expect(dockerCalled).toBe(false); + expect(sandboxCalled).toBe(false); + }); + + it("returns null when the sandbox entry is null", async () => { + const result = await classifySandboxStatusPreflightFailure(null); + expect(result).toBeNull(); + }); +}); diff --git a/src/lib/actions/sandbox/status.ts b/src/lib/actions/sandbox/status.ts index 4b91da4dcc..99667784ed 100644 --- a/src/lib/actions/sandbox/status.ts +++ b/src/lib/actions/sandbox/status.ts @@ -2,24 +2,12 @@ // SPDX-License-Identifier: Apache-2.0 -import { - detectOpenShellStateRpcResultIssue, - printOpenShellStateRpcIssue, -} from "../../adapters/openshell/gateway-drift"; +import { printOpenShellStateRpcIssue } from "../../adapters/openshell/gateway-drift"; import { resolveOpenshell } from "../../adapters/openshell/resolve"; -import { - captureOpenshellForStatus, - isCommandTimeout, -} from "../../adapters/openshell/runtime"; import * as agentRuntime from "../../agent/runtime"; import { CLI_DISPLAY_NAME, CLI_NAME } from "../../cli/branding"; import { D, G, R, RD, YW } from "../../cli/terminal-style"; -import { parseGatewayInference } from "../../inference/config"; -import { - type ProviderHealthProbeOptions, - type ProviderHealthStatus, - probeProviderHealth, -} from "../../inference/health"; +import { type ProviderHealthStatus } from "../../inference/health"; import * as nim from "../../inference/nim"; import * as sandboxVersion from "../../sandbox/version"; import * as shields from "../../shields"; @@ -31,27 +19,44 @@ import { } from "../../state/sandbox-session"; import { getSandboxDockerRuntime } from "./docker-health"; import { - classifyGatewayFailure, - getLayerHeader, isDockerRuntimeDown, printDockerRuntimeDownGuidance, } from "./gateway-failure-classifier"; import type { SandboxGatewayState } from "./gateway-state"; import { - getReconciledSandboxGatewayState, - getSandboxGatewayStateForStatus, printGatewayLifecycleHint, printWrongGatewayActiveGuidance, } from "./gateway-state"; +import { isSandboxGatewayRunningForStatus } from "./process-recovery"; +import { collectSandboxStatusSnapshot } from "./status-snapshot"; import { - isSandboxGatewayRunningForStatus, - probeSandboxInferenceGatewayHealth, -} from "./process-recovery"; + getSandboxStatusPreflight, + printGatewayFailureLayerHeader, + printSandboxStatusPreflightHeader, + withoutTerminalPhasePreflight, +} from "./status-preflight"; -type ProbeProviderHealth = ( - provider: string, - options?: ProviderHealthProbeOptions, -) => ProviderHealthStatus | null; +export { + classifySandboxContainerFailureForStatus, + classifySandboxStatusPreflightFailure, + isDockerDaemonUnreachableForStatus, + getSandboxStatusPreflight, + printGatewayFailureLayerHeader, + printSandboxStatusPreflightHeader, + withoutTerminalPhasePreflight, + type ClassifySandboxStatusPreflightFailureDeps, + type SandboxStatusFailureLayer, + type SandboxStatusPreflightFailure, + type SandboxStatusPreflightResult, +} from "./status-preflight"; +export { + collectSandboxStatusSnapshot, + getSandboxStatusInferenceHealth, + getSandboxStatusReport, + maybeGetSandboxStatusInferenceHealth, + type SandboxStatusReport, + type SandboxStatusSnapshot, +} from "./status-snapshot"; /** * Returns true when status can validate a cached agent version against the running sandbox. @@ -63,156 +68,6 @@ function shouldProbeSandboxRuntimeVersion( return lookup.state === "present" && Boolean(sandbox.agentVersion); } -export function getSandboxStatusInferenceHealth( - gatewayPresent: boolean, - currentProvider: unknown, - currentModel: unknown, - probeProviderHealthImpl: ProbeProviderHealth = probeProviderHealth, -): ProviderHealthStatus | null { - if (!gatewayPresent || typeof currentProvider !== "string") return null; - return probeProviderHealthImpl(currentProvider, { - model: typeof currentModel === "string" ? currentModel : undefined, - }); -} - -export interface SandboxStatusReport { - schemaVersion: 1; - name: string; - found: boolean; - model: string; - provider: string; - phase: string | null; - gatewayState: string; - inferenceHealth: ProviderHealthStatus | null; - rpcIssue: { kind: "image_drift" | "host_process_drift" | "protobuf_mismatch" } | null; - hostGpuDetected: boolean; - sandboxGpuEnabled: boolean; - sandboxGpuMode: string | null; - sandboxGpuDevice: string | null; - openshellDriver: string; - openshellVersion: string; - policies: string[]; - /** - * Whether the resolved docker-driver sandbox container is paused - * (`docker pause`). `false` for non-docker-driver sandboxes or when no - * container is found. A paused container can report `Phase: Error` - * upstream while the sandbox is intact — see #4495. - */ - dockerPaused: boolean; -} - -interface SandboxStatusSnapshot { - sb: registry.SandboxEntry | null; - lookup: SandboxGatewayState; - rpcIssue: ReturnType; - currentModel: string; - currentProvider: string; - inferenceHealth: ProviderHealthStatus | null; -} - -async function collectSandboxStatusSnapshot( - sandboxName: string, -): Promise { - const sb = registry.getSandbox(sandboxName); - let lookup: SandboxGatewayState; - try { - lookup = await getReconciledSandboxGatewayState(sandboxName, { - getState: getSandboxGatewayStateForStatus, - }); - } catch (err) { - const message = err instanceof Error ? err.message : String(err); - lookup = { - state: "gateway_error", - output: ` Could not probe live gateway state: ${message}`, - }; - } - let liveResult: Awaited> | null = null; - if (lookup.state === "present") { - try { - liveResult = await captureOpenshellForStatus(["inference", "get"]); - } catch { - liveResult = null; - } - } - const rpcIssue = liveResult ? detectOpenShellStateRpcResultIssue(liveResult) : null; - if (rpcIssue) { - return { - sb, - lookup, - rpcIssue, - currentModel: "unknown", - currentProvider: "unknown", - inferenceHealth: null, - }; - } - const live = - liveResult && !isCommandTimeout(liveResult) ? parseGatewayInference(liveResult.output) : null; - const currentModel = (live && live.model) || (sb && sb.model) || "unknown"; - const currentProvider = (live && live.provider) || (sb && sb.provider) || "unknown"; - const inferenceHealth = getSandboxStatusInferenceHealth( - lookup.state === "present", - currentProvider, - currentModel, - ); - if ( - inferenceHealth && - lookup.state === "present" && - (currentProvider === "ollama-local" || currentProvider === "vllm-local") - ) { - const gatewayChain = await probeSandboxInferenceGatewayHealth(sandboxName); - if (gatewayChain) { - const gatewaySubprobe: ProviderHealthStatus = { - ok: gatewayChain.ok, - probed: true, - providerLabel: "Inference gateway chain", - endpoint: gatewayChain.endpoint, - detail: gatewayChain.detail, - probeLabel: "gateway", - ...(gatewayChain.ok ? {} : { failureLabel: "unreachable" as const }), - }; - inferenceHealth.subprobes = [...(inferenceHealth.subprobes ?? []), gatewaySubprobe]; - } - } - return { sb, lookup, rpcIssue, currentModel, currentProvider, inferenceHealth }; -} - -export async function getSandboxStatusReport( - sandboxName: string, -): Promise { - const snapshot = await collectSandboxStatusSnapshot(sandboxName); - const { sb, lookup, rpcIssue, currentModel, currentProvider, inferenceHealth } = snapshot; - const dockerRuntime = - lookup.state === "present" ? getSandboxDockerRuntime(sandboxName) : null; - const phase = - lookup.state === "present" ? parseSandboxPhase(lookup.output || "") : null; - const sandboxGpuEnabled = sb - ? (sb.sandboxGpuEnabled ?? (sb.gpuEnabled === true)) - : false; - const policies = - sb && Array.isArray(sb.policies) - ? sb.policies.filter((policy): policy is string => typeof policy === "string") - : []; - return { - schemaVersion: 1, - name: sandboxName, - found: !!sb, - model: currentModel, - provider: currentProvider, - phase, - gatewayState: lookup.state, - inferenceHealth, - rpcIssue: rpcIssue ? { kind: rpcIssue.kind } : null, - hostGpuDetected: !!(sb && sb.hostGpuDetected), - sandboxGpuEnabled, - sandboxGpuMode: (sb && sb.sandboxGpuMode) || null, - sandboxGpuDevice: (sb && sb.sandboxGpuDevice) || null, - openshellDriver: (sb && sb.openshellDriver) || "unknown", - openshellVersion: (sb && sb.openshellVersion) || "unknown", - policies, - dockerPaused: !!dockerRuntime?.paused, - }; -} - /** * Render one Inference status line. The main probe and each subprobe go * through this helper so multi-hop providers (e.g. ollama-local backend + @@ -256,11 +111,6 @@ function maybeEnsureHermesToolGatewayBroker(sb: registry.SandboxEntry | null): v } } -async function printGatewayFailureLayerHeader(sandboxName: string): Promise { - const failure = await classifyGatewayFailure(sandboxName); - console.log(` ${getLayerHeader(failure.layer)}`); -} - function printMissingLiveSandboxStatusGuidance( sandboxName: string, lookup: SandboxGatewayState, @@ -286,17 +136,26 @@ function printMissingLiveSandboxStatusGuidance( // eslint-disable-next-line complexity export async function showSandboxStatus(sandboxName: string): Promise { + const preflight = await getSandboxStatusPreflight(registry.getSandbox(sandboxName)); // #2666: never let an unexpected throw from the gateway probe (e.g. openshell // hanging when its container is stopped and the published port is held by a // foreign listener) suppress the sandbox header. The downstream switch // handles `gateway_error` by printing an actionable block + exit(1), so a // synthesized fallback keeps the user-visible contract intact. - const snapshot = await collectSandboxStatusSnapshot(sandboxName); + const snapshot = await collectSandboxStatusSnapshot(sandboxName, { + suppressInferenceProbe: preflight.suppressInferenceProbe, + }); const { sb, lookup, rpcIssue, currentModel, currentProvider, inferenceHealth } = snapshot; // Resolve the docker-driver container once: reused for the paused-container // recovery hint (#4495) and the Docker health line below (#3975). const dockerRuntime = lookup.state === "present" ? getSandboxDockerRuntime(sandboxName) : null; + const phase = lookup.state === "present" ? parseSandboxPhase(lookup.output || "") : null; + const effectivePreflight = withoutTerminalPhasePreflight(preflight, phase); + printSandboxStatusPreflightHeader(effectivePreflight); + if (effectivePreflight.exitCode !== 0) { + process.exitCode = effectivePreflight.exitCode; + } maybeEnsureHermesToolGatewayBroker(sb); if (rpcIssue) { printOpenShellStateRpcIssue(rpcIssue, { @@ -400,7 +259,6 @@ export async function showSandboxStatus(sandboxName: string): Promise { console.log(""); } console.log(lookup.output); - const phase = parseSandboxPhase(lookup.output || ""); if (phase && phase !== "Ready") { // A non-ready, non-terminal phase can mean two very different things. If // the Docker daemon is down, OpenShell can still return a present-but- @@ -476,7 +334,7 @@ export async function showSandboxStatus(sandboxName: string): Promise { process.exit(1); } else if (lookup.state === "gateway_unreachable_after_restart") { console.log(""); - await printGatewayFailureLayerHeader(sandboxName); + await printGatewayFailureLayerHeader(sandboxName, effectivePreflight.failureLayer); console.log( ` Sandbox '${sandboxName}' may still exist, but the selected ${CLI_DISPLAY_NAME} gateway is still refusing connections after restart.`, ); @@ -492,7 +350,7 @@ export async function showSandboxStatus(sandboxName: string): Promise { process.exit(1); } else if (lookup.state === "gateway_missing_after_restart") { console.log(""); - await printGatewayFailureLayerHeader(sandboxName); + await printGatewayFailureLayerHeader(sandboxName, effectivePreflight.failureLayer); console.log( ` Sandbox '${sandboxName}' may still exist locally, but the ${CLI_DISPLAY_NAME} gateway is no longer configured after restart/rebuild.`, ); @@ -512,7 +370,7 @@ export async function showSandboxStatus(sandboxName: string): Promise { if (lookup.output) { console.log(lookup.output); } - await printGatewayFailureLayerHeader(sandboxName); + await printGatewayFailureLayerHeader(sandboxName, effectivePreflight.failureLayer); printGatewayLifecycleHint(lookup.output, sandboxName, console.log); process.exit(1); } diff --git a/test/cli.test.ts b/test/cli.test.ts index e615fd2b01..5b2b7de06e 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -5,6 +5,7 @@ import { describe, it, expect } from "vitest"; import { execSync, spawn, spawnSync } from "node:child_process"; import type { ChildProcess } from "node:child_process"; import fs from "node:fs"; +import net from "node:net"; import os from "node:os"; import path from "node:path"; import { parse as parseYaml } from "yaml"; @@ -1108,6 +1109,293 @@ describe("CLI dispatch", () => { }); }); + it("sandbox status surfaces docker_unreachable header and suppresses stale Inference probe", () => { + const home = fs.mkdtempSync( + path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-docker-unreachable-"), + ); + const localBin = path.join(home, "bin"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home, "alpha", { + provider: "openai-api", + model: "gpt-4o-mini", + openshellDriver: "docker", + } as unknown as Partial); + + fs.writeFileSync( + path.join(localBin, "docker"), + ["#!/usr/bin/env bash", "exit 1"].join("\n"), + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', + " echo 'Gateway inference:'", + " echo ' Provider: openai-api'", + " echo ' Model: gpt-4o-mini'", + " exit 0", + "fi", + 'if [ "$1" = "status" ]; then', + " echo 'Gateway: nemoclaw'", + " echo 'Status: Connected'", + " exit 0", + "fi", + 'if [ "$1" = "gateway" ] && [ "$2" = "info" ]; then', + " echo 'Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(1); + expect(r.out.startsWith( + "Failure layer: docker_unreachable — Docker daemon is not reachable.", + )).toBe(true); + expect(r.out).not.toContain("Inference: healthy"); + const headerIdx = r.out.indexOf("Failure layer: docker_unreachable"); + const sandboxIdx = r.out.indexOf("Sandbox: alpha"); + expect(headerIdx).toBeGreaterThanOrEqual(0); + expect(sandboxIdx).toBeGreaterThan(headerIdx); + expect( + (r.out.match(/Failure layer: docker_unreachable/g) || []).length, + ).toBe(1); + }); + + it("sandbox status preserves Inference probe and exits 0 when openshellDriver is not docker", () => { + const home = fs.mkdtempSync( + path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-non-docker-driver-"), + ); + const localBin = path.join(home, "bin"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home, "alpha", { + provider: "openai-api", + model: "gpt-4o-mini", + openshellDriver: "vm", + } as unknown as Partial); + + fs.writeFileSync( + path.join(localBin, "docker"), + ["#!/usr/bin/env bash", "exit 1"].join("\n"), + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', + " echo 'Gateway inference:'", + " echo ' Provider: openai-api'", + " echo ' Model: gpt-4o-mini'", + " exit 0", + "fi", + 'if [ "$1" = "status" ]; then', + " echo 'Gateway: nemoclaw'", + " echo 'Status: Connected'", + " exit 0", + "fi", + 'if [ "$1" = "gateway" ] && [ "$2" = "info" ]; then', + " echo 'Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(0); + expect(r.out).not.toContain("Failure layer: docker_unreachable"); + expect(r.out).toContain("Sandbox: alpha"); + expect(r.out).toContain("Provider: openai-api"); + expect(r.out).toContain("Model: gpt-4o-mini"); + expect(r.out).toContain("Inference: healthy"); + }); + + it("sandbox status surfaces sandbox_container_stopped when the per-sandbox container exists but is not running", () => { + const home = fs.mkdtempSync( + path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-container-stopped-"), + ); + const localBin = path.join(home, "bin"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home, "alpha", { + provider: "openai-api", + model: "gpt-4o-mini", + openshellDriver: "docker", + } as unknown as Partial); + + fs.writeFileSync( + path.join(localBin, "docker"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "info" ]; then echo "Server: docker"; exit 0; fi', + 'if [ "$1" = "ps" ] && [ "$2" = "-a" ]; then echo "openshell-alpha-7616dcb1"; exit 0; fi', + 'if [ "$1" = "ps" ]; then echo "openshell-cluster-nemoclaw"; exit 0; fi', + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo ' Name: alpha'", + " echo ' Phase: Error'", + " exit 0", + "fi", + 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', + " echo 'Gateway inference:'", + " echo ' Provider: openai-api'", + " echo ' Model: gpt-4o-mini'", + " exit 0", + "fi", + 'if [ "$1" = "status" ]; then', + " echo 'Gateway: nemoclaw'", + " echo 'Status: Connected'", + " exit 0", + "fi", + 'if [ "$1" = "gateway" ] && [ "$2" = "info" ]; then', + " echo 'Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(1); + expect( + r.out.startsWith( + "Failure layer: sandbox_container_stopped — sandbox container exists but is not running.", + ), + ).toBe(true); + expect(r.out).not.toContain("Inference: healthy"); + expect(r.out).toContain("Phase: Error"); + expect(r.out).not.toContain("Failure layer: docker_unreachable"); + expect(r.out).not.toContain("Failure layer: sandbox_dashboard_port_conflict"); + const headerIdx = r.out.indexOf("Failure layer: sandbox_container_stopped"); + const sandboxIdx = r.out.indexOf("Sandbox: alpha"); + expect(headerIdx).toBeGreaterThanOrEqual(0); + expect(sandboxIdx).toBeGreaterThan(headerIdx); + // The downstream gateway-state fallback header (`Failure layer: ...`) + // must be suppressed once preflight has already emitted its own. + // Otherwise a non-`present` gateway lookup would print a redundant + // second `Failure layer:` line later in the output. + expect((r.out.match(/Failure layer:/g) || []).length).toBe(1); + }); + + it("sandbox status surfaces sandbox_dashboard_port_conflict when the sandbox container is stopped and the dashboard port is held by a foreign listener", async () => { + const home = fs.mkdtempSync( + path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-port-conflict-"), + ); + const localBin = path.join(home, "bin"); + fs.mkdirSync(localBin, { recursive: true }); + + const server = net.createServer(); + await new Promise((resolve, reject) => { + server.once("error", reject); + server.listen(0, "127.0.0.1", () => resolve()); + }); + const address = server.address(); + if (!address || typeof address === "string") { + server.close(); + throw new Error("failed to bind foreign listener on a free port"); + } + const dashboardPort = address.port; + + try { + writeSandboxRegistry(home, "alpha", { + provider: "openai-api", + model: "gpt-4o-mini", + openshellDriver: "docker", + dashboardPort, + } as unknown as Partial); + + fs.writeFileSync( + path.join(localBin, "docker"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "info" ]; then echo "Server: docker"; exit 0; fi', + 'if [ "$1" = "ps" ] && [ "$2" = "-a" ]; then echo "openshell-alpha-7616dcb1"; exit 0; fi', + 'if [ "$1" = "ps" ]; then echo "openshell-cluster-nemoclaw"; exit 0; fi', + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo ' Name: alpha'", + " echo ' Phase: Error'", + " exit 0", + "fi", + 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', + " echo 'Gateway inference:'", + " echo ' Provider: openai-api'", + " echo ' Model: gpt-4o-mini'", + " exit 0", + "fi", + 'if [ "$1" = "status" ]; then', + " echo 'Gateway: nemoclaw'", + " echo 'Status: Connected'", + " exit 0", + "fi", + 'if [ "$1" = "gateway" ] && [ "$2" = "info" ]; then', + " echo 'Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(1); + expect( + r.out.startsWith( + "Failure layer: sandbox_dashboard_port_conflict — sandbox container is stopped and the dashboard port is held by a foreign listener.", + ), + ).toBe(true); + expect(r.out).not.toContain("Inference: healthy"); + expect(r.out).toContain("Phase: Error"); + expect(r.out).not.toContain("Failure layer: sandbox_container_stopped —"); + const headerIdx = r.out.indexOf("Failure layer: sandbox_dashboard_port_conflict"); + const sandboxIdx = r.out.indexOf("Sandbox: alpha"); + expect(headerIdx).toBeGreaterThanOrEqual(0); + expect(sandboxIdx).toBeGreaterThan(headerIdx); + // Downstream gateway-state fallback must not print a second + // `Failure layer:` line when preflight already emitted one. + expect((r.out.match(/Failure layer:/g) || []).length).toBe(1); + } finally { + await new Promise((resolve) => server.close(() => resolve())); + } + }); + it("sandbox status --json emits structured per-sandbox report", () => { const home = fs.mkdtempSync( path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-json-"), @@ -1127,6 +1415,17 @@ describe("CLI dispatch", () => { openshellDriver: "docker", openshellVersion: "0.0.44", } as unknown as Partial); + fs.writeFileSync( + path.join(localBin, "docker"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "info" ]; then echo "Server: docker"; exit 0; fi', + `if [ "$1" = "ps" ] && [ "$2" = "-a" ]; then echo "openshell-cluster-nemoclaw"; echo "openshell-${sandboxName}-7616dcb1"; exit 0; fi`, + `if [ "$1" = "ps" ]; then echo "openshell-cluster-nemoclaw"; echo "openshell-${sandboxName}-7616dcb1"; exit 0; fi`, + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); fs.writeFileSync( path.join(localBin, "openshell"), [ @@ -1450,6 +1749,252 @@ describe("CLI dispatch", () => { expect(parsed.inferenceHealth).toBeNull(); }); + it("sandbox status --json sets failureLayer=docker_unreachable, suppresses inferenceHealth, and exits 1 when the host Docker daemon is unreachable", () => { + const home = fs.mkdtempSync( + path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-json-docker-unreachable-"), + ); + const localBin = path.join(home, "bin"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home, "alpha", { + provider: "openai-api", + model: "gpt-4o-mini", + openshellDriver: "docker", + } as unknown as Partial); + + fs.writeFileSync( + path.join(localBin, "docker"), + ["#!/usr/bin/env bash", "exit 1"].join("\n"), + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', + " echo 'Gateway inference:'", + " echo ' Provider: openai-api'", + " echo ' Model: gpt-4o-mini'", + " exit 0", + "fi", + 'if [ "$1" = "status" ]; then', + " echo 'Gateway: nemoclaw'", + " echo 'Status: Connected'", + " exit 0", + "fi", + 'if [ "$1" = "gateway" ] && [ "$2" = "info" ]; then', + " echo 'Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha status --json", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(1); + const parsed = JSON.parse(r.out); + expect(parsed.failureLayer).toBe("docker_unreachable"); + expect(parsed.inferenceHealth).toBeNull(); + expect(parsed.name).toBe("alpha"); + expect(parsed.found).toBe(true); + }); + + it("sandbox status --json sets failureLayer=sandbox_container_stopped when the per-sandbox container is stopped", () => { + const home = fs.mkdtempSync( + path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-json-container-stopped-"), + ); + const localBin = path.join(home, "bin"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home, "alpha", { + provider: "openai-api", + model: "gpt-4o-mini", + openshellDriver: "docker", + } as unknown as Partial); + + fs.writeFileSync( + path.join(localBin, "docker"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "info" ]; then echo "Server: docker"; exit 0; fi', + 'if [ "$1" = "ps" ] && [ "$2" = "-a" ]; then echo "openshell-alpha-7616dcb1"; exit 0; fi', + 'if [ "$1" = "ps" ]; then echo "openshell-cluster-nemoclaw"; exit 0; fi', + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo ' Name: alpha'", + " echo ' Phase: Error'", + " exit 0", + "fi", + 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', + " echo 'Gateway inference:'", + " echo ' Provider: openai-api'", + " echo ' Model: gpt-4o-mini'", + " exit 0", + "fi", + 'if [ "$1" = "status" ]; then', + " echo 'Gateway: nemoclaw'", + " echo 'Status: Connected'", + " exit 0", + "fi", + 'if [ "$1" = "gateway" ] && [ "$2" = "info" ]; then', + " echo 'Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha status --json", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(1); + const parsed = JSON.parse(r.out); + expect(parsed.failureLayer).toBe("sandbox_container_stopped"); + expect(parsed.phase).toBe("Error"); + expect(parsed.inferenceHealth).toBeNull(); + }); + + it("sandbox status --json sets failureLayer=sandbox_dashboard_port_conflict when the dashboard port is held by a foreign listener", async () => { + const home = fs.mkdtempSync( + path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-json-port-conflict-"), + ); + const localBin = path.join(home, "bin"); + fs.mkdirSync(localBin, { recursive: true }); + + const server = net.createServer(); + await new Promise((resolve, reject) => { + server.once("error", reject); + server.listen(0, "127.0.0.1", () => resolve()); + }); + const address = server.address(); + if (!address || typeof address === "string") { + server.close(); + throw new Error("failed to bind foreign listener on a free port"); + } + const dashboardPort = address.port; + + try { + writeSandboxRegistry(home, "alpha", { + provider: "openai-api", + model: "gpt-4o-mini", + openshellDriver: "docker", + dashboardPort, + } as unknown as Partial); + + fs.writeFileSync( + path.join(localBin, "docker"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "info" ]; then echo "Server: docker"; exit 0; fi', + 'if [ "$1" = "ps" ] && [ "$2" = "-a" ]; then echo "openshell-alpha-7616dcb1"; exit 0; fi', + 'if [ "$1" = "ps" ]; then echo "openshell-cluster-nemoclaw"; exit 0; fi', + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo ' Name: alpha'", + " echo ' Phase: Error'", + " exit 0", + "fi", + 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', + " echo 'Gateway inference:'", + " echo ' Provider: openai-api'", + " echo ' Model: gpt-4o-mini'", + " exit 0", + "fi", + 'if [ "$1" = "status" ]; then', + " echo 'Gateway: nemoclaw'", + " echo 'Status: Connected'", + " exit 0", + "fi", + 'if [ "$1" = "gateway" ] && [ "$2" = "info" ]; then', + " echo 'Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha status --json", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(1); + const parsed = JSON.parse(r.out); + expect(parsed.failureLayer).toBe("sandbox_dashboard_port_conflict"); + expect(parsed.phase).toBe("Error"); + expect(parsed.inferenceHealth).toBeNull(); + } finally { + await new Promise((resolve) => server.close(() => resolve())); + } + }); + + it("sandbox status --json sets failureLayer=null when no preflight failure applies", () => { + const home = fs.mkdtempSync( + path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-json-failure-layer-null-"), + ); + const localBin = path.join(home, "bin"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home, "alpha", { + provider: "openai-api", + model: "gpt-4o-mini", + openshellDriver: "vm", + } as unknown as Partial); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + 'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then', + " echo 'Gateway inference:'", + " echo ' Provider: openai-api'", + " echo ' Model: gpt-4o-mini'", + " exit 0", + "fi", + 'if [ "$1" = "status" ]; then', + " echo 'Gateway: nemoclaw'", + " echo 'Status: Connected'", + " exit 0", + "fi", + 'if [ "$1" = "gateway" ] && [ "$2" = "info" ]; then', + " echo 'Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha status --json", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + const parsed = JSON.parse(r.out); + expect(parsed.failureLayer).toBeNull(); + }); + it("sandbox status --help advertises --json flag", () => { const home = fs.mkdtempSync( path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-help-json-"), diff --git a/test/gateway-failure-classifier.test.ts b/test/gateway-failure-classifier.test.ts index b87fa3d487..73ecbd0656 100644 --- a/test/gateway-failure-classifier.test.ts +++ b/test/gateway-failure-classifier.test.ts @@ -5,10 +5,12 @@ import { describe, expect, it } from "vitest"; import { classifyGatewayFailure, + classifySandboxContainerFailure, getLayerHeader, type GatewayFailureRunners, isDockerRuntimeDown, printDockerRuntimeDownGuidance, + type SandboxContainerFailureRunners, } from "../dist/lib/actions/sandbox/gateway-failure-classifier.js"; function makeRunners(overrides: Partial = {}): GatewayFailureRunners { @@ -227,5 +229,223 @@ describe("getLayerHeader", () => { ); expect(getLayerHeader("container_exited")).toContain("container_exited"); expect(getLayerHeader("gateway_unreachable")).toContain("gateway_unreachable"); + expect(getLayerHeader("sandbox_container_stopped")).toContain( + "sandbox_container_stopped", + ); + expect(getLayerHeader("sandbox_dashboard_port_conflict")).toContain( + "sandbox_dashboard_port_conflict", + ); + }); +}); + +function makeSandboxRunners( + overrides: Partial = {}, +): SandboxContainerFailureRunners { + return { + listAllContainerNames: () => "", + listRunningContainerNames: () => "", + listSandboxNames: () => [], + portProbe: async () => false, + ...overrides, + }; +} + +describe("classifySandboxContainerFailure", () => { + it("returns null when the sandbox container is running", async () => { + const result = await classifySandboxContainerFailure("my-assistant", { + runners: makeSandboxRunners({ + listRunningContainerNames: () => + "openshell-my-assistant-7616dcb1\nopenshell-cluster-nemoclaw", + listAllContainerNames: () => + "openshell-my-assistant-7616dcb1\nopenshell-cluster-nemoclaw", + }), + }); + expect(result).toBeNull(); + }); + + it("returns null when the sandbox container is not present anywhere", async () => { + const result = await classifySandboxContainerFailure("my-assistant", { + runners: makeSandboxRunners({ + listAllContainerNames: () => "openshell-cluster-nemoclaw\n", + }), + }); + expect(result).toBeNull(); + }); + + it("returns sandbox_container_stopped when the container exists but is not running and no port is recorded", async () => { + const result = await classifySandboxContainerFailure("my-assistant", { + runners: makeSandboxRunners({ + listAllContainerNames: () => + "openshell-my-assistant-7616dcb1\nopenshell-cluster-nemoclaw", + }), + }); + expect(result?.layer).toBe("sandbox_container_stopped"); + expect(result?.detail).toContain("openshell-my-assistant-7616dcb1"); + }); + + it("returns sandbox_container_stopped when the container exists, is stopped, and the dashboard port is free", async () => { + let probedPort: number | null = null; + const result = await classifySandboxContainerFailure("my-assistant", { + dashboardPort: 18789, + runners: makeSandboxRunners({ + listAllContainerNames: () => "openshell-my-assistant-7616dcb1\n", + portProbe: async (port) => { + probedPort = port; + return false; + }, + }), + }); + expect(result?.layer).toBe("sandbox_container_stopped"); + expect(probedPort).toBe(18789); + }); + + it("returns sandbox_dashboard_port_conflict when the container exists, is stopped, and the dashboard port is held", async () => { + const result = await classifySandboxContainerFailure("my-assistant", { + dashboardPort: 18789, + runners: makeSandboxRunners({ + listAllContainerNames: () => "openshell-my-assistant-7616dcb1\n", + portProbe: async () => true, + }), + }); + expect(result?.layer).toBe("sandbox_dashboard_port_conflict"); + expect(result?.detail).toContain("18789"); + expect(result?.detail).toContain("openshell-my-assistant-7616dcb1"); + }); + + it("does not probe the port when the container is running", async () => { + let portProbeCalled = false; + await classifySandboxContainerFailure("my-assistant", { + dashboardPort: 18789, + runners: makeSandboxRunners({ + listRunningContainerNames: () => "openshell-my-assistant-7616dcb1", + listAllContainerNames: () => "openshell-my-assistant-7616dcb1", + portProbe: async () => { + portProbeCalled = true; + return true; + }, + }), + }); + expect(portProbeCalled).toBe(false); + }); + + it("does not probe the port when the container is not present at all", async () => { + let portProbeCalled = false; + await classifySandboxContainerFailure("my-assistant", { + dashboardPort: 18789, + runners: makeSandboxRunners({ + listAllContainerNames: () => "openshell-cluster-nemoclaw\n", + portProbe: async () => { + portProbeCalled = true; + return true; + }, + }), + }); + expect(portProbeCalled).toBe(false); + }); + + it("matches the exact prefix and accepts uuid-suffixed shapes that resolve back to the queried sandbox", async () => { + const exactResult = await classifySandboxContainerFailure("my-assistant", { + runners: makeSandboxRunners({ + listAllContainerNames: () => "openshell-my-assistant\n", + }), + }); + expect(exactResult?.layer).toBe("sandbox_container_stopped"); + expect(exactResult?.detail).toContain("openshell-my-assistant"); + + // `openshell-my-assistant-7616dcb1` belongs to `my-assistant` because no + // other registered sandbox name claims it via the longest-owner rule. + const uuidResult = await classifySandboxContainerFailure("my-assistant", { + runners: makeSandboxRunners({ + listAllContainerNames: () => + "openshell-my-assistant-7616dcb1\nopenshell-different-sandbox-abc", + listSandboxNames: () => ["my-assistant", "different-sandbox"], + }), + }); + expect(uuidResult?.layer).toBe("sandbox_container_stopped"); + expect(uuidResult?.detail).toContain("openshell-my-assistant-7616dcb1"); + + const unrelated = await classifySandboxContainerFailure("my-assistant", { + runners: makeSandboxRunners({ + listAllContainerNames: () => + "openshell-cluster-nemoclaw\nopenshell-my-assistantextra\n", + }), + }); + expect(unrelated).toBeNull(); + }); + + it("rejects a longer registered sandbox's container even when the literal prefix matches the queried name", async () => { + // `openshell-my-assistant-prod-7616dcb1` resolves to the longer + // `my-assistant-prod` sandbox via the longest-owner rule; the query for + // `my-assistant` must not consume it. Mirrors the docker-health.ts + // resolver and prevents the prefix-collision bug. + const collision = await classifySandboxContainerFailure("my-assistant", { + runners: makeSandboxRunners({ + listAllContainerNames: () => + "openshell-my-assistant-prod-7616dcb1\nopenshell-cluster-nemoclaw", + listSandboxNames: () => ["my-assistant", "my-assistant-prod"], + }), + }); + expect(collision).toBeNull(); + }); + + it("matches an `openshell-` exact container even when a co-tenant `openshell--` exists in the same listing", async () => { + const result = await classifySandboxContainerFailure("my-assistant", { + runners: makeSandboxRunners({ + listAllContainerNames: () => + "openshell-my-assistant-7616dcb1\nopenshell-my-assistant", + listSandboxNames: () => ["my-assistant"], + }), + }); + expect(result?.layer).toBe("sandbox_container_stopped"); + expect(result?.detail).toContain("openshell-my-assistant"); + expect(result?.detail).not.toContain("openshell-my-assistant-7616dcb1"); + }); + + it("ignores an out-of-range dashboardPort and falls back to sandbox_container_stopped", async () => { + let portProbeCalled = false; + const result = await classifySandboxContainerFailure("my-assistant", { + dashboardPort: 70000, + runners: makeSandboxRunners({ + listAllContainerNames: () => "openshell-my-assistant-7616dcb1\n", + portProbe: async () => { + portProbeCalled = true; + return true; + }, + }), + }); + expect(result?.layer).toBe("sandbox_container_stopped"); + expect(portProbeCalled).toBe(false); + }); + + it("ignores a non-integer dashboardPort and falls back to sandbox_container_stopped", async () => { + let portProbeCalled = false; + const result = await classifySandboxContainerFailure("my-assistant", { + dashboardPort: 18789.5 as unknown as number, + runners: makeSandboxRunners({ + listAllContainerNames: () => "openshell-my-assistant-7616dcb1\n", + portProbe: async () => { + portProbeCalled = true; + return true; + }, + }), + }); + expect(result?.layer).toBe("sandbox_container_stopped"); + expect(portProbeCalled).toBe(false); + }); + + it("ignores a zero dashboardPort and falls back to sandbox_container_stopped", async () => { + let portProbeCalled = false; + const result = await classifySandboxContainerFailure("my-assistant", { + dashboardPort: 0, + runners: makeSandboxRunners({ + listAllContainerNames: () => "openshell-my-assistant-7616dcb1\n", + portProbe: async () => { + portProbeCalled = true; + return true; + }, + }), + }); + expect(result?.layer).toBe("sandbox_container_stopped"); + expect(portProbeCalled).toBe(false); }); }); diff --git a/test/sandbox-container-owner.test.ts b/test/sandbox-container-owner.test.ts new file mode 100644 index 0000000000..4974db2d26 --- /dev/null +++ b/test/sandbox-container-owner.test.ts @@ -0,0 +1,88 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it } from "vitest"; + +import { resolveSandboxContainerOwner } from "../dist/lib/actions/sandbox/sandbox-container-owner.js"; + +describe("resolveSandboxContainerOwner", () => { + it("returns null when no candidate matches the sandbox prefix", () => { + expect( + resolveSandboxContainerOwner( + "openshell-cluster-nemoclaw\nopenshell-different-sandbox-7616dcb1", + "my-assistant", + ["my-assistant", "different-sandbox"], + ), + ).toBeNull(); + }); + + it("prefers the exact-name container even when a co-tenant suffixed candidate exists in the same listing", () => { + expect( + resolveSandboxContainerOwner( + "openshell-my-assistant-7616dcb1\nopenshell-my-assistant", + "my-assistant", + ["my-assistant"], + ), + ).toBe("openshell-my-assistant"); + }); + + it("accepts a uuid-suffixed container that resolves to the queried sandbox via the longest-owner rule", () => { + expect( + resolveSandboxContainerOwner( + "openshell-my-assistant-7616dcb1\nopenshell-different-sandbox-abc", + "my-assistant", + ["my-assistant", "different-sandbox"], + ), + ).toBe("openshell-my-assistant-7616dcb1"); + }); + + it("rejects a container whose longest-owner is a different registered sandbox name", () => { + expect( + resolveSandboxContainerOwner( + "openshell-my-assistant-prod-7616dcb1\nopenshell-cluster-nemoclaw", + "my-assistant", + ["my-assistant", "my-assistant-prod"], + ), + ).toBeNull(); + }); + + it("rejects a container whose stripped name is not separated from the queried sandbox by a hyphen", () => { + expect( + resolveSandboxContainerOwner( + "openshell-my-assistantextra\nopenshell-cluster-nemoclaw", + "my-assistant", + ["my-assistant"], + ), + ).toBeNull(); + }); + + it("includes the queried sandbox in the known-owner set even when listSandboxNames omits it", () => { + expect( + resolveSandboxContainerOwner( + "openshell-my-assistant-7616dcb1", + "my-assistant", + [], + ), + ).toBe("openshell-my-assistant-7616dcb1"); + }); + + it("trims whitespace and ignores blank lines from the docker ps stream", () => { + expect( + resolveSandboxContainerOwner( + " openshell-my-assistant-7616dcb1 \n\n openshell-cluster-nemoclaw \n", + "my-assistant", + ["my-assistant"], + ), + ).toBe("openshell-my-assistant-7616dcb1"); + }); + + it("matches an exact-name container even when listSandboxNames is empty", () => { + expect( + resolveSandboxContainerOwner( + "openshell-my-assistant", + "my-assistant", + [], + ), + ).toBe("openshell-my-assistant"); + }); +});