From a7c7789163466a8d2c83a68145b3339d34135abc Mon Sep 17 00:00:00 2001 From: Yimo Jiang Date: Tue, 26 May 2026 08:09:19 +0000 Subject: [PATCH] fix(onboard): make timeout/killed DNS probes fatal and add Jetson bridge preflight MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The DNS preflight probe funneled docker+busybox `nslookup` through a 20s spawn-level timeout. On a host with outbound DNS blocked, the child was killed and the result became `reason="no_output"`, which the onboarding gate treated as inconclusive — onboarding proceeded until later docker build commands hung for ~15 minutes (#3630). Plumb structured execution metadata (timedOut, signal, exitCode, errorCode) through the probe so a killed child becomes `reason="timeout"`/`"killed"` with full execution context. Treat those reasons as fatal alongside `servers_unreachable`/`resolution_failed` via new `isFatalContainerDnsProbeFailure`. `image_pull_failed` is fatal only when the daemon error matches a registry-DNS signature (`lookup ...: no such host`, `getaddrinfo`, etc.); TCP-connectivity errors (`dial tcp :443: i/o timeout`) and other pull failures stay inconclusive. Decouple pull from probe with `ensureProbeImageCached` (10s inspect, 60s pull). A slow-registry cold-cache pull no longer gets charged against the 20s probe budget and falsely classified as a fatal probe timeout. A wedged daemon (`Cannot connect to the Docker daemon` in inspect stderr) surfaces as the distinct `docker_daemon_unreachable` reason — fatal, with a Docker restart hint. Test seams (`runCaptureImpl`/`runProbeImpl`/`runImpl`/`executionOverride`) skip the pre-pull so hermetic tests on clean CI workers still work. Add `probeDockerBridgeContainerStart` that catches Jetson Thor "failed to add the host <=> sandbox veth pair interfaces: operation not supported" before long sandbox builds (#3508). The gate exits on veth/timeout/killed/docker_daemon_unreachable; generic `error` (e.g. a daemon with the default bridge disabled but using a managed network) and `image_pull_failed` stay inconclusive and fall through to the DNS probe. Tighten `isProbeTimeout` in gateway reachability to spawn-level ETIMEDOUT / killed-by-signal so BusyBox `nc` connection-level "Operation timed out" stderr keeps falling through to `tcp_failed` with the UFW remediation path. Add a new `docker_daemon_unreachable` gateway reason so `inspect_unavailable` upgrades to fatal instead of the warn-only `probe_unavailable`. Extract the bridge + DNS gate into `assertDockerBridgeAndContainerDnsHealthy` and call it both from `preflight()` and from the `--resume` branch (mirroring the `assertCdiNvidiaGpuSpecPresent` #3152 resume backstop) so sessions cached by an older NemoClaw cannot skip the new fatal checks. Refs #3630, #3508. Signed-off-by: Yimo Jiang --- src/lib/onboard.ts | 206 +----- src/lib/onboard/bridge-dns-preflight.test.ts | 205 ++++++ src/lib/onboard/bridge-dns-preflight.ts | 393 ++++++++++ .../gateway-sandbox-reachability.test.ts | 284 ++++++++ .../onboard/gateway-sandbox-reachability.ts | 149 +++- src/lib/onboard/machine/handlers/preflight.ts | 27 +- src/lib/onboard/preflight.test.ts | 511 ++++++++++++- src/lib/onboard/preflight.ts | 679 ++++++++++++++++-- 8 files changed, 2227 insertions(+), 227 deletions(-) create mode 100644 src/lib/onboard/bridge-dns-preflight.test.ts create mode 100644 src/lib/onboard/bridge-dns-preflight.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 518a276843..dbc6eb288f 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -452,11 +452,12 @@ const { assessHost, checkPortAvailable, ensureSwap, - getDockerBridgeGatewayIp, getMemoryInfo, planHostRemediation, - probeContainerDns, } = preflightUtils; +const { + assertDockerBridgeAndContainerDnsHealthy, +}: typeof import("./onboard/bridge-dns-preflight") = require("./onboard/bridge-dns-preflight"); const agentOnboard = require("./agent/onboard"); const agentDefs = require("./agent/defs"); @@ -1803,6 +1804,7 @@ function assertCdiNvidiaGpuSpecPresent( process.exit(1); } + type PreflightOptions = Pick< OnboardOptions, "sandboxGpu" | "sandboxGpuDevice" | "gpu" | "noGpu" @@ -1810,6 +1812,20 @@ type PreflightOptions = Pick< optedOutGpuPassthrough?: boolean; }; +// Reject unsupported container runtimes (currently only Podman with the +// Linux Docker-driver gateway) before any Docker-specific probes. Both +// the fresh preflight and `--resume` backstop call this — if `docker` +// resolves to Podman, surface the unsupported-runtime message instead of +// running bridge/DNS diagnostics that would be misleading. +function rejectUnsupportedContainerRuntime(host: ReturnType): void { + if (isLinuxDockerDriverGatewayEnabled() && host.runtime === "podman") { + console.error(` ✗ ${cliDisplayName()} onboarding now uses OpenShell's Docker driver.`); + console.error(` Podman is not supported for this ${cliDisplayName()} integration path.`); + console.error(" Switch to Docker Engine and rerun onboarding."); + process.exit(1); + } +} + async function preflight( preflightOpts: PreflightOptions = {}, ): Promise> { @@ -1823,6 +1839,10 @@ async function preflight( printRemediationActions(planHostRemediation(host)); process.exit(1); } + // Reject unsupported runtimes (Podman) BEFORE the success log so + // Podman users do not see a misleading `✓ Docker is running` line + // immediately followed by a fatal unsupported-runtime exit. + rejectUnsupportedContainerRuntime(host); console.log(" ✓ Docker is running"); require("./onboard/http-proxy-preflight").warnIfHostProxyMissesLoopback(); const gpu = nim.detectGpu(); @@ -1837,183 +1857,11 @@ async function preflight( !sandboxGpuConfig.sandboxGpuEnabled; assertCdiNvidiaGpuSpecPresent(host, optedOutGpuPassthrough, sandboxGpuConfig.hostGpuPlatform); - // DNS resolution from inside containers (#2101). A corp firewall that - // blocks outbound UDP:53 to public resolvers leaves the sandbox build - // unable to resolve registry.npmjs.org; npm then retries for ~15 min and - // prints the cryptic `Exit handler never called`. - const dns = probeContainerDns(); - // Only reasons where the probe actually *ran* nslookup and observed a DNS - // failure warrant blocking — other reasons are inconclusive (probe itself - // couldn't run, got killed, etc.) and shouldn't fail a valid environment. - const dnsIsFatal = dns.reason === "servers_unreachable" || dns.reason === "resolution_failed"; - - if (dns.ok) { - console.log(" ✓ Container DNS resolution works"); - } else if (!dnsIsFatal) { - // Inconclusive probe — warn but proceed. If the sandbox build really - // does hit a DNS issue, the user will see #2101 pointers in that layer. - if (dns.reason === "image_pull_failed") { - console.warn( - " ⚠ Container DNS probe inconclusive: docker couldn't pull the busybox test image.", - ); - console.warn(" This usually means the docker daemon itself can't reach Docker Hub,"); - console.warn( - " but doesn't prove container DNS is broken — the sandbox build may still succeed.", - ); - } else { - console.warn(` ⚠ Container DNS probe inconclusive (reason: ${dns.reason ?? "unknown"}).`); - } - if (dns.details) { - for (const line of String(dns.details).split("\n").slice(-3)) { - if (line.trim()) console.warn(` ${line.trim()}`); - } - } - console.warn(" Proceeding. If the sandbox build later hangs at `npm ci`, see issue #2101."); - } else { - console.error(" ✗ DNS resolution from inside a docker container failed."); - if (dns.details) { - for (const line of String(dns.details).split("\n").slice(-4)) { - if (line.trim()) console.error(` ${line.trim()}`); - } - } - console.error(""); - { - console.error(" The sandbox build runs `npm ci` inside a container and needs to resolve"); - console.error(" registry.npmjs.org. On networks that block outbound UDP:53 to public DNS"); - console.error(" (common in corporate environments that force DNS-over-TLS on the host),"); - console.error(" the build appears to hang for ~15 minutes and then prints the cryptic"); - console.error(" `npm error Exit handler never called`. See issue #2101."); - console.error(""); - console.error(" Fix options:"); - console.error(""); - - // Platform-aware remediation hints. The systemd-resolved fix is - // Linux-specific; macOS / Windows / WSL-backed-by-Docker-Desktop - // hosts configure DNS through Docker Desktop's GUI or a - // platform-specific daemon.json path, so we avoid printing shell - // commands that would mislead those users. - const isLinuxWithSystemd = - host.platform === "linux" && !host.isWsl && host.systemctlAvailable; - - const printLinuxFix = (bridgeIp: string, note: string | null) => { - if (note) console.error(note); - console.error(" sudo mkdir -p /etc/systemd/resolved.conf.d/"); - console.error( - ` printf '[Resolve]\\nDNSStubListenerExtra=${bridgeIp}\\n' | sudo tee /etc/systemd/resolved.conf.d/docker-bridge.conf`, - ); - console.error(" sudo systemctl restart systemd-resolved"); - console.error(""); - console.error( - " Then add the dns key to /etc/docker/daemon.json (safely merges with existing config if jq is installed):", - ); - console.error( - " sudo cp /etc/docker/daemon.json /etc/docker/daemon.json.bak-$(date +%s) 2>/dev/null", - ); - console.error( - ` { sudo jq '. + {"dns":["${bridgeIp}"]}' /etc/docker/daemon.json 2>/dev/null || echo '{"dns":["${bridgeIp}"]}'; } | sudo tee /etc/docker/daemon.json.new >/dev/null`, - ); - console.error(" sudo mv /etc/docker/daemon.json.new /etc/docker/daemon.json"); - console.error(" sudo systemctl restart docker"); - }; - - if (isLinuxWithSystemd) { - const detectedBridgeIp = getDockerBridgeGatewayIp(); - const bridgeIp = detectedBridgeIp || "172.17.0.1"; - let bridgeNote: string | null = null; - if (detectedBridgeIp && detectedBridgeIp !== "172.17.0.1") { - bridgeNote = ` (detected your docker bridge gateway at ${detectedBridgeIp})`; - } else if (!detectedBridgeIp) { - bridgeNote = - " (could not auto-detect bridge IP; using docker's default — verify with:\n" + - " docker network inspect bridge --format '{{range .IPAM.Config}}{{.Gateway}}{{end}}')"; - } - console.error(" 1. Make systemd-resolved reachable from containers (recommended):"); - printLinuxFix(bridgeIp, bridgeNote); - console.error(""); - console.error(" 2. Configure an explicit UDP:53-capable DNS in /etc/docker/daemon.json"); - console.error(" (ask your IT team for an internal DNS server IP)."); - } else if (host.platform === "darwin") { - // On macOS, branch by the detected runtime (host.runtime) so users get - // shell commands they can actually paste, not a "click this GUI" hint. - if (host.runtime === "colima") { - console.error(" Configure Colima's DNS (macOS):"); - console.error(" colima stop"); - console.error(" colima start --dns "); - console.error(" (or edit ~/.colima/default/colima.yaml and `colima restart`)"); - } else if (host.runtime === "docker-desktop" || host.runtime === "docker") { - console.error(" Configure Docker Desktop's DNS (macOS):"); - console.error( - " cp ~/.docker/daemon.json ~/.docker/daemon.json.bak-$(date +%s) 2>/dev/null", - ); - console.error( - ` { jq '. + {"dns":[""]}' ~/.docker/daemon.json 2>/dev/null || echo '{"dns":[""]}'; } > ~/.docker/daemon.json.new && mv ~/.docker/daemon.json.new ~/.docker/daemon.json`, - ); - console.error(" osascript -e 'quit app \"Docker\"' && sleep 3 && open -a Docker"); - console.error( - " (or do the same via the Docker Desktop UI: Settings → Docker Engine)", - ); - } else { - // Unknown / podman / other - console.error(" Configure your container runtime's DNS (macOS):"); - console.error(" - Docker Desktop:"); - console.error( - ' { jq \'. + {"dns":[""]}\' ~/.docker/daemon.json 2>/dev/null || echo \'{"dns":[""]}\'; } > ~/.docker/daemon.json.new && mv ~/.docker/daemon.json.new ~/.docker/daemon.json', - ); - console.error(" osascript -e 'quit app \"Docker\"' && sleep 3 && open -a Docker"); - console.error(" - Colima:"); - console.error(" colima stop && colima start --dns "); - console.error(" - Rancher Desktop / Podman: edit the runtime's DNS config"); - console.error(" and restart it."); - } - console.error(" Ask your IT team for an internal DNS server IP that accepts UDP:53."); - } else if (host.platform === "win32" || host.isWsl) { - console.error(" 1. Configure Docker Desktop's DNS (Windows / WSL via Docker Desktop):"); - console.error( - " Docker Desktop for Windows → Settings → Docker Engine — edit the JSON to add:", - ); - console.error(' { "dns": [""] }'); - console.error(" Then click Apply & Restart."); - console.error(""); - console.error( - " 2. If you run docker natively inside WSL (not Docker Desktop), apply the Linux fix:", - ); - // Reuse the same bridge-IP detection the Linux branch uses — a - // native-docker-in-WSL install can have a custom bridge subnet - // just like any other Linux host, so a hardcoded 172.17.0.1 - // would break those users' copy-paste. - const wslBridgeIp = getDockerBridgeGatewayIp(); - let wslBridgeNote: string | null = null; - if (wslBridgeIp && wslBridgeIp !== "172.17.0.1") { - wslBridgeNote = ` (detected your docker bridge gateway at ${wslBridgeIp})`; - } else if (!wslBridgeIp) { - wslBridgeNote = - " (could not auto-detect bridge IP — the snippet below uses docker's default; verify with:\n" + - " docker network inspect bridge --format '{{range .IPAM.Config}}{{.Gateway}}{{end}}')"; - } - printLinuxFix(wslBridgeIp || "172.17.0.1", wslBridgeNote); - } else { - console.error(" Configure your docker daemon to use a DNS server that accepts UDP:53."); - console.error( - ' Add { "dns": [""] } to your docker daemon.json and restart the daemon.', - ); - console.error(" Ask your IT team for an internal DNS server IP."); - } - console.error(""); - console.error(" Verify the fix worked:"); - console.error(" docker run --rm busybox nslookup registry.npmjs.org"); - } - process.exit(1); - } + assertDockerBridgeAndContainerDnsHealthy(host); if (host.runtime !== "unknown") { console.log(` ✓ Container runtime: ${host.runtime}`); } - if (isLinuxDockerDriverGatewayEnabled() && host.runtime === "podman") { - console.error(" ✗ NemoClaw onboarding now uses OpenShell's Docker driver."); - console.error(" Podman is not supported for this NemoClaw integration path."); - console.error(" Switch to Docker Engine and rerun onboarding."); - process.exit(1); - } if (host.notes.includes("Running under WSL")) { console.log(" ⓘ Running under WSL"); } @@ -7072,6 +6920,14 @@ async function onboard(opts: OnboardOptions = {}): Promise { runPreflight: (preflightOptions) => preflight({ ...opts, ...preflightOptions }), assessHost, assertCdiNvidiaGpuSpecPresent, + // Resume backstops for #3508/#3630/Podman: the cached preflight + // step does not capture host Docker/DNS state, and a session + // written by an older NemoClaw may not have run the new bridge/ + // DNS fatal checks (mirrors the assertCdiNvidiaGpuSpecPresent + // resume pattern). Podman rejection runs first so users on + // unsupported runtimes don't see Docker-specific diagnostics. + rejectUnsupportedContainerRuntime, + assertDockerBridgeAndContainerDnsHealthy, resolveSandboxGpuConfig, validateSandboxGpuPreflight, skippedStepMessage, diff --git a/src/lib/onboard/bridge-dns-preflight.test.ts b/src/lib/onboard/bridge-dns-preflight.test.ts new file mode 100644 index 0000000000..40c8128079 --- /dev/null +++ b/src/lib/onboard/bridge-dns-preflight.test.ts @@ -0,0 +1,205 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { afterEach, describe, expect, it, vi } from "vitest"; + +import { + printContainerDnsRemediation, + printDockerBridgeContainerStartFailure, +} from "../../../dist/lib/onboard/bridge-dns-preflight"; +import { setOnboardBrandingAgent } from "../../../dist/lib/onboard/branding"; + +describe("printDockerBridgeContainerStartFailure", () => { + const savedInvokedAs = process.env.NEMOCLAW_INVOKED_AS; + const savedAgent = process.env.NEMOCLAW_AGENT; + afterEach(() => { + setOnboardBrandingAgent(null); + if (savedInvokedAs === undefined) { + delete process.env.NEMOCLAW_INVOKED_AS; + } else { + process.env.NEMOCLAW_INVOKED_AS = savedInvokedAs; + } + if (savedAgent === undefined) { + delete process.env.NEMOCLAW_AGENT; + } else { + process.env.NEMOCLAW_AGENT = savedAgent; + } + vi.restoreAllMocks(); + }); + + it("uses the active CLI branding in the verify-outside hint (#3630 CodeRabbit)", () => { + setOnboardBrandingAgent("hermes"); + process.env.NEMOCLAW_AGENT = "hermes"; + process.env.NEMOCLAW_INVOKED_AS = "nemohermes"; + const messages: string[] = []; + const errSpy = vi.spyOn(console, "error").mockImplementation((arg?: unknown) => { + messages.push(String(arg ?? "")); + }); + printDockerBridgeContainerStartFailure({ + ok: false, + reason: "veth_unsupported", + details: "docker: failed to add the host <=> sandbox veth pair interfaces", + timedOut: false, + exitCode: 125, + signal: null, + }); + errSpy.mockRestore(); + const verifyLine = messages.find((line) => line.startsWith(" Verify outside")); + expect(verifyLine).toBeDefined(); + expect(verifyLine).toContain("NemoHermes"); + expect(verifyLine).not.toContain("Verify outside NemoClaw:"); + }); + + it("renders Linux daemon.json remediation without the bare-echo clobber fallback (#3630 CodeRabbit)", () => { + const messages: string[] = []; + const errSpy = vi.spyOn(console, "error").mockImplementation((arg?: unknown) => { + messages.push(String(arg ?? "")); + }); + // `printContainerDnsRemediation` only reads a few host fields; cast + // through `unknown` so the test doesn't have to build a full + // HostAssessment fixture. + printContainerDnsRemediation({ + platform: "linux", + isWsl: false, + systemctlAvailable: true, + runtime: "docker", + } as unknown as Parameters[0]); + errSpy.mockRestore(); + const blob = messages.join("\n"); + // Must NOT clobber an existing daemon.json via a bare-echo fallback. + expect(blob).not.toMatch(/\|\|\s*echo '?\{"dns"/); + expect(blob).not.toMatch(/echo '\{"dns":/); + // Must direct users to install jq if missing, and create the config dir. + expect(blob).toContain("mkdir -p /etc/docker"); + expect(blob).toContain("jq"); + expect(blob).toMatch(/install jq|apt-get install/i); + // Must surface a manual-edit path so users without jq can still proceed. + expect(blob).toMatch(/edit \/etc\/docker\/daemon\.json manually/); + // Sanity: still uses `jq -n` to create new daemon.json when missing. + expect(blob).toContain("jq -n"); + expect(blob).toMatch(/\{"dns":\["[^"]+"\]\}/); + }); + + it("renders WSL-without-systemd remediation without using systemctl steps (#3630 CodeRabbit)", () => { + const messages: string[] = []; + const errSpy = vi.spyOn(console, "error").mockImplementation((arg?: unknown) => { + messages.push(String(arg ?? "")); + }); + printContainerDnsRemediation({ + platform: "linux", + isWsl: true, + systemctlAvailable: false, + runtime: "docker", + } as unknown as Parameters[0]); + errSpy.mockRestore(); + const blob = messages.join("\n"); + expect(blob).toContain("Docker Desktop"); // step 1 still mentions Docker Desktop + // Step 2 path on non-systemd WSL must NOT print systemctl commands. + expect(blob).not.toContain("sudo systemctl restart systemd-resolved"); + expect(blob).not.toContain("sudo systemctl restart docker"); + expect(blob).toMatch(/service docker restart|stop the dockerd process/); + // Still uses the safe jq merge — no bare-echo clobber. + expect(blob).not.toMatch(/\|\|\s*echo '?\{"dns"/); + expect(blob).toContain("mkdir -p /etc/docker"); + expect(blob).toContain("jq -n"); + }); + + it("renders WSL-with-systemd remediation with the Linux systemd path (#3630 CodeRabbit)", () => { + const messages: string[] = []; + const errSpy = vi.spyOn(console, "error").mockImplementation((arg?: unknown) => { + messages.push(String(arg ?? "")); + }); + printContainerDnsRemediation({ + platform: "linux", + isWsl: true, + systemctlAvailable: true, + runtime: "docker", + } as unknown as Parameters[0]); + errSpy.mockRestore(); + const blob = messages.join("\n"); + expect(blob).toContain("sudo systemctl restart systemd-resolved"); + expect(blob).toContain("sudo systemctl restart docker"); + }); + + it("uses the pinned BusyBox digest in the manual verify-fix commands (#3630 CodeRabbit)", () => { + const messages: string[] = []; + const errSpy = vi.spyOn(console, "error").mockImplementation((arg?: unknown) => { + messages.push(String(arg ?? "")); + }); + printContainerDnsRemediation({ + platform: "linux", + isWsl: false, + systemctlAvailable: true, + runtime: "docker", + } as unknown as Parameters[0]); + errSpy.mockRestore(); + const blob = messages.join("\n"); + // The manual nslookup verification must use the pinned digest, not + // the floating `busybox:latest` tag. + expect(blob).toMatch(/docker run --rm busybox@sha256:[0-9a-f]{64} nslookup/); + expect(blob).not.toMatch(/docker run --rm busybox\s+nslookup/); + }); + + it("uses the pinned BusyBox digest in the verify-outside hint after a bridge failure (#3630 CodeRabbit)", () => { + const messages: string[] = []; + const errSpy = vi.spyOn(console, "error").mockImplementation((arg?: unknown) => { + messages.push(String(arg ?? "")); + }); + printDockerBridgeContainerStartFailure({ + ok: false, + reason: "veth_unsupported", + details: "operation not supported", + timedOut: false, + exitCode: 125, + signal: null, + }); + errSpy.mockRestore(); + const blob = messages.join("\n"); + expect(blob).toMatch(/docker run --rm --network bridge busybox@sha256:[0-9a-f]{64} true/); + expect(blob).not.toMatch(/busybox:latest true/); + }); + + it("renders macOS Docker Desktop daemon.json remediation without bare-echo clobber (#3630 CodeRabbit)", () => { + const messages: string[] = []; + const errSpy = vi.spyOn(console, "error").mockImplementation((arg?: unknown) => { + messages.push(String(arg ?? "")); + }); + printContainerDnsRemediation({ + platform: "darwin", + isWsl: false, + systemctlAvailable: false, + runtime: "docker-desktop", + } as unknown as Parameters[0]); + errSpy.mockRestore(); + const blob = messages.join("\n"); + expect(blob).not.toMatch(/\|\|\s*echo '?\{"dns"/); + expect(blob).not.toMatch(/echo '\{"dns":/); + expect(blob).toContain("mkdir -p ~/.docker"); + expect(blob).toMatch(/brew install jq|install jq/i); + expect(blob).toMatch(/edit ~\/\.docker\/daemon\.json manually/); + expect(blob).toContain("jq -n"); + }); + + it("uses cliName() in the docker_daemon_unreachable rerun hint", () => { + setOnboardBrandingAgent("hermes"); + process.env.NEMOCLAW_AGENT = "hermes"; + process.env.NEMOCLAW_INVOKED_AS = "nemohermes"; + const messages: string[] = []; + const errSpy = vi.spyOn(console, "error").mockImplementation((arg?: unknown) => { + messages.push(String(arg ?? "")); + }); + printDockerBridgeContainerStartFailure({ + ok: false, + reason: "docker_daemon_unreachable", + details: "Cannot connect to the Docker daemon", + timedOut: false, + exitCode: null, + signal: null, + }); + errSpy.mockRestore(); + const rerunLine = messages.find((line) => line.includes("re-run")); + expect(rerunLine).toBeDefined(); + expect(rerunLine).toContain("nemohermes onboard"); + expect(rerunLine).not.toMatch(/\bnemoclaw onboard\b/); + }); +}); diff --git a/src/lib/onboard/bridge-dns-preflight.ts b/src/lib/onboard/bridge-dns-preflight.ts new file mode 100644 index 0000000000..1f2f6845c3 --- /dev/null +++ b/src/lib/onboard/bridge-dns-preflight.ts @@ -0,0 +1,393 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * Bridge + DNS preflight gate, extracted from `onboard.ts` so it can be + * reused as a `--resume` backstop without growing the top-level file + * past the `onboard-entrypoint-budget` CI ceiling. + * + * - `assertDockerBridgeAndContainerDnsHealthy(host)` runs the bridge + * container start probe (#3508 Jetson veth) and the DNS-from-inside- + * container probe (#3630), and exits with platform-aware remediation + * on the fatal reasons described in `[[isFatalContainerDnsProbeFailure]]`. + */ + +import { cliDisplayName, cliName } from "./branding"; + +interface DaemonJsonDnsPatchOpts { + /** daemon.json path to merge into (e.g. /etc/docker/daemon.json). */ + daemonJsonPath: string; + /** Containing directory; printed `mkdir -p` ensures it exists. */ + configDir: string; + /** DNS value to add (concrete IP or a ``). */ + dnsValue: string; + /** Prepend `sudo` to the printed commands (Linux daemon.json). */ + sudo: boolean; + /** Suggested jq install command shown when jq is missing. */ + installJqHint: string; + /** Leading whitespace for the printed lines. */ + indent: string; +} + +/** + * Print a copy-pastable shell snippet that adds a `dns` key to the + * given daemon.json safely. The snippet: + * - creates the containing directory, + * - backs up the existing daemon.json, + * - requires `jq` (prints an install hint and aborts if missing — no + * bare-echo fallback that would clobber an existing daemon.json), + * - merges into an existing JSON object via `jq '. + {...}'`, + * - creates a new JSON object via `jq -n {...}` when daemon.json is + * absent, + * - refuses to write if the existing file is not parseable, asking + * the user to fix it manually first. + * + * The snippet is printed verbatim; nothing here executes it. + */ +function printDaemonJsonDnsPatch(opts: DaemonJsonDnsPatchOpts): void { + const { daemonJsonPath, configDir, dnsValue, sudo, installJqHint, indent } = opts; + const sudoPrefix = sudo ? "sudo " : ""; + const dnsJsonLiteral = `{"dns":["${dnsValue}"]}`; + console.error(`${indent}${sudoPrefix}mkdir -p ${configDir}`); + console.error( + `${indent}${sudoPrefix}cp ${daemonJsonPath} ${daemonJsonPath}.bak-$(date +%s) 2>/dev/null || true`, + ); + // One copy-pastable `sh -c` block so the user runs it as a single + // unit. Single-quoted shell body uses '"'"' to embed double quotes + // for jq while keeping the JS template literal readable. + const shBody = [ + `if ! command -v jq >/dev/null 2>&1; then`, + ` echo "jq is required to safely merge ${daemonJsonPath}. Install jq (${installJqHint}) and re-run," >&2;`, + ` echo "or edit ${daemonJsonPath} manually to add: ${dnsJsonLiteral}" >&2;`, + ` exit 1;`, + `fi;`, + `TMP=$(mktemp);`, + `if [ -f ${daemonJsonPath} ]; then`, + ` if ! jq '. + ${dnsJsonLiteral}' ${daemonJsonPath} > "$TMP" 2>/dev/null; then`, + ` echo "${daemonJsonPath} is not valid JSON; fix it manually first" >&2;`, + ` rm -f "$TMP";`, + ` exit 1;`, + ` fi;`, + `else`, + ` jq -n '${dnsJsonLiteral}' > "$TMP";`, + `fi;`, + `mv "$TMP" ${daemonJsonPath};`, + ].join(" "); + console.error(`${indent}${sudoPrefix}sh -c '${shBody.replace(/'/g, "'\"'\"'")}'`); +} +import { + BUSYBOX_PROBE_IMAGE, + type DockerBridgeContainerStartProbeResult, + getDockerBridgeGatewayIp, + type HostAssessment, + isFatalContainerDnsProbeFailure, + probeContainerDns, + probeDockerBridgeContainerStart, +} from "./preflight"; + +type Host = HostAssessment; + +export function printDockerBridgeContainerStartFailure( + result: DockerBridgeContainerStartProbeResult, +): void { + console.error(" ✗ Docker could not start a bridge-network test container."); + if (result.details) { + for (const line of String(result.details).split("\n").slice(-4)) { + if (line.trim()) console.error(` ${line.trim()}`); + } + } + console.error(""); + if (result.reason === "veth_unsupported") { + console.error( + " Docker reported that creating the container veth pair is not supported.", + ); + console.error( + " This matches the Jetson kernel/Docker bridge failure seen before long sandbox builds.", + ); + console.error( + ` Update the Jetson Linux kernel/Docker bridge networking support, or run ${cliDisplayName()} on`, + ); + console.error(" a host whose Docker bridge networking can create veth interfaces."); + } else if (result.reason === "timeout" || result.reason === "killed") { + console.error(" Docker did not complete a minimal bridge container start probe in time."); + console.error(" Restart Docker and check for stuck container/network operations before retrying."); + } else if (result.reason === "docker_daemon_unreachable") { + console.error(" The Docker CLI cannot reach the Docker daemon (dockerd is down or wedged)."); + console.error( + " Restart the Docker daemon (`sudo systemctl restart docker`, or restart Docker Desktop/Colima)", + ); + console.error(` and re-run \`${cliName()} onboard\`.`); + } else if (result.reason === "image_pull_failed") { + console.error(" Docker could not pull the busybox test image needed for the preflight probe."); + console.error(" Ensure the Docker daemon can reach its registry, then retry onboarding."); + } else { + console.error(" Docker returned an unexpected failure for a minimal bridge container."); + console.error(" Restart Docker and retry onboarding after verifying bridge networking."); + } + console.error(""); + console.error(` Verify outside ${cliDisplayName()}:`); + // Reuse the same pinned BusyBox digest as the automated probe so the + // command the user copies matches what NemoClaw actually runs. + console.error(` docker run --rm --network bridge ${BUSYBOX_PROBE_IMAGE} true`); +} + +/** + * Bridge + DNS preflight checks. Call from both `preflight()` and the + * `--resume` branch. The cached preflight step doesn't capture host + * Docker/DNS state, and the original attempt that wrote the cache may + * have aborted later at sandbox build with exactly the #3508/#3630 + * failure modes. Resuming without re-checking would walk into the same + * wall (mirroring the [[assertCdiNvidiaGpuSpecPresent]] resume backstop + * pattern at #3152). + */ +export function assertDockerBridgeAndContainerDnsHealthy(host: Host): void { + // A minimal bridge-backed container start catches Docker/kernel failures + // (notably Jetson veth "operation not supported") before longer gateway or + // sandbox build work starts. Only veth/timeout/killed/daemon-unreachable + // reasons are definitively a bridge problem; image_pull_failed (e.g. Hub + // rate limit, proxy outage, registry DNS — handled in the DNS probe below) + // and bare `error` (e.g. a daemon that disabled the default bridge but + // uses a managed one) stay inconclusive. + const bridgeStart = probeDockerBridgeContainerStart(); + if (bridgeStart.ok) { + console.log(" ✓ Docker can start bridge containers"); + } else if ( + bridgeStart.reason === "veth_unsupported" || + bridgeStart.reason === "timeout" || + bridgeStart.reason === "killed" || + bridgeStart.reason === "docker_daemon_unreachable" + ) { + printDockerBridgeContainerStartFailure(bridgeStart); + process.exit(1); + } else { + console.warn( + ` ⚠ Bridge container start probe inconclusive (reason: ${bridgeStart.reason ?? "unknown"}).`, + ); + if (bridgeStart.details) { + for (const line of String(bridgeStart.details).split("\n").slice(-3)) { + if (line.trim()) console.warn(` ${line.trim()}`); + } + } + console.warn(" Continuing to DNS probe for more specific diagnosis."); + } + + // DNS resolution from inside containers (#2101). A corp firewall that + // blocks outbound UDP:53 to public resolvers leaves the sandbox build + // unable to resolve registry.npmjs.org; npm then retries for ~15 min and + // prints the cryptic `Exit handler never called`. + const dns = probeContainerDns(); + const dnsIsFatal = isFatalContainerDnsProbeFailure(dns); + + if (dns.ok) { + console.log(" ✓ Container DNS resolution works"); + return; + } + if (!dnsIsFatal) { + if (dns.reason === "image_pull_failed") { + console.warn( + " ⚠ Container DNS probe inconclusive: docker couldn't pull the busybox test image.", + ); + console.warn(" This usually means the docker daemon itself can't reach Docker Hub,"); + console.warn( + " but doesn't prove container DNS is broken — the sandbox build may still succeed.", + ); + } else { + console.warn(` ⚠ Container DNS probe inconclusive (reason: ${dns.reason ?? "unknown"}).`); + } + if (dns.details) { + for (const line of String(dns.details).split("\n").slice(-3)) { + if (line.trim()) console.warn(` ${line.trim()}`); + } + } + console.warn(" Proceeding. If the sandbox build later hangs at `npm ci`, see issue #2101."); + return; + } + + if (dns.reason === "veth_unsupported") { + printDockerBridgeContainerStartFailure({ + ok: false, + reason: "veth_unsupported", + details: dns.details, + timedOut: dns.timedOut, + exitCode: dns.exitCode, + signal: dns.signal, + }); + process.exit(1); + } + if (dns.reason === "docker_daemon_unreachable") { + printDockerBridgeContainerStartFailure({ + ok: false, + reason: "docker_daemon_unreachable", + details: dns.details, + timedOut: dns.timedOut, + exitCode: dns.exitCode, + signal: dns.signal, + }); + process.exit(1); + } + if (dns.reason === "timeout" || dns.reason === "killed") { + console.error(" ✗ Container DNS probe did not complete."); + } else if (dns.reason === "image_pull_failed") { + console.error(" ✗ Docker could not resolve or pull the DNS probe image."); + } else { + console.error(" ✗ DNS resolution from inside a docker container failed."); + } + if (dns.details) { + for (const line of String(dns.details).split("\n").slice(-4)) { + if (line.trim()) console.error(` ${line.trim()}`); + } + } + console.error(""); + printContainerDnsRemediation(host); + process.exit(1); +} + +export function printContainerDnsRemediation(host: Host): void { + console.error(" The sandbox build runs `npm ci` inside a container and needs to resolve"); + console.error(" registry.npmjs.org. On networks that block outbound UDP:53 to public DNS"); + console.error(" (common in corporate environments that force DNS-over-TLS on the host),"); + console.error(" the build appears to hang for ~15 minutes and then prints the cryptic"); + console.error(" `npm error Exit handler never called`. See issue #2101."); + console.error(""); + console.error(" Fix options:"); + console.error(""); + + // Platform-aware remediation hints. The systemd-resolved fix is + // Linux-specific; macOS / Windows / WSL-backed-by-Docker-Desktop + // hosts configure DNS through Docker Desktop's GUI or a + // platform-specific daemon.json path, so we avoid printing shell + // commands that would mislead those users. + const isLinuxWithSystemd = + host.platform === "linux" && !host.isWsl && host.systemctlAvailable; + + const printLinuxFix = (bridgeIp: string, note: string | null) => { + if (note) console.error(note); + console.error(" sudo mkdir -p /etc/systemd/resolved.conf.d/"); + console.error( + ` printf '[Resolve]\\nDNSStubListenerExtra=${bridgeIp}\\n' | sudo tee /etc/systemd/resolved.conf.d/docker-bridge.conf`, + ); + console.error(" sudo systemctl restart systemd-resolved"); + console.error(""); + console.error( + " Then merge the dns key into /etc/docker/daemon.json (jq required for safe merge; no bare-echo fallback so an existing file is not clobbered):", + ); + printDaemonJsonDnsPatch({ + daemonJsonPath: "/etc/docker/daemon.json", + configDir: "/etc/docker", + dnsValue: bridgeIp, + sudo: true, + installJqHint: "sudo apt-get install -y jq", + indent: " ", + }); + console.error(" sudo systemctl restart docker"); + }; + + if (isLinuxWithSystemd) { + const detectedBridgeIp = getDockerBridgeGatewayIp(); + const bridgeIp = detectedBridgeIp || "172.17.0.1"; + let bridgeNote: string | null = null; + if (detectedBridgeIp && detectedBridgeIp !== "172.17.0.1") { + bridgeNote = ` (detected your docker bridge gateway at ${detectedBridgeIp})`; + } else if (!detectedBridgeIp) { + bridgeNote = + " (could not auto-detect bridge IP; using docker's default — verify with:\n" + + " docker network inspect bridge --format '{{range .IPAM.Config}}{{.Gateway}}{{end}}')"; + } + console.error(" 1. Make systemd-resolved reachable from containers (recommended):"); + printLinuxFix(bridgeIp, bridgeNote); + console.error(""); + console.error(" 2. Configure an explicit UDP:53-capable DNS in /etc/docker/daemon.json"); + console.error(" (ask your IT team for an internal DNS server IP)."); + } else if (host.platform === "darwin") { + if (host.runtime === "colima") { + console.error(" Configure Colima's DNS (macOS):"); + console.error(" colima stop"); + console.error(" colima start --dns "); + console.error(" (or edit ~/.colima/default/colima.yaml and `colima restart`)"); + } else if (host.runtime === "docker-desktop" || host.runtime === "docker") { + console.error(" Configure Docker Desktop's DNS (macOS):"); + console.error(" Merge the dns key into ~/.docker/daemon.json (jq required for safe merge):"); + printDaemonJsonDnsPatch({ + daemonJsonPath: "~/.docker/daemon.json", + configDir: "~/.docker", + dnsValue: "", + sudo: false, + installJqHint: "brew install jq", + indent: " ", + }); + console.error(" osascript -e 'quit app \"Docker\"' && sleep 3 && open -a Docker"); + console.error( + " (or do the same via the Docker Desktop UI: Settings → Docker Engine)", + ); + } else { + console.error(" Configure your container runtime's DNS (macOS):"); + console.error(" - Docker Desktop (jq required for safe daemon.json merge):"); + printDaemonJsonDnsPatch({ + daemonJsonPath: "~/.docker/daemon.json", + configDir: "~/.docker", + dnsValue: "", + sudo: false, + installJqHint: "brew install jq", + indent: " ", + }); + console.error(" osascript -e 'quit app \"Docker\"' && sleep 3 && open -a Docker"); + console.error(" - Colima:"); + console.error(" colima stop && colima start --dns "); + console.error(" - Rancher Desktop / Podman: edit the runtime's DNS config"); + console.error(" and restart it."); + } + console.error(" Ask your IT team for an internal DNS server IP that accepts UDP:53."); + } else if (host.platform === "win32" || host.isWsl) { + console.error(" 1. Configure Docker Desktop's DNS (Windows / WSL via Docker Desktop):"); + console.error( + " Docker Desktop for Windows → Settings → Docker Engine — edit the JSON to add:", + ); + console.error(' { "dns": [""] }'); + console.error(" Then click Apply & Restart."); + console.error(""); + console.error( + " 2. If you run docker natively inside WSL (not Docker Desktop), apply the Linux fix:", + ); + const wslBridgeIp = getDockerBridgeGatewayIp(); + let wslBridgeNote: string | null = null; + if (wslBridgeIp && wslBridgeIp !== "172.17.0.1") { + wslBridgeNote = ` (detected your docker bridge gateway at ${wslBridgeIp})`; + } else if (!wslBridgeIp) { + wslBridgeNote = + " (could not auto-detect bridge IP — the snippet below uses docker's default; verify with:\n" + + " docker network inspect bridge --format '{{range .IPAM.Config}}{{.Gateway}}{{end}}')"; + } + const wslIp = wslBridgeIp || "172.17.0.1"; + if (host.systemctlAvailable) { + // Native WSL with systemd enabled (`/etc/wsl.conf [boot] + // systemd=true`): the same systemd-resolved + docker daemon.json + // remediation works. + printLinuxFix(wslIp, wslBridgeNote); + } else { + // WSL without systemd — `systemctl` isn't available, so don't + // print steps that depend on it. Show the daemon.json safe-merge + // and a non-systemctl restart hint instead. + if (wslBridgeNote) console.error(wslBridgeNote); + console.error(" Merge the dns key into /etc/docker/daemon.json (jq required for safe merge):"); + printDaemonJsonDnsPatch({ + daemonJsonPath: "/etc/docker/daemon.json", + configDir: "/etc/docker", + dnsValue: wslIp, + sudo: true, + installJqHint: "sudo apt-get install -y jq", + indent: " ", + }); + console.error(" Restart the Docker daemon however your WSL distro launches it"); + console.error(" (e.g. `sudo service docker restart`, or stop the dockerd process and rerun it)."); + } + } else { + console.error(" Configure your docker daemon to use a DNS server that accepts UDP:53."); + console.error( + ' Add { "dns": [""] } to your docker daemon.json and restart the daemon.', + ); + console.error(" Ask your IT team for an internal DNS server IP."); + } + console.error(""); + console.error(" Verify the fix worked:"); + console.error(` docker run --rm ${BUSYBOX_PROBE_IMAGE} nslookup registry.npmjs.org`); +} diff --git a/src/lib/onboard/gateway-sandbox-reachability.test.ts b/src/lib/onboard/gateway-sandbox-reachability.test.ts index 9599f93687..7043e01ce1 100644 --- a/src/lib/onboard/gateway-sandbox-reachability.test.ts +++ b/src/lib/onboard/gateway-sandbox-reachability.test.ts @@ -108,6 +108,207 @@ describe("isSandboxBridgeGatewayReachable", () => { expect(result.reason).toBe("probe_unavailable"); }); + it("flags veth operation-not-supported as a fatal bridge failure", async () => { + const result = await isSandboxBridgeGatewayReachable({ + inspectNetworkImpl: () => ({ subnet: "172.19.0.0/16", gatewayIp: "172.19.0.1" }), + usesHostGatewayRouteImpl: () => false, + runImpl: () => ({ + status: 125, + stderr: + "docker: Error response from daemon: failed to add the host <=> sandbox veth pair interfaces: operation not supported.", + }), + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("veth_unsupported"); + expect(result.detail).toContain("operation not supported"); + }); + + it("does not misclassify unrelated 'veth' or 'operation not supported' output as veth_unsupported (#3630 CodeRabbit)", async () => { + // Generic veth status lines, or `operation not supported` from + // other syscalls (mount, ioctl, etc.) must fall through to the + // existing inconclusive path, not be reported as fatal Jetson veth. + const vethMention = await isSandboxBridgeGatewayReachable({ + inspectNetworkImpl: () => ({ subnet: "172.19.0.0/16", gatewayIp: "172.19.0.1" }), + usesHostGatewayRouteImpl: () => false, + runImpl: () => ({ + status: 1, + stderr: "veth1234: mtu 1500\n", + }), + }); + expect(vethMention.reason).not.toBe("veth_unsupported"); + + const genericOps = await isSandboxBridgeGatewayReachable({ + inspectNetworkImpl: () => ({ subnet: "172.19.0.0/16", gatewayIp: "172.19.0.1" }), + usesHostGatewayRouteImpl: () => false, + runImpl: () => ({ + status: 1, + stderr: "mount: operation not supported on /sys/fs/cgroup\n", + }), + }); + expect(genericOps.reason).not.toBe("veth_unsupported"); + }); + + it("flags docker probe timeouts separately from inconclusive probe failures", async () => { + const result = await isSandboxBridgeGatewayReachable({ + inspectNetworkImpl: () => ({ subnet: "172.19.0.0/16", gatewayIp: "172.19.0.1" }), + usesHostGatewayRouteImpl: () => false, + runImpl: () => ({ + status: null, + signal: "SIGTERM", + error: "spawnSync docker ETIMEDOUT", + }), + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("probe_timeout"); + expect(result.detail).toContain("ETIMEDOUT"); + }); + + it("flags spawn-level timeouts via explicit timedOut flag (preferred runner channel)", async () => { + const result = await isSandboxBridgeGatewayReachable({ + inspectNetworkImpl: () => ({ subnet: "172.19.0.0/16", gatewayIp: "172.19.0.1" }), + usesHostGatewayRouteImpl: () => false, + runImpl: () => ({ + status: null, + signal: "SIGTERM", + timedOut: true, + errorCode: "ETIMEDOUT", + error: "spawnSync docker ETIMEDOUT", + }), + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("probe_timeout"); + }); + + it("does not treat arbitrary signal-killed exits as spawn timeouts when timedOut is false", async () => { + // If the runner explicitly says timedOut=false and errorCode is not + // ETIMEDOUT, the probe must not be classified as probe_timeout. + // status: null routes through the `status !== 1` branch to the + // inconclusive probe_unavailable bucket — pin that explicitly so a + // future refactor can't silently promote it to a fatal reason. + const result = await isSandboxBridgeGatewayReachable({ + inspectNetworkImpl: () => ({ subnet: "172.19.0.0/16", gatewayIp: "172.19.0.1" }), + usesHostGatewayRouteImpl: () => false, + runImpl: () => ({ + status: null, + signal: "SIGTERM", + timedOut: false, + errorCode: "EPIPE", + error: "spawnSync docker EPIPE", + }), + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("probe_unavailable"); + }); + + it("keeps tcp_failed for BusyBox nc connection-level 'Operation timed out' stderr (UFW remediation path)", async () => { + const result = await isSandboxBridgeGatewayReachable({ + inspectNetworkImpl: () => ({ subnet: "172.19.0.0/16", gatewayIp: "172.19.0.1" }), + usesHostGatewayRouteImpl: () => false, + runImpl: () => ({ + status: 1, + stderr: "nc: host.openshell.internal (172.19.0.1:8080): Operation timed out", + }), + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("tcp_failed"); + }); + + it("downgrades a slow-registry pre-pull timeout to probe_unavailable (not fatal probe_timeout) (#3630 codex review)", async () => { + const result = await isSandboxBridgeGatewayReachable({ + inspectNetworkImpl: () => ({ subnet: "172.19.0.0/16", gatewayIp: "172.19.0.1" }), + usesHostGatewayRouteImpl: () => false, + runImpl: () => ({ status: 0 }), + ensureImageCachedOverride: { + ok: false, + reason: "pull_timeout", + details: "docker pull timed out after 60s", + }, + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("probe_unavailable"); + expect(result.detail).toContain("timed out"); + }); + + it("classifies docker-daemon-connect failures from the probe run as fatal docker_daemon_unreachable (#3630 CodeRabbit)", async () => { + // The image-cache pre-pull succeeded (or was bypassed), but the + // actual `docker run` probe failed with the daemon-down signature. + // This must surface as docker_daemon_unreachable (fatal), not slip + // into the warn-only probe_unavailable bucket. + const result = await isSandboxBridgeGatewayReachable({ + inspectNetworkImpl: () => ({ subnet: "172.19.0.0/16", gatewayIp: "172.19.0.1" }), + usesHostGatewayRouteImpl: () => false, + runImpl: () => ({ + status: 1, + stderr: + "Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?", + }), + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("docker_daemon_unreachable"); + expect(result.detail).toContain("Cannot connect to the Docker daemon"); + }); + + it("classifies BusyBox 'bad address' name-resolution failures as probe_unavailable (not tcp_failed)", async () => { + const result = await isSandboxBridgeGatewayReachable({ + inspectNetworkImpl: () => ({ subnet: "172.19.0.0/16", gatewayIp: "172.19.0.1" }), + usesHostGatewayRouteImpl: () => false, + runImpl: () => ({ + status: 1, + stderr: "nc: bad address 'host.openshell.internal'", + }), + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("probe_unavailable"); + }); + + it("prefers docker_daemon_unreachable over name-resolution when stderr contains both signatures (precedence)", async () => { + const result = await isSandboxBridgeGatewayReachable({ + inspectNetworkImpl: () => ({ subnet: "172.19.0.0/16", gatewayIp: "172.19.0.1" }), + usesHostGatewayRouteImpl: () => false, + runImpl: () => ({ + status: 1, + stderr: + "Cannot connect to the Docker daemon at unix:///var/run/docker.sock.\n" + + "nc: bad address 'host.openshell.internal'", + }), + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("docker_daemon_unreachable"); + }); + + it("escalates inspect_unavailable to fatal docker_daemon_unreachable (#3630 codex review)", async () => { + const result = await isSandboxBridgeGatewayReachable({ + inspectNetworkImpl: () => ({ subnet: "172.19.0.0/16", gatewayIp: "172.19.0.1" }), + usesHostGatewayRouteImpl: () => false, + runImpl: () => ({ status: 0 }), + ensureImageCachedOverride: { + ok: false, + reason: "inspect_unavailable", + details: "Cannot connect to the Docker daemon at unix:///var/run/docker.sock", + }, + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("docker_daemon_unreachable"); + expect(result.detail).toContain("Cannot connect to the Docker daemon"); + }); + + it("uses inspect-specific fallback detail when inspect_unavailable has no details (#3630 CodeRabbit)", async () => { + const result = await isSandboxBridgeGatewayReachable({ + inspectNetworkImpl: () => ({ subnet: "172.19.0.0/16", gatewayIp: "172.19.0.1" }), + usesHostGatewayRouteImpl: () => false, + runImpl: () => ({ status: 0 }), + ensureImageCachedOverride: { + ok: false, + reason: "inspect_unavailable", + // No `details` — exercise the fallback branch. + }, + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("docker_daemon_unreachable"); + expect(result.detail).toContain("inspect"); + expect(result.detail).not.toContain("docker pull"); + }); + it("flags tcp_failed only after the OpenShell route was modeled", async () => { const result = await isSandboxBridgeGatewayReachable({ inspectNetworkImpl: () => ({ subnet: "172.19.0.0/16", gatewayIp: "172.19.0.1" }), @@ -156,6 +357,89 @@ describe("formatSandboxBridgeUnreachableMessage", () => { expect(msg).not.toContain("ufw allow"); }); + it("emits a fatal veth message without treating it as inconclusive", () => { + const msg = formatSandboxBridgeUnreachableMessage({ + ok: false, + reason: "veth_unsupported", + detail: + "docker: Error response from daemon: failed to add the host <=> sandbox veth pair interfaces: operation not supported.", + }); + expect(msg).toContain("could not create the sandbox bridge veth pair"); + expect(msg).toContain("operation not supported"); + expect(msg).not.toContain("continuing"); + }); + + it("emits a fatal timeout message without treating it as inconclusive", () => { + const msg = formatSandboxBridgeUnreachableMessage({ + ok: false, + reason: "probe_timeout", + detail: "spawnSync docker ETIMEDOUT", + }); + expect(msg).toContain("probe timed out"); + expect(msg).toContain("ETIMEDOUT"); + expect(msg).not.toContain("continuing"); + }); + + it("emits a fatal docker_daemon_unreachable message with daemon restart hint", () => { + const msg = formatSandboxBridgeUnreachableMessage({ + ok: false, + reason: "docker_daemon_unreachable", + detail: "Cannot connect to the Docker daemon at unix:///var/run/docker.sock", + }); + expect(msg).toContain("Docker daemon is not reachable"); + expect(msg).toContain("Cannot connect to the Docker daemon"); + expect(msg).toMatch(/Restart the Docker daemon|systemctl restart docker|Docker Desktop/); + expect(msg).not.toContain("continuing"); + }); + + it("uses cliDisplayName() and cliName() in fatal messages instead of hardcoded NemoClaw branding (#3630 CodeRabbit)", () => { + const savedAgent = process.env.NEMOCLAW_AGENT; + const savedInvoked = process.env.NEMOCLAW_INVOKED_AS; + process.env.NEMOCLAW_AGENT = "hermes"; + process.env.NEMOCLAW_INVOKED_AS = "nemohermes"; + try { + const veth = formatSandboxBridgeUnreachableMessage({ + ok: false, + reason: "veth_unsupported", + detail: "operation not supported", + }); + expect(veth).toContain("NemoHermes"); + expect(veth).not.toContain("run NemoClaw on"); + + const timeout = formatSandboxBridgeUnreachableMessage({ + ok: false, + reason: "probe_timeout", + detail: "spawnSync docker ETIMEDOUT", + }); + expect(timeout).toContain("`nemohermes onboard`"); + expect(timeout).not.toMatch(/`nemoclaw onboard`/); + + const daemon = formatSandboxBridgeUnreachableMessage({ + ok: false, + reason: "docker_daemon_unreachable", + detail: "Cannot connect to the Docker daemon", + }); + expect(daemon).toContain("`nemohermes onboard`"); + expect(daemon).not.toMatch(/`nemoclaw onboard`/); + + const tcp = formatSandboxBridgeUnreachableMessage({ + ok: false, + reason: "tcp_failed", + routeKind: "bridge_gateway", + networkName: "openshell-docker", + subnet: "172.19.0.0/16", + gatewayIp: "172.19.0.1", + }); + expect(tcp).toContain("`nemohermes onboard`"); + expect(tcp).not.toMatch(/`nemoclaw onboard`/); + } finally { + if (savedAgent === undefined) delete process.env.NEMOCLAW_AGENT; + else process.env.NEMOCLAW_AGENT = savedAgent; + if (savedInvoked === undefined) delete process.env.NEMOCLAW_INVOKED_AS; + else process.env.NEMOCLAW_INVOKED_AS = savedInvoked; + } + }); + it("does not emit a UFW command for host-gateway routing failures", () => { const msg = formatSandboxBridgeUnreachableMessage({ ok: false, diff --git a/src/lib/onboard/gateway-sandbox-reachability.ts b/src/lib/onboard/gateway-sandbox-reachability.ts index 01641da477..b780d78c0b 100644 --- a/src/lib/onboard/gateway-sandbox-reachability.ts +++ b/src/lib/onboard/gateway-sandbox-reachability.ts @@ -12,6 +12,8 @@ import { dockerCapture, dockerRun } from "../adapters/docker/run"; import { GATEWAY_PORT } from "../core/ports"; +import { cliDisplayName, cliName } from "./branding"; +import { ensureProbeImageCached, isDockerDaemonUnreachable } from "./preflight"; const DEFAULT_PROBE_IMAGE = "busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662"; @@ -21,7 +23,13 @@ const HOST_DOCKER_INTERNAL_NAME = "host.docker.internal"; const DEFAULT_PROBE_TIMEOUT_SEC = 5; const PROBE_RUN_OVERHEAD_MS = 10_000; -export type SandboxBridgeReachabilityReason = "ok" | "tcp_failed" | "probe_unavailable"; +export type SandboxBridgeReachabilityReason = + | "ok" + | "tcp_failed" + | "probe_unavailable" + | "probe_timeout" + | "veth_unsupported" + | "docker_daemon_unreachable"; export type SandboxBridgeRouteKind = "bridge_gateway" | "host_gateway"; export interface DockerBridgeNetworkInfo { @@ -43,6 +51,10 @@ interface SandboxBridgeProbeRunResult { status: number | null; signal?: NodeJS.Signals | null; error?: string; + /** Explicit timeout flag from the runner (e.g. spawnSync ETIMEDOUT). */ + timedOut?: boolean; + /** Explicit error code from the runner (e.g. "ETIMEDOUT", "ENOENT"). */ + errorCode?: string | null; stderr?: string | Buffer | null; stdout?: string | Buffer | null; } @@ -63,6 +75,8 @@ export interface SandboxBridgeReachabilityOptions { runImpl?: (args: readonly string[], timeoutMs: number) => SandboxBridgeProbeRunResult; inspectNetworkImpl?: (networkName: string) => DockerBridgeNetworkInfo | undefined; usesHostGatewayRouteImpl?: () => boolean; + /** Inject a precomputed image-cache result; bypasses real pre-pull. */ + ensureImageCachedOverride?: import("./preflight").EnsureProbeImageCachedResult; } function parseDockerNetworkIpamConfig(raw: string): DockerBridgeNetworkInfo | undefined { @@ -113,10 +127,13 @@ function defaultRunImpl(args: readonly string[], timeoutMs: number): SandboxBrid ignoreError: true, suppressOutput: true, }); + const error = result.error as NodeJS.ErrnoException | undefined; return { status: result.status ?? null, signal: result.signal, - error: result.error?.message, + error: error?.message, + timedOut: error?.code === "ETIMEDOUT", + errorCode: error?.code ?? null, stderr: result.stderr, stdout: result.stdout, }; @@ -174,6 +191,29 @@ function isNameResolutionFailure(detail: string): boolean { ); } +function isProbeTimeout(result: SandboxBridgeProbeRunResult): boolean { + // Only spawn-level timeouts qualify here. BusyBox `nc` exits with + // status 1 and prints "Operation timed out" on connection-level + // timeouts (firewalled gateway port) — those must fall through to + // `tcp_failed` so the user gets the UFW/firewall remediation, not a + // Docker restart hint. We honor explicit timedOut/errorCode flags + // from the runner when present, and fall back to scanning the error + // message for the ETIMEDOUT signature. + if (result.timedOut === true) return true; + if (result.errorCode && /^ETIMEDOUT$/i.test(result.errorCode)) return true; + return /\bETIMEDOUT\b/i.test(result.error ?? ""); +} + +function isVethUnsupported(detail: string): boolean { + // Specific Jetson bridge-create signature only. Generic "veth" + // mentions or unrelated "operation not supported" errors must not be + // classified as veth_unsupported (which is fatal in onboarding) — + // require the veth-pair-create wording together with the OS error. + return /failed to add the host .* sandbox veth pair interfaces: operation not supported|veth pair[^.]*?operation not supported/i.test( + detail, + ); +} + function buildProbeArgs( route: OpenShellDockerRoute, probeImage: string, @@ -222,6 +262,43 @@ export async function isSandboxBridgeGatewayReachable( }; } + // Pre-pull the pinned probe image so a slow-registry cold-cache pull + // does not get charged against the (much shorter) probe budget and + // misclassified as a fatal probe_timeout. Image-cache failures stay + // inconclusive (probe_unavailable), matching pre-#3630 semantics. + // + // Test seams that inject a probe runImpl bypass real Docker entirely; + // skip the pre-pull there unless the test supplies an explicit + // ensureImageCachedOverride. + if (opts.ensureImageCachedOverride !== undefined || opts.runImpl === undefined) { + const cached = opts.ensureImageCachedOverride ?? ensureProbeImageCached(probeImage); + if (!cached.ok) { + // A wedged docker daemon (inspect_unavailable) is a fatal Docker + // outage, not a probe/pull uncertainty — keep onboarding from + // proceeding into sandbox work that will hang. Pull failures + // (rate limit / slow registry) remain probe_unavailable. + const reason: SandboxBridgeReachabilityReason = + cached.reason === "inspect_unavailable" ? "docker_daemon_unreachable" : "probe_unavailable"; + // Use an inspect-specific fallback when the image-cache check + // never reached a pull (daemon down at `docker image inspect`), + // so the printed detail does not mislead users into chasing a + // registry/pull issue. + const fallbackDetail = + cached.reason === "inspect_unavailable" + ? "docker image inspect did not complete (daemon unreachable)" + : `docker pull ${probeImage} did not complete`; + return { + ok: false, + reason, + networkName, + subnet: route.subnet, + gatewayIp: route.gatewayIp, + routeKind: route.routeKind, + detail: cached.details ?? fallbackDetail, + }; + } + } + const result = runImpl( buildProbeArgs(route, probeImage, timeoutSec, port), timeoutSec * 1000 + PROBE_RUN_OVERHEAD_MS, @@ -238,6 +315,44 @@ export async function isSandboxBridgeGatewayReachable( } const detail = summarizeProbeResult(result); + if (isVethUnsupported(detail)) { + return { + ok: false, + reason: "veth_unsupported", + networkName, + subnet: route.subnet, + gatewayIp: route.gatewayIp, + routeKind: route.routeKind, + detail, + }; + } + if (isProbeTimeout(result)) { + return { + ok: false, + reason: "probe_timeout", + networkName, + subnet: route.subnet, + gatewayIp: route.gatewayIp, + routeKind: route.routeKind, + detail, + }; + } + // Daemon-connect failures from the docker CLI (e.g. "Cannot connect + // to the Docker daemon" after the image-cache check happened to + // succeed) must surface as fatal docker_daemon_unreachable, not the + // warn-only probe_unavailable, so onboarding stops here rather than + // proceeding into sandbox work that will fail later. + if (isDockerDaemonUnreachable(detail)) { + return { + ok: false, + reason: "docker_daemon_unreachable", + networkName, + subnet: route.subnet, + gatewayIp: route.gatewayIp, + routeKind: route.routeKind, + detail, + }; + } if (result.status !== 1 || isNameResolutionFailure(detail)) { return { ok: false, @@ -274,11 +389,37 @@ export function formatSandboxBridgeUnreachableMessage( ].filter((line): line is string => Boolean(line)).join("\n"); } + if (result.reason === "veth_unsupported") { + return [ + " ✗ Docker could not create the sandbox bridge veth pair.", + result.detail ? ` ${result.detail}` : undefined, + " This matches Jetson kernel/Docker bridge environments where veth creation returns `operation not supported`.", + ` Update the host kernel/Docker bridge networking support, or run ${cliDisplayName()} on a host whose Docker bridge networking can create veth interfaces.`, + ].filter((line): line is string => Boolean(line)).join("\n"); + } + + if (result.reason === "probe_timeout") { + return [ + " ✗ Docker-driver sandbox bridge reachability probe timed out.", + result.detail ? ` ${result.detail}` : undefined, + ` Restart Docker and check for stuck container/network operations before retrying \`${cliName()} onboard\`.`, + ].filter((line): line is string => Boolean(line)).join("\n"); + } + + if (result.reason === "docker_daemon_unreachable") { + return [ + " ✗ Docker daemon is not reachable for the sandbox bridge probe.", + result.detail ? ` ${result.detail}` : undefined, + " Restart the Docker daemon (e.g. `sudo systemctl restart docker`, or restart Docker Desktop/Colima)", + ` and re-run \`${cliName()} onboard\`.`, + ].filter((line): line is string => Boolean(line)).join("\n"); + } + if (result.routeKind === "host_gateway") { return [ ` ✗ Sandbox containers cannot reach the gateway at ${HOST_INTERNAL_NAME}:${port}.`, " The probe used Docker's host-gateway route, matching Docker Desktop/VM-backed Docker.", - " Restart Docker and the OpenShell gateway, then re-run `nemoclaw onboard`.", + ` Restart Docker and the OpenShell gateway, then re-run \`${cliName()} onboard\`.`, ].join("\n"); } @@ -299,7 +440,7 @@ export function formatSandboxBridgeUnreachableMessage( " A host firewall may be blocking traffic from the OpenShell Docker bridge.", " To allow it:", allowCmd, - " Then re-run `nemoclaw onboard`.", + ` Then re-run \`${cliName()} onboard\`.`, ].join("\n"); } diff --git a/src/lib/onboard/machine/handlers/preflight.ts b/src/lib/onboard/machine/handlers/preflight.ts index 80a648c204..17224c862f 100644 --- a/src/lib/onboard/machine/handlers/preflight.ts +++ b/src/lib/onboard/machine/handlers/preflight.ts @@ -47,6 +47,21 @@ export interface PreflightStateOptions< optedOutGpuPassthrough: boolean, hostGpuPlatform?: string | null, ): void; + /** + * Resume backstop for #3508/#3630. Runs the same bridge+DNS fatal + * gate that `preflight()` does, so a cached preflight step cannot + * skip the new fatal checks for hosts where Docker bridge networking + * or container DNS is broken. Optional for back-compat with callers + * that haven't been updated yet. + */ + assertDockerBridgeAndContainerDnsHealthy?(host: Host): void; + /** + * Resume backstop for unsupported container runtimes (e.g. Podman + * with the Linux Docker-driver gateway). Must run before the bridge/ + * DNS backstop above so Podman hosts see the unsupported-runtime + * message instead of Docker-specific diagnostics. + */ + rejectUnsupportedContainerRuntime?(host: Host): void; resolveSandboxGpuConfig( gpu: Gpu, options: { flag: PreflightSandboxGpuFlag; device: string | null | undefined }, @@ -121,11 +136,21 @@ export async function handlePreflightState< deps.validateSandboxGpuPreflight(resumeSandboxGpuConfig); const resumeOptedOutGpuPassthrough = noGpu || (!gpuRequested && session?.gpuPassthrough === false) || !resumeSandboxGpuConfig.sandboxGpuEnabled; + const resumeHost = deps.assessHost(); + // Reject unsupported runtimes (Podman) BEFORE the CDI GPU-spec + // backstop and the Docker-specific bridge/DNS probes so Podman + // hosts always hit the unsupported-runtime message (#3630 + // CodeRabbit). + deps.rejectUnsupportedContainerRuntime?.(resumeHost); deps.assertCdiNvidiaGpuSpecPresent( - deps.assessHost(), + resumeHost, resumeOptedOutGpuPassthrough, resumeSandboxGpuConfig.hostGpuPlatform, ); + // Resume backstop for #3508/#3630. Cached preflight does not capture + // host Docker/DNS state, and a session written by an older NemoClaw + // may have skipped the new bridge/DNS fatal checks. + deps.assertDockerBridgeAndContainerDnsHealthy?.(resumeHost); } else { await deps.startRecordedStep("preflight"); gpu = await deps.runPreflight({ optedOutGpuPassthrough: noGpu }); diff --git a/src/lib/onboard/preflight.test.ts b/src/lib/onboard/preflight.test.ts index 7c87d8d381..8381a67ecc 100644 --- a/src/lib/onboard/preflight.test.ts +++ b/src/lib/onboard/preflight.test.ts @@ -22,7 +22,10 @@ import { parseDockerUsesContainerdSnapshotter, planHostRemediation, dnsProbeName, + ensureProbeImageCached, + isFatalContainerDnsProbeFailure, probeContainerDns, + probeDockerBridgeContainerStart, } from "../../../dist/lib/onboard/preflight"; function requireMemoryInfo(result: ReturnType) { @@ -1309,6 +1312,19 @@ describe("probeContainerDns", () => { }); expect(result.ok).toBe(false); expect(result.reason).toBe("resolution_failed"); + expect(isFatalContainerDnsProbeFailure(result)).toBe(true); + }); + + it("downgrades unrelated docker output (no resolver evidence) from fatal resolution_failed to inconclusive error (#3630 CodeRabbit)", () => { + // No "Server:" header — nslookup never produced a resolver response. + // The output is some docker-side message unrelated to DNS, so we + // must not abort onboarding with the systemd-resolved remediation. + const result = probeContainerDns({ + outputOverride: "docker: random unrelated diagnostic output that mentions nothing DNS related\n", + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("error"); + expect(isFatalContainerDnsProbeFailure(result)).toBe(false); }); it("flags no_output when docker run returns empty", () => { @@ -1329,6 +1345,144 @@ describe("probeContainerDns", () => { expect(result.reason).toBe("no_output"); }); + it("flags timeout when the docker DNS probe is killed by the spawn timeout (#3630)", () => { + const result = probeContainerDns({ + executionOverride: { + stdout: "", + stderr: "", + exitCode: null, + signal: "SIGTERM", + timedOut: true, + error: "spawnSync sh ETIMEDOUT", + errorCode: "ETIMEDOUT", + }, + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("timeout"); + expect(result.timedOut).toBe(true); + expect(result.details).toContain("timed out"); + expect(isFatalContainerDnsProbeFailure(result)).toBe(true); + }); + + it("flags killed when the docker DNS probe exits from a signal without timing out", () => { + const result = probeContainerDns({ + executionOverride: { + stdout: "", + stderr: "", + exitCode: null, + signal: "SIGKILL", + timedOut: false, + }, + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("killed"); + expect(result.signal).toBe("SIGKILL"); + expect(isFatalContainerDnsProbeFailure(result)).toBe(true); + }); + + it("keeps generic no_output nonfatal when there is no timeout, signal, or nonzero exit metadata", () => { + const result = probeContainerDns({ + executionOverride: { + stdout: "", + stderr: "", + exitCode: 0, + signal: null, + timedOut: false, + }, + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("no_output"); + expect(isFatalContainerDnsProbeFailure(result)).toBe(false); + }); + + it("treats docker registry DNS failures during image pull as fatal", () => { + const result = probeContainerDns({ + outputOverride: + 'docker: Error response from daemon: Head "https://registry-1.docker.io/v2/library/busybox/manifests/latest": dial tcp: lookup registry-1.docker.io: no such host.\n', + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("image_pull_failed"); + expect(isFatalContainerDnsProbeFailure(result)).toBe(true); + }); + + it("does not make authorization-only image pull failures fatal DNS failures", () => { + const result = probeContainerDns({ + outputOverride: + "docker: Error response from daemon: pull access denied for busybox, repository does not exist.\n", + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("image_pull_failed"); + expect(isFatalContainerDnsProbeFailure(result)).toBe(false); + }); + + it("does not classify a successful cold-pull as image_pull_failed when followed by servers_unreachable nslookup", () => { + const coldPull = + "Unable to find image 'busybox:latest' locally\n" + + "latest: Pulling from library/busybox\n" + + "Status: Downloaded newer image for busybox:latest\n" + + "Server:\t\t10.0.0.1\n" + + "Address:\t10.0.0.1:53\n" + + ";; connection timed out; no servers could be reached\n"; + const result = probeContainerDns({ outputOverride: coldPull }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("servers_unreachable"); + expect(isFatalContainerDnsProbeFailure(result)).toBe(true); + }); + + it("reports a slow registry pre-pull timeout as nonfatal image_pull_failed, not a fatal probe timeout", () => { + const result = probeContainerDns({ + ensureImageCachedOverride: { + ok: false, + reason: "pull_timeout", + details: "docker pull timed out after 60s", + }, + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("image_pull_failed"); + expect(result.timedOut).toBe(true); + expect(isFatalContainerDnsProbeFailure(result)).toBe(false); + }); + + it("treats a pre-pull DNS-failure as fatal image_pull_failed via the registry-DNS signature", () => { + const result = probeContainerDns({ + ensureImageCachedOverride: { + ok: false, + reason: "pull_failed", + details: + 'docker: Error response from daemon: Head "https://registry-1.docker.io/v2/library/busybox/manifests/latest": dial tcp: lookup registry-1.docker.io: no such host.', + }, + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("image_pull_failed"); + expect(isFatalContainerDnsProbeFailure(result)).toBe(true); + }); + + it("classifies a wedged Docker daemon (inspect_unavailable) as fatal docker_daemon_unreachable (#3630 codex review)", () => { + const result = probeContainerDns({ + ensureImageCachedOverride: { + ok: false, + reason: "inspect_unavailable", + details: "docker image inspect did not complete", + }, + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("docker_daemon_unreachable"); + expect(isFatalContainerDnsProbeFailure(result)).toBe(true); + }); + + it("does not treat a registry TCP timeout (i/o timeout on :443) as a fatal DNS failure (#3630 codex review)", () => { + // dial tcp :443 errors are TCP connectivity, NOT DNS — must not + // be routed to UDP:53/systemd-resolved remediation. + const result = probeContainerDns({ + outputOverride: + 'docker: Error response from daemon: Head "https://registry-1.docker.io/v2/library/busybox/manifests/latest": dial tcp 3.94.224.37:443: i/o timeout.\n', + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("image_pull_failed"); + // Inconclusive — not a DNS resolution failure. + expect(isFatalContainerDnsProbeFailure(result)).toBe(false); + }); + it("captures the spawned command for runCapture override", () => { const captured: string[][] = []; const result = probeContainerDns({ @@ -1344,13 +1498,47 @@ describe("probeContainerDns", () => { expect(captured[0].slice(0, 2)).toEqual(["sh", "-c"]); const script = captured[0][2]; expect(script).toContain("docker run --rm"); - expect(script).toContain("busybox:latest"); + // Image must be pinned to an immutable digest so nslookup output + // parsing cannot drift (#3630 CodeRabbit). + expect(script).toMatch(/busybox@sha256:[0-9a-f]{64}/); // Probe queries a random `.invalid` subdomain (#3630), not a real // domain — cache-bypass guarantee. Stable prefix is asserted instead. expect(script).toMatch(/nslookup nemoclaw-dns-probe-[0-9a-f]+\.invalid /); expect(script).toContain("2>&1"); }); + it("skips real-docker pre-pull when runCaptureImpl or runProbeImpl is injected (hermetic test isolation)", () => { + // If pre-pull leaks through to real Docker, on a clean CI worker the + // probe would short-circuit with image_pull_failed before reaching + // the injected runner. Assert that the injected runner is actually + // called and that the probe's success/failure tracks it. + let runCaptureCalled = false; + const r1 = probeContainerDns({ + runCaptureImpl: () => { + runCaptureCalled = true; + return BUSYBOX_SUCCESS; + }, + }); + expect(runCaptureCalled).toBe(true); + expect(r1.ok).toBe(true); + + let runProbeCalled = false; + const r2 = probeContainerDns({ + runProbeImpl: () => { + runProbeCalled = true; + return { + stdout: BUSYBOX_SUCCESS, + stderr: "", + exitCode: 0, + signal: null, + timedOut: false, + }; + }, + }); + expect(runProbeCalled).toBe(true); + expect(r2.ok).toBe(true); + }); + it("allows the command to be overridden", () => { let seen: readonly string[] = []; probeContainerDns({ @@ -1389,7 +1577,31 @@ describe("probeContainerDns", () => { expect(seenScript).toContain("nslookup pinned-test.invalid"); }); - it("treats thrown runCapture errors as error reason", () => { + it("rejects shell metacharacters in probeName to prevent sh -c injection (#3630 CodeRabbit)", () => { + const injections = [ + "x; touch /tmp/pwned", + "x && touch /tmp/pwned", + "x`whoami`", + "x$(whoami)", + "x|whoami", + "x\nwhoami", + "x \"; rm -rf /\"", + ]; + for (const probeName of injections) { + expect(() => probeContainerDns({ probeName })).toThrow(/probeName must be a plain DNS name/); + } + }); + + it("accepts plain DNS labels (RFC 1035 chars only) as probeName", () => { + expect(() => + probeContainerDns({ + probeName: "nemoclaw-dns-probe-abc123.invalid", + runCaptureImpl: () => "Server:\t1.1.1.1\nAddress:\t1.1.1.1:53\n** server can't find x: NXDOMAIN\n", + }), + ).not.toThrow(); + }); + + it("treats thrown runCapture errors as error reason", () => { const result = probeContainerDns({ runCaptureImpl: () => { throw new Error("docker daemon unreachable"); @@ -1398,6 +1610,10 @@ describe("probeContainerDns", () => { expect(result.ok).toBe(false); expect(result.reason).toBe("error"); expect(result.details).toContain("docker daemon unreachable"); + // Generic `error` is inconclusive — the probe never proved DNS is + // broken, so we must not abort onboarding. Daemon-specific outages + // route through docker_daemon_unreachable instead. + expect(isFatalContainerDnsProbeFailure(result)).toBe(false); }); it("truncates long failure details to the last 400 bytes", () => { @@ -1441,6 +1657,297 @@ describe("probeContainerDns", () => { }); }); +describe("probeDockerBridgeContainerStart", () => { + it("passes when a bridge container exits successfully with no output", () => { + const result = probeDockerBridgeContainerStart({ + executionOverride: { + stdout: "", + stderr: "", + exitCode: 0, + signal: null, + timedOut: false, + }, + }); + expect(result).toEqual({ ok: true, exitCode: 0, signal: null, timedOut: false }); + }); + + it("flags Jetson-style veth operation-not-supported failures (#3508)", () => { + const result = probeDockerBridgeContainerStart({ + executionOverride: { + stdout: "", + stderr: + "docker: Error response from daemon: failed to add the host <=> sandbox veth pair interfaces: operation not supported.\n", + exitCode: 125, + signal: null, + timedOut: false, + }, + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("veth_unsupported"); + expect(result.details).toContain("operation not supported"); + expect(result.exitCode).toBe(125); + }); + + it("does not misclassify unrelated 'veth' mentions as fatal veth_unsupported (#3630 CodeRabbit)", () => { + // Output references "veth" in passing — without the bridge-create + // signature, it must stay on the generic-error path, not the fatal + // Jetson remediation path. + const result = probeDockerBridgeContainerStart({ + executionOverride: { + stdout: "", + stderr: "Created veth veth1234@if4: \n", + exitCode: 1, + signal: null, + timedOut: false, + }, + }); + expect(result.ok).toBe(false); + expect(result.reason).not.toBe("veth_unsupported"); + }); + + it("does not misclassify generic 'operation not supported' errors as veth_unsupported (#3630 CodeRabbit)", () => { + // Generic OS-level "operation not supported" (e.g., from a cgroup + // mount or unrelated syscall) must not be promoted to fatal veth. + const result = probeDockerBridgeContainerStart({ + executionOverride: { + stdout: "", + stderr: "docker: Error: mount: operation not supported.\n", + exitCode: 1, + signal: null, + timedOut: false, + }, + }); + expect(result.ok).toBe(false); + expect(result.reason).not.toBe("veth_unsupported"); + }); + + it("flags bridge container kill-by-signal (no timeout) as reason 'killed' (#3630 CodeRabbit)", () => { + const result = probeDockerBridgeContainerStart({ + executionOverride: { + stdout: "", + stderr: "", + exitCode: null, + signal: "SIGKILL", + timedOut: false, + }, + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("killed"); + expect(result.signal).toBe("SIGKILL"); + expect(result.timedOut).toBe(false); + }); + + it("flags bridge container start timeouts with execution metadata", () => { + const result = probeDockerBridgeContainerStart({ + executionOverride: { + stdout: "", + stderr: "", + exitCode: null, + signal: "SIGTERM", + timedOut: true, + error: "spawnSync docker ETIMEDOUT", + errorCode: "ETIMEDOUT", + }, + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("timeout"); + expect(result.timedOut).toBe(true); + expect(result.details).toContain("timed out"); + }); + + it("runs a minimal docker bridge command with a spawn timeout", () => { + let captured: readonly string[] = []; + let seenOpts: { timeout?: number } | undefined; + const result = probeDockerBridgeContainerStart({ + runProbeImpl: (command, opts) => { + captured = command; + seenOpts = opts; + return { stdout: "", stderr: "", exitCode: 0, signal: null, timedOut: false }; + }, + }); + expect(result.ok).toBe(true); + expect(captured.slice(0, 6)).toEqual([ + "docker", + "run", + "--rm", + "--pull=missing", + "--network", + "bridge", + ]); + // Image pinned to an immutable digest (#3630 CodeRabbit). + expect(captured[6]).toMatch(/^busybox@sha256:[0-9a-f]{64}$/); + expect(captured[7]).toBe("true"); + expect(seenOpts?.timeout).toBe(20_000); + }); + + it("reports image_pull_failed (not bridge timeout) when the busybox pre-pull times out (#3630 codex review)", () => { + const result = probeDockerBridgeContainerStart({ + ensureImageCachedOverride: { + ok: false, + reason: "pull_timeout", + details: "docker pull timed out after 60s", + }, + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("image_pull_failed"); + expect(result.timedOut).toBe(true); + expect(result.details).toContain("timed out"); + }); + + it("skips real-docker pre-pull when runProbeImpl is injected (hermetic test isolation)", () => { + let probeCalled = false; + const result = probeDockerBridgeContainerStart({ + runProbeImpl: (_command) => { + probeCalled = true; + return { stdout: "", stderr: "", exitCode: 0, signal: null, timedOut: false }; + }, + }); + expect(probeCalled).toBe(true); + expect(result.ok).toBe(true); + }); + + it("reports a wedged Docker daemon (inspect_unavailable) as fatal docker_daemon_unreachable (#3630 codex review)", () => { + const result = probeDockerBridgeContainerStart({ + ensureImageCachedOverride: { + ok: false, + reason: "inspect_unavailable", + details: "docker image inspect did not complete", + }, + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("docker_daemon_unreachable"); + expect(result.details).toContain("inspect"); + }); +}); + +describe("ensureProbeImageCached", () => { + it("returns ok when docker image inspect exits 0", () => { + const result = ensureProbeImageCached("busybox:latest", { + inspectProbeImpl: () => ({ + stdout: "[]", + stderr: "", + exitCode: 0, + signal: null, + timedOut: false, + }), + pullProbeImpl: () => { + throw new Error("pull should not run when inspect succeeds"); + }, + }); + expect(result.ok).toBe(true); + expect(result.alreadyCached).toBe(true); + }); + + it("classifies an inspect spawn timeout (ETIMEDOUT) as inspect_unavailable without falling through to pull (#3630 CodeRabbit)", () => { + const result = ensureProbeImageCached("busybox:latest", { + inspectProbeImpl: () => ({ + stdout: "", + stderr: "", + exitCode: null, + signal: "SIGTERM", + timedOut: true, + error: "spawnSync docker ETIMEDOUT", + errorCode: "ETIMEDOUT", + }), + pullProbeImpl: () => { + throw new Error("pull should not run when inspect times out"); + }, + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("inspect_unavailable"); + }); + + it("classifies 'Cannot connect to the Docker daemon' inspect stderr as inspect_unavailable (#3630 codex review)", () => { + const result = ensureProbeImageCached("busybox:latest", { + inspectProbeImpl: () => ({ + stdout: "", + stderr: + "Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?", + exitCode: 1, + signal: null, + timedOut: false, + }), + pullProbeImpl: () => { + throw new Error("pull should not run when daemon is unreachable"); + }, + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("inspect_unavailable"); + expect(result.details).toContain("Cannot connect to the Docker daemon"); + }); + + it("falls back to docker pull when inspect exits 1 without daemon-down signature", () => { + let pullCalled = false; + const result = ensureProbeImageCached("busybox:latest", { + inspectProbeImpl: () => ({ + stdout: "", + stderr: "Error: No such image: busybox:latest", + exitCode: 1, + signal: null, + timedOut: false, + }), + pullProbeImpl: () => { + pullCalled = true; + return { + stdout: "Status: Downloaded newer image for busybox:latest", + stderr: "", + exitCode: 0, + signal: null, + timedOut: false, + }; + }, + }); + expect(pullCalled).toBe(true); + expect(result.ok).toBe(true); + expect(result.alreadyCached).toBe(false); + }); + + it("classifies a pull-time daemon outage as inspect_unavailable (not pull_failed)", () => { + const result = ensureProbeImageCached("busybox:latest", { + inspectProbeImpl: () => ({ + stdout: "", + stderr: "Error: No such image", + exitCode: 1, + signal: null, + timedOut: false, + }), + pullProbeImpl: () => ({ + stdout: "", + stderr: + "Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?", + exitCode: 1, + signal: null, + timedOut: false, + }), + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("inspect_unavailable"); + }); + + it("classifies a pull timeout as pull_timeout (inconclusive, not docker outage)", () => { + const result = ensureProbeImageCached("busybox:latest", { + inspectProbeImpl: () => ({ + stdout: "", + stderr: "No such image", + exitCode: 1, + signal: null, + timedOut: false, + }), + pullProbeImpl: () => ({ + stdout: "", + stderr: "", + exitCode: null, + signal: "SIGTERM", + timedOut: true, + error: "spawnSync docker ETIMEDOUT", + errorCode: "ETIMEDOUT", + }), + }); + expect(result.ok).toBe(false); + expect(result.reason).toBe("pull_timeout"); + }); +}); + describe("getDockerBridgeGatewayIp", () => { it("returns the parsed IPv4 address from docker network inspect", () => { const result = getDockerBridgeGatewayIp(() => "172.17.0.1\n"); diff --git a/src/lib/onboard/preflight.ts b/src/lib/onboard/preflight.ts index bff95ed83c..9380f812b7 100644 --- a/src/lib/onboard/preflight.ts +++ b/src/lib/onboard/preflight.ts @@ -18,14 +18,16 @@ import path from "node:path"; import { DASHBOARD_PORT } from "../core/ports"; // runner.ts still uses CommonJS-style exports — use require here. -const { runCapture } = require("../runner"); +const { run, runCapture } = require("../runner"); type RunCaptureFn = typeof import("../runner").runCapture; +type RunFn = typeof import("../runner").run; type RunCaptureOpts = Parameters[1]; type NullableRunCaptureFn = ( command: Parameters[0], options?: RunCaptureOpts, ) => string | null; +type ProbeRunOpts = { timeout?: number }; // ── Types ──────────────────────────────────────────────────────── @@ -1209,15 +1211,37 @@ export function ensureSwap(minTotalMB?: number, opts: EnsureSwapOpts = {}): Swap // prints the cryptic `Exit handler never called`. This probe catches that // state in a few seconds so the user gets a targeted error up front. +type ProbeFailureReason = + | "no_output" + | "timeout" + | "killed" + | "resolution_failed" + | "servers_unreachable" + | "image_pull_failed" + | "veth_unsupported" + | "docker_daemon_unreachable" + | "error"; + +export interface ProbeExecutionResult { + stdout?: string | Buffer | null; + stderr?: string | Buffer | null; + exitCode?: number | null; + status?: number | null; + signal?: NodeJS.Signals | string | null; + timedOut?: boolean; + error?: string | Error | null; + errorCode?: string | null; +} + +type RunProbeFn = (command: readonly string[], options?: ProbeRunOpts) => ProbeExecutionResult; + export interface DnsProbeResult { ok: boolean; - reason?: - | "no_output" - | "resolution_failed" - | "servers_unreachable" - | "image_pull_failed" - | "error"; + reason?: ProbeFailureReason; details?: string; + timedOut?: boolean; + exitCode?: number | null; + signal?: string | null; } export interface ProbeContainerDnsOpts { @@ -1225,10 +1249,45 @@ export interface ProbeContainerDnsOpts { command?: readonly string[]; /** Inject captured output (bypasses execution). */ outputOverride?: string | null; + /** Inject structured execution metadata (bypasses execution). */ + executionOverride?: ProbeExecutionResult; /** Override runCapture. */ runCaptureImpl?: NullableRunCaptureFn; + /** Override structured probe execution. */ + runProbeImpl?: RunProbeFn; /** Override the probe name (test seam; pinned name for stable assertions). */ probeName?: string; + /** Inject a precomputed image-cache result; skips the pre-pull. */ + ensureImageCachedOverride?: EnsureProbeImageCachedResult; +} + +export interface DockerBridgeContainerStartProbeResult { + ok: boolean; + reason?: Extract< + ProbeFailureReason, + | "no_output" + | "timeout" + | "killed" + | "image_pull_failed" + | "veth_unsupported" + | "docker_daemon_unreachable" + | "error" + >; + details?: string; + timedOut?: boolean; + exitCode?: number | null; + signal?: string | null; +} + +export interface ProbeDockerBridgeContainerStartOpts { + /** Override the docker run command. */ + command?: readonly string[]; + /** Inject structured execution metadata (bypasses execution). */ + executionOverride?: ProbeExecutionResult; + /** Override structured probe execution. */ + runProbeImpl?: RunProbeFn; + /** Inject a precomputed image-cache result; skips the pre-pull. */ + ensureImageCachedOverride?: EnsureProbeImageCachedResult; } /** @@ -1238,6 +1297,300 @@ export interface ProbeContainerDnsOpts { * letting a wedged docker daemon stall preflight forever. */ const PROBE_TIMEOUT_MS = 20_000; +// Pinned to an immutable digest so the BusyBox `nslookup` output shape +// the parser below depends on cannot drift over time. Mirrors the same +// digest used by the sandbox-bridge gateway probe so both probes pull +// the exact same blob and share its Docker image cache. +export const BUSYBOX_PROBE_IMAGE = + "busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662"; + +/** + * Longer ceiling for image pulls. Decoupled from PROBE_TIMEOUT_MS so a + * cold-cache pull on a slow registry does not get charged against the + * shorter probe budget and falsely classified as a fatal probe timeout. + */ +const PROBE_IMAGE_PULL_TIMEOUT_MS = 60_000; + +export interface EnsureProbeImageCachedResult { + ok: boolean; + alreadyCached?: boolean; + reason?: "pull_failed" | "pull_timeout" | "inspect_unavailable"; + details?: string; +} + +export interface EnsureProbeImageCachedOpts { + /** Override the docker image-inspect probe (test seam). */ + inspectProbeImpl?: RunProbeFn; + /** Override the docker pull probe (test seam). */ + pullProbeImpl?: RunProbeFn; + /** Pull-time budget (ms). Defaults to PROBE_IMAGE_PULL_TIMEOUT_MS. */ + pullTimeoutMs?: number; +} + +/** + * Make sure `image` is in the local docker image cache before a timed + * probe runs. Returns `{ ok: true, alreadyCached }` when the image was + * already present or was pulled successfully; otherwise returns a + * structured reason describing why the pull could not be completed. + * + * Decoupling pull from probe lets callers report a slow/blocked registry + * pull as an inconclusive image_pull_failed (not as a fatal probe + * timeout / Docker-restart hint). + */ +export function ensureProbeImageCached( + image: string, + opts: EnsureProbeImageCachedOpts = {}, +): EnsureProbeImageCachedResult { + const inspectImpl = opts.inspectProbeImpl ?? defaultRunProbe; + const pullImpl = opts.pullProbeImpl ?? defaultRunProbe; + const pullTimeoutMs = opts.pullTimeoutMs ?? PROBE_IMAGE_PULL_TIMEOUT_MS; + + const inspect = normalizeProbeExecution( + inspectImpl(["docker", "image", "inspect", image], { timeout: 10_000 }), + ); + if (inspect.exitCode === 0) { + return { ok: true, alreadyCached: true }; + } + // Inspect couldn't run (docker missing/down). Don't mask the underlying + // docker outage as an image-pull issue. The CLI can also exit 1 with a + // "Cannot connect to the Docker daemon" stderr when dockerd is down, + // so we sniff that signature in addition to spawn-level errors. + const inspectOutput = probeCombinedOutput(inspect); + if ( + (inspect.exitCode === null && (inspect.error || inspect.timedOut)) || + isDockerDaemonUnreachable(inspectOutput) + ) { + return { + ok: false, + reason: "inspect_unavailable", + details: + (inspectOutput.trim() && outputTail(inspectOutput)) || + inspect.error || + "docker image inspect did not complete", + }; + } + + const pull = normalizeProbeExecution( + pullImpl(["docker", "pull", image], { timeout: pullTimeoutMs }), + ); + const combined = probeCombinedOutput(pull); + if (pull.exitCode === 0) { + return { ok: true, alreadyCached: false }; + } + if (pull.timedOut || (pull.signal && pull.exitCode === null)) { + return { + ok: false, + reason: "pull_timeout", + details: probeExecutionDetails("docker pull", pull, pullTimeoutMs, combined), + }; + } + // A pull that fails with the daemon-unreachable signature is a docker + // outage, not a registry/cache problem. Promote it so callers can treat + // it as a fatal probe error instead of an inconclusive image_pull. + if (isDockerDaemonUnreachable(combined)) { + return { + ok: false, + reason: "inspect_unavailable", + details: outputTail(combined), + }; + } + return { + ok: false, + reason: "pull_failed", + details: combined.trim() ? outputTail(combined) : (pull.error ?? "docker pull failed"), + }; +} + +export function isDockerDaemonUnreachable(output: string): boolean { + return /Cannot connect to the Docker daemon|Is the docker daemon running\??|docker daemon is not running|error during connect.*Get .*docker.*open .*dial unix/i.test( + output, + ); +} + +function probeText(value: unknown): string { + if (value == null) return ""; + if (Buffer.isBuffer(value)) return value.toString("utf-8"); + return String(value); +} + +function normalizeError(value: unknown): string | null { + if (!value) return null; + if (value instanceof Error) return value.message; + return String(value); +} + +function normalizeProbeExecution(result: ProbeExecutionResult): Required< + Pick +> & { + error: string | null; + errorCode: string | null; +} { + const error = normalizeError(result.error); + const errorCode = + result.errorCode ?? + (typeof result.error === "object" && result.error && "code" in result.error + ? String((result.error as NodeJS.ErrnoException).code) + : null); + return { + stdout: probeText(result.stdout), + stderr: probeText(result.stderr), + exitCode: + typeof result.exitCode === "number" || result.exitCode === null + ? result.exitCode + : typeof result.status === "number" || result.status === null + ? result.status + : null, + signal: result.signal ? String(result.signal) : null, + timedOut: + result.timedOut === true || + errorCode === "ETIMEDOUT" || + (error ? /ETIMEDOUT|timed out/i.test(error) : false), + error, + errorCode, + }; +} + +function defaultRunProbe(command: readonly string[], options?: ProbeRunOpts): ProbeExecutionResult { + const result = (run as RunFn)(command, { + ignoreError: true, + suppressOutput: true, + timeout: options?.timeout, + encoding: "utf-8", + }); + const error = result.error as NodeJS.ErrnoException | undefined; + return { + stdout: result.stdout, + stderr: result.stderr, + exitCode: result.status ?? null, + signal: result.signal ?? null, + timedOut: error?.code === "ETIMEDOUT", + error: error?.message ?? null, + errorCode: error?.code ?? null, + }; +} + +function outputOverrideExecution(output: string | null): ProbeExecutionResult { + return { + stdout: output ?? "", + stderr: "", + exitCode: 0, + signal: null, + timedOut: false, + }; +} + +function captureProbeExecution( + command: readonly string[], + timeoutMs: number, + opts: { + outputOverride?: string | null; + executionOverride?: ProbeExecutionResult; + runCaptureImpl?: NullableRunCaptureFn; + runProbeImpl?: RunProbeFn; + }, +): ReturnType { + if (opts.executionOverride) { + return normalizeProbeExecution(opts.executionOverride); + } + if (opts.outputOverride !== undefined) { + return normalizeProbeExecution(outputOverrideExecution(opts.outputOverride)); + } + if (opts.runProbeImpl) { + return normalizeProbeExecution(opts.runProbeImpl(command, { timeout: timeoutMs })); + } + if (opts.runCaptureImpl) { + return normalizeProbeExecution( + outputOverrideExecution( + opts.runCaptureImpl(command, { + ignoreError: true, + timeout: timeoutMs, + }), + ), + ); + } + return normalizeProbeExecution(defaultRunProbe(command, { timeout: timeoutMs })); +} + +function probeCombinedOutput(execution: ReturnType): string { + return [execution.stdout, execution.stderr].filter((part) => String(part || "").trim()).join("\n"); +} + +function outputTail(output: string, maxLength = 400): string { + return output.trim().slice(-maxLength); +} + +function probeExecutionDetails( + label: string, + execution: ReturnType, + timeoutMs: number, + output: string, +): string { + const details = [ + execution.timedOut ? `${label} timed out after ${Math.ceil(timeoutMs / 1000)}s` : null, + execution.signal ? `${label} was killed by signal ${execution.signal}` : null, + execution.exitCode !== null && execution.exitCode !== 0 + ? `${label} exited with status ${execution.exitCode}` + : null, + execution.error, + output.trim() ? outputTail(output) : null, + ].filter((line): line is string => Boolean(line)); + return details.length > 0 ? details.join("\n") : `${label} produced no output`; +} + +function executionFailureReason( + label: string, + execution: ReturnType, + timeoutMs: number, + output: string, +): Pick | null { + if (execution.timedOut) { + return { + reason: "timeout", + details: probeExecutionDetails(label, execution, timeoutMs, output), + timedOut: true, + exitCode: execution.exitCode, + signal: execution.signal, + }; + } + if (execution.signal) { + return { + reason: "killed", + details: probeExecutionDetails(label, execution, timeoutMs, output), + timedOut: false, + exitCode: execution.exitCode, + signal: execution.signal, + }; + } + return null; +} + +function isImagePullFailure(output: string): boolean { + // Note: "Unable to find image" is the normal cold-pull banner Docker + // prints before a successful pull, so it is not a failure signature. + return /Error response from daemon:.*(pull|manifest|not found)|pull access denied|manifest.*unknown|unauthorized: authentication required|Head.*https?:\/\/.*: dial/i.test( + output, + ); +} + +function isRegistryResolutionFailure(output: string): boolean { + // DNS-resolution signatures only. A "dial tcp ip:port: i/o timeout" is + // a TCP-connectivity failure, not a DNS failure, and must not be + // routed to the UDP:53/systemd-resolved remediation path. + return /lookup .*: no such host|temporary failure in name resolution|could not resolve|getaddrinfo|server misbehaving|dial tcp: lookup|no such host/i.test( + output, + ); +} + +function isVethUnsupported(output: string): boolean { + // The Jetson signature is specifically "failed to add the host <…> + // sandbox veth pair interfaces: operation not supported". Generic + // "veth" mentions or unrelated "operation not supported" errors must + // NOT be classified as veth_unsupported (which is fatal), so require + // the veth-pair-create wording together with the OS error. + return /failed to add the host .* sandbox veth pair interfaces: operation not supported|veth pair[^.]*?operation not supported/i.test( + output, + ); +} /** * Random subdomain of the RFC 6761 reserved .invalid TLD. Every compliant @@ -1311,57 +1664,126 @@ export function getDockerBridgeGatewayIp( * The typical #2101 signature on corp-firewalled hosts. * - `resolution_failed` — resolver answered but lookup failed (NXDOMAIN * or similar). Unusual. - * - `no_output` / `error` — probe couldn't run at all. + * - `timeout` / `killed` / `error` — probe couldn't complete. + * - `no_output` — probe exited cleanly but produced no parseable output. */ export function probeContainerDns(opts: ProbeContainerDnsOpts = {}): DnsProbeResult { - // Cap the whole probe via Node's spawn-level timeout (works on every - // platform Node supports — no dependency on a host-side `timeout` - // binary). Child process is killed, runCapture returns "" under - // ignoreError, and we fall through to the `no_output` branch. - // // We funnel through `sh -c` so we can `2>&1` the docker pull progress // and busybox nslookup diagnostics into stdout — both write the // signatures the parser below depends on (`Error response from daemon`, - // `no servers could be reached`) to stderr. Every token in the script - // is a fixed constant, so no shell injection surface. + // `no servers could be reached`) to stderr. probeName is the only + // non-constant token interpolated into the shell script: validate it + // as a plain DNS name (RFC 1035 label chars) so a crafted override + // cannot inject arbitrary shell tokens. const probeName = opts.probeName ?? dnsProbeName(); + if (!/^[a-z0-9]([a-z0-9.-]{0,253})$/i.test(probeName)) { + throw new Error( + `probeName must be a plain DNS name (RFC 1035 label characters), got: ${JSON.stringify(probeName)}`, + ); + } const command = opts.command ?? [ "sh", "-c", - `docker run --rm --pull=missing busybox:latest nslookup ${probeName} 2>&1`, + `docker run --rm --pull=missing ${BUSYBOX_PROBE_IMAGE} nslookup ${probeName} 2>&1`, ]; - let output: string | null | undefined = opts.outputOverride; - if (output === undefined) { - try { - const runCaptureImpl = - opts.runCaptureImpl ?? - ((cmd: readonly string[], o?: RunCaptureOpts) => - runCapture(cmd, { - ignoreError: o?.ignoreError ?? false, - timeout: o?.timeout, - })); - output = runCaptureImpl(command, { - ignoreError: true, - timeout: PROBE_TIMEOUT_MS, - }); - } catch (e) { + // Pre-pull the busybox image so the timed probe below measures only + // probe time, not registry pull time. A cold-cache pull that times out + // here surfaces as an inconclusive image_pull_failed (registry-DNS + // signature still routes through isRegistryResolutionFailure), not as + // a fatal probe timeout with a misleading "restart Docker" hint. + // + // Any test seam that injects probe execution (output/execution/command + // overrides or runCapture/runProbe replacements) implies the caller is + // staying off the real Docker CLI — skip pre-pull so hermetic tests on + // hosts without Docker/busybox keep working. + const bypassRealDocker = + opts.executionOverride !== undefined || + opts.outputOverride !== undefined || + opts.command !== undefined || + opts.runCaptureImpl !== undefined || + opts.runProbeImpl !== undefined; + if (!bypassRealDocker || opts.ensureImageCachedOverride !== undefined) { + const cached = opts.ensureImageCachedOverride ?? ensureProbeImageCached(BUSYBOX_PROBE_IMAGE); + if (!cached.ok) { + // inspect_unavailable means the docker daemon itself is wedged + // (assessHost said it was reachable, but image-inspect now hangs + // or returns "Cannot connect to the Docker daemon"). Treat that as + // a fatal docker_daemon_unreachable — distinct from generic + // probe `error` reasons that callers may want to keep inconclusive. + if (cached.reason === "inspect_unavailable") { + return { + ok: false, + reason: "docker_daemon_unreachable", + details: cached.details ?? "docker image inspect did not complete", + }; + } return { ok: false, - reason: "error", - details: String((e as Error)?.message ?? e), + reason: "image_pull_failed", + details: cached.details ?? `docker pull ${BUSYBOX_PROBE_IMAGE} did not complete`, + timedOut: cached.reason === "pull_timeout", }; } } + let execution: ReturnType; + try { + execution = captureProbeExecution(command, PROBE_TIMEOUT_MS, opts); + } catch (e) { + return { + ok: false, + reason: "error", + details: String((e as Error)?.message ?? e), + }; + } + + const output = probeCombinedOutput(execution); + const executionFailure = executionFailureReason( + "docker DNS probe", + execution, + PROBE_TIMEOUT_MS, + output, + ); + if (executionFailure) { + return { + ok: false, + ...executionFailure, + }; + } + // Treat whitespace-only output (e.g., bare newlines left by a killed // child) the same as empty — otherwise the subsequent regex checks all // miss and we'd mis-report it as `resolution_failed`. - if (!output || !output.trim()) { + if (!output.trim()) { + if (execution.exitCode !== null && execution.exitCode !== 0) { + return { + ok: false, + reason: "error", + details: probeExecutionDetails("docker DNS probe", execution, PROBE_TIMEOUT_MS, output), + timedOut: false, + exitCode: execution.exitCode, + signal: execution.signal, + }; + } return { ok: false, reason: "no_output", - details: "docker run produced no output (timed out or failed to start)", + details: "docker DNS probe produced no output", + timedOut: false, + exitCode: execution.exitCode, + signal: execution.signal, + }; + } + + if (isVethUnsupported(output)) { + return { + ok: false, + reason: "veth_unsupported", + details: outputTail(output), + timedOut: false, + exitCode: execution.exitCode, + signal: execution.signal, }; } @@ -1407,15 +1829,14 @@ export function probeContainerDns(opts: ProbeContainerDnsOpts = {}): DnsProbeRes // Docker image-pull failure — the probe never got to run nslookup, so // framing this as a DNS problem would mislead. Signatures from // `docker run --pull=missing` when the daemon can't fetch the image. - if ( - /Error response from daemon:.*(pull|manifest|not found)|pull access denied|manifest.*unknown|unauthorized: authentication required|Head.*https?:\/\/.*: dial/i.test( - output, - ) - ) { + if (isImagePullFailure(output)) { return { ok: false, reason: "image_pull_failed", - details: output.slice(-400), + details: outputTail(output), + timedOut: false, + exitCode: execution.exitCode, + signal: execution.signal, }; } @@ -1425,14 +1846,182 @@ export function probeContainerDns(opts: ProbeContainerDnsOpts = {}): DnsProbeRes return { ok: false, reason: "servers_unreachable", - details: output.slice(-400), + details: outputTail(output), + timedOut: false, + exitCode: execution.exitCode, + signal: execution.signal, + }; + } + + // Resolver responded but couldn't answer. Only report resolution_failed + // (fatal) when we actually saw the resolver-identification block from + // nslookup — otherwise the probe never proved DNS is broken (e.g. + // unrelated docker daemon output where nslookup never ran), so fall + // through to inconclusive `error` so onboarding does not falsely abort. + if (hasResolverHeader) { + return { + ok: false, + reason: "resolution_failed", + details: outputTail(output), + timedOut: false, + exitCode: execution.exitCode, + signal: execution.signal, + }; + } + return { + ok: false, + reason: "error", + details: outputTail(output), + timedOut: false, + exitCode: execution.exitCode, + signal: execution.signal, + }; +} + +export function isFatalContainerDnsProbeFailure(result: DnsProbeResult): boolean { + if (result.ok) return false; + if ( + result.reason === "servers_unreachable" || + result.reason === "resolution_failed" || + result.reason === "timeout" || + result.reason === "killed" || + result.reason === "veth_unsupported" || + result.reason === "docker_daemon_unreachable" + ) { + return true; + } + // Generic `error` (runner/transport failures, unexpected output) stays + // inconclusive — the probe never established that container DNS is + // broken, so aborting onboarding would be wrong. Daemon outages route + // through `docker_daemon_unreachable` above; pull failures through the + // image_pull_failed branch below. + return result.reason === "image_pull_failed" && isRegistryResolutionFailure(result.details ?? ""); +} + +export function probeDockerBridgeContainerStart( + opts: ProbeDockerBridgeContainerStartOpts = {}, +): DockerBridgeContainerStartProbeResult { + const command = opts.command ?? [ + "docker", + "run", + "--rm", + "--pull=missing", + "--network", + "bridge", + BUSYBOX_PROBE_IMAGE, + "true", + ]; + + // Pre-pull so a slow-registry cold-cache pull does not get charged + // against the bridge probe budget and falsely reported as a Jetson/ + // bridge timeout (see issue #3630 codex review). Test seams that + // bypass real Docker (executionOverride/command/runProbeImpl) skip the + // pre-pull so hermetic tests on hosts without Docker keep working. + const bypassRealDocker = + opts.executionOverride !== undefined || + opts.command !== undefined || + opts.runProbeImpl !== undefined; + if (!bypassRealDocker || opts.ensureImageCachedOverride !== undefined) { + const cached = opts.ensureImageCachedOverride ?? ensureProbeImageCached(BUSYBOX_PROBE_IMAGE); + if (!cached.ok) { + // inspect_unavailable means docker daemon is wedged — emit the + // distinct docker_daemon_unreachable reason so onboard preflight + // can fail fast while still leaving generic bridge probe `error` + // reasons (e.g. a daemon with no default bridge network) on the + // inconclusive path. + if (cached.reason === "inspect_unavailable") { + return { + ok: false, + reason: "docker_daemon_unreachable", + details: cached.details ?? "docker image inspect did not complete", + }; + } + return { + ok: false, + reason: "image_pull_failed", + details: cached.details ?? `docker pull ${BUSYBOX_PROBE_IMAGE} did not complete`, + timedOut: cached.reason === "pull_timeout", + }; + } + } + + let execution: ReturnType; + try { + execution = captureProbeExecution(command, PROBE_TIMEOUT_MS, opts); + } catch (e) { + return { + ok: false, + reason: "error", + details: String((e as Error)?.message ?? e), + }; + } + + const output = probeCombinedOutput(execution); + const executionFailure = executionFailureReason( + "docker bridge container start probe", + execution, + PROBE_TIMEOUT_MS, + output, + ); + if (executionFailure) { + return { + ok: false, + reason: executionFailure.reason as DockerBridgeContainerStartProbeResult["reason"], + details: executionFailure.details, + timedOut: executionFailure.timedOut, + exitCode: executionFailure.exitCode, + signal: executionFailure.signal, + }; + } + + if (execution.exitCode === 0) { + return { ok: true, exitCode: 0, signal: null, timedOut: false }; + } + + if (isVethUnsupported(output)) { + return { + ok: false, + reason: "veth_unsupported", + details: outputTail(output), + timedOut: false, + exitCode: execution.exitCode, + signal: execution.signal, + }; + } + + if (isImagePullFailure(output)) { + return { + ok: false, + reason: "image_pull_failed", + details: outputTail(output), + timedOut: false, + exitCode: execution.exitCode, + signal: execution.signal, + }; + } + + if (!output.trim()) { + return { + ok: false, + reason: execution.exitCode === null ? "no_output" : "error", + details: probeExecutionDetails( + "docker bridge container start probe", + execution, + PROBE_TIMEOUT_MS, + output, + ), + timedOut: false, + exitCode: execution.exitCode, + signal: execution.signal, }; } - // Something else — resolver responded but couldn't answer. return { ok: false, - reason: "resolution_failed", - details: output.slice(-400), + reason: "error", + details: outputTail(output), + timedOut: false, + exitCode: execution.exitCode, + signal: execution.signal, }; }