diff --git a/docs/reference/troubleshooting.mdx b/docs/reference/troubleshooting.mdx index 6d23f1f476..8d57d3f8cd 100644 --- a/docs/reference/troubleshooting.mdx +++ b/docs/reference/troubleshooting.mdx @@ -1254,6 +1254,11 @@ Fix the NVIDIA Container Toolkit or CDI configuration reported in the diagnostic If you do not need GPU access inside the sandbox, rerun with `--no-sandbox-gpu`. Set `NEMOCLAW_DOCKER_GPU_PATCH=0` only when you need to bypass this compatibility path during troubleshooting. +If onboarding reports `OpenShell supervisor did not reconnect to the GPU-enabled container.` even though the diagnostic bundle shows the patched container is running and healthy, the supervisor-reconnect wait is treating a transient Error phase (reported while the OpenShell host re-registers the new container) as fatal. +The reconnect wait debounces consecutive Error-phase polls before fast-failing, defaulting to five consecutive polls of about 10 seconds in total. +Increase the debounce window with `NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE` if your host needs more time to re-register the patched container, for example slow WSL2 + Docker Desktop setups. +Set it to a higher integer such as `15` (about 30 seconds) and rerun onboarding; the value is clamped to a minimum of `1`. + ### `pip install` fails with a system-packages error Recent Ubuntu releases (including DGX Spark's Ubuntu 24.04) mark the system Python install as externally managed, so `pip install` without a virtual environment fails. diff --git a/skills/nemoclaw-user-reference/references/troubleshooting.md b/skills/nemoclaw-user-reference/references/troubleshooting.md index bcee680bfe..7ee345ab8f 100644 --- a/skills/nemoclaw-user-reference/references/troubleshooting.md +++ b/skills/nemoclaw-user-reference/references/troubleshooting.md @@ -1244,6 +1244,11 @@ Fix the NVIDIA Container Toolkit or CDI configuration reported in the diagnostic If you do not need GPU access inside the sandbox, rerun with `--no-sandbox-gpu`. Set `NEMOCLAW_DOCKER_GPU_PATCH=0` only when you need to bypass this compatibility path during troubleshooting. +If onboarding reports `OpenShell supervisor did not reconnect to the GPU-enabled container.` even though the diagnostic bundle shows the patched container is running and healthy, the supervisor-reconnect wait is treating a transient Error phase (reported while the OpenShell host re-registers the new container) as fatal. +The reconnect wait debounces consecutive Error-phase polls before fast-failing, defaulting to five consecutive polls of about 10 seconds in total. +Increase the debounce window with `NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE` if your host needs more time to re-register the patched container, for example slow WSL2 + Docker Desktop setups. +Set it to a higher integer such as `15` (about 30 seconds) and rerun onboarding; the value is clamped to a minimum of `1`. + ### `pip install` fails with a system-packages error Recent Ubuntu releases (including DGX Spark's Ubuntu 24.04) mark the system Python install as externally managed, so `pip install` without a virtual environment fails. diff --git a/src/lib/onboard/docker-gpu-patch.test.ts b/src/lib/onboard/docker-gpu-patch.test.ts index cf560e7e4f..88223f119f 100644 --- a/src/lib/onboard/docker-gpu-patch.test.ts +++ b/src/lib/onboard/docker-gpu-patch.test.ts @@ -837,7 +837,10 @@ describe("docker-gpu-patch Error-phase diagnostics (#4316)", () => { it("short-circuits the supervisor-reconnect wait when the sandbox enters Error phase", () => { // Without the short-circuit, a patched container that crashes on startup // leaves users waiting the full 900s+ supervisor-reconnect timeout before - // any Error-phase diagnostics run (#4316). + // any Error-phase diagnostics run. With the debounce now in place, this + // test asserts the K=1 (no-debounce) behavior explicitly so the original + // fast-fail intent is preserved when the operator opts out of the + // debounce. const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" })); const listOutputs = [ "alpha Provisioning 1s ago", @@ -853,10 +856,11 @@ describe("docker-gpu-patch Error-phase diagnostics (#4316)", () => { runOpenshell, runCaptureOpenshell, sleep, + errorPhaseDebouncePolls: 1, }); expect(ok).toBe(false); - // Without short-circuit we'd loop ~300 iterations. With it, the second + // Without short-circuit we'd loop ~300 iterations. With K=1 the second // iteration's list output shows Error and the wait bails out. expect(runOpenshell).toHaveBeenCalledTimes(2); expect(sleep).toHaveBeenCalledTimes(1); diff --git a/src/lib/onboard/docker-gpu-patch.ts b/src/lib/onboard/docker-gpu-patch.ts index 5e9dd16a13..d46705ebc5 100644 --- a/src/lib/onboard/docker-gpu-patch.ts +++ b/src/lib/onboard/docker-gpu-patch.ts @@ -14,7 +14,22 @@ import { dockerRunDetached, dockerStop, } from "../adapters/docker"; -import { envInt } from "./env"; +import { + type DockerGpuSupervisorReconnectDeps, + DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV, + DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV, + getDockerGpuSupervisorReconnectErrorDebouncePolls, + getDockerGpuSupervisorReconnectTimeoutSecs, + waitForOpenShellSupervisorReconnect, +} from "./docker-gpu-supervisor-reconnect"; +export { + DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV, + DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV, + getDockerGpuSupervisorReconnectErrorDebouncePolls, + getDockerGpuSupervisorReconnectTimeoutSecs, + waitForOpenShellSupervisorReconnect, +}; +export type { DockerGpuSupervisorReconnectDeps }; export const OPENSHELL_MANAGED_BY_LABEL = "openshell.ai/managed-by"; export const OPENSHELL_MANAGED_BY_VALUE = "openshell"; @@ -23,9 +38,6 @@ const OPENSHELL_SANDBOX_COMMAND_ENV = "OPENSHELL_SANDBOX_COMMAND"; const DOCKER_GPU_PATCH_TIMEOUT_MS = 30_000; const DOCKER_GPU_PATCH_WAIT_SECS = 180; -const DOCKER_GPU_SUPERVISOR_RECONNECT_MIN_SECS = 900; -export const DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV = - "NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT"; export const DOCKER_GPU_PATCH_NETWORK_ENV = "NEMOCLAW_DOCKER_GPU_PATCH_NETWORK"; const MAX_DOCKER_CONTAINER_NAME_LENGTH = 253; const GPU_ENV_KEYS = new Set([ @@ -70,6 +82,11 @@ export type DockerGpuPatchDeps = { readDir?: (dirPath: string) => string[] | null; /** Injectable file reader for unit testing CDI spec content checks. */ readFile?: (filePath: string) => string | null; + /** + * Forwarded to the supervisor-reconnect wait. See + * `DockerGpuSupervisorReconnectDeps.errorPhaseDebouncePolls`. + */ + errorPhaseDebouncePolls?: number; }; export type DockerGpuPatchModeKind = "gpus" | "nvidia-runtime" | "cdi"; @@ -833,72 +850,6 @@ function waitForNewContainerId( return null; } -function sandboxListShowsErrorPhase( - sandboxName: string, - runCaptureOpenshell: NonNullable, -): boolean { - try { - const list = runCaptureOpenshell(["sandbox", "list"], { - ignoreError: true, - suppressOutput: true, - timeout: DOCKER_GPU_PATCH_TIMEOUT_MS, - }); - return SANDBOX_FAILURE_PHASE_TOKENS.has( - parseSandboxPhaseFromListOutput(list, sandboxName) ?? "", - ); - } catch { - return false; - } -} - -function waitForOpenShellSandboxExec( - sandboxName: string, - timeoutSecs: number, - deps: DockerGpuPatchDeps, -): boolean { - if (!deps.runOpenshell) return true; - const d = depsWithDefaults(deps); - const deadline = Date.now() + Math.max(1, timeoutSecs) * 1000; - while (Date.now() <= deadline) { - const result = deps.runOpenshell( - ["sandbox", "exec", "-n", sandboxName, "--", "true"], - { ignoreError: true, suppressOutput: true, timeout: DOCKER_GPU_PATCH_TIMEOUT_MS }, - ); - if (isZeroStatus(result)) return true; - // Short-circuit the supervisor-reconnect wait when the sandbox enters a - // terminal failure phase. Without this, a patched container that exits - // on startup leaves the user staring at the supervisor-reconnect - // timeout (default 900s) before any Error-phase diagnostics run (#4316). - if ( - deps.runCaptureOpenshell && - sandboxListShowsErrorPhase(sandboxName, deps.runCaptureOpenshell) - ) { - return false; - } - d.sleep(2); - } - return false; -} - -export const waitForOpenShellSupervisorReconnect = waitForOpenShellSandboxExec; - -export function getDockerGpuSupervisorReconnectTimeoutSecs( - sandboxReadyTimeoutSecs: number, - env: Record = process.env, -): number { - const readyTimeoutSecs = Number.isFinite(sandboxReadyTimeoutSecs) - ? Math.max(1, Math.round(sandboxReadyTimeoutSecs)) - : 1; - const fallback = Math.max( - readyTimeoutSecs, - DOCKER_GPU_SUPERVISOR_RECONNECT_MIN_SECS, - ); - return Math.max( - 1, - envInt(DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV, fallback, env), - ); -} - function decoratePatchError( error: T, context: DockerGpuPatchFailureContext, @@ -1017,7 +968,7 @@ export function recreateOpenShellDockerSandboxWithGpu( }); if (options.waitForSupervisor !== false) { - const execReady = waitForOpenShellSandboxExec( + const execReady = waitForOpenShellSupervisorReconnect( options.sandboxName, options.timeoutSecs ?? DOCKER_GPU_PATCH_WAIT_SECS, deps, diff --git a/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts b/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts new file mode 100644 index 0000000000..62976067f0 --- /dev/null +++ b/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts @@ -0,0 +1,159 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it, vi } from "vitest"; + +import { + getDockerGpuSupervisorReconnectErrorDebouncePolls, + waitForOpenShellSupervisorReconnect, +} from "../../../dist/lib/onboard/docker-gpu-supervisor-reconnect"; + +// The Docker GPU patch supervisor-reconnect wait must absorb a transient +// Error phase reported while OpenShell's sandbox-list cache catches up to +// the newly-recreated GPU container. The old-container teardown briefly +// marks the row Error before the host re-registers the new container. +// Without debouncing, the fast-fail short-circuits within ~12s on a healthy +// GPU sandbox whose container is running and whose supervisor has already +// logged `LIFECYCLE:INSTALL OpenShell Sandbox Supervisor success`. +describe("docker-gpu-supervisor-reconnect Error-phase debounce", () => { + it("absorbs a transient Error phase shorter than the debounce window", () => { + const execOutputs = [ + { status: 1, stderr: "sandbox not ready" }, + { status: 1, stderr: "sandbox not ready" }, + { status: 1, stderr: "sandbox not ready" }, + { status: 0, stdout: "" }, + ]; + let execIdx = 0; + const runOpenshell = vi.fn( + () => execOutputs[Math.min(execIdx++, execOutputs.length - 1)], + ); + const listOutputs = [ + "alpha Error 1s ago", + "alpha Error 3s ago", + "alpha Provisioning 5s ago", + "alpha Ready 7s ago", + ]; + let listIdx = 0; + const runCaptureOpenshell = vi.fn( + () => listOutputs[Math.min(listIdx++, listOutputs.length - 1)], + ); + const sleep = vi.fn(); + + const ok = waitForOpenShellSupervisorReconnect("alpha", 600, { + runOpenshell, + runCaptureOpenshell, + sleep, + errorPhaseDebouncePolls: 5, + }); + + expect(ok).toBe(true); + expect(runOpenshell).toHaveBeenCalledTimes(4); + }); + + it("still fast-fails when Error phase persists for the full debounce window", () => { + const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" })); + const runCaptureOpenshell = vi.fn(() => "alpha Error 1s ago"); + const sleep = vi.fn(); + + const ok = waitForOpenShellSupervisorReconnect("alpha", 600, { + runOpenshell, + runCaptureOpenshell, + sleep, + errorPhaseDebouncePolls: 3, + }); + + expect(ok).toBe(false); + // Three consecutive Error polls trigger the short-circuit on poll 3. + // Sleeps happen only between polls 1->2 and 2->3, so two sleeps total. + expect(runOpenshell).toHaveBeenCalledTimes(3); + expect(sleep).toHaveBeenCalledTimes(2); + }); + + it("resets the consecutive-Error counter when the phase recovers", () => { + // Error, Error, Provisioning (counter resets), Error, Error, Error + // -> bails out on the 3rd post-recovery Error, not earlier. + const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" })); + const listOutputs = [ + "alpha Error 1s ago", + "alpha Error 3s ago", + "alpha Provisioning 5s ago", + "alpha Error 7s ago", + "alpha Error 9s ago", + "alpha Error 11s ago", + ]; + let listIdx = 0; + const runCaptureOpenshell = vi.fn( + () => listOutputs[Math.min(listIdx++, listOutputs.length - 1)], + ); + const sleep = vi.fn(); + + const ok = waitForOpenShellSupervisorReconnect("alpha", 600, { + runOpenshell, + runCaptureOpenshell, + sleep, + errorPhaseDebouncePolls: 3, + }); + + expect(ok).toBe(false); + expect(runOpenshell).toHaveBeenCalledTimes(6); + }); + + it("defaults the debounce to 5 polls and honors the env override", () => { + expect(getDockerGpuSupervisorReconnectErrorDebouncePolls({})).toBe(5); + expect( + getDockerGpuSupervisorReconnectErrorDebouncePolls({ + NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE: "2", + }), + ).toBe(2); + // Non-positive values are clamped to a minimum of 1. + expect( + getDockerGpuSupervisorReconnectErrorDebouncePolls({ + NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE: "0", + }), + ).toBe(1); + }); + + it("clamps an injected debounce override to the same minimum as the env path", () => { + // 0 / negative / fractional overrides must not bypass the ≥1 contract that + // the env-backed helper enforces. + const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" })); + const runCaptureOpenshell = vi.fn(() => "alpha Error 1s ago"); + const sleep = vi.fn(); + + const ok = waitForOpenShellSupervisorReconnect("alpha", 600, { + runOpenshell, + runCaptureOpenshell, + sleep, + errorPhaseDebouncePolls: 0, + }); + + expect(ok).toBe(false); + // Clamped to K=1: first Error poll short-circuits with no preceding sleep. + expect(runOpenshell).toHaveBeenCalledTimes(1); + expect(sleep).not.toHaveBeenCalled(); + }); + + it("falls back to the env-backed default when an injected override is non-finite", () => { + // NaN / +Infinity / -Infinity overrides must not silently neutralise the + // fast-fail loop. A NaN comparison would always be false and `Infinity` + // would never satisfy `>= debouncePolls`, leaving the wait to burn the + // full timeout window. + for (const bogus of [Number.NaN, Number.POSITIVE_INFINITY, Number.NEGATIVE_INFINITY]) { + const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" })); + const runCaptureOpenshell = vi.fn(() => "alpha Error 1s ago"); + const sleep = vi.fn(); + + const ok = waitForOpenShellSupervisorReconnect("alpha", 600, { + runOpenshell, + runCaptureOpenshell, + sleep, + errorPhaseDebouncePolls: bogus, + }); + + expect(ok).toBe(false); + // Default K=5 from the env-backed helper: 5 polls + 4 sleeps before fast-fail. + expect(runOpenshell).toHaveBeenCalledTimes(5); + expect(sleep).toHaveBeenCalledTimes(4); + } + }); +}); diff --git a/src/lib/onboard/docker-gpu-supervisor-reconnect.ts b/src/lib/onboard/docker-gpu-supervisor-reconnect.ts new file mode 100644 index 0000000000..c8906e9501 --- /dev/null +++ b/src/lib/onboard/docker-gpu-supervisor-reconnect.ts @@ -0,0 +1,158 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * Supervisor-reconnect wait for the Docker GPU patch path. + * + * Source-of-truth boundary + * ------------------------ + * The transient Error phase this module debounces is observed in the + * `openshell sandbox list` cache while the OpenShell host re-registers the + * newly-recreated GPU container after `docker stop` + `docker run`. The + * preferred fix lives at the OpenShell gateway: `sandbox list` should not + * report a terminal phase for a sandbox whose Docker container is being + * recreated by the GPU patch path. Until that upstream change ships, + * NemoClaw tolerates the transient Error at this layer via a + * consecutive-poll debounce. + * + * Removal condition + * ----------------- + * Delete this debounce once OpenShell guarantees `sandbox list` skips the + * brief Error transition during a known recreate. A real-Docker GPU E2E + * reproduction (e.g. `e2e-branch-validation:gpu`, + * `gpu-repo-local-ollama-openclaw`) showing a transient teardown-Error that + * recovers to Ready is the runtime evidence required. + */ + +import { envInt } from "./env"; + +const DOCKER_GPU_PATCH_TIMEOUT_MS = 30_000; +const DOCKER_GPU_SUPERVISOR_RECONNECT_MIN_SECS = 900; +// Default consecutive Error-phase polls required before fast-fail. With a +// 2-second poll interval this is ~10s of sustained Error, which absorbs the +// transient Error reported during container recreation while still bailing +// fast on a patched container that crashed on startup. +const DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_PHASE_DEFAULT_DEBOUNCE_POLLS = 5; + +export const DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV = + "NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT"; +export const DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV = + "NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE"; + +const TERMINAL_SANDBOX_FAILURE_PHASES = new Set(["Error", "Failed", "CrashLoopBackOff"]); + +type DockerRunResult = { + status?: number | null; + stdout?: string | Buffer | null; + stderr?: string | Buffer | null; +}; + +type RunOpenshellFn = ( + args: string[], + opts?: Record, +) => DockerRunResult; +type RunCaptureOpenshellFn = ( + args: string[], + opts?: Record, +) => string; + +export type DockerGpuSupervisorReconnectDeps = { + runOpenshell?: RunOpenshellFn; + runCaptureOpenshell?: RunCaptureOpenshellFn; + sleep?: (seconds: number) => void; + errorPhaseDebouncePolls?: number; +}; + +function defaultSleep(seconds: number): void { + Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, Math.max(0, seconds) * 1000); +} + +function isZeroStatus(result: DockerRunResult | null | undefined): boolean { + return Number(result?.status ?? 0) === 0; +} + +const ANSI_RE = /\x1b\[[0-9;]*m/g; + +function parseSandboxListFailurePhase(output: string, sandboxName: string): string | null { + if (typeof output !== "string" || !output.includes(sandboxName)) return null; + for (const line of output.replace(ANSI_RE, "").split(/\r?\n/)) { + const cols = line.trim().split(/\s+/); + if (cols[0] === sandboxName) { + return cols.find((col) => TERMINAL_SANDBOX_FAILURE_PHASES.has(col)) ?? null; + } + } + return null; +} + +function sandboxListShowsErrorPhase( + sandboxName: string, + runCaptureOpenshell: RunCaptureOpenshellFn, +): boolean { + try { + const list = runCaptureOpenshell(["sandbox", "list"], { + ignoreError: true, + suppressOutput: true, + timeout: DOCKER_GPU_PATCH_TIMEOUT_MS, + }); + return parseSandboxListFailurePhase(list, sandboxName) !== null; + } catch { + return false; + } +} + +export function waitForOpenShellSupervisorReconnect( + sandboxName: string, + timeoutSecs: number, + deps: DockerGpuSupervisorReconnectDeps, +): boolean { + if (!deps.runOpenshell) return true; + const sleep = deps.sleep ?? defaultSleep; + const deadline = Date.now() + Math.max(1, timeoutSecs) * 1000; + const errorPhaseDebouncePolls = + deps.errorPhaseDebouncePolls == null || !Number.isFinite(deps.errorPhaseDebouncePolls) + ? getDockerGpuSupervisorReconnectErrorDebouncePolls() + : Math.max(1, Math.trunc(deps.errorPhaseDebouncePolls)); + let consecutiveErrorPolls = 0; + while (Date.now() <= deadline) { + const result = deps.runOpenshell( + ["sandbox", "exec", "-n", sandboxName, "--", "true"], + { ignoreError: true, suppressOutput: true, timeout: DOCKER_GPU_PATCH_TIMEOUT_MS }, + ); + if (isZeroStatus(result)) return true; + if ( + deps.runCaptureOpenshell && + sandboxListShowsErrorPhase(sandboxName, deps.runCaptureOpenshell) + ) { + consecutiveErrorPolls += 1; + if (consecutiveErrorPolls >= errorPhaseDebouncePolls) return false; + } else { + consecutiveErrorPolls = 0; + } + sleep(2); + } + return false; +} + +export function getDockerGpuSupervisorReconnectTimeoutSecs( + sandboxReadyTimeoutSecs: number, + env: Record = process.env, +): number { + const readyTimeoutSecs = Number.isFinite(sandboxReadyTimeoutSecs) + ? Math.max(1, Math.round(sandboxReadyTimeoutSecs)) + : 1; + const fallback = Math.max(readyTimeoutSecs, DOCKER_GPU_SUPERVISOR_RECONNECT_MIN_SECS); + return Math.max(1, envInt(DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV, fallback, env)); +} + +export function getDockerGpuSupervisorReconnectErrorDebouncePolls( + env: Record = process.env, +): number { + return Math.max( + 1, + envInt( + DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV, + DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_PHASE_DEFAULT_DEBOUNCE_POLLS, + env, + ), + ); +}