From 13abdbab27e8ef5379bdd7f1ab29c7d9c62ce9e8 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Tue, 2 Jun 2026 12:03:09 +0000 Subject: [PATCH 1/4] fix(onboard): debounce Docker GPU patch supervisor reconnect Error-phase short-circuit Signed-off-by: Tinson Lai --- src/lib/onboard/docker-gpu-patch.test.ts | 116 ++++++++++++++++++++++- src/lib/onboard/docker-gpu-patch.ts | 45 ++++++++- 2 files changed, 154 insertions(+), 7 deletions(-) diff --git a/src/lib/onboard/docker-gpu-patch.test.ts b/src/lib/onboard/docker-gpu-patch.test.ts index cf560e7e4f..e8ae5e799d 100644 --- a/src/lib/onboard/docker-gpu-patch.test.ts +++ b/src/lib/onboard/docker-gpu-patch.test.ts @@ -20,6 +20,7 @@ import { dockerReportsNvidiaCdiDevices, formatDockerInspectNetworkSummary, getDockerGpuPatchNetworkMode, + getDockerGpuSupervisorReconnectErrorDebouncePolls, getDockerGpuSupervisorReconnectTimeoutSecs, recreateOpenShellDockerSandboxWithGpu, selectDockerGpuPatchMode, @@ -837,7 +838,10 @@ describe("docker-gpu-patch Error-phase diagnostics (#4316)", () => { it("short-circuits the supervisor-reconnect wait when the sandbox enters Error phase", () => { // Without the short-circuit, a patched container that crashes on startup // leaves users waiting the full 900s+ supervisor-reconnect timeout before - // any Error-phase diagnostics run (#4316). + // any Error-phase diagnostics run (#4316). With the #4664 debounce now in + // place, this test asserts the K=1 (no-debounce) behaviour explicitly so + // the original fast-fail intent is preserved when the operator opts out + // of the debounce. const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" })); const listOutputs = [ "alpha Provisioning 1s ago", @@ -853,10 +857,11 @@ describe("docker-gpu-patch Error-phase diagnostics (#4316)", () => { runOpenshell, runCaptureOpenshell, sleep, + errorPhaseDebouncePolls: 1, }); expect(ok).toBe(false); - // Without short-circuit we'd loop ~300 iterations. With it, the second + // Without short-circuit we'd loop ~300 iterations. With K=1 the second // iteration's list output shows Error and the wait bails out. expect(runOpenshell).toHaveBeenCalledTimes(2); expect(sleep).toHaveBeenCalledTimes(1); @@ -1201,3 +1206,110 @@ describe("docker-gpu-patch Error-phase diagnostics (#4316)", () => { } }); }); + +// Regression coverage for NemoClaw issue #4664: the Docker GPU patch +// supervisor-reconnect wait must absorb a transient Error phase reported +// while OpenShell's sandbox-list cache catches up to the newly-recreated +// GPU container (old-container teardown briefly marks the row Error before +// the host re-registers the new container). Without debouncing, the +// #4316 fast-fail short-circuits within ~12s on a healthy GPU sandbox +// whose container is running and whose supervisor has already logged +// `LIFECYCLE:INSTALL OpenShell Sandbox Supervisor success`. +describe("docker-gpu-patch supervisor-reconnect Error-phase debounce (#4664)", () => { + it("absorbs a transient Error phase shorter than the debounce window", () => { + const execOutputs = [ + { status: 1, stderr: "sandbox not ready" }, + { status: 1, stderr: "sandbox not ready" }, + { status: 1, stderr: "sandbox not ready" }, + { status: 0, stdout: "" }, + ]; + let execIdx = 0; + const runOpenshell = vi.fn( + () => execOutputs[Math.min(execIdx++, execOutputs.length - 1)], + ); + const listOutputs = [ + "alpha Error 1s ago", + "alpha Error 3s ago", + "alpha Provisioning 5s ago", + "alpha Ready 7s ago", + ]; + let listIdx = 0; + const runCaptureOpenshell = vi.fn( + () => listOutputs[Math.min(listIdx++, listOutputs.length - 1)], + ); + const sleep = vi.fn(); + + const ok = waitForOpenShellSupervisorReconnect("alpha", 600, { + runOpenshell, + runCaptureOpenshell, + sleep, + errorPhaseDebouncePolls: 5, + }); + + expect(ok).toBe(true); + expect(runOpenshell).toHaveBeenCalledTimes(4); + }); + + it("still fast-fails when Error phase persists for the full debounce window", () => { + const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" })); + const runCaptureOpenshell = vi.fn(() => "alpha Error 1s ago"); + const sleep = vi.fn(); + + const ok = waitForOpenShellSupervisorReconnect("alpha", 600, { + runOpenshell, + runCaptureOpenshell, + sleep, + errorPhaseDebouncePolls: 3, + }); + + expect(ok).toBe(false); + // Three consecutive Error polls trigger the short-circuit on poll #3. + // Sleeps happen only between polls 1->2 and 2->3, so two sleeps total. + expect(runOpenshell).toHaveBeenCalledTimes(3); + expect(sleep).toHaveBeenCalledTimes(2); + }); + + it("resets the consecutive-Error counter when the phase recovers", () => { + // Error, Error, Provisioning (counter resets), Error, Error, Error + // -> bails out on the 3rd post-recovery Error, not on the 2nd overall. + const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" })); + const listOutputs = [ + "alpha Error 1s ago", + "alpha Error 3s ago", + "alpha Provisioning 5s ago", + "alpha Error 7s ago", + "alpha Error 9s ago", + "alpha Error 11s ago", + ]; + let listIdx = 0; + const runCaptureOpenshell = vi.fn( + () => listOutputs[Math.min(listIdx++, listOutputs.length - 1)], + ); + const sleep = vi.fn(); + + const ok = waitForOpenShellSupervisorReconnect("alpha", 600, { + runOpenshell, + runCaptureOpenshell, + sleep, + errorPhaseDebouncePolls: 3, + }); + + expect(ok).toBe(false); + expect(runOpenshell).toHaveBeenCalledTimes(6); + }); + + it("defaults the debounce to 5 polls and honors the env override", () => { + expect(getDockerGpuSupervisorReconnectErrorDebouncePolls({})).toBe(5); + expect( + getDockerGpuSupervisorReconnectErrorDebouncePolls({ + NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE: "2", + }), + ).toBe(2); + // Non-positive values are clamped to a minimum of 1. + expect( + getDockerGpuSupervisorReconnectErrorDebouncePolls({ + NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE: "0", + }), + ).toBe(1); + }); +}); diff --git a/src/lib/onboard/docker-gpu-patch.ts b/src/lib/onboard/docker-gpu-patch.ts index 5e9dd16a13..21c83db59c 100644 --- a/src/lib/onboard/docker-gpu-patch.ts +++ b/src/lib/onboard/docker-gpu-patch.ts @@ -24,8 +24,17 @@ const OPENSHELL_SANDBOX_COMMAND_ENV = "OPENSHELL_SANDBOX_COMMAND"; const DOCKER_GPU_PATCH_TIMEOUT_MS = 30_000; const DOCKER_GPU_PATCH_WAIT_SECS = 180; const DOCKER_GPU_SUPERVISOR_RECONNECT_MIN_SECS = 900; +// Default number of consecutive Error-phase polls required before the +// supervisor-reconnect wait short-circuits. With a 2-second poll interval this +// is ~10s of sustained Error before fast-fail, which absorbs the transient +// Error reported while OpenShell's sandbox-list cache catches up to the +// newly-recreated GPU container (#4664) while still bailing fast on a +// patched container that actually crashed on startup (#4316). +const DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_PHASE_DEFAULT_DEBOUNCE_POLLS = 5; export const DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV = "NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT"; +export const DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV = + "NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE"; export const DOCKER_GPU_PATCH_NETWORK_ENV = "NEMOCLAW_DOCKER_GPU_PATCH_NETWORK"; const MAX_DOCKER_CONTAINER_NAME_LENGTH = 253; const GPU_ENV_KEYS = new Set([ @@ -70,6 +79,12 @@ export type DockerGpuPatchDeps = { readDir?: (dirPath: string) => string[] | null; /** Injectable file reader for unit testing CDI spec content checks. */ readFile?: (filePath: string) => string | null; + /** + * Number of consecutive Error-phase polls required before the + * supervisor-reconnect wait short-circuits. Omit to use the + * env-configurable default (#4664). + */ + errorPhaseDebouncePolls?: number; }; export type DockerGpuPatchModeKind = "gpus" | "nvidia-runtime" | "cdi"; @@ -859,21 +874,28 @@ function waitForOpenShellSandboxExec( if (!deps.runOpenshell) return true; const d = depsWithDefaults(deps); const deadline = Date.now() + Math.max(1, timeoutSecs) * 1000; + const errorPhaseDebouncePolls = + deps.errorPhaseDebouncePolls ?? getDockerGpuSupervisorReconnectErrorDebouncePolls(); + let consecutiveErrorPolls = 0; while (Date.now() <= deadline) { const result = deps.runOpenshell( ["sandbox", "exec", "-n", sandboxName, "--", "true"], { ignoreError: true, suppressOutput: true, timeout: DOCKER_GPU_PATCH_TIMEOUT_MS }, ); if (isZeroStatus(result)) return true; - // Short-circuit the supervisor-reconnect wait when the sandbox enters a - // terminal failure phase. Without this, a patched container that exits - // on startup leaves the user staring at the supervisor-reconnect - // timeout (default 900s) before any Error-phase diagnostics run (#4316). + // Debounce the terminal-phase short-circuit. A patched container that + // crashes on startup still fast-fails (#4316), but a transient Error + // reported while OpenShell's sandbox-list cache catches up to the + // newly-recreated GPU container is not treated as fatal (#4664). The + // poll count required is configurable via env for operator tuning. if ( deps.runCaptureOpenshell && sandboxListShowsErrorPhase(sandboxName, deps.runCaptureOpenshell) ) { - return false; + consecutiveErrorPolls += 1; + if (consecutiveErrorPolls >= errorPhaseDebouncePolls) return false; + } else { + consecutiveErrorPolls = 0; } d.sleep(2); } @@ -899,6 +921,19 @@ export function getDockerGpuSupervisorReconnectTimeoutSecs( ); } +export function getDockerGpuSupervisorReconnectErrorDebouncePolls( + env: Record = process.env, +): number { + return Math.max( + 1, + envInt( + DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV, + DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_PHASE_DEFAULT_DEBOUNCE_POLLS, + env, + ), + ); +} + function decoratePatchError( error: T, context: DockerGpuPatchFailureContext, From c8bc1c44cbbb4c9ce8dca86310e4d375c9b21d7e Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Tue, 2 Jun 2026 12:31:13 +0000 Subject: [PATCH 2/4] refactor(onboard): extract Docker GPU supervisor-reconnect debounce module + document env Signed-off-by: Tinson Lai --- docs/reference/troubleshooting.mdx | 5 + .../references/troubleshooting.md | 5 + src/lib/onboard/docker-gpu-patch.test.ts | 115 +------------ src/lib/onboard/docker-gpu-patch.ts | 122 +++----------- .../docker-gpu-supervisor-reconnect.test.ts | 115 +++++++++++++ .../docker-gpu-supervisor-reconnect.ts | 156 ++++++++++++++++++ 6 files changed, 304 insertions(+), 214 deletions(-) create mode 100644 src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts create mode 100644 src/lib/onboard/docker-gpu-supervisor-reconnect.ts diff --git a/docs/reference/troubleshooting.mdx b/docs/reference/troubleshooting.mdx index 6d23f1f476..8d57d3f8cd 100644 --- a/docs/reference/troubleshooting.mdx +++ b/docs/reference/troubleshooting.mdx @@ -1254,6 +1254,11 @@ Fix the NVIDIA Container Toolkit or CDI configuration reported in the diagnostic If you do not need GPU access inside the sandbox, rerun with `--no-sandbox-gpu`. Set `NEMOCLAW_DOCKER_GPU_PATCH=0` only when you need to bypass this compatibility path during troubleshooting. +If onboarding reports `OpenShell supervisor did not reconnect to the GPU-enabled container.` even though the diagnostic bundle shows the patched container is running and healthy, the supervisor-reconnect wait is treating a transient Error phase (reported while the OpenShell host re-registers the new container) as fatal. +The reconnect wait debounces consecutive Error-phase polls before fast-failing, defaulting to five consecutive polls of about 10 seconds in total. +Increase the debounce window with `NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE` if your host needs more time to re-register the patched container, for example slow WSL2 + Docker Desktop setups. +Set it to a higher integer such as `15` (about 30 seconds) and rerun onboarding; the value is clamped to a minimum of `1`. + ### `pip install` fails with a system-packages error Recent Ubuntu releases (including DGX Spark's Ubuntu 24.04) mark the system Python install as externally managed, so `pip install` without a virtual environment fails. diff --git a/skills/nemoclaw-user-reference/references/troubleshooting.md b/skills/nemoclaw-user-reference/references/troubleshooting.md index bcee680bfe..7ee345ab8f 100644 --- a/skills/nemoclaw-user-reference/references/troubleshooting.md +++ b/skills/nemoclaw-user-reference/references/troubleshooting.md @@ -1244,6 +1244,11 @@ Fix the NVIDIA Container Toolkit or CDI configuration reported in the diagnostic If you do not need GPU access inside the sandbox, rerun with `--no-sandbox-gpu`. Set `NEMOCLAW_DOCKER_GPU_PATCH=0` only when you need to bypass this compatibility path during troubleshooting. +If onboarding reports `OpenShell supervisor did not reconnect to the GPU-enabled container.` even though the diagnostic bundle shows the patched container is running and healthy, the supervisor-reconnect wait is treating a transient Error phase (reported while the OpenShell host re-registers the new container) as fatal. +The reconnect wait debounces consecutive Error-phase polls before fast-failing, defaulting to five consecutive polls of about 10 seconds in total. +Increase the debounce window with `NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE` if your host needs more time to re-register the patched container, for example slow WSL2 + Docker Desktop setups. +Set it to a higher integer such as `15` (about 30 seconds) and rerun onboarding; the value is clamped to a minimum of `1`. + ### `pip install` fails with a system-packages error Recent Ubuntu releases (including DGX Spark's Ubuntu 24.04) mark the system Python install as externally managed, so `pip install` without a virtual environment fails. diff --git a/src/lib/onboard/docker-gpu-patch.test.ts b/src/lib/onboard/docker-gpu-patch.test.ts index e8ae5e799d..260b16be16 100644 --- a/src/lib/onboard/docker-gpu-patch.test.ts +++ b/src/lib/onboard/docker-gpu-patch.test.ts @@ -20,7 +20,6 @@ import { dockerReportsNvidiaCdiDevices, formatDockerInspectNetworkSummary, getDockerGpuPatchNetworkMode, - getDockerGpuSupervisorReconnectErrorDebouncePolls, getDockerGpuSupervisorReconnectTimeoutSecs, recreateOpenShellDockerSandboxWithGpu, selectDockerGpuPatchMode, @@ -838,10 +837,10 @@ describe("docker-gpu-patch Error-phase diagnostics (#4316)", () => { it("short-circuits the supervisor-reconnect wait when the sandbox enters Error phase", () => { // Without the short-circuit, a patched container that crashes on startup // leaves users waiting the full 900s+ supervisor-reconnect timeout before - // any Error-phase diagnostics run (#4316). With the #4664 debounce now in - // place, this test asserts the K=1 (no-debounce) behaviour explicitly so - // the original fast-fail intent is preserved when the operator opts out - // of the debounce. + // any Error-phase diagnostics run. With the debounce now in place, this + // test asserts the K=1 (no-debounce) behavior explicitly so the original + // fast-fail intent is preserved when the operator opts out of the + // debounce. const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" })); const listOutputs = [ "alpha Provisioning 1s ago", @@ -1207,109 +1206,3 @@ describe("docker-gpu-patch Error-phase diagnostics (#4316)", () => { }); }); -// Regression coverage for NemoClaw issue #4664: the Docker GPU patch -// supervisor-reconnect wait must absorb a transient Error phase reported -// while OpenShell's sandbox-list cache catches up to the newly-recreated -// GPU container (old-container teardown briefly marks the row Error before -// the host re-registers the new container). Without debouncing, the -// #4316 fast-fail short-circuits within ~12s on a healthy GPU sandbox -// whose container is running and whose supervisor has already logged -// `LIFECYCLE:INSTALL OpenShell Sandbox Supervisor success`. -describe("docker-gpu-patch supervisor-reconnect Error-phase debounce (#4664)", () => { - it("absorbs a transient Error phase shorter than the debounce window", () => { - const execOutputs = [ - { status: 1, stderr: "sandbox not ready" }, - { status: 1, stderr: "sandbox not ready" }, - { status: 1, stderr: "sandbox not ready" }, - { status: 0, stdout: "" }, - ]; - let execIdx = 0; - const runOpenshell = vi.fn( - () => execOutputs[Math.min(execIdx++, execOutputs.length - 1)], - ); - const listOutputs = [ - "alpha Error 1s ago", - "alpha Error 3s ago", - "alpha Provisioning 5s ago", - "alpha Ready 7s ago", - ]; - let listIdx = 0; - const runCaptureOpenshell = vi.fn( - () => listOutputs[Math.min(listIdx++, listOutputs.length - 1)], - ); - const sleep = vi.fn(); - - const ok = waitForOpenShellSupervisorReconnect("alpha", 600, { - runOpenshell, - runCaptureOpenshell, - sleep, - errorPhaseDebouncePolls: 5, - }); - - expect(ok).toBe(true); - expect(runOpenshell).toHaveBeenCalledTimes(4); - }); - - it("still fast-fails when Error phase persists for the full debounce window", () => { - const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" })); - const runCaptureOpenshell = vi.fn(() => "alpha Error 1s ago"); - const sleep = vi.fn(); - - const ok = waitForOpenShellSupervisorReconnect("alpha", 600, { - runOpenshell, - runCaptureOpenshell, - sleep, - errorPhaseDebouncePolls: 3, - }); - - expect(ok).toBe(false); - // Three consecutive Error polls trigger the short-circuit on poll #3. - // Sleeps happen only between polls 1->2 and 2->3, so two sleeps total. - expect(runOpenshell).toHaveBeenCalledTimes(3); - expect(sleep).toHaveBeenCalledTimes(2); - }); - - it("resets the consecutive-Error counter when the phase recovers", () => { - // Error, Error, Provisioning (counter resets), Error, Error, Error - // -> bails out on the 3rd post-recovery Error, not on the 2nd overall. - const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" })); - const listOutputs = [ - "alpha Error 1s ago", - "alpha Error 3s ago", - "alpha Provisioning 5s ago", - "alpha Error 7s ago", - "alpha Error 9s ago", - "alpha Error 11s ago", - ]; - let listIdx = 0; - const runCaptureOpenshell = vi.fn( - () => listOutputs[Math.min(listIdx++, listOutputs.length - 1)], - ); - const sleep = vi.fn(); - - const ok = waitForOpenShellSupervisorReconnect("alpha", 600, { - runOpenshell, - runCaptureOpenshell, - sleep, - errorPhaseDebouncePolls: 3, - }); - - expect(ok).toBe(false); - expect(runOpenshell).toHaveBeenCalledTimes(6); - }); - - it("defaults the debounce to 5 polls and honors the env override", () => { - expect(getDockerGpuSupervisorReconnectErrorDebouncePolls({})).toBe(5); - expect( - getDockerGpuSupervisorReconnectErrorDebouncePolls({ - NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE: "2", - }), - ).toBe(2); - // Non-positive values are clamped to a minimum of 1. - expect( - getDockerGpuSupervisorReconnectErrorDebouncePolls({ - NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE: "0", - }), - ).toBe(1); - }); -}); diff --git a/src/lib/onboard/docker-gpu-patch.ts b/src/lib/onboard/docker-gpu-patch.ts index 21c83db59c..d46705ebc5 100644 --- a/src/lib/onboard/docker-gpu-patch.ts +++ b/src/lib/onboard/docker-gpu-patch.ts @@ -14,7 +14,22 @@ import { dockerRunDetached, dockerStop, } from "../adapters/docker"; -import { envInt } from "./env"; +import { + type DockerGpuSupervisorReconnectDeps, + DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV, + DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV, + getDockerGpuSupervisorReconnectErrorDebouncePolls, + getDockerGpuSupervisorReconnectTimeoutSecs, + waitForOpenShellSupervisorReconnect, +} from "./docker-gpu-supervisor-reconnect"; +export { + DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV, + DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV, + getDockerGpuSupervisorReconnectErrorDebouncePolls, + getDockerGpuSupervisorReconnectTimeoutSecs, + waitForOpenShellSupervisorReconnect, +}; +export type { DockerGpuSupervisorReconnectDeps }; export const OPENSHELL_MANAGED_BY_LABEL = "openshell.ai/managed-by"; export const OPENSHELL_MANAGED_BY_VALUE = "openshell"; @@ -23,18 +38,6 @@ const OPENSHELL_SANDBOX_COMMAND_ENV = "OPENSHELL_SANDBOX_COMMAND"; const DOCKER_GPU_PATCH_TIMEOUT_MS = 30_000; const DOCKER_GPU_PATCH_WAIT_SECS = 180; -const DOCKER_GPU_SUPERVISOR_RECONNECT_MIN_SECS = 900; -// Default number of consecutive Error-phase polls required before the -// supervisor-reconnect wait short-circuits. With a 2-second poll interval this -// is ~10s of sustained Error before fast-fail, which absorbs the transient -// Error reported while OpenShell's sandbox-list cache catches up to the -// newly-recreated GPU container (#4664) while still bailing fast on a -// patched container that actually crashed on startup (#4316). -const DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_PHASE_DEFAULT_DEBOUNCE_POLLS = 5; -export const DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV = - "NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT"; -export const DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV = - "NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE"; export const DOCKER_GPU_PATCH_NETWORK_ENV = "NEMOCLAW_DOCKER_GPU_PATCH_NETWORK"; const MAX_DOCKER_CONTAINER_NAME_LENGTH = 253; const GPU_ENV_KEYS = new Set([ @@ -80,9 +83,8 @@ export type DockerGpuPatchDeps = { /** Injectable file reader for unit testing CDI spec content checks. */ readFile?: (filePath: string) => string | null; /** - * Number of consecutive Error-phase polls required before the - * supervisor-reconnect wait short-circuits. Omit to use the - * env-configurable default (#4664). + * Forwarded to the supervisor-reconnect wait. See + * `DockerGpuSupervisorReconnectDeps.errorPhaseDebouncePolls`. */ errorPhaseDebouncePolls?: number; }; @@ -848,92 +850,6 @@ function waitForNewContainerId( return null; } -function sandboxListShowsErrorPhase( - sandboxName: string, - runCaptureOpenshell: NonNullable, -): boolean { - try { - const list = runCaptureOpenshell(["sandbox", "list"], { - ignoreError: true, - suppressOutput: true, - timeout: DOCKER_GPU_PATCH_TIMEOUT_MS, - }); - return SANDBOX_FAILURE_PHASE_TOKENS.has( - parseSandboxPhaseFromListOutput(list, sandboxName) ?? "", - ); - } catch { - return false; - } -} - -function waitForOpenShellSandboxExec( - sandboxName: string, - timeoutSecs: number, - deps: DockerGpuPatchDeps, -): boolean { - if (!deps.runOpenshell) return true; - const d = depsWithDefaults(deps); - const deadline = Date.now() + Math.max(1, timeoutSecs) * 1000; - const errorPhaseDebouncePolls = - deps.errorPhaseDebouncePolls ?? getDockerGpuSupervisorReconnectErrorDebouncePolls(); - let consecutiveErrorPolls = 0; - while (Date.now() <= deadline) { - const result = deps.runOpenshell( - ["sandbox", "exec", "-n", sandboxName, "--", "true"], - { ignoreError: true, suppressOutput: true, timeout: DOCKER_GPU_PATCH_TIMEOUT_MS }, - ); - if (isZeroStatus(result)) return true; - // Debounce the terminal-phase short-circuit. A patched container that - // crashes on startup still fast-fails (#4316), but a transient Error - // reported while OpenShell's sandbox-list cache catches up to the - // newly-recreated GPU container is not treated as fatal (#4664). The - // poll count required is configurable via env for operator tuning. - if ( - deps.runCaptureOpenshell && - sandboxListShowsErrorPhase(sandboxName, deps.runCaptureOpenshell) - ) { - consecutiveErrorPolls += 1; - if (consecutiveErrorPolls >= errorPhaseDebouncePolls) return false; - } else { - consecutiveErrorPolls = 0; - } - d.sleep(2); - } - return false; -} - -export const waitForOpenShellSupervisorReconnect = waitForOpenShellSandboxExec; - -export function getDockerGpuSupervisorReconnectTimeoutSecs( - sandboxReadyTimeoutSecs: number, - env: Record = process.env, -): number { - const readyTimeoutSecs = Number.isFinite(sandboxReadyTimeoutSecs) - ? Math.max(1, Math.round(sandboxReadyTimeoutSecs)) - : 1; - const fallback = Math.max( - readyTimeoutSecs, - DOCKER_GPU_SUPERVISOR_RECONNECT_MIN_SECS, - ); - return Math.max( - 1, - envInt(DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV, fallback, env), - ); -} - -export function getDockerGpuSupervisorReconnectErrorDebouncePolls( - env: Record = process.env, -): number { - return Math.max( - 1, - envInt( - DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV, - DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_PHASE_DEFAULT_DEBOUNCE_POLLS, - env, - ), - ); -} - function decoratePatchError( error: T, context: DockerGpuPatchFailureContext, @@ -1052,7 +968,7 @@ export function recreateOpenShellDockerSandboxWithGpu( }); if (options.waitForSupervisor !== false) { - const execReady = waitForOpenShellSandboxExec( + const execReady = waitForOpenShellSupervisorReconnect( options.sandboxName, options.timeoutSecs ?? DOCKER_GPU_PATCH_WAIT_SECS, deps, diff --git a/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts b/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts new file mode 100644 index 0000000000..07e47d17d3 --- /dev/null +++ b/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts @@ -0,0 +1,115 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it, vi } from "vitest"; + +import { + getDockerGpuSupervisorReconnectErrorDebouncePolls, + waitForOpenShellSupervisorReconnect, +} from "../../../dist/lib/onboard/docker-gpu-supervisor-reconnect"; + +// The Docker GPU patch supervisor-reconnect wait must absorb a transient +// Error phase reported while OpenShell's sandbox-list cache catches up to +// the newly-recreated GPU container. The old-container teardown briefly +// marks the row Error before the host re-registers the new container. +// Without debouncing, the fast-fail short-circuits within ~12s on a healthy +// GPU sandbox whose container is running and whose supervisor has already +// logged `LIFECYCLE:INSTALL OpenShell Sandbox Supervisor success`. +describe("docker-gpu-supervisor-reconnect Error-phase debounce", () => { + it("absorbs a transient Error phase shorter than the debounce window", () => { + const execOutputs = [ + { status: 1, stderr: "sandbox not ready" }, + { status: 1, stderr: "sandbox not ready" }, + { status: 1, stderr: "sandbox not ready" }, + { status: 0, stdout: "" }, + ]; + let execIdx = 0; + const runOpenshell = vi.fn( + () => execOutputs[Math.min(execIdx++, execOutputs.length - 1)], + ); + const listOutputs = [ + "alpha Error 1s ago", + "alpha Error 3s ago", + "alpha Provisioning 5s ago", + "alpha Ready 7s ago", + ]; + let listIdx = 0; + const runCaptureOpenshell = vi.fn( + () => listOutputs[Math.min(listIdx++, listOutputs.length - 1)], + ); + const sleep = vi.fn(); + + const ok = waitForOpenShellSupervisorReconnect("alpha", 600, { + runOpenshell, + runCaptureOpenshell, + sleep, + errorPhaseDebouncePolls: 5, + }); + + expect(ok).toBe(true); + expect(runOpenshell).toHaveBeenCalledTimes(4); + }); + + it("still fast-fails when Error phase persists for the full debounce window", () => { + const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" })); + const runCaptureOpenshell = vi.fn(() => "alpha Error 1s ago"); + const sleep = vi.fn(); + + const ok = waitForOpenShellSupervisorReconnect("alpha", 600, { + runOpenshell, + runCaptureOpenshell, + sleep, + errorPhaseDebouncePolls: 3, + }); + + expect(ok).toBe(false); + // Three consecutive Error polls trigger the short-circuit on poll 3. + // Sleeps happen only between polls 1->2 and 2->3, so two sleeps total. + expect(runOpenshell).toHaveBeenCalledTimes(3); + expect(sleep).toHaveBeenCalledTimes(2); + }); + + it("resets the consecutive-Error counter when the phase recovers", () => { + // Error, Error, Provisioning (counter resets), Error, Error, Error + // -> bails out on the 3rd post-recovery Error, not earlier. + const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" })); + const listOutputs = [ + "alpha Error 1s ago", + "alpha Error 3s ago", + "alpha Provisioning 5s ago", + "alpha Error 7s ago", + "alpha Error 9s ago", + "alpha Error 11s ago", + ]; + let listIdx = 0; + const runCaptureOpenshell = vi.fn( + () => listOutputs[Math.min(listIdx++, listOutputs.length - 1)], + ); + const sleep = vi.fn(); + + const ok = waitForOpenShellSupervisorReconnect("alpha", 600, { + runOpenshell, + runCaptureOpenshell, + sleep, + errorPhaseDebouncePolls: 3, + }); + + expect(ok).toBe(false); + expect(runOpenshell).toHaveBeenCalledTimes(6); + }); + + it("defaults the debounce to 5 polls and honors the env override", () => { + expect(getDockerGpuSupervisorReconnectErrorDebouncePolls({})).toBe(5); + expect( + getDockerGpuSupervisorReconnectErrorDebouncePolls({ + NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE: "2", + }), + ).toBe(2); + // Non-positive values are clamped to a minimum of 1. + expect( + getDockerGpuSupervisorReconnectErrorDebouncePolls({ + NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE: "0", + }), + ).toBe(1); + }); +}); diff --git a/src/lib/onboard/docker-gpu-supervisor-reconnect.ts b/src/lib/onboard/docker-gpu-supervisor-reconnect.ts new file mode 100644 index 0000000000..298ad300ec --- /dev/null +++ b/src/lib/onboard/docker-gpu-supervisor-reconnect.ts @@ -0,0 +1,156 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * Supervisor-reconnect wait for the Docker GPU patch path. + * + * Source-of-truth boundary + * ------------------------ + * The transient Error phase this module debounces is observed in the + * `openshell sandbox list` cache while the OpenShell host re-registers the + * newly-recreated GPU container after `docker stop` + `docker run`. The + * preferred fix lives at the OpenShell gateway: `sandbox list` should not + * report a terminal phase for a sandbox whose Docker container is being + * recreated by the GPU patch path. Until that upstream change ships, + * NemoClaw tolerates the transient Error at this layer via a + * consecutive-poll debounce. + * + * Removal condition + * ----------------- + * Delete this debounce once OpenShell guarantees `sandbox list` skips the + * brief Error transition during a known recreate. A real-Docker GPU E2E + * reproduction (e.g. `e2e-branch-validation:gpu`, + * `gpu-repo-local-ollama-openclaw`) showing a transient teardown-Error that + * recovers to Ready is the runtime evidence required. + */ + +import { envInt } from "./env"; + +const DOCKER_GPU_PATCH_TIMEOUT_MS = 30_000; +const DOCKER_GPU_SUPERVISOR_RECONNECT_MIN_SECS = 900; +// Default consecutive Error-phase polls required before fast-fail. With a +// 2-second poll interval this is ~10s of sustained Error, which absorbs the +// transient Error reported during container recreation while still bailing +// fast on a patched container that crashed on startup. +const DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_PHASE_DEFAULT_DEBOUNCE_POLLS = 5; + +export const DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV = + "NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT"; +export const DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV = + "NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE"; + +const TERMINAL_SANDBOX_FAILURE_PHASES = new Set(["Error", "Failed", "CrashLoopBackOff"]); + +type DockerRunResult = { + status?: number | null; + stdout?: string | Buffer | null; + stderr?: string | Buffer | null; +}; + +type RunOpenshellFn = ( + args: string[], + opts?: Record, +) => DockerRunResult; +type RunCaptureOpenshellFn = ( + args: string[], + opts?: Record, +) => string; + +export type DockerGpuSupervisorReconnectDeps = { + runOpenshell?: RunOpenshellFn; + runCaptureOpenshell?: RunCaptureOpenshellFn; + sleep?: (seconds: number) => void; + errorPhaseDebouncePolls?: number; +}; + +function defaultSleep(seconds: number): void { + Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, Math.max(0, seconds) * 1000); +} + +function isZeroStatus(result: DockerRunResult | null | undefined): boolean { + return Number(result?.status ?? 0) === 0; +} + +const ANSI_RE = /\x1b\[[0-9;]*m/g; + +function parseSandboxListFailurePhase(output: string, sandboxName: string): string | null { + if (typeof output !== "string" || !output.includes(sandboxName)) return null; + for (const line of output.replace(ANSI_RE, "").split(/\r?\n/)) { + const cols = line.trim().split(/\s+/); + if (cols[0] === sandboxName) { + return cols.find((col) => TERMINAL_SANDBOX_FAILURE_PHASES.has(col)) ?? null; + } + } + return null; +} + +function sandboxListShowsErrorPhase( + sandboxName: string, + runCaptureOpenshell: RunCaptureOpenshellFn, +): boolean { + try { + const list = runCaptureOpenshell(["sandbox", "list"], { + ignoreError: true, + suppressOutput: true, + timeout: DOCKER_GPU_PATCH_TIMEOUT_MS, + }); + return parseSandboxListFailurePhase(list, sandboxName) !== null; + } catch { + return false; + } +} + +export function waitForOpenShellSupervisorReconnect( + sandboxName: string, + timeoutSecs: number, + deps: DockerGpuSupervisorReconnectDeps, +): boolean { + if (!deps.runOpenshell) return true; + const sleep = deps.sleep ?? defaultSleep; + const deadline = Date.now() + Math.max(1, timeoutSecs) * 1000; + const errorPhaseDebouncePolls = + deps.errorPhaseDebouncePolls ?? getDockerGpuSupervisorReconnectErrorDebouncePolls(); + let consecutiveErrorPolls = 0; + while (Date.now() <= deadline) { + const result = deps.runOpenshell( + ["sandbox", "exec", "-n", sandboxName, "--", "true"], + { ignoreError: true, suppressOutput: true, timeout: DOCKER_GPU_PATCH_TIMEOUT_MS }, + ); + if (isZeroStatus(result)) return true; + if ( + deps.runCaptureOpenshell && + sandboxListShowsErrorPhase(sandboxName, deps.runCaptureOpenshell) + ) { + consecutiveErrorPolls += 1; + if (consecutiveErrorPolls >= errorPhaseDebouncePolls) return false; + } else { + consecutiveErrorPolls = 0; + } + sleep(2); + } + return false; +} + +export function getDockerGpuSupervisorReconnectTimeoutSecs( + sandboxReadyTimeoutSecs: number, + env: Record = process.env, +): number { + const readyTimeoutSecs = Number.isFinite(sandboxReadyTimeoutSecs) + ? Math.max(1, Math.round(sandboxReadyTimeoutSecs)) + : 1; + const fallback = Math.max(readyTimeoutSecs, DOCKER_GPU_SUPERVISOR_RECONNECT_MIN_SECS); + return Math.max(1, envInt(DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV, fallback, env)); +} + +export function getDockerGpuSupervisorReconnectErrorDebouncePolls( + env: Record = process.env, +): number { + return Math.max( + 1, + envInt( + DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV, + DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_PHASE_DEFAULT_DEBOUNCE_POLLS, + env, + ), + ); +} From 21013fc441f44d2cacbab0fffecd7b6818adff61 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Tue, 2 Jun 2026 12:46:08 +0000 Subject: [PATCH 3/4] fix(onboard): clamp injected supervisor-reconnect debounce override to minimum 1 Signed-off-by: Tinson Lai --- .../docker-gpu-supervisor-reconnect.test.ts | 20 +++++++++++++++++++ .../docker-gpu-supervisor-reconnect.ts | 4 +++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts b/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts index 07e47d17d3..e2cabda6b9 100644 --- a/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts +++ b/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts @@ -112,4 +112,24 @@ describe("docker-gpu-supervisor-reconnect Error-phase debounce", () => { }), ).toBe(1); }); + + it("clamps an injected debounce override to the same minimum as the env path", () => { + // 0 / negative / fractional overrides must not bypass the ≥1 contract that + // the env-backed helper enforces. + const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" })); + const runCaptureOpenshell = vi.fn(() => "alpha Error 1s ago"); + const sleep = vi.fn(); + + const ok = waitForOpenShellSupervisorReconnect("alpha", 600, { + runOpenshell, + runCaptureOpenshell, + sleep, + errorPhaseDebouncePolls: 0, + }); + + expect(ok).toBe(false); + // Clamped to K=1: first Error poll short-circuits with no preceding sleep. + expect(runOpenshell).toHaveBeenCalledTimes(1); + expect(sleep).not.toHaveBeenCalled(); + }); }); diff --git a/src/lib/onboard/docker-gpu-supervisor-reconnect.ts b/src/lib/onboard/docker-gpu-supervisor-reconnect.ts index 298ad300ec..3b052a84f8 100644 --- a/src/lib/onboard/docker-gpu-supervisor-reconnect.ts +++ b/src/lib/onboard/docker-gpu-supervisor-reconnect.ts @@ -109,7 +109,9 @@ export function waitForOpenShellSupervisorReconnect( const sleep = deps.sleep ?? defaultSleep; const deadline = Date.now() + Math.max(1, timeoutSecs) * 1000; const errorPhaseDebouncePolls = - deps.errorPhaseDebouncePolls ?? getDockerGpuSupervisorReconnectErrorDebouncePolls(); + deps.errorPhaseDebouncePolls == null + ? getDockerGpuSupervisorReconnectErrorDebouncePolls() + : Math.max(1, Math.trunc(deps.errorPhaseDebouncePolls)); let consecutiveErrorPolls = 0; while (Date.now() <= deadline) { const result = deps.runOpenshell( From 1b70103e57963b21ea495b6488bba94f7866e654 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Tue, 2 Jun 2026 13:07:32 +0000 Subject: [PATCH 4/4] fix(onboard): reject non-finite supervisor-reconnect debounce overrides + trim EOF Signed-off-by: Tinson Lai --- src/lib/onboard/docker-gpu-patch.test.ts | 1 - .../docker-gpu-supervisor-reconnect.test.ts | 24 +++++++++++++++++++ .../docker-gpu-supervisor-reconnect.ts | 2 +- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/lib/onboard/docker-gpu-patch.test.ts b/src/lib/onboard/docker-gpu-patch.test.ts index 260b16be16..88223f119f 100644 --- a/src/lib/onboard/docker-gpu-patch.test.ts +++ b/src/lib/onboard/docker-gpu-patch.test.ts @@ -1205,4 +1205,3 @@ describe("docker-gpu-patch Error-phase diagnostics (#4316)", () => { } }); }); - diff --git a/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts b/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts index e2cabda6b9..62976067f0 100644 --- a/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts +++ b/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts @@ -132,4 +132,28 @@ describe("docker-gpu-supervisor-reconnect Error-phase debounce", () => { expect(runOpenshell).toHaveBeenCalledTimes(1); expect(sleep).not.toHaveBeenCalled(); }); + + it("falls back to the env-backed default when an injected override is non-finite", () => { + // NaN / +Infinity / -Infinity overrides must not silently neutralise the + // fast-fail loop. A NaN comparison would always be false and `Infinity` + // would never satisfy `>= debouncePolls`, leaving the wait to burn the + // full timeout window. + for (const bogus of [Number.NaN, Number.POSITIVE_INFINITY, Number.NEGATIVE_INFINITY]) { + const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" })); + const runCaptureOpenshell = vi.fn(() => "alpha Error 1s ago"); + const sleep = vi.fn(); + + const ok = waitForOpenShellSupervisorReconnect("alpha", 600, { + runOpenshell, + runCaptureOpenshell, + sleep, + errorPhaseDebouncePolls: bogus, + }); + + expect(ok).toBe(false); + // Default K=5 from the env-backed helper: 5 polls + 4 sleeps before fast-fail. + expect(runOpenshell).toHaveBeenCalledTimes(5); + expect(sleep).toHaveBeenCalledTimes(4); + } + }); }); diff --git a/src/lib/onboard/docker-gpu-supervisor-reconnect.ts b/src/lib/onboard/docker-gpu-supervisor-reconnect.ts index 3b052a84f8..c8906e9501 100644 --- a/src/lib/onboard/docker-gpu-supervisor-reconnect.ts +++ b/src/lib/onboard/docker-gpu-supervisor-reconnect.ts @@ -109,7 +109,7 @@ export function waitForOpenShellSupervisorReconnect( const sleep = deps.sleep ?? defaultSleep; const deadline = Date.now() + Math.max(1, timeoutSecs) * 1000; const errorPhaseDebouncePolls = - deps.errorPhaseDebouncePolls == null + deps.errorPhaseDebouncePolls == null || !Number.isFinite(deps.errorPhaseDebouncePolls) ? getDockerGpuSupervisorReconnectErrorDebouncePolls() : Math.max(1, Math.trunc(deps.errorPhaseDebouncePolls)); let consecutiveErrorPolls = 0;