From 13abdbab27e8ef5379bdd7f1ab29c7d9c62ce9e8 Mon Sep 17 00:00:00 2001
From: Tinson Lai <tinsonl@nvidia.com>
Date: Tue, 2 Jun 2026 12:03:09 +0000
Subject: [PATCH 1/4] fix(onboard): debounce Docker GPU patch supervisor
 reconnect Error-phase short-circuit

Signed-off-by: Tinson Lai <tinsonl@nvidia.com>
---
 src/lib/onboard/docker-gpu-patch.test.ts | 116 ++++++++++++++++++++++-
 src/lib/onboard/docker-gpu-patch.ts      |  45 ++++++++-
 2 files changed, 154 insertions(+), 7 deletions(-)

diff --git a/src/lib/onboard/docker-gpu-patch.test.ts b/src/lib/onboard/docker-gpu-patch.test.ts
index cf560e7e4f..e8ae5e799d 100644
--- a/src/lib/onboard/docker-gpu-patch.test.ts
+++ b/src/lib/onboard/docker-gpu-patch.test.ts
@@ -20,6 +20,7 @@ import {
   dockerReportsNvidiaCdiDevices,
   formatDockerInspectNetworkSummary,
   getDockerGpuPatchNetworkMode,
+  getDockerGpuSupervisorReconnectErrorDebouncePolls,
   getDockerGpuSupervisorReconnectTimeoutSecs,
   recreateOpenShellDockerSandboxWithGpu,
   selectDockerGpuPatchMode,
@@ -837,7 +838,10 @@ describe("docker-gpu-patch Error-phase diagnostics (#4316)", () => {
   it("short-circuits the supervisor-reconnect wait when the sandbox enters Error phase", () => {
     // Without the short-circuit, a patched container that crashes on startup
     // leaves users waiting the full 900s+ supervisor-reconnect timeout before
-    // any Error-phase diagnostics run (#4316).
+    // any Error-phase diagnostics run (#4316). With the #4664 debounce now in
+    // place, this test asserts the K=1 (no-debounce) behaviour explicitly so
+    // the original fast-fail intent is preserved when the operator opts out
+    // of the debounce.
     const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" }));
     const listOutputs = [
       "alpha   Provisioning   1s ago",
@@ -853,10 +857,11 @@ describe("docker-gpu-patch Error-phase diagnostics (#4316)", () => {
       runOpenshell,
       runCaptureOpenshell,
       sleep,
+      errorPhaseDebouncePolls: 1,
     });
 
     expect(ok).toBe(false);
-    // Without short-circuit we'd loop ~300 iterations. With it, the second
+    // Without short-circuit we'd loop ~300 iterations. With K=1 the second
     // iteration's list output shows Error and the wait bails out.
     expect(runOpenshell).toHaveBeenCalledTimes(2);
     expect(sleep).toHaveBeenCalledTimes(1);
@@ -1201,3 +1206,110 @@ describe("docker-gpu-patch Error-phase diagnostics (#4316)", () => {
     }
   });
 });
+
+// Regression coverage for NemoClaw issue #4664: the Docker GPU patch
+// supervisor-reconnect wait must absorb a transient Error phase reported
+// while OpenShell's sandbox-list cache catches up to the newly-recreated
+// GPU container (old-container teardown briefly marks the row Error before
+// the host re-registers the new container). Without debouncing, the
+// #4316 fast-fail short-circuits within ~12s on a healthy GPU sandbox
+// whose container is running and whose supervisor has already logged
+// `LIFECYCLE:INSTALL OpenShell Sandbox Supervisor success`.
+describe("docker-gpu-patch supervisor-reconnect Error-phase debounce (#4664)", () => {
+  it("absorbs a transient Error phase shorter than the debounce window", () => {
+    const execOutputs = [
+      { status: 1, stderr: "sandbox not ready" },
+      { status: 1, stderr: "sandbox not ready" },
+      { status: 1, stderr: "sandbox not ready" },
+      { status: 0, stdout: "" },
+    ];
+    let execIdx = 0;
+    const runOpenshell = vi.fn(
+      () => execOutputs[Math.min(execIdx++, execOutputs.length - 1)],
+    );
+    const listOutputs = [
+      "alpha   Error         1s ago",
+      "alpha   Error         3s ago",
+      "alpha   Provisioning  5s ago",
+      "alpha   Ready         7s ago",
+    ];
+    let listIdx = 0;
+    const runCaptureOpenshell = vi.fn(
+      () => listOutputs[Math.min(listIdx++, listOutputs.length - 1)],
+    );
+    const sleep = vi.fn();
+
+    const ok = waitForOpenShellSupervisorReconnect("alpha", 600, {
+      runOpenshell,
+      runCaptureOpenshell,
+      sleep,
+      errorPhaseDebouncePolls: 5,
+    });
+
+    expect(ok).toBe(true);
+    expect(runOpenshell).toHaveBeenCalledTimes(4);
+  });
+
+  it("still fast-fails when Error phase persists for the full debounce window", () => {
+    const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" }));
+    const runCaptureOpenshell = vi.fn(() => "alpha   Error   1s ago");
+    const sleep = vi.fn();
+
+    const ok = waitForOpenShellSupervisorReconnect("alpha", 600, {
+      runOpenshell,
+      runCaptureOpenshell,
+      sleep,
+      errorPhaseDebouncePolls: 3,
+    });
+
+    expect(ok).toBe(false);
+    // Three consecutive Error polls trigger the short-circuit on poll #3.
+    // Sleeps happen only between polls 1->2 and 2->3, so two sleeps total.
+    expect(runOpenshell).toHaveBeenCalledTimes(3);
+    expect(sleep).toHaveBeenCalledTimes(2);
+  });
+
+  it("resets the consecutive-Error counter when the phase recovers", () => {
+    // Error, Error, Provisioning (counter resets), Error, Error, Error
+    // -> bails out on the 3rd post-recovery Error, not on the 2nd overall.
+    const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" }));
+    const listOutputs = [
+      "alpha   Error         1s ago",
+      "alpha   Error         3s ago",
+      "alpha   Provisioning  5s ago",
+      "alpha   Error         7s ago",
+      "alpha   Error         9s ago",
+      "alpha   Error         11s ago",
+    ];
+    let listIdx = 0;
+    const runCaptureOpenshell = vi.fn(
+      () => listOutputs[Math.min(listIdx++, listOutputs.length - 1)],
+    );
+    const sleep = vi.fn();
+
+    const ok = waitForOpenShellSupervisorReconnect("alpha", 600, {
+      runOpenshell,
+      runCaptureOpenshell,
+      sleep,
+      errorPhaseDebouncePolls: 3,
+    });
+
+    expect(ok).toBe(false);
+    expect(runOpenshell).toHaveBeenCalledTimes(6);
+  });
+
+  it("defaults the debounce to 5 polls and honors the env override", () => {
+    expect(getDockerGpuSupervisorReconnectErrorDebouncePolls({})).toBe(5);
+    expect(
+      getDockerGpuSupervisorReconnectErrorDebouncePolls({
+        NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE: "2",
+      }),
+    ).toBe(2);
+    // Non-positive values are clamped to a minimum of 1.
+    expect(
+      getDockerGpuSupervisorReconnectErrorDebouncePolls({
+        NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE: "0",
+      }),
+    ).toBe(1);
+  });
+});
diff --git a/src/lib/onboard/docker-gpu-patch.ts b/src/lib/onboard/docker-gpu-patch.ts
index 5e9dd16a13..21c83db59c 100644
--- a/src/lib/onboard/docker-gpu-patch.ts
+++ b/src/lib/onboard/docker-gpu-patch.ts
@@ -24,8 +24,17 @@ const OPENSHELL_SANDBOX_COMMAND_ENV = "OPENSHELL_SANDBOX_COMMAND";
 const DOCKER_GPU_PATCH_TIMEOUT_MS = 30_000;
 const DOCKER_GPU_PATCH_WAIT_SECS = 180;
 const DOCKER_GPU_SUPERVISOR_RECONNECT_MIN_SECS = 900;
+// Default number of consecutive Error-phase polls required before the
+// supervisor-reconnect wait short-circuits. With a 2-second poll interval this
+// is ~10s of sustained Error before fast-fail, which absorbs the transient
+// Error reported while OpenShell's sandbox-list cache catches up to the
+// newly-recreated GPU container (#4664) while still bailing fast on a
+// patched container that actually crashed on startup (#4316).
+const DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_PHASE_DEFAULT_DEBOUNCE_POLLS = 5;
 export const DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV =
   "NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT";
+export const DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV =
+  "NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE";
 export const DOCKER_GPU_PATCH_NETWORK_ENV = "NEMOCLAW_DOCKER_GPU_PATCH_NETWORK";
 const MAX_DOCKER_CONTAINER_NAME_LENGTH = 253;
 const GPU_ENV_KEYS = new Set([
@@ -70,6 +79,12 @@ export type DockerGpuPatchDeps = {
   readDir?: (dirPath: string) => string[] | null;
   /** Injectable file reader for unit testing CDI spec content checks. */
   readFile?: (filePath: string) => string | null;
+  /**
+   * Number of consecutive Error-phase polls required before the
+   * supervisor-reconnect wait short-circuits. Omit to use the
+   * env-configurable default (#4664).
+   */
+  errorPhaseDebouncePolls?: number;
 };
 
 export type DockerGpuPatchModeKind = "gpus" | "nvidia-runtime" | "cdi";
@@ -859,21 +874,28 @@ function waitForOpenShellSandboxExec(
   if (!deps.runOpenshell) return true;
   const d = depsWithDefaults(deps);
   const deadline = Date.now() + Math.max(1, timeoutSecs) * 1000;
+  const errorPhaseDebouncePolls =
+    deps.errorPhaseDebouncePolls ?? getDockerGpuSupervisorReconnectErrorDebouncePolls();
+  let consecutiveErrorPolls = 0;
   while (Date.now() <= deadline) {
     const result = deps.runOpenshell(
       ["sandbox", "exec", "-n", sandboxName, "--", "true"],
       { ignoreError: true, suppressOutput: true, timeout: DOCKER_GPU_PATCH_TIMEOUT_MS },
     );
     if (isZeroStatus(result)) return true;
-    // Short-circuit the supervisor-reconnect wait when the sandbox enters a
-    // terminal failure phase. Without this, a patched container that exits
-    // on startup leaves the user staring at the supervisor-reconnect
-    // timeout (default 900s) before any Error-phase diagnostics run (#4316).
+    // Debounce the terminal-phase short-circuit. A patched container that
+    // crashes on startup still fast-fails (#4316), but a transient Error
+    // reported while OpenShell's sandbox-list cache catches up to the
+    // newly-recreated GPU container is not treated as fatal (#4664). The
+    // poll count required is configurable via env for operator tuning.
     if (
       deps.runCaptureOpenshell &&
       sandboxListShowsErrorPhase(sandboxName, deps.runCaptureOpenshell)
     ) {
-      return false;
+      consecutiveErrorPolls += 1;
+      if (consecutiveErrorPolls >= errorPhaseDebouncePolls) return false;
+    } else {
+      consecutiveErrorPolls = 0;
     }
     d.sleep(2);
   }
@@ -899,6 +921,19 @@ export function getDockerGpuSupervisorReconnectTimeoutSecs(
   );
 }
 
+export function getDockerGpuSupervisorReconnectErrorDebouncePolls(
+  env: Record<string, string | undefined> = process.env,
+): number {
+  return Math.max(
+    1,
+    envInt(
+      DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV,
+      DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_PHASE_DEFAULT_DEBOUNCE_POLLS,
+      env,
+    ),
+  );
+}
+
 function decoratePatchError<T extends Error>(
   error: T,
   context: DockerGpuPatchFailureContext,

From c8bc1c44cbbb4c9ce8dca86310e4d375c9b21d7e Mon Sep 17 00:00:00 2001
From: Tinson Lai <tinsonl@nvidia.com>
Date: Tue, 2 Jun 2026 12:31:13 +0000
Subject: [PATCH 2/4] refactor(onboard): extract Docker GPU
 supervisor-reconnect debounce module + document env

Signed-off-by: Tinson Lai <tinsonl@nvidia.com>
---
 docs/reference/troubleshooting.mdx            |   5 +
 .../references/troubleshooting.md             |   5 +
 src/lib/onboard/docker-gpu-patch.test.ts      | 115 +------------
 src/lib/onboard/docker-gpu-patch.ts           | 122 +++-----------
 .../docker-gpu-supervisor-reconnect.test.ts   | 115 +++++++++++++
 .../docker-gpu-supervisor-reconnect.ts        | 156 ++++++++++++++++++
 6 files changed, 304 insertions(+), 214 deletions(-)
 create mode 100644 src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts
 create mode 100644 src/lib/onboard/docker-gpu-supervisor-reconnect.ts

diff --git a/docs/reference/troubleshooting.mdx b/docs/reference/troubleshooting.mdx
index 6d23f1f476..8d57d3f8cd 100644
--- a/docs/reference/troubleshooting.mdx
+++ b/docs/reference/troubleshooting.mdx
@@ -1254,6 +1254,11 @@ Fix the NVIDIA Container Toolkit or CDI configuration reported in the diagnostic
 If you do not need GPU access inside the sandbox, rerun with `--no-sandbox-gpu`.
 Set `NEMOCLAW_DOCKER_GPU_PATCH=0` only when you need to bypass this compatibility path during troubleshooting.
 
+If onboarding reports `OpenShell supervisor did not reconnect to the GPU-enabled container.` even though the diagnostic bundle shows the patched container is running and healthy, the supervisor-reconnect wait is treating a transient Error phase (reported while the OpenShell host re-registers the new container) as fatal.
+The reconnect wait debounces consecutive Error-phase polls before fast-failing, defaulting to five consecutive polls of about 10 seconds in total.
+Increase the debounce window with `NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE` if your host needs more time to re-register the patched container, for example slow WSL2 + Docker Desktop setups.
+Set it to a higher integer such as `15` (about 30 seconds) and rerun onboarding; the value is clamped to a minimum of `1`.
+
 ### `pip install` fails with a system-packages error
 
 Recent Ubuntu releases (including DGX Spark's Ubuntu 24.04) mark the system Python install as externally managed, so `pip install` without a virtual environment fails.
diff --git a/skills/nemoclaw-user-reference/references/troubleshooting.md b/skills/nemoclaw-user-reference/references/troubleshooting.md
index bcee680bfe..7ee345ab8f 100644
--- a/skills/nemoclaw-user-reference/references/troubleshooting.md
+++ b/skills/nemoclaw-user-reference/references/troubleshooting.md
@@ -1244,6 +1244,11 @@ Fix the NVIDIA Container Toolkit or CDI configuration reported in the diagnostic
 If you do not need GPU access inside the sandbox, rerun with `--no-sandbox-gpu`.
 Set `NEMOCLAW_DOCKER_GPU_PATCH=0` only when you need to bypass this compatibility path during troubleshooting.
 
+If onboarding reports `OpenShell supervisor did not reconnect to the GPU-enabled container.` even though the diagnostic bundle shows the patched container is running and healthy, the supervisor-reconnect wait is treating a transient Error phase (reported while the OpenShell host re-registers the new container) as fatal.
+The reconnect wait debounces consecutive Error-phase polls before fast-failing, defaulting to five consecutive polls of about 10 seconds in total.
+Increase the debounce window with `NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE` if your host needs more time to re-register the patched container, for example slow WSL2 + Docker Desktop setups.
+Set it to a higher integer such as `15` (about 30 seconds) and rerun onboarding; the value is clamped to a minimum of `1`.
+
 ### `pip install` fails with a system-packages error
 
 Recent Ubuntu releases (including DGX Spark's Ubuntu 24.04) mark the system Python install as externally managed, so `pip install` without a virtual environment fails.
diff --git a/src/lib/onboard/docker-gpu-patch.test.ts b/src/lib/onboard/docker-gpu-patch.test.ts
index e8ae5e799d..260b16be16 100644
--- a/src/lib/onboard/docker-gpu-patch.test.ts
+++ b/src/lib/onboard/docker-gpu-patch.test.ts
@@ -20,7 +20,6 @@ import {
   dockerReportsNvidiaCdiDevices,
   formatDockerInspectNetworkSummary,
   getDockerGpuPatchNetworkMode,
-  getDockerGpuSupervisorReconnectErrorDebouncePolls,
   getDockerGpuSupervisorReconnectTimeoutSecs,
   recreateOpenShellDockerSandboxWithGpu,
   selectDockerGpuPatchMode,
@@ -838,10 +837,10 @@ describe("docker-gpu-patch Error-phase diagnostics (#4316)", () => {
   it("short-circuits the supervisor-reconnect wait when the sandbox enters Error phase", () => {
     // Without the short-circuit, a patched container that crashes on startup
     // leaves users waiting the full 900s+ supervisor-reconnect timeout before
-    // any Error-phase diagnostics run (#4316). With the #4664 debounce now in
-    // place, this test asserts the K=1 (no-debounce) behaviour explicitly so
-    // the original fast-fail intent is preserved when the operator opts out
-    // of the debounce.
+    // any Error-phase diagnostics run. With the debounce now in place, this
+    // test asserts the K=1 (no-debounce) behavior explicitly so the original
+    // fast-fail intent is preserved when the operator opts out of the
+    // debounce.
     const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" }));
     const listOutputs = [
       "alpha   Provisioning   1s ago",
@@ -1207,109 +1206,3 @@ describe("docker-gpu-patch Error-phase diagnostics (#4316)", () => {
   });
 });
 
-// Regression coverage for NemoClaw issue #4664: the Docker GPU patch
-// supervisor-reconnect wait must absorb a transient Error phase reported
-// while OpenShell's sandbox-list cache catches up to the newly-recreated
-// GPU container (old-container teardown briefly marks the row Error before
-// the host re-registers the new container). Without debouncing, the
-// #4316 fast-fail short-circuits within ~12s on a healthy GPU sandbox
-// whose container is running and whose supervisor has already logged
-// `LIFECYCLE:INSTALL OpenShell Sandbox Supervisor success`.
-describe("docker-gpu-patch supervisor-reconnect Error-phase debounce (#4664)", () => {
-  it("absorbs a transient Error phase shorter than the debounce window", () => {
-    const execOutputs = [
-      { status: 1, stderr: "sandbox not ready" },
-      { status: 1, stderr: "sandbox not ready" },
-      { status: 1, stderr: "sandbox not ready" },
-      { status: 0, stdout: "" },
-    ];
-    let execIdx = 0;
-    const runOpenshell = vi.fn(
-      () => execOutputs[Math.min(execIdx++, execOutputs.length - 1)],
-    );
-    const listOutputs = [
-      "alpha   Error         1s ago",
-      "alpha   Error         3s ago",
-      "alpha   Provisioning  5s ago",
-      "alpha   Ready         7s ago",
-    ];
-    let listIdx = 0;
-    const runCaptureOpenshell = vi.fn(
-      () => listOutputs[Math.min(listIdx++, listOutputs.length - 1)],
-    );
-    const sleep = vi.fn();
-
-    const ok = waitForOpenShellSupervisorReconnect("alpha", 600, {
-      runOpenshell,
-      runCaptureOpenshell,
-      sleep,
-      errorPhaseDebouncePolls: 5,
-    });
-
-    expect(ok).toBe(true);
-    expect(runOpenshell).toHaveBeenCalledTimes(4);
-  });
-
-  it("still fast-fails when Error phase persists for the full debounce window", () => {
-    const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" }));
-    const runCaptureOpenshell = vi.fn(() => "alpha   Error   1s ago");
-    const sleep = vi.fn();
-
-    const ok = waitForOpenShellSupervisorReconnect("alpha", 600, {
-      runOpenshell,
-      runCaptureOpenshell,
-      sleep,
-      errorPhaseDebouncePolls: 3,
-    });
-
-    expect(ok).toBe(false);
-    // Three consecutive Error polls trigger the short-circuit on poll #3.
-    // Sleeps happen only between polls 1->2 and 2->3, so two sleeps total.
-    expect(runOpenshell).toHaveBeenCalledTimes(3);
-    expect(sleep).toHaveBeenCalledTimes(2);
-  });
-
-  it("resets the consecutive-Error counter when the phase recovers", () => {
-    // Error, Error, Provisioning (counter resets), Error, Error, Error
-    // -> bails out on the 3rd post-recovery Error, not on the 2nd overall.
-    const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" }));
-    const listOutputs = [
-      "alpha   Error         1s ago",
-      "alpha   Error         3s ago",
-      "alpha   Provisioning  5s ago",
-      "alpha   Error         7s ago",
-      "alpha   Error         9s ago",
-      "alpha   Error         11s ago",
-    ];
-    let listIdx = 0;
-    const runCaptureOpenshell = vi.fn(
-      () => listOutputs[Math.min(listIdx++, listOutputs.length - 1)],
-    );
-    const sleep = vi.fn();
-
-    const ok = waitForOpenShellSupervisorReconnect("alpha", 600, {
-      runOpenshell,
-      runCaptureOpenshell,
-      sleep,
-      errorPhaseDebouncePolls: 3,
-    });
-
-    expect(ok).toBe(false);
-    expect(runOpenshell).toHaveBeenCalledTimes(6);
-  });
-
-  it("defaults the debounce to 5 polls and honors the env override", () => {
-    expect(getDockerGpuSupervisorReconnectErrorDebouncePolls({})).toBe(5);
-    expect(
-      getDockerGpuSupervisorReconnectErrorDebouncePolls({
-        NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE: "2",
-      }),
-    ).toBe(2);
-    // Non-positive values are clamped to a minimum of 1.
-    expect(
-      getDockerGpuSupervisorReconnectErrorDebouncePolls({
-        NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE: "0",
-      }),
-    ).toBe(1);
-  });
-});
diff --git a/src/lib/onboard/docker-gpu-patch.ts b/src/lib/onboard/docker-gpu-patch.ts
index 21c83db59c..d46705ebc5 100644
--- a/src/lib/onboard/docker-gpu-patch.ts
+++ b/src/lib/onboard/docker-gpu-patch.ts
@@ -14,7 +14,22 @@ import {
   dockerRunDetached,
   dockerStop,
 } from "../adapters/docker";
-import { envInt } from "./env";
+import {
+  type DockerGpuSupervisorReconnectDeps,
+  DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV,
+  DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV,
+  getDockerGpuSupervisorReconnectErrorDebouncePolls,
+  getDockerGpuSupervisorReconnectTimeoutSecs,
+  waitForOpenShellSupervisorReconnect,
+} from "./docker-gpu-supervisor-reconnect";
+export {
+  DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV,
+  DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV,
+  getDockerGpuSupervisorReconnectErrorDebouncePolls,
+  getDockerGpuSupervisorReconnectTimeoutSecs,
+  waitForOpenShellSupervisorReconnect,
+};
+export type { DockerGpuSupervisorReconnectDeps };
 
 export const OPENSHELL_MANAGED_BY_LABEL = "openshell.ai/managed-by";
 export const OPENSHELL_MANAGED_BY_VALUE = "openshell";
@@ -23,18 +38,6 @@ const OPENSHELL_SANDBOX_COMMAND_ENV = "OPENSHELL_SANDBOX_COMMAND";
 
 const DOCKER_GPU_PATCH_TIMEOUT_MS = 30_000;
 const DOCKER_GPU_PATCH_WAIT_SECS = 180;
-const DOCKER_GPU_SUPERVISOR_RECONNECT_MIN_SECS = 900;
-// Default number of consecutive Error-phase polls required before the
-// supervisor-reconnect wait short-circuits. With a 2-second poll interval this
-// is ~10s of sustained Error before fast-fail, which absorbs the transient
-// Error reported while OpenShell's sandbox-list cache catches up to the
-// newly-recreated GPU container (#4664) while still bailing fast on a
-// patched container that actually crashed on startup (#4316).
-const DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_PHASE_DEFAULT_DEBOUNCE_POLLS = 5;
-export const DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV =
-  "NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT";
-export const DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV =
-  "NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE";
 export const DOCKER_GPU_PATCH_NETWORK_ENV = "NEMOCLAW_DOCKER_GPU_PATCH_NETWORK";
 const MAX_DOCKER_CONTAINER_NAME_LENGTH = 253;
 const GPU_ENV_KEYS = new Set([
@@ -80,9 +83,8 @@ export type DockerGpuPatchDeps = {
   /** Injectable file reader for unit testing CDI spec content checks. */
   readFile?: (filePath: string) => string | null;
   /**
-   * Number of consecutive Error-phase polls required before the
-   * supervisor-reconnect wait short-circuits. Omit to use the
-   * env-configurable default (#4664).
+   * Forwarded to the supervisor-reconnect wait. See
+   * `DockerGpuSupervisorReconnectDeps.errorPhaseDebouncePolls`.
    */
   errorPhaseDebouncePolls?: number;
 };
@@ -848,92 +850,6 @@ function waitForNewContainerId(
   return null;
 }
 
-function sandboxListShowsErrorPhase(
-  sandboxName: string,
-  runCaptureOpenshell: NonNullable<DockerGpuPatchDeps["runCaptureOpenshell"]>,
-): boolean {
-  try {
-    const list = runCaptureOpenshell(["sandbox", "list"], {
-      ignoreError: true,
-      suppressOutput: true,
-      timeout: DOCKER_GPU_PATCH_TIMEOUT_MS,
-    });
-    return SANDBOX_FAILURE_PHASE_TOKENS.has(
-      parseSandboxPhaseFromListOutput(list, sandboxName) ?? "",
-    );
-  } catch {
-    return false;
-  }
-}
-
-function waitForOpenShellSandboxExec(
-  sandboxName: string,
-  timeoutSecs: number,
-  deps: DockerGpuPatchDeps,
-): boolean {
-  if (!deps.runOpenshell) return true;
-  const d = depsWithDefaults(deps);
-  const deadline = Date.now() + Math.max(1, timeoutSecs) * 1000;
-  const errorPhaseDebouncePolls =
-    deps.errorPhaseDebouncePolls ?? getDockerGpuSupervisorReconnectErrorDebouncePolls();
-  let consecutiveErrorPolls = 0;
-  while (Date.now() <= deadline) {
-    const result = deps.runOpenshell(
-      ["sandbox", "exec", "-n", sandboxName, "--", "true"],
-      { ignoreError: true, suppressOutput: true, timeout: DOCKER_GPU_PATCH_TIMEOUT_MS },
-    );
-    if (isZeroStatus(result)) return true;
-    // Debounce the terminal-phase short-circuit. A patched container that
-    // crashes on startup still fast-fails (#4316), but a transient Error
-    // reported while OpenShell's sandbox-list cache catches up to the
-    // newly-recreated GPU container is not treated as fatal (#4664). The
-    // poll count required is configurable via env for operator tuning.
-    if (
-      deps.runCaptureOpenshell &&
-      sandboxListShowsErrorPhase(sandboxName, deps.runCaptureOpenshell)
-    ) {
-      consecutiveErrorPolls += 1;
-      if (consecutiveErrorPolls >= errorPhaseDebouncePolls) return false;
-    } else {
-      consecutiveErrorPolls = 0;
-    }
-    d.sleep(2);
-  }
-  return false;
-}
-
-export const waitForOpenShellSupervisorReconnect = waitForOpenShellSandboxExec;
-
-export function getDockerGpuSupervisorReconnectTimeoutSecs(
-  sandboxReadyTimeoutSecs: number,
-  env: Record<string, string | undefined> = process.env,
-): number {
-  const readyTimeoutSecs = Number.isFinite(sandboxReadyTimeoutSecs)
-    ? Math.max(1, Math.round(sandboxReadyTimeoutSecs))
-    : 1;
-  const fallback = Math.max(
-    readyTimeoutSecs,
-    DOCKER_GPU_SUPERVISOR_RECONNECT_MIN_SECS,
-  );
-  return Math.max(
-    1,
-    envInt(DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV, fallback, env),
-  );
-}
-
-export function getDockerGpuSupervisorReconnectErrorDebouncePolls(
-  env: Record<string, string | undefined> = process.env,
-): number {
-  return Math.max(
-    1,
-    envInt(
-      DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV,
-      DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_PHASE_DEFAULT_DEBOUNCE_POLLS,
-      env,
-    ),
-  );
-}
-
 function decoratePatchError<T extends Error>(
   error: T,
   context: DockerGpuPatchFailureContext,
@@ -1052,7 +968,7 @@ export function recreateOpenShellDockerSandboxWithGpu(
     });
 
     if (options.waitForSupervisor !== false) {
-      const execReady = waitForOpenShellSandboxExec(
+      const execReady = waitForOpenShellSupervisorReconnect(
         options.sandboxName,
         options.timeoutSecs ?? DOCKER_GPU_PATCH_WAIT_SECS,
         deps,
diff --git a/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts b/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts
new file mode 100644
index 0000000000..07e47d17d3
--- /dev/null
+++ b/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts
@@ -0,0 +1,115 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { describe, expect, it, vi } from "vitest";
+
+import {
+  getDockerGpuSupervisorReconnectErrorDebouncePolls,
+  waitForOpenShellSupervisorReconnect,
+} from "../../../dist/lib/onboard/docker-gpu-supervisor-reconnect";
+
+// The Docker GPU patch supervisor-reconnect wait must absorb a transient
+// Error phase reported while OpenShell's sandbox-list cache catches up to
+// the newly-recreated GPU container. The old-container teardown briefly
+// marks the row Error before the host re-registers the new container.
+// Without debouncing, the fast-fail short-circuits within ~12s on a healthy
+// GPU sandbox whose container is running and whose supervisor has already
+// logged `LIFECYCLE:INSTALL OpenShell Sandbox Supervisor success`.
+describe("docker-gpu-supervisor-reconnect Error-phase debounce", () => {
+  it("absorbs a transient Error phase shorter than the debounce window", () => {
+    const execOutputs = [
+      { status: 1, stderr: "sandbox not ready" },
+      { status: 1, stderr: "sandbox not ready" },
+      { status: 1, stderr: "sandbox not ready" },
+      { status: 0, stdout: "" },
+    ];
+    let execIdx = 0;
+    const runOpenshell = vi.fn(
+      () => execOutputs[Math.min(execIdx++, execOutputs.length - 1)],
+    );
+    const listOutputs = [
+      "alpha   Error         1s ago",
+      "alpha   Error         3s ago",
+      "alpha   Provisioning  5s ago",
+      "alpha   Ready         7s ago",
+    ];
+    let listIdx = 0;
+    const runCaptureOpenshell = vi.fn(
+      () => listOutputs[Math.min(listIdx++, listOutputs.length - 1)],
+    );
+    const sleep = vi.fn();
+
+    const ok = waitForOpenShellSupervisorReconnect("alpha", 600, {
+      runOpenshell,
+      runCaptureOpenshell,
+      sleep,
+      errorPhaseDebouncePolls: 5,
+    });
+
+    expect(ok).toBe(true);
+    expect(runOpenshell).toHaveBeenCalledTimes(4);
+  });
+
+  it("still fast-fails when Error phase persists for the full debounce window", () => {
+    const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" }));
+    const runCaptureOpenshell = vi.fn(() => "alpha   Error   1s ago");
+    const sleep = vi.fn();
+
+    const ok = waitForOpenShellSupervisorReconnect("alpha", 600, {
+      runOpenshell,
+      runCaptureOpenshell,
+      sleep,
+      errorPhaseDebouncePolls: 3,
+    });
+
+    expect(ok).toBe(false);
+    // Three consecutive Error polls trigger the short-circuit on poll 3.
+    // Sleeps happen only between polls 1->2 and 2->3, so two sleeps total.
+    expect(runOpenshell).toHaveBeenCalledTimes(3);
+    expect(sleep).toHaveBeenCalledTimes(2);
+  });
+
+  it("resets the consecutive-Error counter when the phase recovers", () => {
+    // Error, Error, Provisioning (counter resets), Error, Error, Error
+    // -> bails out on the 3rd post-recovery Error, not earlier.
+    const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" }));
+    const listOutputs = [
+      "alpha   Error         1s ago",
+      "alpha   Error         3s ago",
+      "alpha   Provisioning  5s ago",
+      "alpha   Error         7s ago",
+      "alpha   Error         9s ago",
+      "alpha   Error         11s ago",
+    ];
+    let listIdx = 0;
+    const runCaptureOpenshell = vi.fn(
+      () => listOutputs[Math.min(listIdx++, listOutputs.length - 1)],
+    );
+    const sleep = vi.fn();
+
+    const ok = waitForOpenShellSupervisorReconnect("alpha", 600, {
+      runOpenshell,
+      runCaptureOpenshell,
+      sleep,
+      errorPhaseDebouncePolls: 3,
+    });
+
+    expect(ok).toBe(false);
+    expect(runOpenshell).toHaveBeenCalledTimes(6);
+  });
+
+  it("defaults the debounce to 5 polls and honors the env override", () => {
+    expect(getDockerGpuSupervisorReconnectErrorDebouncePolls({})).toBe(5);
+    expect(
+      getDockerGpuSupervisorReconnectErrorDebouncePolls({
+        NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE: "2",
+      }),
+    ).toBe(2);
+    // Non-positive values are clamped to a minimum of 1.
+    expect(
+      getDockerGpuSupervisorReconnectErrorDebouncePolls({
+        NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE: "0",
+      }),
+    ).toBe(1);
+  });
+});
diff --git a/src/lib/onboard/docker-gpu-supervisor-reconnect.ts b/src/lib/onboard/docker-gpu-supervisor-reconnect.ts
new file mode 100644
index 0000000000..298ad300ec
--- /dev/null
+++ b/src/lib/onboard/docker-gpu-supervisor-reconnect.ts
@@ -0,0 +1,156 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Supervisor-reconnect wait for the Docker GPU patch path.
+ *
+ * Source-of-truth boundary
+ * ------------------------
+ * The transient Error phase this module debounces is observed in the
+ * `openshell sandbox list` cache while the OpenShell host re-registers the
+ * newly-recreated GPU container after `docker stop` + `docker run`. The
+ * preferred fix lives at the OpenShell gateway: `sandbox list` should not
+ * report a terminal phase for a sandbox whose Docker container is being
+ * recreated by the GPU patch path. Until that upstream change ships,
+ * NemoClaw tolerates the transient Error at this layer via a
+ * consecutive-poll debounce.
+ *
+ * Removal condition
+ * -----------------
+ * Delete this debounce once OpenShell guarantees `sandbox list` skips the
+ * brief Error transition during a known recreate. A real-Docker GPU E2E
+ * reproduction (e.g. `e2e-branch-validation:gpu`,
+ * `gpu-repo-local-ollama-openclaw`) showing a transient teardown-Error that
+ * recovers to Ready is the runtime evidence required.
+ */
+
+import { envInt } from "./env";
+
+const DOCKER_GPU_PATCH_TIMEOUT_MS = 30_000;
+const DOCKER_GPU_SUPERVISOR_RECONNECT_MIN_SECS = 900;
+// Default consecutive Error-phase polls required before fast-fail. With a
+// 2-second poll interval this is ~10s of sustained Error, which absorbs the
+// transient Error reported during container recreation while still bailing
+// fast on a patched container that crashed on startup.
+const DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_PHASE_DEFAULT_DEBOUNCE_POLLS = 5;
+
+export const DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV =
+  "NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT";
+export const DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV =
+  "NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE";
+
+const TERMINAL_SANDBOX_FAILURE_PHASES = new Set(["Error", "Failed", "CrashLoopBackOff"]);
+
+type DockerRunResult = {
+  status?: number | null;
+  stdout?: string | Buffer | null;
+  stderr?: string | Buffer | null;
+};
+
+type RunOpenshellFn = (
+  args: string[],
+  opts?: Record<string, unknown>,
+) => DockerRunResult;
+type RunCaptureOpenshellFn = (
+  args: string[],
+  opts?: Record<string, unknown>,
+) => string;
+
+export type DockerGpuSupervisorReconnectDeps = {
+  runOpenshell?: RunOpenshellFn;
+  runCaptureOpenshell?: RunCaptureOpenshellFn;
+  sleep?: (seconds: number) => void;
+  errorPhaseDebouncePolls?: number;
+};
+
+function defaultSleep(seconds: number): void {
+  Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, Math.max(0, seconds) * 1000);
+}
+
+function isZeroStatus(result: DockerRunResult | null | undefined): boolean {
+  return Number(result?.status ?? 0) === 0;
+}
+
+const ANSI_RE = /\x1b\[[0-9;]*m/g;
+
+function parseSandboxListFailurePhase(output: string, sandboxName: string): string | null {
+  if (typeof output !== "string" || !output.includes(sandboxName)) return null;
+  for (const line of output.replace(ANSI_RE, "").split(/\r?\n/)) {
+    const cols = line.trim().split(/\s+/);
+    if (cols[0] === sandboxName) {
+      return cols.find((col) => TERMINAL_SANDBOX_FAILURE_PHASES.has(col)) ?? null;
+    }
+  }
+  return null;
+}
+
+function sandboxListShowsErrorPhase(
+  sandboxName: string,
+  runCaptureOpenshell: RunCaptureOpenshellFn,
+): boolean {
+  try {
+    const list = runCaptureOpenshell(["sandbox", "list"], {
+      ignoreError: true,
+      suppressOutput: true,
+      timeout: DOCKER_GPU_PATCH_TIMEOUT_MS,
+    });
+    return parseSandboxListFailurePhase(list, sandboxName) !== null;
+  } catch {
+    return false;
+  }
+}
+
+export function waitForOpenShellSupervisorReconnect(
+  sandboxName: string,
+  timeoutSecs: number,
+  deps: DockerGpuSupervisorReconnectDeps,
+): boolean {
+  if (!deps.runOpenshell) return true;
+  const sleep = deps.sleep ?? defaultSleep;
+  const deadline = Date.now() + Math.max(1, timeoutSecs) * 1000;
+  const errorPhaseDebouncePolls =
+    deps.errorPhaseDebouncePolls ?? getDockerGpuSupervisorReconnectErrorDebouncePolls();
+  let consecutiveErrorPolls = 0;
+  while (Date.now() <= deadline) {
+    const result = deps.runOpenshell(
+      ["sandbox", "exec", "-n", sandboxName, "--", "true"],
+      { ignoreError: true, suppressOutput: true, timeout: DOCKER_GPU_PATCH_TIMEOUT_MS },
+    );
+    if (isZeroStatus(result)) return true;
+    if (
+      deps.runCaptureOpenshell &&
+      sandboxListShowsErrorPhase(sandboxName, deps.runCaptureOpenshell)
+    ) {
+      consecutiveErrorPolls += 1;
+      if (consecutiveErrorPolls >= errorPhaseDebouncePolls) return false;
+    } else {
+      consecutiveErrorPolls = 0;
+    }
+    sleep(2);
+  }
+  return false;
+}
+
+export function getDockerGpuSupervisorReconnectTimeoutSecs(
+  sandboxReadyTimeoutSecs: number,
+  env: Record<string, string | undefined> = process.env,
+): number {
+  const readyTimeoutSecs = Number.isFinite(sandboxReadyTimeoutSecs)
+    ? Math.max(1, Math.round(sandboxReadyTimeoutSecs))
+    : 1;
+  const fallback = Math.max(readyTimeoutSecs, DOCKER_GPU_SUPERVISOR_RECONNECT_MIN_SECS);
+  return Math.max(1, envInt(DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV, fallback, env));
+}
+
+export function getDockerGpuSupervisorReconnectErrorDebouncePolls(
+  env: Record<string, string | undefined> = process.env,
+): number {
+  return Math.max(
+    1,
+    envInt(
+      DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV,
+      DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_PHASE_DEFAULT_DEBOUNCE_POLLS,
+      env,
+    ),
+  );
+}

From 21013fc441f44d2cacbab0fffecd7b6818adff61 Mon Sep 17 00:00:00 2001
From: Tinson Lai <tinsonl@nvidia.com>
Date: Tue, 2 Jun 2026 12:46:08 +0000
Subject: [PATCH 3/4] fix(onboard): clamp injected supervisor-reconnect
 debounce override to minimum 1

Signed-off-by: Tinson Lai <tinsonl@nvidia.com>
---
 .../docker-gpu-supervisor-reconnect.test.ts   | 20 +++++++++++++++++++
 .../docker-gpu-supervisor-reconnect.ts        |  4 +++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts b/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts
index 07e47d17d3..e2cabda6b9 100644
--- a/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts
+++ b/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts
@@ -112,4 +112,24 @@ describe("docker-gpu-supervisor-reconnect Error-phase debounce", () => {
       }),
     ).toBe(1);
   });
+
+  it("clamps an injected debounce override to the same minimum as the env path", () => {
+    // 0 / negative / fractional overrides must not bypass the ≥1 contract that
+    // the env-backed helper enforces.
+    const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" }));
+    const runCaptureOpenshell = vi.fn(() => "alpha   Error   1s ago");
+    const sleep = vi.fn();
+
+    const ok = waitForOpenShellSupervisorReconnect("alpha", 600, {
+      runOpenshell,
+      runCaptureOpenshell,
+      sleep,
+      errorPhaseDebouncePolls: 0,
+    });
+
+    expect(ok).toBe(false);
+    // Clamped to K=1: first Error poll short-circuits with no preceding sleep.
+    expect(runOpenshell).toHaveBeenCalledTimes(1);
+    expect(sleep).not.toHaveBeenCalled();
+  });
 });
diff --git a/src/lib/onboard/docker-gpu-supervisor-reconnect.ts b/src/lib/onboard/docker-gpu-supervisor-reconnect.ts
index 298ad300ec..3b052a84f8 100644
--- a/src/lib/onboard/docker-gpu-supervisor-reconnect.ts
+++ b/src/lib/onboard/docker-gpu-supervisor-reconnect.ts
@@ -109,7 +109,9 @@ export function waitForOpenShellSupervisorReconnect(
   const sleep = deps.sleep ?? defaultSleep;
   const deadline = Date.now() + Math.max(1, timeoutSecs) * 1000;
   const errorPhaseDebouncePolls =
-    deps.errorPhaseDebouncePolls ?? getDockerGpuSupervisorReconnectErrorDebouncePolls();
+    deps.errorPhaseDebouncePolls == null
+      ? getDockerGpuSupervisorReconnectErrorDebouncePolls()
+      : Math.max(1, Math.trunc(deps.errorPhaseDebouncePolls));
   let consecutiveErrorPolls = 0;
   while (Date.now() <= deadline) {
     const result = deps.runOpenshell(

From 1b70103e57963b21ea495b6488bba94f7866e654 Mon Sep 17 00:00:00 2001
From: Tinson Lai <tinsonl@nvidia.com>
Date: Tue, 2 Jun 2026 13:07:32 +0000
Subject: [PATCH 4/4] fix(onboard): reject non-finite supervisor-reconnect
 debounce overrides + trim EOF

Signed-off-by: Tinson Lai <tinsonl@nvidia.com>
---
 src/lib/onboard/docker-gpu-patch.test.ts      |  1 -
 .../docker-gpu-supervisor-reconnect.test.ts   | 24 +++++++++++++++++++
 .../docker-gpu-supervisor-reconnect.ts        |  2 +-
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/lib/onboard/docker-gpu-patch.test.ts b/src/lib/onboard/docker-gpu-patch.test.ts
index 260b16be16..88223f119f 100644
--- a/src/lib/onboard/docker-gpu-patch.test.ts
+++ b/src/lib/onboard/docker-gpu-patch.test.ts
@@ -1205,4 +1205,3 @@ describe("docker-gpu-patch Error-phase diagnostics (#4316)", () => {
     }
   });
 });
-
diff --git a/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts b/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts
index e2cabda6b9..62976067f0 100644
--- a/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts
+++ b/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts
@@ -132,4 +132,28 @@ describe("docker-gpu-supervisor-reconnect Error-phase debounce", () => {
     expect(runOpenshell).toHaveBeenCalledTimes(1);
     expect(sleep).not.toHaveBeenCalled();
   });
+
+  it("falls back to the env-backed default when an injected override is non-finite", () => {
+    // NaN / +Infinity / -Infinity overrides must not silently neutralise the
+    // fast-fail loop. A NaN comparison would always be false and `Infinity`
+    // would never satisfy `>= debouncePolls`, leaving the wait to burn the
+    // full timeout window.
+    for (const bogus of [Number.NaN, Number.POSITIVE_INFINITY, Number.NEGATIVE_INFINITY]) {
+      const runOpenshell = vi.fn(() => ({ status: 1, stderr: "sandbox not ready" }));
+      const runCaptureOpenshell = vi.fn(() => "alpha   Error   1s ago");
+      const sleep = vi.fn();
+
+      const ok = waitForOpenShellSupervisorReconnect("alpha", 600, {
+        runOpenshell,
+        runCaptureOpenshell,
+        sleep,
+        errorPhaseDebouncePolls: bogus,
+      });
+
+      expect(ok).toBe(false);
+      // Default K=5 from the env-backed helper: 5 polls + 4 sleeps before fast-fail.
+      expect(runOpenshell).toHaveBeenCalledTimes(5);
+      expect(sleep).toHaveBeenCalledTimes(4);
+    }
+  });
 });
diff --git a/src/lib/onboard/docker-gpu-supervisor-reconnect.ts b/src/lib/onboard/docker-gpu-supervisor-reconnect.ts
index 3b052a84f8..c8906e9501 100644
--- a/src/lib/onboard/docker-gpu-supervisor-reconnect.ts
+++ b/src/lib/onboard/docker-gpu-supervisor-reconnect.ts
@@ -109,7 +109,7 @@ export function waitForOpenShellSupervisorReconnect(
   const sleep = deps.sleep ?? defaultSleep;
   const deadline = Date.now() + Math.max(1, timeoutSecs) * 1000;
   const errorPhaseDebouncePolls =
-    deps.errorPhaseDebouncePolls == null
+    deps.errorPhaseDebouncePolls == null || !Number.isFinite(deps.errorPhaseDebouncePolls)
       ? getDockerGpuSupervisorReconnectErrorDebouncePolls()
       : Math.max(1, Math.trunc(deps.errorPhaseDebouncePolls));
   let consecutiveErrorPolls = 0;