diff --git a/.agents/skills/nemoclaw-user-reference/references/troubleshooting.md b/.agents/skills/nemoclaw-user-reference/references/troubleshooting.md
index 355e732699..cdafbeb91a 100644
--- a/.agents/skills/nemoclaw-user-reference/references/troubleshooting.md
+++ b/.agents/skills/nemoclaw-user-reference/references/troubleshooting.md
@@ -1248,9 +1248,9 @@ If you do not need GPU access inside the sandbox, rerun with `--no-sandbox-gpu`.
 Set `NEMOCLAW_DOCKER_GPU_PATCH=0` only when you need to bypass this compatibility path during troubleshooting.
 
 If onboarding reports `OpenShell supervisor did not reconnect to the GPU-enabled container.` even though the diagnostic bundle shows the patched container is running and healthy, the supervisor-reconnect wait is treating a transient Error phase (reported while the OpenShell host re-registers the new container) as fatal.
-The reconnect wait debounces consecutive Error-phase polls before fast-failing, defaulting to five consecutive polls of about 10 seconds in total.
+The reconnect wait debounces consecutive Error-phase polls before fast-failing, defaulting to fifteen consecutive polls of about 30 seconds in total.
 Increase the debounce window with `NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE` if your host needs more time to re-register the patched container, for example slow WSL2 + Docker Desktop setups.
-Set it to a higher integer such as `15` (about 30 seconds) and rerun onboarding; the value is clamped to a minimum of `1`.
+Set it to a higher integer such as `30` (about 60 seconds) and rerun onboarding; the value is clamped to a minimum of `1`.
 
 ### `pip install` fails with a system-packages error
 
diff --git a/Dockerfile b/Dockerfile
index 79666f5bfe..5d70c1e66c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -402,6 +402,7 @@ RUN mkdir -p /sandbox/.nemoclaw/blueprints/0.1.0 \
 # Copy startup script and shared sandbox initialisation library
 COPY scripts/lib/sandbox-init.sh /usr/local/lib/nemoclaw/sandbox-init.sh
 COPY scripts/lib/openclaw_device_approval_policy.py /usr/local/lib/nemoclaw/openclaw_device_approval_policy.py
+COPY scripts/lib/clean_runtime_shell_env_shim.py /usr/local/lib/nemoclaw/clean_runtime_shell_env_shim.py
 COPY scripts/nemoclaw-start.sh /usr/local/bin/nemoclaw-start
 # Copy NODE_OPTIONS preload modules to a Landlock-accessible path. OpenShell ≥0.0.36
 # blocks /opt/nemoclaw-blueprint/ from non-root users, but the entrypoint
@@ -418,6 +419,7 @@ RUN chmod 755 /usr/local/bin/nemoclaw-start /usr/local/bin/nemoclaw-codex-acp \
         /usr/local/lib/nemoclaw/openclaw-build-messaging-plugins.py \
         /usr/local/lib/nemoclaw/seed-wechat-accounts.py \
     && chmod 644 /usr/local/lib/nemoclaw/openclaw_device_approval_policy.py \
+        /usr/local/lib/nemoclaw/clean_runtime_shell_env_shim.py \
     && if [ -d /usr/local/lib/nemoclaw/preloads ]; then find /usr/local/lib/nemoclaw/preloads -type f -name '*.js' -exec chmod 644 {} +; fi \
     && chmod 755 /usr/local/share/nemoclaw \
         /usr/local/share/nemoclaw/openclaw-plugins \
diff --git a/docs/reference/troubleshooting.mdx b/docs/reference/troubleshooting.mdx
index 13111ed382..d9559bd475 100644
--- a/docs/reference/troubleshooting.mdx
+++ b/docs/reference/troubleshooting.mdx
@@ -1262,9 +1262,9 @@ If you do not need GPU access inside the sandbox, rerun with `--no-sandbox-gpu`.
 Set `NEMOCLAW_DOCKER_GPU_PATCH=0` only when you need to bypass this compatibility path during troubleshooting.
 
 If onboarding reports `OpenShell supervisor did not reconnect to the GPU-enabled container.` even though the diagnostic bundle shows the patched container is running and healthy, the supervisor-reconnect wait is treating a transient Error phase (reported while the OpenShell host re-registers the new container) as fatal.
-The reconnect wait debounces consecutive Error-phase polls before fast-failing, defaulting to five consecutive polls of about 10 seconds in total.
+The reconnect wait debounces consecutive Error-phase polls before fast-failing, defaulting to fifteen consecutive polls of about 30 seconds in total.
 Increase the debounce window with `NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE` if your host needs more time to re-register the patched container, for example slow WSL2 + Docker Desktop setups.
-Set it to a higher integer such as `15` (about 30 seconds) and rerun onboarding; the value is clamped to a minimum of `1`.
+Set it to a higher integer such as `30` (about 60 seconds) and rerun onboarding; the value is clamped to a minimum of `1`.
 
 ### `pip install` fails with a system-packages error
 
diff --git a/scripts/lib/clean_runtime_shell_env_shim.py b/scripts/lib/clean_runtime_shell_env_shim.py
new file mode 100644
index 0000000000..37ed113e95
--- /dev/null
+++ b/scripts/lib/clean_runtime_shell_env_shim.py
@@ -0,0 +1,223 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Remove the legacy runtime shell-env shim from a sandbox user's rc file.
+
+Older base images and earlier entrypoints wrote a two-line stanza into
+.bashrc/.profile that sourced /tmp/nemoclaw-proxy-env.sh. The startup
+entrypoint now exports those variables in-process, so the legacy stanza is
+deleted before lock_rc_files makes the rc files read-only again.
+
+The script intentionally exits 0 in a small number of "leave-it-in-place"
+cases that are not safe to rewrite from a non-root entrypoint:
+
+* The rc file is not owned by the current uid (e.g. root-owned .bashrc in a
+  non-root sandbox). Rewriting it would need CAP_FOWNER, which the entrypoint
+  no longer has after process-capability drops. The leftover stanza only
+  sources /tmp/nemoclaw-proxy-env.sh if that file exists; that file's
+  permissions are hardened elsewhere in the startup sequence.
+
+* The rc file contents are already clean (no shim line).
+
+Invocation (from nemoclaw-start.sh):
+    python3 clean_runtime_shell_env_shim.py <rc_path> <shim_text> <uid>
+
+Source-of-truth: this script is a backwards-compatibility skip path. The
+invalid state it tolerates is "legacy base image planted a runtime shim into
+an rc file owned by a different uid than the entrypoint currently runs as".
+The preferred source boundary is the base image build: newer base images
+either own the rc file as the entrypoint user or do not plant the shim at
+all. Previously shipped sandboxes already have the mismatched-owner rc
+files on disk; crashing the entrypoint with exit code 1 on them is strictly
+worse than logging and skipping. Regression tests cover the direct fixture
+skip path and the composed startup invariant asserting
+/tmp/nemoclaw-proxy-env.sh stays mode 444. Removal condition: when no
+supported release ships a base image that plants the legacy shim AND every
+reachable sandbox has been rebuilt off a newer base image, drop the
+mismatched-owner branch and have the script exit 1 on EPERM again.
+"""
+
+import errno
+import os
+import stat
+import sys
+import tempfile
+
+
+def same_file(left, right):
+    return left.st_dev == right.st_dev and left.st_ino == right.st_ino
+
+
+def rewrite_open_rc_file(read_fd, original_stat, cleaned_lines, uid):
+    # The runtime test image can make /sandbox non-writable while leaving
+    # legacy shims in the rc files. In that case atomic rename into /sandbox
+    # fails, so rewrite the already-validated inode through /proc/self/fd
+    # instead.
+    final_mode = stat.S_IMODE(original_stat.st_mode)
+    if uid == 0:
+        os.fchown(read_fd, 0, 0)
+    os.fchmod(read_fd, 0o600)
+    write_fd = os.open(
+        f"/proc/self/fd/{read_fd}",
+        os.O_WRONLY | os.O_TRUNC | getattr(os, "O_CLOEXEC", 0),
+    )
+    try:
+        if not same_file(original_stat, os.fstat(write_fd)):
+            raise RuntimeError("rc file descriptor target changed during cleanup")
+        with os.fdopen(write_fd, "w", encoding="utf-8", errors="surrogateescape") as handle:
+            write_fd = None
+            handle.writelines(cleaned_lines)
+            handle.flush()
+            os.fsync(handle.fileno())
+    finally:
+        if write_fd is not None:
+            os.close(write_fd)
+        os.fchmod(read_fd, final_mode)
+
+
+def rewrite_by_rename(rc_path, original_stat, cleaned_lines, uid, tmp_paths):
+    tmp_fd, tmp_path = tempfile.mkstemp(prefix="nemoclaw-rc-clean.", dir="/tmp", text=True)
+    tmp_paths.append(tmp_path)
+    with os.fdopen(tmp_fd, "w", encoding="utf-8", errors="surrogateescape") as handle:
+        handle.writelines(cleaned_lines)
+        handle.flush()
+        os.fsync(handle.fileno())
+    if uid == 0:
+        os.chown(tmp_path, 0, 0)
+    # Mirror the original rc file's mode bits rather than fixing a permissive
+    # default. The pre-cleanup file's mode is the user-visible source of truth;
+    # widening it here would silently change rc file permissions.
+    os.chmod(tmp_path, stat.S_IMODE(original_stat.st_mode))
+    os.replace(tmp_path, rc_path)
+    tmp_paths.pop()
+
+
+def main(argv):
+    if len(argv) != 4:
+        print(
+            "[SECURITY] clean_runtime_shell_env_shim: expected <rc_path> <shim> <uid>",
+            file=sys.stderr,
+        )
+        return 1
+    rc_path = argv[1]
+    shim = argv[2]
+    uid = int(argv[3])
+    fd = None
+    tmp_paths = []
+
+    try:
+        flags = os.O_RDONLY | getattr(os, "O_CLOEXEC", 0) | getattr(os, "O_NOFOLLOW", 0)
+        try:
+            fd = os.open(rc_path, flags)
+        except OSError as exc:
+            if exc.errno == errno.ELOOP:
+                print(
+                    f"[SECURITY] refusing symlinked rc file during cleanup: {rc_path}",
+                    file=sys.stderr,
+                )
+            else:
+                print(
+                    f"[SECURITY] could not open rc file for cleanup: {rc_path}: {exc}",
+                    file=sys.stderr,
+                )
+            return 1
+
+        st = os.fstat(fd)
+        if not stat.S_ISREG(st.st_mode):
+            print(
+                f"[SECURITY] refusing non-regular rc file during cleanup: {rc_path}",
+                file=sys.stderr,
+            )
+            return 1
+        with os.fdopen(os.dup(fd), "r", encoding="utf-8", errors="surrogateescape") as handle:
+            lines = handle.readlines()
+
+        cleaned = []
+        index = 0
+        while index < len(lines):
+            line = lines[index]
+            bare = line.rstrip("\n")
+            if bare == "# Source runtime proxy config":
+                if index + 1 < len(lines):
+                    next_line = lines[index + 1]
+                    next_bare = next_line.rstrip("\n")
+                    if next_bare == shim or "/tmp/nemoclaw-proxy-env.sh" in next_line:
+                        index += 2
+                        continue
+                    cleaned.append(line)
+                    cleaned.append(next_line)
+                    index += 2
+                    continue
+            if bare == shim or "/tmp/nemoclaw-proxy-env.sh" in line:
+                index += 1
+                continue
+            cleaned.append(line)
+            index += 1
+
+        if any(
+            line.rstrip("\n") == shim or "/tmp/nemoclaw-proxy-env.sh" in line
+            for line in cleaned
+        ):
+            print(
+                f"[SECURITY] runtime env shim still present after cleanup: {rc_path}",
+                file=sys.stderr,
+            )
+            return 1
+        if cleaned == lines:
+            return 0
+
+        # When the rc file is not owned by us (and we are not root) we cannot
+        # safely rewrite it: fchmod would raise EPERM without CAP_FOWNER, and
+        # the in-place reopen via /proc/self/fd would fail anyway.
+        #
+        # Threat model: the legacy shim line we would have removed is still an
+        # active trust-boundary hook. It sources /tmp/nemoclaw-proxy-env.sh on
+        # every shell start and pulls in the proxy and gateway-token exports
+        # from that file. That file is written exclusively via
+        # `emit_sandbox_sourced_file` in scripts/lib/sandbox-init.sh, which
+        # forces mode 444 (and root ownership when the entrypoint runs as
+        # root) before placing the file. The startup sequence validates that
+        # invariant via `validate_tmp_permissions` before launching services.
+        # The composed test in test/service-env.test.ts proves the file stays
+        # at mode 444 through this skip path. As long as the proxy-env file
+        # remains non-user-writable, the leftover shim does not widen the
+        # sandbox's trust boundary; crashing the container under errexit
+        # (which the original code did) was the strictly worse outcome. A
+        # later root-mode boot can finish the cleanup.
+        if uid != 0 and st.st_uid != uid:
+            print(
+                f"[SECURITY] skipping rc cleanup for {rc_path}: not owned by uid={uid} "
+                f"(file uid={st.st_uid}); legacy shim left in place",
+                file=sys.stderr,
+            )
+            return 0
+
+        try:
+            rewrite_open_rc_file(fd, st, cleaned, uid)
+        except OSError as exc:
+            if exc.errno != errno.ENOENT:
+                raise
+            rewrite_by_rename(rc_path, st, cleaned, uid, tmp_paths)
+    except Exception as exc:
+        print(
+            f"[SECURITY] could not safely clean runtime env shim from {rc_path}: {exc}",
+            file=sys.stderr,
+        )
+        return 1
+    finally:
+        if fd is not None:
+            os.close(fd)
+        for tmp_path in tmp_paths:
+            try:
+                os.unlink(tmp_path)
+            except FileNotFoundError:
+                # The successful path in `rewrite_by_rename` removes the tmp
+                # path from `tmp_paths` before this finally block runs, so
+                # arriving here means the rename happened or the OS already
+                # reaped the file. Nothing left to clean up.
+                pass
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))
diff --git a/scripts/nemoclaw-start.sh b/scripts/nemoclaw-start.sh
index 6f1a7e331e..1032ef971f 100755
--- a/scripts/nemoclaw-start.sh
+++ b/scripts/nemoclaw-start.sh
@@ -2444,9 +2444,33 @@ GUARDENVEOF
 # Keep per-user rc files out of runtime proxy wiring. Older images and prior
 # entrypoint versions wrote a two-line shim into .bashrc/.profile; remove that
 # managed stanza before lock_rc_files makes the files read-only again.
+#
+# The Python body lives in scripts/lib/clean_runtime_shell_env_shim.py so it
+# can be unit-tested with controlled rc fixtures. Installed location in the
+# sandbox image: /usr/local/lib/nemoclaw/clean_runtime_shell_env_shim.py.
 ensure_runtime_shell_env_shim() {
   local failed=0
   local rc_file
+  # Resolution order is deliberately fixed: the immutable installed helper at
+  # /usr/local/lib/nemoclaw/ ALWAYS wins when present. That path is set up
+  # by the Dockerfile, chmod 644, root-owned (or build-time owned), and lives
+  # under a system directory the sandbox user cannot write to. We refuse to
+  # honour any environment-supplied override when that file is in place so a
+  # malicious envvar cannot swap in arbitrary Python.
+  #
+  # The NEMOCLAW_RC_CLEAN_SCRIPT override is consulted ONLY when the installed
+  # helper is missing — i.e. running the unit-test wrappers against the
+  # repository tree, where the script lives at scripts/lib/ instead.
+  # The final fallback resolves the script relative to nemoclaw-start.sh so
+  # `bash scripts/nemoclaw-start.sh` works out-of-the-box for ad-hoc dev runs.
+  local clean_script="/usr/local/lib/nemoclaw/clean_runtime_shell_env_shim.py"
+  if [ ! -f "$clean_script" ]; then
+    if [ -n "${NEMOCLAW_RC_CLEAN_SCRIPT:-}" ] && [ -f "${NEMOCLAW_RC_CLEAN_SCRIPT}" ]; then
+      clean_script="${NEMOCLAW_RC_CLEAN_SCRIPT}"
+    else
+      clean_script="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/lib/clean_runtime_shell_env_shim.py"
+    fi
+  fi
 
   for rc_file in "${_SANDBOX_HOME}/.bashrc" "${_SANDBOX_HOME}/.profile"; do
     if [ -L "$rc_file" ]; then
@@ -2463,125 +2487,7 @@ ensure_runtime_shell_env_shim() {
       continue
     fi
 
-    if ! command python3 - "$rc_file" "$_RUNTIME_SHELL_ENV_SHIM" "$(id -u)" <<'PY'; then
-import errno
-import os
-import stat
-import sys
-import tempfile
-
-rc_path, shim, uid_text = sys.argv[1:4]
-uid = int(uid_text)
-fd = None
-tmp_path = None
-
-
-def same_file(left, right):
-    return left.st_dev == right.st_dev and left.st_ino == right.st_ino
-
-
-def rewrite_open_rc_file(read_fd, original_stat, cleaned_lines):
-    # The runtime test image can make /sandbox non-writable while leaving legacy
-    # shims in the rc files. In that case atomic rename into /sandbox fails, so
-    # rewrite the already-validated inode through /proc/self/fd instead.
-    if uid == 0:
-        os.fchown(read_fd, 0, 0)
-    os.fchmod(read_fd, 0o600)
-    write_fd = os.open(
-        f"/proc/self/fd/{read_fd}",
-        os.O_WRONLY | os.O_TRUNC | getattr(os, "O_CLOEXEC", 0),
-    )
-    try:
-        if not same_file(original_stat, os.fstat(write_fd)):
-            raise RuntimeError("rc file descriptor target changed during cleanup")
-        with os.fdopen(write_fd, "w", encoding="utf-8", errors="surrogateescape") as handle:
-            write_fd = None
-            handle.writelines(cleaned_lines)
-            handle.flush()
-            os.fsync(handle.fileno())
-    finally:
-        if write_fd is not None:
-            os.close(write_fd)
-        os.fchmod(read_fd, 0o644)
-
-
-def rewrite_by_rename(cleaned_lines):
-    global tmp_path
-    tmp_fd, tmp_path = tempfile.mkstemp(prefix="nemoclaw-rc-clean.", dir="/tmp", text=True)
-    with os.fdopen(tmp_fd, "w", encoding="utf-8", errors="surrogateescape") as handle:
-        handle.writelines(cleaned_lines)
-        handle.flush()
-        os.fsync(handle.fileno())
-    if uid == 0:
-        os.chown(tmp_path, 0, 0)
-    os.chmod(tmp_path, 0o644)
-    os.replace(tmp_path, rc_path)
-    tmp_path = None
-
-try:
-    flags = os.O_RDONLY | getattr(os, "O_CLOEXEC", 0) | getattr(os, "O_NOFOLLOW", 0)
-    try:
-        fd = os.open(rc_path, flags)
-    except OSError as exc:
-        if exc.errno == errno.ELOOP:
-            print(f"[SECURITY] refusing symlinked rc file during cleanup: {rc_path}", file=sys.stderr)
-        else:
-            print(f"[SECURITY] could not open rc file for cleanup: {rc_path}: {exc}", file=sys.stderr)
-        sys.exit(1)
-
-    st = os.fstat(fd)
-    if not stat.S_ISREG(st.st_mode):
-        print(f"[SECURITY] refusing non-regular rc file during cleanup: {rc_path}", file=sys.stderr)
-        sys.exit(1)
-    with os.fdopen(os.dup(fd), "r", encoding="utf-8", errors="surrogateescape") as handle:
-        lines = handle.readlines()
-
-    cleaned = []
-    index = 0
-    while index < len(lines):
-        line = lines[index]
-        bare = line.rstrip("\n")
-        if bare == "# Source runtime proxy config":
-            if index + 1 < len(lines):
-                next_line = lines[index + 1]
-                next_bare = next_line.rstrip("\n")
-                if next_bare == shim or "/tmp/nemoclaw-proxy-env.sh" in next_line:
-                    index += 2
-                    continue
-                cleaned.append(line)
-                cleaned.append(next_line)
-                index += 2
-                continue
-        if bare == shim or "/tmp/nemoclaw-proxy-env.sh" in line:
-            index += 1
-            continue
-        cleaned.append(line)
-        index += 1
-
-    if any(line.rstrip("\n") == shim or "/tmp/nemoclaw-proxy-env.sh" in line for line in cleaned):
-        print(f"[SECURITY] runtime env shim still present after cleanup: {rc_path}", file=sys.stderr)
-        sys.exit(1)
-    if cleaned == lines:
-        sys.exit(0)
-
-    try:
-        rewrite_open_rc_file(fd, st, cleaned)
-    except OSError as exc:
-        if exc.errno != errno.ENOENT:
-            raise
-        rewrite_by_rename(cleaned)
-except Exception as exc:
-    print(f"[SECURITY] could not safely clean runtime env shim from {rc_path}: {exc}", file=sys.stderr)
-    sys.exit(1)
-finally:
-    if fd is not None:
-        os.close(fd)
-    if tmp_path:
-        try:
-            os.unlink(tmp_path)
-        except FileNotFoundError:
-            pass
-PY
+    if ! command python3 "$clean_script" "$rc_file" "$_RUNTIME_SHELL_ENV_SHIM" "$(id -u)"; then
       failed=1
       continue
     fi
diff --git a/skills/nemoclaw-user-reference/references/troubleshooting.md b/skills/nemoclaw-user-reference/references/troubleshooting.md
index 355e732699..cdafbeb91a 100644
--- a/skills/nemoclaw-user-reference/references/troubleshooting.md
+++ b/skills/nemoclaw-user-reference/references/troubleshooting.md
@@ -1248,9 +1248,9 @@ If you do not need GPU access inside the sandbox, rerun with `--no-sandbox-gpu`.
 Set `NEMOCLAW_DOCKER_GPU_PATCH=0` only when you need to bypass this compatibility path during troubleshooting.
 
 If onboarding reports `OpenShell supervisor did not reconnect to the GPU-enabled container.` even though the diagnostic bundle shows the patched container is running and healthy, the supervisor-reconnect wait is treating a transient Error phase (reported while the OpenShell host re-registers the new container) as fatal.
-The reconnect wait debounces consecutive Error-phase polls before fast-failing, defaulting to five consecutive polls of about 10 seconds in total.
+The reconnect wait debounces consecutive Error-phase polls before fast-failing, defaulting to fifteen consecutive polls of about 30 seconds in total.
 Increase the debounce window with `NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE` if your host needs more time to re-register the patched container, for example slow WSL2 + Docker Desktop setups.
-Set it to a higher integer such as `15` (about 30 seconds) and rerun onboarding; the value is clamped to a minimum of `1`.
+Set it to a higher integer such as `30` (about 60 seconds) and rerun onboarding; the value is clamped to a minimum of `1`.
 
 ### `pip install` fails with a system-packages error
 
diff --git a/src/lib/adapters/docker/container.ts b/src/lib/adapters/docker/container.ts
index 2138e53df7..a4b521502f 100644
--- a/src/lib/adapters/docker/container.ts
+++ b/src/lib/adapters/docker/container.ts
@@ -13,6 +13,10 @@ export function dockerStop(containerName: string, opts: DockerRunOptions = {}):
   return dockerRun(["stop", containerName], opts);
 }
 
+export function dockerStart(containerName: string, opts: DockerRunOptions = {}): DockerRunResult {
+  return dockerRun(["start", containerName], opts);
+}
+
 // NIM (and most Python services) write errors to stderr, so a stdout-only
 // capture would silently lose the auth-error tail. Use dockerRun so we can
 // read both streams from the SpawnResult and concatenate them. Bounded by a
diff --git a/src/lib/onboard/docker-gpu-patch-finalize.test.ts b/src/lib/onboard/docker-gpu-patch-finalize.test.ts
new file mode 100644
index 0000000000..d21edb3092
--- /dev/null
+++ b/src/lib/onboard/docker-gpu-patch-finalize.test.ts
@@ -0,0 +1,112 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { describe, expect, it, vi } from "vitest";
+
+import type { DockerGpuPatchResult } from "../../../dist/lib/onboard/docker-gpu-patch";
+import { finalizeDockerGpuPatchBackup } from "../../../dist/lib/onboard/docker-gpu-patch-finalize";
+
+function deferredCreateResult(): DockerGpuPatchResult {
+  return {
+    applied: true,
+    oldContainerId: "old-container-id",
+    newContainerId: "new-container-id",
+    originalName: "openshell-alpha",
+    backupContainerName: "openshell-alpha-nemoclaw-gpu-backup-1780491860342",
+    mode: {
+      kind: "gpus",
+      label: "--gpus all",
+      device: "all",
+      args: ["--gpus", "all"],
+    },
+    backupRemoved: false,
+  };
+}
+
+describe("finalizeDockerGpuPatchBackup", () => {
+  it("removes the backup container when supervisor reconnect succeeded", () => {
+    const dockerRm = vi.fn((_name: string) => ({ status: 0 }));
+    const outcome = finalizeDockerGpuPatchBackup(
+      { result: deferredCreateResult(), supervisorReady: true },
+      { dockerRm },
+    );
+    expect(outcome).toEqual({ backupRemoved: true, rolledBack: false });
+    expect(dockerRm).toHaveBeenCalledWith(
+      "openshell-alpha-nemoclaw-gpu-backup-1780491860342",
+      expect.objectContaining({ ignoreError: true }),
+    );
+  });
+
+  it("rolls back to the backup container when supervisor reconnect failed", () => {
+    const dockerStop = vi.fn(() => ({ status: 0 }));
+    const dockerRm = vi.fn((_name: string) => ({ status: 0 }));
+    const dockerRename = vi.fn((_old: string, _next: string) => ({ status: 0 }));
+    const dockerStart = vi.fn(() => ({ status: 0 }));
+    const outcome = finalizeDockerGpuPatchBackup(
+      { result: deferredCreateResult(), supervisorReady: false },
+      { dockerStop, dockerRm, dockerRename, dockerStart },
+    );
+    expect(outcome).toEqual({ backupRemoved: false, rolledBack: true });
+    expect(dockerStop).toHaveBeenCalledWith(
+      "new-container-id",
+      expect.objectContaining({ ignoreError: true }),
+    );
+    expect(dockerRename).toHaveBeenCalledWith(
+      "openshell-alpha-nemoclaw-gpu-backup-1780491860342",
+      "openshell-alpha",
+      expect.objectContaining({ ignoreError: true }),
+    );
+    expect(dockerStart).toHaveBeenCalledWith(
+      "openshell-alpha",
+      expect.objectContaining({ ignoreError: true }),
+    );
+    expect(
+      dockerRm.mock.calls.some((call) =>
+        String(call[0]).includes("nemoclaw-gpu-backup"),
+      ),
+    ).toBe(false);
+  });
+
+  it("reports rolledBack=false when restoring the backup fails", () => {
+    const dockerStop = vi.fn(() => ({ status: 0 }));
+    const dockerRm = vi.fn((_name: string) => ({ status: 0 }));
+    const dockerRename = vi.fn((_old: string, _next: string) => ({
+      status: 1,
+      stderr: "no such container",
+    }));
+    const dockerStart = vi.fn(() => ({ status: 0 }));
+    const outcome = finalizeDockerGpuPatchBackup(
+      { result: deferredCreateResult(), supervisorReady: false },
+      { dockerStop, dockerRm, dockerRename, dockerStart },
+    );
+    expect(outcome).toEqual({ backupRemoved: false, rolledBack: false });
+    expect(dockerStart).not.toHaveBeenCalled();
+  });
+
+  it("is a no-op when the backup was already removed by the patch helper", () => {
+    const dockerRm = vi.fn((_name: string) => ({ status: 0 }));
+    const result = { ...deferredCreateResult(), backupRemoved: true };
+    const outcome = finalizeDockerGpuPatchBackup(
+      { result, supervisorReady: true },
+      { dockerRm },
+    );
+    expect(outcome).toEqual({ backupRemoved: true, rolledBack: false });
+    expect(dockerRm).not.toHaveBeenCalled();
+  });
+
+  it("reports backupRemoved=false when supervisor reconnect succeeded but docker rm of the backup failed", () => {
+    const dockerRm = vi.fn((_name: string) => ({
+      status: 1,
+      stderr: "Error response from daemon: container is in use",
+    }));
+    const outcome = finalizeDockerGpuPatchBackup(
+      { result: deferredCreateResult(), supervisorReady: true },
+      { dockerRm },
+    );
+    expect(outcome).toEqual({ backupRemoved: false, rolledBack: false });
+    expect(dockerRm).toHaveBeenCalledWith(
+      "openshell-alpha-nemoclaw-gpu-backup-1780491860342",
+      expect.objectContaining({ ignoreError: true }),
+    );
+  });
+});
diff --git a/src/lib/onboard/docker-gpu-patch-finalize.ts b/src/lib/onboard/docker-gpu-patch-finalize.ts
new file mode 100644
index 0000000000..45fc1761d6
--- /dev/null
+++ b/src/lib/onboard/docker-gpu-patch-finalize.ts
@@ -0,0 +1,172 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Source-of-truth: this module is a NemoClaw-side workaround. The invalid
+// state it recovers from is "OpenShell Docker-driver GPU patch left the
+// sandbox in a deleted-backup / failed-new state when the post-recreate
+// supervisor reconnect could not confirm the GPU container". The preferred
+// source boundary for the fix is OpenShell: a Docker-driver sandbox create
+// that natively accepts NVIDIA GPU access would remove the need for the
+// post-create container recreation NemoClaw performs here. Until OpenShell
+// supports that natively, NemoClaw recreates the container with GPU access
+// and uses this module to either confirm the new container or restore the
+// pre-patch backup. Regression coverage:
+//   * src/lib/onboard/docker-gpu-patch-finalize.test.ts — direct unit tests
+//     for finalize success / rollback / no-op / rollback failure outcomes.
+//   * src/lib/onboard/docker-gpu-patch-rollback.test.ts — composed
+//     recreate-with-rollback scenarios.
+//   * src/lib/onboard/docker-gpu-sandbox-create.test.ts — composed create
+//     flow driving maybeApplyDuringCreate → waitForSupervisorReconnect →
+//     finalizeBackup.
+// Removal condition: when OpenShell supports native Docker-driver GPU
+// creation/reconnect, drop the NemoClaw post-create container recreation
+// and delete this module along with its callers in docker-gpu-patch.ts and
+// docker-gpu-sandbox-create.ts.
+
+import {
+  dockerRename as defaultDockerRename,
+  dockerRm as defaultDockerRm,
+  dockerStart as defaultDockerStart,
+  dockerStop as defaultDockerStop,
+} from "../adapters/docker";
+import type { DockerGpuPatchDeps, DockerGpuPatchResult } from "./docker-gpu-patch";
+
+const DOCKER_GPU_PATCH_TIMEOUT_MS = 30_000;
+
+type DockerRunResult = {
+  status?: number | null;
+  stdout?: string | Buffer | null;
+  stderr?: string | Buffer | null;
+};
+
+type DockerRunOptions = Record<string, unknown>;
+
+type DockerContainerFn = (containerName: string, opts?: DockerRunOptions) => DockerRunResult;
+type DockerRenameFn = (
+  oldContainerName: string,
+  newContainerName: string,
+  opts?: DockerRunOptions,
+) => DockerRunResult;
+
+type ResolvedRollbackDeps = {
+  dockerStop: DockerContainerFn;
+  dockerRm: DockerContainerFn;
+  dockerRename: DockerRenameFn;
+  dockerStart: DockerContainerFn;
+};
+
+function isZeroStatus(result: DockerRunResult | null | undefined): boolean {
+  return Number(result?.status ?? 0) === 0;
+}
+
+function resolveRollbackDeps(deps: DockerGpuPatchDeps): ResolvedRollbackDeps {
+  return {
+    dockerStop: deps.dockerStop ?? defaultDockerStop,
+    dockerRm: deps.dockerRm ?? defaultDockerRm,
+    dockerRename: deps.dockerRename ?? defaultDockerRename,
+    dockerStart: deps.dockerStart ?? defaultDockerStart,
+  };
+}
+
+export function rollbackToBackupContainer(
+  refs: { newContainerId: string; backupContainerName: string; originalName: string },
+  deps: ResolvedRollbackDeps,
+): boolean {
+  const containerOpts = {
+    ignoreError: true,
+    suppressOutput: true,
+    timeout: DOCKER_GPU_PATCH_TIMEOUT_MS,
+  };
+  deps.dockerStop(refs.newContainerId, containerOpts);
+  deps.dockerRm(refs.newContainerId, containerOpts);
+  const restored = deps.dockerRename(
+    refs.backupContainerName,
+    refs.originalName,
+    containerOpts,
+  );
+  if (!isZeroStatus(restored)) return false;
+  const started = deps.dockerStart(refs.originalName, containerOpts);
+  return isZeroStatus(started);
+}
+
+export type DockerGpuPatchFinalizeOptions = {
+  result: DockerGpuPatchResult;
+  supervisorReady: boolean;
+};
+
+export type DockerGpuPatchFinalizeOutcome = {
+  backupRemoved: boolean;
+  rolledBack: boolean;
+};
+
+export function finalizeDockerGpuPatchBackup(
+  options: DockerGpuPatchFinalizeOptions,
+  deps: DockerGpuPatchDeps = {},
+): DockerGpuPatchFinalizeOutcome {
+  const resolved = resolveRollbackDeps(deps);
+  const containerOpts = {
+    ignoreError: true,
+    suppressOutput: true,
+    timeout: DOCKER_GPU_PATCH_TIMEOUT_MS,
+  };
+  if (options.result.backupRemoved) {
+    return { backupRemoved: true, rolledBack: false };
+  }
+  if (options.supervisorReady) {
+    // Backup removal is best-effort: the supervisor probe already confirmed
+    // the new GPU container is reachable, so the backup is no longer needed
+    // even if `docker rm` cannot delete it (e.g. concurrent admin action,
+    // daemon timeout). Reflect the actual rm status in the outcome so
+    // diagnostics can flag a leaked backup container.
+    const rmResult = resolved.dockerRm(
+      options.result.backupContainerName,
+      containerOpts,
+    );
+    return { backupRemoved: isZeroStatus(rmResult), rolledBack: false };
+  }
+  const rolledBack = rollbackToBackupContainer(
+    {
+      newContainerId: options.result.newContainerId,
+      backupContainerName: options.result.backupContainerName,
+      originalName: options.result.originalName,
+    },
+    resolved,
+  );
+  return { backupRemoved: false, rolledBack };
+}
+
+export type SupervisorReconnectOutcome =
+  | { execReady: true; backupRemoved: boolean }
+  | { execReady: false; rolledBack: boolean; error: Error };
+
+export function reconcileSupervisorReconnect(
+  execReady: boolean,
+  refs: { newContainerId: string; backupContainerName: string; originalName: string },
+  deps: DockerGpuPatchDeps,
+): SupervisorReconnectOutcome {
+  const resolved = resolveRollbackDeps(deps);
+  const containerOpts = {
+    ignoreError: true,
+    suppressOutput: true,
+    timeout: DOCKER_GPU_PATCH_TIMEOUT_MS,
+  };
+  if (execReady) {
+    // Backup removal is best-effort here too: the supervisor probe already
+    // confirmed the new container is reachable, so a failed rm leaves a
+    // leaked backup container but the user-visible sandbox is healthy.
+    // Surface the actual rm status so callers can fold it into diagnostics
+    // alongside the deferred-finalize path in `finalizeDockerGpuPatchBackup`.
+    const rmResult = resolved.dockerRm(refs.backupContainerName, containerOpts);
+    return { execReady: true, backupRemoved: isZeroStatus(rmResult) };
+  }
+  const rolledBack = rollbackToBackupContainer(refs, resolved);
+  return {
+    execReady: false,
+    rolledBack,
+    error: new Error(
+      rolledBack
+        ? "OpenShell supervisor did not reconnect to the GPU-enabled container; pre-patch sandbox restored."
+        : "OpenShell supervisor did not reconnect to the GPU-enabled container and rollback failed; pre-patch sandbox was NOT restored.",
+    ),
+  };
+}
diff --git a/src/lib/onboard/docker-gpu-patch-rollback.test.ts b/src/lib/onboard/docker-gpu-patch-rollback.test.ts
new file mode 100644
index 0000000000..5e330942c1
--- /dev/null
+++ b/src/lib/onboard/docker-gpu-patch-rollback.test.ts
@@ -0,0 +1,197 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { describe, expect, it, vi } from "vitest";
+
+import {
+  type DockerContainerInspect,
+  recreateOpenShellDockerSandboxWithGpu,
+} from "../../../dist/lib/onboard/docker-gpu-patch";
+
+function inspectFixture(): DockerContainerInspect {
+  return {
+    Id: "old-container-id",
+    Name: "/openshell-alpha",
+    Config: {
+      Image: "openshell/sandbox:abc",
+      Env: [
+        "A=1",
+        "OPENSHELL_ENDPOINT=http://host.openshell.internal:8080/",
+        "OPENSHELL_TEST=1",
+        "OPENSHELL_SANDBOX_COMMAND=sleep infinity",
+        "NVIDIA_VISIBLE_DEVICES=void",
+      ],
+      Labels: {
+        "openshell.ai/managed-by": "openshell",
+        "openshell.ai/sandbox-name": "alpha",
+        "openshell.ai/sandbox-id": "sandbox-id",
+      },
+      Entrypoint: ["/opt/openshell/bin/openshell-sandbox"],
+      Cmd: [],
+      User: "0",
+      WorkingDir: "/workspace",
+      Hostname: "alpha-host",
+      Tty: true,
+    },
+    HostConfig: {
+      Binds: ["/host:/container:rw"],
+      NetworkMode: "openshell-docker",
+      RestartPolicy: { Name: "unless-stopped" },
+      CapAdd: ["SYS_ADMIN", "NET_ADMIN"],
+      SecurityOpt: ["apparmor=unconfined"],
+      ExtraHosts: ["host.openshell.internal:172.17.0.1"],
+      Memory: 8 * 1024 * 1024 * 1024,
+      NanoCpus: 2_500_000_000,
+    },
+    NetworkSettings: {
+      Networks: {
+        "openshell-docker": {
+          IPAddress: "172.18.0.2",
+          Gateway: "172.18.0.1",
+          Aliases: ["openshell-alpha"],
+        },
+      },
+    },
+  };
+}
+
+describe("recreateOpenShellDockerSandboxWithGpu rollback path", () => {
+  it("rolls back to the backup container when supervisor reconnect fails", () => {
+    const dockerCapture = vi.fn((args: readonly string[]) => {
+      if (args[0] === "ps") return "old-container-id\n";
+      if (args[0] === "inspect") return JSON.stringify([inspectFixture()]);
+      if (args[0] === "info") return "";
+      return "";
+    });
+    const dockerRun = vi.fn(() => ({ status: 0, stdout: "probe-id\n" }));
+    const dockerRunDetached = vi.fn(() => ({ status: 0, stdout: "new-container-id\n" }));
+    const dockerRename = vi.fn((_old: string, _next: string) => ({ status: 0 }));
+    const dockerStop = vi.fn(() => ({ status: 0 }));
+    const dockerStart = vi.fn(() => ({ status: 0 }));
+    const dockerRm = vi.fn((_name: string) => ({ status: 0 }));
+    const runOpenshell = vi.fn(() => ({ status: 1, stderr: "supervisor unreachable" }));
+    const runCaptureOpenshell = vi.fn(() => "");
+
+    expect(() =>
+      recreateOpenShellDockerSandboxWithGpu(
+        { sandboxName: "alpha", timeoutSecs: 1 },
+        {
+          dockerCapture,
+          dockerRun,
+          dockerRunDetached,
+          dockerRename,
+          dockerStop,
+          dockerStart,
+          dockerRm,
+          runOpenshell,
+          runCaptureOpenshell,
+          sleep: vi.fn(),
+          now: () => new Date("2026-05-12T00:00:00Z"),
+          errorPhaseDebouncePolls: 1,
+        },
+      ),
+    ).toThrow(/pre-patch sandbox restored/);
+
+    const restoreRename = dockerRename.mock.calls.find(
+      (call) =>
+        String(call[0]).includes("nemoclaw-gpu-backup") && call[1] === "openshell-alpha",
+    );
+    expect(restoreRename).toBeDefined();
+    expect(dockerStart).toHaveBeenCalledWith(
+      "openshell-alpha",
+      expect.objectContaining({ ignoreError: true }),
+    );
+    expect(
+      dockerRm.mock.calls.some((call) => String(call[0]).includes("nemoclaw-gpu-backup")),
+    ).toBe(false);
+  });
+
+  it("reports rollback failure when restoring the backup container fails", () => {
+    const dockerCapture = vi.fn((args: readonly string[]) => {
+      if (args[0] === "ps") return "old-container-id\n";
+      if (args[0] === "inspect") return JSON.stringify([inspectFixture()]);
+      if (args[0] === "info") return "";
+      return "";
+    });
+    const dockerRun = vi.fn(() => ({ status: 0, stdout: "probe-id\n" }));
+    const dockerRunDetached = vi.fn(() => ({ status: 0, stdout: "new-container-id\n" }));
+    const dockerRename = vi.fn((oldName: string) => {
+      if (String(oldName).includes("nemoclaw-gpu-backup")) {
+        return { status: 1, stderr: "no such container" };
+      }
+      return { status: 0 };
+    });
+    const dockerStop = vi.fn(() => ({ status: 0 }));
+    const dockerStart = vi.fn(() => ({ status: 0 }));
+    const dockerRm = vi.fn(() => ({ status: 0 }));
+    const runOpenshell = vi.fn(() => ({ status: 1, stderr: "supervisor unreachable" }));
+    const runCaptureOpenshell = vi.fn(() => "");
+
+    expect(() =>
+      recreateOpenShellDockerSandboxWithGpu(
+        { sandboxName: "alpha", timeoutSecs: 1 },
+        {
+          dockerCapture,
+          dockerRun,
+          dockerRunDetached,
+          dockerRename,
+          dockerStop,
+          dockerStart,
+          dockerRm,
+          runOpenshell,
+          runCaptureOpenshell,
+          sleep: vi.fn(),
+          now: () => new Date("2026-05-12T00:00:00Z"),
+          errorPhaseDebouncePolls: 1,
+        },
+      ),
+    ).toThrow(/rollback failed; pre-patch sandbox was NOT restored/);
+
+    expect(dockerStart).not.toHaveBeenCalled();
+  });
+
+  it("reports rollback failure when restarting the backup container fails", () => {
+    const dockerCapture = vi.fn((args: readonly string[]) => {
+      if (args[0] === "ps") return "old-container-id\n";
+      if (args[0] === "inspect") return JSON.stringify([inspectFixture()]);
+      if (args[0] === "info") return "";
+      return "";
+    });
+    const dockerRun = vi.fn(() => ({ status: 0, stdout: "probe-id\n" }));
+    const dockerRunDetached = vi.fn(() => ({ status: 0, stdout: "new-container-id\n" }));
+    const dockerRename = vi.fn((_old: string, _next: string) => ({ status: 0 }));
+    const dockerStop = vi.fn(() => ({ status: 0 }));
+    const dockerStart = vi.fn(() => ({ status: 1, stderr: "container start failed" }));
+    const dockerRm = vi.fn((_name: string) => ({ status: 0 }));
+    const runOpenshell = vi.fn(() => ({ status: 1, stderr: "supervisor unreachable" }));
+    const runCaptureOpenshell = vi.fn(() => "");
+
+    expect(() =>
+      recreateOpenShellDockerSandboxWithGpu(
+        { sandboxName: "alpha", timeoutSecs: 1 },
+        {
+          dockerCapture,
+          dockerRun,
+          dockerRunDetached,
+          dockerRename,
+          dockerStop,
+          dockerStart,
+          dockerRm,
+          runOpenshell,
+          runCaptureOpenshell,
+          sleep: vi.fn(),
+          now: () => new Date("2026-05-12T00:00:00Z"),
+          errorPhaseDebouncePolls: 1,
+        },
+      ),
+    ).toThrow(/rollback failed; pre-patch sandbox was NOT restored/);
+
+    expect(dockerStart).toHaveBeenCalledWith(
+      "openshell-alpha",
+      expect.objectContaining({ ignoreError: true }),
+    );
+    expect(
+      dockerRm.mock.calls.some((call) => String(call[0]).includes("nemoclaw-gpu-backup")),
+    ).toBe(false);
+  });
+});
diff --git a/src/lib/onboard/docker-gpu-patch.test.ts b/src/lib/onboard/docker-gpu-patch.test.ts
index 88223f119f..cfc21b552c 100644
--- a/src/lib/onboard/docker-gpu-patch.test.ts
+++ b/src/lib/onboard/docker-gpu-patch.test.ts
@@ -579,7 +579,11 @@ describe("docker-gpu-patch", () => {
       String(call[0]).includes("nemoclaw-gpu-backup"),
     );
     expect(backupRmCall).toBeGreaterThanOrEqual(0);
-    expect(dockerRm.mock.invocationCallOrder[backupRmCall]).toBeLessThan(
+    // Backup container is removed only AFTER supervisor reconnect confirms
+    // the GPU container is reachable. If reconnect fails the rollback path
+    // restores the backup under the original name (see the rollback test
+    // below), so the backup must outlive the supervisor probe.
+    expect(dockerRm.mock.invocationCallOrder[backupRmCall]).toBeGreaterThan(
       runOpenshell.mock.invocationCallOrder[0],
     );
   });
@@ -592,6 +596,7 @@ describe("docker-gpu-patch", () => {
       return "";
     });
     const dockerRunDetached = vi.fn(() => ({ status: 0, stdout: "new-container-id\n" }));
+    const dockerRm = vi.fn((_name: string) => ({ status: 0 }));
     const runOpenshell = vi.fn(() => ({ status: 1, stderr: "phase: Provisioning" }));
 
     const result = recreateOpenShellDockerSandboxWithGpu(
@@ -607,7 +612,7 @@ describe("docker-gpu-patch", () => {
         dockerRunDetached,
         dockerRename: vi.fn(() => ({ status: 0 })),
         dockerStop: vi.fn(() => ({ status: 0 })),
-        dockerRm: vi.fn(() => ({ status: 0 })),
+        dockerRm,
         runOpenshell,
         sleep: vi.fn(),
         now: () => new Date("2026-05-12T00:00:00Z"),
@@ -615,7 +620,16 @@ describe("docker-gpu-patch", () => {
     );
 
     expect(result.newContainerId).toBe("new-container-id");
+    expect(result.backupRemoved).toBe(false);
+    expect(result.originalName).toBe("openshell-alpha");
+    expect(result.backupContainerName).toContain("nemoclaw-gpu-backup");
     expect(runOpenshell).not.toHaveBeenCalled();
+    // The create path takes the supervisor wait into its own hands later in
+    // the flow. The patch helper must NOT remove the backup yet — that would
+    // re-introduce the deleted-backup / failed-new state #4664 fixes.
+    expect(
+      dockerRm.mock.calls.some((call) => String(call[0]).includes("nemoclaw-gpu-backup")),
+    ).toBe(false);
     expect(dockerRunDetached).toHaveBeenCalledWith(
       expect.arrayContaining([
         "--env",
diff --git a/src/lib/onboard/docker-gpu-patch.ts b/src/lib/onboard/docker-gpu-patch.ts
index d46705ebc5..7667447b28 100644
--- a/src/lib/onboard/docker-gpu-patch.ts
+++ b/src/lib/onboard/docker-gpu-patch.ts
@@ -14,6 +14,7 @@ import {
   dockerRunDetached,
   dockerStop,
 } from "../adapters/docker";
+import { reconcileSupervisorReconnect } from "./docker-gpu-patch-finalize";
 import {
   type DockerGpuSupervisorReconnectDeps,
   DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE_ENV,
@@ -70,6 +71,7 @@ export type DockerGpuPatchDeps = {
   dockerRunDetached?: DockerRunFn;
   dockerRename?: DockerRenameFn;
   dockerRm?: DockerContainerFn;
+  dockerStart?: DockerContainerFn;
   dockerStop?: DockerContainerFn;
   dockerLogs?: DockerLogsFn;
   runOpenshell?: (args: string[], opts?: Record<string, unknown>) => DockerRunResult;
@@ -112,14 +114,22 @@ export type DockerGpuPatchFailureContext = {
   backupContainerName?: string | null;
   selectedMode?: DockerGpuPatchMode | null;
   modeAttempts?: DockerGpuPatchModeAttempt[];
+  rolledBack?: boolean;
 };
 
 export type DockerGpuPatchResult = {
   applied: true;
   oldContainerId: string;
   newContainerId: string;
+  originalName: string;
   backupContainerName: string;
   mode: DockerGpuPatchMode;
+  // True when the patch path also confirmed supervisor reconnect AND removed
+  // the backup container. False when the caller deferred the reconnect wait
+  // (via `waitForSupervisor: false`); the backup is still in place and the
+  // caller is responsible for calling `finalizeDockerGpuPatchBackup` after
+  // its own supervisor wait completes.
+  backupRemoved: boolean;
 };
 
 export type DockerGpuCloneRunOptions = {
@@ -961,30 +971,38 @@ export function recreateOpenShellDockerSandboxWithGpu(
     }
     context.newContainerId = newContainerId;
 
-    d.dockerRm(backupContainerName, {
-      ignoreError: true,
-      suppressOutput: true,
-      timeout: DOCKER_GPU_PATCH_TIMEOUT_MS,
-    });
-
-    if (options.waitForSupervisor !== false) {
-      const execReady = waitForOpenShellSupervisorReconnect(
-        options.sandboxName,
-        options.timeoutSecs ?? DOCKER_GPU_PATCH_WAIT_SECS,
-        deps,
-      );
-      if (!execReady) {
-        throw new Error("OpenShell supervisor did not reconnect to the GPU-enabled container.");
-      }
-    }
-
-    return {
+    const selectedMode = selection.mode;
+    const buildPatchResult = (backupRemoved: boolean): DockerGpuPatchResult => ({
       applied: true,
       oldContainerId,
       newContainerId,
+      originalName,
       backupContainerName,
-      mode: selection.mode,
-    };
+      mode: selectedMode,
+      backupRemoved,
+    });
+
+    // Deferred: caller will run the supervisor wait and call
+    // `finalizeDockerGpuPatchBackup` (success → remove the backup, failure →
+    // roll back to it). Removing the backup here would strand the user with
+    // a deleted-backup / failed-new sandbox if the deferred reconnect fails.
+    if (options.waitForSupervisor === false) return buildPatchResult(false);
+
+    const execReady = waitForOpenShellSupervisorReconnect(
+      options.sandboxName,
+      options.timeoutSecs ?? DOCKER_GPU_PATCH_WAIT_SECS,
+      deps,
+    );
+    const reconcile = reconcileSupervisorReconnect(
+      execReady,
+      { newContainerId, backupContainerName, originalName },
+      deps,
+    );
+    if (!reconcile.execReady) {
+      context.rolledBack = reconcile.rolledBack;
+      throw reconcile.error;
+    }
+    return buildPatchResult(reconcile.backupRemoved);
   } catch (error) {
     const err = error instanceof Error ? error : new Error(String(error));
     throw decoratePatchError(err, context);
@@ -1498,6 +1516,7 @@ export function collectDockerGpuPatchDiagnostics(
     `old_container_id=${context?.oldContainerId ?? "unknown"}`,
     `new_container_id=${context?.newContainerId ?? "unknown"}`,
     `backup_container_name=${context?.backupContainerName ?? "none"}`,
+    `rolled_back=${context?.rolledBack === true ? "yes" : context?.rolledBack === false ? "failed" : "no"}`,
     "cleanup_commands:",
     ...cleanupCommands.map((command) => `  ${command}`),
   ];
diff --git a/src/lib/onboard/docker-gpu-sandbox-create.test.ts b/src/lib/onboard/docker-gpu-sandbox-create.test.ts
new file mode 100644
index 0000000000..281dbdbda7
--- /dev/null
+++ b/src/lib/onboard/docker-gpu-sandbox-create.test.ts
@@ -0,0 +1,228 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+
+import type {
+  DockerGpuPatchFailureContext,
+  DockerGpuPatchResult,
+} from "../../../dist/lib/onboard/docker-gpu-patch";
+import { createDockerGpuSandboxCreatePatch } from "../../../dist/lib/onboard/docker-gpu-sandbox-create";
+
+function deferredCreateResult(): DockerGpuPatchResult {
+  return {
+    applied: true,
+    oldContainerId: "old-container-id",
+    newContainerId: "new-container-id",
+    originalName: "openshell-alpha",
+    backupContainerName: "openshell-alpha-nemoclaw-gpu-backup-1780491860342",
+    mode: {
+      kind: "gpus",
+      label: "--gpus all",
+      device: "all",
+      args: ["--gpus", "all"],
+    },
+    backupRemoved: false,
+  };
+}
+
+function makeDeps() {
+  return {
+    runOpenshell: vi.fn(() => ({ status: 0 })),
+    runCaptureOpenshell: vi.fn(() => ""),
+    sleep: vi.fn(),
+    dockerCapture: vi.fn(() => ""),
+  };
+}
+
+describe("createDockerGpuSandboxCreatePatch composed flow", () => {
+  beforeEach(() => {
+    vi.spyOn(console, "log").mockImplementation(() => {});
+    vi.spyOn(console, "error").mockImplementation(() => {});
+  });
+
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it("defers backup removal until waitForSupervisorReconnectIfNeeded sees supervisorReady=true", () => {
+    const deps = makeDeps();
+    const result = deferredCreateResult();
+    const recreatePatch = vi.fn(() => result);
+    const waitForSupervisor = vi.fn(() => true);
+    const finalizeBackup = vi.fn(() => ({ backupRemoved: true, rolledBack: false }));
+    const onPatchFailureExit = vi.fn();
+    const findContainerIds = vi.fn(() => ["existing-container"]);
+
+    const patch = createDockerGpuSandboxCreatePatch({
+      enabled: true,
+      sandboxName: "alpha",
+      timeoutSecs: 60,
+      deps,
+      overrides: {
+        findContainerIds,
+        recreatePatch,
+        waitForSupervisor,
+        finalizeBackup,
+        onPatchFailureExit,
+      },
+    });
+
+    patch.maybeApplyDuringCreate();
+    expect(recreatePatch).toHaveBeenCalledWith(
+      expect.objectContaining({ waitForSupervisor: false }),
+      expect.objectContaining({ runCaptureOpenshell: deps.runCaptureOpenshell }),
+    );
+    // Critical invariant: the patch helper must NOT remove the backup during
+    // create (recreatePatch was called with waitForSupervisor: false; the
+    // result still carries backupRemoved=false).
+    expect(finalizeBackup).not.toHaveBeenCalled();
+
+    patch.waitForSupervisorReconnectIfNeeded();
+    expect(waitForSupervisor).toHaveBeenCalledTimes(1);
+    expect(finalizeBackup).toHaveBeenCalledTimes(1);
+    expect(finalizeBackup).toHaveBeenCalledWith(
+      { result, supervisorReady: true },
+      deps,
+    );
+    expect(onPatchFailureExit).not.toHaveBeenCalled();
+  });
+
+  it("rolls back to the backup container and surfaces rolledBack=true diagnostics when supervisorReady=false", () => {
+    const deps = makeDeps();
+    const result = deferredCreateResult();
+    const recreatePatch = vi.fn(() => result);
+    const waitForSupervisor = vi.fn(() => false);
+    const finalizeBackup = vi.fn(() => ({ backupRemoved: false, rolledBack: true }));
+    const onPatchFailureExit = vi.fn();
+    const findContainerIds = vi.fn(() => ["existing-container"]);
+
+    const patch = createDockerGpuSandboxCreatePatch({
+      enabled: true,
+      sandboxName: "alpha",
+      timeoutSecs: 60,
+      deps,
+      overrides: {
+        findContainerIds,
+        recreatePatch,
+        waitForSupervisor,
+        finalizeBackup,
+        onPatchFailureExit,
+      },
+    });
+
+    patch.maybeApplyDuringCreate();
+    patch.waitForSupervisorReconnectIfNeeded();
+
+    expect(finalizeBackup).toHaveBeenCalledWith(
+      { result, supervisorReady: false },
+      deps,
+    );
+    expect(onPatchFailureExit).toHaveBeenCalledTimes(1);
+    const [sandboxName, error, exitDeps] = onPatchFailureExit.mock.calls[0];
+    expect(sandboxName).toBe("alpha");
+    expect((error as Error).message).toMatch(/pre-patch sandbox restored/);
+    const context = (exitDeps as { context: DockerGpuPatchFailureContext }).context;
+    expect(context.rolledBack).toBe(true);
+    expect(context.newContainerId).toBe("new-container-id");
+    expect(context.backupContainerName).toBe(result.backupContainerName);
+  });
+
+  it("reports rolledBack=false in diagnostics when rollback itself fails", () => {
+    const deps = makeDeps();
+    const result = deferredCreateResult();
+    const recreatePatch = vi.fn(() => result);
+    const waitForSupervisor = vi.fn(() => false);
+    const finalizeBackup = vi.fn(() => ({ backupRemoved: false, rolledBack: false }));
+    const onPatchFailureExit = vi.fn();
+    const findContainerIds = vi.fn(() => ["existing-container"]);
+
+    const patch = createDockerGpuSandboxCreatePatch({
+      enabled: true,
+      sandboxName: "alpha",
+      timeoutSecs: 60,
+      deps,
+      overrides: {
+        findContainerIds,
+        recreatePatch,
+        waitForSupervisor,
+        finalizeBackup,
+        onPatchFailureExit,
+      },
+    });
+
+    patch.maybeApplyDuringCreate();
+    patch.waitForSupervisorReconnectIfNeeded();
+
+    expect(onPatchFailureExit).toHaveBeenCalledTimes(1);
+    const [, error, exitDeps] = onPatchFailureExit.mock.calls[0];
+    expect((error as Error).message).toMatch(/rollback failed; pre-patch sandbox was NOT restored/);
+    const context = (exitDeps as { context: DockerGpuPatchFailureContext }).context;
+    expect(context.rolledBack).toBe(false);
+  });
+
+  it("skips both apply and supervisor wait when no OpenShell container is found", () => {
+    const deps = makeDeps();
+    const recreatePatch = vi.fn();
+    const waitForSupervisor = vi.fn();
+    const finalizeBackup = vi.fn();
+    const onPatchFailureExit = vi.fn();
+    const findContainerIds = vi.fn(() => []);
+
+    const patch = createDockerGpuSandboxCreatePatch({
+      enabled: true,
+      sandboxName: "alpha",
+      timeoutSecs: 60,
+      deps,
+      overrides: {
+        findContainerIds,
+        recreatePatch,
+        waitForSupervisor,
+        finalizeBackup,
+        onPatchFailureExit,
+      },
+    });
+
+    patch.maybeApplyDuringCreate();
+    patch.waitForSupervisorReconnectIfNeeded();
+
+    expect(recreatePatch).not.toHaveBeenCalled();
+    expect(waitForSupervisor).not.toHaveBeenCalled();
+    expect(finalizeBackup).not.toHaveBeenCalled();
+    expect(onPatchFailureExit).not.toHaveBeenCalled();
+  });
+
+  it("records patchError when recreate throws and exitOnPatchError reports it via printDockerGpuPatchFailureAndExit", () => {
+    const deps = makeDeps();
+    const recreatePatch = vi.fn(() => {
+      throw new Error("docker rename failed");
+    });
+    const waitForSupervisor = vi.fn();
+    const finalizeBackup = vi.fn();
+    const onPatchFailureExit = vi.fn();
+    const findContainerIds = vi.fn(() => ["existing-container"]);
+
+    const patch = createDockerGpuSandboxCreatePatch({
+      enabled: true,
+      sandboxName: "alpha",
+      timeoutSecs: 60,
+      deps,
+      overrides: {
+        findContainerIds,
+        recreatePatch,
+        waitForSupervisor,
+        finalizeBackup,
+        onPatchFailureExit,
+      },
+    });
+
+    patch.maybeApplyDuringCreate();
+    expect(patch.createFailureMessage()).toMatch(/Docker GPU patch failed/);
+    patch.exitOnPatchError();
+    expect(onPatchFailureExit).toHaveBeenCalledTimes(1);
+    // Supervisor wait must be skipped because needsSupervisorWait stayed false.
+    patch.waitForSupervisorReconnectIfNeeded();
+    expect(waitForSupervisor).not.toHaveBeenCalled();
+    expect(finalizeBackup).not.toHaveBeenCalled();
+  });
+});
diff --git a/src/lib/onboard/docker-gpu-sandbox-create.ts b/src/lib/onboard/docker-gpu-sandbox-create.ts
index 2c22f46608..2ae979c2b2 100644
--- a/src/lib/onboard/docker-gpu-sandbox-create.ts
+++ b/src/lib/onboard/docker-gpu-sandbox-create.ts
@@ -21,12 +21,26 @@ import {
   shouldApplyDockerGpuPatch,
   waitForOpenShellSupervisorReconnect,
 } from "./docker-gpu-patch";
+import { finalizeDockerGpuPatchBackup } from "./docker-gpu-patch-finalize";
 
 type DockerGpuSandboxCreateDeps = Pick<
   DockerGpuPatchDeps,
   "runOpenshell" | "runCaptureOpenshell" | "sleep" | "dockerCapture"
 >;
 
+type RecreatePatchFn = typeof recreateOpenShellDockerSandboxWithGpu;
+type WaitSupervisorFn = typeof waitForOpenShellSupervisorReconnect;
+type FindContainerIdsFn = typeof findOpenShellDockerSandboxContainerIds;
+type FinalizeBackupFn = typeof finalizeDockerGpuPatchBackup;
+// Loosen the override return type from `never` to `void` so tests can pass a
+// plain `vi.fn()` mock. Production wires `printDockerGpuPatchFailureAndExit`
+// which has return type `never`; that is assignable to `void`.
+type PatchFailureExitFn = (
+  sandboxName: string,
+  error: unknown,
+  deps: Parameters<typeof printDockerGpuPatchFailureAndExit>[2],
+) => void;
+
 type DockerGpuSandboxCreatePatchOptions = {
   enabled: boolean;
   sandboxName: string;
@@ -35,6 +49,19 @@ type DockerGpuSandboxCreatePatchOptions = {
   timeoutSecs: number;
   backend?: DockerGpuPatchBackend;
   deps: DockerGpuSandboxCreateDeps;
+  /**
+   * Test seams. The production composition uses the canonical
+   * `docker-gpu-patch`/`docker-gpu-patch-finalize` exports; tests substitute
+   * lightweight mocks to drive the deferred-finalize sequence without
+   * standing up the full Docker recreate plumbing.
+   */
+  overrides?: {
+    findContainerIds?: FindContainerIdsFn;
+    recreatePatch?: RecreatePatchFn;
+    waitForSupervisor?: WaitSupervisorFn;
+    finalizeBackup?: FinalizeBackupFn;
+    onPatchFailureExit?: PatchFailureExitFn;
+  };
 };
 
 type DockerGpuSandboxConfig = {
@@ -80,6 +107,15 @@ export function createDockerGpuSandboxCreatePatch(
   let patchError: unknown = null;
   let needsSupervisorWait = false;
 
+  const findContainerIds =
+    options.overrides?.findContainerIds ?? findOpenShellDockerSandboxContainerIds;
+  const recreatePatch = options.overrides?.recreatePatch ?? recreateOpenShellDockerSandboxWithGpu;
+  const waitForSupervisor =
+    options.overrides?.waitForSupervisor ?? waitForOpenShellSupervisorReconnect;
+  const finalizeBackup = options.overrides?.finalizeBackup ?? finalizeDockerGpuPatchBackup;
+  const onPatchFailureExit =
+    options.overrides?.onPatchFailureExit ?? printDockerGpuPatchFailureAndExit;
+
   const applyOptions = {
     sandboxName: options.sandboxName,
     gpuDevice: options.gpuDevice,
@@ -91,13 +127,13 @@ export function createDockerGpuSandboxCreatePatch(
   return {
     maybeApplyDuringCreate() {
       if (!options.enabled || result || patchError) return;
-      const containerIds = findOpenShellDockerSandboxContainerIds(options.sandboxName);
+      const containerIds = findContainerIds(options.sandboxName);
       if (containerIds.length === 0) return;
       console.log(
         "  OpenShell Docker container detected; recreating it with NVIDIA GPU access before readiness wait...",
       );
       try {
-        result = recreateOpenShellDockerSandboxWithGpu(
+        result = recreatePatch(
           { ...applyOptions, waitForSupervisor: false },
           { runCaptureOpenshell: options.deps.runCaptureOpenshell, sleep: options.deps.sleep },
         );
@@ -115,7 +151,7 @@ export function createDockerGpuSandboxCreatePatch(
 
     exitOnPatchError() {
       if (!patchError) return;
-      printDockerGpuPatchFailureAndExit(options.sandboxName, patchError, {
+      onPatchFailureExit(options.sandboxName, patchError, {
         runCaptureOpenshell: options.deps.runCaptureOpenshell,
         dockerCapture: options.deps.dockerCapture,
       });
@@ -134,7 +170,7 @@ export function createDockerGpuSandboxCreatePatch(
       console.log(
         `  Waiting for OpenShell supervisor to reconnect to the GPU-enabled container (up to ${supervisorReconnectTimeoutSecs}s)...`,
       );
-      const supervisorReady = waitForOpenShellSupervisorReconnect(
+      const supervisorReady = waitForSupervisor(
         options.sandboxName,
         supervisorReconnectTimeoutSecs,
         {
@@ -147,10 +183,21 @@ export function createDockerGpuSandboxCreatePatch(
           sleep: options.deps.sleep,
         },
       );
+      const finalizeOutcome = result
+        ? finalizeBackup({ result, supervisorReady }, options.deps)
+        : null;
       if (supervisorReady) return;
-      printDockerGpuPatchFailureAndExit(
+      const failureMessage = (() => {
+        if (!finalizeOutcome) {
+          return "OpenShell supervisor did not reconnect to the GPU-enabled container.";
+        }
+        return finalizeOutcome.rolledBack
+          ? "OpenShell supervisor did not reconnect to the GPU-enabled container; pre-patch sandbox restored."
+          : "OpenShell supervisor did not reconnect to the GPU-enabled container and rollback failed; pre-patch sandbox was NOT restored.";
+      })();
+      onPatchFailureExit(
         options.sandboxName,
-        new Error("OpenShell supervisor did not reconnect to the GPU-enabled container."),
+        new Error(failureMessage),
         {
           runCaptureOpenshell: options.deps.runCaptureOpenshell,
           dockerCapture: options.deps.dockerCapture,
@@ -160,6 +207,7 @@ export function createDockerGpuSandboxCreatePatch(
             newContainerId: result?.newContainerId,
             backupContainerName: result?.backupContainerName,
             selectedMode: result?.mode ?? null,
+            rolledBack: finalizeOutcome?.rolledBack ?? false,
           },
         },
       );
diff --git a/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts b/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts
index 62976067f0..34a2564786 100644
--- a/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts
+++ b/src/lib/onboard/docker-gpu-supervisor-reconnect.test.ts
@@ -98,8 +98,8 @@ describe("docker-gpu-supervisor-reconnect Error-phase debounce", () => {
     expect(runOpenshell).toHaveBeenCalledTimes(6);
   });
 
-  it("defaults the debounce to 5 polls and honors the env override", () => {
-    expect(getDockerGpuSupervisorReconnectErrorDebouncePolls({})).toBe(5);
+  it("defaults the debounce to 15 polls and honors the env override", () => {
+    expect(getDockerGpuSupervisorReconnectErrorDebouncePolls({})).toBe(15);
     expect(
       getDockerGpuSupervisorReconnectErrorDebouncePolls({
         NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_DEBOUNCE: "2",
@@ -151,9 +151,9 @@ describe("docker-gpu-supervisor-reconnect Error-phase debounce", () => {
       });
 
       expect(ok).toBe(false);
-      // Default K=5 from the env-backed helper: 5 polls + 4 sleeps before fast-fail.
-      expect(runOpenshell).toHaveBeenCalledTimes(5);
-      expect(sleep).toHaveBeenCalledTimes(4);
+      // Default K=15 from the env-backed helper: 15 polls + 14 sleeps before fast-fail.
+      expect(runOpenshell).toHaveBeenCalledTimes(15);
+      expect(sleep).toHaveBeenCalledTimes(14);
     }
   });
 });
diff --git a/src/lib/onboard/docker-gpu-supervisor-reconnect.ts b/src/lib/onboard/docker-gpu-supervisor-reconnect.ts
index c8906e9501..ec69d4c42f 100644
--- a/src/lib/onboard/docker-gpu-supervisor-reconnect.ts
+++ b/src/lib/onboard/docker-gpu-supervisor-reconnect.ts
@@ -29,10 +29,25 @@ import { envInt } from "./env";
 const DOCKER_GPU_PATCH_TIMEOUT_MS = 30_000;
 const DOCKER_GPU_SUPERVISOR_RECONNECT_MIN_SECS = 900;
 // Default consecutive Error-phase polls required before fast-fail. With a
-// 2-second poll interval this is ~10s of sustained Error, which absorbs the
-// transient Error reported during container recreation while still bailing
-// fast on a patched container that crashed on startup.
-const DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_PHASE_DEFAULT_DEBOUNCE_POLLS = 5;
+// 2-second poll interval this is ~30s of sustained Error, leaving headroom
+// for slower hosts (Docker Desktop on WSL2, DGX Spark cached re-onboard)
+// whose sandbox list cache divergence was observed at ~12s in the original
+// repro. Hosts that genuinely crashed on startup hit the rollback path in
+// applyDockerGpuPatch rather than waiting out the full window.
+//
+// Alternative considered: branching on Docker State.Status + Health.Status
+// to keep retrying when the patched container reports Status=running plus
+// Health=starting. Rejected because the patched container's Health depends
+// on the OpenShell supervisor script — the same signal this wait observes
+// via `openshell sandbox list` — so Docker Health is either redundant or
+// lags by several seconds. The debounce-plus-rollback path also guarantees
+// the user keeps the pre-patch CPU sandbox on reconnect failure, which a
+// Health-aware retry alone would not provide. If a future repro shows
+// Status=running + Health=starting genuinely failing reconnect after this
+// 30s window, switch to a Health-aware retry, but extract Docker health
+// probing into a separate observation channel first rather than overloading
+// this one.
+const DOCKER_GPU_SUPERVISOR_RECONNECT_ERROR_PHASE_DEFAULT_DEBOUNCE_POLLS = 15;
 
 export const DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT_ENV =
   "NEMOCLAW_DOCKER_GPU_SUPERVISOR_RECONNECT_TIMEOUT";
diff --git a/src/lib/sandbox/build-context.ts b/src/lib/sandbox/build-context.ts
index 6398f3ce96..16f6f99ba0 100644
--- a/src/lib/sandbox/build-context.ts
+++ b/src/lib/sandbox/build-context.ts
@@ -132,6 +132,10 @@ function stageOptimizedSandboxBuildContext(
     path.join(rootDir, "scripts", "lib", "openclaw_device_approval_policy.py"),
     path.join(stagedScriptsDir, "lib", "openclaw_device_approval_policy.py"),
   );
+  fs.copyFileSync(
+    path.join(rootDir, "scripts", "lib", "clean_runtime_shell_env_shim.py"),
+    path.join(stagedScriptsDir, "lib", "clean_runtime_shell_env_shim.py"),
+  );
   // OpenClaw config generator extracted in #2449 (fixed in #2565)
   fs.copyFileSync(
     path.join(rootDir, "scripts", "generate-openclaw-config.mts"),
diff --git a/test/clean-runtime-shell-env-shim.test.ts b/test/clean-runtime-shell-env-shim.test.ts
new file mode 100644
index 0000000000..7f51f9e8d6
--- /dev/null
+++ b/test/clean-runtime-shell-env-shim.test.ts
@@ -0,0 +1,116 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { spawnSync } from "node:child_process";
+import fs from "node:fs";
+import os from "node:os";
+import path from "node:path";
+
+import { afterEach, beforeEach, describe, expect, it } from "vitest";
+
+const CLEAN_SCRIPT = path.join(
+  import.meta.dirname,
+  "..",
+  "scripts",
+  "lib",
+  "clean_runtime_shell_env_shim.py",
+);
+const SHIM_TEXT = '[ -f /tmp/nemoclaw-proxy-env.sh ] && . /tmp/nemoclaw-proxy-env.sh';
+const CURRENT_UID = process.getuid?.() ?? 0;
+
+function runScript(args: { rcPath: string; shim: string; uid: number }): {
+  status: number | null;
+  stderr: string;
+  stdout: string;
+} {
+  const result = spawnSync(
+    "python3",
+    [CLEAN_SCRIPT, args.rcPath, args.shim, String(args.uid)],
+    { encoding: "utf-8" },
+  );
+  return {
+    status: result.status,
+    stderr: result.stderr ?? "",
+    stdout: result.stdout ?? "",
+  };
+}
+
+describe("clean_runtime_shell_env_shim.py", () => {
+  let tmpDir: string;
+
+  beforeEach(() => {
+    tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-rc-test-"));
+  });
+
+  afterEach(() => {
+    fs.rmSync(tmpDir, { recursive: true, force: true });
+  });
+
+  it("removes the legacy two-line stanza when uid owns the rc file", () => {
+    const rcPath = path.join(tmpDir, ".bashrc");
+    const before = `export FOO=1\n# Source runtime proxy config\n${SHIM_TEXT}\nexport BAR=2\n`;
+    fs.writeFileSync(rcPath, before, { mode: 0o644 });
+
+    const result = runScript({ rcPath, shim: SHIM_TEXT, uid: CURRENT_UID });
+
+    expect(result.status).toBe(0);
+    const after = fs.readFileSync(rcPath, "utf-8");
+    expect(after).toBe("export FOO=1\nexport BAR=2\n");
+  });
+
+  it("leaves the rc file untouched and exits 0 when the entrypoint uid does not own it", () => {
+    // When the entrypoint runs as a non-root uid against an rc file owned by
+    // a different uid (e.g. root-owned legacy .bashrc), the pre-fix cleanup
+    // raised EPERM under errexit and killed the container. The ownership
+    // guard now logs and exits 0 instead.
+    const rcPath = path.join(tmpDir, ".bashrc");
+    const before = `# Source runtime proxy config\n${SHIM_TEXT}\nexport REAL_USER_LINE=keep\n`;
+    fs.writeFileSync(rcPath, before, { mode: 0o644 });
+
+    // Mismatched uid: pretend we are running as a foreign uid against a file
+    // owned by the test runner. Real container repro uses uid=1000 vs root.
+    const foreignUid = CURRENT_UID + 99999;
+    const result = runScript({ rcPath, shim: SHIM_TEXT, uid: foreignUid });
+
+    expect(result.status).toBe(0);
+    expect(result.stderr).toContain("[SECURITY] skipping rc cleanup");
+    expect(result.stderr).toContain(`file uid=${CURRENT_UID}`);
+    expect(result.stderr).toContain(`uid=${foreignUid}`);
+
+    const after = fs.readFileSync(rcPath, "utf-8");
+    expect(after).toBe(before);
+  });
+
+  it("exits 0 without rewriting when the rc file is already clean", () => {
+    const rcPath = path.join(tmpDir, ".bashrc");
+    const before = "export FOO=1\nexport BAR=2\n";
+    fs.writeFileSync(rcPath, before, { mode: 0o644 });
+
+    const result = runScript({ rcPath, shim: SHIM_TEXT, uid: CURRENT_UID });
+
+    expect(result.status).toBe(0);
+    expect(fs.readFileSync(rcPath, "utf-8")).toBe(before);
+  });
+
+  it("refuses a symlinked rc file and exits 1", () => {
+    const realPath = path.join(tmpDir, "real");
+    const linkPath = path.join(tmpDir, ".bashrc");
+    fs.writeFileSync(realPath, "export FOO=1\n", { mode: 0o644 });
+    fs.symlinkSync(realPath, linkPath);
+
+    const result = runScript({ rcPath: linkPath, shim: SHIM_TEXT, uid: 0 });
+    expect(result.status).toBe(1);
+    expect(result.stderr).toContain("refusing symlinked rc file");
+  });
+
+  it("strips a bare shim line without the preceding comment", () => {
+    const rcPath = path.join(tmpDir, ".bashrc");
+    const before = `export FOO=1\n${SHIM_TEXT}\nexport BAR=2\n`;
+    fs.writeFileSync(rcPath, before, { mode: 0o644 });
+
+    const result = runScript({ rcPath, shim: SHIM_TEXT, uid: CURRENT_UID });
+
+    expect(result.status).toBe(0);
+    expect(fs.readFileSync(rcPath, "utf-8")).toBe("export FOO=1\nexport BAR=2\n");
+  });
+});
diff --git a/test/nemoclaw-start.test.ts b/test/nemoclaw-start.test.ts
index 52f7d38de8..d00f232406 100644
--- a/test/nemoclaw-start.test.ts
+++ b/test/nemoclaw-start.test.ts
@@ -30,14 +30,6 @@ function runtimeShellEnvBlock(src: string): string {
   return src.slice(start, end);
 }
 
-function runtimeShellEnvShimBlock(src: string): string {
-  const start = src.indexOf("ensure_runtime_shell_env_shim() {");
-  const end = src.indexOf("# ── Legacy layout migration", start);
-  expect(start).toBeGreaterThan(-1);
-  expect(end).toBeGreaterThan(start);
-  return src.slice(start, end);
-}
-
 function nonRootFallbackBlock(src: string): string {
   const start = src.indexOf("# ── Non-root fallback");
   const end = src.indexOf("# ── Root path", start);
diff --git a/test/sandbox-build-context.test.ts b/test/sandbox-build-context.test.ts
index 7299314a96..af8e8b3a0b 100644
--- a/test/sandbox-build-context.test.ts
+++ b/test/sandbox-build-context.test.ts
@@ -74,6 +74,7 @@ describe("sandbox build context staging", () => {
     writeFixture(path.join("scripts", "codex-acp-wrapper.sh"));
     writeFixture(path.join("scripts", "lib", "sandbox-init.sh"));
     writeFixture(path.join("scripts", "lib", "openclaw_device_approval_policy.py"));
+    writeFixture(path.join("scripts", "lib", "clean_runtime_shell_env_shim.py"));
     writeFixture(path.join("scripts", "generate-openclaw-config.mts"));
     writeFixture(path.join("scripts", "openclaw-build-messaging-plugins.py"));
     writeFixture(path.join("scripts", "seed-wechat-accounts.py"));
@@ -255,6 +256,9 @@ describe("sandbox build context staging", () => {
       expect(
         fs.existsSync(path.join(buildCtx, "scripts", "lib", "openclaw_device_approval_policy.py")),
       ).toBe(true);
+      expect(
+        fs.existsSync(path.join(buildCtx, "scripts", "lib", "clean_runtime_shell_env_shim.py")),
+      ).toBe(true);
       expect(fs.existsSync(path.join(buildCtx, "scripts", "patch-openclaw-tool-catalog.js"))).toBe(
         true,
       );
diff --git a/test/sandbox-provisioning.test.ts b/test/sandbox-provisioning.test.ts
index e513d909a7..eeac9ab3ce 100644
--- a/test/sandbox-provisioning.test.ts
+++ b/test/sandbox-provisioning.test.ts
@@ -779,6 +779,7 @@ describe("sandbox provisioning: copied OpenClaw helper permissions (#2861)", ()
       path.join(localBin, "nemoclaw-codex-acp"),
       path.join(localLib, "sandbox-init.sh"),
       path.join(localLib, "openclaw_device_approval_policy.py"),
+      path.join(localLib, "clean_runtime_shell_env_shim.py"),
       path.join(localLib, "generate-openclaw-config.mts"),
       path.join(localLib, "openclaw-build-messaging-plugins.py"),
       path.join(localLib, "seed-wechat-accounts.py"),
diff --git a/test/service-env.test.ts b/test/service-env.test.ts
index 6ac6f4c15b..92a5cf0635 100644
--- a/test/service-env.test.ts
+++ b/test/service-env.test.ts
@@ -20,6 +20,14 @@ import { join } from "node:path";
 import { resolveOpenshell } from "../dist/lib/adapters/openshell/resolve";
 
 const NEMOCLAW_START_SCRIPT = join(import.meta.dirname, "../scripts/nemoclaw-start.sh");
+const RC_CLEAN_SCRIPT = join(
+  import.meta.dirname,
+  "../scripts/lib/clean_runtime_shell_env_shim.py",
+);
+
+function rcShimWrapperHeader(): string {
+  return `export NEMOCLAW_RC_CLEAN_SCRIPT=${JSON.stringify(RC_CLEAN_SCRIPT)}`;
+}
 
 function extractRuntimeShellEnvSnippet() {
   const src = readFileSync(NEMOCLAW_START_SCRIPT, "utf-8");
@@ -573,6 +581,7 @@ describe("service environment", () => {
           `_SANDBOX_HOME=${JSON.stringify(fakeHome)}`,
           `_RUNTIME_SHELL_ENV_FILE=${JSON.stringify(proxyEnvPath)}`,
           '_RUNTIME_SHELL_ENV_SHIM="[ -f ${_RUNTIME_SHELL_ENV_FILE} ] && . ${_RUNTIME_SHELL_ENV_FILE}"',
+          rcShimWrapperHeader(),
           extractRuntimeShellEnvShimSnippet(),
           "ensure_runtime_shell_env_shim",
         ].join("\n");
@@ -627,6 +636,7 @@ describe("service environment", () => {
           '_RUNTIME_SHELL_ENV_SHIM="[ -f ${_RUNTIME_SHELL_ENV_FILE} ] && . ${_RUNTIME_SHELL_ENV_FILE}"',
           'legacy_tmp="${_SANDBOX_HOME}/.bashrc.nemoclaw-clean.$$"',
           `ln -s ${JSON.stringify(sensitivePath)} "$legacy_tmp"`,
+          rcShimWrapperHeader(),
           extractRuntimeShellEnvShimSnippet(),
           "ensure_runtime_shell_env_shim",
         ].join("\n");
@@ -677,6 +687,7 @@ describe("service environment", () => {
           '_RUNTIME_SHELL_ENV_SHIM="[ -f ${_RUNTIME_SHELL_ENV_FILE} ] && . ${_RUNTIME_SHELL_ENV_FILE}"',
           'chown() { echo "unexpected chown $*" >&2; exit 42; }',
           'chmod() { echo "unexpected chmod $*" >&2; exit 43; }',
+          rcShimWrapperHeader(),
           extractRuntimeShellEnvShimSnippet(),
         ].join("\n");
         writeFileSync(tmpFile, wrapper, { mode: 0o700 });
@@ -716,6 +727,7 @@ describe("service environment", () => {
           `_SANDBOX_HOME=${JSON.stringify(fakeHome)}`,
           `_RUNTIME_SHELL_ENV_FILE=${JSON.stringify(proxyEnvPath)}`,
           '_RUNTIME_SHELL_ENV_SHIM="[ -f ${_RUNTIME_SHELL_ENV_FILE} ] && . ${_RUNTIME_SHELL_ENV_FILE}"',
+          rcShimWrapperHeader(),
           extractRuntimeShellEnvShimSnippet(),
           "ensure_runtime_shell_env_shim",
         ].join("\n");
@@ -772,6 +784,7 @@ describe("service environment", () => {
           `_SANDBOX_HOME=${JSON.stringify(fakeHome)}`,
           `_RUNTIME_SHELL_ENV_FILE=${JSON.stringify(proxyEnvPath)}`,
           '_RUNTIME_SHELL_ENV_SHIM="[ -f ${_RUNTIME_SHELL_ENV_FILE} ] && . ${_RUNTIME_SHELL_ENV_FILE}"',
+          rcShimWrapperHeader(),
           extractRuntimeShellEnvShimSnippet(),
           "ensure_runtime_shell_env_shim",
         ].join("\n");
@@ -803,6 +816,83 @@ describe("service environment", () => {
       }
     });
 
+    // Composed startup invariant: write_runtime_shell_env emits the proxy
+    // env file with mode 444, ensure_runtime_shell_env_shim then sees a
+    // foreign-owned .bashrc and must exit 0 (otherwise the entrypoint would
+    // terminate the container with exit code 1). The composed assertion
+    // proves the legacy trust-boundary file remains non-user-writable
+    // across the skip path.
+    it("composed startup leaves the proxy env file at mode 444 when the rc cleanup skips a foreign-owned rc file", () => {
+      const fakeDataDir = mkdtempSync(join(tmpdir(), "nemoclaw-rc-skip-composed-"));
+      const fakeHome = mkdtempSync(join(tmpdir(), "nemoclaw-rc-skip-home-"));
+      const proxyEnvPath = join(fakeDataDir, "proxy-env.sh");
+      const rcPath = join(fakeHome, ".bashrc");
+      const tmpFile = join(tmpdir(), `nemoclaw-rc-skip-composed-${process.pid}.sh`);
+      try {
+        const shimLine = `[ -f ${proxyEnvPath} ] && . ${proxyEnvPath}`;
+        const originalBashrc = [
+          "# user-managed bashrc owned by a foreign uid (e.g. root)",
+          "# Source runtime proxy config",
+          shimLine,
+          "export PATH=/usr/local/bin:$PATH",
+          "",
+        ].join("\n");
+        writeFileSync(rcPath, originalBashrc, { mode: 0o644 });
+
+        const persistBlock = extractRuntimeShellEnvSnippet()
+          .trimEnd()
+          .replaceAll("/tmp/nemoclaw-proxy-env.sh", proxyEnvPath);
+        // Foreign uid that does not match the test-runner's actual file owner.
+        // Overriding `id -u` for the bash function-level shim invocation is
+        // the cheapest way to drive the "uid != owner" branch without root.
+        const foreignUid = (process.getuid?.() ?? 1000) + 99999;
+        const wrapper = [
+          "#!/usr/bin/env bash",
+          "set -euo pipefail",
+          sandboxInitSource,
+          'PROXY_HOST="10.200.0.1"',
+          'PROXY_PORT="3128"',
+          '_PROXY_URL="http://${PROXY_HOST}:${PROXY_PORT}"',
+          '_NO_PROXY_VAL="localhost,127.0.0.1,::1,${PROXY_HOST}"',
+          "_TOOL_REDIRECTS=()",
+          `_AXIOS_FIX_SCRIPT="/nonexistent/axios-proxy-fix.js"`,
+          `_SANDBOX_HOME=${JSON.stringify(fakeHome)}`,
+          `_RUNTIME_SHELL_ENV_FILE=${JSON.stringify(proxyEnvPath)}`,
+          `_RUNTIME_SHELL_ENV_SHIM="[ -f \${_RUNTIME_SHELL_ENV_FILE} ] && . \${_RUNTIME_SHELL_ENV_FILE}"`,
+          rcShimWrapperHeader(),
+          // Override `id -u` BEFORE the entrypoint snippets are sourced so the
+          // function-shadow is in place when both write_runtime_shell_env and
+          // ensure_runtime_shell_env_shim consult `$(id -u)`.
+          `id() { case "\${1:-}" in -u) echo ${foreignUid};; *) command id "$@";; esac; }`,
+          "set +u",
+          persistBlock,
+          extractRuntimeShellEnvShimSnippet(),
+          "validate_tmp_permissions " + JSON.stringify(proxyEnvPath),
+        ].join("\n");
+        writeFileSync(tmpFile, wrapper, { mode: 0o700 });
+        const result = execFileSync("bash", [tmpFile], { encoding: "utf-8" });
+
+        expect(result).not.toContain("[SECURITY] " + proxyEnvPath + " has unsafe permissions");
+
+        const finalMode = (lstatSync(proxyEnvPath).mode & 0o777).toString(8);
+        expect(finalMode).toBe("444");
+
+        const rcAfter = readFileSync(rcPath, "utf-8");
+        expect(rcAfter).toBe(originalBashrc);
+      } finally {
+        try {
+          unlinkSync(tmpFile);
+        } catch {
+          /* ignore */
+        }
+        try {
+          execFileSync("rm", ["-rf", fakeDataDir, fakeHome]);
+        } catch {
+          /* ignore */
+        }
+      }
+    });
+
     it("entrypoint overwrites proxy-env.sh cleanly on repeated invocations", () => {
       const fakeDataDir = join(tmpdir(), `nemoclaw-idempotent-test-${process.pid}`);
       execFileSync("mkdir", ["-p", fakeDataDir]);