From fbb17887689c729fc060ddca476a8c8261fef23d Mon Sep 17 00:00:00 2001
From: Dongni Yang <dongniy@nvidia.com>
Date: Wed, 3 Jun 2026 16:17:49 +0800
Subject: [PATCH 1/4] fix(sandbox): add opt-in fail-closed cap-drop enforcement
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a host does not grant CAP_SETPCAP (GitHub runners, Brev shadecloud,
Colossus Ubuntu 24.04, Docker Desktop, WSL), drop_capabilities cannot run
`capsh --drop`, so dangerous bounding-set caps survive and the agent boots
with a weaker posture than the security model assumes.

PR #4266 made this fall-through default fail-closed and was reverted within
hours (#4341) because it broke every CAP_SETPCAP-less host. Reintroduce the
protection as the inverse: the default stays warn-and-continue (zero
regression — no host loses the ability to boot), and operators who require the
hardened posture opt in with NEMOCLAW_REQUIRE_CAP_DROP=1 to refuse start when
the bounding-set drop could not happen.

Scope is the agent process tree only. The `nemoclaw connect` shell inherits
the container's OCI bounding set and is not affected; tightening that needs
cap_drop at sandbox-create, tracked upstream in NVIDIA/OpenShell#1452 — which
is why this is Refs (not Closes) #3280.

Tests drive both fall-through branches with a stubbed capsh (red before this
change, green after), decode the QA-reported CapBnd fixture deterministically,
and assert the default still warns-and-continues (no regression).

Local pre-commit Test (CLI) hook skipped via --no-verify (operator-approved):
it fails only on an offline-blocked messaging-bridge e2e test unrelated to this
change; sandbox-init tests pass 32/32. CI re-runs all hooks on the PR.

Refs #3280

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Dongni Yang <dongniy@nvidia.com>
---
 scripts/lib/sandbox-init.sh |  53 +++++++++++-
 test/sandbox-init.test.ts   | 162 ++++++++++++++++++++++++++++++++++++
 2 files changed, 214 insertions(+), 1 deletion(-)

diff --git a/scripts/lib/sandbox-init.sh b/scripts/lib/sandbox-init.sh
index 54edab7533..8671975f38 100755
--- a/scripts/lib/sandbox-init.sh
+++ b/scripts/lib/sandbox-init.sh
@@ -214,8 +214,13 @@ lock_config_after_write() {
 #   cap_kill — sandbox user signals gateway-user processes via the UID
 #     separation enforced by the entrypoint (see test 13 in
 #     e2e-gateway-isolation.sh).
+# When the runtime cannot drop the bounding set (no CAP_SETPCAP, or capsh
+# missing), the default is to warn and continue. Set NEMOCLAW_REQUIRE_CAP_DROP=1
+# to make that case fail-closed instead — see enforce_cap_drop_if_required.
+#
 # Ref: https://github.com/NVIDIA/NemoClaw/issues/797
 #      https://github.com/NVIDIA/NemoClaw/issues/3280
+#      https://github.com/NVIDIA/OpenShell/issues/1452 (connect-shell scope)
 #
 # Usage:
 #   drop_capabilities /usr/local/bin/nemoclaw-start "$@"
@@ -235,13 +240,59 @@ drop_capabilities() {
         --drop=cap_sys_admin,cap_sys_ptrace,cap_net_raw,cap_dac_override,cap_sys_chroot,cap_fsetid,cap_setfcap,cap_mknod,cap_audit_write,cap_net_bind_service \
         -- -c "exec $entrypoint \"\$@\"" -- "$@"
     else
-      report_residual_capabilities
+      report_residual_capabilities || true
+      enforce_cap_drop_if_required "CAP_SETPCAP unavailable"
     fi
   elif [ "${NEMOCLAW_CAPS_DROPPED:-}" != "1" ]; then
     echo "[SECURITY WARNING] capsh not available — running with default capabilities" >&2
+    enforce_cap_drop_if_required "capsh not available"
   fi
 }
 
+# Optional fail-closed enforcement for the residual-capability fall-throughs
+# (issue #3280). Reaching either fall-through means the bounding-set drop did
+# NOT happen, so the sandbox would boot with a weaker posture than the security
+# model assumes.
+#
+# DEFAULT IS WARN-AND-CONTINUE — no host loses the ability to boot. This is the
+# lesson of #4266/#4341: a default-fail-closed drop broke EVERY host that does
+# not grant CAP_SETPCAP (GitHub runners, Brev shadecloud, Colossus Ubuntu
+# 24.04, Docker Desktop, WSL) and was reverted within hours. Inverting the
+# default to opt-in keeps that regression off by default.
+#
+# Operators who require the hardened posture — and accept that the sandbox will
+# refuse to start on a host that cannot drop the bounding set — opt in with:
+#   NEMOCLAW_REQUIRE_CAP_DROP=1
+#
+# Note on scope: this enforces the AGENT process tree only. A `nemoclaw connect`
+# shell is spawned by the container runtime outside that tree and inherits the
+# container's OCI bounding set; tightening that requires cap_drop at sandbox
+# create, tracked upstream in NVIDIA/OpenShell#1452.
+enforce_cap_drop_if_required() {
+  local reason="$1"
+  [ "${NEMOCLAW_REQUIRE_CAP_DROP:-}" = "1" ] || return 0
+  cat >&2 <<EOF
+
+╔══════════════════════════════════════════════════════════════════════════════╗
+║ [SECURITY] Refusing to start sandbox: bounding-set capability drop failed.    ║
+║                                                                               ║
+║ Reason: ${reason}
+║                                                                               ║
+║ NEMOCLAW_REQUIRE_CAP_DROP=1 is set, so NemoClaw will not start a sandbox that ║
+║ retains dangerous bounding-set capabilities (cap_sys_admin, cap_sys_ptrace,   ║
+║ cap_net_raw, cap_dac_override, cap_net_bind_service, ...). This host cannot   ║
+║ drop them at runtime (no CAP_SETPCAP), so the drop was skipped.               ║
+║                                                                               ║
+║ To run anyway with the weaker (warn-only) posture, unset the variable:        ║
+║   unset NEMOCLAW_REQUIRE_CAP_DROP                                             ║
+║                                                                               ║
+║ Tracking: https://github.com/NVIDIA/NemoClaw/issues/3280                      ║
+╚══════════════════════════════════════════════════════════════════════════════╝
+
+EOF
+  exit 1
+}
+
 # Emit a loud diagnostic when capsh-based dropping is unavailable so that
 # residual dangerous bounding-set caps surface in logs instead of being
 # silently inherited from the container runtime. Called from the
diff --git a/test/sandbox-init.test.ts b/test/sandbox-init.test.ts
index b9ee593cf0..3574a81959 100644
--- a/test/sandbox-init.test.ts
+++ b/test/sandbox-init.test.ts
@@ -403,6 +403,168 @@ EOF
       );
       expect(stdout).toContain("SKIPPED_OK");
     });
+
+    // Repro for reopened issue #3280 (NVBug 6159223), QA FAIL reported by
+    // hulynn on v0.0.54: on a host whose container runtime does not grant
+    // CAP_SETPCAP (e.g. the Colossus Ubuntu 24.04 image), capsh --drop cannot
+    // run, so the bounding-set drop is skipped and the 8 dangerous caps
+    // (cap_sys_admin, cap_sys_ptrace, cap_net_raw, cap_dac_override,
+    // cap_net_bind_service, cap_fowner, cap_setuid, cap_setgid) remain in the
+    // sandbox bounding set.
+    //
+    // PR #4266 made this fall-through fail-closed (refuse to start unless
+    // NEMOCLAW_ALLOW_RESIDUAL_CAPS=1). PR #4341 reverted that to
+    // warn-and-continue, which is what current main does — and is exactly
+    // why QA still sees the residual caps. This test drives the
+    // CAP_SETPCAP-missing path with a stubbed capsh and asserts the current
+    // warn-and-continue behavior: the warning is emitted but the entrypoint
+    // does NOT refuse to start, so the residual-cap posture survives.
+    it("warns but does NOT refuse to start when CAP_SETPCAP is unavailable (issue #3280 repro)", () => {
+      const { stdout } = runWithLib(
+        [
+          "TMP=$(mktemp -d)",
+          // Stub capsh so it is found on PATH (command -v succeeds) but
+          // reports CAP_SETPCAP absent, forcing the fall-through that skips
+          // the actual bounding-set drop.
+          'cat >"$TMP/capsh" <<\'STUB\'',
+          "#!/bin/sh",
+          '[ "$1" = "--has-p=cap_setpcap" ] && exit 1',
+          "exit 0",
+          "STUB",
+          'chmod +x "$TMP/capsh"',
+          'export PATH="$TMP:$PATH"',
+          "drop_capabilities /usr/local/bin/fake-entrypoint 2>&1",
+          // Reached only because drop_capabilities returned instead of
+          // exiting. The pre-#4341 fail-closed policy would `exit 1` before
+          // this line, so its presence in output IS the reproduced gap.
+          'echo "SANDBOX_CONTINUED_DESPITE_RESIDUAL_CAPS"',
+          'rm -rf "$TMP"',
+        ].join("\n"),
+        { env: { NEMOCLAW_CAPS_DROPPED: "" } },
+      );
+
+      // The drop was attempted and detected as impossible...
+      expect(stdout).toContain("CAP_SETPCAP not available — cannot drop bounding-set caps via capsh");
+      // ...yet the sandbox keeps going (the bug this issue tracks).
+      expect(stdout).toContain("SANDBOX_CONTINUED_DESPITE_RESIDUAL_CAPS");
+      // And it does so silently w.r.t. fail-closed: no refusal banner and no
+      // opt-in escape hatch are present on current main (post-#4341 revert).
+      expect(stdout).not.toContain("Refusing to start sandbox");
+      expect(stdout).not.toContain("NEMOCLAW_ALLOW_RESIDUAL_CAPS");
+    });
+
+    // Companion repro at the detection layer: report_residual_capabilities
+    // must name the dangerous caps it finds in the bounding set so they are
+    // visible in the entrypoint log (the [SECURITY] line QA quoted from
+    // /tmp/nemoclaw-start.log). Decoding is exercised directly so the
+    // assertion is deterministic and host-independent (it does not depend on
+    // the test runner's own CapBnd). CapBnd 0x4a82c35fb is the exact value
+    // hulynn decoded on the failing Colossus host.
+    it("names residual dangerous caps from a known CapBnd (issue #3280 repro)", () => {
+      const { stdout } = runWithLib(
+        [
+          // Shadow /proc/self/status reads with a fixture exposing the
+          // failing host's CapBnd, then invoke the decode loop verbatim.
+          "TMP=$(mktemp -d)",
+          'printf "CapBnd:\\t00000004a82c35fb\\n" >"$TMP/status"',
+          "cap_bnd_hex=$(awk '/^CapBnd:/{print $2}' \"$TMP/status\")",
+          "val=$((16#$cap_bnd_hex))",
+          "present_caps=''",
+          "for entry in \\",
+          '  "21:cap_sys_admin" \\',
+          '  "19:cap_sys_ptrace" \\',
+          '  "13:cap_net_raw" \\',
+          '  "1:cap_dac_override" \\',
+          '  "10:cap_net_bind_service"; do',
+          '  bit="${entry%%:*}"',
+          '  name="${entry#*:}"',
+          "  if [ $(((val >> bit) & 1)) -ne 0 ]; then",
+          '    present_caps="${present_caps:+$present_caps,}$name"',
+          "  fi",
+          "done",
+          'echo "RESIDUAL: ${present_caps}"',
+          'rm -rf "$TMP"',
+        ].join("\n"),
+      );
+
+      // All five caps the entrypoint inspects are present in this CapBnd,
+      // matching the QA-reported residual set.
+      expect(stdout).toContain(
+        "RESIDUAL: cap_sys_admin,cap_sys_ptrace,cap_net_raw,cap_dac_override,cap_net_bind_service",
+      );
+    });
+
+    // ── Fix: opt-in fail-closed strict mode (issue #3280) ──────────────
+    // The inverse of the reverted #4266: default stays warn-and-continue (no
+    // regression on CAP_SETPCAP-less hosts), but operators can demand the
+    // hardened posture with NEMOCLAW_REQUIRE_CAP_DROP=1, which refuses to
+    // start when the bounding-set drop could not happen.
+
+    it("refuses to start when CAP_SETPCAP is unavailable and NEMOCLAW_REQUIRE_CAP_DROP=1", () => {
+      const { stdout, stderr } = runWithLib(
+        [
+          "TMP=$(mktemp -d)",
+          // capsh present but reports CAP_SETPCAP absent → drop is skipped.
+          'cat >"$TMP/capsh" <<\'STUB\'',
+          "#!/bin/sh",
+          '[ "$1" = "--has-p=cap_setpcap" ] && exit 1',
+          "exit 0",
+          "STUB",
+          'chmod +x "$TMP/capsh"',
+          'export PATH="$TMP:$PATH"',
+          "drop_capabilities /usr/local/bin/fake-entrypoint",
+          // Strict mode exits 1 before this line is reached.
+          'echo "SHOULD_NOT_REACH"',
+          'rm -rf "$TMP"',
+        ].join("\n"),
+        { env: { NEMOCLAW_CAPS_DROPPED: "", NEMOCLAW_REQUIRE_CAP_DROP: "1" }, expectFail: true },
+      );
+      const combined = `${stdout}\n${stderr}`;
+      expect(combined).toContain("Refusing to start sandbox");
+      expect(combined).toContain("CAP_SETPCAP unavailable");
+      expect(combined).toContain("NEMOCLAW_REQUIRE_CAP_DROP=1 is set");
+      expect(combined).not.toContain("SHOULD_NOT_REACH");
+    });
+
+    it("refuses to start when capsh is missing and NEMOCLAW_REQUIRE_CAP_DROP=1", () => {
+      const { stdout, stderr } = runWithLib(
+        `
+        drop_capabilities /usr/local/bin/fake-entrypoint
+        echo "SHOULD_NOT_REACH"
+      `,
+        {
+          // Hide capsh so command -v fails, exercising the capsh-missing branch.
+          env: { PATH: "/usr/bin:/bin", NEMOCLAW_CAPS_DROPPED: "", NEMOCLAW_REQUIRE_CAP_DROP: "1" },
+          expectFail: true,
+        },
+      );
+      const combined = `${stdout}\n${stderr}`;
+      expect(combined).toContain("capsh not available");
+      expect(combined).toContain("Refusing to start sandbox");
+      expect(combined).not.toContain("SHOULD_NOT_REACH");
+    });
+
+    it("continues (no regression) when NEMOCLAW_REQUIRE_CAP_DROP is unset", () => {
+      const { stdout } = runWithLib(
+        [
+          "TMP=$(mktemp -d)",
+          'cat >"$TMP/capsh" <<\'STUB\'',
+          "#!/bin/sh",
+          '[ "$1" = "--has-p=cap_setpcap" ] && exit 1',
+          "exit 0",
+          "STUB",
+          'chmod +x "$TMP/capsh"',
+          'export PATH="$TMP:$PATH"',
+          "drop_capabilities /usr/local/bin/fake-entrypoint 2>&1",
+          'echo "CONTINUED_OK"',
+          'rm -rf "$TMP"',
+        ].join("\n"),
+        // NEMOCLAW_REQUIRE_CAP_DROP deliberately unset → default warn-and-continue.
+        { env: { NEMOCLAW_CAPS_DROPPED: "", NEMOCLAW_REQUIRE_CAP_DROP: "" } },
+      );
+      expect(stdout).toContain("CONTINUED_OK");
+      expect(stdout).not.toContain("Refusing to start sandbox");
+    });
   });
 
   describe("init_step_down_prefixes", () => {

From 0ed7c51b7a49b1d082a1167f69aa756a2e4c0a75 Mon Sep 17 00:00:00 2001
From: Dongni Yang <dongniy@nvidia.com>
Date: Wed, 3 Jun 2026 19:43:05 +0800
Subject: [PATCH 2/4] fix(sandbox): verify actual CapBnd in strict mode; quote
 setpriv flags

Address review on #3280:
- enforce_cap_drop_if_required verifies the actual bounding set on every
  path instead of trusting the NEMOCLAW_CAPS_DROPPED sentinel, closing a
  strict-mode bypass; the refusal reason is derived from verified state.
- Extract dangerous_caps_in_capbnd() helper + NEMOCLAW_PROC_STATUS test
  seam; add sentinel-bypass and decode regression tests (35 pass).
- Quote setpriv --reuid/--regid/--bounding-set array elements (SC2191).
- Document NEMOCLAW_REQUIRE_CAP_DROP in best-practices and
  sandbox-hardening.

Refs #3280
Signed-off-by: Dongni Yang <dongniy@nvidia.com>
---
 docs/deployment/sandbox-hardening.mdx |   4 +
 docs/security/best-practices.mdx      |   3 +
 scripts/lib/sandbox-init.sh           | 145 ++++++++++--------
 test/sandbox-init.test.ts             | 209 ++++++++++++++------------
 4 files changed, 208 insertions(+), 153 deletions(-)

diff --git a/docs/deployment/sandbox-hardening.mdx b/docs/deployment/sandbox-hardening.mdx
index dc570c15e4..6baceef093 100644
--- a/docs/deployment/sandbox-hardening.mdx
+++ b/docs/deployment/sandbox-hardening.mdx
@@ -39,6 +39,10 @@ It removes `CAP_SYS_ADMIN`, `CAP_SYS_PTRACE`, `CAP_NET_RAW`,
 `CAP_MKNOD`, `CAP_AUDIT_WRITE`, and `CAP_NET_BIND_SERVICE`.
 When `setpriv` is available, the entrypoint also removes the remaining privilege-separation capabilities during the switch from root to the `sandbox` and `gateway` users.
 
+The bounding-set drop is best effort: if `capsh` or `CAP_SETPCAP` is unavailable the entrypoint logs a warning and continues with the runtime-provided capability set, and if `setpriv` is unavailable it falls back to `gosu`.
+To make the drop fail-closed instead, set `NEMOCLAW_REQUIRE_CAP_DROP=1` in the entrypoint environment: the agent then refuses to start unless the agent process tree's bounding set is verified free of the dangerous capabilities.
+This is opt-in because hosts that cannot drop capabilities (no `CAP_SETPCAP` — many cloud VMs, Docker Desktop, WSL) are common, and the check covers the agent process tree only.
+
 For defense-in-depth, also drop all Linux capabilities at the container runtime
 when you launch the image directly:
 
diff --git a/docs/security/best-practices.mdx b/docs/security/best-practices.mdx
index 3254af2b38..7268ad33c4 100644
--- a/docs/security/best-practices.mdx
+++ b/docs/security/best-practices.mdx
@@ -293,6 +293,9 @@ During `setpriv` step-down, the child process also loses `cap_setuid`, `cap_setg
 
 This behavior is best effort: if `capsh` is not available or `CAP_SETPCAP` is not in the bounding set, the entrypoint logs a warning and continues with the default capability set.
 If `setpriv` is unavailable, the entrypoint falls back to `gosu` and logs a warning that the remaining bounding-set capabilities were retained for the child process.
+
+To make the drop fail-closed instead of best-effort, set `NEMOCLAW_REQUIRE_CAP_DROP=1` in the entrypoint environment. The agent then refuses to start unless the agent process tree's bounding set is verified free of the dangerous capabilities, so it will not boot on hosts that cannot perform the drop (no `CAP_SETPCAP`, or `capsh` missing). This is opt-in because such hosts are common (many cloud VMs, Docker Desktop, WSL); leaving it unset preserves the best-effort default. The check covers the agent process tree only — a `nemoclaw connect` shell is spawned by the container runtime outside that tree and is not affected (tracked in [NVIDIA/OpenShell#1452](https://github.com/NVIDIA/OpenShell/issues/1452)).
+
 For additional protection, pass `--cap-drop=ALL` with `docker run` or Compose (see [Sandbox Hardening](../manage-sandboxes/sandbox-hardening)).
 
 | Aspect | Detail |
diff --git a/scripts/lib/sandbox-init.sh b/scripts/lib/sandbox-init.sh
index 8671975f38..373549dd52 100755
--- a/scripts/lib/sandbox-init.sh
+++ b/scripts/lib/sandbox-init.sh
@@ -239,57 +239,95 @@ drop_capabilities() {
       exec capsh \
         --drop=cap_sys_admin,cap_sys_ptrace,cap_net_raw,cap_dac_override,cap_sys_chroot,cap_fsetid,cap_setfcap,cap_mknod,cap_audit_write,cap_net_bind_service \
         -- -c "exec $entrypoint \"\$@\"" -- "$@"
-    else
-      report_residual_capabilities || true
-      enforce_cap_drop_if_required "CAP_SETPCAP unavailable"
     fi
+    # CAP_SETPCAP missing (or the exec above failed): the drop could not run.
+    # Surface the residual bounding-set caps in the log.
+    report_residual_capabilities || true
   elif [ "${NEMOCLAW_CAPS_DROPPED:-}" != "1" ]; then
     echo "[SECURITY WARNING] capsh not available — running with default capabilities" >&2
-    enforce_cap_drop_if_required "capsh not available"
   fi
+
+  # Opt-in fail-closed gate (issue #3280). Deliberately runs on EVERY path,
+  # including when NEMOCLAW_CAPS_DROPPED is already set: it verifies the actual
+  # bounding set rather than trusting that sentinel, so an inherited marker
+  # cannot mask a drop that never happened.
+  enforce_cap_drop_if_required
 }
 
-# Optional fail-closed enforcement for the residual-capability fall-throughs
-# (issue #3280). Reaching either fall-through means the bounding-set drop did
-# NOT happen, so the sandbox would boot with a weaker posture than the security
-# model assumes.
+# Pure decode: given a CapBnd hex string, echo the comma-separated list of the
+# dangerous capabilities present (empty string if none). Factored out so the
+# residual diagnostic, the strict-mode gate, and the unit tests all share one
+# implementation instead of re-deriving the bit math.
 #
-# DEFAULT IS WARN-AND-CONTINUE — no host loses the ability to boot. This is the
-# lesson of #4266/#4341: a default-fail-closed drop broke EVERY host that does
-# not grant CAP_SETPCAP (GitHub runners, Brev shadecloud, Colossus Ubuntu
+# Bash arithmetic handles 64-bit ints on 64-bit platforms; CAP_LAST_CAP is ~41
+# today, well within range. Avoids a gawk-strtonum dependency.
+dangerous_caps_in_capbnd() {
+  local cap_bnd_hex="$1" val entry bit name present=""
+  val=$((16#$cap_bnd_hex))
+  for entry in \
+    "21:cap_sys_admin" \
+    "19:cap_sys_ptrace" \
+    "13:cap_net_raw" \
+    "1:cap_dac_override" \
+    "10:cap_net_bind_service"; do
+    bit="${entry%%:*}"
+    name="${entry#*:}"
+    if [ $(((val >> bit) & 1)) -ne 0 ]; then
+      present="${present:+$present,}$name"
+    fi
+  done
+  printf '%s' "$present"
+}
+
+# Opt-in fail-closed enforcement (issue #3280). When NEMOCLAW_REQUIRE_CAP_DROP=1
+# the sandbox refuses to start unless the bounding set is provably free of the
+# dangerous capabilities. It verifies by reading the ACTUAL CapBnd — NOT by
+# trusting the NEMOCLAW_CAPS_DROPPED sentinel, which an inherited environment
+# could forge to bypass the gate.
+#
+# DEFAULT (unset) IS WARN-AND-CONTINUE — no host loses the ability to boot. This
+# is the lesson of #4266/#4341: a default-fail-closed drop broke EVERY host that
+# does not grant CAP_SETPCAP (GitHub runners, Brev shadecloud, Colossus Ubuntu
 # 24.04, Docker Desktop, WSL) and was reverted within hours. Inverting the
 # default to opt-in keeps that regression off by default.
 #
-# Operators who require the hardened posture — and accept that the sandbox will
-# refuse to start on a host that cannot drop the bounding set — opt in with:
-#   NEMOCLAW_REQUIRE_CAP_DROP=1
+# Scope: the AGENT process tree only. A `nemoclaw connect` shell is spawned by
+# the container runtime outside that tree and inherits the container's OCI
+# bounding set; tightening that requires cap_drop at sandbox create, tracked
+# upstream in NVIDIA/OpenShell#1452.
 #
-# Note on scope: this enforces the AGENT process tree only. A `nemoclaw connect`
-# shell is spawned by the container runtime outside that tree and inherits the
-# container's OCI bounding set; tightening that requires cap_drop at sandbox
-# create, tracked upstream in NVIDIA/OpenShell#1452.
+# Test seam: NEMOCLAW_PROC_STATUS overrides the status source so unit tests can
+# feed a known CapBnd fixture without a real /proc.
 enforce_cap_drop_if_required() {
-  local reason="$1"
   [ "${NEMOCLAW_REQUIRE_CAP_DROP:-}" = "1" ] || return 0
-  cat >&2 <<EOF
-
-╔══════════════════════════════════════════════════════════════════════════════╗
-║ [SECURITY] Refusing to start sandbox: bounding-set capability drop failed.    ║
-║                                                                               ║
-║ Reason: ${reason}
-║                                                                               ║
-║ NEMOCLAW_REQUIRE_CAP_DROP=1 is set, so NemoClaw will not start a sandbox that ║
-║ retains dangerous bounding-set capabilities (cap_sys_admin, cap_sys_ptrace,   ║
-║ cap_net_raw, cap_dac_override, cap_net_bind_service, ...). This host cannot   ║
-║ drop them at runtime (no CAP_SETPCAP), so the drop was skipped.               ║
-║                                                                               ║
-║ To run anyway with the weaker (warn-only) posture, unset the variable:        ║
-║   unset NEMOCLAW_REQUIRE_CAP_DROP                                             ║
-║                                                                               ║
-║ Tracking: https://github.com/NVIDIA/NemoClaw/issues/3280                      ║
-╚══════════════════════════════════════════════════════════════════════════════╝
 
+  local status_path="${NEMOCLAW_PROC_STATUS:-/proc/self/status}"
+  local cap_bnd_hex present reason=""
+  cap_bnd_hex=$(awk '/^CapBnd:/{print $2}' "$status_path" 2>/dev/null || true)
+  if [ -z "$cap_bnd_hex" ]; then
+    # Cannot verify → in strict mode, refuse rather than assume safety.
+    reason="could not read bounding set from ${status_path} — cannot verify drop"
+  else
+    present="$(dangerous_caps_in_capbnd "$cap_bnd_hex")"
+    [ -n "$present" ] && reason="dangerous caps remain in bounding set (CapBnd=${cap_bnd_hex}): ${present}"
+  fi
+  [ -n "$reason" ] || return 0
+
+  cat >&2 <<'EOF'
+
+┌─ [SECURITY] Refusing to start sandbox: bounding-set capability drop failed ──
+│
+│ NEMOCLAW_REQUIRE_CAP_DROP=1 is set, so NemoClaw refuses to start a sandbox
+│ that still holds dangerous bounding-set capabilities. The runtime could not
+│ drop them (capsh or CAP_SETPCAP unavailable on this host), so they remain.
+│
+│ To run anyway with the weaker (warn-only) posture, unset the variable:
+│   unset NEMOCLAW_REQUIRE_CAP_DROP
+│
+│ Tracking: https://github.com/NVIDIA/NemoClaw/issues/3280
+└──────────────────────────────────────────────────────────────────────────────
 EOF
+  echo "[SECURITY] ${reason}" >&2
   exit 1
 }
 
@@ -300,31 +338,18 @@ EOF
 report_residual_capabilities() {
   echo "[SECURITY] CAP_SETPCAP not available — cannot drop bounding-set caps via capsh" >&2
 
-  local cap_bnd_hex val name bit present_caps=""
-  if ! cap_bnd_hex=$(awk '/^CapBnd:/{print $2}' /proc/self/status 2>/dev/null) \
+  local status_path="${NEMOCLAW_PROC_STATUS:-/proc/self/status}"
+  local cap_bnd_hex present
+  if ! cap_bnd_hex=$(awk '/^CapBnd:/{print $2}' "$status_path" 2>/dev/null) \
     || [ -z "$cap_bnd_hex" ]; then
-    echo "[SECURITY] Could not read /proc/self/status — residual caps unknown" >&2
+    echo "[SECURITY] Could not read ${status_path} — residual caps unknown" >&2
     return 0
   fi
   echo "[SECURITY] Residual CapBnd=${cap_bnd_hex}" >&2
 
-  # Bash arithmetic handles 64-bit ints on 64-bit platforms; CAP_LAST_CAP
-  # is ~41 today, well within range. Avoids a gawk-strtonum dependency.
-  val=$((16#$cap_bnd_hex))
-  for entry in \
-    "21:cap_sys_admin" \
-    "19:cap_sys_ptrace" \
-    "13:cap_net_raw" \
-    "1:cap_dac_override" \
-    "10:cap_net_bind_service"; do
-    bit="${entry%%:*}"
-    name="${entry#*:}"
-    if [ $(((val >> bit) & 1)) -ne 0 ]; then
-      present_caps="${present_caps:+$present_caps,}$name"
-    fi
-  done
-  if [ -n "$present_caps" ]; then
-    echo "[SECURITY] Dangerous caps remain in bounding set: ${present_caps}" >&2
+  present="$(dangerous_caps_in_capbnd "$cap_bnd_hex")"
+  if [ -n "$present" ]; then
+    echo "[SECURITY] Dangerous caps remain in bounding set: ${present}" >&2
   fi
 }
 
@@ -382,13 +407,13 @@ init_step_down_prefixes() {
     local drop="-setuid,-setgid,-fowner,-chown,-kill"
     # shellcheck disable=SC2034  # consumed by entrypoint scripts (cross-file)
     STEP_DOWN_PREFIX_SANDBOX=(
-      setpriv --reuid=sandbox --regid=sandbox --init-groups
-      --bounding-set="$drop" --
+      setpriv "--reuid=sandbox" "--regid=sandbox" --init-groups
+      "--bounding-set=$drop" --
     )
     # shellcheck disable=SC2034  # consumed by entrypoint scripts (cross-file)
     STEP_DOWN_PREFIX_GATEWAY=(
-      setpriv --reuid=gateway --regid=gateway --init-groups
-      --bounding-set="$drop" --
+      setpriv "--reuid=gateway" "--regid=gateway" --init-groups
+      "--bounding-set=$drop" --
     )
   else
     echo "[SECURITY WARNING] setpriv or CAP_SETPCAP unavailable — falling back to gosu (bounding set will retain cap_setuid/setgid/fowner/chown/kill — issue #3280)" >&2
diff --git a/test/sandbox-init.test.ts b/test/sandbox-init.test.ts
index 3574a81959..6b37794eac 100644
--- a/test/sandbox-init.test.ts
+++ b/test/sandbox-init.test.ts
@@ -404,116 +404,84 @@ EOF
       expect(stdout).toContain("SKIPPED_OK");
     });
 
-    // Repro for reopened issue #3280 (NVBug 6159223), QA FAIL reported by
+    // Context for reopened issue #3280 (NVBug 6159223), QA FAIL reported by
     // hulynn on v0.0.54: on a host whose container runtime does not grant
     // CAP_SETPCAP (e.g. the Colossus Ubuntu 24.04 image), capsh --drop cannot
-    // run, so the bounding-set drop is skipped and the 8 dangerous caps
+    // run, so the bounding-set drop is skipped and the dangerous caps
     // (cap_sys_admin, cap_sys_ptrace, cap_net_raw, cap_dac_override,
-    // cap_net_bind_service, cap_fowner, cap_setuid, cap_setgid) remain in the
-    // sandbox bounding set.
+    // cap_net_bind_service, ...) remain in the bounding set.
     //
-    // PR #4266 made this fall-through fail-closed (refuse to start unless
-    // NEMOCLAW_ALLOW_RESIDUAL_CAPS=1). PR #4341 reverted that to
-    // warn-and-continue, which is what current main does — and is exactly
-    // why QA still sees the residual caps. This test drives the
-    // CAP_SETPCAP-missing path with a stubbed capsh and asserts the current
-    // warn-and-continue behavior: the warning is emitted but the entrypoint
-    // does NOT refuse to start, so the residual-cap posture survives.
-    it("warns but does NOT refuse to start when CAP_SETPCAP is unavailable (issue #3280 repro)", () => {
+    // The strict-mode tests below use NEMOCLAW_PROC_STATUS — a test seam in
+    // sandbox-init.sh — to feed a known CapBnd fixture, so they exercise the
+    // real enforcement against a controlled bounding set without depending on
+    // the test runner's own /proc/self/status. CapBnd 0x4a82c35fb is the exact
+    // value hulynn decoded on the failing Colossus host.
+    const QA_CAPBND = "00000004a82c35fb"; // contains all 5 inspected dangerous caps
+    const CLEAN_CAPBND = "0000000000000000"; // none present
+    const QA_DANGEROUS = "cap_sys_admin,cap_sys_ptrace,cap_net_raw,cap_dac_override,cap_net_bind_service";
+
+    // Stub capsh so it is found on PATH (command -v succeeds) but reports
+    // CAP_SETPCAP absent, forcing the fall-through that skips the real drop.
+    const capshNoSetpcapStub = [
+      'cat >"$TMP/capsh" <<\'STUB\'',
+      "#!/bin/sh",
+      '[ "$1" = "--has-p=cap_setpcap" ] && exit 1',
+      "exit 0",
+      "STUB",
+      'chmod +x "$TMP/capsh"',
+      'export PATH="$TMP:$PATH"',
+    ];
+    const writeStatusFixture = (capbndHex: string) => [
+      `printf 'CapBnd:\\t${capbndHex}\\n' >"$TMP/status"`,
+      'export NEMOCLAW_PROC_STATUS="$TMP/status"',
+    ];
+
+    // Default (no NEMOCLAW_REQUIRE_CAP_DROP): warns and CONTINUES even though
+    // dangerous caps remain — preserving the zero-regression posture for
+    // CAP_SETPCAP-less hosts. report_residual_capabilities still names them.
+    it("warns but does NOT refuse to start when CAP_SETPCAP is unavailable (issue #3280)", () => {
       const { stdout } = runWithLib(
         [
           "TMP=$(mktemp -d)",
-          // Stub capsh so it is found on PATH (command -v succeeds) but
-          // reports CAP_SETPCAP absent, forcing the fall-through that skips
-          // the actual bounding-set drop.
-          'cat >"$TMP/capsh" <<\'STUB\'',
-          "#!/bin/sh",
-          '[ "$1" = "--has-p=cap_setpcap" ] && exit 1',
-          "exit 0",
-          "STUB",
-          'chmod +x "$TMP/capsh"',
-          'export PATH="$TMP:$PATH"',
+          ...capshNoSetpcapStub,
+          ...writeStatusFixture(QA_CAPBND),
           "drop_capabilities /usr/local/bin/fake-entrypoint 2>&1",
-          // Reached only because drop_capabilities returned instead of
-          // exiting. The pre-#4341 fail-closed policy would `exit 1` before
-          // this line, so its presence in output IS the reproduced gap.
           'echo "SANDBOX_CONTINUED_DESPITE_RESIDUAL_CAPS"',
           'rm -rf "$TMP"',
         ].join("\n"),
         { env: { NEMOCLAW_CAPS_DROPPED: "" } },
       );
-
-      // The drop was attempted and detected as impossible...
       expect(stdout).toContain("CAP_SETPCAP not available — cannot drop bounding-set caps via capsh");
-      // ...yet the sandbox keeps going (the bug this issue tracks).
+      expect(stdout).toContain(`Dangerous caps remain in bounding set: ${QA_DANGEROUS}`);
       expect(stdout).toContain("SANDBOX_CONTINUED_DESPITE_RESIDUAL_CAPS");
-      // And it does so silently w.r.t. fail-closed: no refusal banner and no
-      // opt-in escape hatch are present on current main (post-#4341 revert).
       expect(stdout).not.toContain("Refusing to start sandbox");
-      expect(stdout).not.toContain("NEMOCLAW_ALLOW_RESIDUAL_CAPS");
     });
 
-    // Companion repro at the detection layer: report_residual_capabilities
-    // must name the dangerous caps it finds in the bounding set so they are
-    // visible in the entrypoint log (the [SECURITY] line QA quoted from
-    // /tmp/nemoclaw-start.log). Decoding is exercised directly so the
-    // assertion is deterministic and host-independent (it does not depend on
-    // the test runner's own CapBnd). CapBnd 0x4a82c35fb is the exact value
-    // hulynn decoded on the failing Colossus host.
-    it("names residual dangerous caps from a known CapBnd (issue #3280 repro)", () => {
+    // Exercise the REAL decode function (not a copy of its loop) so future
+    // drift in dangerous_caps_in_capbnd is caught.
+    it("dangerous_caps_in_capbnd decodes the inspected caps from a CapBnd hex", () => {
       const { stdout } = runWithLib(
         [
-          // Shadow /proc/self/status reads with a fixture exposing the
-          // failing host's CapBnd, then invoke the decode loop verbatim.
-          "TMP=$(mktemp -d)",
-          'printf "CapBnd:\\t00000004a82c35fb\\n" >"$TMP/status"',
-          "cap_bnd_hex=$(awk '/^CapBnd:/{print $2}' \"$TMP/status\")",
-          "val=$((16#$cap_bnd_hex))",
-          "present_caps=''",
-          "for entry in \\",
-          '  "21:cap_sys_admin" \\',
-          '  "19:cap_sys_ptrace" \\',
-          '  "13:cap_net_raw" \\',
-          '  "1:cap_dac_override" \\',
-          '  "10:cap_net_bind_service"; do',
-          '  bit="${entry%%:*}"',
-          '  name="${entry#*:}"',
-          "  if [ $(((val >> bit) & 1)) -ne 0 ]; then",
-          '    present_caps="${present_caps:+$present_caps,}$name"',
-          "  fi",
-          "done",
-          'echo "RESIDUAL: ${present_caps}"',
-          'rm -rf "$TMP"',
+          `echo "DANGEROUS:[$(dangerous_caps_in_capbnd ${QA_CAPBND})]"`,
+          `echo "CLEAN:[$(dangerous_caps_in_capbnd ${CLEAN_CAPBND})]"`,
         ].join("\n"),
       );
-
-      // All five caps the entrypoint inspects are present in this CapBnd,
-      // matching the QA-reported residual set.
-      expect(stdout).toContain(
-        "RESIDUAL: cap_sys_admin,cap_sys_ptrace,cap_net_raw,cap_dac_override,cap_net_bind_service",
-      );
+      expect(stdout).toContain(`DANGEROUS:[${QA_DANGEROUS}]`);
+      expect(stdout).toContain("CLEAN:[]");
     });
 
     // ── Fix: opt-in fail-closed strict mode (issue #3280) ──────────────
     // The inverse of the reverted #4266: default stays warn-and-continue (no
-    // regression on CAP_SETPCAP-less hosts), but operators can demand the
-    // hardened posture with NEMOCLAW_REQUIRE_CAP_DROP=1, which refuses to
-    // start when the bounding-set drop could not happen.
+    // regression), but NEMOCLAW_REQUIRE_CAP_DROP=1 refuses to start unless the
+    // ACTUAL bounding set is provably free of the dangerous caps.
 
-    it("refuses to start when CAP_SETPCAP is unavailable and NEMOCLAW_REQUIRE_CAP_DROP=1", () => {
+    it("refuses to start when REQUIRE_CAP_DROP=1 and dangerous caps remain (CAP_SETPCAP path)", () => {
       const { stdout, stderr } = runWithLib(
         [
           "TMP=$(mktemp -d)",
-          // capsh present but reports CAP_SETPCAP absent → drop is skipped.
-          'cat >"$TMP/capsh" <<\'STUB\'',
-          "#!/bin/sh",
-          '[ "$1" = "--has-p=cap_setpcap" ] && exit 1',
-          "exit 0",
-          "STUB",
-          'chmod +x "$TMP/capsh"',
-          'export PATH="$TMP:$PATH"',
+          ...capshNoSetpcapStub,
+          ...writeStatusFixture(QA_CAPBND),
           "drop_capabilities /usr/local/bin/fake-entrypoint",
-          // Strict mode exits 1 before this line is reached.
           'echo "SHOULD_NOT_REACH"',
           'rm -rf "$TMP"',
         ].join("\n"),
@@ -521,45 +489,100 @@ EOF
       );
       const combined = `${stdout}\n${stderr}`;
       expect(combined).toContain("Refusing to start sandbox");
-      expect(combined).toContain("CAP_SETPCAP unavailable");
-      expect(combined).toContain("NEMOCLAW_REQUIRE_CAP_DROP=1 is set");
+      expect(combined).toContain(`dangerous caps remain in bounding set (CapBnd=${QA_CAPBND}): ${QA_DANGEROUS}`);
       expect(combined).not.toContain("SHOULD_NOT_REACH");
     });
 
-    it("refuses to start when capsh is missing and NEMOCLAW_REQUIRE_CAP_DROP=1", () => {
+    it("refuses to start when REQUIRE_CAP_DROP=1 and capsh is missing", () => {
+      const { stdout, stderr } = runWithLib(
+        [
+          "TMP=$(mktemp -d)",
+          ...writeStatusFixture(QA_CAPBND),
+          "drop_capabilities /usr/local/bin/fake-entrypoint",
+          'echo "SHOULD_NOT_REACH"',
+          'rm -rf "$TMP"',
+        ].join("\n"),
+        {
+          // Hide capsh so command -v fails, exercising the capsh-missing branch.
+          env: { PATH: "/usr/bin:/bin", NEMOCLAW_CAPS_DROPPED: "", NEMOCLAW_REQUIRE_CAP_DROP: "1" },
+          expectFail: true,
+        },
+      );
+      const combined = `${stdout}\n${stderr}`;
+      expect(combined).toContain("capsh not available");
+      expect(combined).toContain("Refusing to start sandbox");
+      expect(combined).not.toContain("SHOULD_NOT_REACH");
+    });
+
+    // Regression for the sentinel-bypass finding: a pre-set NEMOCLAW_CAPS_DROPPED=1
+    // must NOT let a host with residual caps slip past strict mode. The gate
+    // verifies the actual bounding set, so it still refuses.
+    it("refuses despite a pre-set NEMOCLAW_CAPS_DROPPED=1 when dangerous caps remain (strict)", () => {
+      const { stdout, stderr } = runWithLib(
+        [
+          "TMP=$(mktemp -d)",
+          ...writeStatusFixture(QA_CAPBND),
+          "drop_capabilities /usr/local/bin/fake-entrypoint",
+          'echo "BYPASSED_STRICT_MODE"',
+          'rm -rf "$TMP"',
+        ].join("\n"),
+        {
+          env: { NEMOCLAW_CAPS_DROPPED: "1", NEMOCLAW_REQUIRE_CAP_DROP: "1" },
+          expectFail: true,
+        },
+      );
+      const combined = `${stdout}\n${stderr}`;
+      expect(combined).toContain("Refusing to start sandbox");
+      expect(combined).toContain("dangerous caps remain in bounding set");
+      expect(combined).not.toContain("BYPASSED_STRICT_MODE");
+    });
+
+    // Strict mode trusts the verified state, not the fall-through: if the
+    // bounding set is already clean it must NOT refuse.
+    it("continues under REQUIRE_CAP_DROP=1 when the bounding set is already clean", () => {
+      const { stdout } = runWithLib(
+        [
+          "TMP=$(mktemp -d)",
+          ...capshNoSetpcapStub,
+          ...writeStatusFixture(CLEAN_CAPBND),
+          "drop_capabilities /usr/local/bin/fake-entrypoint 2>&1",
+          'echo "CONTINUED_CLEAN"',
+          'rm -rf "$TMP"',
+        ].join("\n"),
+        { env: { NEMOCLAW_CAPS_DROPPED: "", NEMOCLAW_REQUIRE_CAP_DROP: "1" } },
+      );
+      expect(stdout).toContain("CONTINUED_CLEAN");
+      expect(stdout).not.toContain("Refusing to start sandbox");
+    });
+
+    it("refuses under REQUIRE_CAP_DROP=1 when the bounding set cannot be verified", () => {
       const { stdout, stderr } = runWithLib(
         `
+        export NEMOCLAW_PROC_STATUS=/nonexistent/sandbox-init-status
         drop_capabilities /usr/local/bin/fake-entrypoint
         echo "SHOULD_NOT_REACH"
       `,
         {
-          // Hide capsh so command -v fails, exercising the capsh-missing branch.
           env: { PATH: "/usr/bin:/bin", NEMOCLAW_CAPS_DROPPED: "", NEMOCLAW_REQUIRE_CAP_DROP: "1" },
           expectFail: true,
         },
       );
       const combined = `${stdout}\n${stderr}`;
-      expect(combined).toContain("capsh not available");
       expect(combined).toContain("Refusing to start sandbox");
+      expect(combined).toContain("could not read bounding set");
       expect(combined).not.toContain("SHOULD_NOT_REACH");
     });
 
-    it("continues (no regression) when NEMOCLAW_REQUIRE_CAP_DROP is unset", () => {
+    it("continues (no regression) when NEMOCLAW_REQUIRE_CAP_DROP is unset even with residual caps", () => {
       const { stdout } = runWithLib(
         [
           "TMP=$(mktemp -d)",
-          'cat >"$TMP/capsh" <<\'STUB\'',
-          "#!/bin/sh",
-          '[ "$1" = "--has-p=cap_setpcap" ] && exit 1',
-          "exit 0",
-          "STUB",
-          'chmod +x "$TMP/capsh"',
-          'export PATH="$TMP:$PATH"',
+          ...capshNoSetpcapStub,
+          ...writeStatusFixture(QA_CAPBND),
           "drop_capabilities /usr/local/bin/fake-entrypoint 2>&1",
           'echo "CONTINUED_OK"',
           'rm -rf "$TMP"',
         ].join("\n"),
-        // NEMOCLAW_REQUIRE_CAP_DROP deliberately unset → default warn-and-continue.
         { env: { NEMOCLAW_CAPS_DROPPED: "", NEMOCLAW_REQUIRE_CAP_DROP: "" } },
       );
       expect(stdout).toContain("CONTINUED_OK");

From eabdbcb13620cd673fb3be144347fcdc36e2fdf6 Mon Sep 17 00:00:00 2001
From: Dongni Yang <dongniy@nvidia.com>
Date: Wed, 3 Jun 2026 20:02:43 +0800
Subject: [PATCH 3/4] fix(sandbox): address CodeRabbit review on #3280

- test: also unset NEMOCLAW_REQUIRE_CAP_DROP in the warn-and-continue
  case so it cannot inherit a strict-mode value from the runner env.
- docs: wrap the NEMOCLAW_REQUIRE_CAP_DROP paragraphs one sentence per
  line in best-practices.mdx and sandbox-hardening.mdx (docs readability).

Refs #3280
Signed-off-by: Dongni Yang <dongniy@nvidia.com>
---
 docs/deployment/sandbox-hardening.mdx | 3 ++-
 docs/security/best-practices.mdx      | 5 ++++-
 test/sandbox-init.test.ts             | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/docs/deployment/sandbox-hardening.mdx b/docs/deployment/sandbox-hardening.mdx
index 6baceef093..b5be554d1e 100644
--- a/docs/deployment/sandbox-hardening.mdx
+++ b/docs/deployment/sandbox-hardening.mdx
@@ -39,7 +39,8 @@ It removes `CAP_SYS_ADMIN`, `CAP_SYS_PTRACE`, `CAP_NET_RAW`,
 `CAP_MKNOD`, `CAP_AUDIT_WRITE`, and `CAP_NET_BIND_SERVICE`.
 When `setpriv` is available, the entrypoint also removes the remaining privilege-separation capabilities during the switch from root to the `sandbox` and `gateway` users.
 
-The bounding-set drop is best effort: if `capsh` or `CAP_SETPCAP` is unavailable the entrypoint logs a warning and continues with the runtime-provided capability set, and if `setpriv` is unavailable it falls back to `gosu`.
+The bounding-set drop is best effort: if `capsh` or `CAP_SETPCAP` is unavailable the entrypoint logs a warning and continues with the runtime-provided capability set.
+If `setpriv` is unavailable, the entrypoint falls back to `gosu`.
 To make the drop fail-closed instead, set `NEMOCLAW_REQUIRE_CAP_DROP=1` in the entrypoint environment: the agent then refuses to start unless the agent process tree's bounding set is verified free of the dangerous capabilities.
 This is opt-in because hosts that cannot drop capabilities (no `CAP_SETPCAP` — many cloud VMs, Docker Desktop, WSL) are common, and the check covers the agent process tree only.
 
diff --git a/docs/security/best-practices.mdx b/docs/security/best-practices.mdx
index 7268ad33c4..dfc0c9347c 100644
--- a/docs/security/best-practices.mdx
+++ b/docs/security/best-practices.mdx
@@ -294,7 +294,10 @@ During `setpriv` step-down, the child process also loses `cap_setuid`, `cap_setg
 This behavior is best effort: if `capsh` is not available or `CAP_SETPCAP` is not in the bounding set, the entrypoint logs a warning and continues with the default capability set.
 If `setpriv` is unavailable, the entrypoint falls back to `gosu` and logs a warning that the remaining bounding-set capabilities were retained for the child process.
 
-To make the drop fail-closed instead of best-effort, set `NEMOCLAW_REQUIRE_CAP_DROP=1` in the entrypoint environment. The agent then refuses to start unless the agent process tree's bounding set is verified free of the dangerous capabilities, so it will not boot on hosts that cannot perform the drop (no `CAP_SETPCAP`, or `capsh` missing). This is opt-in because such hosts are common (many cloud VMs, Docker Desktop, WSL); leaving it unset preserves the best-effort default. The check covers the agent process tree only — a `nemoclaw connect` shell is spawned by the container runtime outside that tree and is not affected (tracked in [NVIDIA/OpenShell#1452](https://github.com/NVIDIA/OpenShell/issues/1452)).
+To make the drop fail-closed instead of best-effort, set `NEMOCLAW_REQUIRE_CAP_DROP=1` in the entrypoint environment.
+The agent then refuses to start unless the agent process tree's bounding set is verified free of the dangerous capabilities, so it will not boot on hosts that cannot perform the drop (no `CAP_SETPCAP`, or `capsh` missing).
+This is opt-in because such hosts are common (many cloud VMs, Docker Desktop, WSL); leaving it unset preserves the best-effort default.
+The check covers the agent process tree only — a `nemoclaw connect` shell is spawned by the container runtime outside that tree and is not affected (tracked in [NVIDIA/OpenShell#1452](https://github.com/NVIDIA/OpenShell/issues/1452)).
 
 For additional protection, pass `--cap-drop=ALL` with `docker run` or Compose (see [Sandbox Hardening](../manage-sandboxes/sandbox-hardening)).
 
diff --git a/test/sandbox-init.test.ts b/test/sandbox-init.test.ts
index 6b37794eac..5ca9ed59d2 100644
--- a/test/sandbox-init.test.ts
+++ b/test/sandbox-init.test.ts
@@ -449,7 +449,7 @@ EOF
           'echo "SANDBOX_CONTINUED_DESPITE_RESIDUAL_CAPS"',
           'rm -rf "$TMP"',
         ].join("\n"),
-        { env: { NEMOCLAW_CAPS_DROPPED: "" } },
+        { env: { NEMOCLAW_CAPS_DROPPED: "", NEMOCLAW_REQUIRE_CAP_DROP: "" } },
       );
       expect(stdout).toContain("CAP_SETPCAP not available — cannot drop bounding-set caps via capsh");
       expect(stdout).toContain(`Dangerous caps remain in bounding set: ${QA_DANGEROUS}`);

From e8880f2309ecd351e1154f0da1831f846e639b31 Mon Sep 17 00:00:00 2001
From: Dongni Yang <dongniy@nvidia.com>
Date: Wed, 3 Jun 2026 20:18:44 +0800
Subject: [PATCH 4/4] fix(sandbox): verify the full dangerous-cap drop-set in
 strict mode

Adversarial review found the strict gate verified only 5 of the 10 caps
capsh drops, so a host retaining cap_setfcap/cap_mknod/cap_sys_chroot/
cap_fsetid/cap_audit_write passed NEMOCLAW_REQUIRE_CAP_DROP=1 as "clean".

- Single DANGEROUS_CAPS list feeds both the capsh --drop arguments and
  dangerous_caps_in_capbnd, so the drop-set and verify-set cannot drift.
- Reject a non-empty but malformed CapBnd (return nonzero) and treat it
  as unverifiable: refuse in strict mode, warn in default mode, instead
  of leaking a raw bash arithmetic error.
- Tests: expect all 10 caps; add strict-refuse and default-warn cases
  for an unparseable CapBnd.
- Docs: strict mode refuses based on the verified bounding set, not on
  whether the host can perform the drop.

Refs #3280
Signed-off-by: Dongni Yang <dongniy@nvidia.com>
---
 docs/security/best-practices.mdx |  2 +-
 scripts/lib/sandbox-init.sh      | 56 +++++++++++++++++++++++++-------
 test/sandbox-init.test.ts        | 49 ++++++++++++++++++++++++++--
 3 files changed, 92 insertions(+), 15 deletions(-)

diff --git a/docs/security/best-practices.mdx b/docs/security/best-practices.mdx
index dfc0c9347c..1c02d1d70f 100644
--- a/docs/security/best-practices.mdx
+++ b/docs/security/best-practices.mdx
@@ -295,7 +295,7 @@ This behavior is best effort: if `capsh` is not available or `CAP_SETPCAP` is no
 If `setpriv` is unavailable, the entrypoint falls back to `gosu` and logs a warning that the remaining bounding-set capabilities were retained for the child process.
 
 To make the drop fail-closed instead of best-effort, set `NEMOCLAW_REQUIRE_CAP_DROP=1` in the entrypoint environment.
-The agent then refuses to start unless the agent process tree's bounding set is verified free of the dangerous capabilities, so it will not boot on hosts that cannot perform the drop (no `CAP_SETPCAP`, or `capsh` missing).
+The agent then refuses to start unless the agent process tree's bounding set is verified free of the dangerous capabilities, so it will not boot on a host whose bounding set still holds them — typically one that cannot perform the drop (no `CAP_SETPCAP`, or `capsh` missing) and was not given a clean bounding set by the container runtime.
 This is opt-in because such hosts are common (many cloud VMs, Docker Desktop, WSL); leaving it unset preserves the best-effort default.
 The check covers the agent process tree only — a `nemoclaw connect` shell is spawned by the container runtime outside that tree and is not affected (tracked in [NVIDIA/OpenShell#1452](https://github.com/NVIDIA/OpenShell/issues/1452)).
 
diff --git a/scripts/lib/sandbox-init.sh b/scripts/lib/sandbox-init.sh
index 373549dd52..b496e555eb 100755
--- a/scripts/lib/sandbox-init.sh
+++ b/scripts/lib/sandbox-init.sh
@@ -225,6 +225,33 @@ lock_config_after_write() {
 # Usage:
 #   drop_capabilities /usr/local/bin/nemoclaw-start "$@"
 #
+# Single source of truth for the dangerous capabilities the entrypoint drops
+# and (in strict mode) verifies are gone. "bit:name" pairs; bit numbers per
+# /usr/include/linux/capability.h. Both the capsh --drop list and
+# dangerous_caps_in_capbnd() derive from this array, so the drop-set and the
+# strict-mode verify-set cannot drift apart (issue #3280).
+DANGEROUS_CAPS=(
+  "21:cap_sys_admin"
+  "19:cap_sys_ptrace"
+  "13:cap_net_raw"
+  "1:cap_dac_override"
+  "18:cap_sys_chroot"
+  "4:cap_fsetid"
+  "31:cap_setfcap"
+  "27:cap_mknod"
+  "29:cap_audit_write"
+  "10:cap_net_bind_service"
+)
+
+# Comma-separated capability names for `capsh --drop`, derived from DANGEROUS_CAPS.
+dangerous_caps_drop_list() {
+  local entry out=""
+  for entry in "${DANGEROUS_CAPS[@]}"; do
+    out="${out:+$out,}${entry#*:}"
+  done
+  printf '%s' "$out"
+}
+
 # The first argument is the absolute path to the entrypoint script to
 # re-exec via capsh. Remaining arguments are forwarded.
 drop_capabilities() {
@@ -237,7 +264,7 @@ drop_capabilities() {
     if capsh --has-p=cap_setpcap 2>/dev/null; then
       export NEMOCLAW_CAPS_DROPPED=1
       exec capsh \
-        --drop=cap_sys_admin,cap_sys_ptrace,cap_net_raw,cap_dac_override,cap_sys_chroot,cap_fsetid,cap_setfcap,cap_mknod,cap_audit_write,cap_net_bind_service \
+        --drop="$(dangerous_caps_drop_list)" \
         -- -c "exec $entrypoint \"\$@\"" -- "$@"
     fi
     # CAP_SETPCAP missing (or the exec above failed): the drop could not run.
@@ -261,15 +288,17 @@ drop_capabilities() {
 #
 # Bash arithmetic handles 64-bit ints on 64-bit platforms; CAP_LAST_CAP is ~41
 # today, well within range. Avoids a gawk-strtonum dependency.
+#
+# Returns nonzero with no output if the hex is empty or malformed, so callers
+# can treat "could not parse" the same as "could not read" instead of silently
+# treating an unparseable bounding set as clean (issue #3280).
 dangerous_caps_in_capbnd() {
   local cap_bnd_hex="$1" val entry bit name present=""
+  case "$cap_bnd_hex" in
+    "" | *[!0-9A-Fa-f]*) return 1 ;;
+  esac
   val=$((16#$cap_bnd_hex))
-  for entry in \
-    "21:cap_sys_admin" \
-    "19:cap_sys_ptrace" \
-    "13:cap_net_raw" \
-    "1:cap_dac_override" \
-    "10:cap_net_bind_service"; do
+  for entry in "${DANGEROUS_CAPS[@]}"; do
     bit="${entry%%:*}"
     name="${entry#*:}"
     if [ $(((val >> bit) & 1)) -ne 0 ]; then
@@ -307,9 +336,11 @@ enforce_cap_drop_if_required() {
   if [ -z "$cap_bnd_hex" ]; then
     # Cannot verify → in strict mode, refuse rather than assume safety.
     reason="could not read bounding set from ${status_path} — cannot verify drop"
-  else
-    present="$(dangerous_caps_in_capbnd "$cap_bnd_hex")"
-    [ -n "$present" ] && reason="dangerous caps remain in bounding set (CapBnd=${cap_bnd_hex}): ${present}"
+  elif ! present="$(dangerous_caps_in_capbnd "$cap_bnd_hex")"; then
+    # Non-empty but unparseable CapBnd is equally unverifiable → refuse.
+    reason="could not parse bounding set (CapBnd=${cap_bnd_hex}) — cannot verify drop"
+  elif [ -n "$present" ]; then
+    reason="dangerous caps remain in bounding set (CapBnd=${cap_bnd_hex}): ${present}"
   fi
   [ -n "$reason" ] || return 0
 
@@ -347,8 +378,9 @@ report_residual_capabilities() {
   fi
   echo "[SECURITY] Residual CapBnd=${cap_bnd_hex}" >&2
 
-  present="$(dangerous_caps_in_capbnd "$cap_bnd_hex")"
-  if [ -n "$present" ]; then
+  if ! present="$(dangerous_caps_in_capbnd "$cap_bnd_hex")"; then
+    echo "[SECURITY] Could not parse CapBnd=${cap_bnd_hex} — residual caps unknown" >&2
+  elif [ -n "$present" ]; then
     echo "[SECURITY] Dangerous caps remain in bounding set: ${present}" >&2
   fi
 }
diff --git a/test/sandbox-init.test.ts b/test/sandbox-init.test.ts
index 5ca9ed59d2..923ad5c3e0 100644
--- a/test/sandbox-init.test.ts
+++ b/test/sandbox-init.test.ts
@@ -416,9 +416,10 @@ EOF
     // real enforcement against a controlled bounding set without depending on
     // the test runner's own /proc/self/status. CapBnd 0x4a82c35fb is the exact
     // value hulynn decoded on the failing Colossus host.
-    const QA_CAPBND = "00000004a82c35fb"; // contains all 5 inspected dangerous caps
+    const QA_CAPBND = "00000004a82c35fb"; // contains all 10 inspected dangerous caps
     const CLEAN_CAPBND = "0000000000000000"; // none present
-    const QA_DANGEROUS = "cap_sys_admin,cap_sys_ptrace,cap_net_raw,cap_dac_override,cap_net_bind_service";
+    const QA_DANGEROUS =
+      "cap_sys_admin,cap_sys_ptrace,cap_net_raw,cap_dac_override,cap_sys_chroot,cap_fsetid,cap_setfcap,cap_mknod,cap_audit_write,cap_net_bind_service";
 
     // Stub capsh so it is found on PATH (command -v succeeds) but reports
     // CAP_SETPCAP absent, forcing the fall-through that skips the real drop.
@@ -573,6 +574,50 @@ EOF
       expect(combined).not.toContain("SHOULD_NOT_REACH");
     });
 
+    // Harden (issue #3280): a non-empty but unparseable CapBnd (corrupt /proc,
+    // CRLF fixture, future format change) must be treated as "cannot verify"
+    // — refusing in strict mode — and must NOT surface a raw bash arithmetic
+    // error. MALFORMED_CAPBND contains non-hex characters.
+    const MALFORMED_CAPBND = "00000000nothex0";
+    it("refuses under REQUIRE_CAP_DROP=1 when CapBnd is non-empty but unparseable", () => {
+      const { stdout, stderr } = runWithLib(
+        [
+          "TMP=$(mktemp -d)",
+          ...writeStatusFixture(MALFORMED_CAPBND),
+          "drop_capabilities /usr/local/bin/fake-entrypoint",
+          'echo "SHOULD_NOT_REACH"',
+          'rm -rf "$TMP"',
+        ].join("\n"),
+        {
+          env: { PATH: "/usr/bin:/bin", NEMOCLAW_CAPS_DROPPED: "", NEMOCLAW_REQUIRE_CAP_DROP: "1" },
+          expectFail: true,
+        },
+      );
+      const combined = `${stdout}\n${stderr}`;
+      expect(combined).toContain("Refusing to start sandbox");
+      expect(combined).toContain("could not parse bounding set");
+      expect(combined).not.toContain("SHOULD_NOT_REACH");
+      // No leaked bash arithmetic error.
+      expect(combined).not.toMatch(/value too great for base|invalid arithmetic|16#/);
+    });
+
+    it("warns and continues (no abort) on an unparseable CapBnd when REQUIRE_CAP_DROP is unset", () => {
+      const { stdout } = runWithLib(
+        [
+          "TMP=$(mktemp -d)",
+          ...capshNoSetpcapStub,
+          ...writeStatusFixture(MALFORMED_CAPBND),
+          "drop_capabilities /usr/local/bin/fake-entrypoint 2>&1",
+          'echo "CONTINUED_ON_BAD_CAPBND"',
+          'rm -rf "$TMP"',
+        ].join("\n"),
+        { env: { NEMOCLAW_CAPS_DROPPED: "", NEMOCLAW_REQUIRE_CAP_DROP: "" } },
+      );
+      expect(stdout).toContain("residual caps unknown");
+      expect(stdout).toContain("CONTINUED_ON_BAD_CAPBND");
+      expect(stdout).not.toContain("Refusing to start sandbox");
+    });
+
     it("continues (no regression) when NEMOCLAW_REQUIRE_CAP_DROP is unset even with residual caps", () => {
       const { stdout } = runWithLib(
         [