Skip to content
5 changes: 4 additions & 1 deletion docs/reference/commands.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,9 @@ On the Docker-driver gateway path, preflight stays read-only when it detects a s
It prints a `⚠ Gateway will be recreated when sandbox creation starts` notice and defers the actual teardown to step `[2/8] Starting OpenShell gateway`.
This means pressing `Ctrl+C` between preflight and step `[2/8]` leaves the running gateway and existing sandbox containers untouched, so `nemoclaw onboard` is safe to run just to check preflight output.
For Linux Docker-driver gateways, onboarding also checks that a helper container on the OpenShell Docker network can reach `host.openshell.internal:<gateway-port>`.
If a host firewall blocks that sandbox path, onboarding exits with a `sudo ufw allow from <subnet> to any port <gateway-port> proto tcp` command before it reports the gateway healthy.
If a host firewall blocks that sandbox path, onboarding exits with a `sudo ufw allow from <subnet> to <gateway-ip> port <gateway-port> proto tcp` command before it reports the gateway healthy.
Set `NEMOCLAW_AUTO_FIX_FIREWALL=1` to opt in to automatic UFW remediation for this specific failure: NemoClaw uses `sudo -n` only, validates the Docker bridge subnet/gateway/port, applies the narrow UFW rule only after a proven TCP reachability failure, and re-probes before continuing.
If passwordless sudo, UFW, or active UFW is unavailable, NemoClaw falls back to the manual guidance path without prompting for a password.
Tune the wait via `NEMOCLAW_REUSE_HEALTH_POLL_COUNT` (default `6`) and `NEMOCLAW_REUSE_HEALTH_POLL_INTERVAL` (default `5` seconds).
The poll count is clamped to a minimum of `1` so the probe always runs at least once, and the interval is clamped to a minimum of `0` (no sleep between attempts).

Expand Down Expand Up @@ -1825,6 +1827,7 @@ These flags toggle optional behaviors during onboarding; set them before running
| `NEMOCLAW_OPENSHELL_GATEWAY_BIN` | path | Advanced override for the `openshell-gateway` binary used by the Linux Docker-driver gateway. Defaults to the binary next to `openshell`, then common install paths. |
| `NEMOCLAW_OPENSHELL_SANDBOX_BIN` | path | Advanced override for the `openshell-sandbox` binary passed to the Linux Docker-driver gateway supervisor. Defaults to the binary next to `openshell`, then common install paths. |
| `NEMOCLAW_OPENSHELL_GATEWAY_STATE_DIR` | path | Advanced override for the Linux Docker-driver gateway pid file and SQLite state directory. Defaults to `~/.local/state/nemoclaw/openshell-docker-gateway`. |
| `NEMOCLAW_AUTO_FIX_FIREWALL` | `1` to enable | Opts in to automatic UFW remediation when Linux Docker-driver sandbox containers cannot reach the host gateway after a proven TCP failure. NemoClaw runs `sudo -n` only, validates the narrow Docker bridge subnet → gateway IP:port rule before invoking UFW, re-probes after applying it, and otherwise falls back to the printed manual command. |
| `NEMOCLAW_WECHAT_QUIET` | `1` to enable | Silences the `[wechat]` diagnostic lines printed during the host-side WeChat QR login (poll status, IDC redirects, swallowed gateway errors), which are visible by default while the experimental WeChat path stabilizes; set `1` once the flow is reliable in your environment. |

### Onboard Profiling Traces
Expand Down
211 changes: 210 additions & 1 deletion src/lib/onboard/gateway-sandbox-reachability.test.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

import { describe, expect, it } from "vitest";
import { describe, expect, it, vi } from "vitest";

import {
__test,
formatSandboxBridgeUnreachableMessage,
isSandboxBridgeGatewayReachable,
tryAutoApplyUfwRule,
verifySandboxBridgeGatewayReachableOrExit,
} from "../../../dist/lib/onboard/gateway-sandbox-reachability";

describe("gateway sandbox reachability route modeling", () => {
Expand Down Expand Up @@ -466,3 +468,210 @@ describe("formatSandboxBridgeUnreachableMessage", () => {
expect(msg).not.toContain("ufw allow");
});
});

describe("tryAutoApplyUfwRule (#4265)", () => {
type Call = { argv: readonly string[]; status: number; stdout?: string; stderr?: string };

function makeRunner(calls: Call[]) {
const recorded: string[][] = [];
const runImpl = (argv: readonly string[]) => {
recorded.push([...argv]);
const idx = recorded.length - 1;
const c = calls[idx];
if (!c) return { status: 0, stdout: "", stderr: "" };
return { status: c.status, stdout: c.stdout ?? "", stderr: c.stderr ?? "" };
};
return { runImpl, recorded };
}

const reach = {
ok: false as const,
reason: "tcp_failed" as const,
routeKind: "bridge_gateway" as const,
networkName: "openshell-docker",
subnet: "172.18.0.0/16",
gatewayIp: "172.18.0.1",
};

it("skips when the operator has not opted in", async () => {
const { runImpl, recorded } = makeRunner([]);
const result = await tryAutoApplyUfwRule(reach, { runImpl, optedIn: false });
expect(result).toEqual({ applied: false, reason: "not_opted_in" });
expect(recorded).toHaveLength(0);
});

it("skips when gatewayIp is unknown", async () => {
const { runImpl, recorded } = makeRunner([]);
const result = await tryAutoApplyUfwRule(
{ ...reach, gatewayIp: undefined },
{ runImpl, optedIn: true },
);
expect(result).toEqual({ applied: false, reason: "no_subnet_or_gateway" });
expect(recorded).toHaveLength(0);
});

it("skips when subnet is unknown", async () => {
const { runImpl, recorded } = makeRunner([]);
const result = await tryAutoApplyUfwRule(
{ ...reach, subnet: undefined },
{ runImpl, optedIn: true },
);
expect(result).toEqual({ applied: false, reason: "no_subnet_or_gateway" });
expect(recorded).toHaveLength(0);
});

it("rejects malformed or overly broad UFW operands before sudo", async () => {
const { runImpl, recorded } = makeRunner([]);
const broadSubnet = await tryAutoApplyUfwRule(
{ ...reach, subnet: "0.0.0.0/0" },
{ runImpl, optedIn: true },
);
const outsideGateway = await tryAutoApplyUfwRule(
{ ...reach, gatewayIp: "172.19.0.1" },
{ runImpl, optedIn: true },
);
const invalidPort = await tryAutoApplyUfwRule(reach, {
runImpl,
optedIn: true,
port: 70000,
});
expect(broadSubnet.reason).toBe("invalid_rule_operand");
expect(outsideGateway.reason).toBe("invalid_rule_operand");
expect(invalidPort.reason).toBe("invalid_rule_operand");
expect(recorded).toHaveLength(0);
});

it("returns sudo_unavailable when passwordless sudo fails", async () => {
const { runImpl } = makeRunner([{ argv: ["sudo", "-n", "true"], status: 1 }]);
const result = await tryAutoApplyUfwRule(reach, { runImpl, optedIn: true });
expect(result.reason).toBe("sudo_unavailable");
});

it("returns ufw_missing when ufw is not on PATH", async () => {
const { runImpl } = makeRunner([
{ argv: ["sudo", "-n", "true"], status: 0 },
{ argv: ["sudo", "-n", "which", "ufw"], status: 1 },
]);
const result = await tryAutoApplyUfwRule(reach, { runImpl, optedIn: true });
expect(result.reason).toBe("ufw_missing");
});

it("returns ufw_inactive when status reports inactive", async () => {
const { runImpl } = makeRunner([
{ argv: ["sudo", "-n", "true"], status: 0 },
{ argv: ["sudo", "-n", "which", "ufw"], status: 0, stdout: "/usr/sbin/ufw" },
{ argv: ["sudo", "-n", "ufw", "status"], status: 0, stdout: "Status: inactive" },
]);
const result = await tryAutoApplyUfwRule(reach, { runImpl, optedIn: true });
expect(result.reason).toBe("ufw_inactive");
});

it("returns ufw_rule_rejected when ufw exits non-zero on apply", async () => {
const { runImpl } = makeRunner([
{ argv: ["sudo", "-n", "true"], status: 0 },
{ argv: ["sudo", "-n", "which", "ufw"], status: 0, stdout: "/usr/sbin/ufw" },
{ argv: ["sudo", "-n", "ufw", "status"], status: 0, stdout: "Status: active" },
{ argv: [], status: 1, stderr: "ufw: rule rejected" },
]);
const result = await tryAutoApplyUfwRule(reach, { runImpl, optedIn: true, port: 8080 });
expect(result.reason).toBe("ufw_rule_rejected");
expect(result.detail).toContain("rule rejected");
});

it("applies the narrow allow rule on the happy path", async () => {
const { runImpl, recorded } = makeRunner([
{ argv: ["sudo", "-n", "true"], status: 0 },
{ argv: ["sudo", "-n", "which", "ufw"], status: 0, stdout: "/usr/sbin/ufw" },
{ argv: ["sudo", "-n", "ufw", "status"], status: 0, stdout: "Status: active" },
{ argv: [], status: 0, stdout: "Rule added" },
]);
const result = await tryAutoApplyUfwRule(reach, { runImpl, optedIn: true, port: 8080 });
expect(result).toEqual({ applied: true, reason: "applied", detail: "Rule added" });
expect(recorded[3]).toEqual([
"sudo", "-n", "ufw", "allow",
"from", "172.18.0.0/16",
"to", "172.18.0.1",
"port", "8080", "proto", "tcp",
]);
});
});

describe("verifySandboxBridgeGatewayReachableOrExit UFW auto-apply (#4265)", () => {
const tcpFailure = {
ok: false as const,
reason: "tcp_failed" as const,
routeKind: "bridge_gateway" as const,
networkName: "openshell-docker",
subnet: "172.18.0.0/16",
gatewayIp: "172.18.0.1",
};

it("does not auto-apply UFW when the bridge-gateway probe is unavailable", async () => {
const autoApplyImpl = vi.fn();
const warn = vi.spyOn(console, "warn").mockImplementation(() => undefined);
await verifySandboxBridgeGatewayReachableOrExit(false, {
autoApplyImpl,
autoApplyOptedInImpl: () => true,
reachabilityImpl: () => ({
...tcpFailure,
reason: "probe_unavailable",
detail: "nc: bad address 'host.openshell.internal'",
}),
});
expect(autoApplyImpl).not.toHaveBeenCalled();
expect(warn).toHaveBeenCalledWith(expect.stringContaining("Could not verify"));
warn.mockRestore();
});

it("re-probes and returns cleanly after a successful UFW apply", async () => {
const reachabilityImpl = vi
.fn()
.mockResolvedValueOnce(tcpFailure)
.mockResolvedValueOnce({ ...tcpFailure, ok: true, reason: "ok" });
const autoApplyImpl = vi.fn().mockReturnValue({ applied: true, reason: "applied" });
const log = vi.spyOn(console, "log").mockImplementation(() => undefined);
await verifySandboxBridgeGatewayReachableOrExit(true, {
autoApplyImpl,
autoApplyOptedInImpl: () => true,
reachabilityImpl,
});
expect(autoApplyImpl).toHaveBeenCalledWith(tcpFailure);
expect(reachabilityImpl).toHaveBeenCalledTimes(2);
expect(log).toHaveBeenCalledWith(expect.stringContaining("Applied UFW rule"));
log.mockRestore();
});

it("falls back to the manual message when apply succeeds but the re-probe still fails", async () => {
const reachabilityImpl = vi.fn().mockResolvedValue(tcpFailure);
const autoApplyImpl = vi.fn().mockReturnValue({ applied: true, reason: "applied" });
const log = vi.spyOn(console, "log").mockImplementation(() => undefined);
const error = vi.spyOn(console, "error").mockImplementation(() => undefined);
await expect(
verifySandboxBridgeGatewayReachableOrExit(false, {
autoApplyImpl,
autoApplyOptedInImpl: () => true,
reachabilityImpl,
}),
).rejects.toThrow("sandbox-bridge unreachable");
expect(reachabilityImpl).toHaveBeenCalledTimes(2);
expect(error).toHaveBeenCalledWith(expect.stringContaining("ufw allow"));
log.mockRestore();
error.mockRestore();
});

it("does not warn for unsupported UFW environments when auto-apply is opted in", async () => {
const warn = vi.spyOn(console, "warn").mockImplementation(() => undefined);
const error = vi.spyOn(console, "error").mockImplementation(() => undefined);
await expect(
verifySandboxBridgeGatewayReachableOrExit(false, {
autoApplyImpl: () => ({ applied: false, reason: "ufw_inactive" }),
autoApplyOptedInImpl: () => true,
reachabilityImpl: () => tcpFailure,
}),
).rejects.toThrow("sandbox-bridge unreachable");
expect(warn).not.toHaveBeenCalled();
expect(error).toHaveBeenCalledWith(expect.stringContaining("ufw allow"));
warn.mockRestore();
error.mockRestore();
});
});
56 changes: 53 additions & 3 deletions src/lib/onboard/gateway-sandbox-reachability.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ import {
ensureProbeImageCached,
isDockerDaemonUnreachable,
} from "./preflight";
import type { UfwAutoApplyResult } from "./ufw-auto-apply";
import { isUfwAutoApplyOptedIn, tryAutoApplyUfwRule } from "./ufw-auto-apply";

export type { UfwAutoApplyOptions, UfwAutoApplyResult } from "./ufw-auto-apply";
export { tryAutoApplyUfwRule } from "./ufw-auto-apply";

const DEFAULT_PROBE_IMAGE =
"busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662";
Expand Down Expand Up @@ -461,18 +466,63 @@ export function formatSandboxBridgeUnreachableMessage(
].join("\n");
}

interface SandboxBridgeVerifierOptions {
skip?: boolean;
port?: number;
reachabilityImpl?: () => Promise<SandboxBridgeReachabilityResult> | SandboxBridgeReachabilityResult;
autoApplyImpl?: (
reach: SandboxBridgeReachabilityResult,
) => Promise<UfwAutoApplyResult> | UfwAutoApplyResult;
autoApplyOptedInImpl?: () => boolean;
}

const SILENT_UFW_AUTO_APPLY_REASONS = new Set<UfwAutoApplyResult["reason"]>([
"not_opted_in",
"ufw_missing",
"ufw_inactive",
]);

export async function verifySandboxBridgeGatewayReachableOrExit(
exitOnFailure: boolean,
options: { skip?: boolean } = {},
options: SandboxBridgeVerifierOptions = {},
): Promise<void> {
if (options.skip) {
console.log(" Docker-driver GPU host networking active; skipping sandbox bridge gateway reachability probe.");
return;
}
const reach = await isSandboxBridgeGatewayReachable();
const port = options.port ?? GATEWAY_PORT;
const reachability = options.reachabilityImpl ?? isSandboxBridgeGatewayReachable;
const autoApplyOptedIn = options.autoApplyOptedInImpl ?? isUfwAutoApplyOptedIn;
const autoApply =
options.autoApplyImpl ??
((result: SandboxBridgeReachabilityResult) => tryAutoApplyUfwRule(result, { optedIn: true, port }));

let reach = await reachability();
if (reach.ok) return;

const message = formatSandboxBridgeUnreachableMessage(reach);
// #4265: when operator opts in and the probe proved a bridge TCP failure,
// try to auto-apply the firewall rule and re-probe before surfacing the
// manual-fix message. Do not mutate firewall state for probe helper/DNS
// failures, even if route metadata is present.
if (reach.routeKind === "bridge_gateway" && reach.reason === "tcp_failed" && autoApplyOptedIn()) {
const autoApplyResult = await autoApply(reach);
if (autoApplyResult.applied) {
const ruleDescription = reach.subnet && reach.gatewayIp
? `allow from ${reach.subnet} to ${reach.gatewayIp}:${port}/tcp`
: `allow sandbox bridge traffic to port ${port}/tcp`;
console.log(
` ✓ Applied UFW rule (NEMOCLAW_AUTO_FIX_FIREWALL=1): ${ruleDescription}`,
);
reach = await reachability();
if (reach.ok) return;
} else if (!SILENT_UFW_AUTO_APPLY_REASONS.has(autoApplyResult.reason)) {
console.warn(
` ⚠ NEMOCLAW_AUTO_FIX_FIREWALL=1 set but could not auto-apply UFW rule (${autoApplyResult.reason}${autoApplyResult.detail ? `: ${autoApplyResult.detail}` : ""}); falling back to manual instructions.`,
);
}
}

const message = formatSandboxBridgeUnreachableMessage(reach, port);
if (reach.reason === "probe_unavailable") {
console.warn(message);
return;
Expand Down
Loading
Loading