diff --git a/src/lib/actions/sandbox/auto-pair-approval.ts b/src/lib/actions/sandbox/auto-pair-approval.ts index 5904731d99..e08dca2cdb 100644 --- a/src/lib/actions/sandbox/auto-pair-approval.ts +++ b/src/lib/actions/sandbox/auto-pair-approval.ts @@ -41,9 +41,26 @@ import { ROOT } from "../../state/paths"; // Bound the in-sandbox work: 2s list + 1s × MAX_APPROVALS attempts plus // shell/python startup slack fits inside the outer spawnSync cap, so a wedged -// sandbox can never block the caller. +// sandbox can never block the caller. These are the defaults for the doctor +// recovery surface (#4616), which batch-clears any backlog of pending upgrades. export const AUTO_PAIR_MAX_APPROVALS = 8; export const AUTO_PAIR_APPROVAL_TIMEOUT_MS = 12_000; +// Default per-call budgets (seconds) for the in-sandbox openclaw subcommands. +const AUTO_PAIR_LIST_TIMEOUT_S = 2; +const AUTO_PAIR_APPROVE_TIMEOUT_S = 1; + +// Per-surface budget overrides. The connect/probe/finalization surfaces (#4504) +// supply a tighter budget — a single realistic pending CLI/webchat scope +// upgrade (maxApprovals = 1) on the watcher's 10s approve budget with a 15s +// outer cap — via ./connect-autopair-budget. The doctor surface (#4616) uses +// the defaults above to drain a backlog. Callers that omit a field inherit the +// default, so the historical doctor payload stays byte-stable. +export type AutoPairApprovalBudget = { + maxApprovals?: number; + listTimeoutS?: number; + approveTimeoutS?: number; + timeoutMs?: number; +}; const AUTO_PAIR_POLICY_PATH = path.join( ROOT, @@ -102,11 +119,14 @@ export function readAutoPairApprovalPolicyModule(): string | null { */ export function buildAutoPairApprovalScript( approvalPolicyModuleB64: string, - options: { emitSummary?: boolean } = {}, + options: { emitSummary?: boolean; budget?: AutoPairApprovalBudget } = {}, ): string { const summaryLine = options.emitSummary ? "print(f'__NEMOCLAW_AUTO_PAIR_APPROVED__={approved_count}')\n" : ""; + const maxApprovals = options.budget?.maxApprovals ?? AUTO_PAIR_MAX_APPROVALS; + const listTimeoutS = options.budget?.listTimeoutS ?? AUTO_PAIR_LIST_TIMEOUT_S; + const approveTimeoutS = options.budget?.approveTimeoutS ?? AUTO_PAIR_APPROVE_TIMEOUT_S; return ` PROXY_ENV=/tmp/nemoclaw-proxy-env.sh [ -r "$PROXY_ENV" ] && . "$PROXY_ENV" @@ -131,12 +151,12 @@ except Exception: sys.exit(0) OPENCLAW = os.environ.get('OPENCLAW_BIN', 'openclaw') -MAX_APPROVALS = ${AUTO_PAIR_MAX_APPROVALS} +MAX_APPROVALS = ${maxApprovals} try: proc = subprocess.run( [OPENCLAW, 'devices', 'list', '--json'], - capture_output=True, text=True, timeout=2, + capture_output=True, text=True, timeout=${listTimeoutS}, ) except (subprocess.TimeoutExpired, FileNotFoundError, OSError): sys.exit(0) @@ -171,7 +191,7 @@ for device in pending: try: approve_proc = subprocess.run( [OPENCLAW, 'devices', 'approve', request_id, '--json'], - capture_output=True, text=True, timeout=1, env=approve_env, + capture_output=True, text=True, timeout=${approveTimeoutS}, env=approve_env, ) if approve_proc.returncode == 0: approved_count += 1 @@ -194,7 +214,7 @@ exit 0 */ export function runSandboxAutoPairApprovalPass( sandboxName: string, - options: { capture?: boolean } = {}, + options: { capture?: boolean; budget?: AutoPairApprovalBudget } = {}, ): AutoPairApprovalResult { const capture = options.capture === true; const approvalPolicyModule = readAutoPairApprovalPolicyModule(); @@ -202,7 +222,11 @@ export function runSandboxAutoPairApprovalPass( return { attempted: false, reported: false, approved: 0 }; } const approvalPolicyModuleB64 = Buffer.from(approvalPolicyModule, "utf-8").toString("base64"); - const script = buildAutoPairApprovalScript(approvalPolicyModuleB64, { emitSummary: capture }); + const script = buildAutoPairApprovalScript(approvalPolicyModuleB64, { + emitSummary: capture, + budget: options.budget, + }); + const outerTimeoutMs = options.budget?.timeoutMs ?? AUTO_PAIR_APPROVAL_TIMEOUT_MS; // Lazy require: `adapters/openshell/runtime` pulls in `runner`, whose // load-time `require("./platform")` cannot be resolved by the Vitest TS // loader. Importing it here keeps this module unit-testable in-process. @@ -217,7 +241,7 @@ export function runSandboxAutoPairApprovalPass( env: process.env, stdio: capture ? ["ignore", "pipe", "pipe"] : ["ignore", "ignore", "ignore"], encoding: "utf-8", - timeout: AUTO_PAIR_APPROVAL_TIMEOUT_MS, + timeout: outerTimeoutMs, }, ); if (!capture) { diff --git a/src/lib/actions/sandbox/connect-autopair-budget.ts b/src/lib/actions/sandbox/connect-autopair-budget.ts new file mode 100644 index 0000000000..0f4af25364 --- /dev/null +++ b/src/lib/actions/sandbox/connect-autopair-budget.ts @@ -0,0 +1,22 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Budget constants for the connect-time auto-pair scope-approval pass +// (runConnectAutoPairApprovalPass in ./connect). Kept in a dependency-free leaf +// module so tests can import and assert the invariant on the real values +// without pulling in connect.ts's heavy transitive requires (#4504). + +export const CONNECT_AUTO_PAIR_MAX_APPROVALS = 1; +// `openclaw devices list` budget (seconds), interpolated into the in-sandbox +// script so the invariant below is asserted on real values, not source text. +export const CONNECT_AUTO_PAIR_LIST_TIMEOUT_S = 2; +// `openclaw devices approve` budget (seconds); matches the in-sandbox watcher's +// RUN_TIMEOUT_SECS = 10 (nemoclaw-start.sh). +export const CONNECT_AUTO_PAIR_APPROVE_TIMEOUT_S = 10; +// Outer spawnSync cap (ms). Must exceed the internal worst case +// (CONNECT_AUTO_PAIR_LIST_TIMEOUT_S + CONNECT_AUTO_PAIR_APPROVE_TIMEOUT_S × +// CONNECT_AUTO_PAIR_MAX_APPROVALS) PLUS shell/python startup, since the outer +// timer starts at `sh` spawn before the proxy env is sourced and python3 +// launches; the ~3s slack means a legitimate slow approve is never SIGKILLed +// mid-loop, which would strand the allowlisted request. +export const CONNECT_AUTO_PAIR_TIMEOUT_MS = 15_000; diff --git a/src/lib/actions/sandbox/connect.ts b/src/lib/actions/sandbox/connect.ts index 970bf4e165..4982f9fa9e 100644 --- a/src/lib/actions/sandbox/connect.ts +++ b/src/lib/actions/sandbox/connect.ts @@ -45,6 +45,12 @@ import { } from "../../state/sandbox-session"; import { runSetupDnsProxy } from "../dns"; import { runSandboxAutoPairApprovalPass } from "./auto-pair-approval"; +import { + CONNECT_AUTO_PAIR_APPROVE_TIMEOUT_S, + CONNECT_AUTO_PAIR_LIST_TIMEOUT_S, + CONNECT_AUTO_PAIR_MAX_APPROVALS, + CONNECT_AUTO_PAIR_TIMEOUT_MS, +} from "./connect-autopair-budget"; import { preflightVllmModelEnvOrExit } from "./connect-vllm-preflight"; import { isDockerRuntimeDown, @@ -205,6 +211,10 @@ function runSandboxConnectProbe(sandboxName: string): void { } if (processCheck.wasRunning) { ensureSandboxInferenceRoute(sandboxName, { quiet: true }); + // Defense-in-depth scope-upgrade approval on the probe-only / `recover` + // path (#4504): the gateway is up, so deterministically clear any pending + // allowlisted CLI/webchat scope upgrade. Best-effort; never throws. + runConnectAutoPairApprovalPass(sandboxName); if (processCheck.forwardRecovered) { console.log( ` Probe complete: ${agentName} gateway is running in '${sandboxName}'; restored dashboard port forward.`, @@ -216,6 +226,8 @@ function runSandboxConnectProbe(sandboxName: string): void { } if (processCheck.recovered) { ensureSandboxInferenceRoute(sandboxName, { quiet: true }); + // Same defense-in-depth approval after a recovery (#4504); best-effort. + runConnectAutoPairApprovalPass(sandboxName); console.log(` Probe complete: recovered ${agentName} gateway in '${sandboxName}'.`); return; } @@ -751,6 +763,29 @@ function ensureSandboxInferenceRouteOrExit( return result.sandbox; } +// Connect/probe/finalization budget for the shared auto-pair approval pass +// (#4504). The realistic case here is a single pending CLI/webchat scope +// upgrade, so MAX_APPROVALS is 1 and the approve timeout matches the in-sandbox +// watcher's RUN_TIMEOUT_SECS = 10 (nemoclaw-start.sh). The outer spawnSync cap +// (15s) exceeds the internal worst case (2s list + 10s × 1 = 12s) plus +// shell/python startup so a legitimate slow approve is never SIGKILLed mid-loop +// and the allowlisted request is never stranded. Constants live in the +// dependency-free ./connect-autopair-budget leaf so tests can assert the +// invariant on the real values without importing this heavy module. The doctor +// recovery surface (#4616) keeps the wider default budget in ./auto-pair-approval. +const CONNECT_AUTO_PAIR_BUDGET = { + maxApprovals: CONNECT_AUTO_PAIR_MAX_APPROVALS, + listTimeoutS: CONNECT_AUTO_PAIR_LIST_TIMEOUT_S, + approveTimeoutS: CONNECT_AUTO_PAIR_APPROVE_TIMEOUT_S, + timeoutMs: CONNECT_AUTO_PAIR_TIMEOUT_MS, +} as const; + +// Thin wrapper so the connect/probe/finalization surfaces share one budget +// without each caller restating it. Best-effort; never throws (#4263/#4504). +export function runConnectAutoPairApprovalPass(sandboxName: string): void { + runSandboxAutoPairApprovalPass(sandboxName, { budget: CONNECT_AUTO_PAIR_BUDGET }); +} + function maybeEnsureHermesToolGatewayBroker(sb: SandboxEntry | null): void { if ( !sb || @@ -976,8 +1011,8 @@ export async function connectSandbox( // piled up between startup and now (e.g., watcher crashed, watcher // deadline exhausted, multi-sandbox gateway contention). The same pass // is reachable without SSH via `doctor --fix` for dashboard-only users - // (#4616). - runSandboxAutoPairApprovalPass(sandboxName); + // (#4616). Uses the tight connect budget (#4504). + runConnectAutoPairApprovalPass(sandboxName); // Print a one-shot hint before dropping the user into the sandbox // shell so a fresh user knows the first thing to type. Without this, diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index f0812bce92..ce02910b75 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -504,6 +504,7 @@ import { type MessagingChannelConfig, readMessagingChannelConfigFromEnv, } from "./messaging-channel-config"; +import { finalizationHandlerDeps } from "./onboard/finalization-deps"; import { streamGatewayStart } from "./onboard/gateway"; import { mergeRequiredHermesToolGatewayPolicyPresets, @@ -6539,11 +6540,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { toSessionUpdates: (updates) => toSessionUpdates(updates as Parameters[0]), removeLegacyCredentialsFile, cleanupStaleHostFiles, - checkAndRecoverSandboxProcesses: (name, options) => { - const processRecovery: typeof import("./actions/sandbox/process-recovery") = - require("./actions/sandbox/process-recovery"); - processRecovery.checkAndRecoverSandboxProcesses(name, options); - }, + ...finalizationHandlerDeps, getChatUiUrl: () => process.env.CHAT_UI_URL || `http://127.0.0.1:${DASHBOARD_PORT}`, buildVerifyChain: (chatUiUrl) => // biome-ignore format: keep src/lib/onboard.ts net-neutral for growth guardrail. diff --git a/src/lib/onboard/finalization-deps.ts b/src/lib/onboard/finalization-deps.ts new file mode 100644 index 0000000000..b9e63a105a --- /dev/null +++ b/src/lib/onboard/finalization-deps.ts @@ -0,0 +1,23 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Lazy-require runtime dependencies for the onboarding finalization handler. +// Kept in a focused module under src/lib/onboard/ so the top-level onboard +// entrypoint stays lean (codebase-growth-guardrails). The lazy `require` calls +// avoid an import cycle: connect.ts and process-recovery.ts both pull in +// onboard helpers, so they must not be statically imported here. +export const finalizationHandlerDeps = { + checkAndRecoverSandboxProcesses(name: string, options: { quiet: boolean }): void { + const processRecovery: typeof import("../actions/sandbox/process-recovery") = + require("../actions/sandbox/process-recovery"); + processRecovery.checkAndRecoverSandboxProcesses(name, options); + }, + // Best-effort device-approval sweep that clears pending allowlisted + // CLI/webchat scope upgrades so onboard hands off without a stuck pairing + // request (#4504). Never throws. + autoPairScopeApproval(name: string): void { + const connect: typeof import("../actions/sandbox/connect") = + require("../actions/sandbox/connect"); + connect.runConnectAutoPairApprovalPass(name); + }, +}; diff --git a/src/lib/onboard/machine/handlers/finalization.test.ts b/src/lib/onboard/machine/handlers/finalization.test.ts index b6cfec8202..438ae3f3b9 100644 --- a/src/lib/onboard/machine/handlers/finalization.test.ts +++ b/src/lib/onboard/machine/handlers/finalization.test.ts @@ -18,6 +18,7 @@ function createDeps(overrides: Partial "http://127.0.0.1:18789"), buildChain: vi.fn(() => ({ port: 18789 })), verify: vi.fn(async () => ({ ok: true })), @@ -37,6 +38,7 @@ function createDeps(overrides: Partial { callsOn.verify.mock.invocationCallOrder[0], ); }); + + // Scenario A (#4504): the auto-pair scope-approval sweep runs against the + // freshly-recovered gateway — strictly after process recovery (which can + // restart the gateway, #3573) and strictly before deployment verification + // (so the gateway state is settled before we probe it). + it("runs the auto-pair scope-approval sweep after process recovery and before verify (#4504)", async () => { + const { deps, calls } = createDeps(); + + await handleFinalizationState(baseOptions(deps)); + + expect(calls.autoPairScopeApproval).toHaveBeenCalledOnce(); + expect(calls.autoPairScopeApproval).toHaveBeenCalledWith("my-assistant"); + // Ordering: recover → autoPairScopeApproval → verify. + expect(calls.autoPairScopeApproval.mock.invocationCallOrder[0]).toBeGreaterThan( + calls.recoverProcesses.mock.invocationCallOrder[0], + ); + expect(calls.autoPairScopeApproval.mock.invocationCallOrder[0]).toBeLessThan( + calls.verify.mock.invocationCallOrder[0], + ); + }); + + // Scenario B (#4504): the sweep is agent-agnostic — the stuck CLI/webchat + // scope upgrade can occur regardless of which agent the sandbox runs. + it("runs the scope-approval sweep regardless of agent type (#4504)", async () => { + const { deps, calls } = createDeps(); + const agent = { name: "hermes" }; + + await handleFinalizationState({ ...baseOptions(deps), agent }); + + expect(calls.autoPairScopeApproval).toHaveBeenCalledWith("my-assistant"); + }); + + // Scenario C (#4504): the dep is documented as best-effort / never-throws and + // the handler wraps no try/catch around it. Per the contract we assert the + // implemented behavior: the sweep is invoked and, because it returns cleanly, + // finalization proceeds to completion (recordSessionComplete still runs). A + // dep that threw would abort finalization here — the regression this guards. + it("treats the scope-approval sweep as best-effort and still completes the session (#4504)", async () => { + const { deps, calls } = createDeps(); + + const result = await handleFinalizationState(baseOptions(deps)); + + expect(calls.autoPairScopeApproval).toHaveBeenCalledOnce(); + // The non-throwing sweep does not abort finalization: it proceeds through + // verification and the dashboard print to a completed result. (#4472 moved + // session completion to the imported completeOnboardMachine, so completion + // is asserted via the downstream dashboard + diagnostics rather than a dep.) + expect(calls.dashboard).toHaveBeenCalledOnce(); + expect(result.verificationDiagnostics).toEqual([" ✓ verified"]); + }); }); diff --git a/src/lib/onboard/machine/handlers/finalization.ts b/src/lib/onboard/machine/handlers/finalization.ts index a89d0920c9..d0ccf0750f 100644 --- a/src/lib/onboard/machine/handlers/finalization.ts +++ b/src/lib/onboard/machine/handlers/finalization.ts @@ -28,6 +28,14 @@ export interface FinalizationStateOptions; @@ -94,6 +102,10 @@ export async function handleFinalizationState { + if (!call.includes("--")) return false; + const inner = decodeWrappedSandboxScript(call[call.length - 1] || ""); + return inner.includes("openclaw") && inner.includes("devices") && inner.includes("approve"); + }); +} + describe("sandbox connect auto-pair approval pass (#4263)", () => { it( "runs a bounded openclaw devices approval pass before opening SSH", @@ -79,13 +96,13 @@ describe("sandbox connect auto-pair approval pass (#4263)", () => { const result = runConnect(tmpDir, sandboxName); expect(result.status).toBe(0); const script = extractApprovalPassScript(stateFile, sandboxName); + // Disallowed/malformed/unknown requests are skipped by the policy before + // an approve is even attempted (they `continue` before the attempt + // counter increments), so they do not consume the MAX_APPROVALS=1 budget + // (#4504). They are ordered first here to prove the rejection path runs; + // the single allowed request (`ok-cli`) is then approved and exhausts the + // one-attempt budget, so the trailing duplicate `ok-cli` is never reached. const run = runApprovalPassScript(script, [ - { - requestId: "ok-cli", - clientId: "openclaw-cli", - clientMode: "cli", - scopes: ["operator.read", "operator.write"], - }, { requestId: "admin-cli", clientId: "openclaw-cli", @@ -105,22 +122,24 @@ describe("sandbox connect auto-pair approval pass (#4263)", () => { scopes: ["operator.read"], }, { - requestId: "dedupe-cli", + requestId: "ok-cli", clientId: "openclaw-cli", clientMode: "cli", - requestedScopes: ["operator.read"], + scopes: ["operator.read", "operator.write"], }, { - requestId: "dedupe-cli", + requestId: "ok-cli", clientId: "openclaw-cli", clientMode: "cli", - requestedScopes: ["operator.read"], + scopes: ["operator.read", "operator.write"], }, ]); expect(run.result.status).toBe(0); - expect(run.approvals).toEqual(["ok-cli", "dedupe-cli"]); - expect(run.approvalEnv).toEqual(["unset:unset:unset", "unset:unset:unset"]); + // Only the first allowed request is approved — MAX_APPROVALS is 1 (#4504), + // the realistic single pending CLI/webchat scope upgrade. + expect(run.approvals).toEqual(["ok-cli"]); + expect(run.approvalEnv).toEqual(["unset:unset:unset"]); }, ); @@ -221,3 +240,204 @@ describe("sandbox connect auto-pair approval pass (#4263)", () => { }, ); }); + +// The #4504 fix also wires the approval pass into the `nemoclaw recover` / +// `connect --probe-only` path (defect A) — the gateway-up branches only, never +// the gateway-down failure exit — and re-uses the shared policy's +// gateway_approval_env so a scope-upgrade approve drops the full gateway env +// triplet (#4462) on the watcher's 10s budget while staying within the outer +// spawnSync cap (defect B). The interactive-connect cases above cover the +// allowlist and best-effort semantics; these add the probe-path wiring, +// gateway-down negative, and the budget invariant on the real constants. +describe("sandbox connect scope-upgrade approval on recover/probe (#4504)", () => { + it( + "runs the approval pass on the --probe-only (recover) path", + testTimeoutOptions(20_000), + () => { + // The probe takes the wasRunning branch (the fake openshell health probe + // reports RUNNING), so the recover path must run the sweep — and must NOT + // open an SSH session. + const { tmpDir, stateFile, sandboxName } = setupFixture( + { + name: "probe-approval-sandbox", + model: "claude-sonnet-4-20250514", + provider: "anthropic-prod", + gpuEnabled: false, + policies: [], + }, + "anthropic-prod", + "claude-sonnet-4-20250514", + ); + + const result = runConnect(tmpDir, sandboxName, {}, ["--probe-only"]); + expect(result.status).toBe(0); + + const state = JSON.parse(fs.readFileSync(stateFile, "utf-8")); + const approvalExec = findApprovalExec(state.sandboxExecCalls as string[][]); + expect(approvalExec).toBeDefined(); + expect(approvalExec).toContain("sandbox"); + expect(approvalExec).toContain("exec"); + expect(approvalExec).toContain("--name"); + expect(approvalExec).toContain(sandboxName); + // probe-only never opens an SSH connect session. + expect(state.sandboxConnectCalls).toEqual([]); + }, + ); + + it( + "does not fail the recover path when the probe approval pass errors", + testTimeoutOptions(20_000), + () => { + // Best-effort: even when the in-sandbox approval exec exits non-zero, the + // probe-only flow must still succeed. + const { tmpDir, stateFile, sandboxName } = setupFixture( + { + name: "probe-approval-tolerant", + model: "claude-sonnet-4-20250514", + provider: "anthropic-prod", + gpuEnabled: false, + policies: [], + }, + "anthropic-prod", + "claude-sonnet-4-20250514", + ); + + const result = runConnect( + tmpDir, + sandboxName, + { NEMOCLAW_TEST_FAIL_APPROVAL_PASS: "1" }, + ["--probe-only"], + ); + expect(result.status).toBe(0); + + const state = JSON.parse(fs.readFileSync(stateFile, "utf-8")); + const approvalExec = findApprovalExec(state.sandboxExecCalls as string[][]); + expect(approvalExec).toBeDefined(); + }, + ); + + it( + "does not run the approval pass when the probe fails (gateway down, recovery fails)", + testTimeoutOptions(20_000), + () => { + // The sweep is wired only into the wasRunning and recovered success + // branches — never the not-running failure exit, where the gateway is + // down. Force the health probe to report STOPPED and let recovery fail so + // the probe lands on the failure branch; the approval pass must NOT run. + const { tmpDir, stateFile, sandboxName } = setupFixture( + { + name: "probe-gateway-down", + model: "claude-sonnet-4-20250514", + provider: "anthropic-prod", + gpuEnabled: false, + policies: [], + }, + "anthropic-prod", + "claude-sonnet-4-20250514", + ); + + const result = runConnect( + tmpDir, + sandboxName, + { NEMOCLAW_TEST_GATEWAY_DOWN: "1" }, + ["--probe-only"], + ); + expect(result.status).toBe(1); + + const state = JSON.parse(fs.readFileSync(stateFile, "utf-8")); + const approvalExec = findApprovalExec(state.sandboxExecCalls as string[][]); + expect(approvalExec).toBeUndefined(); + // And it never opens an SSH session on the failure path. + expect(state.sandboxConnectCalls).toEqual([]); + }, + ); + + it( + "approve child strips the full gateway env triplet on the probe path (#4462)", + testTimeoutOptions(20_000), + () => { + // The probe-path approve must drop OPENCLAW_GATEWAY_URL/_PORT/_TOKEN via + // the shared policy's gateway_approval_env so the local pairing fallback + // cannot re-pin to the gateway and hit the #4462 self-defeat. Render the + // probe-path script, then actually run it and assert the approve child + // saw none of the triplet. + const { tmpDir, stateFile, sandboxName } = setupFixture( + { + name: "probe-env-strip-sandbox", + model: "claude-sonnet-4-20250514", + provider: "anthropic-prod", + gpuEnabled: false, + policies: [], + }, + "anthropic-prod", + "claude-sonnet-4-20250514", + ); + + const result = runConnect(tmpDir, sandboxName, {}, ["--probe-only"]); + expect(result.status).toBe(0); + + const script = extractApprovalPassScript(stateFile, sandboxName); + expect(script).toContain("approve_env = gateway_approval_env(os.environ)"); + expect(script).toContain("env=approve_env"); + + const run = runApprovalPassScript(script, [ + { + requestId: "probe-cli", + clientId: "openclaw-cli", + clientMode: "cli", + scopes: ["operator.read", "operator.write"], + }, + ]); + expect(run.result.status).toBe(0); + expect(run.approvals).toEqual(["probe-cli"]); + // The approve child saw none of the gateway env triplet (#4462). + expect(run.approvalEnv).toEqual(["unset:unset:unset"]); + }, + ); + + it( + "approve timeout matches the watcher (10s), list keeps 2s, and stays within the outer cap", + testTimeoutOptions(20_000), + () => { + const { tmpDir, stateFile, sandboxName } = setupFixture( + { + name: "approve-budget-sandbox", + model: "claude-sonnet-4-20250514", + provider: "anthropic-prod", + gpuEnabled: false, + policies: [], + }, + "anthropic-prod", + "claude-sonnet-4-20250514", + ); + + const result = runConnect(tmpDir, sandboxName, {}, ["--probe-only"]); + expect(result.status).toBe(0); + + const script = extractApprovalPassScript(stateFile, sandboxName); + // The rendered script interpolates the exported budget constants, tying + // runtime behaviour to the values the invariant below asserts on (no + // source-text scraping — numbers come from the imported constants). + expect(script).toContain("[OPENCLAW, 'devices', 'list', '--json']"); + expect(script).toContain(`timeout=${CONNECT_AUTO_PAIR_LIST_TIMEOUT_S},`); + expect(script).toContain(`timeout=${CONNECT_AUTO_PAIR_APPROVE_TIMEOUT_S},`); + expect(script).toContain(`MAX_APPROVALS = ${CONNECT_AUTO_PAIR_MAX_APPROVALS}`); + + // Approve budget matches the in-sandbox watcher RUN_TIMEOUT_SECS = 10; + // list budget is 2s. + expect(CONNECT_AUTO_PAIR_APPROVE_TIMEOUT_S).toBe(10); + expect(CONNECT_AUTO_PAIR_LIST_TIMEOUT_S).toBe(2); + + // Budget invariant: the inner worst case (list + approve × MAX_APPROVALS) + // must stay STRICTLY below the outer spawnSync cap. The outer timer starts + // when `sh` is spawned — before shell startup, sourcing the proxy env, the + // python3 launch, and `devices list` even begin — so the cap must leave + // slack above the inner budget, or a legitimate slow 10s approve is killed + // mid-loop and the allowlisted request is stranded (#4504). + const innerBudgetSeconds = + CONNECT_AUTO_PAIR_LIST_TIMEOUT_S + + CONNECT_AUTO_PAIR_APPROVE_TIMEOUT_S * CONNECT_AUTO_PAIR_MAX_APPROVALS; + expect(innerBudgetSeconds).toBeLessThan(CONNECT_AUTO_PAIR_TIMEOUT_MS / 1000); + }, + ); +}); diff --git a/test/sandbox-connect-inference/helpers.ts b/test/sandbox-connect-inference/helpers.ts index 479baff59d..3a65e258c0 100644 --- a/test/sandbox-connect-inference/helpers.ts +++ b/test/sandbox-connect-inference/helpers.ts @@ -170,6 +170,17 @@ if (args[0] === "sandbox" && args[1] === "exec") { process.stderr.write("simulated sandbox exec failure\\n"); process.exit(7); } + // Test hook (#4504): force the in-sandbox gateway health probe to report + // STOPPED so the probe path takes the not-running branch and (when recovery + // also fails) the probe-failure exit — where the approval sweep must NOT run. + if ( + process.env.NEMOCLAW_TEST_GATEWAY_DOWN === "1" && + command.includes("/health") && + command.includes("HTTP_CODE") + ) { + process.stdout.write("__NEMOCLAW_SANDBOX_EXEC_STARTED__\\nSTOPPED\\n"); + process.exit(0); + } process.stdout.write("__NEMOCLAW_SANDBOX_EXEC_STARTED__\\nRUNNING\\n"); process.exit(0); }