From b072c108f125261359ad253707ff0ea1fc6344ac Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Fri, 22 May 2026 15:01:25 +0000 Subject: [PATCH 01/10] fix(onboard): poll forward list instead of waiting on openshell CLI exit (#4064) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `ensureDashboardForward` invoked `openshell forward start --background` via `spawnSync` with a 30-second timeout. On hosts that fall back to the Docker compatibility gateway (host glibc older than the openshell-gateway requirement), the daemonised forward inherits the parent CLI's stdio, so spawnSync waits on those file descriptors long after `--background` returned. The wait routinely exceeded the 30-second timeout, producing `spawnSync ... openshell ETIMEDOUT` and tripping the rollback path even though the dashboard was healthy inside the sandbox (port 18789 listening, `/health` returning 200). Replace the spawnSync-with-timeout path with a detached spawn plus a poll of `openshell forward list`. The CLI's exit code is no longer the success signal — the appearance of a matching `(port, sandboxName)` entry in the live forward list is. EADDRINUSE-style diagnostics from the parent process are still surfaced before the deadline so the existing port-conflict retry path keeps working. Adds focused unit tests under `src/lib/onboard/forward-start.test.ts` for the new helpers and updates the broader onboard tests that exercise `createSandbox` so their `childProcess.spawn` mocks capture the full argv and their `forward list` mocks include a matching entry for the polled port. Fixes #4064 Signed-off-by: Tinson Lai --- src/lib/onboard.ts | 45 ++++-- src/lib/onboard/forward-start.test.ts | 210 +++++++++++++++++++++++++ src/lib/onboard/forward-start.ts | 178 +++++++++++++++++---- test/onboard-custom-dockerfile.test.ts | 3 +- test/onboard-messaging.test.ts | 42 +++-- test/onboard.test.ts | 45 ++++-- test/shellquote-sandbox.test.ts | 2 +- 7 files changed, 451 insertions(+), 74 deletions(-) create mode 100644 src/lib/onboard/forward-start.test.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 4d9374e3b3..746388e2db 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -18,7 +18,7 @@ const { const { cleanupTempDir }: typeof import("./onboard/temp-files") = require("./onboard/temp-files"); const { stopStaleDashboardListenersForSandbox } = require("./onboard/stale-gateway-cleanup"); const { bestEffortForwardStop } = require("./onboard/forward-cleanup"); -const { looksLikeForwardPortConflict, runBackgroundForwardStartWithPortReleaseRetries }: typeof import("./onboard/forward-start") = require("./onboard/forward-start"); +const { looksLikeForwardPortConflict, runDetachedForwardStartWithPortReleaseRetries }: typeof import("./onboard/forward-start") = require("./onboard/forward-start"); const { ensureOllamaLoopbackSystemdOverride, }: typeof import("./onboard/ollama-systemd") = require("./onboard/ollama-systemd"); @@ -8644,15 +8644,42 @@ function ensureDashboardForward( parsedUrl.port = String(actualPort); const actualTarget = getDashboardForwardTarget(parsedUrl.toString()); bestEffortForwardStop(runOpenshell, actualPort); - const { result: fwdResult, diagnostic: fwdDiagnostic } = runBackgroundForwardStartWithPortReleaseRetries( - (stdio, timeout) => - runOpenshell( - ["forward", "start", "--background", actualTarget, sandboxName], - { ignoreError: true, suppressOutput: true, stdio, timeout }, - ), - () => { sleep(1); bestEffortForwardStop(runOpenshell, actualPort); }, + const forwardStartArgv = openshellArgv([ + "forward", + "start", + "--background", + actualTarget, + sandboxName, + ]); + const fwdOutcome = runDetachedForwardStartWithPortReleaseRetries( + // `openshell forward start --background` daemonises the actual forward + // process. On the Docker compatibility gateway (host glibc older than + // the openshell-gateway requirement) the daemon inherits the parent's + // stdio, so spawnSync on the CLI would block for minutes and trip the + // 30s ETIMEDOUT seen in #4064. Spawn detached, hand the daemon its own + // file descriptors, and confirm via `openshell forward list` instead + // of the CLI's exit code. + ({ stdout, stderr }) => { + try { + const child = spawn(forwardStartArgv[0], forwardStartArgv.slice(1), { + stdio: ["ignore", stdout, stderr], + detached: true, + }); + child.unref(); + return { pid: child.pid }; + } catch (e) { + return { error: e instanceof Error ? e : new Error(String(e)) }; + } + }, + () => runCaptureOpenshell(["forward", "list"], { ignoreError: true }), + { port: actualPort, sandboxName }, + () => { + sleep(1); + bestEffortForwardStop(runOpenshell, actualPort); + }, ); - if (fwdResult && fwdResult.status !== 0) { + const fwdDiagnostic = fwdOutcome.diagnostic; + if (!fwdOutcome.ok) { const looksLikePortConflict = looksLikeForwardPortConflict(fwdDiagnostic); if (rollbackSandboxOnFailure) { // The sandbox was just created, committed to actualPort via its diff --git a/src/lib/onboard/forward-start.test.ts b/src/lib/onboard/forward-start.test.ts new file mode 100644 index 0000000000..b34f89d5e5 --- /dev/null +++ b/src/lib/onboard/forward-start.test.ts @@ -0,0 +1,210 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it, vi } from "vitest"; + +import { + looksLikeForwardPortConflict, + runDetachedForwardStartWithDiagnostics, + runDetachedForwardStartWithPortReleaseRetries, +} from "../../../dist/lib/onboard/forward-start"; + +// Build an `openshell forward list`-shaped output for the given live entries. +// Mirrors the column layout (SANDBOX BIND PORT PID STATUS) that +// `getOccupiedPorts` parses, so the helper recognises the forward as live. +function forwardListWith(entries: Array<{ sandbox: string; port: number; status?: string }>): string { + const header = "SANDBOX BIND PORT PID STATUS"; + const rows = entries.map( + (e) => `${e.sandbox} 127.0.0.1 ${e.port} 1234 ${e.status ?? "running"}`, + ); + return [header, ...rows].join("\n"); +} + +describe("runDetachedForwardStartWithDiagnostics", () => { + it("returns ok as soon as the forward appears in the list", () => { + const fetchList = vi + .fn() + .mockReturnValueOnce(forwardListWith([])) // first poll: nothing yet + .mockReturnValue(forwardListWith([{ sandbox: "my-sandbox", port: 18789 }])); + const spawn = vi.fn().mockReturnValue({ pid: 42 }); + const sleep = vi.fn(); + + const result = runDetachedForwardStartWithDiagnostics( + spawn, + fetchList, + { port: 18789, sandboxName: "my-sandbox" }, + { overallTimeoutMs: 10_000, pollIntervalMs: 10, sleepMs: sleep }, + ); + + expect(result.ok).toBe(true); + expect(result.reason).toBe("ok"); + expect(result.pid).toBe(42); + expect(spawn).toHaveBeenCalledTimes(1); + // First poll missed → one sleep before the second poll observed the entry. + expect(sleep).toHaveBeenCalledTimes(1); + }); + + it("ignores entries that belong to a different sandbox", () => { + const fetchList = vi + .fn() + .mockReturnValue(forwardListWith([{ sandbox: "other-sandbox", port: 18789 }])); + const spawn = vi.fn().mockReturnValue({ pid: 42 }); + const sleep = vi.fn(); + + const result = runDetachedForwardStartWithDiagnostics( + spawn, + fetchList, + { port: 18789, sandboxName: "my-sandbox" }, + { overallTimeoutMs: 50, pollIntervalMs: 10, sleepMs: sleep }, + ); + + expect(result.ok).toBe(false); + expect(result.reason).toBe("timeout"); + }); + + it("reports timeout when the forward never appears", () => { + const fetchList = vi.fn().mockReturnValue(forwardListWith([])); + const spawn = vi.fn().mockReturnValue({ pid: 42 }); + const sleep = vi.fn(); + + const result = runDetachedForwardStartWithDiagnostics( + spawn, + fetchList, + { port: 18789, sandboxName: "my-sandbox" }, + { overallTimeoutMs: 30, pollIntervalMs: 10, sleepMs: sleep }, + ); + + expect(result.ok).toBe(false); + expect(result.reason).toBe("timeout"); + expect(result.diagnostic).toMatch(/forward did not appear in list within 30ms/); + }); + + it("surfaces spawn errors immediately without polling", () => { + const fetchList = vi.fn(); + const spawn = vi.fn().mockReturnValue({ error: new Error("ENOENT: openshell not found") }); + const sleep = vi.fn(); + + const result = runDetachedForwardStartWithDiagnostics( + spawn, + fetchList, + { port: 18789, sandboxName: "my-sandbox" }, + { overallTimeoutMs: 10_000, pollIntervalMs: 10, sleepMs: sleep }, + ); + + expect(result.ok).toBe(false); + expect(result.reason).toBe("spawn-error"); + expect(result.diagnostic).toMatch(/ENOENT/); + expect(fetchList).not.toHaveBeenCalled(); + expect(sleep).not.toHaveBeenCalled(); + }); + + it("treats fetchForwardList exceptions as transient and keeps polling", () => { + const fetchList = vi + .fn() + .mockImplementationOnce(() => { + throw new Error("gateway not reachable yet"); + }) + .mockReturnValue(forwardListWith([{ sandbox: "my-sandbox", port: 18789 }])); + const spawn = vi.fn().mockReturnValue({ pid: 42 }); + const sleep = vi.fn(); + + const result = runDetachedForwardStartWithDiagnostics( + spawn, + fetchList, + { port: 18789, sandboxName: "my-sandbox" }, + { overallTimeoutMs: 10_000, pollIntervalMs: 10, sleepMs: sleep }, + ); + + expect(result.ok).toBe(true); + expect(fetchList).toHaveBeenCalledTimes(2); + }); +}); + +describe("runDetachedForwardStartWithPortReleaseRetries", () => { + it("retries after a port-conflict diagnostic, then succeeds", () => { + const fetchList = vi + .fn() + .mockReturnValueOnce(forwardListWith([])) // first attempt: never appears + .mockReturnValueOnce(forwardListWith([])) // (timeout settles) + .mockReturnValue(forwardListWith([{ sandbox: "my-sandbox", port: 18789 }])); + const beforeRetry = vi.fn(); + // First spawn surfaces a port-conflict in its diagnostic synthesised via + // an Error message; the second spawn succeeds and the forward appears. + const spawn = vi + .fn() + .mockReturnValueOnce({ error: new Error("EADDRINUSE: address already in use") }) + .mockReturnValueOnce({ pid: 99 }); + const sleep = vi.fn(); + + const result = runDetachedForwardStartWithPortReleaseRetries( + spawn, + fetchList, + { port: 18789, sandboxName: "my-sandbox" }, + beforeRetry, + 3, + { overallTimeoutMs: 30, pollIntervalMs: 10, sleepMs: sleep }, + ); + + expect(result.ok).toBe(true); + expect(beforeRetry).toHaveBeenCalledTimes(1); + expect(spawn).toHaveBeenCalledTimes(2); + }); + + it("does not retry when the failure does not look like a port conflict", () => { + const fetchList = vi.fn().mockReturnValue(forwardListWith([])); + const beforeRetry = vi.fn(); + const spawn = vi.fn().mockReturnValue({ pid: 42 }); + const sleep = vi.fn(); + + const result = runDetachedForwardStartWithPortReleaseRetries( + spawn, + fetchList, + { port: 18789, sandboxName: "my-sandbox" }, + beforeRetry, + 3, + { overallTimeoutMs: 20, pollIntervalMs: 10, sleepMs: sleep }, + ); + + expect(result.ok).toBe(false); + expect(result.reason).toBe("timeout"); + expect(beforeRetry).not.toHaveBeenCalled(); + expect(spawn).toHaveBeenCalledTimes(1); + }); + + it("stops retrying after maxRetries even if conflict diagnostics persist", () => { + const fetchList = vi.fn().mockReturnValue(forwardListWith([])); + const beforeRetry = vi.fn(); + const spawn = vi + .fn() + .mockReturnValue({ error: new Error("EADDRINUSE: address already in use") }); + const sleep = vi.fn(); + + const result = runDetachedForwardStartWithPortReleaseRetries( + spawn, + fetchList, + { port: 18789, sandboxName: "my-sandbox" }, + beforeRetry, + 2, + { overallTimeoutMs: 20, pollIntervalMs: 10, sleepMs: sleep }, + ); + + expect(result.ok).toBe(false); + expect(beforeRetry).toHaveBeenCalledTimes(2); + expect(spawn).toHaveBeenCalledTimes(3); // initial + 2 retries + }); +}); + +describe("looksLikeForwardPortConflict", () => { + it("matches the common port-in-use signals", () => { + expect(looksLikeForwardPortConflict("listen tcp 0.0.0.0:18789: bind: address already in use")).toBe( + true, + ); + expect(looksLikeForwardPortConflict("EADDRINUSE")).toBe(true); + expect(looksLikeForwardPortConflict("port 18789 in use")).toBe(true); + }); + + it("returns false for unrelated errors", () => { + expect(looksLikeForwardPortConflict("transport: connection refused")).toBe(false); + expect(looksLikeForwardPortConflict("")).toBe(false); + }); +}); diff --git a/src/lib/onboard/forward-start.ts b/src/lib/onboard/forward-start.ts index 05e45e958a..9cc92a751b 100644 --- a/src/lib/onboard/forward-start.ts +++ b/src/lib/onboard/forward-start.ts @@ -3,23 +3,44 @@ import fs from "node:fs"; import path from "node:path"; -import type { StdioOptions } from "node:child_process"; import { compactText } from "../core/url-utils"; import { redact } from "../security/redact"; +import { getOccupiedPorts } from "./dashboard-port"; import { cleanupTempDir, secureTempFile } from "./temp-files"; -export type BackgroundForwardStartResult = { - status: number | null; - stdout?: string | Buffer | null; - stderr?: string | Buffer | null; - error?: Error; -}; +// `openshell forward start --background` daemonises the actual forward +// process, but the parent CLI's stdio is inherited by the daemon child on +// some platforms (notably the Docker compatibility gateway used when the +// host glibc is older than the openshell-gateway requirement). spawnSync +// then waits on those fds until the daemon exits — minutes later — and +// reports ETIMEDOUT even though the forward is established. See #4064. +// +// The detached path below spawns the CLI with `detached: true`, hands it +// independent diagnostic file descriptors, and confirms success by polling +// `openshell forward list` for an entry matching `(port, sandboxName)`. +// The CLI's exit code is no longer the success signal — the appearance of +// the live forward in the list is. -export type BackgroundForwardStartRunner = ( - stdio: StdioOptions, - timeoutMs: number, -) => BackgroundForwardStartResult; +export type ForwardListFetcher = () => string; + +export type DetachedForwardSpawnRunner = (stdio: { + stdout: number; + stderr: number; +}) => { pid?: number; error?: Error }; + +export interface DetachedForwardStartOutcome { + ok: boolean; + diagnostic: string; + pid?: number; + reason: "ok" | "spawn-error" | "timeout" | "spawn-conflict"; +} + +export interface DetachedForwardStartOptions { + overallTimeoutMs?: number; + pollIntervalMs?: number; + sleepMs?: (ms: number) => void; +} function readDiagnosticFile(filePath: string): string { try { @@ -41,26 +62,66 @@ export function looksLikeForwardPortConflict(diagnostic: string): boolean { return /eaddrinuse|address already in use|port .* in use|bind: .*in use/i.test(diagnostic); } -export function runBackgroundForwardStartWithDiagnostics( - runForwardStart: BackgroundForwardStartRunner, - timeoutMs = 30_000, -): { result: BackgroundForwardStartResult; diagnostic: string } { +function blockingSleepMs(ms: number): void { + if (ms <= 0) return; + // Synchronous sleep — onboard's forward-start sits in a sync code path, + // so we cannot await. spawnSync of `node -e setTimeout` is the same + // primitive `sleepMs` in core/wait uses, but we keep the call site + // injectable so tests can stub it without spawning subprocesses. + const { spawnSync } = require("node:child_process"); + spawnSync(process.execPath, ["-e", `setTimeout(() => {}, ${ms}).unref();`], { + stdio: "ignore", + timeout: ms + 5_000, + }); +} + +function isForwardConfirmed( + forwardListOutput: string, + expect: { port: number; sandboxName: string }, +): boolean { + return getOccupiedPorts(forwardListOutput).get(String(expect.port)) === expect.sandboxName; +} + +/** + * Spawn `openshell forward start --background` as a detached child and wait + * for the resulting forward to appear in `openshell forward list`. Returns + * `ok: true` as soon as the live entry is observed, regardless of whether + * the original spawn process has exited yet. Returns `ok: false` with a + * captured diagnostic when: + * - the spawn itself failed (ENOENT, permission denied, …); + * - the parent process wrote an EADDRINUSE-style error to stderr before + * the deadline (port conflict — retry path); + * - the deadline expired without the forward appearing. + * + * The diagnostic file pair is removed before return, so the temp dir does + * not leak across retries. + */ +export function runDetachedForwardStartWithDiagnostics( + runDetachedSpawn: DetachedForwardSpawnRunner, + fetchForwardList: ForwardListFetcher, + expect: { port: number; sandboxName: string }, + options: DetachedForwardStartOptions = {}, +): DetachedForwardStartOutcome { + const overallTimeoutMs = options.overallTimeoutMs ?? 60_000; + const pollIntervalMs = options.pollIntervalMs ?? 500; + const sleepImpl = options.sleepMs ?? blockingSleepMs; + const forwardDiagPath = secureTempFile("nemoclaw-forward-start", ".out"); const forwardDiagDir = path.dirname(forwardDiagPath); const forwardErrPath = path.join(forwardDiagDir, "nemoclaw-forward-start.err"); - let result: BackgroundForwardStartResult | null = null; + // `fs.openSync` with `"w"` truncates / creates the diagnostic files; the + // child inherits the fds via posix_spawn semantics. We close the host's + // copies immediately so only the child's reference keeps them alive, + // which lets the kernel reclaim them when the (detached) child exits. const outFd = fs.openSync(forwardDiagPath, "w", 0o600); const errFd = fs.openSync(forwardErrPath, "w", 0o600); + let pid: number | undefined; + let spawnError: Error | undefined; try { - try { - result = runForwardStart(["ignore", outFd, errFd], timeoutMs); - } catch (error) { - result = { - status: null, - error: error instanceof Error ? error : new Error(String(error)), - }; - } + const spawnResult = runDetachedSpawn({ stdout: outFd, stderr: errFd }); + pid = spawnResult.pid; + spawnError = spawnResult.error; } finally { try { fs.closeSync(outFd); @@ -74,32 +135,81 @@ export function runBackgroundForwardStartWithDiagnostics( } } - try { + const readDiag = (): string => { const stderr = readDiagnosticFile(forwardErrPath); const stdout = readDiagnosticFile(forwardDiagPath); - const message = result?.error instanceof Error ? result.error.message : ""; + const message = spawnError instanceof Error ? spawnError.message : ""; + return compactText(redact(`${stderr} ${stdout} ${message}`)); + }; + + try { + if (spawnError) { + return { ok: false, diagnostic: readDiag(), pid, reason: "spawn-error" }; + } + + const start = Date.now(); + const deadline = start + overallTimeoutMs; + while (Date.now() < deadline) { + let list: string; + try { + list = fetchForwardList() || ""; + } catch { + list = ""; + } + if (isForwardConfirmed(list, expect)) { + return { ok: true, diagnostic: readDiag(), pid, reason: "ok" }; + } + const diagSoFar = readDiag(); + if (looksLikeForwardPortConflict(diagSoFar)) { + return { ok: false, diagnostic: diagSoFar, pid, reason: "spawn-conflict" }; + } + sleepImpl(pollIntervalMs); + } + const finalDiag = readDiag(); return { - result: result ?? { status: null, error: new Error("forward start did not return a result") }, - diagnostic: compactText(redact(`${stderr} ${stdout} ${message}`)), + ok: false, + diagnostic: + finalDiag || `forward did not appear in list within ${overallTimeoutMs}ms`, + pid, + reason: "timeout", }; } finally { cleanupTempDir(forwardDiagPath, "nemoclaw-forward-start"); } } -export function runBackgroundForwardStartWithPortReleaseRetries( - runForwardStart: BackgroundForwardStartRunner, +/** + * Retry the detached forward-start when the diagnostic looks like an + * EADDRINUSE-style port conflict. `beforeRetry` runs between attempts so + * the caller can drop any stale forward bound to the same port before + * trying again. + */ +export function runDetachedForwardStartWithPortReleaseRetries( + runDetachedSpawn: DetachedForwardSpawnRunner, + fetchForwardList: ForwardListFetcher, + expect: { port: number; sandboxName: string }, beforeRetry: () => void, maxRetries = 3, -): { result: BackgroundForwardStartResult; diagnostic: string } { - let attempt = runBackgroundForwardStartWithDiagnostics(runForwardStart); + options: DetachedForwardStartOptions = {}, +): DetachedForwardStartOutcome { + let attempt = runDetachedForwardStartWithDiagnostics( + runDetachedSpawn, + fetchForwardList, + expect, + options, + ); for ( let retries = 0; - attempt.result.status !== 0 && looksLikeForwardPortConflict(attempt.diagnostic) && retries < maxRetries; + !attempt.ok && looksLikeForwardPortConflict(attempt.diagnostic) && retries < maxRetries; retries++ ) { beforeRetry(); - attempt = runBackgroundForwardStartWithDiagnostics(runForwardStart); + attempt = runDetachedForwardStartWithDiagnostics( + runDetachedSpawn, + fetchForwardList, + expect, + options, + ); } return attempt; } diff --git a/test/onboard-custom-dockerfile.test.ts b/test/onboard-custom-dockerfile.test.ts index cad7ac0b8d..458ac455e4 100644 --- a/test/onboard-custom-dockerfile.test.ts +++ b/test/onboard-custom-dockerfile.test.ts @@ -119,7 +119,8 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); - const cmd = _n(args[1][1]); + child.unref = () => {}; + const cmd = _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]); commands.push({ command: cmd, env: args[2]?.env || null }); // Observe the staged build context state while the sandbox create is in // flight — onboard deletes it once streamSandboxCreate resolves. diff --git a/test/onboard-messaging.test.ts b/test/onboard-messaging.test.ts index 36caddfc39..5ef7924b09 100644 --- a/test/onboard-messaging.test.ts +++ b/test/onboard-messaging.test.ts @@ -72,7 +72,7 @@ runner.runCapture = (command) => { if (_n(command).includes("sandbox get my-assistant")) return ""; if (_n(command).includes("sandbox list")) return "my-assistant Ready"; if (_n(command).includes("provider get")) return "Provider: discord-bridge"; - if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running"; + if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running\nmy-assistant 127.0.0.1 8642 12346 running"; { const sandboxExecCurl = require(${onboardScriptMocksPath}).mockSandboxExecCurl(command, { defaultCurlOutput: "ok", @@ -92,7 +92,8 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); - const command = _n(args[1][1]); + child.unref = () => {}; + const command = _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]); const entry = { command, env: args[2]?.env || null }; const policyMatch = command.match(/--policy ([^ ]+)/); if (policyMatch) { @@ -327,7 +328,7 @@ runner.run = (command, opts = {}) => { runner.runCapture = (command) => { if (_n(command).includes("sandbox get my-assistant")) return ""; if (_n(command).includes("sandbox list")) return "my-assistant Ready"; - if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running"; + if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running\nmy-assistant 127.0.0.1 8642 12346 running"; { const sandboxExecCurl = require(${onboardScriptMocksPath}).mockSandboxExecCurl(command, { defaultCurlOutput: "ok", @@ -350,7 +351,8 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); - const command = _n(args[1][1]); + child.unref = () => {}; + const command = _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]); const entry = { command, env: args[2]?.env || null }; const policyMatch = command.match(/--policy ([^ ]+)/); if (policyMatch) { @@ -516,7 +518,7 @@ runner.runCapture = (command) => { const sandboxExecCurl = require(${onboardScriptMocksPath}).mockSandboxExecCurl(command); if (sandboxExecCurl !== null) return sandboxExecCurl; } - if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running"; + if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running\nmy-assistant 127.0.0.1 8642 12346 running"; return ""; }; registry.registerSandbox = (entry) => { @@ -533,7 +535,8 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); - const command = _n(args[1][1]); + child.unref = () => {}; + const command = _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]); const entry = { command, env: args[2]?.env || null }; const dockerfileMatch = command.match(/--from ([^ ]+Dockerfile)/); if (dockerfileMatch) { @@ -681,7 +684,7 @@ runner.runCapture = (command) => { const sandboxExecCurl = require(${onboardScriptMocksPath}).mockSandboxExecCurl(command); if (sandboxExecCurl !== null) return sandboxExecCurl; } - if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running"; + if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running\nmy-assistant 127.0.0.1 8642 12346 running"; return ""; }; registry.registerSandbox = (entry) => { @@ -698,7 +701,8 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); - const command = _n(args[1][1]); + child.unref = () => {}; + const command = _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]); const entry = { command, env: args[2]?.env || null }; const dockerfileMatch = command.match(/--from ([^ ]+Dockerfile)/); if (dockerfileMatch) { @@ -840,7 +844,7 @@ runner.runCapture = (command) => { const sandboxExecCurl = require(${onboardScriptMocksPath}).mockSandboxExecCurl(command); if (sandboxExecCurl !== null) return sandboxExecCurl; } - if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running"; + if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running\nmy-assistant 127.0.0.1 8642 12346 running"; return ""; }; registry.registerSandbox = (entry) => { @@ -857,7 +861,8 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); - const command = _n(args[1][1]); + child.unref = () => {}; + const command = _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]); const entry = { command, env: args[2]?.env || null }; const dockerfileMatch = command.match(/--from ([^ ]+Dockerfile)/); if (dockerfileMatch) { @@ -1005,7 +1010,7 @@ runner.runCapture = (command) => { const sandboxExecCurl = require(${onboardScriptMocksPath}).mockSandboxExecCurl(command); if (sandboxExecCurl !== null) return sandboxExecCurl; } - if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running"; + if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running\nmy-assistant 127.0.0.1 8642 12346 running"; return ""; }; registry.registerSandbox = (entry) => { @@ -1022,7 +1027,8 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); - const command = _n(args[1][1]); + child.unref = () => {}; + const command = _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]); const entry = { command, env: args[2]?.env || null }; const dockerfileMatch = command.match(/--from ([^ ]+Dockerfile)/); if (dockerfileMatch) { @@ -1221,7 +1227,7 @@ runner.runCapture = (command) => { if (_n(command).includes("sandbox list")) return "my-assistant Ready"; // All messaging providers already exist in gateway if (_n(command).includes("provider get")) return "Provider: exists"; - if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running"; + if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running\nmy-assistant 127.0.0.1 8642 12346 running"; return ""; }; registry.getSandbox = () => ({ name: "my-assistant", gpuEnabled: false }); @@ -1335,7 +1341,7 @@ runner.runCapture = (command) => { const sandboxExecCurl = require(${onboardScriptMocksPath}).mockSandboxExecCurl(command); if (sandboxExecCurl !== null) return sandboxExecCurl; } - if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running"; + if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running\nmy-assistant 127.0.0.1 8642 12346 running"; return ""; }; registry.registerSandbox = () => true; @@ -1349,7 +1355,8 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); - commands.push({ command: _n(args[1][1]), env: args[2]?.env || null }); + child.unref = () => {}; + commands.push({ command: _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]), env: args[2]?.env || null }); process.nextTick(() => { child.stdout.emit("data", Buffer.from("Created sandbox: my-assistant\n")); child.emit("close", 0); @@ -1470,7 +1477,7 @@ runner.runCapture = (command) => { const sandboxExecCurl = require(${onboardScriptMocksPath}).mockSandboxExecCurl(command); if (sandboxExecCurl !== null) return sandboxExecCurl; } - if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running"; + if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running\nmy-assistant 127.0.0.1 8642 12346 running"; return ""; }; registry.registerSandbox = () => true; @@ -1484,7 +1491,8 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); - commands.push({ command: _n(args[1][1]), env: args[2]?.env || null }); + child.unref = () => {}; + commands.push({ command: _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]), env: args[2]?.env || null }); process.nextTick(() => { child.stdout.emit("data", Buffer.from("Created sandbox: my-assistant\n")); child.emit("close", 0); diff --git a/test/onboard.test.ts b/test/onboard.test.ts index 9b5111a402..525bbfecff 100644 --- a/test/onboard.test.ts +++ b/test/onboard.test.ts @@ -2061,7 +2061,8 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); - commands.push({ command: _n(args[1][1]), env: args[2]?.env || null }); + child.unref = () => {}; + commands.push({ command: _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]), env: args[2]?.env || null }); process.nextTick(() => { child.stdout.emit("data", Buffer.from("Created sandbox: my-assistant\n")); child.emit("close", 0); @@ -2192,7 +2193,8 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); - commands.push({ command: _n(args[1][1]), env: args[2]?.env || null }); + child.unref = () => {}; + commands.push({ command: _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]), env: args[2]?.env || null }); process.nextTick(() => { child.stdout.emit("data", Buffer.from("Created sandbox: my-assistant\n")); child.emit("close", 0); @@ -2291,7 +2293,8 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); - commands.push({ command: _n(args[1][1]), env: args[2]?.env || null }); + child.unref = () => {}; + commands.push({ command: _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]), env: args[2]?.env || null }); process.nextTick(() => { child.stdout.emit("data", Buffer.from("Created sandbox: my-assistant\n")); child.emit("close", 0); @@ -2478,7 +2481,7 @@ runner.run = (command, opts = {}) => { runner.runCapture = (command) => { if (_n(command).includes("sandbox get my-assistant")) return "my-assistant"; if (_n(command).includes("sandbox list")) return "my-assistant Ready"; - if (_n(command).includes("forward list")) return ""; + if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running"; { const sandboxExecCurl = require(${onboardScriptMocksPath}).mockSandboxExecCurl(command, { defaultCurlOutput: "ok", @@ -2500,7 +2503,8 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); - commands.push({ command: _n(args[1][1]), env: args[2]?.env || null }); + child.unref = () => {}; + commands.push({ command: _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]), env: args[2]?.env || null }); process.nextTick(() => { child.stdout.emit("data", Buffer.from("Created sandbox: my-assistant\n")); child.emit("close", 0); @@ -2590,7 +2594,7 @@ runner.run = (command, opts = {}) => { runner.runCapture = (command) => { if (_n(command).includes("sandbox get my-assistant")) return "my-assistant"; if (_n(command).includes("sandbox list")) return "my-assistant Ready"; - if (_n(command).includes("forward list")) return ""; + if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running"; { const sandboxExecCurl = require(${onboardScriptMocksPath}).mockSandboxExecCurl(command, { defaultCurlOutput: "ok", @@ -2621,7 +2625,8 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); - commands.push({ command: _n(args[1][1]), env: args[2]?.env || null }); + child.unref = () => {}; + commands.push({ command: _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]), env: args[2]?.env || null }); process.nextTick(() => { child.stdout.emit("data", Buffer.from("Created sandbox: my-assistant\n")); child.emit("close", 0); @@ -2741,6 +2746,7 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); + child.unref = () => {}; commands.push({ command: args[1]?.[1] || String(args[0]), env: args[2]?.env || null }); process.nextTick(() => { child.stdout.emit("data", Buffer.from("Created sandbox: my-assistant\n")); @@ -2860,7 +2866,7 @@ runner.runFile = (file, args = [], opts = {}) => { runner.runCapture = (command) => { if (_n(command).includes("sandbox get my-assistant")) return "my-assistant"; if (_n(command).includes("sandbox list")) return "my-assistant Ready"; - if (_n(command).includes("forward list")) return ""; + if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running"; { const sandboxExecCurl = require(${onboardScriptMocksPath}).mockSandboxExecCurl(command, { defaultCurlOutput: "ok", @@ -2885,7 +2891,8 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); - commands.push({ command: _n(args[1][1]), env: args[2]?.env || null }); + child.unref = () => {}; + commands.push({ command: _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]), env: args[2]?.env || null }); process.nextTick(() => { child.stdout.emit("data", Buffer.from("Created sandbox: my-assistant\n")); child.emit("close", 0); @@ -2990,7 +2997,7 @@ runner.runCapture = (command) => { if (_n(command).includes("sandbox list")) { return sandboxDeleted ? "my-assistant Ready" : "my-assistant NotReady"; } - if (_n(command).includes("forward list")) return ""; + if (_n(command).includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running"; { const sandboxExecCurl = require(${onboardScriptMocksPath}).mockSandboxExecCurl(command, { defaultCurlOutput: "ok", @@ -3015,7 +3022,8 @@ const fakeSpawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); - commands.push({ command: _n(args[1][1]), env: args[2]?.env || null }); + child.unref = () => {}; + commands.push({ command: _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]), env: args[2]?.env || null }); process.nextTick(() => { child.stdout.emit("data", Buffer.from("Created sandbox: my-assistant\n")); child.emit("close", 0); @@ -3144,6 +3152,7 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); + child.unref = () => {}; child.killCalls = []; child.unrefCalls = 0; child.stdout.destroyCalls = 0; @@ -3162,7 +3171,7 @@ childProcess.spawn = (...args) => { process.nextTick(() => child.emit("close", signal === "SIGTERM" ? 0 : 1)); return true; }; - commands.push({ command: _n(args[1][1]), env: args[2]?.env || null, child }); + commands.push({ command: _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]), env: args[2]?.env || null, child }); process.nextTick(() => { child.stdout.emit("data", Buffer.from("Created sandbox: my-assistant\n")); }); @@ -3234,6 +3243,8 @@ const { createSandbox } = require(${onboardPath}); const runner = require(${runnerPath}); const _n = (c) => (Array.isArray(c) ? c.join(" ") : String(c)).replace(/'/g, ""); const registry = require(${registryPath}); +const childProcess = require("node:child_process"); +const { EventEmitter } = require("node:events"); const commands = []; runner.run = (command, opts = {}) => { @@ -3248,6 +3259,16 @@ runner.runCapture = (command) => { }; registry.getSandbox = () => ({ name: "my-assistant", gpuEnabled: false }); +childProcess.spawn = (...args) => { + const child = new EventEmitter(); + child.stdout = new EventEmitter(); + child.stderr = new EventEmitter(); + child.unref = () => {}; + commands.push({ command: _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]), env: args[2]?.env || null }); + process.nextTick(() => child.emit("close", 0)); + return child; +}; + const { createSandbox } = require(${onboardPath}); (async () => { diff --git a/test/shellquote-sandbox.test.ts b/test/shellquote-sandbox.test.ts index de6eb34722..dcf948dd64 100644 --- a/test/shellquote-sandbox.test.ts +++ b/test/shellquote-sandbox.test.ts @@ -82,7 +82,7 @@ runner.runCapture = (command) => { const text = asText(command); if (text.includes("sandbox get my-assistant")) return ""; if (text.includes("sandbox list")) return "my-assistant Ready"; - if (text.includes("forward list")) return ""; + if (text.includes("forward list")) return "my-assistant 127.0.0.1 18789 12345 running"; if (text.includes("sandbox exec") && text.includes("http://localhost:") && text.includes("/health")) return "200"; if (text === "uname -r") return "6.8.0"; return ""; From 45f56715bc12369b8261c1490989273fc0cb648c Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Fri, 22 May 2026 16:00:14 +0000 Subject: [PATCH 02/10] fix(onboard): shrink forward-start callsite, fix sleep child, capture list errors PR #4072 review fixes. - `forward-start.ts`: export `buildDetachedForwardStartSpawn(argv)` so the detached-spawn wiring lives next to the consumer. `onboard.ts`'s call site shrinks from a 27-line lambda to a single helper call, putting `src/lib/onboard.ts` back inside the entrypoint budget. - `forward-start.ts:blockingSleepMs`: drop `setTimeout(...).unref()`. The unref'd timer let the sleep child's event loop drain immediately so spawnSync returned right away and the caller spun the poll loop instead of pausing between forward-list fetches. - `forward-start.ts:runDetachedForwardStartWithDiagnostics`: record the last `fetchForwardList()` error and append it to the diagnostic on timeout. Previously a persistent gateway/list failure was swallowed and the user only saw a generic "forward did not appear" message. - `test/onboard.test.ts`: align the last remaining spawn mock with the full-argv capture pattern used elsewhere. - `src/lib/onboard/forward-start.test.ts`: new regression test for the diagnostic suffix when `fetchForwardList()` keeps throwing. Signed-off-by: Tinson Lai --- src/lib/onboard.ts | 33 ++++---------------- src/lib/onboard/forward-start.test.ts | 20 ++++++++++++ src/lib/onboard/forward-start.ts | 44 +++++++++++++++++++++++---- test/onboard.test.ts | 2 +- 4 files changed, 65 insertions(+), 34 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 746388e2db..1c8e2aa103 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -18,7 +18,7 @@ const { const { cleanupTempDir }: typeof import("./onboard/temp-files") = require("./onboard/temp-files"); const { stopStaleDashboardListenersForSandbox } = require("./onboard/stale-gateway-cleanup"); const { bestEffortForwardStop } = require("./onboard/forward-cleanup"); -const { looksLikeForwardPortConflict, runDetachedForwardStartWithPortReleaseRetries }: typeof import("./onboard/forward-start") = require("./onboard/forward-start"); +const { buildDetachedForwardStartSpawn, looksLikeForwardPortConflict, runDetachedForwardStartWithPortReleaseRetries }: typeof import("./onboard/forward-start") = require("./onboard/forward-start"); const { ensureOllamaLoopbackSystemdOverride, }: typeof import("./onboard/ollama-systemd") = require("./onboard/ollama-systemd"); @@ -8644,33 +8644,12 @@ function ensureDashboardForward( parsedUrl.port = String(actualPort); const actualTarget = getDashboardForwardTarget(parsedUrl.toString()); bestEffortForwardStop(runOpenshell, actualPort); - const forwardStartArgv = openshellArgv([ - "forward", - "start", - "--background", - actualTarget, - sandboxName, - ]); + // Detached spawn + forward-list poll (#4064) so the openshell CLI's + // inherited stdio cannot trip spawnSync's timeout on Docker-compat hosts. const fwdOutcome = runDetachedForwardStartWithPortReleaseRetries( - // `openshell forward start --background` daemonises the actual forward - // process. On the Docker compatibility gateway (host glibc older than - // the openshell-gateway requirement) the daemon inherits the parent's - // stdio, so spawnSync on the CLI would block for minutes and trip the - // 30s ETIMEDOUT seen in #4064. Spawn detached, hand the daemon its own - // file descriptors, and confirm via `openshell forward list` instead - // of the CLI's exit code. - ({ stdout, stderr }) => { - try { - const child = spawn(forwardStartArgv[0], forwardStartArgv.slice(1), { - stdio: ["ignore", stdout, stderr], - detached: true, - }); - child.unref(); - return { pid: child.pid }; - } catch (e) { - return { error: e instanceof Error ? e : new Error(String(e)) }; - } - }, + buildDetachedForwardStartSpawn( + openshellArgv(["forward", "start", "--background", actualTarget, sandboxName]), + ), () => runCaptureOpenshell(["forward", "list"], { ignoreError: true }), { port: actualPort, sandboxName }, () => { diff --git a/src/lib/onboard/forward-start.test.ts b/src/lib/onboard/forward-start.test.ts index b34f89d5e5..582241a0d5 100644 --- a/src/lib/onboard/forward-start.test.ts +++ b/src/lib/onboard/forward-start.test.ts @@ -98,6 +98,26 @@ describe("runDetachedForwardStartWithDiagnostics", () => { expect(sleep).not.toHaveBeenCalled(); }); + it("surfaces persistent fetchForwardList failures in the timeout diagnostic", () => { + const fetchList = vi.fn().mockImplementation(() => { + throw new Error("gateway transport: connection refused"); + }); + const spawn = vi.fn().mockReturnValue({ pid: 42 }); + const sleep = vi.fn(); + + const result = runDetachedForwardStartWithDiagnostics( + spawn, + fetchList, + { port: 18789, sandboxName: "my-sandbox" }, + { overallTimeoutMs: 30, pollIntervalMs: 10, sleepMs: sleep }, + ); + + expect(result.ok).toBe(false); + expect(result.reason).toBe("timeout"); + expect(result.diagnostic).toMatch(/openshell forward list failed/); + expect(result.diagnostic).toMatch(/connection refused/); + }); + it("treats fetchForwardList exceptions as transient and keeps polling", () => { const fetchList = vi .fn() diff --git a/src/lib/onboard/forward-start.ts b/src/lib/onboard/forward-start.ts index 9cc92a751b..daa219816d 100644 --- a/src/lib/onboard/forward-start.ts +++ b/src/lib/onboard/forward-start.ts @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +import { spawn as spawnChild } from "node:child_process"; import fs from "node:fs"; import path from "node:path"; @@ -67,14 +68,43 @@ function blockingSleepMs(ms: number): void { // Synchronous sleep — onboard's forward-start sits in a sync code path, // so we cannot await. spawnSync of `node -e setTimeout` is the same // primitive `sleepMs` in core/wait uses, but we keep the call site - // injectable so tests can stub it without spawning subprocesses. + // injectable so tests can stub it without spawning subprocesses. We + // intentionally do NOT `.unref()` the timer in the child: an unref'd + // timer lets the child's event loop drain immediately, so spawnSync + // returns instantly and the caller spins through the poll loop without + // actually waiting. const { spawnSync } = require("node:child_process"); - spawnSync(process.execPath, ["-e", `setTimeout(() => {}, ${ms}).unref();`], { + spawnSync(process.execPath, ["-e", `setTimeout(() => {}, ${ms});`], { stdio: "ignore", timeout: ms + 5_000, }); } +/** + * Build a `DetachedForwardSpawnRunner` that spawns the given argv as a + * detached child, writing stdio to the file descriptors supplied by + * `runDetachedForwardStartWithDiagnostics`. Kept in this module so the + * onboard call site stays a thin wire-up and the spawn-on-Node detail + * (`detached: true` + `unref()`) lives next to the consumer that relies + * on it. + */ +export function buildDetachedForwardStartSpawn( + argv: readonly string[], +): DetachedForwardSpawnRunner { + return ({ stdout, stderr }) => { + try { + const child = spawnChild(argv[0], argv.slice(1), { + stdio: ["ignore", stdout, stderr], + detached: true, + }); + child.unref(); + return { pid: child.pid }; + } catch (e) { + return { error: e instanceof Error ? e : new Error(String(e)) }; + } + }; +} + function isForwardConfirmed( forwardListOutput: string, expect: { port: number; sandboxName: string }, @@ -135,11 +165,13 @@ export function runDetachedForwardStartWithDiagnostics( } } + let lastFetchError: string | null = null; const readDiag = (): string => { const stderr = readDiagnosticFile(forwardErrPath); const stdout = readDiagnosticFile(forwardDiagPath); const message = spawnError instanceof Error ? spawnError.message : ""; - return compactText(redact(`${stderr} ${stdout} ${message}`)); + const fetchSuffix = lastFetchError ? ` openshell forward list failed: ${lastFetchError}` : ""; + return compactText(redact(`${stderr} ${stdout} ${message}${fetchSuffix}`)); }; try { @@ -150,11 +182,11 @@ export function runDetachedForwardStartWithDiagnostics( const start = Date.now(); const deadline = start + overallTimeoutMs; while (Date.now() < deadline) { - let list: string; + let list = ""; try { list = fetchForwardList() || ""; - } catch { - list = ""; + } catch (err) { + lastFetchError = err instanceof Error ? err.message : String(err); } if (isForwardConfirmed(list, expect)) { return { ok: true, diagnostic: readDiag(), pid, reason: "ok" }; diff --git a/test/onboard.test.ts b/test/onboard.test.ts index 525bbfecff..838341bc6e 100644 --- a/test/onboard.test.ts +++ b/test/onboard.test.ts @@ -2747,7 +2747,7 @@ childProcess.spawn = (...args) => { child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); child.unref = () => {}; - commands.push({ command: args[1]?.[1] || String(args[0]), env: args[2]?.env || null }); + commands.push({ command: _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]), env: args[2]?.env || null }); process.nextTick(() => { child.stdout.emit("data", Buffer.from("Created sandbox: my-assistant\n")); child.emit("close", 0); From 2a7d7b97f92e07d8af04d85492d0be373842b368 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Sat, 23 May 2026 05:24:57 +0000 Subject: [PATCH 03/10] fix(onboard): trim forward-start callsite, surface async spawn errors, fix sleep child MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #4072 review fixes. - `onboard.ts`: collapse `ensureDashboardForward` callsite so net diff against main stays negative (entrypoint budget). `fwdDiagnostic` is destructured directly from the helper outcome and the explicit `{ ignoreError: true }` on `forward list` is dropped so genuine gateway/list failures propagate into the diagnostic instead of being silently coerced to `""`. - `forward-start.ts`: extend `DetachedForwardSpawnRunner` with an optional `onAsyncError` callback. `buildDetachedForwardStartSpawn` registers `child.on("error", …)` so post-spawn ENOENT/EACCES events no longer escape as unhandled errors; the helper surfaces them via the existing `spawn-error` outcome path. - `forward-start.test.ts`: new regression test simulating an async spawn error fired during the helper's poll-loop sleep. - `test/onboard.test.ts`: fix two leftover spawn mocks that came in during the merge with main — they used the legacy `args[1][1]` capture shape and lacked `child.unref`, so the new detached spawn path would crash with `child.unref is not a function` in CI. Signed-off-by: Tinson Lai --- src/lib/onboard.ts | 18 ++++--------- src/lib/onboard/forward-start.test.ts | 30 ++++++++++++++++++++++ src/lib/onboard/forward-start.ts | 37 ++++++++++++++++++++++----- test/onboard.test.ts | 6 +++-- 4 files changed, 69 insertions(+), 22 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index c098e5d0a2..266958181a 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -8638,21 +8638,13 @@ function ensureDashboardForward( parsedUrl.port = String(actualPort); const actualTarget = getDashboardForwardTarget(parsedUrl.toString()); bestEffortForwardStop(runOpenshell, actualPort); - // Detached spawn + forward-list poll (#4064) so the openshell CLI's - // inherited stdio cannot trip spawnSync's timeout on Docker-compat hosts. - const fwdOutcome = runDetachedForwardStartWithPortReleaseRetries( - buildDetachedForwardStartSpawn( - openshellArgv(["forward", "start", "--background", actualTarget, sandboxName]), - ), - () => runCaptureOpenshell(["forward", "list"], { ignoreError: true }), + const { ok: fwdOk, diagnostic: fwdDiagnostic } = runDetachedForwardStartWithPortReleaseRetries( + buildDetachedForwardStartSpawn(openshellArgv(["forward", "start", "--background", actualTarget, sandboxName])), + () => runCaptureOpenshell(["forward", "list"]), { port: actualPort, sandboxName }, - () => { - sleep(1); - bestEffortForwardStop(runOpenshell, actualPort); - }, + () => { sleep(1); bestEffortForwardStop(runOpenshell, actualPort); }, ); - const fwdDiagnostic = fwdOutcome.diagnostic; - if (!fwdOutcome.ok) { + if (!fwdOk) { const looksLikePortConflict = looksLikeForwardPortConflict(fwdDiagnostic); if (rollbackSandboxOnFailure) { // The sandbox was just created, committed to actualPort via its diff --git a/src/lib/onboard/forward-start.test.ts b/src/lib/onboard/forward-start.test.ts index 582241a0d5..85bc669c71 100644 --- a/src/lib/onboard/forward-start.test.ts +++ b/src/lib/onboard/forward-start.test.ts @@ -98,6 +98,36 @@ describe("runDetachedForwardStartWithDiagnostics", () => { expect(sleep).not.toHaveBeenCalled(); }); + it("surfaces async spawn errors fired after the runner returned", () => { + const fetchList = vi.fn().mockReturnValue(""); + let asyncErrorCallback: ((err: Error) => void) | undefined; + const spawn = vi.fn().mockImplementation((_stdio, onAsyncError) => { + asyncErrorCallback = onAsyncError; + return { pid: 4242 }; + }); + // The sleep stub is the only synchronous yield point in the helper's + // poll loop. Simulate Node's `error` event firing during that pause so + // the next iteration observes `asyncSpawnError` and returns spawn-error. + let sleepCalls = 0; + const sleep = vi.fn().mockImplementation(() => { + sleepCalls += 1; + if (sleepCalls === 1 && asyncErrorCallback) { + asyncErrorCallback(new Error("spawn ENOENT")); + } + }); + + const result = runDetachedForwardStartWithDiagnostics( + spawn, + fetchList, + { port: 18789, sandboxName: "my-sandbox" }, + { overallTimeoutMs: 1_000, pollIntervalMs: 5, sleepMs: sleep }, + ); + + expect(result.ok).toBe(false); + expect(result.reason).toBe("spawn-error"); + expect(result.diagnostic).toMatch(/spawn ENOENT/); + }); + it("surfaces persistent fetchForwardList failures in the timeout diagnostic", () => { const fetchList = vi.fn().mockImplementation(() => { throw new Error("gateway transport: connection refused"); diff --git a/src/lib/onboard/forward-start.ts b/src/lib/onboard/forward-start.ts index daa219816d..3d94c69c04 100644 --- a/src/lib/onboard/forward-start.ts +++ b/src/lib/onboard/forward-start.ts @@ -25,10 +25,15 @@ import { cleanupTempDir, secureTempFile } from "./temp-files"; export type ForwardListFetcher = () => string; -export type DetachedForwardSpawnRunner = (stdio: { - stdout: number; - stderr: number; -}) => { pid?: number; error?: Error }; +export type DetachedForwardSpawnRunner = ( + stdio: { stdout: number; stderr: number }, + // Invoked when the underlying child process emits an asynchronous `error` + // event (e.g. ENOENT or EACCES at execve time — these fire after `spawn` + // already returned a child object with `pid === undefined`). The helper + // wires this to a closure variable so the next forward-list poll iteration + // surfaces the failure in the outcome diagnostic. + onAsyncError?: (err: Error) => void, +) => { pid?: number; error?: Error }; export interface DetachedForwardStartOutcome { ok: boolean; @@ -91,12 +96,20 @@ function blockingSleepMs(ms: number): void { export function buildDetachedForwardStartSpawn( argv: readonly string[], ): DetachedForwardSpawnRunner { - return ({ stdout, stderr }) => { + return ({ stdout, stderr }, onAsyncError) => { try { const child = spawnChild(argv[0], argv.slice(1), { stdio: ["ignore", stdout, stderr], detached: true, }); + // Async failures (ENOENT/EACCES at execve, signal during spawn) arrive + // via `error` after `spawn` already returned. Without a listener they + // bubble to the process and crash onboard. + if (onAsyncError) { + child.on("error", onAsyncError); + } else { + child.on("error", () => {}); + } child.unref(); return { pid: child.pid }; } catch (e) { @@ -148,8 +161,14 @@ export function runDetachedForwardStartWithDiagnostics( let pid: number | undefined; let spawnError: Error | undefined; + let asyncSpawnError: Error | undefined; try { - const spawnResult = runDetachedSpawn({ stdout: outFd, stderr: errFd }); + const spawnResult = runDetachedSpawn( + { stdout: outFd, stderr: errFd }, + (err) => { + if (!asyncSpawnError) asyncSpawnError = err; + }, + ); pid = spawnResult.pid; spawnError = spawnResult.error; } finally { @@ -170,8 +189,9 @@ export function runDetachedForwardStartWithDiagnostics( const stderr = readDiagnosticFile(forwardErrPath); const stdout = readDiagnosticFile(forwardDiagPath); const message = spawnError instanceof Error ? spawnError.message : ""; + const asyncMsg = asyncSpawnError instanceof Error ? ` ${asyncSpawnError.message}` : ""; const fetchSuffix = lastFetchError ? ` openshell forward list failed: ${lastFetchError}` : ""; - return compactText(redact(`${stderr} ${stdout} ${message}${fetchSuffix}`)); + return compactText(redact(`${stderr} ${stdout} ${message}${asyncMsg}${fetchSuffix}`)); }; try { @@ -191,6 +211,9 @@ export function runDetachedForwardStartWithDiagnostics( if (isForwardConfirmed(list, expect)) { return { ok: true, diagnostic: readDiag(), pid, reason: "ok" }; } + if (asyncSpawnError) { + return { ok: false, diagnostic: readDiag(), pid, reason: "spawn-error" }; + } const diagSoFar = readDiag(); if (looksLikeForwardPortConflict(diagSoFar)) { return { ok: false, diagnostic: diagSoFar, pid, reason: "spawn-conflict" }; diff --git a/test/onboard.test.ts b/test/onboard.test.ts index 1a54d33421..a86ae640aa 100644 --- a/test/onboard.test.ts +++ b/test/onboard.test.ts @@ -2261,7 +2261,8 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); - commands.push({ command: _n(args[1][1]), env: args[2]?.env || null }); + child.unref = () => {}; + commands.push({ command: _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]), env: args[2]?.env || null }); process.nextTick(() => { child.stdout.emit("data", Buffer.from("Created sandbox: hermes-sandbox\\n")); child.emit("close", 0); @@ -2440,7 +2441,8 @@ childProcess.spawn = (...args) => { const child = new EventEmitter(); child.stdout = new EventEmitter(); child.stderr = new EventEmitter(); - commands.push({ command: _n(args[1][1]), env: args[2]?.env || null }); + child.unref = () => {}; + commands.push({ command: _n([args[0], ...(Array.isArray(args[1]) ? args[1] : [])]), env: args[2]?.env || null }); process.nextTick(() => { child.stdout.emit("data", Buffer.from("Created sandbox: my-assistant\\n")); child.emit("close", 0); From 209bcb2c5b9c17b17a1b28985b9f692378bd7fa5 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Sat, 23 May 2026 05:50:56 +0000 Subject: [PATCH 04/10] fix(onboard): extend forward-start deadline + surface gateway list state on timeout (#4064) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User reproduced #4064 on the merged branch: spawnSync ETIMEDOUT is gone but the helper now times out at 60s with "forward did not appear in list". The user's openshell-gateway runs inside the Docker compatibility container (host glibc < required) and per-call gRPC latency pushes the forward-registration handshake past 60s. Manual `openshell forward start -d` against an unrelated sandbox completes immediately, so the CLI path is healthy. - `forward-start.ts`: bump default `overallTimeoutMs` from 60_000 to 180_000 so Docker-compat gateways have headroom for the forward-registration handshake. - `forward-start.ts`: add `onProgress` / `progressIntervalMs` options so the helper can emit a periodic "still waiting" line during the longer wait, and append the last `openshell forward list` snapshot to the timeout diagnostic so users (and triage) can see whether the gateway returned an empty list or an entry in an unexpected state. - `forward-start.ts`: export `buildForwardStartProgressLogger(port)` so the onboard call site stays a one-liner — the entrypoint budget for `src/lib/onboard.ts` is still net-zero against main. - `forward-start.test.ts`: new regression test for the progress callback cadence and the `last forward list:` diagnostic suffix. Signed-off-by: Tinson Lai --- src/lib/onboard.ts | 4 ++- src/lib/onboard/forward-start.test.ts | 37 ++++++++++++++++++++++++ src/lib/onboard/forward-start.ts | 41 +++++++++++++++++++++++++-- 3 files changed, 78 insertions(+), 4 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 998ef61841..748a8dd225 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -18,7 +18,7 @@ const { const { cleanupTempDir }: typeof import("./onboard/temp-files") = require("./onboard/temp-files"); const { stopStaleDashboardListenersForSandbox } = require("./onboard/stale-gateway-cleanup"); const { bestEffortForwardStop } = require("./onboard/forward-cleanup"); -const { buildDetachedForwardStartSpawn, looksLikeForwardPortConflict, runDetachedForwardStartWithPortReleaseRetries }: typeof import("./onboard/forward-start") = require("./onboard/forward-start"); +const { buildDetachedForwardStartSpawn, buildForwardStartProgressLogger, looksLikeForwardPortConflict, runDetachedForwardStartWithPortReleaseRetries }: typeof import("./onboard/forward-start") = require("./onboard/forward-start"); const { ensureManagedOllamaLoopbackSystemdOverride, ensureOllamaLoopbackSystemdOverride }: typeof import("./onboard/ollama-systemd") = require("./onboard/ollama-systemd"); const { CUSTOM_BUILD_CONTEXT_WARN_BYTES, @@ -8622,6 +8622,8 @@ function ensureDashboardForward( () => runCaptureOpenshell(["forward", "list"]), { port: actualPort, sandboxName }, () => { sleep(1); bestEffortForwardStop(runOpenshell, actualPort); }, + 3, + { onProgress: buildForwardStartProgressLogger(actualPort) }, ); if (!fwdOk) { const looksLikePortConflict = looksLikeForwardPortConflict(fwdDiagnostic); diff --git a/src/lib/onboard/forward-start.test.ts b/src/lib/onboard/forward-start.test.ts index 85bc669c71..4bedb7e752 100644 --- a/src/lib/onboard/forward-start.test.ts +++ b/src/lib/onboard/forward-start.test.ts @@ -128,6 +128,43 @@ describe("runDetachedForwardStartWithDiagnostics", () => { expect(result.diagnostic).toMatch(/spawn ENOENT/); }); + it("invokes onProgress while waiting for the forward to appear", () => { + let now = 0; + const realNow = Date.now; + Date.now = () => now; + try { + const fetchList = vi.fn().mockReturnValue(""); + const spawn = vi.fn().mockReturnValue({ pid: 42 }); + const sleep = vi.fn().mockImplementation((ms) => { + now += ms; + }); + const onProgress = vi.fn(); + + const result = runDetachedForwardStartWithDiagnostics( + spawn, + fetchList, + { port: 18789, sandboxName: "my-sandbox" }, + { + overallTimeoutMs: 120_000, + pollIntervalMs: 1_000, + sleepMs: sleep, + onProgress, + progressIntervalMs: 30_000, + }, + ); + + expect(result.ok).toBe(false); + expect(onProgress).toHaveBeenCalled(); + const calls = onProgress.mock.calls; + expect(calls.length).toBeGreaterThanOrEqual(3); + expect(calls[0][0].elapsedMs).toBeGreaterThanOrEqual(30_000); + expect(result.diagnostic).toMatch(/forward did not appear in list within 120000ms/); + expect(result.diagnostic).toMatch(/last forward list: /); + } finally { + Date.now = realNow; + } + }); + it("surfaces persistent fetchForwardList failures in the timeout diagnostic", () => { const fetchList = vi.fn().mockImplementation(() => { throw new Error("gateway transport: connection refused"); diff --git a/src/lib/onboard/forward-start.ts b/src/lib/onboard/forward-start.ts index 3d94c69c04..4edfc64869 100644 --- a/src/lib/onboard/forward-start.ts +++ b/src/lib/onboard/forward-start.ts @@ -46,6 +46,11 @@ export interface DetachedForwardStartOptions { overallTimeoutMs?: number; pollIntervalMs?: number; sleepMs?: (ms: number) => void; + // Called once per `progressIntervalMs` while the helper is still waiting + // for the forward to appear in `openshell forward list`. The default is a + // no-op so the helper stays terminal-quiet in non-interactive contexts. + onProgress?: (info: { elapsedMs: number; listSnapshot: string }) => void; + progressIntervalMs?: number; } function readDiagnosticFile(filePath: string): string { @@ -125,6 +130,20 @@ function isForwardConfirmed( return getOccupiedPorts(forwardListOutput).get(String(expect.port)) === expect.sandboxName; } +/** + * Default progress logger for the detached forward-start helper. Emits a + * single line to stdout every `progressIntervalMs` while the helper is + * still polling. Kept here so the onboard call site does not need to + * recreate the same closure inline. + */ +export function buildForwardStartProgressLogger(port: number): (info: { elapsedMs: number }) => void { + return ({ elapsedMs }) => { + console.log( + ` Still waiting for forward on port ${port} to register (${Math.round(elapsedMs / 1000)}s elapsed)...`, + ); + }; +} + /** * Spawn `openshell forward start --background` as a detached child and wait * for the resulting forward to appear in `openshell forward list`. Returns @@ -145,9 +164,16 @@ export function runDetachedForwardStartWithDiagnostics( expect: { port: number; sandboxName: string }, options: DetachedForwardStartOptions = {}, ): DetachedForwardStartOutcome { - const overallTimeoutMs = options.overallTimeoutMs ?? 60_000; + // 180s deadline accommodates Docker compatibility gateways (host glibc + // older than openshell-gateway's requirement runs the gateway in an extra + // Docker container, adding per-call gRPC latency that can push the + // forward-registration handshake past a tighter timeout). See #4064. + const overallTimeoutMs = options.overallTimeoutMs ?? 180_000; const pollIntervalMs = options.pollIntervalMs ?? 500; const sleepImpl = options.sleepMs ?? blockingSleepMs; + const onProgress = options.onProgress; + const progressIntervalMs = options.progressIntervalMs ?? 30_000; + let nextProgressAt = Date.now() + progressIntervalMs; const forwardDiagPath = secureTempFile("nemoclaw-forward-start", ".out"); const forwardDiagDir = path.dirname(forwardDiagPath); @@ -201,6 +227,7 @@ export function runDetachedForwardStartWithDiagnostics( const start = Date.now(); const deadline = start + overallTimeoutMs; + let lastListSnapshot = ""; while (Date.now() < deadline) { let list = ""; try { @@ -208,6 +235,7 @@ export function runDetachedForwardStartWithDiagnostics( } catch (err) { lastFetchError = err instanceof Error ? err.message : String(err); } + lastListSnapshot = list; if (isForwardConfirmed(list, expect)) { return { ok: true, diagnostic: readDiag(), pid, reason: "ok" }; } @@ -218,13 +246,20 @@ export function runDetachedForwardStartWithDiagnostics( if (looksLikeForwardPortConflict(diagSoFar)) { return { ok: false, diagnostic: diagSoFar, pid, reason: "spawn-conflict" }; } + if (onProgress && Date.now() >= nextProgressAt) { + onProgress({ elapsedMs: Date.now() - start, listSnapshot: list }); + nextProgressAt = Date.now() + progressIntervalMs; + } sleepImpl(pollIntervalMs); } const finalDiag = readDiag(); + const listTail = lastListSnapshot + ? ` last forward list: ${compactText(redact(lastListSnapshot)).slice(0, 240)}` + : " last forward list: "; + const timeoutSummary = `forward did not appear in list within ${overallTimeoutMs}ms;${listTail}`; return { ok: false, - diagnostic: - finalDiag || `forward did not appear in list within ${overallTimeoutMs}ms`, + diagnostic: finalDiag ? `${timeoutSummary} ${finalDiag}` : timeoutSummary, pid, reason: "timeout", }; From 5a984120cde9f1b0feeb2e33464905d3d2edc264 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Sat, 23 May 2026 06:27:22 +0000 Subject: [PATCH 05/10] fix(onboard): owner-gate forward-stop, time-out forward-list, preflight openshell argv[0] PR #4072 review fixes. - `forward-cleanup.ts`: add `bestEffortForwardStopForSandbox` which consults `openshell forward list` before stopping a port. Skips the stop when the port is owned by a different sandbox, so a TOCTOU / port-conflict retry in `ensureDashboardForward` can no longer kill another sandbox's forward. - `onboard.ts`: use `bestEffortForwardStopForSandbox` for both the pre-start cleanup and the `beforeRetry` callback. Pass an explicit `timeout: 5_000` to `runCaptureOpenshell(["forward", "list"])` so a hung gateway probe cannot bypass the helper's overall deadline. - `forward-start.ts`: preflight `argv[0]` with `fs.accessSync(_, X_OK)` before spawning. The detached-spawn path runs inside a synchronous poll loop, so Node's async `error` event cannot reach the helper while it sleeps in `spawnSync`; surfacing ENOENT/EACCES at preflight time turns the would-be 180s timeout into an immediate spawn-error. The async error listener remains as belt-and-braces. - `forward-start.ts`: move `maxRetries` into the options object so the onboard callsite stays a single positional argument and the entrypoint budget remains net-zero against main. - `forward-cleanup.test.ts`: new file covering the four owner outcomes (stopped, owned-other, no-entry, list-failed) plus the non-live status filter. - `forward-start.test.ts`: replace the synthetic async-callback test with a real `buildDetachedForwardStartSpawn(["/missing"])` assertion so we exercise the actual preflight; update existing `runDetachedForwardStartWithPortReleaseRetries` callers to pass `maxRetries` via the options object. Signed-off-by: Tinson Lai --- src/lib/onboard.ts | 9 +-- src/lib/onboard/forward-cleanup.test.ts | 100 ++++++++++++++++++++++++ src/lib/onboard/forward-cleanup.ts | 43 ++++++++++ src/lib/onboard/forward-start.test.ts | 41 ++++------ src/lib/onboard/forward-start.ts | 22 +++++- 5 files changed, 182 insertions(+), 33 deletions(-) create mode 100644 src/lib/onboard/forward-cleanup.test.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 748a8dd225..fcd12feda1 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -17,7 +17,7 @@ const { }: typeof import("./onboard/branding") = require("./onboard/branding"); const { cleanupTempDir }: typeof import("./onboard/temp-files") = require("./onboard/temp-files"); const { stopStaleDashboardListenersForSandbox } = require("./onboard/stale-gateway-cleanup"); -const { bestEffortForwardStop } = require("./onboard/forward-cleanup"); +const { bestEffortForwardStop, bestEffortForwardStopForSandbox } = require("./onboard/forward-cleanup"); const { buildDetachedForwardStartSpawn, buildForwardStartProgressLogger, looksLikeForwardPortConflict, runDetachedForwardStartWithPortReleaseRetries }: typeof import("./onboard/forward-start") = require("./onboard/forward-start"); const { ensureManagedOllamaLoopbackSystemdOverride, ensureOllamaLoopbackSystemdOverride }: typeof import("./onboard/ollama-systemd") = require("./onboard/ollama-systemd"); const { @@ -8616,13 +8616,12 @@ function ensureDashboardForward( const parsedUrl = new URL(chatUiUrl.includes("://") ? chatUiUrl : `http://${chatUiUrl}`); parsedUrl.port = String(actualPort); const actualTarget = getDashboardForwardTarget(parsedUrl.toString()); - bestEffortForwardStop(runOpenshell, actualPort); + bestEffortForwardStopForSandbox(runOpenshell, runCaptureOpenshell, actualPort, sandboxName); const { ok: fwdOk, diagnostic: fwdDiagnostic } = runDetachedForwardStartWithPortReleaseRetries( buildDetachedForwardStartSpawn(openshellArgv(["forward", "start", "--background", actualTarget, sandboxName])), - () => runCaptureOpenshell(["forward", "list"]), + () => runCaptureOpenshell(["forward", "list"], { timeout: 5_000 }), { port: actualPort, sandboxName }, - () => { sleep(1); bestEffortForwardStop(runOpenshell, actualPort); }, - 3, + () => { sleep(1); bestEffortForwardStopForSandbox(runOpenshell, runCaptureOpenshell, actualPort, sandboxName); }, { onProgress: buildForwardStartProgressLogger(actualPort) }, ); if (!fwdOk) { diff --git a/src/lib/onboard/forward-cleanup.test.ts b/src/lib/onboard/forward-cleanup.test.ts new file mode 100644 index 0000000000..72e57c2fd9 --- /dev/null +++ b/src/lib/onboard/forward-cleanup.test.ts @@ -0,0 +1,100 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it, vi } from "vitest"; + +import { + bestEffortForwardStop, + bestEffortForwardStopForSandbox, +} from "../../../dist/lib/onboard/forward-cleanup"; + +function forwardListWith(entries: Array<{ sandbox: string; port: number; status?: string }>): string { + const header = "SANDBOX BIND PORT PID STATUS"; + const rows = entries.map( + (e) => `${e.sandbox} 127.0.0.1 ${e.port} 1234 ${e.status ?? "running"}`, + ); + return [header, ...rows].join("\n"); +} + +describe("bestEffortForwardStop", () => { + it("invokes `forward stop` with the port and silently ignores errors", () => { + const run = vi.fn(); + bestEffortForwardStop(run, 18789); + expect(run).toHaveBeenCalledWith( + ["forward", "stop", "18789"], + { ignoreError: true, suppressOutput: true }, + ); + }); +}); + +describe("bestEffortForwardStopForSandbox", () => { + it("returns owned-other and skips stop when the port belongs to a different sandbox", () => { + const run = vi.fn(); + const fetch = vi + .fn() + .mockReturnValue(forwardListWith([{ sandbox: "other-sandbox", port: 18789 }])); + + const outcome = bestEffortForwardStopForSandbox(run, fetch, 18789, "my-sandbox"); + + expect(outcome).toBe("owned-other"); + expect(run).not.toHaveBeenCalled(); + expect(fetch).toHaveBeenCalledWith( + ["forward", "list"], + expect.objectContaining({ ignoreError: true, timeout: 5_000 }), + ); + }); + + it("returns stopped and runs forward stop when the port belongs to the same sandbox", () => { + const run = vi.fn(); + const fetch = vi + .fn() + .mockReturnValue(forwardListWith([{ sandbox: "my-sandbox", port: 18789 }])); + + const outcome = bestEffortForwardStopForSandbox(run, fetch, 18789, "my-sandbox"); + + expect(outcome).toBe("stopped"); + expect(run).toHaveBeenCalledWith( + ["forward", "stop", "18789"], + { ignoreError: true, suppressOutput: true }, + ); + }); + + it("returns no-entry and runs stop defensively when no live forward is on that port", () => { + const run = vi.fn(); + const fetch = vi.fn().mockReturnValue(forwardListWith([])); + + const outcome = bestEffortForwardStopForSandbox(run, fetch, 18789, "my-sandbox"); + + expect(outcome).toBe("no-entry"); + expect(run).toHaveBeenCalledTimes(1); + }); + + it("falls through to a defensive stop when `forward list` itself throws", () => { + const run = vi.fn(); + const fetch = vi.fn().mockImplementation(() => { + throw new Error("gateway timed out"); + }); + + const outcome = bestEffortForwardStopForSandbox(run, fetch, 18789, "my-sandbox"); + + expect(outcome).toBe("list-failed"); + expect(run).toHaveBeenCalledTimes(1); + }); + + it("ignores forwards with non-live status when deciding ownership", () => { + // `getOccupiedPorts` filters by `isLiveForwardStatus`, so a "stopped" + // entry on the requested port should be treated as no-entry (not as a + // foreign owner). + const run = vi.fn(); + const fetch = vi + .fn() + .mockReturnValue( + forwardListWith([{ sandbox: "other-sandbox", port: 18789, status: "stopped" }]), + ); + + const outcome = bestEffortForwardStopForSandbox(run, fetch, 18789, "my-sandbox"); + + expect(outcome).toBe("no-entry"); + expect(run).toHaveBeenCalledTimes(1); + }); +}); diff --git a/src/lib/onboard/forward-cleanup.ts b/src/lib/onboard/forward-cleanup.ts index 6f1eed8094..44fdee0ee1 100644 --- a/src/lib/onboard/forward-cleanup.ts +++ b/src/lib/onboard/forward-cleanup.ts @@ -1,11 +1,18 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +import { getOccupiedPorts } from "./dashboard-port"; + export type ForwardStopRunner = ( args: string[], opts: { ignoreError?: boolean; suppressOutput?: boolean }, ) => unknown; +export type ForwardListRunner = ( + args: string[], + opts: { ignoreError?: boolean; timeout?: number }, +) => string; + export function bestEffortForwardStop( runOpenshell: ForwardStopRunner, port: string | number, @@ -15,3 +22,39 @@ export function bestEffortForwardStop( suppressOutput: true, }); } + +/** + * Stop the forward on `port` only when `openshell forward list` reports it + * is owned by `sandboxName` (or unowned). Prevents the dashboard-forward + * recovery / retry path from killing another sandbox's forward when two + * onboard runs race on the same port. + * + * Returns: + * - "stopped" — entry matched sandboxName and the stop ran. + * - "owned-other" — entry exists for a different sandbox; stop skipped. + * - "no-entry" — no live entry for that port; stop ran defensively. + * - "list-failed" — could not enumerate forwards; stop ran defensively. + */ +export function bestEffortForwardStopForSandbox( + runOpenshell: ForwardStopRunner, + runCaptureOpenshell: ForwardListRunner, + port: string | number, + sandboxName: string, +): "stopped" | "owned-other" | "no-entry" | "list-failed" { + let listOutput = ""; + try { + listOutput = runCaptureOpenshell(["forward", "list"], { + ignoreError: true, + timeout: 5_000, + }); + } catch { + bestEffortForwardStop(runOpenshell, port); + return "list-failed"; + } + const owner = getOccupiedPorts(listOutput).get(String(port)) ?? null; + if (owner && owner !== sandboxName) { + return "owned-other"; + } + bestEffortForwardStop(runOpenshell, port); + return owner === sandboxName ? "stopped" : "no-entry"; +} diff --git a/src/lib/onboard/forward-start.test.ts b/src/lib/onboard/forward-start.test.ts index 4bedb7e752..f9e6c58387 100644 --- a/src/lib/onboard/forward-start.test.ts +++ b/src/lib/onboard/forward-start.test.ts @@ -4,6 +4,7 @@ import { describe, expect, it, vi } from "vitest"; import { + buildDetachedForwardStartSpawn, looksLikeForwardPortConflict, runDetachedForwardStartWithDiagnostics, runDetachedForwardStartWithPortReleaseRetries, @@ -98,34 +99,29 @@ describe("runDetachedForwardStartWithDiagnostics", () => { expect(sleep).not.toHaveBeenCalled(); }); - it("surfaces async spawn errors fired after the runner returned", () => { + it("preflights argv[0] and short-circuits on a missing openshell binary", () => { const fetchList = vi.fn().mockReturnValue(""); - let asyncErrorCallback: ((err: Error) => void) | undefined; - const spawn = vi.fn().mockImplementation((_stdio, onAsyncError) => { - asyncErrorCallback = onAsyncError; - return { pid: 4242 }; - }); - // The sleep stub is the only synchronous yield point in the helper's - // poll loop. Simulate Node's `error` event firing during that pause so - // the next iteration observes `asyncSpawnError` and returns spawn-error. - let sleepCalls = 0; - const sleep = vi.fn().mockImplementation(() => { - sleepCalls += 1; - if (sleepCalls === 1 && asyncErrorCallback) { - asyncErrorCallback(new Error("spawn ENOENT")); - } - }); + const sleep = vi.fn(); + // Real `buildDetachedForwardStartSpawn` checks `fs.accessSync(argv[0], + // X_OK)` before spawning, so a missing binary surfaces as a synchronous + // spawn-error instead of relying on Node's async `error` event (which + // cannot fire while the helper is sleeping inside spawnSync). #4072 P3. + const spawn = buildDetachedForwardStartSpawn(["/nonexistent/openshell-binary-for-test"]); const result = runDetachedForwardStartWithDiagnostics( spawn, fetchList, { port: 18789, sandboxName: "my-sandbox" }, - { overallTimeoutMs: 1_000, pollIntervalMs: 5, sleepMs: sleep }, + { overallTimeoutMs: 5_000, pollIntervalMs: 5, sleepMs: sleep }, ); expect(result.ok).toBe(false); expect(result.reason).toBe("spawn-error"); - expect(result.diagnostic).toMatch(/spawn ENOENT/); + expect(result.diagnostic).toMatch(/ENOENT|EACCES|no such file|permission denied/i); + // No polling should have happened; the helper returned at the spawn + // preflight step. + expect(fetchList).not.toHaveBeenCalled(); + expect(sleep).not.toHaveBeenCalled(); }); it("invokes onProgress while waiting for the forward to appear", () => { @@ -228,8 +224,7 @@ describe("runDetachedForwardStartWithPortReleaseRetries", () => { fetchList, { port: 18789, sandboxName: "my-sandbox" }, beforeRetry, - 3, - { overallTimeoutMs: 30, pollIntervalMs: 10, sleepMs: sleep }, + { overallTimeoutMs: 30, pollIntervalMs: 10, sleepMs: sleep, maxRetries: 3 }, ); expect(result.ok).toBe(true); @@ -248,8 +243,7 @@ describe("runDetachedForwardStartWithPortReleaseRetries", () => { fetchList, { port: 18789, sandboxName: "my-sandbox" }, beforeRetry, - 3, - { overallTimeoutMs: 20, pollIntervalMs: 10, sleepMs: sleep }, + { overallTimeoutMs: 20, pollIntervalMs: 10, sleepMs: sleep, maxRetries: 3 }, ); expect(result.ok).toBe(false); @@ -271,8 +265,7 @@ describe("runDetachedForwardStartWithPortReleaseRetries", () => { fetchList, { port: 18789, sandboxName: "my-sandbox" }, beforeRetry, - 2, - { overallTimeoutMs: 20, pollIntervalMs: 10, sleepMs: sleep }, + { overallTimeoutMs: 20, pollIntervalMs: 10, sleepMs: sleep, maxRetries: 2 }, ); expect(result.ok).toBe(false); diff --git a/src/lib/onboard/forward-start.ts b/src/lib/onboard/forward-start.ts index 4edfc64869..b1b6a8cfb5 100644 --- a/src/lib/onboard/forward-start.ts +++ b/src/lib/onboard/forward-start.ts @@ -51,6 +51,9 @@ export interface DetachedForwardStartOptions { // no-op so the helper stays terminal-quiet in non-interactive contexts. onProgress?: (info: { elapsedMs: number; listSnapshot: string }) => void; progressIntervalMs?: number; + // Number of EADDRINUSE-style retries after the initial attempt. Honoured + // only by `runDetachedForwardStartWithPortReleaseRetries`. Defaults to 3. + maxRetries?: number; } function readDiagnosticFile(filePath: string): string { @@ -102,14 +105,25 @@ export function buildDetachedForwardStartSpawn( argv: readonly string[], ): DetachedForwardSpawnRunner { return ({ stdout, stderr }, onAsyncError) => { + // Preflight: the helper polls synchronously, so a Node `error` event + // dispatched after `spawn` returns cannot reach the poll loop while it + // is sleeping on a `spawnSync` child. Catch the obvious ENOENT/EACCES + // cases up front via `fs.accessSync` so the helper returns the real + // failure immediately instead of timing out 180s later. + try { + fs.accessSync(argv[0], fs.constants.X_OK); + } catch (e) { + return { error: e instanceof Error ? e : new Error(String(e)) }; + } try { const child = spawnChild(argv[0], argv.slice(1), { stdio: ["ignore", stdout, stderr], detached: true, }); - // Async failures (ENOENT/EACCES at execve, signal during spawn) arrive - // via `error` after `spawn` already returned. Without a listener they - // bubble to the process and crash onboard. + // Belt-and-braces: register the `error` listener in case a runtime + // execve failure still slips through the preflight (race against + // permission changes, NFS mounts going stale, etc.). The poll loop + // checks `asyncSpawnError` between iterations. if (onAsyncError) { child.on("error", onAsyncError); } else { @@ -279,9 +293,9 @@ export function runDetachedForwardStartWithPortReleaseRetries( fetchForwardList: ForwardListFetcher, expect: { port: number; sandboxName: string }, beforeRetry: () => void, - maxRetries = 3, options: DetachedForwardStartOptions = {}, ): DetachedForwardStartOutcome { + const maxRetries = options.maxRetries ?? 3; let attempt = runDetachedForwardStartWithDiagnostics( runDetachedSpawn, fetchForwardList, From 10f4f329d6e075861ceb43a876444b78bba6c6b7 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Sat, 23 May 2026 07:35:02 +0000 Subject: [PATCH 06/10] fix(onboard): close TOCTOU + dead-code paths in forward-start helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #4072 review fixes. - `forward-cleanup.ts`: when `openshell forward list` fails, the owner- scoped stop now skips entirely instead of falling through to a port-only `forward stop` that could kill an unrelated sandbox's forward. Owner-confirmed and no-entry paths use the sandbox-scoped `forward stop ` form so a TOCTOU window between list and stop cannot collateral another sandbox's forward either. Pulled the probe timeout from the shared `OPENSHELL_PROBE_TIMEOUT_MS` constant (15s) instead of an ad-hoc 5_000 — a slow Docker-compat gateway can otherwise time out every list call and flip the helper into false-rollback territory. - `onboard.ts`: import `OPENSHELL_PROBE_TIMEOUT_MS` and use it for the forward-list poll's per-call timeout, matching the other openshell probe sites. - `forward-start.ts`: drop the `onAsyncError` parameter and the `asyncSpawnError` check. The detached-spawn path runs inside a synchronous poll loop, so Node's async `error` event cannot reach the helper while it sleeps inside `spawnSync`. The preflight `fs.accessSync(argv[0], X_OK)` already catches the common ENOENT/EACCES cases; the helper now keeps only a no-op `error` listener to swallow any late event so it cannot crash the process. - `forward-start.ts`: on timeout / port-conflict outcomes, send SIGTERM to the detached `openshell forward start --background` pid so the child cannot still register a forward minutes after onboard rolled the sandbox back (which would race the next onboard attempt for the same port). - `forward-cleanup.test.ts`: cover the new behaviours — list-failed now skips stop entirely, owner-confirmed and no-entry use the sandbox-scoped 4-arg `forward stop` form, and the probe timeout asserts `OPENSHELL_PROBE_TIMEOUT_MS` (15s) rather than the previous 5_000 literal. Signed-off-by: Tinson Lai --- src/lib/onboard.ts | 3 +- src/lib/onboard/forward-cleanup.test.ts | 23 +++++++--- src/lib/onboard/forward-cleanup.ts | 41 ++++++++++++++---- src/lib/onboard/forward-start.ts | 57 +++++++++++++------------ 4 files changed, 80 insertions(+), 44 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 127aa3483d..23cc6db940 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -272,6 +272,7 @@ const { shouldFrontOllamaWithProxy, }: typeof import("./onboard/local-inference-topology") = require("./onboard/local-inference-topology"); const { resolveOpenshell } = require("./adapters/openshell/resolve"); +const { OPENSHELL_PROBE_TIMEOUT_MS } = require("./adapters/openshell/timeouts"); const credentials: typeof import("./credentials/store") = require("./credentials/store"); const { prompt, @@ -8622,7 +8623,7 @@ function ensureDashboardForward( bestEffortForwardStopForSandbox(runOpenshell, runCaptureOpenshell, actualPort, sandboxName); const { ok: fwdOk, diagnostic: fwdDiagnostic } = runDetachedForwardStartWithPortReleaseRetries( buildDetachedForwardStartSpawn(openshellArgv(["forward", "start", "--background", actualTarget, sandboxName])), - () => runCaptureOpenshell(["forward", "list"], { timeout: 5_000 }), + () => runCaptureOpenshell(["forward", "list"], { timeout: OPENSHELL_PROBE_TIMEOUT_MS }), { port: actualPort, sandboxName }, () => { sleep(1); bestEffortForwardStopForSandbox(runOpenshell, runCaptureOpenshell, actualPort, sandboxName); }, { onProgress: buildForwardStartProgressLogger(actualPort) }, diff --git a/src/lib/onboard/forward-cleanup.test.ts b/src/lib/onboard/forward-cleanup.test.ts index 72e57c2fd9..49730d4b34 100644 --- a/src/lib/onboard/forward-cleanup.test.ts +++ b/src/lib/onboard/forward-cleanup.test.ts @@ -40,11 +40,11 @@ describe("bestEffortForwardStopForSandbox", () => { expect(run).not.toHaveBeenCalled(); expect(fetch).toHaveBeenCalledWith( ["forward", "list"], - expect.objectContaining({ ignoreError: true, timeout: 5_000 }), + expect.objectContaining({ ignoreError: true, timeout: 15_000 }), ); }); - it("returns stopped and runs forward stop when the port belongs to the same sandbox", () => { + it("returns stopped and uses the sandbox-scoped forward stop form when ownership matches", () => { const run = vi.fn(); const fetch = vi .fn() @@ -53,23 +53,29 @@ describe("bestEffortForwardStopForSandbox", () => { const outcome = bestEffortForwardStopForSandbox(run, fetch, 18789, "my-sandbox"); expect(outcome).toBe("stopped"); + // Sandbox-scoped stop closes the TOCTOU window between list and stop: + // even if another sandbox bound the port in the meantime, openshell + // forward stop with both args will refuse to kill it. expect(run).toHaveBeenCalledWith( - ["forward", "stop", "18789"], + ["forward", "stop", "18789", "my-sandbox"], { ignoreError: true, suppressOutput: true }, ); }); - it("returns no-entry and runs stop defensively when no live forward is on that port", () => { + it("returns no-entry and runs a sandbox-scoped stop when no live forward is on that port", () => { const run = vi.fn(); const fetch = vi.fn().mockReturnValue(forwardListWith([])); const outcome = bestEffortForwardStopForSandbox(run, fetch, 18789, "my-sandbox"); expect(outcome).toBe("no-entry"); - expect(run).toHaveBeenCalledTimes(1); + expect(run).toHaveBeenCalledWith( + ["forward", "stop", "18789", "my-sandbox"], + { ignoreError: true, suppressOutput: true }, + ); }); - it("falls through to a defensive stop when `forward list` itself throws", () => { + it("skips the stop entirely when `forward list` itself throws (owner unknown)", () => { const run = vi.fn(); const fetch = vi.fn().mockImplementation(() => { throw new Error("gateway timed out"); @@ -78,7 +84,10 @@ describe("bestEffortForwardStopForSandbox", () => { const outcome = bestEffortForwardStopForSandbox(run, fetch, 18789, "my-sandbox"); expect(outcome).toBe("list-failed"); - expect(run).toHaveBeenCalledTimes(1); + // Without ownership data, a port-only stop could kill another + // sandbox's forward — better to leave the port alone and let the + // helper's retry / next poll observe the real state. + expect(run).not.toHaveBeenCalled(); }); it("ignores forwards with non-live status when deciding ownership", () => { diff --git a/src/lib/onboard/forward-cleanup.ts b/src/lib/onboard/forward-cleanup.ts index 44fdee0ee1..af5a5d1250 100644 --- a/src/lib/onboard/forward-cleanup.ts +++ b/src/lib/onboard/forward-cleanup.ts @@ -1,6 +1,8 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +import { OPENSHELL_PROBE_TIMEOUT_MS } from "../adapters/openshell/timeouts"; + import { getOccupiedPorts } from "./dashboard-port"; export type ForwardStopRunner = ( @@ -13,6 +15,15 @@ export type ForwardListRunner = ( opts: { ignoreError?: boolean; timeout?: number }, ) => string; +/** + * `openshell forward stop ` — port-scoped, kills whatever forward is + * currently bound to that port. Use only when the caller has no sandbox + * context (or is intentionally targeting any owner — e.g. wholesale + * dashboard-port reclaim during recovery). Sandbox-aware call sites should + * prefer `bestEffortForwardStopForSandbox`, which uses the sandbox-scoped + * `forward stop ` form so the kill cannot collateral + * another sandbox's forward in a TOCTOU window. + */ export function bestEffortForwardStop( runOpenshell: ForwardStopRunner, port: string | number, @@ -25,15 +36,27 @@ export function bestEffortForwardStop( /** * Stop the forward on `port` only when `openshell forward list` reports it - * is owned by `sandboxName` (or unowned). Prevents the dashboard-forward - * recovery / retry path from killing another sandbox's forward when two - * onboard runs race on the same port. + * is owned by `sandboxName` (or unowned). When the list query fails the + * stop is skipped — without ownership data we cannot tell whether port + * belongs to a concurrent onboard / connect on a different sandbox, so the + * safe choice is to leave the port alone and let the helper's retry path + * (or the next forward list poll once the gateway is responsive) sort + * itself out. + * + * The stop itself uses the sandbox-scoped `forward stop ` + * form so a TOCTOU window between list and stop cannot accidentally kill + * another sandbox's forward that bound to the same port in the meantime. * * Returns: * - "stopped" — entry matched sandboxName and the stop ran. * - "owned-other" — entry exists for a different sandbox; stop skipped. - * - "no-entry" — no live entry for that port; stop ran defensively. - * - "list-failed" — could not enumerate forwards; stop ran defensively. + * - "no-entry" — no live entry for that port; stop ran defensively + * (sandbox-scoped, so a concurrent racer's forward + * that may have bound the port between list and stop + * is left alone). + * - "list-failed" — could not enumerate forwards; stop SKIPPED. The + * owner is unknown and the port-only `forward stop` + * could kill an unrelated sandbox's forward. */ export function bestEffortForwardStopForSandbox( runOpenshell: ForwardStopRunner, @@ -45,16 +68,18 @@ export function bestEffortForwardStopForSandbox( try { listOutput = runCaptureOpenshell(["forward", "list"], { ignoreError: true, - timeout: 5_000, + timeout: OPENSHELL_PROBE_TIMEOUT_MS, }); } catch { - bestEffortForwardStop(runOpenshell, port); return "list-failed"; } const owner = getOccupiedPorts(listOutput).get(String(port)) ?? null; if (owner && owner !== sandboxName) { return "owned-other"; } - bestEffortForwardStop(runOpenshell, port); + runOpenshell(["forward", "stop", String(port), sandboxName], { + ignoreError: true, + suppressOutput: true, + }); return owner === sandboxName ? "stopped" : "no-entry"; } diff --git a/src/lib/onboard/forward-start.ts b/src/lib/onboard/forward-start.ts index b1b6a8cfb5..866c8e376a 100644 --- a/src/lib/onboard/forward-start.ts +++ b/src/lib/onboard/forward-start.ts @@ -27,12 +27,6 @@ export type ForwardListFetcher = () => string; export type DetachedForwardSpawnRunner = ( stdio: { stdout: number; stderr: number }, - // Invoked when the underlying child process emits an asynchronous `error` - // event (e.g. ENOENT or EACCES at execve time — these fire after `spawn` - // already returned a child object with `pid === undefined`). The helper - // wires this to a closure variable so the next forward-list poll iteration - // surfaces the failure in the outcome diagnostic. - onAsyncError?: (err: Error) => void, ) => { pid?: number; error?: Error }; export interface DetachedForwardStartOutcome { @@ -104,7 +98,7 @@ function blockingSleepMs(ms: number): void { export function buildDetachedForwardStartSpawn( argv: readonly string[], ): DetachedForwardSpawnRunner { - return ({ stdout, stderr }, onAsyncError) => { + return ({ stdout, stderr }) => { // Preflight: the helper polls synchronously, so a Node `error` event // dispatched after `spawn` returns cannot reach the poll loop while it // is sleeping on a `spawnSync` child. Catch the obvious ENOENT/EACCES @@ -120,15 +114,9 @@ export function buildDetachedForwardStartSpawn( stdio: ["ignore", stdout, stderr], detached: true, }); - // Belt-and-braces: register the `error` listener in case a runtime - // execve failure still slips through the preflight (race against - // permission changes, NFS mounts going stale, etc.). The poll loop - // checks `asyncSpawnError` between iterations. - if (onAsyncError) { - child.on("error", onAsyncError); - } else { - child.on("error", () => {}); - } + // Swallow any belated `error` event so a race between accessSync and + // execve does not crash the process via an unhandled emitter. + child.on("error", () => {}); child.unref(); return { pid: child.pid }; } catch (e) { @@ -144,6 +132,23 @@ function isForwardConfirmed( return getOccupiedPorts(forwardListOutput).get(String(expect.port)) === expect.sandboxName; } +/** + * Best-effort SIGTERM of the detached `openshell forward start --background` + * process when the helper gives up. Without this, a slow gateway handshake + * can still register a forward minutes after onboard already rolled the + * sandbox back, causing the next onboard attempt on the same port to race + * an orphan CLI for the dashboard. `kill` swallows ESRCH so a child that + * already exited is a no-op. + */ +function terminateDetachedForwardChild(pid: number | undefined): void { + if (!pid || pid <= 0) return; + try { + process.kill(pid, "SIGTERM"); + } catch { + /* already exited or out of our reach */ + } +} + /** * Default progress logger for the detached forward-start helper. Emits a * single line to stdout every `progressIntervalMs` while the helper is @@ -201,14 +206,8 @@ export function runDetachedForwardStartWithDiagnostics( let pid: number | undefined; let spawnError: Error | undefined; - let asyncSpawnError: Error | undefined; try { - const spawnResult = runDetachedSpawn( - { stdout: outFd, stderr: errFd }, - (err) => { - if (!asyncSpawnError) asyncSpawnError = err; - }, - ); + const spawnResult = runDetachedSpawn({ stdout: outFd, stderr: errFd }); pid = spawnResult.pid; spawnError = spawnResult.error; } finally { @@ -229,9 +228,8 @@ export function runDetachedForwardStartWithDiagnostics( const stderr = readDiagnosticFile(forwardErrPath); const stdout = readDiagnosticFile(forwardDiagPath); const message = spawnError instanceof Error ? spawnError.message : ""; - const asyncMsg = asyncSpawnError instanceof Error ? ` ${asyncSpawnError.message}` : ""; const fetchSuffix = lastFetchError ? ` openshell forward list failed: ${lastFetchError}` : ""; - return compactText(redact(`${stderr} ${stdout} ${message}${asyncMsg}${fetchSuffix}`)); + return compactText(redact(`${stderr} ${stdout} ${message}${fetchSuffix}`)); }; try { @@ -253,11 +251,9 @@ export function runDetachedForwardStartWithDiagnostics( if (isForwardConfirmed(list, expect)) { return { ok: true, diagnostic: readDiag(), pid, reason: "ok" }; } - if (asyncSpawnError) { - return { ok: false, diagnostic: readDiag(), pid, reason: "spawn-error" }; - } const diagSoFar = readDiag(); if (looksLikeForwardPortConflict(diagSoFar)) { + terminateDetachedForwardChild(pid); return { ok: false, diagnostic: diagSoFar, pid, reason: "spawn-conflict" }; } if (onProgress && Date.now() >= nextProgressAt) { @@ -271,6 +267,11 @@ export function runDetachedForwardStartWithDiagnostics( ? ` last forward list: ${compactText(redact(lastListSnapshot)).slice(0, 240)}` : " last forward list: "; const timeoutSummary = `forward did not appear in list within ${overallTimeoutMs}ms;${listTail}`; + // The detached `openshell forward start --background` process may still + // be running (e.g. blocked on a slow gateway handshake). If the caller + // is about to roll back the sandbox, leaving an orphan CLI that may yet + // succeed would race with the next onboard attempt for the same port. + terminateDetachedForwardChild(pid); return { ok: false, diagnostic: finalDiag ? `${timeoutSummary} ${finalDiag}` : timeoutSummary, From 169d51fb79d90581e15ffe0026a3b082eb644c53 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Sat, 23 May 2026 07:57:36 +0000 Subject: [PATCH 07/10] fix(onboard): null-pid spawn-error, clear stale fetch error, SIGTERM tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #4072 review fixes. - `forward-start.ts`: `buildDetachedForwardStartSpawn` now returns a synchronous spawn-error when `spawn` came back with `child.pid == null`. Without this the swallowed `error` listener would let the caller wait the full 180s deadline for a child that never actually ran. `child.unref()` only runs once the pid is known. - `forward-start.ts`: clear `lastFetchError` after a successful `fetchForwardList` so a recovered gateway does not leave a stale "openshell forward list failed: …" tail on the eventual timeout diagnostic. - `forward-start.ts` / `forward-start.test.ts`: drop the issue/PR references from the new comments; keep the generic rationale. - `forward-start.test.ts`: cover the SIGTERM paths — timeout SIGTERMs the detached pid, port-conflict diagnostic SIGTERMs the detached pid (driven by writing a real EADDRINUSE line to the captured stderr fd), and a spawn-error outcome (no pid) leaves `process.kill` untouched. Also pin the lastFetchError clearing behaviour. Signed-off-by: Tinson Lai --- src/lib/onboard/forward-start.test.ts | 96 ++++++++++++++++++++++++++- src/lib/onboard/forward-start.ts | 15 ++++- 2 files changed, 108 insertions(+), 3 deletions(-) diff --git a/src/lib/onboard/forward-start.test.ts b/src/lib/onboard/forward-start.test.ts index f9e6c58387..7822f960e6 100644 --- a/src/lib/onboard/forward-start.test.ts +++ b/src/lib/onboard/forward-start.test.ts @@ -105,7 +105,7 @@ describe("runDetachedForwardStartWithDiagnostics", () => { // Real `buildDetachedForwardStartSpawn` checks `fs.accessSync(argv[0], // X_OK)` before spawning, so a missing binary surfaces as a synchronous // spawn-error instead of relying on Node's async `error` event (which - // cannot fire while the helper is sleeping inside spawnSync). #4072 P3. + // cannot fire while the helper is sleeping inside spawnSync). const spawn = buildDetachedForwardStartSpawn(["/nonexistent/openshell-binary-for-test"]); const result = runDetachedForwardStartWithDiagnostics( @@ -201,6 +201,100 @@ describe("runDetachedForwardStartWithDiagnostics", () => { expect(result.ok).toBe(true); expect(fetchList).toHaveBeenCalledTimes(2); }); + + it("clears a transient fetch error from the diagnostic when a later poll succeeds", () => { + const fetchList = vi + .fn() + .mockImplementationOnce(() => { + throw new Error("transient gateway: connection refused"); + }) + .mockReturnValue(""); + const spawn = vi.fn().mockReturnValue({ pid: 42 }); + const sleep = vi.fn(); + + const result = runDetachedForwardStartWithDiagnostics( + spawn, + fetchList, + { port: 18789, sandboxName: "my-sandbox" }, + { overallTimeoutMs: 30, pollIntervalMs: 10, sleepMs: sleep }, + ); + + expect(result.ok).toBe(false); + expect(result.reason).toBe("timeout"); + expect(result.diagnostic).not.toMatch(/openshell forward list failed/); + }); + + it("SIGTERMs the detached child on timeout", () => { + const fetchList = vi.fn().mockReturnValue(""); + const spawn = vi.fn().mockReturnValue({ pid: 4242 }); + const sleep = vi.fn(); + const realKill = process.kill; + const killSpy = vi.fn(); + // Replace process.kill so the test does not actually try to signal pid 4242. + (process as { kill: typeof process.kill }).kill = killSpy as unknown as typeof process.kill; + try { + const result = runDetachedForwardStartWithDiagnostics( + spawn, + fetchList, + { port: 18789, sandboxName: "my-sandbox" }, + { overallTimeoutMs: 20, pollIntervalMs: 10, sleepMs: sleep }, + ); + expect(result.ok).toBe(false); + expect(result.reason).toBe("timeout"); + expect(killSpy).toHaveBeenCalledWith(4242, "SIGTERM"); + } finally { + (process as { kill: typeof process.kill }).kill = realKill; + } + }); + + it("SIGTERMs the detached child on a port-conflict diagnostic", () => { + // Spawn writes an EADDRINUSE line to the stderr file descriptor so the + // first poll iteration reads it back and trips the conflict branch. + const realFs = require("node:fs"); + const fetchList = vi.fn().mockReturnValue(""); + const spawn = vi.fn().mockImplementation(({ stderr }: { stderr: number }) => { + realFs.writeSync(stderr, "listen tcp 0.0.0.0:18789: bind: address already in use\n"); + return { pid: 8888 }; + }); + const sleep = vi.fn(); + const realKill = process.kill; + const killSpy = vi.fn(); + (process as { kill: typeof process.kill }).kill = killSpy as unknown as typeof process.kill; + try { + const result = runDetachedForwardStartWithDiagnostics( + spawn, + fetchList, + { port: 18789, sandboxName: "my-sandbox" }, + { overallTimeoutMs: 1_000, pollIntervalMs: 10, sleepMs: sleep }, + ); + expect(result.ok).toBe(false); + expect(result.reason).toBe("spawn-conflict"); + expect(killSpy).toHaveBeenCalledWith(8888, "SIGTERM"); + } finally { + (process as { kill: typeof process.kill }).kill = realKill; + } + }); + + it("does not SIGTERM when spawn never produced a pid", () => { + const fetchList = vi.fn(); + const spawn = vi.fn().mockReturnValue({ error: new Error("ENOENT") }); + const sleep = vi.fn(); + const realKill = process.kill; + const killSpy = vi.fn(); + (process as { kill: typeof process.kill }).kill = killSpy as unknown as typeof process.kill; + try { + const result = runDetachedForwardStartWithDiagnostics( + spawn, + fetchList, + { port: 18789, sandboxName: "my-sandbox" }, + { overallTimeoutMs: 1_000, pollIntervalMs: 10, sleepMs: sleep }, + ); + expect(result.reason).toBe("spawn-error"); + expect(killSpy).not.toHaveBeenCalled(); + } finally { + (process as { kill: typeof process.kill }).kill = realKill; + } + }); }); describe("runDetachedForwardStartWithPortReleaseRetries", () => { diff --git a/src/lib/onboard/forward-start.ts b/src/lib/onboard/forward-start.ts index 866c8e376a..a9274f1716 100644 --- a/src/lib/onboard/forward-start.ts +++ b/src/lib/onboard/forward-start.ts @@ -15,7 +15,7 @@ import { cleanupTempDir, secureTempFile } from "./temp-files"; // some platforms (notably the Docker compatibility gateway used when the // host glibc is older than the openshell-gateway requirement). spawnSync // then waits on those fds until the daemon exits — minutes later — and -// reports ETIMEDOUT even though the forward is established. See #4064. +// reports ETIMEDOUT even though the forward is established. // // The detached path below spawns the CLI with `detached: true`, hands it // independent diagnostic file descriptors, and confirms success by polling @@ -117,6 +117,13 @@ export function buildDetachedForwardStartSpawn( // Swallow any belated `error` event so a race between accessSync and // execve does not crash the process via an unhandled emitter. child.on("error", () => {}); + // A null/undefined pid means execve failed even though the preflight + // succeeded (race against permission changes, ulimit, etc.). The async + // `error` event would otherwise be swallowed by the listener above and + // the caller would wait the full deadline for a child that never ran. + if (child.pid == null) { + return { error: new Error(`spawn ${argv[0]} returned no pid`) }; + } child.unref(); return { pid: child.pid }; } catch (e) { @@ -186,7 +193,7 @@ export function runDetachedForwardStartWithDiagnostics( // 180s deadline accommodates Docker compatibility gateways (host glibc // older than openshell-gateway's requirement runs the gateway in an extra // Docker container, adding per-call gRPC latency that can push the - // forward-registration handshake past a tighter timeout). See #4064. + // forward-registration handshake past a tighter timeout). const overallTimeoutMs = options.overallTimeoutMs ?? 180_000; const pollIntervalMs = options.pollIntervalMs ?? 500; const sleepImpl = options.sleepMs ?? blockingSleepMs; @@ -244,6 +251,10 @@ export function runDetachedForwardStartWithDiagnostics( let list = ""; try { list = fetchForwardList() || ""; + // Clear the cached transient error so a recovered gateway does not + // leave a stale "openshell forward list failed: …" suffix on the + // eventual timeout diagnostic. + lastFetchError = null; } catch (err) { lastFetchError = err instanceof Error ? err.message : String(err); } From dddbe5d16042aa6ebb5d597a493ed1f001124c36 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Sat, 23 May 2026 13:27:59 +0000 Subject: [PATCH 08/10] fix(onboard): inline CONTROL_UI_PORT to keep onboard.ts within budget Signed-off-by: Tinson Lai --- src/lib/onboard.ts | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 54b548fdac..67b359821c 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -21,11 +21,11 @@ const { }: typeof import("./onboard/inference-selection-validation") = require("./onboard/inference-selection-validation"); const { cleanupTempDir }: typeof import("./onboard/temp-files") = require("./onboard/temp-files"); const { stopStaleDashboardListenersForSandbox } = require("./onboard/stale-gateway-cleanup"); -const { bestEffortForwardStop } = require("./onboard/forward-cleanup"); const { ensureManagedOllamaLoopbackSystemdOverride, ensureOllamaLoopbackSystemdOverride, }: typeof import("./onboard/ollama-systemd") = require("./onboard/ollama-systemd"); +const { bestEffortForwardStop } = require("./onboard/forward-cleanup"); const { CUSTOM_BUILD_CONTEXT_WARN_BYTES, isInsideIgnoredCustomBuildContextPath, @@ -2974,7 +2974,7 @@ async function createSandbox( } } const preferredPort = - controlUiPort ?? envPort ?? persistedPort ?? (agent ? agent.forwardPort : CONTROL_UI_PORT); + controlUiPort ?? envPort ?? persistedPort ?? (agent ? agent.forwardPort : DASHBOARD_PORT); const earlyForwards = runCaptureOpenshell(["forward", "list"], { ignoreError: true }); const effectivePort = findAvailableDashboardPort(sandboxName, preferredPort, earlyForwards); if (effectivePort !== preferredPort) { @@ -6694,8 +6694,6 @@ async function setupPoliciesWithSelection( // ── Dashboard ──────────────────────────────────────────────────── -const CONTROL_UI_PORT = DASHBOARD_PORT; - const { buildChain, buildControlUiUrls, @@ -6722,7 +6720,6 @@ const { printAgentDashboardUi: agentOnboard.printDashboardUi, }); - const onboardRuntimeBoundary = new OnboardRuntimeBoundary({ toSessionUpdates: (updates: Record) => toSessionUpdates(updates as Parameters[0]), From 707d647a8a7d6fbb0c51a5871b5bc8f148221e82 Mon Sep 17 00:00:00 2001 From: Tinson Lai Date: Sat, 23 May 2026 14:59:37 +0000 Subject: [PATCH 09/10] fix(onboard): list-failed catches forward-list errors; lift fs import in test Signed-off-by: Tinson Lai --- src/lib/onboard/forward-cleanup.test.ts | 8 +++++++- src/lib/onboard/forward-cleanup.ts | 6 +++++- src/lib/onboard/forward-start.test.ts | 5 +++-- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/lib/onboard/forward-cleanup.test.ts b/src/lib/onboard/forward-cleanup.test.ts index 49730d4b34..f43c2bfbd4 100644 --- a/src/lib/onboard/forward-cleanup.test.ts +++ b/src/lib/onboard/forward-cleanup.test.ts @@ -40,7 +40,13 @@ describe("bestEffortForwardStopForSandbox", () => { expect(run).not.toHaveBeenCalled(); expect(fetch).toHaveBeenCalledWith( ["forward", "list"], - expect.objectContaining({ ignoreError: true, timeout: 15_000 }), + expect.objectContaining({ timeout: 15_000 }), + ); + // Caller must NOT pass ignoreError; failures should throw so the catch + // branch returns "list-failed" instead of running a stop with no owner data. + expect(fetch).not.toHaveBeenCalledWith( + ["forward", "list"], + expect.objectContaining({ ignoreError: true }), ); }); diff --git a/src/lib/onboard/forward-cleanup.ts b/src/lib/onboard/forward-cleanup.ts index af5a5d1250..5ccffe71f5 100644 --- a/src/lib/onboard/forward-cleanup.ts +++ b/src/lib/onboard/forward-cleanup.ts @@ -64,10 +64,14 @@ export function bestEffortForwardStopForSandbox( port: string | number, sandboxName: string, ): "stopped" | "owned-other" | "no-entry" | "list-failed" { + // Let runCaptureOpenshell throw on failure/timeout so the catch branch + // returns "list-failed". With ignoreError: true the runner would swallow + // the error and return "", which getOccupiedPorts parses as an empty map + // and the "no-entry" branch below would still run the stop — exactly the + // collateral-damage case this helper exists to avoid. let listOutput = ""; try { listOutput = runCaptureOpenshell(["forward", "list"], { - ignoreError: true, timeout: OPENSHELL_PROBE_TIMEOUT_MS, }); } catch { diff --git a/src/lib/onboard/forward-start.test.ts b/src/lib/onboard/forward-start.test.ts index 7822f960e6..7a4cfefc35 100644 --- a/src/lib/onboard/forward-start.test.ts +++ b/src/lib/onboard/forward-start.test.ts @@ -1,6 +1,8 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +import fs from "node:fs"; + import { describe, expect, it, vi } from "vitest"; import { @@ -250,10 +252,9 @@ describe("runDetachedForwardStartWithDiagnostics", () => { it("SIGTERMs the detached child on a port-conflict diagnostic", () => { // Spawn writes an EADDRINUSE line to the stderr file descriptor so the // first poll iteration reads it back and trips the conflict branch. - const realFs = require("node:fs"); const fetchList = vi.fn().mockReturnValue(""); const spawn = vi.fn().mockImplementation(({ stderr }: { stderr: number }) => { - realFs.writeSync(stderr, "listen tcp 0.0.0.0:18789: bind: address already in use\n"); + fs.writeSync(stderr, "listen tcp 0.0.0.0:18789: bind: address already in use\n"); return { pid: 8888 }; }); const sleep = vi.fn(); From 6ce79ad41053aab1fba948c338d633628a046458 Mon Sep 17 00:00:00 2001 From: Aaron Erickson Date: Sat, 23 May 2026 08:31:30 -0700 Subject: [PATCH 10/10] fix(onboard): scope dashboard forward cleanup stops Signed-off-by: Aaron Erickson --- src/lib/onboard/dashboard.ts | 25 ++++++++---------- test/onboard-dashboard.test.ts | 48 +++++++++++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/src/lib/onboard/dashboard.ts b/src/lib/onboard/dashboard.ts index abc0ab35e3..2840bf7b40 100644 --- a/src/lib/onboard/dashboard.ts +++ b/src/lib/onboard/dashboard.ts @@ -219,13 +219,20 @@ export function createOnboardDashboardHelpers(deps: OnboardDashboardDeps): Onboa ): number { const { rollbackSandboxOnFailure = false } = options; const preferredPort = Number(getDashboardForwardPort(chatUiUrl)); + const stopForwardForSandbox = (port: string | number) => + bestEffortForwardStopForSandbox( + deps.runOpenshell, + (args, opts) => (deps.runCaptureOpenshell(args, opts) ?? "") as string, + port, + sandboxName, + ); let existingForwards = deps.runCaptureOpenshell(["forward", "list"], { ignoreError: true }); const preferredEntry = findForwardEntry(existingForwards, String(preferredPort)); if ( preferredEntry && (preferredEntry.sandboxName === sandboxName || !isLiveForwardStatus(preferredEntry.status)) ) { - bestEffortForwardStop(deps.runOpenshell, preferredPort); + stopForwardForSandbox(preferredPort); existingForwards = deps.runCaptureOpenshell(["forward", "list"], { ignoreError: true }); } let actualPort: number; @@ -252,19 +259,14 @@ export function createOnboardDashboardHelpers(deps: OnboardDashboardDeps): Onboa const occupied = getOccupiedPorts(existingForwards); for (const [port, owner] of occupied.entries()) { if (owner === sandboxName && Number(port) !== actualPort) { - bestEffortForwardStop(deps.runOpenshell, port); + stopForwardForSandbox(port); } } const parsedUrl = new URL(chatUiUrl.includes("://") ? chatUiUrl : `http://${chatUiUrl}`); parsedUrl.port = String(actualPort); const actualTarget = getDashboardForwardTarget(parsedUrl.toString()); - bestEffortForwardStopForSandbox( - deps.runOpenshell, - (args, opts) => (deps.runCaptureOpenshell(args, opts) ?? "") as string, - actualPort, - sandboxName, - ); + stopForwardForSandbox(actualPort); const { ok: fwdOk, diagnostic: fwdDiagnostic } = runDetachedForwardStartWithPortReleaseRetries( buildDetachedForwardStartSpawn( deps.openshellArgv(["forward", "start", "--background", actualTarget, sandboxName]), @@ -274,12 +276,7 @@ export function createOnboardDashboardHelpers(deps: OnboardDashboardDeps): Onboa { port: actualPort, sandboxName }, () => { deps.sleep(1); - bestEffortForwardStopForSandbox( - deps.runOpenshell, - (args, opts) => (deps.runCaptureOpenshell(args, opts) ?? "") as string, - actualPort, - sandboxName, - ); + stopForwardForSandbox(actualPort); }, { onProgress: buildForwardStartProgressLogger(actualPort) }, ); diff --git a/test/onboard-dashboard.test.ts b/test/onboard-dashboard.test.ts index ecb9aaaedb..c8408934f0 100644 --- a/test/onboard-dashboard.test.ts +++ b/test/onboard-dashboard.test.ts @@ -1,11 +1,16 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -import { describe, expect, it } from "vitest"; +import { describe, expect, it, vi } from "vitest"; const { getPortConflictServiceHints } = require("../dist/lib/onboard") as { getPortConflictServiceHints: (platform?: string) => string[]; }; +const { createOnboardDashboardHelpers } = require("../dist/lib/onboard/dashboard") as { + createOnboardDashboardHelpers: (deps: Record) => { + ensureDashboardForward: (sandboxName: string, chatUiUrl?: string) => number; + }; +}; describe("onboard dashboard helpers", () => { it("prints platform-appropriate service hints for port conflicts", () => { @@ -15,4 +20,45 @@ describe("onboard dashboard helpers", () => { /systemctl --user stop openclaw-gateway.service/, ); }); + + it("uses sandbox-scoped forward stops for same-sandbox dashboard cleanup", () => { + const forwardList = + "SANDBOX BIND PORT PID STATUS\n" + + "my-sandbox 127.0.0.1 18789 12345 running\n" + + "my-sandbox 127.0.0.1 19000 12346 running"; + const runOpenshell = vi.fn((_args: string[], _opts?: Record) => ({ + status: 0, + })); + const runCaptureOpenshell = vi.fn((args: string[], _opts?: Record) => + args.join(" ") === "forward list" ? forwardList : "", + ); + const helpers = createOnboardDashboardHelpers({ + runOpenshell, + runCaptureOpenshell, + openshellArgv: (args: string[]) => [process.execPath, "-e", "", ...args], + cliName: () => "nemoclaw", + agentProductName: () => "NemoClaw", + getProviderLabel: (provider: string) => provider, + note: vi.fn(), + isWsl: () => false, + redact: (value: unknown) => String(value), + sleep: vi.fn(), + printAgentDashboardUi: vi.fn(), + }); + + expect(helpers.ensureDashboardForward("my-sandbox", "http://127.0.0.1:18789")).toBe(18789); + + const stopArgs = runOpenshell.mock.calls.map(([args]) => args); + expect(stopArgs).toContainEqual(["forward", "stop", "18789", "my-sandbox"]); + expect(stopArgs).toContainEqual(["forward", "stop", "19000", "my-sandbox"]); + expect( + stopArgs.some( + (args) => + Array.isArray(args) && + args[0] === "forward" && + args[1] === "stop" && + args.length === 3, + ), + ).toBe(false); + }); });