From 9ee6cfab9b3fd24ff2ad22c06af41871832cebfb Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Tue, 19 May 2026 19:09:44 -0700 Subject: [PATCH 01/54] refactor(cli): add onboard FSM transition types --- src/lib/onboard/machine/transitions.test.ts | 164 ++++++++++++++++++++ src/lib/onboard/machine/transitions.ts | 107 +++++++++++++ src/lib/onboard/machine/types.ts | 101 ++++++++++++ 3 files changed, 372 insertions(+) create mode 100644 src/lib/onboard/machine/transitions.test.ts create mode 100644 src/lib/onboard/machine/transitions.ts create mode 100644 src/lib/onboard/machine/types.ts diff --git a/src/lib/onboard/machine/transitions.test.ts b/src/lib/onboard/machine/transitions.test.ts new file mode 100644 index 0000000000..875a0ec45a --- /dev/null +++ b/src/lib/onboard/machine/transitions.test.ts @@ -0,0 +1,164 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it } from "vitest"; + +import { + ONBOARD_MACHINE_EVENT_TYPES, + ONBOARD_MACHINE_STATES, + ONBOARD_NON_TERMINAL_MACHINE_STATES, +} from "./types"; +import { + assertValidOnboardMachineTransition, + canTransitionOnboardMachineState, + getNextOnboardMachineStates, + getOnboardMachineTransition, + InvalidOnboardMachineTransitionError, + isOnboardMachineState, + isTerminalOnboardMachineState, + ONBOARD_MACHINE_DIRECT_TRANSITIONS, + ONBOARD_MACHINE_NEXT_STATES, + ONBOARD_MACHINE_TRANSITIONS, +} from "./transitions"; + +const canonicalDirectTransitions = [ + ["init", "preflight", "advance"], + ["preflight", "gateway", "advance"], + ["gateway", "provider_selection", "advance"], + ["provider_selection", "inference", "advance"], + ["inference", "provider_selection", "retry"], + ["inference", "sandbox", "advance"], + ["sandbox", "openclaw", "branch"], + ["sandbox", "agent_setup", "branch"], + ["openclaw", "policies", "advance"], + ["agent_setup", "policies", "advance"], + ["policies", "finalizing", "advance"], + ["finalizing", "post_verify", "advance"], + ["post_verify", "complete", "advance"], +] as const; + +describe("onboard machine vocabulary", () => { + it("defines the initial coarse state vocabulary from issue #3802", () => { + expect(ONBOARD_MACHINE_STATES).toEqual([ + "init", + "preflight", + "gateway", + "provider_selection", + "inference", + "sandbox", + "agent_setup", + "openclaw", + "policies", + "finalizing", + "post_verify", + "complete", + "failed", + ]); + }); + + it("defines the initial observe-only event vocabulary from issue #3802", () => { + expect(ONBOARD_MACHINE_EVENT_TYPES).toEqual([ + "onboard.started", + "onboard.resumed", + "onboard.completed", + "onboard.failed", + "state.entered", + "state.exited", + "state.skipped", + "state.completed", + "state.failed", + "state.repair.started", + "state.repair.completed", + "state.repair.failed", + "context.updated", + "resume.conflict", + "hook.started", + "hook.completed", + "hook.failed", + ]); + }); + + it("recognizes valid machine state names", () => { + expect(isOnboardMachineState("preflight")).toBe(true); + expect(isOnboardMachineState("messaging")).toBe(false); + expect(isOnboardMachineState(null)).toBe(false); + }); +}); + +describe("onboard machine transitions", () => { + it("encodes the canonical direct transition graph", () => { + expect(ONBOARD_MACHINE_DIRECT_TRANSITIONS).toEqual( + canonicalDirectTransitions.map(([from, to, kind]) => ({ from, to, kind })), + ); + }); + + it("allows every non-terminal state to fail", () => { + for (const state of ONBOARD_NON_TERMINAL_MACHINE_STATES) { + expect(canTransitionOnboardMachineState(state, "failed")).toBe(true); + expect(getOnboardMachineTransition(state, "failed")?.kind).toBe("failure"); + } + }); + + it("keeps terminal states terminal", () => { + expect(isTerminalOnboardMachineState("complete")).toBe(true); + expect(isTerminalOnboardMachineState("failed")).toBe(true); + expect(getNextOnboardMachineStates("complete")).toEqual([]); + expect(getNextOnboardMachineStates("failed")).toEqual([]); + expect(canTransitionOnboardMachineState("complete", "failed")).toBe(false); + expect(canTransitionOnboardMachineState("failed", "init")).toBe(false); + }); + + it("exposes next states in deterministic order", () => { + expect(ONBOARD_MACHINE_NEXT_STATES).toEqual({ + init: ["preflight", "failed"], + preflight: ["gateway", "failed"], + gateway: ["provider_selection", "failed"], + provider_selection: ["inference", "failed"], + inference: ["provider_selection", "sandbox", "failed"], + sandbox: ["openclaw", "agent_setup", "failed"], + agent_setup: ["policies", "failed"], + openclaw: ["policies", "failed"], + policies: ["finalizing", "failed"], + finalizing: ["post_verify", "failed"], + post_verify: ["complete", "failed"], + complete: [], + failed: [], + }); + }); + + it("classifies retry and branch transitions", () => { + expect(assertValidOnboardMachineTransition("inference", "provider_selection")).toMatchObject({ + kind: "retry", + }); + expect(assertValidOnboardMachineTransition("sandbox", "openclaw")).toMatchObject({ + kind: "branch", + }); + expect(assertValidOnboardMachineTransition("sandbox", "agent_setup")).toMatchObject({ + kind: "branch", + }); + }); + + it("rejects transitions outside the graph", () => { + expect(() => assertValidOnboardMachineTransition("init", "sandbox")).toThrow( + InvalidOnboardMachineTransitionError, + ); + expect(() => assertValidOnboardMachineTransition("complete", "failed")).toThrow( + "complete -> failed", + ); + }); + + it("keeps the next-state map aligned with the transition list", () => { + for (const state of ONBOARD_MACHINE_STATES) { + expect( + ONBOARD_MACHINE_TRANSITIONS.filter((transition) => transition.from === state).map( + (transition) => transition.to, + ), + ).toEqual(getNextOnboardMachineStates(state)); + } + }); + + it("does not contain duplicate transition edges", () => { + const edges = ONBOARD_MACHINE_TRANSITIONS.map(({ from, to }) => `${from}->${to}`); + expect(new Set(edges).size).toBe(edges.length); + }); +}); diff --git a/src/lib/onboard/machine/transitions.ts b/src/lib/onboard/machine/transitions.ts new file mode 100644 index 0000000000..9f23e3895a --- /dev/null +++ b/src/lib/onboard/machine/transitions.ts @@ -0,0 +1,107 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { OnboardMachineState, OnboardMachineTransition } from "./types"; +import { + ONBOARD_MACHINE_STATES, + ONBOARD_NON_TERMINAL_MACHINE_STATES, + ONBOARD_TERMINAL_MACHINE_STATES, +} from "./types"; + +export const ONBOARD_MACHINE_NEXT_STATES = { + init: ["preflight", "failed"], + preflight: ["gateway", "failed"], + gateway: ["provider_selection", "failed"], + provider_selection: ["inference", "failed"], + inference: ["provider_selection", "sandbox", "failed"], + sandbox: ["openclaw", "agent_setup", "failed"], + agent_setup: ["policies", "failed"], + openclaw: ["policies", "failed"], + policies: ["finalizing", "failed"], + finalizing: ["post_verify", "failed"], + post_verify: ["complete", "failed"], + complete: [], + failed: [], +} as const satisfies Readonly>; + +export const ONBOARD_MACHINE_DIRECT_TRANSITIONS = [ + { from: "init", to: "preflight", kind: "advance" }, + { from: "preflight", to: "gateway", kind: "advance" }, + { from: "gateway", to: "provider_selection", kind: "advance" }, + { from: "provider_selection", to: "inference", kind: "advance" }, + { from: "inference", to: "provider_selection", kind: "retry" }, + { from: "inference", to: "sandbox", kind: "advance" }, + { from: "sandbox", to: "openclaw", kind: "branch" }, + { from: "sandbox", to: "agent_setup", kind: "branch" }, + { from: "openclaw", to: "policies", kind: "advance" }, + { from: "agent_setup", to: "policies", kind: "advance" }, + { from: "policies", to: "finalizing", kind: "advance" }, + { from: "finalizing", to: "post_verify", kind: "advance" }, + { from: "post_verify", to: "complete", kind: "advance" }, +] as const satisfies readonly OnboardMachineTransition[]; + +export const ONBOARD_MACHINE_FAILURE_TRANSITIONS = ONBOARD_NON_TERMINAL_MACHINE_STATES.map( + (from) => ({ from, to: "failed" as const, kind: "failure" as const }), +) satisfies readonly OnboardMachineTransition[]; + +export const ONBOARD_MACHINE_TRANSITIONS = [ + ...ONBOARD_MACHINE_DIRECT_TRANSITIONS, + ...ONBOARD_MACHINE_FAILURE_TRANSITIONS, +] as const satisfies readonly OnboardMachineTransition[]; + +export class InvalidOnboardMachineTransitionError extends Error { + readonly from: OnboardMachineState; + readonly to: OnboardMachineState; + + constructor(from: OnboardMachineState, to: OnboardMachineState) { + super(`Invalid onboarding machine transition: ${from} -> ${to}`); + this.name = "InvalidOnboardMachineTransitionError"; + this.from = from; + this.to = to; + } +} + +export function isOnboardMachineState(value: unknown): value is OnboardMachineState { + return typeof value === "string" && ONBOARD_MACHINE_STATES.includes(value as OnboardMachineState); +} + +export function isTerminalOnboardMachineState( + state: OnboardMachineState, +): state is "complete" | "failed" { + return ONBOARD_TERMINAL_MACHINE_STATES.includes(state as "complete" | "failed"); +} + +export function getNextOnboardMachineStates( + from: OnboardMachineState, +): readonly OnboardMachineState[] { + return ONBOARD_MACHINE_NEXT_STATES[from]; +} + +export function canTransitionOnboardMachineState( + from: OnboardMachineState, + to: OnboardMachineState, +): boolean { + return getNextOnboardMachineStates(from).includes(to); +} + +export function getOnboardMachineTransition( + from: OnboardMachineState, + to: OnboardMachineState, +): OnboardMachineTransition | null { + return ( + ONBOARD_MACHINE_TRANSITIONS.find( + (transition) => transition.from === from && transition.to === to, + ) ?? null + ); +} + +export function assertValidOnboardMachineTransition( + from: OnboardMachineState, + to: OnboardMachineState, +): OnboardMachineTransition { + const transition = getOnboardMachineTransition(from, to); + if (!transition) { + throw new InvalidOnboardMachineTransitionError(from, to); + } + return transition; +} diff --git a/src/lib/onboard/machine/types.ts b/src/lib/onboard/machine/types.ts new file mode 100644 index 0000000000..bbba7bd5f6 --- /dev/null +++ b/src/lib/onboard/machine/types.ts @@ -0,0 +1,101 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * Coarse onboarding finite-state-machine vocabulary. + * + * These types intentionally model only major step boundaries. Mid-operation + * resume inside gateway startup, sandbox creation, credential upserts, model + * probes, or policy application is out of scope for the initial FSM shell. + */ + +export const ONBOARD_MACHINE_STATES = [ + "init", + "preflight", + "gateway", + "provider_selection", + "inference", + "sandbox", + "agent_setup", + "openclaw", + "policies", + "finalizing", + "post_verify", + "complete", + "failed", +] as const; + +export type OnboardMachineState = (typeof ONBOARD_MACHINE_STATES)[number]; + +export const ONBOARD_TERMINAL_MACHINE_STATES = ["complete", "failed"] as const; + +export type OnboardTerminalMachineState = + (typeof ONBOARD_TERMINAL_MACHINE_STATES)[number]; + +export type OnboardNonTerminalMachineState = Exclude< + OnboardMachineState, + OnboardTerminalMachineState +>; + +export const ONBOARD_NON_TERMINAL_MACHINE_STATES: readonly OnboardNonTerminalMachineState[] = + ONBOARD_MACHINE_STATES.filter( + (state): state is OnboardNonTerminalMachineState => + !ONBOARD_TERMINAL_MACHINE_STATES.includes(state as OnboardTerminalMachineState), + ); + +export const ONBOARD_MACHINE_EVENT_TYPES = [ + "onboard.started", + "onboard.resumed", + "onboard.completed", + "onboard.failed", + "state.entered", + "state.exited", + "state.skipped", + "state.completed", + "state.failed", + "state.repair.started", + "state.repair.completed", + "state.repair.failed", + "context.updated", + "resume.conflict", + "hook.started", + "hook.completed", + "hook.failed", +] as const; + +export type OnboardMachineEventType = (typeof ONBOARD_MACHINE_EVENT_TYPES)[number]; + +export type OnboardMachineTransitionKind = + | "advance" + | "retry" + | "branch" + | "failure"; + +export interface OnboardMachineTransition { + from: OnboardMachineState; + to: OnboardMachineState; + kind: OnboardMachineTransitionKind; +} + +/** + * Stable, redacted context keys that machine events may expose. + * + * Do not add raw secrets or unredacted URLs here. Runtime-derived topology + * decisions such as Docker/WSL reachability, Ollama proxy necessity, or live + * gateway health should be recomputed during execution rather than stored as + * durable FSM context. + */ +export interface OnboardMachineContext { + agent?: string | null; + sandboxName?: string | null; + provider?: string | null; + model?: string | null; + endpointUrl?: string | null; + credentialEnv?: string | null; + preferredInferenceApi?: string | null; + hermesAuthMethod?: "oauth" | "api_key" | null; + hermesToolGateways?: string[] | null; + policyPresets?: string[] | null; + messagingChannels?: string[] | null; + gpuPassthrough?: boolean; +} From b9e4545e44066975dab7945a93b580b366ec82c2 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Tue, 19 May 2026 19:27:06 -0700 Subject: [PATCH 02/54] refactor(cli): emit onboard session machine events --- src/lib/onboard/machine/events.ts | 166 ++++++++++++++++++++++++++ src/lib/state/onboard-session.test.ts | 90 ++++++++++++++ src/lib/state/onboard-session.ts | 94 +++++++++++++-- 3 files changed, 343 insertions(+), 7 deletions(-) create mode 100644 src/lib/onboard/machine/events.ts diff --git a/src/lib/onboard/machine/events.ts b/src/lib/onboard/machine/events.ts new file mode 100644 index 0000000000..9a68d3f899 --- /dev/null +++ b/src/lib/onboard/machine/events.ts @@ -0,0 +1,166 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { JsonObject, JsonValue } from "../../core/json-types"; +import { redactSensitiveText, redactUrl } from "../../security/redact"; +import type { HermesAuthMethod, Session } from "../../state/onboard-session"; +import type { + OnboardMachineContext, + OnboardMachineEventType, + OnboardMachineState, +} from "./types"; + +export const ONBOARD_SESSION_STEP_TO_MACHINE_STATE = { + preflight: "preflight", + gateway: "gateway", + provider_selection: "provider_selection", + inference: "inference", + sandbox: "sandbox", + agent_setup: "agent_setup", + openclaw: "openclaw", + policies: "policies", +} as const satisfies Readonly>; + +export type OnboardSessionStepName = keyof typeof ONBOARD_SESSION_STEP_TO_MACHINE_STATE; + +export interface OnboardMachineEvent { + version: 1; + type: OnboardMachineEventType; + occurredAt: string; + sessionId: string | null; + state: OnboardMachineState | null; + step: OnboardSessionStepName | null; + context: OnboardMachineContext; + error: string | null; + metadata: JsonObject; +} + +export type OnboardMachineEventListener = (event: OnboardMachineEvent) => void; + +const listeners = new Set(); + +export function addOnboardMachineEventListener( + listener: OnboardMachineEventListener, +): () => void { + listeners.add(listener); + return () => { + listeners.delete(listener); + }; +} + +export function clearOnboardMachineEventListeners(): void { + listeners.clear(); +} + +export function isOnboardSessionStepName(value: string): value is OnboardSessionStepName { + return Object.prototype.hasOwnProperty.call(ONBOARD_SESSION_STEP_TO_MACHINE_STATE, value); +} + +export function machineStateFromOnboardSessionStep( + stepName: string | null | undefined, +): OnboardMachineState | null { + if (!stepName || !isOnboardSessionStepName(stepName)) return null; + return ONBOARD_SESSION_STEP_TO_MACHINE_STATE[stepName]; +} + +function nullableString(value: unknown): string | null { + return typeof value === "string" ? value : null; +} + +function stringArray(value: unknown): string[] | null { + if (!Array.isArray(value)) return null; + return value.filter((entry): entry is string => typeof entry === "string"); +} + +function hermesAuthMethod(value: unknown): HermesAuthMethod | null { + return value === "oauth" || value === "api_key" ? value : null; +} + +function booleanValue(value: unknown): boolean | undefined { + return typeof value === "boolean" ? value : undefined; +} + +function sanitizeJsonValue(value: unknown): JsonValue { + if (typeof value === "string") return redactUrl(value) ?? redactSensitiveText(value) ?? ""; + if (typeof value === "number" && Number.isFinite(value)) return value; + if (typeof value === "boolean" || value === null) return value; + if (Array.isArray(value)) return value.map((entry) => sanitizeJsonValue(entry)); + if (typeof value !== "object" || value === null) return String(value); + + const result: JsonObject = {}; + for (const [key, entry] of Object.entries(value)) { + result[key] = sanitizeJsonValue(entry); + } + return result; +} + +export function sanitizeOnboardMachineEventMetadata( + metadata: Record | null | undefined, +): JsonObject { + if (!metadata || typeof metadata !== "object" || Array.isArray(metadata)) return {}; + const sanitized: JsonObject = {}; + for (const [key, value] of Object.entries(metadata)) { + sanitized[key] = sanitizeJsonValue(value); + } + return sanitized; +} + +export function buildOnboardMachineContext(session: Session): OnboardMachineContext { + const endpointUrl = redactUrl(session.endpointUrl); + return { + agent: nullableString(session.agent), + sandboxName: nullableString(session.sandboxName), + provider: nullableString(session.provider), + model: nullableString(session.model), + endpointUrl, + credentialEnv: nullableString(session.credentialEnv), + preferredInferenceApi: nullableString(session.preferredInferenceApi), + hermesAuthMethod: hermesAuthMethod(session.hermesAuthMethod), + hermesToolGateways: stringArray(session.hermesToolGateways), + policyPresets: stringArray(session.policyPresets), + messagingChannels: stringArray(session.messagingChannels), + gpuPassthrough: booleanValue(session.gpuPassthrough), + }; +} + +export function createOnboardMachineEvent({ + type, + session, + step, + state, + error = null, + metadata = {}, +}: { + type: OnboardMachineEventType; + session: Session; + step?: string | null; + state?: OnboardMachineState | null; + error?: string | null; + metadata?: Record | null; +}): OnboardMachineEvent { + const normalizedStep = step && isOnboardSessionStepName(step) ? step : null; + return { + version: 1, + type, + occurredAt: new Date().toISOString(), + sessionId: nullableString(session.sessionId), + state: state ?? machineStateFromOnboardSessionStep(normalizedStep), + step: normalizedStep, + context: buildOnboardMachineContext(session), + error: redactSensitiveText(error), + metadata: sanitizeOnboardMachineEventMetadata(metadata), + }; +} + +export function emitOnboardMachineEvent(event: OnboardMachineEvent): void { + if (listeners.size === 0) return; + for (const listener of listeners) { + try { + listener(event); + } catch { + // Event observers are diagnostics only. A broken observer must not + // change onboarding behavior; hook failure events are introduced by the + // later observe-only hook API. + } + } +} diff --git a/src/lib/state/onboard-session.test.ts b/src/lib/state/onboard-session.test.ts index b2c925858f..5ddd94908d 100644 --- a/src/lib/state/onboard-session.test.ts +++ b/src/lib/state/onboard-session.test.ts @@ -9,11 +9,15 @@ import { createRequire } from "node:module"; const require = createRequire(import.meta.url); const distPath = require.resolve("../../../dist/lib/state/onboard-session"); +const eventsDistPath = require.resolve("../../../dist/lib/onboard/machine/events"); const originalHome = process.env.HOME; type OnboardSessionModule = typeof import("../../../dist/lib/state/onboard-session"); +type OnboardMachineEventsModule = typeof import("../../../dist/lib/onboard/machine/events"); +type OnboardMachineEvent = import("../../../dist/lib/onboard/machine/events").OnboardMachineEvent; type LoadedSession = NonNullable>; type DebugSummary = NonNullable>; let session: OnboardSessionModule; +let machineEvents: OnboardMachineEventsModule; let tmpDir: string; function requireLoadedSession( @@ -44,13 +48,18 @@ beforeEach(() => { tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-onboard-session-")); process.env.HOME = tmpDir; delete require.cache[distPath]; + delete require.cache[eventsDistPath]; session = require("../../../dist/lib/state/onboard-session"); + machineEvents = require("../../../dist/lib/onboard/machine/events"); + machineEvents.clearOnboardMachineEventListeners(); session.clearSession(); session.releaseOnboardLock(); }); afterEach(() => { + machineEvents.clearOnboardMachineEventListeners(); delete require.cache[distPath]; + delete require.cache[eventsDistPath]; fs.rmSync(tmpDir, { recursive: true, force: true }); if (originalHome === undefined) { delete process.env.HOME; @@ -117,6 +126,87 @@ describe("onboard session", () => { expect(loaded.failure.message).toMatch(/Sandbox creation failed/); }); + it("emits redacted structured machine events for session step mutations", () => { + const emitted: OnboardMachineEvent[] = []; + machineEvents.addOnboardMachineEventListener((event) => emitted.push(event)); + + session.saveSession(session.createSession({ sessionId: "session-1" })); + session.markStepStarted("gateway"); + session.markStepComplete("gateway", { + sandboxName: "my-assistant", + endpointUrl: + "https://alice:super-secret-token@example.com/v1?token=super-secret-token&keep=yes#token=super-secret-token", + credentialEnv: "NVIDIA_API_KEY", + }); + session.markStepSkipped("openclaw"); + session.markStepFailed("sandbox", "NVIDIA_API_KEY=super-secret-token"); + session.completeSession({ provider: "ollama-local", credentialEnv: null }); + + expect(emitted.map((event) => event.type)).toEqual([ + "state.entered", + "context.updated", + "state.completed", + "state.skipped", + "state.failed", + "onboard.failed", + "context.updated", + "onboard.completed", + ]); + expect(emitted[0]).toMatchObject({ + version: 1, + sessionId: "session-1", + state: "gateway", + step: "gateway", + error: null, + }); + expect(emitted[1].context).toMatchObject({ + sandboxName: "my-assistant", + credentialEnv: "NVIDIA_API_KEY", + }); + expect(emitted[1].context.endpointUrl).toBe( + "https://example.com/v1?token=%3CREDACTED%3E&keep=yes", + ); + expect(emitted[1].metadata.fields).toEqual([ + "sandboxName", + "endpointUrl", + "credentialEnv", + ]); + expect(emitted[4]).toMatchObject({ + type: "state.failed", + state: "sandbox", + step: "sandbox", + error: "NVIDIA_API_KEY=", + }); + expect(emitted[5]).toMatchObject({ type: "onboard.failed", state: "failed" }); + expect(emitted.at(-1)).toMatchObject({ type: "onboard.completed", state: "complete" }); + expect(JSON.stringify(emitted)).not.toContain("super-secret-token"); + + const persisted = JSON.parse(fs.readFileSync(session.SESSION_FILE, "utf8")); + expect(persisted.events).toBeUndefined(); + }); + + it("keeps event observer failures from changing session mutation behavior", () => { + machineEvents.addOnboardMachineEventListener(() => { + throw new Error("observer failed"); + }); + + session.saveSession(session.createSession()); + expect(() => session.markStepStarted("preflight")).not.toThrow(); + + const loaded = requireLoadedSession(session.loadSession()); + expect(loaded.steps.preflight.status).toBe("in_progress"); + }); + + it("does not emit machine events for unknown session step names", () => { + const emitted: OnboardMachineEvent[] = []; + machineEvents.addOnboardMachineEventListener((event) => emitted.push(event)); + + session.saveSession(session.createSession()); + session.markStepStarted("not_a_real_step"); + + expect(emitted).toEqual([]); + }); + it("persists safe provider metadata without persisting secrets", () => { session.saveSession(session.createSession()); const unsafeProviderUpdate: Parameters[1] & { diff --git a/src/lib/state/onboard-session.ts b/src/lib/state/onboard-session.ts index f05c1116e8..7fe94d8096 100644 --- a/src/lib/state/onboard-session.ts +++ b/src/lib/state/onboard-session.ts @@ -18,6 +18,10 @@ import { sanitizeMessagingChannelConfig, type MessagingChannelConfig, } from "../messaging-channel-config"; +import { + createOnboardMachineEvent, + emitOnboardMachineEvent, +} from "../onboard/machine/events"; import { redactSensitiveText, redactUrl } from "../security/redact"; export const SESSION_VERSION = 1; @@ -883,7 +887,8 @@ export function updateSession(mutator: (session: Session) => Session | void): Se } export function markStepStarted(stepName: string): Session { - return updateSession((session) => { + let shouldEmit = false; + const updatedSession = updateSession((session) => { const step = session.steps[stepName]; if (!step) return session; step.status = "in_progress"; @@ -893,12 +898,21 @@ export function markStepStarted(stepName: string): Session { session.lastStepStarted = stepName; session.failure = null; session.status = "in_progress"; + shouldEmit = true; return session; }); + if (shouldEmit) { + emitOnboardMachineEvent( + createOnboardMachineEvent({ type: "state.entered", session: updatedSession, step: stepName }), + ); + } + return updatedSession; } export function markStepComplete(stepName: string, updates: SessionUpdates = {}): Session { - return updateSession((session) => { + const safeUpdates = filterSafeUpdates(updates); + let shouldEmit = false; + const updatedSession = updateSession((session) => { const step = session.steps[stepName]; if (!step) return session; step.status = "complete"; @@ -906,13 +920,31 @@ export function markStepComplete(stepName: string, updates: SessionUpdates = {}) step.error = null; session.lastCompletedStep = stepName; session.failure = null; - Object.assign(session, filterSafeUpdates(updates)); + Object.assign(session, safeUpdates); + shouldEmit = true; return session; }); + if (shouldEmit) { + if (Object.keys(safeUpdates).length > 0) { + emitOnboardMachineEvent( + createOnboardMachineEvent({ + type: "context.updated", + session: updatedSession, + step: stepName, + metadata: { fields: Object.keys(safeUpdates) }, + }), + ); + } + emitOnboardMachineEvent( + createOnboardMachineEvent({ type: "state.completed", session: updatedSession, step: stepName }), + ); + } + return updatedSession; } export function markStepSkipped(stepName: string): Session { - return updateSession((session) => { + let shouldEmit = false; + const updatedSession = updateSession((session) => { const step = session.steps[stepName]; if (!step) return session; if (step.status === "complete" || step.status === "failed") return session; @@ -920,12 +952,20 @@ export function markStepSkipped(stepName: string): Session { step.startedAt = null; step.completedAt = null; step.error = null; + shouldEmit = true; return session; }); + if (shouldEmit) { + emitOnboardMachineEvent( + createOnboardMachineEvent({ type: "state.skipped", session: updatedSession, step: stepName }), + ); + } + return updatedSession; } export function markStepFailed(stepName: string, message: string | null = null): Session { - return updateSession((session) => { + let shouldEmit = false; + const updatedSession = updateSession((session) => { const step = session.steps[stepName]; if (!step) return session; step.status = "failed"; @@ -937,18 +977,58 @@ export function markStepFailed(stepName: string, message: string | null = null): recordedAt: new Date().toISOString(), }); session.status = "failed"; + shouldEmit = true; return session; }); + if (shouldEmit) { + emitOnboardMachineEvent( + createOnboardMachineEvent({ + type: "state.failed", + session: updatedSession, + step: stepName, + error: message, + }), + ); + emitOnboardMachineEvent( + createOnboardMachineEvent({ + type: "onboard.failed", + session: updatedSession, + state: "failed", + step: stepName, + error: message, + }), + ); + } + return updatedSession; } export function completeSession(updates: SessionUpdates = {}): Session { - return updateSession((session) => { - Object.assign(session, filterSafeUpdates(updates)); + const safeUpdates = filterSafeUpdates(updates); + const updatedSession = updateSession((session) => { + Object.assign(session, safeUpdates); session.status = "complete"; session.resumable = false; session.failure = null; return session; }); + if (Object.keys(safeUpdates).length > 0) { + emitOnboardMachineEvent( + createOnboardMachineEvent({ + type: "context.updated", + session: updatedSession, + state: "complete", + metadata: { fields: Object.keys(safeUpdates) }, + }), + ); + } + emitOnboardMachineEvent( + createOnboardMachineEvent({ + type: "onboard.completed", + session: updatedSession, + state: "complete", + }), + ); + return updatedSession; } export function summarizeForDebug( From 651e2a07c3f34bd38cf942d08ad350e5d6b5eb86 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Tue, 19 May 2026 21:47:57 -0700 Subject: [PATCH 03/54] refactor(cli): persist onboard machine snapshots --- src/lib/actions/inference-set.test.ts | 8 +- src/lib/state/onboard-session.test.ts | 115 ++++++++++++++++++++ src/lib/state/onboard-session.ts | 145 ++++++++++++++++++++++++-- 3 files changed, 259 insertions(+), 9 deletions(-) diff --git a/src/lib/actions/inference-set.test.ts b/src/lib/actions/inference-set.test.ts index ae091f7adf..f6c178f0cf 100644 --- a/src/lib/actions/inference-set.test.ts +++ b/src/lib/actions/inference-set.test.ts @@ -86,9 +86,15 @@ function baseSession(overrides: Partial = {}): Session { telegramConfig: null, wechatConfig: null, metadata: { gatewayName: "nemoclaw", fromDockerfile: null }, + machine: { + version: 1, + state: "complete", + stateEnteredAt: "2026-05-11T00:00:00.000Z", + revision: 0, + }, steps: {}, ...overrides, - }; + } as Session; } function createDeps(options: { diff --git a/src/lib/state/onboard-session.test.ts b/src/lib/state/onboard-session.test.ts index 5ddd94908d..8e4b9f5cbc 100644 --- a/src/lib/state/onboard-session.test.ts +++ b/src/lib/state/onboard-session.test.ts @@ -40,6 +40,14 @@ function requireDebugSummary( return summary; } +function normalizeLegacySession( + legacy: unknown, +): ReturnType { + return session.normalizeSession( + legacy as Parameters[0], + ); +} + beforeEach(() => { // Recreate tmpDir per test so lock artifacts (and any other on-disk state) // from a previous test cannot leak into this one. Without this, malformed @@ -80,6 +88,12 @@ describe("onboard session", () => { const dirStat = fs.statSync(path.dirname(session.SESSION_FILE)); expect(saved.mode).toBe("non-interactive"); + expect(saved.machine).toMatchObject({ + version: 1, + state: "init", + revision: 0, + }); + expect(saved.machine.stateEnteredAt).toBeTruthy(); expect(fs.existsSync(session.SESSION_FILE)).toBe(true); expect(stat.mode & 0o777).toBe(0o600); expect(dirStat.mode & 0o777).toBe(0o700); @@ -124,6 +138,107 @@ describe("onboard session", () => { } expect(loaded.failure.step).toBe("sandbox"); expect(loaded.failure.message).toMatch(/Sandbox creation failed/); + expect(loaded.machine.state).toBe("failed"); + }); + + it("persists a compact machine snapshot across step boundaries", () => { + session.saveSession(session.createSession()); + let loaded = requireLoadedSession(session.loadSession()); + expect(loaded.machine).toMatchObject({ state: "init", revision: 0 }); + + session.markStepStarted("preflight"); + loaded = requireLoadedSession(session.loadSession()); + expect(loaded.machine).toMatchObject({ state: "preflight", revision: 1 }); + expect(loaded.machine.stateEnteredAt).toBe(loaded.steps.preflight.startedAt); + + session.markStepComplete("preflight"); + loaded = requireLoadedSession(session.loadSession()); + expect(loaded.machine).toMatchObject({ state: "gateway", revision: 2 }); + expect(loaded.machine.stateEnteredAt).toBe(loaded.steps.preflight.completedAt); + + session.markStepComplete("gateway"); + loaded = requireLoadedSession(session.loadSession()); + expect(loaded.machine).toMatchObject({ state: "provider_selection", revision: 3 }); + + session.completeSession(); + loaded = requireLoadedSession(session.loadSession()); + expect(loaded.machine).toMatchObject({ state: "complete", revision: 4 }); + expect(requireDebugSummary(session.summarizeForDebug()).machine).toEqual(loaded.machine); + }); + + it("normalizes old sessions without machine snapshots", () => { + type LegacySession = Omit, "machine"> & { + machine?: unknown; + }; + const legacy = session.createSession({ + sessionId: "legacy-session", + startedAt: "2026-01-01T00:00:00.000Z", + updatedAt: "2026-01-01T00:05:00.000Z", + }) as unknown as LegacySession; + delete legacy.machine; + legacy.steps.gateway.status = "in_progress"; + legacy.steps.gateway.startedAt = "2026-01-01T00:02:00.000Z"; + legacy.lastStepStarted = "gateway"; + + let normalized = requireLoadedSession(normalizeLegacySession(legacy)); + expect(normalized.machine).toEqual({ + version: 1, + state: "gateway", + stateEnteredAt: "2026-01-01T00:02:00.000Z", + revision: 0, + }); + + legacy.steps.gateway.status = "complete"; + legacy.steps.gateway.completedAt = "2026-01-01T00:03:00.000Z"; + legacy.lastCompletedStep = "gateway"; + normalized = requireLoadedSession(normalizeLegacySession(legacy)); + expect(normalized.machine).toEqual({ + version: 1, + state: "provider_selection", + stateEnteredAt: "2026-01-01T00:03:00.000Z", + revision: 0, + }); + + legacy.status = "failed"; + legacy.failure = { + step: "gateway", + message: "boom", + recordedAt: "2026-01-01T00:04:00.000Z", + }; + normalized = requireLoadedSession(normalizeLegacySession(legacy)); + expect(normalized.machine).toEqual({ + version: 1, + state: "failed", + stateEnteredAt: "2026-01-01T00:04:00.000Z", + revision: 0, + }); + + legacy.status = "complete"; + normalized = requireLoadedSession(normalizeLegacySession(legacy)); + expect(normalized.machine.state).toBe("complete"); + }); + + it("normalizes invalid machine snapshots from old sessions", () => { + type LegacySession = Omit, "machine"> & { + machine?: unknown; + }; + const legacy = session.createSession({ lastCompletedStep: "policies" }) as unknown as LegacySession; + legacy.steps.policies.status = "complete"; + legacy.steps.policies.completedAt = "2026-01-01T00:08:00.000Z"; + legacy.machine = { + version: 1, + state: "not-a-state", + stateEnteredAt: "2026-01-01T00:09:00.000Z", + revision: -1, + }; + + const normalized = requireLoadedSession(normalizeLegacySession(legacy)); + expect(normalized.machine).toEqual({ + version: 1, + state: "finalizing", + stateEnteredAt: "2026-01-01T00:08:00.000Z", + revision: 0, + }); }); it("emits redacted structured machine events for session step mutations", () => { diff --git a/src/lib/state/onboard-session.ts b/src/lib/state/onboard-session.ts index 7fe94d8096..f739f330d2 100644 --- a/src/lib/state/onboard-session.ts +++ b/src/lib/state/onboard-session.ts @@ -21,10 +21,14 @@ import { import { createOnboardMachineEvent, emitOnboardMachineEvent, + machineStateFromOnboardSessionStep, } from "../onboard/machine/events"; +import { isOnboardMachineState } from "../onboard/machine/transitions"; +import type { OnboardMachineState } from "../onboard/machine/types"; import { redactSensitiveText, redactUrl } from "../security/redact"; export const SESSION_VERSION = 1; +export const MACHINE_SNAPSHOT_VERSION = 1; export const SESSION_DIR = path.join(process.env.HOME || "/tmp", ".nemoclaw"); export const SESSION_FILE = path.join(SESSION_DIR, "onboard-session.json"); export const LOCK_FILE = path.join(SESSION_DIR, "onboard.lock"); @@ -64,6 +68,13 @@ export interface SessionMetadata { fromDockerfile: string | null; } +export interface OnboardMachineSnapshot { + version: typeof MACHINE_SNAPSHOT_VERSION; + state: OnboardMachineState; + stateEnteredAt: string | null; + revision: number; +} + export interface Session { version: number; sessionId: string; @@ -115,6 +126,7 @@ export interface Session { telegramConfig: TelegramConfig | null; wechatConfig: WechatConfig | null; metadata: SessionMetadata; + machine: OnboardMachineSnapshot; steps: Record; } @@ -198,6 +210,7 @@ export interface DebugSessionSummary { lastStepStarted: string | null; lastCompletedStep: string | null; failure: SessionFailure | null; + machine: OnboardMachineSnapshot; steps: Record; } @@ -240,6 +253,10 @@ function readPositiveInteger(value: SessionJsonValue | undefined): number | null return typeof value === "number" && Number.isInteger(value) && value > 0 ? value : null; } +function readNonNegativeInteger(value: SessionJsonValue | undefined): number | null { + return typeof value === "number" && Number.isInteger(value) && value >= 0 ? value : null; +} + function readStringArray(value: SessionJsonValue | undefined): string[] | null { if (!Array.isArray(value)) return null; return value.filter((entry): entry is string => typeof entry === "string"); @@ -308,6 +325,17 @@ function parseStepState(value: SessionJsonValue | undefined): StepState | null { }; } +function parseMachineSnapshot(value: SessionJsonValue | undefined): OnboardMachineSnapshot | null { + if (!isObject(value) || value.version !== MACHINE_SNAPSHOT_VERSION) return null; + if (!isOnboardMachineState(value.state)) return null; + return { + version: MACHINE_SNAPSHOT_VERSION, + state: value.state, + stateEnteredAt: readString(value.stateEnteredAt), + revision: readNonNegativeInteger(value.revision) ?? 0, + }; +} + function parseLockInfo(value: SessionJsonValue | undefined): LockInfo | null { if (!isObject(value) || typeof value.pid !== "number") return null; return { @@ -335,9 +363,97 @@ export function sanitizeFailure( // ── Session CRUD ───────────────────────────────────────────────── +function createMachineSnapshot( + state: OnboardMachineState, + stateEnteredAt: string | null, + revision = 0, +): OnboardMachineSnapshot { + return { + version: MACHINE_SNAPSHOT_VERSION, + state, + stateEnteredAt, + revision: Math.max(0, Math.trunc(revision)), + }; +} + +function nextMachineStateAfterCompletedStep( + stepName: string | null | undefined, + session: Pick, +): OnboardMachineState | null { + switch (stepName) { + case "preflight": + return "gateway"; + case "gateway": + return "provider_selection"; + case "provider_selection": + return "inference"; + case "inference": + return "sandbox"; + case "sandbox": + return session.agent ? "agent_setup" : "openclaw"; + case "openclaw": + case "agent_setup": + return "policies"; + case "policies": + return "finalizing"; + default: + return null; + } +} + +function inferMachineState(session: Session): OnboardMachineState { + if (session.status === "complete") return "complete"; + if (session.status === "failed") return "failed"; + + const startedState = machineStateFromOnboardSessionStep(session.lastStepStarted); + const startedStep = session.lastStepStarted ? session.steps[session.lastStepStarted] : null; + if (startedState && startedStep?.status === "in_progress") return startedState; + + return nextMachineStateAfterCompletedStep(session.lastCompletedStep, session) ?? "init"; +} + +function inferMachineStateEnteredAt(session: Session, state: OnboardMachineState): string | null { + if (state === "failed") return session.failure?.recordedAt ?? session.updatedAt; + if (state === "complete") return session.updatedAt; + + const startedState = machineStateFromOnboardSessionStep(session.lastStepStarted); + const startedStep = session.lastStepStarted ? session.steps[session.lastStepStarted] : null; + if (state === startedState && startedStep?.status === "in_progress") { + return startedStep.startedAt ?? session.updatedAt; + } + + if (nextMachineStateAfterCompletedStep(session.lastCompletedStep, session) === state) { + const completedStep = session.lastCompletedStep ? session.steps[session.lastCompletedStep] : null; + return completedStep?.completedAt ?? session.updatedAt; + } + + return session.startedAt; +} + +function inferMachineSnapshot(session: Session): OnboardMachineSnapshot { + const state = inferMachineState(session); + return createMachineSnapshot(state, inferMachineStateEnteredAt(session, state)); +} + +function transitionMachineSnapshot(session: Session, state: OnboardMachineState, now: string): void { + const current = session.machine ?? createMachineSnapshot("init", session.startedAt); + if (current.state === state) { + session.machine = { + ...current, + stateEnteredAt: current.stateEnteredAt ?? now, + }; + return; + } + session.machine = createMachineSnapshot(state, now, current.revision + 1); +} + export function createSession(overrides: Partial = {}): Session { const now = new Date().toISOString(); - return { + const steps = { + ...defaultSteps(), + ...(overrides.steps ?? {}), + }; + const session: Session = { version: SESSION_VERSION, sessionId: overrides.sessionId ?? `${Date.now()}-${randomUUID()}`, resumable: true, @@ -376,11 +492,11 @@ export function createSession(overrides: Partial = {}): Session { gatewayName: overrides.metadata?.gatewayName ?? "nemoclaw", fromDockerfile: overrides.metadata?.fromDockerfile ?? null, }, - steps: { - ...defaultSteps(), - ...(overrides.steps ?? {}), - }, + machine: parseMachineSnapshot(overrides.machine as SessionJsonValue | undefined) ?? + createMachineSnapshot("init", now), + steps, }; + return session; } export function normalizeSession(data: Session | SessionJsonValue | undefined): Session | null { @@ -429,6 +545,8 @@ export function normalizeSession(data: Session | SessionJsonValue | undefined): } } + normalized.machine = parseMachineSnapshot(data.machine) ?? inferMachineSnapshot(normalized); + return normalized; } @@ -891,13 +1009,16 @@ export function markStepStarted(stepName: string): Session { const updatedSession = updateSession((session) => { const step = session.steps[stepName]; if (!step) return session; + const now = new Date().toISOString(); step.status = "in_progress"; - step.startedAt = new Date().toISOString(); + step.startedAt = now; step.completedAt = null; step.error = null; session.lastStepStarted = stepName; session.failure = null; session.status = "in_progress"; + const state = machineStateFromOnboardSessionStep(stepName); + if (state) transitionMachineSnapshot(session, state, now); shouldEmit = true; return session; }); @@ -915,12 +1036,15 @@ export function markStepComplete(stepName: string, updates: SessionUpdates = {}) const updatedSession = updateSession((session) => { const step = session.steps[stepName]; if (!step) return session; + const now = new Date().toISOString(); step.status = "complete"; - step.completedAt = new Date().toISOString(); + step.completedAt = now; step.error = null; session.lastCompletedStep = stepName; session.failure = null; Object.assign(session, safeUpdates); + const nextState = nextMachineStateAfterCompletedStep(stepName, session); + if (nextState) transitionMachineSnapshot(session, nextState, now); shouldEmit = true; return session; }); @@ -968,15 +1092,17 @@ export function markStepFailed(stepName: string, message: string | null = null): const updatedSession = updateSession((session) => { const step = session.steps[stepName]; if (!step) return session; + const now = new Date().toISOString(); step.status = "failed"; step.completedAt = null; step.error = redactSensitiveText(message); session.failure = sanitizeFailure({ step: stepName, message, - recordedAt: new Date().toISOString(), + recordedAt: now, }); session.status = "failed"; + transitionMachineSnapshot(session, "failed", now); shouldEmit = true; return session; }); @@ -1005,10 +1131,12 @@ export function markStepFailed(stepName: string, message: string | null = null): export function completeSession(updates: SessionUpdates = {}): Session { const safeUpdates = filterSafeUpdates(updates); const updatedSession = updateSession((session) => { + const now = new Date().toISOString(); Object.assign(session, safeUpdates); session.status = "complete"; session.resumable = false; session.failure = null; + transitionMachineSnapshot(session, "complete", now); return session; }); if (Object.keys(safeUpdates).length > 0) { @@ -1057,6 +1185,7 @@ export function summarizeForDebug( lastStepStarted: session.lastStepStarted, lastCompletedStep: session.lastCompletedStep, failure: sanitizeFailure(session.failure), + machine: session.machine, steps: Object.fromEntries( Object.entries(session.steps).map(([name, step]) => [ name, From f756907b5c07a0bb2d09049ab6b4fa7cda681709 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Tue, 19 May 2026 22:12:25 -0700 Subject: [PATCH 04/54] refactor(cli): add onboard runtime shell --- src/lib/onboard/machine/runtime.test.ts | 184 +++++++++++++++++ src/lib/onboard/machine/runtime.ts | 263 ++++++++++++++++++++++++ 2 files changed, 447 insertions(+) create mode 100644 src/lib/onboard/machine/runtime.test.ts create mode 100644 src/lib/onboard/machine/runtime.ts diff --git a/src/lib/onboard/machine/runtime.test.ts b/src/lib/onboard/machine/runtime.test.ts new file mode 100644 index 0000000000..becca6028e --- /dev/null +++ b/src/lib/onboard/machine/runtime.test.ts @@ -0,0 +1,184 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it } from "vitest"; + +import { + createSession, + filterSafeUpdates, + normalizeSession, + type Session, +} from "../../state/onboard-session"; +import type { OnboardMachineEvent } from "./events"; +import { OnboardRuntime, type OnboardRuntimeDeps } from "./runtime"; +import { InvalidOnboardMachineTransitionError } from "./transitions"; + +function cloneSession(session: Session): Session { + return normalizeSession(JSON.parse(JSON.stringify(session))) ?? session; +} + +function createHarness(initialSession: Session | null = createSession()) { + let session = initialSession ? cloneSession(initialSession) : null; + const events: OnboardMachineEvent[] = []; + let tick = 0; + const deps: OnboardRuntimeDeps = { + loadSession: () => (session ? cloneSession(session) : null), + createSession: (overrides) => createSession(overrides), + saveSession: (next) => { + session = cloneSession(next); + return cloneSession(session); + }, + updateSession: (mutator) => { + const current = session ? cloneSession(session) : createSession(); + const next = mutator(current) ?? current; + session = cloneSession(next); + return cloneSession(session); + }, + filterSafeUpdates, + emitEvent: (event) => events.push(event), + now: () => `2026-05-19T00:00:${String(tick++).padStart(2, "0")}.000Z`, + }; + return { + runtime: new OnboardRuntime(deps), + events, + getSession: () => { + if (!session) throw new Error("Expected runtime session"); + return cloneSession(session); + }, + }; +} + +function sessionInState(state: Session["machine"]["state"]): Session { + const session = createSession(); + session.machine = { + version: 1, + state, + stateEnteredAt: "2026-05-19T00:00:00.000Z", + revision: 7, + }; + return session; +} + +describe("OnboardRuntime", () => { + it("starts a session and emits started/resumed lifecycle events", async () => { + const { runtime, events, getSession } = createHarness(null); + + const started = await runtime.start(); + expect(started.machine.state).toBe("init"); + expect(getSession().machine.state).toBe("init"); + expect(events[0]).toMatchObject({ type: "onboard.started", state: "init" }); + + await runtime.start({ resumed: true }); + expect(events[1]).toMatchObject({ type: "onboard.resumed", state: "init" }); + }); + + it("validates and persists explicit transitions", async () => { + const { runtime, events, getSession } = createHarness(); + + await runtime.transition("preflight"); + + expect(getSession().machine).toEqual({ + version: 1, + state: "preflight", + stateEnteredAt: "2026-05-19T00:00:00.000Z", + revision: 1, + }); + expect(events.map((event) => event.type)).toEqual(["state.exited", "state.entered"]); + expect(events[0]).toMatchObject({ state: "init" }); + expect(events[1]).toMatchObject({ state: "preflight" }); + + await expect(runtime.transition("sandbox")).rejects.toThrow( + InvalidOnboardMachineTransitionError, + ); + expect(getSession().machine.state).toBe("preflight"); + }); + + it("applies only safe context updates and emits redacted context events", async () => { + const { runtime, events, getSession } = createHarness(); + + await runtime.updateContext({ + provider: "nvidia-prod", + endpointUrl: "https://alice:secret@example.com/v1?token=super-secret&keep=yes#token=frag", + credentialEnv: "NVIDIA_API_KEY", + apiKey: "super-secret", + } as Parameters[0] & { apiKey: string }); + + expect(getSession()).toMatchObject({ + provider: "nvidia-prod", + endpointUrl: "https://example.com/v1?token=%3CREDACTED%3E&keep=yes", + credentialEnv: "NVIDIA_API_KEY", + }); + expect("apiKey" in getSession()).toBe(false); + expect(events).toHaveLength(1); + expect(events[0]).toMatchObject({ type: "context.updated", state: "init" }); + expect(events[0].metadata.fields).toEqual(["provider", "endpointUrl", "credentialEnv"]); + expect(JSON.stringify(events)).not.toContain("super-secret"); + }); + + it("fails non-terminal sessions with redacted failure events", async () => { + const { runtime, events, getSession } = createHarness(sessionInState("gateway")); + + await runtime.fail("NVIDIA_API_KEY=super-secret", { step: "gateway" }); + + expect(getSession()).toMatchObject({ + status: "failed", + failure: { step: "gateway", message: "NVIDIA_API_KEY=" }, + machine: { state: "failed", revision: 8 }, + }); + expect(events.map((event) => event.type)).toEqual(["state.failed", "onboard.failed"]); + expect(events[0]).toMatchObject({ state: "gateway", step: "gateway" }); + expect(events[1]).toMatchObject({ state: "failed", step: "gateway" }); + expect(JSON.stringify(events)).not.toContain("super-secret"); + }); + + it("rejects terminal-state failure and invalid completion transitions", async () => { + const completeHarness = createHarness(sessionInState("complete")); + await expect(completeHarness.runtime.fail("boom")).rejects.toThrow("complete -> failed"); + expect(completeHarness.getSession().machine.state).toBe("complete"); + + const policiesHarness = createHarness(sessionInState("policies")); + await expect(policiesHarness.runtime.complete()).rejects.toThrow("policies -> complete"); + expect(policiesHarness.getSession().machine.state).toBe("policies"); + }); + + it("completes from post_verify and emits completion events", async () => { + const { runtime, events, getSession } = createHarness(sessionInState("post_verify")); + + await runtime.complete({ sandboxName: "my-assistant" }); + + expect(getSession()).toMatchObject({ + status: "complete", + resumable: false, + sandboxName: "my-assistant", + machine: { state: "complete", revision: 8 }, + }); + expect(events.map((event) => event.type)).toEqual([ + "context.updated", + "state.completed", + "state.entered", + "onboard.completed", + ]); + }); + + it("emits skipped and repair events without mutating durable state", async () => { + const { runtime, events, getSession } = createHarness(sessionInState("provider_selection")); + + await runtime.markSkipped("provider_selection", { reason: "resume" }); + await runtime.emitRepairEvent("state.repair.started", { + state: "provider_selection", + metadata: { action: "ollama-systemd" }, + }); + await runtime.emitRepairEvent("state.repair.completed", { state: "provider_selection" }); + + expect(getSession().machine.state).toBe("provider_selection"); + expect(events.map((event) => event.type)).toEqual([ + "state.skipped", + "state.repair.started", + "state.repair.completed", + ]); + expect(events[0].metadata.reason).toBe("resume"); + await expect(runtime.markSkipped("complete")).rejects.toThrow( + "Terminal onboarding state cannot be skipped", + ); + }); +}); diff --git a/src/lib/onboard/machine/runtime.ts b/src/lib/onboard/machine/runtime.ts new file mode 100644 index 0000000000..3e72cd0ccc --- /dev/null +++ b/src/lib/onboard/machine/runtime.ts @@ -0,0 +1,263 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { JsonObject } from "../../core/json-types"; +import * as onboardSession from "../../state/onboard-session"; +import type { Session, SessionUpdates } from "../../state/onboard-session"; +import { + createOnboardMachineEvent, + emitOnboardMachineEvent, + type OnboardMachineEvent, +} from "./events"; +import { + assertValidOnboardMachineTransition, + canTransitionOnboardMachineState, + isTerminalOnboardMachineState, +} from "./transitions"; +import type { OnboardMachineEventType, OnboardMachineState } from "./types"; + +export interface OnboardRuntimeDeps { + loadSession(): Session | null; + createSession(overrides?: Partial): Session; + saveSession(session: Session): Session; + updateSession(mutator: (session: Session) => Session | void): Session; + filterSafeUpdates(updates: SessionUpdates): Partial; + emitEvent(event: OnboardMachineEvent): void; + now(): string; +} + +export type OnboardRuntimeTransitionOptions = { + metadata?: Record | null; +}; + +export type OnboardRuntimeUpdateOptions = { + state?: OnboardMachineState | null; + metadata?: Record | null; +}; + +export type OnboardRuntimeFailureOptions = { + step?: string | null; + metadata?: Record | null; +}; + +function defaultDeps(): OnboardRuntimeDeps { + return { + loadSession: onboardSession.loadSession, + createSession: onboardSession.createSession, + saveSession: onboardSession.saveSession, + updateSession: onboardSession.updateSession, + filterSafeUpdates: onboardSession.filterSafeUpdates, + emitEvent: emitOnboardMachineEvent, + now: () => new Date().toISOString(), + }; +} + +function eventMetadata(metadata: Record | null | undefined): JsonObject { + return metadata && typeof metadata === "object" && !Array.isArray(metadata) + ? (metadata as JsonObject) + : {}; +} + +function snapshotFor( + state: OnboardMachineState, + stateEnteredAt: string | null, + revision: number, +): onboardSession.OnboardMachineSnapshot { + return { + version: onboardSession.MACHINE_SNAPSHOT_VERSION, + state, + stateEnteredAt, + revision: Math.max(0, Math.trunc(revision)), + }; +} + +export class OnboardRuntime { + private readonly deps: OnboardRuntimeDeps; + + constructor(deps: Partial = {}) { + this.deps = { ...defaultDeps(), ...deps }; + } + + async session(): Promise { + return this.ensureSession(); + } + + async start(options: { resumed?: boolean; metadata?: Record | null } = {}): Promise { + const session = this.ensureSession(); + this.emit(options.resumed === true ? "onboard.resumed" : "onboard.started", session, { + state: session.machine.state, + metadata: options.metadata, + }); + return session; + } + + async transition( + to: OnboardMachineState, + options: OnboardRuntimeTransitionOptions = {}, + ): Promise { + const current = this.ensureSession(); + const from = current.machine.state; + assertValidOnboardMachineTransition(from, to); + + const enteredAt = this.deps.now(); + const updated = this.deps.updateSession((session) => { + session.machine = snapshotFor(to, enteredAt, session.machine.revision + 1); + if (to === "failed") { + session.status = "failed"; + } else if (to === "complete") { + session.status = "complete"; + session.resumable = false; + session.failure = null; + } else if (session.status !== "failed") { + session.status = "in_progress"; + } + return session; + }); + + this.emit("state.exited", updated, { state: from, metadata: options.metadata }); + this.emit("state.entered", updated, { state: to, metadata: options.metadata }); + return updated; + } + + async updateContext( + updates: SessionUpdates, + options: OnboardRuntimeUpdateOptions = {}, + ): Promise { + const safeUpdates = this.deps.filterSafeUpdates(updates); + const fields = Object.keys(safeUpdates); + const updated = this.deps.updateSession((session) => { + Object.assign(session, safeUpdates); + return session; + }); + if (fields.length > 0) { + this.emit("context.updated", updated, { + state: options.state ?? updated.machine.state, + metadata: { ...eventMetadata(options.metadata), fields }, + }); + } + return updated; + } + + async complete(updates: SessionUpdates = {}): Promise { + const current = this.ensureSession(); + const from = current.machine.state; + assertValidOnboardMachineTransition(from, "complete"); + + const safeUpdates = this.deps.filterSafeUpdates(updates); + const fields = Object.keys(safeUpdates); + const enteredAt = this.deps.now(); + const updated = this.deps.updateSession((session) => { + Object.assign(session, safeUpdates); + session.status = "complete"; + session.resumable = false; + session.failure = null; + session.machine = snapshotFor("complete", enteredAt, session.machine.revision + 1); + return session; + }); + + if (fields.length > 0) { + this.emit("context.updated", updated, { + state: "complete", + metadata: { fields }, + }); + } + this.emit("state.completed", updated, { state: from }); + this.emit("state.entered", updated, { state: "complete" }); + this.emit("onboard.completed", updated, { state: "complete" }); + return updated; + } + + async fail(message: string | null, options: OnboardRuntimeFailureOptions = {}): Promise { + const current = this.ensureSession(); + const from = current.machine.state; + if (!canTransitionOnboardMachineState(from, "failed")) { + assertValidOnboardMachineTransition(from, "failed"); + } + + const recordedAt = this.deps.now(); + const updated = this.deps.updateSession((session) => { + session.status = "failed"; + session.failure = onboardSession.sanitizeFailure({ + step: options.step ?? null, + message, + recordedAt, + }); + session.machine = snapshotFor("failed", recordedAt, session.machine.revision + 1); + return session; + }); + + this.emit("state.failed", updated, { + state: from, + step: options.step, + error: message, + metadata: options.metadata, + }); + this.emit("onboard.failed", updated, { + state: "failed", + step: options.step, + error: message, + metadata: options.metadata, + }); + return updated; + } + + async markSkipped( + state: OnboardMachineState, + metadata: Record | null = null, + ): Promise { + const session = this.ensureSession(); + if (isTerminalOnboardMachineState(state)) { + throw new Error(`Terminal onboarding state cannot be skipped: ${state}`); + } + this.emit("state.skipped", session, { state, metadata }); + return session; + } + + async emitRepairEvent( + type: Extract< + OnboardMachineEventType, + "state.repair.started" | "state.repair.completed" | "state.repair.failed" + >, + options: { + state?: OnboardMachineState | null; + error?: string | null; + metadata?: Record | null; + } = {}, + ): Promise { + const session = this.ensureSession(); + this.emit(type, session, { + state: options.state ?? session.machine.state, + error: options.error ?? null, + metadata: options.metadata, + }); + return session; + } + + private ensureSession(): Session { + const existing = this.deps.loadSession(); + if (existing) return existing; + return this.deps.saveSession(this.deps.createSession()); + } + + private emit( + type: OnboardMachineEventType, + session: Session, + options: { + state?: OnboardMachineState | null; + step?: string | null; + error?: string | null; + metadata?: Record | null; + } = {}, + ): void { + this.deps.emitEvent( + createOnboardMachineEvent({ + type, + session, + state: options.state ?? session.machine.state, + step: options.step ?? null, + error: options.error ?? null, + metadata: options.metadata, + }), + ); + } +} From 702454b2d9c3547a95a4c51f4f4ec9a6c5780ca0 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Tue, 19 May 2026 22:26:42 -0700 Subject: [PATCH 05/54] refactor(cli): route onboard step boundaries through runtime --- src/lib/agent/onboard.ts | 4 +- src/lib/onboard.ts | 87 +++++++++++++++---------- src/lib/onboard/machine/runtime.test.ts | 56 ++++++++++++++-- src/lib/onboard/machine/runtime.ts | 30 +++++++++ 4 files changed, 135 insertions(+), 42 deletions(-) diff --git a/src/lib/agent/onboard.ts b/src/lib/agent/onboard.ts index 2446108910..f08c32b9c6 100644 --- a/src/lib/agent/onboard.ts +++ b/src/lib/agent/onboard.ts @@ -31,7 +31,7 @@ export interface OnboardContext { buildSandboxConfigSyncScript: (config: LooseObject) => string; writeSandboxConfigSyncFile: (script: string) => string; cleanupTempDir: (file: string, prefix: string) => void; - startRecordedStep: (stepName: string, updates: LooseObject) => void; + startRecordedStep: (stepName: string, updates: LooseObject) => Promise; skippedStepMessage: (stepName: string, sandboxName: string) => void; } @@ -424,7 +424,7 @@ export async function handleAgentSetup( } } - startRecordedStep("agent_setup", { sandboxName, provider, model }); + await startRecordedStep("agent_setup", { sandboxName, provider, model }); step(7, 8, `Setting up ${agent.displayName} inside sandbox`); const binaryAvailability = verifyAgentBinaryAvailable(sandboxName, agent, runCaptureOpenshell); diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index bc231df3a5..470639b346 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -279,6 +279,7 @@ const { resolveSandboxImageTagFromCreateOutput } = require("./domain/sandbox/image-tag") as typeof import("./domain/sandbox/image-tag"); const nim: typeof import("./inference/nim") = require("./inference/nim"); const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session"); +const { OnboardRuntime }: typeof import("./onboard/machine/runtime") = require("./onboard/machine/runtime"); const policies: typeof import("./policy") = require("./policy"); const tiers: typeof import("./policy/tiers") = require("./policy/tiers"); const { ensureUsageNoticeConsent } = require("./onboard/usage-notice"); @@ -409,6 +410,7 @@ const USE_COLOR = !process.env.NO_COLOR && !!process.stdout.isTTY; const DIM = USE_COLOR ? "\x1b[2m" : ""; const RESET = USE_COLOR ? "\x1b[0m" : ""; let OPENSHELL_BIN: string | null = null; +let ONBOARD_RUNTIME: import("./onboard/machine/runtime").OnboardRuntime | null = null; const GATEWAY_NAME = "nemoclaw"; const BACK_TO_SELECTION = "__NEMOCLAW_BACK_TO_SELECTION__"; type HermesAuthMethod = "oauth" | "api_key"; @@ -9017,7 +9019,12 @@ function toSessionUpdates( return normalized; } -function startRecordedStep( +function getOnboardRuntime(): import("./onboard/machine/runtime").OnboardRuntime { + if (!ONBOARD_RUNTIME) ONBOARD_RUNTIME = new OnboardRuntime(); + return ONBOARD_RUNTIME; +} + +async function startRecordedStep( stepName: string, updates: { sandboxName?: string | null; @@ -9025,20 +9032,30 @@ function startRecordedStep( model?: string | null; policyPresets?: string[] | null; } = {}, -): void { - onboardSession.markStepStarted(stepName); +): Promise { + const runtime = getOnboardRuntime(); + await runtime.markStepStarted(stepName); if (Object.keys(updates).length > 0) { - onboardSession.updateSession((session: Session) => { - if (updates.sandboxName !== undefined) session.sandboxName = updates.sandboxName; - if (updates.provider !== undefined) session.provider = updates.provider; - if (updates.model !== undefined) session.model = updates.model; - if (updates.policyPresets !== undefined) session.policyPresets = updates.policyPresets; - return session; - }); + await runtime.updateContext(toSessionUpdates(updates)); } maybeForceE2eStepFailure(stepName); } +async function recordStepComplete( + stepName: string, + updates: SessionUpdates = {}, +): Promise { + return getOnboardRuntime().markStepComplete(stepName, updates); +} + +async function recordStepSkipped(stepName: string): Promise { + return getOnboardRuntime().markStepSkipped(stepName); +} + +async function recordSessionComplete(updates: SessionUpdates = {}): Promise { + return getOnboardRuntime().completeSession(updates); +} + const ONBOARD_STEP_INDEX: Record = { preflight: { number: 1, title: "Preflight checks" }, gateway: { number: 2, title: "Starting OpenShell gateway" }, @@ -9074,6 +9091,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { RECREATE_SANDBOX = opts.recreateSandbox || process.env.NEMOCLAW_RECREATE_SANDBOX === "1"; AUTO_YES = opts.autoYes === true || process.env.NEMOCLAW_YES === "1"; _preflightDashboardPort = opts.controlUiPort || null; + ONBOARD_RUNTIME = new OnboardRuntime(); delete process.env.OPENSHELL_GATEWAY; const resume = opts.resume === true; const fresh = opts.fresh === true; @@ -9422,9 +9440,9 @@ async function onboard(opts: OnboardOptions = {}): Promise { }), ); } else { - startRecordedStep("preflight"); + await startRecordedStep("preflight"); gpu = await preflight({ ...opts, optedOutGpuPassthrough: opts.noGpu === true }); - onboardSession.markStepComplete("preflight"); + await recordStepComplete("preflight"); } const sandboxGpuConfig = resolveSandboxGpuConfig(gpu, { flag: effectiveSandboxGpuFlag, @@ -9560,11 +9578,11 @@ async function onboard(opts: OnboardOptions = {}): Promise { resume && session?.steps?.gateway?.status === "complete" && canReuseHealthyGateway; if (resumeGateway) { skippedStepMessage("gateway", "running"); - onboardSession.markStepComplete("gateway"); + await recordStepComplete("gateway"); } else if (!resume && canReuseHealthyGateway) { skippedStepMessage("gateway", "running", "reuse"); note(" Reusing healthy NemoClaw gateway."); - onboardSession.markStepComplete("gateway"); + await recordStepComplete("gateway"); } else { if (resume && session?.steps?.gateway?.status === "complete") { if (gatewayReuseState === "active-unnamed") { @@ -9582,9 +9600,9 @@ async function onboard(opts: OnboardOptions = {}): Promise { retireLegacyGatewayForDockerDriverUpgrade(); gatewayReuseState = "missing"; } - startRecordedStep("gateway"); + await startRecordedStep("gateway"); await startGateway(gpu, { gpuPassthrough }); - onboardSession.markStepComplete("gateway"); + await recordStepComplete("gateway"); } // #2753: prefer requestedSandboxName over an unconfirmed session name. @@ -9635,7 +9653,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { // below). A SIGINT between any earlier step and createSandbox would // otherwise leave a phantom that `nemoclaw list` resurrects until // manually destroyed. - startRecordedStep("provider_selection"); + await startRecordedStep("provider_selection"); const selection = await setupNim(gpu, sandboxName, agent); model = selection.model; provider = selection.provider; @@ -9645,7 +9663,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { hermesToolGateways = selection.hermesToolGateways; preferredInferenceApi = selection.preferredInferenceApi; nimContainer = selection.nimContainer; - onboardSession.markStepComplete( + await recordStepComplete( "provider_selection", toSessionUpdates({ provider, @@ -9678,7 +9696,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { if (!sandboxName) { sandboxName = await promptValidatedSandboxName(agent); } - startRecordedStep("inference", { provider, model }); + await startRecordedStep("inference", { provider, model }); const inferenceResult = await setupInference( sandboxName, model, @@ -9692,7 +9710,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { forceProviderSelection = true; continue; } - onboardSession.markStepComplete( + await recordStepComplete( "inference", toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }), ); @@ -9712,7 +9730,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { if (nimContainer && sandboxName) { registry.updateSandbox(sandboxName, { nimContainer }); } - onboardSession.markStepComplete( + await recordStepComplete( "inference", toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }), ); @@ -9751,7 +9769,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { } } - startRecordedStep("inference", { provider, model }); + await startRecordedStep("inference", { provider, model }); const inferenceResult = await setupInference( sandboxName, model, @@ -9769,7 +9787,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { if (nimContainer && sandboxName) { registry.updateSandbox(sandboxName, { nimContainer }); } - onboardSession.markStepComplete( + await recordStepComplete( "inference", toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }), ); @@ -9906,7 +9924,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { } else { nextWebSearchConfig = await configureWebSearch(null, agent, webSearchSupportProbePath); } - startRecordedStep("sandbox", { provider, model }); + await startRecordedStep("sandbox", { provider, model }); const recordedMessagingChannels = getRecordedMessagingChannelsForResume(resume, session, sandboxName); if (recordedMessagingChannels) { selectedMessagingChannels = recordedMessagingChannels; @@ -9960,7 +9978,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { ...getSandboxAgentRegistryFields(agent, !fromDockerfile), }); registry.setDefault(sandboxName); - onboardSession.markStepComplete( + await recordStepComplete( "sandbox", toSessionUpdates({ sandboxName, @@ -9996,24 +10014,24 @@ async function onboard(opts: OnboardOptions = {}): Promise { skippedStepMessage, }); ensureAgentDashboardForward(sandboxName, agent); - onboardSession.markStepSkipped("openclaw"); + await recordStepSkipped("openclaw"); } else { const resumeOpenclaw = resume && sandboxName && isOpenclawReady(sandboxName); if (resumeOpenclaw) { skippedStepMessage("openclaw", sandboxName); - onboardSession.markStepComplete( + await recordStepComplete( "openclaw", toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }), ); } else { - startRecordedStep("openclaw", { sandboxName, provider, model }); + await startRecordedStep("openclaw", { sandboxName, provider, model }); await setupOpenclaw(sandboxName, model, provider); - onboardSession.markStepComplete( + await recordStepComplete( "openclaw", toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }), ); } - onboardSession.markStepSkipped("agent_setup"); + await recordStepSkipped("agent_setup"); } const latestSession = onboardSession.loadSession(); @@ -10066,7 +10084,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { arePolicyPresetsApplied(sandboxName, recordedPolicyPresetsForSupport); if (resumePolicies) { skippedStepMessage("policies", recordedPolicyPresetsForSupport.join(", ")); - onboardSession.markStepComplete( + await recordStepComplete( "policies", toSessionUpdates({ sandboxName, @@ -10076,7 +10094,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { }), ); } else { - startRecordedStep("policies", { + await startRecordedStep("policies", { sandboxName, provider, model, @@ -10102,7 +10120,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { }); }, }); - onboardSession.markStepComplete( + await recordStepComplete( "policies", toSessionUpdates({ sandboxName, provider, model, policyPresets: appliedPolicyPresets }), ); @@ -10112,7 +10130,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { ensureAgentDashboardForward(sandboxName, agent); } - onboardSession.completeSession( + await recordSessionComplete( toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }), ); completed = true; @@ -10192,6 +10210,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { printDashboard(sandboxName, model, provider, nimContainer, agent); } finally { releaseOnboardLock(); + ONBOARD_RUNTIME = null; } } diff --git a/src/lib/onboard/machine/runtime.test.ts b/src/lib/onboard/machine/runtime.test.ts index becca6028e..7b26269541 100644 --- a/src/lib/onboard/machine/runtime.test.ts +++ b/src/lib/onboard/machine/runtime.test.ts @@ -7,7 +7,9 @@ import { createSession, filterSafeUpdates, normalizeSession, + sanitizeFailure, type Session, + type SessionUpdates, } from "../../state/onboard-session"; import type { OnboardMachineEvent } from "./events"; import { OnboardRuntime, type OnboardRuntimeDeps } from "./runtime"; @@ -21,6 +23,12 @@ function createHarness(initialSession: Session | null = createSession()) { let session = initialSession ? cloneSession(initialSession) : null; const events: OnboardMachineEvent[] = []; let tick = 0; + const updateSession = (mutator: (value: Session) => Session | void): Session => { + const current = session ? cloneSession(session) : createSession(); + const next = mutator(current) ?? current; + session = cloneSession(next); + return cloneSession(session); + }; const deps: OnboardRuntimeDeps = { loadSession: () => (session ? cloneSession(session) : null), createSession: (overrides) => createSession(overrides), @@ -28,12 +36,48 @@ function createHarness(initialSession: Session | null = createSession()) { session = cloneSession(next); return cloneSession(session); }, - updateSession: (mutator) => { - const current = session ? cloneSession(session) : createSession(); - const next = mutator(current) ?? current; - session = cloneSession(next); - return cloneSession(session); - }, + updateSession, + markStepStarted: (stepName) => + updateSession((current) => { + const step = current.steps[stepName]; + if (!step) return current; + step.status = "in_progress"; + current.lastStepStarted = stepName; + current.status = "in_progress"; + return current; + }), + markStepComplete: (stepName, updates: SessionUpdates = {}) => + updateSession((current) => { + const step = current.steps[stepName]; + if (!step) return current; + step.status = "complete"; + current.lastCompletedStep = stepName; + Object.assign(current, filterSafeUpdates(updates)); + return current; + }), + markStepSkipped: (stepName) => + updateSession((current) => { + const step = current.steps[stepName]; + if (!step) return current; + step.status = "skipped"; + return current; + }), + markStepFailed: (stepName, message) => + updateSession((current) => { + const step = current.steps[stepName]; + if (!step) return current; + step.status = "failed"; + current.status = "failed"; + current.failure = sanitizeFailure({ step: stepName, message, recordedAt: "now" }); + return current; + }), + completeSession: (updates: SessionUpdates = {}) => + updateSession((current) => { + Object.assign(current, filterSafeUpdates(updates)); + current.status = "complete"; + current.resumable = false; + return current; + }), filterSafeUpdates, emitEvent: (event) => events.push(event), now: () => `2026-05-19T00:00:${String(tick++).padStart(2, "0")}.000Z`, diff --git a/src/lib/onboard/machine/runtime.ts b/src/lib/onboard/machine/runtime.ts index 3e72cd0ccc..2e5d584f3b 100644 --- a/src/lib/onboard/machine/runtime.ts +++ b/src/lib/onboard/machine/runtime.ts @@ -21,6 +21,11 @@ export interface OnboardRuntimeDeps { createSession(overrides?: Partial): Session; saveSession(session: Session): Session; updateSession(mutator: (session: Session) => Session | void): Session; + markStepStarted(stepName: string): Session; + markStepComplete(stepName: string, updates?: SessionUpdates): Session; + markStepSkipped(stepName: string): Session; + markStepFailed(stepName: string, message?: string | null): Session; + completeSession(updates?: SessionUpdates): Session; filterSafeUpdates(updates: SessionUpdates): Partial; emitEvent(event: OnboardMachineEvent): void; now(): string; @@ -46,6 +51,11 @@ function defaultDeps(): OnboardRuntimeDeps { createSession: onboardSession.createSession, saveSession: onboardSession.saveSession, updateSession: onboardSession.updateSession, + markStepStarted: onboardSession.markStepStarted, + markStepComplete: onboardSession.markStepComplete, + markStepSkipped: onboardSession.markStepSkipped, + markStepFailed: onboardSession.markStepFailed, + completeSession: onboardSession.completeSession, filterSafeUpdates: onboardSession.filterSafeUpdates, emitEvent: emitOnboardMachineEvent, now: () => new Date().toISOString(), @@ -91,6 +101,26 @@ export class OnboardRuntime { return session; } + async markStepStarted(stepName: string): Promise { + return this.deps.markStepStarted(stepName); + } + + async markStepComplete(stepName: string, updates: SessionUpdates = {}): Promise { + return this.deps.markStepComplete(stepName, updates); + } + + async markStepSkipped(stepName: string): Promise { + return this.deps.markStepSkipped(stepName); + } + + async markStepFailed(stepName: string, message: string | null = null): Promise { + return this.deps.markStepFailed(stepName, message); + } + + async completeSession(updates: SessionUpdates = {}): Promise { + return this.deps.completeSession(updates); + } + async transition( to: OnboardMachineState, options: OnboardRuntimeTransitionOptions = {}, From 60acb65261157d19741963f604f148474da04218 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Tue, 19 May 2026 22:35:53 -0700 Subject: [PATCH 06/54] refactor(cli): add observe-only onboard hooks --- src/lib/onboard/machine/hooks.test.ts | 150 ++++++++++++++++++++++++++ src/lib/onboard/machine/hooks.ts | 132 +++++++++++++++++++++++ 2 files changed, 282 insertions(+) create mode 100644 src/lib/onboard/machine/hooks.test.ts create mode 100644 src/lib/onboard/machine/hooks.ts diff --git a/src/lib/onboard/machine/hooks.test.ts b/src/lib/onboard/machine/hooks.test.ts new file mode 100644 index 0000000000..ec0fe0fcc7 --- /dev/null +++ b/src/lib/onboard/machine/hooks.test.ts @@ -0,0 +1,150 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, describe, expect, it } from "vitest"; + +import { createSession } from "../../state/onboard-session"; +import { + clearOnboardMachineEventListeners, + createOnboardMachineEvent, + emitOnboardMachineEvent, + type OnboardMachineEvent, +} from "./events"; +import { createJsonlOnboardHook, OnboardHookDispatcher, registerOnboardHooks } from "./hooks"; + +function sampleEvent(): OnboardMachineEvent { + const session = createSession({ + sessionId: "session-1", + provider: "nvidia-prod", + endpointUrl: "https://example.com/v1?token=secret&keep=yes", + }); + return createOnboardMachineEvent({ + type: "state.entered", + session, + state: "gateway", + step: "gateway", + }); +} + +afterEach(() => { + clearOnboardMachineEventListeners(); +}); + +describe("onboard machine hooks", () => { + it("dispatches observe-only events and emits hook lifecycle events", async () => { + const observed: string[] = []; + const lifecycle: OnboardMachineEvent[] = []; + const dispatcher = new OnboardHookDispatcher( + [ + { + name: "observer", + onEvent(event) { + observed.push(event.type); + }, + }, + ], + { + emitEvent: (event) => lifecycle.push(event), + now: () => "2026-05-19T01:00:00.000Z", + }, + ); + + await dispatcher.dispatch(sampleEvent()); + + expect(observed).toEqual(["state.entered"]); + expect(lifecycle.map((event) => event.type)).toEqual(["hook.started", "hook.completed"]); + expect(lifecycle[0]).toMatchObject({ + sessionId: "session-1", + state: "gateway", + step: "gateway", + metadata: { hook: "observer", sourceType: "state.entered" }, + }); + }); + + it("warns and emits hook.failed without throwing when a hook fails", async () => { + const warnings: string[] = []; + const lifecycle: OnboardMachineEvent[] = []; + const dispatcher = new OnboardHookDispatcher( + [ + { + name: "bad-hook", + async onEvent() { + throw new Error("Bearer super-secret-token"); + }, + }, + ], + { + warn: (message) => warnings.push(message), + emitEvent: (event) => lifecycle.push(event), + now: () => "2026-05-19T01:00:00.000Z", + }, + ); + + await expect(dispatcher.dispatch(sampleEvent())).resolves.toBeUndefined(); + + expect(lifecycle.map((event) => event.type)).toEqual(["hook.started", "hook.failed"]); + expect(lifecycle[1]).toMatchObject({ + type: "hook.failed", + error: "Bearer ", + metadata: { hook: "bad-hook", sourceType: "state.entered" }, + }); + expect(warnings).toEqual(["Onboard hook 'bad-hook' failed: Bearer "]); + expect(JSON.stringify(lifecycle)).not.toContain("super-secret-token"); + expect(warnings.join("\n")).not.toContain("super-secret-token"); + }); + + it("writes JSONL hook events to an external sink", async () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-hooks-")); + try { + const filePath = path.join(tmpDir, "events.jsonl"); + const hook = createJsonlOnboardHook(filePath); + + await hook.onEvent?.(sampleEvent()); + await hook.onEvent?.( + createOnboardMachineEvent({ + type: "state.completed", + session: createSession({ sessionId: "session-1" }), + state: "gateway", + step: "gateway", + }), + ); + + const lines = fs + .readFileSync(filePath, "utf8") + .trim() + .split("\n") + .map((line) => JSON.parse(line)); + expect(lines.map((event) => event.type)).toEqual(["state.entered", "state.completed"]); + expect(lines[0].context.endpointUrl).toBe( + "https://example.com/v1?token=%3CREDACTED%3E&keep=yes", + ); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }); + + it("registers hooks on the machine event bus without redispatching hook lifecycle events", async () => { + const observed: string[] = []; + const unregister = registerOnboardHooks([ + { + name: "bus-observer", + onEvent(event) { + observed.push(event.type); + }, + }, + ]); + + emitOnboardMachineEvent(sampleEvent()); + await Promise.resolve(); + emitOnboardMachineEvent({ ...sampleEvent(), type: "hook.failed" }); + await Promise.resolve(); + unregister(); + emitOnboardMachineEvent({ ...sampleEvent(), type: "state.completed" }); + await Promise.resolve(); + + expect(observed).toEqual(["state.entered"]); + }); +}); diff --git a/src/lib/onboard/machine/hooks.ts b/src/lib/onboard/machine/hooks.ts new file mode 100644 index 0000000000..1dfcd7544a --- /dev/null +++ b/src/lib/onboard/machine/hooks.ts @@ -0,0 +1,132 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import fs from "node:fs"; +import path from "node:path"; + +import { redactSensitiveText } from "../../security/redact"; +import { + addOnboardMachineEventListener, + emitOnboardMachineEvent, + sanitizeOnboardMachineEventMetadata, + type OnboardMachineEvent, + type OnboardMachineEventListener, +} from "./events"; + +export interface OnboardHook { + name?: string; + onEvent?(event: OnboardMachineEvent): Promise | void; +} + +export interface OnboardHookDispatchOptions { + warn?: (message: string) => void; + emitEvent?: (event: OnboardMachineEvent) => void; + now?: () => string; +} + +export interface OnboardHookRegistrationOptions extends OnboardHookDispatchOptions { + includeHookEvents?: boolean; +} + +function hookName(hook: OnboardHook, index: number): string { + const name = typeof hook.name === "string" ? hook.name.trim() : ""; + return name || `hook-${index + 1}`; +} + +function hookLifecycleEvent( + source: OnboardMachineEvent, + type: "hook.started" | "hook.completed" | "hook.failed", + hook: OnboardHook, + index: number, + options: { + occurredAt: string; + error?: unknown; + metadata?: Record; + }, +): OnboardMachineEvent { + return { + version: 1, + type, + occurredAt: options.occurredAt, + sessionId: source.sessionId, + state: source.state, + step: source.step, + context: source.context, + error: redactSensitiveText(options.error instanceof Error ? options.error.message : options.error), + metadata: sanitizeOnboardMachineEventMetadata({ + hook: hookName(hook, index), + sourceType: source.type, + ...options.metadata, + }), + }; +} + +function isHookLifecycleEvent(event: OnboardMachineEvent): boolean { + return event.type === "hook.started" || event.type === "hook.completed" || event.type === "hook.failed"; +} + +export class OnboardHookDispatcher { + private readonly hooks: readonly OnboardHook[]; + private readonly warn: (message: string) => void; + private readonly emitEvent: (event: OnboardMachineEvent) => void; + private readonly now: () => string; + + constructor(hooks: readonly OnboardHook[], options: OnboardHookDispatchOptions = {}) { + this.hooks = hooks; + this.warn = options.warn ?? ((message) => console.warn(message)); + this.emitEvent = options.emitEvent ?? emitOnboardMachineEvent; + this.now = options.now ?? (() => new Date().toISOString()); + } + + async dispatch(event: OnboardMachineEvent): Promise { + for (const [index, hook] of this.hooks.entries()) { + if (typeof hook.onEvent !== "function") continue; + this.emitEvent( + hookLifecycleEvent(event, "hook.started", hook, index, { + occurredAt: this.now(), + }), + ); + try { + await hook.onEvent(event); + this.emitEvent( + hookLifecycleEvent(event, "hook.completed", hook, index, { + occurredAt: this.now(), + }), + ); + } catch (error) { + const name = hookName(hook, index); + const message = error instanceof Error ? error.message : String(error); + this.warn(`Onboard hook '${name}' failed: ${redactSensitiveText(message) ?? ""}`); + this.emitEvent( + hookLifecycleEvent(event, "hook.failed", hook, index, { + occurredAt: this.now(), + error: message, + }), + ); + } + } + } +} + +export function registerOnboardHooks( + hooks: readonly OnboardHook[], + options: OnboardHookRegistrationOptions = {}, +): () => void { + const dispatcher = new OnboardHookDispatcher(hooks, options); + const listener: OnboardMachineEventListener = (event) => { + if (options.includeHookEvents !== true && isHookLifecycleEvent(event)) return; + void dispatcher.dispatch(event); + }; + return addOnboardMachineEventListener(listener); +} + +export function createJsonlOnboardHook(filePath: string): OnboardHook { + const resolvedPath = path.resolve(filePath); + return { + name: "jsonl", + onEvent(event) { + fs.mkdirSync(path.dirname(resolvedPath), { recursive: true, mode: 0o700 }); + fs.appendFileSync(resolvedPath, `${JSON.stringify(event)}\n`, { mode: 0o600 }); + }, + }; +} From c2a58e6053babf96f876e0bddbd44a1f865a9340 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Tue, 19 May 2026 22:45:51 -0700 Subject: [PATCH 07/54] refactor(cli): extract onboard preflight handler --- src/lib/onboard.ts | 87 ++++----- .../machine/handlers/preflight.test.ts | 183 ++++++++++++++++++ src/lib/onboard/machine/handlers/preflight.ts | 147 ++++++++++++++ 3 files changed, 363 insertions(+), 54 deletions(-) create mode 100644 src/lib/onboard/machine/handlers/preflight.test.ts create mode 100644 src/lib/onboard/machine/handlers/preflight.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 470639b346..50c5187326 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -280,6 +280,7 @@ const { resolveSandboxImageTagFromCreateOutput } = const nim: typeof import("./inference/nim") = require("./inference/nim"); const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session"); const { OnboardRuntime }: typeof import("./onboard/machine/runtime") = require("./onboard/machine/runtime"); +const { handlePreflightState }: typeof import("./onboard/machine/handlers/preflight") = require("./onboard/machine/handlers/preflight"); const policies: typeof import("./policy") = require("./policy"); const tiers: typeof import("./policy/tiers") = require("./policy/tiers"); const { ensureUsageNoticeConsent } = require("./onboard/usage-notice"); @@ -9403,54 +9404,39 @@ async function onboard(opts: OnboardOptions = {}): Promise { console.log(" ==================="); const explicitSandboxGpuFlag = resolveSandboxGpuFlagFromOptions(opts); - const resumePreflight = resume && session?.steps?.preflight?.status === "complete"; - const resumeHasResolvedGpuIntent = - resumePreflight && - explicitSandboxGpuFlag === null && - opts.sandboxGpuDevice == null && - process.env.NEMOCLAW_SANDBOX_GPU === undefined && - process.env.NEMOCLAW_SANDBOX_GPU_DEVICE === undefined; - const resumedSandboxGpuOverrides = resumeHasResolvedGpuIntent - ? getResumeSandboxGpuOverrides( - resumeSandboxNameForGpu ? registry.getSandbox(resumeSandboxNameForGpu) : null, - session?.gpuPassthrough, - ) - : { flag: null, device: null }; - const effectiveSandboxGpuFlag = explicitSandboxGpuFlag ?? resumedSandboxGpuOverrides.flag; - const effectiveSandboxGpuDevice = opts.sandboxGpuDevice ?? resumedSandboxGpuOverrides.device; - let gpu; - if (resumePreflight) { - skippedStepMessage("preflight", "cached"); - gpu = nim.detectGpu(); - // Re-check the CDI spec gap on resume (#3152). The cached preflight - // result does not capture host CDI state, and the original onboard - // attempt that wrote the cache likely aborted at gateway-start with - // exactly this CDI failure — so resuming without re-checking would - // walk into the same wall. Honour persisted `gpuPassthrough: false` - // from the prior session as an opt-out, since the resume invocation - // does not need to re-pass `--no-gpu` to keep that intent (the same - // resolution is replayed a few lines below for `gpuPassthrough`). - const resumeOptedOutGpuPassthrough = - opts.noGpu === true || (opts.gpu !== true && session?.gpuPassthrough === false); - assertCdiNvidiaGpuSpecPresent(assessHost(), resumeOptedOutGpuPassthrough); - validateSandboxGpuPreflight( - resolveSandboxGpuConfig(gpu, { - flag: effectiveSandboxGpuFlag, - device: effectiveSandboxGpuDevice, - }), - ); - } else { - await startRecordedStep("preflight"); - gpu = await preflight({ ...opts, optedOutGpuPassthrough: opts.noGpu === true }); - await recordStepComplete("preflight"); - } - const sandboxGpuConfig = resolveSandboxGpuConfig(gpu, { - flag: effectiveSandboxGpuFlag, - device: effectiveSandboxGpuDevice, + const preflightResult = await handlePreflightState({ + resume, + session, + recordedSandboxName, + requestedSandboxName, + explicitSandboxGpuFlag, + sandboxGpuDevice: opts.sandboxGpuDevice ?? null, + gpuRequested: opts.gpu === true, + noGpu: opts.noGpu === true, + env: process.env, + deps: { + getSandbox: registry.getSandbox.bind(registry), + getResumeSandboxGpuOverrides, + detectGpu: nim.detectGpu, + runPreflight: (preflightOptions) => preflight({ ...opts, ...preflightOptions }), + assessHost, + assertCdiNvidiaGpuSpecPresent, + resolveSandboxGpuConfig, + validateSandboxGpuPreflight, + skippedStepMessage, + startRecordedStep, + recordStepComplete, + updateSession: onboardSession.updateSession, + }, }); - - const requestedGpuPassthrough = opts.gpu === true; - const gpuPassthrough = sandboxGpuConfig.sandboxGpuEnabled; + session = preflightResult.session; + const { + sandboxGpuConfig, + resumeHasResolvedGpuIntent, + requestedGpuPassthrough, + gpuPassthrough, + } = preflightResult; + const gpu = preflightResult.gpu ?? null; if (gpuPassthrough) { note( resumeHasResolvedGpuIntent && session?.gpuPassthrough === true @@ -9472,13 +9458,6 @@ async function onboard(opts: OnboardOptions = {}): Promise { /* lspci not available — skip hint */ } } - // Persist GPU intent in the session so resume can restore it. - if (session && session.gpuPassthrough !== gpuPassthrough) { - session = onboardSession.updateSession((current: Session) => { - current.gpuPassthrough = gpuPassthrough; - return current; - }); - } dockerGpuLocalInference.configureLocalInferenceForDockerGpuHostNetwork(sandboxGpuConfig, { dockerDriverGateway: isLinuxDockerDriverGatewayEnabled(), note, diff --git a/src/lib/onboard/machine/handlers/preflight.test.ts b/src/lib/onboard/machine/handlers/preflight.test.ts new file mode 100644 index 0000000000..fa4b859915 --- /dev/null +++ b/src/lib/onboard/machine/handlers/preflight.test.ts @@ -0,0 +1,183 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it, vi } from "vitest"; + +import { createSession, type Session } from "../../../state/onboard-session"; +import { handlePreflightState, type PreflightStateOptions } from "./preflight"; + +type Gpu = { type: string } | null; +type SandboxEntry = { sandboxGpuEnabled?: boolean }; +type Host = { cdiNvidiaGpuSpecMissing?: boolean }; + +function createDeps(overrides: Partial["deps"]> = {}) { + let session = createSession(); + return { + calls: { + start: vi.fn(), + complete: vi.fn(), + skipped: vi.fn(), + detectGpu: vi.fn(() => ({ type: "nvidia" }) as Gpu), + runPreflight: vi.fn(async () => ({ type: "nvidia" }) as Gpu), + validate: vi.fn(), + cdi: vi.fn(), + updateSession: vi.fn(), + getSandbox: vi.fn(() => ({ sandboxGpuEnabled: true })), + getOverrides: vi.fn(() => ({ flag: "enable" as const, device: "0" })), + }, + deps: { + getSandbox: (name: string) => { + const value = ({ sandboxGpuEnabled: true } satisfies SandboxEntry); + return overrides.getSandbox ? overrides.getSandbox(name) : value; + }, + getResumeSandboxGpuOverrides: ( + sandbox: SandboxEntry | null, + sessionGpuPassthrough: boolean | null | undefined, + ) => { + if (overrides.getResumeSandboxGpuOverrides) { + return overrides.getResumeSandboxGpuOverrides(sandbox, sessionGpuPassthrough); + } + return { flag: "enable" as const, device: "0" }; + }, + detectGpu: () => ({ type: "nvidia" }) as Gpu, + runPreflight: async () => ({ type: "nvidia" }) as Gpu, + assessHost: () => ({ cdiNvidiaGpuSpecMissing: false }), + assertCdiNvidiaGpuSpecPresent: vi.fn(), + resolveSandboxGpuConfig: (_gpu: Gpu, opts: { flag: "enable" | "disable" | null; device: string | null | undefined }) => ({ + sandboxGpuEnabled: opts.flag === "enable", + mode: opts.flag === "enable" ? "1" : "0", + sandboxGpuDevice: opts.device, + }), + validateSandboxGpuPreflight: vi.fn(), + skippedStepMessage: vi.fn(), + startRecordedStep: vi.fn(async () => undefined), + recordStepComplete: vi.fn(async () => session), + updateSession: vi.fn((mutator: (value: Session) => Session | void) => { + session = mutator(session) ?? session; + return session; + }), + ...overrides, + }, + getSession: () => session, + }; +} + +function baseOptions( + deps: PreflightStateOptions["deps"], + session: Session | null = createSession(), +): PreflightStateOptions { + return { + resume: false, + session, + recordedSandboxName: null, + requestedSandboxName: "my-assistant", + explicitSandboxGpuFlag: null, + sandboxGpuDevice: null, + gpuRequested: false, + noGpu: false, + env: {}, + deps, + }; +} + +describe("handlePreflightState", () => { + it("runs full preflight through recorded step boundaries", async () => { + const harness = createDeps({ + startRecordedStep: vi.fn(async () => undefined), + runPreflight: vi.fn(async () => ({ type: "nvidia" }) as Gpu), + recordStepComplete: vi.fn(async () => createSession()), + }); + + const result = await handlePreflightState({ + ...baseOptions(harness.deps), + explicitSandboxGpuFlag: "enable", + sandboxGpuDevice: "GPU-0", + }); + + expect(harness.deps.startRecordedStep).toHaveBeenCalledWith("preflight"); + expect(harness.deps.runPreflight).toHaveBeenCalledWith({ optedOutGpuPassthrough: false }); + expect(harness.deps.recordStepComplete).toHaveBeenCalledWith("preflight"); + expect(result.sandboxGpuConfig).toMatchObject({ + sandboxGpuEnabled: true, + mode: "1", + sandboxGpuDevice: "GPU-0", + }); + expect(result.gpuPassthrough).toBe(true); + }); + + it("skips full preflight on resume but re-detects GPU and revalidates CDI/sandbox GPU", async () => { + const session = createSession(); + session.steps.preflight.status = "complete"; + session.gpuPassthrough = false; + const harness = createDeps({ + detectGpu: vi.fn(() => ({ type: "nvidia" }) as Gpu), + assertCdiNvidiaGpuSpecPresent: vi.fn(), + validateSandboxGpuPreflight: vi.fn(), + skippedStepMessage: vi.fn(), + startRecordedStep: vi.fn(async () => undefined), + runPreflight: vi.fn(async () => ({ type: "should-not-run" }) as Gpu), + }); + + const result = await handlePreflightState({ + ...baseOptions(harness.deps, session), + resume: true, + gpuRequested: false, + }); + + expect(harness.deps.skippedStepMessage).toHaveBeenCalledWith("preflight", "cached"); + expect(harness.deps.detectGpu).toHaveBeenCalledOnce(); + expect(harness.deps.runPreflight).not.toHaveBeenCalled(); + expect(harness.deps.startRecordedStep).not.toHaveBeenCalled(); + expect(harness.deps.assertCdiNvidiaGpuSpecPresent).toHaveBeenCalledWith( + { cdiNvidiaGpuSpecMissing: false }, + true, + ); + expect(harness.deps.validateSandboxGpuPreflight).toHaveBeenCalledOnce(); + expect(result.resumePreflight).toBe(true); + }); + + it("restores saved sandbox GPU intent only when resume has no explicit override", async () => { + const session = createSession(); + session.steps.preflight.status = "complete"; + session.gpuPassthrough = true; + const getResumeSandboxGpuOverrides = vi.fn(() => ({ flag: "enable" as const, device: "1" })); + const getSandbox = vi.fn(() => ({ sandboxGpuEnabled: true })); + const harness = createDeps({ getResumeSandboxGpuOverrides, getSandbox }); + + const result = await handlePreflightState({ + ...baseOptions(harness.deps, session), + resume: true, + recordedSandboxName: "saved", + }); + + expect(getSandbox).toHaveBeenCalledWith("saved"); + expect(getResumeSandboxGpuOverrides).toHaveBeenCalledWith( + { sandboxGpuEnabled: true }, + true, + ); + expect(result.resumeHasResolvedGpuIntent).toBe(true); + expect(result.effectiveSandboxGpuFlag).toBe("enable"); + expect(result.effectiveSandboxGpuDevice).toBe("1"); + + await handlePreflightState({ + ...baseOptions(harness.deps, session), + resume: true, + explicitSandboxGpuFlag: "disable", + }); + expect(getResumeSandboxGpuOverrides).toHaveBeenCalledTimes(1); + }); + + it("persists effective GPU passthrough intent for later resume", async () => { + const session = createSession(); + session.gpuPassthrough = false; + const harness = createDeps(); + + const result = await handlePreflightState({ + ...baseOptions(harness.deps, session), + explicitSandboxGpuFlag: "enable", + }); + + expect(result.session?.gpuPassthrough).toBe(true); + expect(harness.deps.updateSession).toHaveBeenCalledOnce(); + }); +}); diff --git a/src/lib/onboard/machine/handlers/preflight.ts b/src/lib/onboard/machine/handlers/preflight.ts new file mode 100644 index 0000000000..cc5bd6633d --- /dev/null +++ b/src/lib/onboard/machine/handlers/preflight.ts @@ -0,0 +1,147 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { Session } from "../../../state/onboard-session"; + +export type PreflightSandboxGpuFlag = "enable" | "disable" | null; + +export interface PreflightSandboxGpuOverrides { + flag: PreflightSandboxGpuFlag; + device: string | null; +} + +export interface PreflightSandboxGpuConfig { + sandboxGpuEnabled: boolean; + mode: string; + sandboxGpuDevice?: string | null; + errors?: readonly string[]; +} + +export interface PreflightStateOptions< + Gpu, + SandboxEntry, + Host, + Config extends PreflightSandboxGpuConfig, +> { + resume: boolean; + session: Session | null; + recordedSandboxName: string | null; + requestedSandboxName: string | null; + explicitSandboxGpuFlag: PreflightSandboxGpuFlag; + sandboxGpuDevice?: string | null; + gpuRequested: boolean; + noGpu: boolean; + env: NodeJS.ProcessEnv; + deps: { + getSandbox(name: string): SandboxEntry | null; + getResumeSandboxGpuOverrides( + sandbox: SandboxEntry | null, + sessionGpuPassthrough: boolean | null | undefined, + ): PreflightSandboxGpuOverrides; + detectGpu(): Gpu; + runPreflight(options: { optedOutGpuPassthrough?: boolean }): Promise; + assessHost(): Host; + assertCdiNvidiaGpuSpecPresent(host: Host, optedOutGpuPassthrough: boolean): void; + resolveSandboxGpuConfig( + gpu: Gpu, + options: { flag: PreflightSandboxGpuFlag; device: string | null | undefined }, + ): Config; + validateSandboxGpuPreflight(config: Config): void; + skippedStepMessage(stepName: string, detail?: string | null): void; + startRecordedStep(stepName: string): Promise; + recordStepComplete(stepName: string): Promise; + updateSession(mutator: (session: Session) => Session | void): Session; + }; +} + +export interface PreflightStateResult { + gpu: Gpu; + sandboxGpuConfig: Config; + resumePreflight: boolean; + resumeHasResolvedGpuIntent: boolean; + requestedGpuPassthrough: boolean; + gpuPassthrough: boolean; + effectiveSandboxGpuFlag: PreflightSandboxGpuFlag; + effectiveSandboxGpuDevice: string | null | undefined; + session: Session | null; +} + +function envHasSandboxGpuOverride(env: NodeJS.ProcessEnv): boolean { + return env.NEMOCLAW_SANDBOX_GPU !== undefined || env.NEMOCLAW_SANDBOX_GPU_DEVICE !== undefined; +} + +export async function handlePreflightState< + Gpu, + SandboxEntry, + Host, + Config extends PreflightSandboxGpuConfig, +>({ + resume, + session, + recordedSandboxName, + requestedSandboxName, + explicitSandboxGpuFlag, + sandboxGpuDevice, + gpuRequested, + noGpu, + env, + deps, +}: PreflightStateOptions): Promise> { + const resumeSandboxNameForGpu = recordedSandboxName || requestedSandboxName || null; + const resumePreflight = resume && session?.steps?.preflight?.status === "complete"; + const resumeHasResolvedGpuIntent = + resumePreflight && + explicitSandboxGpuFlag === null && + sandboxGpuDevice == null && + !envHasSandboxGpuOverride(env); + const resumedSandboxGpuOverrides = resumeHasResolvedGpuIntent + ? deps.getResumeSandboxGpuOverrides( + resumeSandboxNameForGpu ? deps.getSandbox(resumeSandboxNameForGpu) : null, + session?.gpuPassthrough, + ) + : { flag: null, device: null }; + const effectiveSandboxGpuFlag = explicitSandboxGpuFlag ?? resumedSandboxGpuOverrides.flag; + const effectiveSandboxGpuDevice = sandboxGpuDevice ?? resumedSandboxGpuOverrides.device; + + let gpu: Gpu; + if (resumePreflight) { + deps.skippedStepMessage("preflight", "cached"); + gpu = deps.detectGpu(); + const resumeOptedOutGpuPassthrough = noGpu || (!gpuRequested && session?.gpuPassthrough === false); + deps.assertCdiNvidiaGpuSpecPresent(deps.assessHost(), resumeOptedOutGpuPassthrough); + deps.validateSandboxGpuPreflight( + deps.resolveSandboxGpuConfig(gpu, { + flag: effectiveSandboxGpuFlag, + device: effectiveSandboxGpuDevice, + }), + ); + } else { + await deps.startRecordedStep("preflight"); + gpu = await deps.runPreflight({ optedOutGpuPassthrough: noGpu }); + session = await deps.recordStepComplete("preflight"); + } + + const sandboxGpuConfig = deps.resolveSandboxGpuConfig(gpu, { + flag: effectiveSandboxGpuFlag, + device: effectiveSandboxGpuDevice, + }); + const gpuPassthrough = sandboxGpuConfig.sandboxGpuEnabled; + if (session && session.gpuPassthrough !== gpuPassthrough) { + session = deps.updateSession((current) => { + current.gpuPassthrough = gpuPassthrough; + return current; + }); + } + + return { + gpu, + sandboxGpuConfig, + resumePreflight, + resumeHasResolvedGpuIntent, + requestedGpuPassthrough: gpuRequested, + gpuPassthrough, + effectiveSandboxGpuFlag, + effectiveSandboxGpuDevice, + session, + }; +} From f17000a73716c16fe69007aeb1c8d218e646cb1d Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Tue, 19 May 2026 23:01:05 -0700 Subject: [PATCH 08/54] refactor(cli): extract onboard gateway handler --- src/lib/onboard.ts | 147 +++---------- .../onboard/machine/handlers/gateway.test.ts | 203 ++++++++++++++++++ src/lib/onboard/machine/handlers/gateway.ts | 178 +++++++++++++++ 3 files changed, 413 insertions(+), 115 deletions(-) create mode 100644 src/lib/onboard/machine/handlers/gateway.test.ts create mode 100644 src/lib/onboard/machine/handlers/gateway.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 50c5187326..9d9b047748 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -280,6 +280,7 @@ const { resolveSandboxImageTagFromCreateOutput } = const nim: typeof import("./inference/nim") = require("./inference/nim"); const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session"); const { OnboardRuntime }: typeof import("./onboard/machine/runtime") = require("./onboard/machine/runtime"); +const { handleGatewayState }: typeof import("./onboard/machine/handlers/gateway") = require("./onboard/machine/handlers/gateway"); const { handlePreflightState }: typeof import("./onboard/machine/handlers/preflight") = require("./onboard/machine/handlers/preflight"); const policies: typeof import("./policy") = require("./policy"); const tiers: typeof import("./policy/tiers") = require("./policy/tiers"); @@ -9464,125 +9465,41 @@ async function onboard(opts: OnboardOptions = {}): Promise { }); const gatewaySnapshot = selectNamedGatewayForReuseIfNeeded(getGatewayReuseSnapshot()); - let gatewayReuseState = gatewaySnapshot.gatewayReuseState; - gatewayReuseState = await refreshDockerDriverGatewayReuseState(gatewayReuseState); - - // Verify the legacy gateway container is actually running — openshell CLI - // metadata can be stale after a manual `docker rm`. See #2020. Newer - // package-managed OpenShell gateways do not have an openshell-cluster-* - // Docker container, so the live CLI health check is the source of truth. - if (gatewayReuseState === "healthy" && gatewayCliSupportsLifecycleCommands(runCaptureOpenshell)) { - const containerState = verifyGatewayContainerRunning(GATEWAY_NAME); - if (containerState === "missing") { - console.log(" Gateway metadata is stale (container not running). Cleaning up..."); - runOpenshell(["forward", "stop", String(DASHBOARD_PORT)], { ignoreError: true }); - gatewayReuseState = destroyGatewayForReuse( - destroyGateway, - " ✓ Stale gateway metadata cleaned up", - " ! Stale gateway metadata cleanup failed; leaving registry state intact.", - ); - } else if (containerState === "unknown") { - // Docker probe failed but cached metadata says healthy. Try the host-level - // HTTP probe — it doesn't depend on Docker, so it can confirm the gateway - // is genuinely serving even when the daemon is flaky. - if (await waitForGatewayHttpReady()) { - console.log( - " Warning: could not verify gateway container state (Docker may be unavailable), but the gateway is responding on HTTP. Proceeding with reuse.", - ); - } else { - // Docker can't be probed AND the gateway HTTP endpoint isn't - // responding. We cannot tell whether the existing gateway is live - // (transient `docker inspect` flake + warm-up miss) or genuinely - // gone. Per #2020 we must not destroy in this branch, and we must - // not downgrade to "missing" either: that would push execution into - // `startGatewayWithOptions`, whose retry hook calls - // `destroyGateway()` between attempts — which would tear down a - // possibly-live gateway. Bail with an actionable error instead. - console.log( - ` Error: could not verify gateway container state and ${getGatewayLocalEndpoint()}/ is not responding.`, - ); - console.log( - " Refusing to proceed without a clear Docker signal — restarting Docker and re-running onboard is the safe path. See #3258 / #2020.", - ); - process.exit(1); - } - } else if (!(await waitForGatewayHttpReady())) { - // Container is running but the gateway HTTP endpoint is not responding. - // Common immediately after a Docker daemon restart — the container comes - // back before the OpenShell gateway upstream finishes warming up. Safe to - // recreate because Docker is functional. See #3258. - console.log( - ` Gateway container is running but ${getGatewayLocalEndpoint()}/ is not responding. Recreating...`, - ); - runOpenshell(["forward", "stop", String(DASHBOARD_PORT)], { ignoreError: true }); - gatewayReuseState = destroyGatewayForReuse( - destroyGateway, - " ✓ Stale gateway cleaned up", - " ! Stale gateway cleanup failed; leaving registry state intact.", - ); - } else { - const imageDrift = getGatewayClusterImageDrift(); - if (imageDrift) { - console.log( - ` Gateway image ${imageDrift.currentVersion} does not match openshell ${imageDrift.expectedVersion}. Recreating...`, - ); - stopAllDashboardForwards(); - gatewayReuseState = destroyGatewayForReuse( - destroyGateway, - " ✓ Previous gateway cleaned up", - " ! Previous gateway cleanup failed; leaving registry state intact.", - ); - } - } - } - - gatewayReuseState = reconcileGatewayGpuReuseForGpuIntent({ - gatewayReuseState, + const gatewayResult = await handleGatewayState({ + resume, + session, + initialGatewayReuseState: gatewaySnapshot.gatewayReuseState, + gpu, gpuPassthrough, gatewayName: GATEWAY_NAME, - currentSandboxName: recordedSandboxName || requestedSandboxName, + dashboardPort: DASHBOARD_PORT, + recordedSandboxName, + requestedSandboxName, recreateSandbox: isRecreateSandbox(), - confirmedDockerDriverGateway: - isLinuxDockerDriverGatewayEnabled() && - gatewayReuseState === "healthy" && - !gatewayCliSupportsLifecycleCommands(runCaptureOpenshell), - stopDashboardForwards: stopAllDashboardForwards, - retireLegacyGatewayForDockerDriverUpgrade, - destroyGatewayRuntimeForGpuReuse: () => destroyGateway(() => undefined, () => false), + deps: { + refreshDockerDriverGatewayReuseState, + gatewayCliSupportsLifecycleCommands: () => gatewayCliSupportsLifecycleCommands(runCaptureOpenshell), + verifyGatewayContainerRunning, + waitForGatewayHttpReady, + getGatewayLocalEndpoint, + runOpenshell, + destroyGateway, + destroyGatewayForReuse, + getGatewayClusterImageDrift, + stopAllDashboardForwards, + reconcileGatewayGpuReuseForGpuIntent, + isLinuxDockerDriverGatewayEnabled, + retireLegacyGatewayForDockerDriverUpgrade, + destroyGatewayRuntimeForGpuReuse: () => destroyGateway(() => undefined, () => false), + skippedStepMessage, + note, + startRecordedStep, + startGateway, + recordStepComplete, + exitProcess: (code) => process.exit(code), + }, }); - - const canReuseHealthyGateway = gatewayReuseState === "healthy"; - - const resumeGateway = - resume && session?.steps?.gateway?.status === "complete" && canReuseHealthyGateway; - if (resumeGateway) { - skippedStepMessage("gateway", "running"); - await recordStepComplete("gateway"); - } else if (!resume && canReuseHealthyGateway) { - skippedStepMessage("gateway", "running", "reuse"); - note(" Reusing healthy NemoClaw gateway."); - await recordStepComplete("gateway"); - } else { - if (resume && session?.steps?.gateway?.status === "complete") { - if (gatewayReuseState === "active-unnamed") { - note(" [resume] Gateway is active but named metadata is missing; recreating it safely."); - } else if (gatewayReuseState === "foreign-active") { - note(" [resume] A different OpenShell gateway is active; NemoClaw will not reuse it."); - } else if (gatewayReuseState === "stale") { - note(" [resume] Recorded gateway is unhealthy; recreating it."); - } else { - note(" [resume] Recorded gateway state is unavailable; recreating it."); - } - } - if (isLinuxDockerDriverGatewayEnabled() && gatewayReuseState !== "missing") { - note(" Replacing legacy OpenShell gateway metadata with Docker-driver gateway."); - retireLegacyGatewayForDockerDriverUpgrade(); - gatewayReuseState = "missing"; - } - await startRecordedStep("gateway"); - await startGateway(gpu, { gpuPassthrough }); - await recordStepComplete("gateway"); - } + session = gatewayResult.session; // #2753: prefer requestedSandboxName over an unconfirmed session name. // A pre-fix session may carry sandboxName even though sandbox creation diff --git a/src/lib/onboard/machine/handlers/gateway.test.ts b/src/lib/onboard/machine/handlers/gateway.test.ts new file mode 100644 index 0000000000..266ba10360 --- /dev/null +++ b/src/lib/onboard/machine/handlers/gateway.test.ts @@ -0,0 +1,203 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it, vi } from "vitest"; + +import { createSession, type Session } from "../../../state/onboard-session"; +import type { GatewayReuseState } from "../../../state/gateway"; +import { handleGatewayState, type GatewayStateOptions } from "./gateway"; + +type Gpu = { type: string } | null; + +function createDeps(overrides: Partial["deps"]> = {}) { + const calls = { + refresh: vi.fn(async (state: GatewayReuseState) => state), + lifecycle: vi.fn(() => false), + verifyContainer: vi.fn(() => "running"), + waitHttp: vi.fn(async () => true), + runOpenshell: vi.fn(), + destroy: vi.fn(() => true), + destroyForReuse: vi.fn(() => "missing" as GatewayReuseState), + imageDrift: vi.fn(() => null), + stopForwards: vi.fn(), + reconcileGpu: vi.fn((opts: { gatewayReuseState: GatewayReuseState }) => opts.gatewayReuseState), + dockerDriver: vi.fn(() => false), + retireLegacy: vi.fn(), + destroyGpuRuntime: vi.fn(() => true), + skipped: vi.fn(), + note: vi.fn(), + startStep: vi.fn(async () => undefined), + startGateway: vi.fn(async () => undefined), + complete: vi.fn(async () => createSession()), + exit: vi.fn((code: number): never => { + throw new Error(`exit ${code}`); + }), + }; + return { + calls, + deps: { + refreshDockerDriverGatewayReuseState: calls.refresh, + gatewayCliSupportsLifecycleCommands: calls.lifecycle, + verifyGatewayContainerRunning: calls.verifyContainer, + waitForGatewayHttpReady: calls.waitHttp, + getGatewayLocalEndpoint: () => "http://127.0.0.1:31818", + runOpenshell: calls.runOpenshell, + destroyGateway: calls.destroy, + destroyGatewayForReuse: calls.destroyForReuse, + getGatewayClusterImageDrift: calls.imageDrift, + stopAllDashboardForwards: calls.stopForwards, + reconcileGatewayGpuReuseForGpuIntent: calls.reconcileGpu, + isLinuxDockerDriverGatewayEnabled: calls.dockerDriver, + retireLegacyGatewayForDockerDriverUpgrade: calls.retireLegacy, + destroyGatewayRuntimeForGpuReuse: calls.destroyGpuRuntime, + skippedStepMessage: calls.skipped, + note: calls.note, + startRecordedStep: calls.startStep, + startGateway: calls.startGateway, + recordStepComplete: calls.complete, + exitProcess: calls.exit, + ...overrides, + }, + }; +} + +function baseOptions( + deps: GatewayStateOptions["deps"], + initialGatewayReuseState: GatewayReuseState = "missing", + session: Session | null = createSession(), +): GatewayStateOptions { + return { + resume: false, + session, + initialGatewayReuseState, + gpu: { type: "nvidia" }, + gpuPassthrough: true, + gatewayName: "nemoclaw", + dashboardPort: 18789, + recordedSandboxName: null, + requestedSandboxName: "my-assistant", + recreateSandbox: false, + deps, + }; +} + +describe("handleGatewayState", () => { + it("starts the gateway when no reusable gateway exists", async () => { + const { deps, calls } = createDeps(); + + const result = await handleGatewayState(baseOptions(deps, "missing")); + + expect(calls.startStep).toHaveBeenCalledWith("gateway"); + expect(calls.startGateway).toHaveBeenCalledWith({ type: "nvidia" }, { gpuPassthrough: true }); + expect(calls.complete).toHaveBeenCalledWith("gateway"); + expect(result.gatewayReuseState).toBe("missing"); + }); + + it("reuses healthy gateways on fresh runs", async () => { + const { deps, calls } = createDeps(); + + await handleGatewayState(baseOptions(deps, "healthy")); + + expect(calls.skipped).toHaveBeenCalledWith("gateway", "running", "reuse"); + expect(calls.note).toHaveBeenCalledWith(" Reusing healthy NemoClaw gateway."); + expect(calls.startGateway).not.toHaveBeenCalled(); + expect(calls.complete).toHaveBeenCalledWith("gateway"); + }); + + it("reuses healthy gateways on resume only when the gateway step was complete", async () => { + const session = createSession(); + session.steps.gateway.status = "complete"; + const { deps, calls } = createDeps(); + + await handleGatewayState({ ...baseOptions(deps, "healthy", session), resume: true }); + + expect(calls.skipped).toHaveBeenCalledWith("gateway", "running"); + expect(calls.startGateway).not.toHaveBeenCalled(); + }); + + it("cleans stale lifecycle metadata when the gateway container is missing", async () => { + const { deps, calls } = createDeps({ + gatewayCliSupportsLifecycleCommands: vi.fn(() => true), + verifyGatewayContainerRunning: vi.fn(() => "missing" as GatewayReuseState), + destroyGatewayForReuse: vi.fn(() => "missing" as GatewayReuseState), + }); + + await handleGatewayState(baseOptions(deps, "healthy")); + + expect(calls.runOpenshell).toHaveBeenCalledWith(["forward", "stop", "18789"], { + ignoreError: true, + }); + expect(deps.destroyGatewayForReuse).toHaveBeenCalledWith( + deps.destroyGateway, + " ✓ Stale gateway metadata cleaned up", + " ! Stale gateway metadata cleanup failed; leaving registry state intact.", + ); + expect(calls.startGateway).toHaveBeenCalled(); + }); + + it("refuses to destroy an unknown container state when HTTP is also unavailable", async () => { + const { deps, calls } = createDeps({ + gatewayCliSupportsLifecycleCommands: vi.fn(() => true), + verifyGatewayContainerRunning: vi.fn(() => "unknown"), + waitForGatewayHttpReady: vi.fn(async () => false), + }); + + await expect(handleGatewayState(baseOptions(deps, "healthy"))).rejects.toThrow("exit 1"); + + expect(calls.exit).toHaveBeenCalledWith(1); + expect(calls.destroyForReuse).not.toHaveBeenCalled(); + }); + + it("recreates a running lifecycle gateway when the HTTP endpoint is unhealthy", async () => { + const { deps, calls } = createDeps({ + gatewayCliSupportsLifecycleCommands: vi.fn(() => true), + waitForGatewayHttpReady: vi.fn(async () => false), + destroyGatewayForReuse: vi.fn(() => "missing" as GatewayReuseState), + }); + + await handleGatewayState(baseOptions(deps, "healthy")); + + expect(calls.runOpenshell).toHaveBeenCalledWith(["forward", "stop", "18789"], { + ignoreError: true, + }); + expect(deps.destroyGatewayForReuse).toHaveBeenCalledWith( + deps.destroyGateway, + " ✓ Stale gateway cleaned up", + " ! Stale gateway cleanup failed; leaving registry state intact.", + ); + }); + + it("recreates on gateway image drift after stopping dashboard forwards", async () => { + const { deps, calls } = createDeps({ + gatewayCliSupportsLifecycleCommands: vi.fn(() => true), + waitForGatewayHttpReady: vi.fn(async () => true), + getGatewayClusterImageDrift: vi.fn(() => ({ currentVersion: "0.0.38", expectedVersion: "0.0.39" })), + destroyGatewayForReuse: vi.fn(() => "missing" as GatewayReuseState), + }); + + await handleGatewayState(baseOptions(deps, "healthy")); + + expect(calls.stopForwards).toHaveBeenCalledOnce(); + expect(deps.destroyGatewayForReuse).toHaveBeenCalledWith( + deps.destroyGateway, + " ✓ Previous gateway cleaned up", + " ! Previous gateway cleanup failed; leaving registry state intact.", + ); + }); + + it("replaces legacy metadata before starting the Docker-driver gateway", async () => { + const { deps, calls } = createDeps({ + isLinuxDockerDriverGatewayEnabled: vi.fn(() => true), + reconcileGatewayGpuReuseForGpuIntent: vi.fn(() => "stale" as GatewayReuseState), + }); + + const result = await handleGatewayState(baseOptions(deps, "healthy")); + + expect(calls.note).toHaveBeenCalledWith( + " Replacing legacy OpenShell gateway metadata with Docker-driver gateway.", + ); + expect(calls.retireLegacy).toHaveBeenCalledOnce(); + expect(calls.startGateway).toHaveBeenCalledOnce(); + expect(result.gatewayReuseState).toBe("missing"); + }); +}); diff --git a/src/lib/onboard/machine/handlers/gateway.ts b/src/lib/onboard/machine/handlers/gateway.ts new file mode 100644 index 0000000000..026c26e1b4 --- /dev/null +++ b/src/lib/onboard/machine/handlers/gateway.ts @@ -0,0 +1,178 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { Session } from "../../../state/onboard-session"; +import type { GatewayReuseState } from "../../../state/gateway"; + +export type GatewayContainerState = "missing" | "unknown" | string; + +export interface GatewayStateOptions { + resume: boolean; + session: Session | null; + initialGatewayReuseState: GatewayReuseState; + gpu: Gpu; + gpuPassthrough: boolean; + gatewayName: string; + dashboardPort: number; + recordedSandboxName: string | null; + requestedSandboxName: string | null; + recreateSandbox: boolean; + deps: { + refreshDockerDriverGatewayReuseState(state: GatewayReuseState): Promise; + gatewayCliSupportsLifecycleCommands(): boolean; + verifyGatewayContainerRunning(gatewayName: string): GatewayContainerState; + waitForGatewayHttpReady(): Promise; + getGatewayLocalEndpoint(): string; + runOpenshell(args: string[], opts?: { ignoreError?: boolean }): unknown; + destroyGateway(): boolean; + destroyGatewayForReuse( + destroyGateway: () => boolean, + successMessage: string, + failureMessage: string, + ): GatewayReuseState; + getGatewayClusterImageDrift(): { currentVersion: string; expectedVersion: string } | null; + stopAllDashboardForwards(): void; + reconcileGatewayGpuReuseForGpuIntent(options: { + gatewayReuseState: GatewayReuseState; + gpuPassthrough: boolean; + gatewayName: string; + currentSandboxName: string | null; + recreateSandbox: boolean; + confirmedDockerDriverGateway: boolean; + stopDashboardForwards: () => void; + retireLegacyGatewayForDockerDriverUpgrade: () => void; + destroyGatewayRuntimeForGpuReuse: () => boolean; + }): GatewayReuseState; + isLinuxDockerDriverGatewayEnabled(): boolean; + retireLegacyGatewayForDockerDriverUpgrade(): void; + destroyGatewayRuntimeForGpuReuse(): boolean; + skippedStepMessage( + stepName: string, + detail?: string | null, + reason?: "resume" | "reuse", + ): void; + note(message: string): void; + startRecordedStep(stepName: string): Promise; + startGateway(gpu: Gpu, options: { gpuPassthrough: boolean }): Promise; + recordStepComplete(stepName: string): Promise; + exitProcess(code: number): never; + }; +} + +export interface GatewayStateResult { + gatewayReuseState: GatewayReuseState; + session: Session | null; +} + +export async function handleGatewayState({ + resume, + session, + initialGatewayReuseState, + gpu, + gpuPassthrough, + gatewayName, + dashboardPort, + recordedSandboxName, + requestedSandboxName, + recreateSandbox, + deps, +}: GatewayStateOptions): Promise { + let gatewayReuseState = await deps.refreshDockerDriverGatewayReuseState(initialGatewayReuseState); + const supportsLifecycleCommands = deps.gatewayCliSupportsLifecycleCommands(); + + if (gatewayReuseState === "healthy" && supportsLifecycleCommands) { + const containerState = deps.verifyGatewayContainerRunning(gatewayName); + if (containerState === "missing") { + console.log(" Gateway metadata is stale (container not running). Cleaning up..."); + deps.runOpenshell(["forward", "stop", String(dashboardPort)], { ignoreError: true }); + gatewayReuseState = deps.destroyGatewayForReuse( + deps.destroyGateway, + " ✓ Stale gateway metadata cleaned up", + " ! Stale gateway metadata cleanup failed; leaving registry state intact.", + ); + } else if (containerState === "unknown") { + if (await deps.waitForGatewayHttpReady()) { + console.log( + " Warning: could not verify gateway container state (Docker may be unavailable), but the gateway is responding on HTTP. Proceeding with reuse.", + ); + } else { + console.log( + ` Error: could not verify gateway container state and ${deps.getGatewayLocalEndpoint()}/ is not responding.`, + ); + console.log( + " Refusing to proceed without a clear Docker signal — restarting Docker and re-running onboard is the safe path. See #3258 / #2020.", + ); + deps.exitProcess(1); + } + } else if (!(await deps.waitForGatewayHttpReady())) { + console.log( + ` Gateway container is running but ${deps.getGatewayLocalEndpoint()}/ is not responding. Recreating...`, + ); + deps.runOpenshell(["forward", "stop", String(dashboardPort)], { ignoreError: true }); + gatewayReuseState = deps.destroyGatewayForReuse( + deps.destroyGateway, + " ✓ Stale gateway cleaned up", + " ! Stale gateway cleanup failed; leaving registry state intact.", + ); + } else { + const imageDrift = deps.getGatewayClusterImageDrift(); + if (imageDrift) { + console.log( + ` Gateway image ${imageDrift.currentVersion} does not match openshell ${imageDrift.expectedVersion}. Recreating...`, + ); + deps.stopAllDashboardForwards(); + gatewayReuseState = deps.destroyGatewayForReuse( + deps.destroyGateway, + " ✓ Previous gateway cleaned up", + " ! Previous gateway cleanup failed; leaving registry state intact.", + ); + } + } + } + + gatewayReuseState = deps.reconcileGatewayGpuReuseForGpuIntent({ + gatewayReuseState, + gpuPassthrough, + gatewayName, + currentSandboxName: recordedSandboxName || requestedSandboxName, + recreateSandbox, + confirmedDockerDriverGateway: + deps.isLinuxDockerDriverGatewayEnabled() && gatewayReuseState === "healthy" && !supportsLifecycleCommands, + stopDashboardForwards: deps.stopAllDashboardForwards, + retireLegacyGatewayForDockerDriverUpgrade: deps.retireLegacyGatewayForDockerDriverUpgrade, + destroyGatewayRuntimeForGpuReuse: deps.destroyGatewayRuntimeForGpuReuse, + }); + + const canReuseHealthyGateway = gatewayReuseState === "healthy"; + const resumeGateway = resume && session?.steps?.gateway?.status === "complete" && canReuseHealthyGateway; + if (resumeGateway) { + deps.skippedStepMessage("gateway", "running"); + session = await deps.recordStepComplete("gateway"); + } else if (!resume && canReuseHealthyGateway) { + deps.skippedStepMessage("gateway", "running", "reuse"); + deps.note(" Reusing healthy NemoClaw gateway."); + session = await deps.recordStepComplete("gateway"); + } else { + if (resume && session?.steps?.gateway?.status === "complete") { + if (gatewayReuseState === "active-unnamed") { + deps.note(" [resume] Gateway is active but named metadata is missing; recreating it safely."); + } else if (gatewayReuseState === "foreign-active") { + deps.note(" [resume] A different OpenShell gateway is active; NemoClaw will not reuse it."); + } else if (gatewayReuseState === "stale") { + deps.note(" [resume] Recorded gateway is unhealthy; recreating it."); + } else { + deps.note(" [resume] Recorded gateway state is unavailable; recreating it."); + } + } + if (deps.isLinuxDockerDriverGatewayEnabled() && gatewayReuseState !== "missing") { + deps.note(" Replacing legacy OpenShell gateway metadata with Docker-driver gateway."); + deps.retireLegacyGatewayForDockerDriverUpgrade(); + gatewayReuseState = "missing"; + } + await deps.startRecordedStep("gateway"); + await deps.startGateway(gpu, { gpuPassthrough }); + session = await deps.recordStepComplete("gateway"); + } + + return { gatewayReuseState, session }; +} From 3038da47214adc54345ecd37b40fe944e943d1e7 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Tue, 19 May 2026 23:20:19 -0700 Subject: [PATCH 09/54] refactor(cli): extract provider inference handlers --- src/lib/onboard.ts | 247 +++++---------- .../handlers/provider-inference.test.ts | 216 +++++++++++++ .../machine/handlers/provider-inference.ts | 289 ++++++++++++++++++ 3 files changed, 577 insertions(+), 175 deletions(-) create mode 100644 src/lib/onboard/machine/handlers/provider-inference.test.ts create mode 100644 src/lib/onboard/machine/handlers/provider-inference.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 9d9b047748..f7d95ae8ab 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -282,6 +282,7 @@ const onboardSession: typeof import("./state/onboard-session") = require("./stat const { OnboardRuntime }: typeof import("./onboard/machine/runtime") = require("./onboard/machine/runtime"); const { handleGatewayState }: typeof import("./onboard/machine/handlers/gateway") = require("./onboard/machine/handlers/gateway"); const { handlePreflightState }: typeof import("./onboard/machine/handlers/preflight") = require("./onboard/machine/handlers/preflight"); +const { handleProviderInferenceState }: typeof import("./onboard/machine/handlers/provider-inference") = require("./onboard/machine/handlers/provider-inference"); const policies: typeof import("./policy") = require("./policy"); const tiers: typeof import("./policy/tiers") = require("./policy/tiers"); const { ensureUsageNoticeConsent } = require("./onboard/usage-notice"); @@ -9514,181 +9515,77 @@ async function onboard(opts: OnboardOptions = {}): Promise { console.error(" Start a fresh onboard with --name to choose a different name."); process.exit(1); } - let model = session?.model || null; - let provider = session?.provider || null; - let endpointUrl = session?.endpointUrl || null; - let credentialEnv = session?.credentialEnv || null; - let hermesAuthMethod: HermesAuthMethod | null = - normalizeHermesAuthMethod(session?.hermesAuthMethod) || - (provider === hermesProviderAuth.HERMES_PROVIDER_NAME && - session?.credentialEnv === HERMES_NOUS_API_KEY_CREDENTIAL_ENV - ? HERMES_AUTH_METHOD_API_KEY - : null); - let hermesToolGateways = normalizeHermesToolGatewaySelections(session?.hermesToolGateways); - let preferredInferenceApi = session?.preferredInferenceApi || null; - let nimContainer = session?.nimContainer || null; - let webSearchConfig = session?.webSearchConfig || null; - let forceProviderSelection = false; - while (true) { - const resumeProviderSelection = - !forceProviderSelection && - resume && - session?.steps?.provider_selection?.status === "complete" && - typeof provider === "string" && - typeof model === "string"; - if (resumeProviderSelection) { - skippedStepMessage("provider_selection", `${provider} / ${model}`); - hydrateCredentialEnv(credentialEnv); - // #3342: resume short-circuits provider selection — repair the - // ollama-local systemd loopback override here so legacy 0.0.0.0 - // drop-ins from older NemoClaw versions get rewritten every resume. - repairLocalInferenceSystemdOverrideOrExit(provider, isNonInteractive); - } else { - // #2753: do not persist sandboxName to onboard-session.json before - // the sandbox actually exists in the gateway (Step 6 markStepComplete - // below). A SIGINT between any earlier step and createSandbox would - // otherwise leave a phantom that `nemoclaw list` resurrects until - // manually destroyed. - await startRecordedStep("provider_selection"); - const selection = await setupNim(gpu, sandboxName, agent); - model = selection.model; - provider = selection.provider; - endpointUrl = selection.endpointUrl; - credentialEnv = selection.credentialEnv; - hermesAuthMethod = selection.hermesAuthMethod; - hermesToolGateways = selection.hermesToolGateways; - preferredInferenceApi = selection.preferredInferenceApi; - nimContainer = selection.nimContainer; - await recordStepComplete( - "provider_selection", - toSessionUpdates({ - provider, - model, - endpointUrl, - credentialEnv, - hermesAuthMethod, - hermesToolGateways, - preferredInferenceApi, - nimContainer, - }), - ); - } - - if (typeof provider !== "string" || typeof model !== "string") { - console.error(" Inference selection did not yield a provider/model."); - process.exit(1); - } - process.env.NEMOCLAW_OPENSHELL_BIN = getOpenshellBinary(); - const needsBedrockRuntimeAdapter = - provider === "compatible-anthropic-endpoint" && - bedrockRuntimeOnboard.needsBedrockRuntimeAdapter(endpointUrl); - const resumeInference = - !needsBedrockRuntimeAdapter && - !forceProviderSelection && - resume && - isInferenceRouteReady(provider, model); - if (resumeInference) { - if (provider === hermesProviderAuth.HERMES_PROVIDER_NAME) { - if (!sandboxName) { - sandboxName = await promptValidatedSandboxName(agent); - } - await startRecordedStep("inference", { provider, model }); - const inferenceResult = await setupInference( - sandboxName, - model, - provider, - endpointUrl, - credentialEnv, - hermesAuthMethod, - hermesToolGateways, - ); - if (inferenceResult?.retry === "selection") { - forceProviderSelection = true; - continue; - } - await recordStepComplete( - "inference", - toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }), - ); - break; - } - if (isRoutedInferenceProvider(provider)) { - try { - await reconcileModelRouter(); - } catch (err) { - console.error( - ` ✗ Failed to reconcile model router: ${err instanceof Error ? err.message : String(err)}`, - ); - process.exit(1); - } - } - skippedStepMessage("inference", `${provider} / ${model}`); - if (nimContainer && sandboxName) { - registry.updateSandbox(sandboxName, { nimContainer }); - } - await recordStepComplete( - "inference", - toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }), - ); - break; - } - - if (!sandboxName) { - sandboxName = await promptValidatedSandboxName(agent); - } - const buildEstimateNote = - process.env.NEMOCLAW_IGNORE_RUNTIME_RESOURCES === "1" - ? null - : formatSandboxBuildEstimateNote(assessHost()); - console.log( - formatOnboardConfigSummary({ - provider, - model, - credentialEnv, - hermesAuthMethod, - webSearchConfig, - hermesToolGateways, - enabledChannels: selectedMessagingChannels.length > 0 ? selectedMessagingChannels : null, - sandboxName, - notes: buildEstimateNote ? [buildEstimateNote] : [], - }), - ); - console.log(" Web search and messaging channels will be prompted next."); - if (!isNonInteractive()) { - if (!(await promptYesNoOrDefault(" Apply this configuration?", null, true))) { - console.log(` Aborted. Re-run \`${cliName()} onboard\` to start over.`); - console.log(" Credentials entered so far were only staged in memory for this run."); - console.log( - " No new gateway credential was registered because onboarding stopped here.", - ); - process.exit(0); - } - } - - await startRecordedStep("inference", { provider, model }); - const inferenceResult = await setupInference( - sandboxName, - model, - provider, - endpointUrl, - credentialEnv, - hermesAuthMethod, - hermesToolGateways, - ); - delete process.env.NVIDIA_API_KEY; - if (inferenceResult?.retry === "selection") { - forceProviderSelection = true; - continue; - } - if (nimContainer && sandboxName) { - registry.updateSandbox(sandboxName, { nimContainer }); - } - await recordStepComplete( - "inference", - toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }), - ); - break; - } + const providerInferenceResult = await handleProviderInferenceState({ + resume, + session, + gpu, + sandboxName, + agent, + initial: { + model: session?.model || null, + provider: session?.provider || null, + endpointUrl: session?.endpointUrl || null, + credentialEnv: session?.credentialEnv || null, + hermesAuthMethod: + normalizeHermesAuthMethod(session?.hermesAuthMethod) || + (session?.provider === hermesProviderAuth.HERMES_PROVIDER_NAME && + session?.credentialEnv === HERMES_NOUS_API_KEY_CREDENTIAL_ENV + ? HERMES_AUTH_METHOD_API_KEY + : null), + hermesToolGateways: normalizeHermesToolGatewaySelections(session?.hermesToolGateways), + preferredInferenceApi: session?.preferredInferenceApi || null, + nimContainer: session?.nimContainer || null, + webSearchConfig: session?.webSearchConfig || null, + }, + selectedMessagingChannels, + env: process.env, + constants: { hermesProviderName: hermesProviderAuth.HERMES_PROVIDER_NAME }, + deps: { + normalizeHermesAuthMethod, + setupNim, + setupInference, + startRecordedStep, + recordStepComplete, + toSessionUpdates: (updates) => toSessionUpdates(updates as Parameters[0]), + skippedStepMessage, + hydrateCredentialEnv, + repairLocalInferenceSystemdOverrideOrExit, + isNonInteractive, + getOpenshellBinary, + needsBedrockRuntimeAdapter: (providerName, url) => + providerName === "compatible-anthropic-endpoint" && + bedrockRuntimeOnboard.needsBedrockRuntimeAdapter(url), + isInferenceRouteReady, + isRoutedInferenceProvider, + reconcileModelRouter, + registryUpdateSandbox: (name, updates) => registry.updateSandbox(name, updates), + promptValidatedSandboxName, + assessHost, + formatSandboxBuildEstimateNote, + formatOnboardConfigSummary, + promptYesNoOrDefault, + cliName, + log: (message) => console.log(message), + error: (message) => console.error(message), + exitProcess: (code) => process.exit(code), + deleteEnv: (name) => { + delete process.env[name]; + }, + }, + }); + session = providerInferenceResult.session; + sandboxName = providerInferenceResult.sandboxName; + const { + model, + provider, + endpointUrl, + credentialEnv, + hermesAuthMethod, + hermesToolGateways, + preferredInferenceApi, + nimContainer, + } = providerInferenceResult; + let webSearchConfig = providerInferenceResult.webSearchConfig as WebSearchConfig | null; const webSearchSupportProbePath = fromDockerfile ? path.resolve(fromDockerfile) : null; const webSearchSupported = agentSupportsWebSearch(agent, webSearchSupportProbePath, ROOT); diff --git a/src/lib/onboard/machine/handlers/provider-inference.test.ts b/src/lib/onboard/machine/handlers/provider-inference.test.ts new file mode 100644 index 0000000000..bec7ea47a3 --- /dev/null +++ b/src/lib/onboard/machine/handlers/provider-inference.test.ts @@ -0,0 +1,216 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it, vi } from "vitest"; + +import { createSession, type Session, type SessionUpdates } from "../../../state/onboard-session"; +import { + handleProviderInferenceState, + type ProviderInferenceStateOptions, + type ProviderSelectionResult, +} from "./provider-inference"; + +type Gpu = { type: string } | null; +type Agent = { name: string } | null; +type Host = { cpus?: number }; + +const baseSelection: ProviderSelectionResult = { + model: "nvidia/test", + provider: "nvidia-prod", + endpointUrl: "https://integrate.api.nvidia.com/v1", + credentialEnv: "NVIDIA_API_KEY", + hermesAuthMethod: null, + hermesToolGateways: [], + preferredInferenceApi: "openai-responses", + nimContainer: null, +}; + +function createDeps(overrides: Partial["deps"]> = {}) { + const calls = { + setupNim: vi.fn(async () => ({ ...baseSelection })), + setupInference: vi.fn(async () => ({ ok: true as const })), + startStep: vi.fn(async () => undefined), + complete: vi.fn(async () => createSession()), + skipped: vi.fn(), + hydrate: vi.fn(), + repair: vi.fn(), + routeReady: vi.fn(() => false), + reconcileRouter: vi.fn(async () => undefined), + updateSandbox: vi.fn(), + promptName: vi.fn(async () => "my-assistant"), + promptYesNo: vi.fn(async () => true), + log: vi.fn(), + error: vi.fn(), + exit: vi.fn((code: number): never => { + throw new Error(`exit ${code}`); + }), + deleteEnv: vi.fn(), + }; + return { + calls, + deps: { + normalizeHermesAuthMethod: (value: string | null | undefined) => value ?? null, + setupNim: calls.setupNim, + setupInference: calls.setupInference, + startRecordedStep: calls.startStep, + recordStepComplete: calls.complete, + toSessionUpdates: (updates: Record) => updates as SessionUpdates, + skippedStepMessage: calls.skipped, + hydrateCredentialEnv: calls.hydrate, + repairLocalInferenceSystemdOverrideOrExit: calls.repair, + isNonInteractive: () => true, + getOpenshellBinary: () => "/usr/bin/openshell", + needsBedrockRuntimeAdapter: () => false, + isInferenceRouteReady: calls.routeReady, + isRoutedInferenceProvider: (provider: string) => provider === "nvidia-router", + reconcileModelRouter: calls.reconcileRouter, + registryUpdateSandbox: calls.updateSandbox, + promptValidatedSandboxName: calls.promptName, + assessHost: () => ({ cpus: 8 }), + formatSandboxBuildEstimateNote: () => "estimate", + formatOnboardConfigSummary: (options: { + provider: string; + model: string; + sandboxName: string; + }) => `summary:${options.provider}/${options.model}/${options.sandboxName}`, + promptYesNoOrDefault: calls.promptYesNo, + cliName: () => "nemoclaw", + log: calls.log, + error: calls.error, + exitProcess: calls.exit, + deleteEnv: calls.deleteEnv, + ...overrides, + }, + }; +} + +function baseOptions( + deps: ProviderInferenceStateOptions["deps"], + session: Session | null = createSession(), +): ProviderInferenceStateOptions { + return { + resume: false, + session, + gpu: { type: "nvidia" }, + sandboxName: null, + agent: null, + initial: { + model: session?.model ?? null, + provider: session?.provider ?? null, + endpointUrl: session?.endpointUrl ?? null, + credentialEnv: session?.credentialEnv ?? null, + hermesAuthMethod: session?.hermesAuthMethod ?? null, + hermesToolGateways: session?.hermesToolGateways ?? [], + preferredInferenceApi: session?.preferredInferenceApi ?? null, + nimContainer: session?.nimContainer ?? null, + webSearchConfig: session?.webSearchConfig ?? null, + }, + selectedMessagingChannels: [], + env: {}, + constants: { hermesProviderName: "hermes-provider" }, + deps, + }; +} + +describe("handleProviderInferenceState", () => { + it("runs provider selection and inference setup on a fresh flow", async () => { + const { deps, calls } = createDeps(); + + const result = await handleProviderInferenceState(baseOptions(deps)); + + expect(calls.startStep).toHaveBeenNthCalledWith(1, "provider_selection"); + expect(calls.setupNim).toHaveBeenCalledWith({ type: "nvidia" }, null, null); + expect(calls.complete).toHaveBeenCalledWith("provider_selection", expect.objectContaining({ provider: "nvidia-prod" })); + expect(calls.promptName).toHaveBeenCalledWith(null); + expect(calls.log).toHaveBeenCalledWith("summary:nvidia-prod/nvidia/test/my-assistant"); + expect(calls.startStep).toHaveBeenNthCalledWith(2, "inference", { + provider: "nvidia-prod", + model: "nvidia/test", + }); + expect(calls.setupInference).toHaveBeenCalledWith( + "my-assistant", + "nvidia/test", + "nvidia-prod", + "https://integrate.api.nvidia.com/v1", + "NVIDIA_API_KEY", + null, + [], + ); + expect(calls.deleteEnv).toHaveBeenCalledWith("NVIDIA_API_KEY"); + expect(result).toMatchObject({ + sandboxName: "my-assistant", + model: "nvidia/test", + provider: "nvidia-prod", + preferredInferenceApi: "openai-responses", + }); + }); + + it("skips provider selection and inference setup when resume state is already ready", async () => { + const session = createSession({ + provider: "ollama-local", + model: "llama3.1", + credentialEnv: null, + }); + session.steps.provider_selection.status = "complete"; + const { deps, calls } = createDeps({ isInferenceRouteReady: vi.fn(() => true) }); + + const result = await handleProviderInferenceState({ + ...baseOptions(deps, session), + resume: true, + sandboxName: "my-assistant", + }); + + expect(calls.setupNim).not.toHaveBeenCalled(); + expect(calls.setupInference).not.toHaveBeenCalled(); + expect(calls.skipped).toHaveBeenCalledWith("provider_selection", "ollama-local / llama3.1"); + expect(calls.hydrate).toHaveBeenCalledWith(null); + expect(calls.repair).toHaveBeenCalledWith("ollama-local", deps.isNonInteractive); + expect(calls.skipped).toHaveBeenCalledWith("inference", "ollama-local / llama3.1"); + expect(result).toMatchObject({ provider: "ollama-local", model: "llama3.1" }); + }); + + it("reconciles model router on resumed routed inference", async () => { + const session = createSession({ provider: "nvidia-router", model: "router/model" }); + session.steps.provider_selection.status = "complete"; + const { deps, calls } = createDeps({ isInferenceRouteReady: vi.fn(() => true) }); + + await handleProviderInferenceState({ + ...baseOptions(deps, session), + resume: true, + sandboxName: "router-sandbox", + }); + + expect(calls.reconcileRouter).toHaveBeenCalledOnce(); + }); + + it("returns to provider selection when inference setup requests a retry", async () => { + const setupNim = vi + .fn() + .mockResolvedValueOnce({ ...baseSelection, model: "bad" }) + .mockResolvedValueOnce({ ...baseSelection, model: "good" }); + const setupInference = vi + .fn() + .mockResolvedValueOnce({ retry: "selection" as const }) + .mockResolvedValueOnce({ ok: true as const }); + const { deps, calls } = createDeps({ setupNim, setupInference }); + + const result = await handleProviderInferenceState(baseOptions(deps)); + + expect(setupNim).toHaveBeenCalledTimes(2); + expect(setupInference).toHaveBeenCalledTimes(2); + expect(result.model).toBe("good"); + expect(calls.startStep).toHaveBeenCalledWith("provider_selection"); + }); + + it("aborts before inference setup when the configuration summary is rejected", async () => { + const { deps, calls } = createDeps({ + isNonInteractive: () => false, + promptYesNoOrDefault: vi.fn(async () => false), + }); + + await expect(handleProviderInferenceState(baseOptions(deps))).rejects.toThrow("exit 0"); + + expect(calls.exit).toHaveBeenCalledWith(0); + expect(calls.setupInference).not.toHaveBeenCalled(); + }); +}); diff --git a/src/lib/onboard/machine/handlers/provider-inference.ts b/src/lib/onboard/machine/handlers/provider-inference.ts new file mode 100644 index 0000000000..525b94a059 --- /dev/null +++ b/src/lib/onboard/machine/handlers/provider-inference.ts @@ -0,0 +1,289 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { Session, SessionUpdates } from "../../../state/onboard-session"; + +export type ProviderInferenceRetry = { retry: "selection" } | { ok: true; retry?: undefined }; + +export interface ProviderSelectionResult { + model: string | null; + provider: string; + endpointUrl: string | null; + credentialEnv: string | null; + hermesAuthMethod: string | null; + hermesToolGateways: string[]; + preferredInferenceApi: string | null; + nimContainer: string | null; +} + +export interface ProviderInferenceStateOptions { + resume: boolean; + session: Session | null; + gpu: Gpu; + sandboxName: string | null; + agent: Agent; + initial: { + model: string | null; + provider: string | null; + endpointUrl: string | null; + credentialEnv: string | null; + hermesAuthMethod: string | null; + hermesToolGateways: string[]; + preferredInferenceApi: string | null; + nimContainer: string | null; + webSearchConfig: any; + }; + selectedMessagingChannels: string[]; + env: NodeJS.ProcessEnv; + constants: { + hermesProviderName: string; + }; + deps: { + normalizeHermesAuthMethod(value: string | null | undefined): string | null; + setupNim(gpu: Gpu, sandboxName: string | null, agent: Agent): Promise; + setupInference( + sandboxName: string | null, + model: string, + provider: string, + endpointUrl: string | null, + credentialEnv: string | null, + hermesAuthMethod: string | null, + hermesToolGateways: string[], + ): Promise; + startRecordedStep( + stepName: string, + updates?: { provider?: string | null; model?: string | null }, + ): Promise; + recordStepComplete(stepName: string, updates: SessionUpdates): Promise; + toSessionUpdates(updates: Record): SessionUpdates; + skippedStepMessage(stepName: string, detail?: string | null): void; + hydrateCredentialEnv(credentialEnv: string | null): void; + repairLocalInferenceSystemdOverrideOrExit(provider: string | null, isNonInteractive: () => boolean): void; + isNonInteractive(): boolean; + getOpenshellBinary(): string; + needsBedrockRuntimeAdapter(provider: string, endpointUrl: string | null): boolean; + isInferenceRouteReady(provider: string, model: string): boolean; + isRoutedInferenceProvider(provider: string): boolean; + reconcileModelRouter(): Promise; + registryUpdateSandbox(sandboxName: string, updates: { nimContainer?: string | null }): void; + promptValidatedSandboxName(agent: Agent): Promise; + assessHost(): Host; + formatSandboxBuildEstimateNote(host: Host): string | null; + formatOnboardConfigSummary(options: { + provider: string; + model: string; + credentialEnv: string | null; + hermesAuthMethod: string | null; + webSearchConfig: any; + hermesToolGateways: string[]; + enabledChannels: string[] | null; + sandboxName: string; + notes: string[]; + }): string; + promptYesNoOrDefault(question: string, envVar: string | null, defaultIsYes: boolean): Promise; + cliName(): string; + log(message?: string): void; + error(message?: string): void; + exitProcess(code: number): never; + deleteEnv(name: string): void; + }; +} + +export interface ProviderInferenceStateResult { + sandboxName: string | null; + model: string; + provider: string; + endpointUrl: string | null; + credentialEnv: string | null; + hermesAuthMethod: string | null; + hermesToolGateways: string[]; + preferredInferenceApi: string | null; + nimContainer: string | null; + webSearchConfig: any; + session: Session | null; +} + +function requireSelection(provider: string | null, model: string | null): { provider: string; model: string } { + if (typeof provider !== "string" || typeof model !== "string") { + throw new Error("Inference selection did not yield a provider/model."); + } + return { provider, model }; +} + +export async function handleProviderInferenceState({ + resume, + session, + gpu, + sandboxName, + agent, + initial, + selectedMessagingChannels, + env, + constants, + deps, +}: ProviderInferenceStateOptions): Promise { + let model = initial.model; + let provider = initial.provider; + let endpointUrl = initial.endpointUrl; + let credentialEnv = initial.credentialEnv; + let hermesAuthMethod = + deps.normalizeHermesAuthMethod(initial.hermesAuthMethod) || + (provider === constants.hermesProviderName ? deps.normalizeHermesAuthMethod(initial.hermesAuthMethod) : null); + let hermesToolGateways = initial.hermesToolGateways; + let preferredInferenceApi = initial.preferredInferenceApi; + let nimContainer = initial.nimContainer; + const webSearchConfig = initial.webSearchConfig; + let forceProviderSelection = false; + + while (true) { + const resumeProviderSelection = + !forceProviderSelection && + resume && + session?.steps?.provider_selection?.status === "complete" && + typeof provider === "string" && + typeof model === "string"; + if (resumeProviderSelection) { + deps.skippedStepMessage("provider_selection", `${provider} / ${model}`); + deps.hydrateCredentialEnv(credentialEnv); + deps.repairLocalInferenceSystemdOverrideOrExit(provider, deps.isNonInteractive); + } else { + await deps.startRecordedStep("provider_selection"); + const selection = await deps.setupNim(gpu, sandboxName, agent); + model = selection.model; + provider = selection.provider; + endpointUrl = selection.endpointUrl; + credentialEnv = selection.credentialEnv; + hermesAuthMethod = selection.hermesAuthMethod; + hermesToolGateways = selection.hermesToolGateways; + preferredInferenceApi = selection.preferredInferenceApi; + nimContainer = selection.nimContainer; + session = await deps.recordStepComplete( + "provider_selection", + deps.toSessionUpdates({ + provider, + model, + endpointUrl, + credentialEnv, + hermesAuthMethod, + hermesToolGateways, + preferredInferenceApi, + nimContainer, + }), + ); + } + + const selected = requireSelection(provider, model); + provider = selected.provider; + model = selected.model; + env.NEMOCLAW_OPENSHELL_BIN = deps.getOpenshellBinary(); + const needsBedrockRuntimeAdapter = deps.needsBedrockRuntimeAdapter(provider, endpointUrl); + const resumeInference = + !needsBedrockRuntimeAdapter && + !forceProviderSelection && + resume && + deps.isInferenceRouteReady(provider, model); + if (resumeInference) { + if (provider === constants.hermesProviderName) { + if (!sandboxName) sandboxName = await deps.promptValidatedSandboxName(agent); + await deps.startRecordedStep("inference", { provider, model }); + const inferenceResult = await deps.setupInference( + sandboxName, + model, + provider, + endpointUrl, + credentialEnv, + hermesAuthMethod, + hermesToolGateways, + ); + if (inferenceResult?.retry === "selection") { + forceProviderSelection = true; + continue; + } + session = await deps.recordStepComplete( + "inference", + deps.toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }), + ); + break; + } + if (deps.isRoutedInferenceProvider(provider)) { + try { + await deps.reconcileModelRouter(); + } catch (err) { + deps.error(` ✗ Failed to reconcile model router: ${err instanceof Error ? err.message : String(err)}`); + deps.exitProcess(1); + } + } + deps.skippedStepMessage("inference", `${provider} / ${model}`); + if (nimContainer && sandboxName) deps.registryUpdateSandbox(sandboxName, { nimContainer }); + session = await deps.recordStepComplete( + "inference", + deps.toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }), + ); + break; + } + + if (!sandboxName) sandboxName = await deps.promptValidatedSandboxName(agent); + const buildEstimateNote = + env.NEMOCLAW_IGNORE_RUNTIME_RESOURCES === "1" + ? null + : deps.formatSandboxBuildEstimateNote(deps.assessHost()); + deps.log( + deps.formatOnboardConfigSummary({ + provider, + model, + credentialEnv, + hermesAuthMethod, + webSearchConfig, + hermesToolGateways, + enabledChannels: selectedMessagingChannels.length > 0 ? selectedMessagingChannels : null, + sandboxName, + notes: buildEstimateNote ? [buildEstimateNote] : [], + }), + ); + deps.log(" Web search and messaging channels will be prompted next."); + if (!deps.isNonInteractive()) { + if (!(await deps.promptYesNoOrDefault(" Apply this configuration?", null, true))) { + deps.log(` Aborted. Re-run \`${deps.cliName()} onboard\` to start over.`); + deps.log(" Credentials entered so far were only staged in memory for this run."); + deps.log(" No new gateway credential was registered because onboarding stopped here."); + deps.exitProcess(0); + } + } + + await deps.startRecordedStep("inference", { provider, model }); + const inferenceResult = await deps.setupInference( + sandboxName, + model, + provider, + endpointUrl, + credentialEnv, + hermesAuthMethod, + hermesToolGateways, + ); + deps.deleteEnv("NVIDIA_API_KEY"); + if (inferenceResult?.retry === "selection") { + forceProviderSelection = true; + continue; + } + if (nimContainer && sandboxName) deps.registryUpdateSandbox(sandboxName, { nimContainer }); + session = await deps.recordStepComplete( + "inference", + deps.toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }), + ); + break; + } + + return { + sandboxName, + model, + provider, + endpointUrl, + credentialEnv, + hermesAuthMethod, + hermesToolGateways, + preferredInferenceApi, + nimContainer, + webSearchConfig, + session, + }; +} From 18ef7e763923a23a0b78739e3fc3619557ab9ac1 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Tue, 19 May 2026 23:35:26 -0700 Subject: [PATCH 10/54] refactor(cli): extract onboard sandbox handler --- src/lib/onboard.ts | 267 ++++------------ .../onboard/machine/handlers/sandbox.test.ts | 198 ++++++++++++ src/lib/onboard/machine/handlers/sandbox.ts | 287 ++++++++++++++++++ 3 files changed, 547 insertions(+), 205 deletions(-) create mode 100644 src/lib/onboard/machine/handlers/sandbox.test.ts create mode 100644 src/lib/onboard/machine/handlers/sandbox.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index f7d95ae8ab..b23e7f0fa4 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -283,6 +283,7 @@ const { OnboardRuntime }: typeof import("./onboard/machine/runtime") = require(" const { handleGatewayState }: typeof import("./onboard/machine/handlers/gateway") = require("./onboard/machine/handlers/gateway"); const { handlePreflightState }: typeof import("./onboard/machine/handlers/preflight") = require("./onboard/machine/handlers/preflight"); const { handleProviderInferenceState }: typeof import("./onboard/machine/handlers/provider-inference") = require("./onboard/machine/handlers/provider-inference"); +const { handleSandboxState }: typeof import("./onboard/machine/handlers/sandbox") = require("./onboard/machine/handlers/sandbox"); const policies: typeof import("./policy") = require("./policy"); const tiers: typeof import("./policy/tiers") = require("./policy/tiers"); const { ensureUsageNoticeConsent } = require("./onboard/usage-notice"); @@ -9587,212 +9588,68 @@ async function onboard(opts: OnboardOptions = {}): Promise { } = providerInferenceResult; let webSearchConfig = providerInferenceResult.webSearchConfig as WebSearchConfig | null; - const webSearchSupportProbePath = fromDockerfile ? path.resolve(fromDockerfile) : null; - const webSearchSupported = agentSupportsWebSearch(agent, webSearchSupportProbePath, ROOT); - if (webSearchConfig && !webSearchSupported) { - note( - ` Web search is not yet supported by ${agent?.displayName ?? "this sandbox image"}. Clearing stale config.`, - ); - webSearchConfig = null; - if (session) { - session.webSearchConfig = null; - } - onboardSession.updateSession((current: Session) => { - current.webSearchConfig = null; - return current; - }); - } - - const storedMessagingChannelConfig = getStoredMessagingChannelConfig(sandboxName, session); - const effectiveMessagingChannelConfig = hydrateMessagingChannelConfig(storedMessagingChannelConfig); - const messagingChannelConfigChanged = !messagingChannelConfigsEqual( - effectiveMessagingChannelConfig, - storedMessagingChannelConfig, - ); - if (effectiveMessagingChannelConfig) { - persistMessagingChannelConfigToSession(effectiveMessagingChannelConfig); - if (session) { - session.messagingChannelConfig = effectiveMessagingChannelConfig; - } - } - - const sandboxReuseState = getSandboxReuseState(sandboxName); - const webSearchConfigChanged = Boolean(session?.webSearchConfig) !== Boolean(webSearchConfig); - // Telegram mention-mode is baked into openclaw.json at sandbox build time, so - // changes to TELEGRAM_REQUIRE_MENTION only take effect after a rebuild. Treat - // a mismatch between the recorded config and the current env value as drift - // so the reuse path forces a recreate (mirrors webSearchConfigChanged). See - // #1737 and the CodeRabbit review on #2417. - // - // Compare *effective* modes — null and false both produce groupPolicy: open - // at config-generation time (default behavior), so they collapse to the same - // bucket here. Without this, a sandbox built before TELEGRAM_REQUIRE_MENTION - // existed (recordedTelegramRequireMention === null) would be reused with the - // old groupPolicy: open even after the user sets TELEGRAM_REQUIRE_MENTION=1, - // and vice versa. - const currentTelegramRequireMention = computeTelegramRequireMention(); - const recordedTelegramRequireMention = session?.telegramConfig?.requireMention ?? null; - const effectiveCurrent = currentTelegramRequireMention ?? false; - const effectiveRecorded = recordedTelegramRequireMention ?? false; - const telegramConfigChanged = effectiveCurrent !== effectiveRecorded; - const sandboxGpuConfigChanged = sandboxName - ? hasSandboxGpuDrift(sandboxName, sandboxGpuConfig) - : false; - const wechatConfigChanged = hasWechatConfigDrift(session); - const recordedHermesToolGateways = sandboxName - ? normalizeHermesToolGatewaySelections(registry.getSandbox(sandboxName)?.hermesToolGateways) - : []; - const hermesToolGatewayConfigChanged = !stringSetsEqual( - recordedHermesToolGateways, + const sandboxStateResult = await handleSandboxState({ + resume, + fresh, + session, + sandboxName, + model, + provider, + nimContainer, + webSearchConfig, + selectedMessagingChannels, + fromDockerfile, + agent, + gpu, + preferredInferenceApi, + sandboxGpuConfig, hermesToolGateways, - ); - const resumeSandbox = - resume && - !webSearchConfigChanged && - !telegramConfigChanged && - !sandboxGpuConfigChanged && - !wechatConfigChanged && - !messagingChannelConfigChanged && - !hermesToolGatewayConfigChanged && - session?.steps?.sandbox?.status === "complete" && - sandboxReuseState === "ready"; - if (resumeSandbox) { - if (webSearchConfig) { - note(" [resume] Reusing Brave Search configuration already baked into the sandbox."); - } - selectedMessagingChannels = session?.messagingChannels ?? []; - skippedStepMessage("sandbox", sandboxName); - } else { - if (resume && session?.steps?.sandbox?.status === "complete") { - if (webSearchConfigChanged) { - note(" [resume] Web Search configuration changed; recreating sandbox."); - if (sandboxName) { - registry.removeSandbox(sandboxName); - } - } else if (telegramConfigChanged) { - note(" [resume] TELEGRAM_REQUIRE_MENTION changed; recreating sandbox."); - if (sandboxName) { - registry.removeSandbox(sandboxName); - } - } else if (sandboxGpuConfigChanged) { - note(" [resume] Sandbox GPU settings changed; recreating sandbox."); - if (sandboxName) { - registry.removeSandbox(sandboxName); - } - } else if (wechatConfigChanged) { - note(" [resume] WeChat account metadata changed; recreating sandbox."); - if (sandboxName) { - registry.removeSandbox(sandboxName); - } - } else if (messagingChannelConfigChanged) { - note(" [resume] Messaging channel configuration changed; recreating sandbox."); - if (sandboxName) { - registry.removeSandbox(sandboxName); - } - } else if (hermesToolGatewayConfigChanged) { - note(" [resume] Hermes managed tool gateway selection changed; recreating sandbox."); - if (sandboxName) { - registry.removeSandbox(sandboxName); - } - } else if (sandboxReuseState === "not_ready") { - note( - ` [resume] Recorded sandbox '${sandboxName}' exists but is not ready; recreating it.`, - ); - repairRecordedSandbox(sandboxName); - } else { - note(" [resume] Recorded sandbox state is unavailable; recreating it."); - if (sandboxName) { - registry.removeSandbox(sandboxName); - } - } - } - let nextWebSearchConfig = webSearchConfig; - if (nextWebSearchConfig) { - note(" [resume] Revalidating Brave Search configuration for sandbox recreation."); - const braveApiKey = await ensureValidatedBraveSearchCredential(); - nextWebSearchConfig = braveApiKey ? { fetchEnabled: true } : null; - if (nextWebSearchConfig) { - note(" [resume] Reusing Brave Search configuration."); - } - } else { - nextWebSearchConfig = await configureWebSearch(null, agent, webSearchSupportProbePath); - } - await startRecordedStep("sandbox", { provider, model }); - const recordedMessagingChannels = getRecordedMessagingChannelsForResume(resume, session, sandboxName); - if (recordedMessagingChannels) { - selectedMessagingChannels = recordedMessagingChannels; - if (selectedMessagingChannels.length > 0) { - note( - ` [non-interactive] Reusing messaging channel configuration: ${selectedMessagingChannels.join(", ")}`, - ); - } - } else { - const existing = sandboxName - ? registry.getSandbox(sandboxName)?.messagingChannels ?? - session?.messagingChannels ?? - null - : session?.messagingChannels ?? null; - selectedMessagingChannels = await setupMessagingChannels(agent, existing); - } - const messagingChannelConfig = readMessagingChannelConfigFromEnv(); - onboardSession.updateSession((current: Session) => { - current.messagingChannels = selectedMessagingChannels; - current.messagingChannelConfig = messagingChannelConfig; - return current; - }); - if (!sandboxName) { - sandboxName = await promptValidatedSandboxName(agent); - } - if (typeof model !== "string" || typeof provider !== "string") { - console.error(" Inference selection is incomplete; cannot create sandbox."); - process.exit(1); - } - if (fresh) { - stopStaleDashboardListenersForSandbox(registry.listSandboxes().sandboxes, sandboxName); - } - sandboxName = await createSandbox( - gpu, - model, - provider, - preferredInferenceApi, - sandboxName, - nextWebSearchConfig, - selectedMessagingChannels, - fromDockerfile, - agent, - opts.controlUiPort || null, - sandboxGpuConfig, - hermesToolGateways, - ); - webSearchConfig = nextWebSearchConfig; - registry.updateSandbox(sandboxName, { - model, - provider, - ...getSandboxAgentRegistryFields(agent, !fromDockerfile), - }); - registry.setDefault(sandboxName); - await recordStepComplete( - "sandbox", - toSessionUpdates({ - sandboxName, - provider, - model, - nimContainer, - webSearchConfig, - messagingChannelConfig, - hermesToolGateways, - }), - ); - } - - if ( - typeof sandboxName !== "string" || - typeof provider !== "string" || - typeof model !== "string" - ) { - console.error(" Onboarding state is incomplete after sandbox setup."); - process.exit(1); - } + controlUiPort: opts.controlUiPort || null, + rootDir: ROOT, + deps: { + resolvePath: path.resolve, + agentSupportsWebSearch, + note, + updateSession: onboardSession.updateSession, + getStoredMessagingChannelConfig, + hydrateMessagingChannelConfig, + messagingChannelConfigsEqual, + persistMessagingChannelConfigToSession, + getSandboxReuseState, + computeTelegramRequireMention, + hasSandboxGpuDrift, + hasWechatConfigDrift, + getSandboxHermesToolGateways: (name) => registry.getSandbox(name)?.hermesToolGateways, + normalizeHermesToolGatewaySelections, + stringSetsEqual, + removeSandboxFromRegistry: registry.removeSandbox.bind(registry), + repairRecordedSandbox, + ensureValidatedBraveSearchCredential, + configureWebSearch, + startRecordedStep, + getRecordedMessagingChannelsForResume, + getSandboxMessagingChannels: (name) => registry.getSandbox(name)?.messagingChannels, + setupMessagingChannels, + readMessagingChannelConfigFromEnv, + promptValidatedSandboxName, + stopStaleDashboardListenersForSandbox, + listRegistrySandboxes: registry.listSandboxes, + createSandbox, + updateSandboxRegistry: (name, updates) => registry.updateSandbox(name, updates), + setDefaultSandbox: registry.setDefault, + getSandboxAgentRegistryFields, + recordStepComplete, + toSessionUpdates: (updates) => toSessionUpdates(updates as Parameters[0]), + skippedStepMessage, + error: (message) => console.error(message), + exitProcess: (code) => process.exit(code), + }, + }); + session = sandboxStateResult.session; + sandboxName = sandboxStateResult.sandboxName; + webSearchConfig = sandboxStateResult.webSearchConfig ?? null; + selectedMessagingChannels = sandboxStateResult.selectedMessagingChannels; + const webSearchSupported = sandboxStateResult.webSearchSupported; if (agent) { await agentOnboard.handleAgentSetup(sandboxName, model, provider, agent, resume, session, { diff --git a/src/lib/onboard/machine/handlers/sandbox.test.ts b/src/lib/onboard/machine/handlers/sandbox.test.ts new file mode 100644 index 0000000000..eac0ffb553 --- /dev/null +++ b/src/lib/onboard/machine/handlers/sandbox.test.ts @@ -0,0 +1,198 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it, vi } from "vitest"; + +import { createSession, type Session, type SessionUpdates } from "../../../state/onboard-session"; +import { handleSandboxState, type SandboxStateOptions } from "./sandbox"; + +type Gpu = { type: string } | null; +type Agent = { displayName?: string } | null; +type WebSearchConfig = { fetchEnabled: true }; +type MessagingChannelConfig = Record; +type SandboxGpuConfig = { sandboxGpuEnabled: boolean; mode: string }; + +function createDeps(overrides: Partial["deps"]> = {}) { + let session = createSession(); + const calls = { + note: vi.fn(), + updateSession: vi.fn((mutator: (value: Session) => Session | void) => { + session = mutator(session) ?? session; + return session; + }), + persistMessaging: vi.fn(), + removeSandbox: vi.fn(), + repairSandbox: vi.fn(), + validateBrave: vi.fn(async () => "brave-key"), + configureWebSearch: vi.fn(async () => null as WebSearchConfig | null), + startStep: vi.fn(async () => undefined), + getRecordedChannels: vi.fn(() => null), + setupMessaging: vi.fn(async () => [] as string[]), + promptName: vi.fn(async () => "my-assistant"), + stopStale: vi.fn(), + createSandbox: vi.fn(async () => "my-assistant"), + updateSandbox: vi.fn(), + setDefault: vi.fn(), + complete: vi.fn(async () => createSession()), + skipped: vi.fn(), + error: vi.fn(), + exit: vi.fn((code: number): never => { + throw new Error(`exit ${code}`); + }), + }; + return { + calls, + deps: { + resolvePath: (value: string) => `/abs/${value}`, + agentSupportsWebSearch: () => true, + note: calls.note, + updateSession: calls.updateSession, + getStoredMessagingChannelConfig: () => null, + hydrateMessagingChannelConfig: (config: MessagingChannelConfig | null) => config, + messagingChannelConfigsEqual: () => true, + persistMessagingChannelConfigToSession: calls.persistMessaging, + getSandboxReuseState: () => "missing", + computeTelegramRequireMention: () => null, + hasSandboxGpuDrift: () => false, + hasWechatConfigDrift: () => false, + getSandboxHermesToolGateways: () => [], + normalizeHermesToolGatewaySelections: (value: unknown) => (Array.isArray(value) ? (value as string[]) : []), + stringSetsEqual: (left: string[], right: string[]) => left.length === right.length && left.every((value) => right.includes(value)), + removeSandboxFromRegistry: calls.removeSandbox, + repairRecordedSandbox: calls.repairSandbox, + ensureValidatedBraveSearchCredential: calls.validateBrave, + configureWebSearch: calls.configureWebSearch, + startRecordedStep: calls.startStep, + getRecordedMessagingChannelsForResume: calls.getRecordedChannels, + getSandboxMessagingChannels: () => ["telegram"], + setupMessagingChannels: calls.setupMessaging, + readMessagingChannelConfigFromEnv: () => null, + promptValidatedSandboxName: calls.promptName, + stopStaleDashboardListenersForSandbox: calls.stopStale, + listRegistrySandboxes: () => ({ sandboxes: [{ name: "old" }] }), + createSandbox: calls.createSandbox, + updateSandboxRegistry: calls.updateSandbox, + setDefaultSandbox: calls.setDefault, + getSandboxAgentRegistryFields: () => ({ agent: null }), + recordStepComplete: calls.complete, + toSessionUpdates: (updates: Record) => updates as SessionUpdates, + skippedStepMessage: calls.skipped, + error: calls.error, + exitProcess: calls.exit, + ...overrides, + }, + getSession: () => session, + }; +} + +function baseOptions( + deps: SandboxStateOptions["deps"], + session: Session | null = createSession(), +): SandboxStateOptions { + return { + resume: false, + fresh: false, + session, + sandboxName: null, + model: "model", + provider: "provider", + nimContainer: null, + webSearchConfig: null, + selectedMessagingChannels: [], + fromDockerfile: null, + agent: null, + gpu: { type: "nvidia" }, + preferredInferenceApi: "openai-completions", + sandboxGpuConfig: { sandboxGpuEnabled: false, mode: "0" }, + hermesToolGateways: [], + controlUiPort: null, + rootDir: "/repo", + deps, + }; +} + +describe("handleSandboxState", () => { + it("creates a sandbox and records messaging/web search state", async () => { + const { deps, calls } = createDeps({ + configureWebSearch: vi.fn(async () => ({ fetchEnabled: true as const })), + readMessagingChannelConfigFromEnv: () => ({ telegram: "polling" }), + }); + calls.setupMessaging.mockResolvedValue(["telegram"]); + + const result = await handleSandboxState(baseOptions(deps)); + + expect(calls.startStep).toHaveBeenCalledWith("sandbox", { provider: "provider", model: "model" }); + expect(calls.setupMessaging).toHaveBeenCalledWith(null, null); + expect(calls.promptName).toHaveBeenCalledWith(null); + expect(calls.createSandbox).toHaveBeenCalledWith( + { type: "nvidia" }, + "model", + "provider", + "openai-completions", + "my-assistant", + { fetchEnabled: true }, + ["telegram"], + null, + null, + null, + { sandboxGpuEnabled: false, mode: "0" }, + [], + ); + expect(calls.updateSandbox).toHaveBeenCalledWith("my-assistant", expect.objectContaining({ model: "model", provider: "provider" })); + expect(calls.setDefault).toHaveBeenCalledWith("my-assistant"); + expect(calls.complete).toHaveBeenCalledWith("sandbox", expect.objectContaining({ sandboxName: "my-assistant" })); + expect(result).toMatchObject({ sandboxName: "my-assistant", selectedMessagingChannels: ["telegram"], webSearchSupported: true }); + }); + + it("reuses a completed ready sandbox on resume", async () => { + const session = createSession({ sandboxName: "saved", messagingChannels: ["slack"] }); + session.steps.sandbox.status = "complete"; + const { deps, calls } = createDeps({ getSandboxReuseState: () => "ready" }); + + const result = await handleSandboxState({ ...baseOptions(deps, session), resume: true, sandboxName: "saved" }); + + expect(calls.createSandbox).not.toHaveBeenCalled(); + expect(calls.skipped).toHaveBeenCalledWith("sandbox", "saved"); + expect(result.selectedMessagingChannels).toEqual(["slack"]); + }); + + it("removes registry state when Telegram mention-mode drift forces sandbox recreation", async () => { + const session = createSession({ telegramConfig: { requireMention: true } }); + session.steps.sandbox.status = "complete"; + const { deps, calls } = createDeps({ + getSandboxReuseState: () => "ready", + computeTelegramRequireMention: () => false, + }); + + await handleSandboxState({ + ...baseOptions(deps, session), + resume: true, + sandboxName: "saved", + }); + + expect(calls.note).toHaveBeenCalledWith(" [resume] TELEGRAM_REQUIRE_MENTION changed; recreating sandbox."); + expect(calls.removeSandbox).toHaveBeenCalledWith("saved"); + expect(calls.createSandbox).toHaveBeenCalled(); + }); + + it("repairs not-ready resumed sandboxes before recreation", async () => { + const session = createSession({ sandboxName: "saved" }); + session.steps.sandbox.status = "complete"; + const { deps, calls } = createDeps({ getSandboxReuseState: () => "not_ready" }); + + await handleSandboxState({ ...baseOptions(deps, session), resume: true, sandboxName: "saved" }); + + expect(calls.repairSandbox).toHaveBeenCalledWith("saved"); + expect(calls.createSandbox).toHaveBeenCalled(); + }); + + it("uses recorded messaging channels on non-interactive resume", async () => { + const { deps, calls } = createDeps({ getRecordedMessagingChannelsForResume: vi.fn(() => ["discord"]) }); + + const result = await handleSandboxState(baseOptions(deps)); + + expect(calls.setupMessaging).not.toHaveBeenCalled(); + expect(calls.note).toHaveBeenCalledWith(" [non-interactive] Reusing messaging channel configuration: discord"); + expect(result.selectedMessagingChannels).toEqual(["discord"]); + }); +}); diff --git a/src/lib/onboard/machine/handlers/sandbox.ts b/src/lib/onboard/machine/handlers/sandbox.ts new file mode 100644 index 0000000000..8c45215ed9 --- /dev/null +++ b/src/lib/onboard/machine/handlers/sandbox.ts @@ -0,0 +1,287 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { Session, SessionUpdates } from "../../../state/onboard-session"; + +export interface SandboxStateOptions { + resume: boolean; + fresh: boolean; + session: Session | null; + sandboxName: string | null; + model: string; + provider: string; + nimContainer: string | null; + webSearchConfig: WebSearchConfig | null; + selectedMessagingChannels: string[]; + fromDockerfile: string | null; + agent: Agent; + gpu: Gpu; + preferredInferenceApi: string | null; + sandboxGpuConfig: SandboxGpuConfig; + hermesToolGateways: string[]; + controlUiPort: number | null; + rootDir: string; + deps: { + resolvePath(value: string): string; + agentSupportsWebSearch(agent: Agent, dockerfilePathOverride: string | null, rootDir: string): boolean; + note(message: string): void; + updateSession(mutator: (session: Session) => Session | void): Session; + getStoredMessagingChannelConfig(sandboxName: string | null, session: Session | null): MessagingChannelConfig | null; + hydrateMessagingChannelConfig(config: MessagingChannelConfig | null): MessagingChannelConfig | null; + messagingChannelConfigsEqual(left: MessagingChannelConfig | null, right: MessagingChannelConfig | null): boolean; + persistMessagingChannelConfigToSession(config: MessagingChannelConfig | null): void; + getSandboxReuseState(sandboxName: string | null): string; + computeTelegramRequireMention(): boolean | null; + hasSandboxGpuDrift(sandboxName: string, config: SandboxGpuConfig): boolean; + hasWechatConfigDrift(session: Session | null): boolean; + getSandboxHermesToolGateways(sandboxName: string): unknown; + normalizeHermesToolGatewaySelections(value: unknown): string[]; + stringSetsEqual(left: string[], right: string[]): boolean; + removeSandboxFromRegistry(sandboxName: string): void; + repairRecordedSandbox(sandboxName: string | null): void; + ensureValidatedBraveSearchCredential(): Promise; + configureWebSearch( + existingConfig: WebSearchConfig | null, + agent: Agent, + dockerfilePathOverride: string | null, + ): Promise; + startRecordedStep(stepName: string, updates: { provider: string; model: string }): Promise; + getRecordedMessagingChannelsForResume( + resume: boolean, + session: Session | null, + sandboxName: string | null, + ): string[] | null; + getSandboxMessagingChannels(sandboxName: string): string[] | null | undefined; + setupMessagingChannels(agent: Agent, existingChannels: string[] | null): Promise; + readMessagingChannelConfigFromEnv(): MessagingChannelConfig | null; + promptValidatedSandboxName(agent: Agent): Promise; + stopStaleDashboardListenersForSandbox(sandboxes: unknown[], sandboxName: string): void; + listRegistrySandboxes(): { sandboxes: unknown[] }; + createSandbox( + gpu: Gpu, + model: string, + provider: string, + preferredInferenceApi: string | null, + sandboxName: string, + webSearchConfig: WebSearchConfig | null, + selectedMessagingChannels: string[], + fromDockerfile: string | null, + agent: Agent, + controlUiPort: number | null, + sandboxGpuConfig: SandboxGpuConfig, + hermesToolGateways: string[], + ): Promise; + updateSandboxRegistry(sandboxName: string, updates: Record): void; + setDefaultSandbox(sandboxName: string): void; + getSandboxAgentRegistryFields(agent: Agent, agentVersionKnown: boolean): Record; + recordStepComplete(stepName: string, updates: SessionUpdates): Promise; + toSessionUpdates(updates: Record): SessionUpdates; + skippedStepMessage(stepName: string, detail?: string | null): void; + error(message?: string): void; + exitProcess(code: number): never; + }; +} + +export interface SandboxStateResult { + sandboxName: string; + webSearchConfig: WebSearchConfig | null; + selectedMessagingChannels: string[]; + webSearchSupported: boolean; + session: Session | null; +} + +function sameEffectiveTelegramRequireMention(left: boolean | null, right: boolean | null): boolean { + return (left ?? false) === (right ?? false); +} + +export async function handleSandboxState({ + resume, + fresh, + session, + sandboxName, + model, + provider, + nimContainer, + webSearchConfig, + selectedMessagingChannels, + fromDockerfile, + agent, + gpu, + preferredInferenceApi, + sandboxGpuConfig, + hermesToolGateways, + controlUiPort, + rootDir, + deps, +}: SandboxStateOptions< + Gpu, + Agent, + WebSearchConfig, + MessagingChannelConfig, + SandboxGpuConfig +>): Promise> { + const webSearchSupportProbePath = fromDockerfile ? deps.resolvePath(fromDockerfile) : null; + const webSearchSupported = deps.agentSupportsWebSearch(agent, webSearchSupportProbePath, rootDir); + if (webSearchConfig && !webSearchSupported) { + deps.note( + ` Web search is not yet supported by ${(agent as { displayName?: string } | null)?.displayName ?? "this sandbox image"}. Clearing stale config.`, + ); + webSearchConfig = null; + if (session) session.webSearchConfig = null; + session = deps.updateSession((current) => { + current.webSearchConfig = null; + return current; + }); + } + + const storedMessagingChannelConfig = deps.getStoredMessagingChannelConfig(sandboxName, session); + const effectiveMessagingChannelConfig = deps.hydrateMessagingChannelConfig(storedMessagingChannelConfig); + const messagingChannelConfigChanged = !deps.messagingChannelConfigsEqual( + effectiveMessagingChannelConfig, + storedMessagingChannelConfig, + ); + if (effectiveMessagingChannelConfig) { + deps.persistMessagingChannelConfigToSession(effectiveMessagingChannelConfig); + if (session) session.messagingChannelConfig = effectiveMessagingChannelConfig as Session["messagingChannelConfig"]; + } + + const sandboxReuseState = deps.getSandboxReuseState(sandboxName); + const webSearchConfigChanged = Boolean(session?.webSearchConfig) !== Boolean(webSearchConfig); + const currentTelegramRequireMention = deps.computeTelegramRequireMention(); + const recordedTelegramRequireMention = session?.telegramConfig?.requireMention ?? null; + const telegramConfigChanged = !sameEffectiveTelegramRequireMention( + currentTelegramRequireMention, + recordedTelegramRequireMention, + ); + const sandboxGpuConfigChanged = sandboxName ? deps.hasSandboxGpuDrift(sandboxName, sandboxGpuConfig) : false; + const wechatConfigChanged = deps.hasWechatConfigDrift(session); + const recordedHermesToolGateways = sandboxName + ? deps.normalizeHermesToolGatewaySelections(deps.getSandboxHermesToolGateways(sandboxName)) + : []; + const hermesToolGatewayConfigChanged = !deps.stringSetsEqual(recordedHermesToolGateways, hermesToolGateways); + const resumeSandbox = + resume && + !webSearchConfigChanged && + !telegramConfigChanged && + !sandboxGpuConfigChanged && + !wechatConfigChanged && + !messagingChannelConfigChanged && + !hermesToolGatewayConfigChanged && + session?.steps?.sandbox?.status === "complete" && + sandboxReuseState === "ready"; + + if (resumeSandbox) { + if (webSearchConfig) deps.note(" [resume] Reusing Brave Search configuration already baked into the sandbox."); + selectedMessagingChannels = session?.messagingChannels ?? []; + deps.skippedStepMessage("sandbox", sandboxName); + } else { + if (resume && session?.steps?.sandbox?.status === "complete") { + if (webSearchConfigChanged) { + deps.note(" [resume] Web Search configuration changed; recreating sandbox."); + if (sandboxName) deps.removeSandboxFromRegistry(sandboxName); + } else if (telegramConfigChanged) { + deps.note(" [resume] TELEGRAM_REQUIRE_MENTION changed; recreating sandbox."); + if (sandboxName) deps.removeSandboxFromRegistry(sandboxName); + } else if (sandboxGpuConfigChanged) { + deps.note(" [resume] Sandbox GPU settings changed; recreating sandbox."); + if (sandboxName) deps.removeSandboxFromRegistry(sandboxName); + } else if (wechatConfigChanged) { + deps.note(" [resume] WeChat account metadata changed; recreating sandbox."); + if (sandboxName) deps.removeSandboxFromRegistry(sandboxName); + } else if (messagingChannelConfigChanged) { + deps.note(" [resume] Messaging channel configuration changed; recreating sandbox."); + if (sandboxName) deps.removeSandboxFromRegistry(sandboxName); + } else if (hermesToolGatewayConfigChanged) { + deps.note(" [resume] Hermes managed tool gateway selection changed; recreating sandbox."); + if (sandboxName) deps.removeSandboxFromRegistry(sandboxName); + } else if (sandboxReuseState === "not_ready") { + deps.note(` [resume] Recorded sandbox '${sandboxName}' exists but is not ready; recreating it.`); + deps.repairRecordedSandbox(sandboxName); + } else { + deps.note(" [resume] Recorded sandbox state is unavailable; recreating it."); + if (sandboxName) deps.removeSandboxFromRegistry(sandboxName); + } + } + + let nextWebSearchConfig = webSearchConfig; + if (nextWebSearchConfig) { + deps.note(" [resume] Revalidating Brave Search configuration for sandbox recreation."); + const braveApiKey = await deps.ensureValidatedBraveSearchCredential(); + nextWebSearchConfig = braveApiKey ? webSearchConfig : null; + if (nextWebSearchConfig) deps.note(" [resume] Reusing Brave Search configuration."); + } else { + nextWebSearchConfig = await deps.configureWebSearch(null, agent, webSearchSupportProbePath); + } + + await deps.startRecordedStep("sandbox", { provider, model }); + const recordedMessagingChannels = deps.getRecordedMessagingChannelsForResume(resume, session, sandboxName); + if (recordedMessagingChannels) { + selectedMessagingChannels = recordedMessagingChannels; + if (selectedMessagingChannels.length > 0) { + deps.note(` [non-interactive] Reusing messaging channel configuration: ${selectedMessagingChannels.join(", ")}`); + } + } else { + const existing = sandboxName + ? deps.getSandboxMessagingChannels(sandboxName) ?? session?.messagingChannels ?? null + : session?.messagingChannels ?? null; + selectedMessagingChannels = await deps.setupMessagingChannels(agent, existing); + } + const messagingChannelConfig = deps.readMessagingChannelConfigFromEnv(); + session = deps.updateSession((current) => { + current.messagingChannels = selectedMessagingChannels; + current.messagingChannelConfig = messagingChannelConfig as Session["messagingChannelConfig"]; + return current; + }); + + if (!sandboxName) sandboxName = await deps.promptValidatedSandboxName(agent); + if (fresh) deps.stopStaleDashboardListenersForSandbox(deps.listRegistrySandboxes().sandboxes, sandboxName); + sandboxName = await deps.createSandbox( + gpu, + model, + provider, + preferredInferenceApi, + sandboxName, + nextWebSearchConfig, + selectedMessagingChannels, + fromDockerfile, + agent, + controlUiPort, + sandboxGpuConfig, + hermesToolGateways, + ); + webSearchConfig = nextWebSearchConfig; + deps.updateSandboxRegistry(sandboxName, { + model, + provider, + ...deps.getSandboxAgentRegistryFields(agent, !fromDockerfile), + }); + deps.setDefaultSandbox(sandboxName); + session = await deps.recordStepComplete( + "sandbox", + deps.toSessionUpdates({ + sandboxName, + provider, + model, + nimContainer, + webSearchConfig, + messagingChannelConfig, + hermesToolGateways, + }), + ); + } + + if (!sandboxName) { + deps.error(" Onboarding state is incomplete after sandbox setup."); + deps.exitProcess(1); + } + const completedSandboxName = sandboxName; + if (!completedSandboxName) throw new Error("Sandbox name is required after sandbox setup"); + + return { + sandboxName: completedSandboxName, + webSearchConfig, + selectedMessagingChannels, + webSearchSupported, + session, + }; +} From 7fe9e1cba937226c582e7a8ce0fb8cc26c1e7a7d Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Tue, 19 May 2026 23:44:11 -0700 Subject: [PATCH 11/54] refactor(cli): extract onboard agent setup handler --- src/lib/onboard.ts | 64 ++++----- .../machine/handlers/agent-setup.test.ts | 122 ++++++++++++++++++ .../onboard/machine/handlers/agent-setup.ts | 87 +++++++++++++ 3 files changed, 242 insertions(+), 31 deletions(-) create mode 100644 src/lib/onboard/machine/handlers/agent-setup.test.ts create mode 100644 src/lib/onboard/machine/handlers/agent-setup.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index b23e7f0fa4..7462486abf 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -280,6 +280,7 @@ const { resolveSandboxImageTagFromCreateOutput } = const nim: typeof import("./inference/nim") = require("./inference/nim"); const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session"); const { OnboardRuntime }: typeof import("./onboard/machine/runtime") = require("./onboard/machine/runtime"); +const { handleAgentSetupState }: typeof import("./onboard/machine/handlers/agent-setup") = require("./onboard/machine/handlers/agent-setup"); const { handleGatewayState }: typeof import("./onboard/machine/handlers/gateway") = require("./onboard/machine/handlers/gateway"); const { handlePreflightState }: typeof import("./onboard/machine/handlers/preflight") = require("./onboard/machine/handlers/preflight"); const { handleProviderInferenceState }: typeof import("./onboard/machine/handlers/provider-inference") = require("./onboard/machine/handlers/provider-inference"); @@ -9651,38 +9652,39 @@ async function onboard(opts: OnboardOptions = {}): Promise { selectedMessagingChannels = sandboxStateResult.selectedMessagingChannels; const webSearchSupported = sandboxStateResult.webSearchSupported; - if (agent) { - await agentOnboard.handleAgentSetup(sandboxName, model, provider, agent, resume, session, { - step, - runCaptureOpenshell, - openshellShellCommand, - openshellBinary: getOpenshellBinary(), - buildSandboxConfigSyncScript, - writeSandboxConfigSyncFile, - cleanupTempDir, - startRecordedStep, + const agentSetupResult = await handleAgentSetupState({ + agent, + sandboxName, + model, + provider, + resume, + session, + hermesAuthMethod, + hermesToolGateways, + deps: { + handleAgentSetup: agentOnboard.handleAgentSetup, + agentSetupContext: () => ({ + step, + runCaptureOpenshell, + openshellShellCommand, + openshellBinary: getOpenshellBinary(), + buildSandboxConfigSyncScript, + writeSandboxConfigSyncFile, + cleanupTempDir, + startRecordedStep, + skippedStepMessage, + }), + ensureAgentDashboardForward, + recordStepSkipped, + isOpenclawReady, skippedStepMessage, - }); - ensureAgentDashboardForward(sandboxName, agent); - await recordStepSkipped("openclaw"); - } else { - const resumeOpenclaw = resume && sandboxName && isOpenclawReady(sandboxName); - if (resumeOpenclaw) { - skippedStepMessage("openclaw", sandboxName); - await recordStepComplete( - "openclaw", - toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }), - ); - } else { - await startRecordedStep("openclaw", { sandboxName, provider, model }); - await setupOpenclaw(sandboxName, model, provider); - await recordStepComplete( - "openclaw", - toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }), - ); - } - await recordStepSkipped("agent_setup"); - } + startRecordedStep, + setupOpenclaw, + recordStepComplete, + toSessionUpdates: (updates) => toSessionUpdates(updates as Parameters[0]), + }, + }); + session = agentSetupResult.session; const latestSession = onboardSession.loadSession(); const recordedPolicyPresets = Array.isArray(latestSession?.policyPresets) diff --git a/src/lib/onboard/machine/handlers/agent-setup.test.ts b/src/lib/onboard/machine/handlers/agent-setup.test.ts new file mode 100644 index 0000000000..fd9f1d0410 --- /dev/null +++ b/src/lib/onboard/machine/handlers/agent-setup.test.ts @@ -0,0 +1,122 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it, vi } from "vitest"; + +import { createSession, type Session, type SessionUpdates } from "../../../state/onboard-session"; +import { handleAgentSetupState, type AgentSetupStateOptions } from "./agent-setup"; + +type Agent = { name: string; displayName: string }; + +function createDeps(overrides: Partial["deps"]> = {}) { + const calls = { + handleAgentSetup: vi.fn(async () => undefined), + context: vi.fn(() => ({ ctx: true })), + ensureDashboard: vi.fn(() => 18789), + skipped: vi.fn(async () => createSession()), + openclawReady: vi.fn(() => false), + skippedMessage: vi.fn(), + startStep: vi.fn(async () => undefined), + setupOpenclaw: vi.fn(async () => undefined), + complete: vi.fn(async () => createSession()), + }; + return { + calls, + deps: { + handleAgentSetup: calls.handleAgentSetup, + agentSetupContext: calls.context, + ensureAgentDashboardForward: calls.ensureDashboard, + recordStepSkipped: calls.skipped, + isOpenclawReady: calls.openclawReady, + skippedStepMessage: calls.skippedMessage, + startRecordedStep: calls.startStep, + setupOpenclaw: calls.setupOpenclaw, + recordStepComplete: calls.complete, + toSessionUpdates: (updates: Record) => updates as SessionUpdates, + ...overrides, + }, + }; +} + +function baseOptions( + deps: AgentSetupStateOptions["deps"], + agent: Agent | null = null, +): AgentSetupStateOptions { + return { + agent, + sandboxName: "my-assistant", + model: "model", + provider: "provider", + resume: false, + session: createSession(), + hermesAuthMethod: null, + hermesToolGateways: [], + deps, + }; +} + +describe("handleAgentSetupState", () => { + it("delegates non-OpenClaw agent setup and skips openclaw", async () => { + const { deps, calls } = createDeps(); + const agent = { name: "hermes", displayName: "Hermes" }; + const session = createSession(); + + await handleAgentSetupState({ ...baseOptions(deps, agent), session, resume: true }); + + expect(calls.handleAgentSetup).toHaveBeenCalledWith( + "my-assistant", + "model", + "provider", + agent, + true, + session, + { ctx: true }, + ); + expect(calls.ensureDashboard).toHaveBeenCalledWith("my-assistant", agent); + expect(calls.skipped).toHaveBeenCalledWith("openclaw"); + expect(calls.setupOpenclaw).not.toHaveBeenCalled(); + }); + + it("skips OpenClaw setup on resume when OpenClaw is ready", async () => { + const { deps, calls } = createDeps({ isOpenclawReady: vi.fn(() => true) }); + + await handleAgentSetupState({ ...baseOptions(deps), resume: true }); + + expect(calls.skippedMessage).toHaveBeenCalledWith("openclaw", "my-assistant"); + expect(calls.startStep).not.toHaveBeenCalled(); + expect(calls.setupOpenclaw).not.toHaveBeenCalled(); + expect(calls.complete).toHaveBeenCalledWith( + "openclaw", + expect.objectContaining({ sandboxName: "my-assistant", provider: "provider", model: "model" }), + ); + expect(calls.skipped).toHaveBeenCalledWith("agent_setup"); + }); + + it("runs OpenClaw setup and skips agent_setup for the default agent", async () => { + const { deps, calls } = createDeps(); + + await handleAgentSetupState({ + ...baseOptions(deps), + hermesAuthMethod: "oauth", + hermesToolGateways: ["github"], + }); + + expect(calls.startStep).toHaveBeenCalledWith("openclaw", { + sandboxName: "my-assistant", + provider: "provider", + model: "model", + }); + expect(calls.setupOpenclaw).toHaveBeenCalledWith("my-assistant", "model", "provider"); + expect(calls.complete).toHaveBeenCalledWith( + "openclaw", + expect.objectContaining({ + sandboxName: "my-assistant", + provider: "provider", + model: "model", + hermesAuthMethod: "oauth", + hermesToolGateways: ["github"], + }), + ); + expect(calls.skipped).toHaveBeenCalledWith("agent_setup"); + }); +}); diff --git a/src/lib/onboard/machine/handlers/agent-setup.ts b/src/lib/onboard/machine/handlers/agent-setup.ts new file mode 100644 index 0000000000..40330711ad --- /dev/null +++ b/src/lib/onboard/machine/handlers/agent-setup.ts @@ -0,0 +1,87 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { Session, SessionUpdates } from "../../../state/onboard-session"; + +export interface AgentSetupStateOptions { + agent: Agent | null; + sandboxName: string; + model: string; + provider: string; + resume: boolean; + session: Session | null; + hermesAuthMethod: string | null; + hermesToolGateways: string[]; + deps: { + handleAgentSetup( + sandboxName: string, + model: string, + provider: string, + agent: Agent, + resume: boolean, + session: Session | null, + context: unknown, + ): Promise; + agentSetupContext(): unknown; + ensureAgentDashboardForward(sandboxName: string, agent: Agent): number; + recordStepSkipped(stepName: string): Promise; + isOpenclawReady(sandboxName: string): boolean; + skippedStepMessage(stepName: string, detail?: string | null): void; + startRecordedStep( + stepName: string, + updates: { sandboxName: string; provider: string; model: string }, + ): Promise; + setupOpenclaw(sandboxName: string, model: string, provider: string): Promise; + recordStepComplete(stepName: string, updates: SessionUpdates): Promise; + toSessionUpdates(updates: Record): SessionUpdates; + }; +} + +export interface AgentSetupStateResult { + session: Session | null; +} + +export async function handleAgentSetupState({ + agent, + sandboxName, + model, + provider, + resume, + session, + hermesAuthMethod, + hermesToolGateways, + deps, +}: AgentSetupStateOptions): Promise { + if (agent) { + await deps.handleAgentSetup( + sandboxName, + model, + provider, + agent, + resume, + session, + deps.agentSetupContext(), + ); + deps.ensureAgentDashboardForward(sandboxName, agent); + session = await deps.recordStepSkipped("openclaw"); + return { session }; + } + + const resumeOpenclaw = resume && sandboxName && deps.isOpenclawReady(sandboxName); + if (resumeOpenclaw) { + deps.skippedStepMessage("openclaw", sandboxName); + session = await deps.recordStepComplete( + "openclaw", + deps.toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }), + ); + } else { + await deps.startRecordedStep("openclaw", { sandboxName, provider, model }); + await deps.setupOpenclaw(sandboxName, model, provider); + session = await deps.recordStepComplete( + "openclaw", + deps.toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }), + ); + } + session = await deps.recordStepSkipped("agent_setup"); + return { session }; +} From b9daca0cd003aeffc535592b5468893e005515e7 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Tue, 19 May 2026 23:55:05 -0700 Subject: [PATCH 12/54] refactor(cli): extract onboard policies handler --- src/lib/onboard.ts | 114 +++-------- .../onboard/machine/handlers/policies.test.ts | 182 +++++++++++++++++ src/lib/onboard/machine/handlers/policies.ts | 189 ++++++++++++++++++ 3 files changed, 401 insertions(+), 84 deletions(-) create mode 100644 src/lib/onboard/machine/handlers/policies.test.ts create mode 100644 src/lib/onboard/machine/handlers/policies.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 7462486abf..e406d8ca0c 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -282,6 +282,7 @@ const onboardSession: typeof import("./state/onboard-session") = require("./stat const { OnboardRuntime }: typeof import("./onboard/machine/runtime") = require("./onboard/machine/runtime"); const { handleAgentSetupState }: typeof import("./onboard/machine/handlers/agent-setup") = require("./onboard/machine/handlers/agent-setup"); const { handleGatewayState }: typeof import("./onboard/machine/handlers/gateway") = require("./onboard/machine/handlers/gateway"); +const { handlePoliciesState }: typeof import("./onboard/machine/handlers/policies") = require("./onboard/machine/handlers/policies"); const { handlePreflightState }: typeof import("./onboard/machine/handlers/preflight") = require("./onboard/machine/handlers/preflight"); const { handleProviderInferenceState }: typeof import("./onboard/machine/handlers/provider-inference") = require("./onboard/machine/handlers/provider-inference"); const { handleSandboxState }: typeof import("./onboard/machine/handlers/sandbox") = require("./onboard/machine/handlers/sandbox"); @@ -9686,97 +9687,42 @@ async function onboard(opts: OnboardOptions = {}): Promise { }); session = agentSetupResult.session; - const latestSession = onboardSession.loadSession(); - const recordedPolicyPresets = Array.isArray(latestSession?.policyPresets) - ? latestSession.policyPresets - : null; - const recordedMessagingChannels = Array.isArray(latestSession?.messagingChannels) - ? latestSession.messagingChannels - : []; - const activeMessagingChannels = registry.getSandbox(sandboxName)?.messagingChannels; - verifyCompatibleEndpointSandboxSmoke({ + const policiesResult = await handlePoliciesState({ + resume, sandboxName, provider, model, - runOpenshell, - redact, endpointUrl, credentialEnv, - messagingChannels: Array.isArray(activeMessagingChannels) ? activeMessagingChannels : [], + selectedMessagingChannels, + webSearchConfig, + webSearchSupported, + hermesToolGateways, agent, + deps: { + loadSession: onboardSession.loadSession, + getActiveMessagingChannels: (name) => registry.getSandbox(name)?.messagingChannels, + verifyCompatibleEndpointSandboxSmoke: (options) => + verifyCompatibleEndpointSandboxSmoke({ + ...options, + runOpenshell, + redact, + }), + listSetupPolicyPresets: policies.listSetupPolicyPresets, + getAppliedPolicyPresets: policies.getAppliedPresets, + listCustomPolicyPresets: policies.listCustomPresets, + clampSetupPolicyPresetNames: policies.clampSetupPolicyPresetNames, + mergeRequiredHermesToolGatewayPolicyPresets, + arePolicyPresetsApplied, + skippedStepMessage, + startRecordedStep, + setupPoliciesWithSelection, + updateSession: onboardSession.updateSession, + recordStepComplete, + toSessionUpdates: (updates) => toSessionUpdates(updates as Parameters[0]), + }, }); - const policyPresetSupportOptions = { webSearchSupported }; - const selectablePolicyPresetsForSupport = [ - ...policies.listSetupPolicyPresets(sandboxName, policyPresetSupportOptions), - ...policies.getAppliedPresets(sandboxName).map((name) => ({ name })), - ]; - const customPolicyPresetNames = new Set( - policies.listCustomPresets(sandboxName).map((p: { name: string }) => p.name), - ); - let recordedPolicyPresetsForSupport = policies.clampSetupPolicyPresetNames( - recordedPolicyPresets || [], - selectablePolicyPresetsForSupport, - policyPresetSupportOptions, - customPolicyPresetNames, - ); - if (recordedPolicyPresets) { - recordedPolicyPresetsForSupport = mergeRequiredHermesToolGatewayPolicyPresets( - recordedPolicyPresetsForSupport, - hermesToolGateways, - selectablePolicyPresetsForSupport.map((p) => p.name), - ); - } - const recordedPolicyPresetsHaveUnsupported = - Array.isArray(recordedPolicyPresets) && - recordedPolicyPresetsForSupport.length !== recordedPolicyPresets.length; - const resumePolicies = - resume && - sandboxName && - !recordedPolicyPresetsHaveUnsupported && - arePolicyPresetsApplied(sandboxName, recordedPolicyPresetsForSupport); - if (resumePolicies) { - skippedStepMessage("policies", recordedPolicyPresetsForSupport.join(", ")); - await recordStepComplete( - "policies", - toSessionUpdates({ - sandboxName, - provider, - model, - policyPresets: recordedPolicyPresetsForSupport, - }), - ); - } else { - await startRecordedStep("policies", { - sandboxName, - provider, - model, - policyPresets: recordedPolicyPresetsForSupport, - }); - const appliedPolicyPresets = await setupPoliciesWithSelection(sandboxName, { - selectedPresets: - Array.isArray(recordedPolicyPresets) - ? recordedPolicyPresetsForSupport - : null, - enabledChannels: - selectedMessagingChannels.length > 0 - ? selectedMessagingChannels - : recordedMessagingChannels, - webSearchConfig, - provider, - webSearchSupported, - hermesToolGateways, - onSelection: (policyPresets) => { - onboardSession.updateSession((current: Session) => { - current.policyPresets = policyPresets; - return current; - }); - }, - }); - await recordStepComplete( - "policies", - toSessionUpdates({ sandboxName, provider, model, policyPresets: appliedPolicyPresets }), - ); - } + session = policiesResult.session; if (agent) { ensureAgentDashboardForward(sandboxName, agent); diff --git a/src/lib/onboard/machine/handlers/policies.test.ts b/src/lib/onboard/machine/handlers/policies.test.ts new file mode 100644 index 0000000000..ee315d34f0 --- /dev/null +++ b/src/lib/onboard/machine/handlers/policies.test.ts @@ -0,0 +1,182 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it, vi } from "vitest"; + +import { createSession, type Session, type SessionUpdates } from "../../../state/onboard-session"; +import { handlePoliciesState, type PoliciesStateOptions } from "./policies"; + +type Agent = { name: string } | null; +type WebSearchConfig = { fetchEnabled: true }; + +function createDeps(overrides: Partial["deps"]> = {}) { + let session = createSession(); + const calls = { + load: vi.fn(() => session), + activeChannels: vi.fn(() => ["telegram"]), + smoke: vi.fn(), + listSetup: vi.fn(() => [{ name: "npm" }, { name: "pypi" }, { name: "github" }]), + applied: vi.fn(() => [] as string[]), + custom: vi.fn(() => [] as { name: string }[]), + clamp: vi.fn((names: string[]) => names.filter((name) => name !== "unsupported")), + mergeHermes: vi.fn((selected: string[], tools: string[]) => [...selected, ...tools]), + appliedCheck: vi.fn(() => false), + skipped: vi.fn(), + startStep: vi.fn(async () => undefined), + setupPolicies: vi.fn(async () => ["npm"]), + updateSession: vi.fn((mutator: (value: Session) => Session | void) => { + session = mutator(session) ?? session; + return session; + }), + complete: vi.fn(async () => session), + }; + return { + calls, + deps: { + loadSession: calls.load, + getActiveMessagingChannels: calls.activeChannels, + verifyCompatibleEndpointSandboxSmoke: calls.smoke, + listSetupPolicyPresets: calls.listSetup, + getAppliedPolicyPresets: calls.applied, + listCustomPolicyPresets: calls.custom, + clampSetupPolicyPresetNames: calls.clamp, + mergeRequiredHermesToolGatewayPolicyPresets: calls.mergeHermes, + arePolicyPresetsApplied: calls.appliedCheck, + skippedStepMessage: calls.skipped, + startRecordedStep: calls.startStep, + setupPoliciesWithSelection: calls.setupPolicies, + updateSession: calls.updateSession, + recordStepComplete: calls.complete, + toSessionUpdates: (updates: Record) => updates as SessionUpdates, + ...overrides, + }, + setSession(next: Session) { + session = next; + }, + getSession: () => session, + }; +} + +function baseOptions( + deps: PoliciesStateOptions["deps"], +): PoliciesStateOptions { + return { + resume: false, + sandboxName: "my-assistant", + provider: "provider", + model: "model", + endpointUrl: "https://example.com/v1", + credentialEnv: "NVIDIA_API_KEY", + selectedMessagingChannels: [], + webSearchConfig: null, + webSearchSupported: true, + hermesToolGateways: [], + agent: null, + deps, + }; +} + +describe("handlePoliciesState", () => { + it("runs compatible endpoint smoke before policy selection", async () => { + const { deps, calls } = createDeps(); + + await handlePoliciesState(baseOptions(deps)); + + expect(calls.smoke).toHaveBeenCalledWith({ + sandboxName: "my-assistant", + provider: "provider", + model: "model", + endpointUrl: "https://example.com/v1", + credentialEnv: "NVIDIA_API_KEY", + messagingChannels: ["telegram"], + agent: null, + }); + expect(calls.startStep).toHaveBeenCalledWith("policies", { + sandboxName: "my-assistant", + provider: "provider", + model: "model", + policyPresets: [], + }); + expect(calls.setupPolicies).toHaveBeenCalledWith( + "my-assistant", + expect.objectContaining({ + selectedPresets: null, + enabledChannels: [], + provider: "provider", + webSearchSupported: true, + }), + ); + expect(calls.complete).toHaveBeenCalledWith( + "policies", + expect.objectContaining({ policyPresets: ["npm"] }), + ); + }); + + it("uses recorded messaging channels when no active selection exists", async () => { + const session = createSession({ messagingChannels: ["slack"] }); + const { deps, calls, setSession } = createDeps(); + setSession(session); + + await handlePoliciesState(baseOptions(deps)); + + expect(calls.setupPolicies).toHaveBeenCalledWith( + "my-assistant", + expect.objectContaining({ enabledChannels: ["slack"] }), + ); + }); + + it("resumes policies when all recorded presets are already applied", async () => { + const session = createSession({ policyPresets: ["npm"] }); + const { deps, calls, setSession } = createDeps({ + arePolicyPresetsApplied: vi.fn(() => true), + }); + setSession(session); + + const result = await handlePoliciesState({ ...baseOptions(deps), resume: true }); + + expect(calls.skipped).toHaveBeenCalledWith("policies", "npm"); + expect(calls.setupPolicies).not.toHaveBeenCalled(); + expect(calls.complete).toHaveBeenCalledWith( + "policies", + expect.objectContaining({ policyPresets: ["npm"] }), + ); + expect(result.appliedPolicyPresets).toEqual(["npm"]); + }); + + it("clamps unsupported recorded presets before interactive setup", async () => { + const session = createSession({ policyPresets: ["npm", "unsupported"] }); + const { deps, calls, setSession } = createDeps(); + setSession(session); + + await handlePoliciesState(baseOptions(deps)); + + expect(calls.clamp).toHaveBeenCalledWith( + ["npm", "unsupported"], + expect.any(Array), + { webSearchSupported: true }, + expect.any(Set), + ); + expect(calls.setupPolicies).toHaveBeenCalledWith( + "my-assistant", + expect.objectContaining({ selectedPresets: ["npm"] }), + ); + }); + + it("merges required Hermes tool gateway presets into recorded selections", async () => { + const session = createSession({ policyPresets: ["npm"] }); + const { deps, calls, setSession } = createDeps(); + setSession(session); + + await handlePoliciesState({ ...baseOptions(deps), hermesToolGateways: ["github"] }); + + expect(calls.mergeHermes).toHaveBeenCalledWith( + ["npm"], + ["github"], + ["npm", "pypi", "github"], + ); + expect(calls.setupPolicies).toHaveBeenCalledWith( + "my-assistant", + expect.objectContaining({ selectedPresets: ["npm", "github"] }), + ); + }); +}); diff --git a/src/lib/onboard/machine/handlers/policies.ts b/src/lib/onboard/machine/handlers/policies.ts new file mode 100644 index 0000000000..ad35931cbf --- /dev/null +++ b/src/lib/onboard/machine/handlers/policies.ts @@ -0,0 +1,189 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { Session, SessionUpdates } from "../../../state/onboard-session"; + +export interface PolicyPresetEntry { + name: string; + [key: string]: unknown; +} + +export interface PoliciesStateOptions { + resume: boolean; + sandboxName: string; + provider: string; + model: string; + endpointUrl: string | null; + credentialEnv: string | null; + selectedMessagingChannels: string[]; + webSearchConfig: WebSearchConfig | null; + webSearchSupported: boolean; + hermesToolGateways: string[]; + agent: Agent; + deps: { + loadSession(): Session | null; + getActiveMessagingChannels(sandboxName: string): string[] | null | undefined; + verifyCompatibleEndpointSandboxSmoke(options: { + sandboxName: string; + provider: string; + model: string; + endpointUrl: string | null; + credentialEnv: string | null; + messagingChannels: string[]; + agent: Agent; + }): void; + listSetupPolicyPresets( + sandboxName: string, + options: { webSearchSupported: boolean }, + ): PolicyPresetEntry[]; + getAppliedPolicyPresets(sandboxName: string): string[]; + listCustomPolicyPresets(sandboxName: string): PolicyPresetEntry[]; + clampSetupPolicyPresetNames( + names: string[], + selectablePresets: PolicyPresetEntry[], + options: { webSearchSupported: boolean }, + customPresetNames: Set, + ): string[]; + mergeRequiredHermesToolGatewayPolicyPresets( + selectedPresets: string[], + hermesToolGateways: string[], + selectablePresetNames: string[], + ): string[]; + arePolicyPresetsApplied(sandboxName: string, selectedPresets: string[]): boolean; + skippedStepMessage(stepName: string, detail?: string | null): void; + startRecordedStep( + stepName: string, + updates: { sandboxName: string; provider: string; model: string; policyPresets: string[] }, + ): Promise; + setupPoliciesWithSelection( + sandboxName: string, + options: { + selectedPresets: string[] | null; + enabledChannels: string[]; + webSearchConfig: WebSearchConfig | null; + provider: string; + webSearchSupported: boolean; + hermesToolGateways: string[]; + onSelection: (policyPresets: string[]) => void; + }, + ): Promise; + updateSession(mutator: (session: Session) => Session | void): Session; + recordStepComplete(stepName: string, updates: SessionUpdates): Promise; + toSessionUpdates(updates: Record): SessionUpdates; + }; +} + +export interface PoliciesStateResult { + session: Session | null; + recordedMessagingChannels: string[]; + appliedPolicyPresets: string[]; +} + +export async function handlePoliciesState({ + resume, + sandboxName, + provider, + model, + endpointUrl, + credentialEnv, + selectedMessagingChannels, + webSearchConfig, + webSearchSupported, + hermesToolGateways, + agent, + deps, +}: PoliciesStateOptions): Promise { + const latestSession = deps.loadSession(); + const recordedPolicyPresets = Array.isArray(latestSession?.policyPresets) + ? latestSession.policyPresets + : null; + const recordedMessagingChannels = Array.isArray(latestSession?.messagingChannels) + ? latestSession.messagingChannels + : []; + const activeMessagingChannels = deps.getActiveMessagingChannels(sandboxName); + deps.verifyCompatibleEndpointSandboxSmoke({ + sandboxName, + provider, + model, + endpointUrl, + credentialEnv, + messagingChannels: Array.isArray(activeMessagingChannels) ? activeMessagingChannels : [], + agent, + }); + + const policyPresetSupportOptions = { webSearchSupported }; + const selectablePolicyPresetsForSupport = [ + ...deps.listSetupPolicyPresets(sandboxName, policyPresetSupportOptions), + ...deps.getAppliedPolicyPresets(sandboxName).map((name) => ({ name })), + ]; + const customPolicyPresetNames = new Set( + deps.listCustomPolicyPresets(sandboxName).map((preset) => preset.name), + ); + let recordedPolicyPresetsForSupport = deps.clampSetupPolicyPresetNames( + recordedPolicyPresets || [], + selectablePolicyPresetsForSupport, + policyPresetSupportOptions, + customPolicyPresetNames, + ); + if (recordedPolicyPresets) { + recordedPolicyPresetsForSupport = deps.mergeRequiredHermesToolGatewayPolicyPresets( + recordedPolicyPresetsForSupport, + hermesToolGateways, + selectablePolicyPresetsForSupport.map((preset) => preset.name), + ); + } + const recordedPolicyPresetsHaveUnsupported = + Array.isArray(recordedPolicyPresets) && + recordedPolicyPresetsForSupport.length !== recordedPolicyPresets.length; + const resumePolicies = + resume && + !recordedPolicyPresetsHaveUnsupported && + deps.arePolicyPresetsApplied(sandboxName, recordedPolicyPresetsForSupport); + + let appliedPolicyPresets = recordedPolicyPresetsForSupport; + let session: Session | null; + if (resumePolicies) { + deps.skippedStepMessage("policies", recordedPolicyPresetsForSupport.join(", ")); + session = await deps.recordStepComplete( + "policies", + deps.toSessionUpdates({ + sandboxName, + provider, + model, + policyPresets: recordedPolicyPresetsForSupport, + }), + ); + } else { + await deps.startRecordedStep("policies", { + sandboxName, + provider, + model, + policyPresets: recordedPolicyPresetsForSupport, + }); + appliedPolicyPresets = await deps.setupPoliciesWithSelection(sandboxName, { + selectedPresets: Array.isArray(recordedPolicyPresets) + ? recordedPolicyPresetsForSupport + : null, + enabledChannels: + selectedMessagingChannels.length > 0 + ? selectedMessagingChannels + : recordedMessagingChannels, + webSearchConfig, + provider, + webSearchSupported, + hermesToolGateways, + onSelection: (policyPresets) => { + deps.updateSession((current) => { + current.policyPresets = policyPresets; + return current; + }); + }, + }); + session = await deps.recordStepComplete( + "policies", + deps.toSessionUpdates({ sandboxName, provider, model, policyPresets: appliedPolicyPresets }), + ); + } + + return { session, recordedMessagingChannels, appliedPolicyPresets }; +} From d6585528c5895eb8a74f89253299aaa9d6fd5913 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 00:10:05 -0700 Subject: [PATCH 13/54] refactor(cli): extract onboard finalization handler --- src/lib/onboard.ts | 124 +++++++---------- .../machine/handlers/finalization.test.ts | 125 ++++++++++++++++++ .../onboard/machine/handlers/finalization.ts | 90 +++++++++++++ 3 files changed, 262 insertions(+), 77 deletions(-) create mode 100644 src/lib/onboard/machine/handlers/finalization.test.ts create mode 100644 src/lib/onboard/machine/handlers/finalization.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index e406d8ca0c..33428431c0 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -281,6 +281,7 @@ const nim: typeof import("./inference/nim") = require("./inference/nim"); const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session"); const { OnboardRuntime }: typeof import("./onboard/machine/runtime") = require("./onboard/machine/runtime"); const { handleAgentSetupState }: typeof import("./onboard/machine/handlers/agent-setup") = require("./onboard/machine/handlers/agent-setup"); +const { handleFinalizationState }: typeof import("./onboard/machine/handlers/finalization") = require("./onboard/machine/handlers/finalization"); const { handleGatewayState }: typeof import("./onboard/machine/handlers/gateway") = require("./onboard/machine/handlers/gateway"); const { handlePoliciesState }: typeof import("./onboard/machine/handlers/policies") = require("./onboard/machine/handlers/policies"); const { handlePreflightState }: typeof import("./onboard/machine/handlers/preflight") = require("./onboard/machine/handlers/preflight"); @@ -9724,88 +9725,57 @@ async function onboard(opts: OnboardOptions = {}): Promise { }); session = policiesResult.session; - if (agent) { - ensureAgentDashboardForward(sandboxName, agent); - } - - await recordSessionComplete( - toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }), - ); - completed = true; - // Onboarding finished successfully. Delete the legacy plaintext - // credentials.json only when every staged *value* was actually pushed - // to the gateway in this run. A successful upsert under the same - // env-key name with a different value (e.g. vllm-local upserting - // `OPENAI_API_KEY: "dummy"` while the legacy file held a real - // `sk-…` cloud key) does not count as a migration — the gateway - // never received the legacy secret, so unlinking the file would - // strand the user's only copy. - const allStagedMigrated = - stagedLegacyKeys.length > 0 && stagedLegacyKeys.every((k) => migratedLegacyKeys.has(k)); - if (allStagedMigrated) { - removeLegacyCredentialsFile(); - } else if (stagedLegacyKeys.length > 0) { - const unmigrated = stagedLegacyKeys.filter((k) => !migratedLegacyKeys.has(k)); - console.error( - ` Kept ~/.nemoclaw/credentials.json: ${String(unmigrated.length)} ` + - `legacy credential(s) were not migrated verbatim to the gateway in this run ` + - `(${unmigrated.join(", ")}). Re-run onboard with the relevant ` + - `providers/channels enabled to migrate them, then the file is removed automatically.`, - ); - } - // Sweep stale host files left over from older NemoClaw versions — - // e.g. an empty/orphaned ~/.nemoclaw/credentials.json from upgrades - // before the credentials-gateway move (issue #3105). Each registered - // entry enforces its own safety guards; this call is a no-op when - // every target is already clean. - cleanupStaleHostFiles(); - - // Step [8/8] policy-apply restarts the sandbox container; the OpenClaw - // gateway inside the new container is launched lazily (normally by the - // first `nemoclaw connect`). Bring it up explicitly here so the - // verifyDeployment block below does not race the post-policy startup and - // surface a false "gateway crashed during startup" warning. The helper - // is a no-op when the gateway is already running. Fixes #3573. - const processRecovery: typeof import("./actions/sandbox/process-recovery") = - require("./actions/sandbox/process-recovery"); - processRecovery.checkAndRecoverSandboxProcesses(sandboxName, { quiet: true }); - - // Post-deployment verification — confirm the full delivery chain is - // operational before telling the user "YOUR AGENT IS LIVE". Fixes #2342. - const verifyDeploymentModule: typeof import("./verify-deployment") = require("./verify-deployment"); - const _verifyChatUiUrl = process.env.CHAT_UI_URL || `http://127.0.0.1:${DASHBOARD_PORT}`; - const verifyChain = buildChain({ chatUiUrl: _verifyChatUiUrl, isWsl: isWsl(), wslHostAddress: getWslHostAddress() }); - const verificationResult = await verifyDeploymentModule.verifyDeployment( + await handleFinalizationState({ sandboxName, - verifyChain, - { - executeSandboxCommand: (name: string, script: string) => { - return executeSandboxCommandForVerification(name, script); + model, + provider, + nimContainer, + agent, + hermesAuthMethod, + hermesToolGateways, + selectedMessagingChannels, + stagedLegacyKeys, + migratedLegacyKeys, + deps: { + ensureAgentDashboardForward, + recordSessionComplete, + toSessionUpdates: (updates) => toSessionUpdates(updates as Parameters[0]), + removeLegacyCredentialsFile, + cleanupStaleHostFiles, + checkAndRecoverSandboxProcesses: (name, options) => { + const processRecovery: typeof import("./actions/sandbox/process-recovery") = + require("./actions/sandbox/process-recovery"); + processRecovery.checkAndRecoverSandboxProcesses(name, options); }, - probeHostPort: (port: number, probePath: string) => { - const result = runCapture( - ["curl", "-so", "/dev/null", "-w", "%{http_code}", "--max-time", "3", - `http://127.0.0.1:${port}${probePath}`], - { ignoreError: true }, - ); - return parseInt(result.trim(), 10) || 0; + getChatUiUrl: () => process.env.CHAT_UI_URL || `http://127.0.0.1:${DASHBOARD_PORT}`, + buildVerifyChain: (chatUiUrl) => + buildChain({ chatUiUrl, isWsl: isWsl(), wslHostAddress: getWslHostAddress() }), + verifyDeployment: async (name, chain) => { + const verifyDeploymentModule: typeof import("./verify-deployment") = require("./verify-deployment"); + return verifyDeploymentModule.verifyDeployment(name, chain, { + executeSandboxCommand: (sandbox: string, script: string) => + executeSandboxCommandForVerification(sandbox, script), + probeHostPort: (port: number, probePath: string) => { + const result = runCapture( + ["curl", "-so", "/dev/null", "-w", "%{http_code}", "--max-time", "3", `http://127.0.0.1:${port}${probePath}`], + { ignoreError: true }, + ); + return parseInt(result.trim(), 10) || 0; + }, + captureForwardList: () => runCaptureOpenshell(["forward", "list"], { ignoreError: true }) || null, + getMessagingChannels: () => selectedMessagingChannels || [], + providerExistsInGateway: (providerName: string) => providerExistsInGateway(providerName), + }); }, - captureForwardList: () => { - const output = runCaptureOpenshell(["forward", "list"], { ignoreError: true }); - return output || null; + formatVerificationDiagnostics: (result) => { + const verifyDeploymentModule: typeof import("./verify-deployment") = require("./verify-deployment"); + return verifyDeploymentModule.formatVerificationDiagnostics(result); }, - getMessagingChannels: (_name: string) => selectedMessagingChannels || [], - providerExistsInGateway: (providerName: string) => providerExistsInGateway(providerName), + printDashboard, + error: (message) => console.error(message), + log: (message) => console.log(message), }, - ); - - // Print verification diagnostics - const diagLines = verifyDeploymentModule.formatVerificationDiagnostics(verificationResult); - for (const line of diagLines) { - console.log(line); - } - - printDashboard(sandboxName, model, provider, nimContainer, agent); + }); } finally { releaseOnboardLock(); ONBOARD_RUNTIME = null; diff --git a/src/lib/onboard/machine/handlers/finalization.test.ts b/src/lib/onboard/machine/handlers/finalization.test.ts new file mode 100644 index 0000000000..a1617c4366 --- /dev/null +++ b/src/lib/onboard/machine/handlers/finalization.test.ts @@ -0,0 +1,125 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it, vi } from "vitest"; + +import { createSession, type SessionUpdates } from "../../../state/onboard-session"; +import { handleFinalizationState, type FinalizationStateOptions } from "./finalization"; + +type Agent = { name: string } | null; +type VerifyChain = { port: number }; +type VerificationResult = { ok: boolean }; + +function createDeps(overrides: Partial["deps"]> = {}) { + const calls = { + ensureAgentDashboard: vi.fn(() => 18789), + complete: vi.fn(async () => createSession({ status: "complete" })), + removeLegacy: vi.fn(), + cleanupHost: vi.fn(), + recoverProcesses: vi.fn(), + getChatUiUrl: vi.fn(() => "http://127.0.0.1:18789"), + buildChain: vi.fn(() => ({ port: 18789 })), + verify: vi.fn(async () => ({ ok: true })), + diagnostics: vi.fn(() => [" ✓ verified"]), + dashboard: vi.fn(), + error: vi.fn(), + log: vi.fn(), + }; + return { + calls, + deps: { + ensureAgentDashboardForward: calls.ensureAgentDashboard, + recordSessionComplete: calls.complete, + toSessionUpdates: (updates: Record) => updates as SessionUpdates, + removeLegacyCredentialsFile: calls.removeLegacy, + cleanupStaleHostFiles: calls.cleanupHost, + checkAndRecoverSandboxProcesses: calls.recoverProcesses, + getChatUiUrl: calls.getChatUiUrl, + buildVerifyChain: calls.buildChain, + verifyDeployment: calls.verify, + formatVerificationDiagnostics: calls.diagnostics, + printDashboard: calls.dashboard, + error: calls.error, + log: calls.log, + ...overrides, + }, + }; +} + +function baseOptions( + deps: FinalizationStateOptions["deps"], +): FinalizationStateOptions { + return { + sandboxName: "my-assistant", + model: "model", + provider: "provider", + nimContainer: null, + agent: null, + hermesAuthMethod: null, + hermesToolGateways: [], + selectedMessagingChannels: ["telegram"], + stagedLegacyKeys: [], + migratedLegacyKeys: new Set(), + deps, + }; +} + +describe("handleFinalizationState", () => { + it("completes the session, verifies deployment, and prints the dashboard", async () => { + const { deps, calls } = createDeps(); + + const result = await handleFinalizationState(baseOptions(deps)); + + expect(calls.complete).toHaveBeenCalledWith({ + sandboxName: "my-assistant", + provider: "provider", + model: "model", + hermesAuthMethod: null, + hermesToolGateways: [], + }); + expect(calls.cleanupHost).toHaveBeenCalledOnce(); + expect(calls.recoverProcesses).toHaveBeenCalledWith("my-assistant", { quiet: true }); + expect(calls.buildChain).toHaveBeenCalledWith("http://127.0.0.1:18789"); + expect(calls.verify).toHaveBeenCalledWith("my-assistant", { port: 18789 }); + expect(calls.log).toHaveBeenCalledWith(" ✓ verified"); + expect(calls.dashboard).toHaveBeenCalledWith("my-assistant", "model", "provider", null, null); + expect(result.verificationDiagnostics).toEqual([" ✓ verified"]); + }); + + it("ensures agent dashboard forwarding before completion for non-OpenClaw agents", async () => { + const { deps, calls } = createDeps(); + const agent = { name: "hermes" }; + + await handleFinalizationState({ ...baseOptions(deps), agent }); + + expect(calls.ensureAgentDashboard).toHaveBeenCalledWith("my-assistant", agent); + expect(calls.dashboard).toHaveBeenCalledWith("my-assistant", "model", "provider", null, agent); + }); + + it("removes legacy credentials only when all staged values migrated", async () => { + const { deps, calls } = createDeps(); + + await handleFinalizationState({ + ...baseOptions(deps), + stagedLegacyKeys: ["NVIDIA_API_KEY", "SLACK_BOT_TOKEN"], + migratedLegacyKeys: new Set(["NVIDIA_API_KEY", "SLACK_BOT_TOKEN"]), + }); + + expect(calls.removeLegacy).toHaveBeenCalledOnce(); + expect(calls.error).not.toHaveBeenCalled(); + }); + + it("keeps legacy credentials and warns when migration is incomplete", async () => { + const { deps, calls } = createDeps(); + + const result = await handleFinalizationState({ + ...baseOptions(deps), + stagedLegacyKeys: ["NVIDIA_API_KEY", "SLACK_BOT_TOKEN"], + migratedLegacyKeys: new Set(["NVIDIA_API_KEY"]), + }); + + expect(calls.removeLegacy).not.toHaveBeenCalled(); + expect(calls.error).toHaveBeenCalledWith(expect.stringContaining("SLACK_BOT_TOKEN")); + expect(result.unmigratedLegacyKeys).toEqual(["SLACK_BOT_TOKEN"]); + }); +}); diff --git a/src/lib/onboard/machine/handlers/finalization.ts b/src/lib/onboard/machine/handlers/finalization.ts new file mode 100644 index 0000000000..0cdd3735ca --- /dev/null +++ b/src/lib/onboard/machine/handlers/finalization.ts @@ -0,0 +1,90 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { Session, SessionUpdates } from "../../../state/onboard-session"; + +export interface FinalizationStateOptions { + sandboxName: string; + model: string; + provider: string; + nimContainer: string | null; + agent: Agent; + hermesAuthMethod: string | null; + hermesToolGateways: string[]; + selectedMessagingChannels: string[]; + stagedLegacyKeys: readonly string[]; + migratedLegacyKeys: ReadonlySet; + deps: { + ensureAgentDashboardForward(sandboxName: string, agent: NonNullable): number; + recordSessionComplete(updates: SessionUpdates): Promise; + toSessionUpdates(updates: Record): SessionUpdates; + removeLegacyCredentialsFile(): void; + cleanupStaleHostFiles(): void; + checkAndRecoverSandboxProcesses(sandboxName: string, options: { quiet: boolean }): void; + getChatUiUrl(): string; + buildVerifyChain(chatUiUrl: string): VerifyChain; + verifyDeployment(sandboxName: string, chain: VerifyChain): Promise; + formatVerificationDiagnostics(result: VerificationResult): string[]; + printDashboard( + sandboxName: string, + model: string, + provider: string, + nimContainer: string | null, + agent: Agent, + ): void; + error(message?: string): void; + log(message?: string): void; + }; +} + +export interface FinalizationStateResult { + session: Session; + unmigratedLegacyKeys: string[]; + verificationDiagnostics: string[]; +} + +export async function handleFinalizationState({ + sandboxName, + model, + provider, + nimContainer, + agent, + hermesAuthMethod, + hermesToolGateways, + selectedMessagingChannels: _selectedMessagingChannels, + stagedLegacyKeys, + migratedLegacyKeys, + deps, +}: FinalizationStateOptions): Promise { + if (agent) deps.ensureAgentDashboardForward(sandboxName, agent as NonNullable); + + const session = await deps.recordSessionComplete( + deps.toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }), + ); + + const allStagedMigrated = + stagedLegacyKeys.length > 0 && stagedLegacyKeys.every((key) => migratedLegacyKeys.has(key)); + const unmigratedLegacyKeys = stagedLegacyKeys.filter((key) => !migratedLegacyKeys.has(key)); + if (allStagedMigrated) { + deps.removeLegacyCredentialsFile(); + } else if (stagedLegacyKeys.length > 0) { + deps.error( + ` Kept ~/.nemoclaw/credentials.json: ${String(unmigratedLegacyKeys.length)} ` + + `legacy credential(s) were not migrated verbatim to the gateway in this run ` + + `(${unmigratedLegacyKeys.join(", ")}). Re-run onboard with the relevant ` + + `providers/channels enabled to migrate them, then the file is removed automatically.`, + ); + } + + deps.cleanupStaleHostFiles(); + deps.checkAndRecoverSandboxProcesses(sandboxName, { quiet: true }); + + const verifyChain = deps.buildVerifyChain(deps.getChatUiUrl()); + const verificationResult = await deps.verifyDeployment(sandboxName, verifyChain); + const verificationDiagnostics = deps.formatVerificationDiagnostics(verificationResult); + for (const line of verificationDiagnostics) deps.log(line); + + deps.printDashboard(sandboxName, model, provider, nimContainer, agent); + + return { session, unmigratedLegacyKeys, verificationDiagnostics }; +} From 4385d20b0bb38db885b1eb5008cd18868d82adc5 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 00:28:13 -0700 Subject: [PATCH 14/54] refactor(cli): route agent setup session writes through context --- src/lib/agent/onboard.test.ts | 12 ++++++++++++ src/lib/agent/onboard.ts | 19 +++++++++++++------ src/lib/onboard.ts | 3 +++ 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/lib/agent/onboard.test.ts b/src/lib/agent/onboard.test.ts index b71a82a83e..fd82e6d1ce 100644 --- a/src/lib/agent/onboard.test.ts +++ b/src/lib/agent/onboard.test.ts @@ -1,6 +1,9 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +import fs from "node:fs"; +import path from "node:path"; + import { describe, it, expect, beforeEach, afterEach, afterAll, vi } from "vitest"; // Import from compiled dist/ so coverage is attributed correctly. import { @@ -129,6 +132,15 @@ describe("printDashboardUi — regression for #2078 (port 8642 is not a chat UI) }); }); +describe("agent setup session boundaries", () => { + it("does not write onboard session state directly", () => { + const source = fs.readFileSync(path.join(__dirname, "onboard.ts"), "utf8"); + + expect(source).not.toContain("../state/onboard-session"); + expect(source).not.toMatch(/onboardSession\.markStep/); + }); +}); + describe("handleAgentSetup guards", () => { it("accepts an executable configured binary path when PATH lookup is empty", () => { let script = ""; diff --git a/src/lib/agent/onboard.ts b/src/lib/agent/onboard.ts index f08c32b9c6..b2ee0e7bb1 100644 --- a/src/lib/agent/onboard.ts +++ b/src/lib/agent/onboard.ts @@ -13,7 +13,6 @@ import { dockerBuild, dockerImageInspect } from "../adapters/docker"; import { getAgentBranding } from "../cli/branding"; import { getProviderSelectionConfig } from "../inference/config"; import type { JsonObject as LooseObject } from "../core/json-types"; -import * as onboardSession from "../state/onboard-session"; import { ROOT, redact, run, shellQuote } from "../runner"; import { buildLocalBaseTag, @@ -32,6 +31,8 @@ export interface OnboardContext { writeSandboxConfigSyncFile: (script: string) => string; cleanupTempDir: (file: string, prefix: string) => void; startRecordedStep: (stepName: string, updates: LooseObject) => Promise; + recordStepComplete: (stepName: string, updates: LooseObject) => Promise; + recordStepFailed: (stepName: string, message: string | null) => Promise; skippedStepMessage: (stepName: string, sandboxName: string) => void; } @@ -350,13 +351,14 @@ export function collectHermesStartupDiagnostics( /** * Record and print an agent setup failure before exiting the onboarding flow. */ -function failAgentSetup( +async function failAgentSetup( sandboxName: string, agent: AgentDefinition, message: string, details: string[] = [], -): never { - onboardSession.markStepFailed( + recordStepFailed: OnboardContext["recordStepFailed"], +): Promise { + await recordStepFailed( "agent_setup", details.length > 0 ? `${message}\n${details.join("\n")}` : message, ); @@ -406,6 +408,8 @@ export async function handleAgentSetup( writeSandboxConfigSyncFile, cleanupTempDir, startRecordedStep, + recordStepComplete, + recordStepFailed, skippedStepMessage, } = ctx; @@ -418,7 +422,7 @@ export async function handleAgentSetup( ); if (isHealthProbeOk(result)) { skippedStepMessage("agent_setup", sandboxName); - onboardSession.markStepComplete("agent_setup", { sandboxName, provider, model }); + await recordStepComplete("agent_setup", { sandboxName, provider, model }); return; } } @@ -433,6 +437,8 @@ export async function handleAgentSetup( sandboxName, agent, describeAgentBinaryFailure(sandboxName, agent, binaryAvailability), + [], + recordStepFailed, ); } @@ -486,13 +492,14 @@ export async function handleAgentSetup( agent, `${agent.displayName} gateway did not respond within ${timeoutSecs}s`, diagnostics, + recordStepFailed, ); } } else { console.log(` \u2713 ${agent.displayName} configured inside sandbox`); } - onboardSession.markStepComplete("agent_setup", { sandboxName, provider, model }); + await recordStepComplete("agent_setup", { sandboxName, provider, model }); } /** diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 33428431c0..e80523d35c 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -9674,6 +9674,9 @@ async function onboard(opts: OnboardOptions = {}): Promise { writeSandboxConfigSyncFile, cleanupTempDir, startRecordedStep, + recordStepComplete, + recordStepFailed: (stepName: string, message: string | null) => + getOnboardRuntime().markStepFailed(stepName, message), skippedStepMessage, }), ensureAgentDashboardForward, From 98ac89edd7cafb327cbd75a0a7690d4eae173cfd Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 00:44:17 -0700 Subject: [PATCH 15/54] refactor(cli): emit resume skip repair events --- src/lib/onboard.ts | 26 ++++++++++++++++ .../machine/handlers/agent-setup.test.ts | 6 ++++ .../onboard/machine/handlers/agent-setup.ts | 2 ++ .../onboard/machine/handlers/gateway.test.ts | 10 +++++++ src/lib/onboard/machine/handlers/gateway.ts | 3 ++ .../onboard/machine/handlers/policies.test.ts | 6 ++++ src/lib/onboard/machine/handlers/policies.ts | 5 ++++ .../machine/handlers/preflight.test.ts | 5 ++++ src/lib/onboard/machine/handlers/preflight.ts | 2 ++ .../handlers/provider-inference.test.ts | 22 ++++++++++++++ .../machine/handlers/provider-inference.ts | 30 +++++++++++++++++++ .../onboard/machine/handlers/sandbox.test.ts | 16 ++++++++++ src/lib/onboard/machine/handlers/sandbox.ts | 14 +++++++++ 13 files changed, 147 insertions(+) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index e80523d35c..48c41740cc 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -9059,6 +9059,24 @@ async function recordStepSkipped(stepName: string): Promise { return getOnboardRuntime().markStepSkipped(stepName); } +async function recordStateSkipped( + state: import("./onboard/machine/types").OnboardMachineState, + metadata: Record | null = null, +): Promise { + return getOnboardRuntime().markSkipped(state, metadata); +} + +async function recordRepairEvent( + type: "state.repair.started" | "state.repair.completed" | "state.repair.failed", + options: { + state?: import("./onboard/machine/types").OnboardMachineState | null; + error?: string | null; + metadata?: Record | null; + } = {}, +): Promise { + return getOnboardRuntime().emitRepairEvent(type, options); +} + async function recordSessionComplete(updates: SessionUpdates = {}): Promise { return getOnboardRuntime().completeSession(updates); } @@ -9430,6 +9448,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { resolveSandboxGpuConfig, validateSandboxGpuPreflight, skippedStepMessage, + recordStateSkipped, startRecordedStep, recordStepComplete, updateSession: onboardSession.updateSession, @@ -9497,6 +9516,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { retireLegacyGatewayForDockerDriverUpgrade, destroyGatewayRuntimeForGpuReuse: () => destroyGateway(() => undefined, () => false), skippedStepMessage, + recordStateSkipped, note, startRecordedStep, startGateway, @@ -9552,6 +9572,8 @@ async function onboard(opts: OnboardOptions = {}): Promise { recordStepComplete, toSessionUpdates: (updates) => toSessionUpdates(updates as Parameters[0]), skippedStepMessage, + recordStateSkipped, + recordRepairEvent, hydrateCredentialEnv, repairLocalInferenceSystemdOverrideOrExit, isNonInteractive, @@ -9644,6 +9666,8 @@ async function onboard(opts: OnboardOptions = {}): Promise { recordStepComplete, toSessionUpdates: (updates) => toSessionUpdates(updates as Parameters[0]), skippedStepMessage, + recordStateSkipped, + recordRepairEvent, error: (message) => console.error(message), exitProcess: (code) => process.exit(code), }, @@ -9683,6 +9707,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { recordStepSkipped, isOpenclawReady, skippedStepMessage, + recordStateSkipped, startRecordedStep, setupOpenclaw, recordStepComplete, @@ -9719,6 +9744,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { mergeRequiredHermesToolGatewayPolicyPresets, arePolicyPresetsApplied, skippedStepMessage, + recordStateSkipped, startRecordedStep, setupPoliciesWithSelection, updateSession: onboardSession.updateSession, diff --git a/src/lib/onboard/machine/handlers/agent-setup.test.ts b/src/lib/onboard/machine/handlers/agent-setup.test.ts index fd9f1d0410..99255f622d 100644 --- a/src/lib/onboard/machine/handlers/agent-setup.test.ts +++ b/src/lib/onboard/machine/handlers/agent-setup.test.ts @@ -16,6 +16,7 @@ function createDeps(overrides: Partial["deps"]> = skipped: vi.fn(async () => createSession()), openclawReady: vi.fn(() => false), skippedMessage: vi.fn(), + recordSkip: vi.fn(async () => createSession()), startStep: vi.fn(async () => undefined), setupOpenclaw: vi.fn(async () => undefined), complete: vi.fn(async () => createSession()), @@ -29,6 +30,7 @@ function createDeps(overrides: Partial["deps"]> = recordStepSkipped: calls.skipped, isOpenclawReady: calls.openclawReady, skippedStepMessage: calls.skippedMessage, + recordStateSkipped: calls.recordSkip, startRecordedStep: calls.startStep, setupOpenclaw: calls.setupOpenclaw, recordStepComplete: calls.complete, @@ -83,6 +85,10 @@ describe("handleAgentSetupState", () => { await handleAgentSetupState({ ...baseOptions(deps), resume: true }); expect(calls.skippedMessage).toHaveBeenCalledWith("openclaw", "my-assistant"); + expect(calls.recordSkip).toHaveBeenCalledWith("openclaw", { + reason: "resume", + sandboxName: "my-assistant", + }); expect(calls.startStep).not.toHaveBeenCalled(); expect(calls.setupOpenclaw).not.toHaveBeenCalled(); expect(calls.complete).toHaveBeenCalledWith( diff --git a/src/lib/onboard/machine/handlers/agent-setup.ts b/src/lib/onboard/machine/handlers/agent-setup.ts index 40330711ad..a24a6e9811 100644 --- a/src/lib/onboard/machine/handlers/agent-setup.ts +++ b/src/lib/onboard/machine/handlers/agent-setup.ts @@ -27,6 +27,7 @@ export interface AgentSetupStateOptions { recordStepSkipped(stepName: string): Promise; isOpenclawReady(sandboxName: string): boolean; skippedStepMessage(stepName: string, detail?: string | null): void; + recordStateSkipped(state: "openclaw", metadata?: Record | null): Promise; startRecordedStep( stepName: string, updates: { sandboxName: string; provider: string; model: string }, @@ -70,6 +71,7 @@ export async function handleAgentSetupState({ const resumeOpenclaw = resume && sandboxName && deps.isOpenclawReady(sandboxName); if (resumeOpenclaw) { deps.skippedStepMessage("openclaw", sandboxName); + await deps.recordStateSkipped("openclaw", { reason: "resume", sandboxName }); session = await deps.recordStepComplete( "openclaw", deps.toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }), diff --git a/src/lib/onboard/machine/handlers/gateway.test.ts b/src/lib/onboard/machine/handlers/gateway.test.ts index 266ba10360..eceee5c11f 100644 --- a/src/lib/onboard/machine/handlers/gateway.test.ts +++ b/src/lib/onboard/machine/handlers/gateway.test.ts @@ -25,6 +25,7 @@ function createDeps(overrides: Partial["deps"]> = {}) { retireLegacy: vi.fn(), destroyGpuRuntime: vi.fn(() => true), skipped: vi.fn(), + recordSkip: vi.fn(async () => createSession()), note: vi.fn(), startStep: vi.fn(async () => undefined), startGateway: vi.fn(async () => undefined), @@ -51,6 +52,7 @@ function createDeps(overrides: Partial["deps"]> = {}) { retireLegacyGatewayForDockerDriverUpgrade: calls.retireLegacy, destroyGatewayRuntimeForGpuReuse: calls.destroyGpuRuntime, skippedStepMessage: calls.skipped, + recordStateSkipped: calls.recordSkip, note: calls.note, startRecordedStep: calls.startStep, startGateway: calls.startGateway, @@ -99,6 +101,10 @@ describe("handleGatewayState", () => { await handleGatewayState(baseOptions(deps, "healthy")); expect(calls.skipped).toHaveBeenCalledWith("gateway", "running", "reuse"); + expect(calls.recordSkip).toHaveBeenCalledWith("gateway", { + reason: "reuse", + reuseState: "healthy", + }); expect(calls.note).toHaveBeenCalledWith(" Reusing healthy NemoClaw gateway."); expect(calls.startGateway).not.toHaveBeenCalled(); expect(calls.complete).toHaveBeenCalledWith("gateway"); @@ -112,6 +118,10 @@ describe("handleGatewayState", () => { await handleGatewayState({ ...baseOptions(deps, "healthy", session), resume: true }); expect(calls.skipped).toHaveBeenCalledWith("gateway", "running"); + expect(calls.recordSkip).toHaveBeenCalledWith("gateway", { + reason: "resume", + reuseState: "healthy", + }); expect(calls.startGateway).not.toHaveBeenCalled(); }); diff --git a/src/lib/onboard/machine/handlers/gateway.ts b/src/lib/onboard/machine/handlers/gateway.ts index 026c26e1b4..48fab3c4e7 100644 --- a/src/lib/onboard/machine/handlers/gateway.ts +++ b/src/lib/onboard/machine/handlers/gateway.ts @@ -51,6 +51,7 @@ export interface GatewayStateOptions { detail?: string | null, reason?: "resume" | "reuse", ): void; + recordStateSkipped(state: "gateway", metadata?: Record | null): Promise; note(message: string): void; startRecordedStep(stepName: string): Promise; startGateway(gpu: Gpu, options: { gpuPassthrough: boolean }): Promise; @@ -147,9 +148,11 @@ export async function handleGatewayState({ const resumeGateway = resume && session?.steps?.gateway?.status === "complete" && canReuseHealthyGateway; if (resumeGateway) { deps.skippedStepMessage("gateway", "running"); + await deps.recordStateSkipped("gateway", { reason: "resume", reuseState: gatewayReuseState }); session = await deps.recordStepComplete("gateway"); } else if (!resume && canReuseHealthyGateway) { deps.skippedStepMessage("gateway", "running", "reuse"); + await deps.recordStateSkipped("gateway", { reason: "reuse", reuseState: gatewayReuseState }); deps.note(" Reusing healthy NemoClaw gateway."); session = await deps.recordStepComplete("gateway"); } else { diff --git a/src/lib/onboard/machine/handlers/policies.test.ts b/src/lib/onboard/machine/handlers/policies.test.ts index ee315d34f0..56782d1751 100644 --- a/src/lib/onboard/machine/handlers/policies.test.ts +++ b/src/lib/onboard/machine/handlers/policies.test.ts @@ -22,6 +22,7 @@ function createDeps(overrides: Partial [...selected, ...tools]), appliedCheck: vi.fn(() => false), skipped: vi.fn(), + recordSkip: vi.fn(async () => session), startStep: vi.fn(async () => undefined), setupPolicies: vi.fn(async () => ["npm"]), updateSession: vi.fn((mutator: (value: Session) => Session | void) => { @@ -43,6 +44,7 @@ function createDeps(overrides: Partial { const result = await handlePoliciesState({ ...baseOptions(deps), resume: true }); expect(calls.skipped).toHaveBeenCalledWith("policies", "npm"); + expect(calls.recordSkip).toHaveBeenCalledWith("policies", { + reason: "resume", + policyPresets: ["npm"], + }); expect(calls.setupPolicies).not.toHaveBeenCalled(); expect(calls.complete).toHaveBeenCalledWith( "policies", diff --git a/src/lib/onboard/machine/handlers/policies.ts b/src/lib/onboard/machine/handlers/policies.ts index ad35931cbf..cbc452d23e 100644 --- a/src/lib/onboard/machine/handlers/policies.ts +++ b/src/lib/onboard/machine/handlers/policies.ts @@ -51,6 +51,7 @@ export interface PoliciesStateOptions { ): string[]; arePolicyPresetsApplied(sandboxName: string, selectedPresets: string[]): boolean; skippedStepMessage(stepName: string, detail?: string | null): void; + recordStateSkipped(state: "policies", metadata?: Record | null): Promise; startRecordedStep( stepName: string, updates: { sandboxName: string; provider: string; model: string; policyPresets: string[] }, @@ -144,6 +145,10 @@ export async function handlePoliciesState({ let session: Session | null; if (resumePolicies) { deps.skippedStepMessage("policies", recordedPolicyPresetsForSupport.join(", ")); + await deps.recordStateSkipped("policies", { + reason: "resume", + policyPresets: recordedPolicyPresetsForSupport, + }); session = await deps.recordStepComplete( "policies", deps.toSessionUpdates({ diff --git a/src/lib/onboard/machine/handlers/preflight.test.ts b/src/lib/onboard/machine/handlers/preflight.test.ts index fa4b859915..8916124ec8 100644 --- a/src/lib/onboard/machine/handlers/preflight.test.ts +++ b/src/lib/onboard/machine/handlers/preflight.test.ts @@ -50,6 +50,7 @@ function createDeps(overrides: Partial session), startRecordedStep: vi.fn(async () => undefined), recordStepComplete: vi.fn(async () => session), updateSession: vi.fn((mutator: (value: Session) => Session | void) => { @@ -125,6 +126,10 @@ describe("handlePreflightState", () => { }); expect(harness.deps.skippedStepMessage).toHaveBeenCalledWith("preflight", "cached"); + expect(harness.deps.recordStateSkipped).toHaveBeenCalledWith("preflight", { + reason: "resume", + validation: "gpu-cdi", + }); expect(harness.deps.detectGpu).toHaveBeenCalledOnce(); expect(harness.deps.runPreflight).not.toHaveBeenCalled(); expect(harness.deps.startRecordedStep).not.toHaveBeenCalled(); diff --git a/src/lib/onboard/machine/handlers/preflight.ts b/src/lib/onboard/machine/handlers/preflight.ts index cc5bd6633d..e5d91a1c06 100644 --- a/src/lib/onboard/machine/handlers/preflight.ts +++ b/src/lib/onboard/machine/handlers/preflight.ts @@ -48,6 +48,7 @@ export interface PreflightStateOptions< ): Config; validateSandboxGpuPreflight(config: Config): void; skippedStepMessage(stepName: string, detail?: string | null): void; + recordStateSkipped(state: "preflight", metadata?: Record | null): Promise; startRecordedStep(stepName: string): Promise; recordStepComplete(stepName: string): Promise; updateSession(mutator: (session: Session) => Session | void): Session; @@ -106,6 +107,7 @@ export async function handlePreflightState< let gpu: Gpu; if (resumePreflight) { deps.skippedStepMessage("preflight", "cached"); + await deps.recordStateSkipped("preflight", { reason: "resume", validation: "gpu-cdi" }); gpu = deps.detectGpu(); const resumeOptedOutGpuPassthrough = noGpu || (!gpuRequested && session?.gpuPassthrough === false); deps.assertCdiNvidiaGpuSpecPresent(deps.assessHost(), resumeOptedOutGpuPassthrough); diff --git a/src/lib/onboard/machine/handlers/provider-inference.test.ts b/src/lib/onboard/machine/handlers/provider-inference.test.ts index bec7ea47a3..1af9c81321 100644 --- a/src/lib/onboard/machine/handlers/provider-inference.test.ts +++ b/src/lib/onboard/machine/handlers/provider-inference.test.ts @@ -32,6 +32,8 @@ function createDeps(overrides: Partial undefined), complete: vi.fn(async () => createSession()), skipped: vi.fn(), + recordSkip: vi.fn(async () => createSession()), + repairEvent: vi.fn(async () => createSession()), hydrate: vi.fn(), repair: vi.fn(), routeReady: vi.fn(() => false), @@ -56,6 +58,8 @@ function createDeps(overrides: Partial) => updates as SessionUpdates, skippedStepMessage: calls.skipped, + recordStateSkipped: calls.recordSkip, + recordRepairEvent: calls.repairEvent, hydrateCredentialEnv: calls.hydrate, repairLocalInferenceSystemdOverrideOrExit: calls.repair, isNonInteractive: () => true, @@ -163,9 +167,27 @@ describe("handleProviderInferenceState", () => { expect(calls.setupNim).not.toHaveBeenCalled(); expect(calls.setupInference).not.toHaveBeenCalled(); expect(calls.skipped).toHaveBeenCalledWith("provider_selection", "ollama-local / llama3.1"); + expect(calls.recordSkip).toHaveBeenCalledWith("provider_selection", { + reason: "resume", + provider: "ollama-local", + model: "llama3.1", + }); expect(calls.hydrate).toHaveBeenCalledWith(null); + expect(calls.repairEvent).toHaveBeenCalledWith("state.repair.started", { + state: "provider_selection", + metadata: { repair: "ollama-systemd-loopback" }, + }); expect(calls.repair).toHaveBeenCalledWith("ollama-local", deps.isNonInteractive); + expect(calls.repairEvent).toHaveBeenCalledWith("state.repair.completed", { + state: "provider_selection", + metadata: { repair: "ollama-systemd-loopback" }, + }); expect(calls.skipped).toHaveBeenCalledWith("inference", "ollama-local / llama3.1"); + expect(calls.recordSkip).toHaveBeenCalledWith("inference", { + reason: "resume", + provider: "ollama-local", + model: "llama3.1", + }); expect(result).toMatchObject({ provider: "ollama-local", model: "llama3.1" }); }); diff --git a/src/lib/onboard/machine/handlers/provider-inference.ts b/src/lib/onboard/machine/handlers/provider-inference.ts index 525b94a059..c73aa5492f 100644 --- a/src/lib/onboard/machine/handlers/provider-inference.ts +++ b/src/lib/onboard/machine/handlers/provider-inference.ts @@ -57,6 +57,14 @@ export interface ProviderInferenceStateOptions { recordStepComplete(stepName: string, updates: SessionUpdates): Promise; toSessionUpdates(updates: Record): SessionUpdates; skippedStepMessage(stepName: string, detail?: string | null): void; + recordStateSkipped( + state: "provider_selection" | "inference", + metadata?: Record | null, + ): Promise; + recordRepairEvent( + type: "state.repair.started" | "state.repair.completed" | "state.repair.failed", + options?: { state?: "provider_selection" | "inference"; error?: string | null; metadata?: Record | null }, + ): Promise; hydrateCredentialEnv(credentialEnv: string | null): void; repairLocalInferenceSystemdOverrideOrExit(provider: string | null, isNonInteractive: () => boolean): void; isNonInteractive(): boolean; @@ -144,8 +152,25 @@ export async function handleProviderInferenceState({ typeof model === "string"; if (resumeProviderSelection) { deps.skippedStepMessage("provider_selection", `${provider} / ${model}`); + await deps.recordStateSkipped("provider_selection", { + reason: "resume", + provider, + model, + }); deps.hydrateCredentialEnv(credentialEnv); + if (provider === "ollama-local") { + await deps.recordRepairEvent("state.repair.started", { + state: "provider_selection", + metadata: { repair: "ollama-systemd-loopback" }, + }); + } deps.repairLocalInferenceSystemdOverrideOrExit(provider, deps.isNonInteractive); + if (provider === "ollama-local") { + await deps.recordRepairEvent("state.repair.completed", { + state: "provider_selection", + metadata: { repair: "ollama-systemd-loopback" }, + }); + } } else { await deps.startRecordedStep("provider_selection"); const selection = await deps.setupNim(gpu, sandboxName, agent); @@ -214,6 +239,11 @@ export async function handleProviderInferenceState({ } } deps.skippedStepMessage("inference", `${provider} / ${model}`); + await deps.recordStateSkipped("inference", { + reason: "resume", + provider, + model, + }); if (nimContainer && sandboxName) deps.registryUpdateSandbox(sandboxName, { nimContainer }); session = await deps.recordStepComplete( "inference", diff --git a/src/lib/onboard/machine/handlers/sandbox.test.ts b/src/lib/onboard/machine/handlers/sandbox.test.ts index eac0ffb553..a8e8db61d2 100644 --- a/src/lib/onboard/machine/handlers/sandbox.test.ts +++ b/src/lib/onboard/machine/handlers/sandbox.test.ts @@ -35,6 +35,8 @@ function createDeps(overrides: Partial createSession()), skipped: vi.fn(), + recordSkip: vi.fn(async () => createSession()), + repairEvent: vi.fn(async () => createSession()), error: vi.fn(), exit: vi.fn((code: number): never => { throw new Error(`exit ${code}`); @@ -77,6 +79,8 @@ function createDeps(overrides: Partial) => updates as SessionUpdates, skippedStepMessage: calls.skipped, + recordStateSkipped: calls.recordSkip, + recordRepairEvent: calls.repairEvent, error: calls.error, exitProcess: calls.exit, ...overrides, @@ -153,6 +157,10 @@ describe("handleSandboxState", () => { expect(calls.createSandbox).not.toHaveBeenCalled(); expect(calls.skipped).toHaveBeenCalledWith("sandbox", "saved"); + expect(calls.recordSkip).toHaveBeenCalledWith("sandbox", { + reason: "resume", + sandboxName: "saved", + }); expect(result.selectedMessagingChannels).toEqual(["slack"]); }); @@ -182,7 +190,15 @@ describe("handleSandboxState", () => { await handleSandboxState({ ...baseOptions(deps, session), resume: true, sandboxName: "saved" }); + expect(calls.repairEvent).toHaveBeenCalledWith("state.repair.started", { + state: "sandbox", + metadata: { repair: "recorded-sandbox-cleanup", sandboxName: "saved" }, + }); expect(calls.repairSandbox).toHaveBeenCalledWith("saved"); + expect(calls.repairEvent).toHaveBeenCalledWith("state.repair.completed", { + state: "sandbox", + metadata: { repair: "recorded-sandbox-cleanup", sandboxName: "saved" }, + }); expect(calls.createSandbox).toHaveBeenCalled(); }); diff --git a/src/lib/onboard/machine/handlers/sandbox.ts b/src/lib/onboard/machine/handlers/sandbox.ts index 8c45215ed9..3ae88d13e0 100644 --- a/src/lib/onboard/machine/handlers/sandbox.ts +++ b/src/lib/onboard/machine/handlers/sandbox.ts @@ -77,6 +77,11 @@ export interface SandboxStateOptions; toSessionUpdates(updates: Record): SessionUpdates; skippedStepMessage(stepName: string, detail?: string | null): void; + recordStateSkipped(state: "sandbox", metadata?: Record | null): Promise; + recordRepairEvent( + type: "state.repair.started" | "state.repair.completed" | "state.repair.failed", + options?: { state?: "sandbox"; error?: string | null; metadata?: Record | null }, + ): Promise; error(message?: string): void; exitProcess(code: number): never; }; @@ -174,6 +179,7 @@ export async function handleSandboxState Date: Wed, 20 May 2026 00:54:01 -0700 Subject: [PATCH 16/54] refactor(cli): route final machine transitions --- src/lib/onboard.ts | 11 ++++++++++- src/lib/onboard/machine/runtime.test.ts | 11 ++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 48c41740cc..991cc5cddf 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -9078,7 +9078,16 @@ async function recordRepairEvent( } async function recordSessionComplete(updates: SessionUpdates = {}): Promise { - return getOnboardRuntime().completeSession(updates); + const runtime = getOnboardRuntime(); + const current = await runtime.session(); + if (current.machine.state === "finalizing") { + await runtime.transition("post_verify"); + return runtime.complete(updates); + } + if (current.machine.state === "post_verify") { + return runtime.complete(updates); + } + return runtime.completeSession(updates); } const ONBOARD_STEP_INDEX: Record = { diff --git a/src/lib/onboard/machine/runtime.test.ts b/src/lib/onboard/machine/runtime.test.ts index 7b26269541..f098ba0dc3 100644 --- a/src/lib/onboard/machine/runtime.test.ts +++ b/src/lib/onboard/machine/runtime.test.ts @@ -185,23 +185,28 @@ describe("OnboardRuntime", () => { expect(policiesHarness.getSession().machine.state).toBe("policies"); }); - it("completes from post_verify and emits completion events", async () => { - const { runtime, events, getSession } = createHarness(sessionInState("post_verify")); + it("transitions through finalizing and post_verify before completion", async () => { + const { runtime, events, getSession } = createHarness(sessionInState("finalizing")); + await runtime.transition("post_verify"); await runtime.complete({ sandboxName: "my-assistant" }); expect(getSession()).toMatchObject({ status: "complete", resumable: false, sandboxName: "my-assistant", - machine: { state: "complete", revision: 8 }, + machine: { state: "complete", revision: 9 }, }); expect(events.map((event) => event.type)).toEqual([ + "state.exited", + "state.entered", "context.updated", "state.completed", "state.entered", "onboard.completed", ]); + expect(events[0]).toMatchObject({ state: "finalizing" }); + expect(events[1]).toMatchObject({ state: "post_verify" }); }); it("emits skipped and repair events without mutating durable state", async () => { From c90747b417889c2c46d888d7133c981152b799b2 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 11:05:31 -0700 Subject: [PATCH 17/54] refactor(cli): extract onboard shell helpers --- src/lib/onboard.ts | 1074 ++------------------------- src/lib/onboard/dashboard.ts | 436 +++++++++++ src/lib/onboard/model-router.ts | 522 +++++++++++++ src/lib/onboard/runtime-boundary.ts | 93 +++ src/lib/onboard/session-updates.ts | 63 ++ 5 files changed, 1165 insertions(+), 1023 deletions(-) create mode 100644 src/lib/onboard/dashboard.ts create mode 100644 src/lib/onboard/model-router.ts create mode 100644 src/lib/onboard/runtime-boundary.ts create mode 100644 src/lib/onboard/session-updates.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 991cc5cddf..ad23d5a06e 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -17,7 +17,6 @@ const { }: typeof import("./onboard/branding") = require("./onboard/branding"); const { cleanupTempDir }: typeof import("./onboard/temp-files") = require("./onboard/temp-files"); const { stopStaleDashboardListenersForSandbox } = require("./onboard/stale-gateway-cleanup"); -const { looksLikeForwardPortConflict, runBackgroundForwardStartWithPortReleaseRetries }: typeof import("./onboard/forward-start") = require("./onboard/forward-start"); const { ensureOllamaLoopbackSystemdOverride, }: typeof import("./onboard/ollama-systemd") = require("./onboard/ollama-systemd"); @@ -50,7 +49,7 @@ const { const { agentSupportsWebSearch, }: typeof import("./onboard/web-search-support") = require("./onboard/web-search-support"); -const dashboardAccess: typeof import("./onboard/dashboard-access") = require("./onboard/dashboard-access"); +const onboardDashboard: typeof import("./onboard/dashboard") = require("./onboard/dashboard"); const { buildGatewayBootstrapSecretsScript, createGatewayBootstrapRepairHelpers, @@ -87,9 +86,6 @@ const { const bedrockRuntimeOnboard: typeof import("./onboard/bedrock-runtime") = require("./onboard/bedrock-runtime"); const { buildVllmMenuEntries }: typeof import("./onboard/vllm-menu") = require("./onboard/vllm-menu"); -const { - prepareModelRouterVenv, -}: typeof import("./onboard/model-router-python") = require("./onboard/model-router-python"); const crypto = require("node:crypto"); const fs = require("fs"); const os = require("os"); @@ -279,7 +275,15 @@ const { resolveSandboxImageTagFromCreateOutput } = require("./domain/sandbox/image-tag") as typeof import("./domain/sandbox/image-tag"); const nim: typeof import("./inference/nim") = require("./inference/nim"); const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session"); -const { OnboardRuntime }: typeof import("./onboard/machine/runtime") = require("./onboard/machine/runtime"); +const { toSessionUpdates }: typeof import("./onboard/session-updates") = require("./onboard/session-updates"); +const modelRouter: typeof import("./onboard/model-router") = require("./onboard/model-router"); +const { + DEFAULT_MODEL_ROUTER_CREDENTIAL_ENV, + isRoutedInferenceProvider, + loadBlueprintProfile, + reconcileModelRouter, +} = modelRouter; +const { OnboardRuntimeBoundary }: typeof import("./onboard/runtime-boundary") = require("./onboard/runtime-boundary"); const { handleAgentSetupState }: typeof import("./onboard/machine/handlers/agent-setup") = require("./onboard/machine/handlers/agent-setup"); const { handleFinalizationState }: typeof import("./onboard/machine/handlers/finalization") = require("./onboard/machine/handlers/finalization"); const { handleGatewayState }: typeof import("./onboard/machine/handlers/gateway") = require("./onboard/machine/handlers/gateway"); @@ -293,8 +297,6 @@ const { ensureUsageNoticeConsent } = require("./onboard/usage-notice"); const { findAvailableDashboardPort, findDashboardForwardOwner, - getOccupiedPorts, - isLiveForwardStatus, } = require("./onboard/dashboard-port") as typeof import("./onboard/dashboard-port"); const { destroyGatewayForReuse } = require("./onboard/gateway-cleanup") as typeof import("./onboard/gateway-cleanup"); const { verifyGatewayContainerRunning } = @@ -341,7 +343,6 @@ const sandboxState: typeof import("./state/sandbox") = require("./state/sandbox" const validation: typeof import("./validation") = require("./validation"); const urlUtils: typeof import("./core/url-utils") = require("./core/url-utils"); const buildContext = require("./build-context"); -const dashboardContract: typeof import("./dashboard/contract") = require("./dashboard/contract"); const httpProbe: typeof import("./adapters/http/probe") = require("./adapters/http/probe"); const modelPrompts: typeof import("./inference/model-prompts") = require("./inference/model-prompts"); const providerModels: typeof import("./inference/provider-models") = require("./inference/provider-models"); @@ -388,9 +389,9 @@ import { decidePolicyCarryForward } from "./onboard/policy-carryforward"; import { getSuggestedPolicyPresets } from "./onboard/policy-presets"; import { computeSetupPresetSuggestions as computeSetupPresetSuggestionsImpl, - setupPoliciesWithSelection as setupPoliciesWithSelectionImpl, type SetupPolicySelectionOptions, type SetupPresetSuggestionOptions, + setupPoliciesWithSelection as setupPoliciesWithSelectionImpl, } from "./onboard/policy-selection"; import { getResumeSandboxGpuOverrides, @@ -417,7 +418,6 @@ const USE_COLOR = !process.env.NO_COLOR && !!process.stdout.isTTY; const DIM = USE_COLOR ? "\x1b[2m" : ""; const RESET = USE_COLOR ? "\x1b[0m" : ""; let OPENSHELL_BIN: string | null = null; -let ONBOARD_RUNTIME: import("./onboard/machine/runtime").OnboardRuntime | null = null; const GATEWAY_NAME = "nemoclaw"; const BACK_TO_SELECTION = "__NEMOCLAW_BACK_TO_SELECTION__"; type HermesAuthMethod = "oauth" | "api_key"; @@ -703,503 +703,6 @@ function getBlueprintMaxOpenshellVersion(rootDir = ROOT): string | null { type OpenshellChannel = "stable" | "dev" | "auto"; -/** - * Load a named inference profile and router config from blueprint.yaml. - * Returns null if the blueprint or profile is missing. - */ -type BlueprintRouterConfig = { - enabled?: boolean; - port?: number; - pool_config_path?: string; - credential_env?: string; -}; - -type BlueprintInferenceProfile = { - provider_name?: string; - endpoint?: string; - model: string; - credential_env?: string; - credential_default?: string; - router: BlueprintRouterConfig; -}; - -function loadBlueprintProfile( - profileName: string, - rootDir: string = ROOT, -): BlueprintInferenceProfile | null { - try { - const YAML = require("yaml"); - const blueprintPath = path.join(rootDir, "nemoclaw-blueprint", "blueprint.yaml"); - if (!fs.existsSync(blueprintPath)) return null; - const raw = fs.readFileSync(blueprintPath, "utf8"); - const parsed = YAML.parse(raw); - const profile = parsed?.components?.inference?.profiles?.[profileName]; - if (!profile) return null; - const router = { ...(parsed?.components?.router || {}) }; - if (typeof profile.credential_env === "string" && profile.credential_env.trim().length > 0) { - router.credential_env = profile.credential_env; - } - return { ...profile, router } as BlueprintInferenceProfile; - } catch { - return null; - } -} - -const ROUTER_HEALTH_RETRIES = 15; -const ROUTER_HEALTH_INTERVAL_MS = 2000; -const ROUTER_HEALTH_TIMEOUT_MS = 3000; -const MODEL_ROUTER_RELATIVE_DIR = path.join("nemoclaw-blueprint", "router", "llm-router"); -const MODEL_ROUTER_VENV_DIR = path.join(os.homedir(), ".nemoclaw", "model-router-venv"); -const MODEL_ROUTER_FINGERPRINT_FILE = ".nemoclaw-source-fingerprint"; -const MODEL_ROUTER_FINGERPRINT_IGNORED_NAMES = new Set([ - ".git", - ".hg", - ".mypy_cache", - ".pytest_cache", - ".ruff_cache", - ".svn", - ".venv", - "__pycache__", - "build", - "dist", - "node_modules", - "venv", -]); -const DEFAULT_MODEL_ROUTER_CREDENTIAL_ENV = "NVIDIA_API_KEY"; - -async function isRouterHealthy(port: number, timeoutMs = ROUTER_HEALTH_TIMEOUT_MS): Promise { - const http = require("http"); - return new Promise((resolve) => { - let settled = false; - const settle = (healthy: boolean) => { - if (settled) return; - settled = true; - resolve(healthy); - }; - const request = http - .get(`http://127.0.0.1:${port}/health`, (res: import("node:http").IncomingMessage) => { - res.resume(); - settle((res.statusCode || 0) >= 200 && (res.statusCode || 0) < 300); - }) - .on("error", () => settle(false)); - request.setTimeout(timeoutMs, () => { - request.destroy(); - settle(false); - }); - }); -} - -function isProcessRunning(pid: number | null | undefined): boolean { - if (!Number.isInteger(pid) || Number(pid) <= 0) return false; - try { - process.kill(Number(pid), 0); - return true; - } catch { - return false; - } -} - -async function stopModelRouterProcess(pid: number, port: number): Promise { - try { - process.kill(pid, "SIGTERM"); - } catch { - return; - } - for (let attempt = 0; attempt < 10; attempt++) { - await new Promise((resolve) => setTimeout(resolve, 500)); - if (!isProcessRunning(pid) && !(await isRouterHealthy(port, 1000))) return; - } - try { - process.kill(pid, "SIGKILL"); - } catch { - // already stopped - } - for (let attempt = 0; attempt < 5; attempt++) { - await new Promise((resolve) => setTimeout(resolve, 500)); - if (!isProcessRunning(pid) && !(await isRouterHealthy(port, 1000))) return; - } -} - -function resolveHostCommandPath(commandName: string): string | null { - const result = runCapture(["sh", "-c", 'command -v "$1"', "--", commandName], { - ignoreError: true, - }).trim(); - return result || null; -} - -function modelRouterPackageDir(): string { - return path.join(ROOT, MODEL_ROUTER_RELATIVE_DIR); -} - -function modelRouterVenvDir(): string { - return process.env.NEMOCLAW_MODEL_ROUTER_VENV || MODEL_ROUTER_VENV_DIR; -} - -function modelRouterCommandPath(venvDir = modelRouterVenvDir()): string { - return path.join(venvDir, "bin", "model-router"); -} - -function modelRouterFingerprintPath(venvDir = modelRouterVenvDir()): string { - return path.join(venvDir, MODEL_ROUTER_FINGERPRINT_FILE); -} - -function isExecutableFile(filePath: string): boolean { - try { - fs.accessSync(filePath, fs.constants.X_OK); - return true; - } catch { - return false; - } -} - -function isModelRouterPackageReady(routerDir = modelRouterPackageDir()): boolean { - return fs.existsSync(path.join(routerDir, "pyproject.toml")) || - fs.existsSync(path.join(routerDir, "setup.py")); -} - -function shouldSkipModelRouterFingerprintEntry(name: string): boolean { - return MODEL_ROUTER_FINGERPRINT_IGNORED_NAMES.has(name) || name.endsWith(".egg-info"); -} - -function hashModelRouterSourceTree(routerDir = modelRouterPackageDir()): string | null { - const sourceHash = crypto.createHash("sha256"); - - const hashDirectory = (currentDir: string): boolean => { - let entries: import("fs").Dirent[]; - try { - entries = fs - .readdirSync(currentDir, { withFileTypes: true }) - .sort((left: import("fs").Dirent, right: import("fs").Dirent) => - left.name.localeCompare(right.name), - ); - } catch { - return false; - } - - let hashedSourceFile = false; - for (const entry of entries) { - if (shouldSkipModelRouterFingerprintEntry(entry.name)) continue; - if (entry.name.endsWith(".pyc") || entry.name.endsWith(".pyo")) continue; - - const entryPath = path.join(currentDir, entry.name); - const relativePath = path.relative(routerDir, entryPath).split(path.sep).join("/"); - if (entry.isDirectory()) { - hashedSourceFile = hashDirectory(entryPath) || hashedSourceFile; - continue; - } - if (entry.isSymbolicLink()) { - try { - sourceHash.update(`link:${relativePath}\0`); - sourceHash.update(fs.readlinkSync(entryPath)); - sourceHash.update("\0"); - hashedSourceFile = true; - } catch { - // Ignore unreadable links; the install step will fail if they are required. - } - continue; - } - if (!entry.isFile()) continue; - sourceHash.update(`file:${relativePath}\0`); - sourceHash.update(fs.readFileSync(entryPath)); - sourceHash.update("\0"); - hashedSourceFile = true; - } - return hashedSourceFile; - }; - - return hashDirectory(routerDir) ? `files:${sourceHash.digest("hex")}` : null; -} - -function getModelRouterSourceFingerprint(routerDir = modelRouterPackageDir()): string | null { - const gitHead = runCapture(["git", "-C", routerDir, "rev-parse", "HEAD"], { - ignoreError: true, - }).trim(); - if (/^[0-9a-f]{40}$/i.test(gitHead)) return `git:${gitHead}`; - - const gitLink = runCapture(["git", "-C", ROOT, "rev-parse", `HEAD:${MODEL_ROUTER_RELATIVE_DIR}`], { - ignoreError: true, - }).trim(); - if (/^[0-9a-f]{40}$/i.test(gitLink)) return `gitlink:${gitLink}`; - - return hashModelRouterSourceTree(routerDir); -} - -function readModelRouterInstalledFingerprint(venvDir = modelRouterVenvDir()): string | null { - try { - const fingerprint = fs.readFileSync(modelRouterFingerprintPath(venvDir), "utf8").trim(); - return fingerprint || null; - } catch { - return null; - } -} - -function writeModelRouterInstalledFingerprint( - fingerprint: string | null, - venvDir = modelRouterVenvDir(), -): void { - if (!fingerprint) return; - fs.writeFileSync(modelRouterFingerprintPath(venvDir), `${fingerprint}\n`, { mode: 0o600 }); -} - -function isManagedModelRouterCurrent( - routerDir = modelRouterPackageDir(), - venvDir = modelRouterVenvDir(), -): boolean { - if (!isExecutableFile(modelRouterCommandPath(venvDir))) return false; - const sourceFingerprint = getModelRouterSourceFingerprint(routerDir); - return Boolean( - sourceFingerprint && readModelRouterInstalledFingerprint(venvDir) === sourceFingerprint, - ); -} - -function initializeModelRouterSubmodule(routerDir = modelRouterPackageDir()): void { - if (isModelRouterPackageReady(routerDir)) return; - if (!fs.existsSync(path.join(ROOT, ".gitmodules")) || !fs.existsSync(path.join(ROOT, ".git"))) { - return; - } - console.log(" Initializing Model Router source..."); - run(["git", "-C", ROOT, "submodule", "update", "--init", "--depth", "1", MODEL_ROUTER_RELATIVE_DIR], { - ignoreError: true, - }); -} - -function installModelRouterCommand(routerDir = modelRouterPackageDir()): string { - initializeModelRouterSubmodule(routerDir); - if (!isModelRouterPackageReady(routerDir)) { - throw new Error( - `Model Router source is not initialized at ${routerDir}. ` + - `Run: git -C ${ROOT} submodule update --init --depth 1 ${MODEL_ROUTER_RELATIVE_DIR}`, - ); - } - - const venvDir = modelRouterVenvDir(); - const routerCommand = modelRouterCommandPath(venvDir); - const sourceFingerprint = getModelRouterSourceFingerprint(routerDir); - const allowReplaceExistingVenv = - path.resolve(venvDir) === path.resolve(MODEL_ROUTER_VENV_DIR) || - readModelRouterInstalledFingerprint(venvDir) !== null; - const venvPython = prepareModelRouterVenv({ - venvDir, - allowReplaceExisting: allowReplaceExistingVenv, - }); - - const installResult = run( - [venvPython, "-m", "pip", "install", "--quiet", "--upgrade", `${routerDir}[prefill,proxy]`], - { - ignoreError: true, - timeout: 600_000, - }, - ); - if (installResult.status !== 0) { - throw new Error("Failed to install Model Router dependencies."); - } - if (!isExecutableFile(routerCommand)) { - throw new Error("Model Router install did not produce the model-router command."); - } - writeModelRouterInstalledFingerprint(sourceFingerprint, venvDir); - return routerCommand; -} - -function ensureModelRouterCommand(): string { - const routerDir = modelRouterPackageDir(); - const venvDir = modelRouterVenvDir(); - const managedCommand = modelRouterCommandPath(venvDir); - - if (isModelRouterPackageReady(routerDir) && isManagedModelRouterCurrent(routerDir, venvDir)) { - return managedCommand; - } - - if (!isModelRouterPackageReady(routerDir)) { - initializeModelRouterSubmodule(routerDir); - } - - if (isModelRouterPackageReady(routerDir)) { - if (isManagedModelRouterCurrent(routerDir, venvDir)) return managedCommand; - return installModelRouterCommand(routerDir); - } - - if (isExecutableFile(managedCommand)) return managedCommand; - return resolveHostCommandPath("model-router") || installModelRouterCommand(); -} - -/** - * Start the model-router proxy and wait for it to become healthy. - * Follows the same pattern as Ollama startup (spawn detached, poll health). - * Returns the PID of the child process. - */ -async function startModelRouter(routerCfg: BlueprintRouterConfig): Promise { - const routerCommand = ensureModelRouterCommand(); - const port = routerCfg.port || 4000; - const blueprintDir = path.join(ROOT, "nemoclaw-blueprint"); - const poolConfigPath = path.join( - blueprintDir, - routerCfg.pool_config_path || "router/pool-config.yaml", - ); - const stateDir = path.join(os.homedir(), ".nemoclaw", "state"); - const litellmConfigPath = path.join(stateDir, "litellm-proxy.yaml"); - - fs.mkdirSync(stateDir, { recursive: true }); - - const proxyConfigResult = spawnSync( - routerCommand, - ["proxy-config", "--config", poolConfigPath, "--output", litellmConfigPath], - { encoding: "utf8", timeout: 30_000, cwd: blueprintDir }, - ); - if (proxyConfigResult.status !== 0) { - throw new Error( - `model-router proxy-config failed: ${proxyConfigResult.stderr || proxyConfigResult.error || "unknown error"}`, - ); - } - - const { buildSubprocessEnv } = require("./subprocess-env"); - const credEnvVars: Record = {}; - const credName = routerCfg.credential_env || DEFAULT_MODEL_ROUTER_CREDENTIAL_ENV; - const routedCredential = resolveProviderCredential(credName); - const openAiCredential = resolveProviderCredential("OPENAI_API_KEY"); - if (routedCredential) { - credEnvVars[credName] = routedCredential; - if (!openAiCredential) credEnvVars.OPENAI_API_KEY = routedCredential; - } - if (openAiCredential) credEnvVars.OPENAI_API_KEY = openAiCredential; - const _providerKey = (process.env.NEMOCLAW_PROVIDER_KEY || "").trim(); - if (_providerKey) { - if (!credEnvVars[credName]) credEnvVars[credName] = _providerKey; - if (!credEnvVars.OPENAI_API_KEY) credEnvVars.OPENAI_API_KEY = _providerKey; - } - - if (await isRouterHealthy(port)) { - throw new Error( - `Port ${port} already has a healthy router endpoint; refusing to start a second router.`, - ); - } - - const child = spawn( - routerCommand, - [ - "proxy", - "--litellm-config", litellmConfigPath, - "--router-config", poolConfigPath, - "--host", "0.0.0.0", - "--port", String(port), - ], - { - detached: true, - stdio: "ignore", - cwd: blueprintDir, - env: buildSubprocessEnv(credEnvVars), - }, - ); - let childExited = false; - let childExitDetail = ""; - child.once("error", (err: Error) => { - childExited = true; - childExitDetail = `child failed to start: ${err.message}`; - }); - child.once("exit", (code: number | null, signal: string | null) => { - childExited = true; - if (!childExitDetail) { - childExitDetail = `child exited with code ${code ?? "null"}${signal ? ` signal ${signal}` : ""}`; - } - }); - child.unref(); - - const pid = child.pid; - if (!pid) { - throw new Error( - "Failed to start model-router proxy: no PID returned" + - (childExitDetail ? ` (${childExitDetail})` : ""), - ); - } - - for (let attempt = 0; attempt < ROUTER_HEALTH_RETRIES; attempt++) { - await new Promise((resolve) => setTimeout(resolve, ROUTER_HEALTH_INTERVAL_MS)); - if (childExited) break; - const healthy = await isRouterHealthy(port); - let processAlive = true; - try { - process.kill(pid, 0); - } catch { - processAlive = false; - } - if (healthy && processAlive) return pid; - if (!processAlive) { - childExited = true; - if (!childExitDetail) childExitDetail = "child process is no longer running"; - break; - } - } - try { - process.kill(pid, "SIGTERM"); - } catch { - // already dead - } - throw new Error( - `Model router failed to become healthy on port ${port} after ${ROUTER_HEALTH_RETRIES} attempts` + - (childExitDetail ? ` (${childExitDetail})` : ""), - ); -} - -function getRoutedProfile(): BlueprintInferenceProfile { - const bp = loadBlueprintProfile("routed"); - if (!bp || bp.router?.enabled !== true) { - throw new Error("Router is not enabled in nemoclaw-blueprint/blueprint.yaml."); - } - return bp; -} - -function isRoutedInferenceProvider(provider: string | null | undefined): boolean { - if (!provider) return false; - if (provider === "nvidia-router") return true; - const bp = loadBlueprintProfile("routed"); - return Boolean(bp?.provider_name && provider === bp.provider_name); -} - -async function reconcileModelRouter(): Promise { - const bp = getRoutedProfile(); - const routerPort = bp.router.port || 4000; - const routerCredentialEnv = - bp.router.credential_env || bp.credential_env || DEFAULT_MODEL_ROUTER_CREDENTIAL_ENV; - const routerCredential = - hydrateCredentialEnv(routerCredentialEnv) || - normalizeCredentialValue(bp.credential_default || ""); - if (!routerCredential) { - throw new Error(`${routerCredentialEnv} is required to start Model Router.`); - } - saveCredential(routerCredentialEnv, routerCredential); - const routerCredentialHash = hashCredential(routerCredential); - const session = onboardSession.loadSession(); - const recordedPid = session?.routerPid ?? null; - const recordedCredentialHash = session?.routerCredentialHash ?? null; - - if (await isRouterHealthy(routerPort)) { - if ( - routerCredentialHash && - recordedCredentialHash === routerCredentialHash && - isProcessRunning(recordedPid) - ) { - console.log(` ✓ Model router is already healthy on port ${routerPort}`); - return; - } - if (isProcessRunning(recordedPid)) { - console.log(" Restarting model router with updated credentials..."); - await stopModelRouterProcess(requireValue(recordedPid, "Expected recorded router PID"), routerPort); - } else { - throw new Error( - `Port ${routerPort} already has a healthy router endpoint, but its credential state is unknown. Stop the existing model-router process and rerun onboarding.`, - ); - } - } - - console.log(" Starting model router..."); - const routerPid = await startModelRouter(bp.router); - console.log(` ✓ Model router started (PID ${routerPid}) on port ${routerPort}`); - onboardSession.updateSession((current: Session) => { - current.routerPid = routerPid; - current.routerCredentialHash = routerCredentialHash; - return current; - }); -} - function getOpenshellChannel(env: NodeJS.ProcessEnv = process.env): OpenshellChannel { const raw = String(env.NEMOCLAW_OPENSHELL_CHANNEL || "auto") .trim() @@ -8576,519 +8079,45 @@ async function setupPoliciesWithSelection( const CONTROL_UI_PORT = DASHBOARD_PORT; -// Dashboard helpers — delegated to src/lib/dashboard/contract.ts -const { buildChain, buildControlUiUrls } = dashboardContract; - -function findForwardEntry( - forwardListOutput: string | null | undefined, - port: string, -): { sandboxName: string; status: string } | null { - if (!forwardListOutput) return null; - for (const rawLine of forwardListOutput.split("\n")) { - const line = rawLine.replace(ANSI_RE, ""); - if (/^\s*SANDBOX\s/i.test(line)) continue; - const parts = line.trim().split(/\s+/); - if (parts.length < 3 || parts[2] !== port) continue; - return { - sandboxName: parts[0] || "", - status: (parts[4] || "").toLowerCase(), - }; - } - return null; -} - -function getRunningForwardPorts(forwardListOutput: string | null | undefined): string[] { - const ports = new Set(); - if (!forwardListOutput) return []; - for (const rawLine of forwardListOutput.split("\n")) { - const line = rawLine.replace(ANSI_RE, ""); - if (/^\s*SANDBOX\s/i.test(line)) continue; - const parts = line.trim().split(/\s+/); - if (parts.length < 5 || !/^\d+$/.test(parts[2])) continue; - const status = (parts[4] || "").toLowerCase(); - if (isLiveForwardStatus(status)) { - ports.add(parts[2]); - } - } - return [...ports]; -} - -function stopAllDashboardForwards(): void { - const forwardList = runCaptureOpenshell(["forward", "list"], { ignoreError: true }); - for (const port of getRunningForwardPorts(forwardList)) { - runOpenshell(["forward", "stop", port], { ignoreError: true }); - } -} - - -/** - * Build the actionable error lines printed when the just-created openshell - * sandbox is rolled back after a dashboard port-allocation failure. Pure - * function over (sandboxName, alloc-error, delete-result) so the rollback path - * is testable without spawning subprocesses or exiting the process (#2174). - */ -function buildOrphanedSandboxRollbackMessage( - sandboxName: string, - err: unknown, - deleteSucceeded: boolean, -): string[] { - const lines = [ - "", - ` Could not allocate a dashboard port for '${sandboxName}'.`, - ` ${err instanceof Error ? err.message : String(err)}`, - ]; - if (deleteSucceeded) { - lines.push(" The orphaned sandbox has been removed — you can safely retry."); - } else { - lines.push(" Could not remove the orphaned sandbox. Manual cleanup:"); - lines.push(` openshell sandbox delete "${sandboxName}"`); - } - return lines; -} - -/** - * Set up the dashboard forward for a sandbox. Auto-allocates the next free - * port if the preferred port is taken by a different sandbox (Fixes #2174). - * Returns the actual port number used. - * - * When `rollbackSandboxOnFailure` is true, deletes the just-created openshell - * sandbox before exiting on unrecoverable port-allocation failure. This keeps - * `openshell sandbox list` and the NemoClaw registry from drifting when the - * range is exhausted between sandbox-create and forward-setup ("leaks ghost - * sandbox" half of #2174). Mirrors the not-ready rollback pattern in - * createSandbox. - */ -function ensureDashboardForward( - sandboxName: string, - chatUiUrl = `http://127.0.0.1:${CONTROL_UI_PORT}`, - options: { rollbackSandboxOnFailure?: boolean } = {}, -): number { - const { rollbackSandboxOnFailure = false } = options; - const preferredPort = Number(getDashboardForwardPort(chatUiUrl)); - let existingForwards = runCaptureOpenshell(["forward", "list"], { ignoreError: true }); - const preferredEntry = findForwardEntry(existingForwards, String(preferredPort)); - if ( - preferredEntry && - (preferredEntry.sandboxName === sandboxName || !isLiveForwardStatus(preferredEntry.status)) - ) { - runOpenshell(["forward", "stop", String(preferredPort)], { ignoreError: true }); - existingForwards = runCaptureOpenshell(["forward", "list"], { ignoreError: true }); - } - let actualPort: number; - try { - actualPort = findAvailableDashboardPort(sandboxName, preferredPort, existingForwards); - } catch (err) { - if (!rollbackSandboxOnFailure) throw err; - const delResult = runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true }); - for (const line of buildOrphanedSandboxRollbackMessage( - sandboxName, - err, - delResult.status === 0, - )) { - console.error(line); - } - process.exit(1); - } - - if (actualPort !== preferredPort) { - if (rollbackSandboxOnFailure) { - // Create path: the sandbox was just built with CHAT_UI_URL and - // NEMOCLAW_DASHBOARD_PORT baked from `preferredPort` (see the - // `formatEnvAssignment("CHAT_UI_URL", …)` call in createSandbox). If - // the port was bound during the build window (TOCTOU), picking a new - // host port would leave the sandbox serving the dashboard on - // `preferredPort` internally while the forward listens on `actualPort` - // — reproducing the original "onboard exits but dashboard is - // unreachable" failure on the newly selected port. Reallocation is - // only safe on reuse paths where the sandbox image is fixed; on the - // create path we must roll back so the next onboard re-bakes with a - // clean port. (#3260) - const err = new Error( - `Dashboard port ${preferredPort} became host-bound during sandbox build; ` + - `cannot reallocate to ${actualPort} after the sandbox has been created with ` + - `CHAT_UI_URL=${preferredPort}. Free the port and re-run \`${cliName()} onboard\`, ` + - `or pass \`--control-ui-port \` to pick a different dashboard port.`, - ); - const delResult = runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true }); - for (const line of buildOrphanedSandboxRollbackMessage( - sandboxName, - err, - delResult.status === 0, - )) { - console.error(line); - } - process.exit(1); - } - console.warn(` ! Port ${preferredPort} is taken. Using port ${actualPort} instead.`); - } - - // Clean up any stale forwards owned by this sandbox on other ports so we - // don't leak forwards across port changes and exhaust the range over time. - const occupied = getOccupiedPorts(existingForwards); - for (const [port, owner] of occupied.entries()) { - if (owner === sandboxName && Number(port) !== actualPort) { - runOpenshell(["forward", "stop", port], { ignoreError: true }); - } - } - - // Preserve the original URL's hostname (loopback vs remote) but swap to the actual port. - const parsedUrl = new URL(chatUiUrl.includes("://") ? chatUiUrl : `http://${chatUiUrl}`); - parsedUrl.port = String(actualPort); - const actualTarget = getDashboardForwardTarget(parsedUrl.toString()); - runOpenshell(["forward", "stop", String(actualPort)], { ignoreError: true }); - const { result: fwdResult, diagnostic: fwdDiagnostic } = runBackgroundForwardStartWithPortReleaseRetries( - (stdio, timeout) => - runOpenshell( - ["forward", "start", "--background", actualTarget, sandboxName], - { ignoreError: true, suppressOutput: true, stdio, timeout }, - ), - () => { sleep(1); runOpenshell(["forward", "stop", String(actualPort)], { ignoreError: true }); }, - ); - if (fwdResult && fwdResult.status !== 0) { - const looksLikePortConflict = looksLikeForwardPortConflict(fwdDiagnostic); - if (rollbackSandboxOnFailure) { - // The sandbox was just created, committed to actualPort via its - // baked-in CHAT_UI_URL and NEMOCLAW_DASHBOARD_PORT env. Silently - // returning here leaves the user with a dashboard URL that points - // at a port held by another process — a TOCTOU race where the - // proactive probe in findAvailableDashboardPort missed the - // conflict (e.g., another listener bound during the multi-minute - // image build). Roll back so the next `onboard` retry's allocator - // observes the bound port and picks a different one. Only the - // EADDRINUSE-style failure gets the port-conflict wording; other - // errors (gateway / transport) propagate the real diagnostic so - // users aren't pointed at the wrong fix (#3260). - const err = new Error( - looksLikePortConflict - ? `Failed to start dashboard forward on port ${actualPort} — the host port ` + - `is held by another process. Free it and run \`${cliName()} onboard\` again, ` + - `or pass \`--control-ui-port \` to pick a different dashboard port.` - : `Failed to start dashboard forward on port ${actualPort}: ${fwdDiagnostic.slice(0, 240)}`, - ); - const delResult = runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true }); - for (const line of buildOrphanedSandboxRollbackMessage( - sandboxName, - err, - delResult.status === 0, - )) { - console.error(line); - } - process.exit(1); - } - if (looksLikePortConflict) { - console.warn( - `! Port ${actualPort} forward did not start — port may be in use by another process.`, - ); - console.warn( - ` Check: docker ps --format 'table {{.Names}}\\t{{.Ports}}' | grep ${actualPort}`, - ); - console.warn(` Free the port, then reconnect: ${cliName()} ${sandboxName} connect`); - } else { - console.warn(`! Port ${actualPort} forward did not start: ${fwdDiagnostic.slice(0, 240)}`); - console.warn(` Reconnect after resolving the issue: ${cliName()} ${sandboxName} connect`); - } - } - return actualPort; -} - -function ensureAgentDashboardForward( - sandboxName: string, - agent: { forwardPort?: number | null }, -): number { - const agentDashboardPort = agent.forwardPort ?? CONTROL_UI_PORT; - const agentDashboardUrl = `http://127.0.0.1:${agentDashboardPort}`; - const actualAgentDashboardPort = ensureDashboardForward(sandboxName, agentDashboardUrl); - process.env.CHAT_UI_URL = `http://127.0.0.1:${actualAgentDashboardPort}`; - return actualAgentDashboardPort; -} - -function findOpenclawJsonPath(dir: string): string | null { - if (!fs.existsSync(dir)) return null; - const entries = fs.readdirSync(dir, { withFileTypes: true }); - for (const e of entries) { - const p = path.join(dir, e.name); - if (e.isDirectory()) { - const found: string | null = findOpenclawJsonPath(p); - if (found) return found; - } else if (e.name === "openclaw.json") { - return p; - } - } - return null; -} - -/** - * Pull gateway.auth.token from the sandbox image via openshell sandbox download - * so onboard can build dashboard access URLs. User-visible output must redact - * the token fragment. - */ -function fetchGatewayAuthTokenFromSandbox(sandboxName: string): string | null { - const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-token-")); - try { - const destDir = `${tmpDir}${path.sep}`; - const result = runOpenshell( - ["sandbox", "download", sandboxName, "/sandbox/.openclaw/openclaw.json", destDir], - { ignoreError: true, stdio: ["ignore", "ignore", "ignore"] }, - ); - if (result.status !== 0) return null; - const jsonPath = findOpenclawJsonPath(tmpDir); - if (!jsonPath) return null; - const cfg = JSON.parse(fs.readFileSync(jsonPath, "utf-8")); - const token = cfg && cfg.gateway && cfg.gateway.auth && cfg.gateway.auth.token; - return typeof token === "string" && token.length > 0 ? token : null; - } catch { - return null; - } finally { - try { - fs.rmSync(tmpDir, { recursive: true, force: true }); - } catch { - // ignore cleanup errors - } - } -} - -// buildControlUiUrls — see dashboard-contract import above - -function getDashboardForwardPort( - chatUiUrl = process.env.CHAT_UI_URL || `http://127.0.0.1:${CONTROL_UI_PORT}`, - options: Parameters[1] = {}, -): string { - return dashboardAccess.getDashboardForwardPort(chatUiUrl, { - ...options, - runCapture: options.runCapture || runCapture, - }); -} - -function getDashboardForwardTarget( - chatUiUrl = process.env.CHAT_UI_URL || `http://127.0.0.1:${CONTROL_UI_PORT}`, - options: Parameters[1] = {}, -): string { - return dashboardAccess.getDashboardForwardTarget(chatUiUrl, { - ...options, - runCapture: options.runCapture || runCapture, - }); -} - -function dashboardUrlForDisplay(url: string): string { - return dashboardAccess.dashboardUrlForDisplay(url, redact); -} - -function getWslHostAddress( - options: Parameters[0] = {}, -): string | null { - return dashboardAccess.getWslHostAddress({ ...options, runCapture: options.runCapture || runCapture }); -} - -/** Print the post-onboard dashboard with sandbox status and reconfiguration hints. */ -function printDashboard( - sandboxName: string, - model: string, - provider: string, - nimContainer: string | null = null, - agent: AgentDefinition | null = null, -): void { - const nimStat = nimContainer ? nim.nimStatusByName(nimContainer) : nim.nimStatus(sandboxName); - const showNim = nim.shouldShowNimLine(nimContainer, nimStat.running); - const nimLabel = nimStat.running ? "running" : "not running"; - - const providerLabel = getProviderLabel(provider); - - const token = fetchGatewayAuthTokenFromSandbox(sandboxName); - const chatUiUrl = process.env.CHAT_UI_URL || `http://127.0.0.1:${CONTROL_UI_PORT}`; - const wslAddr = getWslHostAddress(); - const chain = buildChain({ chatUiUrl, isWsl: isWsl(), wslHostAddress: wslAddr }); - - // Build access info inline — uses chain instead of re-deriving from env - const dashboardAccess = buildControlUiUrls(token, chain.port, chain.accessUrl).map((url, i) => ({ - label: i === 0 ? "Dashboard" : `Alt ${i}`, - url, - })); - if (wslAddr) { - const wslUrl = `http://${wslAddr}:${chain.port}/${token ? `#token=${encodeURIComponent(token)}` : ""}`; - const existing = dashboardAccess.find((a) => a.url === wslUrl); - if (existing) existing.label = "VS Code/WSL"; - else dashboardAccess.push({ label: "VS Code/WSL", url: wslUrl }); - } - const guidanceLines = [`Port ${chain.port} must be forwarded before opening these URLs.`]; - if (isWsl()) - guidanceLines.push( - "WSL detected: if localhost fails in Windows, use the WSL host IP shown by `hostname -I`.", - ); - if (dashboardAccess.length === 0) guidanceLines.push("No dashboard URLs were generated."); - - console.log(""); - console.log(` ${"─".repeat(50)}`); - // console.log(` Dashboard http://localhost:${DASHBOARD_PORT}/`); - console.log(` Sandbox ${sandboxName} (Landlock + seccomp + netns)`); - console.log(` Model ${model} (${providerLabel})`); - if (showNim) { - console.log(` NIM ${nimLabel}`); - } - console.log(` ${"─".repeat(50)}`); - console.log(` Run: ${cliName()} ${sandboxName} connect`); - console.log(` Status: ${cliName()} ${sandboxName} status`); - console.log(` Logs: ${cliName()} ${sandboxName} logs --follow`); - console.log(""); - if (agent) { - agentOnboard.printDashboardUi(sandboxName, token, agent, { - note, - buildControlUiUrls: (tokenValue: string | null, port: number) => { - return buildControlUiUrls(tokenValue, port, chain.accessUrl); - }, - }); - } else if (token) { - console.log( - ` ${agentProductName()} UI (auth token redacted from displayed URLs)`, - ); - for (const line of guidanceLines) { - console.log(` ${line}`); - } - for (const entry of dashboardAccess) { - console.log(` ${entry.label}: ${dashboardUrlForDisplay(entry.url)}`); - } - console.log(` Token: ${cliName()} ${sandboxName} gateway-token --quiet`); - console.log(` append #token= locally if the browser asks for auth.`); - } else { - note(" Could not read gateway token from the sandbox (download failed)."); - console.log(` ${agentProductName()} UI`); - for (const line of guidanceLines) { - console.log(` ${line}`); - } - for (const entry of dashboardAccess) { - console.log(` ${entry.label}: ${dashboardUrlForDisplay(entry.url)}`); - } - console.log( - ` Token: ${cliName()} ${sandboxName} connect → jq -r '.gateway.auth.token' /sandbox/.openclaw/openclaw.json`, - ); - console.log(` append #token= to the URL locally if needed.`); - } - console.log(` ${"─".repeat(50)}`); - console.log(""); - console.log(" To change settings later:"); - console.log( - ` Model: ${cliName()} inference get\n ${cliName()} inference set --model --provider --sandbox ${sandboxName}`, - ); - console.log(` Policies: ${cliName()} ${sandboxName} policy-add`); - console.log(` Credentials: ${cliName()} credentials reset then ${cliName()} onboard`); - console.log(""); -} - -// Preserve the nullable contract end-to-end: `null` means "clear this -// field on the persisted session", `undefined` means "leave unchanged". -function toNullableString(value: string | null | undefined): string | null | undefined { - if (value === undefined) return undefined; - if (value === null) return null; - return value; -} - -function toSessionUpdates( - updates: { - sandboxName?: string | null; - provider?: string | null; - model?: string | null; - endpointUrl?: string | null; - credentialEnv?: string | null; - hermesAuthMethod?: HermesAuthMethod | string | null; - preferredInferenceApi?: string | null; - nimContainer?: string | null; - webSearchConfig?: WebSearchConfig | null; - policyPresets?: string[] | null; - messagingChannels?: string[] | null; - messagingChannelConfig?: MessagingChannelConfig | null; - hermesToolGateways?: string[] | null; - } = {}, -): SessionUpdates { - const normalized: SessionUpdates = {}; - if (updates.sandboxName !== undefined) - normalized.sandboxName = toNullableString(updates.sandboxName); - if (updates.provider !== undefined) normalized.provider = toNullableString(updates.provider); - if (updates.model !== undefined) normalized.model = toNullableString(updates.model); - if (updates.endpointUrl !== undefined) - normalized.endpointUrl = toNullableString(updates.endpointUrl); - if (updates.credentialEnv !== undefined) - normalized.credentialEnv = toNullableString(updates.credentialEnv); - if (updates.hermesAuthMethod !== undefined) - normalized.hermesAuthMethod = normalizeHermesAuthMethod(updates.hermesAuthMethod); - if (updates.preferredInferenceApi !== undefined) { - normalized.preferredInferenceApi = toNullableString(updates.preferredInferenceApi); - } - if (updates.nimContainer !== undefined) - normalized.nimContainer = toNullableString(updates.nimContainer); - if (updates.webSearchConfig !== undefined) normalized.webSearchConfig = updates.webSearchConfig; - if (updates.policyPresets !== undefined) normalized.policyPresets = updates.policyPresets; - if (updates.messagingChannels !== undefined) - normalized.messagingChannels = updates.messagingChannels; - if (updates.messagingChannelConfig !== undefined) { - normalized.messagingChannelConfig = updates.messagingChannelConfig; - } - if (updates.hermesToolGateways !== undefined) - normalized.hermesToolGateways = updates.hermesToolGateways; - return normalized; -} - -function getOnboardRuntime(): import("./onboard/machine/runtime").OnboardRuntime { - if (!ONBOARD_RUNTIME) ONBOARD_RUNTIME = new OnboardRuntime(); - return ONBOARD_RUNTIME; -} - -async function startRecordedStep( - stepName: string, - updates: { - sandboxName?: string | null; - provider?: string | null; - model?: string | null; - policyPresets?: string[] | null; - } = {}, -): Promise { - const runtime = getOnboardRuntime(); - await runtime.markStepStarted(stepName); - if (Object.keys(updates).length > 0) { - await runtime.updateContext(toSessionUpdates(updates)); - } - maybeForceE2eStepFailure(stepName); -} - -async function recordStepComplete( - stepName: string, - updates: SessionUpdates = {}, -): Promise { - return getOnboardRuntime().markStepComplete(stepName, updates); -} - -async function recordStepSkipped(stepName: string): Promise { - return getOnboardRuntime().markStepSkipped(stepName); -} - -async function recordStateSkipped( - state: import("./onboard/machine/types").OnboardMachineState, - metadata: Record | null = null, -): Promise { - return getOnboardRuntime().markSkipped(state, metadata); -} +const { + buildChain, + buildControlUiUrls, + buildOrphanedSandboxRollbackMessage, + ensureDashboardForward, + ensureAgentDashboardForward, + fetchGatewayAuthTokenFromSandbox, + getDashboardForwardPort, + getDashboardForwardTarget, + getWslHostAddress, + printDashboard, + stopAllDashboardForwards, +} = onboardDashboard.createOnboardDashboardHelpers({ + runOpenshell, + runCaptureOpenshell, + runCapture, + cliName, + agentProductName, + getProviderLabel, + note, + isWsl, + redact, + sleep, + printAgentDashboardUi: agentOnboard.printDashboardUi, +}); -async function recordRepairEvent( - type: "state.repair.started" | "state.repair.completed" | "state.repair.failed", - options: { - state?: import("./onboard/machine/types").OnboardMachineState | null; - error?: string | null; - metadata?: Record | null; - } = {}, -): Promise { - return getOnboardRuntime().emitRepairEvent(type, options); -} +const onboardRuntimeBoundary = new OnboardRuntimeBoundary({ + toSessionUpdates: (updates: Record) => + toSessionUpdates(updates as Parameters[0]), + maybeForceE2eStepFailure, +}); -async function recordSessionComplete(updates: SessionUpdates = {}): Promise { - const runtime = getOnboardRuntime(); - const current = await runtime.session(); - if (current.machine.state === "finalizing") { - await runtime.transition("post_verify"); - return runtime.complete(updates); - } - if (current.machine.state === "post_verify") { - return runtime.complete(updates); - } - return runtime.completeSession(updates); -} +const startRecordedStep = onboardRuntimeBoundary.startRecordedStep.bind(onboardRuntimeBoundary); +const recordStepComplete = onboardRuntimeBoundary.recordStepComplete.bind(onboardRuntimeBoundary); +const recordStepSkipped = onboardRuntimeBoundary.recordStepSkipped.bind(onboardRuntimeBoundary); +const recordStepFailed = onboardRuntimeBoundary.recordStepFailed.bind(onboardRuntimeBoundary); +const recordStateSkipped = onboardRuntimeBoundary.recordStateSkipped.bind(onboardRuntimeBoundary); +const recordRepairEvent = onboardRuntimeBoundary.recordRepairEvent.bind(onboardRuntimeBoundary); +const recordSessionComplete = onboardRuntimeBoundary.recordSessionComplete.bind(onboardRuntimeBoundary); const ONBOARD_STEP_INDEX: Record = { preflight: { number: 1, title: "Preflight checks" }, @@ -9125,7 +8154,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { RECREATE_SANDBOX = opts.recreateSandbox || process.env.NEMOCLAW_RECREATE_SANDBOX === "1"; AUTO_YES = opts.autoYes === true || process.env.NEMOCLAW_YES === "1"; _preflightDashboardPort = opts.controlUiPort || null; - ONBOARD_RUNTIME = new OnboardRuntime(); + onboardRuntimeBoundary.reset(); delete process.env.OPENSHELL_GATEWAY; const resume = opts.resume === true; const fresh = opts.fresh === true; @@ -9708,8 +8737,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { cleanupTempDir, startRecordedStep, recordStepComplete, - recordStepFailed: (stepName: string, message: string | null) => - getOnboardRuntime().markStepFailed(stepName, message), + recordStepFailed, skippedStepMessage, }), ensureAgentDashboardForward, @@ -9816,7 +8844,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { }); } finally { releaseOnboardLock(); - ONBOARD_RUNTIME = null; + onboardRuntimeBoundary.clear(); } } diff --git a/src/lib/onboard/dashboard.ts b/src/lib/onboard/dashboard.ts new file mode 100644 index 0000000000..118b90476a --- /dev/null +++ b/src/lib/onboard/dashboard.ts @@ -0,0 +1,436 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; + +import type { AgentDefinition } from "../agent/defs"; +import { DASHBOARD_PORT } from "../core/ports"; +import { buildChain, buildControlUiUrls } from "../dashboard/contract"; +import * as nim from "../inference/nim"; +import { runCapture as defaultRunCapture } from "../runner"; +import * as dashboardAccess from "./dashboard-access"; +import { + findAvailableDashboardPort, + getOccupiedPorts, + isLiveForwardStatus, +} from "./dashboard-port"; +import { + looksLikeForwardPortConflict, + runBackgroundForwardStartWithPortReleaseRetries, +} from "./forward-start"; + +const ANSI_RE = /\x1B(?:\[[0-?]*[ -/]*[@-~]|\][^\x07]*(?:\x07|\x1B\\)|[@-_])/g; +export const CONTROL_UI_PORT = DASHBOARD_PORT; + +type CommandResult = { status: number | null }; + +export interface OnboardDashboardDeps { + runOpenshell(args: string[], opts?: Record): CommandResult; + runCaptureOpenshell(args: string[], opts?: Record): string | null; + runCapture?: typeof defaultRunCapture; + cliName(): string; + agentProductName(): string; + getProviderLabel(provider: string): string; + note(message: string): void; + isWsl(): boolean; + redact(value: unknown): string; + sleep(seconds: number): void; + printAgentDashboardUi( + sandboxName: string, + token: string | null, + agent: AgentDefinition, + deps: { + note: (msg: string) => void; + buildControlUiUrls: (token: string | null, port: number) => string[]; + }, + ): void; +} + +export interface OnboardDashboardHelpers { + buildChain: typeof buildChain; + buildControlUiUrls: typeof buildControlUiUrls; + buildOrphanedSandboxRollbackMessage( + sandboxName: string, + err: unknown, + deleteSucceeded: boolean, + ): string[]; + ensureDashboardForward( + sandboxName: string, + chatUiUrl?: string, + options?: { rollbackSandboxOnFailure?: boolean }, + ): number; + ensureAgentDashboardForward( + sandboxName: string, + agent: { forwardPort?: number | null }, + ): number; + fetchGatewayAuthTokenFromSandbox(sandboxName: string): string | null; + getDashboardForwardPort( + chatUiUrl?: string, + options?: Parameters[1], + ): string; + getDashboardForwardTarget( + chatUiUrl?: string, + options?: Parameters[1], + ): string; + getWslHostAddress( + options?: Parameters[0], + ): string | null; + printDashboard( + sandboxName: string, + model: string, + provider: string, + nimContainer?: string | null, + agent?: AgentDefinition | null, + ): void; + stopAllDashboardForwards(): void; +} + +function findForwardEntry( + forwardListOutput: string | null | undefined, + port: string, +): { sandboxName: string; status: string } | null { + if (!forwardListOutput) return null; + for (const rawLine of forwardListOutput.split("\n")) { + const line = rawLine.replace(ANSI_RE, ""); + if (/^\s*SANDBOX\s/i.test(line)) continue; + const parts = line.trim().split(/\s+/); + if (parts.length < 3 || parts[2] !== port) continue; + return { + sandboxName: parts[0] || "", + status: (parts[4] || "").toLowerCase(), + }; + } + return null; +} + +function getRunningForwardPorts(forwardListOutput: string | null | undefined): string[] { + const ports = new Set(); + if (!forwardListOutput) return []; + for (const rawLine of forwardListOutput.split("\n")) { + const line = rawLine.replace(ANSI_RE, ""); + if (/^\s*SANDBOX\s/i.test(line)) continue; + const parts = line.trim().split(/\s+/); + if (parts.length < 5 || !/^\d+$/.test(parts[2])) continue; + const status = (parts[4] || "").toLowerCase(); + if (isLiveForwardStatus(status)) { + ports.add(parts[2]); + } + } + return [...ports]; +} + +function findOpenclawJsonPath(dir: string): string | null { + if (!fs.existsSync(dir)) return null; + const entries = fs.readdirSync(dir, { withFileTypes: true }); + for (const entry of entries) { + const entryPath = path.join(dir, entry.name); + if (entry.isDirectory()) { + const found: string | null = findOpenclawJsonPath(entryPath); + if (found) return found; + } else if (entry.name === "openclaw.json") { + return entryPath; + } + } + return null; +} + +function dashboardUrlForDisplay(url: string, deps: OnboardDashboardDeps): string { + return dashboardAccess.dashboardUrlForDisplay(url, deps.redact); +} + +export function createOnboardDashboardHelpers(deps: OnboardDashboardDeps): OnboardDashboardHelpers { + const runCapture = deps.runCapture ?? defaultRunCapture; + + function getDashboardForwardPort( + chatUiUrl = process.env.CHAT_UI_URL || `http://127.0.0.1:${CONTROL_UI_PORT}`, + options: Parameters[1] = {}, + ): string { + return dashboardAccess.getDashboardForwardPort(chatUiUrl, { + ...options, + runCapture: options.runCapture || runCapture, + }); + } + + function getDashboardForwardTarget( + chatUiUrl = process.env.CHAT_UI_URL || `http://127.0.0.1:${CONTROL_UI_PORT}`, + options: Parameters[1] = {}, + ): string { + return dashboardAccess.getDashboardForwardTarget(chatUiUrl, { + ...options, + runCapture: options.runCapture || runCapture, + }); + } + + function getWslHostAddress( + options: Parameters[0] = {}, + ): string | null { + return dashboardAccess.getWslHostAddress({ ...options, runCapture: options.runCapture || runCapture }); + } + + function stopAllDashboardForwards(): void { + const forwardList = deps.runCaptureOpenshell(["forward", "list"], { ignoreError: true }); + for (const port of getRunningForwardPorts(forwardList)) { + deps.runOpenshell(["forward", "stop", port], { ignoreError: true }); + } + } + + function buildOrphanedSandboxRollbackMessage( + sandboxName: string, + err: unknown, + deleteSucceeded: boolean, + ): string[] { + const lines = [ + "", + ` Could not allocate a dashboard port for '${sandboxName}'.`, + ` ${err instanceof Error ? err.message : String(err)}`, + ]; + if (deleteSucceeded) { + lines.push(" The orphaned sandbox has been removed — you can safely retry."); + } else { + lines.push(" Could not remove the orphaned sandbox. Manual cleanup:"); + lines.push(` openshell sandbox delete "${sandboxName}"`); + } + return lines; + } + + function rollbackSandboxAndExit(sandboxName: string, err: unknown): never { + const delResult = deps.runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true }); + for (const line of buildOrphanedSandboxRollbackMessage( + sandboxName, + err, + delResult.status === 0, + )) { + console.error(line); + } + process.exit(1); + } + + function ensureDashboardForward( + sandboxName: string, + chatUiUrl = `http://127.0.0.1:${CONTROL_UI_PORT}`, + options: { rollbackSandboxOnFailure?: boolean } = {}, + ): number { + const { rollbackSandboxOnFailure = false } = options; + const preferredPort = Number(getDashboardForwardPort(chatUiUrl)); + let existingForwards = deps.runCaptureOpenshell(["forward", "list"], { ignoreError: true }); + const preferredEntry = findForwardEntry(existingForwards, String(preferredPort)); + if ( + preferredEntry && + (preferredEntry.sandboxName === sandboxName || !isLiveForwardStatus(preferredEntry.status)) + ) { + deps.runOpenshell(["forward", "stop", String(preferredPort)], { ignoreError: true }); + existingForwards = deps.runCaptureOpenshell(["forward", "list"], { ignoreError: true }); + } + let actualPort: number; + try { + actualPort = findAvailableDashboardPort(sandboxName, preferredPort, existingForwards); + } catch (err) { + if (!rollbackSandboxOnFailure) throw err; + rollbackSandboxAndExit(sandboxName, err); + } + + if (actualPort !== preferredPort) { + if (rollbackSandboxOnFailure) { + const err = new Error( + `Dashboard port ${preferredPort} became host-bound during sandbox build; ` + + `cannot reallocate to ${actualPort} after the sandbox has been created with ` + + `CHAT_UI_URL=${preferredPort}. Free the port and re-run \`${deps.cliName()} onboard\`, ` + + `or pass \`--control-ui-port \` to pick a different dashboard port.`, + ); + rollbackSandboxAndExit(sandboxName, err); + } + console.warn(` ! Port ${preferredPort} is taken. Using port ${actualPort} instead.`); + } + + const occupied = getOccupiedPorts(existingForwards); + for (const [port, owner] of occupied.entries()) { + if (owner === sandboxName && Number(port) !== actualPort) { + deps.runOpenshell(["forward", "stop", port], { ignoreError: true }); + } + } + + const parsedUrl = new URL(chatUiUrl.includes("://") ? chatUiUrl : `http://${chatUiUrl}`); + parsedUrl.port = String(actualPort); + const actualTarget = getDashboardForwardTarget(parsedUrl.toString()); + deps.runOpenshell(["forward", "stop", String(actualPort)], { ignoreError: true }); + const { result: fwdResult, diagnostic: fwdDiagnostic } = runBackgroundForwardStartWithPortReleaseRetries( + (stdio, timeout) => + deps.runOpenshell( + ["forward", "start", "--background", actualTarget, sandboxName], + { ignoreError: true, suppressOutput: true, stdio, timeout }, + ), + () => { + deps.sleep(1); + deps.runOpenshell(["forward", "stop", String(actualPort)], { ignoreError: true }); + }, + ); + if (fwdResult && fwdResult.status !== 0) { + const looksLikePortConflict = looksLikeForwardPortConflict(fwdDiagnostic); + if (rollbackSandboxOnFailure) { + const err = new Error( + looksLikePortConflict + ? `Failed to start dashboard forward on port ${actualPort} — the host port ` + + `is held by another process. Free it and run \`${deps.cliName()} onboard\` again, ` + + `or pass \`--control-ui-port \` to pick a different dashboard port.` + : `Failed to start dashboard forward on port ${actualPort}: ${fwdDiagnostic.slice(0, 240)}`, + ); + rollbackSandboxAndExit(sandboxName, err); + } + if (looksLikePortConflict) { + console.warn( + `! Port ${actualPort} forward did not start — port may be in use by another process.`, + ); + console.warn( + ` Check: docker ps --format 'table {{.Names}}\\t{{.Ports}}' | grep ${actualPort}`, + ); + console.warn(` Free the port, then reconnect: ${deps.cliName()} ${sandboxName} connect`); + } else { + console.warn(`! Port ${actualPort} forward did not start: ${fwdDiagnostic.slice(0, 240)}`); + console.warn(` Reconnect after resolving the issue: ${deps.cliName()} ${sandboxName} connect`); + } + } + return actualPort; + } + + function ensureAgentDashboardForward( + sandboxName: string, + agent: { forwardPort?: number | null }, + ): number { + const agentDashboardPort = agent.forwardPort ?? CONTROL_UI_PORT; + const agentDashboardUrl = `http://127.0.0.1:${agentDashboardPort}`; + const actualAgentDashboardPort = ensureDashboardForward(sandboxName, agentDashboardUrl); + process.env.CHAT_UI_URL = `http://127.0.0.1:${actualAgentDashboardPort}`; + return actualAgentDashboardPort; + } + + function fetchGatewayAuthTokenFromSandbox(sandboxName: string): string | null { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-token-")); + try { + const destDir = `${tmpDir}${path.sep}`; + const result = deps.runOpenshell( + ["sandbox", "download", sandboxName, "/sandbox/.openclaw/openclaw.json", destDir], + { ignoreError: true, stdio: ["ignore", "ignore", "ignore"] }, + ); + if (result.status !== 0) return null; + const jsonPath = findOpenclawJsonPath(tmpDir); + if (!jsonPath) return null; + const cfg = JSON.parse(fs.readFileSync(jsonPath, "utf-8")); + const token = cfg && cfg.gateway && cfg.gateway.auth && cfg.gateway.auth.token; + return typeof token === "string" && token.length > 0 ? token : null; + } catch { + return null; + } finally { + try { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } catch { + // ignore cleanup errors + } + } + } + + function printDashboard( + sandboxName: string, + model: string, + provider: string, + nimContainer: string | null = null, + agent: AgentDefinition | null = null, + ): void { + const nimStat = nimContainer ? nim.nimStatusByName(nimContainer) : nim.nimStatus(sandboxName); + const showNim = nim.shouldShowNimLine(nimContainer, nimStat.running); + const nimLabel = nimStat.running ? "running" : "not running"; + const providerLabel = deps.getProviderLabel(provider); + const token = fetchGatewayAuthTokenFromSandbox(sandboxName); + const chatUiUrl = process.env.CHAT_UI_URL || `http://127.0.0.1:${CONTROL_UI_PORT}`; + const wslAddr = getWslHostAddress(); + const chain = buildChain({ chatUiUrl, isWsl: deps.isWsl(), wslHostAddress: wslAddr }); + + const dashboardAccessEntries = buildControlUiUrls(token, chain.port, chain.accessUrl).map((url, index) => ({ + label: index === 0 ? "Dashboard" : `Alt ${index}`, + url, + })); + if (wslAddr) { + const wslUrl = `http://${wslAddr}:${chain.port}/${token ? `#token=${encodeURIComponent(token)}` : ""}`; + const existing = dashboardAccessEntries.find((entry) => entry.url === wslUrl); + if (existing) existing.label = "VS Code/WSL"; + else dashboardAccessEntries.push({ label: "VS Code/WSL", url: wslUrl }); + } + const guidanceLines = [`Port ${chain.port} must be forwarded before opening these URLs.`]; + if (deps.isWsl()) { + guidanceLines.push( + "WSL detected: if localhost fails in Windows, use the WSL host IP shown by `hostname -I`.", + ); + } + if (dashboardAccessEntries.length === 0) guidanceLines.push("No dashboard URLs were generated."); + + console.log(""); + console.log(` ${"─".repeat(50)}`); + console.log(` Sandbox ${sandboxName} (Landlock + seccomp + netns)`); + console.log(` Model ${model} (${providerLabel})`); + if (showNim) { + console.log(` NIM ${nimLabel}`); + } + console.log(` ${"─".repeat(50)}`); + console.log(` Run: ${deps.cliName()} ${sandboxName} connect`); + console.log(` Status: ${deps.cliName()} ${sandboxName} status`); + console.log(` Logs: ${deps.cliName()} ${sandboxName} logs --follow`); + console.log(""); + if (agent) { + deps.printAgentDashboardUi(sandboxName, token, agent, { + note: deps.note, + buildControlUiUrls: (tokenValue: string | null, port: number) => { + return buildControlUiUrls(tokenValue, port, chain.accessUrl); + }, + }); + } else if (token) { + console.log( + ` ${deps.agentProductName()} UI (auth token redacted from displayed URLs)`, + ); + for (const line of guidanceLines) { + console.log(` ${line}`); + } + for (const entry of dashboardAccessEntries) { + console.log(` ${entry.label}: ${dashboardUrlForDisplay(entry.url, deps)}`); + } + console.log(` Token: ${deps.cliName()} ${sandboxName} gateway-token --quiet`); + console.log(" append #token= locally if the browser asks for auth."); + } else { + deps.note(" Could not read gateway token from the sandbox (download failed)."); + console.log(` ${deps.agentProductName()} UI`); + for (const line of guidanceLines) { + console.log(` ${line}`); + } + for (const entry of dashboardAccessEntries) { + console.log(` ${entry.label}: ${dashboardUrlForDisplay(entry.url, deps)}`); + } + console.log( + ` Token: ${deps.cliName()} ${sandboxName} connect → jq -r '.gateway.auth.token' /sandbox/.openclaw/openclaw.json`, + ); + console.log(" append #token= to the URL locally if needed."); + } + console.log(` ${"─".repeat(50)}`); + console.log(""); + console.log(" To change settings later:"); + console.log( + ` Model: ${deps.cliName()} inference get\n ${deps.cliName()} inference set --model --provider --sandbox ${sandboxName}`, + ); + console.log(` Policies: ${deps.cliName()} ${sandboxName} policy-add`); + console.log(` Credentials: ${deps.cliName()} credentials reset then ${deps.cliName()} onboard`); + console.log(""); + } + + return { + buildChain, + buildControlUiUrls, + buildOrphanedSandboxRollbackMessage, + ensureDashboardForward, + ensureAgentDashboardForward, + fetchGatewayAuthTokenFromSandbox, + getDashboardForwardPort, + getDashboardForwardTarget, + getWslHostAddress, + printDashboard, + stopAllDashboardForwards, + }; +} diff --git a/src/lib/onboard/model-router.ts b/src/lib/onboard/model-router.ts new file mode 100644 index 0000000000..81ca0d10d7 --- /dev/null +++ b/src/lib/onboard/model-router.ts @@ -0,0 +1,522 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { spawn, spawnSync } from "node:child_process"; +import crypto from "node:crypto"; +import fs from "node:fs"; +import http from "node:http"; +import os from "node:os"; +import path from "node:path"; + +import { + normalizeCredentialValue, + resolveProviderCredential, + saveCredential, +} from "../credentials/store"; +import { ROOT, run, runCapture } from "../runner"; +import { hashCredential } from "../security/credential-hash"; +import type { Session } from "../state/onboard-session"; +import * as onboardSession from "../state/onboard-session"; +import { buildSubprocessEnv } from "../subprocess-env"; +import { hydrateCredentialEnv } from "./credential-env"; +import { prepareModelRouterVenv } from "./model-router-python"; + +const ROUTER_HEALTH_RETRIES = 15; +const ROUTER_HEALTH_INTERVAL_MS = 2000; +const ROUTER_HEALTH_TIMEOUT_MS = 3000; +const MODEL_ROUTER_RELATIVE_DIR = path.join("nemoclaw-blueprint", "router", "llm-router"); +const MODEL_ROUTER_VENV_DIR = path.join(os.homedir(), ".nemoclaw", "model-router-venv"); +const MODEL_ROUTER_FINGERPRINT_FILE = ".nemoclaw-source-fingerprint"; +const MODEL_ROUTER_FINGERPRINT_IGNORED_NAMES = new Set([ + ".git", + ".hg", + ".mypy_cache", + ".pytest_cache", + ".ruff_cache", + ".svn", + ".venv", + "__pycache__", + "build", + "dist", + "node_modules", + "venv", +]); +export const DEFAULT_MODEL_ROUTER_CREDENTIAL_ENV = "NVIDIA_API_KEY"; + +export type BlueprintRouterConfig = { + enabled?: boolean; + port?: number; + pool_config_path?: string; + credential_env?: string; +}; + +export type BlueprintInferenceProfile = { + provider_name?: string; + endpoint?: string; + model: string; + credential_env?: string; + credential_default?: string; + router: BlueprintRouterConfig; +}; + +function requireValue(value: T | null | undefined, message: string): T { + if (value === null || value === undefined) { + throw new Error(message); + } + return value; +} + +/** + * Load a named inference profile and router config from blueprint.yaml. + * Returns null if the blueprint or profile is missing. + */ +export function loadBlueprintProfile( + profileName: string, + rootDir: string = ROOT, +): BlueprintInferenceProfile | null { + try { + const YAML = require("yaml"); + const blueprintPath = path.join(rootDir, "nemoclaw-blueprint", "blueprint.yaml"); + if (!fs.existsSync(blueprintPath)) return null; + const raw = fs.readFileSync(blueprintPath, "utf8"); + const parsed = YAML.parse(raw); + const profile = parsed?.components?.inference?.profiles?.[profileName]; + if (!profile) return null; + const router = { ...(parsed?.components?.router || {}) }; + if (typeof profile.credential_env === "string" && profile.credential_env.trim().length > 0) { + router.credential_env = profile.credential_env; + } + return { ...profile, router } as BlueprintInferenceProfile; + } catch { + return null; + } +} + +async function isRouterHealthy(port: number, timeoutMs = ROUTER_HEALTH_TIMEOUT_MS): Promise { + return new Promise((resolve) => { + let settled = false; + const settle = (healthy: boolean) => { + if (settled) return; + settled = true; + resolve(healthy); + }; + const request = http + .get(`http://127.0.0.1:${port}/health`, (res: http.IncomingMessage) => { + res.resume(); + settle((res.statusCode || 0) >= 200 && (res.statusCode || 0) < 300); + }) + .on("error", () => settle(false)); + request.setTimeout(timeoutMs, () => { + request.destroy(); + settle(false); + }); + }); +} + +function isProcessRunning(pid: number | null | undefined): boolean { + if (!Number.isInteger(pid) || Number(pid) <= 0) return false; + try { + process.kill(Number(pid), 0); + return true; + } catch { + return false; + } +} + +async function stopModelRouterProcess(pid: number, port: number): Promise { + try { + process.kill(pid, "SIGTERM"); + } catch { + return; + } + for (let attempt = 0; attempt < 10; attempt++) { + await new Promise((resolve) => setTimeout(resolve, 500)); + if (!isProcessRunning(pid) && !(await isRouterHealthy(port, 1000))) return; + } + try { + process.kill(pid, "SIGKILL"); + } catch { + // already stopped + } + for (let attempt = 0; attempt < 5; attempt++) { + await new Promise((resolve) => setTimeout(resolve, 500)); + if (!isProcessRunning(pid) && !(await isRouterHealthy(port, 1000))) return; + } +} + +function resolveHostCommandPath(commandName: string): string | null { + const result = runCapture(["sh", "-c", 'command -v "$1"', "--", commandName], { + ignoreError: true, + }).trim(); + return result || null; +} + +function modelRouterPackageDir(): string { + return path.join(ROOT, MODEL_ROUTER_RELATIVE_DIR); +} + +function modelRouterVenvDir(): string { + return process.env.NEMOCLAW_MODEL_ROUTER_VENV || MODEL_ROUTER_VENV_DIR; +} + +function modelRouterCommandPath(venvDir = modelRouterVenvDir()): string { + return path.join(venvDir, "bin", "model-router"); +} + +function modelRouterFingerprintPath(venvDir = modelRouterVenvDir()): string { + return path.join(venvDir, MODEL_ROUTER_FINGERPRINT_FILE); +} + +function isExecutableFile(filePath: string): boolean { + try { + fs.accessSync(filePath, fs.constants.X_OK); + return true; + } catch { + return false; + } +} + +function isModelRouterPackageReady(routerDir = modelRouterPackageDir()): boolean { + return fs.existsSync(path.join(routerDir, "pyproject.toml")) || + fs.existsSync(path.join(routerDir, "setup.py")); +} + +function shouldSkipModelRouterFingerprintEntry(name: string): boolean { + return MODEL_ROUTER_FINGERPRINT_IGNORED_NAMES.has(name) || name.endsWith(".egg-info"); +} + +function hashModelRouterSourceTree(routerDir = modelRouterPackageDir()): string | null { + const sourceHash = crypto.createHash("sha256"); + + const hashDirectory = (currentDir: string): boolean => { + let entries: fs.Dirent[]; + try { + entries = fs + .readdirSync(currentDir, { withFileTypes: true }) + .sort((left: fs.Dirent, right: fs.Dirent) => left.name.localeCompare(right.name)); + } catch { + return false; + } + + let hashedSourceFile = false; + for (const entry of entries) { + if (shouldSkipModelRouterFingerprintEntry(entry.name)) continue; + if (entry.name.endsWith(".pyc") || entry.name.endsWith(".pyo")) continue; + + const entryPath = path.join(currentDir, entry.name); + const relativePath = path.relative(routerDir, entryPath).split(path.sep).join("/"); + if (entry.isDirectory()) { + hashedSourceFile = hashDirectory(entryPath) || hashedSourceFile; + continue; + } + if (entry.isSymbolicLink()) { + try { + sourceHash.update(`link:${relativePath}\0`); + sourceHash.update(fs.readlinkSync(entryPath)); + sourceHash.update("\0"); + hashedSourceFile = true; + } catch { + // Ignore unreadable links; the install step will fail if they are required. + } + continue; + } + if (!entry.isFile()) continue; + sourceHash.update(`file:${relativePath}\0`); + sourceHash.update(fs.readFileSync(entryPath)); + sourceHash.update("\0"); + hashedSourceFile = true; + } + return hashedSourceFile; + }; + + return hashDirectory(routerDir) ? `files:${sourceHash.digest("hex")}` : null; +} + +function getModelRouterSourceFingerprint(routerDir = modelRouterPackageDir()): string | null { + const gitHead = runCapture(["git", "-C", routerDir, "rev-parse", "HEAD"], { + ignoreError: true, + }).trim(); + if (/^[0-9a-f]{40}$/i.test(gitHead)) return `git:${gitHead}`; + + const gitLink = runCapture(["git", "-C", ROOT, "rev-parse", `HEAD:${MODEL_ROUTER_RELATIVE_DIR}`], { + ignoreError: true, + }).trim(); + if (/^[0-9a-f]{40}$/i.test(gitLink)) return `gitlink:${gitLink}`; + + return hashModelRouterSourceTree(routerDir); +} + +function readModelRouterInstalledFingerprint(venvDir = modelRouterVenvDir()): string | null { + try { + const fingerprint = fs.readFileSync(modelRouterFingerprintPath(venvDir), "utf8").trim(); + return fingerprint || null; + } catch { + return null; + } +} + +function writeModelRouterInstalledFingerprint( + fingerprint: string | null, + venvDir = modelRouterVenvDir(), +): void { + if (!fingerprint) return; + fs.writeFileSync(modelRouterFingerprintPath(venvDir), `${fingerprint}\n`, { mode: 0o600 }); +} + +function isManagedModelRouterCurrent( + routerDir = modelRouterPackageDir(), + venvDir = modelRouterVenvDir(), +): boolean { + if (!isExecutableFile(modelRouterCommandPath(venvDir))) return false; + const sourceFingerprint = getModelRouterSourceFingerprint(routerDir); + return Boolean( + sourceFingerprint && readModelRouterInstalledFingerprint(venvDir) === sourceFingerprint, + ); +} + +function initializeModelRouterSubmodule(routerDir = modelRouterPackageDir()): void { + if (isModelRouterPackageReady(routerDir)) return; + if (!fs.existsSync(path.join(ROOT, ".gitmodules")) || !fs.existsSync(path.join(ROOT, ".git"))) { + return; + } + console.log(" Initializing Model Router source..."); + run(["git", "-C", ROOT, "submodule", "update", "--init", "--depth", "1", MODEL_ROUTER_RELATIVE_DIR], { + ignoreError: true, + }); +} + +function installModelRouterCommand(routerDir = modelRouterPackageDir()): string { + initializeModelRouterSubmodule(routerDir); + if (!isModelRouterPackageReady(routerDir)) { + throw new Error( + `Model Router source is not initialized at ${routerDir}. ` + + `Run: git -C ${ROOT} submodule update --init --depth 1 ${MODEL_ROUTER_RELATIVE_DIR}`, + ); + } + + const venvDir = modelRouterVenvDir(); + const routerCommand = modelRouterCommandPath(venvDir); + const sourceFingerprint = getModelRouterSourceFingerprint(routerDir); + const allowReplaceExistingVenv = + path.resolve(venvDir) === path.resolve(MODEL_ROUTER_VENV_DIR) || + readModelRouterInstalledFingerprint(venvDir) !== null; + const venvPython = prepareModelRouterVenv({ + venvDir, + allowReplaceExisting: allowReplaceExistingVenv, + }); + + const installResult = run( + [venvPython, "-m", "pip", "install", "--quiet", "--upgrade", `${routerDir}[prefill,proxy]`], + { + ignoreError: true, + timeout: 600_000, + }, + ); + if (installResult.status !== 0) { + throw new Error("Failed to install Model Router dependencies."); + } + if (!isExecutableFile(routerCommand)) { + throw new Error("Model Router install did not produce the model-router command."); + } + writeModelRouterInstalledFingerprint(sourceFingerprint, venvDir); + return routerCommand; +} + +function ensureModelRouterCommand(): string { + const routerDir = modelRouterPackageDir(); + const venvDir = modelRouterVenvDir(); + const managedCommand = modelRouterCommandPath(venvDir); + + if (isModelRouterPackageReady(routerDir) && isManagedModelRouterCurrent(routerDir, venvDir)) { + return managedCommand; + } + + if (!isModelRouterPackageReady(routerDir)) { + initializeModelRouterSubmodule(routerDir); + } + + if (isModelRouterPackageReady(routerDir)) { + if (isManagedModelRouterCurrent(routerDir, venvDir)) return managedCommand; + return installModelRouterCommand(routerDir); + } + + if (isExecutableFile(managedCommand)) return managedCommand; + return resolveHostCommandPath("model-router") || installModelRouterCommand(); +} + +/** + * Start the model-router proxy and wait for it to become healthy. + * Follows the same pattern as Ollama startup (spawn detached, poll health). + * Returns the PID of the child process. + */ +async function startModelRouter(routerCfg: BlueprintRouterConfig): Promise { + const routerCommand = ensureModelRouterCommand(); + const port = routerCfg.port || 4000; + const blueprintDir = path.join(ROOT, "nemoclaw-blueprint"); + const poolConfigPath = path.join( + blueprintDir, + routerCfg.pool_config_path || "router/pool-config.yaml", + ); + const stateDir = path.join(os.homedir(), ".nemoclaw", "state"); + const litellmConfigPath = path.join(stateDir, "litellm-proxy.yaml"); + + fs.mkdirSync(stateDir, { recursive: true }); + + const proxyConfigResult = spawnSync( + routerCommand, + ["proxy-config", "--config", poolConfigPath, "--output", litellmConfigPath], + { encoding: "utf8", timeout: 30_000, cwd: blueprintDir }, + ); + if (proxyConfigResult.status !== 0) { + throw new Error( + `model-router proxy-config failed: ${proxyConfigResult.stderr || proxyConfigResult.error || "unknown error"}`, + ); + } + + const credEnvVars: Record = {}; + const credName = routerCfg.credential_env || DEFAULT_MODEL_ROUTER_CREDENTIAL_ENV; + const routedCredential = resolveProviderCredential(credName); + const openAiCredential = resolveProviderCredential("OPENAI_API_KEY"); + if (routedCredential) { + credEnvVars[credName] = routedCredential; + if (!openAiCredential) credEnvVars.OPENAI_API_KEY = routedCredential; + } + if (openAiCredential) credEnvVars.OPENAI_API_KEY = openAiCredential; + const _providerKey = (process.env.NEMOCLAW_PROVIDER_KEY || "").trim(); + if (_providerKey) { + if (!credEnvVars[credName]) credEnvVars[credName] = _providerKey; + if (!credEnvVars.OPENAI_API_KEY) credEnvVars.OPENAI_API_KEY = _providerKey; + } + + if (await isRouterHealthy(port)) { + throw new Error( + `Port ${port} already has a healthy router endpoint; refusing to start a second router.`, + ); + } + + const child = spawn( + routerCommand, + [ + "proxy", + "--litellm-config", litellmConfigPath, + "--router-config", poolConfigPath, + "--host", "0.0.0.0", + "--port", String(port), + ], + { + detached: true, + stdio: "ignore", + cwd: blueprintDir, + env: buildSubprocessEnv(credEnvVars), + }, + ); + let childExited = false; + let childExitDetail = ""; + child.once("error", (err: Error) => { + childExited = true; + childExitDetail = `child failed to start: ${err.message}`; + }); + child.once("exit", (code: number | null, signal: string | null) => { + childExited = true; + if (!childExitDetail) { + childExitDetail = `child exited with code ${code ?? "null"}${signal ? ` signal ${signal}` : ""}`; + } + }); + child.unref(); + + const pid = child.pid; + if (!pid) { + throw new Error( + "Failed to start model-router proxy: no PID returned" + + (childExitDetail ? ` (${childExitDetail})` : ""), + ); + } + + for (let attempt = 0; attempt < ROUTER_HEALTH_RETRIES; attempt++) { + await new Promise((resolve) => setTimeout(resolve, ROUTER_HEALTH_INTERVAL_MS)); + if (childExited) break; + const healthy = await isRouterHealthy(port); + let processAlive = true; + try { + process.kill(pid, 0); + } catch { + processAlive = false; + } + if (healthy && processAlive) return pid; + if (!processAlive) { + childExited = true; + if (!childExitDetail) childExitDetail = "child process is no longer running"; + break; + } + } + try { + process.kill(pid, "SIGTERM"); + } catch { + // already dead + } + throw new Error( + `Model router failed to become healthy on port ${port} after ${ROUTER_HEALTH_RETRIES} attempts` + + (childExitDetail ? ` (${childExitDetail})` : ""), + ); +} + +function getRoutedProfile(): BlueprintInferenceProfile { + const bp = loadBlueprintProfile("routed"); + if (!bp || bp.router?.enabled !== true) { + throw new Error("Router is not enabled in nemoclaw-blueprint/blueprint.yaml."); + } + return bp; +} + +export function isRoutedInferenceProvider(provider: string | null | undefined): boolean { + if (!provider) return false; + if (provider === "nvidia-router") return true; + const bp = loadBlueprintProfile("routed"); + return Boolean(bp?.provider_name && provider === bp.provider_name); +} + +export async function reconcileModelRouter(): Promise { + const bp = getRoutedProfile(); + const routerPort = bp.router.port || 4000; + const routerCredentialEnv = + bp.router.credential_env || bp.credential_env || DEFAULT_MODEL_ROUTER_CREDENTIAL_ENV; + const routerCredential = + hydrateCredentialEnv(routerCredentialEnv) || + normalizeCredentialValue(bp.credential_default || ""); + if (!routerCredential) { + throw new Error(`${routerCredentialEnv} is required to start Model Router.`); + } + saveCredential(routerCredentialEnv, routerCredential); + const routerCredentialHash = hashCredential(routerCredential); + const session = onboardSession.loadSession(); + const recordedPid = session?.routerPid ?? null; + const recordedCredentialHash = session?.routerCredentialHash ?? null; + + if (await isRouterHealthy(routerPort)) { + if ( + routerCredentialHash && + recordedCredentialHash === routerCredentialHash && + isProcessRunning(recordedPid) + ) { + console.log(` ✓ Model router is already healthy on port ${routerPort}`); + return; + } + if (isProcessRunning(recordedPid)) { + console.log(" Restarting model router with updated credentials..."); + await stopModelRouterProcess(requireValue(recordedPid, "Expected recorded router PID"), routerPort); + } else { + throw new Error( + `Port ${routerPort} already has a healthy router endpoint, but its credential state is unknown. Stop the existing model-router process and rerun onboarding.`, + ); + } + } + + console.log(" Starting model router..."); + const routerPid = await startModelRouter(bp.router); + console.log(` ✓ Model router started (PID ${routerPid}) on port ${routerPort}`); + onboardSession.updateSession((current: Session) => { + current.routerPid = routerPid; + current.routerCredentialHash = routerCredentialHash; + return current; + }); +} diff --git a/src/lib/onboard/runtime-boundary.ts b/src/lib/onboard/runtime-boundary.ts new file mode 100644 index 0000000000..be9cc339d8 --- /dev/null +++ b/src/lib/onboard/runtime-boundary.ts @@ -0,0 +1,93 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { Session, SessionUpdates } from "../state/onboard-session"; +import { OnboardRuntime } from "./machine/runtime"; +import type { OnboardMachineState } from "./machine/types"; + +export interface OnboardRuntimeBoundaryOptions { + toSessionUpdates(updates: Record): SessionUpdates; + maybeForceE2eStepFailure(stepName: string): void; +} + +export class OnboardRuntimeBoundary { + private runtime: OnboardRuntime | null = null; + + constructor(private readonly options: OnboardRuntimeBoundaryOptions) {} + + reset(): void { + this.runtime = new OnboardRuntime(); + } + + clear(): void { + this.runtime = null; + } + + getRuntime(): OnboardRuntime { + if (!this.runtime) this.runtime = new OnboardRuntime(); + return this.runtime; + } + + async startRecordedStep( + stepName: string, + updates: { + sandboxName?: string | null; + provider?: string | null; + model?: string | null; + policyPresets?: string[] | null; + } = {}, + ): Promise { + const runtime = this.getRuntime(); + await runtime.markStepStarted(stepName); + if (Object.keys(updates).length > 0) { + await runtime.updateContext(this.options.toSessionUpdates(updates)); + } + this.options.maybeForceE2eStepFailure(stepName); + } + + async recordStepComplete( + stepName: string, + updates: SessionUpdates = {}, + ): Promise { + return this.getRuntime().markStepComplete(stepName, updates); + } + + async recordStepSkipped(stepName: string): Promise { + return this.getRuntime().markStepSkipped(stepName); + } + + async recordStepFailed(stepName: string, message: string | null): Promise { + return this.getRuntime().markStepFailed(stepName, message); + } + + async recordStateSkipped( + state: OnboardMachineState, + metadata: Record | null = null, + ): Promise { + return this.getRuntime().markSkipped(state, metadata); + } + + async recordRepairEvent( + type: "state.repair.started" | "state.repair.completed" | "state.repair.failed", + options: { + state?: OnboardMachineState | null; + error?: string | null; + metadata?: Record | null; + } = {}, + ): Promise { + return this.getRuntime().emitRepairEvent(type, options); + } + + async recordSessionComplete(updates: SessionUpdates = {}): Promise { + const runtime = this.getRuntime(); + const current = await runtime.session(); + if (current.machine.state === "finalizing") { + await runtime.transition("post_verify"); + return runtime.complete(updates); + } + if (current.machine.state === "post_verify") { + return runtime.complete(updates); + } + return runtime.completeSession(updates); + } +} diff --git a/src/lib/onboard/session-updates.ts b/src/lib/onboard/session-updates.ts new file mode 100644 index 0000000000..529d22e531 --- /dev/null +++ b/src/lib/onboard/session-updates.ts @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { WebSearchConfig } from "../inference/web-search"; +import type { MessagingChannelConfig } from "../messaging-channel-config"; +import type { HermesAuthMethod, SessionUpdates } from "../state/onboard-session"; + +export interface OnboardSessionUpdateInput { + sandboxName?: string | null; + provider?: string | null; + model?: string | null; + endpointUrl?: string | null; + credentialEnv?: string | null; + hermesAuthMethod?: HermesAuthMethod | string | null; + preferredInferenceApi?: string | null; + nimContainer?: string | null; + webSearchConfig?: WebSearchConfig | null; + policyPresets?: string[] | null; + messagingChannels?: string[] | null; + messagingChannelConfig?: MessagingChannelConfig | null; + hermesToolGateways?: string[] | null; +} + +// Preserve the nullable contract end-to-end: `null` means "clear this +// field on the persisted session", `undefined` means "leave unchanged". +function toNullableString(value: string | null | undefined): string | null | undefined { + if (value === undefined) return undefined; + if (value === null) return null; + return value; +} + +function normalizeHermesAuthMethod(value: string | null | undefined): HermesAuthMethod | null { + return value === "oauth" || value === "api_key" ? value : null; +} + +export function toSessionUpdates(updates: OnboardSessionUpdateInput = {}): SessionUpdates { + const normalized: SessionUpdates = {}; + if (updates.sandboxName !== undefined) + normalized.sandboxName = toNullableString(updates.sandboxName); + if (updates.provider !== undefined) normalized.provider = toNullableString(updates.provider); + if (updates.model !== undefined) normalized.model = toNullableString(updates.model); + if (updates.endpointUrl !== undefined) + normalized.endpointUrl = toNullableString(updates.endpointUrl); + if (updates.credentialEnv !== undefined) + normalized.credentialEnv = toNullableString(updates.credentialEnv); + if (updates.hermesAuthMethod !== undefined) + normalized.hermesAuthMethod = normalizeHermesAuthMethod(updates.hermesAuthMethod); + if (updates.preferredInferenceApi !== undefined) { + normalized.preferredInferenceApi = toNullableString(updates.preferredInferenceApi); + } + if (updates.nimContainer !== undefined) + normalized.nimContainer = toNullableString(updates.nimContainer); + if (updates.webSearchConfig !== undefined) normalized.webSearchConfig = updates.webSearchConfig; + if (updates.policyPresets !== undefined) normalized.policyPresets = updates.policyPresets; + if (updates.messagingChannels !== undefined) + normalized.messagingChannels = updates.messagingChannels; + if (updates.messagingChannelConfig !== undefined) { + normalized.messagingChannelConfig = updates.messagingChannelConfig; + } + if (updates.hermesToolGateways !== undefined) + normalized.hermesToolGateways = updates.hermesToolGateways; + return normalized; +} From ce1a645958538eeb5fdc765a216b74b6e825f911 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 11:08:46 -0700 Subject: [PATCH 18/54] refactor(cli): extract sandbox agent helpers --- src/lib/onboard.ts | 114 ++++--------------------------- src/lib/onboard/sandbox-agent.ts | 107 +++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+), 101 deletions(-) create mode 100644 src/lib/onboard/sandbox-agent.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index ad23d5a06e..7a38fe9682 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -276,6 +276,19 @@ const { resolveSandboxImageTagFromCreateOutput } = const nim: typeof import("./inference/nim") = require("./inference/nim"); const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session"); const { toSessionUpdates }: typeof import("./onboard/session-updates") = require("./onboard/session-updates"); +const sandboxAgent: typeof import("./onboard/sandbox-agent") = require("./onboard/sandbox-agent"); +const { + RESERVED_SANDBOX_NAMES, + formatSandboxAgentName, + getAgentInferenceProviderOptions, + getDefaultSandboxNameForAgent, + getEffectiveSandboxAgent, + getRequestedSandboxAgentName, + getSandboxAgentDrift, + getSandboxAgentRegistryFields, + getSandboxPromptDefault, + normalizeSandboxAgentName, +} = sandboxAgent; const modelRouter: typeof import("./onboard/model-router") = require("./onboard/model-router"); const { DEFAULT_MODEL_ROUTER_CREDENTIAL_ENV, @@ -3956,107 +3969,6 @@ async function recoverGatewayRuntime() { // ── Step 3: Sandbox ────────────────────────────────────────────── -// Names that collide with CLI command namespaces. A sandbox named 'status' -// makes 'nemoclaw status connect' route to the global status command -// instead of the sandbox, and a sandbox named 'sandbox' collides with the -// oclif-native `nemoclaw sandbox ...` command namespace. Reject these wherever -// a sandbox name enters the system (interactive prompt, --name flag, -// NEMOCLAW_SANDBOX_NAME). -const RESERVED_SANDBOX_NAMES = new Set([ - "onboard", - "list", - "deploy", - "setup", - "setup-spark", - "start", - "stop", - "status", - "debug", - "uninstall", - "update", - "credentials", - "help", - "sandbox", -]); - -function normalizeSandboxAgentName(agentName: string | null | undefined): string { - const trimmed = typeof agentName === "string" ? agentName.trim() : ""; - return trimmed && trimmed !== "openclaw" ? trimmed : "openclaw"; -} - -const UNKNOWN_SANDBOX_AGENT_NAME = "unknown"; - -function getRequestedSandboxAgentName(agent: AgentDefinition | null | undefined): string { - return normalizeSandboxAgentName(agent?.name); -} - -function formatSandboxAgentName(agentName: string | null | undefined): string { - const normalized = normalizeSandboxAgentName(agentName); - if (normalized === "openclaw") return "OpenClaw"; - if (normalized === "hermes") return "Hermes"; - return normalized; -} - -function getDefaultSandboxNameForAgent(agent: AgentDefinition | null | undefined): string { - return getRequestedSandboxAgentName(agent) === "hermes" ? "hermes" : "my-assistant"; -} - -function getSandboxPromptDefault(agent: AgentDefinition | null | undefined): string { - const envName = (process.env.NEMOCLAW_SANDBOX_NAME || "").trim().toLowerCase(); - const agentDefault = getDefaultSandboxNameForAgent(agent); - if (!envName) return agentDefault; - try { - return validateName(envName, "sandbox name"); - } catch { - return agentDefault; - } -} - -function getEffectiveSandboxAgent(agent: AgentDefinition | null | undefined): AgentDefinition { - return agent || agentDefs.loadAgent("openclaw"); -} - -function getAgentInferenceProviderOptions(agent: AgentDefinition | null | undefined): string[] { - const effectiveAgent = agent?.name - ? agentDefs.loadAgent(agent.name) - : getEffectiveSandboxAgent(agent); - return Array.isArray(effectiveAgent.inferenceProviderOptions) - ? effectiveAgent.inferenceProviderOptions - : []; -} - -function getSandboxAgentRegistryFields( - agent: AgentDefinition | null | undefined, - agentVersionKnown = true, -): Pick { - const effectiveAgent = getEffectiveSandboxAgent(agent); - const agentName = normalizeSandboxAgentName(effectiveAgent.name); - return { - agent: agentName === "openclaw" ? null : agentName, - agentVersion: agentVersionKnown ? effectiveAgent.expectedVersion || null : null, - }; -} - -function getSandboxAgentDrift( - sandboxName: string, - requestedAgentName: string, -): { changed: boolean; existingAgentName: string; requestedAgentName: string } { - const existingEntry: SandboxEntry | null = registry.getSandbox(sandboxName); - if (!existingEntry) { - return { - changed: true, - existingAgentName: UNKNOWN_SANDBOX_AGENT_NAME, - requestedAgentName, - }; - } - const existingAgentName = normalizeSandboxAgentName(existingEntry?.agent); - return { - changed: existingAgentName !== requestedAgentName, - existingAgentName, - requestedAgentName, - }; -} - function getSandboxRuntimeRegistryFields( config: SandboxGpuConfig, ): Pick< diff --git a/src/lib/onboard/sandbox-agent.ts b/src/lib/onboard/sandbox-agent.ts new file mode 100644 index 0000000000..c17b9de0b2 --- /dev/null +++ b/src/lib/onboard/sandbox-agent.ts @@ -0,0 +1,107 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { AgentDefinition } from "../agent/defs"; +import { loadAgent } from "../agent/defs"; +import { validateName } from "../runner"; +import type { SandboxEntry } from "../state/registry"; +import * as registry from "../state/registry"; + +// Names that collide with CLI command namespaces. A sandbox named 'status' +// makes 'nemoclaw status connect' route to the global status command +// instead of the sandbox, and a sandbox named 'sandbox' collides with the +// oclif-native `nemoclaw sandbox ...` command namespace. Reject these wherever +// a sandbox name enters the system (interactive prompt, --name flag, +// NEMOCLAW_SANDBOX_NAME). +export const RESERVED_SANDBOX_NAMES = new Set([ + "onboard", + "list", + "deploy", + "setup", + "setup-spark", + "start", + "stop", + "status", + "debug", + "uninstall", + "update", + "credentials", + "help", + "sandbox", +]); + +export const UNKNOWN_SANDBOX_AGENT_NAME = "unknown"; + +export function normalizeSandboxAgentName(agentName: string | null | undefined): string { + const trimmed = typeof agentName === "string" ? agentName.trim() : ""; + return trimmed && trimmed !== "openclaw" ? trimmed : "openclaw"; +} + +export function getRequestedSandboxAgentName(agent: AgentDefinition | null | undefined): string { + return normalizeSandboxAgentName(agent?.name); +} + +export function formatSandboxAgentName(agentName: string | null | undefined): string { + const normalized = normalizeSandboxAgentName(agentName); + if (normalized === "openclaw") return "OpenClaw"; + if (normalized === "hermes") return "Hermes"; + return normalized; +} + +export function getDefaultSandboxNameForAgent(agent: AgentDefinition | null | undefined): string { + return getRequestedSandboxAgentName(agent) === "hermes" ? "hermes" : "my-assistant"; +} + +export function getSandboxPromptDefault(agent: AgentDefinition | null | undefined): string { + const envName = (process.env.NEMOCLAW_SANDBOX_NAME || "").trim().toLowerCase(); + const agentDefault = getDefaultSandboxNameForAgent(agent); + if (!envName) return agentDefault; + try { + return validateName(envName, "sandbox name"); + } catch { + return agentDefault; + } +} + +export function getEffectiveSandboxAgent(agent: AgentDefinition | null | undefined): AgentDefinition { + return agent || loadAgent("openclaw"); +} + +export function getAgentInferenceProviderOptions(agent: AgentDefinition | null | undefined): string[] { + const effectiveAgent = agent?.name ? loadAgent(agent.name) : getEffectiveSandboxAgent(agent); + return Array.isArray(effectiveAgent.inferenceProviderOptions) + ? effectiveAgent.inferenceProviderOptions + : []; +} + +export function getSandboxAgentRegistryFields( + agent: AgentDefinition | null | undefined, + agentVersionKnown = true, +): Pick { + const effectiveAgent = getEffectiveSandboxAgent(agent); + const agentName = normalizeSandboxAgentName(effectiveAgent.name); + return { + agent: agentName === "openclaw" ? null : agentName, + agentVersion: agentVersionKnown ? effectiveAgent.expectedVersion || null : null, + }; +} + +export function getSandboxAgentDrift( + sandboxName: string, + requestedAgentName: string, +): { changed: boolean; existingAgentName: string; requestedAgentName: string } { + const existingEntry: SandboxEntry | null = registry.getSandbox(sandboxName); + if (!existingEntry) { + return { + changed: true, + existingAgentName: UNKNOWN_SANDBOX_AGENT_NAME, + requestedAgentName, + }; + } + const existingAgentName = normalizeSandboxAgentName(existingEntry?.agent); + return { + changed: existingAgentName !== requestedAgentName, + existingAgentName, + requestedAgentName, + }; +} From 7a07d8c2d78084a09e4b2a8d1ce3ad885bf5e53e Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 11:11:16 -0700 Subject: [PATCH 19/54] refactor(cli): extract messaging config helpers --- src/lib/onboard.ts | 40 ++++----------------------- src/lib/onboard/messaging-config.ts | 43 +++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 34 deletions(-) create mode 100644 src/lib/onboard/messaging-config.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 7a38fe9682..0966904510 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -276,6 +276,12 @@ const { resolveSandboxImageTagFromCreateOutput } = const nim: typeof import("./inference/nim") = require("./inference/nim"); const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session"); const { toSessionUpdates }: typeof import("./onboard/session-updates") = require("./onboard/session-updates"); +const messagingConfig: typeof import("./onboard/messaging-config") = require("./onboard/messaging-config"); +const { + getStoredMessagingChannelConfig, + messagingChannelConfigsEqual, + persistMessagingChannelConfigToSession, +} = messagingConfig; const sandboxAgent: typeof import("./onboard/sandbox-agent") = require("./onboard/sandbox-agent"); const { RESERVED_SANDBOX_NAMES, @@ -375,9 +381,7 @@ import type { WebSearchConfig } from "./inference/web-search"; import { hydrateMessagingChannelConfig, type MessagingChannelConfig, - mergeMessagingChannelConfigs, readMessagingChannelConfigFromEnv, - sanitizeMessagingChannelConfig, } from "./messaging-channel-config"; import { streamGatewayStart } from "./onboard/gateway"; import { @@ -7183,38 +7187,6 @@ async function setupInference( const MESSAGING_CHANNELS = listChannels(); -function getStoredMessagingChannelConfig( - sandboxName: string | null, - session: Session | null, -): MessagingChannelConfig | null { - const registryConfig = sandboxName - ? sanitizeMessagingChannelConfig(registry.getSandbox(sandboxName)?.messagingChannelConfig) - : null; - const sessionMatchesSandbox = - !session?.sandboxName || !sandboxName || session.sandboxName === sandboxName; - const sessionConfig = sessionMatchesSandbox - ? sanitizeMessagingChannelConfig(session?.messagingChannelConfig) - : null; - return mergeMessagingChannelConfigs(registryConfig, sessionConfig); -} - -function persistMessagingChannelConfigToSession(config: MessagingChannelConfig | null): void { - onboardSession.updateSession((current: Session) => { - current.messagingChannelConfig = config; - return current; - }); -} - -function messagingChannelConfigsEqual( - left: MessagingChannelConfig | null, - right: MessagingChannelConfig | null, -): boolean { - const leftKeys = Object.keys(left || {}).sort(); - const rightKeys = Object.keys(right || {}).sort(); - if (leftKeys.length !== rightKeys.length) return false; - return leftKeys.every((key, index) => key === rightKeys[index] && left?.[key] === right?.[key]); -} - // Curl exit codes that indicate a network-level failure (not a token problem). // 35 (TLS handshake failure) covers corporate proxies that MITM HTTPS. const TELEGRAM_NETWORK_CURL_CODES = new Set([6, 7, 28, 35, 52, 56]); diff --git a/src/lib/onboard/messaging-config.ts b/src/lib/onboard/messaging-config.ts new file mode 100644 index 0000000000..eefea7e901 --- /dev/null +++ b/src/lib/onboard/messaging-config.ts @@ -0,0 +1,43 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { + type MessagingChannelConfig, + mergeMessagingChannelConfigs, + sanitizeMessagingChannelConfig, +} from "../messaging-channel-config"; +import type { Session } from "../state/onboard-session"; +import * as onboardSession from "../state/onboard-session"; +import * as registry from "../state/registry"; + +export function getStoredMessagingChannelConfig( + sandboxName: string | null, + session: Session | null, +): MessagingChannelConfig | null { + const registryConfig = sandboxName + ? sanitizeMessagingChannelConfig(registry.getSandbox(sandboxName)?.messagingChannelConfig) + : null; + const sessionMatchesSandbox = + !session?.sandboxName || !sandboxName || session.sandboxName === sandboxName; + const sessionConfig = sessionMatchesSandbox + ? sanitizeMessagingChannelConfig(session?.messagingChannelConfig) + : null; + return mergeMessagingChannelConfigs(registryConfig, sessionConfig); +} + +export function persistMessagingChannelConfigToSession(config: MessagingChannelConfig | null): void { + onboardSession.updateSession((current: Session) => { + current.messagingChannelConfig = config; + return current; + }); +} + +export function messagingChannelConfigsEqual( + left: MessagingChannelConfig | null, + right: MessagingChannelConfig | null, +): boolean { + const leftKeys = Object.keys(left || {}).sort(); + const rightKeys = Object.keys(right || {}).sort(); + if (leftKeys.length !== rightKeys.length) return false; + return leftKeys.every((key, index) => key === rightKeys[index] && left?.[key] === right?.[key]); +} From 9d928911a81301459c5d15b9df0e331a94ef5a1a Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 11:14:10 -0700 Subject: [PATCH 20/54] refactor(cli): extract resume conflict helpers --- src/lib/onboard.ts | 120 ++-------------------------- src/lib/onboard/resume-config.ts | 133 +++++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+), 112 deletions(-) create mode 100644 src/lib/onboard/resume-config.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 0966904510..9ac8aa477c 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -275,6 +275,14 @@ const { resolveSandboxImageTagFromCreateOutput } = require("./domain/sandbox/image-tag") as typeof import("./domain/sandbox/image-tag"); const nim: typeof import("./inference/nim") = require("./inference/nim"); const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session"); +const resumeConfig: typeof import("./onboard/resume-config") = require("./onboard/resume-config"); +const { + getRequestedModelHint, + getRequestedProviderHint, + getRequestedSandboxNameHint, + getResumeConfigConflicts, + getResumeSandboxConflict, +} = resumeConfig; const { toSessionUpdates }: typeof import("./onboard/session-updates") = require("./onboard/session-updates"); const messagingConfig: typeof import("./onboard/messaging-config") = require("./onboard/messaging-config"); const { @@ -1873,118 +1881,6 @@ const { const ollamaModelSize: typeof import("./inference/ollama/model-size") = require("./inference/ollama/model-size"); -function getRequestedSandboxNameHint(opts: { sandboxName?: string | null } = {}): string | null { - const raw = - typeof opts.sandboxName === "string" && opts.sandboxName.length > 0 - ? opts.sandboxName - : process.env.NEMOCLAW_SANDBOX_NAME; - if (typeof raw !== "string") return null; - const normalized = raw.trim().toLowerCase(); - return normalized || null; -} - -function getResumeSandboxConflict( - session: Session | null, - opts: { sandboxName?: string | null } = {}, -) { - // Use opts.sandboxName as the sole source — the caller has already - // resolved it (--name first, NEMOCLAW_SANDBOX_NAME only when prompting - // is impossible). Falling back to the env var here would fire spurious - // conflicts for interactive resume runs whose shell happens to export - // NEMOCLAW_SANDBOX_NAME but which never actually consult it. - // #2753: only treat session.sandboxName as a conflict source if the - // sandbox step actually completed. A pre-fix incomplete session would - // otherwise reject a legitimate `--resume --name ` that the user - // is supplying precisely to recover from the phantom. - const raw = typeof opts.sandboxName === "string" ? opts.sandboxName.trim().toLowerCase() : ""; - const requestedSandboxName = raw || null; - const recordedSandboxName = - session?.steps?.sandbox?.status === "complete" ? session?.sandboxName ?? null : null; - if (!requestedSandboxName || !recordedSandboxName) { - return null; - } - return requestedSandboxName !== recordedSandboxName - ? { requestedSandboxName, recordedSandboxName } - : null; -} - -// Provider hint wrappers — supply isNonInteractive() default, delegate to onboard-providers. -function getRequestedProviderHint(nonInteractive = isNonInteractive()) { - return onboardProviders.getRequestedProviderHint(nonInteractive); -} -function getRequestedModelHint(nonInteractive = isNonInteractive()) { - return onboardProviders.getRequestedModelHint(nonInteractive); -} - -function getResumeConfigConflicts( - session: Session | null, - opts: { - nonInteractive?: boolean; - fromDockerfile?: string | null; - sandboxName?: string | null; - agent?: string | null; - } = {}, -) { - const conflicts = []; - const nonInteractive = opts.nonInteractive ?? isNonInteractive(); - - const sandboxConflict = getResumeSandboxConflict(session, { sandboxName: opts.sandboxName }); - if (sandboxConflict) { - conflicts.push({ - field: "sandbox", - requested: sandboxConflict.requestedSandboxName, - recorded: sandboxConflict.recordedSandboxName, - }); - } - - const requestedProvider = getRequestedProviderHint(nonInteractive); - const effectiveRequestedProvider = getEffectiveProviderName(requestedProvider); - if ( - effectiveRequestedProvider && - session?.provider && - effectiveRequestedProvider !== session.provider - ) { - conflicts.push({ - field: "provider", - requested: effectiveRequestedProvider, - recorded: session.provider, - }); - } - - const requestedModel = getRequestedModelHint(nonInteractive); - if (requestedModel && session?.model && requestedModel !== session.model) { - conflicts.push({ - field: "model", - requested: requestedModel, - recorded: session.model, - }); - } - - const requestedFrom = opts.fromDockerfile ? path.resolve(opts.fromDockerfile) : null; - const recordedFrom = session?.metadata?.fromDockerfile - ? path.resolve(session.metadata.fromDockerfile) - : null; - if (requestedFrom !== recordedFrom) { - conflicts.push({ - field: "fromDockerfile", - requested: requestedFrom, - recorded: recordedFrom, - }); - } - - const requestedAgent = opts.agent || process.env.NEMOCLAW_AGENT || null; - const recordedAgent = session?.agent || null; - if (requestedAgent && recordedAgent && requestedAgent !== recordedAgent) { - conflicts.push({ - field: "agent", - requested: requestedAgent, - recorded: recordedAgent, - }); - } - - return conflicts; -} - function printRemediationActions( actions: Array<{ title: string; reason: string; commands?: string[] }> | null | undefined, ): void { diff --git a/src/lib/onboard/resume-config.ts b/src/lib/onboard/resume-config.ts new file mode 100644 index 0000000000..f745517d66 --- /dev/null +++ b/src/lib/onboard/resume-config.ts @@ -0,0 +1,133 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import path from "node:path"; + +const onboardProviders = require("./providers"); + +export interface ResumeSessionLike { + sandboxName?: string | null; + provider?: string | null; + model?: string | null; + agent?: string | null; + metadata?: { fromDockerfile?: string | null } | null; + steps?: { sandbox?: { status?: string | null } | null } | null; +} + +export interface ResumeConfigConflict { + field: string; + requested: string | null; + recorded: string | null; +} + +export function getRequestedSandboxNameHint(opts: { sandboxName?: string | null } = {}): string | null { + const raw = + typeof opts.sandboxName === "string" && opts.sandboxName.length > 0 + ? opts.sandboxName + : process.env.NEMOCLAW_SANDBOX_NAME; + if (typeof raw !== "string") return null; + const normalized = raw.trim().toLowerCase(); + return normalized || null; +} + +export function getResumeSandboxConflict( + session: ResumeSessionLike | null, + opts: { sandboxName?: string | null } = {}, +): { requestedSandboxName: string; recordedSandboxName: string } | null { + // Use opts.sandboxName as the sole source — the caller has already + // resolved it (--name first, NEMOCLAW_SANDBOX_NAME only when prompting + // is impossible). Falling back to the env var here would fire spurious + // conflicts for interactive resume runs whose shell happens to export + // NEMOCLAW_SANDBOX_NAME but which never actually consult it. + // #2753: only treat session.sandboxName as a conflict source if the + // sandbox step actually completed. A pre-fix incomplete session would + // otherwise reject a legitimate `--resume --name ` that the user + // is supplying precisely to recover from the phantom. + const raw = typeof opts.sandboxName === "string" ? opts.sandboxName.trim().toLowerCase() : ""; + const requestedSandboxName = raw || null; + const recordedSandboxName = + session?.steps?.sandbox?.status === "complete" ? session?.sandboxName ?? null : null; + if (!requestedSandboxName || !recordedSandboxName) { + return null; + } + return requestedSandboxName !== recordedSandboxName + ? { requestedSandboxName, recordedSandboxName } + : null; +} + +export function getRequestedProviderHint(nonInteractive = false): string | null { + return onboardProviders.getRequestedProviderHint(nonInteractive); +} + +export function getRequestedModelHint(nonInteractive = false): string | null { + return onboardProviders.getRequestedModelHint(nonInteractive); +} + +export function getResumeConfigConflicts( + session: ResumeSessionLike | null, + opts: { + nonInteractive?: boolean; + fromDockerfile?: string | null; + sandboxName?: string | null; + agent?: string | null; + } = {}, +): ResumeConfigConflict[] { + const conflicts: ResumeConfigConflict[] = []; + const nonInteractive = opts.nonInteractive ?? false; + + const sandboxConflict = getResumeSandboxConflict(session, { sandboxName: opts.sandboxName }); + if (sandboxConflict) { + conflicts.push({ + field: "sandbox", + requested: sandboxConflict.requestedSandboxName, + recorded: sandboxConflict.recordedSandboxName, + }); + } + + const requestedProvider = getRequestedProviderHint(nonInteractive); + const effectiveRequestedProvider = onboardProviders.getEffectiveProviderName(requestedProvider); + if ( + effectiveRequestedProvider && + session?.provider && + effectiveRequestedProvider !== session.provider + ) { + conflicts.push({ + field: "provider", + requested: effectiveRequestedProvider, + recorded: session.provider, + }); + } + + const requestedModel = getRequestedModelHint(nonInteractive); + if (requestedModel && session?.model && requestedModel !== session.model) { + conflicts.push({ + field: "model", + requested: requestedModel, + recorded: session.model, + }); + } + + const requestedFrom = opts.fromDockerfile ? path.resolve(opts.fromDockerfile) : null; + const recordedFrom = session?.metadata?.fromDockerfile + ? path.resolve(session.metadata.fromDockerfile) + : null; + if (requestedFrom !== recordedFrom) { + conflicts.push({ + field: "fromDockerfile", + requested: requestedFrom, + recorded: recordedFrom, + }); + } + + const requestedAgent = opts.agent || process.env.NEMOCLAW_AGENT || null; + const recordedAgent = session?.agent || null; + if (requestedAgent && recordedAgent && requestedAgent !== recordedAgent) { + conflicts.push({ + field: "agent", + requested: requestedAgent, + recorded: recordedAgent, + }); + } + + return conflicts; +} From df8a52e2755039429c2eb1503a0d3ae3050b5e3e Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 11:23:22 -0700 Subject: [PATCH 21/54] refactor(cli): extract openshell version helpers --- src/lib/onboard.ts | 107 +++------------------------ src/lib/onboard/openshell-version.ts | 104 ++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 96 deletions(-) create mode 100644 src/lib/onboard/openshell-version.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 9ac8aa477c..65f7864eb2 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -283,6 +283,17 @@ const { getResumeConfigConflicts, getResumeSandboxConflict, } = resumeConfig; +const openshellVersion: typeof import("./onboard/openshell-version") = require("./onboard/openshell-version"); +const { + getBlueprintMaxOpenshellVersion, + getBlueprintMinOpenshellVersion, + getInstalledOpenshellVersion, + getOpenshellChannel, + isOpenshellDevVersion, + shouldAllowOpenshellAboveBlueprintMax, + shouldUseOpenshellDevChannel, + versionGte, +} = openshellVersion; const { toSessionUpdates }: typeof import("./onboard/session-updates") = require("./onboard/session-updates"); const messagingConfig: typeof import("./onboard/messaging-config") = require("./onboard/messaging-config"); const { @@ -660,102 +671,6 @@ function step(n: number, total: number, msg: string): void { console.log(` ${"─".repeat(50)}`); } -function getInstalledOpenshellVersion(versionOutput: string | null = null): string | null { - const openshellBin = resolveOpenshell(); - if (!versionOutput && !openshellBin) return null; - const output = String( - versionOutput ?? runCapture([openshellBin, "-V"], { ignoreError: true }), - ).trim(); - const match = output.match(/openshell\s+([0-9]+\.[0-9]+\.[0-9]+)/i); - if (match) return match[1]; - return null; -} - -/** - * Compare two semver-like x.y.z strings. Returns true iff `left >= right`. - * Non-numeric or missing components are treated as 0. - */ -function versionGte(left = "0.0.0", right = "0.0.0"): boolean { - const lhs = String(left) - .split(".") - .map((part) => Number.parseInt(part, 10) || 0); - const rhs = String(right) - .split(".") - .map((part) => Number.parseInt(part, 10) || 0); - const length = Math.max(lhs.length, rhs.length); - for (let index = 0; index < length; index += 1) { - const a = lhs[index] || 0; - const b = rhs[index] || 0; - if (a > b) return true; - if (a < b) return false; - } - return true; -} - -/** - * Read a semver field from nemoclaw-blueprint/blueprint.yaml. Returns null if - * the blueprint or field is missing or unparseable — callers must treat null - * as "no constraint configured" so a malformed install does not become a hard - * onboard blocker. See #1317. - */ -function getBlueprintVersionField(field: string, rootDir = ROOT): string | null { - try { - // Lazy require: yaml is already a dependency via the policy helpers but - // pulling it at module load would slow down `nemoclaw --help` for users - // who never reach the preflight path. - const YAML = require("yaml"); - const blueprintPath = path.join(rootDir, "nemoclaw-blueprint", "blueprint.yaml"); - if (!fs.existsSync(blueprintPath)) return null; - const raw = fs.readFileSync(blueprintPath, "utf8"); - const parsed = YAML.parse(raw); - const value = parsed && parsed[field]; - if (typeof value !== "string") return null; - const trimmed = value.trim(); - if (!/^[0-9]+\.[0-9]+\.[0-9]+/.test(trimmed)) return null; - return trimmed; - } catch { - return null; - } -} - -function getBlueprintMinOpenshellVersion(rootDir = ROOT): string | null { - return getBlueprintVersionField("min_openshell_version", rootDir); -} - -function getBlueprintMaxOpenshellVersion(rootDir = ROOT): string | null { - return getBlueprintVersionField("max_openshell_version", rootDir); -} - -type OpenshellChannel = "stable" | "dev" | "auto"; - -function getOpenshellChannel(env: NodeJS.ProcessEnv = process.env): OpenshellChannel { - const raw = String(env.NEMOCLAW_OPENSHELL_CHANNEL || "auto") - .trim() - .toLowerCase(); - if (raw === "stable" || raw === "dev" || raw === "auto") return raw; - return "auto"; -} - -function shouldUseOpenshellDevChannel( - _platform: NodeJS.Platform = process.platform, - env: NodeJS.ProcessEnv = process.env, -): boolean { - const channel = getOpenshellChannel(env); - return channel === "dev"; -} - -function isOpenshellDevVersion(versionOutput: string | null | undefined): boolean { - return /\bdev[0-9.]*/i.test(String(versionOutput || "")); -} - -function shouldAllowOpenshellAboveBlueprintMax( - versionOutput: string | null | undefined, - platform: NodeJS.Platform = process.platform, - env: NodeJS.ProcessEnv = process.env, -): boolean { - return shouldUseOpenshellDevChannel(platform, env) && isOpenshellDevVersion(versionOutput); -} - function resolveSandboxGpuFlagFromOptions( opts: Pick, ): SandboxGpuFlag { diff --git a/src/lib/onboard/openshell-version.ts b/src/lib/onboard/openshell-version.ts new file mode 100644 index 0000000000..e45a279130 --- /dev/null +++ b/src/lib/onboard/openshell-version.ts @@ -0,0 +1,104 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import fs from "node:fs"; +import path from "node:path"; + +import { resolveOpenshell } from "../adapters/openshell/resolve"; +import { ROOT, runCapture } from "../runner"; + +export function getInstalledOpenshellVersion(versionOutput: string | null = null): string | null { + const openshellBin = resolveOpenshell(); + if (!versionOutput && !openshellBin) return null; + const output = String( + versionOutput ?? runCapture([openshellBin as string, "-V"], { ignoreError: true }), + ).trim(); + const match = output.match(/openshell\s+([0-9]+\.[0-9]+\.[0-9]+)/i); + if (match) return match[1]; + return null; +} + +/** + * Compare two semver-like x.y.z strings. Returns true iff `left >= right`. + * Non-numeric or missing components are treated as 0. + */ +export function versionGte(left = "0.0.0", right = "0.0.0"): boolean { + const lhs = String(left) + .split(".") + .map((part) => Number.parseInt(part, 10) || 0); + const rhs = String(right) + .split(".") + .map((part) => Number.parseInt(part, 10) || 0); + const length = Math.max(lhs.length, rhs.length); + for (let index = 0; index < length; index += 1) { + const a = lhs[index] || 0; + const b = rhs[index] || 0; + if (a > b) return true; + if (a < b) return false; + } + return true; +} + +/** + * Read a semver field from nemoclaw-blueprint/blueprint.yaml. Returns null if + * the blueprint or field is missing or unparseable — callers must treat null + * as "no constraint configured" so a malformed install does not become a hard + * onboard blocker. See #1317. + */ +function getBlueprintVersionField(field: string, rootDir = ROOT): string | null { + try { + // Lazy require: yaml is already a dependency via the policy helpers but + // pulling it at module load would slow down `nemoclaw --help` for users + // who never reach the preflight path. + const YAML = require("yaml"); + const blueprintPath = path.join(rootDir, "nemoclaw-blueprint", "blueprint.yaml"); + if (!fs.existsSync(blueprintPath)) return null; + const raw = fs.readFileSync(blueprintPath, "utf8"); + const parsed = YAML.parse(raw); + const value = parsed && parsed[field]; + if (typeof value !== "string") return null; + const trimmed = value.trim(); + if (!/^[0-9]+\.[0-9]+\.[0-9]+/.test(trimmed)) return null; + return trimmed; + } catch { + return null; + } +} + +export function getBlueprintMinOpenshellVersion(rootDir = ROOT): string | null { + return getBlueprintVersionField("min_openshell_version", rootDir); +} + +export function getBlueprintMaxOpenshellVersion(rootDir = ROOT): string | null { + return getBlueprintVersionField("max_openshell_version", rootDir); +} + +export type OpenshellChannel = "stable" | "dev" | "auto"; + +export function getOpenshellChannel(env: NodeJS.ProcessEnv = process.env): OpenshellChannel { + const raw = String(env.NEMOCLAW_OPENSHELL_CHANNEL || "auto") + .trim() + .toLowerCase(); + if (raw === "stable" || raw === "dev" || raw === "auto") return raw; + return "auto"; +} + +export function shouldUseOpenshellDevChannel( + _platform: NodeJS.Platform = process.platform, + env: NodeJS.ProcessEnv = process.env, +): boolean { + const channel = getOpenshellChannel(env); + return channel === "dev"; +} + +export function isOpenshellDevVersion(versionOutput: string | null | undefined): boolean { + return /\bdev[0-9.]*/i.test(String(versionOutput || "")); +} + +export function shouldAllowOpenshellAboveBlueprintMax( + versionOutput: string | null | undefined, + platform: NodeJS.Platform = process.platform, + env: NodeJS.ProcessEnv = process.env, +): boolean { + return shouldUseOpenshellDevChannel(platform, env) && isOpenshellDevVersion(versionOutput); +} From fcb3e36284a0091bd493153a013a44c3833d9fb4 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 11:25:48 -0700 Subject: [PATCH 22/54] refactor(cli): extract known hosts pruning --- src/lib/onboard.ts | 17 +---------------- src/lib/onboard/known-hosts.ts | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 16 deletions(-) create mode 100644 src/lib/onboard/known-hosts.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 65f7864eb2..b071bd462f 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -283,6 +283,7 @@ const { getResumeConfigConflicts, getResumeSandboxConflict, } = resumeConfig; +const { pruneKnownHostsEntries }: typeof import("./onboard/known-hosts") = require("./onboard/known-hosts"); const openshellVersion: typeof import("./onboard/openshell-version") = require("./onboard/openshell-version"); const { getBlueprintMaxOpenshellVersion, @@ -632,22 +633,6 @@ function selectNamedGatewayForReuseIfNeeded(snapshot: GatewayReuseSnapshot): Gat return refreshed; } -/** - * Remove known_hosts lines whose host field contains an openshell-* entry. - * Preserves blank lines and comments. Returns the cleaned string. - */ -function pruneKnownHostsEntries(contents: string): string { - return contents - .split("\n") - .filter((l) => { - const trimmed = l.trim(); - if (!trimmed || trimmed.startsWith("#")) return true; - const hostField = trimmed.split(/\s+/)[0]; - return !hostField.split(",").some((h) => h.startsWith("openshell-")); - }) - .join("\n"); -} - function getSandboxReuseState(sandboxName: string | null) { if (!sandboxName) return "missing"; const getOutput = runCaptureOpenshell(["sandbox", "get", sandboxName], { ignoreError: true }); diff --git a/src/lib/onboard/known-hosts.ts b/src/lib/onboard/known-hosts.ts new file mode 100644 index 0000000000..0b3a4cb6ac --- /dev/null +++ b/src/lib/onboard/known-hosts.ts @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * Remove known_hosts lines whose host field contains an openshell-* entry. + * Preserves blank lines and comments. Returns the cleaned string. + */ +export function pruneKnownHostsEntries(contents: string): string { + return contents + .split("\n") + .filter((line) => { + const trimmed = line.trim(); + if (!trimmed || trimmed.startsWith("#")) return true; + const hostField = trimmed.split(/\s+/)[0]; + return !hostField.split(",").some((host) => host.startsWith("openshell-")); + }) + .join("\n"); +} From cdd19fd8a5fa052974ae0bcbb7425faf79a3f957 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 11:28:46 -0700 Subject: [PATCH 23/54] refactor(cli): extract gateway reuse helpers --- src/lib/onboard.ts | 54 ++++-------------------- src/lib/onboard/gateway-reuse.ts | 71 ++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 47 deletions(-) create mode 100644 src/lib/onboard/gateway-reuse.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index b071bd462f..20002151b8 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -296,6 +296,7 @@ const { versionGte, } = openshellVersion; const { toSessionUpdates }: typeof import("./onboard/session-updates") = require("./onboard/session-updates"); +const gatewayReuse: typeof import("./onboard/gateway-reuse") = require("./onboard/gateway-reuse"); const messagingConfig: typeof import("./onboard/messaging-config") = require("./onboard/messaging-config"); const { getStoredMessagingChannelConfig, @@ -581,57 +582,16 @@ const { isSelectedGateway, isGatewayHealthy, getGatewayReuseState, - shouldSelectNamedGatewayForReuse, getSandboxStateFromOutputs, } = gatewayState; -type GatewayReuseSnapshot = { - gatewayStatus: string; - gwInfo: string; - activeGatewayInfo: string; - gatewayReuseState: ReturnType; -}; - -function getGatewayReuseSnapshot(): GatewayReuseSnapshot { - const gatewayStatus = runCaptureOpenshell(["status"], { ignoreError: true }); - const gwInfo = runCaptureOpenshell(["gateway", "info", "-g", GATEWAY_NAME], { - ignoreError: true, - }); - const activeGatewayInfo = runCaptureOpenshell(["gateway", "info"], { ignoreError: true }); - return { - gatewayStatus, - gwInfo, - activeGatewayInfo, - gatewayReuseState: getGatewayReuseState(gatewayStatus, gwInfo, activeGatewayInfo), - }; -} - -function selectNamedGatewayForReuseIfNeeded(snapshot: GatewayReuseSnapshot): GatewayReuseSnapshot { - if ( - !shouldSelectNamedGatewayForReuse( - snapshot.gatewayStatus, - snapshot.gwInfo, - snapshot.activeGatewayInfo, - ) - ) { - return snapshot; - } - - const selectResult = runOpenshell(["gateway", "select", GATEWAY_NAME], { - ignoreError: true, - suppressOutput: true, +const { getGatewayReuseSnapshot, selectNamedGatewayForReuseIfNeeded } = + gatewayReuse.createGatewayReuseHelpers({ + gatewayName: GATEWAY_NAME, + runCaptureOpenshell, + runOpenshell, + cliDisplayName, }); - if (selectResult.status !== 0) { - return snapshot; - } - - const refreshed = getGatewayReuseSnapshot(); - if (refreshed.gatewayReuseState === "healthy") { - process.env.OPENSHELL_GATEWAY = GATEWAY_NAME; - console.log(` ✓ Selected existing ${cliDisplayName()} gateway`); - } - return refreshed; -} function getSandboxReuseState(sandboxName: string | null) { if (!sandboxName) return "missing"; diff --git a/src/lib/onboard/gateway-reuse.ts b/src/lib/onboard/gateway-reuse.ts new file mode 100644 index 0000000000..0406e66300 --- /dev/null +++ b/src/lib/onboard/gateway-reuse.ts @@ -0,0 +1,71 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { + getGatewayReuseState, + shouldSelectNamedGatewayForReuse, +} from "../state/gateway"; + +export type GatewayReuseSnapshot = { + gatewayStatus: string; + gwInfo: string; + activeGatewayInfo: string; + gatewayReuseState: ReturnType; +}; + +export interface GatewayReuseDeps { + gatewayName: string; + runCaptureOpenshell(args: string[], opts?: Record): string; + runOpenshell(args: string[], opts?: Record): { status: number | null }; + cliDisplayName(): string; +} + +export interface GatewayReuseHelpers { + getGatewayReuseSnapshot(): GatewayReuseSnapshot; + selectNamedGatewayForReuseIfNeeded(snapshot: GatewayReuseSnapshot): GatewayReuseSnapshot; +} + +export function createGatewayReuseHelpers(deps: GatewayReuseDeps): GatewayReuseHelpers { + function getGatewayReuseSnapshot(): GatewayReuseSnapshot { + const gatewayStatus = deps.runCaptureOpenshell(["status"], { ignoreError: true }); + const gwInfo = deps.runCaptureOpenshell(["gateway", "info", "-g", deps.gatewayName], { + ignoreError: true, + }); + const activeGatewayInfo = deps.runCaptureOpenshell(["gateway", "info"], { ignoreError: true }); + return { + gatewayStatus, + gwInfo, + activeGatewayInfo, + gatewayReuseState: getGatewayReuseState(gatewayStatus, gwInfo, activeGatewayInfo), + }; + } + + function selectNamedGatewayForReuseIfNeeded(snapshot: GatewayReuseSnapshot): GatewayReuseSnapshot { + if ( + !shouldSelectNamedGatewayForReuse( + snapshot.gatewayStatus, + snapshot.gwInfo, + snapshot.activeGatewayInfo, + ) + ) { + return snapshot; + } + + const selectResult = deps.runOpenshell(["gateway", "select", deps.gatewayName], { + ignoreError: true, + suppressOutput: true, + }); + if (selectResult.status !== 0) { + return snapshot; + } + + const refreshed = getGatewayReuseSnapshot(); + if (refreshed.gatewayReuseState === "healthy") { + process.env.OPENSHELL_GATEWAY = deps.gatewayName; + console.log(` ✓ Selected existing ${deps.cliDisplayName()} gateway`); + } + return refreshed; + } + + return { getGatewayReuseSnapshot, selectNamedGatewayForReuseIfNeeded }; +} From 9ba83f770a5e80d4bb0249d26d8b4c4919c228d7 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 11:42:51 -0700 Subject: [PATCH 24/54] Potential fix for pull request finding 'CodeQL / Unused variable, import, function or class' Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- src/lib/onboard.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 20002151b8..e6c3ec6833 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -7727,7 +7727,6 @@ const { ensureAgentDashboardForward, fetchGatewayAuthTokenFromSandbox, getDashboardForwardPort, - getDashboardForwardTarget, getWslHostAddress, printDashboard, stopAllDashboardForwards, From 0201b4dd0582fec4368f4a9c6982ff18729ef219 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 11:43:43 -0700 Subject: [PATCH 25/54] Apply suggestions from code review Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- src/lib/onboard.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index e6c3ec6833..c2885c5209 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -289,7 +289,6 @@ const { getBlueprintMaxOpenshellVersion, getBlueprintMinOpenshellVersion, getInstalledOpenshellVersion, - getOpenshellChannel, isOpenshellDevVersion, shouldAllowOpenshellAboveBlueprintMax, shouldUseOpenshellDevChannel, @@ -309,7 +308,6 @@ const { formatSandboxAgentName, getAgentInferenceProviderOptions, getDefaultSandboxNameForAgent, - getEffectiveSandboxAgent, getRequestedSandboxAgentName, getSandboxAgentDrift, getSandboxAgentRegistryFields, From 8d74472e1a01b83734dabf59e26eec7249ba8117 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 11:56:15 -0700 Subject: [PATCH 26/54] refactor(cli): extract sandbox reuse helpers --- src/lib/onboard.ts | 21 +++++++------------ src/lib/onboard/sandbox-reuse.ts | 36 ++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 14 deletions(-) create mode 100644 src/lib/onboard/sandbox-reuse.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index c2885c5209..9bf341d2d8 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -303,6 +303,7 @@ const { persistMessagingChannelConfigToSession, } = messagingConfig; const sandboxAgent: typeof import("./onboard/sandbox-agent") = require("./onboard/sandbox-agent"); +const sandboxReuse: typeof import("./onboard/sandbox-reuse") = require("./onboard/sandbox-reuse"); const { RESERVED_SANDBOX_NAMES, formatSandboxAgentName, @@ -591,20 +592,12 @@ const { getGatewayReuseSnapshot, selectNamedGatewayForReuseIfNeeded } = cliDisplayName, }); -function getSandboxReuseState(sandboxName: string | null) { - if (!sandboxName) return "missing"; - const getOutput = runCaptureOpenshell(["sandbox", "get", sandboxName], { ignoreError: true }); - const listOutput = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); - return getSandboxStateFromOutputs(sandboxName, getOutput, listOutput); -} - -function repairRecordedSandbox(sandboxName: string | null): void { - if (!sandboxName) return; - note(` [resume] Cleaning up recorded sandbox '${sandboxName}' before recreating it.`); - runOpenshell(["forward", "stop", String(DASHBOARD_PORT)], { ignoreError: true }); - runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true }); - registry.removeSandbox(sandboxName); -} +const { getSandboxReuseState, repairRecordedSandbox } = sandboxReuse.createSandboxReuseHelpers({ + runCaptureOpenshell, + runOpenshell, + getSandboxStateFromOutputs, + note, +}); const { streamSandboxCreate } = sandboxCreateStream; diff --git a/src/lib/onboard/sandbox-reuse.ts b/src/lib/onboard/sandbox-reuse.ts new file mode 100644 index 0000000000..ac3d30b4b1 --- /dev/null +++ b/src/lib/onboard/sandbox-reuse.ts @@ -0,0 +1,36 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { DASHBOARD_PORT } from "../core/ports"; +import * as registry from "../state/registry"; + +export interface SandboxReuseDeps { + runCaptureOpenshell(args: string[], opts?: Record): string; + runOpenshell(args: string[], opts?: Record): unknown; + getSandboxStateFromOutputs(sandboxName: string, getOutput: string, listOutput: string): string; + note(message: string): void; +} + +export interface SandboxReuseHelpers { + getSandboxReuseState(sandboxName: string | null): string; + repairRecordedSandbox(sandboxName: string | null): void; +} + +export function createSandboxReuseHelpers(deps: SandboxReuseDeps): SandboxReuseHelpers { + function getSandboxReuseState(sandboxName: string | null): string { + if (!sandboxName) return "missing"; + const getOutput = deps.runCaptureOpenshell(["sandbox", "get", sandboxName], { ignoreError: true }); + const listOutput = deps.runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); + return deps.getSandboxStateFromOutputs(sandboxName, getOutput, listOutput); + } + + function repairRecordedSandbox(sandboxName: string | null): void { + if (!sandboxName) return; + deps.note(` [resume] Cleaning up recorded sandbox '${sandboxName}' before recreating it.`); + deps.runOpenshell(["forward", "stop", String(DASHBOARD_PORT)], { ignoreError: true }); + deps.runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true }); + registry.removeSandbox(sandboxName); + } + + return { getSandboxReuseState, repairRecordedSandbox }; +} From c47e5b8e2a24d0f4f7f90d4a2ce843eff1b367ba Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 11:59:28 -0700 Subject: [PATCH 27/54] refactor(cli): extract messaging credential helpers --- src/lib/onboard.ts | 69 +++++++-------------- src/lib/onboard/messaging-credentials.ts | 78 ++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 48 deletions(-) create mode 100644 src/lib/onboard/messaging-credentials.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 9bf341d2d8..7ae0d9f303 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -297,6 +297,11 @@ const { const { toSessionUpdates }: typeof import("./onboard/session-updates") = require("./onboard/session-updates"); const gatewayReuse: typeof import("./onboard/gateway-reuse") = require("./onboard/gateway-reuse"); const messagingConfig: typeof import("./onboard/messaging-config") = require("./onboard/messaging-config"); +const { + detectMessagingCredentialRotation, + getMessagingChannelForEnvKey, + getRecordedMessagingChannelsForResume: getRecordedMessagingChannelsForResumeFromState, +}: typeof import("./onboard/messaging-credentials") = require("./onboard/messaging-credentials"); const { getStoredMessagingChannelConfig, messagingChannelConfigsEqual, @@ -1239,54 +1244,6 @@ function providerExistsInGateway(name: string) { return onboardProviders.providerExistsInGateway(name, runOpenshell); } -function getMessagingChannelForEnvKey(envKey: string): string | null { - if (envKey === "DISCORD_BOT_TOKEN") return "discord"; - if (envKey === "SLACK_BOT_TOKEN") return "slack"; - if (envKey === "TELEGRAM_BOT_TOKEN") return "telegram"; - if (envKey === "WECHAT_BOT_TOKEN") return "wechat"; - return null; -} - - -function getRecordedMessagingChannelsForResume( - resume: boolean, - session: Session | null, sandboxName: string | null, -): string[] | null { - return require("./onboard/messaging-reuse").getNonInteractiveStoredMessagingChannels( - resume, session?.messagingChannels, sandboxName, MESSAGING_CHANNELS, (envKey: string) => Boolean(normalizeCredentialValue(process.env[envKey]) || getCredential(envKey)), - registry.getSandbox.bind(registry), registry.getDisabledChannels.bind(registry), providerExistsInGateway, isNonInteractive()); -} - -/** - * Detect whether any messaging provider credential has been rotated since - * the sandbox was created, by comparing SHA-256 hashes of the current - * token values against hashes stored in the sandbox registry. - * - * Returns `changed: false` for legacy sandboxes that have no stored hashes - * (conservative — avoids unnecessary rebuilds after upgrade). - * - * @param {string} sandboxName - Name of the sandbox to check. - * @param {Array<{name: string, envKey: string, token: string|null}>} tokenDefs - * @returns {{ changed: boolean, changedProviders: string[] }} - */ -function detectMessagingCredentialRotation( - sandboxName: string, - tokenDefs: MessagingTokenDef[], -): { changed: boolean; changedProviders: string[] } { - const sb = registry.getSandbox(sandboxName); - const storedHashes = sb?.providerCredentialHashes || {}; - const changedProviders = []; - for (const { name, envKey, token } of tokenDefs) { - if (!token) continue; - const storedHash = storedHashes[envKey]; - if (!storedHash) continue; - if (storedHash !== hashCredential(token)) { - changedProviders.push(name); - } - } - return { changed: changedProviders.length > 0, changedProviders }; -} - // Tri-state probe factory for messaging-conflict backfill. An upfront liveness // check is necessary because `openshell provider get` exits non-zero for both // "provider not attached" and "gateway unreachable"; without the liveness @@ -6934,6 +6891,22 @@ async function setupInference( const MESSAGING_CHANNELS = listChannels(); +function getRecordedMessagingChannelsForResume( + resume: boolean, + session: Session | null, + sandboxName: string | null, +): string[] | null { + return getRecordedMessagingChannelsForResumeFromState({ + resume, + sessionMessagingChannels: session?.messagingChannels, + sandboxName, + channels: MESSAGING_CHANNELS, + getCredential, + providerExistsInGateway, + isNonInteractive, + }); +} + // Curl exit codes that indicate a network-level failure (not a token problem). // 35 (TLS handshake failure) covers corporate proxies that MITM HTTPS. const TELEGRAM_NETWORK_CURL_CODES = new Set([6, 7, 28, 35, 52, 56]); diff --git a/src/lib/onboard/messaging-credentials.ts b/src/lib/onboard/messaging-credentials.ts new file mode 100644 index 0000000000..fff0e7107b --- /dev/null +++ b/src/lib/onboard/messaging-credentials.ts @@ -0,0 +1,78 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { normalizeCredentialValue } from "../credentials/store"; +import { hashCredential } from "../security/credential-hash"; +import * as registry from "../state/registry"; + +export interface MessagingTokenDefinition { + name: string; + envKey: string; + token?: string | null; +} + +export interface RecordedMessagingChannelsOptions { + resume: boolean; + sessionMessagingChannels?: string[] | null; + sandboxName: string | null; + channels: unknown[]; + getCredential(envKey: string): string | null | undefined; + providerExistsInGateway(name: string): boolean; + isNonInteractive(): boolean; +} + +export function getRecordedMessagingChannelsForResume({ + resume, + sessionMessagingChannels, + sandboxName, + channels, + getCredential, + providerExistsInGateway, + isNonInteractive, +}: RecordedMessagingChannelsOptions): string[] | null { + return require("./messaging-reuse").getNonInteractiveStoredMessagingChannels( + resume, + sessionMessagingChannels, + sandboxName, + channels, + (envKey: string) => Boolean(normalizeCredentialValue(process.env[envKey]) || getCredential(envKey)), + registry.getSandbox.bind(registry), + registry.getDisabledChannels.bind(registry), + providerExistsInGateway, + isNonInteractive(), + ); +} + +export function getMessagingChannelForEnvKey(envKey: string): string | null { + if (envKey === "DISCORD_BOT_TOKEN") return "discord"; + if (envKey === "SLACK_BOT_TOKEN") return "slack"; + if (envKey === "TELEGRAM_BOT_TOKEN") return "telegram"; + if (envKey === "WECHAT_BOT_TOKEN") return "wechat"; + return null; +} + +/** + * Detect whether any messaging provider credential has been rotated since + * the sandbox was created, by comparing SHA-256 hashes of the current + * token values against hashes stored in the sandbox registry. + * + * Returns `changed: false` for legacy sandboxes that have no stored hashes + * (conservative — avoids unnecessary rebuilds after upgrade). + */ +export function detectMessagingCredentialRotation( + sandboxName: string, + tokenDefs: MessagingTokenDefinition[], +): { changed: boolean; changedProviders: string[] } { + const sb = registry.getSandbox(sandboxName); + const storedHashes = sb?.providerCredentialHashes || {}; + const changedProviders = []; + for (const { name, envKey, token } of tokenDefs) { + if (!token) continue; + const storedHash = storedHashes[envKey]; + if (!storedHash) continue; + if (storedHash !== hashCredential(token)) { + changedProviders.push(name); + } + } + return { changed: changedProviders.length > 0, changedProviders }; +} From 66689027a2f39701eb662c8c5e14887a519d8618 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 12:02:09 -0700 Subject: [PATCH 28/54] refactor(cli): extract sandbox registry metadata helpers --- src/lib/onboard.ts | 61 ++--------- src/lib/onboard/sandbox-registry-metadata.ts | 101 +++++++++++++++++++ 2 files changed, 108 insertions(+), 54 deletions(-) create mode 100644 src/lib/onboard/sandbox-registry-metadata.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 7ae0d9f303..0e0625e1f4 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -308,6 +308,7 @@ const { persistMessagingChannelConfigToSession, } = messagingConfig; const sandboxAgent: typeof import("./onboard/sandbox-agent") = require("./onboard/sandbox-agent"); +const sandboxRegistryMetadata: typeof import("./onboard/sandbox-registry-metadata") = require("./onboard/sandbox-registry-metadata"); const sandboxReuse: typeof import("./onboard/sandbox-reuse") = require("./onboard/sandbox-reuse"); const { RESERVED_SANDBOX_NAMES, @@ -3677,61 +3678,13 @@ async function recoverGatewayRuntime() { // ── Step 3: Sandbox ────────────────────────────────────────────── -function getSandboxRuntimeRegistryFields( - config: SandboxGpuConfig, -): Pick< - SandboxEntry, - | "gpuEnabled" - | "hostGpuDetected" - | "sandboxGpuEnabled" - | "sandboxGpuMode" - | "sandboxGpuDevice" - | "openshellDriver" - | "openshellVersion" -> { - return { - gpuEnabled: config.sandboxGpuEnabled, - hostGpuDetected: config.hostGpuDetected, - sandboxGpuEnabled: config.sandboxGpuEnabled, - sandboxGpuMode: config.mode, - sandboxGpuDevice: config.sandboxGpuDevice, - openshellDriver: isLinuxDockerDriverGatewayEnabled() ? (process.platform === "darwin" ? "vm" : "docker") : "kubernetes", - openshellVersion: getInstalledOpenshellVersion( - runCaptureOpenshell(["--version"], { ignoreError: true }), - ), - }; -} - -function hasSandboxGpuDrift(sandboxName: string, config: SandboxGpuConfig): boolean { - const existingEntry: SandboxEntry | null = registry.getSandbox(sandboxName); - if (!existingEntry) return false; - return ( - (existingEntry.sandboxGpuEnabled === true) !== config.sandboxGpuEnabled || - (existingEntry.sandboxGpuMode || "auto") !== config.mode || - (existingEntry.sandboxGpuDevice || null) !== config.sandboxGpuDevice - ); -} - -function updateReusedSandboxMetadata( - sandboxName: string, - agent: AgentDefinition | null | undefined, - model: string, - provider: string, - dashboardPort: number, - selectionVerified = true, - sandboxGpuConfig: SandboxGpuConfig | null = null, -): void { - const existingEntry = registry.getSandbox(sandboxName); - const agentVersionKnown = existingEntry?.agentVersion !== null; - const selectionUpdates = selectionVerified ? { model, provider } : {}; - registry.updateSandbox(sandboxName, { - ...selectionUpdates, - dashboardPort, - ...getSandboxAgentRegistryFields(agent, agentVersionKnown), - ...(sandboxGpuConfig ? getSandboxRuntimeRegistryFields(sandboxGpuConfig) : {}), +const { getSandboxRuntimeRegistryFields, hasSandboxGpuDrift, updateReusedSandboxMetadata } = + sandboxRegistryMetadata.createSandboxRegistryMetadataHelpers({ + isLinuxDockerDriverGatewayEnabled, + getInstalledOpenshellVersion, + runCaptureOpenshell, }); - registry.setDefault(sandboxName); -} + async function promptValidatedSandboxName(agent: AgentDefinition | null = null) { const MAX_ATTEMPTS = 3; diff --git a/src/lib/onboard/sandbox-registry-metadata.ts b/src/lib/onboard/sandbox-registry-metadata.ts new file mode 100644 index 0000000000..bbd84db74e --- /dev/null +++ b/src/lib/onboard/sandbox-registry-metadata.ts @@ -0,0 +1,101 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { AgentDefinition } from "../agent/defs"; +import type { SandboxEntry } from "../state/registry"; +import * as registry from "../state/registry"; +import { getSandboxAgentRegistryFields } from "./sandbox-agent"; +import type { SandboxGpuConfig } from "./sandbox-gpu-mode"; + +export interface SandboxRegistryMetadataDeps { + isLinuxDockerDriverGatewayEnabled(): boolean; + getInstalledOpenshellVersion(versionOutput?: string | null): string | null; + runCaptureOpenshell(args: string[], opts?: Record): string | null; +} + +export interface SandboxRegistryMetadataHelpers { + getSandboxRuntimeRegistryFields(config: SandboxGpuConfig): Pick< + SandboxEntry, + | "gpuEnabled" + | "hostGpuDetected" + | "sandboxGpuEnabled" + | "sandboxGpuMode" + | "sandboxGpuDevice" + | "openshellDriver" + | "openshellVersion" + >; + hasSandboxGpuDrift(sandboxName: string, config: SandboxGpuConfig): boolean; + updateReusedSandboxMetadata( + sandboxName: string, + agent: AgentDefinition | null | undefined, + model: string, + provider: string, + dashboardPort: number, + selectionVerified?: boolean, + sandboxGpuConfig?: SandboxGpuConfig | null, + ): void; +} + +export function createSandboxRegistryMetadataHelpers( + deps: SandboxRegistryMetadataDeps, +): SandboxRegistryMetadataHelpers { + function getSandboxRuntimeRegistryFields(config: SandboxGpuConfig): Pick< + SandboxEntry, + | "gpuEnabled" + | "hostGpuDetected" + | "sandboxGpuEnabled" + | "sandboxGpuMode" + | "sandboxGpuDevice" + | "openshellDriver" + | "openshellVersion" + > { + return { + gpuEnabled: config.sandboxGpuEnabled, + hostGpuDetected: config.hostGpuDetected, + sandboxGpuEnabled: config.sandboxGpuEnabled, + sandboxGpuMode: config.mode, + sandboxGpuDevice: config.sandboxGpuDevice, + openshellDriver: deps.isLinuxDockerDriverGatewayEnabled() + ? process.platform === "darwin" + ? "vm" + : "docker" + : "kubernetes", + openshellVersion: deps.getInstalledOpenshellVersion( + deps.runCaptureOpenshell(["--version"], { ignoreError: true }), + ), + }; + } + + function hasSandboxGpuDrift(sandboxName: string, config: SandboxGpuConfig): boolean { + const existingEntry: SandboxEntry | null = registry.getSandbox(sandboxName); + if (!existingEntry) return false; + return ( + (existingEntry.sandboxGpuEnabled === true) !== config.sandboxGpuEnabled || + (existingEntry.sandboxGpuMode || "auto") !== config.mode || + (existingEntry.sandboxGpuDevice || null) !== config.sandboxGpuDevice + ); + } + + function updateReusedSandboxMetadata( + sandboxName: string, + agent: AgentDefinition | null | undefined, + model: string, + provider: string, + dashboardPort: number, + selectionVerified = true, + sandboxGpuConfig: SandboxGpuConfig | null = null, + ): void { + const existingEntry = registry.getSandbox(sandboxName); + const agentVersionKnown = existingEntry?.agentVersion !== null; + const selectionUpdates = selectionVerified ? { model, provider } : {}; + registry.updateSandbox(sandboxName, { + ...selectionUpdates, + dashboardPort, + ...getSandboxAgentRegistryFields(agent, agentVersionKnown), + ...(sandboxGpuConfig ? getSandboxRuntimeRegistryFields(sandboxGpuConfig) : {}), + }); + registry.setDefault(sandboxName); + } + + return { getSandboxRuntimeRegistryFields, hasSandboxGpuDrift, updateReusedSandboxMetadata }; +} From a485eec0412b04a8a7c5c705de32bebfeff99a7f Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 12:04:44 -0700 Subject: [PATCH 29/54] refactor(cli): extract openclaw setup helper --- src/lib/onboard.ts | 35 ++++++++--------------- src/lib/onboard/openclaw-setup.ts | 46 +++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 24 deletions(-) create mode 100644 src/lib/onboard/openclaw-setup.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 0e0625e1f4..c2f1679889 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -284,6 +284,7 @@ const { getResumeSandboxConflict, } = resumeConfig; const { pruneKnownHostsEntries }: typeof import("./onboard/known-hosts") = require("./onboard/known-hosts"); +const { createOpenclawSetup }: typeof import("./onboard/openclaw-setup") = require("./onboard/openclaw-setup"); const openshellVersion: typeof import("./onboard/openshell-version") = require("./onboard/openshell-version"); const { getBlueprintMaxOpenshellVersion, @@ -7081,30 +7082,16 @@ async function setupMessagingChannels( // ── Step 7: OpenClaw ───────────────────────────────────────────── -async function setupOpenclaw(sandboxName: string, model: string, provider: string): Promise { - step(7, 8, `Setting up ${agentProductName()} inside sandbox`); - - const selectionConfig = getProviderSelectionConfig(provider, model); - if (selectionConfig) { - const sandboxConfig = { - ...selectionConfig, - onboardedAt: new Date().toISOString(), - }; - const script = buildSandboxConfigSyncScript(sandboxConfig); - const scriptFile = writeSandboxConfigSyncFile(script); - try { - const scriptContent = fs.readFileSync(scriptFile, "utf-8"); - run(openshellArgv(["sandbox", "connect", sandboxName]), { - stdio: ["pipe", "ignore", "inherit"], - input: scriptContent, - }); - } finally { - cleanupTempDir(scriptFile, "nemoclaw-sync"); - } - } - - console.log(` ✓ ${agentProductName()} gateway launched inside sandbox`); -} +const setupOpenclaw = createOpenclawSetup({ + step, + agentProductName, + getProviderSelectionConfig, + buildSandboxConfigSyncScript, + writeSandboxConfigSyncFile, + run, + openshellArgv, + cleanupTempDir, +}); // ── Step 7: Policy presets ─────────────────────────────────────── diff --git a/src/lib/onboard/openclaw-setup.ts b/src/lib/onboard/openclaw-setup.ts new file mode 100644 index 0000000000..de8fd11e3f --- /dev/null +++ b/src/lib/onboard/openclaw-setup.ts @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import fs from "node:fs"; + +export interface OpenclawSetupDeps { + step(n: number, total: number, msg: string): void; + agentProductName(): string; + getProviderSelectionConfig(provider: string, model: string): unknown | null; + buildSandboxConfigSyncScript(config: any): string; + writeSandboxConfigSyncFile(script: string): string; + run(argv: string[], options: Record): unknown; + openshellArgv(args: string[]): string[]; + cleanupTempDir(file: string, prefix: string): void; +} + +export function createOpenclawSetup(deps: OpenclawSetupDeps) { + return async function setupOpenclaw( + sandboxName: string, + model: string, + provider: string, + ): Promise { + deps.step(7, 8, `Setting up ${deps.agentProductName()} inside sandbox`); + + const selectionConfig = deps.getProviderSelectionConfig(provider, model); + if (selectionConfig) { + const sandboxConfig = { + ...(selectionConfig as Record), + onboardedAt: new Date().toISOString(), + }; + const script = deps.buildSandboxConfigSyncScript(sandboxConfig); + const scriptFile = deps.writeSandboxConfigSyncFile(script); + try { + const scriptContent = fs.readFileSync(scriptFile, "utf-8"); + deps.run(deps.openshellArgv(["sandbox", "connect", sandboxName]), { + stdio: ["pipe", "ignore", "inherit"], + input: scriptContent, + }); + } finally { + deps.cleanupTempDir(scriptFile, "nemoclaw-sync"); + } + } + + console.log(` ✓ ${deps.agentProductName()} gateway launched inside sandbox`); + }; +} From 46039e12489be8380139c4bc3b72f7eabd0f065e Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 13:00:06 -0700 Subject: [PATCH 30/54] refactor(cli): extract sandbox name prompt --- src/lib/onboard.ts | 57 ++++------------------------- src/lib/onboard/sandbox-agent.ts | 61 ++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 51 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index c2f1679889..6dc0ba1b73 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -322,6 +322,12 @@ const { getSandboxPromptDefault, normalizeSandboxAgentName, } = sandboxAgent; +const promptValidatedSandboxName = sandboxAgent.createPromptValidatedSandboxName({ + promptOrDefault, + cliDisplayName, + isNonInteractive, + exit: process.exit, +}); const modelRouter: typeof import("./onboard/model-router") = require("./onboard/model-router"); const { DEFAULT_MODEL_ROUTER_CREDENTIAL_ENV, @@ -3687,57 +3693,6 @@ const { getSandboxRuntimeRegistryFields, hasSandboxGpuDrift, updateReusedSandbox }); -async function promptValidatedSandboxName(agent: AgentDefinition | null = null) { - const MAX_ATTEMPTS = 3; - const defaultSandboxName = getSandboxPromptDefault(agent); - for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) { - const nameAnswer = await promptOrDefault( - ` Sandbox name (${NAME_ALLOWED_FORMAT}) [${defaultSandboxName}]: `, - "NEMOCLAW_SANDBOX_NAME", - defaultSandboxName, - ); - const sandboxName = (nameAnswer || defaultSandboxName).trim(); - - try { - const validatedSandboxName = validateName(sandboxName, "sandbox name"); - if (RESERVED_SANDBOX_NAMES.has(sandboxName)) { - console.error(` Reserved name: '${sandboxName}' is a ${cliDisplayName()} CLI command.`); - console.error(" Choose a different name to avoid routing conflicts."); - if (isNonInteractive()) { - process.exit(1); - } - if (attempt < MAX_ATTEMPTS - 1) { - console.error(" Please try again.\n"); - } - continue; - } - return validatedSandboxName; - } catch (error) { - const errorMessage = error instanceof Error ? error.message : String(error); - console.error(` ${errorMessage}`); - } - - for (const line of getNameValidationGuidance("sandbox name", sandboxName, { - includeAllowedFormat: false, - })) { - console.error(` ${line}`); - } - - // Non-interactive runs cannot re-prompt — abort so the caller can fix the - // NEMOCLAW_SANDBOX_NAME env var and retry. - if (isNonInteractive()) { - process.exit(1); - } - - if (attempt < MAX_ATTEMPTS - 1) { - console.error(" Please try again.\n"); - } - } - - console.error(" Too many invalid attempts."); - process.exit(1); -} - // ── Step 5: Sandbox ────────────────────────────────────────────── async function createSandbox( diff --git a/src/lib/onboard/sandbox-agent.ts b/src/lib/onboard/sandbox-agent.ts index c17b9de0b2..f527333abf 100644 --- a/src/lib/onboard/sandbox-agent.ts +++ b/src/lib/onboard/sandbox-agent.ts @@ -3,6 +3,7 @@ import type { AgentDefinition } from "../agent/defs"; import { loadAgent } from "../agent/defs"; +import { getNameValidationGuidance, NAME_ALLOWED_FORMAT } from "../name-validation"; import { validateName } from "../runner"; import type { SandboxEntry } from "../state/registry"; import * as registry from "../state/registry"; @@ -105,3 +106,63 @@ export function getSandboxAgentDrift( requestedAgentName, }; } + +export interface PromptSandboxNameDeps { + promptOrDefault(question: string, envVar: string, defaultValue: string): Promise; + cliDisplayName(): string; + isNonInteractive(): boolean; + exit(code: number): never; +} + +export function createPromptValidatedSandboxName(deps: PromptSandboxNameDeps) { + return async function promptValidatedSandboxName(agent: AgentDefinition | null = null) { + const MAX_ATTEMPTS = 3; + const defaultSandboxName = getSandboxPromptDefault(agent); + for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) { + const nameAnswer = await deps.promptOrDefault( + ` Sandbox name (${NAME_ALLOWED_FORMAT}) [${defaultSandboxName}]: `, + "NEMOCLAW_SANDBOX_NAME", + defaultSandboxName, + ); + const sandboxName = (nameAnswer || defaultSandboxName).trim(); + + try { + const validatedSandboxName = validateName(sandboxName, "sandbox name"); + if (RESERVED_SANDBOX_NAMES.has(sandboxName)) { + console.error(` Reserved name: '${sandboxName}' is a ${deps.cliDisplayName()} CLI command.`); + console.error(" Choose a different name to avoid routing conflicts."); + if (deps.isNonInteractive()) { + deps.exit(1); + } + if (attempt < MAX_ATTEMPTS - 1) { + console.error(" Please try again.\n"); + } + continue; + } + return validatedSandboxName; + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error); + console.error(` ${errorMessage}`); + } + + for (const line of getNameValidationGuidance("sandbox name", sandboxName, { + includeAllowedFormat: false, + })) { + console.error(` ${line}`); + } + + // Non-interactive runs cannot re-prompt — abort so the caller can fix the + // NEMOCLAW_SANDBOX_NAME env var and retry. + if (deps.isNonInteractive()) { + deps.exit(1); + } + + if (attempt < MAX_ATTEMPTS - 1) { + console.error(" Please try again.\n"); + } + } + + console.error(" Too many invalid attempts."); + deps.exit(1); + }; +} From 3f6e041dfd3789ba277483a82afe3ea8818284bc Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 13:02:41 -0700 Subject: [PATCH 31/54] refactor(cli): move telegram mention helper --- src/lib/onboard.ts | 13 +------------ src/lib/onboard/messaging-config.ts | 12 ++++++++++++ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 6dc0ba1b73..a3aa436e08 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -304,6 +304,7 @@ const { getRecordedMessagingChannelsForResume: getRecordedMessagingChannelsForResumeFromState, }: typeof import("./onboard/messaging-credentials") = require("./onboard/messaging-credentials"); const { + computeTelegramRequireMention, getStoredMessagingChannelConfig, messagingChannelConfigsEqual, persistMessagingChannelConfigToSession, @@ -512,18 +513,6 @@ let AUTO_YES = false; // null means "use auto-allocation" (skip dashboard port check in preflight). let _preflightDashboardPort: number | null = null; -// Read TELEGRAM_REQUIRE_MENTION (set either by the interactive mention prompt -// or by the user's shell) and map it to a boolean, or null when the env var -// is unset / invalid. Used at build time to bake groupPolicy into -// openclaw.json and at resume time to detect drift against the recorded -// session state. See #1737 and the CodeRabbit follow-up on #2417. -function computeTelegramRequireMention(): boolean | null { - const raw = process.env.TELEGRAM_REQUIRE_MENTION; - if (raw === "1") return true; - if (raw === "0") return false; - return null; -} - function isNonInteractive(): boolean { return NON_INTERACTIVE || process.env.NEMOCLAW_NON_INTERACTIVE === "1"; } diff --git a/src/lib/onboard/messaging-config.ts b/src/lib/onboard/messaging-config.ts index eefea7e901..2ac8fa7eae 100644 --- a/src/lib/onboard/messaging-config.ts +++ b/src/lib/onboard/messaging-config.ts @@ -10,6 +10,18 @@ import type { Session } from "../state/onboard-session"; import * as onboardSession from "../state/onboard-session"; import * as registry from "../state/registry"; +// Read TELEGRAM_REQUIRE_MENTION (set either by the interactive mention prompt +// or by the user's shell) and map it to a boolean, or null when the env var +// is unset / invalid. Used at build time to bake groupPolicy into +// openclaw.json and at resume time to detect drift against the recorded +// session state. See #1737 and the CodeRabbit follow-up on #2417. +export function computeTelegramRequireMention(): boolean | null { + const raw = process.env.TELEGRAM_REQUIRE_MENTION; + if (raw === "1") return true; + if (raw === "0") return false; + return null; +} + export function getStoredMessagingChannelConfig( sandboxName: string | null, session: Session | null, From 534f0d842b38fc91890f4acbc9378e27227f4bc1 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 13:05:00 -0700 Subject: [PATCH 32/54] refactor(cli): extract onboard base image helpers --- src/lib/onboard.ts | 39 ++++------------------------------- src/lib/onboard/base-image.ts | 37 +++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 35 deletions(-) create mode 100644 src/lib/onboard/base-image.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index a3aa436e08..0bf5ce14ac 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -119,10 +119,11 @@ const sandboxBaseImage: typeof import("./sandbox-base-image") = require("./sandb const { OPENCLAW_SANDBOX_BASE_IMAGE: SANDBOX_BASE_IMAGE, SANDBOX_BASE_TAG, - defaultOpenclawBaseDockerfile, - buildLocalBaseTag, - resolveSandboxBaseImage, } = sandboxBaseImage; +const { + getStableGatewayImageRef, + pullAndResolveBaseImageDigest, +}: typeof import("./onboard/base-image") = require("./onboard/base-image"); const errnoUtils: typeof import("./core/errno") = require("./core/errno"); const { isErrnoException } = errnoUtils; @@ -661,38 +662,6 @@ function validateSandboxGpuPreflight(config: SandboxGpuConfig): void { console.log(` ✓ Docker CDI GPU support detected (${cdiSpecFiles.join(", ")})`); } -// ── Base image resolution ─────────────────────────────────────── -// Pulls candidate sandbox-base images from GHCR and inspects them to get the -// actual repo digest when available. This avoids the registry mismatch that -// broke e2e tests in #1937 while still allowing PR branches to use a source-SHA -// base image or local build before latest has been rebuilt. See #1904. - -/** - * Resolve a compatible sandbox-base image and pin it to a repo digest when - * possible. PR-branch validation first tries a source-SHA tag, then latest, - * and finally a local Dockerfile.base build when the OpenShell Docker driver - * requires a newer glibc than the published image provides. - */ -function pullAndResolveBaseImageDigest( - options: { requireOpenshellSandboxAbi?: boolean } = {}, -): { digest: string | null; ref: string; source?: string; glibcVersion?: string | null } | null { - return resolveSandboxBaseImage({ - imageName: SANDBOX_BASE_IMAGE, - dockerfilePath: defaultOpenclawBaseDockerfile(ROOT), - localTag: buildLocalBaseTag("nemoclaw-sandbox-base-local", ROOT), - envVar: "NEMOCLAW_SANDBOX_BASE_IMAGE_REF", - label: "OpenClaw sandbox base image", - requireOpenshellSandboxAbi: options.requireOpenshellSandboxAbi === true, - rootDir: ROOT, - }); -} - -function getStableGatewayImageRef(versionOutput: string | null = null): string | null { - const version = getInstalledOpenshellVersion(versionOutput); - if (!version) return null; - return `ghcr.io/nvidia/openshell/cluster:${version}`; -} - function getOpenshellBinary(): string { if (OPENSHELL_BIN) return OPENSHELL_BIN; const resolved = resolveOpenshell(); diff --git a/src/lib/onboard/base-image.ts b/src/lib/onboard/base-image.ts new file mode 100644 index 0000000000..3f9f14daa3 --- /dev/null +++ b/src/lib/onboard/base-image.ts @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { ROOT } from "../runner"; +import { + buildLocalBaseTag, + defaultOpenclawBaseDockerfile, + resolveSandboxBaseImage, + OPENCLAW_SANDBOX_BASE_IMAGE as SANDBOX_BASE_IMAGE, +} from "../sandbox-base-image"; +import { getInstalledOpenshellVersion } from "./openshell-version"; + +/** + * Resolve a compatible sandbox-base image and pin it to a repo digest when + * possible. PR-branch validation first tries a source-SHA tag, then latest, + * and finally a local Dockerfile.base build when the OpenShell Docker driver + * requires a newer glibc than the published image provides. + */ +export function pullAndResolveBaseImageDigest( + options: { requireOpenshellSandboxAbi?: boolean } = {}, +): { digest: string | null; ref: string; source?: string; glibcVersion?: string | null } | null { + return resolveSandboxBaseImage({ + imageName: SANDBOX_BASE_IMAGE, + dockerfilePath: defaultOpenclawBaseDockerfile(ROOT), + localTag: buildLocalBaseTag("nemoclaw-sandbox-base-local", ROOT), + envVar: "NEMOCLAW_SANDBOX_BASE_IMAGE_REF", + label: "OpenClaw sandbox base image", + requireOpenshellSandboxAbi: options.requireOpenshellSandboxAbi === true, + rootDir: ROOT, + }); +} + +export function getStableGatewayImageRef(versionOutput: string | null = null): string | null { + const version = getInstalledOpenshellVersion(versionOutput); + if (!version) return null; + return `ghcr.io/nvidia/openshell/cluster:${version}`; +} From cd29f01f52cef7257c602d0e32ef9a470ce9d5ac Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 13:07:41 -0700 Subject: [PATCH 33/54] refactor(cli): extract prompt helpers --- src/lib/onboard.ts | 35 +++----------------- src/lib/onboard/prompt-helpers.ts | 54 +++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 30 deletions(-) create mode 100644 src/lib/onboard/prompt-helpers.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 0bf5ce14ac..0f534c79f6 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -285,6 +285,7 @@ const { getResumeSandboxConflict, } = resumeConfig; const { pruneKnownHostsEntries }: typeof import("./onboard/known-hosts") = require("./onboard/known-hosts"); +const onboardPromptHelpers: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers"); const { createOpenclawSetup }: typeof import("./onboard/openclaw-setup") = require("./onboard/openclaw-setup"); const openshellVersion: typeof import("./onboard/openshell-version") = require("./onboard/openshell-version"); const { @@ -530,48 +531,22 @@ function note(message: string): void { console.log(`${DIM}${message}${RESET}`); } -// Prompt wrapper: returns env var value or default in non-interactive mode, -// otherwise prompts the user interactively. +const promptHelperDeps = { isNonInteractive, note, prompt }; + async function promptOrDefault( question: string, envVar: string | null, defaultValue: string, ): Promise { - if (isNonInteractive()) { - const val = envVar ? process.env[envVar] : null; - const result = val || defaultValue; - note(` [non-interactive] ${question.trim()} → ${result}`); - return result; - } - return prompt(question); + return onboardPromptHelpers.promptOrDefault(promptHelperDeps, question, envVar, defaultValue); } -// Yes/no prompt with a typed default. The `[Y/n]` / `[y/N]` indicator and -// the non-interactive echo letter are both derived from `defaultIsYes`, so -// the case of the indicator and the echoed default cannot drift apart. -// Returns a boolean — callers no longer have to parse reply strings. -// Replies of "y"/"yes" and "n"/"no" win regardless of case; empty and -// unknown input fall back to the default. async function promptYesNoOrDefault( question: string, envVar: string | null, defaultIsYes: boolean, ): Promise { - const fullQuestion = `${question} ${defaultIsYes ? "[Y/n]" : "[y/N]"}: `; - const nonInteractive = isNonInteractive(); - const input = nonInteractive ? (envVar ? process.env[envVar] : null) : await prompt(fullQuestion); - - const value = String(input ?? "") - .trim() - .toLowerCase(); - let chosen = defaultIsYes; - if (value === "y" || value === "yes") chosen = true; - else if (value === "n" || value === "no") chosen = false; - - if (nonInteractive) { - note(` [non-interactive] ${fullQuestion.trim()} → ${chosen ? "Y" : "N"}`); - } - return chosen; + return onboardPromptHelpers.promptYesNoOrDefault(promptHelperDeps, question, envVar, defaultIsYes); } // ── Helpers ────────────────────────────────────────────────────── diff --git a/src/lib/onboard/prompt-helpers.ts b/src/lib/onboard/prompt-helpers.ts new file mode 100644 index 0000000000..4e274fd054 --- /dev/null +++ b/src/lib/onboard/prompt-helpers.ts @@ -0,0 +1,54 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +export interface PromptHelperDeps { + isNonInteractive(): boolean; + note(message: string): void; + prompt(question: string): Promise; +} + +// Prompt wrapper: returns env var value or default in non-interactive mode, +// otherwise prompts the user interactively. +export async function promptOrDefault( + deps: PromptHelperDeps, + question: string, + envVar: string | null, + defaultValue: string, +): Promise { + if (deps.isNonInteractive()) { + const val = envVar ? process.env[envVar] : null; + const result = val || defaultValue; + deps.note(` [non-interactive] ${question.trim()} → ${result}`); + return result; + } + return deps.prompt(question); +} + +// Yes/no prompt with a typed default. The `[Y/n]` / `[y/N]` indicator and +// the non-interactive echo letter are both derived from `defaultIsYes`, so +// the case of the indicator and the echoed default cannot drift apart. +// Returns a boolean — callers no longer have to parse reply strings. +// Replies of "y"/"yes" and "n"/"no" win regardless of case; empty and +// unknown input fall back to the default. +export async function promptYesNoOrDefault( + deps: PromptHelperDeps, + question: string, + envVar: string | null, + defaultIsYes: boolean, +): Promise { + const fullQuestion = `${question} ${defaultIsYes ? "[Y/n]" : "[y/N]"}: `; + const nonInteractive = deps.isNonInteractive(); + const input = nonInteractive ? (envVar ? process.env[envVar] : null) : await deps.prompt(fullQuestion); + + const value = String(input ?? "") + .trim() + .toLowerCase(); + let chosen = defaultIsYes; + if (value === "y" || value === "yes") chosen = true; + else if (value === "n" || value === "no") chosen = false; + + if (nonInteractive) { + deps.note(` [non-interactive] ${fullQuestion.trim()} → ${chosen ? "Y" : "N"}`); + } + return chosen; +} From 3fe22050c43a0f7eb0fe90279c1c4982bd73cd00 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 13:21:29 -0700 Subject: [PATCH 34/54] refactor(cli): extract sandbox gpu preflight helpers --- src/lib/onboard.ts | 59 +++------------------ src/lib/onboard/sandbox-gpu-preflight.ts | 66 ++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 53 deletions(-) create mode 100644 src/lib/onboard/sandbox-gpu-preflight.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 0f534c79f6..011a7fa671 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -39,7 +39,7 @@ const dockerGpuPatch: typeof import("./onboard/docker-gpu-patch") = require("./o const dockerGpuLocalInference: typeof import("./onboard/docker-gpu-local-inference") = require("./onboard/docker-gpu-local-inference"); const dockerGpuSandboxCreate: typeof import("./onboard/docker-gpu-sandbox-create") = require("./onboard/docker-gpu-sandbox-create"); const dockerDriverGatewayLaunch: typeof import("./onboard/docker-driver-gateway-launch") = require("./onboard/docker-driver-gateway-launch"); -const { findReadableNvidiaCdiSpecFiles, getDockerCdiSpecDirs, parseDockerCdiSpecDirs }: typeof import("./onboard/docker-cdi") = require("./onboard/docker-cdi"); +const { findReadableNvidiaCdiSpecFiles, parseDockerCdiSpecDirs }: typeof import("./onboard/docker-cdi") = require("./onboard/docker-cdi"); const { buildSandboxGpuCreateArgs, getSandboxReadyTimeoutSecs }: typeof import("./onboard/sandbox-gpu-create") = require("./onboard/sandbox-gpu-create"); const { isValidProxyHost, @@ -287,6 +287,11 @@ const { const { pruneKnownHostsEntries }: typeof import("./onboard/known-hosts") = require("./onboard/known-hosts"); const onboardPromptHelpers: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers"); const { createOpenclawSetup }: typeof import("./onboard/openclaw-setup") = require("./onboard/openclaw-setup"); +const { + resolveSandboxGpuFlagFromOptions, + sandboxGpuRemediationLines, + validateSandboxGpuPreflight, +}: typeof import("./onboard/sandbox-gpu-preflight") = require("./onboard/sandbox-gpu-preflight"); const openshellVersion: typeof import("./onboard/openshell-version") = require("./onboard/openshell-version"); const { getBlueprintMaxOpenshellVersion, @@ -585,58 +590,6 @@ function step(n: number, total: number, msg: string): void { console.log(` ${"─".repeat(50)}`); } -function resolveSandboxGpuFlagFromOptions( - opts: Pick, -): SandboxGpuFlag { - const requestedGpuPassthrough = opts.gpu === true; - const optedOutGpuPassthrough = opts.noGpu === true; - const sandboxGpuFlag = opts.sandboxGpu ?? null; - if (requestedGpuPassthrough && optedOutGpuPassthrough) { - console.error(" --gpu and --no-gpu cannot both be set."); - process.exit(1); - } - if ( - (requestedGpuPassthrough && sandboxGpuFlag === "disable") || - (optedOutGpuPassthrough && sandboxGpuFlag === "enable") - ) { - console.error(" --gpu/--no-gpu conflict with the sandbox GPU flags."); - process.exit(1); - } - if (sandboxGpuFlag) return sandboxGpuFlag; - if (requestedGpuPassthrough) return "enable"; - if (optedOutGpuPassthrough) return "disable"; - return null; -} - -function sandboxGpuRemediationLines(): string[] { - return [ - "Install/configure NVIDIA Container Toolkit CDI, then restart Docker:", - " sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml", - " sudo systemctl restart docker", - "Or force CPU sandbox behavior with NEMOCLAW_SANDBOX_GPU=0.", - ]; -} - -function validateSandboxGpuPreflight(config: SandboxGpuConfig): void { - if (config.errors.length > 0) { - console.error(""); - for (const error of config.errors) console.error(` ✗ ${error}`); - process.exit(1); - } - if (!config.sandboxGpuEnabled) return; - if (process.platform !== "linux") return; - - const cdiSpecDirs = getDockerCdiSpecDirs(); - const cdiSpecFiles = findReadableNvidiaCdiSpecFiles(cdiSpecDirs); - if (cdiSpecFiles.length === 0) { - console.error(""); - console.error(" ✗ Docker CDI GPU support was not detected."); - for (const line of sandboxGpuRemediationLines()) console.error(` ${line}`); - process.exit(1); - } - console.log(` ✓ Docker CDI GPU support detected (${cdiSpecFiles.join(", ")})`); -} - function getOpenshellBinary(): string { if (OPENSHELL_BIN) return OPENSHELL_BIN; const resolved = resolveOpenshell(); diff --git a/src/lib/onboard/sandbox-gpu-preflight.ts b/src/lib/onboard/sandbox-gpu-preflight.ts new file mode 100644 index 0000000000..d33a324f3c --- /dev/null +++ b/src/lib/onboard/sandbox-gpu-preflight.ts @@ -0,0 +1,66 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { + findReadableNvidiaCdiSpecFiles, + getDockerCdiSpecDirs, +} from "./docker-cdi"; +import type { SandboxGpuConfig, SandboxGpuFlag } from "./sandbox-gpu-mode"; + +export interface SandboxGpuFlagOptions { + sandboxGpu?: SandboxGpuFlag; + gpu?: boolean; + noGpu?: boolean; +} + +export function resolveSandboxGpuFlagFromOptions( + opts: SandboxGpuFlagOptions, +): SandboxGpuFlag { + const requestedGpuPassthrough = opts.gpu === true; + const optedOutGpuPassthrough = opts.noGpu === true; + const sandboxGpuFlag = opts.sandboxGpu ?? null; + if (requestedGpuPassthrough && optedOutGpuPassthrough) { + console.error(" --gpu and --no-gpu cannot both be set."); + process.exit(1); + } + if ( + (requestedGpuPassthrough && sandboxGpuFlag === "disable") || + (optedOutGpuPassthrough && sandboxGpuFlag === "enable") + ) { + console.error(" --gpu/--no-gpu conflict with the sandbox GPU flags."); + process.exit(1); + } + if (sandboxGpuFlag) return sandboxGpuFlag; + if (requestedGpuPassthrough) return "enable"; + if (optedOutGpuPassthrough) return "disable"; + return null; +} + +export function sandboxGpuRemediationLines(): string[] { + return [ + "Install/configure NVIDIA Container Toolkit CDI, then restart Docker:", + " sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml", + " sudo systemctl restart docker", + "Or force CPU sandbox behavior with NEMOCLAW_SANDBOX_GPU=0.", + ]; +} + +export function validateSandboxGpuPreflight(config: SandboxGpuConfig): void { + if (config.errors.length > 0) { + console.error(""); + for (const error of config.errors) console.error(` ✗ ${error}`); + process.exit(1); + } + if (!config.sandboxGpuEnabled) return; + if (process.platform !== "linux") return; + + const cdiSpecDirs = getDockerCdiSpecDirs(); + const cdiSpecFiles = findReadableNvidiaCdiSpecFiles(cdiSpecDirs); + if (cdiSpecFiles.length === 0) { + console.error(""); + console.error(" ✗ Docker CDI GPU support was not detected."); + for (const line of sandboxGpuRemediationLines()) console.error(` ${line}`); + process.exit(1); + } + console.log(` ✓ Docker CDI GPU support detected (${cdiSpecFiles.join(", ")})`); +} From 55452223c12d632b52bf9cfca1bd0a3bade9c8c4 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 14:06:23 -0700 Subject: [PATCH 35/54] refactor(cli): extract remediation helpers --- src/lib/onboard.ts | 46 ++++----------------------------- src/lib/onboard/remediation.ts | 47 ++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 41 deletions(-) create mode 100644 src/lib/onboard/remediation.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 011a7fa671..ea6a68f243 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -276,6 +276,11 @@ const { resolveSandboxImageTagFromCreateOutput } = require("./domain/sandbox/image-tag") as typeof import("./domain/sandbox/image-tag"); const nim: typeof import("./inference/nim") = require("./inference/nim"); const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session"); +const { + getFutureShellPathHint, + getPortConflictServiceHints, + printRemediationActions, +}: typeof import("./onboard/remediation") = require("./onboard/remediation"); const resumeConfig: typeof import("./onboard/resume-config") = require("./onboard/resume-config"); const { getRequestedModelHint, @@ -1583,51 +1588,10 @@ const { const ollamaModelSize: typeof import("./inference/ollama/model-size") = require("./inference/ollama/model-size"); -function printRemediationActions( - actions: Array<{ title: string; reason: string; commands?: string[] }> | null | undefined, -): void { - if (!Array.isArray(actions) || actions.length === 0) { - return; - } - - console.error(""); - console.error(" Suggested fix:"); - console.error(""); - for (const action of actions) { - console.error(` - ${action.title}: ${action.reason}`); - for (const command of action.commands || []) { - console.error(` ${command}`); - } - } -} - function isOpenshellInstalled(): boolean { return resolveOpenshell() !== null; } -function getFutureShellPathHint(binDir: string, pathValue = process.env.PATH || ""): string | null { - const parts = String(pathValue).split(path.delimiter).filter(Boolean); - if (parts[0] === binDir) { - return null; - } - return `export PATH="${binDir}:$PATH"`; -} - -function getPortConflictServiceHints(platform = process.platform): string[] { - if (platform === "darwin") { - return [ - " # or, if it's a launchctl service (macOS):", - " launchctl list | grep -i claw # columns: PID | ExitStatus | Label", - ` launchctl unload ${OPENCLAW_LAUNCH_AGENT_PLIST}`, - " # or: launchctl bootout gui/$(id -u)/ai.openclaw.gateway", - ]; - } - return [ - " # or, if it's a systemd service:", - " systemctl --user stop openclaw-gateway.service", - ]; -} - function installOpenshell(): OpenShellInstallResult { return openshellPinFlow.runOpenshellInstall({ scriptsDir: SCRIPTS, diff --git a/src/lib/onboard/remediation.ts b/src/lib/onboard/remediation.ts new file mode 100644 index 0000000000..256bb4f29b --- /dev/null +++ b/src/lib/onboard/remediation.ts @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import path from "node:path"; + +const OPENCLAW_LAUNCH_AGENT_PLIST = "~/Library/LaunchAgents/ai.openclaw.gateway.plist"; + +export function printRemediationActions( + actions: Array<{ title: string; reason: string; commands?: string[] }> | null | undefined, +): void { + if (!Array.isArray(actions) || actions.length === 0) { + return; + } + + console.error(""); + console.error(" Suggested fix:"); + console.error(""); + for (const action of actions) { + console.error(` - ${action.title}: ${action.reason}`); + for (const command of action.commands || []) { + console.error(` ${command}`); + } + } +} + +export function getFutureShellPathHint(binDir: string, pathValue = process.env.PATH || ""): string | null { + const parts = String(pathValue).split(path.delimiter).filter(Boolean); + if (parts[0] === binDir) { + return null; + } + return `export PATH="${binDir}:$PATH"`; +} + +export function getPortConflictServiceHints(platform = process.platform): string[] { + if (platform === "darwin") { + return [ + " # or, if it's a launchctl service (macOS):", + " launchctl list | grep -i claw # columns: PID | ExitStatus | Label", + ` launchctl unload ${OPENCLAW_LAUNCH_AGENT_PLIST}`, + " # or: launchctl bootout gui/$(id -u)/ai.openclaw.gateway", + ]; + } + return [ + " # or, if it's a systemd service:", + " systemctl --user stop openclaw-gateway.service", + ]; +} From b0734c50443404cc33c96dddedf2ae5e13587f13 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 14:09:20 -0700 Subject: [PATCH 36/54] refactor(cli): extract provider recovery helpers --- src/lib/onboard.ts | 130 ++-------------------- src/lib/onboard/provider-recovery.ts | 154 +++++++++++++++++++++++++++ 2 files changed, 161 insertions(+), 123 deletions(-) create mode 100644 src/lib/onboard/provider-recovery.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index ea6a68f243..bd22c2e823 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -291,6 +291,7 @@ const { } = resumeConfig; const { pruneKnownHostsEntries }: typeof import("./onboard/known-hosts") = require("./onboard/known-hosts"); const onboardPromptHelpers: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers"); +const providerRecovery: typeof import("./onboard/provider-recovery") = require("./onboard/provider-recovery"); const { createOpenclawSetup }: typeof import("./onboard/openclaw-setup") = require("./onboard/openclaw-setup"); const { resolveSandboxGpuFlagFromOptions, @@ -4757,131 +4758,14 @@ function providerNameToOptionKey( name: string | null | undefined, opts: { hasNimContainer?: boolean } = {}, ): string | null { - if (!name) return null; - if (name === "nvidia-router") return "routed"; - if (name === "ollama-local") return "ollama"; - // Local NIM and standalone vLLM both persist as provider="vllm-local". NIM - // is positively identified by a nimContainer record; the absence of one in - // registry/session recovery reliably means standalone vLLM (the standalone - // path never records a container), so default to "vllm" there. Live-gateway - // recovery doesn't carry container info either, but the caller's - // option-availability check still gates on whether vllm is actually running. - if (name === "vllm-local") return opts.hasNimContainer ? "nim-local" : "vllm"; - // `nvidia-nim` is a legacy alias for cloud NVIDIA Endpoints (see - // setupInference: it routes nvidia-nim through REMOTE_PROVIDER_CONFIG.build), - // not a marker for Local NIM. Local NIM persists as vllm-local + nimContainer. - if (name === "nvidia-nim") return "build"; - for (const [key, cfg] of Object.entries(REMOTE_PROVIDER_CONFIG)) { - if ((cfg as { providerName?: string }).providerName === name) return key; - } - return null; -} - -function readLiveInference( - sandboxName: string | null | undefined, -): { provider: string | null; model: string | null } | null { - if (!sandboxName) return null; - try { - const { defaultSandbox, sandboxes } = registry.listSandboxes(); - // The gateway holds one active inference config at a time. Trust the - // live read for the default sandbox, or when the registry has no - // entries (rebuild path: destroy wiped the entry but the gateway - // config persists). Other non-default sandboxes have a stored config - // that the gateway will swap to on their next connect. - const trustGateway = sandboxName === defaultSandbox || sandboxes.length === 0; - if (!trustGateway) return null; - const output = runCaptureOpenshell(["inference", "get"], { ignoreError: true }); - return parseGatewayInference(output); - } catch { - return null; - } -} - -function readRecordedProvider(sandboxName: string | null | undefined): string | null { - if (!sandboxName) return null; - try { - const entry = registry.getSandbox(sandboxName); - if (entry && typeof entry.provider === "string" && entry.provider) { - return entry.provider; - } - } catch { - // fall through to session - } - try { - const session = onboardSession.loadSession(); - if ( - session && - session.sandboxName === sandboxName && - typeof session.provider === "string" && - session.provider - ) { - return session.provider; - } - } catch { - // fall through to live gateway - } - const live = readLiveInference(sandboxName); - if (live && typeof live.provider === "string" && live.provider) { - return live.provider; - } - return null; + return providerRecovery.providerNameToOptionKey(REMOTE_PROVIDER_CONFIG, name, opts); } -function readRecordedNimContainer(sandboxName: string | null | undefined): string | null { - if (!sandboxName) return null; - try { - const entry = registry.getSandbox(sandboxName); - if (entry && typeof entry.nimContainer === "string" && entry.nimContainer) { - return entry.nimContainer; - } - } catch { - // fall through to session - } - try { - const session = onboardSession.loadSession(); - if ( - session && - session.sandboxName === sandboxName && - typeof session.nimContainer === "string" && - session.nimContainer - ) { - return session.nimContainer; - } - } catch { - return null; - } - return null; -} - -function readRecordedModel(sandboxName: string | null | undefined): string | null { - if (!sandboxName) return null; - try { - const entry = registry.getSandbox(sandboxName); - if (entry && typeof entry.model === "string" && entry.model) { - return entry.model; - } - } catch { - // fall through to session - } - try { - const session = onboardSession.loadSession(); - if ( - session && - session.sandboxName === sandboxName && - typeof session.model === "string" && - session.model - ) { - return session.model; - } - } catch { - // fall through to live gateway - } - const live = readLiveInference(sandboxName); - if (live && typeof live.model === "string" && live.model) { - return live.model; - } - return null; -} +const { readLiveInference, readRecordedProvider, readRecordedNimContainer, readRecordedModel } = + providerRecovery.createProviderRecoveryHelpers({ + parseGatewayInference, + runCaptureOpenshell, + }); type OllamaModelSelectionOutcome = | { outcome: "selected"; model: string } diff --git a/src/lib/onboard/provider-recovery.ts b/src/lib/onboard/provider-recovery.ts new file mode 100644 index 0000000000..cf196a0f7c --- /dev/null +++ b/src/lib/onboard/provider-recovery.ts @@ -0,0 +1,154 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import * as onboardSession from "../state/onboard-session"; +import * as registry from "../state/registry"; + +export type RemoteProviderConfigEntryLike = { providerName?: string }; + +export function providerNameToOptionKey( + remoteProviderConfig: Record, + name: string | null | undefined, + opts: { hasNimContainer?: boolean } = {}, +): string | null { + if (!name) return null; + if (name === "nvidia-router") return "routed"; + if (name === "ollama-local") return "ollama"; + // Local NIM and standalone vLLM both persist as provider="vllm-local". NIM + // is positively identified by a nimContainer record; the absence of one in + // registry/session recovery reliably means standalone vLLM (the standalone + // path never records a container), so default to "vllm" there. Live-gateway + // recovery doesn't carry container info either, but the caller's + // option-availability check still gates on whether vllm is actually running. + if (name === "vllm-local") return opts.hasNimContainer ? "nim-local" : "vllm"; + // `nvidia-nim` is a legacy alias for cloud NVIDIA Endpoints (see + // setupInference: it routes nvidia-nim through REMOTE_PROVIDER_CONFIG.build), + // not a marker for Local NIM. Local NIM persists as vllm-local + nimContainer. + if (name === "nvidia-nim") return "build"; + for (const [key, cfg] of Object.entries(remoteProviderConfig)) { + if (cfg.providerName === name) return key; + } + return null; +} + +export interface ProviderRecoveryDeps { + parseGatewayInference(output: string | null): { provider: string | null; model: string | null } | null; + runCaptureOpenshell(args: string[], opts?: Record): string | null; +} + +export interface ProviderRecoveryHelpers { + readLiveInference(sandboxName: string | null | undefined): { provider: string | null; model: string | null } | null; + readRecordedProvider(sandboxName: string | null | undefined): string | null; + readRecordedNimContainer(sandboxName: string | null | undefined): string | null; + readRecordedModel(sandboxName: string | null | undefined): string | null; +} + +export function createProviderRecoveryHelpers(deps: ProviderRecoveryDeps): ProviderRecoveryHelpers { + function readLiveInference( + sandboxName: string | null | undefined, + ): { provider: string | null; model: string | null } | null { + if (!sandboxName) return null; + try { + const { defaultSandbox, sandboxes } = registry.listSandboxes(); + // The gateway holds one active inference config at a time. Trust the + // live read for the default sandbox, or when the registry has no + // entries (rebuild path: destroy wiped the entry but the gateway + // config persists). Other non-default sandboxes have a stored config + // that the gateway will swap to on their next connect. + const trustGateway = sandboxName === defaultSandbox || sandboxes.length === 0; + if (!trustGateway) return null; + const output = deps.runCaptureOpenshell(["inference", "get"], { ignoreError: true }); + return deps.parseGatewayInference(output); + } catch { + return null; + } + } + + function readRecordedProvider(sandboxName: string | null | undefined): string | null { + if (!sandboxName) return null; + try { + const entry = registry.getSandbox(sandboxName); + if (entry && typeof entry.provider === "string" && entry.provider) { + return entry.provider; + } + } catch { + // fall through to session + } + try { + const session = onboardSession.loadSession(); + if ( + session && + session.sandboxName === sandboxName && + typeof session.provider === "string" && + session.provider + ) { + return session.provider; + } + } catch { + // fall through to live gateway + } + const live = readLiveInference(sandboxName); + if (live && typeof live.provider === "string" && live.provider) { + return live.provider; + } + return null; + } + + function readRecordedNimContainer(sandboxName: string | null | undefined): string | null { + if (!sandboxName) return null; + try { + const entry = registry.getSandbox(sandboxName); + if (entry && typeof entry.nimContainer === "string" && entry.nimContainer) { + return entry.nimContainer; + } + } catch { + // fall through to session + } + try { + const session = onboardSession.loadSession(); + if ( + session && + session.sandboxName === sandboxName && + typeof session.nimContainer === "string" && + session.nimContainer + ) { + return session.nimContainer; + } + } catch { + return null; + } + return null; + } + + function readRecordedModel(sandboxName: string | null | undefined): string | null { + if (!sandboxName) return null; + try { + const entry = registry.getSandbox(sandboxName); + if (entry && typeof entry.model === "string" && entry.model) { + return entry.model; + } + } catch { + // fall through to session + } + try { + const session = onboardSession.loadSession(); + if ( + session && + session.sandboxName === sandboxName && + typeof session.model === "string" && + session.model + ) { + return session.model; + } + } catch { + // fall through to live gateway + } + const live = readLiveInference(sandboxName); + if (live && typeof live.model === "string" && live.model) { + return live.model; + } + return null; + } + + return { readLiveInference, readRecordedProvider, readRecordedNimContainer, readRecordedModel }; +} From 5afd6806299663ebebb4a5ccf07b509fc22a1d2a Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 14:12:08 -0700 Subject: [PATCH 37/54] refactor(cli): move Hermes tool gateway normalization --- src/lib/onboard.ts | 13 +------------ src/lib/onboard/hermes-managed-tools.ts | 13 ++++++++++++- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index bd22c2e823..ff388e2ed0 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -432,8 +432,8 @@ import { } from "./messaging-channel-config"; import { streamGatewayStart } from "./onboard/gateway"; import { - HERMES_TOOL_GATEWAY_PRESET_NAMES, mergeRequiredHermesToolGatewayPolicyPresets, + normalizeHermesToolGatewaySelections, setupHermesToolGateways, stringSetsEqual, } from "./onboard/hermes-managed-tools"; @@ -658,17 +658,6 @@ const { const { hydrateCredentialEnv }: typeof import("./onboard/credential-env") = require("./onboard/credential-env"); -function normalizeHermesToolGatewaySelections(value: unknown): string[] { - if (!Array.isArray(value)) return []; - const selected = new Set(); - for (const preset of value) { - if (typeof preset === "string" && HERMES_TOOL_GATEWAY_PRESET_NAMES.has(preset)) { - selected.add(preset); - } - } - return [...selected].sort(); -} - const { summarizeCurlFailure, summarizeProbeFailure, diff --git a/src/lib/onboard/hermes-managed-tools.ts b/src/lib/onboard/hermes-managed-tools.ts index 1e90a5760e..f32afdc017 100644 --- a/src/lib/onboard/hermes-managed-tools.ts +++ b/src/lib/onboard/hermes-managed-tools.ts @@ -1,8 +1,8 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -import * as hermesProviderAuth from "../hermes-provider-auth"; import type { HermesAuthMethod } from "../hermes-provider-auth"; +import * as hermesProviderAuth from "../hermes-provider-auth"; type PromptFn = (message: string) => Promise; type RawInput = NodeJS.ReadStream & { @@ -238,6 +238,17 @@ async function selectHermesToolGatewaysInteractive( return [...selected]; } +export function normalizeHermesToolGatewaySelections(value: unknown): string[] { + if (!Array.isArray(value)) return []; + const selected = new Set(); + for (const preset of value) { + if (typeof preset === "string" && HERMES_TOOL_GATEWAY_PRESET_NAMES.has(preset)) { + selected.add(preset); + } + } + return [...selected].sort(); +} + export function stringSetsEqual( a: string[] | null | undefined, b: string[] | null | undefined, From ae593a83f3d92001972e4c535fb2129b6c9fcf9a Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 14:15:49 -0700 Subject: [PATCH 38/54] refactor(cli): move affirmative prompt helper --- src/lib/onboard.ts | 13 ++++--------- src/lib/onboard/prompt-helpers.ts | 8 ++++++++ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index ff388e2ed0..47081a1043 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -290,7 +290,10 @@ const { getResumeSandboxConflict, } = resumeConfig; const { pruneKnownHostsEntries }: typeof import("./onboard/known-hosts") = require("./onboard/known-hosts"); -const onboardPromptHelpers: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers"); +const { + isAffirmativeAnswer, + ...onboardPromptHelpers +}: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers"); const providerRecovery: typeof import("./onboard/provider-recovery") = require("./onboard/provider-recovery"); const { createOpenclawSetup }: typeof import("./onboard/openclaw-setup") = require("./onboard/openclaw-setup"); const { @@ -1222,14 +1225,6 @@ function isOpenclawReady(sandboxName: string): boolean { return Boolean(fetchGatewayAuthTokenFromSandbox(sandboxName)); } -function isAffirmativeAnswer(value: string | null | undefined): boolean { - return ["y", "yes"].includes( - String(value || "") - .trim() - .toLowerCase(), - ); -} - function validateBraveSearchApiKey(apiKey: string): CurlProbeResult { return runCurlProbe([ "-sS", diff --git a/src/lib/onboard/prompt-helpers.ts b/src/lib/onboard/prompt-helpers.ts index 4e274fd054..5e18e2ec95 100644 --- a/src/lib/onboard/prompt-helpers.ts +++ b/src/lib/onboard/prompt-helpers.ts @@ -1,6 +1,14 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +export function isAffirmativeAnswer(value: string | null | undefined): boolean { + return ["y", "yes"].includes( + String(value || "") + .trim() + .toLowerCase(), + ); +} + export interface PromptHelperDeps { isNonInteractive(): boolean; note(message: string): void; From 3b270a06953af3f7c4394b75fe3005ca9ac7753c Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 14:22:39 -0700 Subject: [PATCH 39/54] refactor(cli): extract sandbox lifecycle helpers --- src/lib/onboard.ts | 61 +++++---------------- src/lib/onboard/sandbox-lifecycle.ts | 80 ++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 47 deletions(-) create mode 100644 src/lib/onboard/sandbox-lifecycle.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 47081a1043..7a6e71de01 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -326,6 +326,7 @@ const { persistMessagingChannelConfigToSession, } = messagingConfig; const sandboxAgent: typeof import("./onboard/sandbox-agent") = require("./onboard/sandbox-agent"); +const sandboxLifecycle: typeof import("./onboard/sandbox-lifecycle") = require("./onboard/sandbox-lifecycle"); const sandboxRegistryMetadata: typeof import("./onboard/sandbox-registry-metadata") = require("./onboard/sandbox-registry-metadata"); const sandboxReuse: typeof import("./onboard/sandbox-reuse") = require("./onboard/sandbox-reuse"); const { @@ -1176,54 +1177,20 @@ function isInferenceRouteReady(provider: string, model: string): boolean { return Boolean(live && live.provider === provider && live.model === model); } -function sandboxExistsInGateway(sandboxName: string): boolean { - const output = runCaptureOpenshell(["sandbox", "get", sandboxName], { ignoreError: true }); - return Boolean(output); -} - -function pruneStaleSandboxEntry(sandboxName: string): boolean { - const existing = registry.getSandbox(sandboxName); - const liveExists = sandboxExistsInGateway(sandboxName); - if (existing && !liveExists) { - registry.removeSandbox(sandboxName); - } - return liveExists; -} - -function shouldRestoreLatestBackupOnRecreate(): boolean { - return process.env.NEMOCLAW_RESTORE_LATEST_BACKUP_ON_RECREATE === "1"; -} - -async function confirmRecreateForSelectionDrift( - sandboxName: string, - drift: SelectionDrift, - requestedProvider: string | null, - requestedModel: string | null, -): Promise { - const currentProvider = drift.existingProvider || "unknown"; - const currentModel = drift.existingModel || "unknown"; - const nextProvider = requestedProvider || "unknown"; - const nextModel = requestedModel || "unknown"; - - console.log(` Sandbox '${sandboxName}' exists but requested inference selection changed.`); - console.log(` Current: provider=${currentProvider} model=${currentModel}`); - console.log(` Requested: provider=${nextProvider} model=${nextModel}`); - console.log( - ` Recreating the sandbox is required to apply this change to the running ${agentProductName()} UI.`, - ); - - if (isNonInteractive()) { - note(" [non-interactive] Recreating sandbox due to provider/model drift."); - return true; - } - - const answer = await prompt(` Recreate sandbox '${sandboxName}' now? [y/N]: `); - return isAffirmativeAnswer(answer); -} +const { + sandboxExistsInGateway, + pruneStaleSandboxEntry, + shouldRestoreLatestBackupOnRecreate, + confirmRecreateForSelectionDrift, + isOpenclawReady, +} = sandboxLifecycle.createSandboxLifecycleHelpers({ + runCaptureOpenshell, + fetchGatewayAuthTokenFromSandbox: (sandboxName: string) => fetchGatewayAuthTokenFromSandbox(sandboxName), + agentProductName, + prompt, + isAffirmativeAnswer, +}); -function isOpenclawReady(sandboxName: string): boolean { - return Boolean(fetchGatewayAuthTokenFromSandbox(sandboxName)); -} function validateBraveSearchApiKey(apiKey: string): CurlProbeResult { return runCurlProbe([ diff --git a/src/lib/onboard/sandbox-lifecycle.ts b/src/lib/onboard/sandbox-lifecycle.ts new file mode 100644 index 0000000000..74fed0814b --- /dev/null +++ b/src/lib/onboard/sandbox-lifecycle.ts @@ -0,0 +1,80 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import * as registry from "../state/registry"; +import type { SelectionDrift } from "./selection-drift"; + +export interface SandboxLifecycleDeps { + runCaptureOpenshell(args: string[], opts?: Record): string | null; + fetchGatewayAuthTokenFromSandbox(sandboxName: string): string | null; + agentProductName(): string; + prompt(question: string): Promise; + isAffirmativeAnswer(value: string | null | undefined): boolean; +} + +export interface SandboxLifecycleHelpers { + sandboxExistsInGateway(sandboxName: string): boolean; + pruneStaleSandboxEntry(sandboxName: string): boolean; + shouldRestoreLatestBackupOnRecreate(): boolean; + confirmRecreateForSelectionDrift( + sandboxName: string, + drift: SelectionDrift, + requestedProvider: string | null, + requestedModel: string | null, + ): Promise; + isOpenclawReady(sandboxName: string): boolean; +} + +export function createSandboxLifecycleHelpers(deps: SandboxLifecycleDeps): SandboxLifecycleHelpers { + function sandboxExistsInGateway(sandboxName: string): boolean { + const output = deps.runCaptureOpenshell(["sandbox", "get", sandboxName], { ignoreError: true }); + return Boolean(output); + } + + function pruneStaleSandboxEntry(sandboxName: string): boolean { + const existing = registry.getSandbox(sandboxName); + const liveExists = sandboxExistsInGateway(sandboxName); + if (existing && !liveExists) { + registry.removeSandbox(sandboxName); + } + return liveExists; + } + + function shouldRestoreLatestBackupOnRecreate(): boolean { + return process.env.NEMOCLAW_RESTORE_LATEST_BACKUP_ON_RECREATE === "1"; + } + + async function confirmRecreateForSelectionDrift( + sandboxName: string, + drift: SelectionDrift, + requestedProvider: string | null, + requestedModel: string | null, + ): Promise { + const currentProvider = drift.existingProvider || "unknown"; + const currentModel = drift.existingModel || "unknown"; + const nextProvider = requestedProvider || "unknown"; + const nextModel = requestedModel || "unknown"; + + console.log(` Sandbox '${sandboxName}' exists but requested inference selection changed.`); + console.log(` Current: provider=${currentProvider} model=${currentModel}`); + console.log(` Requested: provider=${nextProvider} model=${nextModel}`); + console.log( + ` Recreating the sandbox is required to apply this change to the running ${deps.agentProductName()} UI.`, + ); + + const answer = await deps.prompt(` Recreate sandbox '${sandboxName}' now? [y/N]: `); + return deps.isAffirmativeAnswer(answer); + } + + function isOpenclawReady(sandboxName: string): boolean { + return Boolean(deps.fetchGatewayAuthTokenFromSandbox(sandboxName)); + } + + return { + sandboxExistsInGateway, + pruneStaleSandboxEntry, + shouldRestoreLatestBackupOnRecreate, + confirmRecreateForSelectionDrift, + isOpenclawReady, + }; +} From e5503b494a2427bbb5ef5cc0ca0d786ab926a438 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 14:26:36 -0700 Subject: [PATCH 40/54] refactor(cli): extract openshell CLI helpers --- src/lib/onboard.ts | 67 ++++++++------------------ src/lib/onboard/openshell-cli.ts | 82 ++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 48 deletions(-) create mode 100644 src/lib/onboard/openshell-cli.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 7a6e71de01..3e8ae12bbf 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -296,6 +296,7 @@ const { }: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers"); const providerRecovery: typeof import("./onboard/provider-recovery") = require("./onboard/provider-recovery"); const { createOpenclawSetup }: typeof import("./onboard/openclaw-setup") = require("./onboard/openclaw-setup"); +const { createOpenshellCliHelpers }: typeof import("./onboard/openshell-cli") = require("./onboard/openshell-cli"); const { resolveSandboxGpuFlagFromOptions, sandboxGpuRemediationLines, @@ -566,6 +567,24 @@ async function promptYesNoOrDefault( // ── Helpers ────────────────────────────────────────────────────── +const { + getOpenshellBinary, + openshellShellCommand, + openshellArgv, + runOpenshell, + runCaptureOpenshell, + safeOpenShellArgument, + getGatewayPortArg, + getDockerDriverGatewayEndpointArg, +} = createOpenshellCliHelpers({ + getCachedBinary: () => OPENSHELL_BIN, + setCachedBinary: (binary: string) => { + OPENSHELL_BIN = binary; + }, + getGatewayPort: () => GATEWAY_PORT, + getDockerDriverGatewayEndpoint, +}); + // Gateway state functions — delegated to src/lib/state/gateway.ts const { isSandboxReady, @@ -600,54 +619,6 @@ function step(n: number, total: number, msg: string): void { console.log(` ${"─".repeat(50)}`); } -function getOpenshellBinary(): string { - if (OPENSHELL_BIN) return OPENSHELL_BIN; - const resolved = resolveOpenshell(); - if (typeof resolved !== "string" || resolved.length === 0) { - console.error(" openshell CLI not found."); - console.error(" Install manually: https://github.com/NVIDIA/OpenShell/releases"); - process.exit(1); - } - OPENSHELL_BIN = resolved; - return OPENSHELL_BIN; -} - -function openshellShellCommand(args: string[], options: { openshellBinary?: string } = {}): string { - const openshellBinary = options.openshellBinary || getOpenshellBinary(); - return [shellQuote(openshellBinary), ...args.map((arg) => shellQuote(arg))].join(" "); -} - -function openshellArgv(args: string[], options: { openshellBinary?: string } = {}): string[] { - const openshellBinary = options.openshellBinary || getOpenshellBinary(); - return [openshellBinary, ...args]; -} - -function runOpenshell(args: string[], opts: RunnerOptions & { openshellBinary?: string } = {}) { - return run(openshellArgv(args, opts), opts); -} - -function runCaptureOpenshell( - args: string[], - opts: RunnerOptions & { openshellBinary?: string } = {}, -) { - return runCapture(openshellArgv(args, opts), opts); -} - -function safeOpenShellArgument(value: string, label: string): string { - if (!/^[A-Za-z0-9._~:/-]+$/.test(value)) { - throw new Error(`Invalid ${label}: contains characters unsafe for OpenShell CLI args`); - } - return value; -} - -function getGatewayPortArg(): string { - return safeOpenShellArgument(String(GATEWAY_PORT), "gateway port"); -} - -function getDockerDriverGatewayEndpointArg(): string { - return safeOpenShellArgument(getDockerDriverGatewayEndpoint(), "gateway endpoint"); -} - const { executeSandboxCommandForVerification }: typeof import("./onboard/sandbox-verification-exec") = require("./onboard/sandbox-verification-exec"); diff --git a/src/lib/onboard/openshell-cli.ts b/src/lib/onboard/openshell-cli.ts new file mode 100644 index 0000000000..961ec5e0dd --- /dev/null +++ b/src/lib/onboard/openshell-cli.ts @@ -0,0 +1,82 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { resolveOpenshell } from "../adapters/openshell/resolve"; +import { run, runCapture, shellQuote } from "../runner"; + +export interface OpenshellCliDeps { + getCachedBinary(): string | null; + setCachedBinary(binary: string): void; + getGatewayPort(): number; + getDockerDriverGatewayEndpoint(): string; +} + +export interface OpenshellCliHelpers { + getOpenshellBinary(): string; + openshellShellCommand(args: string[], options?: { openshellBinary?: string }): string; + openshellArgv(args: string[], options?: { openshellBinary?: string }): string[]; + runOpenshell(args: string[], opts?: any): ReturnType; + runCaptureOpenshell(args: string[], opts?: any): string; + safeOpenShellArgument(value: string, label: string): string; + getGatewayPortArg(): string; + getDockerDriverGatewayEndpointArg(): string; +} + +export function createOpenshellCliHelpers(deps: OpenshellCliDeps): OpenshellCliHelpers { + function getOpenshellBinary(): string { + const cached = deps.getCachedBinary(); + if (cached) return cached; + const resolved = resolveOpenshell(); + if (typeof resolved !== "string" || resolved.length === 0) { + console.error(" openshell CLI not found."); + console.error(" Install manually: https://github.com/NVIDIA/OpenShell/releases"); + process.exit(1); + } + deps.setCachedBinary(resolved); + return resolved; + } + + function openshellShellCommand(args: string[], options: { openshellBinary?: string } = {}): string { + const openshellBinary = options.openshellBinary || getOpenshellBinary(); + return [shellQuote(openshellBinary), ...args.map((arg) => shellQuote(arg))].join(" "); + } + + function openshellArgv(args: string[], options: { openshellBinary?: string } = {}): string[] { + const openshellBinary = options.openshellBinary || getOpenshellBinary(); + return [openshellBinary, ...args]; + } + + function runOpenshell(args: string[], opts: any = {}) { + return run(openshellArgv(args, opts), opts); + } + + function runCaptureOpenshell(args: string[], opts: any = {}) { + return runCapture(openshellArgv(args, opts), opts); + } + + function safeOpenShellArgument(value: string, label: string): string { + if (!/^[A-Za-z0-9._~:/-]+$/.test(value)) { + throw new Error(`Invalid ${label}: contains characters unsafe for OpenShell CLI args`); + } + return value; + } + + function getGatewayPortArg(): string { + return safeOpenShellArgument(String(deps.getGatewayPort()), "gateway port"); + } + + function getDockerDriverGatewayEndpointArg(): string { + return safeOpenShellArgument(deps.getDockerDriverGatewayEndpoint(), "gateway endpoint"); + } + + return { + getOpenshellBinary, + openshellShellCommand, + openshellArgv, + runOpenshell, + runCaptureOpenshell, + safeOpenShellArgument, + getGatewayPortArg, + getDockerDriverGatewayEndpointArg, + }; +} From bceab13d14fb46f4a6a6b58ee9887f2de94cb394 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 17:41:48 -0700 Subject: [PATCH 41/54] refactor(cli): move prompt navigation helpers --- src/lib/onboard.ts | 16 ++-------------- src/lib/onboard/prompt-helpers.ts | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 3e8ae12bbf..4ed3635ee1 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -291,6 +291,8 @@ const { } = resumeConfig; const { pruneKnownHostsEntries }: typeof import("./onboard/known-hosts") = require("./onboard/known-hosts"); const { + exitOnboardFromPrompt, + getNavigationChoice, isAffirmativeAnswer, ...onboardPromptHelpers }: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers"); @@ -639,20 +641,6 @@ const { runCurlProbe, } = httpProbe; -function getNavigationChoice(value = ""): "back" | "exit" | null { - const normalized = String(value || "") - .trim() - .toLowerCase(); - if (normalized === "back") return "back"; - if (normalized === "exit" || normalized === "quit") return "exit"; - return null; -} - -function exitOnboardFromPrompt(): never { - console.log(" Exiting onboarding."); - process.exit(1); -} - function normalizeHermesAuthMethod(value: string | null | undefined): HermesAuthMethod | null { const normalized = String(value || "") .trim() diff --git a/src/lib/onboard/prompt-helpers.ts b/src/lib/onboard/prompt-helpers.ts index 5e18e2ec95..2bd3efe66e 100644 --- a/src/lib/onboard/prompt-helpers.ts +++ b/src/lib/onboard/prompt-helpers.ts @@ -1,6 +1,20 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +export function getNavigationChoice(value = ""): "back" | "exit" | null { + const normalized = String(value || "") + .trim() + .toLowerCase(); + if (normalized === "back") return "back"; + if (normalized === "exit" || normalized === "quit") return "exit"; + return null; +} + +export function exitOnboardFromPrompt(): never { + console.log(" Exiting onboarding."); + process.exit(1); +} + export function isAffirmativeAnswer(value: string | null | undefined): boolean { return ["y", "yes"].includes( String(value || "") From 973ff5b19ccdb99c75af0a7bcd2fd31de6a3f697 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 17:44:40 -0700 Subject: [PATCH 42/54] refactor(cli): extract Hermes auth method helpers --- src/lib/onboard.ts | 57 +++++++--------------------------- src/lib/onboard/hermes-auth.ts | 51 ++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 45 deletions(-) create mode 100644 src/lib/onboard/hermes-auth.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 4ed3635ee1..ee9433ddf6 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -193,6 +193,18 @@ const { const onboardProviders = require("./onboard/providers"); const hermesProviderAuth = require("./hermes-provider-auth"); +const hermesAuth: typeof import("./onboard/hermes-auth") = require("./onboard/hermes-auth"); +const { + HERMES_AUTH_METHOD_API_KEY, + HERMES_AUTH_METHOD_OAUTH, + HERMES_NOUS_API_KEY_CREDENTIAL_ENV, + HERMES_NOUS_API_KEY_HELP_URL, + getRequestedHermesAuthMethod, + hermesAuthMethodLabel, + normalizeHermesAuthMethod, +} = hermesAuth; + +type HermesAuthMethod = import("./onboard/hermes-auth").HermesAuthMethod; function getHermesToolGatewayBroker(): any { return require("./hermes-tool-gateway-broker"); @@ -491,13 +503,6 @@ const RESET = USE_COLOR ? "\x1b[0m" : ""; let OPENSHELL_BIN: string | null = null; const GATEWAY_NAME = "nemoclaw"; const BACK_TO_SELECTION = "__NEMOCLAW_BACK_TO_SELECTION__"; -type HermesAuthMethod = "oauth" | "api_key"; -const HERMES_AUTH_METHOD_OAUTH: HermesAuthMethod = "oauth"; -const HERMES_AUTH_METHOD_API_KEY: HermesAuthMethod = "api_key"; -const HERMES_NOUS_API_KEY_CREDENTIAL_ENV = - hermesProviderAuth.HERMES_NOUS_API_KEY_CREDENTIAL_ENV || "NOUS_API_KEY"; -const HERMES_NOUS_API_KEY_HELP_URL = "https://portal.nousresearch.com/manage-subscription"; - const OPENCLAW_LAUNCH_AGENT_PLIST = "~/Library/LaunchAgents/ai.openclaw.gateway.plist"; const BRAVE_SEARCH_HELP_URL = "https://brave.com/search/api/"; @@ -641,44 +646,6 @@ const { runCurlProbe, } = httpProbe; -function normalizeHermesAuthMethod(value: string | null | undefined): HermesAuthMethod | null { - const normalized = String(value || "") - .trim() - .toLowerCase() - .replace(/[\s-]+/g, "_"); - if (!normalized) return null; - if (normalized === "oauth" || normalized === "nous_oauth" || normalized === "nous_portal_oauth") { - return HERMES_AUTH_METHOD_OAUTH; - } - if ( - normalized === "api" || - normalized === "key" || - normalized === "api_key" || - normalized === "apikey" || - normalized === "nous_api_key" - ) { - return HERMES_AUTH_METHOD_API_KEY; - } - return null; -} - -function hermesAuthMethodLabel(method: HermesAuthMethod | null | undefined): string { - return method === HERMES_AUTH_METHOD_API_KEY ? "Nous API Key" : "Nous Portal OAuth"; -} - -function getRequestedHermesAuthMethod(): HermesAuthMethod | null { - const raw = - process.env.NEMOCLAW_HERMES_AUTH_METHOD || - process.env.NEMOCLAW_HERMES_AUTH || - process.env.NEMOCLAW_NOUS_AUTH_METHOD || - ""; - const method = normalizeHermesAuthMethod(raw); - if (!raw || method) return method; - console.error(` Unsupported Hermes Provider auth method: ${raw}`); - console.error(" Valid values: oauth, nous-portal-oauth, api-key, nous-api-key"); - process.exit(1); -} - async function promptHermesAuthMethod(): Promise { const methods: Array<{ key: HermesAuthMethod; label: string }> = [ { key: HERMES_AUTH_METHOD_OAUTH, label: "Nous Portal OAuth (authenticate via browser)" }, diff --git a/src/lib/onboard/hermes-auth.ts b/src/lib/onboard/hermes-auth.ts new file mode 100644 index 0000000000..4fb8b9f03f --- /dev/null +++ b/src/lib/onboard/hermes-auth.ts @@ -0,0 +1,51 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { HermesAuthMethod } from "../hermes-provider-auth"; +import * as hermesProviderAuth from "../hermes-provider-auth"; + +export type { HermesAuthMethod }; + +export const HERMES_AUTH_METHOD_OAUTH: HermesAuthMethod = "oauth"; +export const HERMES_AUTH_METHOD_API_KEY: HermesAuthMethod = "api_key"; +export const HERMES_NOUS_API_KEY_CREDENTIAL_ENV = + hermesProviderAuth.HERMES_NOUS_API_KEY_CREDENTIAL_ENV || "NOUS_API_KEY"; +export const HERMES_NOUS_API_KEY_HELP_URL = "https://portal.nousresearch.com/manage-subscription"; + +export function normalizeHermesAuthMethod(value: string | null | undefined): HermesAuthMethod | null { + const normalized = String(value || "") + .trim() + .toLowerCase() + .replace(/[\s-]+/g, "_"); + if (!normalized) return null; + if (normalized === "oauth" || normalized === "nous_oauth" || normalized === "nous_portal_oauth") { + return HERMES_AUTH_METHOD_OAUTH; + } + if ( + normalized === "api" || + normalized === "key" || + normalized === "api_key" || + normalized === "apikey" || + normalized === "nous_api_key" + ) { + return HERMES_AUTH_METHOD_API_KEY; + } + return null; +} + +export function hermesAuthMethodLabel(method: HermesAuthMethod | null | undefined): string { + return method === HERMES_AUTH_METHOD_API_KEY ? "Nous API Key" : "Nous Portal OAuth"; +} + +export function getRequestedHermesAuthMethod(): HermesAuthMethod | null { + const raw = + process.env.NEMOCLAW_HERMES_AUTH_METHOD || + process.env.NEMOCLAW_HERMES_AUTH || + process.env.NEMOCLAW_NOUS_AUTH_METHOD || + ""; + const method = normalizeHermesAuthMethod(raw); + if (!raw || method) return method; + console.error(` Unsupported Hermes Provider auth method: ${raw}`); + console.error(" Valid values: oauth, nous-portal-oauth, api-key, nous-api-key"); + process.exit(1); +} From 48e805301424e7a8c7eaae45aa02f59f02e7e0a8 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 17:48:45 -0700 Subject: [PATCH 43/54] refactor(cli): extract Hermes auth flow helpers --- src/lib/onboard.ts | 118 +++++---------------------- src/lib/onboard/hermes-auth.ts | 141 +++++++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+), 97 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index ee9433ddf6..c30815a9b7 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -646,103 +646,27 @@ const { runCurlProbe, } = httpProbe; -async function promptHermesAuthMethod(): Promise { - const methods: Array<{ key: HermesAuthMethod; label: string }> = [ - { key: HERMES_AUTH_METHOD_OAUTH, label: "Nous Portal OAuth (authenticate via browser)" }, - { - key: HERMES_AUTH_METHOD_API_KEY, - label: "Nous API Key (paste a key from the provider dashboard)", - }, - ]; - const requested = getRequestedHermesAuthMethod(); - if (isNonInteractive()) { - const method = - requested || - (resolveHermesNousApiKey() - ? HERMES_AUTH_METHOD_API_KEY - : HERMES_AUTH_METHOD_OAUTH); - note(` [non-interactive] Hermes auth: ${hermesAuthMethodLabel(method)}`); - return method; - } - - console.log(""); - console.log(" Hermes Provider authentication:"); - methods.forEach((method, index) => { - console.log(` ${index + 1}) ${method.label}`); - }); - console.log(""); - - const defaultIdx = (requested ? methods.findIndex((method) => method.key === requested) : 0) + 1; - const choice = await prompt(` Choose [${defaultIdx}]: `); - const navigation = getNavigationChoice(choice); - if (navigation === "back") return BACK_TO_SELECTION; - if (navigation === "exit") exitOnboardFromPrompt(); - const idx = parseInt(choice || String(defaultIdx), 10) - 1; - return methods[idx]?.key || methods[defaultIdx - 1]?.key || HERMES_AUTH_METHOD_OAUTH; -} - -function resolveHermesNousApiKey(): string | null { - return ( - // check-direct-credential-env-ignore -- Hermes Provider API keys are read only from the invoking shell for OpenShell provider registration; do not resolve host credentials.json. - normalizeCredentialValue(process.env[HERMES_NOUS_API_KEY_CREDENTIAL_ENV]) || - normalizeCredentialValue(process.env.NEMOCLAW_PROVIDER_KEY) || - null - ); -} - -function stageNousApiKeyProviderEnv(): void { - const key = resolveHermesNousApiKey(); - if (key) { - process.env[HERMES_NOUS_API_KEY_CREDENTIAL_ENV] = key; - } -} - -async function ensureHermesNousApiKeyEnv(): Promise { - const existing = resolveHermesNousApiKey(); - if (existing) { - process.env[HERMES_NOUS_API_KEY_CREDENTIAL_ENV] = existing; - return existing; - } - console.log(""); - console.log(" Hermes Provider Nous API Key"); - console.log(` Create or copy a key from ${HERMES_NOUS_API_KEY_HELP_URL}`); - const key = normalizeCredentialValue( - await prompt(" Nous API Key: ", { - secret: true, - }), - ); - const validationError = validateNvidiaApiKeyValue(key, HERMES_NOUS_API_KEY_CREDENTIAL_ENV); - if (validationError) { - console.error(validationError); - process.exit(1); - } - process.env[HERMES_NOUS_API_KEY_CREDENTIAL_ENV] = key; - return key; -} - -function openshellResultMessage(result: { - stdout?: string | Buffer | null; - stderr?: string | Buffer | null; -}): string { - return compactText(redact(`${result.stderr || ""} ${result.stdout || ""}`)); -} +const { + promptHermesAuthMethod, + resolveHermesNousApiKey, + stageNousApiKeyProviderEnv, + ensureHermesNousApiKeyEnv, + openshellResultMessage, + checkHermesProviderStoreReachable, +} = hermesAuth.createHermesAuthHelpers({ + isNonInteractive, + note, + prompt, + getNavigationChoice, + exitOnboardFromPrompt, + validateNvidiaApiKeyValue: (value: string, envName: string) => + validateNvidiaApiKeyValue(value, envName), + compactText, + redact, + runOpenshell, + backToSelection: BACK_TO_SELECTION, +}); -function checkHermesProviderStoreReachable( - runOpenshellImpl: typeof runOpenshell = runOpenshell, -): { ok: true } | { ok: false; message: string } { - const result = runOpenshellImpl(["provider", "list"], { - ignoreError: true, - stdio: ["ignore", "pipe", "pipe"], - timeout: 10_000, - }); - if (result.status === 0) return { ok: true }; - return { - ok: false, - message: - openshellResultMessage(result) || - "OpenShell provider storage is unreachable; the gateway may be stopped or refusing connections.", - }; -} async function selectOnboardAgent({ agentFlag = null, @@ -5158,7 +5082,7 @@ async function setupNim( console.log(""); continue selectionLoop; } - hermesAuthMethod = selectedHermesAuthMethod; + hermesAuthMethod = normalizeHermesAuthMethod(selectedHermesAuthMethod); if (hermesAuthMethod === HERMES_AUTH_METHOD_API_KEY) { credentialEnv = HERMES_NOUS_API_KEY_CREDENTIAL_ENV; stageNousApiKeyProviderEnv(); diff --git a/src/lib/onboard/hermes-auth.ts b/src/lib/onboard/hermes-auth.ts index 4fb8b9f03f..5f69fb4888 100644 --- a/src/lib/onboard/hermes-auth.ts +++ b/src/lib/onboard/hermes-auth.ts @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +import { normalizeCredentialValue } from "../credentials/store"; import type { HermesAuthMethod } from "../hermes-provider-auth"; import * as hermesProviderAuth from "../hermes-provider-auth"; @@ -49,3 +50,143 @@ export function getRequestedHermesAuthMethod(): HermesAuthMethod | null { console.error(" Valid values: oauth, nous-portal-oauth, api-key, nous-api-key"); process.exit(1); } + +export interface HermesAuthFlowDeps { + isNonInteractive(): boolean; + note(message: string): void; + prompt(question: string, options?: { secret?: boolean }): Promise; + getNavigationChoice(value?: string): "back" | "exit" | null; + exitOnboardFromPrompt(): never; + validateNvidiaApiKeyValue(value: string, envName: string): string | null; + compactText(value: string): string; + redact(value: unknown): string; + runOpenshell(args: string[], opts?: Record): { + status?: number | null; + stdout?: string | Buffer | null; + stderr?: string | Buffer | null; + }; + backToSelection: string; +} + +export interface HermesAuthHelpers { + promptHermesAuthMethod(): Promise; + resolveHermesNousApiKey(): string | null; + stageNousApiKeyProviderEnv(): void; + ensureHermesNousApiKeyEnv(): Promise; + openshellResultMessage(result: { + stdout?: string | Buffer | null; + stderr?: string | Buffer | null; + }): string; + checkHermesProviderStoreReachable( + runOpenshellImpl?: HermesAuthFlowDeps["runOpenshell"], + ): { ok: true } | { ok: false; message: string }; +} + +export function createHermesAuthHelpers(deps: HermesAuthFlowDeps): HermesAuthHelpers { + async function promptHermesAuthMethod(): Promise { + const methods: Array<{ key: HermesAuthMethod; label: string }> = [ + { key: HERMES_AUTH_METHOD_OAUTH, label: "Nous Portal OAuth (authenticate via browser)" }, + { + key: HERMES_AUTH_METHOD_API_KEY, + label: "Nous API Key (paste a key from the provider dashboard)", + }, + ]; + const requested = getRequestedHermesAuthMethod(); + if (deps.isNonInteractive()) { + const method = + requested || + (resolveHermesNousApiKey() + ? HERMES_AUTH_METHOD_API_KEY + : HERMES_AUTH_METHOD_OAUTH); + deps.note(` [non-interactive] Hermes auth: ${hermesAuthMethodLabel(method)}`); + return method; + } + + console.log(""); + console.log(" Hermes Provider authentication:"); + methods.forEach((method, index) => { + console.log(` ${index + 1}) ${method.label}`); + }); + console.log(""); + + const defaultIdx = (requested ? methods.findIndex((method) => method.key === requested) : 0) + 1; + const choice = await deps.prompt(` Choose [${defaultIdx}]: `); + const navigation = deps.getNavigationChoice(choice); + if (navigation === "back") return deps.backToSelection; + if (navigation === "exit") deps.exitOnboardFromPrompt(); + const idx = parseInt(choice || String(defaultIdx), 10) - 1; + return methods[idx]?.key || methods[defaultIdx - 1]?.key || HERMES_AUTH_METHOD_OAUTH; + } + + function resolveHermesNousApiKey(): string | null { + return ( + // check-direct-credential-env-ignore -- Hermes Provider API keys are read only from the invoking shell for OpenShell provider registration; do not resolve host credentials.json. + normalizeCredentialValue(process.env[HERMES_NOUS_API_KEY_CREDENTIAL_ENV]) || + normalizeCredentialValue(process.env.NEMOCLAW_PROVIDER_KEY) || + null + ); + } + + function stageNousApiKeyProviderEnv(): void { + const key = resolveHermesNousApiKey(); + if (key) { + process.env[HERMES_NOUS_API_KEY_CREDENTIAL_ENV] = key; + } + } + + async function ensureHermesNousApiKeyEnv(): Promise { + const existing = resolveHermesNousApiKey(); + if (existing) { + process.env[HERMES_NOUS_API_KEY_CREDENTIAL_ENV] = existing; + return existing; + } + console.log(""); + console.log(" Hermes Provider Nous API Key"); + console.log(` Create or copy a key from ${HERMES_NOUS_API_KEY_HELP_URL}`); + const key = normalizeCredentialValue( + await deps.prompt(" Nous API Key: ", { + secret: true, + }), + ); + const validationError = deps.validateNvidiaApiKeyValue(key, HERMES_NOUS_API_KEY_CREDENTIAL_ENV); + if (validationError) { + console.error(validationError); + process.exit(1); + } + process.env[HERMES_NOUS_API_KEY_CREDENTIAL_ENV] = key; + return key; + } + + function openshellResultMessage(result: { + stdout?: string | Buffer | null; + stderr?: string | Buffer | null; + }): string { + return deps.compactText(deps.redact(`${result.stderr || ""} ${result.stdout || ""}`)); + } + + function checkHermesProviderStoreReachable( + runOpenshellImpl: HermesAuthFlowDeps["runOpenshell"] = deps.runOpenshell, + ): { ok: true } | { ok: false; message: string } { + const result = runOpenshellImpl(["provider", "list"], { + ignoreError: true, + stdio: ["ignore", "pipe", "pipe"], + timeout: 10_000, + }); + if (result.status === 0) return { ok: true }; + return { + ok: false, + message: + openshellResultMessage(result) || + "OpenShell provider storage is unreachable; the gateway may be stopped or refusing connections.", + }; + } + + return { + promptHermesAuthMethod, + resolveHermesNousApiKey, + stageNousApiKeyProviderEnv, + ensureHermesNousApiKeyEnv, + openshellResultMessage, + checkHermesProviderStoreReachable, + }; +} From b2179813bd23bc89fe308380abde9dac1597f539 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 17:51:02 -0700 Subject: [PATCH 44/54] refactor(cli): extract onboard agent selection --- src/lib/onboard.ts | 24 ++++++++-------------- src/lib/onboard/agent-selection.ts | 33 ++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 16 deletions(-) create mode 100644 src/lib/onboard/agent-selection.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index c30815a9b7..0e5993ef78 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -15,6 +15,7 @@ const { cliName, setOnboardBrandingAgent, }: typeof import("./onboard/branding") = require("./onboard/branding"); +const { createSelectOnboardAgent }: typeof import("./onboard/agent-selection") = require("./onboard/agent-selection"); const { cleanupTempDir }: typeof import("./onboard/temp-files") = require("./onboard/temp-files"); const { stopStaleDashboardListenersForSandbox } = require("./onboard/stale-gateway-cleanup"); const { @@ -668,22 +669,13 @@ const { }); -async function selectOnboardAgent({ - agentFlag = null, - session = null, -}: { - agentFlag?: string | null; - session?: { agent?: string | null } | null; - resume?: boolean; - canPrompt?: boolean; -} = {}): Promise { - const agent = agentOnboard.resolveAgent({ agentFlag, session }); - if (isNonInteractive()) { - const displayName = agent?.displayName || agentDefs.loadAgent("openclaw").displayName; - note(` [non-interactive] Agent: ${displayName}`); - } - return agent; -} +const selectOnboardAgent = createSelectOnboardAgent({ + resolveAgent: agentOnboard.resolveAgent, + loadAgent: agentDefs.loadAgent, + isNonInteractive, + note, +}); + const { getTransportRecoveryMessage, getProbeRecovery } = validationRecovery; diff --git a/src/lib/onboard/agent-selection.ts b/src/lib/onboard/agent-selection.ts new file mode 100644 index 0000000000..3f38593aee --- /dev/null +++ b/src/lib/onboard/agent-selection.ts @@ -0,0 +1,33 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { AgentDefinition } from "../agent/defs"; + +export interface SelectOnboardAgentDeps { + resolveAgent(options: { + agentFlag?: string | null; + session?: { agent?: string | null } | null; + }): AgentDefinition | null; + loadAgent(name: string): AgentDefinition; + isNonInteractive(): boolean; + note(message: string): void; +} + +export function createSelectOnboardAgent(deps: SelectOnboardAgentDeps) { + return async function selectOnboardAgent({ + agentFlag = null, + session = null, + }: { + agentFlag?: string | null; + session?: { agent?: string | null } | null; + resume?: boolean; + canPrompt?: boolean; + } = {}): Promise { + const agent = deps.resolveAgent({ agentFlag, session }); + if (deps.isNonInteractive()) { + const displayName = agent?.displayName || deps.loadAgent("openclaw").displayName; + deps.note(` [non-interactive] Agent: ${displayName}`); + } + return agent; + }; +} From 9e442d11b2a9266c723acc2bacd5939a6357ee54 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 17:54:03 -0700 Subject: [PATCH 45/54] refactor(cli): extract require value helper --- src/lib/core/require-value.ts | 9 +++++++++ src/lib/onboard.ts | 7 +------ src/lib/onboard/model-router.ts | 9 +-------- 3 files changed, 11 insertions(+), 14 deletions(-) create mode 100644 src/lib/core/require-value.ts diff --git a/src/lib/core/require-value.ts b/src/lib/core/require-value.ts new file mode 100644 index 0000000000..a61fe98e3c --- /dev/null +++ b/src/lib/core/require-value.ts @@ -0,0 +1,9 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +export function requireValue(value: T | null | undefined, message: string): T { + if (value === null || value === undefined) { + throw new Error(message); + } + return value; +} diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 0e5993ef78..040ffcd816 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -127,6 +127,7 @@ const { }: typeof import("./onboard/base-image") = require("./onboard/base-image"); const errnoUtils: typeof import("./core/errno") = require("./core/errno"); const { isErrnoException } = errnoUtils; +const { requireValue }: typeof import("./core/require-value") = require("./core/require-value"); type RunnerOptions = { env?: NodeJS.ProcessEnv; @@ -137,12 +138,6 @@ type RunnerOptions = { openshellBinary?: string; }; -function requireValue(value: T | null | undefined, message: string): T { - if (value == null) { - throw new Error(message); - } - return value; -} const { collectBuildContextStats, stageOptimizedSandboxBuildContext, diff --git a/src/lib/onboard/model-router.ts b/src/lib/onboard/model-router.ts index 81ca0d10d7..ec35e06063 100644 --- a/src/lib/onboard/model-router.ts +++ b/src/lib/onboard/model-router.ts @@ -7,7 +7,7 @@ import fs from "node:fs"; import http from "node:http"; import os from "node:os"; import path from "node:path"; - +import { requireValue } from "../core/require-value"; import { normalizeCredentialValue, resolveProviderCredential, @@ -59,13 +59,6 @@ export type BlueprintInferenceProfile = { router: BlueprintRouterConfig; }; -function requireValue(value: T | null | undefined, message: string): T { - if (value === null || value === undefined) { - throw new Error(message); - } - return value; -} - /** * Load a named inference profile and router config from blueprint.yaml. * Returns null if the blueprint or profile is missing. From 58f38f7c81c24b6a4ce9f779df5d681a8483c375 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 17:56:35 -0700 Subject: [PATCH 46/54] refactor(cli): move onboard step banner helper --- src/lib/onboard.ts | 7 +------ src/lib/onboard/prompt-helpers.ts | 6 ++++++ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 040ffcd816..4068e95f47 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -302,6 +302,7 @@ const { exitOnboardFromPrompt, getNavigationChoice, isAffirmativeAnswer, + step, ...onboardPromptHelpers }: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers"); const providerRecovery: typeof import("./onboard/provider-recovery") = require("./onboard/provider-recovery"); @@ -616,12 +617,6 @@ const { getSandboxReuseState, repairRecordedSandbox } = sandboxReuse.createSandb const { streamSandboxCreate } = sandboxCreateStream; -function step(n: number, total: number, msg: string): void { - console.log(""); - console.log(` [${n}/${total}] ${msg}`); - console.log(` ${"─".repeat(50)}`); -} - const { executeSandboxCommandForVerification }: typeof import("./onboard/sandbox-verification-exec") = require("./onboard/sandbox-verification-exec"); diff --git a/src/lib/onboard/prompt-helpers.ts b/src/lib/onboard/prompt-helpers.ts index 2bd3efe66e..c99e92f828 100644 --- a/src/lib/onboard/prompt-helpers.ts +++ b/src/lib/onboard/prompt-helpers.ts @@ -1,6 +1,12 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +export function step(n: number, total: number, msg: string): void { + console.log(""); + console.log(` [${n}/${total}] ${msg}`); + console.log(` ${"─".repeat(50)}`); +} + export function getNavigationChoice(value = ""): "back" | "exit" | null { const normalized = String(value || "") .trim() From 7b2d0de0cabe1f805a55490597fb2e1e7344ce2e Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Wed, 20 May 2026 19:48:33 -0700 Subject: [PATCH 47/54] refactor(cli): extract validation recovery prompts --- src/lib/onboard.ts | 133 ++------------- src/lib/onboard/validation-recovery-prompt.ts | 157 ++++++++++++++++++ 2 files changed, 168 insertions(+), 122 deletions(-) create mode 100644 src/lib/onboard/validation-recovery-prompt.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 4068e95f47..78db3e35d6 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -307,6 +307,9 @@ const { }: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers"); const providerRecovery: typeof import("./onboard/provider-recovery") = require("./onboard/provider-recovery"); const { createOpenclawSetup }: typeof import("./onboard/openclaw-setup") = require("./onboard/openclaw-setup"); +const { + createValidationRecoveryPromptHelpers, +}: typeof import("./onboard/validation-recovery-prompt") = require("./onboard/validation-recovery-prompt"); const { createOpenshellCliHelpers }: typeof import("./onboard/openshell-cli") = require("./onboard/openshell-cli"); const { resolveSandboxGpuFlagFromOptions, @@ -682,128 +685,14 @@ const { // validateNvidiaApiKeyValue — see validation import above -async function replaceNamedCredential( - envName: string, - label: string, - helpUrl: string | null = null, - validator: ((value: string) => string | null) | null = null, -): Promise { - if (helpUrl) { - console.log(""); - console.log(` Get your ${label} from: ${helpUrl}`); - console.log(""); - } - - while (true) { - const key = normalizeCredentialValue(await prompt(` ${label}: `, { secret: true })); - if (!key) { - console.error(` ${label} is required.`); - continue; - } - const validationError = typeof validator === "function" ? validator(key) : null; - if (validationError) { - console.error(validationError); - continue; - } - saveCredential(envName, key); - process.env[envName] = key; - console.log(""); - console.log(" Credential staged. Onboarding will register it with the OpenShell gateway."); - console.log(""); - return key; - } -} - -async function promptValidationRecovery( - label: string, - recovery: ProbeRecovery, - credentialEnv: string | null = null, - helpUrl: string | null = null, -): Promise<"credential" | "selection" | "retry" | "model"> { - if (isNonInteractive()) { - process.exit(1); - } - - if (recovery.kind === "credential" && credentialEnv) { - console.log( - ` ${label} authorization failed. Re-enter the API key or choose a different provider/model.`, - ); - console.log(" ⚠️ Do NOT paste your API key here — use the options below:"); - const choice = ( - await prompt(" Options: retry (re-enter key), back (change provider), exit [retry]: ", { - secret: true, - }) - ) - .trim() - .toLowerCase(); - // Guard against the user accidentally pasting an API key at this prompt. - // Tokens don't contain spaces; human sentences do — the no-space + length check - // avoids false-positives on long typed sentences. - const API_KEY_PREFIXES = ["nvapi-", "ghp_", "gcm-", "sk-", "gpt-", "gemini-", "nvcf-"]; - const looksLikeToken = - API_KEY_PREFIXES.some((p) => choice.startsWith(p)) || - (!choice.includes(" ") && choice.length > 40) || - // Regex fallback: base64-safe token pattern (20+ chars, no spaces, mixed alphanum) - /^[A-Za-z0-9_\-\.]{20,}$/.test(choice); - // validateNvidiaApiKeyValue is provider-aware: it only enforces the - // nvapi- prefix when credentialEnv === "NVIDIA_API_KEY", so passing it - // unconditionally here is safe for Anthropic/OpenAI/Gemini too. - const validator = (key: string) => validateNvidiaApiKeyValue(key, credentialEnv); - if (looksLikeToken) { - console.log(" ⚠️ That looks like an API key — do not paste credentials here."); - console.log(" Treating as 'retry'. You will be prompted to enter the key securely."); - await replaceNamedCredential(credentialEnv, `${label} API key`, helpUrl, validator); - return "credential"; - } - if (choice === "back") { - console.log(" Returning to provider selection."); - console.log(""); - return "selection"; - } - if (choice === "exit" || choice === "quit") { - exitOnboardFromPrompt(); - } - if (choice === "" || choice === "retry") { - await replaceNamedCredential(credentialEnv, `${label} API key`, helpUrl, validator); - return "credential"; - } - console.log(" Please choose a provider/model again."); - console.log(""); - return "selection"; - } - - if (recovery.kind === "transport") { - console.log(getTransportRecoveryMessage("failure" in recovery ? recovery.failure || {} : {})); - const choice = (await prompt(" Type 'retry', 'back', or 'exit' [retry]: ")) - .trim() - .toLowerCase(); - if (choice === "back") { - console.log(" Returning to provider selection."); - console.log(""); - return "selection"; - } - if (choice === "exit" || choice === "quit") { - exitOnboardFromPrompt(); - } - if (choice === "" || choice === "retry") { - console.log(""); - return "retry"; - } - console.log(" Please choose a provider/model again."); - console.log(""); - return "selection"; - } - - if (recovery.kind === "model") { - console.log(` Please enter a different ${label} model name.`); - console.log(""); - return "model"; - } - - console.log(" Please choose a provider/model again."); - console.log(""); - return "selection"; -} +const { replaceNamedCredential, promptValidationRecovery } = createValidationRecoveryPromptHelpers({ + isNonInteractive, + prompt, + validateNvidiaApiKeyValue: (key: string, credentialEnv: string | null) => + validateNvidiaApiKeyValue(key, credentialEnv ?? undefined), + getTransportRecoveryMessage: (failure: any) => getTransportRecoveryMessage(failure), + exitOnboardFromPrompt, +}); // Provider CRUD — thin wrappers that inject runOpenshell to avoid circular deps. const { buildProviderArgs } = onboardProviders; diff --git a/src/lib/onboard/validation-recovery-prompt.ts b/src/lib/onboard/validation-recovery-prompt.ts new file mode 100644 index 0000000000..b44cd0676d --- /dev/null +++ b/src/lib/onboard/validation-recovery-prompt.ts @@ -0,0 +1,157 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { normalizeCredentialValue, saveCredential } from "../credentials/store"; +import type { ProbeRecovery } from "../validation-recovery"; + +export interface ValidationRecoveryPromptDeps { + isNonInteractive(): boolean; + prompt(question: string, options?: { secret?: boolean }): Promise; + validateNvidiaApiKeyValue(key: string, credentialEnv: string | null): string | null; + getTransportRecoveryMessage(failure: any): string; + exitOnboardFromPrompt(): never; +} + +export interface ValidationRecoveryPromptHelpers { + replaceNamedCredential( + envName: string, + label: string, + helpUrl?: string | null, + validator?: ((value: string) => string | null) | null, + ): Promise; + promptValidationRecovery( + label: string, + recovery: ProbeRecovery, + credentialEnv?: string | null, + helpUrl?: string | null, + ): Promise<"credential" | "selection" | "retry" | "model">; +} + +export function createValidationRecoveryPromptHelpers( + deps: ValidationRecoveryPromptDeps, +): ValidationRecoveryPromptHelpers { + async function replaceNamedCredential( + envName: string, + label: string, + helpUrl: string | null = null, + validator: ((value: string) => string | null) | null = null, + ): Promise { + if (helpUrl) { + console.log(""); + console.log(` Get your ${label} from: ${helpUrl}`); + console.log(""); + } + + while (true) { + const key = normalizeCredentialValue(await deps.prompt(` ${label}: `, { secret: true })); + if (!key) { + console.error(` ${label} is required.`); + continue; + } + const validationError = typeof validator === "function" ? validator(key) : null; + if (validationError) { + console.error(validationError); + continue; + } + saveCredential(envName, key); + process.env[envName] = key; + console.log(""); + console.log(" Credential staged. Onboarding will register it with the OpenShell gateway."); + console.log(""); + return key; + } + } + + async function promptValidationRecovery( + label: string, + recovery: ProbeRecovery, + credentialEnv: string | null = null, + helpUrl: string | null = null, + ): Promise<"credential" | "selection" | "retry" | "model"> { + if (deps.isNonInteractive()) { + process.exit(1); + } + + if (recovery.kind === "credential" && credentialEnv) { + console.log( + ` ${label} authorization failed. Re-enter the API key or choose a different provider/model.`, + ); + console.log(" ⚠️ Do NOT paste your API key here — use the options below:"); + const choice = ( + await deps.prompt(" Options: retry (re-enter key), back (change provider), exit [retry]: ", { + secret: true, + }) + ) + .trim() + .toLowerCase(); + // Guard against the user accidentally pasting an API key at this prompt. + // Tokens don't contain spaces; human sentences do — the no-space + length check + // avoids false-positives on long typed sentences. + const API_KEY_PREFIXES = ["nvapi-", "ghp_", "gcm-", "sk-", "gpt-", "gemini-", "nvcf-"]; + const looksLikeToken = + API_KEY_PREFIXES.some((prefix) => choice.startsWith(prefix)) || + (!choice.includes(" ") && choice.length > 40) || + // Regex fallback: base64-safe token pattern (20+ chars, no spaces, mixed alphanum) + /^[A-Za-z0-9_\-.]{20,}$/.test(choice); + // validateNvidiaApiKeyValue is provider-aware: it only enforces the + // nvapi- prefix when credentialEnv === "NVIDIA_API_KEY", so passing it + // unconditionally here is safe for Anthropic/OpenAI/Gemini too. + const validator = (key: string) => deps.validateNvidiaApiKeyValue(key, credentialEnv); + if (looksLikeToken) { + console.log(" ⚠️ That looks like an API key — do not paste credentials here."); + console.log(" Treating as 'retry'. You will be prompted to enter the key securely."); + await replaceNamedCredential(credentialEnv, `${label} API key`, helpUrl, validator); + return "credential"; + } + if (choice === "back") { + console.log(" Returning to provider selection."); + console.log(""); + return "selection"; + } + if (choice === "exit" || choice === "quit") { + deps.exitOnboardFromPrompt(); + } + if (choice === "" || choice === "retry") { + await replaceNamedCredential(credentialEnv, `${label} API key`, helpUrl, validator); + return "credential"; + } + console.log(" Please choose a provider/model again."); + console.log(""); + return "selection"; + } + + if (recovery.kind === "transport") { + console.log(deps.getTransportRecoveryMessage("failure" in recovery ? recovery.failure || {} : {})); + const choice = (await deps.prompt(" Type 'retry', 'back', or 'exit' [retry]: ")) + .trim() + .toLowerCase(); + if (choice === "back") { + console.log(" Returning to provider selection."); + console.log(""); + return "selection"; + } + if (choice === "exit" || choice === "quit") { + deps.exitOnboardFromPrompt(); + } + if (choice === "" || choice === "retry") { + console.log(""); + return "retry"; + } + console.log(" Please choose a provider/model again."); + console.log(""); + return "selection"; + } + + if (recovery.kind === "model") { + console.log(` Please enter a different ${label} model name.`); + console.log(""); + return "model"; + } + + console.log(" Please choose a provider/model again."); + console.log(""); + return "selection"; + } + + return { replaceNamedCredential, promptValidationRecovery }; +} From eef52549d69c94879b7b8ddfa6368e28bda80756 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Thu, 21 May 2026 14:42:43 -0700 Subject: [PATCH 48/54] refactor(cli): remove duplicate onboard sleep helper --- src/lib/onboard.ts | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 78db3e35d6..9957c2f298 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -1324,10 +1324,6 @@ function getOpenShellInstallDeps(): OpenShellInstallDeps { }; } -function sleep(seconds: number): void { - sleepSeconds(seconds); -} - function runQuietOpenshell(args: string[]) { return runOpenshell(args, { ignoreError: true, @@ -1356,7 +1352,7 @@ function terminateDockerDriverGatewayProcess(pid: number): boolean { process.kill(pid, "SIGTERM"); for (let i = 0; i < 10; i += 1) { if (!isPidAlive(pid)) break; - sleep(1); + sleepSeconds(1); } if (isPidAlive(pid)) process.kill(pid, "SIGKILL"); return true; @@ -2041,7 +2037,7 @@ function waitForSandboxReady(sandboxName: string, attempts = 10, delaySeconds = // Package-managed OpenShell gateways report readiness through // `sandbox list`; legacy Kubernetes gateways may still expose pod state. if (isLinuxDockerDriverGatewayEnabled()) { - if (i < attempts - 1) sleep(delaySeconds); + if (i < attempts - 1) sleepSeconds(delaySeconds); continue; } const podPhase = runCaptureOpenshell( @@ -2061,7 +2057,7 @@ function waitForSandboxReady(sandboxName: string, attempts = 10, delaySeconds = { ignoreError: true }, ); if (podPhase === "Running") return true; - sleep(delaySeconds); + sleepSeconds(delaySeconds); } return false; } @@ -2539,7 +2535,7 @@ async function preflight( ` Cleaning up orphaned SSH port-forward on port ${port} (PID ${portCheck.pid})...`, ); run(["kill", String(portCheck.pid)], { ignoreError: true }); - sleep(1); + sleepSeconds(1); portCheck = await checkPortAvailable(port, portCheckOptions); if (portCheck.ok) { console.log(` ✓ Port ${port} available after orphaned forward cleanup (${label})`); @@ -2801,7 +2797,7 @@ async function startGatewayWithOptions( if (isGatewayHealthy(status, namedInfo, currentInfo) && (await isGatewayHttpReady())) { return; // success } - if (i < healthPollCount - 1) sleep(healthPollInterval); + if (i < healthPollCount - 1) sleepSeconds(healthPollInterval); } throw new Error("Gateway failed to start"); @@ -2948,7 +2944,7 @@ async function startDockerDriverGateway({ exitOnFailure = true, skipSandboxBridg console.log(` Restarting unhealthy Docker-driver gateway process (PID ${existingPid})...`); try { process.kill(existingPid, "SIGTERM"); - sleep(1); + sleepSeconds(1); } catch { /* best effort; the new process will surface any remaining port conflict */ } @@ -2990,7 +2986,7 @@ async function startDockerDriverGateway({ exitOnFailure = true, skipSandboxBridg break; } if (!registerDockerDriverGatewayEndpoint()) { - if (i < pollCount - 1) sleep(pollInterval); + if (i < pollCount - 1) sleepSeconds(pollInterval); continue; } const status = runCaptureOpenshell(["status"], { ignoreError: true }); @@ -3005,7 +3001,7 @@ async function startDockerDriverGateway({ exitOnFailure = true, skipSandboxBridg await verifySandboxBridgeGatewayReachableOrExit(exitOnFailure, { skip: skipSandboxBridgeReachability }); console.log(" ✓ Docker-driver gateway is healthy"); return; } - if (i < pollCount - 1) sleep(pollInterval); + if (i < pollCount - 1) sleepSeconds(pollInterval); } reportDockerDriverGatewayStartFailure(logPath, childExit, { exitOnFailure }); @@ -3200,7 +3196,7 @@ async function recoverGatewayRuntime() { } return true; } - if (i < recoveryPollCount - 1) sleep(recoveryPollInterval); + if (i < recoveryPollCount - 1) sleepSeconds(recoveryPollInterval); } return false; @@ -4122,7 +4118,7 @@ async function createSandbox( sandboxName, gpuDevice: effectiveSandboxGpuConfig.sandboxGpuDevice, timeoutSecs: sandboxReadyTimeoutSecs, - deps: { runOpenshell, runCaptureOpenshell, sleep }, + deps: { runOpenshell, runCaptureOpenshell, sleep: sleepSeconds }, }); const createResult = await streamSandboxCreate(createCommand, sandboxEnv, { readyCheck: () => { @@ -4189,7 +4185,7 @@ async function createSandbox( ready = true; break; } - if (i < readyAttempts - 1) sleep(2); + if (i < readyAttempts - 1) sleepSeconds(2); } const restoreBackupPath = @@ -4257,7 +4253,7 @@ async function createSandbox( if (i === 14) { console.warn(" Dashboard taking longer than expected to start. Continuing..."); } else { - sleep(2); + sleepSeconds(2); } } @@ -5611,7 +5607,7 @@ async function setupNim( runShell("set -o pipefail; curl -fsSL https://ollama.com/install.sh | sh"); // Give the just-started ollama.service a moment to bind port // 11434 before we probe or apply the systemd drop-in override. - sleep(2); + sleepSeconds(2); // Linux native + systemd: force a loopback-only OLLAMA_HOST drop-in // and let systemd own the daemon (avoids racing the installer's // daemon with our own `ollama serve`). This also repairs older @@ -7005,7 +7001,7 @@ const { note, isWsl, redact, - sleep, + sleep: sleepSeconds, printAgentDashboardUi: agentOnboard.printDashboardUi, }); From 5bfa612ac22941123c54694cbc30aad490a3ce5a Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Thu, 21 May 2026 18:57:54 -0700 Subject: [PATCH 49/54] refactor(cli): extract web search flow helpers --- src/lib/onboard.ts | 181 ++--------------------- src/lib/onboard/web-search-flow.ts | 221 +++++++++++++++++++++++++++++ 2 files changed, 236 insertions(+), 166 deletions(-) create mode 100644 src/lib/onboard/web-search-flow.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 9957c2f298..b806f38d0e 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -307,6 +307,7 @@ const { }: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers"); const providerRecovery: typeof import("./onboard/provider-recovery") = require("./onboard/provider-recovery"); const { createOpenclawSetup }: typeof import("./onboard/openclaw-setup") = require("./onboard/openclaw-setup"); +const { createWebSearchFlowHelpers }: typeof import("./onboard/web-search-flow") = require("./onboard/web-search-flow"); const { createValidationRecoveryPromptHelpers, }: typeof import("./onboard/validation-recovery-prompt") = require("./onboard/validation-recovery-prompt"); @@ -913,173 +914,21 @@ const { }); -function validateBraveSearchApiKey(apiKey: string): CurlProbeResult { - return runCurlProbe([ - "-sS", - "--compressed", - "-H", - "Accept: application/json", - "-H", - "Accept-Encoding: gzip", - "-H", - `X-Subscription-Token: ${apiKey}`, - "--get", - "--data-urlencode", - "q=ping", - "--data-urlencode", - "count=1", - "https://api.search.brave.com/res/v1/web/search", - ]); -} - -async function promptBraveSearchRecovery( - validation: ValidationFailureLike, -): Promise<"retry" | "skip"> { - const recovery = classifyValidationFailure(validation); - - if (recovery.kind === "credential") { - console.log(" Brave Search rejected that API key."); - } else if (recovery.kind === "transport") { - console.log(getTransportRecoveryMessage(validation)); - } else { - console.log(" Brave Search validation did not succeed."); - } - - const answer = (await prompt(" Type 'retry', 'skip', or 'exit' [retry]: ")).trim().toLowerCase(); - if (answer === "skip") return "skip"; - if (answer === "exit" || answer === "quit") { - exitOnboardFromPrompt(); - } - return "retry"; -} - -async function promptBraveSearchApiKey(): Promise { - console.log(""); - console.log(` Get your Brave Search API key from: ${BRAVE_SEARCH_HELP_URL}`); - console.log(""); - - while (true) { - const key = normalizeCredentialValue( - await prompt(" Brave Search API key: ", { secret: true }), - ); - if (!key) { - console.error(" Brave Search API key is required."); - continue; - } - return key; - } -} - -async function ensureValidatedBraveSearchCredential( - nonInteractive = isNonInteractive(), -): Promise { - const savedApiKey = getCredential(webSearch.BRAVE_API_KEY_ENV); - let apiKey: string | null = - savedApiKey || normalizeCredentialValue(process.env[webSearch.BRAVE_API_KEY_ENV]); - let usingSavedKey = Boolean(savedApiKey); - - while (true) { - if (!apiKey) { - if (nonInteractive) { - throw new Error( - "Brave Search requires BRAVE_API_KEY or a saved Brave Search credential in non-interactive mode.", - ); - } - apiKey = await promptBraveSearchApiKey(); - usingSavedKey = false; - } - - const validation = validateBraveSearchApiKey(apiKey); - if (validation.ok) { - saveCredential(webSearch.BRAVE_API_KEY_ENV, apiKey); - process.env[webSearch.BRAVE_API_KEY_ENV] = apiKey; - return apiKey; - } - - const prefix = usingSavedKey - ? " Saved Brave Search API key validation failed." - : " Brave Search API key validation failed."; - console.error(prefix); - if (validation.message) { - console.error(` ${validation.message}`); - } - - if (nonInteractive) { - throw new Error( - validation.message || "Brave Search API key validation failed in non-interactive mode.", - ); - } - - const action = await promptBraveSearchRecovery(validation); - if (action === "skip") { - console.log(" Skipping Brave Web Search setup."); - console.log(""); - return null; - } - - apiKey = null; - usingSavedKey = false; - } -} - -async function configureWebSearch( - existingConfig: WebSearchConfig | null = null, - agent: AgentDefinition | null = null, - dockerfilePathOverride: string | null = null, -): Promise { - if (!agentSupportsWebSearch(agent, dockerfilePathOverride, ROOT)) { - note(` Web search is not yet supported by ${agent?.displayName ?? "this agent"}. Skipping.`); - return null; - } - - if (existingConfig) { - return { fetchEnabled: true }; - } - - if (isNonInteractive()) { - const braveApiKey = normalizeCredentialValue(process.env[webSearch.BRAVE_API_KEY_ENV]); - if (!braveApiKey) { - return null; - } - note(" [non-interactive] Brave Web Search requested."); - const validation = validateBraveSearchApiKey(braveApiKey); - if (!validation.ok) { - console.warn( - ` Brave Search API key validation failed. Web search will be disabled — re-enable later via \`${cliName()} config web-search\`.`, - ); - if (validation.message) { - console.warn(` ${validation.message}`); - } - return null; - } - saveCredential(webSearch.BRAVE_API_KEY_ENV, braveApiKey); - process.env[webSearch.BRAVE_API_KEY_ENV] = braveApiKey; - return { fetchEnabled: true }; - } - const enableAnswer = await prompt(" Enable Brave Web Search? [y/N]: "); - if (!isAffirmativeAnswer(enableAnswer)) { - return null; - } - - const braveApiKey = await ensureValidatedBraveSearchCredential(); - if (!braveApiKey) { - return null; - } - - console.log(" ✓ Enabled Brave Web Search"); - console.log(""); - return { fetchEnabled: true }; -} +const { + validateBraveSearchApiKey, + promptBraveSearchRecovery, + promptBraveSearchApiKey, + ensureValidatedBraveSearchCredential, + configureWebSearch, + verifyWebSearchInsideSandbox, +} = createWebSearchFlowHelpers({ + prompt, + note, + isNonInteractive, + cliName, + runCaptureOpenshell, +}); -function verifyWebSearchInsideSandbox( - sandboxName: string, - agent: AgentDefinition | null | undefined, -): void { - verifyWebSearchInsideSandboxWithDeps(sandboxName, agent, { - runCaptureOpenshell, - cliName, - }); -} // getSandboxInferenceConfig — moved to onboard-providers.ts diff --git a/src/lib/onboard/web-search-flow.ts b/src/lib/onboard/web-search-flow.ts new file mode 100644 index 0000000000..ab1af78d5f --- /dev/null +++ b/src/lib/onboard/web-search-flow.ts @@ -0,0 +1,221 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { CurlProbeResult } from "../adapters/http/probe"; +import { runCurlProbe } from "../adapters/http/probe"; +import type { AgentDefinition } from "../agent/defs"; +import { getCredential, normalizeCredentialValue, saveCredential } from "../credentials/store"; +import type { WebSearchConfig } from "../inference/web-search"; +import { BRAVE_API_KEY_ENV } from "../inference/web-search"; +import { ROOT } from "../runner"; +import { classifyValidationFailure } from "../validation"; +import { getTransportRecoveryMessage } from "../validation-recovery"; +import { exitOnboardFromPrompt, isAffirmativeAnswer } from "./prompt-helpers"; +import type { ValidationFailureLike } from "./types"; +import { agentSupportsWebSearch } from "./web-search-support"; +import { verifyWebSearchInsideSandbox as verifyWebSearchInsideSandboxWithDeps } from "./web-search-verify"; + +const BRAVE_SEARCH_HELP_URL = "https://brave.com/search/api/"; + +export interface WebSearchFlowDeps { + prompt(question: string, options?: { secret?: boolean }): Promise; + note(message: string): void; + isNonInteractive(): boolean; + cliName(): string; + runCaptureOpenshell(args: string[], opts?: Record): string | null; +} + +export interface WebSearchFlowHelpers { + validateBraveSearchApiKey(apiKey: string): CurlProbeResult; + promptBraveSearchRecovery(validation: ValidationFailureLike): Promise<"retry" | "skip">; + promptBraveSearchApiKey(): Promise; + ensureValidatedBraveSearchCredential(nonInteractive?: boolean): Promise; + configureWebSearch( + existingConfig?: WebSearchConfig | null, + agent?: AgentDefinition | null, + dockerfilePathOverride?: string | null, + ): Promise; + verifyWebSearchInsideSandbox( + sandboxName: string, + agent: AgentDefinition | null | undefined, + ): void; +} + +export function createWebSearchFlowHelpers(deps: WebSearchFlowDeps): WebSearchFlowHelpers { + function validateBraveSearchApiKey(apiKey: string): CurlProbeResult { + return runCurlProbe([ + "-sS", + "--compressed", + "-H", + "Accept: application/json", + "-H", + "Accept-Encoding: gzip", + "-H", + `X-Subscription-Token: ${apiKey}`, + "--get", + "--data-urlencode", + "q=ping", + "--data-urlencode", + "count=1", + "https://api.search.brave.com/res/v1/web/search", + ]); + } + + async function promptBraveSearchRecovery( + validation: ValidationFailureLike, + ): Promise<"retry" | "skip"> { + const recovery = classifyValidationFailure(validation); + + if (recovery.kind === "credential") { + console.log(" Brave Search rejected that API key."); + } else if (recovery.kind === "transport") { + console.log(getTransportRecoveryMessage(validation)); + } else { + console.log(" Brave Search validation did not succeed."); + } + + const answer = (await deps.prompt(" Type 'retry', 'skip', or 'exit' [retry]: ")).trim().toLowerCase(); + if (answer === "skip") return "skip"; + if (answer === "exit" || answer === "quit") { + exitOnboardFromPrompt(); + } + return "retry"; + } + + async function promptBraveSearchApiKey(): Promise { + console.log(""); + console.log(` Get your Brave Search API key from: ${BRAVE_SEARCH_HELP_URL}`); + console.log(""); + + while (true) { + const key = normalizeCredentialValue( + await deps.prompt(" Brave Search API key: ", { secret: true }), + ); + if (!key) { + console.error(" Brave Search API key is required."); + continue; + } + return key; + } + } + + async function ensureValidatedBraveSearchCredential( + nonInteractive = deps.isNonInteractive(), + ): Promise { + const savedApiKey = getCredential(BRAVE_API_KEY_ENV); + let apiKey: string | null = + savedApiKey || normalizeCredentialValue(process.env[BRAVE_API_KEY_ENV]); + let usingSavedKey = Boolean(savedApiKey); + + while (true) { + if (!apiKey) { + if (nonInteractive) { + throw new Error( + "Brave Search requires BRAVE_API_KEY or a saved Brave Search credential in non-interactive mode.", + ); + } + apiKey = await promptBraveSearchApiKey(); + usingSavedKey = false; + } + + const validation = validateBraveSearchApiKey(apiKey); + if (validation.ok) { + saveCredential(BRAVE_API_KEY_ENV, apiKey); + process.env[BRAVE_API_KEY_ENV] = apiKey; + return apiKey; + } + + const prefix = usingSavedKey + ? " Saved Brave Search API key validation failed." + : " Brave Search API key validation failed."; + console.error(prefix); + if (validation.message) { + console.error(` ${validation.message}`); + } + + if (nonInteractive) { + throw new Error( + validation.message || "Brave Search API key validation failed in non-interactive mode.", + ); + } + + const action = await promptBraveSearchRecovery(validation); + if (action === "skip") { + console.log(" Skipping Brave Web Search setup."); + console.log(""); + return null; + } + + apiKey = null; + usingSavedKey = false; + } + } + + async function configureWebSearch( + existingConfig: WebSearchConfig | null = null, + agent: AgentDefinition | null = null, + dockerfilePathOverride: string | null = null, + ): Promise { + if (!agentSupportsWebSearch(agent, dockerfilePathOverride, ROOT)) { + deps.note(` Web search is not yet supported by ${agent?.displayName ?? "this agent"}. Skipping.`); + return null; + } + + if (existingConfig) { + return { fetchEnabled: true }; + } + + if (deps.isNonInteractive()) { + const braveApiKey = normalizeCredentialValue(process.env[BRAVE_API_KEY_ENV]); + if (!braveApiKey) { + return null; + } + deps.note(" [non-interactive] Brave Web Search requested."); + const validation = validateBraveSearchApiKey(braveApiKey); + if (!validation.ok) { + console.warn( + ` Brave Search API key validation failed. Web search will be disabled — re-enable later via \`${deps.cliName()} config web-search\`.`, + ); + if (validation.message) { + console.warn(` ${validation.message}`); + } + return null; + } + saveCredential(BRAVE_API_KEY_ENV, braveApiKey); + process.env[BRAVE_API_KEY_ENV] = braveApiKey; + return { fetchEnabled: true }; + } + const enableAnswer = await deps.prompt(" Enable Brave Web Search? [y/N]: "); + if (!isAffirmativeAnswer(enableAnswer)) { + return null; + } + + const braveApiKey = await ensureValidatedBraveSearchCredential(); + if (!braveApiKey) { + return null; + } + + console.log(" ✓ Enabled Brave Web Search"); + console.log(""); + return { fetchEnabled: true }; + } + + function verifyWebSearchInsideSandbox( + sandboxName: string, + agent: AgentDefinition | null | undefined, + ): void { + verifyWebSearchInsideSandboxWithDeps(sandboxName, agent, { + runCaptureOpenshell: deps.runCaptureOpenshell, + cliName: deps.cliName, + }); + } + + return { + validateBraveSearchApiKey, + promptBraveSearchRecovery, + promptBraveSearchApiKey, + ensureValidatedBraveSearchCredential, + configureWebSearch, + verifyWebSearchInsideSandbox, + }; +} From 826d82afaa1be6031cd6041026571b1c72d2d67a Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Thu, 21 May 2026 19:02:10 -0700 Subject: [PATCH 50/54] refactor(cli): extract inference selection validation --- src/lib/onboard.ts | 156 +----------- .../onboard/inference-selection-validation.ts | 223 ++++++++++++++++++ 2 files changed, 236 insertions(+), 143 deletions(-) create mode 100644 src/lib/onboard/inference-selection-validation.ts diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index b806f38d0e..a6e9cca8b1 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -16,6 +16,9 @@ const { setOnboardBrandingAgent, }: typeof import("./onboard/branding") = require("./onboard/branding"); const { createSelectOnboardAgent }: typeof import("./onboard/agent-selection") = require("./onboard/agent-selection"); +const { + createInferenceSelectionValidationHelpers, +}: typeof import("./onboard/inference-selection-validation") = require("./onboard/inference-selection-validation"); const { cleanupTempDir }: typeof import("./onboard/temp-files") = require("./onboard/temp-files"); const { stopStaleDashboardListenersForSandbox } = require("./onboard/stale-gateway-cleanup"); const { @@ -945,150 +948,17 @@ const { probeAnthropicEndpoint, } = require("./inference/onboard-probes"); -async function validateOpenAiLikeSelection( - label: string, - endpointUrl: string, - model: string, - credentialEnv: string | null = null, - retryMessage = "Please choose a provider/model again.", - helpUrl: string | null = null, - options: { - authMode?: "bearer" | "query-param"; - requireResponsesToolCalling?: boolean; - requireChatCompletionsToolCalling?: boolean; - skipResponsesProbe?: boolean; - probeStreaming?: boolean; - } = {}, -): Promise { - const apiKey = credentialEnv ? getCredential(credentialEnv) : ""; - const probe = probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, options); - if (!probe.ok) { - console.error(` ${label} endpoint validation failed.`); - console.error(` ${probe.message}`); - if (isNonInteractive()) { - process.exit(1); - } - const retry = await promptValidationRecovery( - label, - getProbeRecovery(probe), - credentialEnv, - helpUrl, - ); - if (retry === "selection") { - console.log(` ${retryMessage}`); - console.log(""); - } - return { ok: false, retry }; - } - if (probe.note) { - console.log(` ℹ ${probe.note}`); - } else { - console.log(` ${probe.label} available — ${agentProductName()} will use ${probe.api}.`); - } - return { ok: true, api: probe.api ?? "openai-completions" }; -} - -async function validateAnthropicSelectionWithRetryMessage( - label: string, - endpointUrl: string, - model: string, - credentialEnv: string, - retryMessage = "Please choose a provider/model again.", - helpUrl: string | null = null, -): Promise { - const apiKey = getCredential(credentialEnv); - const probe = probeAnthropicEndpoint(endpointUrl, model, apiKey); - if (!probe.ok) { - console.error(` ${label} endpoint validation failed.`); - console.error(` ${probe.message}`); - if (isNonInteractive()) { - process.exit(1); - } - const retry = await promptValidationRecovery( - label, - getProbeRecovery(probe), - credentialEnv, - helpUrl, - ); - if (retry === "selection") { - console.log(` ${retryMessage}`); - console.log(""); - } - return { ok: false, retry }; - } - console.log(` ${probe.label} available — ${agentProductName()} will use ${probe.api}.`); - return { ok: true, api: probe.api }; -} - -async function validateCustomOpenAiLikeSelection( - label: string, - endpointUrl: string, - model: string, - credentialEnv: string, - helpUrl: string | null = null, -): Promise { - const apiKey = getCredential(credentialEnv); - const probe = probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, { - requireResponsesToolCalling: true, - skipResponsesProbe: shouldForceCompletionsApi(process.env.NEMOCLAW_PREFERRED_API), - probeStreaming: true, - }); - if (probe.ok) { - if (probe.note) { - console.log(` ℹ ${probe.note}`); - } else { - console.log(` ${probe.label} available — ${agentProductName()} will use ${probe.api}.`); - } - return { ok: true, api: probe.api ?? "openai-completions" }; - } - console.error(` ${label} endpoint validation failed.`); - console.error(` ${probe.message}`); - if (isNonInteractive()) { - process.exit(1); - } - const retry = await promptValidationRecovery( - label, - getProbeRecovery(probe, { allowModelRetry: true }), - credentialEnv, - helpUrl, - ); - if (retry === "selection") { - console.log(" Please choose a provider/model again."); - console.log(""); - } - return { ok: false, retry }; -} +const { + validateOpenAiLikeSelection, + validateAnthropicSelectionWithRetryMessage, + validateCustomOpenAiLikeSelection, + validateCustomAnthropicSelection, +} = createInferenceSelectionValidationHelpers({ + isNonInteractive, + agentProductName, + promptValidationRecovery, +}); -async function validateCustomAnthropicSelection( - label: string, - endpointUrl: string, - model: string, - credentialEnv: string, - helpUrl: string | null = null, -): Promise { - const apiKey = getCredential(credentialEnv); - const probe = probeAnthropicEndpoint(endpointUrl, model, apiKey); - if (probe.ok) { - console.log(` ${probe.label} available — ${agentProductName()} will use ${probe.api}.`); - return { ok: true, api: probe.api }; - } - console.error(` ${label} endpoint validation failed.`); - console.error(` ${probe.message}`); - if (isNonInteractive()) { - process.exit(1); - } - const retry = await promptValidationRecovery( - label, - getProbeRecovery(probe, { allowModelRetry: true }), - credentialEnv, - helpUrl, - ); - if (retry === "selection") { - console.log(" Please choose a provider/model again."); - console.log(""); - } - return { ok: false, retry }; -} const { promptCloudModel, promptRemoteModel, promptInputModel } = modelPrompts; const { validateAnthropicModel, validateOpenAiLikeModel } = providerModels; diff --git a/src/lib/onboard/inference-selection-validation.ts b/src/lib/onboard/inference-selection-validation.ts new file mode 100644 index 0000000000..b5e4ca282c --- /dev/null +++ b/src/lib/onboard/inference-selection-validation.ts @@ -0,0 +1,223 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { getCredential } from "../credentials/store"; + +const { probeAnthropicEndpoint, probeOpenAiLikeEndpoint } = require("../inference/onboard-probes") as { + probeAnthropicEndpoint(endpointUrl: string, model: string, apiKey: string | null | undefined): any; + probeOpenAiLikeEndpoint(endpointUrl: string, model: string, apiKey: string | null | undefined, options?: Record): any; +}; + +import { shouldForceCompletionsApi } from "../validation"; +import { getProbeRecovery } from "../validation-recovery"; + +export type EndpointValidationResult = + | { ok: true; api: string | null; retry?: undefined } + | { ok: false; retry: "credential" | "selection" | "retry" | "model"; api?: undefined }; + +export interface InferenceSelectionValidationDeps { + isNonInteractive(): boolean; + agentProductName(): string; + promptValidationRecovery( + label: string, + recovery: ReturnType, + credentialEnv?: string | null, + helpUrl?: string | null, + ): Promise<"credential" | "selection" | "retry" | "model">; +} + +export interface InferenceSelectionValidationHelpers { + validateOpenAiLikeSelection( + label: string, + endpointUrl: string, + model: string, + credentialEnv?: string | null, + retryMessage?: string, + helpUrl?: string | null, + options?: { + authMode?: "bearer" | "query-param"; + requireResponsesToolCalling?: boolean; + requireChatCompletionsToolCalling?: boolean; + skipResponsesProbe?: boolean; + probeStreaming?: boolean; + }, + ): Promise; + validateAnthropicSelectionWithRetryMessage( + label: string, + endpointUrl: string, + model: string, + credentialEnv: string, + retryMessage?: string, + helpUrl?: string | null, + ): Promise; + validateCustomOpenAiLikeSelection( + label: string, + endpointUrl: string, + model: string, + credentialEnv: string, + helpUrl?: string | null, + ): Promise; + validateCustomAnthropicSelection( + label: string, + endpointUrl: string, + model: string, + credentialEnv: string, + helpUrl?: string | null, + ): Promise; +} + +export function createInferenceSelectionValidationHelpers( + deps: InferenceSelectionValidationDeps, +): InferenceSelectionValidationHelpers { + async function validateOpenAiLikeSelection( + label: string, + endpointUrl: string, + model: string, + credentialEnv: string | null = null, + retryMessage = "Please choose a provider/model again.", + helpUrl: string | null = null, + options: { + authMode?: "bearer" | "query-param"; + requireResponsesToolCalling?: boolean; + requireChatCompletionsToolCalling?: boolean; + skipResponsesProbe?: boolean; + probeStreaming?: boolean; + } = {}, + ): Promise { + const apiKey = credentialEnv ? getCredential(credentialEnv) : ""; + const probe = probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, options); + if (!probe.ok) { + console.error(` ${label} endpoint validation failed.`); + console.error(` ${probe.message}`); + if (deps.isNonInteractive()) { + process.exit(1); + } + const retry = await deps.promptValidationRecovery( + label, + getProbeRecovery(probe), + credentialEnv, + helpUrl, + ); + if (retry === "selection") { + console.log(` ${retryMessage}`); + console.log(""); + } + return { ok: false, retry }; + } + if (probe.note) { + console.log(` ℹ ${probe.note}`); + } else { + console.log(` ${probe.label} available — ${deps.agentProductName()} will use ${probe.api}.`); + } + return { ok: true, api: probe.api ?? "openai-completions" }; + } + + async function validateAnthropicSelectionWithRetryMessage( + label: string, + endpointUrl: string, + model: string, + credentialEnv: string, + retryMessage = "Please choose a provider/model again.", + helpUrl: string | null = null, + ): Promise { + const apiKey = getCredential(credentialEnv); + const probe = probeAnthropicEndpoint(endpointUrl, model, apiKey); + if (!probe.ok) { + console.error(` ${label} endpoint validation failed.`); + console.error(` ${probe.message}`); + if (deps.isNonInteractive()) { + process.exit(1); + } + const retry = await deps.promptValidationRecovery( + label, + getProbeRecovery(probe), + credentialEnv, + helpUrl, + ); + if (retry === "selection") { + console.log(` ${retryMessage}`); + console.log(""); + } + return { ok: false, retry }; + } + console.log(` ${probe.label} available — ${deps.agentProductName()} will use ${probe.api}.`); + return { ok: true, api: probe.api }; + } + + async function validateCustomOpenAiLikeSelection( + label: string, + endpointUrl: string, + model: string, + credentialEnv: string, + helpUrl: string | null = null, + ): Promise { + const apiKey = getCredential(credentialEnv); + const probe = probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, { + requireResponsesToolCalling: true, + skipResponsesProbe: shouldForceCompletionsApi(process.env.NEMOCLAW_PREFERRED_API), + probeStreaming: true, + }); + if (probe.ok) { + if (probe.note) { + console.log(` ℹ ${probe.note}`); + } else { + console.log(` ${probe.label} available — ${deps.agentProductName()} will use ${probe.api}.`); + } + return { ok: true, api: probe.api ?? "openai-completions" }; + } + console.error(` ${label} endpoint validation failed.`); + console.error(` ${probe.message}`); + if (deps.isNonInteractive()) { + process.exit(1); + } + const retry = await deps.promptValidationRecovery( + label, + getProbeRecovery(probe, { allowModelRetry: true }), + credentialEnv, + helpUrl, + ); + if (retry === "selection") { + console.log(" Please choose a provider/model again."); + console.log(""); + } + return { ok: false, retry }; + } + + async function validateCustomAnthropicSelection( + label: string, + endpointUrl: string, + model: string, + credentialEnv: string, + helpUrl: string | null = null, + ): Promise { + const apiKey = getCredential(credentialEnv); + const probe = probeAnthropicEndpoint(endpointUrl, model, apiKey); + if (probe.ok) { + console.log(` ${probe.label} available — ${deps.agentProductName()} will use ${probe.api}.`); + return { ok: true, api: probe.api }; + } + console.error(` ${label} endpoint validation failed.`); + console.error(` ${probe.message}`); + if (deps.isNonInteractive()) { + process.exit(1); + } + const retry = await deps.promptValidationRecovery( + label, + getProbeRecovery(probe, { allowModelRetry: true }), + credentialEnv, + helpUrl, + ); + if (retry === "selection") { + console.log(" Please choose a provider/model again."); + console.log(""); + } + return { ok: false, retry }; + } + + return { + validateOpenAiLikeSelection, + validateAnthropicSelectionWithRetryMessage, + validateCustomOpenAiLikeSelection, + validateCustomAnthropicSelection, + }; +} From a7fe203f8f7f686d9275a4d389affbb69d5fb53a Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Thu, 21 May 2026 19:05:07 -0700 Subject: [PATCH 51/54] refactor(cli): move direct sandbox gpu verifier --- src/lib/onboard.ts | 33 ++++++----------------- src/lib/onboard/sandbox-gpu-preflight.ts | 34 ++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 25 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index a6e9cca8b1..21bc0b7411 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -315,11 +315,12 @@ const { createValidationRecoveryPromptHelpers, }: typeof import("./onboard/validation-recovery-prompt") = require("./onboard/validation-recovery-prompt"); const { createOpenshellCliHelpers }: typeof import("./onboard/openshell-cli") = require("./onboard/openshell-cli"); +const sandboxGpuPreflight: typeof import("./onboard/sandbox-gpu-preflight") = require("./onboard/sandbox-gpu-preflight"); const { resolveSandboxGpuFlagFromOptions, sandboxGpuRemediationLines, validateSandboxGpuPreflight, -}: typeof import("./onboard/sandbox-gpu-preflight") = require("./onboard/sandbox-gpu-preflight"); +} = sandboxGpuPreflight; const openshellVersion: typeof import("./onboard/openshell-version") = require("./onboard/openshell-version"); const { getBlueprintMaxOpenshellVersion, @@ -807,30 +808,12 @@ type EndpointValidationResult = | { ok: true; api: string | null; retry?: undefined } | { ok: false; retry: "credential" | "selection" | "retry" | "model"; api?: undefined }; -function verifyDirectSandboxGpu(sandboxName: string): void { - console.log(" Verifying direct sandbox GPU access..."); - for (const proof of buildDirectSandboxGpuProofCommands(sandboxName)) { - const result = runOpenshell(proof.args, { - ignoreError: true, - suppressOutput: true, - timeout: 30_000, - }); - if (result.status === 0) { - console.log(` ✓ GPU proof passed: ${proof.label}`); - continue; - } - if (proof.optional === true) return; - const diagnostic = compactText(redact(`${result.stderr || ""} ${result.stdout || ""}`)); - console.error(` ✗ GPU proof failed: ${proof.label}`); - if (diagnostic) console.error(` ${diagnostic.slice(0, 300)}`); - for (const line of sandboxGpuRemediationLines()) { - console.error(` ${line}`); - } - const statusText = String(result.status || 1); - const diagnosticSuffix = diagnostic ? `: ${diagnostic.slice(0, 300)}` : ""; - throw new Error(`GPU proof failed: ${proof.label} (status ${statusText})${diagnosticSuffix}`); - } -} +const verifyDirectSandboxGpu = sandboxGpuPreflight.createDirectSandboxGpuVerifier({ + runOpenshell, + compactText, + redact, +}); + function upsertMessagingProviders(tokenDefs: MessagingTokenDef[]) { const upserted = onboardProviders.upsertMessagingProviders(tokenDefs, runOpenshell); diff --git a/src/lib/onboard/sandbox-gpu-preflight.ts b/src/lib/onboard/sandbox-gpu-preflight.ts index d33a324f3c..ab0ac2e822 100644 --- a/src/lib/onboard/sandbox-gpu-preflight.ts +++ b/src/lib/onboard/sandbox-gpu-preflight.ts @@ -5,6 +5,7 @@ import { findReadableNvidiaCdiSpecFiles, getDockerCdiSpecDirs, } from "./docker-cdi"; +import { buildDirectSandboxGpuProofCommands } from "./initial-policy"; import type { SandboxGpuConfig, SandboxGpuFlag } from "./sandbox-gpu-mode"; export interface SandboxGpuFlagOptions { @@ -45,6 +46,39 @@ export function sandboxGpuRemediationLines(): string[] { ]; } +export interface DirectSandboxGpuVerifierDeps { + runOpenshell(args: string[], opts?: Record): { status?: number | null; stdout?: unknown; stderr?: unknown }; + compactText(value: string): string; + redact(value: unknown): string; +} + +export function createDirectSandboxGpuVerifier(deps: DirectSandboxGpuVerifierDeps) { + return function verifyDirectSandboxGpu(sandboxName: string): void { + console.log(" Verifying direct sandbox GPU access..."); + for (const proof of buildDirectSandboxGpuProofCommands(sandboxName)) { + const result = deps.runOpenshell(proof.args, { + ignoreError: true, + suppressOutput: true, + timeout: 30_000, + }); + if (result.status === 0) { + console.log(` ✓ GPU proof passed: ${proof.label}`); + continue; + } + if (proof.optional === true) return; + const diagnostic = deps.compactText(deps.redact(`${result.stderr || ""} ${result.stdout || ""}`)); + console.error(` ✗ GPU proof failed: ${proof.label}`); + if (diagnostic) console.error(` ${diagnostic.slice(0, 300)}`); + for (const line of sandboxGpuRemediationLines()) { + console.error(` ${line}`); + } + const statusText = String(result.status || 1); + const diagnosticSuffix = diagnostic ? `: ${diagnostic.slice(0, 300)}` : ""; + throw new Error(`GPU proof failed: ${proof.label} (status ${statusText})${diagnosticSuffix}`); + } + }; +} + export function validateSandboxGpuPreflight(config: SandboxGpuConfig): void { if (config.errors.length > 0) { console.error(""); From ab9a8d180fc6ed313939d7330fc49fb81be8b3d4 Mon Sep 17 00:00:00 2001 From: Aaron Erickson Date: Sat, 23 May 2026 00:59:49 -0700 Subject: [PATCH 52/54] fix(ci): address code scanning alerts in onboard refactor --- src/lib/onboard.ts | 17 +++++++++-------- src/lib/security/credential-hash.ts | 3 +++ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 045a04d086..f305d0e9a2 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -2428,14 +2428,12 @@ async function startGatewayWithOptions( } // Also purge any known_hosts entries matching the gateway hostname pattern const knownHostsPath = path.join(os.homedir(), ".ssh", "known_hosts"); - if (fs.existsSync(knownHostsPath)) { - try { - const kh = fs.readFileSync(knownHostsPath, "utf8"); - const cleaned = pruneKnownHostsEntries(kh); - if (cleaned !== kh) fs.writeFileSync(knownHostsPath, cleaned); - } catch { - /* best-effort cleanup — ignore read/write errors */ - } + try { + const kh = fs.readFileSync(knownHostsPath, "utf8"); + const cleaned = pruneKnownHostsEntries(kh); + if (cleaned !== kh) fs.writeFileSync(knownHostsPath, cleaned); + } catch { + /* best-effort cleanup — ignore absent/read/write errors */ } const gwArgs = ["--name", GATEWAY_NAME, "--port", getGatewayPortArg()]; @@ -2669,6 +2667,9 @@ async function startDockerDriverGateway({ exitOnFailure = true, skipSandboxBridg fs.mkdirSync(stateDir, { recursive: true, mode: 0o700 }); const logPath = path.join(stateDir, "openshell-gateway.log"); + // The gateway state directory is NemoClaw-owned; creating it before opening + // the append-only log is intentional and safe for this local runtime file. + // codeql[js/file-system-race] const outFd = fs.openSync(logPath, "a", 0o600); const errFd = fs.openSync(logPath, "a", 0o600); console.log(" Starting OpenShell Docker-driver gateway..."); diff --git a/src/lib/security/credential-hash.ts b/src/lib/security/credential-hash.ts index 5554591e6f..051dfb6ba3 100644 --- a/src/lib/security/credential-hash.ts +++ b/src/lib/security/credential-hash.ts @@ -6,5 +6,8 @@ import crypto from "node:crypto"; export function hashCredential(value: string | null | undefined): string | null { const normalized = String(value ?? "").trim(); if (!normalized) return null; + // This is a non-secret change detector for credential rotation, not a + // password verifier or credential storage primitive. + // codeql[js/insufficient-password-hash] return crypto.createHash("sha256").update(normalized).digest("hex"); } From bc85d6351c6679369f694a016257d53091ce17cc Mon Sep 17 00:00:00 2001 From: Aaron Erickson Date: Sat, 23 May 2026 01:05:14 -0700 Subject: [PATCH 53/54] fix(ci): place codeql suppressions on alert lines --- src/lib/onboard.ts | 5 ++--- src/lib/security/credential-hash.ts | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index f305d0e9a2..70e36b13b5 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -2669,9 +2669,8 @@ async function startDockerDriverGateway({ exitOnFailure = true, skipSandboxBridg const logPath = path.join(stateDir, "openshell-gateway.log"); // The gateway state directory is NemoClaw-owned; creating it before opening // the append-only log is intentional and safe for this local runtime file. - // codeql[js/file-system-race] - const outFd = fs.openSync(logPath, "a", 0o600); - const errFd = fs.openSync(logPath, "a", 0o600); + const outFd = fs.openSync(logPath, "a", 0o600); // codeql[js/file-system-race] + const errFd = fs.openSync(logPath, "a", 0o600); // codeql[js/file-system-race] console.log(" Starting OpenShell Docker-driver gateway..."); console.log(` Gateway log: ${logPath}`); const launch = gatewayLaunch ?? { diff --git a/src/lib/security/credential-hash.ts b/src/lib/security/credential-hash.ts index 051dfb6ba3..492806eae6 100644 --- a/src/lib/security/credential-hash.ts +++ b/src/lib/security/credential-hash.ts @@ -8,6 +8,5 @@ export function hashCredential(value: string | null | undefined): string | null if (!normalized) return null; // This is a non-secret change detector for credential rotation, not a // password verifier or credential storage primitive. - // codeql[js/insufficient-password-hash] - return crypto.createHash("sha256").update(normalized).digest("hex"); + return crypto.createHash("sha256").update(normalized).digest("hex"); // codeql[js/insufficient-password-hash] } From 80cb1647f657c6e091837018f58343707336e69c Mon Sep 17 00:00:00 2001 From: Aaron Erickson Date: Sat, 23 May 2026 01:12:18 -0700 Subject: [PATCH 54/54] fix(onboard): preserve Brave credential prompt navigation --- src/lib/onboard/web-search-flow.ts | 32 ++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/src/lib/onboard/web-search-flow.ts b/src/lib/onboard/web-search-flow.ts index ab1af78d5f..7700482704 100644 --- a/src/lib/onboard/web-search-flow.ts +++ b/src/lib/onboard/web-search-flow.ts @@ -10,6 +10,7 @@ import { BRAVE_API_KEY_ENV } from "../inference/web-search"; import { ROOT } from "../runner"; import { classifyValidationFailure } from "../validation"; import { getTransportRecoveryMessage } from "../validation-recovery"; +import { BACK_TO_SELECTION, type BackToSelection, isBackToSelection } from "./credential-navigation"; import { exitOnboardFromPrompt, isAffirmativeAnswer } from "./prompt-helpers"; import type { ValidationFailureLike } from "./types"; import { agentSupportsWebSearch } from "./web-search-support"; @@ -28,8 +29,8 @@ export interface WebSearchFlowDeps { export interface WebSearchFlowHelpers { validateBraveSearchApiKey(apiKey: string): CurlProbeResult; promptBraveSearchRecovery(validation: ValidationFailureLike): Promise<"retry" | "skip">; - promptBraveSearchApiKey(): Promise; - ensureValidatedBraveSearchCredential(nonInteractive?: boolean): Promise; + promptBraveSearchApiKey(): Promise; + ensureValidatedBraveSearchCredential(nonInteractive?: boolean): Promise; configureWebSearch( existingConfig?: WebSearchConfig | null, agent?: AgentDefinition | null, @@ -82,15 +83,23 @@ export function createWebSearchFlowHelpers(deps: WebSearchFlowDeps): WebSearchFl return "retry"; } - async function promptBraveSearchApiKey(): Promise { + async function promptBraveSearchApiKey(): Promise { console.log(""); console.log(` Get your Brave Search API key from: ${BRAVE_SEARCH_HELP_URL}`); console.log(""); while (true) { - const key = normalizeCredentialValue( - await deps.prompt(" Brave Search API key: ", { secret: true }), - ); + const value = await deps.prompt(" Brave Search API key: ", { secret: true }); + const intent = normalizeCredentialValue(value).toLowerCase(); + if (intent === "back") return BACK_TO_SELECTION; + if (intent === "exit" || intent === "quit") { + exitOnboardFromPrompt(); + } + if (intent === "?" || intent === "help") { + console.log(" Type back to choose again, or exit to quit."); + continue; + } + const key = normalizeCredentialValue(value); if (!key) { console.error(" Brave Search API key is required."); continue; @@ -101,7 +110,7 @@ export function createWebSearchFlowHelpers(deps: WebSearchFlowDeps): WebSearchFl async function ensureValidatedBraveSearchCredential( nonInteractive = deps.isNonInteractive(), - ): Promise { + ): Promise { const savedApiKey = getCredential(BRAVE_API_KEY_ENV); let apiKey: string | null = savedApiKey || normalizeCredentialValue(process.env[BRAVE_API_KEY_ENV]); @@ -114,7 +123,11 @@ export function createWebSearchFlowHelpers(deps: WebSearchFlowDeps): WebSearchFl "Brave Search requires BRAVE_API_KEY or a saved Brave Search credential in non-interactive mode.", ); } - apiKey = await promptBraveSearchApiKey(); + const promptedApiKey = await promptBraveSearchApiKey(); + if (isBackToSelection(promptedApiKey)) { + return promptedApiKey; + } + apiKey = promptedApiKey; usingSavedKey = false; } @@ -191,6 +204,9 @@ export function createWebSearchFlowHelpers(deps: WebSearchFlowDeps): WebSearchFl } const braveApiKey = await ensureValidatedBraveSearchCredential(); + if (isBackToSelection(braveApiKey)) { + return configureWebSearch(existingConfig, agent, dockerfilePathOverride); + } if (!braveApiKey) { return null; }