From 9ee6cfab9b3fd24ff2ad22c06af41871832cebfb Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Tue, 19 May 2026 19:09:44 -0700
Subject: [PATCH 01/54] refactor(cli): add onboard FSM transition types

---
 src/lib/onboard/machine/transitions.test.ts | 164 ++++++++++++++++++++
 src/lib/onboard/machine/transitions.ts      | 107 +++++++++++++
 src/lib/onboard/machine/types.ts            | 101 ++++++++++++
 3 files changed, 372 insertions(+)
 create mode 100644 src/lib/onboard/machine/transitions.test.ts
 create mode 100644 src/lib/onboard/machine/transitions.ts
 create mode 100644 src/lib/onboard/machine/types.ts

diff --git a/src/lib/onboard/machine/transitions.test.ts b/src/lib/onboard/machine/transitions.test.ts
new file mode 100644
index 0000000000..875a0ec45a
--- /dev/null
+++ b/src/lib/onboard/machine/transitions.test.ts
@@ -0,0 +1,164 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { describe, expect, it } from "vitest";
+
+import {
+  ONBOARD_MACHINE_EVENT_TYPES,
+  ONBOARD_MACHINE_STATES,
+  ONBOARD_NON_TERMINAL_MACHINE_STATES,
+} from "./types";
+import {
+  assertValidOnboardMachineTransition,
+  canTransitionOnboardMachineState,
+  getNextOnboardMachineStates,
+  getOnboardMachineTransition,
+  InvalidOnboardMachineTransitionError,
+  isOnboardMachineState,
+  isTerminalOnboardMachineState,
+  ONBOARD_MACHINE_DIRECT_TRANSITIONS,
+  ONBOARD_MACHINE_NEXT_STATES,
+  ONBOARD_MACHINE_TRANSITIONS,
+} from "./transitions";
+
+const canonicalDirectTransitions = [
+  ["init", "preflight", "advance"],
+  ["preflight", "gateway", "advance"],
+  ["gateway", "provider_selection", "advance"],
+  ["provider_selection", "inference", "advance"],
+  ["inference", "provider_selection", "retry"],
+  ["inference", "sandbox", "advance"],
+  ["sandbox", "openclaw", "branch"],
+  ["sandbox", "agent_setup", "branch"],
+  ["openclaw", "policies", "advance"],
+  ["agent_setup", "policies", "advance"],
+  ["policies", "finalizing", "advance"],
+  ["finalizing", "post_verify", "advance"],
+  ["post_verify", "complete", "advance"],
+] as const;
+
+describe("onboard machine vocabulary", () => {
+  it("defines the initial coarse state vocabulary from issue #3802", () => {
+    expect(ONBOARD_MACHINE_STATES).toEqual([
+      "init",
+      "preflight",
+      "gateway",
+      "provider_selection",
+      "inference",
+      "sandbox",
+      "agent_setup",
+      "openclaw",
+      "policies",
+      "finalizing",
+      "post_verify",
+      "complete",
+      "failed",
+    ]);
+  });
+
+  it("defines the initial observe-only event vocabulary from issue #3802", () => {
+    expect(ONBOARD_MACHINE_EVENT_TYPES).toEqual([
+      "onboard.started",
+      "onboard.resumed",
+      "onboard.completed",
+      "onboard.failed",
+      "state.entered",
+      "state.exited",
+      "state.skipped",
+      "state.completed",
+      "state.failed",
+      "state.repair.started",
+      "state.repair.completed",
+      "state.repair.failed",
+      "context.updated",
+      "resume.conflict",
+      "hook.started",
+      "hook.completed",
+      "hook.failed",
+    ]);
+  });
+
+  it("recognizes valid machine state names", () => {
+    expect(isOnboardMachineState("preflight")).toBe(true);
+    expect(isOnboardMachineState("messaging")).toBe(false);
+    expect(isOnboardMachineState(null)).toBe(false);
+  });
+});
+
+describe("onboard machine transitions", () => {
+  it("encodes the canonical direct transition graph", () => {
+    expect(ONBOARD_MACHINE_DIRECT_TRANSITIONS).toEqual(
+      canonicalDirectTransitions.map(([from, to, kind]) => ({ from, to, kind })),
+    );
+  });
+
+  it("allows every non-terminal state to fail", () => {
+    for (const state of ONBOARD_NON_TERMINAL_MACHINE_STATES) {
+      expect(canTransitionOnboardMachineState(state, "failed")).toBe(true);
+      expect(getOnboardMachineTransition(state, "failed")?.kind).toBe("failure");
+    }
+  });
+
+  it("keeps terminal states terminal", () => {
+    expect(isTerminalOnboardMachineState("complete")).toBe(true);
+    expect(isTerminalOnboardMachineState("failed")).toBe(true);
+    expect(getNextOnboardMachineStates("complete")).toEqual([]);
+    expect(getNextOnboardMachineStates("failed")).toEqual([]);
+    expect(canTransitionOnboardMachineState("complete", "failed")).toBe(false);
+    expect(canTransitionOnboardMachineState("failed", "init")).toBe(false);
+  });
+
+  it("exposes next states in deterministic order", () => {
+    expect(ONBOARD_MACHINE_NEXT_STATES).toEqual({
+      init: ["preflight", "failed"],
+      preflight: ["gateway", "failed"],
+      gateway: ["provider_selection", "failed"],
+      provider_selection: ["inference", "failed"],
+      inference: ["provider_selection", "sandbox", "failed"],
+      sandbox: ["openclaw", "agent_setup", "failed"],
+      agent_setup: ["policies", "failed"],
+      openclaw: ["policies", "failed"],
+      policies: ["finalizing", "failed"],
+      finalizing: ["post_verify", "failed"],
+      post_verify: ["complete", "failed"],
+      complete: [],
+      failed: [],
+    });
+  });
+
+  it("classifies retry and branch transitions", () => {
+    expect(assertValidOnboardMachineTransition("inference", "provider_selection")).toMatchObject({
+      kind: "retry",
+    });
+    expect(assertValidOnboardMachineTransition("sandbox", "openclaw")).toMatchObject({
+      kind: "branch",
+    });
+    expect(assertValidOnboardMachineTransition("sandbox", "agent_setup")).toMatchObject({
+      kind: "branch",
+    });
+  });
+
+  it("rejects transitions outside the graph", () => {
+    expect(() => assertValidOnboardMachineTransition("init", "sandbox")).toThrow(
+      InvalidOnboardMachineTransitionError,
+    );
+    expect(() => assertValidOnboardMachineTransition("complete", "failed")).toThrow(
+      "complete -> failed",
+    );
+  });
+
+  it("keeps the next-state map aligned with the transition list", () => {
+    for (const state of ONBOARD_MACHINE_STATES) {
+      expect(
+        ONBOARD_MACHINE_TRANSITIONS.filter((transition) => transition.from === state).map(
+          (transition) => transition.to,
+        ),
+      ).toEqual(getNextOnboardMachineStates(state));
+    }
+  });
+
+  it("does not contain duplicate transition edges", () => {
+    const edges = ONBOARD_MACHINE_TRANSITIONS.map(({ from, to }) => `${from}->${to}`);
+    expect(new Set(edges).size).toBe(edges.length);
+  });
+});
diff --git a/src/lib/onboard/machine/transitions.ts b/src/lib/onboard/machine/transitions.ts
new file mode 100644
index 0000000000..9f23e3895a
--- /dev/null
+++ b/src/lib/onboard/machine/transitions.ts
@@ -0,0 +1,107 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import type { OnboardMachineState, OnboardMachineTransition } from "./types";
+import {
+  ONBOARD_MACHINE_STATES,
+  ONBOARD_NON_TERMINAL_MACHINE_STATES,
+  ONBOARD_TERMINAL_MACHINE_STATES,
+} from "./types";
+
+export const ONBOARD_MACHINE_NEXT_STATES = {
+  init: ["preflight", "failed"],
+  preflight: ["gateway", "failed"],
+  gateway: ["provider_selection", "failed"],
+  provider_selection: ["inference", "failed"],
+  inference: ["provider_selection", "sandbox", "failed"],
+  sandbox: ["openclaw", "agent_setup", "failed"],
+  agent_setup: ["policies", "failed"],
+  openclaw: ["policies", "failed"],
+  policies: ["finalizing", "failed"],
+  finalizing: ["post_verify", "failed"],
+  post_verify: ["complete", "failed"],
+  complete: [],
+  failed: [],
+} as const satisfies Readonly<Record<OnboardMachineState, readonly OnboardMachineState[]>>;
+
+export const ONBOARD_MACHINE_DIRECT_TRANSITIONS = [
+  { from: "init", to: "preflight", kind: "advance" },
+  { from: "preflight", to: "gateway", kind: "advance" },
+  { from: "gateway", to: "provider_selection", kind: "advance" },
+  { from: "provider_selection", to: "inference", kind: "advance" },
+  { from: "inference", to: "provider_selection", kind: "retry" },
+  { from: "inference", to: "sandbox", kind: "advance" },
+  { from: "sandbox", to: "openclaw", kind: "branch" },
+  { from: "sandbox", to: "agent_setup", kind: "branch" },
+  { from: "openclaw", to: "policies", kind: "advance" },
+  { from: "agent_setup", to: "policies", kind: "advance" },
+  { from: "policies", to: "finalizing", kind: "advance" },
+  { from: "finalizing", to: "post_verify", kind: "advance" },
+  { from: "post_verify", to: "complete", kind: "advance" },
+] as const satisfies readonly OnboardMachineTransition[];
+
+export const ONBOARD_MACHINE_FAILURE_TRANSITIONS = ONBOARD_NON_TERMINAL_MACHINE_STATES.map(
+  (from) => ({ from, to: "failed" as const, kind: "failure" as const }),
+) satisfies readonly OnboardMachineTransition[];
+
+export const ONBOARD_MACHINE_TRANSITIONS = [
+  ...ONBOARD_MACHINE_DIRECT_TRANSITIONS,
+  ...ONBOARD_MACHINE_FAILURE_TRANSITIONS,
+] as const satisfies readonly OnboardMachineTransition[];
+
+export class InvalidOnboardMachineTransitionError extends Error {
+  readonly from: OnboardMachineState;
+  readonly to: OnboardMachineState;
+
+  constructor(from: OnboardMachineState, to: OnboardMachineState) {
+    super(`Invalid onboarding machine transition: ${from} -> ${to}`);
+    this.name = "InvalidOnboardMachineTransitionError";
+    this.from = from;
+    this.to = to;
+  }
+}
+
+export function isOnboardMachineState(value: unknown): value is OnboardMachineState {
+  return typeof value === "string" && ONBOARD_MACHINE_STATES.includes(value as OnboardMachineState);
+}
+
+export function isTerminalOnboardMachineState(
+  state: OnboardMachineState,
+): state is "complete" | "failed" {
+  return ONBOARD_TERMINAL_MACHINE_STATES.includes(state as "complete" | "failed");
+}
+
+export function getNextOnboardMachineStates(
+  from: OnboardMachineState,
+): readonly OnboardMachineState[] {
+  return ONBOARD_MACHINE_NEXT_STATES[from];
+}
+
+export function canTransitionOnboardMachineState(
+  from: OnboardMachineState,
+  to: OnboardMachineState,
+): boolean {
+  return getNextOnboardMachineStates(from).includes(to);
+}
+
+export function getOnboardMachineTransition(
+  from: OnboardMachineState,
+  to: OnboardMachineState,
+): OnboardMachineTransition | null {
+  return (
+    ONBOARD_MACHINE_TRANSITIONS.find(
+      (transition) => transition.from === from && transition.to === to,
+    ) ?? null
+  );
+}
+
+export function assertValidOnboardMachineTransition(
+  from: OnboardMachineState,
+  to: OnboardMachineState,
+): OnboardMachineTransition {
+  const transition = getOnboardMachineTransition(from, to);
+  if (!transition) {
+    throw new InvalidOnboardMachineTransitionError(from, to);
+  }
+  return transition;
+}
diff --git a/src/lib/onboard/machine/types.ts b/src/lib/onboard/machine/types.ts
new file mode 100644
index 0000000000..bbba7bd5f6
--- /dev/null
+++ b/src/lib/onboard/machine/types.ts
@@ -0,0 +1,101 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Coarse onboarding finite-state-machine vocabulary.
+ *
+ * These types intentionally model only major step boundaries. Mid-operation
+ * resume inside gateway startup, sandbox creation, credential upserts, model
+ * probes, or policy application is out of scope for the initial FSM shell.
+ */
+
+export const ONBOARD_MACHINE_STATES = [
+  "init",
+  "preflight",
+  "gateway",
+  "provider_selection",
+  "inference",
+  "sandbox",
+  "agent_setup",
+  "openclaw",
+  "policies",
+  "finalizing",
+  "post_verify",
+  "complete",
+  "failed",
+] as const;
+
+export type OnboardMachineState = (typeof ONBOARD_MACHINE_STATES)[number];
+
+export const ONBOARD_TERMINAL_MACHINE_STATES = ["complete", "failed"] as const;
+
+export type OnboardTerminalMachineState =
+  (typeof ONBOARD_TERMINAL_MACHINE_STATES)[number];
+
+export type OnboardNonTerminalMachineState = Exclude<
+  OnboardMachineState,
+  OnboardTerminalMachineState
+>;
+
+export const ONBOARD_NON_TERMINAL_MACHINE_STATES: readonly OnboardNonTerminalMachineState[] =
+  ONBOARD_MACHINE_STATES.filter(
+    (state): state is OnboardNonTerminalMachineState =>
+      !ONBOARD_TERMINAL_MACHINE_STATES.includes(state as OnboardTerminalMachineState),
+  );
+
+export const ONBOARD_MACHINE_EVENT_TYPES = [
+  "onboard.started",
+  "onboard.resumed",
+  "onboard.completed",
+  "onboard.failed",
+  "state.entered",
+  "state.exited",
+  "state.skipped",
+  "state.completed",
+  "state.failed",
+  "state.repair.started",
+  "state.repair.completed",
+  "state.repair.failed",
+  "context.updated",
+  "resume.conflict",
+  "hook.started",
+  "hook.completed",
+  "hook.failed",
+] as const;
+
+export type OnboardMachineEventType = (typeof ONBOARD_MACHINE_EVENT_TYPES)[number];
+
+export type OnboardMachineTransitionKind =
+  | "advance"
+  | "retry"
+  | "branch"
+  | "failure";
+
+export interface OnboardMachineTransition {
+  from: OnboardMachineState;
+  to: OnboardMachineState;
+  kind: OnboardMachineTransitionKind;
+}
+
+/**
+ * Stable, redacted context keys that machine events may expose.
+ *
+ * Do not add raw secrets or unredacted URLs here. Runtime-derived topology
+ * decisions such as Docker/WSL reachability, Ollama proxy necessity, or live
+ * gateway health should be recomputed during execution rather than stored as
+ * durable FSM context.
+ */
+export interface OnboardMachineContext {
+  agent?: string | null;
+  sandboxName?: string | null;
+  provider?: string | null;
+  model?: string | null;
+  endpointUrl?: string | null;
+  credentialEnv?: string | null;
+  preferredInferenceApi?: string | null;
+  hermesAuthMethod?: "oauth" | "api_key" | null;
+  hermesToolGateways?: string[] | null;
+  policyPresets?: string[] | null;
+  messagingChannels?: string[] | null;
+  gpuPassthrough?: boolean;
+}

From b9e4545e44066975dab7945a93b580b366ec82c2 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Tue, 19 May 2026 19:27:06 -0700
Subject: [PATCH 02/54] refactor(cli): emit onboard session machine events

---
 src/lib/onboard/machine/events.ts     | 166 ++++++++++++++++++++++++++
 src/lib/state/onboard-session.test.ts |  90 ++++++++++++++
 src/lib/state/onboard-session.ts      |  94 +++++++++++++--
 3 files changed, 343 insertions(+), 7 deletions(-)
 create mode 100644 src/lib/onboard/machine/events.ts

diff --git a/src/lib/onboard/machine/events.ts b/src/lib/onboard/machine/events.ts
new file mode 100644
index 0000000000..9a68d3f899
--- /dev/null
+++ b/src/lib/onboard/machine/events.ts
@@ -0,0 +1,166 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import type { JsonObject, JsonValue } from "../../core/json-types";
+import { redactSensitiveText, redactUrl } from "../../security/redact";
+import type { HermesAuthMethod, Session } from "../../state/onboard-session";
+import type {
+  OnboardMachineContext,
+  OnboardMachineEventType,
+  OnboardMachineState,
+} from "./types";
+
+export const ONBOARD_SESSION_STEP_TO_MACHINE_STATE = {
+  preflight: "preflight",
+  gateway: "gateway",
+  provider_selection: "provider_selection",
+  inference: "inference",
+  sandbox: "sandbox",
+  agent_setup: "agent_setup",
+  openclaw: "openclaw",
+  policies: "policies",
+} as const satisfies Readonly<Record<string, OnboardMachineState>>;
+
+export type OnboardSessionStepName = keyof typeof ONBOARD_SESSION_STEP_TO_MACHINE_STATE;
+
+export interface OnboardMachineEvent {
+  version: 1;
+  type: OnboardMachineEventType;
+  occurredAt: string;
+  sessionId: string | null;
+  state: OnboardMachineState | null;
+  step: OnboardSessionStepName | null;
+  context: OnboardMachineContext;
+  error: string | null;
+  metadata: JsonObject;
+}
+
+export type OnboardMachineEventListener = (event: OnboardMachineEvent) => void;
+
+const listeners = new Set<OnboardMachineEventListener>();
+
+export function addOnboardMachineEventListener(
+  listener: OnboardMachineEventListener,
+): () => void {
+  listeners.add(listener);
+  return () => {
+    listeners.delete(listener);
+  };
+}
+
+export function clearOnboardMachineEventListeners(): void {
+  listeners.clear();
+}
+
+export function isOnboardSessionStepName(value: string): value is OnboardSessionStepName {
+  return Object.prototype.hasOwnProperty.call(ONBOARD_SESSION_STEP_TO_MACHINE_STATE, value);
+}
+
+export function machineStateFromOnboardSessionStep(
+  stepName: string | null | undefined,
+): OnboardMachineState | null {
+  if (!stepName || !isOnboardSessionStepName(stepName)) return null;
+  return ONBOARD_SESSION_STEP_TO_MACHINE_STATE[stepName];
+}
+
+function nullableString(value: unknown): string | null {
+  return typeof value === "string" ? value : null;
+}
+
+function stringArray(value: unknown): string[] | null {
+  if (!Array.isArray(value)) return null;
+  return value.filter((entry): entry is string => typeof entry === "string");
+}
+
+function hermesAuthMethod(value: unknown): HermesAuthMethod | null {
+  return value === "oauth" || value === "api_key" ? value : null;
+}
+
+function booleanValue(value: unknown): boolean | undefined {
+  return typeof value === "boolean" ? value : undefined;
+}
+
+function sanitizeJsonValue(value: unknown): JsonValue {
+  if (typeof value === "string") return redactUrl(value) ?? redactSensitiveText(value) ?? "";
+  if (typeof value === "number" && Number.isFinite(value)) return value;
+  if (typeof value === "boolean" || value === null) return value;
+  if (Array.isArray(value)) return value.map((entry) => sanitizeJsonValue(entry));
+  if (typeof value !== "object" || value === null) return String(value);
+
+  const result: JsonObject = {};
+  for (const [key, entry] of Object.entries(value)) {
+    result[key] = sanitizeJsonValue(entry);
+  }
+  return result;
+}
+
+export function sanitizeOnboardMachineEventMetadata(
+  metadata: Record<string, unknown> | null | undefined,
+): JsonObject {
+  if (!metadata || typeof metadata !== "object" || Array.isArray(metadata)) return {};
+  const sanitized: JsonObject = {};
+  for (const [key, value] of Object.entries(metadata)) {
+    sanitized[key] = sanitizeJsonValue(value);
+  }
+  return sanitized;
+}
+
+export function buildOnboardMachineContext(session: Session): OnboardMachineContext {
+  const endpointUrl = redactUrl(session.endpointUrl);
+  return {
+    agent: nullableString(session.agent),
+    sandboxName: nullableString(session.sandboxName),
+    provider: nullableString(session.provider),
+    model: nullableString(session.model),
+    endpointUrl,
+    credentialEnv: nullableString(session.credentialEnv),
+    preferredInferenceApi: nullableString(session.preferredInferenceApi),
+    hermesAuthMethod: hermesAuthMethod(session.hermesAuthMethod),
+    hermesToolGateways: stringArray(session.hermesToolGateways),
+    policyPresets: stringArray(session.policyPresets),
+    messagingChannels: stringArray(session.messagingChannels),
+    gpuPassthrough: booleanValue(session.gpuPassthrough),
+  };
+}
+
+export function createOnboardMachineEvent({
+  type,
+  session,
+  step,
+  state,
+  error = null,
+  metadata = {},
+}: {
+  type: OnboardMachineEventType;
+  session: Session;
+  step?: string | null;
+  state?: OnboardMachineState | null;
+  error?: string | null;
+  metadata?: Record<string, unknown> | null;
+}): OnboardMachineEvent {
+  const normalizedStep = step && isOnboardSessionStepName(step) ? step : null;
+  return {
+    version: 1,
+    type,
+    occurredAt: new Date().toISOString(),
+    sessionId: nullableString(session.sessionId),
+    state: state ?? machineStateFromOnboardSessionStep(normalizedStep),
+    step: normalizedStep,
+    context: buildOnboardMachineContext(session),
+    error: redactSensitiveText(error),
+    metadata: sanitizeOnboardMachineEventMetadata(metadata),
+  };
+}
+
+export function emitOnboardMachineEvent(event: OnboardMachineEvent): void {
+  if (listeners.size === 0) return;
+  for (const listener of listeners) {
+    try {
+      listener(event);
+    } catch {
+      // Event observers are diagnostics only. A broken observer must not
+      // change onboarding behavior; hook failure events are introduced by the
+      // later observe-only hook API.
+    }
+  }
+}
diff --git a/src/lib/state/onboard-session.test.ts b/src/lib/state/onboard-session.test.ts
index b2c925858f..5ddd94908d 100644
--- a/src/lib/state/onboard-session.test.ts
+++ b/src/lib/state/onboard-session.test.ts
@@ -9,11 +9,15 @@ import { createRequire } from "node:module";
 
 const require = createRequire(import.meta.url);
 const distPath = require.resolve("../../../dist/lib/state/onboard-session");
+const eventsDistPath = require.resolve("../../../dist/lib/onboard/machine/events");
 const originalHome = process.env.HOME;
 type OnboardSessionModule = typeof import("../../../dist/lib/state/onboard-session");
+type OnboardMachineEventsModule = typeof import("../../../dist/lib/onboard/machine/events");
+type OnboardMachineEvent = import("../../../dist/lib/onboard/machine/events").OnboardMachineEvent;
 type LoadedSession = NonNullable<ReturnType<OnboardSessionModule["loadSession"]>>;
 type DebugSummary = NonNullable<ReturnType<OnboardSessionModule["summarizeForDebug"]>>;
 let session: OnboardSessionModule;
+let machineEvents: OnboardMachineEventsModule;
 let tmpDir: string;
 
 function requireLoadedSession(
@@ -44,13 +48,18 @@ beforeEach(() => {
   tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-onboard-session-"));
   process.env.HOME = tmpDir;
   delete require.cache[distPath];
+  delete require.cache[eventsDistPath];
   session = require("../../../dist/lib/state/onboard-session");
+  machineEvents = require("../../../dist/lib/onboard/machine/events");
+  machineEvents.clearOnboardMachineEventListeners();
   session.clearSession();
   session.releaseOnboardLock();
 });
 
 afterEach(() => {
+  machineEvents.clearOnboardMachineEventListeners();
   delete require.cache[distPath];
+  delete require.cache[eventsDistPath];
   fs.rmSync(tmpDir, { recursive: true, force: true });
   if (originalHome === undefined) {
     delete process.env.HOME;
@@ -117,6 +126,87 @@ describe("onboard session", () => {
     expect(loaded.failure.message).toMatch(/Sandbox creation failed/);
   });
 
+  it("emits redacted structured machine events for session step mutations", () => {
+    const emitted: OnboardMachineEvent[] = [];
+    machineEvents.addOnboardMachineEventListener((event) => emitted.push(event));
+
+    session.saveSession(session.createSession({ sessionId: "session-1" }));
+    session.markStepStarted("gateway");
+    session.markStepComplete("gateway", {
+      sandboxName: "my-assistant",
+      endpointUrl:
+        "https://alice:super-secret-token@example.com/v1?token=super-secret-token&keep=yes#token=super-secret-token",
+      credentialEnv: "NVIDIA_API_KEY",
+    });
+    session.markStepSkipped("openclaw");
+    session.markStepFailed("sandbox", "NVIDIA_API_KEY=super-secret-token");
+    session.completeSession({ provider: "ollama-local", credentialEnv: null });
+
+    expect(emitted.map((event) => event.type)).toEqual([
+      "state.entered",
+      "context.updated",
+      "state.completed",
+      "state.skipped",
+      "state.failed",
+      "onboard.failed",
+      "context.updated",
+      "onboard.completed",
+    ]);
+    expect(emitted[0]).toMatchObject({
+      version: 1,
+      sessionId: "session-1",
+      state: "gateway",
+      step: "gateway",
+      error: null,
+    });
+    expect(emitted[1].context).toMatchObject({
+      sandboxName: "my-assistant",
+      credentialEnv: "NVIDIA_API_KEY",
+    });
+    expect(emitted[1].context.endpointUrl).toBe(
+      "https://example.com/v1?token=%3CREDACTED%3E&keep=yes",
+    );
+    expect(emitted[1].metadata.fields).toEqual([
+      "sandboxName",
+      "endpointUrl",
+      "credentialEnv",
+    ]);
+    expect(emitted[4]).toMatchObject({
+      type: "state.failed",
+      state: "sandbox",
+      step: "sandbox",
+      error: "NVIDIA_API_KEY=<REDACTED>",
+    });
+    expect(emitted[5]).toMatchObject({ type: "onboard.failed", state: "failed" });
+    expect(emitted.at(-1)).toMatchObject({ type: "onboard.completed", state: "complete" });
+    expect(JSON.stringify(emitted)).not.toContain("super-secret-token");
+
+    const persisted = JSON.parse(fs.readFileSync(session.SESSION_FILE, "utf8"));
+    expect(persisted.events).toBeUndefined();
+  });
+
+  it("keeps event observer failures from changing session mutation behavior", () => {
+    machineEvents.addOnboardMachineEventListener(() => {
+      throw new Error("observer failed");
+    });
+
+    session.saveSession(session.createSession());
+    expect(() => session.markStepStarted("preflight")).not.toThrow();
+
+    const loaded = requireLoadedSession(session.loadSession());
+    expect(loaded.steps.preflight.status).toBe("in_progress");
+  });
+
+  it("does not emit machine events for unknown session step names", () => {
+    const emitted: OnboardMachineEvent[] = [];
+    machineEvents.addOnboardMachineEventListener((event) => emitted.push(event));
+
+    session.saveSession(session.createSession());
+    session.markStepStarted("not_a_real_step");
+
+    expect(emitted).toEqual([]);
+  });
+
   it("persists safe provider metadata without persisting secrets", () => {
     session.saveSession(session.createSession());
     const unsafeProviderUpdate: Parameters<OnboardSessionModule["markStepComplete"]>[1] & {
diff --git a/src/lib/state/onboard-session.ts b/src/lib/state/onboard-session.ts
index f05c1116e8..7fe94d8096 100644
--- a/src/lib/state/onboard-session.ts
+++ b/src/lib/state/onboard-session.ts
@@ -18,6 +18,10 @@ import {
   sanitizeMessagingChannelConfig,
   type MessagingChannelConfig,
 } from "../messaging-channel-config";
+import {
+  createOnboardMachineEvent,
+  emitOnboardMachineEvent,
+} from "../onboard/machine/events";
 import { redactSensitiveText, redactUrl } from "../security/redact";
 
 export const SESSION_VERSION = 1;
@@ -883,7 +887,8 @@ export function updateSession(mutator: (session: Session) => Session | void): Se
 }
 
 export function markStepStarted(stepName: string): Session {
-  return updateSession((session) => {
+  let shouldEmit = false;
+  const updatedSession = updateSession((session) => {
     const step = session.steps[stepName];
     if (!step) return session;
     step.status = "in_progress";
@@ -893,12 +898,21 @@ export function markStepStarted(stepName: string): Session {
     session.lastStepStarted = stepName;
     session.failure = null;
     session.status = "in_progress";
+    shouldEmit = true;
     return session;
   });
+  if (shouldEmit) {
+    emitOnboardMachineEvent(
+      createOnboardMachineEvent({ type: "state.entered", session: updatedSession, step: stepName }),
+    );
+  }
+  return updatedSession;
 }
 
 export function markStepComplete(stepName: string, updates: SessionUpdates = {}): Session {
-  return updateSession((session) => {
+  const safeUpdates = filterSafeUpdates(updates);
+  let shouldEmit = false;
+  const updatedSession = updateSession((session) => {
     const step = session.steps[stepName];
     if (!step) return session;
     step.status = "complete";
@@ -906,13 +920,31 @@ export function markStepComplete(stepName: string, updates: SessionUpdates = {})
     step.error = null;
     session.lastCompletedStep = stepName;
     session.failure = null;
-    Object.assign(session, filterSafeUpdates(updates));
+    Object.assign(session, safeUpdates);
+    shouldEmit = true;
     return session;
   });
+  if (shouldEmit) {
+    if (Object.keys(safeUpdates).length > 0) {
+      emitOnboardMachineEvent(
+        createOnboardMachineEvent({
+          type: "context.updated",
+          session: updatedSession,
+          step: stepName,
+          metadata: { fields: Object.keys(safeUpdates) },
+        }),
+      );
+    }
+    emitOnboardMachineEvent(
+      createOnboardMachineEvent({ type: "state.completed", session: updatedSession, step: stepName }),
+    );
+  }
+  return updatedSession;
 }
 
 export function markStepSkipped(stepName: string): Session {
-  return updateSession((session) => {
+  let shouldEmit = false;
+  const updatedSession = updateSession((session) => {
     const step = session.steps[stepName];
     if (!step) return session;
     if (step.status === "complete" || step.status === "failed") return session;
@@ -920,12 +952,20 @@ export function markStepSkipped(stepName: string): Session {
     step.startedAt = null;
     step.completedAt = null;
     step.error = null;
+    shouldEmit = true;
     return session;
   });
+  if (shouldEmit) {
+    emitOnboardMachineEvent(
+      createOnboardMachineEvent({ type: "state.skipped", session: updatedSession, step: stepName }),
+    );
+  }
+  return updatedSession;
 }
 
 export function markStepFailed(stepName: string, message: string | null = null): Session {
-  return updateSession((session) => {
+  let shouldEmit = false;
+  const updatedSession = updateSession((session) => {
     const step = session.steps[stepName];
     if (!step) return session;
     step.status = "failed";
@@ -937,18 +977,58 @@ export function markStepFailed(stepName: string, message: string | null = null):
       recordedAt: new Date().toISOString(),
     });
     session.status = "failed";
+    shouldEmit = true;
     return session;
   });
+  if (shouldEmit) {
+    emitOnboardMachineEvent(
+      createOnboardMachineEvent({
+        type: "state.failed",
+        session: updatedSession,
+        step: stepName,
+        error: message,
+      }),
+    );
+    emitOnboardMachineEvent(
+      createOnboardMachineEvent({
+        type: "onboard.failed",
+        session: updatedSession,
+        state: "failed",
+        step: stepName,
+        error: message,
+      }),
+    );
+  }
+  return updatedSession;
 }
 
 export function completeSession(updates: SessionUpdates = {}): Session {
-  return updateSession((session) => {
-    Object.assign(session, filterSafeUpdates(updates));
+  const safeUpdates = filterSafeUpdates(updates);
+  const updatedSession = updateSession((session) => {
+    Object.assign(session, safeUpdates);
     session.status = "complete";
     session.resumable = false;
     session.failure = null;
     return session;
   });
+  if (Object.keys(safeUpdates).length > 0) {
+    emitOnboardMachineEvent(
+      createOnboardMachineEvent({
+        type: "context.updated",
+        session: updatedSession,
+        state: "complete",
+        metadata: { fields: Object.keys(safeUpdates) },
+      }),
+    );
+  }
+  emitOnboardMachineEvent(
+    createOnboardMachineEvent({
+      type: "onboard.completed",
+      session: updatedSession,
+      state: "complete",
+    }),
+  );
+  return updatedSession;
 }
 
 export function summarizeForDebug(

From 651e2a07c3f34bd38cf942d08ad350e5d6b5eb86 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Tue, 19 May 2026 21:47:57 -0700
Subject: [PATCH 03/54] refactor(cli): persist onboard machine snapshots

---
 src/lib/actions/inference-set.test.ts |   8 +-
 src/lib/state/onboard-session.test.ts | 115 ++++++++++++++++++++
 src/lib/state/onboard-session.ts      | 145 ++++++++++++++++++++++++--
 3 files changed, 259 insertions(+), 9 deletions(-)

diff --git a/src/lib/actions/inference-set.test.ts b/src/lib/actions/inference-set.test.ts
index ae091f7adf..f6c178f0cf 100644
--- a/src/lib/actions/inference-set.test.ts
+++ b/src/lib/actions/inference-set.test.ts
@@ -86,9 +86,15 @@ function baseSession(overrides: Partial<Session> = {}): Session {
     telegramConfig: null,
     wechatConfig: null,
     metadata: { gatewayName: "nemoclaw", fromDockerfile: null },
+    machine: {
+      version: 1,
+      state: "complete",
+      stateEnteredAt: "2026-05-11T00:00:00.000Z",
+      revision: 0,
+    },
     steps: {},
     ...overrides,
-  };
+  } as Session;
 }
 
 function createDeps(options: {
diff --git a/src/lib/state/onboard-session.test.ts b/src/lib/state/onboard-session.test.ts
index 5ddd94908d..8e4b9f5cbc 100644
--- a/src/lib/state/onboard-session.test.ts
+++ b/src/lib/state/onboard-session.test.ts
@@ -40,6 +40,14 @@ function requireDebugSummary(
   return summary;
 }
 
+function normalizeLegacySession(
+  legacy: unknown,
+): ReturnType<OnboardSessionModule["normalizeSession"]> {
+  return session.normalizeSession(
+    legacy as Parameters<OnboardSessionModule["normalizeSession"]>[0],
+  );
+}
+
 beforeEach(() => {
   // Recreate tmpDir per test so lock artifacts (and any other on-disk state)
   // from a previous test cannot leak into this one. Without this, malformed
@@ -80,6 +88,12 @@ describe("onboard session", () => {
     const dirStat = fs.statSync(path.dirname(session.SESSION_FILE));
 
     expect(saved.mode).toBe("non-interactive");
+    expect(saved.machine).toMatchObject({
+      version: 1,
+      state: "init",
+      revision: 0,
+    });
+    expect(saved.machine.stateEnteredAt).toBeTruthy();
     expect(fs.existsSync(session.SESSION_FILE)).toBe(true);
     expect(stat.mode & 0o777).toBe(0o600);
     expect(dirStat.mode & 0o777).toBe(0o700);
@@ -124,6 +138,107 @@ describe("onboard session", () => {
     }
     expect(loaded.failure.step).toBe("sandbox");
     expect(loaded.failure.message).toMatch(/Sandbox creation failed/);
+    expect(loaded.machine.state).toBe("failed");
+  });
+
+  it("persists a compact machine snapshot across step boundaries", () => {
+    session.saveSession(session.createSession());
+    let loaded = requireLoadedSession(session.loadSession());
+    expect(loaded.machine).toMatchObject({ state: "init", revision: 0 });
+
+    session.markStepStarted("preflight");
+    loaded = requireLoadedSession(session.loadSession());
+    expect(loaded.machine).toMatchObject({ state: "preflight", revision: 1 });
+    expect(loaded.machine.stateEnteredAt).toBe(loaded.steps.preflight.startedAt);
+
+    session.markStepComplete("preflight");
+    loaded = requireLoadedSession(session.loadSession());
+    expect(loaded.machine).toMatchObject({ state: "gateway", revision: 2 });
+    expect(loaded.machine.stateEnteredAt).toBe(loaded.steps.preflight.completedAt);
+
+    session.markStepComplete("gateway");
+    loaded = requireLoadedSession(session.loadSession());
+    expect(loaded.machine).toMatchObject({ state: "provider_selection", revision: 3 });
+
+    session.completeSession();
+    loaded = requireLoadedSession(session.loadSession());
+    expect(loaded.machine).toMatchObject({ state: "complete", revision: 4 });
+    expect(requireDebugSummary(session.summarizeForDebug()).machine).toEqual(loaded.machine);
+  });
+
+  it("normalizes old sessions without machine snapshots", () => {
+    type LegacySession = Omit<ReturnType<OnboardSessionModule["createSession"]>, "machine"> & {
+      machine?: unknown;
+    };
+    const legacy = session.createSession({
+      sessionId: "legacy-session",
+      startedAt: "2026-01-01T00:00:00.000Z",
+      updatedAt: "2026-01-01T00:05:00.000Z",
+    }) as unknown as LegacySession;
+    delete legacy.machine;
+    legacy.steps.gateway.status = "in_progress";
+    legacy.steps.gateway.startedAt = "2026-01-01T00:02:00.000Z";
+    legacy.lastStepStarted = "gateway";
+
+    let normalized = requireLoadedSession(normalizeLegacySession(legacy));
+    expect(normalized.machine).toEqual({
+      version: 1,
+      state: "gateway",
+      stateEnteredAt: "2026-01-01T00:02:00.000Z",
+      revision: 0,
+    });
+
+    legacy.steps.gateway.status = "complete";
+    legacy.steps.gateway.completedAt = "2026-01-01T00:03:00.000Z";
+    legacy.lastCompletedStep = "gateway";
+    normalized = requireLoadedSession(normalizeLegacySession(legacy));
+    expect(normalized.machine).toEqual({
+      version: 1,
+      state: "provider_selection",
+      stateEnteredAt: "2026-01-01T00:03:00.000Z",
+      revision: 0,
+    });
+
+    legacy.status = "failed";
+    legacy.failure = {
+      step: "gateway",
+      message: "boom",
+      recordedAt: "2026-01-01T00:04:00.000Z",
+    };
+    normalized = requireLoadedSession(normalizeLegacySession(legacy));
+    expect(normalized.machine).toEqual({
+      version: 1,
+      state: "failed",
+      stateEnteredAt: "2026-01-01T00:04:00.000Z",
+      revision: 0,
+    });
+
+    legacy.status = "complete";
+    normalized = requireLoadedSession(normalizeLegacySession(legacy));
+    expect(normalized.machine.state).toBe("complete");
+  });
+
+  it("normalizes invalid machine snapshots from old sessions", () => {
+    type LegacySession = Omit<ReturnType<OnboardSessionModule["createSession"]>, "machine"> & {
+      machine?: unknown;
+    };
+    const legacy = session.createSession({ lastCompletedStep: "policies" }) as unknown as LegacySession;
+    legacy.steps.policies.status = "complete";
+    legacy.steps.policies.completedAt = "2026-01-01T00:08:00.000Z";
+    legacy.machine = {
+      version: 1,
+      state: "not-a-state",
+      stateEnteredAt: "2026-01-01T00:09:00.000Z",
+      revision: -1,
+    };
+
+    const normalized = requireLoadedSession(normalizeLegacySession(legacy));
+    expect(normalized.machine).toEqual({
+      version: 1,
+      state: "finalizing",
+      stateEnteredAt: "2026-01-01T00:08:00.000Z",
+      revision: 0,
+    });
   });
 
   it("emits redacted structured machine events for session step mutations", () => {
diff --git a/src/lib/state/onboard-session.ts b/src/lib/state/onboard-session.ts
index 7fe94d8096..f739f330d2 100644
--- a/src/lib/state/onboard-session.ts
+++ b/src/lib/state/onboard-session.ts
@@ -21,10 +21,14 @@ import {
 import {
   createOnboardMachineEvent,
   emitOnboardMachineEvent,
+  machineStateFromOnboardSessionStep,
 } from "../onboard/machine/events";
+import { isOnboardMachineState } from "../onboard/machine/transitions";
+import type { OnboardMachineState } from "../onboard/machine/types";
 import { redactSensitiveText, redactUrl } from "../security/redact";
 
 export const SESSION_VERSION = 1;
+export const MACHINE_SNAPSHOT_VERSION = 1;
 export const SESSION_DIR = path.join(process.env.HOME || "/tmp", ".nemoclaw");
 export const SESSION_FILE = path.join(SESSION_DIR, "onboard-session.json");
 export const LOCK_FILE = path.join(SESSION_DIR, "onboard.lock");
@@ -64,6 +68,13 @@ export interface SessionMetadata {
   fromDockerfile: string | null;
 }
 
+export interface OnboardMachineSnapshot {
+  version: typeof MACHINE_SNAPSHOT_VERSION;
+  state: OnboardMachineState;
+  stateEnteredAt: string | null;
+  revision: number;
+}
+
 export interface Session {
   version: number;
   sessionId: string;
@@ -115,6 +126,7 @@ export interface Session {
   telegramConfig: TelegramConfig | null;
   wechatConfig: WechatConfig | null;
   metadata: SessionMetadata;
+  machine: OnboardMachineSnapshot;
   steps: Record<string, StepState>;
 }
 
@@ -198,6 +210,7 @@ export interface DebugSessionSummary {
   lastStepStarted: string | null;
   lastCompletedStep: string | null;
   failure: SessionFailure | null;
+  machine: OnboardMachineSnapshot;
   steps: Record<string, StepState>;
 }
 
@@ -240,6 +253,10 @@ function readPositiveInteger(value: SessionJsonValue | undefined): number | null
   return typeof value === "number" && Number.isInteger(value) && value > 0 ? value : null;
 }
 
+function readNonNegativeInteger(value: SessionJsonValue | undefined): number | null {
+  return typeof value === "number" && Number.isInteger(value) && value >= 0 ? value : null;
+}
+
 function readStringArray(value: SessionJsonValue | undefined): string[] | null {
   if (!Array.isArray(value)) return null;
   return value.filter((entry): entry is string => typeof entry === "string");
@@ -308,6 +325,17 @@ function parseStepState(value: SessionJsonValue | undefined): StepState | null {
   };
 }
 
+function parseMachineSnapshot(value: SessionJsonValue | undefined): OnboardMachineSnapshot | null {
+  if (!isObject(value) || value.version !== MACHINE_SNAPSHOT_VERSION) return null;
+  if (!isOnboardMachineState(value.state)) return null;
+  return {
+    version: MACHINE_SNAPSHOT_VERSION,
+    state: value.state,
+    stateEnteredAt: readString(value.stateEnteredAt),
+    revision: readNonNegativeInteger(value.revision) ?? 0,
+  };
+}
+
 function parseLockInfo(value: SessionJsonValue | undefined): LockInfo | null {
   if (!isObject(value) || typeof value.pid !== "number") return null;
   return {
@@ -335,9 +363,97 @@ export function sanitizeFailure(
 
 // ── Session CRUD ─────────────────────────────────────────────────
 
+function createMachineSnapshot(
+  state: OnboardMachineState,
+  stateEnteredAt: string | null,
+  revision = 0,
+): OnboardMachineSnapshot {
+  return {
+    version: MACHINE_SNAPSHOT_VERSION,
+    state,
+    stateEnteredAt,
+    revision: Math.max(0, Math.trunc(revision)),
+  };
+}
+
+function nextMachineStateAfterCompletedStep(
+  stepName: string | null | undefined,
+  session: Pick<Session, "agent">,
+): OnboardMachineState | null {
+  switch (stepName) {
+    case "preflight":
+      return "gateway";
+    case "gateway":
+      return "provider_selection";
+    case "provider_selection":
+      return "inference";
+    case "inference":
+      return "sandbox";
+    case "sandbox":
+      return session.agent ? "agent_setup" : "openclaw";
+    case "openclaw":
+    case "agent_setup":
+      return "policies";
+    case "policies":
+      return "finalizing";
+    default:
+      return null;
+  }
+}
+
+function inferMachineState(session: Session): OnboardMachineState {
+  if (session.status === "complete") return "complete";
+  if (session.status === "failed") return "failed";
+
+  const startedState = machineStateFromOnboardSessionStep(session.lastStepStarted);
+  const startedStep = session.lastStepStarted ? session.steps[session.lastStepStarted] : null;
+  if (startedState && startedStep?.status === "in_progress") return startedState;
+
+  return nextMachineStateAfterCompletedStep(session.lastCompletedStep, session) ?? "init";
+}
+
+function inferMachineStateEnteredAt(session: Session, state: OnboardMachineState): string | null {
+  if (state === "failed") return session.failure?.recordedAt ?? session.updatedAt;
+  if (state === "complete") return session.updatedAt;
+
+  const startedState = machineStateFromOnboardSessionStep(session.lastStepStarted);
+  const startedStep = session.lastStepStarted ? session.steps[session.lastStepStarted] : null;
+  if (state === startedState && startedStep?.status === "in_progress") {
+    return startedStep.startedAt ?? session.updatedAt;
+  }
+
+  if (nextMachineStateAfterCompletedStep(session.lastCompletedStep, session) === state) {
+    const completedStep = session.lastCompletedStep ? session.steps[session.lastCompletedStep] : null;
+    return completedStep?.completedAt ?? session.updatedAt;
+  }
+
+  return session.startedAt;
+}
+
+function inferMachineSnapshot(session: Session): OnboardMachineSnapshot {
+  const state = inferMachineState(session);
+  return createMachineSnapshot(state, inferMachineStateEnteredAt(session, state));
+}
+
+function transitionMachineSnapshot(session: Session, state: OnboardMachineState, now: string): void {
+  const current = session.machine ?? createMachineSnapshot("init", session.startedAt);
+  if (current.state === state) {
+    session.machine = {
+      ...current,
+      stateEnteredAt: current.stateEnteredAt ?? now,
+    };
+    return;
+  }
+  session.machine = createMachineSnapshot(state, now, current.revision + 1);
+}
+
 export function createSession(overrides: Partial<Session> = {}): Session {
   const now = new Date().toISOString();
-  return {
+  const steps = {
+    ...defaultSteps(),
+    ...(overrides.steps ?? {}),
+  };
+  const session: Session = {
     version: SESSION_VERSION,
     sessionId: overrides.sessionId ?? `${Date.now()}-${randomUUID()}`,
     resumable: true,
@@ -376,11 +492,11 @@ export function createSession(overrides: Partial<Session> = {}): Session {
       gatewayName: overrides.metadata?.gatewayName ?? "nemoclaw",
       fromDockerfile: overrides.metadata?.fromDockerfile ?? null,
     },
-    steps: {
-      ...defaultSteps(),
-      ...(overrides.steps ?? {}),
-    },
+    machine: parseMachineSnapshot(overrides.machine as SessionJsonValue | undefined) ??
+      createMachineSnapshot("init", now),
+    steps,
   };
+  return session;
 }
 
 export function normalizeSession(data: Session | SessionJsonValue | undefined): Session | null {
@@ -429,6 +545,8 @@ export function normalizeSession(data: Session | SessionJsonValue | undefined):
     }
   }
 
+  normalized.machine = parseMachineSnapshot(data.machine) ?? inferMachineSnapshot(normalized);
+
   return normalized;
 }
 
@@ -891,13 +1009,16 @@ export function markStepStarted(stepName: string): Session {
   const updatedSession = updateSession((session) => {
     const step = session.steps[stepName];
     if (!step) return session;
+    const now = new Date().toISOString();
     step.status = "in_progress";
-    step.startedAt = new Date().toISOString();
+    step.startedAt = now;
     step.completedAt = null;
     step.error = null;
     session.lastStepStarted = stepName;
     session.failure = null;
     session.status = "in_progress";
+    const state = machineStateFromOnboardSessionStep(stepName);
+    if (state) transitionMachineSnapshot(session, state, now);
     shouldEmit = true;
     return session;
   });
@@ -915,12 +1036,15 @@ export function markStepComplete(stepName: string, updates: SessionUpdates = {})
   const updatedSession = updateSession((session) => {
     const step = session.steps[stepName];
     if (!step) return session;
+    const now = new Date().toISOString();
     step.status = "complete";
-    step.completedAt = new Date().toISOString();
+    step.completedAt = now;
     step.error = null;
     session.lastCompletedStep = stepName;
     session.failure = null;
     Object.assign(session, safeUpdates);
+    const nextState = nextMachineStateAfterCompletedStep(stepName, session);
+    if (nextState) transitionMachineSnapshot(session, nextState, now);
     shouldEmit = true;
     return session;
   });
@@ -968,15 +1092,17 @@ export function markStepFailed(stepName: string, message: string | null = null):
   const updatedSession = updateSession((session) => {
     const step = session.steps[stepName];
     if (!step) return session;
+    const now = new Date().toISOString();
     step.status = "failed";
     step.completedAt = null;
     step.error = redactSensitiveText(message);
     session.failure = sanitizeFailure({
       step: stepName,
       message,
-      recordedAt: new Date().toISOString(),
+      recordedAt: now,
     });
     session.status = "failed";
+    transitionMachineSnapshot(session, "failed", now);
     shouldEmit = true;
     return session;
   });
@@ -1005,10 +1131,12 @@ export function markStepFailed(stepName: string, message: string | null = null):
 export function completeSession(updates: SessionUpdates = {}): Session {
   const safeUpdates = filterSafeUpdates(updates);
   const updatedSession = updateSession((session) => {
+    const now = new Date().toISOString();
     Object.assign(session, safeUpdates);
     session.status = "complete";
     session.resumable = false;
     session.failure = null;
+    transitionMachineSnapshot(session, "complete", now);
     return session;
   });
   if (Object.keys(safeUpdates).length > 0) {
@@ -1057,6 +1185,7 @@ export function summarizeForDebug(
     lastStepStarted: session.lastStepStarted,
     lastCompletedStep: session.lastCompletedStep,
     failure: sanitizeFailure(session.failure),
+    machine: session.machine,
     steps: Object.fromEntries(
       Object.entries(session.steps).map(([name, step]) => [
         name,

From f756907b5c07a0bb2d09049ab6b4fa7cda681709 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Tue, 19 May 2026 22:12:25 -0700
Subject: [PATCH 04/54] refactor(cli): add onboard runtime shell

---
 src/lib/onboard/machine/runtime.test.ts | 184 +++++++++++++++++
 src/lib/onboard/machine/runtime.ts      | 263 ++++++++++++++++++++++++
 2 files changed, 447 insertions(+)
 create mode 100644 src/lib/onboard/machine/runtime.test.ts
 create mode 100644 src/lib/onboard/machine/runtime.ts

diff --git a/src/lib/onboard/machine/runtime.test.ts b/src/lib/onboard/machine/runtime.test.ts
new file mode 100644
index 0000000000..becca6028e
--- /dev/null
+++ b/src/lib/onboard/machine/runtime.test.ts
@@ -0,0 +1,184 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { describe, expect, it } from "vitest";
+
+import {
+  createSession,
+  filterSafeUpdates,
+  normalizeSession,
+  type Session,
+} from "../../state/onboard-session";
+import type { OnboardMachineEvent } from "./events";
+import { OnboardRuntime, type OnboardRuntimeDeps } from "./runtime";
+import { InvalidOnboardMachineTransitionError } from "./transitions";
+
+function cloneSession(session: Session): Session {
+  return normalizeSession(JSON.parse(JSON.stringify(session))) ?? session;
+}
+
+function createHarness(initialSession: Session | null = createSession()) {
+  let session = initialSession ? cloneSession(initialSession) : null;
+  const events: OnboardMachineEvent[] = [];
+  let tick = 0;
+  const deps: OnboardRuntimeDeps = {
+    loadSession: () => (session ? cloneSession(session) : null),
+    createSession: (overrides) => createSession(overrides),
+    saveSession: (next) => {
+      session = cloneSession(next);
+      return cloneSession(session);
+    },
+    updateSession: (mutator) => {
+      const current = session ? cloneSession(session) : createSession();
+      const next = mutator(current) ?? current;
+      session = cloneSession(next);
+      return cloneSession(session);
+    },
+    filterSafeUpdates,
+    emitEvent: (event) => events.push(event),
+    now: () => `2026-05-19T00:00:${String(tick++).padStart(2, "0")}.000Z`,
+  };
+  return {
+    runtime: new OnboardRuntime(deps),
+    events,
+    getSession: () => {
+      if (!session) throw new Error("Expected runtime session");
+      return cloneSession(session);
+    },
+  };
+}
+
+function sessionInState(state: Session["machine"]["state"]): Session {
+  const session = createSession();
+  session.machine = {
+    version: 1,
+    state,
+    stateEnteredAt: "2026-05-19T00:00:00.000Z",
+    revision: 7,
+  };
+  return session;
+}
+
+describe("OnboardRuntime", () => {
+  it("starts a session and emits started/resumed lifecycle events", async () => {
+    const { runtime, events, getSession } = createHarness(null);
+
+    const started = await runtime.start();
+    expect(started.machine.state).toBe("init");
+    expect(getSession().machine.state).toBe("init");
+    expect(events[0]).toMatchObject({ type: "onboard.started", state: "init" });
+
+    await runtime.start({ resumed: true });
+    expect(events[1]).toMatchObject({ type: "onboard.resumed", state: "init" });
+  });
+
+  it("validates and persists explicit transitions", async () => {
+    const { runtime, events, getSession } = createHarness();
+
+    await runtime.transition("preflight");
+
+    expect(getSession().machine).toEqual({
+      version: 1,
+      state: "preflight",
+      stateEnteredAt: "2026-05-19T00:00:00.000Z",
+      revision: 1,
+    });
+    expect(events.map((event) => event.type)).toEqual(["state.exited", "state.entered"]);
+    expect(events[0]).toMatchObject({ state: "init" });
+    expect(events[1]).toMatchObject({ state: "preflight" });
+
+    await expect(runtime.transition("sandbox")).rejects.toThrow(
+      InvalidOnboardMachineTransitionError,
+    );
+    expect(getSession().machine.state).toBe("preflight");
+  });
+
+  it("applies only safe context updates and emits redacted context events", async () => {
+    const { runtime, events, getSession } = createHarness();
+
+    await runtime.updateContext({
+      provider: "nvidia-prod",
+      endpointUrl: "https://alice:secret@example.com/v1?token=super-secret&keep=yes#token=frag",
+      credentialEnv: "NVIDIA_API_KEY",
+      apiKey: "super-secret",
+    } as Parameters<typeof runtime.updateContext>[0] & { apiKey: string });
+
+    expect(getSession()).toMatchObject({
+      provider: "nvidia-prod",
+      endpointUrl: "https://example.com/v1?token=%3CREDACTED%3E&keep=yes",
+      credentialEnv: "NVIDIA_API_KEY",
+    });
+    expect("apiKey" in getSession()).toBe(false);
+    expect(events).toHaveLength(1);
+    expect(events[0]).toMatchObject({ type: "context.updated", state: "init" });
+    expect(events[0].metadata.fields).toEqual(["provider", "endpointUrl", "credentialEnv"]);
+    expect(JSON.stringify(events)).not.toContain("super-secret");
+  });
+
+  it("fails non-terminal sessions with redacted failure events", async () => {
+    const { runtime, events, getSession } = createHarness(sessionInState("gateway"));
+
+    await runtime.fail("NVIDIA_API_KEY=super-secret", { step: "gateway" });
+
+    expect(getSession()).toMatchObject({
+      status: "failed",
+      failure: { step: "gateway", message: "NVIDIA_API_KEY=<REDACTED>" },
+      machine: { state: "failed", revision: 8 },
+    });
+    expect(events.map((event) => event.type)).toEqual(["state.failed", "onboard.failed"]);
+    expect(events[0]).toMatchObject({ state: "gateway", step: "gateway" });
+    expect(events[1]).toMatchObject({ state: "failed", step: "gateway" });
+    expect(JSON.stringify(events)).not.toContain("super-secret");
+  });
+
+  it("rejects terminal-state failure and invalid completion transitions", async () => {
+    const completeHarness = createHarness(sessionInState("complete"));
+    await expect(completeHarness.runtime.fail("boom")).rejects.toThrow("complete -> failed");
+    expect(completeHarness.getSession().machine.state).toBe("complete");
+
+    const policiesHarness = createHarness(sessionInState("policies"));
+    await expect(policiesHarness.runtime.complete()).rejects.toThrow("policies -> complete");
+    expect(policiesHarness.getSession().machine.state).toBe("policies");
+  });
+
+  it("completes from post_verify and emits completion events", async () => {
+    const { runtime, events, getSession } = createHarness(sessionInState("post_verify"));
+
+    await runtime.complete({ sandboxName: "my-assistant" });
+
+    expect(getSession()).toMatchObject({
+      status: "complete",
+      resumable: false,
+      sandboxName: "my-assistant",
+      machine: { state: "complete", revision: 8 },
+    });
+    expect(events.map((event) => event.type)).toEqual([
+      "context.updated",
+      "state.completed",
+      "state.entered",
+      "onboard.completed",
+    ]);
+  });
+
+  it("emits skipped and repair events without mutating durable state", async () => {
+    const { runtime, events, getSession } = createHarness(sessionInState("provider_selection"));
+
+    await runtime.markSkipped("provider_selection", { reason: "resume" });
+    await runtime.emitRepairEvent("state.repair.started", {
+      state: "provider_selection",
+      metadata: { action: "ollama-systemd" },
+    });
+    await runtime.emitRepairEvent("state.repair.completed", { state: "provider_selection" });
+
+    expect(getSession().machine.state).toBe("provider_selection");
+    expect(events.map((event) => event.type)).toEqual([
+      "state.skipped",
+      "state.repair.started",
+      "state.repair.completed",
+    ]);
+    expect(events[0].metadata.reason).toBe("resume");
+    await expect(runtime.markSkipped("complete")).rejects.toThrow(
+      "Terminal onboarding state cannot be skipped",
+    );
+  });
+});
diff --git a/src/lib/onboard/machine/runtime.ts b/src/lib/onboard/machine/runtime.ts
new file mode 100644
index 0000000000..3e72cd0ccc
--- /dev/null
+++ b/src/lib/onboard/machine/runtime.ts
@@ -0,0 +1,263 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import type { JsonObject } from "../../core/json-types";
+import * as onboardSession from "../../state/onboard-session";
+import type { Session, SessionUpdates } from "../../state/onboard-session";
+import {
+  createOnboardMachineEvent,
+  emitOnboardMachineEvent,
+  type OnboardMachineEvent,
+} from "./events";
+import {
+  assertValidOnboardMachineTransition,
+  canTransitionOnboardMachineState,
+  isTerminalOnboardMachineState,
+} from "./transitions";
+import type { OnboardMachineEventType, OnboardMachineState } from "./types";
+
+export interface OnboardRuntimeDeps {
+  loadSession(): Session | null;
+  createSession(overrides?: Partial<Session>): Session;
+  saveSession(session: Session): Session;
+  updateSession(mutator: (session: Session) => Session | void): Session;
+  filterSafeUpdates(updates: SessionUpdates): Partial<Session>;
+  emitEvent(event: OnboardMachineEvent): void;
+  now(): string;
+}
+
+export type OnboardRuntimeTransitionOptions = {
+  metadata?: Record<string, unknown> | null;
+};
+
+export type OnboardRuntimeUpdateOptions = {
+  state?: OnboardMachineState | null;
+  metadata?: Record<string, unknown> | null;
+};
+
+export type OnboardRuntimeFailureOptions = {
+  step?: string | null;
+  metadata?: Record<string, unknown> | null;
+};
+
+function defaultDeps(): OnboardRuntimeDeps {
+  return {
+    loadSession: onboardSession.loadSession,
+    createSession: onboardSession.createSession,
+    saveSession: onboardSession.saveSession,
+    updateSession: onboardSession.updateSession,
+    filterSafeUpdates: onboardSession.filterSafeUpdates,
+    emitEvent: emitOnboardMachineEvent,
+    now: () => new Date().toISOString(),
+  };
+}
+
+function eventMetadata(metadata: Record<string, unknown> | null | undefined): JsonObject {
+  return metadata && typeof metadata === "object" && !Array.isArray(metadata)
+    ? (metadata as JsonObject)
+    : {};
+}
+
+function snapshotFor(
+  state: OnboardMachineState,
+  stateEnteredAt: string | null,
+  revision: number,
+): onboardSession.OnboardMachineSnapshot {
+  return {
+    version: onboardSession.MACHINE_SNAPSHOT_VERSION,
+    state,
+    stateEnteredAt,
+    revision: Math.max(0, Math.trunc(revision)),
+  };
+}
+
+export class OnboardRuntime {
+  private readonly deps: OnboardRuntimeDeps;
+
+  constructor(deps: Partial<OnboardRuntimeDeps> = {}) {
+    this.deps = { ...defaultDeps(), ...deps };
+  }
+
+  async session(): Promise<Session> {
+    return this.ensureSession();
+  }
+
+  async start(options: { resumed?: boolean; metadata?: Record<string, unknown> | null } = {}): Promise<Session> {
+    const session = this.ensureSession();
+    this.emit(options.resumed === true ? "onboard.resumed" : "onboard.started", session, {
+      state: session.machine.state,
+      metadata: options.metadata,
+    });
+    return session;
+  }
+
+  async transition(
+    to: OnboardMachineState,
+    options: OnboardRuntimeTransitionOptions = {},
+  ): Promise<Session> {
+    const current = this.ensureSession();
+    const from = current.machine.state;
+    assertValidOnboardMachineTransition(from, to);
+
+    const enteredAt = this.deps.now();
+    const updated = this.deps.updateSession((session) => {
+      session.machine = snapshotFor(to, enteredAt, session.machine.revision + 1);
+      if (to === "failed") {
+        session.status = "failed";
+      } else if (to === "complete") {
+        session.status = "complete";
+        session.resumable = false;
+        session.failure = null;
+      } else if (session.status !== "failed") {
+        session.status = "in_progress";
+      }
+      return session;
+    });
+
+    this.emit("state.exited", updated, { state: from, metadata: options.metadata });
+    this.emit("state.entered", updated, { state: to, metadata: options.metadata });
+    return updated;
+  }
+
+  async updateContext(
+    updates: SessionUpdates,
+    options: OnboardRuntimeUpdateOptions = {},
+  ): Promise<Session> {
+    const safeUpdates = this.deps.filterSafeUpdates(updates);
+    const fields = Object.keys(safeUpdates);
+    const updated = this.deps.updateSession((session) => {
+      Object.assign(session, safeUpdates);
+      return session;
+    });
+    if (fields.length > 0) {
+      this.emit("context.updated", updated, {
+        state: options.state ?? updated.machine.state,
+        metadata: { ...eventMetadata(options.metadata), fields },
+      });
+    }
+    return updated;
+  }
+
+  async complete(updates: SessionUpdates = {}): Promise<Session> {
+    const current = this.ensureSession();
+    const from = current.machine.state;
+    assertValidOnboardMachineTransition(from, "complete");
+
+    const safeUpdates = this.deps.filterSafeUpdates(updates);
+    const fields = Object.keys(safeUpdates);
+    const enteredAt = this.deps.now();
+    const updated = this.deps.updateSession((session) => {
+      Object.assign(session, safeUpdates);
+      session.status = "complete";
+      session.resumable = false;
+      session.failure = null;
+      session.machine = snapshotFor("complete", enteredAt, session.machine.revision + 1);
+      return session;
+    });
+
+    if (fields.length > 0) {
+      this.emit("context.updated", updated, {
+        state: "complete",
+        metadata: { fields },
+      });
+    }
+    this.emit("state.completed", updated, { state: from });
+    this.emit("state.entered", updated, { state: "complete" });
+    this.emit("onboard.completed", updated, { state: "complete" });
+    return updated;
+  }
+
+  async fail(message: string | null, options: OnboardRuntimeFailureOptions = {}): Promise<Session> {
+    const current = this.ensureSession();
+    const from = current.machine.state;
+    if (!canTransitionOnboardMachineState(from, "failed")) {
+      assertValidOnboardMachineTransition(from, "failed");
+    }
+
+    const recordedAt = this.deps.now();
+    const updated = this.deps.updateSession((session) => {
+      session.status = "failed";
+      session.failure = onboardSession.sanitizeFailure({
+        step: options.step ?? null,
+        message,
+        recordedAt,
+      });
+      session.machine = snapshotFor("failed", recordedAt, session.machine.revision + 1);
+      return session;
+    });
+
+    this.emit("state.failed", updated, {
+      state: from,
+      step: options.step,
+      error: message,
+      metadata: options.metadata,
+    });
+    this.emit("onboard.failed", updated, {
+      state: "failed",
+      step: options.step,
+      error: message,
+      metadata: options.metadata,
+    });
+    return updated;
+  }
+
+  async markSkipped(
+    state: OnboardMachineState,
+    metadata: Record<string, unknown> | null = null,
+  ): Promise<Session> {
+    const session = this.ensureSession();
+    if (isTerminalOnboardMachineState(state)) {
+      throw new Error(`Terminal onboarding state cannot be skipped: ${state}`);
+    }
+    this.emit("state.skipped", session, { state, metadata });
+    return session;
+  }
+
+  async emitRepairEvent(
+    type: Extract<
+      OnboardMachineEventType,
+      "state.repair.started" | "state.repair.completed" | "state.repair.failed"
+    >,
+    options: {
+      state?: OnboardMachineState | null;
+      error?: string | null;
+      metadata?: Record<string, unknown> | null;
+    } = {},
+  ): Promise<Session> {
+    const session = this.ensureSession();
+    this.emit(type, session, {
+      state: options.state ?? session.machine.state,
+      error: options.error ?? null,
+      metadata: options.metadata,
+    });
+    return session;
+  }
+
+  private ensureSession(): Session {
+    const existing = this.deps.loadSession();
+    if (existing) return existing;
+    return this.deps.saveSession(this.deps.createSession());
+  }
+
+  private emit(
+    type: OnboardMachineEventType,
+    session: Session,
+    options: {
+      state?: OnboardMachineState | null;
+      step?: string | null;
+      error?: string | null;
+      metadata?: Record<string, unknown> | null;
+    } = {},
+  ): void {
+    this.deps.emitEvent(
+      createOnboardMachineEvent({
+        type,
+        session,
+        state: options.state ?? session.machine.state,
+        step: options.step ?? null,
+        error: options.error ?? null,
+        metadata: options.metadata,
+      }),
+    );
+  }
+}

From 702454b2d9c3547a95a4c51f4f4ec9a6c5780ca0 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Tue, 19 May 2026 22:26:42 -0700
Subject: [PATCH 05/54] refactor(cli): route onboard step boundaries through
 runtime

---
 src/lib/agent/onboard.ts                |  4 +-
 src/lib/onboard.ts                      | 87 +++++++++++++++----------
 src/lib/onboard/machine/runtime.test.ts | 56 ++++++++++++++--
 src/lib/onboard/machine/runtime.ts      | 30 +++++++++
 4 files changed, 135 insertions(+), 42 deletions(-)

diff --git a/src/lib/agent/onboard.ts b/src/lib/agent/onboard.ts
index 2446108910..f08c32b9c6 100644
--- a/src/lib/agent/onboard.ts
+++ b/src/lib/agent/onboard.ts
@@ -31,7 +31,7 @@ export interface OnboardContext {
   buildSandboxConfigSyncScript: (config: LooseObject) => string;
   writeSandboxConfigSyncFile: (script: string) => string;
   cleanupTempDir: (file: string, prefix: string) => void;
-  startRecordedStep: (stepName: string, updates: LooseObject) => void;
+  startRecordedStep: (stepName: string, updates: LooseObject) => Promise<void>;
   skippedStepMessage: (stepName: string, sandboxName: string) => void;
 }
 
@@ -424,7 +424,7 @@ export async function handleAgentSetup(
     }
   }
 
-  startRecordedStep("agent_setup", { sandboxName, provider, model });
+  await startRecordedStep("agent_setup", { sandboxName, provider, model });
   step(7, 8, `Setting up ${agent.displayName} inside sandbox`);
 
   const binaryAvailability = verifyAgentBinaryAvailable(sandboxName, agent, runCaptureOpenshell);
diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index bc231df3a5..470639b346 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -279,6 +279,7 @@ const { resolveSandboxImageTagFromCreateOutput } =
   require("./domain/sandbox/image-tag") as typeof import("./domain/sandbox/image-tag");
 const nim: typeof import("./inference/nim") = require("./inference/nim");
 const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session");
+const { OnboardRuntime }: typeof import("./onboard/machine/runtime") = require("./onboard/machine/runtime");
 const policies: typeof import("./policy") = require("./policy");
 const tiers: typeof import("./policy/tiers") = require("./policy/tiers");
 const { ensureUsageNoticeConsent } = require("./onboard/usage-notice");
@@ -409,6 +410,7 @@ const USE_COLOR = !process.env.NO_COLOR && !!process.stdout.isTTY;
 const DIM = USE_COLOR ? "\x1b[2m" : "";
 const RESET = USE_COLOR ? "\x1b[0m" : "";
 let OPENSHELL_BIN: string | null = null;
+let ONBOARD_RUNTIME: import("./onboard/machine/runtime").OnboardRuntime | null = null;
 const GATEWAY_NAME = "nemoclaw";
 const BACK_TO_SELECTION = "__NEMOCLAW_BACK_TO_SELECTION__";
 type HermesAuthMethod = "oauth" | "api_key";
@@ -9017,7 +9019,12 @@ function toSessionUpdates(
   return normalized;
 }
 
-function startRecordedStep(
+function getOnboardRuntime(): import("./onboard/machine/runtime").OnboardRuntime {
+  if (!ONBOARD_RUNTIME) ONBOARD_RUNTIME = new OnboardRuntime();
+  return ONBOARD_RUNTIME;
+}
+
+async function startRecordedStep(
   stepName: string,
   updates: {
     sandboxName?: string | null;
@@ -9025,20 +9032,30 @@ function startRecordedStep(
     model?: string | null;
     policyPresets?: string[] | null;
   } = {},
-): void {
-  onboardSession.markStepStarted(stepName);
+): Promise<void> {
+  const runtime = getOnboardRuntime();
+  await runtime.markStepStarted(stepName);
   if (Object.keys(updates).length > 0) {
-    onboardSession.updateSession((session: Session) => {
-      if (updates.sandboxName !== undefined) session.sandboxName = updates.sandboxName;
-      if (updates.provider !== undefined) session.provider = updates.provider;
-      if (updates.model !== undefined) session.model = updates.model;
-      if (updates.policyPresets !== undefined) session.policyPresets = updates.policyPresets;
-      return session;
-    });
+    await runtime.updateContext(toSessionUpdates(updates));
   }
   maybeForceE2eStepFailure(stepName);
 }
 
+async function recordStepComplete(
+  stepName: string,
+  updates: SessionUpdates = {},
+): Promise<Session> {
+  return getOnboardRuntime().markStepComplete(stepName, updates);
+}
+
+async function recordStepSkipped(stepName: string): Promise<Session> {
+  return getOnboardRuntime().markStepSkipped(stepName);
+}
+
+async function recordSessionComplete(updates: SessionUpdates = {}): Promise<Session> {
+  return getOnboardRuntime().completeSession(updates);
+}
+
 const ONBOARD_STEP_INDEX: Record<string, { number: number; title: string }> = {
   preflight: { number: 1, title: "Preflight checks" },
   gateway: { number: 2, title: "Starting OpenShell gateway" },
@@ -9074,6 +9091,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
   RECREATE_SANDBOX = opts.recreateSandbox || process.env.NEMOCLAW_RECREATE_SANDBOX === "1";
   AUTO_YES = opts.autoYes === true || process.env.NEMOCLAW_YES === "1";
   _preflightDashboardPort = opts.controlUiPort || null;
+  ONBOARD_RUNTIME = new OnboardRuntime();
   delete process.env.OPENSHELL_GATEWAY;
   const resume = opts.resume === true;
   const fresh = opts.fresh === true;
@@ -9422,9 +9440,9 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
         }),
       );
     } else {
-      startRecordedStep("preflight");
+      await startRecordedStep("preflight");
       gpu = await preflight({ ...opts, optedOutGpuPassthrough: opts.noGpu === true });
-      onboardSession.markStepComplete("preflight");
+      await recordStepComplete("preflight");
     }
     const sandboxGpuConfig = resolveSandboxGpuConfig(gpu, {
       flag: effectiveSandboxGpuFlag,
@@ -9560,11 +9578,11 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
       resume && session?.steps?.gateway?.status === "complete" && canReuseHealthyGateway;
     if (resumeGateway) {
       skippedStepMessage("gateway", "running");
-      onboardSession.markStepComplete("gateway");
+      await recordStepComplete("gateway");
     } else if (!resume && canReuseHealthyGateway) {
       skippedStepMessage("gateway", "running", "reuse");
       note("  Reusing healthy NemoClaw gateway.");
-      onboardSession.markStepComplete("gateway");
+      await recordStepComplete("gateway");
     } else {
       if (resume && session?.steps?.gateway?.status === "complete") {
         if (gatewayReuseState === "active-unnamed") {
@@ -9582,9 +9600,9 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
         retireLegacyGatewayForDockerDriverUpgrade();
         gatewayReuseState = "missing";
       }
-      startRecordedStep("gateway");
+      await startRecordedStep("gateway");
       await startGateway(gpu, { gpuPassthrough });
-      onboardSession.markStepComplete("gateway");
+      await recordStepComplete("gateway");
     }
 
     // #2753: prefer requestedSandboxName over an unconfirmed session name.
@@ -9635,7 +9653,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
         // below). A SIGINT between any earlier step and createSandbox would
         // otherwise leave a phantom that `nemoclaw list` resurrects until
         // manually destroyed.
-        startRecordedStep("provider_selection");
+        await startRecordedStep("provider_selection");
         const selection = await setupNim(gpu, sandboxName, agent);
         model = selection.model;
         provider = selection.provider;
@@ -9645,7 +9663,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
         hermesToolGateways = selection.hermesToolGateways;
         preferredInferenceApi = selection.preferredInferenceApi;
         nimContainer = selection.nimContainer;
-        onboardSession.markStepComplete(
+        await recordStepComplete(
           "provider_selection",
           toSessionUpdates({
             provider,
@@ -9678,7 +9696,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
           if (!sandboxName) {
             sandboxName = await promptValidatedSandboxName(agent);
           }
-          startRecordedStep("inference", { provider, model });
+          await startRecordedStep("inference", { provider, model });
           const inferenceResult = await setupInference(
             sandboxName,
             model,
@@ -9692,7 +9710,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
             forceProviderSelection = true;
             continue;
           }
-          onboardSession.markStepComplete(
+          await recordStepComplete(
             "inference",
             toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }),
           );
@@ -9712,7 +9730,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
         if (nimContainer && sandboxName) {
           registry.updateSandbox(sandboxName, { nimContainer });
         }
-        onboardSession.markStepComplete(
+        await recordStepComplete(
           "inference",
           toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }),
         );
@@ -9751,7 +9769,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
         }
       }
 
-      startRecordedStep("inference", { provider, model });
+      await startRecordedStep("inference", { provider, model });
       const inferenceResult = await setupInference(
         sandboxName,
         model,
@@ -9769,7 +9787,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
       if (nimContainer && sandboxName) {
         registry.updateSandbox(sandboxName, { nimContainer });
       }
-      onboardSession.markStepComplete(
+      await recordStepComplete(
         "inference",
         toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }),
       );
@@ -9906,7 +9924,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
       } else {
         nextWebSearchConfig = await configureWebSearch(null, agent, webSearchSupportProbePath);
       }
-      startRecordedStep("sandbox", { provider, model });
+      await startRecordedStep("sandbox", { provider, model });
       const recordedMessagingChannels = getRecordedMessagingChannelsForResume(resume, session, sandboxName);
       if (recordedMessagingChannels) {
         selectedMessagingChannels = recordedMessagingChannels;
@@ -9960,7 +9978,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
         ...getSandboxAgentRegistryFields(agent, !fromDockerfile),
       });
       registry.setDefault(sandboxName);
-      onboardSession.markStepComplete(
+      await recordStepComplete(
         "sandbox",
         toSessionUpdates({
           sandboxName,
@@ -9996,24 +10014,24 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
         skippedStepMessage,
       });
       ensureAgentDashboardForward(sandboxName, agent);
-      onboardSession.markStepSkipped("openclaw");
+      await recordStepSkipped("openclaw");
     } else {
       const resumeOpenclaw = resume && sandboxName && isOpenclawReady(sandboxName);
       if (resumeOpenclaw) {
         skippedStepMessage("openclaw", sandboxName);
-        onboardSession.markStepComplete(
+        await recordStepComplete(
           "openclaw",
           toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }),
         );
       } else {
-        startRecordedStep("openclaw", { sandboxName, provider, model });
+        await startRecordedStep("openclaw", { sandboxName, provider, model });
         await setupOpenclaw(sandboxName, model, provider);
-        onboardSession.markStepComplete(
+        await recordStepComplete(
           "openclaw",
           toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }),
         );
       }
-      onboardSession.markStepSkipped("agent_setup");
+      await recordStepSkipped("agent_setup");
     }
 
     const latestSession = onboardSession.loadSession();
@@ -10066,7 +10084,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
       arePolicyPresetsApplied(sandboxName, recordedPolicyPresetsForSupport);
     if (resumePolicies) {
       skippedStepMessage("policies", recordedPolicyPresetsForSupport.join(", "));
-      onboardSession.markStepComplete(
+      await recordStepComplete(
         "policies",
         toSessionUpdates({
           sandboxName,
@@ -10076,7 +10094,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
         }),
       );
     } else {
-      startRecordedStep("policies", {
+      await startRecordedStep("policies", {
         sandboxName,
         provider,
         model,
@@ -10102,7 +10120,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
           });
         },
       });
-      onboardSession.markStepComplete(
+      await recordStepComplete(
         "policies",
         toSessionUpdates({ sandboxName, provider, model, policyPresets: appliedPolicyPresets }),
       );
@@ -10112,7 +10130,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
       ensureAgentDashboardForward(sandboxName, agent);
     }
 
-    onboardSession.completeSession(
+    await recordSessionComplete(
       toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }),
     );
     completed = true;
@@ -10192,6 +10210,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
     printDashboard(sandboxName, model, provider, nimContainer, agent);
   } finally {
     releaseOnboardLock();
+    ONBOARD_RUNTIME = null;
   }
 }
 
diff --git a/src/lib/onboard/machine/runtime.test.ts b/src/lib/onboard/machine/runtime.test.ts
index becca6028e..7b26269541 100644
--- a/src/lib/onboard/machine/runtime.test.ts
+++ b/src/lib/onboard/machine/runtime.test.ts
@@ -7,7 +7,9 @@ import {
   createSession,
   filterSafeUpdates,
   normalizeSession,
+  sanitizeFailure,
   type Session,
+  type SessionUpdates,
 } from "../../state/onboard-session";
 import type { OnboardMachineEvent } from "./events";
 import { OnboardRuntime, type OnboardRuntimeDeps } from "./runtime";
@@ -21,6 +23,12 @@ function createHarness(initialSession: Session | null = createSession()) {
   let session = initialSession ? cloneSession(initialSession) : null;
   const events: OnboardMachineEvent[] = [];
   let tick = 0;
+  const updateSession = (mutator: (value: Session) => Session | void): Session => {
+    const current = session ? cloneSession(session) : createSession();
+    const next = mutator(current) ?? current;
+    session = cloneSession(next);
+    return cloneSession(session);
+  };
   const deps: OnboardRuntimeDeps = {
     loadSession: () => (session ? cloneSession(session) : null),
     createSession: (overrides) => createSession(overrides),
@@ -28,12 +36,48 @@ function createHarness(initialSession: Session | null = createSession()) {
       session = cloneSession(next);
       return cloneSession(session);
     },
-    updateSession: (mutator) => {
-      const current = session ? cloneSession(session) : createSession();
-      const next = mutator(current) ?? current;
-      session = cloneSession(next);
-      return cloneSession(session);
-    },
+    updateSession,
+    markStepStarted: (stepName) =>
+      updateSession((current) => {
+        const step = current.steps[stepName];
+        if (!step) return current;
+        step.status = "in_progress";
+        current.lastStepStarted = stepName;
+        current.status = "in_progress";
+        return current;
+      }),
+    markStepComplete: (stepName, updates: SessionUpdates = {}) =>
+      updateSession((current) => {
+        const step = current.steps[stepName];
+        if (!step) return current;
+        step.status = "complete";
+        current.lastCompletedStep = stepName;
+        Object.assign(current, filterSafeUpdates(updates));
+        return current;
+      }),
+    markStepSkipped: (stepName) =>
+      updateSession((current) => {
+        const step = current.steps[stepName];
+        if (!step) return current;
+        step.status = "skipped";
+        return current;
+      }),
+    markStepFailed: (stepName, message) =>
+      updateSession((current) => {
+        const step = current.steps[stepName];
+        if (!step) return current;
+        step.status = "failed";
+        current.status = "failed";
+        current.failure = sanitizeFailure({ step: stepName, message, recordedAt: "now" });
+        return current;
+      }),
+    completeSession: (updates: SessionUpdates = {}) =>
+      updateSession((current) => {
+        Object.assign(current, filterSafeUpdates(updates));
+        current.status = "complete";
+        current.resumable = false;
+        return current;
+      }),
     filterSafeUpdates,
     emitEvent: (event) => events.push(event),
     now: () => `2026-05-19T00:00:${String(tick++).padStart(2, "0")}.000Z`,
diff --git a/src/lib/onboard/machine/runtime.ts b/src/lib/onboard/machine/runtime.ts
index 3e72cd0ccc..2e5d584f3b 100644
--- a/src/lib/onboard/machine/runtime.ts
+++ b/src/lib/onboard/machine/runtime.ts
@@ -21,6 +21,11 @@ export interface OnboardRuntimeDeps {
   createSession(overrides?: Partial<Session>): Session;
   saveSession(session: Session): Session;
   updateSession(mutator: (session: Session) => Session | void): Session;
+  markStepStarted(stepName: string): Session;
+  markStepComplete(stepName: string, updates?: SessionUpdates): Session;
+  markStepSkipped(stepName: string): Session;
+  markStepFailed(stepName: string, message?: string | null): Session;
+  completeSession(updates?: SessionUpdates): Session;
   filterSafeUpdates(updates: SessionUpdates): Partial<Session>;
   emitEvent(event: OnboardMachineEvent): void;
   now(): string;
@@ -46,6 +51,11 @@ function defaultDeps(): OnboardRuntimeDeps {
     createSession: onboardSession.createSession,
     saveSession: onboardSession.saveSession,
     updateSession: onboardSession.updateSession,
+    markStepStarted: onboardSession.markStepStarted,
+    markStepComplete: onboardSession.markStepComplete,
+    markStepSkipped: onboardSession.markStepSkipped,
+    markStepFailed: onboardSession.markStepFailed,
+    completeSession: onboardSession.completeSession,
     filterSafeUpdates: onboardSession.filterSafeUpdates,
     emitEvent: emitOnboardMachineEvent,
     now: () => new Date().toISOString(),
@@ -91,6 +101,26 @@ export class OnboardRuntime {
     return session;
   }
 
+  async markStepStarted(stepName: string): Promise<Session> {
+    return this.deps.markStepStarted(stepName);
+  }
+
+  async markStepComplete(stepName: string, updates: SessionUpdates = {}): Promise<Session> {
+    return this.deps.markStepComplete(stepName, updates);
+  }
+
+  async markStepSkipped(stepName: string): Promise<Session> {
+    return this.deps.markStepSkipped(stepName);
+  }
+
+  async markStepFailed(stepName: string, message: string | null = null): Promise<Session> {
+    return this.deps.markStepFailed(stepName, message);
+  }
+
+  async completeSession(updates: SessionUpdates = {}): Promise<Session> {
+    return this.deps.completeSession(updates);
+  }
+
   async transition(
     to: OnboardMachineState,
     options: OnboardRuntimeTransitionOptions = {},

From 60acb65261157d19741963f604f148474da04218 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Tue, 19 May 2026 22:35:53 -0700
Subject: [PATCH 06/54] refactor(cli): add observe-only onboard hooks

---
 src/lib/onboard/machine/hooks.test.ts | 150 ++++++++++++++++++++++++++
 src/lib/onboard/machine/hooks.ts      | 132 +++++++++++++++++++++++
 2 files changed, 282 insertions(+)
 create mode 100644 src/lib/onboard/machine/hooks.test.ts
 create mode 100644 src/lib/onboard/machine/hooks.ts

diff --git a/src/lib/onboard/machine/hooks.test.ts b/src/lib/onboard/machine/hooks.test.ts
new file mode 100644
index 0000000000..ec0fe0fcc7
--- /dev/null
+++ b/src/lib/onboard/machine/hooks.test.ts
@@ -0,0 +1,150 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import fs from "node:fs";
+import os from "node:os";
+import path from "node:path";
+import { afterEach, describe, expect, it } from "vitest";
+
+import { createSession } from "../../state/onboard-session";
+import {
+  clearOnboardMachineEventListeners,
+  createOnboardMachineEvent,
+  emitOnboardMachineEvent,
+  type OnboardMachineEvent,
+} from "./events";
+import { createJsonlOnboardHook, OnboardHookDispatcher, registerOnboardHooks } from "./hooks";
+
+function sampleEvent(): OnboardMachineEvent {
+  const session = createSession({
+    sessionId: "session-1",
+    provider: "nvidia-prod",
+    endpointUrl: "https://example.com/v1?token=secret&keep=yes",
+  });
+  return createOnboardMachineEvent({
+    type: "state.entered",
+    session,
+    state: "gateway",
+    step: "gateway",
+  });
+}
+
+afterEach(() => {
+  clearOnboardMachineEventListeners();
+});
+
+describe("onboard machine hooks", () => {
+  it("dispatches observe-only events and emits hook lifecycle events", async () => {
+    const observed: string[] = [];
+    const lifecycle: OnboardMachineEvent[] = [];
+    const dispatcher = new OnboardHookDispatcher(
+      [
+        {
+          name: "observer",
+          onEvent(event) {
+            observed.push(event.type);
+          },
+        },
+      ],
+      {
+        emitEvent: (event) => lifecycle.push(event),
+        now: () => "2026-05-19T01:00:00.000Z",
+      },
+    );
+
+    await dispatcher.dispatch(sampleEvent());
+
+    expect(observed).toEqual(["state.entered"]);
+    expect(lifecycle.map((event) => event.type)).toEqual(["hook.started", "hook.completed"]);
+    expect(lifecycle[0]).toMatchObject({
+      sessionId: "session-1",
+      state: "gateway",
+      step: "gateway",
+      metadata: { hook: "observer", sourceType: "state.entered" },
+    });
+  });
+
+  it("warns and emits hook.failed without throwing when a hook fails", async () => {
+    const warnings: string[] = [];
+    const lifecycle: OnboardMachineEvent[] = [];
+    const dispatcher = new OnboardHookDispatcher(
+      [
+        {
+          name: "bad-hook",
+          async onEvent() {
+            throw new Error("Bearer super-secret-token");
+          },
+        },
+      ],
+      {
+        warn: (message) => warnings.push(message),
+        emitEvent: (event) => lifecycle.push(event),
+        now: () => "2026-05-19T01:00:00.000Z",
+      },
+    );
+
+    await expect(dispatcher.dispatch(sampleEvent())).resolves.toBeUndefined();
+
+    expect(lifecycle.map((event) => event.type)).toEqual(["hook.started", "hook.failed"]);
+    expect(lifecycle[1]).toMatchObject({
+      type: "hook.failed",
+      error: "Bearer <REDACTED>",
+      metadata: { hook: "bad-hook", sourceType: "state.entered" },
+    });
+    expect(warnings).toEqual(["Onboard hook 'bad-hook' failed: Bearer <REDACTED>"]);
+    expect(JSON.stringify(lifecycle)).not.toContain("super-secret-token");
+    expect(warnings.join("\n")).not.toContain("super-secret-token");
+  });
+
+  it("writes JSONL hook events to an external sink", async () => {
+    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-hooks-"));
+    try {
+      const filePath = path.join(tmpDir, "events.jsonl");
+      const hook = createJsonlOnboardHook(filePath);
+
+      await hook.onEvent?.(sampleEvent());
+      await hook.onEvent?.(
+        createOnboardMachineEvent({
+          type: "state.completed",
+          session: createSession({ sessionId: "session-1" }),
+          state: "gateway",
+          step: "gateway",
+        }),
+      );
+
+      const lines = fs
+        .readFileSync(filePath, "utf8")
+        .trim()
+        .split("\n")
+        .map((line) => JSON.parse(line));
+      expect(lines.map((event) => event.type)).toEqual(["state.entered", "state.completed"]);
+      expect(lines[0].context.endpointUrl).toBe(
+        "https://example.com/v1?token=%3CREDACTED%3E&keep=yes",
+      );
+    } finally {
+      fs.rmSync(tmpDir, { recursive: true, force: true });
+    }
+  });
+
+  it("registers hooks on the machine event bus without redispatching hook lifecycle events", async () => {
+    const observed: string[] = [];
+    const unregister = registerOnboardHooks([
+      {
+        name: "bus-observer",
+        onEvent(event) {
+          observed.push(event.type);
+        },
+      },
+    ]);
+
+    emitOnboardMachineEvent(sampleEvent());
+    await Promise.resolve();
+    emitOnboardMachineEvent({ ...sampleEvent(), type: "hook.failed" });
+    await Promise.resolve();
+    unregister();
+    emitOnboardMachineEvent({ ...sampleEvent(), type: "state.completed" });
+    await Promise.resolve();
+
+    expect(observed).toEqual(["state.entered"]);
+  });
+});
diff --git a/src/lib/onboard/machine/hooks.ts b/src/lib/onboard/machine/hooks.ts
new file mode 100644
index 0000000000..1dfcd7544a
--- /dev/null
+++ b/src/lib/onboard/machine/hooks.ts
@@ -0,0 +1,132 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import fs from "node:fs";
+import path from "node:path";
+
+import { redactSensitiveText } from "../../security/redact";
+import {
+  addOnboardMachineEventListener,
+  emitOnboardMachineEvent,
+  sanitizeOnboardMachineEventMetadata,
+  type OnboardMachineEvent,
+  type OnboardMachineEventListener,
+} from "./events";
+
+export interface OnboardHook {
+  name?: string;
+  onEvent?(event: OnboardMachineEvent): Promise<void> | void;
+}
+
+export interface OnboardHookDispatchOptions {
+  warn?: (message: string) => void;
+  emitEvent?: (event: OnboardMachineEvent) => void;
+  now?: () => string;
+}
+
+export interface OnboardHookRegistrationOptions extends OnboardHookDispatchOptions {
+  includeHookEvents?: boolean;
+}
+
+function hookName(hook: OnboardHook, index: number): string {
+  const name = typeof hook.name === "string" ? hook.name.trim() : "";
+  return name || `hook-${index + 1}`;
+}
+
+function hookLifecycleEvent(
+  source: OnboardMachineEvent,
+  type: "hook.started" | "hook.completed" | "hook.failed",
+  hook: OnboardHook,
+  index: number,
+  options: {
+    occurredAt: string;
+    error?: unknown;
+    metadata?: Record<string, unknown>;
+  },
+): OnboardMachineEvent {
+  return {
+    version: 1,
+    type,
+    occurredAt: options.occurredAt,
+    sessionId: source.sessionId,
+    state: source.state,
+    step: source.step,
+    context: source.context,
+    error: redactSensitiveText(options.error instanceof Error ? options.error.message : options.error),
+    metadata: sanitizeOnboardMachineEventMetadata({
+      hook: hookName(hook, index),
+      sourceType: source.type,
+      ...options.metadata,
+    }),
+  };
+}
+
+function isHookLifecycleEvent(event: OnboardMachineEvent): boolean {
+  return event.type === "hook.started" || event.type === "hook.completed" || event.type === "hook.failed";
+}
+
+export class OnboardHookDispatcher {
+  private readonly hooks: readonly OnboardHook[];
+  private readonly warn: (message: string) => void;
+  private readonly emitEvent: (event: OnboardMachineEvent) => void;
+  private readonly now: () => string;
+
+  constructor(hooks: readonly OnboardHook[], options: OnboardHookDispatchOptions = {}) {
+    this.hooks = hooks;
+    this.warn = options.warn ?? ((message) => console.warn(message));
+    this.emitEvent = options.emitEvent ?? emitOnboardMachineEvent;
+    this.now = options.now ?? (() => new Date().toISOString());
+  }
+
+  async dispatch(event: OnboardMachineEvent): Promise<void> {
+    for (const [index, hook] of this.hooks.entries()) {
+      if (typeof hook.onEvent !== "function") continue;
+      this.emitEvent(
+        hookLifecycleEvent(event, "hook.started", hook, index, {
+          occurredAt: this.now(),
+        }),
+      );
+      try {
+        await hook.onEvent(event);
+        this.emitEvent(
+          hookLifecycleEvent(event, "hook.completed", hook, index, {
+            occurredAt: this.now(),
+          }),
+        );
+      } catch (error) {
+        const name = hookName(hook, index);
+        const message = error instanceof Error ? error.message : String(error);
+        this.warn(`Onboard hook '${name}' failed: ${redactSensitiveText(message) ?? "<redacted>"}`);
+        this.emitEvent(
+          hookLifecycleEvent(event, "hook.failed", hook, index, {
+            occurredAt: this.now(),
+            error: message,
+          }),
+        );
+      }
+    }
+  }
+}
+
+export function registerOnboardHooks(
+  hooks: readonly OnboardHook[],
+  options: OnboardHookRegistrationOptions = {},
+): () => void {
+  const dispatcher = new OnboardHookDispatcher(hooks, options);
+  const listener: OnboardMachineEventListener = (event) => {
+    if (options.includeHookEvents !== true && isHookLifecycleEvent(event)) return;
+    void dispatcher.dispatch(event);
+  };
+  return addOnboardMachineEventListener(listener);
+}
+
+export function createJsonlOnboardHook(filePath: string): OnboardHook {
+  const resolvedPath = path.resolve(filePath);
+  return {
+    name: "jsonl",
+    onEvent(event) {
+      fs.mkdirSync(path.dirname(resolvedPath), { recursive: true, mode: 0o700 });
+      fs.appendFileSync(resolvedPath, `${JSON.stringify(event)}\n`, { mode: 0o600 });
+    },
+  };
+}

From c2a58e6053babf96f876e0bddbd44a1f865a9340 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Tue, 19 May 2026 22:45:51 -0700
Subject: [PATCH 07/54] refactor(cli): extract onboard preflight handler

---
 src/lib/onboard.ts                            |  87 ++++-----
 .../machine/handlers/preflight.test.ts        | 183 ++++++++++++++++++
 src/lib/onboard/machine/handlers/preflight.ts | 147 ++++++++++++++
 3 files changed, 363 insertions(+), 54 deletions(-)
 create mode 100644 src/lib/onboard/machine/handlers/preflight.test.ts
 create mode 100644 src/lib/onboard/machine/handlers/preflight.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 470639b346..50c5187326 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -280,6 +280,7 @@ const { resolveSandboxImageTagFromCreateOutput } =
 const nim: typeof import("./inference/nim") = require("./inference/nim");
 const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session");
 const { OnboardRuntime }: typeof import("./onboard/machine/runtime") = require("./onboard/machine/runtime");
+const { handlePreflightState }: typeof import("./onboard/machine/handlers/preflight") = require("./onboard/machine/handlers/preflight");
 const policies: typeof import("./policy") = require("./policy");
 const tiers: typeof import("./policy/tiers") = require("./policy/tiers");
 const { ensureUsageNoticeConsent } = require("./onboard/usage-notice");
@@ -9403,54 +9404,39 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
     console.log("  ===================");
 
     const explicitSandboxGpuFlag = resolveSandboxGpuFlagFromOptions(opts);
-    const resumePreflight = resume && session?.steps?.preflight?.status === "complete";
-    const resumeHasResolvedGpuIntent =
-      resumePreflight &&
-      explicitSandboxGpuFlag === null &&
-      opts.sandboxGpuDevice == null &&
-      process.env.NEMOCLAW_SANDBOX_GPU === undefined &&
-      process.env.NEMOCLAW_SANDBOX_GPU_DEVICE === undefined;
-    const resumedSandboxGpuOverrides = resumeHasResolvedGpuIntent
-      ? getResumeSandboxGpuOverrides(
-          resumeSandboxNameForGpu ? registry.getSandbox(resumeSandboxNameForGpu) : null,
-          session?.gpuPassthrough,
-        )
-      : { flag: null, device: null };
-    const effectiveSandboxGpuFlag = explicitSandboxGpuFlag ?? resumedSandboxGpuOverrides.flag;
-    const effectiveSandboxGpuDevice = opts.sandboxGpuDevice ?? resumedSandboxGpuOverrides.device;
-    let gpu;
-    if (resumePreflight) {
-      skippedStepMessage("preflight", "cached");
-      gpu = nim.detectGpu();
-      // Re-check the CDI spec gap on resume (#3152). The cached preflight
-      // result does not capture host CDI state, and the original onboard
-      // attempt that wrote the cache likely aborted at gateway-start with
-      // exactly this CDI failure — so resuming without re-checking would
-      // walk into the same wall. Honour persisted `gpuPassthrough: false`
-      // from the prior session as an opt-out, since the resume invocation
-      // does not need to re-pass `--no-gpu` to keep that intent (the same
-      // resolution is replayed a few lines below for `gpuPassthrough`).
-      const resumeOptedOutGpuPassthrough =
-        opts.noGpu === true || (opts.gpu !== true && session?.gpuPassthrough === false);
-      assertCdiNvidiaGpuSpecPresent(assessHost(), resumeOptedOutGpuPassthrough);
-      validateSandboxGpuPreflight(
-        resolveSandboxGpuConfig(gpu, {
-          flag: effectiveSandboxGpuFlag,
-          device: effectiveSandboxGpuDevice,
-        }),
-      );
-    } else {
-      await startRecordedStep("preflight");
-      gpu = await preflight({ ...opts, optedOutGpuPassthrough: opts.noGpu === true });
-      await recordStepComplete("preflight");
-    }
-    const sandboxGpuConfig = resolveSandboxGpuConfig(gpu, {
-      flag: effectiveSandboxGpuFlag,
-      device: effectiveSandboxGpuDevice,
+    const preflightResult = await handlePreflightState({
+      resume,
+      session,
+      recordedSandboxName,
+      requestedSandboxName,
+      explicitSandboxGpuFlag,
+      sandboxGpuDevice: opts.sandboxGpuDevice ?? null,
+      gpuRequested: opts.gpu === true,
+      noGpu: opts.noGpu === true,
+      env: process.env,
+      deps: {
+        getSandbox: registry.getSandbox.bind(registry),
+        getResumeSandboxGpuOverrides,
+        detectGpu: nim.detectGpu,
+        runPreflight: (preflightOptions) => preflight({ ...opts, ...preflightOptions }),
+        assessHost,
+        assertCdiNvidiaGpuSpecPresent,
+        resolveSandboxGpuConfig,
+        validateSandboxGpuPreflight,
+        skippedStepMessage,
+        startRecordedStep,
+        recordStepComplete,
+        updateSession: onboardSession.updateSession,
+      },
     });
-
-    const requestedGpuPassthrough = opts.gpu === true;
-    const gpuPassthrough = sandboxGpuConfig.sandboxGpuEnabled;
+    session = preflightResult.session;
+    const {
+      sandboxGpuConfig,
+      resumeHasResolvedGpuIntent,
+      requestedGpuPassthrough,
+      gpuPassthrough,
+    } = preflightResult;
+    const gpu = preflightResult.gpu ?? null;
     if (gpuPassthrough) {
       note(
         resumeHasResolvedGpuIntent && session?.gpuPassthrough === true
@@ -9472,13 +9458,6 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
         /* lspci not available — skip hint */
       }
     }
-    // Persist GPU intent in the session so resume can restore it.
-    if (session && session.gpuPassthrough !== gpuPassthrough) {
-      session = onboardSession.updateSession((current: Session) => {
-        current.gpuPassthrough = gpuPassthrough;
-        return current;
-      });
-    }
     dockerGpuLocalInference.configureLocalInferenceForDockerGpuHostNetwork(sandboxGpuConfig, {
       dockerDriverGateway: isLinuxDockerDriverGatewayEnabled(),
       note,
diff --git a/src/lib/onboard/machine/handlers/preflight.test.ts b/src/lib/onboard/machine/handlers/preflight.test.ts
new file mode 100644
index 0000000000..fa4b859915
--- /dev/null
+++ b/src/lib/onboard/machine/handlers/preflight.test.ts
@@ -0,0 +1,183 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { describe, expect, it, vi } from "vitest";
+
+import { createSession, type Session } from "../../../state/onboard-session";
+import { handlePreflightState, type PreflightStateOptions } from "./preflight";
+
+type Gpu = { type: string } | null;
+type SandboxEntry = { sandboxGpuEnabled?: boolean };
+type Host = { cdiNvidiaGpuSpecMissing?: boolean };
+
+function createDeps(overrides: Partial<PreflightStateOptions<Gpu, SandboxEntry, Host, { sandboxGpuEnabled: boolean; mode: string; sandboxGpuDevice?: string | null }>["deps"]> = {}) {
+  let session = createSession();
+  return {
+    calls: {
+      start: vi.fn(),
+      complete: vi.fn(),
+      skipped: vi.fn(),
+      detectGpu: vi.fn(() => ({ type: "nvidia" }) as Gpu),
+      runPreflight: vi.fn(async () => ({ type: "nvidia" }) as Gpu),
+      validate: vi.fn(),
+      cdi: vi.fn(),
+      updateSession: vi.fn(),
+      getSandbox: vi.fn(() => ({ sandboxGpuEnabled: true })),
+      getOverrides: vi.fn(() => ({ flag: "enable" as const, device: "0" })),
+    },
+    deps: {
+      getSandbox: (name: string) => {
+        const value = ({ sandboxGpuEnabled: true } satisfies SandboxEntry);
+        return overrides.getSandbox ? overrides.getSandbox(name) : value;
+      },
+      getResumeSandboxGpuOverrides: (
+        sandbox: SandboxEntry | null,
+        sessionGpuPassthrough: boolean | null | undefined,
+      ) => {
+        if (overrides.getResumeSandboxGpuOverrides) {
+          return overrides.getResumeSandboxGpuOverrides(sandbox, sessionGpuPassthrough);
+        }
+        return { flag: "enable" as const, device: "0" };
+      },
+      detectGpu: () => ({ type: "nvidia" }) as Gpu,
+      runPreflight: async () => ({ type: "nvidia" }) as Gpu,
+      assessHost: () => ({ cdiNvidiaGpuSpecMissing: false }),
+      assertCdiNvidiaGpuSpecPresent: vi.fn(),
+      resolveSandboxGpuConfig: (_gpu: Gpu, opts: { flag: "enable" | "disable" | null; device: string | null | undefined }) => ({
+        sandboxGpuEnabled: opts.flag === "enable",
+        mode: opts.flag === "enable" ? "1" : "0",
+        sandboxGpuDevice: opts.device,
+      }),
+      validateSandboxGpuPreflight: vi.fn(),
+      skippedStepMessage: vi.fn(),
+      startRecordedStep: vi.fn(async () => undefined),
+      recordStepComplete: vi.fn(async () => session),
+      updateSession: vi.fn((mutator: (value: Session) => Session | void) => {
+        session = mutator(session) ?? session;
+        return session;
+      }),
+      ...overrides,
+    },
+    getSession: () => session,
+  };
+}
+
+function baseOptions(
+  deps: PreflightStateOptions<Gpu, SandboxEntry, Host, { sandboxGpuEnabled: boolean; mode: string; sandboxGpuDevice?: string | null }>["deps"],
+  session: Session | null = createSession(),
+): PreflightStateOptions<Gpu, SandboxEntry, Host, { sandboxGpuEnabled: boolean; mode: string; sandboxGpuDevice?: string | null }> {
+  return {
+    resume: false,
+    session,
+    recordedSandboxName: null,
+    requestedSandboxName: "my-assistant",
+    explicitSandboxGpuFlag: null,
+    sandboxGpuDevice: null,
+    gpuRequested: false,
+    noGpu: false,
+    env: {},
+    deps,
+  };
+}
+
+describe("handlePreflightState", () => {
+  it("runs full preflight through recorded step boundaries", async () => {
+    const harness = createDeps({
+      startRecordedStep: vi.fn(async () => undefined),
+      runPreflight: vi.fn(async () => ({ type: "nvidia" }) as Gpu),
+      recordStepComplete: vi.fn(async () => createSession()),
+    });
+
+    const result = await handlePreflightState({
+      ...baseOptions(harness.deps),
+      explicitSandboxGpuFlag: "enable",
+      sandboxGpuDevice: "GPU-0",
+    });
+
+    expect(harness.deps.startRecordedStep).toHaveBeenCalledWith("preflight");
+    expect(harness.deps.runPreflight).toHaveBeenCalledWith({ optedOutGpuPassthrough: false });
+    expect(harness.deps.recordStepComplete).toHaveBeenCalledWith("preflight");
+    expect(result.sandboxGpuConfig).toMatchObject({
+      sandboxGpuEnabled: true,
+      mode: "1",
+      sandboxGpuDevice: "GPU-0",
+    });
+    expect(result.gpuPassthrough).toBe(true);
+  });
+
+  it("skips full preflight on resume but re-detects GPU and revalidates CDI/sandbox GPU", async () => {
+    const session = createSession();
+    session.steps.preflight.status = "complete";
+    session.gpuPassthrough = false;
+    const harness = createDeps({
+      detectGpu: vi.fn(() => ({ type: "nvidia" }) as Gpu),
+      assertCdiNvidiaGpuSpecPresent: vi.fn(),
+      validateSandboxGpuPreflight: vi.fn(),
+      skippedStepMessage: vi.fn(),
+      startRecordedStep: vi.fn(async () => undefined),
+      runPreflight: vi.fn(async () => ({ type: "should-not-run" }) as Gpu),
+    });
+
+    const result = await handlePreflightState({
+      ...baseOptions(harness.deps, session),
+      resume: true,
+      gpuRequested: false,
+    });
+
+    expect(harness.deps.skippedStepMessage).toHaveBeenCalledWith("preflight", "cached");
+    expect(harness.deps.detectGpu).toHaveBeenCalledOnce();
+    expect(harness.deps.runPreflight).not.toHaveBeenCalled();
+    expect(harness.deps.startRecordedStep).not.toHaveBeenCalled();
+    expect(harness.deps.assertCdiNvidiaGpuSpecPresent).toHaveBeenCalledWith(
+      { cdiNvidiaGpuSpecMissing: false },
+      true,
+    );
+    expect(harness.deps.validateSandboxGpuPreflight).toHaveBeenCalledOnce();
+    expect(result.resumePreflight).toBe(true);
+  });
+
+  it("restores saved sandbox GPU intent only when resume has no explicit override", async () => {
+    const session = createSession();
+    session.steps.preflight.status = "complete";
+    session.gpuPassthrough = true;
+    const getResumeSandboxGpuOverrides = vi.fn(() => ({ flag: "enable" as const, device: "1" }));
+    const getSandbox = vi.fn(() => ({ sandboxGpuEnabled: true }));
+    const harness = createDeps({ getResumeSandboxGpuOverrides, getSandbox });
+
+    const result = await handlePreflightState({
+      ...baseOptions(harness.deps, session),
+      resume: true,
+      recordedSandboxName: "saved",
+    });
+
+    expect(getSandbox).toHaveBeenCalledWith("saved");
+    expect(getResumeSandboxGpuOverrides).toHaveBeenCalledWith(
+      { sandboxGpuEnabled: true },
+      true,
+    );
+    expect(result.resumeHasResolvedGpuIntent).toBe(true);
+    expect(result.effectiveSandboxGpuFlag).toBe("enable");
+    expect(result.effectiveSandboxGpuDevice).toBe("1");
+
+    await handlePreflightState({
+      ...baseOptions(harness.deps, session),
+      resume: true,
+      explicitSandboxGpuFlag: "disable",
+    });
+    expect(getResumeSandboxGpuOverrides).toHaveBeenCalledTimes(1);
+  });
+
+  it("persists effective GPU passthrough intent for later resume", async () => {
+    const session = createSession();
+    session.gpuPassthrough = false;
+    const harness = createDeps();
+
+    const result = await handlePreflightState({
+      ...baseOptions(harness.deps, session),
+      explicitSandboxGpuFlag: "enable",
+    });
+
+    expect(result.session?.gpuPassthrough).toBe(true);
+    expect(harness.deps.updateSession).toHaveBeenCalledOnce();
+  });
+});
diff --git a/src/lib/onboard/machine/handlers/preflight.ts b/src/lib/onboard/machine/handlers/preflight.ts
new file mode 100644
index 0000000000..cc5bd6633d
--- /dev/null
+++ b/src/lib/onboard/machine/handlers/preflight.ts
@@ -0,0 +1,147 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import type { Session } from "../../../state/onboard-session";
+
+export type PreflightSandboxGpuFlag = "enable" | "disable" | null;
+
+export interface PreflightSandboxGpuOverrides {
+  flag: PreflightSandboxGpuFlag;
+  device: string | null;
+}
+
+export interface PreflightSandboxGpuConfig {
+  sandboxGpuEnabled: boolean;
+  mode: string;
+  sandboxGpuDevice?: string | null;
+  errors?: readonly string[];
+}
+
+export interface PreflightStateOptions<
+  Gpu,
+  SandboxEntry,
+  Host,
+  Config extends PreflightSandboxGpuConfig,
+> {
+  resume: boolean;
+  session: Session | null;
+  recordedSandboxName: string | null;
+  requestedSandboxName: string | null;
+  explicitSandboxGpuFlag: PreflightSandboxGpuFlag;
+  sandboxGpuDevice?: string | null;
+  gpuRequested: boolean;
+  noGpu: boolean;
+  env: NodeJS.ProcessEnv;
+  deps: {
+    getSandbox(name: string): SandboxEntry | null;
+    getResumeSandboxGpuOverrides(
+      sandbox: SandboxEntry | null,
+      sessionGpuPassthrough: boolean | null | undefined,
+    ): PreflightSandboxGpuOverrides;
+    detectGpu(): Gpu;
+    runPreflight(options: { optedOutGpuPassthrough?: boolean }): Promise<Gpu>;
+    assessHost(): Host;
+    assertCdiNvidiaGpuSpecPresent(host: Host, optedOutGpuPassthrough: boolean): void;
+    resolveSandboxGpuConfig(
+      gpu: Gpu,
+      options: { flag: PreflightSandboxGpuFlag; device: string | null | undefined },
+    ): Config;
+    validateSandboxGpuPreflight(config: Config): void;
+    skippedStepMessage(stepName: string, detail?: string | null): void;
+    startRecordedStep(stepName: string): Promise<void>;
+    recordStepComplete(stepName: string): Promise<Session>;
+    updateSession(mutator: (session: Session) => Session | void): Session;
+  };
+}
+
+export interface PreflightStateResult<Gpu, Config extends PreflightSandboxGpuConfig> {
+  gpu: Gpu;
+  sandboxGpuConfig: Config;
+  resumePreflight: boolean;
+  resumeHasResolvedGpuIntent: boolean;
+  requestedGpuPassthrough: boolean;
+  gpuPassthrough: boolean;
+  effectiveSandboxGpuFlag: PreflightSandboxGpuFlag;
+  effectiveSandboxGpuDevice: string | null | undefined;
+  session: Session | null;
+}
+
+function envHasSandboxGpuOverride(env: NodeJS.ProcessEnv): boolean {
+  return env.NEMOCLAW_SANDBOX_GPU !== undefined || env.NEMOCLAW_SANDBOX_GPU_DEVICE !== undefined;
+}
+
+export async function handlePreflightState<
+  Gpu,
+  SandboxEntry,
+  Host,
+  Config extends PreflightSandboxGpuConfig,
+>({
+  resume,
+  session,
+  recordedSandboxName,
+  requestedSandboxName,
+  explicitSandboxGpuFlag,
+  sandboxGpuDevice,
+  gpuRequested,
+  noGpu,
+  env,
+  deps,
+}: PreflightStateOptions<Gpu, SandboxEntry, Host, Config>): Promise<PreflightStateResult<Gpu, Config>> {
+  const resumeSandboxNameForGpu = recordedSandboxName || requestedSandboxName || null;
+  const resumePreflight = resume && session?.steps?.preflight?.status === "complete";
+  const resumeHasResolvedGpuIntent =
+    resumePreflight &&
+    explicitSandboxGpuFlag === null &&
+    sandboxGpuDevice == null &&
+    !envHasSandboxGpuOverride(env);
+  const resumedSandboxGpuOverrides = resumeHasResolvedGpuIntent
+    ? deps.getResumeSandboxGpuOverrides(
+        resumeSandboxNameForGpu ? deps.getSandbox(resumeSandboxNameForGpu) : null,
+        session?.gpuPassthrough,
+      )
+    : { flag: null, device: null };
+  const effectiveSandboxGpuFlag = explicitSandboxGpuFlag ?? resumedSandboxGpuOverrides.flag;
+  const effectiveSandboxGpuDevice = sandboxGpuDevice ?? resumedSandboxGpuOverrides.device;
+
+  let gpu: Gpu;
+  if (resumePreflight) {
+    deps.skippedStepMessage("preflight", "cached");
+    gpu = deps.detectGpu();
+    const resumeOptedOutGpuPassthrough = noGpu || (!gpuRequested && session?.gpuPassthrough === false);
+    deps.assertCdiNvidiaGpuSpecPresent(deps.assessHost(), resumeOptedOutGpuPassthrough);
+    deps.validateSandboxGpuPreflight(
+      deps.resolveSandboxGpuConfig(gpu, {
+        flag: effectiveSandboxGpuFlag,
+        device: effectiveSandboxGpuDevice,
+      }),
+    );
+  } else {
+    await deps.startRecordedStep("preflight");
+    gpu = await deps.runPreflight({ optedOutGpuPassthrough: noGpu });
+    session = await deps.recordStepComplete("preflight");
+  }
+
+  const sandboxGpuConfig = deps.resolveSandboxGpuConfig(gpu, {
+    flag: effectiveSandboxGpuFlag,
+    device: effectiveSandboxGpuDevice,
+  });
+  const gpuPassthrough = sandboxGpuConfig.sandboxGpuEnabled;
+  if (session && session.gpuPassthrough !== gpuPassthrough) {
+    session = deps.updateSession((current) => {
+      current.gpuPassthrough = gpuPassthrough;
+      return current;
+    });
+  }
+
+  return {
+    gpu,
+    sandboxGpuConfig,
+    resumePreflight,
+    resumeHasResolvedGpuIntent,
+    requestedGpuPassthrough: gpuRequested,
+    gpuPassthrough,
+    effectiveSandboxGpuFlag,
+    effectiveSandboxGpuDevice,
+    session,
+  };
+}

From f17000a73716c16fe69007aeb1c8d218e646cb1d Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Tue, 19 May 2026 23:01:05 -0700
Subject: [PATCH 08/54] refactor(cli): extract onboard gateway handler

---
 src/lib/onboard.ts                            | 147 +++----------
 .../onboard/machine/handlers/gateway.test.ts  | 203 ++++++++++++++++++
 src/lib/onboard/machine/handlers/gateway.ts   | 178 +++++++++++++++
 3 files changed, 413 insertions(+), 115 deletions(-)
 create mode 100644 src/lib/onboard/machine/handlers/gateway.test.ts
 create mode 100644 src/lib/onboard/machine/handlers/gateway.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 50c5187326..9d9b047748 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -280,6 +280,7 @@ const { resolveSandboxImageTagFromCreateOutput } =
 const nim: typeof import("./inference/nim") = require("./inference/nim");
 const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session");
 const { OnboardRuntime }: typeof import("./onboard/machine/runtime") = require("./onboard/machine/runtime");
+const { handleGatewayState }: typeof import("./onboard/machine/handlers/gateway") = require("./onboard/machine/handlers/gateway");
 const { handlePreflightState }: typeof import("./onboard/machine/handlers/preflight") = require("./onboard/machine/handlers/preflight");
 const policies: typeof import("./policy") = require("./policy");
 const tiers: typeof import("./policy/tiers") = require("./policy/tiers");
@@ -9464,125 +9465,41 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
     });
 
     const gatewaySnapshot = selectNamedGatewayForReuseIfNeeded(getGatewayReuseSnapshot());
-    let gatewayReuseState = gatewaySnapshot.gatewayReuseState;
-    gatewayReuseState = await refreshDockerDriverGatewayReuseState(gatewayReuseState);
-
-    // Verify the legacy gateway container is actually running — openshell CLI
-    // metadata can be stale after a manual `docker rm`. See #2020. Newer
-    // package-managed OpenShell gateways do not have an openshell-cluster-*
-    // Docker container, so the live CLI health check is the source of truth.
-    if (gatewayReuseState === "healthy" && gatewayCliSupportsLifecycleCommands(runCaptureOpenshell)) {
-      const containerState = verifyGatewayContainerRunning(GATEWAY_NAME);
-      if (containerState === "missing") {
-        console.log("  Gateway metadata is stale (container not running). Cleaning up...");
-        runOpenshell(["forward", "stop", String(DASHBOARD_PORT)], { ignoreError: true });
-        gatewayReuseState = destroyGatewayForReuse(
-          destroyGateway,
-          "  ✓ Stale gateway metadata cleaned up",
-          "  ! Stale gateway metadata cleanup failed; leaving registry state intact.",
-        );
-      } else if (containerState === "unknown") {
-        // Docker probe failed but cached metadata says healthy. Try the host-level
-        // HTTP probe — it doesn't depend on Docker, so it can confirm the gateway
-        // is genuinely serving even when the daemon is flaky.
-        if (await waitForGatewayHttpReady()) {
-          console.log(
-            "  Warning: could not verify gateway container state (Docker may be unavailable), but the gateway is responding on HTTP. Proceeding with reuse.",
-          );
-        } else {
-          // Docker can't be probed AND the gateway HTTP endpoint isn't
-          // responding. We cannot tell whether the existing gateway is live
-          // (transient `docker inspect` flake + warm-up miss) or genuinely
-          // gone. Per #2020 we must not destroy in this branch, and we must
-          // not downgrade to "missing" either: that would push execution into
-          // `startGatewayWithOptions`, whose retry hook calls
-          // `destroyGateway()` between attempts — which would tear down a
-          // possibly-live gateway. Bail with an actionable error instead.
-          console.log(
-            `  Error: could not verify gateway container state and ${getGatewayLocalEndpoint()}/ is not responding.`,
-          );
-          console.log(
-            "  Refusing to proceed without a clear Docker signal — restarting Docker and re-running onboard is the safe path. See #3258 / #2020.",
-          );
-          process.exit(1);
-        }
-      } else if (!(await waitForGatewayHttpReady())) {
-        // Container is running but the gateway HTTP endpoint is not responding.
-        // Common immediately after a Docker daemon restart — the container comes
-        // back before the OpenShell gateway upstream finishes warming up. Safe to
-        // recreate because Docker is functional. See #3258.
-        console.log(
-          `  Gateway container is running but ${getGatewayLocalEndpoint()}/ is not responding. Recreating...`,
-        );
-        runOpenshell(["forward", "stop", String(DASHBOARD_PORT)], { ignoreError: true });
-        gatewayReuseState = destroyGatewayForReuse(
-          destroyGateway,
-          "  ✓ Stale gateway cleaned up",
-          "  ! Stale gateway cleanup failed; leaving registry state intact.",
-        );
-      } else {
-        const imageDrift = getGatewayClusterImageDrift();
-        if (imageDrift) {
-          console.log(
-            `  Gateway image ${imageDrift.currentVersion} does not match openshell ${imageDrift.expectedVersion}. Recreating...`,
-          );
-          stopAllDashboardForwards();
-          gatewayReuseState = destroyGatewayForReuse(
-            destroyGateway,
-            "  ✓ Previous gateway cleaned up",
-            "  ! Previous gateway cleanup failed; leaving registry state intact.",
-          );
-        }
-      }
-    }
-
-    gatewayReuseState = reconcileGatewayGpuReuseForGpuIntent({
-      gatewayReuseState,
+    const gatewayResult = await handleGatewayState({
+      resume,
+      session,
+      initialGatewayReuseState: gatewaySnapshot.gatewayReuseState,
+      gpu,
       gpuPassthrough,
       gatewayName: GATEWAY_NAME,
-      currentSandboxName: recordedSandboxName || requestedSandboxName,
+      dashboardPort: DASHBOARD_PORT,
+      recordedSandboxName,
+      requestedSandboxName,
       recreateSandbox: isRecreateSandbox(),
-      confirmedDockerDriverGateway:
-        isLinuxDockerDriverGatewayEnabled() &&
-        gatewayReuseState === "healthy" &&
-        !gatewayCliSupportsLifecycleCommands(runCaptureOpenshell),
-      stopDashboardForwards: stopAllDashboardForwards,
-      retireLegacyGatewayForDockerDriverUpgrade,
-      destroyGatewayRuntimeForGpuReuse: () => destroyGateway(() => undefined, () => false),
+      deps: {
+        refreshDockerDriverGatewayReuseState,
+        gatewayCliSupportsLifecycleCommands: () => gatewayCliSupportsLifecycleCommands(runCaptureOpenshell),
+        verifyGatewayContainerRunning,
+        waitForGatewayHttpReady,
+        getGatewayLocalEndpoint,
+        runOpenshell,
+        destroyGateway,
+        destroyGatewayForReuse,
+        getGatewayClusterImageDrift,
+        stopAllDashboardForwards,
+        reconcileGatewayGpuReuseForGpuIntent,
+        isLinuxDockerDriverGatewayEnabled,
+        retireLegacyGatewayForDockerDriverUpgrade,
+        destroyGatewayRuntimeForGpuReuse: () => destroyGateway(() => undefined, () => false),
+        skippedStepMessage,
+        note,
+        startRecordedStep,
+        startGateway,
+        recordStepComplete,
+        exitProcess: (code) => process.exit(code),
+      },
     });
-
-    const canReuseHealthyGateway = gatewayReuseState === "healthy";
-
-    const resumeGateway =
-      resume && session?.steps?.gateway?.status === "complete" && canReuseHealthyGateway;
-    if (resumeGateway) {
-      skippedStepMessage("gateway", "running");
-      await recordStepComplete("gateway");
-    } else if (!resume && canReuseHealthyGateway) {
-      skippedStepMessage("gateway", "running", "reuse");
-      note("  Reusing healthy NemoClaw gateway.");
-      await recordStepComplete("gateway");
-    } else {
-      if (resume && session?.steps?.gateway?.status === "complete") {
-        if (gatewayReuseState === "active-unnamed") {
-          note("  [resume] Gateway is active but named metadata is missing; recreating it safely.");
-        } else if (gatewayReuseState === "foreign-active") {
-          note("  [resume] A different OpenShell gateway is active; NemoClaw will not reuse it.");
-        } else if (gatewayReuseState === "stale") {
-          note("  [resume] Recorded gateway is unhealthy; recreating it.");
-        } else {
-          note("  [resume] Recorded gateway state is unavailable; recreating it.");
-        }
-      }
-      if (isLinuxDockerDriverGatewayEnabled() && gatewayReuseState !== "missing") {
-        note("  Replacing legacy OpenShell gateway metadata with Docker-driver gateway.");
-        retireLegacyGatewayForDockerDriverUpgrade();
-        gatewayReuseState = "missing";
-      }
-      await startRecordedStep("gateway");
-      await startGateway(gpu, { gpuPassthrough });
-      await recordStepComplete("gateway");
-    }
+    session = gatewayResult.session;
 
     // #2753: prefer requestedSandboxName over an unconfirmed session name.
     // A pre-fix session may carry sandboxName even though sandbox creation
diff --git a/src/lib/onboard/machine/handlers/gateway.test.ts b/src/lib/onboard/machine/handlers/gateway.test.ts
new file mode 100644
index 0000000000..266ba10360
--- /dev/null
+++ b/src/lib/onboard/machine/handlers/gateway.test.ts
@@ -0,0 +1,203 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { describe, expect, it, vi } from "vitest";
+
+import { createSession, type Session } from "../../../state/onboard-session";
+import type { GatewayReuseState } from "../../../state/gateway";
+import { handleGatewayState, type GatewayStateOptions } from "./gateway";
+
+type Gpu = { type: string } | null;
+
+function createDeps(overrides: Partial<GatewayStateOptions<Gpu>["deps"]> = {}) {
+  const calls = {
+    refresh: vi.fn(async (state: GatewayReuseState) => state),
+    lifecycle: vi.fn(() => false),
+    verifyContainer: vi.fn(() => "running"),
+    waitHttp: vi.fn(async () => true),
+    runOpenshell: vi.fn(),
+    destroy: vi.fn(() => true),
+    destroyForReuse: vi.fn(() => "missing" as GatewayReuseState),
+    imageDrift: vi.fn(() => null),
+    stopForwards: vi.fn(),
+    reconcileGpu: vi.fn((opts: { gatewayReuseState: GatewayReuseState }) => opts.gatewayReuseState),
+    dockerDriver: vi.fn(() => false),
+    retireLegacy: vi.fn(),
+    destroyGpuRuntime: vi.fn(() => true),
+    skipped: vi.fn(),
+    note: vi.fn(),
+    startStep: vi.fn(async () => undefined),
+    startGateway: vi.fn(async () => undefined),
+    complete: vi.fn(async () => createSession()),
+    exit: vi.fn((code: number): never => {
+      throw new Error(`exit ${code}`);
+    }),
+  };
+  return {
+    calls,
+    deps: {
+      refreshDockerDriverGatewayReuseState: calls.refresh,
+      gatewayCliSupportsLifecycleCommands: calls.lifecycle,
+      verifyGatewayContainerRunning: calls.verifyContainer,
+      waitForGatewayHttpReady: calls.waitHttp,
+      getGatewayLocalEndpoint: () => "http://127.0.0.1:31818",
+      runOpenshell: calls.runOpenshell,
+      destroyGateway: calls.destroy,
+      destroyGatewayForReuse: calls.destroyForReuse,
+      getGatewayClusterImageDrift: calls.imageDrift,
+      stopAllDashboardForwards: calls.stopForwards,
+      reconcileGatewayGpuReuseForGpuIntent: calls.reconcileGpu,
+      isLinuxDockerDriverGatewayEnabled: calls.dockerDriver,
+      retireLegacyGatewayForDockerDriverUpgrade: calls.retireLegacy,
+      destroyGatewayRuntimeForGpuReuse: calls.destroyGpuRuntime,
+      skippedStepMessage: calls.skipped,
+      note: calls.note,
+      startRecordedStep: calls.startStep,
+      startGateway: calls.startGateway,
+      recordStepComplete: calls.complete,
+      exitProcess: calls.exit,
+      ...overrides,
+    },
+  };
+}
+
+function baseOptions(
+  deps: GatewayStateOptions<Gpu>["deps"],
+  initialGatewayReuseState: GatewayReuseState = "missing",
+  session: Session | null = createSession(),
+): GatewayStateOptions<Gpu> {
+  return {
+    resume: false,
+    session,
+    initialGatewayReuseState,
+    gpu: { type: "nvidia" },
+    gpuPassthrough: true,
+    gatewayName: "nemoclaw",
+    dashboardPort: 18789,
+    recordedSandboxName: null,
+    requestedSandboxName: "my-assistant",
+    recreateSandbox: false,
+    deps,
+  };
+}
+
+describe("handleGatewayState", () => {
+  it("starts the gateway when no reusable gateway exists", async () => {
+    const { deps, calls } = createDeps();
+
+    const result = await handleGatewayState(baseOptions(deps, "missing"));
+
+    expect(calls.startStep).toHaveBeenCalledWith("gateway");
+    expect(calls.startGateway).toHaveBeenCalledWith({ type: "nvidia" }, { gpuPassthrough: true });
+    expect(calls.complete).toHaveBeenCalledWith("gateway");
+    expect(result.gatewayReuseState).toBe("missing");
+  });
+
+  it("reuses healthy gateways on fresh runs", async () => {
+    const { deps, calls } = createDeps();
+
+    await handleGatewayState(baseOptions(deps, "healthy"));
+
+    expect(calls.skipped).toHaveBeenCalledWith("gateway", "running", "reuse");
+    expect(calls.note).toHaveBeenCalledWith("  Reusing healthy NemoClaw gateway.");
+    expect(calls.startGateway).not.toHaveBeenCalled();
+    expect(calls.complete).toHaveBeenCalledWith("gateway");
+  });
+
+  it("reuses healthy gateways on resume only when the gateway step was complete", async () => {
+    const session = createSession();
+    session.steps.gateway.status = "complete";
+    const { deps, calls } = createDeps();
+
+    await handleGatewayState({ ...baseOptions(deps, "healthy", session), resume: true });
+
+    expect(calls.skipped).toHaveBeenCalledWith("gateway", "running");
+    expect(calls.startGateway).not.toHaveBeenCalled();
+  });
+
+  it("cleans stale lifecycle metadata when the gateway container is missing", async () => {
+    const { deps, calls } = createDeps({
+      gatewayCliSupportsLifecycleCommands: vi.fn(() => true),
+      verifyGatewayContainerRunning: vi.fn(() => "missing" as GatewayReuseState),
+      destroyGatewayForReuse: vi.fn(() => "missing" as GatewayReuseState),
+    });
+
+    await handleGatewayState(baseOptions(deps, "healthy"));
+
+    expect(calls.runOpenshell).toHaveBeenCalledWith(["forward", "stop", "18789"], {
+      ignoreError: true,
+    });
+    expect(deps.destroyGatewayForReuse).toHaveBeenCalledWith(
+      deps.destroyGateway,
+      "  ✓ Stale gateway metadata cleaned up",
+      "  ! Stale gateway metadata cleanup failed; leaving registry state intact.",
+    );
+    expect(calls.startGateway).toHaveBeenCalled();
+  });
+
+  it("refuses to destroy an unknown container state when HTTP is also unavailable", async () => {
+    const { deps, calls } = createDeps({
+      gatewayCliSupportsLifecycleCommands: vi.fn(() => true),
+      verifyGatewayContainerRunning: vi.fn(() => "unknown"),
+      waitForGatewayHttpReady: vi.fn(async () => false),
+    });
+
+    await expect(handleGatewayState(baseOptions(deps, "healthy"))).rejects.toThrow("exit 1");
+
+    expect(calls.exit).toHaveBeenCalledWith(1);
+    expect(calls.destroyForReuse).not.toHaveBeenCalled();
+  });
+
+  it("recreates a running lifecycle gateway when the HTTP endpoint is unhealthy", async () => {
+    const { deps, calls } = createDeps({
+      gatewayCliSupportsLifecycleCommands: vi.fn(() => true),
+      waitForGatewayHttpReady: vi.fn(async () => false),
+      destroyGatewayForReuse: vi.fn(() => "missing" as GatewayReuseState),
+    });
+
+    await handleGatewayState(baseOptions(deps, "healthy"));
+
+    expect(calls.runOpenshell).toHaveBeenCalledWith(["forward", "stop", "18789"], {
+      ignoreError: true,
+    });
+    expect(deps.destroyGatewayForReuse).toHaveBeenCalledWith(
+      deps.destroyGateway,
+      "  ✓ Stale gateway cleaned up",
+      "  ! Stale gateway cleanup failed; leaving registry state intact.",
+    );
+  });
+
+  it("recreates on gateway image drift after stopping dashboard forwards", async () => {
+    const { deps, calls } = createDeps({
+      gatewayCliSupportsLifecycleCommands: vi.fn(() => true),
+      waitForGatewayHttpReady: vi.fn(async () => true),
+      getGatewayClusterImageDrift: vi.fn(() => ({ currentVersion: "0.0.38", expectedVersion: "0.0.39" })),
+      destroyGatewayForReuse: vi.fn(() => "missing" as GatewayReuseState),
+    });
+
+    await handleGatewayState(baseOptions(deps, "healthy"));
+
+    expect(calls.stopForwards).toHaveBeenCalledOnce();
+    expect(deps.destroyGatewayForReuse).toHaveBeenCalledWith(
+      deps.destroyGateway,
+      "  ✓ Previous gateway cleaned up",
+      "  ! Previous gateway cleanup failed; leaving registry state intact.",
+    );
+  });
+
+  it("replaces legacy metadata before starting the Docker-driver gateway", async () => {
+    const { deps, calls } = createDeps({
+      isLinuxDockerDriverGatewayEnabled: vi.fn(() => true),
+      reconcileGatewayGpuReuseForGpuIntent: vi.fn(() => "stale" as GatewayReuseState),
+    });
+
+    const result = await handleGatewayState(baseOptions(deps, "healthy"));
+
+    expect(calls.note).toHaveBeenCalledWith(
+      "  Replacing legacy OpenShell gateway metadata with Docker-driver gateway.",
+    );
+    expect(calls.retireLegacy).toHaveBeenCalledOnce();
+    expect(calls.startGateway).toHaveBeenCalledOnce();
+    expect(result.gatewayReuseState).toBe("missing");
+  });
+});
diff --git a/src/lib/onboard/machine/handlers/gateway.ts b/src/lib/onboard/machine/handlers/gateway.ts
new file mode 100644
index 0000000000..026c26e1b4
--- /dev/null
+++ b/src/lib/onboard/machine/handlers/gateway.ts
@@ -0,0 +1,178 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import type { Session } from "../../../state/onboard-session";
+import type { GatewayReuseState } from "../../../state/gateway";
+
+export type GatewayContainerState = "missing" | "unknown" | string;
+
+export interface GatewayStateOptions<Gpu> {
+  resume: boolean;
+  session: Session | null;
+  initialGatewayReuseState: GatewayReuseState;
+  gpu: Gpu;
+  gpuPassthrough: boolean;
+  gatewayName: string;
+  dashboardPort: number;
+  recordedSandboxName: string | null;
+  requestedSandboxName: string | null;
+  recreateSandbox: boolean;
+  deps: {
+    refreshDockerDriverGatewayReuseState(state: GatewayReuseState): Promise<GatewayReuseState>;
+    gatewayCliSupportsLifecycleCommands(): boolean;
+    verifyGatewayContainerRunning(gatewayName: string): GatewayContainerState;
+    waitForGatewayHttpReady(): Promise<boolean>;
+    getGatewayLocalEndpoint(): string;
+    runOpenshell(args: string[], opts?: { ignoreError?: boolean }): unknown;
+    destroyGateway(): boolean;
+    destroyGatewayForReuse(
+      destroyGateway: () => boolean,
+      successMessage: string,
+      failureMessage: string,
+    ): GatewayReuseState;
+    getGatewayClusterImageDrift(): { currentVersion: string; expectedVersion: string } | null;
+    stopAllDashboardForwards(): void;
+    reconcileGatewayGpuReuseForGpuIntent(options: {
+      gatewayReuseState: GatewayReuseState;
+      gpuPassthrough: boolean;
+      gatewayName: string;
+      currentSandboxName: string | null;
+      recreateSandbox: boolean;
+      confirmedDockerDriverGateway: boolean;
+      stopDashboardForwards: () => void;
+      retireLegacyGatewayForDockerDriverUpgrade: () => void;
+      destroyGatewayRuntimeForGpuReuse: () => boolean;
+    }): GatewayReuseState;
+    isLinuxDockerDriverGatewayEnabled(): boolean;
+    retireLegacyGatewayForDockerDriverUpgrade(): void;
+    destroyGatewayRuntimeForGpuReuse(): boolean;
+    skippedStepMessage(
+      stepName: string,
+      detail?: string | null,
+      reason?: "resume" | "reuse",
+    ): void;
+    note(message: string): void;
+    startRecordedStep(stepName: string): Promise<void>;
+    startGateway(gpu: Gpu, options: { gpuPassthrough: boolean }): Promise<void>;
+    recordStepComplete(stepName: string): Promise<Session>;
+    exitProcess(code: number): never;
+  };
+}
+
+export interface GatewayStateResult {
+  gatewayReuseState: GatewayReuseState;
+  session: Session | null;
+}
+
+export async function handleGatewayState<Gpu>({
+  resume,
+  session,
+  initialGatewayReuseState,
+  gpu,
+  gpuPassthrough,
+  gatewayName,
+  dashboardPort,
+  recordedSandboxName,
+  requestedSandboxName,
+  recreateSandbox,
+  deps,
+}: GatewayStateOptions<Gpu>): Promise<GatewayStateResult> {
+  let gatewayReuseState = await deps.refreshDockerDriverGatewayReuseState(initialGatewayReuseState);
+  const supportsLifecycleCommands = deps.gatewayCliSupportsLifecycleCommands();
+
+  if (gatewayReuseState === "healthy" && supportsLifecycleCommands) {
+    const containerState = deps.verifyGatewayContainerRunning(gatewayName);
+    if (containerState === "missing") {
+      console.log("  Gateway metadata is stale (container not running). Cleaning up...");
+      deps.runOpenshell(["forward", "stop", String(dashboardPort)], { ignoreError: true });
+      gatewayReuseState = deps.destroyGatewayForReuse(
+        deps.destroyGateway,
+        "  ✓ Stale gateway metadata cleaned up",
+        "  ! Stale gateway metadata cleanup failed; leaving registry state intact.",
+      );
+    } else if (containerState === "unknown") {
+      if (await deps.waitForGatewayHttpReady()) {
+        console.log(
+          "  Warning: could not verify gateway container state (Docker may be unavailable), but the gateway is responding on HTTP. Proceeding with reuse.",
+        );
+      } else {
+        console.log(
+          `  Error: could not verify gateway container state and ${deps.getGatewayLocalEndpoint()}/ is not responding.`,
+        );
+        console.log(
+          "  Refusing to proceed without a clear Docker signal — restarting Docker and re-running onboard is the safe path. See #3258 / #2020.",
+        );
+        deps.exitProcess(1);
+      }
+    } else if (!(await deps.waitForGatewayHttpReady())) {
+      console.log(
+        `  Gateway container is running but ${deps.getGatewayLocalEndpoint()}/ is not responding. Recreating...`,
+      );
+      deps.runOpenshell(["forward", "stop", String(dashboardPort)], { ignoreError: true });
+      gatewayReuseState = deps.destroyGatewayForReuse(
+        deps.destroyGateway,
+        "  ✓ Stale gateway cleaned up",
+        "  ! Stale gateway cleanup failed; leaving registry state intact.",
+      );
+    } else {
+      const imageDrift = deps.getGatewayClusterImageDrift();
+      if (imageDrift) {
+        console.log(
+          `  Gateway image ${imageDrift.currentVersion} does not match openshell ${imageDrift.expectedVersion}. Recreating...`,
+        );
+        deps.stopAllDashboardForwards();
+        gatewayReuseState = deps.destroyGatewayForReuse(
+          deps.destroyGateway,
+          "  ✓ Previous gateway cleaned up",
+          "  ! Previous gateway cleanup failed; leaving registry state intact.",
+        );
+      }
+    }
+  }
+
+  gatewayReuseState = deps.reconcileGatewayGpuReuseForGpuIntent({
+    gatewayReuseState,
+    gpuPassthrough,
+    gatewayName,
+    currentSandboxName: recordedSandboxName || requestedSandboxName,
+    recreateSandbox,
+    confirmedDockerDriverGateway:
+      deps.isLinuxDockerDriverGatewayEnabled() && gatewayReuseState === "healthy" && !supportsLifecycleCommands,
+    stopDashboardForwards: deps.stopAllDashboardForwards,
+    retireLegacyGatewayForDockerDriverUpgrade: deps.retireLegacyGatewayForDockerDriverUpgrade,
+    destroyGatewayRuntimeForGpuReuse: deps.destroyGatewayRuntimeForGpuReuse,
+  });
+
+  const canReuseHealthyGateway = gatewayReuseState === "healthy";
+  const resumeGateway = resume && session?.steps?.gateway?.status === "complete" && canReuseHealthyGateway;
+  if (resumeGateway) {
+    deps.skippedStepMessage("gateway", "running");
+    session = await deps.recordStepComplete("gateway");
+  } else if (!resume && canReuseHealthyGateway) {
+    deps.skippedStepMessage("gateway", "running", "reuse");
+    deps.note("  Reusing healthy NemoClaw gateway.");
+    session = await deps.recordStepComplete("gateway");
+  } else {
+    if (resume && session?.steps?.gateway?.status === "complete") {
+      if (gatewayReuseState === "active-unnamed") {
+        deps.note("  [resume] Gateway is active but named metadata is missing; recreating it safely.");
+      } else if (gatewayReuseState === "foreign-active") {
+        deps.note("  [resume] A different OpenShell gateway is active; NemoClaw will not reuse it.");
+      } else if (gatewayReuseState === "stale") {
+        deps.note("  [resume] Recorded gateway is unhealthy; recreating it.");
+      } else {
+        deps.note("  [resume] Recorded gateway state is unavailable; recreating it.");
+      }
+    }
+    if (deps.isLinuxDockerDriverGatewayEnabled() && gatewayReuseState !== "missing") {
+      deps.note("  Replacing legacy OpenShell gateway metadata with Docker-driver gateway.");
+      deps.retireLegacyGatewayForDockerDriverUpgrade();
+      gatewayReuseState = "missing";
+    }
+    await deps.startRecordedStep("gateway");
+    await deps.startGateway(gpu, { gpuPassthrough });
+    session = await deps.recordStepComplete("gateway");
+  }
+
+  return { gatewayReuseState, session };
+}

From 3038da47214adc54345ecd37b40fe944e943d1e7 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Tue, 19 May 2026 23:20:19 -0700
Subject: [PATCH 09/54] refactor(cli): extract provider inference handlers

---
 src/lib/onboard.ts                            | 247 +++++----------
 .../handlers/provider-inference.test.ts       | 216 +++++++++++++
 .../machine/handlers/provider-inference.ts    | 289 ++++++++++++++++++
 3 files changed, 577 insertions(+), 175 deletions(-)
 create mode 100644 src/lib/onboard/machine/handlers/provider-inference.test.ts
 create mode 100644 src/lib/onboard/machine/handlers/provider-inference.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 9d9b047748..f7d95ae8ab 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -282,6 +282,7 @@ const onboardSession: typeof import("./state/onboard-session") = require("./stat
 const { OnboardRuntime }: typeof import("./onboard/machine/runtime") = require("./onboard/machine/runtime");
 const { handleGatewayState }: typeof import("./onboard/machine/handlers/gateway") = require("./onboard/machine/handlers/gateway");
 const { handlePreflightState }: typeof import("./onboard/machine/handlers/preflight") = require("./onboard/machine/handlers/preflight");
+const { handleProviderInferenceState }: typeof import("./onboard/machine/handlers/provider-inference") = require("./onboard/machine/handlers/provider-inference");
 const policies: typeof import("./policy") = require("./policy");
 const tiers: typeof import("./policy/tiers") = require("./policy/tiers");
 const { ensureUsageNoticeConsent } = require("./onboard/usage-notice");
@@ -9514,181 +9515,77 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
       console.error("  Start a fresh onboard with --name <sandbox> to choose a different name.");
       process.exit(1);
     }
-    let model = session?.model || null;
-    let provider = session?.provider || null;
-    let endpointUrl = session?.endpointUrl || null;
-    let credentialEnv = session?.credentialEnv || null;
-    let hermesAuthMethod: HermesAuthMethod | null =
-      normalizeHermesAuthMethod(session?.hermesAuthMethod) ||
-      (provider === hermesProviderAuth.HERMES_PROVIDER_NAME &&
-      session?.credentialEnv === HERMES_NOUS_API_KEY_CREDENTIAL_ENV
-        ? HERMES_AUTH_METHOD_API_KEY
-        : null);
-    let hermesToolGateways = normalizeHermesToolGatewaySelections(session?.hermesToolGateways);
-    let preferredInferenceApi = session?.preferredInferenceApi || null;
-    let nimContainer = session?.nimContainer || null;
-    let webSearchConfig = session?.webSearchConfig || null;
-    let forceProviderSelection = false;
-    while (true) {
-      const resumeProviderSelection =
-        !forceProviderSelection &&
-        resume &&
-        session?.steps?.provider_selection?.status === "complete" &&
-        typeof provider === "string" &&
-        typeof model === "string";
-      if (resumeProviderSelection) {
-        skippedStepMessage("provider_selection", `${provider} / ${model}`);
-        hydrateCredentialEnv(credentialEnv);
-        // #3342: resume short-circuits provider selection — repair the
-        // ollama-local systemd loopback override here so legacy 0.0.0.0
-        // drop-ins from older NemoClaw versions get rewritten every resume.
-        repairLocalInferenceSystemdOverrideOrExit(provider, isNonInteractive);
-      } else {
-        // #2753: do not persist sandboxName to onboard-session.json before
-        // the sandbox actually exists in the gateway (Step 6 markStepComplete
-        // below). A SIGINT between any earlier step and createSandbox would
-        // otherwise leave a phantom that `nemoclaw list` resurrects until
-        // manually destroyed.
-        await startRecordedStep("provider_selection");
-        const selection = await setupNim(gpu, sandboxName, agent);
-        model = selection.model;
-        provider = selection.provider;
-        endpointUrl = selection.endpointUrl;
-        credentialEnv = selection.credentialEnv;
-        hermesAuthMethod = selection.hermesAuthMethod;
-        hermesToolGateways = selection.hermesToolGateways;
-        preferredInferenceApi = selection.preferredInferenceApi;
-        nimContainer = selection.nimContainer;
-        await recordStepComplete(
-          "provider_selection",
-          toSessionUpdates({
-            provider,
-            model,
-            endpointUrl,
-            credentialEnv,
-            hermesAuthMethod,
-            hermesToolGateways,
-            preferredInferenceApi,
-            nimContainer,
-          }),
-        );
-      }
-
-      if (typeof provider !== "string" || typeof model !== "string") {
-        console.error("  Inference selection did not yield a provider/model.");
-        process.exit(1);
-      }
-      process.env.NEMOCLAW_OPENSHELL_BIN = getOpenshellBinary();
-      const needsBedrockRuntimeAdapter =
-        provider === "compatible-anthropic-endpoint" &&
-        bedrockRuntimeOnboard.needsBedrockRuntimeAdapter(endpointUrl);
-      const resumeInference =
-        !needsBedrockRuntimeAdapter &&
-        !forceProviderSelection &&
-        resume &&
-        isInferenceRouteReady(provider, model);
-      if (resumeInference) {
-        if (provider === hermesProviderAuth.HERMES_PROVIDER_NAME) {
-          if (!sandboxName) {
-            sandboxName = await promptValidatedSandboxName(agent);
-          }
-          await startRecordedStep("inference", { provider, model });
-          const inferenceResult = await setupInference(
-            sandboxName,
-            model,
-            provider,
-            endpointUrl,
-            credentialEnv,
-            hermesAuthMethod,
-            hermesToolGateways,
-          );
-          if (inferenceResult?.retry === "selection") {
-            forceProviderSelection = true;
-            continue;
-          }
-          await recordStepComplete(
-            "inference",
-            toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }),
-          );
-          break;
-        }
-        if (isRoutedInferenceProvider(provider)) {
-          try {
-            await reconcileModelRouter();
-          } catch (err) {
-            console.error(
-              `  ✗ Failed to reconcile model router: ${err instanceof Error ? err.message : String(err)}`,
-            );
-            process.exit(1);
-          }
-        }
-        skippedStepMessage("inference", `${provider} / ${model}`);
-        if (nimContainer && sandboxName) {
-          registry.updateSandbox(sandboxName, { nimContainer });
-        }
-        await recordStepComplete(
-          "inference",
-          toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }),
-        );
-        break;
-      }
-
-      if (!sandboxName) {
-        sandboxName = await promptValidatedSandboxName(agent);
-      }
-      const buildEstimateNote =
-        process.env.NEMOCLAW_IGNORE_RUNTIME_RESOURCES === "1"
-          ? null
-          : formatSandboxBuildEstimateNote(assessHost());
-      console.log(
-        formatOnboardConfigSummary({
-          provider,
-          model,
-          credentialEnv,
-          hermesAuthMethod,
-          webSearchConfig,
-          hermesToolGateways,
-          enabledChannels: selectedMessagingChannels.length > 0 ? selectedMessagingChannels : null,
-          sandboxName,
-          notes: buildEstimateNote ? [buildEstimateNote] : [],
-        }),
-      );
-      console.log("  Web search and messaging channels will be prompted next.");
-      if (!isNonInteractive()) {
-        if (!(await promptYesNoOrDefault("  Apply this configuration?", null, true))) {
-          console.log(`  Aborted. Re-run \`${cliName()} onboard\` to start over.`);
-          console.log("  Credentials entered so far were only staged in memory for this run.");
-          console.log(
-            "  No new gateway credential was registered because onboarding stopped here.",
-          );
-          process.exit(0);
-        }
-      }
-
-      await startRecordedStep("inference", { provider, model });
-      const inferenceResult = await setupInference(
-        sandboxName,
-        model,
-        provider,
-        endpointUrl,
-        credentialEnv,
-        hermesAuthMethod,
-        hermesToolGateways,
-      );
-      delete process.env.NVIDIA_API_KEY;
-      if (inferenceResult?.retry === "selection") {
-        forceProviderSelection = true;
-        continue;
-      }
-      if (nimContainer && sandboxName) {
-        registry.updateSandbox(sandboxName, { nimContainer });
-      }
-      await recordStepComplete(
-        "inference",
-        toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }),
-      );
-      break;
-    }
+    const providerInferenceResult = await handleProviderInferenceState({
+      resume,
+      session,
+      gpu,
+      sandboxName,
+      agent,
+      initial: {
+        model: session?.model || null,
+        provider: session?.provider || null,
+        endpointUrl: session?.endpointUrl || null,
+        credentialEnv: session?.credentialEnv || null,
+        hermesAuthMethod:
+          normalizeHermesAuthMethod(session?.hermesAuthMethod) ||
+          (session?.provider === hermesProviderAuth.HERMES_PROVIDER_NAME &&
+          session?.credentialEnv === HERMES_NOUS_API_KEY_CREDENTIAL_ENV
+            ? HERMES_AUTH_METHOD_API_KEY
+            : null),
+        hermesToolGateways: normalizeHermesToolGatewaySelections(session?.hermesToolGateways),
+        preferredInferenceApi: session?.preferredInferenceApi || null,
+        nimContainer: session?.nimContainer || null,
+        webSearchConfig: session?.webSearchConfig || null,
+      },
+      selectedMessagingChannels,
+      env: process.env,
+      constants: { hermesProviderName: hermesProviderAuth.HERMES_PROVIDER_NAME },
+      deps: {
+        normalizeHermesAuthMethod,
+        setupNim,
+        setupInference,
+        startRecordedStep,
+        recordStepComplete,
+        toSessionUpdates: (updates) => toSessionUpdates(updates as Parameters<typeof toSessionUpdates>[0]),
+        skippedStepMessage,
+        hydrateCredentialEnv,
+        repairLocalInferenceSystemdOverrideOrExit,
+        isNonInteractive,
+        getOpenshellBinary,
+        needsBedrockRuntimeAdapter: (providerName, url) =>
+          providerName === "compatible-anthropic-endpoint" &&
+          bedrockRuntimeOnboard.needsBedrockRuntimeAdapter(url),
+        isInferenceRouteReady,
+        isRoutedInferenceProvider,
+        reconcileModelRouter,
+        registryUpdateSandbox: (name, updates) => registry.updateSandbox(name, updates),
+        promptValidatedSandboxName,
+        assessHost,
+        formatSandboxBuildEstimateNote,
+        formatOnboardConfigSummary,
+        promptYesNoOrDefault,
+        cliName,
+        log: (message) => console.log(message),
+        error: (message) => console.error(message),
+        exitProcess: (code) => process.exit(code),
+        deleteEnv: (name) => {
+          delete process.env[name];
+        },
+      },
+    });
+    session = providerInferenceResult.session;
+    sandboxName = providerInferenceResult.sandboxName;
+    const {
+      model,
+      provider,
+      endpointUrl,
+      credentialEnv,
+      hermesAuthMethod,
+      hermesToolGateways,
+      preferredInferenceApi,
+      nimContainer,
+    } = providerInferenceResult;
+    let webSearchConfig = providerInferenceResult.webSearchConfig as WebSearchConfig | null;
 
     const webSearchSupportProbePath = fromDockerfile ? path.resolve(fromDockerfile) : null;
     const webSearchSupported = agentSupportsWebSearch(agent, webSearchSupportProbePath, ROOT);
diff --git a/src/lib/onboard/machine/handlers/provider-inference.test.ts b/src/lib/onboard/machine/handlers/provider-inference.test.ts
new file mode 100644
index 0000000000..bec7ea47a3
--- /dev/null
+++ b/src/lib/onboard/machine/handlers/provider-inference.test.ts
@@ -0,0 +1,216 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { describe, expect, it, vi } from "vitest";
+
+import { createSession, type Session, type SessionUpdates } from "../../../state/onboard-session";
+import {
+  handleProviderInferenceState,
+  type ProviderInferenceStateOptions,
+  type ProviderSelectionResult,
+} from "./provider-inference";
+
+type Gpu = { type: string } | null;
+type Agent = { name: string } | null;
+type Host = { cpus?: number };
+
+const baseSelection: ProviderSelectionResult = {
+  model: "nvidia/test",
+  provider: "nvidia-prod",
+  endpointUrl: "https://integrate.api.nvidia.com/v1",
+  credentialEnv: "NVIDIA_API_KEY",
+  hermesAuthMethod: null,
+  hermesToolGateways: [],
+  preferredInferenceApi: "openai-responses",
+  nimContainer: null,
+};
+
+function createDeps(overrides: Partial<ProviderInferenceStateOptions<Gpu, Agent, Host>["deps"]> = {}) {
+  const calls = {
+    setupNim: vi.fn(async () => ({ ...baseSelection })),
+    setupInference: vi.fn(async () => ({ ok: true as const })),
+    startStep: vi.fn(async () => undefined),
+    complete: vi.fn(async () => createSession()),
+    skipped: vi.fn(),
+    hydrate: vi.fn(),
+    repair: vi.fn(),
+    routeReady: vi.fn(() => false),
+    reconcileRouter: vi.fn(async () => undefined),
+    updateSandbox: vi.fn(),
+    promptName: vi.fn(async () => "my-assistant"),
+    promptYesNo: vi.fn(async () => true),
+    log: vi.fn(),
+    error: vi.fn(),
+    exit: vi.fn((code: number): never => {
+      throw new Error(`exit ${code}`);
+    }),
+    deleteEnv: vi.fn(),
+  };
+  return {
+    calls,
+    deps: {
+      normalizeHermesAuthMethod: (value: string | null | undefined) => value ?? null,
+      setupNim: calls.setupNim,
+      setupInference: calls.setupInference,
+      startRecordedStep: calls.startStep,
+      recordStepComplete: calls.complete,
+      toSessionUpdates: (updates: Record<string, unknown>) => updates as SessionUpdates,
+      skippedStepMessage: calls.skipped,
+      hydrateCredentialEnv: calls.hydrate,
+      repairLocalInferenceSystemdOverrideOrExit: calls.repair,
+      isNonInteractive: () => true,
+      getOpenshellBinary: () => "/usr/bin/openshell",
+      needsBedrockRuntimeAdapter: () => false,
+      isInferenceRouteReady: calls.routeReady,
+      isRoutedInferenceProvider: (provider: string) => provider === "nvidia-router",
+      reconcileModelRouter: calls.reconcileRouter,
+      registryUpdateSandbox: calls.updateSandbox,
+      promptValidatedSandboxName: calls.promptName,
+      assessHost: () => ({ cpus: 8 }),
+      formatSandboxBuildEstimateNote: () => "estimate",
+      formatOnboardConfigSummary: (options: {
+        provider: string;
+        model: string;
+        sandboxName: string;
+      }) => `summary:${options.provider}/${options.model}/${options.sandboxName}`,
+      promptYesNoOrDefault: calls.promptYesNo,
+      cliName: () => "nemoclaw",
+      log: calls.log,
+      error: calls.error,
+      exitProcess: calls.exit,
+      deleteEnv: calls.deleteEnv,
+      ...overrides,
+    },
+  };
+}
+
+function baseOptions(
+  deps: ProviderInferenceStateOptions<Gpu, Agent, Host>["deps"],
+  session: Session | null = createSession(),
+): ProviderInferenceStateOptions<Gpu, Agent, Host> {
+  return {
+    resume: false,
+    session,
+    gpu: { type: "nvidia" },
+    sandboxName: null,
+    agent: null,
+    initial: {
+      model: session?.model ?? null,
+      provider: session?.provider ?? null,
+      endpointUrl: session?.endpointUrl ?? null,
+      credentialEnv: session?.credentialEnv ?? null,
+      hermesAuthMethod: session?.hermesAuthMethod ?? null,
+      hermesToolGateways: session?.hermesToolGateways ?? [],
+      preferredInferenceApi: session?.preferredInferenceApi ?? null,
+      nimContainer: session?.nimContainer ?? null,
+      webSearchConfig: session?.webSearchConfig ?? null,
+    },
+    selectedMessagingChannels: [],
+    env: {},
+    constants: { hermesProviderName: "hermes-provider" },
+    deps,
+  };
+}
+
+describe("handleProviderInferenceState", () => {
+  it("runs provider selection and inference setup on a fresh flow", async () => {
+    const { deps, calls } = createDeps();
+
+    const result = await handleProviderInferenceState(baseOptions(deps));
+
+    expect(calls.startStep).toHaveBeenNthCalledWith(1, "provider_selection");
+    expect(calls.setupNim).toHaveBeenCalledWith({ type: "nvidia" }, null, null);
+    expect(calls.complete).toHaveBeenCalledWith("provider_selection", expect.objectContaining({ provider: "nvidia-prod" }));
+    expect(calls.promptName).toHaveBeenCalledWith(null);
+    expect(calls.log).toHaveBeenCalledWith("summary:nvidia-prod/nvidia/test/my-assistant");
+    expect(calls.startStep).toHaveBeenNthCalledWith(2, "inference", {
+      provider: "nvidia-prod",
+      model: "nvidia/test",
+    });
+    expect(calls.setupInference).toHaveBeenCalledWith(
+      "my-assistant",
+      "nvidia/test",
+      "nvidia-prod",
+      "https://integrate.api.nvidia.com/v1",
+      "NVIDIA_API_KEY",
+      null,
+      [],
+    );
+    expect(calls.deleteEnv).toHaveBeenCalledWith("NVIDIA_API_KEY");
+    expect(result).toMatchObject({
+      sandboxName: "my-assistant",
+      model: "nvidia/test",
+      provider: "nvidia-prod",
+      preferredInferenceApi: "openai-responses",
+    });
+  });
+
+  it("skips provider selection and inference setup when resume state is already ready", async () => {
+    const session = createSession({
+      provider: "ollama-local",
+      model: "llama3.1",
+      credentialEnv: null,
+    });
+    session.steps.provider_selection.status = "complete";
+    const { deps, calls } = createDeps({ isInferenceRouteReady: vi.fn(() => true) });
+
+    const result = await handleProviderInferenceState({
+      ...baseOptions(deps, session),
+      resume: true,
+      sandboxName: "my-assistant",
+    });
+
+    expect(calls.setupNim).not.toHaveBeenCalled();
+    expect(calls.setupInference).not.toHaveBeenCalled();
+    expect(calls.skipped).toHaveBeenCalledWith("provider_selection", "ollama-local / llama3.1");
+    expect(calls.hydrate).toHaveBeenCalledWith(null);
+    expect(calls.repair).toHaveBeenCalledWith("ollama-local", deps.isNonInteractive);
+    expect(calls.skipped).toHaveBeenCalledWith("inference", "ollama-local / llama3.1");
+    expect(result).toMatchObject({ provider: "ollama-local", model: "llama3.1" });
+  });
+
+  it("reconciles model router on resumed routed inference", async () => {
+    const session = createSession({ provider: "nvidia-router", model: "router/model" });
+    session.steps.provider_selection.status = "complete";
+    const { deps, calls } = createDeps({ isInferenceRouteReady: vi.fn(() => true) });
+
+    await handleProviderInferenceState({
+      ...baseOptions(deps, session),
+      resume: true,
+      sandboxName: "router-sandbox",
+    });
+
+    expect(calls.reconcileRouter).toHaveBeenCalledOnce();
+  });
+
+  it("returns to provider selection when inference setup requests a retry", async () => {
+    const setupNim = vi
+      .fn()
+      .mockResolvedValueOnce({ ...baseSelection, model: "bad" })
+      .mockResolvedValueOnce({ ...baseSelection, model: "good" });
+    const setupInference = vi
+      .fn()
+      .mockResolvedValueOnce({ retry: "selection" as const })
+      .mockResolvedValueOnce({ ok: true as const });
+    const { deps, calls } = createDeps({ setupNim, setupInference });
+
+    const result = await handleProviderInferenceState(baseOptions(deps));
+
+    expect(setupNim).toHaveBeenCalledTimes(2);
+    expect(setupInference).toHaveBeenCalledTimes(2);
+    expect(result.model).toBe("good");
+    expect(calls.startStep).toHaveBeenCalledWith("provider_selection");
+  });
+
+  it("aborts before inference setup when the configuration summary is rejected", async () => {
+    const { deps, calls } = createDeps({
+      isNonInteractive: () => false,
+      promptYesNoOrDefault: vi.fn(async () => false),
+    });
+
+    await expect(handleProviderInferenceState(baseOptions(deps))).rejects.toThrow("exit 0");
+
+    expect(calls.exit).toHaveBeenCalledWith(0);
+    expect(calls.setupInference).not.toHaveBeenCalled();
+  });
+});
diff --git a/src/lib/onboard/machine/handlers/provider-inference.ts b/src/lib/onboard/machine/handlers/provider-inference.ts
new file mode 100644
index 0000000000..525b94a059
--- /dev/null
+++ b/src/lib/onboard/machine/handlers/provider-inference.ts
@@ -0,0 +1,289 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import type { Session, SessionUpdates } from "../../../state/onboard-session";
+
+export type ProviderInferenceRetry = { retry: "selection" } | { ok: true; retry?: undefined };
+
+export interface ProviderSelectionResult {
+  model: string | null;
+  provider: string;
+  endpointUrl: string | null;
+  credentialEnv: string | null;
+  hermesAuthMethod: string | null;
+  hermesToolGateways: string[];
+  preferredInferenceApi: string | null;
+  nimContainer: string | null;
+}
+
+export interface ProviderInferenceStateOptions<Gpu, Agent, Host> {
+  resume: boolean;
+  session: Session | null;
+  gpu: Gpu;
+  sandboxName: string | null;
+  agent: Agent;
+  initial: {
+    model: string | null;
+    provider: string | null;
+    endpointUrl: string | null;
+    credentialEnv: string | null;
+    hermesAuthMethod: string | null;
+    hermesToolGateways: string[];
+    preferredInferenceApi: string | null;
+    nimContainer: string | null;
+    webSearchConfig: any;
+  };
+  selectedMessagingChannels: string[];
+  env: NodeJS.ProcessEnv;
+  constants: {
+    hermesProviderName: string;
+  };
+  deps: {
+    normalizeHermesAuthMethod(value: string | null | undefined): string | null;
+    setupNim(gpu: Gpu, sandboxName: string | null, agent: Agent): Promise<ProviderSelectionResult>;
+    setupInference(
+      sandboxName: string | null,
+      model: string,
+      provider: string,
+      endpointUrl: string | null,
+      credentialEnv: string | null,
+      hermesAuthMethod: string | null,
+      hermesToolGateways: string[],
+    ): Promise<ProviderInferenceRetry>;
+    startRecordedStep(
+      stepName: string,
+      updates?: { provider?: string | null; model?: string | null },
+    ): Promise<void>;
+    recordStepComplete(stepName: string, updates: SessionUpdates): Promise<Session>;
+    toSessionUpdates(updates: Record<string, unknown>): SessionUpdates;
+    skippedStepMessage(stepName: string, detail?: string | null): void;
+    hydrateCredentialEnv(credentialEnv: string | null): void;
+    repairLocalInferenceSystemdOverrideOrExit(provider: string | null, isNonInteractive: () => boolean): void;
+    isNonInteractive(): boolean;
+    getOpenshellBinary(): string;
+    needsBedrockRuntimeAdapter(provider: string, endpointUrl: string | null): boolean;
+    isInferenceRouteReady(provider: string, model: string): boolean;
+    isRoutedInferenceProvider(provider: string): boolean;
+    reconcileModelRouter(): Promise<void>;
+    registryUpdateSandbox(sandboxName: string, updates: { nimContainer?: string | null }): void;
+    promptValidatedSandboxName(agent: Agent): Promise<string>;
+    assessHost(): Host;
+    formatSandboxBuildEstimateNote(host: Host): string | null;
+    formatOnboardConfigSummary(options: {
+      provider: string;
+      model: string;
+      credentialEnv: string | null;
+      hermesAuthMethod: string | null;
+      webSearchConfig: any;
+      hermesToolGateways: string[];
+      enabledChannels: string[] | null;
+      sandboxName: string;
+      notes: string[];
+    }): string;
+    promptYesNoOrDefault(question: string, envVar: string | null, defaultIsYes: boolean): Promise<boolean>;
+    cliName(): string;
+    log(message?: string): void;
+    error(message?: string): void;
+    exitProcess(code: number): never;
+    deleteEnv(name: string): void;
+  };
+}
+
+export interface ProviderInferenceStateResult {
+  sandboxName: string | null;
+  model: string;
+  provider: string;
+  endpointUrl: string | null;
+  credentialEnv: string | null;
+  hermesAuthMethod: string | null;
+  hermesToolGateways: string[];
+  preferredInferenceApi: string | null;
+  nimContainer: string | null;
+  webSearchConfig: any;
+  session: Session | null;
+}
+
+function requireSelection(provider: string | null, model: string | null): { provider: string; model: string } {
+  if (typeof provider !== "string" || typeof model !== "string") {
+    throw new Error("Inference selection did not yield a provider/model.");
+  }
+  return { provider, model };
+}
+
+export async function handleProviderInferenceState<Gpu, Agent, Host>({
+  resume,
+  session,
+  gpu,
+  sandboxName,
+  agent,
+  initial,
+  selectedMessagingChannels,
+  env,
+  constants,
+  deps,
+}: ProviderInferenceStateOptions<Gpu, Agent, Host>): Promise<ProviderInferenceStateResult> {
+  let model = initial.model;
+  let provider = initial.provider;
+  let endpointUrl = initial.endpointUrl;
+  let credentialEnv = initial.credentialEnv;
+  let hermesAuthMethod =
+    deps.normalizeHermesAuthMethod(initial.hermesAuthMethod) ||
+    (provider === constants.hermesProviderName ? deps.normalizeHermesAuthMethod(initial.hermesAuthMethod) : null);
+  let hermesToolGateways = initial.hermesToolGateways;
+  let preferredInferenceApi = initial.preferredInferenceApi;
+  let nimContainer = initial.nimContainer;
+  const webSearchConfig = initial.webSearchConfig;
+  let forceProviderSelection = false;
+
+  while (true) {
+    const resumeProviderSelection =
+      !forceProviderSelection &&
+      resume &&
+      session?.steps?.provider_selection?.status === "complete" &&
+      typeof provider === "string" &&
+      typeof model === "string";
+    if (resumeProviderSelection) {
+      deps.skippedStepMessage("provider_selection", `${provider} / ${model}`);
+      deps.hydrateCredentialEnv(credentialEnv);
+      deps.repairLocalInferenceSystemdOverrideOrExit(provider, deps.isNonInteractive);
+    } else {
+      await deps.startRecordedStep("provider_selection");
+      const selection = await deps.setupNim(gpu, sandboxName, agent);
+      model = selection.model;
+      provider = selection.provider;
+      endpointUrl = selection.endpointUrl;
+      credentialEnv = selection.credentialEnv;
+      hermesAuthMethod = selection.hermesAuthMethod;
+      hermesToolGateways = selection.hermesToolGateways;
+      preferredInferenceApi = selection.preferredInferenceApi;
+      nimContainer = selection.nimContainer;
+      session = await deps.recordStepComplete(
+        "provider_selection",
+        deps.toSessionUpdates({
+          provider,
+          model,
+          endpointUrl,
+          credentialEnv,
+          hermesAuthMethod,
+          hermesToolGateways,
+          preferredInferenceApi,
+          nimContainer,
+        }),
+      );
+    }
+
+    const selected = requireSelection(provider, model);
+    provider = selected.provider;
+    model = selected.model;
+    env.NEMOCLAW_OPENSHELL_BIN = deps.getOpenshellBinary();
+    const needsBedrockRuntimeAdapter = deps.needsBedrockRuntimeAdapter(provider, endpointUrl);
+    const resumeInference =
+      !needsBedrockRuntimeAdapter &&
+      !forceProviderSelection &&
+      resume &&
+      deps.isInferenceRouteReady(provider, model);
+    if (resumeInference) {
+      if (provider === constants.hermesProviderName) {
+        if (!sandboxName) sandboxName = await deps.promptValidatedSandboxName(agent);
+        await deps.startRecordedStep("inference", { provider, model });
+        const inferenceResult = await deps.setupInference(
+          sandboxName,
+          model,
+          provider,
+          endpointUrl,
+          credentialEnv,
+          hermesAuthMethod,
+          hermesToolGateways,
+        );
+        if (inferenceResult?.retry === "selection") {
+          forceProviderSelection = true;
+          continue;
+        }
+        session = await deps.recordStepComplete(
+          "inference",
+          deps.toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }),
+        );
+        break;
+      }
+      if (deps.isRoutedInferenceProvider(provider)) {
+        try {
+          await deps.reconcileModelRouter();
+        } catch (err) {
+          deps.error(`  ✗ Failed to reconcile model router: ${err instanceof Error ? err.message : String(err)}`);
+          deps.exitProcess(1);
+        }
+      }
+      deps.skippedStepMessage("inference", `${provider} / ${model}`);
+      if (nimContainer && sandboxName) deps.registryUpdateSandbox(sandboxName, { nimContainer });
+      session = await deps.recordStepComplete(
+        "inference",
+        deps.toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }),
+      );
+      break;
+    }
+
+    if (!sandboxName) sandboxName = await deps.promptValidatedSandboxName(agent);
+    const buildEstimateNote =
+      env.NEMOCLAW_IGNORE_RUNTIME_RESOURCES === "1"
+        ? null
+        : deps.formatSandboxBuildEstimateNote(deps.assessHost());
+    deps.log(
+      deps.formatOnboardConfigSummary({
+        provider,
+        model,
+        credentialEnv,
+        hermesAuthMethod,
+        webSearchConfig,
+        hermesToolGateways,
+        enabledChannels: selectedMessagingChannels.length > 0 ? selectedMessagingChannels : null,
+        sandboxName,
+        notes: buildEstimateNote ? [buildEstimateNote] : [],
+      }),
+    );
+    deps.log("  Web search and messaging channels will be prompted next.");
+    if (!deps.isNonInteractive()) {
+      if (!(await deps.promptYesNoOrDefault("  Apply this configuration?", null, true))) {
+        deps.log(`  Aborted. Re-run \`${deps.cliName()} onboard\` to start over.`);
+        deps.log("  Credentials entered so far were only staged in memory for this run.");
+        deps.log("  No new gateway credential was registered because onboarding stopped here.");
+        deps.exitProcess(0);
+      }
+    }
+
+    await deps.startRecordedStep("inference", { provider, model });
+    const inferenceResult = await deps.setupInference(
+      sandboxName,
+      model,
+      provider,
+      endpointUrl,
+      credentialEnv,
+      hermesAuthMethod,
+      hermesToolGateways,
+    );
+    deps.deleteEnv("NVIDIA_API_KEY");
+    if (inferenceResult?.retry === "selection") {
+      forceProviderSelection = true;
+      continue;
+    }
+    if (nimContainer && sandboxName) deps.registryUpdateSandbox(sandboxName, { nimContainer });
+    session = await deps.recordStepComplete(
+      "inference",
+      deps.toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }),
+    );
+    break;
+  }
+
+  return {
+    sandboxName,
+    model,
+    provider,
+    endpointUrl,
+    credentialEnv,
+    hermesAuthMethod,
+    hermesToolGateways,
+    preferredInferenceApi,
+    nimContainer,
+    webSearchConfig,
+    session,
+  };
+}

From 18ef7e763923a23a0b78739e3fc3619557ab9ac1 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Tue, 19 May 2026 23:35:26 -0700
Subject: [PATCH 10/54] refactor(cli): extract onboard sandbox handler

---
 src/lib/onboard.ts                            | 267 ++++------------
 .../onboard/machine/handlers/sandbox.test.ts  | 198 ++++++++++++
 src/lib/onboard/machine/handlers/sandbox.ts   | 287 ++++++++++++++++++
 3 files changed, 547 insertions(+), 205 deletions(-)
 create mode 100644 src/lib/onboard/machine/handlers/sandbox.test.ts
 create mode 100644 src/lib/onboard/machine/handlers/sandbox.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index f7d95ae8ab..b23e7f0fa4 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -283,6 +283,7 @@ const { OnboardRuntime }: typeof import("./onboard/machine/runtime") = require("
 const { handleGatewayState }: typeof import("./onboard/machine/handlers/gateway") = require("./onboard/machine/handlers/gateway");
 const { handlePreflightState }: typeof import("./onboard/machine/handlers/preflight") = require("./onboard/machine/handlers/preflight");
 const { handleProviderInferenceState }: typeof import("./onboard/machine/handlers/provider-inference") = require("./onboard/machine/handlers/provider-inference");
+const { handleSandboxState }: typeof import("./onboard/machine/handlers/sandbox") = require("./onboard/machine/handlers/sandbox");
 const policies: typeof import("./policy") = require("./policy");
 const tiers: typeof import("./policy/tiers") = require("./policy/tiers");
 const { ensureUsageNoticeConsent } = require("./onboard/usage-notice");
@@ -9587,212 +9588,68 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
     } = providerInferenceResult;
     let webSearchConfig = providerInferenceResult.webSearchConfig as WebSearchConfig | null;
 
-    const webSearchSupportProbePath = fromDockerfile ? path.resolve(fromDockerfile) : null;
-    const webSearchSupported = agentSupportsWebSearch(agent, webSearchSupportProbePath, ROOT);
-    if (webSearchConfig && !webSearchSupported) {
-      note(
-        `  Web search is not yet supported by ${agent?.displayName ?? "this sandbox image"}. Clearing stale config.`,
-      );
-      webSearchConfig = null;
-      if (session) {
-        session.webSearchConfig = null;
-      }
-      onboardSession.updateSession((current: Session) => {
-        current.webSearchConfig = null;
-        return current;
-      });
-    }
-
-    const storedMessagingChannelConfig = getStoredMessagingChannelConfig(sandboxName, session);
-    const effectiveMessagingChannelConfig = hydrateMessagingChannelConfig(storedMessagingChannelConfig);
-    const messagingChannelConfigChanged = !messagingChannelConfigsEqual(
-      effectiveMessagingChannelConfig,
-      storedMessagingChannelConfig,
-    );
-    if (effectiveMessagingChannelConfig) {
-      persistMessagingChannelConfigToSession(effectiveMessagingChannelConfig);
-      if (session) {
-        session.messagingChannelConfig = effectiveMessagingChannelConfig;
-      }
-    }
-
-    const sandboxReuseState = getSandboxReuseState(sandboxName);
-    const webSearchConfigChanged = Boolean(session?.webSearchConfig) !== Boolean(webSearchConfig);
-    // Telegram mention-mode is baked into openclaw.json at sandbox build time, so
-    // changes to TELEGRAM_REQUIRE_MENTION only take effect after a rebuild. Treat
-    // a mismatch between the recorded config and the current env value as drift
-    // so the reuse path forces a recreate (mirrors webSearchConfigChanged). See
-    // #1737 and the CodeRabbit review on #2417.
-    //
-    // Compare *effective* modes — null and false both produce groupPolicy: open
-    // at config-generation time (default behavior), so they collapse to the same
-    // bucket here. Without this, a sandbox built before TELEGRAM_REQUIRE_MENTION
-    // existed (recordedTelegramRequireMention === null) would be reused with the
-    // old groupPolicy: open even after the user sets TELEGRAM_REQUIRE_MENTION=1,
-    // and vice versa.
-    const currentTelegramRequireMention = computeTelegramRequireMention();
-    const recordedTelegramRequireMention = session?.telegramConfig?.requireMention ?? null;
-    const effectiveCurrent = currentTelegramRequireMention ?? false;
-    const effectiveRecorded = recordedTelegramRequireMention ?? false;
-    const telegramConfigChanged = effectiveCurrent !== effectiveRecorded;
-    const sandboxGpuConfigChanged = sandboxName
-      ? hasSandboxGpuDrift(sandboxName, sandboxGpuConfig)
-      : false;
-    const wechatConfigChanged = hasWechatConfigDrift(session);
-    const recordedHermesToolGateways = sandboxName
-      ? normalizeHermesToolGatewaySelections(registry.getSandbox(sandboxName)?.hermesToolGateways)
-      : [];
-    const hermesToolGatewayConfigChanged = !stringSetsEqual(
-      recordedHermesToolGateways,
+    const sandboxStateResult = await handleSandboxState({
+      resume,
+      fresh,
+      session,
+      sandboxName,
+      model,
+      provider,
+      nimContainer,
+      webSearchConfig,
+      selectedMessagingChannels,
+      fromDockerfile,
+      agent,
+      gpu,
+      preferredInferenceApi,
+      sandboxGpuConfig,
       hermesToolGateways,
-    );
-    const resumeSandbox =
-      resume &&
-      !webSearchConfigChanged &&
-      !telegramConfigChanged &&
-      !sandboxGpuConfigChanged &&
-      !wechatConfigChanged &&
-      !messagingChannelConfigChanged &&
-      !hermesToolGatewayConfigChanged &&
-      session?.steps?.sandbox?.status === "complete" &&
-      sandboxReuseState === "ready";
-    if (resumeSandbox) {
-      if (webSearchConfig) {
-        note("  [resume] Reusing Brave Search configuration already baked into the sandbox.");
-      }
-      selectedMessagingChannels = session?.messagingChannels ?? [];
-      skippedStepMessage("sandbox", sandboxName);
-    } else {
-      if (resume && session?.steps?.sandbox?.status === "complete") {
-        if (webSearchConfigChanged) {
-          note("  [resume] Web Search configuration changed; recreating sandbox.");
-          if (sandboxName) {
-            registry.removeSandbox(sandboxName);
-          }
-        } else if (telegramConfigChanged) {
-          note("  [resume] TELEGRAM_REQUIRE_MENTION changed; recreating sandbox.");
-          if (sandboxName) {
-            registry.removeSandbox(sandboxName);
-          }
-        } else if (sandboxGpuConfigChanged) {
-          note("  [resume] Sandbox GPU settings changed; recreating sandbox.");
-          if (sandboxName) {
-            registry.removeSandbox(sandboxName);
-          }
-        } else if (wechatConfigChanged) {
-          note("  [resume] WeChat account metadata changed; recreating sandbox.");
-          if (sandboxName) {
-            registry.removeSandbox(sandboxName);
-          }
-        } else if (messagingChannelConfigChanged) {
-          note("  [resume] Messaging channel configuration changed; recreating sandbox.");
-          if (sandboxName) {
-            registry.removeSandbox(sandboxName);
-          }
-        } else if (hermesToolGatewayConfigChanged) {
-          note("  [resume] Hermes managed tool gateway selection changed; recreating sandbox.");
-          if (sandboxName) {
-            registry.removeSandbox(sandboxName);
-          }
-        } else if (sandboxReuseState === "not_ready") {
-          note(
-            `  [resume] Recorded sandbox '${sandboxName}' exists but is not ready; recreating it.`,
-          );
-          repairRecordedSandbox(sandboxName);
-        } else {
-          note("  [resume] Recorded sandbox state is unavailable; recreating it.");
-          if (sandboxName) {
-            registry.removeSandbox(sandboxName);
-          }
-        }
-      }
-      let nextWebSearchConfig = webSearchConfig;
-      if (nextWebSearchConfig) {
-        note("  [resume] Revalidating Brave Search configuration for sandbox recreation.");
-        const braveApiKey = await ensureValidatedBraveSearchCredential();
-        nextWebSearchConfig = braveApiKey ? { fetchEnabled: true } : null;
-        if (nextWebSearchConfig) {
-          note("  [resume] Reusing Brave Search configuration.");
-        }
-      } else {
-        nextWebSearchConfig = await configureWebSearch(null, agent, webSearchSupportProbePath);
-      }
-      await startRecordedStep("sandbox", { provider, model });
-      const recordedMessagingChannels = getRecordedMessagingChannelsForResume(resume, session, sandboxName);
-      if (recordedMessagingChannels) {
-        selectedMessagingChannels = recordedMessagingChannels;
-        if (selectedMessagingChannels.length > 0) {
-          note(
-            `  [non-interactive] Reusing messaging channel configuration: ${selectedMessagingChannels.join(", ")}`,
-          );
-        }
-      } else {
-        const existing = sandboxName
-          ? registry.getSandbox(sandboxName)?.messagingChannels ??
-            session?.messagingChannels ??
-            null
-          : session?.messagingChannels ?? null;
-        selectedMessagingChannels = await setupMessagingChannels(agent, existing);
-      }
-      const messagingChannelConfig = readMessagingChannelConfigFromEnv();
-      onboardSession.updateSession((current: Session) => {
-        current.messagingChannels = selectedMessagingChannels;
-        current.messagingChannelConfig = messagingChannelConfig;
-        return current;
-      });
-      if (!sandboxName) {
-        sandboxName = await promptValidatedSandboxName(agent);
-      }
-      if (typeof model !== "string" || typeof provider !== "string") {
-        console.error("  Inference selection is incomplete; cannot create sandbox.");
-        process.exit(1);
-      }
-      if (fresh) {
-        stopStaleDashboardListenersForSandbox(registry.listSandboxes().sandboxes, sandboxName);
-      }
-      sandboxName = await createSandbox(
-        gpu,
-        model,
-        provider,
-        preferredInferenceApi,
-        sandboxName,
-        nextWebSearchConfig,
-        selectedMessagingChannels,
-        fromDockerfile,
-        agent,
-        opts.controlUiPort || null,
-        sandboxGpuConfig,
-        hermesToolGateways,
-      );
-      webSearchConfig = nextWebSearchConfig;
-      registry.updateSandbox(sandboxName, {
-        model,
-        provider,
-        ...getSandboxAgentRegistryFields(agent, !fromDockerfile),
-      });
-      registry.setDefault(sandboxName);
-      await recordStepComplete(
-        "sandbox",
-        toSessionUpdates({
-          sandboxName,
-          provider,
-          model,
-          nimContainer,
-          webSearchConfig,
-          messagingChannelConfig,
-          hermesToolGateways,
-        }),
-      );
-    }
-
-    if (
-      typeof sandboxName !== "string" ||
-      typeof provider !== "string" ||
-      typeof model !== "string"
-    ) {
-      console.error("  Onboarding state is incomplete after sandbox setup.");
-      process.exit(1);
-    }
+      controlUiPort: opts.controlUiPort || null,
+      rootDir: ROOT,
+      deps: {
+        resolvePath: path.resolve,
+        agentSupportsWebSearch,
+        note,
+        updateSession: onboardSession.updateSession,
+        getStoredMessagingChannelConfig,
+        hydrateMessagingChannelConfig,
+        messagingChannelConfigsEqual,
+        persistMessagingChannelConfigToSession,
+        getSandboxReuseState,
+        computeTelegramRequireMention,
+        hasSandboxGpuDrift,
+        hasWechatConfigDrift,
+        getSandboxHermesToolGateways: (name) => registry.getSandbox(name)?.hermesToolGateways,
+        normalizeHermesToolGatewaySelections,
+        stringSetsEqual,
+        removeSandboxFromRegistry: registry.removeSandbox.bind(registry),
+        repairRecordedSandbox,
+        ensureValidatedBraveSearchCredential,
+        configureWebSearch,
+        startRecordedStep,
+        getRecordedMessagingChannelsForResume,
+        getSandboxMessagingChannels: (name) => registry.getSandbox(name)?.messagingChannels,
+        setupMessagingChannels,
+        readMessagingChannelConfigFromEnv,
+        promptValidatedSandboxName,
+        stopStaleDashboardListenersForSandbox,
+        listRegistrySandboxes: registry.listSandboxes,
+        createSandbox,
+        updateSandboxRegistry: (name, updates) => registry.updateSandbox(name, updates),
+        setDefaultSandbox: registry.setDefault,
+        getSandboxAgentRegistryFields,
+        recordStepComplete,
+        toSessionUpdates: (updates) => toSessionUpdates(updates as Parameters<typeof toSessionUpdates>[0]),
+        skippedStepMessage,
+        error: (message) => console.error(message),
+        exitProcess: (code) => process.exit(code),
+      },
+    });
+    session = sandboxStateResult.session;
+    sandboxName = sandboxStateResult.sandboxName;
+    webSearchConfig = sandboxStateResult.webSearchConfig ?? null;
+    selectedMessagingChannels = sandboxStateResult.selectedMessagingChannels;
+    const webSearchSupported = sandboxStateResult.webSearchSupported;
 
     if (agent) {
       await agentOnboard.handleAgentSetup(sandboxName, model, provider, agent, resume, session, {
diff --git a/src/lib/onboard/machine/handlers/sandbox.test.ts b/src/lib/onboard/machine/handlers/sandbox.test.ts
new file mode 100644
index 0000000000..eac0ffb553
--- /dev/null
+++ b/src/lib/onboard/machine/handlers/sandbox.test.ts
@@ -0,0 +1,198 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { describe, expect, it, vi } from "vitest";
+
+import { createSession, type Session, type SessionUpdates } from "../../../state/onboard-session";
+import { handleSandboxState, type SandboxStateOptions } from "./sandbox";
+
+type Gpu = { type: string } | null;
+type Agent = { displayName?: string } | null;
+type WebSearchConfig = { fetchEnabled: true };
+type MessagingChannelConfig = Record<string, string>;
+type SandboxGpuConfig = { sandboxGpuEnabled: boolean; mode: string };
+
+function createDeps(overrides: Partial<SandboxStateOptions<Gpu, Agent, WebSearchConfig, MessagingChannelConfig, SandboxGpuConfig>["deps"]> = {}) {
+  let session = createSession();
+  const calls = {
+    note: vi.fn(),
+    updateSession: vi.fn((mutator: (value: Session) => Session | void) => {
+      session = mutator(session) ?? session;
+      return session;
+    }),
+    persistMessaging: vi.fn(),
+    removeSandbox: vi.fn(),
+    repairSandbox: vi.fn(),
+    validateBrave: vi.fn(async () => "brave-key"),
+    configureWebSearch: vi.fn(async () => null as WebSearchConfig | null),
+    startStep: vi.fn(async () => undefined),
+    getRecordedChannels: vi.fn(() => null),
+    setupMessaging: vi.fn(async () => [] as string[]),
+    promptName: vi.fn(async () => "my-assistant"),
+    stopStale: vi.fn(),
+    createSandbox: vi.fn(async () => "my-assistant"),
+    updateSandbox: vi.fn(),
+    setDefault: vi.fn(),
+    complete: vi.fn(async () => createSession()),
+    skipped: vi.fn(),
+    error: vi.fn(),
+    exit: vi.fn((code: number): never => {
+      throw new Error(`exit ${code}`);
+    }),
+  };
+  return {
+    calls,
+    deps: {
+      resolvePath: (value: string) => `/abs/${value}`,
+      agentSupportsWebSearch: () => true,
+      note: calls.note,
+      updateSession: calls.updateSession,
+      getStoredMessagingChannelConfig: () => null,
+      hydrateMessagingChannelConfig: (config: MessagingChannelConfig | null) => config,
+      messagingChannelConfigsEqual: () => true,
+      persistMessagingChannelConfigToSession: calls.persistMessaging,
+      getSandboxReuseState: () => "missing",
+      computeTelegramRequireMention: () => null,
+      hasSandboxGpuDrift: () => false,
+      hasWechatConfigDrift: () => false,
+      getSandboxHermesToolGateways: () => [],
+      normalizeHermesToolGatewaySelections: (value: unknown) => (Array.isArray(value) ? (value as string[]) : []),
+      stringSetsEqual: (left: string[], right: string[]) => left.length === right.length && left.every((value) => right.includes(value)),
+      removeSandboxFromRegistry: calls.removeSandbox,
+      repairRecordedSandbox: calls.repairSandbox,
+      ensureValidatedBraveSearchCredential: calls.validateBrave,
+      configureWebSearch: calls.configureWebSearch,
+      startRecordedStep: calls.startStep,
+      getRecordedMessagingChannelsForResume: calls.getRecordedChannels,
+      getSandboxMessagingChannels: () => ["telegram"],
+      setupMessagingChannels: calls.setupMessaging,
+      readMessagingChannelConfigFromEnv: () => null,
+      promptValidatedSandboxName: calls.promptName,
+      stopStaleDashboardListenersForSandbox: calls.stopStale,
+      listRegistrySandboxes: () => ({ sandboxes: [{ name: "old" }] }),
+      createSandbox: calls.createSandbox,
+      updateSandboxRegistry: calls.updateSandbox,
+      setDefaultSandbox: calls.setDefault,
+      getSandboxAgentRegistryFields: () => ({ agent: null }),
+      recordStepComplete: calls.complete,
+      toSessionUpdates: (updates: Record<string, unknown>) => updates as SessionUpdates,
+      skippedStepMessage: calls.skipped,
+      error: calls.error,
+      exitProcess: calls.exit,
+      ...overrides,
+    },
+    getSession: () => session,
+  };
+}
+
+function baseOptions(
+  deps: SandboxStateOptions<Gpu, Agent, WebSearchConfig, MessagingChannelConfig, SandboxGpuConfig>["deps"],
+  session: Session | null = createSession(),
+): SandboxStateOptions<Gpu, Agent, WebSearchConfig, MessagingChannelConfig, SandboxGpuConfig> {
+  return {
+    resume: false,
+    fresh: false,
+    session,
+    sandboxName: null,
+    model: "model",
+    provider: "provider",
+    nimContainer: null,
+    webSearchConfig: null,
+    selectedMessagingChannels: [],
+    fromDockerfile: null,
+    agent: null,
+    gpu: { type: "nvidia" },
+    preferredInferenceApi: "openai-completions",
+    sandboxGpuConfig: { sandboxGpuEnabled: false, mode: "0" },
+    hermesToolGateways: [],
+    controlUiPort: null,
+    rootDir: "/repo",
+    deps,
+  };
+}
+
+describe("handleSandboxState", () => {
+  it("creates a sandbox and records messaging/web search state", async () => {
+    const { deps, calls } = createDeps({
+      configureWebSearch: vi.fn(async () => ({ fetchEnabled: true as const })),
+      readMessagingChannelConfigFromEnv: () => ({ telegram: "polling" }),
+    });
+    calls.setupMessaging.mockResolvedValue(["telegram"]);
+
+    const result = await handleSandboxState(baseOptions(deps));
+
+    expect(calls.startStep).toHaveBeenCalledWith("sandbox", { provider: "provider", model: "model" });
+    expect(calls.setupMessaging).toHaveBeenCalledWith(null, null);
+    expect(calls.promptName).toHaveBeenCalledWith(null);
+    expect(calls.createSandbox).toHaveBeenCalledWith(
+      { type: "nvidia" },
+      "model",
+      "provider",
+      "openai-completions",
+      "my-assistant",
+      { fetchEnabled: true },
+      ["telegram"],
+      null,
+      null,
+      null,
+      { sandboxGpuEnabled: false, mode: "0" },
+      [],
+    );
+    expect(calls.updateSandbox).toHaveBeenCalledWith("my-assistant", expect.objectContaining({ model: "model", provider: "provider" }));
+    expect(calls.setDefault).toHaveBeenCalledWith("my-assistant");
+    expect(calls.complete).toHaveBeenCalledWith("sandbox", expect.objectContaining({ sandboxName: "my-assistant" }));
+    expect(result).toMatchObject({ sandboxName: "my-assistant", selectedMessagingChannels: ["telegram"], webSearchSupported: true });
+  });
+
+  it("reuses a completed ready sandbox on resume", async () => {
+    const session = createSession({ sandboxName: "saved", messagingChannels: ["slack"] });
+    session.steps.sandbox.status = "complete";
+    const { deps, calls } = createDeps({ getSandboxReuseState: () => "ready" });
+
+    const result = await handleSandboxState({ ...baseOptions(deps, session), resume: true, sandboxName: "saved" });
+
+    expect(calls.createSandbox).not.toHaveBeenCalled();
+    expect(calls.skipped).toHaveBeenCalledWith("sandbox", "saved");
+    expect(result.selectedMessagingChannels).toEqual(["slack"]);
+  });
+
+  it("removes registry state when Telegram mention-mode drift forces sandbox recreation", async () => {
+    const session = createSession({ telegramConfig: { requireMention: true } });
+    session.steps.sandbox.status = "complete";
+    const { deps, calls } = createDeps({
+      getSandboxReuseState: () => "ready",
+      computeTelegramRequireMention: () => false,
+    });
+
+    await handleSandboxState({
+      ...baseOptions(deps, session),
+      resume: true,
+      sandboxName: "saved",
+    });
+
+    expect(calls.note).toHaveBeenCalledWith("  [resume] TELEGRAM_REQUIRE_MENTION changed; recreating sandbox.");
+    expect(calls.removeSandbox).toHaveBeenCalledWith("saved");
+    expect(calls.createSandbox).toHaveBeenCalled();
+  });
+
+  it("repairs not-ready resumed sandboxes before recreation", async () => {
+    const session = createSession({ sandboxName: "saved" });
+    session.steps.sandbox.status = "complete";
+    const { deps, calls } = createDeps({ getSandboxReuseState: () => "not_ready" });
+
+    await handleSandboxState({ ...baseOptions(deps, session), resume: true, sandboxName: "saved" });
+
+    expect(calls.repairSandbox).toHaveBeenCalledWith("saved");
+    expect(calls.createSandbox).toHaveBeenCalled();
+  });
+
+  it("uses recorded messaging channels on non-interactive resume", async () => {
+    const { deps, calls } = createDeps({ getRecordedMessagingChannelsForResume: vi.fn(() => ["discord"]) });
+
+    const result = await handleSandboxState(baseOptions(deps));
+
+    expect(calls.setupMessaging).not.toHaveBeenCalled();
+    expect(calls.note).toHaveBeenCalledWith("  [non-interactive] Reusing messaging channel configuration: discord");
+    expect(result.selectedMessagingChannels).toEqual(["discord"]);
+  });
+});
diff --git a/src/lib/onboard/machine/handlers/sandbox.ts b/src/lib/onboard/machine/handlers/sandbox.ts
new file mode 100644
index 0000000000..8c45215ed9
--- /dev/null
+++ b/src/lib/onboard/machine/handlers/sandbox.ts
@@ -0,0 +1,287 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import type { Session, SessionUpdates } from "../../../state/onboard-session";
+
+export interface SandboxStateOptions<Gpu, Agent, WebSearchConfig, MessagingChannelConfig, SandboxGpuConfig> {
+  resume: boolean;
+  fresh: boolean;
+  session: Session | null;
+  sandboxName: string | null;
+  model: string;
+  provider: string;
+  nimContainer: string | null;
+  webSearchConfig: WebSearchConfig | null;
+  selectedMessagingChannels: string[];
+  fromDockerfile: string | null;
+  agent: Agent;
+  gpu: Gpu;
+  preferredInferenceApi: string | null;
+  sandboxGpuConfig: SandboxGpuConfig;
+  hermesToolGateways: string[];
+  controlUiPort: number | null;
+  rootDir: string;
+  deps: {
+    resolvePath(value: string): string;
+    agentSupportsWebSearch(agent: Agent, dockerfilePathOverride: string | null, rootDir: string): boolean;
+    note(message: string): void;
+    updateSession(mutator: (session: Session) => Session | void): Session;
+    getStoredMessagingChannelConfig(sandboxName: string | null, session: Session | null): MessagingChannelConfig | null;
+    hydrateMessagingChannelConfig(config: MessagingChannelConfig | null): MessagingChannelConfig | null;
+    messagingChannelConfigsEqual(left: MessagingChannelConfig | null, right: MessagingChannelConfig | null): boolean;
+    persistMessagingChannelConfigToSession(config: MessagingChannelConfig | null): void;
+    getSandboxReuseState(sandboxName: string | null): string;
+    computeTelegramRequireMention(): boolean | null;
+    hasSandboxGpuDrift(sandboxName: string, config: SandboxGpuConfig): boolean;
+    hasWechatConfigDrift(session: Session | null): boolean;
+    getSandboxHermesToolGateways(sandboxName: string): unknown;
+    normalizeHermesToolGatewaySelections(value: unknown): string[];
+    stringSetsEqual(left: string[], right: string[]): boolean;
+    removeSandboxFromRegistry(sandboxName: string): void;
+    repairRecordedSandbox(sandboxName: string | null): void;
+    ensureValidatedBraveSearchCredential(): Promise<string | null>;
+    configureWebSearch(
+      existingConfig: WebSearchConfig | null,
+      agent: Agent,
+      dockerfilePathOverride: string | null,
+    ): Promise<WebSearchConfig | null>;
+    startRecordedStep(stepName: string, updates: { provider: string; model: string }): Promise<void>;
+    getRecordedMessagingChannelsForResume(
+      resume: boolean,
+      session: Session | null,
+      sandboxName: string | null,
+    ): string[] | null;
+    getSandboxMessagingChannels(sandboxName: string): string[] | null | undefined;
+    setupMessagingChannels(agent: Agent, existingChannels: string[] | null): Promise<string[]>;
+    readMessagingChannelConfigFromEnv(): MessagingChannelConfig | null;
+    promptValidatedSandboxName(agent: Agent): Promise<string>;
+    stopStaleDashboardListenersForSandbox(sandboxes: unknown[], sandboxName: string): void;
+    listRegistrySandboxes(): { sandboxes: unknown[] };
+    createSandbox(
+      gpu: Gpu,
+      model: string,
+      provider: string,
+      preferredInferenceApi: string | null,
+      sandboxName: string,
+      webSearchConfig: WebSearchConfig | null,
+      selectedMessagingChannels: string[],
+      fromDockerfile: string | null,
+      agent: Agent,
+      controlUiPort: number | null,
+      sandboxGpuConfig: SandboxGpuConfig,
+      hermesToolGateways: string[],
+    ): Promise<string>;
+    updateSandboxRegistry(sandboxName: string, updates: Record<string, unknown>): void;
+    setDefaultSandbox(sandboxName: string): void;
+    getSandboxAgentRegistryFields(agent: Agent, agentVersionKnown: boolean): Record<string, unknown>;
+    recordStepComplete(stepName: string, updates: SessionUpdates): Promise<Session>;
+    toSessionUpdates(updates: Record<string, unknown>): SessionUpdates;
+    skippedStepMessage(stepName: string, detail?: string | null): void;
+    error(message?: string): void;
+    exitProcess(code: number): never;
+  };
+}
+
+export interface SandboxStateResult<WebSearchConfig> {
+  sandboxName: string;
+  webSearchConfig: WebSearchConfig | null;
+  selectedMessagingChannels: string[];
+  webSearchSupported: boolean;
+  session: Session | null;
+}
+
+function sameEffectiveTelegramRequireMention(left: boolean | null, right: boolean | null): boolean {
+  return (left ?? false) === (right ?? false);
+}
+
+export async function handleSandboxState<Gpu, Agent, WebSearchConfig, MessagingChannelConfig, SandboxGpuConfig>({
+  resume,
+  fresh,
+  session,
+  sandboxName,
+  model,
+  provider,
+  nimContainer,
+  webSearchConfig,
+  selectedMessagingChannels,
+  fromDockerfile,
+  agent,
+  gpu,
+  preferredInferenceApi,
+  sandboxGpuConfig,
+  hermesToolGateways,
+  controlUiPort,
+  rootDir,
+  deps,
+}: SandboxStateOptions<
+  Gpu,
+  Agent,
+  WebSearchConfig,
+  MessagingChannelConfig,
+  SandboxGpuConfig
+>): Promise<SandboxStateResult<WebSearchConfig>> {
+  const webSearchSupportProbePath = fromDockerfile ? deps.resolvePath(fromDockerfile) : null;
+  const webSearchSupported = deps.agentSupportsWebSearch(agent, webSearchSupportProbePath, rootDir);
+  if (webSearchConfig && !webSearchSupported) {
+    deps.note(
+      `  Web search is not yet supported by ${(agent as { displayName?: string } | null)?.displayName ?? "this sandbox image"}. Clearing stale config.`,
+    );
+    webSearchConfig = null;
+    if (session) session.webSearchConfig = null;
+    session = deps.updateSession((current) => {
+      current.webSearchConfig = null;
+      return current;
+    });
+  }
+
+  const storedMessagingChannelConfig = deps.getStoredMessagingChannelConfig(sandboxName, session);
+  const effectiveMessagingChannelConfig = deps.hydrateMessagingChannelConfig(storedMessagingChannelConfig);
+  const messagingChannelConfigChanged = !deps.messagingChannelConfigsEqual(
+    effectiveMessagingChannelConfig,
+    storedMessagingChannelConfig,
+  );
+  if (effectiveMessagingChannelConfig) {
+    deps.persistMessagingChannelConfigToSession(effectiveMessagingChannelConfig);
+    if (session) session.messagingChannelConfig = effectiveMessagingChannelConfig as Session["messagingChannelConfig"];
+  }
+
+  const sandboxReuseState = deps.getSandboxReuseState(sandboxName);
+  const webSearchConfigChanged = Boolean(session?.webSearchConfig) !== Boolean(webSearchConfig);
+  const currentTelegramRequireMention = deps.computeTelegramRequireMention();
+  const recordedTelegramRequireMention = session?.telegramConfig?.requireMention ?? null;
+  const telegramConfigChanged = !sameEffectiveTelegramRequireMention(
+    currentTelegramRequireMention,
+    recordedTelegramRequireMention,
+  );
+  const sandboxGpuConfigChanged = sandboxName ? deps.hasSandboxGpuDrift(sandboxName, sandboxGpuConfig) : false;
+  const wechatConfigChanged = deps.hasWechatConfigDrift(session);
+  const recordedHermesToolGateways = sandboxName
+    ? deps.normalizeHermesToolGatewaySelections(deps.getSandboxHermesToolGateways(sandboxName))
+    : [];
+  const hermesToolGatewayConfigChanged = !deps.stringSetsEqual(recordedHermesToolGateways, hermesToolGateways);
+  const resumeSandbox =
+    resume &&
+    !webSearchConfigChanged &&
+    !telegramConfigChanged &&
+    !sandboxGpuConfigChanged &&
+    !wechatConfigChanged &&
+    !messagingChannelConfigChanged &&
+    !hermesToolGatewayConfigChanged &&
+    session?.steps?.sandbox?.status === "complete" &&
+    sandboxReuseState === "ready";
+
+  if (resumeSandbox) {
+    if (webSearchConfig) deps.note("  [resume] Reusing Brave Search configuration already baked into the sandbox.");
+    selectedMessagingChannels = session?.messagingChannels ?? [];
+    deps.skippedStepMessage("sandbox", sandboxName);
+  } else {
+    if (resume && session?.steps?.sandbox?.status === "complete") {
+      if (webSearchConfigChanged) {
+        deps.note("  [resume] Web Search configuration changed; recreating sandbox.");
+        if (sandboxName) deps.removeSandboxFromRegistry(sandboxName);
+      } else if (telegramConfigChanged) {
+        deps.note("  [resume] TELEGRAM_REQUIRE_MENTION changed; recreating sandbox.");
+        if (sandboxName) deps.removeSandboxFromRegistry(sandboxName);
+      } else if (sandboxGpuConfigChanged) {
+        deps.note("  [resume] Sandbox GPU settings changed; recreating sandbox.");
+        if (sandboxName) deps.removeSandboxFromRegistry(sandboxName);
+      } else if (wechatConfigChanged) {
+        deps.note("  [resume] WeChat account metadata changed; recreating sandbox.");
+        if (sandboxName) deps.removeSandboxFromRegistry(sandboxName);
+      } else if (messagingChannelConfigChanged) {
+        deps.note("  [resume] Messaging channel configuration changed; recreating sandbox.");
+        if (sandboxName) deps.removeSandboxFromRegistry(sandboxName);
+      } else if (hermesToolGatewayConfigChanged) {
+        deps.note("  [resume] Hermes managed tool gateway selection changed; recreating sandbox.");
+        if (sandboxName) deps.removeSandboxFromRegistry(sandboxName);
+      } else if (sandboxReuseState === "not_ready") {
+        deps.note(`  [resume] Recorded sandbox '${sandboxName}' exists but is not ready; recreating it.`);
+        deps.repairRecordedSandbox(sandboxName);
+      } else {
+        deps.note("  [resume] Recorded sandbox state is unavailable; recreating it.");
+        if (sandboxName) deps.removeSandboxFromRegistry(sandboxName);
+      }
+    }
+
+    let nextWebSearchConfig = webSearchConfig;
+    if (nextWebSearchConfig) {
+      deps.note("  [resume] Revalidating Brave Search configuration for sandbox recreation.");
+      const braveApiKey = await deps.ensureValidatedBraveSearchCredential();
+      nextWebSearchConfig = braveApiKey ? webSearchConfig : null;
+      if (nextWebSearchConfig) deps.note("  [resume] Reusing Brave Search configuration.");
+    } else {
+      nextWebSearchConfig = await deps.configureWebSearch(null, agent, webSearchSupportProbePath);
+    }
+
+    await deps.startRecordedStep("sandbox", { provider, model });
+    const recordedMessagingChannels = deps.getRecordedMessagingChannelsForResume(resume, session, sandboxName);
+    if (recordedMessagingChannels) {
+      selectedMessagingChannels = recordedMessagingChannels;
+      if (selectedMessagingChannels.length > 0) {
+        deps.note(`  [non-interactive] Reusing messaging channel configuration: ${selectedMessagingChannels.join(", ")}`);
+      }
+    } else {
+      const existing = sandboxName
+        ? deps.getSandboxMessagingChannels(sandboxName) ?? session?.messagingChannels ?? null
+        : session?.messagingChannels ?? null;
+      selectedMessagingChannels = await deps.setupMessagingChannels(agent, existing);
+    }
+    const messagingChannelConfig = deps.readMessagingChannelConfigFromEnv();
+    session = deps.updateSession((current) => {
+      current.messagingChannels = selectedMessagingChannels;
+      current.messagingChannelConfig = messagingChannelConfig as Session["messagingChannelConfig"];
+      return current;
+    });
+
+    if (!sandboxName) sandboxName = await deps.promptValidatedSandboxName(agent);
+    if (fresh) deps.stopStaleDashboardListenersForSandbox(deps.listRegistrySandboxes().sandboxes, sandboxName);
+    sandboxName = await deps.createSandbox(
+      gpu,
+      model,
+      provider,
+      preferredInferenceApi,
+      sandboxName,
+      nextWebSearchConfig,
+      selectedMessagingChannels,
+      fromDockerfile,
+      agent,
+      controlUiPort,
+      sandboxGpuConfig,
+      hermesToolGateways,
+    );
+    webSearchConfig = nextWebSearchConfig;
+    deps.updateSandboxRegistry(sandboxName, {
+      model,
+      provider,
+      ...deps.getSandboxAgentRegistryFields(agent, !fromDockerfile),
+    });
+    deps.setDefaultSandbox(sandboxName);
+    session = await deps.recordStepComplete(
+      "sandbox",
+      deps.toSessionUpdates({
+        sandboxName,
+        provider,
+        model,
+        nimContainer,
+        webSearchConfig,
+        messagingChannelConfig,
+        hermesToolGateways,
+      }),
+    );
+  }
+
+  if (!sandboxName) {
+    deps.error("  Onboarding state is incomplete after sandbox setup.");
+    deps.exitProcess(1);
+  }
+  const completedSandboxName = sandboxName;
+  if (!completedSandboxName) throw new Error("Sandbox name is required after sandbox setup");
+
+  return {
+    sandboxName: completedSandboxName,
+    webSearchConfig,
+    selectedMessagingChannels,
+    webSearchSupported,
+    session,
+  };
+}

From 7fe9e1cba937226c582e7a8ce0fb8cc26c1e7a7d Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Tue, 19 May 2026 23:44:11 -0700
Subject: [PATCH 11/54] refactor(cli): extract onboard agent setup handler

---
 src/lib/onboard.ts                            |  64 ++++-----
 .../machine/handlers/agent-setup.test.ts      | 122 ++++++++++++++++++
 .../onboard/machine/handlers/agent-setup.ts   |  87 +++++++++++++
 3 files changed, 242 insertions(+), 31 deletions(-)
 create mode 100644 src/lib/onboard/machine/handlers/agent-setup.test.ts
 create mode 100644 src/lib/onboard/machine/handlers/agent-setup.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index b23e7f0fa4..7462486abf 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -280,6 +280,7 @@ const { resolveSandboxImageTagFromCreateOutput } =
 const nim: typeof import("./inference/nim") = require("./inference/nim");
 const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session");
 const { OnboardRuntime }: typeof import("./onboard/machine/runtime") = require("./onboard/machine/runtime");
+const { handleAgentSetupState }: typeof import("./onboard/machine/handlers/agent-setup") = require("./onboard/machine/handlers/agent-setup");
 const { handleGatewayState }: typeof import("./onboard/machine/handlers/gateway") = require("./onboard/machine/handlers/gateway");
 const { handlePreflightState }: typeof import("./onboard/machine/handlers/preflight") = require("./onboard/machine/handlers/preflight");
 const { handleProviderInferenceState }: typeof import("./onboard/machine/handlers/provider-inference") = require("./onboard/machine/handlers/provider-inference");
@@ -9651,38 +9652,39 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
     selectedMessagingChannels = sandboxStateResult.selectedMessagingChannels;
     const webSearchSupported = sandboxStateResult.webSearchSupported;
 
-    if (agent) {
-      await agentOnboard.handleAgentSetup(sandboxName, model, provider, agent, resume, session, {
-        step,
-        runCaptureOpenshell,
-        openshellShellCommand,
-        openshellBinary: getOpenshellBinary(),
-        buildSandboxConfigSyncScript,
-        writeSandboxConfigSyncFile,
-        cleanupTempDir,
-        startRecordedStep,
+    const agentSetupResult = await handleAgentSetupState({
+      agent,
+      sandboxName,
+      model,
+      provider,
+      resume,
+      session,
+      hermesAuthMethod,
+      hermesToolGateways,
+      deps: {
+        handleAgentSetup: agentOnboard.handleAgentSetup,
+        agentSetupContext: () => ({
+          step,
+          runCaptureOpenshell,
+          openshellShellCommand,
+          openshellBinary: getOpenshellBinary(),
+          buildSandboxConfigSyncScript,
+          writeSandboxConfigSyncFile,
+          cleanupTempDir,
+          startRecordedStep,
+          skippedStepMessage,
+        }),
+        ensureAgentDashboardForward,
+        recordStepSkipped,
+        isOpenclawReady,
         skippedStepMessage,
-      });
-      ensureAgentDashboardForward(sandboxName, agent);
-      await recordStepSkipped("openclaw");
-    } else {
-      const resumeOpenclaw = resume && sandboxName && isOpenclawReady(sandboxName);
-      if (resumeOpenclaw) {
-        skippedStepMessage("openclaw", sandboxName);
-        await recordStepComplete(
-          "openclaw",
-          toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }),
-        );
-      } else {
-        await startRecordedStep("openclaw", { sandboxName, provider, model });
-        await setupOpenclaw(sandboxName, model, provider);
-        await recordStepComplete(
-          "openclaw",
-          toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }),
-        );
-      }
-      await recordStepSkipped("agent_setup");
-    }
+        startRecordedStep,
+        setupOpenclaw,
+        recordStepComplete,
+        toSessionUpdates: (updates) => toSessionUpdates(updates as Parameters<typeof toSessionUpdates>[0]),
+      },
+    });
+    session = agentSetupResult.session;
 
     const latestSession = onboardSession.loadSession();
     const recordedPolicyPresets = Array.isArray(latestSession?.policyPresets)
diff --git a/src/lib/onboard/machine/handlers/agent-setup.test.ts b/src/lib/onboard/machine/handlers/agent-setup.test.ts
new file mode 100644
index 0000000000..fd9f1d0410
--- /dev/null
+++ b/src/lib/onboard/machine/handlers/agent-setup.test.ts
@@ -0,0 +1,122 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { describe, expect, it, vi } from "vitest";
+
+import { createSession, type Session, type SessionUpdates } from "../../../state/onboard-session";
+import { handleAgentSetupState, type AgentSetupStateOptions } from "./agent-setup";
+
+type Agent = { name: string; displayName: string };
+
+function createDeps(overrides: Partial<AgentSetupStateOptions<Agent>["deps"]> = {}) {
+  const calls = {
+    handleAgentSetup: vi.fn(async () => undefined),
+    context: vi.fn(() => ({ ctx: true })),
+    ensureDashboard: vi.fn(() => 18789),
+    skipped: vi.fn(async () => createSession()),
+    openclawReady: vi.fn(() => false),
+    skippedMessage: vi.fn(),
+    startStep: vi.fn(async () => undefined),
+    setupOpenclaw: vi.fn(async () => undefined),
+    complete: vi.fn(async () => createSession()),
+  };
+  return {
+    calls,
+    deps: {
+      handleAgentSetup: calls.handleAgentSetup,
+      agentSetupContext: calls.context,
+      ensureAgentDashboardForward: calls.ensureDashboard,
+      recordStepSkipped: calls.skipped,
+      isOpenclawReady: calls.openclawReady,
+      skippedStepMessage: calls.skippedMessage,
+      startRecordedStep: calls.startStep,
+      setupOpenclaw: calls.setupOpenclaw,
+      recordStepComplete: calls.complete,
+      toSessionUpdates: (updates: Record<string, unknown>) => updates as SessionUpdates,
+      ...overrides,
+    },
+  };
+}
+
+function baseOptions(
+  deps: AgentSetupStateOptions<Agent>["deps"],
+  agent: Agent | null = null,
+): AgentSetupStateOptions<Agent> {
+  return {
+    agent,
+    sandboxName: "my-assistant",
+    model: "model",
+    provider: "provider",
+    resume: false,
+    session: createSession(),
+    hermesAuthMethod: null,
+    hermesToolGateways: [],
+    deps,
+  };
+}
+
+describe("handleAgentSetupState", () => {
+  it("delegates non-OpenClaw agent setup and skips openclaw", async () => {
+    const { deps, calls } = createDeps();
+    const agent = { name: "hermes", displayName: "Hermes" };
+    const session = createSession();
+
+    await handleAgentSetupState({ ...baseOptions(deps, agent), session, resume: true });
+
+    expect(calls.handleAgentSetup).toHaveBeenCalledWith(
+      "my-assistant",
+      "model",
+      "provider",
+      agent,
+      true,
+      session,
+      { ctx: true },
+    );
+    expect(calls.ensureDashboard).toHaveBeenCalledWith("my-assistant", agent);
+    expect(calls.skipped).toHaveBeenCalledWith("openclaw");
+    expect(calls.setupOpenclaw).not.toHaveBeenCalled();
+  });
+
+  it("skips OpenClaw setup on resume when OpenClaw is ready", async () => {
+    const { deps, calls } = createDeps({ isOpenclawReady: vi.fn(() => true) });
+
+    await handleAgentSetupState({ ...baseOptions(deps), resume: true });
+
+    expect(calls.skippedMessage).toHaveBeenCalledWith("openclaw", "my-assistant");
+    expect(calls.startStep).not.toHaveBeenCalled();
+    expect(calls.setupOpenclaw).not.toHaveBeenCalled();
+    expect(calls.complete).toHaveBeenCalledWith(
+      "openclaw",
+      expect.objectContaining({ sandboxName: "my-assistant", provider: "provider", model: "model" }),
+    );
+    expect(calls.skipped).toHaveBeenCalledWith("agent_setup");
+  });
+
+  it("runs OpenClaw setup and skips agent_setup for the default agent", async () => {
+    const { deps, calls } = createDeps();
+
+    await handleAgentSetupState({
+      ...baseOptions(deps),
+      hermesAuthMethod: "oauth",
+      hermesToolGateways: ["github"],
+    });
+
+    expect(calls.startStep).toHaveBeenCalledWith("openclaw", {
+      sandboxName: "my-assistant",
+      provider: "provider",
+      model: "model",
+    });
+    expect(calls.setupOpenclaw).toHaveBeenCalledWith("my-assistant", "model", "provider");
+    expect(calls.complete).toHaveBeenCalledWith(
+      "openclaw",
+      expect.objectContaining({
+        sandboxName: "my-assistant",
+        provider: "provider",
+        model: "model",
+        hermesAuthMethod: "oauth",
+        hermesToolGateways: ["github"],
+      }),
+    );
+    expect(calls.skipped).toHaveBeenCalledWith("agent_setup");
+  });
+});
diff --git a/src/lib/onboard/machine/handlers/agent-setup.ts b/src/lib/onboard/machine/handlers/agent-setup.ts
new file mode 100644
index 0000000000..40330711ad
--- /dev/null
+++ b/src/lib/onboard/machine/handlers/agent-setup.ts
@@ -0,0 +1,87 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import type { Session, SessionUpdates } from "../../../state/onboard-session";
+
+export interface AgentSetupStateOptions<Agent> {
+  agent: Agent | null;
+  sandboxName: string;
+  model: string;
+  provider: string;
+  resume: boolean;
+  session: Session | null;
+  hermesAuthMethod: string | null;
+  hermesToolGateways: string[];
+  deps: {
+    handleAgentSetup(
+      sandboxName: string,
+      model: string,
+      provider: string,
+      agent: Agent,
+      resume: boolean,
+      session: Session | null,
+      context: unknown,
+    ): Promise<void>;
+    agentSetupContext(): unknown;
+    ensureAgentDashboardForward(sandboxName: string, agent: Agent): number;
+    recordStepSkipped(stepName: string): Promise<Session>;
+    isOpenclawReady(sandboxName: string): boolean;
+    skippedStepMessage(stepName: string, detail?: string | null): void;
+    startRecordedStep(
+      stepName: string,
+      updates: { sandboxName: string; provider: string; model: string },
+    ): Promise<void>;
+    setupOpenclaw(sandboxName: string, model: string, provider: string): Promise<void>;
+    recordStepComplete(stepName: string, updates: SessionUpdates): Promise<Session>;
+    toSessionUpdates(updates: Record<string, unknown>): SessionUpdates;
+  };
+}
+
+export interface AgentSetupStateResult {
+  session: Session | null;
+}
+
+export async function handleAgentSetupState<Agent>({
+  agent,
+  sandboxName,
+  model,
+  provider,
+  resume,
+  session,
+  hermesAuthMethod,
+  hermesToolGateways,
+  deps,
+}: AgentSetupStateOptions<Agent>): Promise<AgentSetupStateResult> {
+  if (agent) {
+    await deps.handleAgentSetup(
+      sandboxName,
+      model,
+      provider,
+      agent,
+      resume,
+      session,
+      deps.agentSetupContext(),
+    );
+    deps.ensureAgentDashboardForward(sandboxName, agent);
+    session = await deps.recordStepSkipped("openclaw");
+    return { session };
+  }
+
+  const resumeOpenclaw = resume && sandboxName && deps.isOpenclawReady(sandboxName);
+  if (resumeOpenclaw) {
+    deps.skippedStepMessage("openclaw", sandboxName);
+    session = await deps.recordStepComplete(
+      "openclaw",
+      deps.toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }),
+    );
+  } else {
+    await deps.startRecordedStep("openclaw", { sandboxName, provider, model });
+    await deps.setupOpenclaw(sandboxName, model, provider);
+    session = await deps.recordStepComplete(
+      "openclaw",
+      deps.toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }),
+    );
+  }
+  session = await deps.recordStepSkipped("agent_setup");
+  return { session };
+}

From b9daca0cd003aeffc535592b5468893e005515e7 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Tue, 19 May 2026 23:55:05 -0700
Subject: [PATCH 12/54] refactor(cli): extract onboard policies handler

---
 src/lib/onboard.ts                            | 114 +++--------
 .../onboard/machine/handlers/policies.test.ts | 182 +++++++++++++++++
 src/lib/onboard/machine/handlers/policies.ts  | 189 ++++++++++++++++++
 3 files changed, 401 insertions(+), 84 deletions(-)
 create mode 100644 src/lib/onboard/machine/handlers/policies.test.ts
 create mode 100644 src/lib/onboard/machine/handlers/policies.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 7462486abf..e406d8ca0c 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -282,6 +282,7 @@ const onboardSession: typeof import("./state/onboard-session") = require("./stat
 const { OnboardRuntime }: typeof import("./onboard/machine/runtime") = require("./onboard/machine/runtime");
 const { handleAgentSetupState }: typeof import("./onboard/machine/handlers/agent-setup") = require("./onboard/machine/handlers/agent-setup");
 const { handleGatewayState }: typeof import("./onboard/machine/handlers/gateway") = require("./onboard/machine/handlers/gateway");
+const { handlePoliciesState }: typeof import("./onboard/machine/handlers/policies") = require("./onboard/machine/handlers/policies");
 const { handlePreflightState }: typeof import("./onboard/machine/handlers/preflight") = require("./onboard/machine/handlers/preflight");
 const { handleProviderInferenceState }: typeof import("./onboard/machine/handlers/provider-inference") = require("./onboard/machine/handlers/provider-inference");
 const { handleSandboxState }: typeof import("./onboard/machine/handlers/sandbox") = require("./onboard/machine/handlers/sandbox");
@@ -9686,97 +9687,42 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
     });
     session = agentSetupResult.session;
 
-    const latestSession = onboardSession.loadSession();
-    const recordedPolicyPresets = Array.isArray(latestSession?.policyPresets)
-      ? latestSession.policyPresets
-      : null;
-    const recordedMessagingChannels = Array.isArray(latestSession?.messagingChannels)
-      ? latestSession.messagingChannels
-      : [];
-    const activeMessagingChannels = registry.getSandbox(sandboxName)?.messagingChannels;
-    verifyCompatibleEndpointSandboxSmoke({
+    const policiesResult = await handlePoliciesState({
+      resume,
       sandboxName,
       provider,
       model,
-      runOpenshell,
-      redact,
       endpointUrl,
       credentialEnv,
-      messagingChannels: Array.isArray(activeMessagingChannels) ? activeMessagingChannels : [],
+      selectedMessagingChannels,
+      webSearchConfig,
+      webSearchSupported,
+      hermesToolGateways,
       agent,
+      deps: {
+        loadSession: onboardSession.loadSession,
+        getActiveMessagingChannels: (name) => registry.getSandbox(name)?.messagingChannels,
+        verifyCompatibleEndpointSandboxSmoke: (options) =>
+          verifyCompatibleEndpointSandboxSmoke({
+            ...options,
+            runOpenshell,
+            redact,
+          }),
+        listSetupPolicyPresets: policies.listSetupPolicyPresets,
+        getAppliedPolicyPresets: policies.getAppliedPresets,
+        listCustomPolicyPresets: policies.listCustomPresets,
+        clampSetupPolicyPresetNames: policies.clampSetupPolicyPresetNames,
+        mergeRequiredHermesToolGatewayPolicyPresets,
+        arePolicyPresetsApplied,
+        skippedStepMessage,
+        startRecordedStep,
+        setupPoliciesWithSelection,
+        updateSession: onboardSession.updateSession,
+        recordStepComplete,
+        toSessionUpdates: (updates) => toSessionUpdates(updates as Parameters<typeof toSessionUpdates>[0]),
+      },
     });
-    const policyPresetSupportOptions = { webSearchSupported };
-    const selectablePolicyPresetsForSupport = [
-      ...policies.listSetupPolicyPresets(sandboxName, policyPresetSupportOptions),
-      ...policies.getAppliedPresets(sandboxName).map((name) => ({ name })),
-    ];
-    const customPolicyPresetNames = new Set(
-      policies.listCustomPresets(sandboxName).map((p: { name: string }) => p.name),
-    );
-    let recordedPolicyPresetsForSupport = policies.clampSetupPolicyPresetNames(
-      recordedPolicyPresets || [],
-      selectablePolicyPresetsForSupport,
-      policyPresetSupportOptions,
-      customPolicyPresetNames,
-    );
-    if (recordedPolicyPresets) {
-      recordedPolicyPresetsForSupport = mergeRequiredHermesToolGatewayPolicyPresets(
-        recordedPolicyPresetsForSupport,
-        hermesToolGateways,
-        selectablePolicyPresetsForSupport.map((p) => p.name),
-      );
-    }
-    const recordedPolicyPresetsHaveUnsupported =
-      Array.isArray(recordedPolicyPresets) &&
-      recordedPolicyPresetsForSupport.length !== recordedPolicyPresets.length;
-    const resumePolicies =
-      resume &&
-      sandboxName &&
-      !recordedPolicyPresetsHaveUnsupported &&
-      arePolicyPresetsApplied(sandboxName, recordedPolicyPresetsForSupport);
-    if (resumePolicies) {
-      skippedStepMessage("policies", recordedPolicyPresetsForSupport.join(", "));
-      await recordStepComplete(
-        "policies",
-        toSessionUpdates({
-          sandboxName,
-          provider,
-          model,
-          policyPresets: recordedPolicyPresetsForSupport,
-        }),
-      );
-    } else {
-      await startRecordedStep("policies", {
-        sandboxName,
-        provider,
-        model,
-        policyPresets: recordedPolicyPresetsForSupport,
-      });
-      const appliedPolicyPresets = await setupPoliciesWithSelection(sandboxName, {
-        selectedPresets:
-          Array.isArray(recordedPolicyPresets)
-            ? recordedPolicyPresetsForSupport
-            : null,
-        enabledChannels:
-          selectedMessagingChannels.length > 0
-            ? selectedMessagingChannels
-            : recordedMessagingChannels,
-        webSearchConfig,
-        provider,
-        webSearchSupported,
-        hermesToolGateways,
-        onSelection: (policyPresets) => {
-          onboardSession.updateSession((current: Session) => {
-            current.policyPresets = policyPresets;
-            return current;
-          });
-        },
-      });
-      await recordStepComplete(
-        "policies",
-        toSessionUpdates({ sandboxName, provider, model, policyPresets: appliedPolicyPresets }),
-      );
-    }
+    session = policiesResult.session;
 
     if (agent) {
       ensureAgentDashboardForward(sandboxName, agent);
diff --git a/src/lib/onboard/machine/handlers/policies.test.ts b/src/lib/onboard/machine/handlers/policies.test.ts
new file mode 100644
index 0000000000..ee315d34f0
--- /dev/null
+++ b/src/lib/onboard/machine/handlers/policies.test.ts
@@ -0,0 +1,182 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { describe, expect, it, vi } from "vitest";
+
+import { createSession, type Session, type SessionUpdates } from "../../../state/onboard-session";
+import { handlePoliciesState, type PoliciesStateOptions } from "./policies";
+
+type Agent = { name: string } | null;
+type WebSearchConfig = { fetchEnabled: true };
+
+function createDeps(overrides: Partial<PoliciesStateOptions<Agent, WebSearchConfig>["deps"]> = {}) {
+  let session = createSession();
+  const calls = {
+    load: vi.fn(() => session),
+    activeChannels: vi.fn(() => ["telegram"]),
+    smoke: vi.fn(),
+    listSetup: vi.fn(() => [{ name: "npm" }, { name: "pypi" }, { name: "github" }]),
+    applied: vi.fn(() => [] as string[]),
+    custom: vi.fn(() => [] as { name: string }[]),
+    clamp: vi.fn((names: string[]) => names.filter((name) => name !== "unsupported")),
+    mergeHermes: vi.fn((selected: string[], tools: string[]) => [...selected, ...tools]),
+    appliedCheck: vi.fn(() => false),
+    skipped: vi.fn(),
+    startStep: vi.fn(async () => undefined),
+    setupPolicies: vi.fn(async () => ["npm"]),
+    updateSession: vi.fn((mutator: (value: Session) => Session | void) => {
+      session = mutator(session) ?? session;
+      return session;
+    }),
+    complete: vi.fn(async () => session),
+  };
+  return {
+    calls,
+    deps: {
+      loadSession: calls.load,
+      getActiveMessagingChannels: calls.activeChannels,
+      verifyCompatibleEndpointSandboxSmoke: calls.smoke,
+      listSetupPolicyPresets: calls.listSetup,
+      getAppliedPolicyPresets: calls.applied,
+      listCustomPolicyPresets: calls.custom,
+      clampSetupPolicyPresetNames: calls.clamp,
+      mergeRequiredHermesToolGatewayPolicyPresets: calls.mergeHermes,
+      arePolicyPresetsApplied: calls.appliedCheck,
+      skippedStepMessage: calls.skipped,
+      startRecordedStep: calls.startStep,
+      setupPoliciesWithSelection: calls.setupPolicies,
+      updateSession: calls.updateSession,
+      recordStepComplete: calls.complete,
+      toSessionUpdates: (updates: Record<string, unknown>) => updates as SessionUpdates,
+      ...overrides,
+    },
+    setSession(next: Session) {
+      session = next;
+    },
+    getSession: () => session,
+  };
+}
+
+function baseOptions(
+  deps: PoliciesStateOptions<Agent, WebSearchConfig>["deps"],
+): PoliciesStateOptions<Agent, WebSearchConfig> {
+  return {
+    resume: false,
+    sandboxName: "my-assistant",
+    provider: "provider",
+    model: "model",
+    endpointUrl: "https://example.com/v1",
+    credentialEnv: "NVIDIA_API_KEY",
+    selectedMessagingChannels: [],
+    webSearchConfig: null,
+    webSearchSupported: true,
+    hermesToolGateways: [],
+    agent: null,
+    deps,
+  };
+}
+
+describe("handlePoliciesState", () => {
+  it("runs compatible endpoint smoke before policy selection", async () => {
+    const { deps, calls } = createDeps();
+
+    await handlePoliciesState(baseOptions(deps));
+
+    expect(calls.smoke).toHaveBeenCalledWith({
+      sandboxName: "my-assistant",
+      provider: "provider",
+      model: "model",
+      endpointUrl: "https://example.com/v1",
+      credentialEnv: "NVIDIA_API_KEY",
+      messagingChannels: ["telegram"],
+      agent: null,
+    });
+    expect(calls.startStep).toHaveBeenCalledWith("policies", {
+      sandboxName: "my-assistant",
+      provider: "provider",
+      model: "model",
+      policyPresets: [],
+    });
+    expect(calls.setupPolicies).toHaveBeenCalledWith(
+      "my-assistant",
+      expect.objectContaining({
+        selectedPresets: null,
+        enabledChannels: [],
+        provider: "provider",
+        webSearchSupported: true,
+      }),
+    );
+    expect(calls.complete).toHaveBeenCalledWith(
+      "policies",
+      expect.objectContaining({ policyPresets: ["npm"] }),
+    );
+  });
+
+  it("uses recorded messaging channels when no active selection exists", async () => {
+    const session = createSession({ messagingChannels: ["slack"] });
+    const { deps, calls, setSession } = createDeps();
+    setSession(session);
+
+    await handlePoliciesState(baseOptions(deps));
+
+    expect(calls.setupPolicies).toHaveBeenCalledWith(
+      "my-assistant",
+      expect.objectContaining({ enabledChannels: ["slack"] }),
+    );
+  });
+
+  it("resumes policies when all recorded presets are already applied", async () => {
+    const session = createSession({ policyPresets: ["npm"] });
+    const { deps, calls, setSession } = createDeps({
+      arePolicyPresetsApplied: vi.fn(() => true),
+    });
+    setSession(session);
+
+    const result = await handlePoliciesState({ ...baseOptions(deps), resume: true });
+
+    expect(calls.skipped).toHaveBeenCalledWith("policies", "npm");
+    expect(calls.setupPolicies).not.toHaveBeenCalled();
+    expect(calls.complete).toHaveBeenCalledWith(
+      "policies",
+      expect.objectContaining({ policyPresets: ["npm"] }),
+    );
+    expect(result.appliedPolicyPresets).toEqual(["npm"]);
+  });
+
+  it("clamps unsupported recorded presets before interactive setup", async () => {
+    const session = createSession({ policyPresets: ["npm", "unsupported"] });
+    const { deps, calls, setSession } = createDeps();
+    setSession(session);
+
+    await handlePoliciesState(baseOptions(deps));
+
+    expect(calls.clamp).toHaveBeenCalledWith(
+      ["npm", "unsupported"],
+      expect.any(Array),
+      { webSearchSupported: true },
+      expect.any(Set),
+    );
+    expect(calls.setupPolicies).toHaveBeenCalledWith(
+      "my-assistant",
+      expect.objectContaining({ selectedPresets: ["npm"] }),
+    );
+  });
+
+  it("merges required Hermes tool gateway presets into recorded selections", async () => {
+    const session = createSession({ policyPresets: ["npm"] });
+    const { deps, calls, setSession } = createDeps();
+    setSession(session);
+
+    await handlePoliciesState({ ...baseOptions(deps), hermesToolGateways: ["github"] });
+
+    expect(calls.mergeHermes).toHaveBeenCalledWith(
+      ["npm"],
+      ["github"],
+      ["npm", "pypi", "github"],
+    );
+    expect(calls.setupPolicies).toHaveBeenCalledWith(
+      "my-assistant",
+      expect.objectContaining({ selectedPresets: ["npm", "github"] }),
+    );
+  });
+});
diff --git a/src/lib/onboard/machine/handlers/policies.ts b/src/lib/onboard/machine/handlers/policies.ts
new file mode 100644
index 0000000000..ad35931cbf
--- /dev/null
+++ b/src/lib/onboard/machine/handlers/policies.ts
@@ -0,0 +1,189 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import type { Session, SessionUpdates } from "../../../state/onboard-session";
+
+export interface PolicyPresetEntry {
+  name: string;
+  [key: string]: unknown;
+}
+
+export interface PoliciesStateOptions<Agent, WebSearchConfig> {
+  resume: boolean;
+  sandboxName: string;
+  provider: string;
+  model: string;
+  endpointUrl: string | null;
+  credentialEnv: string | null;
+  selectedMessagingChannels: string[];
+  webSearchConfig: WebSearchConfig | null;
+  webSearchSupported: boolean;
+  hermesToolGateways: string[];
+  agent: Agent;
+  deps: {
+    loadSession(): Session | null;
+    getActiveMessagingChannels(sandboxName: string): string[] | null | undefined;
+    verifyCompatibleEndpointSandboxSmoke(options: {
+      sandboxName: string;
+      provider: string;
+      model: string;
+      endpointUrl: string | null;
+      credentialEnv: string | null;
+      messagingChannels: string[];
+      agent: Agent;
+    }): void;
+    listSetupPolicyPresets(
+      sandboxName: string,
+      options: { webSearchSupported: boolean },
+    ): PolicyPresetEntry[];
+    getAppliedPolicyPresets(sandboxName: string): string[];
+    listCustomPolicyPresets(sandboxName: string): PolicyPresetEntry[];
+    clampSetupPolicyPresetNames(
+      names: string[],
+      selectablePresets: PolicyPresetEntry[],
+      options: { webSearchSupported: boolean },
+      customPresetNames: Set<string>,
+    ): string[];
+    mergeRequiredHermesToolGatewayPolicyPresets(
+      selectedPresets: string[],
+      hermesToolGateways: string[],
+      selectablePresetNames: string[],
+    ): string[];
+    arePolicyPresetsApplied(sandboxName: string, selectedPresets: string[]): boolean;
+    skippedStepMessage(stepName: string, detail?: string | null): void;
+    startRecordedStep(
+      stepName: string,
+      updates: { sandboxName: string; provider: string; model: string; policyPresets: string[] },
+    ): Promise<void>;
+    setupPoliciesWithSelection(
+      sandboxName: string,
+      options: {
+        selectedPresets: string[] | null;
+        enabledChannels: string[];
+        webSearchConfig: WebSearchConfig | null;
+        provider: string;
+        webSearchSupported: boolean;
+        hermesToolGateways: string[];
+        onSelection: (policyPresets: string[]) => void;
+      },
+    ): Promise<string[]>;
+    updateSession(mutator: (session: Session) => Session | void): Session;
+    recordStepComplete(stepName: string, updates: SessionUpdates): Promise<Session>;
+    toSessionUpdates(updates: Record<string, unknown>): SessionUpdates;
+  };
+}
+
+export interface PoliciesStateResult {
+  session: Session | null;
+  recordedMessagingChannels: string[];
+  appliedPolicyPresets: string[];
+}
+
+export async function handlePoliciesState<Agent, WebSearchConfig>({
+  resume,
+  sandboxName,
+  provider,
+  model,
+  endpointUrl,
+  credentialEnv,
+  selectedMessagingChannels,
+  webSearchConfig,
+  webSearchSupported,
+  hermesToolGateways,
+  agent,
+  deps,
+}: PoliciesStateOptions<Agent, WebSearchConfig>): Promise<PoliciesStateResult> {
+  const latestSession = deps.loadSession();
+  const recordedPolicyPresets = Array.isArray(latestSession?.policyPresets)
+    ? latestSession.policyPresets
+    : null;
+  const recordedMessagingChannels = Array.isArray(latestSession?.messagingChannels)
+    ? latestSession.messagingChannels
+    : [];
+  const activeMessagingChannels = deps.getActiveMessagingChannels(sandboxName);
+  deps.verifyCompatibleEndpointSandboxSmoke({
+    sandboxName,
+    provider,
+    model,
+    endpointUrl,
+    credentialEnv,
+    messagingChannels: Array.isArray(activeMessagingChannels) ? activeMessagingChannels : [],
+    agent,
+  });
+
+  const policyPresetSupportOptions = { webSearchSupported };
+  const selectablePolicyPresetsForSupport = [
+    ...deps.listSetupPolicyPresets(sandboxName, policyPresetSupportOptions),
+    ...deps.getAppliedPolicyPresets(sandboxName).map((name) => ({ name })),
+  ];
+  const customPolicyPresetNames = new Set(
+    deps.listCustomPolicyPresets(sandboxName).map((preset) => preset.name),
+  );
+  let recordedPolicyPresetsForSupport = deps.clampSetupPolicyPresetNames(
+    recordedPolicyPresets || [],
+    selectablePolicyPresetsForSupport,
+    policyPresetSupportOptions,
+    customPolicyPresetNames,
+  );
+  if (recordedPolicyPresets) {
+    recordedPolicyPresetsForSupport = deps.mergeRequiredHermesToolGatewayPolicyPresets(
+      recordedPolicyPresetsForSupport,
+      hermesToolGateways,
+      selectablePolicyPresetsForSupport.map((preset) => preset.name),
+    );
+  }
+  const recordedPolicyPresetsHaveUnsupported =
+    Array.isArray(recordedPolicyPresets) &&
+    recordedPolicyPresetsForSupport.length !== recordedPolicyPresets.length;
+  const resumePolicies =
+    resume &&
+    !recordedPolicyPresetsHaveUnsupported &&
+    deps.arePolicyPresetsApplied(sandboxName, recordedPolicyPresetsForSupport);
+
+  let appliedPolicyPresets = recordedPolicyPresetsForSupport;
+  let session: Session | null;
+  if (resumePolicies) {
+    deps.skippedStepMessage("policies", recordedPolicyPresetsForSupport.join(", "));
+    session = await deps.recordStepComplete(
+      "policies",
+      deps.toSessionUpdates({
+        sandboxName,
+        provider,
+        model,
+        policyPresets: recordedPolicyPresetsForSupport,
+      }),
+    );
+  } else {
+    await deps.startRecordedStep("policies", {
+      sandboxName,
+      provider,
+      model,
+      policyPresets: recordedPolicyPresetsForSupport,
+    });
+    appliedPolicyPresets = await deps.setupPoliciesWithSelection(sandboxName, {
+      selectedPresets: Array.isArray(recordedPolicyPresets)
+        ? recordedPolicyPresetsForSupport
+        : null,
+      enabledChannels:
+        selectedMessagingChannels.length > 0
+          ? selectedMessagingChannels
+          : recordedMessagingChannels,
+      webSearchConfig,
+      provider,
+      webSearchSupported,
+      hermesToolGateways,
+      onSelection: (policyPresets) => {
+        deps.updateSession((current) => {
+          current.policyPresets = policyPresets;
+          return current;
+        });
+      },
+    });
+    session = await deps.recordStepComplete(
+      "policies",
+      deps.toSessionUpdates({ sandboxName, provider, model, policyPresets: appliedPolicyPresets }),
+    );
+  }
+
+  return { session, recordedMessagingChannels, appliedPolicyPresets };
+}

From d6585528c5895eb8a74f89253299aaa9d6fd5913 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 00:10:05 -0700
Subject: [PATCH 13/54] refactor(cli): extract onboard finalization handler

---
 src/lib/onboard.ts                            | 124 +++++++----------
 .../machine/handlers/finalization.test.ts     | 125 ++++++++++++++++++
 .../onboard/machine/handlers/finalization.ts  |  90 +++++++++++++
 3 files changed, 262 insertions(+), 77 deletions(-)
 create mode 100644 src/lib/onboard/machine/handlers/finalization.test.ts
 create mode 100644 src/lib/onboard/machine/handlers/finalization.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index e406d8ca0c..33428431c0 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -281,6 +281,7 @@ const nim: typeof import("./inference/nim") = require("./inference/nim");
 const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session");
 const { OnboardRuntime }: typeof import("./onboard/machine/runtime") = require("./onboard/machine/runtime");
 const { handleAgentSetupState }: typeof import("./onboard/machine/handlers/agent-setup") = require("./onboard/machine/handlers/agent-setup");
+const { handleFinalizationState }: typeof import("./onboard/machine/handlers/finalization") = require("./onboard/machine/handlers/finalization");
 const { handleGatewayState }: typeof import("./onboard/machine/handlers/gateway") = require("./onboard/machine/handlers/gateway");
 const { handlePoliciesState }: typeof import("./onboard/machine/handlers/policies") = require("./onboard/machine/handlers/policies");
 const { handlePreflightState }: typeof import("./onboard/machine/handlers/preflight") = require("./onboard/machine/handlers/preflight");
@@ -9724,88 +9725,57 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
     });
     session = policiesResult.session;
 
-    if (agent) {
-      ensureAgentDashboardForward(sandboxName, agent);
-    }
-
-    await recordSessionComplete(
-      toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }),
-    );
-    completed = true;
-    // Onboarding finished successfully. Delete the legacy plaintext
-    // credentials.json only when every staged *value* was actually pushed
-    // to the gateway in this run. A successful upsert under the same
-    // env-key name with a different value (e.g. vllm-local upserting
-    // `OPENAI_API_KEY: "dummy"` while the legacy file held a real
-    // `sk-…` cloud key) does not count as a migration — the gateway
-    // never received the legacy secret, so unlinking the file would
-    // strand the user's only copy.
-    const allStagedMigrated =
-      stagedLegacyKeys.length > 0 && stagedLegacyKeys.every((k) => migratedLegacyKeys.has(k));
-    if (allStagedMigrated) {
-      removeLegacyCredentialsFile();
-    } else if (stagedLegacyKeys.length > 0) {
-      const unmigrated = stagedLegacyKeys.filter((k) => !migratedLegacyKeys.has(k));
-      console.error(
-        `  Kept ~/.nemoclaw/credentials.json: ${String(unmigrated.length)} ` +
-          `legacy credential(s) were not migrated verbatim to the gateway in this run ` +
-          `(${unmigrated.join(", ")}). Re-run onboard with the relevant ` +
-          `providers/channels enabled to migrate them, then the file is removed automatically.`,
-      );
-    }
-    // Sweep stale host files left over from older NemoClaw versions —
-    // e.g. an empty/orphaned ~/.nemoclaw/credentials.json from upgrades
-    // before the credentials-gateway move (issue #3105). Each registered
-    // entry enforces its own safety guards; this call is a no-op when
-    // every target is already clean.
-    cleanupStaleHostFiles();
-
-    // Step [8/8] policy-apply restarts the sandbox container; the OpenClaw
-    // gateway inside the new container is launched lazily (normally by the
-    // first `nemoclaw <name> connect`). Bring it up explicitly here so the
-    // verifyDeployment block below does not race the post-policy startup and
-    // surface a false "gateway crashed during startup" warning. The helper
-    // is a no-op when the gateway is already running. Fixes #3573.
-    const processRecovery: typeof import("./actions/sandbox/process-recovery") =
-      require("./actions/sandbox/process-recovery");
-    processRecovery.checkAndRecoverSandboxProcesses(sandboxName, { quiet: true });
-
-    // Post-deployment verification — confirm the full delivery chain is
-    // operational before telling the user "YOUR AGENT IS LIVE". Fixes #2342.
-    const verifyDeploymentModule: typeof import("./verify-deployment") = require("./verify-deployment");
-    const _verifyChatUiUrl = process.env.CHAT_UI_URL || `http://127.0.0.1:${DASHBOARD_PORT}`;
-    const verifyChain = buildChain({ chatUiUrl: _verifyChatUiUrl, isWsl: isWsl(), wslHostAddress: getWslHostAddress() });
-    const verificationResult = await verifyDeploymentModule.verifyDeployment(
+    await handleFinalizationState({
       sandboxName,
-      verifyChain,
-      {
-        executeSandboxCommand: (name: string, script: string) => {
-          return executeSandboxCommandForVerification(name, script);
+      model,
+      provider,
+      nimContainer,
+      agent,
+      hermesAuthMethod,
+      hermesToolGateways,
+      selectedMessagingChannels,
+      stagedLegacyKeys,
+      migratedLegacyKeys,
+      deps: {
+        ensureAgentDashboardForward,
+        recordSessionComplete,
+        toSessionUpdates: (updates) => toSessionUpdates(updates as Parameters<typeof toSessionUpdates>[0]),
+        removeLegacyCredentialsFile,
+        cleanupStaleHostFiles,
+        checkAndRecoverSandboxProcesses: (name, options) => {
+          const processRecovery: typeof import("./actions/sandbox/process-recovery") =
+            require("./actions/sandbox/process-recovery");
+          processRecovery.checkAndRecoverSandboxProcesses(name, options);
         },
-        probeHostPort: (port: number, probePath: string) => {
-          const result = runCapture(
-            ["curl", "-so", "/dev/null", "-w", "%{http_code}", "--max-time", "3",
-              `http://127.0.0.1:${port}${probePath}`],
-            { ignoreError: true },
-          );
-          return parseInt(result.trim(), 10) || 0;
+        getChatUiUrl: () => process.env.CHAT_UI_URL || `http://127.0.0.1:${DASHBOARD_PORT}`,
+        buildVerifyChain: (chatUiUrl) =>
+          buildChain({ chatUiUrl, isWsl: isWsl(), wslHostAddress: getWslHostAddress() }),
+        verifyDeployment: async (name, chain) => {
+          const verifyDeploymentModule: typeof import("./verify-deployment") = require("./verify-deployment");
+          return verifyDeploymentModule.verifyDeployment(name, chain, {
+            executeSandboxCommand: (sandbox: string, script: string) =>
+              executeSandboxCommandForVerification(sandbox, script),
+            probeHostPort: (port: number, probePath: string) => {
+              const result = runCapture(
+                ["curl", "-so", "/dev/null", "-w", "%{http_code}", "--max-time", "3", `http://127.0.0.1:${port}${probePath}`],
+                { ignoreError: true },
+              );
+              return parseInt(result.trim(), 10) || 0;
+            },
+            captureForwardList: () => runCaptureOpenshell(["forward", "list"], { ignoreError: true }) || null,
+            getMessagingChannels: () => selectedMessagingChannels || [],
+            providerExistsInGateway: (providerName: string) => providerExistsInGateway(providerName),
+          });
         },
-        captureForwardList: () => {
-          const output = runCaptureOpenshell(["forward", "list"], { ignoreError: true });
-          return output || null;
+        formatVerificationDiagnostics: (result) => {
+          const verifyDeploymentModule: typeof import("./verify-deployment") = require("./verify-deployment");
+          return verifyDeploymentModule.formatVerificationDiagnostics(result);
         },
-        getMessagingChannels: (_name: string) => selectedMessagingChannels || [],
-        providerExistsInGateway: (providerName: string) => providerExistsInGateway(providerName),
+        printDashboard,
+        error: (message) => console.error(message),
+        log: (message) => console.log(message),
       },
-    );
-
-    // Print verification diagnostics
-    const diagLines = verifyDeploymentModule.formatVerificationDiagnostics(verificationResult);
-    for (const line of diagLines) {
-      console.log(line);
-    }
-
-    printDashboard(sandboxName, model, provider, nimContainer, agent);
+    });
   } finally {
     releaseOnboardLock();
     ONBOARD_RUNTIME = null;
diff --git a/src/lib/onboard/machine/handlers/finalization.test.ts b/src/lib/onboard/machine/handlers/finalization.test.ts
new file mode 100644
index 0000000000..a1617c4366
--- /dev/null
+++ b/src/lib/onboard/machine/handlers/finalization.test.ts
@@ -0,0 +1,125 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { describe, expect, it, vi } from "vitest";
+
+import { createSession, type SessionUpdates } from "../../../state/onboard-session";
+import { handleFinalizationState, type FinalizationStateOptions } from "./finalization";
+
+type Agent = { name: string } | null;
+type VerifyChain = { port: number };
+type VerificationResult = { ok: boolean };
+
+function createDeps(overrides: Partial<FinalizationStateOptions<Agent, VerifyChain, VerificationResult>["deps"]> = {}) {
+  const calls = {
+    ensureAgentDashboard: vi.fn(() => 18789),
+    complete: vi.fn(async () => createSession({ status: "complete" })),
+    removeLegacy: vi.fn(),
+    cleanupHost: vi.fn(),
+    recoverProcesses: vi.fn(),
+    getChatUiUrl: vi.fn(() => "http://127.0.0.1:18789"),
+    buildChain: vi.fn(() => ({ port: 18789 })),
+    verify: vi.fn(async () => ({ ok: true })),
+    diagnostics: vi.fn(() => ["  ✓ verified"]),
+    dashboard: vi.fn(),
+    error: vi.fn(),
+    log: vi.fn(),
+  };
+  return {
+    calls,
+    deps: {
+      ensureAgentDashboardForward: calls.ensureAgentDashboard,
+      recordSessionComplete: calls.complete,
+      toSessionUpdates: (updates: Record<string, unknown>) => updates as SessionUpdates,
+      removeLegacyCredentialsFile: calls.removeLegacy,
+      cleanupStaleHostFiles: calls.cleanupHost,
+      checkAndRecoverSandboxProcesses: calls.recoverProcesses,
+      getChatUiUrl: calls.getChatUiUrl,
+      buildVerifyChain: calls.buildChain,
+      verifyDeployment: calls.verify,
+      formatVerificationDiagnostics: calls.diagnostics,
+      printDashboard: calls.dashboard,
+      error: calls.error,
+      log: calls.log,
+      ...overrides,
+    },
+  };
+}
+
+function baseOptions(
+  deps: FinalizationStateOptions<Agent, VerifyChain, VerificationResult>["deps"],
+): FinalizationStateOptions<Agent, VerifyChain, VerificationResult> {
+  return {
+    sandboxName: "my-assistant",
+    model: "model",
+    provider: "provider",
+    nimContainer: null,
+    agent: null,
+    hermesAuthMethod: null,
+    hermesToolGateways: [],
+    selectedMessagingChannels: ["telegram"],
+    stagedLegacyKeys: [],
+    migratedLegacyKeys: new Set(),
+    deps,
+  };
+}
+
+describe("handleFinalizationState", () => {
+  it("completes the session, verifies deployment, and prints the dashboard", async () => {
+    const { deps, calls } = createDeps();
+
+    const result = await handleFinalizationState(baseOptions(deps));
+
+    expect(calls.complete).toHaveBeenCalledWith({
+      sandboxName: "my-assistant",
+      provider: "provider",
+      model: "model",
+      hermesAuthMethod: null,
+      hermesToolGateways: [],
+    });
+    expect(calls.cleanupHost).toHaveBeenCalledOnce();
+    expect(calls.recoverProcesses).toHaveBeenCalledWith("my-assistant", { quiet: true });
+    expect(calls.buildChain).toHaveBeenCalledWith("http://127.0.0.1:18789");
+    expect(calls.verify).toHaveBeenCalledWith("my-assistant", { port: 18789 });
+    expect(calls.log).toHaveBeenCalledWith("  ✓ verified");
+    expect(calls.dashboard).toHaveBeenCalledWith("my-assistant", "model", "provider", null, null);
+    expect(result.verificationDiagnostics).toEqual(["  ✓ verified"]);
+  });
+
+  it("ensures agent dashboard forwarding before completion for non-OpenClaw agents", async () => {
+    const { deps, calls } = createDeps();
+    const agent = { name: "hermes" };
+
+    await handleFinalizationState({ ...baseOptions(deps), agent });
+
+    expect(calls.ensureAgentDashboard).toHaveBeenCalledWith("my-assistant", agent);
+    expect(calls.dashboard).toHaveBeenCalledWith("my-assistant", "model", "provider", null, agent);
+  });
+
+  it("removes legacy credentials only when all staged values migrated", async () => {
+    const { deps, calls } = createDeps();
+
+    await handleFinalizationState({
+      ...baseOptions(deps),
+      stagedLegacyKeys: ["NVIDIA_API_KEY", "SLACK_BOT_TOKEN"],
+      migratedLegacyKeys: new Set(["NVIDIA_API_KEY", "SLACK_BOT_TOKEN"]),
+    });
+
+    expect(calls.removeLegacy).toHaveBeenCalledOnce();
+    expect(calls.error).not.toHaveBeenCalled();
+  });
+
+  it("keeps legacy credentials and warns when migration is incomplete", async () => {
+    const { deps, calls } = createDeps();
+
+    const result = await handleFinalizationState({
+      ...baseOptions(deps),
+      stagedLegacyKeys: ["NVIDIA_API_KEY", "SLACK_BOT_TOKEN"],
+      migratedLegacyKeys: new Set(["NVIDIA_API_KEY"]),
+    });
+
+    expect(calls.removeLegacy).not.toHaveBeenCalled();
+    expect(calls.error).toHaveBeenCalledWith(expect.stringContaining("SLACK_BOT_TOKEN"));
+    expect(result.unmigratedLegacyKeys).toEqual(["SLACK_BOT_TOKEN"]);
+  });
+});
diff --git a/src/lib/onboard/machine/handlers/finalization.ts b/src/lib/onboard/machine/handlers/finalization.ts
new file mode 100644
index 0000000000..0cdd3735ca
--- /dev/null
+++ b/src/lib/onboard/machine/handlers/finalization.ts
@@ -0,0 +1,90 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import type { Session, SessionUpdates } from "../../../state/onboard-session";
+
+export interface FinalizationStateOptions<Agent, VerifyChain, VerificationResult> {
+  sandboxName: string;
+  model: string;
+  provider: string;
+  nimContainer: string | null;
+  agent: Agent;
+  hermesAuthMethod: string | null;
+  hermesToolGateways: string[];
+  selectedMessagingChannels: string[];
+  stagedLegacyKeys: readonly string[];
+  migratedLegacyKeys: ReadonlySet<string>;
+  deps: {
+    ensureAgentDashboardForward(sandboxName: string, agent: NonNullable<Agent>): number;
+    recordSessionComplete(updates: SessionUpdates): Promise<Session>;
+    toSessionUpdates(updates: Record<string, unknown>): SessionUpdates;
+    removeLegacyCredentialsFile(): void;
+    cleanupStaleHostFiles(): void;
+    checkAndRecoverSandboxProcesses(sandboxName: string, options: { quiet: boolean }): void;
+    getChatUiUrl(): string;
+    buildVerifyChain(chatUiUrl: string): VerifyChain;
+    verifyDeployment(sandboxName: string, chain: VerifyChain): Promise<VerificationResult>;
+    formatVerificationDiagnostics(result: VerificationResult): string[];
+    printDashboard(
+      sandboxName: string,
+      model: string,
+      provider: string,
+      nimContainer: string | null,
+      agent: Agent,
+    ): void;
+    error(message?: string): void;
+    log(message?: string): void;
+  };
+}
+
+export interface FinalizationStateResult {
+  session: Session;
+  unmigratedLegacyKeys: string[];
+  verificationDiagnostics: string[];
+}
+
+export async function handleFinalizationState<Agent, VerifyChain, VerificationResult>({
+  sandboxName,
+  model,
+  provider,
+  nimContainer,
+  agent,
+  hermesAuthMethod,
+  hermesToolGateways,
+  selectedMessagingChannels: _selectedMessagingChannels,
+  stagedLegacyKeys,
+  migratedLegacyKeys,
+  deps,
+}: FinalizationStateOptions<Agent, VerifyChain, VerificationResult>): Promise<FinalizationStateResult> {
+  if (agent) deps.ensureAgentDashboardForward(sandboxName, agent as NonNullable<Agent>);
+
+  const session = await deps.recordSessionComplete(
+    deps.toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }),
+  );
+
+  const allStagedMigrated =
+    stagedLegacyKeys.length > 0 && stagedLegacyKeys.every((key) => migratedLegacyKeys.has(key));
+  const unmigratedLegacyKeys = stagedLegacyKeys.filter((key) => !migratedLegacyKeys.has(key));
+  if (allStagedMigrated) {
+    deps.removeLegacyCredentialsFile();
+  } else if (stagedLegacyKeys.length > 0) {
+    deps.error(
+      `  Kept ~/.nemoclaw/credentials.json: ${String(unmigratedLegacyKeys.length)} ` +
+        `legacy credential(s) were not migrated verbatim to the gateway in this run ` +
+        `(${unmigratedLegacyKeys.join(", ")}). Re-run onboard with the relevant ` +
+        `providers/channels enabled to migrate them, then the file is removed automatically.`,
+    );
+  }
+
+  deps.cleanupStaleHostFiles();
+  deps.checkAndRecoverSandboxProcesses(sandboxName, { quiet: true });
+
+  const verifyChain = deps.buildVerifyChain(deps.getChatUiUrl());
+  const verificationResult = await deps.verifyDeployment(sandboxName, verifyChain);
+  const verificationDiagnostics = deps.formatVerificationDiagnostics(verificationResult);
+  for (const line of verificationDiagnostics) deps.log(line);
+
+  deps.printDashboard(sandboxName, model, provider, nimContainer, agent);
+
+  return { session, unmigratedLegacyKeys, verificationDiagnostics };
+}

From 4385d20b0bb38db885b1eb5008cd18868d82adc5 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 00:28:13 -0700
Subject: [PATCH 14/54] refactor(cli): route agent setup session writes through
 context

---
 src/lib/agent/onboard.test.ts | 12 ++++++++++++
 src/lib/agent/onboard.ts      | 19 +++++++++++++------
 src/lib/onboard.ts            |  3 +++
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/src/lib/agent/onboard.test.ts b/src/lib/agent/onboard.test.ts
index b71a82a83e..fd82e6d1ce 100644
--- a/src/lib/agent/onboard.test.ts
+++ b/src/lib/agent/onboard.test.ts
@@ -1,6 +1,9 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
+import fs from "node:fs";
+import path from "node:path";
+
 import { describe, it, expect, beforeEach, afterEach, afterAll, vi } from "vitest";
 // Import from compiled dist/ so coverage is attributed correctly.
 import {
@@ -129,6 +132,15 @@ describe("printDashboardUi — regression for #2078 (port 8642 is not a chat UI)
   });
 });
 
+describe("agent setup session boundaries", () => {
+  it("does not write onboard session state directly", () => {
+    const source = fs.readFileSync(path.join(__dirname, "onboard.ts"), "utf8");
+
+    expect(source).not.toContain("../state/onboard-session");
+    expect(source).not.toMatch(/onboardSession\.markStep/);
+  });
+});
+
 describe("handleAgentSetup guards", () => {
   it("accepts an executable configured binary path when PATH lookup is empty", () => {
     let script = "";
diff --git a/src/lib/agent/onboard.ts b/src/lib/agent/onboard.ts
index f08c32b9c6..b2ee0e7bb1 100644
--- a/src/lib/agent/onboard.ts
+++ b/src/lib/agent/onboard.ts
@@ -13,7 +13,6 @@ import { dockerBuild, dockerImageInspect } from "../adapters/docker";
 import { getAgentBranding } from "../cli/branding";
 import { getProviderSelectionConfig } from "../inference/config";
 import type { JsonObject as LooseObject } from "../core/json-types";
-import * as onboardSession from "../state/onboard-session";
 import { ROOT, redact, run, shellQuote } from "../runner";
 import {
   buildLocalBaseTag,
@@ -32,6 +31,8 @@ export interface OnboardContext {
   writeSandboxConfigSyncFile: (script: string) => string;
   cleanupTempDir: (file: string, prefix: string) => void;
   startRecordedStep: (stepName: string, updates: LooseObject) => Promise<void>;
+  recordStepComplete: (stepName: string, updates: LooseObject) => Promise<unknown>;
+  recordStepFailed: (stepName: string, message: string | null) => Promise<unknown>;
   skippedStepMessage: (stepName: string, sandboxName: string) => void;
 }
 
@@ -350,13 +351,14 @@ export function collectHermesStartupDiagnostics(
 /**
  * Record and print an agent setup failure before exiting the onboarding flow.
  */
-function failAgentSetup(
+async function failAgentSetup(
   sandboxName: string,
   agent: AgentDefinition,
   message: string,
   details: string[] = [],
-): never {
-  onboardSession.markStepFailed(
+  recordStepFailed: OnboardContext["recordStepFailed"],
+): Promise<never> {
+  await recordStepFailed(
     "agent_setup",
     details.length > 0 ? `${message}\n${details.join("\n")}` : message,
   );
@@ -406,6 +408,8 @@ export async function handleAgentSetup(
     writeSandboxConfigSyncFile,
     cleanupTempDir,
     startRecordedStep,
+    recordStepComplete,
+    recordStepFailed,
     skippedStepMessage,
   } = ctx;
 
@@ -418,7 +422,7 @@ export async function handleAgentSetup(
       );
       if (isHealthProbeOk(result)) {
         skippedStepMessage("agent_setup", sandboxName);
-        onboardSession.markStepComplete("agent_setup", { sandboxName, provider, model });
+        await recordStepComplete("agent_setup", { sandboxName, provider, model });
         return;
       }
     }
@@ -433,6 +437,8 @@ export async function handleAgentSetup(
       sandboxName,
       agent,
       describeAgentBinaryFailure(sandboxName, agent, binaryAvailability),
+      [],
+      recordStepFailed,
     );
   }
 
@@ -486,13 +492,14 @@ export async function handleAgentSetup(
         agent,
         `${agent.displayName} gateway did not respond within ${timeoutSecs}s`,
         diagnostics,
+        recordStepFailed,
       );
     }
   } else {
     console.log(`  \u2713 ${agent.displayName} configured inside sandbox`);
   }
 
-  onboardSession.markStepComplete("agent_setup", { sandboxName, provider, model });
+  await recordStepComplete("agent_setup", { sandboxName, provider, model });
 }
 
 /**
diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 33428431c0..e80523d35c 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -9674,6 +9674,9 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
           writeSandboxConfigSyncFile,
           cleanupTempDir,
           startRecordedStep,
+          recordStepComplete,
+          recordStepFailed: (stepName: string, message: string | null) =>
+            getOnboardRuntime().markStepFailed(stepName, message),
           skippedStepMessage,
         }),
         ensureAgentDashboardForward,

From 98ac89edd7cafb327cbd75a0a7690d4eae173cfd Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 00:44:17 -0700
Subject: [PATCH 15/54] refactor(cli): emit resume skip repair events

---
 src/lib/onboard.ts                            | 26 ++++++++++++++++
 .../machine/handlers/agent-setup.test.ts      |  6 ++++
 .../onboard/machine/handlers/agent-setup.ts   |  2 ++
 .../onboard/machine/handlers/gateway.test.ts  | 10 +++++++
 src/lib/onboard/machine/handlers/gateway.ts   |  3 ++
 .../onboard/machine/handlers/policies.test.ts |  6 ++++
 src/lib/onboard/machine/handlers/policies.ts  |  5 ++++
 .../machine/handlers/preflight.test.ts        |  5 ++++
 src/lib/onboard/machine/handlers/preflight.ts |  2 ++
 .../handlers/provider-inference.test.ts       | 22 ++++++++++++++
 .../machine/handlers/provider-inference.ts    | 30 +++++++++++++++++++
 .../onboard/machine/handlers/sandbox.test.ts  | 16 ++++++++++
 src/lib/onboard/machine/handlers/sandbox.ts   | 14 +++++++++
 13 files changed, 147 insertions(+)

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index e80523d35c..48c41740cc 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -9059,6 +9059,24 @@ async function recordStepSkipped(stepName: string): Promise<Session> {
   return getOnboardRuntime().markStepSkipped(stepName);
 }
 
+async function recordStateSkipped(
+  state: import("./onboard/machine/types").OnboardMachineState,
+  metadata: Record<string, unknown> | null = null,
+): Promise<Session> {
+  return getOnboardRuntime().markSkipped(state, metadata);
+}
+
+async function recordRepairEvent(
+  type: "state.repair.started" | "state.repair.completed" | "state.repair.failed",
+  options: {
+    state?: import("./onboard/machine/types").OnboardMachineState | null;
+    error?: string | null;
+    metadata?: Record<string, unknown> | null;
+  } = {},
+): Promise<Session> {
+  return getOnboardRuntime().emitRepairEvent(type, options);
+}
+
 async function recordSessionComplete(updates: SessionUpdates = {}): Promise<Session> {
   return getOnboardRuntime().completeSession(updates);
 }
@@ -9430,6 +9448,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
         resolveSandboxGpuConfig,
         validateSandboxGpuPreflight,
         skippedStepMessage,
+        recordStateSkipped,
         startRecordedStep,
         recordStepComplete,
         updateSession: onboardSession.updateSession,
@@ -9497,6 +9516,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
         retireLegacyGatewayForDockerDriverUpgrade,
         destroyGatewayRuntimeForGpuReuse: () => destroyGateway(() => undefined, () => false),
         skippedStepMessage,
+        recordStateSkipped,
         note,
         startRecordedStep,
         startGateway,
@@ -9552,6 +9572,8 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
         recordStepComplete,
         toSessionUpdates: (updates) => toSessionUpdates(updates as Parameters<typeof toSessionUpdates>[0]),
         skippedStepMessage,
+        recordStateSkipped,
+        recordRepairEvent,
         hydrateCredentialEnv,
         repairLocalInferenceSystemdOverrideOrExit,
         isNonInteractive,
@@ -9644,6 +9666,8 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
         recordStepComplete,
         toSessionUpdates: (updates) => toSessionUpdates(updates as Parameters<typeof toSessionUpdates>[0]),
         skippedStepMessage,
+        recordStateSkipped,
+        recordRepairEvent,
         error: (message) => console.error(message),
         exitProcess: (code) => process.exit(code),
       },
@@ -9683,6 +9707,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
         recordStepSkipped,
         isOpenclawReady,
         skippedStepMessage,
+        recordStateSkipped,
         startRecordedStep,
         setupOpenclaw,
         recordStepComplete,
@@ -9719,6 +9744,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
         mergeRequiredHermesToolGatewayPolicyPresets,
         arePolicyPresetsApplied,
         skippedStepMessage,
+        recordStateSkipped,
         startRecordedStep,
         setupPoliciesWithSelection,
         updateSession: onboardSession.updateSession,
diff --git a/src/lib/onboard/machine/handlers/agent-setup.test.ts b/src/lib/onboard/machine/handlers/agent-setup.test.ts
index fd9f1d0410..99255f622d 100644
--- a/src/lib/onboard/machine/handlers/agent-setup.test.ts
+++ b/src/lib/onboard/machine/handlers/agent-setup.test.ts
@@ -16,6 +16,7 @@ function createDeps(overrides: Partial<AgentSetupStateOptions<Agent>["deps"]> =
     skipped: vi.fn(async () => createSession()),
     openclawReady: vi.fn(() => false),
     skippedMessage: vi.fn(),
+    recordSkip: vi.fn(async () => createSession()),
     startStep: vi.fn(async () => undefined),
     setupOpenclaw: vi.fn(async () => undefined),
     complete: vi.fn(async () => createSession()),
@@ -29,6 +30,7 @@ function createDeps(overrides: Partial<AgentSetupStateOptions<Agent>["deps"]> =
       recordStepSkipped: calls.skipped,
       isOpenclawReady: calls.openclawReady,
       skippedStepMessage: calls.skippedMessage,
+      recordStateSkipped: calls.recordSkip,
       startRecordedStep: calls.startStep,
       setupOpenclaw: calls.setupOpenclaw,
       recordStepComplete: calls.complete,
@@ -83,6 +85,10 @@ describe("handleAgentSetupState", () => {
     await handleAgentSetupState({ ...baseOptions(deps), resume: true });
 
     expect(calls.skippedMessage).toHaveBeenCalledWith("openclaw", "my-assistant");
+    expect(calls.recordSkip).toHaveBeenCalledWith("openclaw", {
+      reason: "resume",
+      sandboxName: "my-assistant",
+    });
     expect(calls.startStep).not.toHaveBeenCalled();
     expect(calls.setupOpenclaw).not.toHaveBeenCalled();
     expect(calls.complete).toHaveBeenCalledWith(
diff --git a/src/lib/onboard/machine/handlers/agent-setup.ts b/src/lib/onboard/machine/handlers/agent-setup.ts
index 40330711ad..a24a6e9811 100644
--- a/src/lib/onboard/machine/handlers/agent-setup.ts
+++ b/src/lib/onboard/machine/handlers/agent-setup.ts
@@ -27,6 +27,7 @@ export interface AgentSetupStateOptions<Agent> {
     recordStepSkipped(stepName: string): Promise<Session>;
     isOpenclawReady(sandboxName: string): boolean;
     skippedStepMessage(stepName: string, detail?: string | null): void;
+    recordStateSkipped(state: "openclaw", metadata?: Record<string, unknown> | null): Promise<Session>;
     startRecordedStep(
       stepName: string,
       updates: { sandboxName: string; provider: string; model: string },
@@ -70,6 +71,7 @@ export async function handleAgentSetupState<Agent>({
   const resumeOpenclaw = resume && sandboxName && deps.isOpenclawReady(sandboxName);
   if (resumeOpenclaw) {
     deps.skippedStepMessage("openclaw", sandboxName);
+    await deps.recordStateSkipped("openclaw", { reason: "resume", sandboxName });
     session = await deps.recordStepComplete(
       "openclaw",
       deps.toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }),
diff --git a/src/lib/onboard/machine/handlers/gateway.test.ts b/src/lib/onboard/machine/handlers/gateway.test.ts
index 266ba10360..eceee5c11f 100644
--- a/src/lib/onboard/machine/handlers/gateway.test.ts
+++ b/src/lib/onboard/machine/handlers/gateway.test.ts
@@ -25,6 +25,7 @@ function createDeps(overrides: Partial<GatewayStateOptions<Gpu>["deps"]> = {}) {
     retireLegacy: vi.fn(),
     destroyGpuRuntime: vi.fn(() => true),
     skipped: vi.fn(),
+    recordSkip: vi.fn(async () => createSession()),
     note: vi.fn(),
     startStep: vi.fn(async () => undefined),
     startGateway: vi.fn(async () => undefined),
@@ -51,6 +52,7 @@ function createDeps(overrides: Partial<GatewayStateOptions<Gpu>["deps"]> = {}) {
       retireLegacyGatewayForDockerDriverUpgrade: calls.retireLegacy,
       destroyGatewayRuntimeForGpuReuse: calls.destroyGpuRuntime,
       skippedStepMessage: calls.skipped,
+      recordStateSkipped: calls.recordSkip,
       note: calls.note,
       startRecordedStep: calls.startStep,
       startGateway: calls.startGateway,
@@ -99,6 +101,10 @@ describe("handleGatewayState", () => {
     await handleGatewayState(baseOptions(deps, "healthy"));
 
     expect(calls.skipped).toHaveBeenCalledWith("gateway", "running", "reuse");
+    expect(calls.recordSkip).toHaveBeenCalledWith("gateway", {
+      reason: "reuse",
+      reuseState: "healthy",
+    });
     expect(calls.note).toHaveBeenCalledWith("  Reusing healthy NemoClaw gateway.");
     expect(calls.startGateway).not.toHaveBeenCalled();
     expect(calls.complete).toHaveBeenCalledWith("gateway");
@@ -112,6 +118,10 @@ describe("handleGatewayState", () => {
     await handleGatewayState({ ...baseOptions(deps, "healthy", session), resume: true });
 
     expect(calls.skipped).toHaveBeenCalledWith("gateway", "running");
+    expect(calls.recordSkip).toHaveBeenCalledWith("gateway", {
+      reason: "resume",
+      reuseState: "healthy",
+    });
     expect(calls.startGateway).not.toHaveBeenCalled();
   });
 
diff --git a/src/lib/onboard/machine/handlers/gateway.ts b/src/lib/onboard/machine/handlers/gateway.ts
index 026c26e1b4..48fab3c4e7 100644
--- a/src/lib/onboard/machine/handlers/gateway.ts
+++ b/src/lib/onboard/machine/handlers/gateway.ts
@@ -51,6 +51,7 @@ export interface GatewayStateOptions<Gpu> {
       detail?: string | null,
       reason?: "resume" | "reuse",
     ): void;
+    recordStateSkipped(state: "gateway", metadata?: Record<string, unknown> | null): Promise<Session>;
     note(message: string): void;
     startRecordedStep(stepName: string): Promise<void>;
     startGateway(gpu: Gpu, options: { gpuPassthrough: boolean }): Promise<void>;
@@ -147,9 +148,11 @@ export async function handleGatewayState<Gpu>({
   const resumeGateway = resume && session?.steps?.gateway?.status === "complete" && canReuseHealthyGateway;
   if (resumeGateway) {
     deps.skippedStepMessage("gateway", "running");
+    await deps.recordStateSkipped("gateway", { reason: "resume", reuseState: gatewayReuseState });
     session = await deps.recordStepComplete("gateway");
   } else if (!resume && canReuseHealthyGateway) {
     deps.skippedStepMessage("gateway", "running", "reuse");
+    await deps.recordStateSkipped("gateway", { reason: "reuse", reuseState: gatewayReuseState });
     deps.note("  Reusing healthy NemoClaw gateway.");
     session = await deps.recordStepComplete("gateway");
   } else {
diff --git a/src/lib/onboard/machine/handlers/policies.test.ts b/src/lib/onboard/machine/handlers/policies.test.ts
index ee315d34f0..56782d1751 100644
--- a/src/lib/onboard/machine/handlers/policies.test.ts
+++ b/src/lib/onboard/machine/handlers/policies.test.ts
@@ -22,6 +22,7 @@ function createDeps(overrides: Partial<PoliciesStateOptions<Agent, WebSearchConf
     mergeHermes: vi.fn((selected: string[], tools: string[]) => [...selected, ...tools]),
     appliedCheck: vi.fn(() => false),
     skipped: vi.fn(),
+    recordSkip: vi.fn(async () => session),
     startStep: vi.fn(async () => undefined),
     setupPolicies: vi.fn(async () => ["npm"]),
     updateSession: vi.fn((mutator: (value: Session) => Session | void) => {
@@ -43,6 +44,7 @@ function createDeps(overrides: Partial<PoliciesStateOptions<Agent, WebSearchConf
       mergeRequiredHermesToolGatewayPolicyPresets: calls.mergeHermes,
       arePolicyPresetsApplied: calls.appliedCheck,
       skippedStepMessage: calls.skipped,
+      recordStateSkipped: calls.recordSkip,
       startRecordedStep: calls.startStep,
       setupPoliciesWithSelection: calls.setupPolicies,
       updateSession: calls.updateSession,
@@ -135,6 +137,10 @@ describe("handlePoliciesState", () => {
     const result = await handlePoliciesState({ ...baseOptions(deps), resume: true });
 
     expect(calls.skipped).toHaveBeenCalledWith("policies", "npm");
+    expect(calls.recordSkip).toHaveBeenCalledWith("policies", {
+      reason: "resume",
+      policyPresets: ["npm"],
+    });
     expect(calls.setupPolicies).not.toHaveBeenCalled();
     expect(calls.complete).toHaveBeenCalledWith(
       "policies",
diff --git a/src/lib/onboard/machine/handlers/policies.ts b/src/lib/onboard/machine/handlers/policies.ts
index ad35931cbf..cbc452d23e 100644
--- a/src/lib/onboard/machine/handlers/policies.ts
+++ b/src/lib/onboard/machine/handlers/policies.ts
@@ -51,6 +51,7 @@ export interface PoliciesStateOptions<Agent, WebSearchConfig> {
     ): string[];
     arePolicyPresetsApplied(sandboxName: string, selectedPresets: string[]): boolean;
     skippedStepMessage(stepName: string, detail?: string | null): void;
+    recordStateSkipped(state: "policies", metadata?: Record<string, unknown> | null): Promise<Session>;
     startRecordedStep(
       stepName: string,
       updates: { sandboxName: string; provider: string; model: string; policyPresets: string[] },
@@ -144,6 +145,10 @@ export async function handlePoliciesState<Agent, WebSearchConfig>({
   let session: Session | null;
   if (resumePolicies) {
     deps.skippedStepMessage("policies", recordedPolicyPresetsForSupport.join(", "));
+    await deps.recordStateSkipped("policies", {
+      reason: "resume",
+      policyPresets: recordedPolicyPresetsForSupport,
+    });
     session = await deps.recordStepComplete(
       "policies",
       deps.toSessionUpdates({
diff --git a/src/lib/onboard/machine/handlers/preflight.test.ts b/src/lib/onboard/machine/handlers/preflight.test.ts
index fa4b859915..8916124ec8 100644
--- a/src/lib/onboard/machine/handlers/preflight.test.ts
+++ b/src/lib/onboard/machine/handlers/preflight.test.ts
@@ -50,6 +50,7 @@ function createDeps(overrides: Partial<PreflightStateOptions<Gpu, SandboxEntry,
       }),
       validateSandboxGpuPreflight: vi.fn(),
       skippedStepMessage: vi.fn(),
+      recordStateSkipped: vi.fn(async () => session),
       startRecordedStep: vi.fn(async () => undefined),
       recordStepComplete: vi.fn(async () => session),
       updateSession: vi.fn((mutator: (value: Session) => Session | void) => {
@@ -125,6 +126,10 @@ describe("handlePreflightState", () => {
     });
 
     expect(harness.deps.skippedStepMessage).toHaveBeenCalledWith("preflight", "cached");
+    expect(harness.deps.recordStateSkipped).toHaveBeenCalledWith("preflight", {
+      reason: "resume",
+      validation: "gpu-cdi",
+    });
     expect(harness.deps.detectGpu).toHaveBeenCalledOnce();
     expect(harness.deps.runPreflight).not.toHaveBeenCalled();
     expect(harness.deps.startRecordedStep).not.toHaveBeenCalled();
diff --git a/src/lib/onboard/machine/handlers/preflight.ts b/src/lib/onboard/machine/handlers/preflight.ts
index cc5bd6633d..e5d91a1c06 100644
--- a/src/lib/onboard/machine/handlers/preflight.ts
+++ b/src/lib/onboard/machine/handlers/preflight.ts
@@ -48,6 +48,7 @@ export interface PreflightStateOptions<
     ): Config;
     validateSandboxGpuPreflight(config: Config): void;
     skippedStepMessage(stepName: string, detail?: string | null): void;
+    recordStateSkipped(state: "preflight", metadata?: Record<string, unknown> | null): Promise<Session>;
     startRecordedStep(stepName: string): Promise<void>;
     recordStepComplete(stepName: string): Promise<Session>;
     updateSession(mutator: (session: Session) => Session | void): Session;
@@ -106,6 +107,7 @@ export async function handlePreflightState<
   let gpu: Gpu;
   if (resumePreflight) {
     deps.skippedStepMessage("preflight", "cached");
+    await deps.recordStateSkipped("preflight", { reason: "resume", validation: "gpu-cdi" });
     gpu = deps.detectGpu();
     const resumeOptedOutGpuPassthrough = noGpu || (!gpuRequested && session?.gpuPassthrough === false);
     deps.assertCdiNvidiaGpuSpecPresent(deps.assessHost(), resumeOptedOutGpuPassthrough);
diff --git a/src/lib/onboard/machine/handlers/provider-inference.test.ts b/src/lib/onboard/machine/handlers/provider-inference.test.ts
index bec7ea47a3..1af9c81321 100644
--- a/src/lib/onboard/machine/handlers/provider-inference.test.ts
+++ b/src/lib/onboard/machine/handlers/provider-inference.test.ts
@@ -32,6 +32,8 @@ function createDeps(overrides: Partial<ProviderInferenceStateOptions<Gpu, Agent,
     startStep: vi.fn(async () => undefined),
     complete: vi.fn(async () => createSession()),
     skipped: vi.fn(),
+    recordSkip: vi.fn(async () => createSession()),
+    repairEvent: vi.fn(async () => createSession()),
     hydrate: vi.fn(),
     repair: vi.fn(),
     routeReady: vi.fn(() => false),
@@ -56,6 +58,8 @@ function createDeps(overrides: Partial<ProviderInferenceStateOptions<Gpu, Agent,
       recordStepComplete: calls.complete,
       toSessionUpdates: (updates: Record<string, unknown>) => updates as SessionUpdates,
       skippedStepMessage: calls.skipped,
+      recordStateSkipped: calls.recordSkip,
+      recordRepairEvent: calls.repairEvent,
       hydrateCredentialEnv: calls.hydrate,
       repairLocalInferenceSystemdOverrideOrExit: calls.repair,
       isNonInteractive: () => true,
@@ -163,9 +167,27 @@ describe("handleProviderInferenceState", () => {
     expect(calls.setupNim).not.toHaveBeenCalled();
     expect(calls.setupInference).not.toHaveBeenCalled();
     expect(calls.skipped).toHaveBeenCalledWith("provider_selection", "ollama-local / llama3.1");
+    expect(calls.recordSkip).toHaveBeenCalledWith("provider_selection", {
+      reason: "resume",
+      provider: "ollama-local",
+      model: "llama3.1",
+    });
     expect(calls.hydrate).toHaveBeenCalledWith(null);
+    expect(calls.repairEvent).toHaveBeenCalledWith("state.repair.started", {
+      state: "provider_selection",
+      metadata: { repair: "ollama-systemd-loopback" },
+    });
     expect(calls.repair).toHaveBeenCalledWith("ollama-local", deps.isNonInteractive);
+    expect(calls.repairEvent).toHaveBeenCalledWith("state.repair.completed", {
+      state: "provider_selection",
+      metadata: { repair: "ollama-systemd-loopback" },
+    });
     expect(calls.skipped).toHaveBeenCalledWith("inference", "ollama-local / llama3.1");
+    expect(calls.recordSkip).toHaveBeenCalledWith("inference", {
+      reason: "resume",
+      provider: "ollama-local",
+      model: "llama3.1",
+    });
     expect(result).toMatchObject({ provider: "ollama-local", model: "llama3.1" });
   });
 
diff --git a/src/lib/onboard/machine/handlers/provider-inference.ts b/src/lib/onboard/machine/handlers/provider-inference.ts
index 525b94a059..c73aa5492f 100644
--- a/src/lib/onboard/machine/handlers/provider-inference.ts
+++ b/src/lib/onboard/machine/handlers/provider-inference.ts
@@ -57,6 +57,14 @@ export interface ProviderInferenceStateOptions<Gpu, Agent, Host> {
     recordStepComplete(stepName: string, updates: SessionUpdates): Promise<Session>;
     toSessionUpdates(updates: Record<string, unknown>): SessionUpdates;
     skippedStepMessage(stepName: string, detail?: string | null): void;
+    recordStateSkipped(
+      state: "provider_selection" | "inference",
+      metadata?: Record<string, unknown> | null,
+    ): Promise<Session>;
+    recordRepairEvent(
+      type: "state.repair.started" | "state.repair.completed" | "state.repair.failed",
+      options?: { state?: "provider_selection" | "inference"; error?: string | null; metadata?: Record<string, unknown> | null },
+    ): Promise<Session>;
     hydrateCredentialEnv(credentialEnv: string | null): void;
     repairLocalInferenceSystemdOverrideOrExit(provider: string | null, isNonInteractive: () => boolean): void;
     isNonInteractive(): boolean;
@@ -144,8 +152,25 @@ export async function handleProviderInferenceState<Gpu, Agent, Host>({
       typeof model === "string";
     if (resumeProviderSelection) {
       deps.skippedStepMessage("provider_selection", `${provider} / ${model}`);
+      await deps.recordStateSkipped("provider_selection", {
+        reason: "resume",
+        provider,
+        model,
+      });
       deps.hydrateCredentialEnv(credentialEnv);
+      if (provider === "ollama-local") {
+        await deps.recordRepairEvent("state.repair.started", {
+          state: "provider_selection",
+          metadata: { repair: "ollama-systemd-loopback" },
+        });
+      }
       deps.repairLocalInferenceSystemdOverrideOrExit(provider, deps.isNonInteractive);
+      if (provider === "ollama-local") {
+        await deps.recordRepairEvent("state.repair.completed", {
+          state: "provider_selection",
+          metadata: { repair: "ollama-systemd-loopback" },
+        });
+      }
     } else {
       await deps.startRecordedStep("provider_selection");
       const selection = await deps.setupNim(gpu, sandboxName, agent);
@@ -214,6 +239,11 @@ export async function handleProviderInferenceState<Gpu, Agent, Host>({
         }
       }
       deps.skippedStepMessage("inference", `${provider} / ${model}`);
+      await deps.recordStateSkipped("inference", {
+        reason: "resume",
+        provider,
+        model,
+      });
       if (nimContainer && sandboxName) deps.registryUpdateSandbox(sandboxName, { nimContainer });
       session = await deps.recordStepComplete(
         "inference",
diff --git a/src/lib/onboard/machine/handlers/sandbox.test.ts b/src/lib/onboard/machine/handlers/sandbox.test.ts
index eac0ffb553..a8e8db61d2 100644
--- a/src/lib/onboard/machine/handlers/sandbox.test.ts
+++ b/src/lib/onboard/machine/handlers/sandbox.test.ts
@@ -35,6 +35,8 @@ function createDeps(overrides: Partial<SandboxStateOptions<Gpu, Agent, WebSearch
     setDefault: vi.fn(),
     complete: vi.fn(async () => createSession()),
     skipped: vi.fn(),
+    recordSkip: vi.fn(async () => createSession()),
+    repairEvent: vi.fn(async () => createSession()),
     error: vi.fn(),
     exit: vi.fn((code: number): never => {
       throw new Error(`exit ${code}`);
@@ -77,6 +79,8 @@ function createDeps(overrides: Partial<SandboxStateOptions<Gpu, Agent, WebSearch
       recordStepComplete: calls.complete,
       toSessionUpdates: (updates: Record<string, unknown>) => updates as SessionUpdates,
       skippedStepMessage: calls.skipped,
+      recordStateSkipped: calls.recordSkip,
+      recordRepairEvent: calls.repairEvent,
       error: calls.error,
       exitProcess: calls.exit,
       ...overrides,
@@ -153,6 +157,10 @@ describe("handleSandboxState", () => {
 
     expect(calls.createSandbox).not.toHaveBeenCalled();
     expect(calls.skipped).toHaveBeenCalledWith("sandbox", "saved");
+    expect(calls.recordSkip).toHaveBeenCalledWith("sandbox", {
+      reason: "resume",
+      sandboxName: "saved",
+    });
     expect(result.selectedMessagingChannels).toEqual(["slack"]);
   });
 
@@ -182,7 +190,15 @@ describe("handleSandboxState", () => {
 
     await handleSandboxState({ ...baseOptions(deps, session), resume: true, sandboxName: "saved" });
 
+    expect(calls.repairEvent).toHaveBeenCalledWith("state.repair.started", {
+      state: "sandbox",
+      metadata: { repair: "recorded-sandbox-cleanup", sandboxName: "saved" },
+    });
     expect(calls.repairSandbox).toHaveBeenCalledWith("saved");
+    expect(calls.repairEvent).toHaveBeenCalledWith("state.repair.completed", {
+      state: "sandbox",
+      metadata: { repair: "recorded-sandbox-cleanup", sandboxName: "saved" },
+    });
     expect(calls.createSandbox).toHaveBeenCalled();
   });
 
diff --git a/src/lib/onboard/machine/handlers/sandbox.ts b/src/lib/onboard/machine/handlers/sandbox.ts
index 8c45215ed9..3ae88d13e0 100644
--- a/src/lib/onboard/machine/handlers/sandbox.ts
+++ b/src/lib/onboard/machine/handlers/sandbox.ts
@@ -77,6 +77,11 @@ export interface SandboxStateOptions<Gpu, Agent, WebSearchConfig, MessagingChann
     recordStepComplete(stepName: string, updates: SessionUpdates): Promise<Session>;
     toSessionUpdates(updates: Record<string, unknown>): SessionUpdates;
     skippedStepMessage(stepName: string, detail?: string | null): void;
+    recordStateSkipped(state: "sandbox", metadata?: Record<string, unknown> | null): Promise<Session>;
+    recordRepairEvent(
+      type: "state.repair.started" | "state.repair.completed" | "state.repair.failed",
+      options?: { state?: "sandbox"; error?: string | null; metadata?: Record<string, unknown> | null },
+    ): Promise<Session>;
     error(message?: string): void;
     exitProcess(code: number): never;
   };
@@ -174,6 +179,7 @@ export async function handleSandboxState<Gpu, Agent, WebSearchConfig, MessagingC
     if (webSearchConfig) deps.note("  [resume] Reusing Brave Search configuration already baked into the sandbox.");
     selectedMessagingChannels = session?.messagingChannels ?? [];
     deps.skippedStepMessage("sandbox", sandboxName);
+    await deps.recordStateSkipped("sandbox", { reason: "resume", sandboxName });
   } else {
     if (resume && session?.steps?.sandbox?.status === "complete") {
       if (webSearchConfigChanged) {
@@ -196,7 +202,15 @@ export async function handleSandboxState<Gpu, Agent, WebSearchConfig, MessagingC
         if (sandboxName) deps.removeSandboxFromRegistry(sandboxName);
       } else if (sandboxReuseState === "not_ready") {
         deps.note(`  [resume] Recorded sandbox '${sandboxName}' exists but is not ready; recreating it.`);
+        await deps.recordRepairEvent("state.repair.started", {
+          state: "sandbox",
+          metadata: { repair: "recorded-sandbox-cleanup", sandboxName },
+        });
         deps.repairRecordedSandbox(sandboxName);
+        await deps.recordRepairEvent("state.repair.completed", {
+          state: "sandbox",
+          metadata: { repair: "recorded-sandbox-cleanup", sandboxName },
+        });
       } else {
         deps.note("  [resume] Recorded sandbox state is unavailable; recreating it.");
         if (sandboxName) deps.removeSandboxFromRegistry(sandboxName);

From c025e4a1f21561cbe6c9bd22ecd75cfdbb197108 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 00:54:01 -0700
Subject: [PATCH 16/54] refactor(cli): route final machine transitions

---
 src/lib/onboard.ts                      | 11 ++++++++++-
 src/lib/onboard/machine/runtime.test.ts | 11 ++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 48c41740cc..991cc5cddf 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -9078,7 +9078,16 @@ async function recordRepairEvent(
 }
 
 async function recordSessionComplete(updates: SessionUpdates = {}): Promise<Session> {
-  return getOnboardRuntime().completeSession(updates);
+  const runtime = getOnboardRuntime();
+  const current = await runtime.session();
+  if (current.machine.state === "finalizing") {
+    await runtime.transition("post_verify");
+    return runtime.complete(updates);
+  }
+  if (current.machine.state === "post_verify") {
+    return runtime.complete(updates);
+  }
+  return runtime.completeSession(updates);
 }
 
 const ONBOARD_STEP_INDEX: Record<string, { number: number; title: string }> = {
diff --git a/src/lib/onboard/machine/runtime.test.ts b/src/lib/onboard/machine/runtime.test.ts
index 7b26269541..f098ba0dc3 100644
--- a/src/lib/onboard/machine/runtime.test.ts
+++ b/src/lib/onboard/machine/runtime.test.ts
@@ -185,23 +185,28 @@ describe("OnboardRuntime", () => {
     expect(policiesHarness.getSession().machine.state).toBe("policies");
   });
 
-  it("completes from post_verify and emits completion events", async () => {
-    const { runtime, events, getSession } = createHarness(sessionInState("post_verify"));
+  it("transitions through finalizing and post_verify before completion", async () => {
+    const { runtime, events, getSession } = createHarness(sessionInState("finalizing"));
 
+    await runtime.transition("post_verify");
     await runtime.complete({ sandboxName: "my-assistant" });
 
     expect(getSession()).toMatchObject({
       status: "complete",
       resumable: false,
       sandboxName: "my-assistant",
-      machine: { state: "complete", revision: 8 },
+      machine: { state: "complete", revision: 9 },
     });
     expect(events.map((event) => event.type)).toEqual([
+      "state.exited",
+      "state.entered",
       "context.updated",
       "state.completed",
       "state.entered",
       "onboard.completed",
     ]);
+    expect(events[0]).toMatchObject({ state: "finalizing" });
+    expect(events[1]).toMatchObject({ state: "post_verify" });
   });
 
   it("emits skipped and repair events without mutating durable state", async () => {

From c90747b417889c2c46d888d7133c981152b799b2 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 11:05:31 -0700
Subject: [PATCH 17/54] refactor(cli): extract onboard shell helpers

---
 src/lib/onboard.ts                  | 1074 ++-------------------------
 src/lib/onboard/dashboard.ts        |  436 +++++++++++
 src/lib/onboard/model-router.ts     |  522 +++++++++++++
 src/lib/onboard/runtime-boundary.ts |   93 +++
 src/lib/onboard/session-updates.ts  |   63 ++
 5 files changed, 1165 insertions(+), 1023 deletions(-)
 create mode 100644 src/lib/onboard/dashboard.ts
 create mode 100644 src/lib/onboard/model-router.ts
 create mode 100644 src/lib/onboard/runtime-boundary.ts
 create mode 100644 src/lib/onboard/session-updates.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 991cc5cddf..ad23d5a06e 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -17,7 +17,6 @@ const {
 }: typeof import("./onboard/branding") = require("./onboard/branding");
 const { cleanupTempDir }: typeof import("./onboard/temp-files") = require("./onboard/temp-files");
 const { stopStaleDashboardListenersForSandbox } = require("./onboard/stale-gateway-cleanup");
-const { looksLikeForwardPortConflict, runBackgroundForwardStartWithPortReleaseRetries }: typeof import("./onboard/forward-start") = require("./onboard/forward-start");
 const {
   ensureOllamaLoopbackSystemdOverride,
 }: typeof import("./onboard/ollama-systemd") = require("./onboard/ollama-systemd");
@@ -50,7 +49,7 @@ const {
 const {
   agentSupportsWebSearch,
 }: typeof import("./onboard/web-search-support") = require("./onboard/web-search-support");
-const dashboardAccess: typeof import("./onboard/dashboard-access") = require("./onboard/dashboard-access");
+const onboardDashboard: typeof import("./onboard/dashboard") = require("./onboard/dashboard");
 const {
   buildGatewayBootstrapSecretsScript,
   createGatewayBootstrapRepairHelpers,
@@ -87,9 +86,6 @@ const {
 const bedrockRuntimeOnboard: typeof import("./onboard/bedrock-runtime") =
   require("./onboard/bedrock-runtime");
 const { buildVllmMenuEntries }: typeof import("./onboard/vllm-menu") = require("./onboard/vllm-menu");
-const {
-  prepareModelRouterVenv,
-}: typeof import("./onboard/model-router-python") = require("./onboard/model-router-python");
 const crypto = require("node:crypto");
 const fs = require("fs");
 const os = require("os");
@@ -279,7 +275,15 @@ const { resolveSandboxImageTagFromCreateOutput } =
   require("./domain/sandbox/image-tag") as typeof import("./domain/sandbox/image-tag");
 const nim: typeof import("./inference/nim") = require("./inference/nim");
 const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session");
-const { OnboardRuntime }: typeof import("./onboard/machine/runtime") = require("./onboard/machine/runtime");
+const { toSessionUpdates }: typeof import("./onboard/session-updates") = require("./onboard/session-updates");
+const modelRouter: typeof import("./onboard/model-router") = require("./onboard/model-router");
+const {
+  DEFAULT_MODEL_ROUTER_CREDENTIAL_ENV,
+  isRoutedInferenceProvider,
+  loadBlueprintProfile,
+  reconcileModelRouter,
+} = modelRouter;
+const { OnboardRuntimeBoundary }: typeof import("./onboard/runtime-boundary") = require("./onboard/runtime-boundary");
 const { handleAgentSetupState }: typeof import("./onboard/machine/handlers/agent-setup") = require("./onboard/machine/handlers/agent-setup");
 const { handleFinalizationState }: typeof import("./onboard/machine/handlers/finalization") = require("./onboard/machine/handlers/finalization");
 const { handleGatewayState }: typeof import("./onboard/machine/handlers/gateway") = require("./onboard/machine/handlers/gateway");
@@ -293,8 +297,6 @@ const { ensureUsageNoticeConsent } = require("./onboard/usage-notice");
 const {
   findAvailableDashboardPort,
   findDashboardForwardOwner,
-  getOccupiedPorts,
-  isLiveForwardStatus,
 } = require("./onboard/dashboard-port") as typeof import("./onboard/dashboard-port");
 const { destroyGatewayForReuse } = require("./onboard/gateway-cleanup") as typeof import("./onboard/gateway-cleanup");
 const { verifyGatewayContainerRunning } =
@@ -341,7 +343,6 @@ const sandboxState: typeof import("./state/sandbox") = require("./state/sandbox"
 const validation: typeof import("./validation") = require("./validation");
 const urlUtils: typeof import("./core/url-utils") = require("./core/url-utils");
 const buildContext = require("./build-context");
-const dashboardContract: typeof import("./dashboard/contract") = require("./dashboard/contract");
 const httpProbe: typeof import("./adapters/http/probe") = require("./adapters/http/probe");
 const modelPrompts: typeof import("./inference/model-prompts") = require("./inference/model-prompts");
 const providerModels: typeof import("./inference/provider-models") = require("./inference/provider-models");
@@ -388,9 +389,9 @@ import { decidePolicyCarryForward } from "./onboard/policy-carryforward";
 import { getSuggestedPolicyPresets } from "./onboard/policy-presets";
 import {
   computeSetupPresetSuggestions as computeSetupPresetSuggestionsImpl,
-  setupPoliciesWithSelection as setupPoliciesWithSelectionImpl,
   type SetupPolicySelectionOptions,
   type SetupPresetSuggestionOptions,
+  setupPoliciesWithSelection as setupPoliciesWithSelectionImpl,
 } from "./onboard/policy-selection";
 import {
   getResumeSandboxGpuOverrides,
@@ -417,7 +418,6 @@ const USE_COLOR = !process.env.NO_COLOR && !!process.stdout.isTTY;
 const DIM = USE_COLOR ? "\x1b[2m" : "";
 const RESET = USE_COLOR ? "\x1b[0m" : "";
 let OPENSHELL_BIN: string | null = null;
-let ONBOARD_RUNTIME: import("./onboard/machine/runtime").OnboardRuntime | null = null;
 const GATEWAY_NAME = "nemoclaw";
 const BACK_TO_SELECTION = "__NEMOCLAW_BACK_TO_SELECTION__";
 type HermesAuthMethod = "oauth" | "api_key";
@@ -703,503 +703,6 @@ function getBlueprintMaxOpenshellVersion(rootDir = ROOT): string | null {
 
 type OpenshellChannel = "stable" | "dev" | "auto";
 
-/**
- * Load a named inference profile and router config from blueprint.yaml.
- * Returns null if the blueprint or profile is missing.
- */
-type BlueprintRouterConfig = {
-  enabled?: boolean;
-  port?: number;
-  pool_config_path?: string;
-  credential_env?: string;
-};
-
-type BlueprintInferenceProfile = {
-  provider_name?: string;
-  endpoint?: string;
-  model: string;
-  credential_env?: string;
-  credential_default?: string;
-  router: BlueprintRouterConfig;
-};
-
-function loadBlueprintProfile(
-  profileName: string,
-  rootDir: string = ROOT,
-): BlueprintInferenceProfile | null {
-  try {
-    const YAML = require("yaml");
-    const blueprintPath = path.join(rootDir, "nemoclaw-blueprint", "blueprint.yaml");
-    if (!fs.existsSync(blueprintPath)) return null;
-    const raw = fs.readFileSync(blueprintPath, "utf8");
-    const parsed = YAML.parse(raw);
-    const profile = parsed?.components?.inference?.profiles?.[profileName];
-    if (!profile) return null;
-    const router = { ...(parsed?.components?.router || {}) };
-    if (typeof profile.credential_env === "string" && profile.credential_env.trim().length > 0) {
-      router.credential_env = profile.credential_env;
-    }
-    return { ...profile, router } as BlueprintInferenceProfile;
-  } catch {
-    return null;
-  }
-}
-
-const ROUTER_HEALTH_RETRIES = 15;
-const ROUTER_HEALTH_INTERVAL_MS = 2000;
-const ROUTER_HEALTH_TIMEOUT_MS = 3000;
-const MODEL_ROUTER_RELATIVE_DIR = path.join("nemoclaw-blueprint", "router", "llm-router");
-const MODEL_ROUTER_VENV_DIR = path.join(os.homedir(), ".nemoclaw", "model-router-venv");
-const MODEL_ROUTER_FINGERPRINT_FILE = ".nemoclaw-source-fingerprint";
-const MODEL_ROUTER_FINGERPRINT_IGNORED_NAMES = new Set([
-  ".git",
-  ".hg",
-  ".mypy_cache",
-  ".pytest_cache",
-  ".ruff_cache",
-  ".svn",
-  ".venv",
-  "__pycache__",
-  "build",
-  "dist",
-  "node_modules",
-  "venv",
-]);
-const DEFAULT_MODEL_ROUTER_CREDENTIAL_ENV = "NVIDIA_API_KEY";
-
-async function isRouterHealthy(port: number, timeoutMs = ROUTER_HEALTH_TIMEOUT_MS): Promise<boolean> {
-  const http = require("http");
-  return new Promise<boolean>((resolve) => {
-    let settled = false;
-    const settle = (healthy: boolean) => {
-      if (settled) return;
-      settled = true;
-      resolve(healthy);
-    };
-    const request = http
-      .get(`http://127.0.0.1:${port}/health`, (res: import("node:http").IncomingMessage) => {
-        res.resume();
-        settle((res.statusCode || 0) >= 200 && (res.statusCode || 0) < 300);
-      })
-      .on("error", () => settle(false));
-    request.setTimeout(timeoutMs, () => {
-      request.destroy();
-      settle(false);
-    });
-  });
-}
-
-function isProcessRunning(pid: number | null | undefined): boolean {
-  if (!Number.isInteger(pid) || Number(pid) <= 0) return false;
-  try {
-    process.kill(Number(pid), 0);
-    return true;
-  } catch {
-    return false;
-  }
-}
-
-async function stopModelRouterProcess(pid: number, port: number): Promise<void> {
-  try {
-    process.kill(pid, "SIGTERM");
-  } catch {
-    return;
-  }
-  for (let attempt = 0; attempt < 10; attempt++) {
-    await new Promise((resolve) => setTimeout(resolve, 500));
-    if (!isProcessRunning(pid) && !(await isRouterHealthy(port, 1000))) return;
-  }
-  try {
-    process.kill(pid, "SIGKILL");
-  } catch {
-    // already stopped
-  }
-  for (let attempt = 0; attempt < 5; attempt++) {
-    await new Promise((resolve) => setTimeout(resolve, 500));
-    if (!isProcessRunning(pid) && !(await isRouterHealthy(port, 1000))) return;
-  }
-}
-
-function resolveHostCommandPath(commandName: string): string | null {
-  const result = runCapture(["sh", "-c", 'command -v "$1"', "--", commandName], {
-    ignoreError: true,
-  }).trim();
-  return result || null;
-}
-
-function modelRouterPackageDir(): string {
-  return path.join(ROOT, MODEL_ROUTER_RELATIVE_DIR);
-}
-
-function modelRouterVenvDir(): string {
-  return process.env.NEMOCLAW_MODEL_ROUTER_VENV || MODEL_ROUTER_VENV_DIR;
-}
-
-function modelRouterCommandPath(venvDir = modelRouterVenvDir()): string {
-  return path.join(venvDir, "bin", "model-router");
-}
-
-function modelRouterFingerprintPath(venvDir = modelRouterVenvDir()): string {
-  return path.join(venvDir, MODEL_ROUTER_FINGERPRINT_FILE);
-}
-
-function isExecutableFile(filePath: string): boolean {
-  try {
-    fs.accessSync(filePath, fs.constants.X_OK);
-    return true;
-  } catch {
-    return false;
-  }
-}
-
-function isModelRouterPackageReady(routerDir = modelRouterPackageDir()): boolean {
-  return fs.existsSync(path.join(routerDir, "pyproject.toml")) ||
-    fs.existsSync(path.join(routerDir, "setup.py"));
-}
-
-function shouldSkipModelRouterFingerprintEntry(name: string): boolean {
-  return MODEL_ROUTER_FINGERPRINT_IGNORED_NAMES.has(name) || name.endsWith(".egg-info");
-}
-
-function hashModelRouterSourceTree(routerDir = modelRouterPackageDir()): string | null {
-  const sourceHash = crypto.createHash("sha256");
-
-  const hashDirectory = (currentDir: string): boolean => {
-    let entries: import("fs").Dirent[];
-    try {
-      entries = fs
-        .readdirSync(currentDir, { withFileTypes: true })
-        .sort((left: import("fs").Dirent, right: import("fs").Dirent) =>
-          left.name.localeCompare(right.name),
-        );
-    } catch {
-      return false;
-    }
-
-    let hashedSourceFile = false;
-    for (const entry of entries) {
-      if (shouldSkipModelRouterFingerprintEntry(entry.name)) continue;
-      if (entry.name.endsWith(".pyc") || entry.name.endsWith(".pyo")) continue;
-
-      const entryPath = path.join(currentDir, entry.name);
-      const relativePath = path.relative(routerDir, entryPath).split(path.sep).join("/");
-      if (entry.isDirectory()) {
-        hashedSourceFile = hashDirectory(entryPath) || hashedSourceFile;
-        continue;
-      }
-      if (entry.isSymbolicLink()) {
-        try {
-          sourceHash.update(`link:${relativePath}\0`);
-          sourceHash.update(fs.readlinkSync(entryPath));
-          sourceHash.update("\0");
-          hashedSourceFile = true;
-        } catch {
-          // Ignore unreadable links; the install step will fail if they are required.
-        }
-        continue;
-      }
-      if (!entry.isFile()) continue;
-      sourceHash.update(`file:${relativePath}\0`);
-      sourceHash.update(fs.readFileSync(entryPath));
-      sourceHash.update("\0");
-      hashedSourceFile = true;
-    }
-    return hashedSourceFile;
-  };
-
-  return hashDirectory(routerDir) ? `files:${sourceHash.digest("hex")}` : null;
-}
-
-function getModelRouterSourceFingerprint(routerDir = modelRouterPackageDir()): string | null {
-  const gitHead = runCapture(["git", "-C", routerDir, "rev-parse", "HEAD"], {
-    ignoreError: true,
-  }).trim();
-  if (/^[0-9a-f]{40}$/i.test(gitHead)) return `git:${gitHead}`;
-
-  const gitLink = runCapture(["git", "-C", ROOT, "rev-parse", `HEAD:${MODEL_ROUTER_RELATIVE_DIR}`], {
-    ignoreError: true,
-  }).trim();
-  if (/^[0-9a-f]{40}$/i.test(gitLink)) return `gitlink:${gitLink}`;
-
-  return hashModelRouterSourceTree(routerDir);
-}
-
-function readModelRouterInstalledFingerprint(venvDir = modelRouterVenvDir()): string | null {
-  try {
-    const fingerprint = fs.readFileSync(modelRouterFingerprintPath(venvDir), "utf8").trim();
-    return fingerprint || null;
-  } catch {
-    return null;
-  }
-}
-
-function writeModelRouterInstalledFingerprint(
-  fingerprint: string | null,
-  venvDir = modelRouterVenvDir(),
-): void {
-  if (!fingerprint) return;
-  fs.writeFileSync(modelRouterFingerprintPath(venvDir), `${fingerprint}\n`, { mode: 0o600 });
-}
-
-function isManagedModelRouterCurrent(
-  routerDir = modelRouterPackageDir(),
-  venvDir = modelRouterVenvDir(),
-): boolean {
-  if (!isExecutableFile(modelRouterCommandPath(venvDir))) return false;
-  const sourceFingerprint = getModelRouterSourceFingerprint(routerDir);
-  return Boolean(
-    sourceFingerprint && readModelRouterInstalledFingerprint(venvDir) === sourceFingerprint,
-  );
-}
-
-function initializeModelRouterSubmodule(routerDir = modelRouterPackageDir()): void {
-  if (isModelRouterPackageReady(routerDir)) return;
-  if (!fs.existsSync(path.join(ROOT, ".gitmodules")) || !fs.existsSync(path.join(ROOT, ".git"))) {
-    return;
-  }
-  console.log("  Initializing Model Router source...");
-  run(["git", "-C", ROOT, "submodule", "update", "--init", "--depth", "1", MODEL_ROUTER_RELATIVE_DIR], {
-    ignoreError: true,
-  });
-}
-
-function installModelRouterCommand(routerDir = modelRouterPackageDir()): string {
-  initializeModelRouterSubmodule(routerDir);
-  if (!isModelRouterPackageReady(routerDir)) {
-    throw new Error(
-      `Model Router source is not initialized at ${routerDir}. ` +
-        `Run: git -C ${ROOT} submodule update --init --depth 1 ${MODEL_ROUTER_RELATIVE_DIR}`,
-    );
-  }
-
-  const venvDir = modelRouterVenvDir();
-  const routerCommand = modelRouterCommandPath(venvDir);
-  const sourceFingerprint = getModelRouterSourceFingerprint(routerDir);
-  const allowReplaceExistingVenv =
-    path.resolve(venvDir) === path.resolve(MODEL_ROUTER_VENV_DIR) ||
-    readModelRouterInstalledFingerprint(venvDir) !== null;
-  const venvPython = prepareModelRouterVenv({
-    venvDir,
-    allowReplaceExisting: allowReplaceExistingVenv,
-  });
-
-  const installResult = run(
-    [venvPython, "-m", "pip", "install", "--quiet", "--upgrade", `${routerDir}[prefill,proxy]`],
-    {
-      ignoreError: true,
-      timeout: 600_000,
-    },
-  );
-  if (installResult.status !== 0) {
-    throw new Error("Failed to install Model Router dependencies.");
-  }
-  if (!isExecutableFile(routerCommand)) {
-    throw new Error("Model Router install did not produce the model-router command.");
-  }
-  writeModelRouterInstalledFingerprint(sourceFingerprint, venvDir);
-  return routerCommand;
-}
-
-function ensureModelRouterCommand(): string {
-  const routerDir = modelRouterPackageDir();
-  const venvDir = modelRouterVenvDir();
-  const managedCommand = modelRouterCommandPath(venvDir);
-
-  if (isModelRouterPackageReady(routerDir) && isManagedModelRouterCurrent(routerDir, venvDir)) {
-    return managedCommand;
-  }
-
-  if (!isModelRouterPackageReady(routerDir)) {
-    initializeModelRouterSubmodule(routerDir);
-  }
-
-  if (isModelRouterPackageReady(routerDir)) {
-    if (isManagedModelRouterCurrent(routerDir, venvDir)) return managedCommand;
-    return installModelRouterCommand(routerDir);
-  }
-
-  if (isExecutableFile(managedCommand)) return managedCommand;
-  return resolveHostCommandPath("model-router") || installModelRouterCommand();
-}
-
-/**
- * Start the model-router proxy and wait for it to become healthy.
- * Follows the same pattern as Ollama startup (spawn detached, poll health).
- * Returns the PID of the child process.
- */
-async function startModelRouter(routerCfg: BlueprintRouterConfig): Promise<number> {
-  const routerCommand = ensureModelRouterCommand();
-  const port = routerCfg.port || 4000;
-  const blueprintDir = path.join(ROOT, "nemoclaw-blueprint");
-  const poolConfigPath = path.join(
-    blueprintDir,
-    routerCfg.pool_config_path || "router/pool-config.yaml",
-  );
-  const stateDir = path.join(os.homedir(), ".nemoclaw", "state");
-  const litellmConfigPath = path.join(stateDir, "litellm-proxy.yaml");
-
-  fs.mkdirSync(stateDir, { recursive: true });
-
-  const proxyConfigResult = spawnSync(
-    routerCommand,
-    ["proxy-config", "--config", poolConfigPath, "--output", litellmConfigPath],
-    { encoding: "utf8", timeout: 30_000, cwd: blueprintDir },
-  );
-  if (proxyConfigResult.status !== 0) {
-    throw new Error(
-      `model-router proxy-config failed: ${proxyConfigResult.stderr || proxyConfigResult.error || "unknown error"}`,
-    );
-  }
-
-  const { buildSubprocessEnv } = require("./subprocess-env");
-  const credEnvVars: Record<string, string> = {};
-  const credName = routerCfg.credential_env || DEFAULT_MODEL_ROUTER_CREDENTIAL_ENV;
-  const routedCredential = resolveProviderCredential(credName);
-  const openAiCredential = resolveProviderCredential("OPENAI_API_KEY");
-  if (routedCredential) {
-    credEnvVars[credName] = routedCredential;
-    if (!openAiCredential) credEnvVars.OPENAI_API_KEY = routedCredential;
-  }
-  if (openAiCredential) credEnvVars.OPENAI_API_KEY = openAiCredential;
-  const _providerKey = (process.env.NEMOCLAW_PROVIDER_KEY || "").trim();
-  if (_providerKey) {
-    if (!credEnvVars[credName]) credEnvVars[credName] = _providerKey;
-    if (!credEnvVars.OPENAI_API_KEY) credEnvVars.OPENAI_API_KEY = _providerKey;
-  }
-
-  if (await isRouterHealthy(port)) {
-    throw new Error(
-      `Port ${port} already has a healthy router endpoint; refusing to start a second router.`,
-    );
-  }
-
-  const child = spawn(
-    routerCommand,
-    [
-      "proxy",
-      "--litellm-config", litellmConfigPath,
-      "--router-config", poolConfigPath,
-      "--host", "0.0.0.0",
-      "--port", String(port),
-    ],
-    {
-      detached: true,
-      stdio: "ignore",
-      cwd: blueprintDir,
-      env: buildSubprocessEnv(credEnvVars),
-    },
-  );
-  let childExited = false;
-  let childExitDetail = "";
-  child.once("error", (err: Error) => {
-    childExited = true;
-    childExitDetail = `child failed to start: ${err.message}`;
-  });
-  child.once("exit", (code: number | null, signal: string | null) => {
-    childExited = true;
-    if (!childExitDetail) {
-      childExitDetail = `child exited with code ${code ?? "null"}${signal ? ` signal ${signal}` : ""}`;
-    }
-  });
-  child.unref();
-
-  const pid = child.pid;
-  if (!pid) {
-    throw new Error(
-      "Failed to start model-router proxy: no PID returned" +
-        (childExitDetail ? ` (${childExitDetail})` : ""),
-    );
-  }
-
-  for (let attempt = 0; attempt < ROUTER_HEALTH_RETRIES; attempt++) {
-    await new Promise((resolve) => setTimeout(resolve, ROUTER_HEALTH_INTERVAL_MS));
-    if (childExited) break;
-    const healthy = await isRouterHealthy(port);
-    let processAlive = true;
-    try {
-      process.kill(pid, 0);
-    } catch {
-      processAlive = false;
-    }
-    if (healthy && processAlive) return pid;
-    if (!processAlive) {
-      childExited = true;
-      if (!childExitDetail) childExitDetail = "child process is no longer running";
-      break;
-    }
-  }
-  try {
-    process.kill(pid, "SIGTERM");
-  } catch {
-    // already dead
-  }
-  throw new Error(
-    `Model router failed to become healthy on port ${port} after ${ROUTER_HEALTH_RETRIES} attempts` +
-      (childExitDetail ? ` (${childExitDetail})` : ""),
-  );
-}
-
-function getRoutedProfile(): BlueprintInferenceProfile {
-  const bp = loadBlueprintProfile("routed");
-  if (!bp || bp.router?.enabled !== true) {
-    throw new Error("Router is not enabled in nemoclaw-blueprint/blueprint.yaml.");
-  }
-  return bp;
-}
-
-function isRoutedInferenceProvider(provider: string | null | undefined): boolean {
-  if (!provider) return false;
-  if (provider === "nvidia-router") return true;
-  const bp = loadBlueprintProfile("routed");
-  return Boolean(bp?.provider_name && provider === bp.provider_name);
-}
-
-async function reconcileModelRouter(): Promise<void> {
-  const bp = getRoutedProfile();
-  const routerPort = bp.router.port || 4000;
-  const routerCredentialEnv =
-    bp.router.credential_env || bp.credential_env || DEFAULT_MODEL_ROUTER_CREDENTIAL_ENV;
-  const routerCredential =
-    hydrateCredentialEnv(routerCredentialEnv) ||
-    normalizeCredentialValue(bp.credential_default || "");
-  if (!routerCredential) {
-    throw new Error(`${routerCredentialEnv} is required to start Model Router.`);
-  }
-  saveCredential(routerCredentialEnv, routerCredential);
-  const routerCredentialHash = hashCredential(routerCredential);
-  const session = onboardSession.loadSession();
-  const recordedPid = session?.routerPid ?? null;
-  const recordedCredentialHash = session?.routerCredentialHash ?? null;
-
-  if (await isRouterHealthy(routerPort)) {
-    if (
-      routerCredentialHash &&
-      recordedCredentialHash === routerCredentialHash &&
-      isProcessRunning(recordedPid)
-    ) {
-      console.log(`  ✓ Model router is already healthy on port ${routerPort}`);
-      return;
-    }
-    if (isProcessRunning(recordedPid)) {
-      console.log("  Restarting model router with updated credentials...");
-      await stopModelRouterProcess(requireValue(recordedPid, "Expected recorded router PID"), routerPort);
-    } else {
-      throw new Error(
-        `Port ${routerPort} already has a healthy router endpoint, but its credential state is unknown. Stop the existing model-router process and rerun onboarding.`,
-      );
-    }
-  }
-
-  console.log("  Starting model router...");
-  const routerPid = await startModelRouter(bp.router);
-  console.log(`  ✓ Model router started (PID ${routerPid}) on port ${routerPort}`);
-  onboardSession.updateSession((current: Session) => {
-    current.routerPid = routerPid;
-    current.routerCredentialHash = routerCredentialHash;
-    return current;
-  });
-}
-
 function getOpenshellChannel(env: NodeJS.ProcessEnv = process.env): OpenshellChannel {
   const raw = String(env.NEMOCLAW_OPENSHELL_CHANNEL || "auto")
     .trim()
@@ -8576,519 +8079,45 @@ async function setupPoliciesWithSelection(
 
 const CONTROL_UI_PORT = DASHBOARD_PORT;
 
-// Dashboard helpers — delegated to src/lib/dashboard/contract.ts
-const { buildChain, buildControlUiUrls } = dashboardContract;
-
-function findForwardEntry(
-  forwardListOutput: string | null | undefined,
-  port: string,
-): { sandboxName: string; status: string } | null {
-  if (!forwardListOutput) return null;
-  for (const rawLine of forwardListOutput.split("\n")) {
-    const line = rawLine.replace(ANSI_RE, "");
-    if (/^\s*SANDBOX\s/i.test(line)) continue;
-    const parts = line.trim().split(/\s+/);
-    if (parts.length < 3 || parts[2] !== port) continue;
-    return {
-      sandboxName: parts[0] || "",
-      status: (parts[4] || "").toLowerCase(),
-    };
-  }
-  return null;
-}
-
-function getRunningForwardPorts(forwardListOutput: string | null | undefined): string[] {
-  const ports = new Set<string>();
-  if (!forwardListOutput) return [];
-  for (const rawLine of forwardListOutput.split("\n")) {
-    const line = rawLine.replace(ANSI_RE, "");
-    if (/^\s*SANDBOX\s/i.test(line)) continue;
-    const parts = line.trim().split(/\s+/);
-    if (parts.length < 5 || !/^\d+$/.test(parts[2])) continue;
-    const status = (parts[4] || "").toLowerCase();
-    if (isLiveForwardStatus(status)) {
-      ports.add(parts[2]);
-    }
-  }
-  return [...ports];
-}
-
-function stopAllDashboardForwards(): void {
-  const forwardList = runCaptureOpenshell(["forward", "list"], { ignoreError: true });
-  for (const port of getRunningForwardPorts(forwardList)) {
-    runOpenshell(["forward", "stop", port], { ignoreError: true });
-  }
-}
-
-
-/**
- * Build the actionable error lines printed when the just-created openshell
- * sandbox is rolled back after a dashboard port-allocation failure. Pure
- * function over (sandboxName, alloc-error, delete-result) so the rollback path
- * is testable without spawning subprocesses or exiting the process (#2174).
- */
-function buildOrphanedSandboxRollbackMessage(
-  sandboxName: string,
-  err: unknown,
-  deleteSucceeded: boolean,
-): string[] {
-  const lines = [
-    "",
-    `  Could not allocate a dashboard port for '${sandboxName}'.`,
-    `  ${err instanceof Error ? err.message : String(err)}`,
-  ];
-  if (deleteSucceeded) {
-    lines.push("  The orphaned sandbox has been removed — you can safely retry.");
-  } else {
-    lines.push("  Could not remove the orphaned sandbox. Manual cleanup:");
-    lines.push(`    openshell sandbox delete "${sandboxName}"`);
-  }
-  return lines;
-}
-
-/**
- * Set up the dashboard forward for a sandbox. Auto-allocates the next free
- * port if the preferred port is taken by a different sandbox (Fixes #2174).
- * Returns the actual port number used.
- *
- * When `rollbackSandboxOnFailure` is true, deletes the just-created openshell
- * sandbox before exiting on unrecoverable port-allocation failure. This keeps
- * `openshell sandbox list` and the NemoClaw registry from drifting when the
- * range is exhausted between sandbox-create and forward-setup ("leaks ghost
- * sandbox" half of #2174). Mirrors the not-ready rollback pattern in
- * createSandbox.
- */
-function ensureDashboardForward(
-  sandboxName: string,
-  chatUiUrl = `http://127.0.0.1:${CONTROL_UI_PORT}`,
-  options: { rollbackSandboxOnFailure?: boolean } = {},
-): number {
-  const { rollbackSandboxOnFailure = false } = options;
-  const preferredPort = Number(getDashboardForwardPort(chatUiUrl));
-  let existingForwards = runCaptureOpenshell(["forward", "list"], { ignoreError: true });
-  const preferredEntry = findForwardEntry(existingForwards, String(preferredPort));
-  if (
-    preferredEntry &&
-    (preferredEntry.sandboxName === sandboxName || !isLiveForwardStatus(preferredEntry.status))
-  ) {
-    runOpenshell(["forward", "stop", String(preferredPort)], { ignoreError: true });
-    existingForwards = runCaptureOpenshell(["forward", "list"], { ignoreError: true });
-  }
-  let actualPort: number;
-  try {
-    actualPort = findAvailableDashboardPort(sandboxName, preferredPort, existingForwards);
-  } catch (err) {
-    if (!rollbackSandboxOnFailure) throw err;
-    const delResult = runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true });
-    for (const line of buildOrphanedSandboxRollbackMessage(
-      sandboxName,
-      err,
-      delResult.status === 0,
-    )) {
-      console.error(line);
-    }
-    process.exit(1);
-  }
-
-  if (actualPort !== preferredPort) {
-    if (rollbackSandboxOnFailure) {
-      // Create path: the sandbox was just built with CHAT_UI_URL and
-      // NEMOCLAW_DASHBOARD_PORT baked from `preferredPort` (see the
-      // `formatEnvAssignment("CHAT_UI_URL", …)` call in createSandbox). If
-      // the port was bound during the build window (TOCTOU), picking a new
-      // host port would leave the sandbox serving the dashboard on
-      // `preferredPort` internally while the forward listens on `actualPort`
-      // — reproducing the original "onboard exits but dashboard is
-      // unreachable" failure on the newly selected port. Reallocation is
-      // only safe on reuse paths where the sandbox image is fixed; on the
-      // create path we must roll back so the next onboard re-bakes with a
-      // clean port. (#3260)
-      const err = new Error(
-        `Dashboard port ${preferredPort} became host-bound during sandbox build; ` +
-          `cannot reallocate to ${actualPort} after the sandbox has been created with ` +
-          `CHAT_UI_URL=${preferredPort}. Free the port and re-run \`${cliName()} onboard\`, ` +
-          `or pass \`--control-ui-port <N>\` to pick a different dashboard port.`,
-      );
-      const delResult = runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true });
-      for (const line of buildOrphanedSandboxRollbackMessage(
-        sandboxName,
-        err,
-        delResult.status === 0,
-      )) {
-        console.error(line);
-      }
-      process.exit(1);
-    }
-    console.warn(`  ! Port ${preferredPort} is taken. Using port ${actualPort} instead.`);
-  }
-
-  // Clean up any stale forwards owned by this sandbox on other ports so we
-  // don't leak forwards across port changes and exhaust the range over time.
-  const occupied = getOccupiedPorts(existingForwards);
-  for (const [port, owner] of occupied.entries()) {
-    if (owner === sandboxName && Number(port) !== actualPort) {
-      runOpenshell(["forward", "stop", port], { ignoreError: true });
-    }
-  }
-
-  // Preserve the original URL's hostname (loopback vs remote) but swap to the actual port.
-  const parsedUrl = new URL(chatUiUrl.includes("://") ? chatUiUrl : `http://${chatUiUrl}`);
-  parsedUrl.port = String(actualPort);
-  const actualTarget = getDashboardForwardTarget(parsedUrl.toString());
-  runOpenshell(["forward", "stop", String(actualPort)], { ignoreError: true });
-  const { result: fwdResult, diagnostic: fwdDiagnostic } = runBackgroundForwardStartWithPortReleaseRetries(
-    (stdio, timeout) =>
-      runOpenshell(
-        ["forward", "start", "--background", actualTarget, sandboxName],
-        { ignoreError: true, suppressOutput: true, stdio, timeout },
-      ),
-    () => { sleep(1); runOpenshell(["forward", "stop", String(actualPort)], { ignoreError: true }); },
-  );
-  if (fwdResult && fwdResult.status !== 0) {
-    const looksLikePortConflict = looksLikeForwardPortConflict(fwdDiagnostic);
-    if (rollbackSandboxOnFailure) {
-      // The sandbox was just created, committed to actualPort via its
-      // baked-in CHAT_UI_URL and NEMOCLAW_DASHBOARD_PORT env. Silently
-      // returning here leaves the user with a dashboard URL that points
-      // at a port held by another process — a TOCTOU race where the
-      // proactive probe in findAvailableDashboardPort missed the
-      // conflict (e.g., another listener bound during the multi-minute
-      // image build). Roll back so the next `onboard` retry's allocator
-      // observes the bound port and picks a different one. Only the
-      // EADDRINUSE-style failure gets the port-conflict wording; other
-      // errors (gateway / transport) propagate the real diagnostic so
-      // users aren't pointed at the wrong fix (#3260).
-      const err = new Error(
-        looksLikePortConflict
-          ? `Failed to start dashboard forward on port ${actualPort} — the host port ` +
-              `is held by another process. Free it and run \`${cliName()} onboard\` again, ` +
-              `or pass \`--control-ui-port <N>\` to pick a different dashboard port.`
-          : `Failed to start dashboard forward on port ${actualPort}: ${fwdDiagnostic.slice(0, 240)}`,
-      );
-      const delResult = runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true });
-      for (const line of buildOrphanedSandboxRollbackMessage(
-        sandboxName,
-        err,
-        delResult.status === 0,
-      )) {
-        console.error(line);
-      }
-      process.exit(1);
-    }
-    if (looksLikePortConflict) {
-      console.warn(
-        `! Port ${actualPort} forward did not start — port may be in use by another process.`,
-      );
-      console.warn(
-        `  Check: docker ps --format 'table {{.Names}}\\t{{.Ports}}' | grep ${actualPort}`,
-      );
-      console.warn(`  Free the port, then reconnect: ${cliName()} ${sandboxName} connect`);
-    } else {
-      console.warn(`! Port ${actualPort} forward did not start: ${fwdDiagnostic.slice(0, 240)}`);
-      console.warn(`  Reconnect after resolving the issue: ${cliName()} ${sandboxName} connect`);
-    }
-  }
-  return actualPort;
-}
-
-function ensureAgentDashboardForward(
-  sandboxName: string,
-  agent: { forwardPort?: number | null },
-): number {
-  const agentDashboardPort = agent.forwardPort ?? CONTROL_UI_PORT;
-  const agentDashboardUrl = `http://127.0.0.1:${agentDashboardPort}`;
-  const actualAgentDashboardPort = ensureDashboardForward(sandboxName, agentDashboardUrl);
-  process.env.CHAT_UI_URL = `http://127.0.0.1:${actualAgentDashboardPort}`;
-  return actualAgentDashboardPort;
-}
-
-function findOpenclawJsonPath(dir: string): string | null {
-  if (!fs.existsSync(dir)) return null;
-  const entries = fs.readdirSync(dir, { withFileTypes: true });
-  for (const e of entries) {
-    const p = path.join(dir, e.name);
-    if (e.isDirectory()) {
-      const found: string | null = findOpenclawJsonPath(p);
-      if (found) return found;
-    } else if (e.name === "openclaw.json") {
-      return p;
-    }
-  }
-  return null;
-}
-
-/**
- * Pull gateway.auth.token from the sandbox image via openshell sandbox download
- * so onboard can build dashboard access URLs. User-visible output must redact
- * the token fragment.
- */
-function fetchGatewayAuthTokenFromSandbox(sandboxName: string): string | null {
-  const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-token-"));
-  try {
-    const destDir = `${tmpDir}${path.sep}`;
-    const result = runOpenshell(
-      ["sandbox", "download", sandboxName, "/sandbox/.openclaw/openclaw.json", destDir],
-      { ignoreError: true, stdio: ["ignore", "ignore", "ignore"] },
-    );
-    if (result.status !== 0) return null;
-    const jsonPath = findOpenclawJsonPath(tmpDir);
-    if (!jsonPath) return null;
-    const cfg = JSON.parse(fs.readFileSync(jsonPath, "utf-8"));
-    const token = cfg && cfg.gateway && cfg.gateway.auth && cfg.gateway.auth.token;
-    return typeof token === "string" && token.length > 0 ? token : null;
-  } catch {
-    return null;
-  } finally {
-    try {
-      fs.rmSync(tmpDir, { recursive: true, force: true });
-    } catch {
-      // ignore cleanup errors
-    }
-  }
-}
-
-// buildControlUiUrls — see dashboard-contract import above
-
-function getDashboardForwardPort(
-  chatUiUrl = process.env.CHAT_UI_URL || `http://127.0.0.1:${CONTROL_UI_PORT}`,
-  options: Parameters<typeof dashboardAccess.getDashboardForwardPort>[1] = {},
-): string {
-  return dashboardAccess.getDashboardForwardPort(chatUiUrl, {
-    ...options,
-    runCapture: options.runCapture || runCapture,
-  });
-}
-
-function getDashboardForwardTarget(
-  chatUiUrl = process.env.CHAT_UI_URL || `http://127.0.0.1:${CONTROL_UI_PORT}`,
-  options: Parameters<typeof dashboardAccess.getDashboardForwardTarget>[1] = {},
-): string {
-  return dashboardAccess.getDashboardForwardTarget(chatUiUrl, {
-    ...options,
-    runCapture: options.runCapture || runCapture,
-  });
-}
-
-function dashboardUrlForDisplay(url: string): string {
-  return dashboardAccess.dashboardUrlForDisplay(url, redact);
-}
-
-function getWslHostAddress(
-  options: Parameters<typeof dashboardAccess.getWslHostAddress>[0] = {},
-): string | null {
-  return dashboardAccess.getWslHostAddress({ ...options, runCapture: options.runCapture || runCapture });
-}
-
-/** Print the post-onboard dashboard with sandbox status and reconfiguration hints. */
-function printDashboard(
-  sandboxName: string,
-  model: string,
-  provider: string,
-  nimContainer: string | null = null,
-  agent: AgentDefinition | null = null,
-): void {
-  const nimStat = nimContainer ? nim.nimStatusByName(nimContainer) : nim.nimStatus(sandboxName);
-  const showNim = nim.shouldShowNimLine(nimContainer, nimStat.running);
-  const nimLabel = nimStat.running ? "running" : "not running";
-
-  const providerLabel = getProviderLabel(provider);
-
-  const token = fetchGatewayAuthTokenFromSandbox(sandboxName);
-  const chatUiUrl = process.env.CHAT_UI_URL || `http://127.0.0.1:${CONTROL_UI_PORT}`;
-  const wslAddr = getWslHostAddress();
-  const chain = buildChain({ chatUiUrl, isWsl: isWsl(), wslHostAddress: wslAddr });
-
-  // Build access info inline — uses chain instead of re-deriving from env
-  const dashboardAccess = buildControlUiUrls(token, chain.port, chain.accessUrl).map((url, i) => ({
-    label: i === 0 ? "Dashboard" : `Alt ${i}`,
-    url,
-  }));
-  if (wslAddr) {
-    const wslUrl = `http://${wslAddr}:${chain.port}/${token ? `#token=${encodeURIComponent(token)}` : ""}`;
-    const existing = dashboardAccess.find((a) => a.url === wslUrl);
-    if (existing) existing.label = "VS Code/WSL";
-    else dashboardAccess.push({ label: "VS Code/WSL", url: wslUrl });
-  }
-  const guidanceLines = [`Port ${chain.port} must be forwarded before opening these URLs.`];
-  if (isWsl())
-    guidanceLines.push(
-      "WSL detected: if localhost fails in Windows, use the WSL host IP shown by `hostname -I`.",
-    );
-  if (dashboardAccess.length === 0) guidanceLines.push("No dashboard URLs were generated.");
-
-  console.log("");
-  console.log(`  ${"─".repeat(50)}`);
-  // console.log(`  Dashboard    http://localhost:${DASHBOARD_PORT}/`);
-  console.log(`  Sandbox      ${sandboxName} (Landlock + seccomp + netns)`);
-  console.log(`  Model        ${model} (${providerLabel})`);
-  if (showNim) {
-    console.log(`  NIM          ${nimLabel}`);
-  }
-  console.log(`  ${"─".repeat(50)}`);
-  console.log(`  Run:         ${cliName()} ${sandboxName} connect`);
-  console.log(`  Status:      ${cliName()} ${sandboxName} status`);
-  console.log(`  Logs:        ${cliName()} ${sandboxName} logs --follow`);
-  console.log("");
-  if (agent) {
-    agentOnboard.printDashboardUi(sandboxName, token, agent, {
-      note,
-      buildControlUiUrls: (tokenValue: string | null, port: number) => {
-        return buildControlUiUrls(tokenValue, port, chain.accessUrl);
-      },
-    });
-  } else if (token) {
-    console.log(
-      `  ${agentProductName()} UI (auth token redacted from displayed URLs)`,
-    );
-    for (const line of guidanceLines) {
-      console.log(`  ${line}`);
-    }
-    for (const entry of dashboardAccess) {
-      console.log(`  ${entry.label}: ${dashboardUrlForDisplay(entry.url)}`);
-    }
-    console.log(`  Token:       ${cliName()} ${sandboxName} gateway-token --quiet`);
-    console.log(`               append  #token=<token> locally if the browser asks for auth.`);
-  } else {
-    note("  Could not read gateway token from the sandbox (download failed).");
-    console.log(`  ${agentProductName()} UI`);
-    for (const line of guidanceLines) {
-      console.log(`  ${line}`);
-    }
-    for (const entry of dashboardAccess) {
-      console.log(`  ${entry.label}: ${dashboardUrlForDisplay(entry.url)}`);
-    }
-    console.log(
-      `  Token:       ${cliName()} ${sandboxName} connect  →  jq -r '.gateway.auth.token' /sandbox/.openclaw/openclaw.json`,
-    );
-    console.log(`               append  #token=<token>  to the URL locally if needed.`);
-  }
-  console.log(`  ${"─".repeat(50)}`);
-  console.log("");
-  console.log("  To change settings later:");
-  console.log(
-    `    Model:       ${cliName()} inference get\n                 ${cliName()} inference set --model <model> --provider <provider> --sandbox ${sandboxName}`,
-  );
-  console.log(`    Policies:    ${cliName()} ${sandboxName} policy-add`);
-  console.log(`    Credentials: ${cliName()} credentials reset <KEY>  then  ${cliName()} onboard`);
-  console.log("");
-}
-
-// Preserve the nullable contract end-to-end: `null` means "clear this
-// field on the persisted session", `undefined` means "leave unchanged".
-function toNullableString(value: string | null | undefined): string | null | undefined {
-  if (value === undefined) return undefined;
-  if (value === null) return null;
-  return value;
-}
-
-function toSessionUpdates(
-  updates: {
-    sandboxName?: string | null;
-    provider?: string | null;
-    model?: string | null;
-    endpointUrl?: string | null;
-    credentialEnv?: string | null;
-    hermesAuthMethod?: HermesAuthMethod | string | null;
-    preferredInferenceApi?: string | null;
-    nimContainer?: string | null;
-    webSearchConfig?: WebSearchConfig | null;
-    policyPresets?: string[] | null;
-    messagingChannels?: string[] | null;
-    messagingChannelConfig?: MessagingChannelConfig | null;
-    hermesToolGateways?: string[] | null;
-  } = {},
-): SessionUpdates {
-  const normalized: SessionUpdates = {};
-  if (updates.sandboxName !== undefined)
-    normalized.sandboxName = toNullableString(updates.sandboxName);
-  if (updates.provider !== undefined) normalized.provider = toNullableString(updates.provider);
-  if (updates.model !== undefined) normalized.model = toNullableString(updates.model);
-  if (updates.endpointUrl !== undefined)
-    normalized.endpointUrl = toNullableString(updates.endpointUrl);
-  if (updates.credentialEnv !== undefined)
-    normalized.credentialEnv = toNullableString(updates.credentialEnv);
-  if (updates.hermesAuthMethod !== undefined)
-    normalized.hermesAuthMethod = normalizeHermesAuthMethod(updates.hermesAuthMethod);
-  if (updates.preferredInferenceApi !== undefined) {
-    normalized.preferredInferenceApi = toNullableString(updates.preferredInferenceApi);
-  }
-  if (updates.nimContainer !== undefined)
-    normalized.nimContainer = toNullableString(updates.nimContainer);
-  if (updates.webSearchConfig !== undefined) normalized.webSearchConfig = updates.webSearchConfig;
-  if (updates.policyPresets !== undefined) normalized.policyPresets = updates.policyPresets;
-  if (updates.messagingChannels !== undefined)
-    normalized.messagingChannels = updates.messagingChannels;
-  if (updates.messagingChannelConfig !== undefined) {
-    normalized.messagingChannelConfig = updates.messagingChannelConfig;
-  }
-  if (updates.hermesToolGateways !== undefined)
-    normalized.hermesToolGateways = updates.hermesToolGateways;
-  return normalized;
-}
-
-function getOnboardRuntime(): import("./onboard/machine/runtime").OnboardRuntime {
-  if (!ONBOARD_RUNTIME) ONBOARD_RUNTIME = new OnboardRuntime();
-  return ONBOARD_RUNTIME;
-}
-
-async function startRecordedStep(
-  stepName: string,
-  updates: {
-    sandboxName?: string | null;
-    provider?: string | null;
-    model?: string | null;
-    policyPresets?: string[] | null;
-  } = {},
-): Promise<void> {
-  const runtime = getOnboardRuntime();
-  await runtime.markStepStarted(stepName);
-  if (Object.keys(updates).length > 0) {
-    await runtime.updateContext(toSessionUpdates(updates));
-  }
-  maybeForceE2eStepFailure(stepName);
-}
-
-async function recordStepComplete(
-  stepName: string,
-  updates: SessionUpdates = {},
-): Promise<Session> {
-  return getOnboardRuntime().markStepComplete(stepName, updates);
-}
-
-async function recordStepSkipped(stepName: string): Promise<Session> {
-  return getOnboardRuntime().markStepSkipped(stepName);
-}
-
-async function recordStateSkipped(
-  state: import("./onboard/machine/types").OnboardMachineState,
-  metadata: Record<string, unknown> | null = null,
-): Promise<Session> {
-  return getOnboardRuntime().markSkipped(state, metadata);
-}
+const {
+  buildChain,
+  buildControlUiUrls,
+  buildOrphanedSandboxRollbackMessage,
+  ensureDashboardForward,
+  ensureAgentDashboardForward,
+  fetchGatewayAuthTokenFromSandbox,
+  getDashboardForwardPort,
+  getDashboardForwardTarget,
+  getWslHostAddress,
+  printDashboard,
+  stopAllDashboardForwards,
+} = onboardDashboard.createOnboardDashboardHelpers({
+  runOpenshell,
+  runCaptureOpenshell,
+  runCapture,
+  cliName,
+  agentProductName,
+  getProviderLabel,
+  note,
+  isWsl,
+  redact,
+  sleep,
+  printAgentDashboardUi: agentOnboard.printDashboardUi,
+});
 
-async function recordRepairEvent(
-  type: "state.repair.started" | "state.repair.completed" | "state.repair.failed",
-  options: {
-    state?: import("./onboard/machine/types").OnboardMachineState | null;
-    error?: string | null;
-    metadata?: Record<string, unknown> | null;
-  } = {},
-): Promise<Session> {
-  return getOnboardRuntime().emitRepairEvent(type, options);
-}
+const onboardRuntimeBoundary = new OnboardRuntimeBoundary({
+  toSessionUpdates: (updates: Record<string, unknown>) =>
+    toSessionUpdates(updates as Parameters<typeof toSessionUpdates>[0]),
+  maybeForceE2eStepFailure,
+});
 
-async function recordSessionComplete(updates: SessionUpdates = {}): Promise<Session> {
-  const runtime = getOnboardRuntime();
-  const current = await runtime.session();
-  if (current.machine.state === "finalizing") {
-    await runtime.transition("post_verify");
-    return runtime.complete(updates);
-  }
-  if (current.machine.state === "post_verify") {
-    return runtime.complete(updates);
-  }
-  return runtime.completeSession(updates);
-}
+const startRecordedStep = onboardRuntimeBoundary.startRecordedStep.bind(onboardRuntimeBoundary);
+const recordStepComplete = onboardRuntimeBoundary.recordStepComplete.bind(onboardRuntimeBoundary);
+const recordStepSkipped = onboardRuntimeBoundary.recordStepSkipped.bind(onboardRuntimeBoundary);
+const recordStepFailed = onboardRuntimeBoundary.recordStepFailed.bind(onboardRuntimeBoundary);
+const recordStateSkipped = onboardRuntimeBoundary.recordStateSkipped.bind(onboardRuntimeBoundary);
+const recordRepairEvent = onboardRuntimeBoundary.recordRepairEvent.bind(onboardRuntimeBoundary);
+const recordSessionComplete = onboardRuntimeBoundary.recordSessionComplete.bind(onboardRuntimeBoundary);
 
 const ONBOARD_STEP_INDEX: Record<string, { number: number; title: string }> = {
   preflight: { number: 1, title: "Preflight checks" },
@@ -9125,7 +8154,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
   RECREATE_SANDBOX = opts.recreateSandbox || process.env.NEMOCLAW_RECREATE_SANDBOX === "1";
   AUTO_YES = opts.autoYes === true || process.env.NEMOCLAW_YES === "1";
   _preflightDashboardPort = opts.controlUiPort || null;
-  ONBOARD_RUNTIME = new OnboardRuntime();
+  onboardRuntimeBoundary.reset();
   delete process.env.OPENSHELL_GATEWAY;
   const resume = opts.resume === true;
   const fresh = opts.fresh === true;
@@ -9708,8 +8737,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
           cleanupTempDir,
           startRecordedStep,
           recordStepComplete,
-          recordStepFailed: (stepName: string, message: string | null) =>
-            getOnboardRuntime().markStepFailed(stepName, message),
+          recordStepFailed,
           skippedStepMessage,
         }),
         ensureAgentDashboardForward,
@@ -9816,7 +8844,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
     });
   } finally {
     releaseOnboardLock();
-    ONBOARD_RUNTIME = null;
+    onboardRuntimeBoundary.clear();
   }
 }
 
diff --git a/src/lib/onboard/dashboard.ts b/src/lib/onboard/dashboard.ts
new file mode 100644
index 0000000000..118b90476a
--- /dev/null
+++ b/src/lib/onboard/dashboard.ts
@@ -0,0 +1,436 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import fs from "node:fs";
+import os from "node:os";
+import path from "node:path";
+
+import type { AgentDefinition } from "../agent/defs";
+import { DASHBOARD_PORT } from "../core/ports";
+import { buildChain, buildControlUiUrls } from "../dashboard/contract";
+import * as nim from "../inference/nim";
+import { runCapture as defaultRunCapture } from "../runner";
+import * as dashboardAccess from "./dashboard-access";
+import {
+  findAvailableDashboardPort,
+  getOccupiedPorts,
+  isLiveForwardStatus,
+} from "./dashboard-port";
+import {
+  looksLikeForwardPortConflict,
+  runBackgroundForwardStartWithPortReleaseRetries,
+} from "./forward-start";
+
+const ANSI_RE = /\x1B(?:\[[0-?]*[ -/]*[@-~]|\][^\x07]*(?:\x07|\x1B\\)|[@-_])/g;
+export const CONTROL_UI_PORT = DASHBOARD_PORT;
+
+type CommandResult = { status: number | null };
+
+export interface OnboardDashboardDeps {
+  runOpenshell(args: string[], opts?: Record<string, unknown>): CommandResult;
+  runCaptureOpenshell(args: string[], opts?: Record<string, unknown>): string | null;
+  runCapture?: typeof defaultRunCapture;
+  cliName(): string;
+  agentProductName(): string;
+  getProviderLabel(provider: string): string;
+  note(message: string): void;
+  isWsl(): boolean;
+  redact(value: unknown): string;
+  sleep(seconds: number): void;
+  printAgentDashboardUi(
+    sandboxName: string,
+    token: string | null,
+    agent: AgentDefinition,
+    deps: {
+      note: (msg: string) => void;
+      buildControlUiUrls: (token: string | null, port: number) => string[];
+    },
+  ): void;
+}
+
+export interface OnboardDashboardHelpers {
+  buildChain: typeof buildChain;
+  buildControlUiUrls: typeof buildControlUiUrls;
+  buildOrphanedSandboxRollbackMessage(
+    sandboxName: string,
+    err: unknown,
+    deleteSucceeded: boolean,
+  ): string[];
+  ensureDashboardForward(
+    sandboxName: string,
+    chatUiUrl?: string,
+    options?: { rollbackSandboxOnFailure?: boolean },
+  ): number;
+  ensureAgentDashboardForward(
+    sandboxName: string,
+    agent: { forwardPort?: number | null },
+  ): number;
+  fetchGatewayAuthTokenFromSandbox(sandboxName: string): string | null;
+  getDashboardForwardPort(
+    chatUiUrl?: string,
+    options?: Parameters<typeof dashboardAccess.getDashboardForwardPort>[1],
+  ): string;
+  getDashboardForwardTarget(
+    chatUiUrl?: string,
+    options?: Parameters<typeof dashboardAccess.getDashboardForwardTarget>[1],
+  ): string;
+  getWslHostAddress(
+    options?: Parameters<typeof dashboardAccess.getWslHostAddress>[0],
+  ): string | null;
+  printDashboard(
+    sandboxName: string,
+    model: string,
+    provider: string,
+    nimContainer?: string | null,
+    agent?: AgentDefinition | null,
+  ): void;
+  stopAllDashboardForwards(): void;
+}
+
+function findForwardEntry(
+  forwardListOutput: string | null | undefined,
+  port: string,
+): { sandboxName: string; status: string } | null {
+  if (!forwardListOutput) return null;
+  for (const rawLine of forwardListOutput.split("\n")) {
+    const line = rawLine.replace(ANSI_RE, "");
+    if (/^\s*SANDBOX\s/i.test(line)) continue;
+    const parts = line.trim().split(/\s+/);
+    if (parts.length < 3 || parts[2] !== port) continue;
+    return {
+      sandboxName: parts[0] || "",
+      status: (parts[4] || "").toLowerCase(),
+    };
+  }
+  return null;
+}
+
+function getRunningForwardPorts(forwardListOutput: string | null | undefined): string[] {
+  const ports = new Set<string>();
+  if (!forwardListOutput) return [];
+  for (const rawLine of forwardListOutput.split("\n")) {
+    const line = rawLine.replace(ANSI_RE, "");
+    if (/^\s*SANDBOX\s/i.test(line)) continue;
+    const parts = line.trim().split(/\s+/);
+    if (parts.length < 5 || !/^\d+$/.test(parts[2])) continue;
+    const status = (parts[4] || "").toLowerCase();
+    if (isLiveForwardStatus(status)) {
+      ports.add(parts[2]);
+    }
+  }
+  return [...ports];
+}
+
+function findOpenclawJsonPath(dir: string): string | null {
+  if (!fs.existsSync(dir)) return null;
+  const entries = fs.readdirSync(dir, { withFileTypes: true });
+  for (const entry of entries) {
+    const entryPath = path.join(dir, entry.name);
+    if (entry.isDirectory()) {
+      const found: string | null = findOpenclawJsonPath(entryPath);
+      if (found) return found;
+    } else if (entry.name === "openclaw.json") {
+      return entryPath;
+    }
+  }
+  return null;
+}
+
+function dashboardUrlForDisplay(url: string, deps: OnboardDashboardDeps): string {
+  return dashboardAccess.dashboardUrlForDisplay(url, deps.redact);
+}
+
+export function createOnboardDashboardHelpers(deps: OnboardDashboardDeps): OnboardDashboardHelpers {
+  const runCapture = deps.runCapture ?? defaultRunCapture;
+
+  function getDashboardForwardPort(
+    chatUiUrl = process.env.CHAT_UI_URL || `http://127.0.0.1:${CONTROL_UI_PORT}`,
+    options: Parameters<typeof dashboardAccess.getDashboardForwardPort>[1] = {},
+  ): string {
+    return dashboardAccess.getDashboardForwardPort(chatUiUrl, {
+      ...options,
+      runCapture: options.runCapture || runCapture,
+    });
+  }
+
+  function getDashboardForwardTarget(
+    chatUiUrl = process.env.CHAT_UI_URL || `http://127.0.0.1:${CONTROL_UI_PORT}`,
+    options: Parameters<typeof dashboardAccess.getDashboardForwardTarget>[1] = {},
+  ): string {
+    return dashboardAccess.getDashboardForwardTarget(chatUiUrl, {
+      ...options,
+      runCapture: options.runCapture || runCapture,
+    });
+  }
+
+  function getWslHostAddress(
+    options: Parameters<typeof dashboardAccess.getWslHostAddress>[0] = {},
+  ): string | null {
+    return dashboardAccess.getWslHostAddress({ ...options, runCapture: options.runCapture || runCapture });
+  }
+
+  function stopAllDashboardForwards(): void {
+    const forwardList = deps.runCaptureOpenshell(["forward", "list"], { ignoreError: true });
+    for (const port of getRunningForwardPorts(forwardList)) {
+      deps.runOpenshell(["forward", "stop", port], { ignoreError: true });
+    }
+  }
+
+  function buildOrphanedSandboxRollbackMessage(
+    sandboxName: string,
+    err: unknown,
+    deleteSucceeded: boolean,
+  ): string[] {
+    const lines = [
+      "",
+      `  Could not allocate a dashboard port for '${sandboxName}'.`,
+      `  ${err instanceof Error ? err.message : String(err)}`,
+    ];
+    if (deleteSucceeded) {
+      lines.push("  The orphaned sandbox has been removed — you can safely retry.");
+    } else {
+      lines.push("  Could not remove the orphaned sandbox. Manual cleanup:");
+      lines.push(`    openshell sandbox delete "${sandboxName}"`);
+    }
+    return lines;
+  }
+
+  function rollbackSandboxAndExit(sandboxName: string, err: unknown): never {
+    const delResult = deps.runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true });
+    for (const line of buildOrphanedSandboxRollbackMessage(
+      sandboxName,
+      err,
+      delResult.status === 0,
+    )) {
+      console.error(line);
+    }
+    process.exit(1);
+  }
+
+  function ensureDashboardForward(
+    sandboxName: string,
+    chatUiUrl = `http://127.0.0.1:${CONTROL_UI_PORT}`,
+    options: { rollbackSandboxOnFailure?: boolean } = {},
+  ): number {
+    const { rollbackSandboxOnFailure = false } = options;
+    const preferredPort = Number(getDashboardForwardPort(chatUiUrl));
+    let existingForwards = deps.runCaptureOpenshell(["forward", "list"], { ignoreError: true });
+    const preferredEntry = findForwardEntry(existingForwards, String(preferredPort));
+    if (
+      preferredEntry &&
+      (preferredEntry.sandboxName === sandboxName || !isLiveForwardStatus(preferredEntry.status))
+    ) {
+      deps.runOpenshell(["forward", "stop", String(preferredPort)], { ignoreError: true });
+      existingForwards = deps.runCaptureOpenshell(["forward", "list"], { ignoreError: true });
+    }
+    let actualPort: number;
+    try {
+      actualPort = findAvailableDashboardPort(sandboxName, preferredPort, existingForwards);
+    } catch (err) {
+      if (!rollbackSandboxOnFailure) throw err;
+      rollbackSandboxAndExit(sandboxName, err);
+    }
+
+    if (actualPort !== preferredPort) {
+      if (rollbackSandboxOnFailure) {
+        const err = new Error(
+          `Dashboard port ${preferredPort} became host-bound during sandbox build; ` +
+            `cannot reallocate to ${actualPort} after the sandbox has been created with ` +
+            `CHAT_UI_URL=${preferredPort}. Free the port and re-run \`${deps.cliName()} onboard\`, ` +
+            `or pass \`--control-ui-port <N>\` to pick a different dashboard port.`,
+        );
+        rollbackSandboxAndExit(sandboxName, err);
+      }
+      console.warn(`  ! Port ${preferredPort} is taken. Using port ${actualPort} instead.`);
+    }
+
+    const occupied = getOccupiedPorts(existingForwards);
+    for (const [port, owner] of occupied.entries()) {
+      if (owner === sandboxName && Number(port) !== actualPort) {
+        deps.runOpenshell(["forward", "stop", port], { ignoreError: true });
+      }
+    }
+
+    const parsedUrl = new URL(chatUiUrl.includes("://") ? chatUiUrl : `http://${chatUiUrl}`);
+    parsedUrl.port = String(actualPort);
+    const actualTarget = getDashboardForwardTarget(parsedUrl.toString());
+    deps.runOpenshell(["forward", "stop", String(actualPort)], { ignoreError: true });
+    const { result: fwdResult, diagnostic: fwdDiagnostic } = runBackgroundForwardStartWithPortReleaseRetries(
+      (stdio, timeout) =>
+        deps.runOpenshell(
+          ["forward", "start", "--background", actualTarget, sandboxName],
+          { ignoreError: true, suppressOutput: true, stdio, timeout },
+        ),
+      () => {
+        deps.sleep(1);
+        deps.runOpenshell(["forward", "stop", String(actualPort)], { ignoreError: true });
+      },
+    );
+    if (fwdResult && fwdResult.status !== 0) {
+      const looksLikePortConflict = looksLikeForwardPortConflict(fwdDiagnostic);
+      if (rollbackSandboxOnFailure) {
+        const err = new Error(
+          looksLikePortConflict
+            ? `Failed to start dashboard forward on port ${actualPort} — the host port ` +
+                `is held by another process. Free it and run \`${deps.cliName()} onboard\` again, ` +
+                `or pass \`--control-ui-port <N>\` to pick a different dashboard port.`
+            : `Failed to start dashboard forward on port ${actualPort}: ${fwdDiagnostic.slice(0, 240)}`,
+        );
+        rollbackSandboxAndExit(sandboxName, err);
+      }
+      if (looksLikePortConflict) {
+        console.warn(
+          `! Port ${actualPort} forward did not start — port may be in use by another process.`,
+        );
+        console.warn(
+          `  Check: docker ps --format 'table {{.Names}}\\t{{.Ports}}' | grep ${actualPort}`,
+        );
+        console.warn(`  Free the port, then reconnect: ${deps.cliName()} ${sandboxName} connect`);
+      } else {
+        console.warn(`! Port ${actualPort} forward did not start: ${fwdDiagnostic.slice(0, 240)}`);
+        console.warn(`  Reconnect after resolving the issue: ${deps.cliName()} ${sandboxName} connect`);
+      }
+    }
+    return actualPort;
+  }
+
+  function ensureAgentDashboardForward(
+    sandboxName: string,
+    agent: { forwardPort?: number | null },
+  ): number {
+    const agentDashboardPort = agent.forwardPort ?? CONTROL_UI_PORT;
+    const agentDashboardUrl = `http://127.0.0.1:${agentDashboardPort}`;
+    const actualAgentDashboardPort = ensureDashboardForward(sandboxName, agentDashboardUrl);
+    process.env.CHAT_UI_URL = `http://127.0.0.1:${actualAgentDashboardPort}`;
+    return actualAgentDashboardPort;
+  }
+
+  function fetchGatewayAuthTokenFromSandbox(sandboxName: string): string | null {
+    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-token-"));
+    try {
+      const destDir = `${tmpDir}${path.sep}`;
+      const result = deps.runOpenshell(
+        ["sandbox", "download", sandboxName, "/sandbox/.openclaw/openclaw.json", destDir],
+        { ignoreError: true, stdio: ["ignore", "ignore", "ignore"] },
+      );
+      if (result.status !== 0) return null;
+      const jsonPath = findOpenclawJsonPath(tmpDir);
+      if (!jsonPath) return null;
+      const cfg = JSON.parse(fs.readFileSync(jsonPath, "utf-8"));
+      const token = cfg && cfg.gateway && cfg.gateway.auth && cfg.gateway.auth.token;
+      return typeof token === "string" && token.length > 0 ? token : null;
+    } catch {
+      return null;
+    } finally {
+      try {
+        fs.rmSync(tmpDir, { recursive: true, force: true });
+      } catch {
+        // ignore cleanup errors
+      }
+    }
+  }
+
+  function printDashboard(
+    sandboxName: string,
+    model: string,
+    provider: string,
+    nimContainer: string | null = null,
+    agent: AgentDefinition | null = null,
+  ): void {
+    const nimStat = nimContainer ? nim.nimStatusByName(nimContainer) : nim.nimStatus(sandboxName);
+    const showNim = nim.shouldShowNimLine(nimContainer, nimStat.running);
+    const nimLabel = nimStat.running ? "running" : "not running";
+    const providerLabel = deps.getProviderLabel(provider);
+    const token = fetchGatewayAuthTokenFromSandbox(sandboxName);
+    const chatUiUrl = process.env.CHAT_UI_URL || `http://127.0.0.1:${CONTROL_UI_PORT}`;
+    const wslAddr = getWslHostAddress();
+    const chain = buildChain({ chatUiUrl, isWsl: deps.isWsl(), wslHostAddress: wslAddr });
+
+    const dashboardAccessEntries = buildControlUiUrls(token, chain.port, chain.accessUrl).map((url, index) => ({
+      label: index === 0 ? "Dashboard" : `Alt ${index}`,
+      url,
+    }));
+    if (wslAddr) {
+      const wslUrl = `http://${wslAddr}:${chain.port}/${token ? `#token=${encodeURIComponent(token)}` : ""}`;
+      const existing = dashboardAccessEntries.find((entry) => entry.url === wslUrl);
+      if (existing) existing.label = "VS Code/WSL";
+      else dashboardAccessEntries.push({ label: "VS Code/WSL", url: wslUrl });
+    }
+    const guidanceLines = [`Port ${chain.port} must be forwarded before opening these URLs.`];
+    if (deps.isWsl()) {
+      guidanceLines.push(
+        "WSL detected: if localhost fails in Windows, use the WSL host IP shown by `hostname -I`.",
+      );
+    }
+    if (dashboardAccessEntries.length === 0) guidanceLines.push("No dashboard URLs were generated.");
+
+    console.log("");
+    console.log(`  ${"─".repeat(50)}`);
+    console.log(`  Sandbox      ${sandboxName} (Landlock + seccomp + netns)`);
+    console.log(`  Model        ${model} (${providerLabel})`);
+    if (showNim) {
+      console.log(`  NIM          ${nimLabel}`);
+    }
+    console.log(`  ${"─".repeat(50)}`);
+    console.log(`  Run:         ${deps.cliName()} ${sandboxName} connect`);
+    console.log(`  Status:      ${deps.cliName()} ${sandboxName} status`);
+    console.log(`  Logs:        ${deps.cliName()} ${sandboxName} logs --follow`);
+    console.log("");
+    if (agent) {
+      deps.printAgentDashboardUi(sandboxName, token, agent, {
+        note: deps.note,
+        buildControlUiUrls: (tokenValue: string | null, port: number) => {
+          return buildControlUiUrls(tokenValue, port, chain.accessUrl);
+        },
+      });
+    } else if (token) {
+      console.log(
+        `  ${deps.agentProductName()} UI (auth token redacted from displayed URLs)`,
+      );
+      for (const line of guidanceLines) {
+        console.log(`  ${line}`);
+      }
+      for (const entry of dashboardAccessEntries) {
+        console.log(`  ${entry.label}: ${dashboardUrlForDisplay(entry.url, deps)}`);
+      }
+      console.log(`  Token:       ${deps.cliName()} ${sandboxName} gateway-token --quiet`);
+      console.log("               append  #token=<token> locally if the browser asks for auth.");
+    } else {
+      deps.note("  Could not read gateway token from the sandbox (download failed).");
+      console.log(`  ${deps.agentProductName()} UI`);
+      for (const line of guidanceLines) {
+        console.log(`  ${line}`);
+      }
+      for (const entry of dashboardAccessEntries) {
+        console.log(`  ${entry.label}: ${dashboardUrlForDisplay(entry.url, deps)}`);
+      }
+      console.log(
+        `  Token:       ${deps.cliName()} ${sandboxName} connect  →  jq -r '.gateway.auth.token' /sandbox/.openclaw/openclaw.json`,
+      );
+      console.log("               append  #token=<token>  to the URL locally if needed.");
+    }
+    console.log(`  ${"─".repeat(50)}`);
+    console.log("");
+    console.log("  To change settings later:");
+    console.log(
+      `    Model:       ${deps.cliName()} inference get\n                 ${deps.cliName()} inference set --model <model> --provider <provider> --sandbox ${sandboxName}`,
+    );
+    console.log(`    Policies:    ${deps.cliName()} ${sandboxName} policy-add`);
+    console.log(`    Credentials: ${deps.cliName()} credentials reset <KEY>  then  ${deps.cliName()} onboard`);
+    console.log("");
+  }
+
+  return {
+    buildChain,
+    buildControlUiUrls,
+    buildOrphanedSandboxRollbackMessage,
+    ensureDashboardForward,
+    ensureAgentDashboardForward,
+    fetchGatewayAuthTokenFromSandbox,
+    getDashboardForwardPort,
+    getDashboardForwardTarget,
+    getWslHostAddress,
+    printDashboard,
+    stopAllDashboardForwards,
+  };
+}
diff --git a/src/lib/onboard/model-router.ts b/src/lib/onboard/model-router.ts
new file mode 100644
index 0000000000..81ca0d10d7
--- /dev/null
+++ b/src/lib/onboard/model-router.ts
@@ -0,0 +1,522 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { spawn, spawnSync } from "node:child_process";
+import crypto from "node:crypto";
+import fs from "node:fs";
+import http from "node:http";
+import os from "node:os";
+import path from "node:path";
+
+import {
+  normalizeCredentialValue,
+  resolveProviderCredential,
+  saveCredential,
+} from "../credentials/store";
+import { ROOT, run, runCapture } from "../runner";
+import { hashCredential } from "../security/credential-hash";
+import type { Session } from "../state/onboard-session";
+import * as onboardSession from "../state/onboard-session";
+import { buildSubprocessEnv } from "../subprocess-env";
+import { hydrateCredentialEnv } from "./credential-env";
+import { prepareModelRouterVenv } from "./model-router-python";
+
+const ROUTER_HEALTH_RETRIES = 15;
+const ROUTER_HEALTH_INTERVAL_MS = 2000;
+const ROUTER_HEALTH_TIMEOUT_MS = 3000;
+const MODEL_ROUTER_RELATIVE_DIR = path.join("nemoclaw-blueprint", "router", "llm-router");
+const MODEL_ROUTER_VENV_DIR = path.join(os.homedir(), ".nemoclaw", "model-router-venv");
+const MODEL_ROUTER_FINGERPRINT_FILE = ".nemoclaw-source-fingerprint";
+const MODEL_ROUTER_FINGERPRINT_IGNORED_NAMES = new Set([
+  ".git",
+  ".hg",
+  ".mypy_cache",
+  ".pytest_cache",
+  ".ruff_cache",
+  ".svn",
+  ".venv",
+  "__pycache__",
+  "build",
+  "dist",
+  "node_modules",
+  "venv",
+]);
+export const DEFAULT_MODEL_ROUTER_CREDENTIAL_ENV = "NVIDIA_API_KEY";
+
+export type BlueprintRouterConfig = {
+  enabled?: boolean;
+  port?: number;
+  pool_config_path?: string;
+  credential_env?: string;
+};
+
+export type BlueprintInferenceProfile = {
+  provider_name?: string;
+  endpoint?: string;
+  model: string;
+  credential_env?: string;
+  credential_default?: string;
+  router: BlueprintRouterConfig;
+};
+
+function requireValue<T>(value: T | null | undefined, message: string): T {
+  if (value === null || value === undefined) {
+    throw new Error(message);
+  }
+  return value;
+}
+
+/**
+ * Load a named inference profile and router config from blueprint.yaml.
+ * Returns null if the blueprint or profile is missing.
+ */
+export function loadBlueprintProfile(
+  profileName: string,
+  rootDir: string = ROOT,
+): BlueprintInferenceProfile | null {
+  try {
+    const YAML = require("yaml");
+    const blueprintPath = path.join(rootDir, "nemoclaw-blueprint", "blueprint.yaml");
+    if (!fs.existsSync(blueprintPath)) return null;
+    const raw = fs.readFileSync(blueprintPath, "utf8");
+    const parsed = YAML.parse(raw);
+    const profile = parsed?.components?.inference?.profiles?.[profileName];
+    if (!profile) return null;
+    const router = { ...(parsed?.components?.router || {}) };
+    if (typeof profile.credential_env === "string" && profile.credential_env.trim().length > 0) {
+      router.credential_env = profile.credential_env;
+    }
+    return { ...profile, router } as BlueprintInferenceProfile;
+  } catch {
+    return null;
+  }
+}
+
+async function isRouterHealthy(port: number, timeoutMs = ROUTER_HEALTH_TIMEOUT_MS): Promise<boolean> {
+  return new Promise<boolean>((resolve) => {
+    let settled = false;
+    const settle = (healthy: boolean) => {
+      if (settled) return;
+      settled = true;
+      resolve(healthy);
+    };
+    const request = http
+      .get(`http://127.0.0.1:${port}/health`, (res: http.IncomingMessage) => {
+        res.resume();
+        settle((res.statusCode || 0) >= 200 && (res.statusCode || 0) < 300);
+      })
+      .on("error", () => settle(false));
+    request.setTimeout(timeoutMs, () => {
+      request.destroy();
+      settle(false);
+    });
+  });
+}
+
+function isProcessRunning(pid: number | null | undefined): boolean {
+  if (!Number.isInteger(pid) || Number(pid) <= 0) return false;
+  try {
+    process.kill(Number(pid), 0);
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+async function stopModelRouterProcess(pid: number, port: number): Promise<void> {
+  try {
+    process.kill(pid, "SIGTERM");
+  } catch {
+    return;
+  }
+  for (let attempt = 0; attempt < 10; attempt++) {
+    await new Promise((resolve) => setTimeout(resolve, 500));
+    if (!isProcessRunning(pid) && !(await isRouterHealthy(port, 1000))) return;
+  }
+  try {
+    process.kill(pid, "SIGKILL");
+  } catch {
+    // already stopped
+  }
+  for (let attempt = 0; attempt < 5; attempt++) {
+    await new Promise((resolve) => setTimeout(resolve, 500));
+    if (!isProcessRunning(pid) && !(await isRouterHealthy(port, 1000))) return;
+  }
+}
+
+function resolveHostCommandPath(commandName: string): string | null {
+  const result = runCapture(["sh", "-c", 'command -v "$1"', "--", commandName], {
+    ignoreError: true,
+  }).trim();
+  return result || null;
+}
+
+function modelRouterPackageDir(): string {
+  return path.join(ROOT, MODEL_ROUTER_RELATIVE_DIR);
+}
+
+function modelRouterVenvDir(): string {
+  return process.env.NEMOCLAW_MODEL_ROUTER_VENV || MODEL_ROUTER_VENV_DIR;
+}
+
+function modelRouterCommandPath(venvDir = modelRouterVenvDir()): string {
+  return path.join(venvDir, "bin", "model-router");
+}
+
+function modelRouterFingerprintPath(venvDir = modelRouterVenvDir()): string {
+  return path.join(venvDir, MODEL_ROUTER_FINGERPRINT_FILE);
+}
+
+function isExecutableFile(filePath: string): boolean {
+  try {
+    fs.accessSync(filePath, fs.constants.X_OK);
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+function isModelRouterPackageReady(routerDir = modelRouterPackageDir()): boolean {
+  return fs.existsSync(path.join(routerDir, "pyproject.toml")) ||
+    fs.existsSync(path.join(routerDir, "setup.py"));
+}
+
+function shouldSkipModelRouterFingerprintEntry(name: string): boolean {
+  return MODEL_ROUTER_FINGERPRINT_IGNORED_NAMES.has(name) || name.endsWith(".egg-info");
+}
+
+function hashModelRouterSourceTree(routerDir = modelRouterPackageDir()): string | null {
+  const sourceHash = crypto.createHash("sha256");
+
+  const hashDirectory = (currentDir: string): boolean => {
+    let entries: fs.Dirent[];
+    try {
+      entries = fs
+        .readdirSync(currentDir, { withFileTypes: true })
+        .sort((left: fs.Dirent, right: fs.Dirent) => left.name.localeCompare(right.name));
+    } catch {
+      return false;
+    }
+
+    let hashedSourceFile = false;
+    for (const entry of entries) {
+      if (shouldSkipModelRouterFingerprintEntry(entry.name)) continue;
+      if (entry.name.endsWith(".pyc") || entry.name.endsWith(".pyo")) continue;
+
+      const entryPath = path.join(currentDir, entry.name);
+      const relativePath = path.relative(routerDir, entryPath).split(path.sep).join("/");
+      if (entry.isDirectory()) {
+        hashedSourceFile = hashDirectory(entryPath) || hashedSourceFile;
+        continue;
+      }
+      if (entry.isSymbolicLink()) {
+        try {
+          sourceHash.update(`link:${relativePath}\0`);
+          sourceHash.update(fs.readlinkSync(entryPath));
+          sourceHash.update("\0");
+          hashedSourceFile = true;
+        } catch {
+          // Ignore unreadable links; the install step will fail if they are required.
+        }
+        continue;
+      }
+      if (!entry.isFile()) continue;
+      sourceHash.update(`file:${relativePath}\0`);
+      sourceHash.update(fs.readFileSync(entryPath));
+      sourceHash.update("\0");
+      hashedSourceFile = true;
+    }
+    return hashedSourceFile;
+  };
+
+  return hashDirectory(routerDir) ? `files:${sourceHash.digest("hex")}` : null;
+}
+
+function getModelRouterSourceFingerprint(routerDir = modelRouterPackageDir()): string | null {
+  const gitHead = runCapture(["git", "-C", routerDir, "rev-parse", "HEAD"], {
+    ignoreError: true,
+  }).trim();
+  if (/^[0-9a-f]{40}$/i.test(gitHead)) return `git:${gitHead}`;
+
+  const gitLink = runCapture(["git", "-C", ROOT, "rev-parse", `HEAD:${MODEL_ROUTER_RELATIVE_DIR}`], {
+    ignoreError: true,
+  }).trim();
+  if (/^[0-9a-f]{40}$/i.test(gitLink)) return `gitlink:${gitLink}`;
+
+  return hashModelRouterSourceTree(routerDir);
+}
+
+function readModelRouterInstalledFingerprint(venvDir = modelRouterVenvDir()): string | null {
+  try {
+    const fingerprint = fs.readFileSync(modelRouterFingerprintPath(venvDir), "utf8").trim();
+    return fingerprint || null;
+  } catch {
+    return null;
+  }
+}
+
+function writeModelRouterInstalledFingerprint(
+  fingerprint: string | null,
+  venvDir = modelRouterVenvDir(),
+): void {
+  if (!fingerprint) return;
+  fs.writeFileSync(modelRouterFingerprintPath(venvDir), `${fingerprint}\n`, { mode: 0o600 });
+}
+
+function isManagedModelRouterCurrent(
+  routerDir = modelRouterPackageDir(),
+  venvDir = modelRouterVenvDir(),
+): boolean {
+  if (!isExecutableFile(modelRouterCommandPath(venvDir))) return false;
+  const sourceFingerprint = getModelRouterSourceFingerprint(routerDir);
+  return Boolean(
+    sourceFingerprint && readModelRouterInstalledFingerprint(venvDir) === sourceFingerprint,
+  );
+}
+
+function initializeModelRouterSubmodule(routerDir = modelRouterPackageDir()): void {
+  if (isModelRouterPackageReady(routerDir)) return;
+  if (!fs.existsSync(path.join(ROOT, ".gitmodules")) || !fs.existsSync(path.join(ROOT, ".git"))) {
+    return;
+  }
+  console.log("  Initializing Model Router source...");
+  run(["git", "-C", ROOT, "submodule", "update", "--init", "--depth", "1", MODEL_ROUTER_RELATIVE_DIR], {
+    ignoreError: true,
+  });
+}
+
+function installModelRouterCommand(routerDir = modelRouterPackageDir()): string {
+  initializeModelRouterSubmodule(routerDir);
+  if (!isModelRouterPackageReady(routerDir)) {
+    throw new Error(
+      `Model Router source is not initialized at ${routerDir}. ` +
+        `Run: git -C ${ROOT} submodule update --init --depth 1 ${MODEL_ROUTER_RELATIVE_DIR}`,
+    );
+  }
+
+  const venvDir = modelRouterVenvDir();
+  const routerCommand = modelRouterCommandPath(venvDir);
+  const sourceFingerprint = getModelRouterSourceFingerprint(routerDir);
+  const allowReplaceExistingVenv =
+    path.resolve(venvDir) === path.resolve(MODEL_ROUTER_VENV_DIR) ||
+    readModelRouterInstalledFingerprint(venvDir) !== null;
+  const venvPython = prepareModelRouterVenv({
+    venvDir,
+    allowReplaceExisting: allowReplaceExistingVenv,
+  });
+
+  const installResult = run(
+    [venvPython, "-m", "pip", "install", "--quiet", "--upgrade", `${routerDir}[prefill,proxy]`],
+    {
+      ignoreError: true,
+      timeout: 600_000,
+    },
+  );
+  if (installResult.status !== 0) {
+    throw new Error("Failed to install Model Router dependencies.");
+  }
+  if (!isExecutableFile(routerCommand)) {
+    throw new Error("Model Router install did not produce the model-router command.");
+  }
+  writeModelRouterInstalledFingerprint(sourceFingerprint, venvDir);
+  return routerCommand;
+}
+
+function ensureModelRouterCommand(): string {
+  const routerDir = modelRouterPackageDir();
+  const venvDir = modelRouterVenvDir();
+  const managedCommand = modelRouterCommandPath(venvDir);
+
+  if (isModelRouterPackageReady(routerDir) && isManagedModelRouterCurrent(routerDir, venvDir)) {
+    return managedCommand;
+  }
+
+  if (!isModelRouterPackageReady(routerDir)) {
+    initializeModelRouterSubmodule(routerDir);
+  }
+
+  if (isModelRouterPackageReady(routerDir)) {
+    if (isManagedModelRouterCurrent(routerDir, venvDir)) return managedCommand;
+    return installModelRouterCommand(routerDir);
+  }
+
+  if (isExecutableFile(managedCommand)) return managedCommand;
+  return resolveHostCommandPath("model-router") || installModelRouterCommand();
+}
+
+/**
+ * Start the model-router proxy and wait for it to become healthy.
+ * Follows the same pattern as Ollama startup (spawn detached, poll health).
+ * Returns the PID of the child process.
+ */
+async function startModelRouter(routerCfg: BlueprintRouterConfig): Promise<number> {
+  const routerCommand = ensureModelRouterCommand();
+  const port = routerCfg.port || 4000;
+  const blueprintDir = path.join(ROOT, "nemoclaw-blueprint");
+  const poolConfigPath = path.join(
+    blueprintDir,
+    routerCfg.pool_config_path || "router/pool-config.yaml",
+  );
+  const stateDir = path.join(os.homedir(), ".nemoclaw", "state");
+  const litellmConfigPath = path.join(stateDir, "litellm-proxy.yaml");
+
+  fs.mkdirSync(stateDir, { recursive: true });
+
+  const proxyConfigResult = spawnSync(
+    routerCommand,
+    ["proxy-config", "--config", poolConfigPath, "--output", litellmConfigPath],
+    { encoding: "utf8", timeout: 30_000, cwd: blueprintDir },
+  );
+  if (proxyConfigResult.status !== 0) {
+    throw new Error(
+      `model-router proxy-config failed: ${proxyConfigResult.stderr || proxyConfigResult.error || "unknown error"}`,
+    );
+  }
+
+  const credEnvVars: Record<string, string> = {};
+  const credName = routerCfg.credential_env || DEFAULT_MODEL_ROUTER_CREDENTIAL_ENV;
+  const routedCredential = resolveProviderCredential(credName);
+  const openAiCredential = resolveProviderCredential("OPENAI_API_KEY");
+  if (routedCredential) {
+    credEnvVars[credName] = routedCredential;
+    if (!openAiCredential) credEnvVars.OPENAI_API_KEY = routedCredential;
+  }
+  if (openAiCredential) credEnvVars.OPENAI_API_KEY = openAiCredential;
+  const _providerKey = (process.env.NEMOCLAW_PROVIDER_KEY || "").trim();
+  if (_providerKey) {
+    if (!credEnvVars[credName]) credEnvVars[credName] = _providerKey;
+    if (!credEnvVars.OPENAI_API_KEY) credEnvVars.OPENAI_API_KEY = _providerKey;
+  }
+
+  if (await isRouterHealthy(port)) {
+    throw new Error(
+      `Port ${port} already has a healthy router endpoint; refusing to start a second router.`,
+    );
+  }
+
+  const child = spawn(
+    routerCommand,
+    [
+      "proxy",
+      "--litellm-config", litellmConfigPath,
+      "--router-config", poolConfigPath,
+      "--host", "0.0.0.0",
+      "--port", String(port),
+    ],
+    {
+      detached: true,
+      stdio: "ignore",
+      cwd: blueprintDir,
+      env: buildSubprocessEnv(credEnvVars),
+    },
+  );
+  let childExited = false;
+  let childExitDetail = "";
+  child.once("error", (err: Error) => {
+    childExited = true;
+    childExitDetail = `child failed to start: ${err.message}`;
+  });
+  child.once("exit", (code: number | null, signal: string | null) => {
+    childExited = true;
+    if (!childExitDetail) {
+      childExitDetail = `child exited with code ${code ?? "null"}${signal ? ` signal ${signal}` : ""}`;
+    }
+  });
+  child.unref();
+
+  const pid = child.pid;
+  if (!pid) {
+    throw new Error(
+      "Failed to start model-router proxy: no PID returned" +
+        (childExitDetail ? ` (${childExitDetail})` : ""),
+    );
+  }
+
+  for (let attempt = 0; attempt < ROUTER_HEALTH_RETRIES; attempt++) {
+    await new Promise((resolve) => setTimeout(resolve, ROUTER_HEALTH_INTERVAL_MS));
+    if (childExited) break;
+    const healthy = await isRouterHealthy(port);
+    let processAlive = true;
+    try {
+      process.kill(pid, 0);
+    } catch {
+      processAlive = false;
+    }
+    if (healthy && processAlive) return pid;
+    if (!processAlive) {
+      childExited = true;
+      if (!childExitDetail) childExitDetail = "child process is no longer running";
+      break;
+    }
+  }
+  try {
+    process.kill(pid, "SIGTERM");
+  } catch {
+    // already dead
+  }
+  throw new Error(
+    `Model router failed to become healthy on port ${port} after ${ROUTER_HEALTH_RETRIES} attempts` +
+      (childExitDetail ? ` (${childExitDetail})` : ""),
+  );
+}
+
+function getRoutedProfile(): BlueprintInferenceProfile {
+  const bp = loadBlueprintProfile("routed");
+  if (!bp || bp.router?.enabled !== true) {
+    throw new Error("Router is not enabled in nemoclaw-blueprint/blueprint.yaml.");
+  }
+  return bp;
+}
+
+export function isRoutedInferenceProvider(provider: string | null | undefined): boolean {
+  if (!provider) return false;
+  if (provider === "nvidia-router") return true;
+  const bp = loadBlueprintProfile("routed");
+  return Boolean(bp?.provider_name && provider === bp.provider_name);
+}
+
+export async function reconcileModelRouter(): Promise<void> {
+  const bp = getRoutedProfile();
+  const routerPort = bp.router.port || 4000;
+  const routerCredentialEnv =
+    bp.router.credential_env || bp.credential_env || DEFAULT_MODEL_ROUTER_CREDENTIAL_ENV;
+  const routerCredential =
+    hydrateCredentialEnv(routerCredentialEnv) ||
+    normalizeCredentialValue(bp.credential_default || "");
+  if (!routerCredential) {
+    throw new Error(`${routerCredentialEnv} is required to start Model Router.`);
+  }
+  saveCredential(routerCredentialEnv, routerCredential);
+  const routerCredentialHash = hashCredential(routerCredential);
+  const session = onboardSession.loadSession();
+  const recordedPid = session?.routerPid ?? null;
+  const recordedCredentialHash = session?.routerCredentialHash ?? null;
+
+  if (await isRouterHealthy(routerPort)) {
+    if (
+      routerCredentialHash &&
+      recordedCredentialHash === routerCredentialHash &&
+      isProcessRunning(recordedPid)
+    ) {
+      console.log(`  ✓ Model router is already healthy on port ${routerPort}`);
+      return;
+    }
+    if (isProcessRunning(recordedPid)) {
+      console.log("  Restarting model router with updated credentials...");
+      await stopModelRouterProcess(requireValue(recordedPid, "Expected recorded router PID"), routerPort);
+    } else {
+      throw new Error(
+        `Port ${routerPort} already has a healthy router endpoint, but its credential state is unknown. Stop the existing model-router process and rerun onboarding.`,
+      );
+    }
+  }
+
+  console.log("  Starting model router...");
+  const routerPid = await startModelRouter(bp.router);
+  console.log(`  ✓ Model router started (PID ${routerPid}) on port ${routerPort}`);
+  onboardSession.updateSession((current: Session) => {
+    current.routerPid = routerPid;
+    current.routerCredentialHash = routerCredentialHash;
+    return current;
+  });
+}
diff --git a/src/lib/onboard/runtime-boundary.ts b/src/lib/onboard/runtime-boundary.ts
new file mode 100644
index 0000000000..be9cc339d8
--- /dev/null
+++ b/src/lib/onboard/runtime-boundary.ts
@@ -0,0 +1,93 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import type { Session, SessionUpdates } from "../state/onboard-session";
+import { OnboardRuntime } from "./machine/runtime";
+import type { OnboardMachineState } from "./machine/types";
+
+export interface OnboardRuntimeBoundaryOptions {
+  toSessionUpdates(updates: Record<string, unknown>): SessionUpdates;
+  maybeForceE2eStepFailure(stepName: string): void;
+}
+
+export class OnboardRuntimeBoundary {
+  private runtime: OnboardRuntime | null = null;
+
+  constructor(private readonly options: OnboardRuntimeBoundaryOptions) {}
+
+  reset(): void {
+    this.runtime = new OnboardRuntime();
+  }
+
+  clear(): void {
+    this.runtime = null;
+  }
+
+  getRuntime(): OnboardRuntime {
+    if (!this.runtime) this.runtime = new OnboardRuntime();
+    return this.runtime;
+  }
+
+  async startRecordedStep(
+    stepName: string,
+    updates: {
+      sandboxName?: string | null;
+      provider?: string | null;
+      model?: string | null;
+      policyPresets?: string[] | null;
+    } = {},
+  ): Promise<void> {
+    const runtime = this.getRuntime();
+    await runtime.markStepStarted(stepName);
+    if (Object.keys(updates).length > 0) {
+      await runtime.updateContext(this.options.toSessionUpdates(updates));
+    }
+    this.options.maybeForceE2eStepFailure(stepName);
+  }
+
+  async recordStepComplete(
+    stepName: string,
+    updates: SessionUpdates = {},
+  ): Promise<Session> {
+    return this.getRuntime().markStepComplete(stepName, updates);
+  }
+
+  async recordStepSkipped(stepName: string): Promise<Session> {
+    return this.getRuntime().markStepSkipped(stepName);
+  }
+
+  async recordStepFailed(stepName: string, message: string | null): Promise<Session> {
+    return this.getRuntime().markStepFailed(stepName, message);
+  }
+
+  async recordStateSkipped(
+    state: OnboardMachineState,
+    metadata: Record<string, unknown> | null = null,
+  ): Promise<Session> {
+    return this.getRuntime().markSkipped(state, metadata);
+  }
+
+  async recordRepairEvent(
+    type: "state.repair.started" | "state.repair.completed" | "state.repair.failed",
+    options: {
+      state?: OnboardMachineState | null;
+      error?: string | null;
+      metadata?: Record<string, unknown> | null;
+    } = {},
+  ): Promise<Session> {
+    return this.getRuntime().emitRepairEvent(type, options);
+  }
+
+  async recordSessionComplete(updates: SessionUpdates = {}): Promise<Session> {
+    const runtime = this.getRuntime();
+    const current = await runtime.session();
+    if (current.machine.state === "finalizing") {
+      await runtime.transition("post_verify");
+      return runtime.complete(updates);
+    }
+    if (current.machine.state === "post_verify") {
+      return runtime.complete(updates);
+    }
+    return runtime.completeSession(updates);
+  }
+}
diff --git a/src/lib/onboard/session-updates.ts b/src/lib/onboard/session-updates.ts
new file mode 100644
index 0000000000..529d22e531
--- /dev/null
+++ b/src/lib/onboard/session-updates.ts
@@ -0,0 +1,63 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import type { WebSearchConfig } from "../inference/web-search";
+import type { MessagingChannelConfig } from "../messaging-channel-config";
+import type { HermesAuthMethod, SessionUpdates } from "../state/onboard-session";
+
+export interface OnboardSessionUpdateInput {
+  sandboxName?: string | null;
+  provider?: string | null;
+  model?: string | null;
+  endpointUrl?: string | null;
+  credentialEnv?: string | null;
+  hermesAuthMethod?: HermesAuthMethod | string | null;
+  preferredInferenceApi?: string | null;
+  nimContainer?: string | null;
+  webSearchConfig?: WebSearchConfig | null;
+  policyPresets?: string[] | null;
+  messagingChannels?: string[] | null;
+  messagingChannelConfig?: MessagingChannelConfig | null;
+  hermesToolGateways?: string[] | null;
+}
+
+// Preserve the nullable contract end-to-end: `null` means "clear this
+// field on the persisted session", `undefined` means "leave unchanged".
+function toNullableString(value: string | null | undefined): string | null | undefined {
+  if (value === undefined) return undefined;
+  if (value === null) return null;
+  return value;
+}
+
+function normalizeHermesAuthMethod(value: string | null | undefined): HermesAuthMethod | null {
+  return value === "oauth" || value === "api_key" ? value : null;
+}
+
+export function toSessionUpdates(updates: OnboardSessionUpdateInput = {}): SessionUpdates {
+  const normalized: SessionUpdates = {};
+  if (updates.sandboxName !== undefined)
+    normalized.sandboxName = toNullableString(updates.sandboxName);
+  if (updates.provider !== undefined) normalized.provider = toNullableString(updates.provider);
+  if (updates.model !== undefined) normalized.model = toNullableString(updates.model);
+  if (updates.endpointUrl !== undefined)
+    normalized.endpointUrl = toNullableString(updates.endpointUrl);
+  if (updates.credentialEnv !== undefined)
+    normalized.credentialEnv = toNullableString(updates.credentialEnv);
+  if (updates.hermesAuthMethod !== undefined)
+    normalized.hermesAuthMethod = normalizeHermesAuthMethod(updates.hermesAuthMethod);
+  if (updates.preferredInferenceApi !== undefined) {
+    normalized.preferredInferenceApi = toNullableString(updates.preferredInferenceApi);
+  }
+  if (updates.nimContainer !== undefined)
+    normalized.nimContainer = toNullableString(updates.nimContainer);
+  if (updates.webSearchConfig !== undefined) normalized.webSearchConfig = updates.webSearchConfig;
+  if (updates.policyPresets !== undefined) normalized.policyPresets = updates.policyPresets;
+  if (updates.messagingChannels !== undefined)
+    normalized.messagingChannels = updates.messagingChannels;
+  if (updates.messagingChannelConfig !== undefined) {
+    normalized.messagingChannelConfig = updates.messagingChannelConfig;
+  }
+  if (updates.hermesToolGateways !== undefined)
+    normalized.hermesToolGateways = updates.hermesToolGateways;
+  return normalized;
+}

From ce1a645958538eeb5fdc765a216b74b6e825f911 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 11:08:46 -0700
Subject: [PATCH 18/54] refactor(cli): extract sandbox agent helpers

---
 src/lib/onboard.ts               | 114 ++++---------------------------
 src/lib/onboard/sandbox-agent.ts | 107 +++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+), 101 deletions(-)
 create mode 100644 src/lib/onboard/sandbox-agent.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index ad23d5a06e..7a38fe9682 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -276,6 +276,19 @@ const { resolveSandboxImageTagFromCreateOutput } =
 const nim: typeof import("./inference/nim") = require("./inference/nim");
 const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session");
 const { toSessionUpdates }: typeof import("./onboard/session-updates") = require("./onboard/session-updates");
+const sandboxAgent: typeof import("./onboard/sandbox-agent") = require("./onboard/sandbox-agent");
+const {
+  RESERVED_SANDBOX_NAMES,
+  formatSandboxAgentName,
+  getAgentInferenceProviderOptions,
+  getDefaultSandboxNameForAgent,
+  getEffectiveSandboxAgent,
+  getRequestedSandboxAgentName,
+  getSandboxAgentDrift,
+  getSandboxAgentRegistryFields,
+  getSandboxPromptDefault,
+  normalizeSandboxAgentName,
+} = sandboxAgent;
 const modelRouter: typeof import("./onboard/model-router") = require("./onboard/model-router");
 const {
   DEFAULT_MODEL_ROUTER_CREDENTIAL_ENV,
@@ -3956,107 +3969,6 @@ async function recoverGatewayRuntime() {
 
 // ── Step 3: Sandbox ──────────────────────────────────────────────
 
-// Names that collide with CLI command namespaces. A sandbox named 'status'
-// makes 'nemoclaw status connect' route to the global status command
-// instead of the sandbox, and a sandbox named 'sandbox' collides with the
-// oclif-native `nemoclaw sandbox ...` command namespace. Reject these wherever
-// a sandbox name enters the system (interactive prompt, --name flag,
-// NEMOCLAW_SANDBOX_NAME).
-const RESERVED_SANDBOX_NAMES = new Set([
-  "onboard",
-  "list",
-  "deploy",
-  "setup",
-  "setup-spark",
-  "start",
-  "stop",
-  "status",
-  "debug",
-  "uninstall",
-  "update",
-  "credentials",
-  "help",
-  "sandbox",
-]);
-
-function normalizeSandboxAgentName(agentName: string | null | undefined): string {
-  const trimmed = typeof agentName === "string" ? agentName.trim() : "";
-  return trimmed && trimmed !== "openclaw" ? trimmed : "openclaw";
-}
-
-const UNKNOWN_SANDBOX_AGENT_NAME = "unknown";
-
-function getRequestedSandboxAgentName(agent: AgentDefinition | null | undefined): string {
-  return normalizeSandboxAgentName(agent?.name);
-}
-
-function formatSandboxAgentName(agentName: string | null | undefined): string {
-  const normalized = normalizeSandboxAgentName(agentName);
-  if (normalized === "openclaw") return "OpenClaw";
-  if (normalized === "hermes") return "Hermes";
-  return normalized;
-}
-
-function getDefaultSandboxNameForAgent(agent: AgentDefinition | null | undefined): string {
-  return getRequestedSandboxAgentName(agent) === "hermes" ? "hermes" : "my-assistant";
-}
-
-function getSandboxPromptDefault(agent: AgentDefinition | null | undefined): string {
-  const envName = (process.env.NEMOCLAW_SANDBOX_NAME || "").trim().toLowerCase();
-  const agentDefault = getDefaultSandboxNameForAgent(agent);
-  if (!envName) return agentDefault;
-  try {
-    return validateName(envName, "sandbox name");
-  } catch {
-    return agentDefault;
-  }
-}
-
-function getEffectiveSandboxAgent(agent: AgentDefinition | null | undefined): AgentDefinition {
-  return agent || agentDefs.loadAgent("openclaw");
-}
-
-function getAgentInferenceProviderOptions(agent: AgentDefinition | null | undefined): string[] {
-  const effectiveAgent = agent?.name
-    ? agentDefs.loadAgent(agent.name)
-    : getEffectiveSandboxAgent(agent);
-  return Array.isArray(effectiveAgent.inferenceProviderOptions)
-    ? effectiveAgent.inferenceProviderOptions
-    : [];
-}
-
-function getSandboxAgentRegistryFields(
-  agent: AgentDefinition | null | undefined,
-  agentVersionKnown = true,
-): Pick<SandboxEntry, "agent" | "agentVersion"> {
-  const effectiveAgent = getEffectiveSandboxAgent(agent);
-  const agentName = normalizeSandboxAgentName(effectiveAgent.name);
-  return {
-    agent: agentName === "openclaw" ? null : agentName,
-    agentVersion: agentVersionKnown ? effectiveAgent.expectedVersion || null : null,
-  };
-}
-
-function getSandboxAgentDrift(
-  sandboxName: string,
-  requestedAgentName: string,
-): { changed: boolean; existingAgentName: string; requestedAgentName: string } {
-  const existingEntry: SandboxEntry | null = registry.getSandbox(sandboxName);
-  if (!existingEntry) {
-    return {
-      changed: true,
-      existingAgentName: UNKNOWN_SANDBOX_AGENT_NAME,
-      requestedAgentName,
-    };
-  }
-  const existingAgentName = normalizeSandboxAgentName(existingEntry?.agent);
-  return {
-    changed: existingAgentName !== requestedAgentName,
-    existingAgentName,
-    requestedAgentName,
-  };
-}
-
 function getSandboxRuntimeRegistryFields(
   config: SandboxGpuConfig,
 ): Pick<
diff --git a/src/lib/onboard/sandbox-agent.ts b/src/lib/onboard/sandbox-agent.ts
new file mode 100644
index 0000000000..c17b9de0b2
--- /dev/null
+++ b/src/lib/onboard/sandbox-agent.ts
@@ -0,0 +1,107 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import type { AgentDefinition } from "../agent/defs";
+import { loadAgent } from "../agent/defs";
+import { validateName } from "../runner";
+import type { SandboxEntry } from "../state/registry";
+import * as registry from "../state/registry";
+
+// Names that collide with CLI command namespaces. A sandbox named 'status'
+// makes 'nemoclaw status connect' route to the global status command
+// instead of the sandbox, and a sandbox named 'sandbox' collides with the
+// oclif-native `nemoclaw sandbox ...` command namespace. Reject these wherever
+// a sandbox name enters the system (interactive prompt, --name flag,
+// NEMOCLAW_SANDBOX_NAME).
+export const RESERVED_SANDBOX_NAMES = new Set([
+  "onboard",
+  "list",
+  "deploy",
+  "setup",
+  "setup-spark",
+  "start",
+  "stop",
+  "status",
+  "debug",
+  "uninstall",
+  "update",
+  "credentials",
+  "help",
+  "sandbox",
+]);
+
+export const UNKNOWN_SANDBOX_AGENT_NAME = "unknown";
+
+export function normalizeSandboxAgentName(agentName: string | null | undefined): string {
+  const trimmed = typeof agentName === "string" ? agentName.trim() : "";
+  return trimmed && trimmed !== "openclaw" ? trimmed : "openclaw";
+}
+
+export function getRequestedSandboxAgentName(agent: AgentDefinition | null | undefined): string {
+  return normalizeSandboxAgentName(agent?.name);
+}
+
+export function formatSandboxAgentName(agentName: string | null | undefined): string {
+  const normalized = normalizeSandboxAgentName(agentName);
+  if (normalized === "openclaw") return "OpenClaw";
+  if (normalized === "hermes") return "Hermes";
+  return normalized;
+}
+
+export function getDefaultSandboxNameForAgent(agent: AgentDefinition | null | undefined): string {
+  return getRequestedSandboxAgentName(agent) === "hermes" ? "hermes" : "my-assistant";
+}
+
+export function getSandboxPromptDefault(agent: AgentDefinition | null | undefined): string {
+  const envName = (process.env.NEMOCLAW_SANDBOX_NAME || "").trim().toLowerCase();
+  const agentDefault = getDefaultSandboxNameForAgent(agent);
+  if (!envName) return agentDefault;
+  try {
+    return validateName(envName, "sandbox name");
+  } catch {
+    return agentDefault;
+  }
+}
+
+export function getEffectiveSandboxAgent(agent: AgentDefinition | null | undefined): AgentDefinition {
+  return agent || loadAgent("openclaw");
+}
+
+export function getAgentInferenceProviderOptions(agent: AgentDefinition | null | undefined): string[] {
+  const effectiveAgent = agent?.name ? loadAgent(agent.name) : getEffectiveSandboxAgent(agent);
+  return Array.isArray(effectiveAgent.inferenceProviderOptions)
+    ? effectiveAgent.inferenceProviderOptions
+    : [];
+}
+
+export function getSandboxAgentRegistryFields(
+  agent: AgentDefinition | null | undefined,
+  agentVersionKnown = true,
+): Pick<SandboxEntry, "agent" | "agentVersion"> {
+  const effectiveAgent = getEffectiveSandboxAgent(agent);
+  const agentName = normalizeSandboxAgentName(effectiveAgent.name);
+  return {
+    agent: agentName === "openclaw" ? null : agentName,
+    agentVersion: agentVersionKnown ? effectiveAgent.expectedVersion || null : null,
+  };
+}
+
+export function getSandboxAgentDrift(
+  sandboxName: string,
+  requestedAgentName: string,
+): { changed: boolean; existingAgentName: string; requestedAgentName: string } {
+  const existingEntry: SandboxEntry | null = registry.getSandbox(sandboxName);
+  if (!existingEntry) {
+    return {
+      changed: true,
+      existingAgentName: UNKNOWN_SANDBOX_AGENT_NAME,
+      requestedAgentName,
+    };
+  }
+  const existingAgentName = normalizeSandboxAgentName(existingEntry?.agent);
+  return {
+    changed: existingAgentName !== requestedAgentName,
+    existingAgentName,
+    requestedAgentName,
+  };
+}

From 7a07d8c2d78084a09e4b2a8d1ce3ad885bf5e53e Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 11:11:16 -0700
Subject: [PATCH 19/54] refactor(cli): extract messaging config helpers

---
 src/lib/onboard.ts                  | 40 ++++-----------------------
 src/lib/onboard/messaging-config.ts | 43 +++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 34 deletions(-)
 create mode 100644 src/lib/onboard/messaging-config.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 7a38fe9682..0966904510 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -276,6 +276,12 @@ const { resolveSandboxImageTagFromCreateOutput } =
 const nim: typeof import("./inference/nim") = require("./inference/nim");
 const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session");
 const { toSessionUpdates }: typeof import("./onboard/session-updates") = require("./onboard/session-updates");
+const messagingConfig: typeof import("./onboard/messaging-config") = require("./onboard/messaging-config");
+const {
+  getStoredMessagingChannelConfig,
+  messagingChannelConfigsEqual,
+  persistMessagingChannelConfigToSession,
+} = messagingConfig;
 const sandboxAgent: typeof import("./onboard/sandbox-agent") = require("./onboard/sandbox-agent");
 const {
   RESERVED_SANDBOX_NAMES,
@@ -375,9 +381,7 @@ import type { WebSearchConfig } from "./inference/web-search";
 import {
   hydrateMessagingChannelConfig,
   type MessagingChannelConfig,
-  mergeMessagingChannelConfigs,
   readMessagingChannelConfigFromEnv,
-  sanitizeMessagingChannelConfig,
 } from "./messaging-channel-config";
 import { streamGatewayStart } from "./onboard/gateway";
 import {
@@ -7183,38 +7187,6 @@ async function setupInference(
 
 const MESSAGING_CHANNELS = listChannels();
 
-function getStoredMessagingChannelConfig(
-  sandboxName: string | null,
-  session: Session | null,
-): MessagingChannelConfig | null {
-  const registryConfig = sandboxName
-    ? sanitizeMessagingChannelConfig(registry.getSandbox(sandboxName)?.messagingChannelConfig)
-    : null;
-  const sessionMatchesSandbox =
-    !session?.sandboxName || !sandboxName || session.sandboxName === sandboxName;
-  const sessionConfig = sessionMatchesSandbox
-    ? sanitizeMessagingChannelConfig(session?.messagingChannelConfig)
-    : null;
-  return mergeMessagingChannelConfigs(registryConfig, sessionConfig);
-}
-
-function persistMessagingChannelConfigToSession(config: MessagingChannelConfig | null): void {
-  onboardSession.updateSession((current: Session) => {
-    current.messagingChannelConfig = config;
-    return current;
-  });
-}
-
-function messagingChannelConfigsEqual(
-  left: MessagingChannelConfig | null,
-  right: MessagingChannelConfig | null,
-): boolean {
-  const leftKeys = Object.keys(left || {}).sort();
-  const rightKeys = Object.keys(right || {}).sort();
-  if (leftKeys.length !== rightKeys.length) return false;
-  return leftKeys.every((key, index) => key === rightKeys[index] && left?.[key] === right?.[key]);
-}
-
 // Curl exit codes that indicate a network-level failure (not a token problem).
 // 35 (TLS handshake failure) covers corporate proxies that MITM HTTPS.
 const TELEGRAM_NETWORK_CURL_CODES = new Set([6, 7, 28, 35, 52, 56]);
diff --git a/src/lib/onboard/messaging-config.ts b/src/lib/onboard/messaging-config.ts
new file mode 100644
index 0000000000..eefea7e901
--- /dev/null
+++ b/src/lib/onboard/messaging-config.ts
@@ -0,0 +1,43 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import {
+  type MessagingChannelConfig,
+  mergeMessagingChannelConfigs,
+  sanitizeMessagingChannelConfig,
+} from "../messaging-channel-config";
+import type { Session } from "../state/onboard-session";
+import * as onboardSession from "../state/onboard-session";
+import * as registry from "../state/registry";
+
+export function getStoredMessagingChannelConfig(
+  sandboxName: string | null,
+  session: Session | null,
+): MessagingChannelConfig | null {
+  const registryConfig = sandboxName
+    ? sanitizeMessagingChannelConfig(registry.getSandbox(sandboxName)?.messagingChannelConfig)
+    : null;
+  const sessionMatchesSandbox =
+    !session?.sandboxName || !sandboxName || session.sandboxName === sandboxName;
+  const sessionConfig = sessionMatchesSandbox
+    ? sanitizeMessagingChannelConfig(session?.messagingChannelConfig)
+    : null;
+  return mergeMessagingChannelConfigs(registryConfig, sessionConfig);
+}
+
+export function persistMessagingChannelConfigToSession(config: MessagingChannelConfig | null): void {
+  onboardSession.updateSession((current: Session) => {
+    current.messagingChannelConfig = config;
+    return current;
+  });
+}
+
+export function messagingChannelConfigsEqual(
+  left: MessagingChannelConfig | null,
+  right: MessagingChannelConfig | null,
+): boolean {
+  const leftKeys = Object.keys(left || {}).sort();
+  const rightKeys = Object.keys(right || {}).sort();
+  if (leftKeys.length !== rightKeys.length) return false;
+  return leftKeys.every((key, index) => key === rightKeys[index] && left?.[key] === right?.[key]);
+}

From 9d928911a81301459c5d15b9df0e331a94ef5a1a Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 11:14:10 -0700
Subject: [PATCH 20/54] refactor(cli): extract resume conflict helpers

---
 src/lib/onboard.ts               | 120 ++--------------------------
 src/lib/onboard/resume-config.ts | 133 +++++++++++++++++++++++++++++++
 2 files changed, 141 insertions(+), 112 deletions(-)
 create mode 100644 src/lib/onboard/resume-config.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 0966904510..9ac8aa477c 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -275,6 +275,14 @@ const { resolveSandboxImageTagFromCreateOutput } =
   require("./domain/sandbox/image-tag") as typeof import("./domain/sandbox/image-tag");
 const nim: typeof import("./inference/nim") = require("./inference/nim");
 const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session");
+const resumeConfig: typeof import("./onboard/resume-config") = require("./onboard/resume-config");
+const {
+  getRequestedModelHint,
+  getRequestedProviderHint,
+  getRequestedSandboxNameHint,
+  getResumeConfigConflicts,
+  getResumeSandboxConflict,
+} = resumeConfig;
 const { toSessionUpdates }: typeof import("./onboard/session-updates") = require("./onboard/session-updates");
 const messagingConfig: typeof import("./onboard/messaging-config") = require("./onboard/messaging-config");
 const {
@@ -1873,118 +1881,6 @@ const {
 
 const ollamaModelSize: typeof import("./inference/ollama/model-size") = require("./inference/ollama/model-size");
 
-function getRequestedSandboxNameHint(opts: { sandboxName?: string | null } = {}): string | null {
-  const raw =
-    typeof opts.sandboxName === "string" && opts.sandboxName.length > 0
-      ? opts.sandboxName
-      : process.env.NEMOCLAW_SANDBOX_NAME;
-  if (typeof raw !== "string") return null;
-  const normalized = raw.trim().toLowerCase();
-  return normalized || null;
-}
-
-function getResumeSandboxConflict(
-  session: Session | null,
-  opts: { sandboxName?: string | null } = {},
-) {
-  // Use opts.sandboxName as the sole source — the caller has already
-  // resolved it (--name first, NEMOCLAW_SANDBOX_NAME only when prompting
-  // is impossible). Falling back to the env var here would fire spurious
-  // conflicts for interactive resume runs whose shell happens to export
-  // NEMOCLAW_SANDBOX_NAME but which never actually consult it.
-  // #2753: only treat session.sandboxName as a conflict source if the
-  // sandbox step actually completed. A pre-fix incomplete session would
-  // otherwise reject a legitimate `--resume --name <new>` that the user
-  // is supplying precisely to recover from the phantom.
-  const raw = typeof opts.sandboxName === "string" ? opts.sandboxName.trim().toLowerCase() : "";
-  const requestedSandboxName = raw || null;
-  const recordedSandboxName =
-    session?.steps?.sandbox?.status === "complete" ? session?.sandboxName ?? null : null;
-  if (!requestedSandboxName || !recordedSandboxName) {
-    return null;
-  }
-  return requestedSandboxName !== recordedSandboxName
-    ? { requestedSandboxName, recordedSandboxName }
-    : null;
-}
-
-// Provider hint wrappers — supply isNonInteractive() default, delegate to onboard-providers.
-function getRequestedProviderHint(nonInteractive = isNonInteractive()) {
-  return onboardProviders.getRequestedProviderHint(nonInteractive);
-}
-function getRequestedModelHint(nonInteractive = isNonInteractive()) {
-  return onboardProviders.getRequestedModelHint(nonInteractive);
-}
-
-function getResumeConfigConflicts(
-  session: Session | null,
-  opts: {
-    nonInteractive?: boolean;
-    fromDockerfile?: string | null;
-    sandboxName?: string | null;
-    agent?: string | null;
-  } = {},
-) {
-  const conflicts = [];
-  const nonInteractive = opts.nonInteractive ?? isNonInteractive();
-
-  const sandboxConflict = getResumeSandboxConflict(session, { sandboxName: opts.sandboxName });
-  if (sandboxConflict) {
-    conflicts.push({
-      field: "sandbox",
-      requested: sandboxConflict.requestedSandboxName,
-      recorded: sandboxConflict.recordedSandboxName,
-    });
-  }
-
-  const requestedProvider = getRequestedProviderHint(nonInteractive);
-  const effectiveRequestedProvider = getEffectiveProviderName(requestedProvider);
-  if (
-    effectiveRequestedProvider &&
-    session?.provider &&
-    effectiveRequestedProvider !== session.provider
-  ) {
-    conflicts.push({
-      field: "provider",
-      requested: effectiveRequestedProvider,
-      recorded: session.provider,
-    });
-  }
-
-  const requestedModel = getRequestedModelHint(nonInteractive);
-  if (requestedModel && session?.model && requestedModel !== session.model) {
-    conflicts.push({
-      field: "model",
-      requested: requestedModel,
-      recorded: session.model,
-    });
-  }
-
-  const requestedFrom = opts.fromDockerfile ? path.resolve(opts.fromDockerfile) : null;
-  const recordedFrom = session?.metadata?.fromDockerfile
-    ? path.resolve(session.metadata.fromDockerfile)
-    : null;
-  if (requestedFrom !== recordedFrom) {
-    conflicts.push({
-      field: "fromDockerfile",
-      requested: requestedFrom,
-      recorded: recordedFrom,
-    });
-  }
-
-  const requestedAgent = opts.agent || process.env.NEMOCLAW_AGENT || null;
-  const recordedAgent = session?.agent || null;
-  if (requestedAgent && recordedAgent && requestedAgent !== recordedAgent) {
-    conflicts.push({
-      field: "agent",
-      requested: requestedAgent,
-      recorded: recordedAgent,
-    });
-  }
-
-  return conflicts;
-}
-
 function printRemediationActions(
   actions: Array<{ title: string; reason: string; commands?: string[] }> | null | undefined,
 ): void {
diff --git a/src/lib/onboard/resume-config.ts b/src/lib/onboard/resume-config.ts
new file mode 100644
index 0000000000..f745517d66
--- /dev/null
+++ b/src/lib/onboard/resume-config.ts
@@ -0,0 +1,133 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import path from "node:path";
+
+const onboardProviders = require("./providers");
+
+export interface ResumeSessionLike {
+  sandboxName?: string | null;
+  provider?: string | null;
+  model?: string | null;
+  agent?: string | null;
+  metadata?: { fromDockerfile?: string | null } | null;
+  steps?: { sandbox?: { status?: string | null } | null } | null;
+}
+
+export interface ResumeConfigConflict {
+  field: string;
+  requested: string | null;
+  recorded: string | null;
+}
+
+export function getRequestedSandboxNameHint(opts: { sandboxName?: string | null } = {}): string | null {
+  const raw =
+    typeof opts.sandboxName === "string" && opts.sandboxName.length > 0
+      ? opts.sandboxName
+      : process.env.NEMOCLAW_SANDBOX_NAME;
+  if (typeof raw !== "string") return null;
+  const normalized = raw.trim().toLowerCase();
+  return normalized || null;
+}
+
+export function getResumeSandboxConflict(
+  session: ResumeSessionLike | null,
+  opts: { sandboxName?: string | null } = {},
+): { requestedSandboxName: string; recordedSandboxName: string } | null {
+  // Use opts.sandboxName as the sole source — the caller has already
+  // resolved it (--name first, NEMOCLAW_SANDBOX_NAME only when prompting
+  // is impossible). Falling back to the env var here would fire spurious
+  // conflicts for interactive resume runs whose shell happens to export
+  // NEMOCLAW_SANDBOX_NAME but which never actually consult it.
+  // #2753: only treat session.sandboxName as a conflict source if the
+  // sandbox step actually completed. A pre-fix incomplete session would
+  // otherwise reject a legitimate `--resume --name <new>` that the user
+  // is supplying precisely to recover from the phantom.
+  const raw = typeof opts.sandboxName === "string" ? opts.sandboxName.trim().toLowerCase() : "";
+  const requestedSandboxName = raw || null;
+  const recordedSandboxName =
+    session?.steps?.sandbox?.status === "complete" ? session?.sandboxName ?? null : null;
+  if (!requestedSandboxName || !recordedSandboxName) {
+    return null;
+  }
+  return requestedSandboxName !== recordedSandboxName
+    ? { requestedSandboxName, recordedSandboxName }
+    : null;
+}
+
+export function getRequestedProviderHint(nonInteractive = false): string | null {
+  return onboardProviders.getRequestedProviderHint(nonInteractive);
+}
+
+export function getRequestedModelHint(nonInteractive = false): string | null {
+  return onboardProviders.getRequestedModelHint(nonInteractive);
+}
+
+export function getResumeConfigConflicts(
+  session: ResumeSessionLike | null,
+  opts: {
+    nonInteractive?: boolean;
+    fromDockerfile?: string | null;
+    sandboxName?: string | null;
+    agent?: string | null;
+  } = {},
+): ResumeConfigConflict[] {
+  const conflicts: ResumeConfigConflict[] = [];
+  const nonInteractive = opts.nonInteractive ?? false;
+
+  const sandboxConflict = getResumeSandboxConflict(session, { sandboxName: opts.sandboxName });
+  if (sandboxConflict) {
+    conflicts.push({
+      field: "sandbox",
+      requested: sandboxConflict.requestedSandboxName,
+      recorded: sandboxConflict.recordedSandboxName,
+    });
+  }
+
+  const requestedProvider = getRequestedProviderHint(nonInteractive);
+  const effectiveRequestedProvider = onboardProviders.getEffectiveProviderName(requestedProvider);
+  if (
+    effectiveRequestedProvider &&
+    session?.provider &&
+    effectiveRequestedProvider !== session.provider
+  ) {
+    conflicts.push({
+      field: "provider",
+      requested: effectiveRequestedProvider,
+      recorded: session.provider,
+    });
+  }
+
+  const requestedModel = getRequestedModelHint(nonInteractive);
+  if (requestedModel && session?.model && requestedModel !== session.model) {
+    conflicts.push({
+      field: "model",
+      requested: requestedModel,
+      recorded: session.model,
+    });
+  }
+
+  const requestedFrom = opts.fromDockerfile ? path.resolve(opts.fromDockerfile) : null;
+  const recordedFrom = session?.metadata?.fromDockerfile
+    ? path.resolve(session.metadata.fromDockerfile)
+    : null;
+  if (requestedFrom !== recordedFrom) {
+    conflicts.push({
+      field: "fromDockerfile",
+      requested: requestedFrom,
+      recorded: recordedFrom,
+    });
+  }
+
+  const requestedAgent = opts.agent || process.env.NEMOCLAW_AGENT || null;
+  const recordedAgent = session?.agent || null;
+  if (requestedAgent && recordedAgent && requestedAgent !== recordedAgent) {
+    conflicts.push({
+      field: "agent",
+      requested: requestedAgent,
+      recorded: recordedAgent,
+    });
+  }
+
+  return conflicts;
+}

From df8a52e2755039429c2eb1503a0d3ae3050b5e3e Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 11:23:22 -0700
Subject: [PATCH 21/54] refactor(cli): extract openshell version helpers

---
 src/lib/onboard.ts                   | 107 +++------------------------
 src/lib/onboard/openshell-version.ts | 104 ++++++++++++++++++++++++++
 2 files changed, 115 insertions(+), 96 deletions(-)
 create mode 100644 src/lib/onboard/openshell-version.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 9ac8aa477c..65f7864eb2 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -283,6 +283,17 @@ const {
   getResumeConfigConflicts,
   getResumeSandboxConflict,
 } = resumeConfig;
+const openshellVersion: typeof import("./onboard/openshell-version") = require("./onboard/openshell-version");
+const {
+  getBlueprintMaxOpenshellVersion,
+  getBlueprintMinOpenshellVersion,
+  getInstalledOpenshellVersion,
+  getOpenshellChannel,
+  isOpenshellDevVersion,
+  shouldAllowOpenshellAboveBlueprintMax,
+  shouldUseOpenshellDevChannel,
+  versionGte,
+} = openshellVersion;
 const { toSessionUpdates }: typeof import("./onboard/session-updates") = require("./onboard/session-updates");
 const messagingConfig: typeof import("./onboard/messaging-config") = require("./onboard/messaging-config");
 const {
@@ -660,102 +671,6 @@ function step(n: number, total: number, msg: string): void {
   console.log(`  ${"─".repeat(50)}`);
 }
 
-function getInstalledOpenshellVersion(versionOutput: string | null = null): string | null {
-  const openshellBin = resolveOpenshell();
-  if (!versionOutput && !openshellBin) return null;
-  const output = String(
-    versionOutput ?? runCapture([openshellBin, "-V"], { ignoreError: true }),
-  ).trim();
-  const match = output.match(/openshell\s+([0-9]+\.[0-9]+\.[0-9]+)/i);
-  if (match) return match[1];
-  return null;
-}
-
-/**
- * Compare two semver-like x.y.z strings. Returns true iff `left >= right`.
- * Non-numeric or missing components are treated as 0.
- */
-function versionGte(left = "0.0.0", right = "0.0.0"): boolean {
-  const lhs = String(left)
-    .split(".")
-    .map((part) => Number.parseInt(part, 10) || 0);
-  const rhs = String(right)
-    .split(".")
-    .map((part) => Number.parseInt(part, 10) || 0);
-  const length = Math.max(lhs.length, rhs.length);
-  for (let index = 0; index < length; index += 1) {
-    const a = lhs[index] || 0;
-    const b = rhs[index] || 0;
-    if (a > b) return true;
-    if (a < b) return false;
-  }
-  return true;
-}
-
-/**
- * Read a semver field from nemoclaw-blueprint/blueprint.yaml. Returns null if
- * the blueprint or field is missing or unparseable — callers must treat null
- * as "no constraint configured" so a malformed install does not become a hard
- * onboard blocker. See #1317.
- */
-function getBlueprintVersionField(field: string, rootDir = ROOT): string | null {
-  try {
-    // Lazy require: yaml is already a dependency via the policy helpers but
-    // pulling it at module load would slow down `nemoclaw --help` for users
-    // who never reach the preflight path.
-    const YAML = require("yaml");
-    const blueprintPath = path.join(rootDir, "nemoclaw-blueprint", "blueprint.yaml");
-    if (!fs.existsSync(blueprintPath)) return null;
-    const raw = fs.readFileSync(blueprintPath, "utf8");
-    const parsed = YAML.parse(raw);
-    const value = parsed && parsed[field];
-    if (typeof value !== "string") return null;
-    const trimmed = value.trim();
-    if (!/^[0-9]+\.[0-9]+\.[0-9]+/.test(trimmed)) return null;
-    return trimmed;
-  } catch {
-    return null;
-  }
-}
-
-function getBlueprintMinOpenshellVersion(rootDir = ROOT): string | null {
-  return getBlueprintVersionField("min_openshell_version", rootDir);
-}
-
-function getBlueprintMaxOpenshellVersion(rootDir = ROOT): string | null {
-  return getBlueprintVersionField("max_openshell_version", rootDir);
-}
-
-type OpenshellChannel = "stable" | "dev" | "auto";
-
-function getOpenshellChannel(env: NodeJS.ProcessEnv = process.env): OpenshellChannel {
-  const raw = String(env.NEMOCLAW_OPENSHELL_CHANNEL || "auto")
-    .trim()
-    .toLowerCase();
-  if (raw === "stable" || raw === "dev" || raw === "auto") return raw;
-  return "auto";
-}
-
-function shouldUseOpenshellDevChannel(
-  _platform: NodeJS.Platform = process.platform,
-  env: NodeJS.ProcessEnv = process.env,
-): boolean {
-  const channel = getOpenshellChannel(env);
-  return channel === "dev";
-}
-
-function isOpenshellDevVersion(versionOutput: string | null | undefined): boolean {
-  return /\bdev[0-9.]*/i.test(String(versionOutput || ""));
-}
-
-function shouldAllowOpenshellAboveBlueprintMax(
-  versionOutput: string | null | undefined,
-  platform: NodeJS.Platform = process.platform,
-  env: NodeJS.ProcessEnv = process.env,
-): boolean {
-  return shouldUseOpenshellDevChannel(platform, env) && isOpenshellDevVersion(versionOutput);
-}
-
 function resolveSandboxGpuFlagFromOptions(
   opts: Pick<OnboardOptions, "sandboxGpu" | "gpu" | "noGpu">,
 ): SandboxGpuFlag {
diff --git a/src/lib/onboard/openshell-version.ts b/src/lib/onboard/openshell-version.ts
new file mode 100644
index 0000000000..e45a279130
--- /dev/null
+++ b/src/lib/onboard/openshell-version.ts
@@ -0,0 +1,104 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import fs from "node:fs";
+import path from "node:path";
+
+import { resolveOpenshell } from "../adapters/openshell/resolve";
+import { ROOT, runCapture } from "../runner";
+
+export function getInstalledOpenshellVersion(versionOutput: string | null = null): string | null {
+  const openshellBin = resolveOpenshell();
+  if (!versionOutput && !openshellBin) return null;
+  const output = String(
+    versionOutput ?? runCapture([openshellBin as string, "-V"], { ignoreError: true }),
+  ).trim();
+  const match = output.match(/openshell\s+([0-9]+\.[0-9]+\.[0-9]+)/i);
+  if (match) return match[1];
+  return null;
+}
+
+/**
+ * Compare two semver-like x.y.z strings. Returns true iff `left >= right`.
+ * Non-numeric or missing components are treated as 0.
+ */
+export function versionGte(left = "0.0.0", right = "0.0.0"): boolean {
+  const lhs = String(left)
+    .split(".")
+    .map((part) => Number.parseInt(part, 10) || 0);
+  const rhs = String(right)
+    .split(".")
+    .map((part) => Number.parseInt(part, 10) || 0);
+  const length = Math.max(lhs.length, rhs.length);
+  for (let index = 0; index < length; index += 1) {
+    const a = lhs[index] || 0;
+    const b = rhs[index] || 0;
+    if (a > b) return true;
+    if (a < b) return false;
+  }
+  return true;
+}
+
+/**
+ * Read a semver field from nemoclaw-blueprint/blueprint.yaml. Returns null if
+ * the blueprint or field is missing or unparseable — callers must treat null
+ * as "no constraint configured" so a malformed install does not become a hard
+ * onboard blocker. See #1317.
+ */
+function getBlueprintVersionField(field: string, rootDir = ROOT): string | null {
+  try {
+    // Lazy require: yaml is already a dependency via the policy helpers but
+    // pulling it at module load would slow down `nemoclaw --help` for users
+    // who never reach the preflight path.
+    const YAML = require("yaml");
+    const blueprintPath = path.join(rootDir, "nemoclaw-blueprint", "blueprint.yaml");
+    if (!fs.existsSync(blueprintPath)) return null;
+    const raw = fs.readFileSync(blueprintPath, "utf8");
+    const parsed = YAML.parse(raw);
+    const value = parsed && parsed[field];
+    if (typeof value !== "string") return null;
+    const trimmed = value.trim();
+    if (!/^[0-9]+\.[0-9]+\.[0-9]+/.test(trimmed)) return null;
+    return trimmed;
+  } catch {
+    return null;
+  }
+}
+
+export function getBlueprintMinOpenshellVersion(rootDir = ROOT): string | null {
+  return getBlueprintVersionField("min_openshell_version", rootDir);
+}
+
+export function getBlueprintMaxOpenshellVersion(rootDir = ROOT): string | null {
+  return getBlueprintVersionField("max_openshell_version", rootDir);
+}
+
+export type OpenshellChannel = "stable" | "dev" | "auto";
+
+export function getOpenshellChannel(env: NodeJS.ProcessEnv = process.env): OpenshellChannel {
+  const raw = String(env.NEMOCLAW_OPENSHELL_CHANNEL || "auto")
+    .trim()
+    .toLowerCase();
+  if (raw === "stable" || raw === "dev" || raw === "auto") return raw;
+  return "auto";
+}
+
+export function shouldUseOpenshellDevChannel(
+  _platform: NodeJS.Platform = process.platform,
+  env: NodeJS.ProcessEnv = process.env,
+): boolean {
+  const channel = getOpenshellChannel(env);
+  return channel === "dev";
+}
+
+export function isOpenshellDevVersion(versionOutput: string | null | undefined): boolean {
+  return /\bdev[0-9.]*/i.test(String(versionOutput || ""));
+}
+
+export function shouldAllowOpenshellAboveBlueprintMax(
+  versionOutput: string | null | undefined,
+  platform: NodeJS.Platform = process.platform,
+  env: NodeJS.ProcessEnv = process.env,
+): boolean {
+  return shouldUseOpenshellDevChannel(platform, env) && isOpenshellDevVersion(versionOutput);
+}

From fcb3e36284a0091bd493153a013a44c3833d9fb4 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 11:25:48 -0700
Subject: [PATCH 22/54] refactor(cli): extract known hosts pruning

---
 src/lib/onboard.ts             | 17 +----------------
 src/lib/onboard/known-hosts.ts | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 16 deletions(-)
 create mode 100644 src/lib/onboard/known-hosts.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 65f7864eb2..b071bd462f 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -283,6 +283,7 @@ const {
   getResumeConfigConflicts,
   getResumeSandboxConflict,
 } = resumeConfig;
+const { pruneKnownHostsEntries }: typeof import("./onboard/known-hosts") = require("./onboard/known-hosts");
 const openshellVersion: typeof import("./onboard/openshell-version") = require("./onboard/openshell-version");
 const {
   getBlueprintMaxOpenshellVersion,
@@ -632,22 +633,6 @@ function selectNamedGatewayForReuseIfNeeded(snapshot: GatewayReuseSnapshot): Gat
   return refreshed;
 }
 
-/**
- * Remove known_hosts lines whose host field contains an openshell-* entry.
- * Preserves blank lines and comments. Returns the cleaned string.
- */
-function pruneKnownHostsEntries(contents: string): string {
-  return contents
-    .split("\n")
-    .filter((l) => {
-      const trimmed = l.trim();
-      if (!trimmed || trimmed.startsWith("#")) return true;
-      const hostField = trimmed.split(/\s+/)[0];
-      return !hostField.split(",").some((h) => h.startsWith("openshell-"));
-    })
-    .join("\n");
-}
-
 function getSandboxReuseState(sandboxName: string | null) {
   if (!sandboxName) return "missing";
   const getOutput = runCaptureOpenshell(["sandbox", "get", sandboxName], { ignoreError: true });
diff --git a/src/lib/onboard/known-hosts.ts b/src/lib/onboard/known-hosts.ts
new file mode 100644
index 0000000000..0b3a4cb6ac
--- /dev/null
+++ b/src/lib/onboard/known-hosts.ts
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Remove known_hosts lines whose host field contains an openshell-* entry.
+ * Preserves blank lines and comments. Returns the cleaned string.
+ */
+export function pruneKnownHostsEntries(contents: string): string {
+  return contents
+    .split("\n")
+    .filter((line) => {
+      const trimmed = line.trim();
+      if (!trimmed || trimmed.startsWith("#")) return true;
+      const hostField = trimmed.split(/\s+/)[0];
+      return !hostField.split(",").some((host) => host.startsWith("openshell-"));
+    })
+    .join("\n");
+}

From cdd19fd8a5fa052974ae0bcbb7425faf79a3f957 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 11:28:46 -0700
Subject: [PATCH 23/54] refactor(cli): extract gateway reuse helpers

---
 src/lib/onboard.ts               | 54 ++++--------------------
 src/lib/onboard/gateway-reuse.ts | 71 ++++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 47 deletions(-)
 create mode 100644 src/lib/onboard/gateway-reuse.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index b071bd462f..20002151b8 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -296,6 +296,7 @@ const {
   versionGte,
 } = openshellVersion;
 const { toSessionUpdates }: typeof import("./onboard/session-updates") = require("./onboard/session-updates");
+const gatewayReuse: typeof import("./onboard/gateway-reuse") = require("./onboard/gateway-reuse");
 const messagingConfig: typeof import("./onboard/messaging-config") = require("./onboard/messaging-config");
 const {
   getStoredMessagingChannelConfig,
@@ -581,57 +582,16 @@ const {
   isSelectedGateway,
   isGatewayHealthy,
   getGatewayReuseState,
-  shouldSelectNamedGatewayForReuse,
   getSandboxStateFromOutputs,
 } = gatewayState;
 
-type GatewayReuseSnapshot = {
-  gatewayStatus: string;
-  gwInfo: string;
-  activeGatewayInfo: string;
-  gatewayReuseState: ReturnType<typeof getGatewayReuseState>;
-};
-
-function getGatewayReuseSnapshot(): GatewayReuseSnapshot {
-  const gatewayStatus = runCaptureOpenshell(["status"], { ignoreError: true });
-  const gwInfo = runCaptureOpenshell(["gateway", "info", "-g", GATEWAY_NAME], {
-    ignoreError: true,
-  });
-  const activeGatewayInfo = runCaptureOpenshell(["gateway", "info"], { ignoreError: true });
-  return {
-    gatewayStatus,
-    gwInfo,
-    activeGatewayInfo,
-    gatewayReuseState: getGatewayReuseState(gatewayStatus, gwInfo, activeGatewayInfo),
-  };
-}
-
-function selectNamedGatewayForReuseIfNeeded(snapshot: GatewayReuseSnapshot): GatewayReuseSnapshot {
-  if (
-    !shouldSelectNamedGatewayForReuse(
-      snapshot.gatewayStatus,
-      snapshot.gwInfo,
-      snapshot.activeGatewayInfo,
-    )
-  ) {
-    return snapshot;
-  }
-
-  const selectResult = runOpenshell(["gateway", "select", GATEWAY_NAME], {
-    ignoreError: true,
-    suppressOutput: true,
+const { getGatewayReuseSnapshot, selectNamedGatewayForReuseIfNeeded } =
+  gatewayReuse.createGatewayReuseHelpers({
+    gatewayName: GATEWAY_NAME,
+    runCaptureOpenshell,
+    runOpenshell,
+    cliDisplayName,
   });
-  if (selectResult.status !== 0) {
-    return snapshot;
-  }
-
-  const refreshed = getGatewayReuseSnapshot();
-  if (refreshed.gatewayReuseState === "healthy") {
-    process.env.OPENSHELL_GATEWAY = GATEWAY_NAME;
-    console.log(`  ✓ Selected existing ${cliDisplayName()} gateway`);
-  }
-  return refreshed;
-}
 
 function getSandboxReuseState(sandboxName: string | null) {
   if (!sandboxName) return "missing";
diff --git a/src/lib/onboard/gateway-reuse.ts b/src/lib/onboard/gateway-reuse.ts
new file mode 100644
index 0000000000..0406e66300
--- /dev/null
+++ b/src/lib/onboard/gateway-reuse.ts
@@ -0,0 +1,71 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import {
+  getGatewayReuseState,
+  shouldSelectNamedGatewayForReuse,
+} from "../state/gateway";
+
+export type GatewayReuseSnapshot = {
+  gatewayStatus: string;
+  gwInfo: string;
+  activeGatewayInfo: string;
+  gatewayReuseState: ReturnType<typeof getGatewayReuseState>;
+};
+
+export interface GatewayReuseDeps {
+  gatewayName: string;
+  runCaptureOpenshell(args: string[], opts?: Record<string, unknown>): string;
+  runOpenshell(args: string[], opts?: Record<string, unknown>): { status: number | null };
+  cliDisplayName(): string;
+}
+
+export interface GatewayReuseHelpers {
+  getGatewayReuseSnapshot(): GatewayReuseSnapshot;
+  selectNamedGatewayForReuseIfNeeded(snapshot: GatewayReuseSnapshot): GatewayReuseSnapshot;
+}
+
+export function createGatewayReuseHelpers(deps: GatewayReuseDeps): GatewayReuseHelpers {
+  function getGatewayReuseSnapshot(): GatewayReuseSnapshot {
+    const gatewayStatus = deps.runCaptureOpenshell(["status"], { ignoreError: true });
+    const gwInfo = deps.runCaptureOpenshell(["gateway", "info", "-g", deps.gatewayName], {
+      ignoreError: true,
+    });
+    const activeGatewayInfo = deps.runCaptureOpenshell(["gateway", "info"], { ignoreError: true });
+    return {
+      gatewayStatus,
+      gwInfo,
+      activeGatewayInfo,
+      gatewayReuseState: getGatewayReuseState(gatewayStatus, gwInfo, activeGatewayInfo),
+    };
+  }
+
+  function selectNamedGatewayForReuseIfNeeded(snapshot: GatewayReuseSnapshot): GatewayReuseSnapshot {
+    if (
+      !shouldSelectNamedGatewayForReuse(
+        snapshot.gatewayStatus,
+        snapshot.gwInfo,
+        snapshot.activeGatewayInfo,
+      )
+    ) {
+      return snapshot;
+    }
+
+    const selectResult = deps.runOpenshell(["gateway", "select", deps.gatewayName], {
+      ignoreError: true,
+      suppressOutput: true,
+    });
+    if (selectResult.status !== 0) {
+      return snapshot;
+    }
+
+    const refreshed = getGatewayReuseSnapshot();
+    if (refreshed.gatewayReuseState === "healthy") {
+      process.env.OPENSHELL_GATEWAY = deps.gatewayName;
+      console.log(`  ✓ Selected existing ${deps.cliDisplayName()} gateway`);
+    }
+    return refreshed;
+  }
+
+  return { getGatewayReuseSnapshot, selectNamedGatewayForReuseIfNeeded };
+}

From 9ba83f770a5e80d4bb0249d26d8b4c4919c228d7 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 11:42:51 -0700
Subject: [PATCH 24/54] Potential fix for pull request finding 'CodeQL / Unused
 variable, import, function or class'

Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
---
 src/lib/onboard.ts | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 20002151b8..e6c3ec6833 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -7727,7 +7727,6 @@ const {
   ensureAgentDashboardForward,
   fetchGatewayAuthTokenFromSandbox,
   getDashboardForwardPort,
-  getDashboardForwardTarget,
   getWslHostAddress,
   printDashboard,
   stopAllDashboardForwards,

From 0201b4dd0582fec4368f4a9c6982ff18729ef219 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 11:43:43 -0700
Subject: [PATCH 25/54] Apply suggestions from code review

Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
---
 src/lib/onboard.ts | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index e6c3ec6833..c2885c5209 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -289,7 +289,6 @@ const {
   getBlueprintMaxOpenshellVersion,
   getBlueprintMinOpenshellVersion,
   getInstalledOpenshellVersion,
-  getOpenshellChannel,
   isOpenshellDevVersion,
   shouldAllowOpenshellAboveBlueprintMax,
   shouldUseOpenshellDevChannel,
@@ -309,7 +308,6 @@ const {
   formatSandboxAgentName,
   getAgentInferenceProviderOptions,
   getDefaultSandboxNameForAgent,
-  getEffectiveSandboxAgent,
   getRequestedSandboxAgentName,
   getSandboxAgentDrift,
   getSandboxAgentRegistryFields,

From 8d74472e1a01b83734dabf59e26eec7249ba8117 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 11:56:15 -0700
Subject: [PATCH 26/54] refactor(cli): extract sandbox reuse helpers

---
 src/lib/onboard.ts               | 21 +++++++------------
 src/lib/onboard/sandbox-reuse.ts | 36 ++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 14 deletions(-)
 create mode 100644 src/lib/onboard/sandbox-reuse.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index c2885c5209..9bf341d2d8 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -303,6 +303,7 @@ const {
   persistMessagingChannelConfigToSession,
 } = messagingConfig;
 const sandboxAgent: typeof import("./onboard/sandbox-agent") = require("./onboard/sandbox-agent");
+const sandboxReuse: typeof import("./onboard/sandbox-reuse") = require("./onboard/sandbox-reuse");
 const {
   RESERVED_SANDBOX_NAMES,
   formatSandboxAgentName,
@@ -591,20 +592,12 @@ const { getGatewayReuseSnapshot, selectNamedGatewayForReuseIfNeeded } =
     cliDisplayName,
   });
 
-function getSandboxReuseState(sandboxName: string | null) {
-  if (!sandboxName) return "missing";
-  const getOutput = runCaptureOpenshell(["sandbox", "get", sandboxName], { ignoreError: true });
-  const listOutput = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true });
-  return getSandboxStateFromOutputs(sandboxName, getOutput, listOutput);
-}
-
-function repairRecordedSandbox(sandboxName: string | null): void {
-  if (!sandboxName) return;
-  note(`  [resume] Cleaning up recorded sandbox '${sandboxName}' before recreating it.`);
-  runOpenshell(["forward", "stop", String(DASHBOARD_PORT)], { ignoreError: true });
-  runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true });
-  registry.removeSandbox(sandboxName);
-}
+const { getSandboxReuseState, repairRecordedSandbox } = sandboxReuse.createSandboxReuseHelpers({
+  runCaptureOpenshell,
+  runOpenshell,
+  getSandboxStateFromOutputs,
+  note,
+});
 
 const { streamSandboxCreate } = sandboxCreateStream;
 
diff --git a/src/lib/onboard/sandbox-reuse.ts b/src/lib/onboard/sandbox-reuse.ts
new file mode 100644
index 0000000000..ac3d30b4b1
--- /dev/null
+++ b/src/lib/onboard/sandbox-reuse.ts
@@ -0,0 +1,36 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { DASHBOARD_PORT } from "../core/ports";
+import * as registry from "../state/registry";
+
+export interface SandboxReuseDeps {
+  runCaptureOpenshell(args: string[], opts?: Record<string, unknown>): string;
+  runOpenshell(args: string[], opts?: Record<string, unknown>): unknown;
+  getSandboxStateFromOutputs(sandboxName: string, getOutput: string, listOutput: string): string;
+  note(message: string): void;
+}
+
+export interface SandboxReuseHelpers {
+  getSandboxReuseState(sandboxName: string | null): string;
+  repairRecordedSandbox(sandboxName: string | null): void;
+}
+
+export function createSandboxReuseHelpers(deps: SandboxReuseDeps): SandboxReuseHelpers {
+  function getSandboxReuseState(sandboxName: string | null): string {
+    if (!sandboxName) return "missing";
+    const getOutput = deps.runCaptureOpenshell(["sandbox", "get", sandboxName], { ignoreError: true });
+    const listOutput = deps.runCaptureOpenshell(["sandbox", "list"], { ignoreError: true });
+    return deps.getSandboxStateFromOutputs(sandboxName, getOutput, listOutput);
+  }
+
+  function repairRecordedSandbox(sandboxName: string | null): void {
+    if (!sandboxName) return;
+    deps.note(`  [resume] Cleaning up recorded sandbox '${sandboxName}' before recreating it.`);
+    deps.runOpenshell(["forward", "stop", String(DASHBOARD_PORT)], { ignoreError: true });
+    deps.runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true });
+    registry.removeSandbox(sandboxName);
+  }
+
+  return { getSandboxReuseState, repairRecordedSandbox };
+}

From c47e5b8e2a24d0f4f7f90d4a2ce843eff1b367ba Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 11:59:28 -0700
Subject: [PATCH 27/54] refactor(cli): extract messaging credential helpers

---
 src/lib/onboard.ts                       | 69 +++++++--------------
 src/lib/onboard/messaging-credentials.ts | 78 ++++++++++++++++++++++++
 2 files changed, 99 insertions(+), 48 deletions(-)
 create mode 100644 src/lib/onboard/messaging-credentials.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 9bf341d2d8..7ae0d9f303 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -297,6 +297,11 @@ const {
 const { toSessionUpdates }: typeof import("./onboard/session-updates") = require("./onboard/session-updates");
 const gatewayReuse: typeof import("./onboard/gateway-reuse") = require("./onboard/gateway-reuse");
 const messagingConfig: typeof import("./onboard/messaging-config") = require("./onboard/messaging-config");
+const {
+  detectMessagingCredentialRotation,
+  getMessagingChannelForEnvKey,
+  getRecordedMessagingChannelsForResume: getRecordedMessagingChannelsForResumeFromState,
+}: typeof import("./onboard/messaging-credentials") = require("./onboard/messaging-credentials");
 const {
   getStoredMessagingChannelConfig,
   messagingChannelConfigsEqual,
@@ -1239,54 +1244,6 @@ function providerExistsInGateway(name: string) {
   return onboardProviders.providerExistsInGateway(name, runOpenshell);
 }
 
-function getMessagingChannelForEnvKey(envKey: string): string | null {
-  if (envKey === "DISCORD_BOT_TOKEN") return "discord";
-  if (envKey === "SLACK_BOT_TOKEN") return "slack";
-  if (envKey === "TELEGRAM_BOT_TOKEN") return "telegram";
-  if (envKey === "WECHAT_BOT_TOKEN") return "wechat";
-  return null;
-}
-
-
-function getRecordedMessagingChannelsForResume(
-  resume: boolean,
-  session: Session | null, sandboxName: string | null,
-): string[] | null {
-  return require("./onboard/messaging-reuse").getNonInteractiveStoredMessagingChannels(
-    resume, session?.messagingChannels, sandboxName, MESSAGING_CHANNELS, (envKey: string) => Boolean(normalizeCredentialValue(process.env[envKey]) || getCredential(envKey)),
-    registry.getSandbox.bind(registry), registry.getDisabledChannels.bind(registry), providerExistsInGateway, isNonInteractive());
-}
-
-/**
- * Detect whether any messaging provider credential has been rotated since
- * the sandbox was created, by comparing SHA-256 hashes of the current
- * token values against hashes stored in the sandbox registry.
- *
- * Returns `changed: false` for legacy sandboxes that have no stored hashes
- * (conservative — avoids unnecessary rebuilds after upgrade).
- *
- * @param {string} sandboxName - Name of the sandbox to check.
- * @param {Array<{name: string, envKey: string, token: string|null}>} tokenDefs
- * @returns {{ changed: boolean, changedProviders: string[] }}
- */
-function detectMessagingCredentialRotation(
-  sandboxName: string,
-  tokenDefs: MessagingTokenDef[],
-): { changed: boolean; changedProviders: string[] } {
-  const sb = registry.getSandbox(sandboxName);
-  const storedHashes = sb?.providerCredentialHashes || {};
-  const changedProviders = [];
-  for (const { name, envKey, token } of tokenDefs) {
-    if (!token) continue;
-    const storedHash = storedHashes[envKey];
-    if (!storedHash) continue;
-    if (storedHash !== hashCredential(token)) {
-      changedProviders.push(name);
-    }
-  }
-  return { changed: changedProviders.length > 0, changedProviders };
-}
-
 // Tri-state probe factory for messaging-conflict backfill. An upfront liveness
 // check is necessary because `openshell provider get` exits non-zero for both
 // "provider not attached" and "gateway unreachable"; without the liveness
@@ -6934,6 +6891,22 @@ async function setupInference(
 
 const MESSAGING_CHANNELS = listChannels();
 
+function getRecordedMessagingChannelsForResume(
+  resume: boolean,
+  session: Session | null,
+  sandboxName: string | null,
+): string[] | null {
+  return getRecordedMessagingChannelsForResumeFromState({
+    resume,
+    sessionMessagingChannels: session?.messagingChannels,
+    sandboxName,
+    channels: MESSAGING_CHANNELS,
+    getCredential,
+    providerExistsInGateway,
+    isNonInteractive,
+  });
+}
+
 // Curl exit codes that indicate a network-level failure (not a token problem).
 // 35 (TLS handshake failure) covers corporate proxies that MITM HTTPS.
 const TELEGRAM_NETWORK_CURL_CODES = new Set([6, 7, 28, 35, 52, 56]);
diff --git a/src/lib/onboard/messaging-credentials.ts b/src/lib/onboard/messaging-credentials.ts
new file mode 100644
index 0000000000..fff0e7107b
--- /dev/null
+++ b/src/lib/onboard/messaging-credentials.ts
@@ -0,0 +1,78 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { normalizeCredentialValue } from "../credentials/store";
+import { hashCredential } from "../security/credential-hash";
+import * as registry from "../state/registry";
+
+export interface MessagingTokenDefinition {
+  name: string;
+  envKey: string;
+  token?: string | null;
+}
+
+export interface RecordedMessagingChannelsOptions {
+  resume: boolean;
+  sessionMessagingChannels?: string[] | null;
+  sandboxName: string | null;
+  channels: unknown[];
+  getCredential(envKey: string): string | null | undefined;
+  providerExistsInGateway(name: string): boolean;
+  isNonInteractive(): boolean;
+}
+
+export function getRecordedMessagingChannelsForResume({
+  resume,
+  sessionMessagingChannels,
+  sandboxName,
+  channels,
+  getCredential,
+  providerExistsInGateway,
+  isNonInteractive,
+}: RecordedMessagingChannelsOptions): string[] | null {
+  return require("./messaging-reuse").getNonInteractiveStoredMessagingChannels(
+    resume,
+    sessionMessagingChannels,
+    sandboxName,
+    channels,
+    (envKey: string) => Boolean(normalizeCredentialValue(process.env[envKey]) || getCredential(envKey)),
+    registry.getSandbox.bind(registry),
+    registry.getDisabledChannels.bind(registry),
+    providerExistsInGateway,
+    isNonInteractive(),
+  );
+}
+
+export function getMessagingChannelForEnvKey(envKey: string): string | null {
+  if (envKey === "DISCORD_BOT_TOKEN") return "discord";
+  if (envKey === "SLACK_BOT_TOKEN") return "slack";
+  if (envKey === "TELEGRAM_BOT_TOKEN") return "telegram";
+  if (envKey === "WECHAT_BOT_TOKEN") return "wechat";
+  return null;
+}
+
+/**
+ * Detect whether any messaging provider credential has been rotated since
+ * the sandbox was created, by comparing SHA-256 hashes of the current
+ * token values against hashes stored in the sandbox registry.
+ *
+ * Returns `changed: false` for legacy sandboxes that have no stored hashes
+ * (conservative — avoids unnecessary rebuilds after upgrade).
+ */
+export function detectMessagingCredentialRotation(
+  sandboxName: string,
+  tokenDefs: MessagingTokenDefinition[],
+): { changed: boolean; changedProviders: string[] } {
+  const sb = registry.getSandbox(sandboxName);
+  const storedHashes = sb?.providerCredentialHashes || {};
+  const changedProviders = [];
+  for (const { name, envKey, token } of tokenDefs) {
+    if (!token) continue;
+    const storedHash = storedHashes[envKey];
+    if (!storedHash) continue;
+    if (storedHash !== hashCredential(token)) {
+      changedProviders.push(name);
+    }
+  }
+  return { changed: changedProviders.length > 0, changedProviders };
+}

From 66689027a2f39701eb662c8c5e14887a519d8618 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 12:02:09 -0700
Subject: [PATCH 28/54] refactor(cli): extract sandbox registry metadata
 helpers

---
 src/lib/onboard.ts                           |  61 ++---------
 src/lib/onboard/sandbox-registry-metadata.ts | 101 +++++++++++++++++++
 2 files changed, 108 insertions(+), 54 deletions(-)
 create mode 100644 src/lib/onboard/sandbox-registry-metadata.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 7ae0d9f303..0e0625e1f4 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -308,6 +308,7 @@ const {
   persistMessagingChannelConfigToSession,
 } = messagingConfig;
 const sandboxAgent: typeof import("./onboard/sandbox-agent") = require("./onboard/sandbox-agent");
+const sandboxRegistryMetadata: typeof import("./onboard/sandbox-registry-metadata") = require("./onboard/sandbox-registry-metadata");
 const sandboxReuse: typeof import("./onboard/sandbox-reuse") = require("./onboard/sandbox-reuse");
 const {
   RESERVED_SANDBOX_NAMES,
@@ -3677,61 +3678,13 @@ async function recoverGatewayRuntime() {
 
 // ── Step 3: Sandbox ──────────────────────────────────────────────
 
-function getSandboxRuntimeRegistryFields(
-  config: SandboxGpuConfig,
-): Pick<
-  SandboxEntry,
-  | "gpuEnabled"
-  | "hostGpuDetected"
-  | "sandboxGpuEnabled"
-  | "sandboxGpuMode"
-  | "sandboxGpuDevice"
-  | "openshellDriver"
-  | "openshellVersion"
-> {
-  return {
-    gpuEnabled: config.sandboxGpuEnabled,
-    hostGpuDetected: config.hostGpuDetected,
-    sandboxGpuEnabled: config.sandboxGpuEnabled,
-    sandboxGpuMode: config.mode,
-    sandboxGpuDevice: config.sandboxGpuDevice,
-    openshellDriver: isLinuxDockerDriverGatewayEnabled() ? (process.platform === "darwin" ? "vm" : "docker") : "kubernetes",
-    openshellVersion: getInstalledOpenshellVersion(
-      runCaptureOpenshell(["--version"], { ignoreError: true }),
-    ),
-  };
-}
-
-function hasSandboxGpuDrift(sandboxName: string, config: SandboxGpuConfig): boolean {
-  const existingEntry: SandboxEntry | null = registry.getSandbox(sandboxName);
-  if (!existingEntry) return false;
-  return (
-    (existingEntry.sandboxGpuEnabled === true) !== config.sandboxGpuEnabled ||
-    (existingEntry.sandboxGpuMode || "auto") !== config.mode ||
-    (existingEntry.sandboxGpuDevice || null) !== config.sandboxGpuDevice
-  );
-}
-
-function updateReusedSandboxMetadata(
-  sandboxName: string,
-  agent: AgentDefinition | null | undefined,
-  model: string,
-  provider: string,
-  dashboardPort: number,
-  selectionVerified = true,
-  sandboxGpuConfig: SandboxGpuConfig | null = null,
-): void {
-  const existingEntry = registry.getSandbox(sandboxName);
-  const agentVersionKnown = existingEntry?.agentVersion !== null;
-  const selectionUpdates = selectionVerified ? { model, provider } : {};
-  registry.updateSandbox(sandboxName, {
-    ...selectionUpdates,
-    dashboardPort,
-    ...getSandboxAgentRegistryFields(agent, agentVersionKnown),
-    ...(sandboxGpuConfig ? getSandboxRuntimeRegistryFields(sandboxGpuConfig) : {}),
+const { getSandboxRuntimeRegistryFields, hasSandboxGpuDrift, updateReusedSandboxMetadata } =
+  sandboxRegistryMetadata.createSandboxRegistryMetadataHelpers({
+    isLinuxDockerDriverGatewayEnabled,
+    getInstalledOpenshellVersion,
+    runCaptureOpenshell,
   });
-  registry.setDefault(sandboxName);
-}
+
 
 async function promptValidatedSandboxName(agent: AgentDefinition | null = null) {
   const MAX_ATTEMPTS = 3;
diff --git a/src/lib/onboard/sandbox-registry-metadata.ts b/src/lib/onboard/sandbox-registry-metadata.ts
new file mode 100644
index 0000000000..bbd84db74e
--- /dev/null
+++ b/src/lib/onboard/sandbox-registry-metadata.ts
@@ -0,0 +1,101 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import type { AgentDefinition } from "../agent/defs";
+import type { SandboxEntry } from "../state/registry";
+import * as registry from "../state/registry";
+import { getSandboxAgentRegistryFields } from "./sandbox-agent";
+import type { SandboxGpuConfig } from "./sandbox-gpu-mode";
+
+export interface SandboxRegistryMetadataDeps {
+  isLinuxDockerDriverGatewayEnabled(): boolean;
+  getInstalledOpenshellVersion(versionOutput?: string | null): string | null;
+  runCaptureOpenshell(args: string[], opts?: Record<string, unknown>): string | null;
+}
+
+export interface SandboxRegistryMetadataHelpers {
+  getSandboxRuntimeRegistryFields(config: SandboxGpuConfig): Pick<
+    SandboxEntry,
+    | "gpuEnabled"
+    | "hostGpuDetected"
+    | "sandboxGpuEnabled"
+    | "sandboxGpuMode"
+    | "sandboxGpuDevice"
+    | "openshellDriver"
+    | "openshellVersion"
+  >;
+  hasSandboxGpuDrift(sandboxName: string, config: SandboxGpuConfig): boolean;
+  updateReusedSandboxMetadata(
+    sandboxName: string,
+    agent: AgentDefinition | null | undefined,
+    model: string,
+    provider: string,
+    dashboardPort: number,
+    selectionVerified?: boolean,
+    sandboxGpuConfig?: SandboxGpuConfig | null,
+  ): void;
+}
+
+export function createSandboxRegistryMetadataHelpers(
+  deps: SandboxRegistryMetadataDeps,
+): SandboxRegistryMetadataHelpers {
+  function getSandboxRuntimeRegistryFields(config: SandboxGpuConfig): Pick<
+    SandboxEntry,
+    | "gpuEnabled"
+    | "hostGpuDetected"
+    | "sandboxGpuEnabled"
+    | "sandboxGpuMode"
+    | "sandboxGpuDevice"
+    | "openshellDriver"
+    | "openshellVersion"
+  > {
+    return {
+      gpuEnabled: config.sandboxGpuEnabled,
+      hostGpuDetected: config.hostGpuDetected,
+      sandboxGpuEnabled: config.sandboxGpuEnabled,
+      sandboxGpuMode: config.mode,
+      sandboxGpuDevice: config.sandboxGpuDevice,
+      openshellDriver: deps.isLinuxDockerDriverGatewayEnabled()
+        ? process.platform === "darwin"
+          ? "vm"
+          : "docker"
+        : "kubernetes",
+      openshellVersion: deps.getInstalledOpenshellVersion(
+        deps.runCaptureOpenshell(["--version"], { ignoreError: true }),
+      ),
+    };
+  }
+
+  function hasSandboxGpuDrift(sandboxName: string, config: SandboxGpuConfig): boolean {
+    const existingEntry: SandboxEntry | null = registry.getSandbox(sandboxName);
+    if (!existingEntry) return false;
+    return (
+      (existingEntry.sandboxGpuEnabled === true) !== config.sandboxGpuEnabled ||
+      (existingEntry.sandboxGpuMode || "auto") !== config.mode ||
+      (existingEntry.sandboxGpuDevice || null) !== config.sandboxGpuDevice
+    );
+  }
+
+  function updateReusedSandboxMetadata(
+    sandboxName: string,
+    agent: AgentDefinition | null | undefined,
+    model: string,
+    provider: string,
+    dashboardPort: number,
+    selectionVerified = true,
+    sandboxGpuConfig: SandboxGpuConfig | null = null,
+  ): void {
+    const existingEntry = registry.getSandbox(sandboxName);
+    const agentVersionKnown = existingEntry?.agentVersion !== null;
+    const selectionUpdates = selectionVerified ? { model, provider } : {};
+    registry.updateSandbox(sandboxName, {
+      ...selectionUpdates,
+      dashboardPort,
+      ...getSandboxAgentRegistryFields(agent, agentVersionKnown),
+      ...(sandboxGpuConfig ? getSandboxRuntimeRegistryFields(sandboxGpuConfig) : {}),
+    });
+    registry.setDefault(sandboxName);
+  }
+
+  return { getSandboxRuntimeRegistryFields, hasSandboxGpuDrift, updateReusedSandboxMetadata };
+}

From a485eec0412b04a8a7c5c705de32bebfeff99a7f Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 12:04:44 -0700
Subject: [PATCH 29/54] refactor(cli): extract openclaw setup helper

---
 src/lib/onboard.ts                | 35 ++++++++---------------
 src/lib/onboard/openclaw-setup.ts | 46 +++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 24 deletions(-)
 create mode 100644 src/lib/onboard/openclaw-setup.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 0e0625e1f4..c2f1679889 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -284,6 +284,7 @@ const {
   getResumeSandboxConflict,
 } = resumeConfig;
 const { pruneKnownHostsEntries }: typeof import("./onboard/known-hosts") = require("./onboard/known-hosts");
+const { createOpenclawSetup }: typeof import("./onboard/openclaw-setup") = require("./onboard/openclaw-setup");
 const openshellVersion: typeof import("./onboard/openshell-version") = require("./onboard/openshell-version");
 const {
   getBlueprintMaxOpenshellVersion,
@@ -7081,30 +7082,16 @@ async function setupMessagingChannels(
 
 // ── Step 7: OpenClaw ─────────────────────────────────────────────
 
-async function setupOpenclaw(sandboxName: string, model: string, provider: string): Promise<void> {
-  step(7, 8, `Setting up ${agentProductName()} inside sandbox`);
-
-  const selectionConfig = getProviderSelectionConfig(provider, model);
-  if (selectionConfig) {
-    const sandboxConfig = {
-      ...selectionConfig,
-      onboardedAt: new Date().toISOString(),
-    };
-    const script = buildSandboxConfigSyncScript(sandboxConfig);
-    const scriptFile = writeSandboxConfigSyncFile(script);
-    try {
-      const scriptContent = fs.readFileSync(scriptFile, "utf-8");
-      run(openshellArgv(["sandbox", "connect", sandboxName]), {
-        stdio: ["pipe", "ignore", "inherit"],
-        input: scriptContent,
-      });
-    } finally {
-      cleanupTempDir(scriptFile, "nemoclaw-sync");
-    }
-  }
-
-  console.log(`  ✓ ${agentProductName()} gateway launched inside sandbox`);
-}
+const setupOpenclaw = createOpenclawSetup({
+  step,
+  agentProductName,
+  getProviderSelectionConfig,
+  buildSandboxConfigSyncScript,
+  writeSandboxConfigSyncFile,
+  run,
+  openshellArgv,
+  cleanupTempDir,
+});
 
 // ── Step 7: Policy presets ───────────────────────────────────────
 
diff --git a/src/lib/onboard/openclaw-setup.ts b/src/lib/onboard/openclaw-setup.ts
new file mode 100644
index 0000000000..de8fd11e3f
--- /dev/null
+++ b/src/lib/onboard/openclaw-setup.ts
@@ -0,0 +1,46 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import fs from "node:fs";
+
+export interface OpenclawSetupDeps {
+  step(n: number, total: number, msg: string): void;
+  agentProductName(): string;
+  getProviderSelectionConfig(provider: string, model: string): unknown | null;
+  buildSandboxConfigSyncScript(config: any): string;
+  writeSandboxConfigSyncFile(script: string): string;
+  run(argv: string[], options: Record<string, unknown>): unknown;
+  openshellArgv(args: string[]): string[];
+  cleanupTempDir(file: string, prefix: string): void;
+}
+
+export function createOpenclawSetup(deps: OpenclawSetupDeps) {
+  return async function setupOpenclaw(
+    sandboxName: string,
+    model: string,
+    provider: string,
+  ): Promise<void> {
+    deps.step(7, 8, `Setting up ${deps.agentProductName()} inside sandbox`);
+
+    const selectionConfig = deps.getProviderSelectionConfig(provider, model);
+    if (selectionConfig) {
+      const sandboxConfig = {
+        ...(selectionConfig as Record<string, unknown>),
+        onboardedAt: new Date().toISOString(),
+      };
+      const script = deps.buildSandboxConfigSyncScript(sandboxConfig);
+      const scriptFile = deps.writeSandboxConfigSyncFile(script);
+      try {
+        const scriptContent = fs.readFileSync(scriptFile, "utf-8");
+        deps.run(deps.openshellArgv(["sandbox", "connect", sandboxName]), {
+          stdio: ["pipe", "ignore", "inherit"],
+          input: scriptContent,
+        });
+      } finally {
+        deps.cleanupTempDir(scriptFile, "nemoclaw-sync");
+      }
+    }
+
+    console.log(`  ✓ ${deps.agentProductName()} gateway launched inside sandbox`);
+  };
+}

From 46039e12489be8380139c4bc3b72f7eabd0f065e Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 13:00:06 -0700
Subject: [PATCH 30/54] refactor(cli): extract sandbox name prompt

---
 src/lib/onboard.ts               | 57 ++++-------------------------
 src/lib/onboard/sandbox-agent.ts | 61 ++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 51 deletions(-)

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index c2f1679889..6dc0ba1b73 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -322,6 +322,12 @@ const {
   getSandboxPromptDefault,
   normalizeSandboxAgentName,
 } = sandboxAgent;
+const promptValidatedSandboxName = sandboxAgent.createPromptValidatedSandboxName({
+  promptOrDefault,
+  cliDisplayName,
+  isNonInteractive,
+  exit: process.exit,
+});
 const modelRouter: typeof import("./onboard/model-router") = require("./onboard/model-router");
 const {
   DEFAULT_MODEL_ROUTER_CREDENTIAL_ENV,
@@ -3687,57 +3693,6 @@ const { getSandboxRuntimeRegistryFields, hasSandboxGpuDrift, updateReusedSandbox
   });
 
 
-async function promptValidatedSandboxName(agent: AgentDefinition | null = null) {
-  const MAX_ATTEMPTS = 3;
-  const defaultSandboxName = getSandboxPromptDefault(agent);
-  for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
-    const nameAnswer = await promptOrDefault(
-      `  Sandbox name (${NAME_ALLOWED_FORMAT}) [${defaultSandboxName}]: `,
-      "NEMOCLAW_SANDBOX_NAME",
-      defaultSandboxName,
-    );
-    const sandboxName = (nameAnswer || defaultSandboxName).trim();
-
-    try {
-      const validatedSandboxName = validateName(sandboxName, "sandbox name");
-      if (RESERVED_SANDBOX_NAMES.has(sandboxName)) {
-        console.error(`  Reserved name: '${sandboxName}' is a ${cliDisplayName()} CLI command.`);
-        console.error("  Choose a different name to avoid routing conflicts.");
-        if (isNonInteractive()) {
-          process.exit(1);
-        }
-        if (attempt < MAX_ATTEMPTS - 1) {
-          console.error("  Please try again.\n");
-        }
-        continue;
-      }
-      return validatedSandboxName;
-    } catch (error) {
-      const errorMessage = error instanceof Error ? error.message : String(error);
-      console.error(`  ${errorMessage}`);
-    }
-
-    for (const line of getNameValidationGuidance("sandbox name", sandboxName, {
-      includeAllowedFormat: false,
-    })) {
-      console.error(`  ${line}`);
-    }
-
-    // Non-interactive runs cannot re-prompt — abort so the caller can fix the
-    // NEMOCLAW_SANDBOX_NAME env var and retry.
-    if (isNonInteractive()) {
-      process.exit(1);
-    }
-
-    if (attempt < MAX_ATTEMPTS - 1) {
-      console.error("  Please try again.\n");
-    }
-  }
-
-  console.error("  Too many invalid attempts.");
-  process.exit(1);
-}
-
 // ── Step 5: Sandbox ──────────────────────────────────────────────
 
 async function createSandbox(
diff --git a/src/lib/onboard/sandbox-agent.ts b/src/lib/onboard/sandbox-agent.ts
index c17b9de0b2..f527333abf 100644
--- a/src/lib/onboard/sandbox-agent.ts
+++ b/src/lib/onboard/sandbox-agent.ts
@@ -3,6 +3,7 @@
 
 import type { AgentDefinition } from "../agent/defs";
 import { loadAgent } from "../agent/defs";
+import { getNameValidationGuidance, NAME_ALLOWED_FORMAT } from "../name-validation";
 import { validateName } from "../runner";
 import type { SandboxEntry } from "../state/registry";
 import * as registry from "../state/registry";
@@ -105,3 +106,63 @@ export function getSandboxAgentDrift(
     requestedAgentName,
   };
 }
+
+export interface PromptSandboxNameDeps {
+  promptOrDefault(question: string, envVar: string, defaultValue: string): Promise<string>;
+  cliDisplayName(): string;
+  isNonInteractive(): boolean;
+  exit(code: number): never;
+}
+
+export function createPromptValidatedSandboxName(deps: PromptSandboxNameDeps) {
+  return async function promptValidatedSandboxName(agent: AgentDefinition | null = null) {
+    const MAX_ATTEMPTS = 3;
+    const defaultSandboxName = getSandboxPromptDefault(agent);
+    for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
+      const nameAnswer = await deps.promptOrDefault(
+        `  Sandbox name (${NAME_ALLOWED_FORMAT}) [${defaultSandboxName}]: `,
+        "NEMOCLAW_SANDBOX_NAME",
+        defaultSandboxName,
+      );
+      const sandboxName = (nameAnswer || defaultSandboxName).trim();
+
+      try {
+        const validatedSandboxName = validateName(sandboxName, "sandbox name");
+        if (RESERVED_SANDBOX_NAMES.has(sandboxName)) {
+          console.error(`  Reserved name: '${sandboxName}' is a ${deps.cliDisplayName()} CLI command.`);
+          console.error("  Choose a different name to avoid routing conflicts.");
+          if (deps.isNonInteractive()) {
+            deps.exit(1);
+          }
+          if (attempt < MAX_ATTEMPTS - 1) {
+            console.error("  Please try again.\n");
+          }
+          continue;
+        }
+        return validatedSandboxName;
+      } catch (error) {
+        const errorMessage = error instanceof Error ? error.message : String(error);
+        console.error(`  ${errorMessage}`);
+      }
+
+      for (const line of getNameValidationGuidance("sandbox name", sandboxName, {
+        includeAllowedFormat: false,
+      })) {
+        console.error(`  ${line}`);
+      }
+
+      // Non-interactive runs cannot re-prompt — abort so the caller can fix the
+      // NEMOCLAW_SANDBOX_NAME env var and retry.
+      if (deps.isNonInteractive()) {
+        deps.exit(1);
+      }
+
+      if (attempt < MAX_ATTEMPTS - 1) {
+        console.error("  Please try again.\n");
+      }
+    }
+
+    console.error("  Too many invalid attempts.");
+    deps.exit(1);
+  };
+}

From 3f6e041dfd3789ba277483a82afe3ea8818284bc Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 13:02:41 -0700
Subject: [PATCH 31/54] refactor(cli): move telegram mention helper

---
 src/lib/onboard.ts                  | 13 +------------
 src/lib/onboard/messaging-config.ts | 12 ++++++++++++
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 6dc0ba1b73..a3aa436e08 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -304,6 +304,7 @@ const {
   getRecordedMessagingChannelsForResume: getRecordedMessagingChannelsForResumeFromState,
 }: typeof import("./onboard/messaging-credentials") = require("./onboard/messaging-credentials");
 const {
+  computeTelegramRequireMention,
   getStoredMessagingChannelConfig,
   messagingChannelConfigsEqual,
   persistMessagingChannelConfigToSession,
@@ -512,18 +513,6 @@ let AUTO_YES = false;
 // null means "use auto-allocation" (skip dashboard port check in preflight).
 let _preflightDashboardPort: number | null = null;
 
-// Read TELEGRAM_REQUIRE_MENTION (set either by the interactive mention prompt
-// or by the user's shell) and map it to a boolean, or null when the env var
-// is unset / invalid. Used at build time to bake groupPolicy into
-// openclaw.json and at resume time to detect drift against the recorded
-// session state. See #1737 and the CodeRabbit follow-up on #2417.
-function computeTelegramRequireMention(): boolean | null {
-  const raw = process.env.TELEGRAM_REQUIRE_MENTION;
-  if (raw === "1") return true;
-  if (raw === "0") return false;
-  return null;
-}
-
 function isNonInteractive(): boolean {
   return NON_INTERACTIVE || process.env.NEMOCLAW_NON_INTERACTIVE === "1";
 }
diff --git a/src/lib/onboard/messaging-config.ts b/src/lib/onboard/messaging-config.ts
index eefea7e901..2ac8fa7eae 100644
--- a/src/lib/onboard/messaging-config.ts
+++ b/src/lib/onboard/messaging-config.ts
@@ -10,6 +10,18 @@ import type { Session } from "../state/onboard-session";
 import * as onboardSession from "../state/onboard-session";
 import * as registry from "../state/registry";
 
+// Read TELEGRAM_REQUIRE_MENTION (set either by the interactive mention prompt
+// or by the user's shell) and map it to a boolean, or null when the env var
+// is unset / invalid. Used at build time to bake groupPolicy into
+// openclaw.json and at resume time to detect drift against the recorded
+// session state. See #1737 and the CodeRabbit follow-up on #2417.
+export function computeTelegramRequireMention(): boolean | null {
+  const raw = process.env.TELEGRAM_REQUIRE_MENTION;
+  if (raw === "1") return true;
+  if (raw === "0") return false;
+  return null;
+}
+
 export function getStoredMessagingChannelConfig(
   sandboxName: string | null,
   session: Session | null,

From 534f0d842b38fc91890f4acbc9378e27227f4bc1 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 13:05:00 -0700
Subject: [PATCH 32/54] refactor(cli): extract onboard base image helpers

---
 src/lib/onboard.ts            | 39 ++++-------------------------------
 src/lib/onboard/base-image.ts | 37 +++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 35 deletions(-)
 create mode 100644 src/lib/onboard/base-image.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index a3aa436e08..0bf5ce14ac 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -119,10 +119,11 @@ const sandboxBaseImage: typeof import("./sandbox-base-image") = require("./sandb
 const {
   OPENCLAW_SANDBOX_BASE_IMAGE: SANDBOX_BASE_IMAGE,
   SANDBOX_BASE_TAG,
-  defaultOpenclawBaseDockerfile,
-  buildLocalBaseTag,
-  resolveSandboxBaseImage,
 } = sandboxBaseImage;
+const {
+  getStableGatewayImageRef,
+  pullAndResolveBaseImageDigest,
+}: typeof import("./onboard/base-image") = require("./onboard/base-image");
 const errnoUtils: typeof import("./core/errno") = require("./core/errno");
 const { isErrnoException } = errnoUtils;
 
@@ -661,38 +662,6 @@ function validateSandboxGpuPreflight(config: SandboxGpuConfig): void {
   console.log(`  ✓ Docker CDI GPU support detected (${cdiSpecFiles.join(", ")})`);
 }
 
-// ── Base image resolution ───────────────────────────────────────
-// Pulls candidate sandbox-base images from GHCR and inspects them to get the
-// actual repo digest when available. This avoids the registry mismatch that
-// broke e2e tests in #1937 while still allowing PR branches to use a source-SHA
-// base image or local build before latest has been rebuilt. See #1904.
-
-/**
- * Resolve a compatible sandbox-base image and pin it to a repo digest when
- * possible. PR-branch validation first tries a source-SHA tag, then latest,
- * and finally a local Dockerfile.base build when the OpenShell Docker driver
- * requires a newer glibc than the published image provides.
- */
-function pullAndResolveBaseImageDigest(
-  options: { requireOpenshellSandboxAbi?: boolean } = {},
-): { digest: string | null; ref: string; source?: string; glibcVersion?: string | null } | null {
-  return resolveSandboxBaseImage({
-    imageName: SANDBOX_BASE_IMAGE,
-    dockerfilePath: defaultOpenclawBaseDockerfile(ROOT),
-    localTag: buildLocalBaseTag("nemoclaw-sandbox-base-local", ROOT),
-    envVar: "NEMOCLAW_SANDBOX_BASE_IMAGE_REF",
-    label: "OpenClaw sandbox base image",
-    requireOpenshellSandboxAbi: options.requireOpenshellSandboxAbi === true,
-    rootDir: ROOT,
-  });
-}
-
-function getStableGatewayImageRef(versionOutput: string | null = null): string | null {
-  const version = getInstalledOpenshellVersion(versionOutput);
-  if (!version) return null;
-  return `ghcr.io/nvidia/openshell/cluster:${version}`;
-}
-
 function getOpenshellBinary(): string {
   if (OPENSHELL_BIN) return OPENSHELL_BIN;
   const resolved = resolveOpenshell();
diff --git a/src/lib/onboard/base-image.ts b/src/lib/onboard/base-image.ts
new file mode 100644
index 0000000000..3f9f14daa3
--- /dev/null
+++ b/src/lib/onboard/base-image.ts
@@ -0,0 +1,37 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { ROOT } from "../runner";
+import {
+  buildLocalBaseTag,
+  defaultOpenclawBaseDockerfile,
+  resolveSandboxBaseImage,
+  OPENCLAW_SANDBOX_BASE_IMAGE as SANDBOX_BASE_IMAGE,
+} from "../sandbox-base-image";
+import { getInstalledOpenshellVersion } from "./openshell-version";
+
+/**
+ * Resolve a compatible sandbox-base image and pin it to a repo digest when
+ * possible. PR-branch validation first tries a source-SHA tag, then latest,
+ * and finally a local Dockerfile.base build when the OpenShell Docker driver
+ * requires a newer glibc than the published image provides.
+ */
+export function pullAndResolveBaseImageDigest(
+  options: { requireOpenshellSandboxAbi?: boolean } = {},
+): { digest: string | null; ref: string; source?: string; glibcVersion?: string | null } | null {
+  return resolveSandboxBaseImage({
+    imageName: SANDBOX_BASE_IMAGE,
+    dockerfilePath: defaultOpenclawBaseDockerfile(ROOT),
+    localTag: buildLocalBaseTag("nemoclaw-sandbox-base-local", ROOT),
+    envVar: "NEMOCLAW_SANDBOX_BASE_IMAGE_REF",
+    label: "OpenClaw sandbox base image",
+    requireOpenshellSandboxAbi: options.requireOpenshellSandboxAbi === true,
+    rootDir: ROOT,
+  });
+}
+
+export function getStableGatewayImageRef(versionOutput: string | null = null): string | null {
+  const version = getInstalledOpenshellVersion(versionOutput);
+  if (!version) return null;
+  return `ghcr.io/nvidia/openshell/cluster:${version}`;
+}

From cd29f01f52cef7257c602d0e32ef9a470ce9d5ac Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 13:07:41 -0700
Subject: [PATCH 33/54] refactor(cli): extract prompt helpers

---
 src/lib/onboard.ts                | 35 +++-----------------
 src/lib/onboard/prompt-helpers.ts | 54 +++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 30 deletions(-)
 create mode 100644 src/lib/onboard/prompt-helpers.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 0bf5ce14ac..0f534c79f6 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -285,6 +285,7 @@ const {
   getResumeSandboxConflict,
 } = resumeConfig;
 const { pruneKnownHostsEntries }: typeof import("./onboard/known-hosts") = require("./onboard/known-hosts");
+const onboardPromptHelpers: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers");
 const { createOpenclawSetup }: typeof import("./onboard/openclaw-setup") = require("./onboard/openclaw-setup");
 const openshellVersion: typeof import("./onboard/openshell-version") = require("./onboard/openshell-version");
 const {
@@ -530,48 +531,22 @@ function note(message: string): void {
   console.log(`${DIM}${message}${RESET}`);
 }
 
-// Prompt wrapper: returns env var value or default in non-interactive mode,
-// otherwise prompts the user interactively.
+const promptHelperDeps = { isNonInteractive, note, prompt };
+
 async function promptOrDefault(
   question: string,
   envVar: string | null,
   defaultValue: string,
 ): Promise<string> {
-  if (isNonInteractive()) {
-    const val = envVar ? process.env[envVar] : null;
-    const result = val || defaultValue;
-    note(`  [non-interactive] ${question.trim()} → ${result}`);
-    return result;
-  }
-  return prompt(question);
+  return onboardPromptHelpers.promptOrDefault(promptHelperDeps, question, envVar, defaultValue);
 }
 
-// Yes/no prompt with a typed default. The `[Y/n]` / `[y/N]` indicator and
-// the non-interactive echo letter are both derived from `defaultIsYes`, so
-// the case of the indicator and the echoed default cannot drift apart.
-// Returns a boolean — callers no longer have to parse reply strings.
-// Replies of "y"/"yes" and "n"/"no" win regardless of case; empty and
-// unknown input fall back to the default.
 async function promptYesNoOrDefault(
   question: string,
   envVar: string | null,
   defaultIsYes: boolean,
 ): Promise<boolean> {
-  const fullQuestion = `${question} ${defaultIsYes ? "[Y/n]" : "[y/N]"}: `;
-  const nonInteractive = isNonInteractive();
-  const input = nonInteractive ? (envVar ? process.env[envVar] : null) : await prompt(fullQuestion);
-
-  const value = String(input ?? "")
-    .trim()
-    .toLowerCase();
-  let chosen = defaultIsYes;
-  if (value === "y" || value === "yes") chosen = true;
-  else if (value === "n" || value === "no") chosen = false;
-
-  if (nonInteractive) {
-    note(`  [non-interactive] ${fullQuestion.trim()} → ${chosen ? "Y" : "N"}`);
-  }
-  return chosen;
+  return onboardPromptHelpers.promptYesNoOrDefault(promptHelperDeps, question, envVar, defaultIsYes);
 }
 
 // ── Helpers ──────────────────────────────────────────────────────
diff --git a/src/lib/onboard/prompt-helpers.ts b/src/lib/onboard/prompt-helpers.ts
new file mode 100644
index 0000000000..4e274fd054
--- /dev/null
+++ b/src/lib/onboard/prompt-helpers.ts
@@ -0,0 +1,54 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+export interface PromptHelperDeps {
+  isNonInteractive(): boolean;
+  note(message: string): void;
+  prompt(question: string): Promise<string>;
+}
+
+// Prompt wrapper: returns env var value or default in non-interactive mode,
+// otherwise prompts the user interactively.
+export async function promptOrDefault(
+  deps: PromptHelperDeps,
+  question: string,
+  envVar: string | null,
+  defaultValue: string,
+): Promise<string> {
+  if (deps.isNonInteractive()) {
+    const val = envVar ? process.env[envVar] : null;
+    const result = val || defaultValue;
+    deps.note(`  [non-interactive] ${question.trim()} → ${result}`);
+    return result;
+  }
+  return deps.prompt(question);
+}
+
+// Yes/no prompt with a typed default. The `[Y/n]` / `[y/N]` indicator and
+// the non-interactive echo letter are both derived from `defaultIsYes`, so
+// the case of the indicator and the echoed default cannot drift apart.
+// Returns a boolean — callers no longer have to parse reply strings.
+// Replies of "y"/"yes" and "n"/"no" win regardless of case; empty and
+// unknown input fall back to the default.
+export async function promptYesNoOrDefault(
+  deps: PromptHelperDeps,
+  question: string,
+  envVar: string | null,
+  defaultIsYes: boolean,
+): Promise<boolean> {
+  const fullQuestion = `${question} ${defaultIsYes ? "[Y/n]" : "[y/N]"}: `;
+  const nonInteractive = deps.isNonInteractive();
+  const input = nonInteractive ? (envVar ? process.env[envVar] : null) : await deps.prompt(fullQuestion);
+
+  const value = String(input ?? "")
+    .trim()
+    .toLowerCase();
+  let chosen = defaultIsYes;
+  if (value === "y" || value === "yes") chosen = true;
+  else if (value === "n" || value === "no") chosen = false;
+
+  if (nonInteractive) {
+    deps.note(`  [non-interactive] ${fullQuestion.trim()} → ${chosen ? "Y" : "N"}`);
+  }
+  return chosen;
+}

From 3fe22050c43a0f7eb0fe90279c1c4982bd73cd00 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 13:21:29 -0700
Subject: [PATCH 34/54] refactor(cli): extract sandbox gpu preflight helpers

---
 src/lib/onboard.ts                       | 59 +++------------------
 src/lib/onboard/sandbox-gpu-preflight.ts | 66 ++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 53 deletions(-)
 create mode 100644 src/lib/onboard/sandbox-gpu-preflight.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 0f534c79f6..011a7fa671 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -39,7 +39,7 @@ const dockerGpuPatch: typeof import("./onboard/docker-gpu-patch") = require("./o
 const dockerGpuLocalInference: typeof import("./onboard/docker-gpu-local-inference") = require("./onboard/docker-gpu-local-inference");
 const dockerGpuSandboxCreate: typeof import("./onboard/docker-gpu-sandbox-create") = require("./onboard/docker-gpu-sandbox-create");
 const dockerDriverGatewayLaunch: typeof import("./onboard/docker-driver-gateway-launch") = require("./onboard/docker-driver-gateway-launch");
-const { findReadableNvidiaCdiSpecFiles, getDockerCdiSpecDirs, parseDockerCdiSpecDirs }: typeof import("./onboard/docker-cdi") = require("./onboard/docker-cdi");
+const { findReadableNvidiaCdiSpecFiles, parseDockerCdiSpecDirs }: typeof import("./onboard/docker-cdi") = require("./onboard/docker-cdi");
 const { buildSandboxGpuCreateArgs, getSandboxReadyTimeoutSecs }: typeof import("./onboard/sandbox-gpu-create") = require("./onboard/sandbox-gpu-create");
 const {
   isValidProxyHost,
@@ -287,6 +287,11 @@ const {
 const { pruneKnownHostsEntries }: typeof import("./onboard/known-hosts") = require("./onboard/known-hosts");
 const onboardPromptHelpers: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers");
 const { createOpenclawSetup }: typeof import("./onboard/openclaw-setup") = require("./onboard/openclaw-setup");
+const {
+  resolveSandboxGpuFlagFromOptions,
+  sandboxGpuRemediationLines,
+  validateSandboxGpuPreflight,
+}: typeof import("./onboard/sandbox-gpu-preflight") = require("./onboard/sandbox-gpu-preflight");
 const openshellVersion: typeof import("./onboard/openshell-version") = require("./onboard/openshell-version");
 const {
   getBlueprintMaxOpenshellVersion,
@@ -585,58 +590,6 @@ function step(n: number, total: number, msg: string): void {
   console.log(`  ${"─".repeat(50)}`);
 }
 
-function resolveSandboxGpuFlagFromOptions(
-  opts: Pick<OnboardOptions, "sandboxGpu" | "gpu" | "noGpu">,
-): SandboxGpuFlag {
-  const requestedGpuPassthrough = opts.gpu === true;
-  const optedOutGpuPassthrough = opts.noGpu === true;
-  const sandboxGpuFlag = opts.sandboxGpu ?? null;
-  if (requestedGpuPassthrough && optedOutGpuPassthrough) {
-    console.error("  --gpu and --no-gpu cannot both be set.");
-    process.exit(1);
-  }
-  if (
-    (requestedGpuPassthrough && sandboxGpuFlag === "disable") ||
-    (optedOutGpuPassthrough && sandboxGpuFlag === "enable")
-  ) {
-    console.error("  --gpu/--no-gpu conflict with the sandbox GPU flags.");
-    process.exit(1);
-  }
-  if (sandboxGpuFlag) return sandboxGpuFlag;
-  if (requestedGpuPassthrough) return "enable";
-  if (optedOutGpuPassthrough) return "disable";
-  return null;
-}
-
-function sandboxGpuRemediationLines(): string[] {
-  return [
-    "Install/configure NVIDIA Container Toolkit CDI, then restart Docker:",
-    "  sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml",
-    "  sudo systemctl restart docker",
-    "Or force CPU sandbox behavior with NEMOCLAW_SANDBOX_GPU=0.",
-  ];
-}
-
-function validateSandboxGpuPreflight(config: SandboxGpuConfig): void {
-  if (config.errors.length > 0) {
-    console.error("");
-    for (const error of config.errors) console.error(`  ✗ ${error}`);
-    process.exit(1);
-  }
-  if (!config.sandboxGpuEnabled) return;
-  if (process.platform !== "linux") return;
-
-  const cdiSpecDirs = getDockerCdiSpecDirs();
-  const cdiSpecFiles = findReadableNvidiaCdiSpecFiles(cdiSpecDirs);
-  if (cdiSpecFiles.length === 0) {
-    console.error("");
-    console.error("  ✗ Docker CDI GPU support was not detected.");
-    for (const line of sandboxGpuRemediationLines()) console.error(`    ${line}`);
-    process.exit(1);
-  }
-  console.log(`  ✓ Docker CDI GPU support detected (${cdiSpecFiles.join(", ")})`);
-}
-
 function getOpenshellBinary(): string {
   if (OPENSHELL_BIN) return OPENSHELL_BIN;
   const resolved = resolveOpenshell();
diff --git a/src/lib/onboard/sandbox-gpu-preflight.ts b/src/lib/onboard/sandbox-gpu-preflight.ts
new file mode 100644
index 0000000000..d33a324f3c
--- /dev/null
+++ b/src/lib/onboard/sandbox-gpu-preflight.ts
@@ -0,0 +1,66 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import {
+  findReadableNvidiaCdiSpecFiles,
+  getDockerCdiSpecDirs,
+} from "./docker-cdi";
+import type { SandboxGpuConfig, SandboxGpuFlag } from "./sandbox-gpu-mode";
+
+export interface SandboxGpuFlagOptions {
+  sandboxGpu?: SandboxGpuFlag;
+  gpu?: boolean;
+  noGpu?: boolean;
+}
+
+export function resolveSandboxGpuFlagFromOptions(
+  opts: SandboxGpuFlagOptions,
+): SandboxGpuFlag {
+  const requestedGpuPassthrough = opts.gpu === true;
+  const optedOutGpuPassthrough = opts.noGpu === true;
+  const sandboxGpuFlag = opts.sandboxGpu ?? null;
+  if (requestedGpuPassthrough && optedOutGpuPassthrough) {
+    console.error("  --gpu and --no-gpu cannot both be set.");
+    process.exit(1);
+  }
+  if (
+    (requestedGpuPassthrough && sandboxGpuFlag === "disable") ||
+    (optedOutGpuPassthrough && sandboxGpuFlag === "enable")
+  ) {
+    console.error("  --gpu/--no-gpu conflict with the sandbox GPU flags.");
+    process.exit(1);
+  }
+  if (sandboxGpuFlag) return sandboxGpuFlag;
+  if (requestedGpuPassthrough) return "enable";
+  if (optedOutGpuPassthrough) return "disable";
+  return null;
+}
+
+export function sandboxGpuRemediationLines(): string[] {
+  return [
+    "Install/configure NVIDIA Container Toolkit CDI, then restart Docker:",
+    "  sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml",
+    "  sudo systemctl restart docker",
+    "Or force CPU sandbox behavior with NEMOCLAW_SANDBOX_GPU=0.",
+  ];
+}
+
+export function validateSandboxGpuPreflight(config: SandboxGpuConfig): void {
+  if (config.errors.length > 0) {
+    console.error("");
+    for (const error of config.errors) console.error(`  ✗ ${error}`);
+    process.exit(1);
+  }
+  if (!config.sandboxGpuEnabled) return;
+  if (process.platform !== "linux") return;
+
+  const cdiSpecDirs = getDockerCdiSpecDirs();
+  const cdiSpecFiles = findReadableNvidiaCdiSpecFiles(cdiSpecDirs);
+  if (cdiSpecFiles.length === 0) {
+    console.error("");
+    console.error("  ✗ Docker CDI GPU support was not detected.");
+    for (const line of sandboxGpuRemediationLines()) console.error(`    ${line}`);
+    process.exit(1);
+  }
+  console.log(`  ✓ Docker CDI GPU support detected (${cdiSpecFiles.join(", ")})`);
+}

From 55452223c12d632b52bf9cfca1bd0a3bade9c8c4 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 14:06:23 -0700
Subject: [PATCH 35/54] refactor(cli): extract remediation helpers

---
 src/lib/onboard.ts             | 46 ++++-----------------------------
 src/lib/onboard/remediation.ts | 47 ++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 41 deletions(-)
 create mode 100644 src/lib/onboard/remediation.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 011a7fa671..ea6a68f243 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -276,6 +276,11 @@ const { resolveSandboxImageTagFromCreateOutput } =
   require("./domain/sandbox/image-tag") as typeof import("./domain/sandbox/image-tag");
 const nim: typeof import("./inference/nim") = require("./inference/nim");
 const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session");
+const {
+  getFutureShellPathHint,
+  getPortConflictServiceHints,
+  printRemediationActions,
+}: typeof import("./onboard/remediation") = require("./onboard/remediation");
 const resumeConfig: typeof import("./onboard/resume-config") = require("./onboard/resume-config");
 const {
   getRequestedModelHint,
@@ -1583,51 +1588,10 @@ const {
 
 const ollamaModelSize: typeof import("./inference/ollama/model-size") = require("./inference/ollama/model-size");
 
-function printRemediationActions(
-  actions: Array<{ title: string; reason: string; commands?: string[] }> | null | undefined,
-): void {
-  if (!Array.isArray(actions) || actions.length === 0) {
-    return;
-  }
-
-  console.error("");
-  console.error("  Suggested fix:");
-  console.error("");
-  for (const action of actions) {
-    console.error(`  - ${action.title}: ${action.reason}`);
-    for (const command of action.commands || []) {
-      console.error(`    ${command}`);
-    }
-  }
-}
-
 function isOpenshellInstalled(): boolean {
   return resolveOpenshell() !== null;
 }
 
-function getFutureShellPathHint(binDir: string, pathValue = process.env.PATH || ""): string | null {
-  const parts = String(pathValue).split(path.delimiter).filter(Boolean);
-  if (parts[0] === binDir) {
-    return null;
-  }
-  return `export PATH="${binDir}:$PATH"`;
-}
-
-function getPortConflictServiceHints(platform = process.platform): string[] {
-  if (platform === "darwin") {
-    return [
-      "       # or, if it's a launchctl service (macOS):",
-      "       launchctl list | grep -i claw   # columns: PID | ExitStatus | Label",
-      `       launchctl unload ${OPENCLAW_LAUNCH_AGENT_PLIST}`,
-      "       # or: launchctl bootout gui/$(id -u)/ai.openclaw.gateway",
-    ];
-  }
-  return [
-    "       # or, if it's a systemd service:",
-    "       systemctl --user stop openclaw-gateway.service",
-  ];
-}
-
 function installOpenshell(): OpenShellInstallResult {
   return openshellPinFlow.runOpenshellInstall({
     scriptsDir: SCRIPTS,
diff --git a/src/lib/onboard/remediation.ts b/src/lib/onboard/remediation.ts
new file mode 100644
index 0000000000..256bb4f29b
--- /dev/null
+++ b/src/lib/onboard/remediation.ts
@@ -0,0 +1,47 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import path from "node:path";
+
+const OPENCLAW_LAUNCH_AGENT_PLIST = "~/Library/LaunchAgents/ai.openclaw.gateway.plist";
+
+export function printRemediationActions(
+  actions: Array<{ title: string; reason: string; commands?: string[] }> | null | undefined,
+): void {
+  if (!Array.isArray(actions) || actions.length === 0) {
+    return;
+  }
+
+  console.error("");
+  console.error("  Suggested fix:");
+  console.error("");
+  for (const action of actions) {
+    console.error(`  - ${action.title}: ${action.reason}`);
+    for (const command of action.commands || []) {
+      console.error(`    ${command}`);
+    }
+  }
+}
+
+export function getFutureShellPathHint(binDir: string, pathValue = process.env.PATH || ""): string | null {
+  const parts = String(pathValue).split(path.delimiter).filter(Boolean);
+  if (parts[0] === binDir) {
+    return null;
+  }
+  return `export PATH="${binDir}:$PATH"`;
+}
+
+export function getPortConflictServiceHints(platform = process.platform): string[] {
+  if (platform === "darwin") {
+    return [
+      "       # or, if it's a launchctl service (macOS):",
+      "       launchctl list | grep -i claw   # columns: PID | ExitStatus | Label",
+      `       launchctl unload ${OPENCLAW_LAUNCH_AGENT_PLIST}`,
+      "       # or: launchctl bootout gui/$(id -u)/ai.openclaw.gateway",
+    ];
+  }
+  return [
+    "       # or, if it's a systemd service:",
+    "       systemctl --user stop openclaw-gateway.service",
+  ];
+}

From b0734c50443404cc33c96dddedf2ae5e13587f13 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 14:09:20 -0700
Subject: [PATCH 36/54] refactor(cli): extract provider recovery helpers

---
 src/lib/onboard.ts                   | 130 ++--------------------
 src/lib/onboard/provider-recovery.ts | 154 +++++++++++++++++++++++++++
 2 files changed, 161 insertions(+), 123 deletions(-)
 create mode 100644 src/lib/onboard/provider-recovery.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index ea6a68f243..bd22c2e823 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -291,6 +291,7 @@ const {
 } = resumeConfig;
 const { pruneKnownHostsEntries }: typeof import("./onboard/known-hosts") = require("./onboard/known-hosts");
 const onboardPromptHelpers: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers");
+const providerRecovery: typeof import("./onboard/provider-recovery") = require("./onboard/provider-recovery");
 const { createOpenclawSetup }: typeof import("./onboard/openclaw-setup") = require("./onboard/openclaw-setup");
 const {
   resolveSandboxGpuFlagFromOptions,
@@ -4757,131 +4758,14 @@ function providerNameToOptionKey(
   name: string | null | undefined,
   opts: { hasNimContainer?: boolean } = {},
 ): string | null {
-  if (!name) return null;
-  if (name === "nvidia-router") return "routed";
-  if (name === "ollama-local") return "ollama";
-  // Local NIM and standalone vLLM both persist as provider="vllm-local". NIM
-  // is positively identified by a nimContainer record; the absence of one in
-  // registry/session recovery reliably means standalone vLLM (the standalone
-  // path never records a container), so default to "vllm" there. Live-gateway
-  // recovery doesn't carry container info either, but the caller's
-  // option-availability check still gates on whether vllm is actually running.
-  if (name === "vllm-local") return opts.hasNimContainer ? "nim-local" : "vllm";
-  // `nvidia-nim` is a legacy alias for cloud NVIDIA Endpoints (see
-  // setupInference: it routes nvidia-nim through REMOTE_PROVIDER_CONFIG.build),
-  // not a marker for Local NIM. Local NIM persists as vllm-local + nimContainer.
-  if (name === "nvidia-nim") return "build";
-  for (const [key, cfg] of Object.entries(REMOTE_PROVIDER_CONFIG)) {
-    if ((cfg as { providerName?: string }).providerName === name) return key;
-  }
-  return null;
-}
-
-function readLiveInference(
-  sandboxName: string | null | undefined,
-): { provider: string | null; model: string | null } | null {
-  if (!sandboxName) return null;
-  try {
-    const { defaultSandbox, sandboxes } = registry.listSandboxes();
-    // The gateway holds one active inference config at a time. Trust the
-    // live read for the default sandbox, or when the registry has no
-    // entries (rebuild path: destroy wiped the entry but the gateway
-    // config persists). Other non-default sandboxes have a stored config
-    // that the gateway will swap to on their next connect.
-    const trustGateway = sandboxName === defaultSandbox || sandboxes.length === 0;
-    if (!trustGateway) return null;
-    const output = runCaptureOpenshell(["inference", "get"], { ignoreError: true });
-    return parseGatewayInference(output);
-  } catch {
-    return null;
-  }
-}
-
-function readRecordedProvider(sandboxName: string | null | undefined): string | null {
-  if (!sandboxName) return null;
-  try {
-    const entry = registry.getSandbox(sandboxName);
-    if (entry && typeof entry.provider === "string" && entry.provider) {
-      return entry.provider;
-    }
-  } catch {
-    // fall through to session
-  }
-  try {
-    const session = onboardSession.loadSession();
-    if (
-      session &&
-      session.sandboxName === sandboxName &&
-      typeof session.provider === "string" &&
-      session.provider
-    ) {
-      return session.provider;
-    }
-  } catch {
-    // fall through to live gateway
-  }
-  const live = readLiveInference(sandboxName);
-  if (live && typeof live.provider === "string" && live.provider) {
-    return live.provider;
-  }
-  return null;
+  return providerRecovery.providerNameToOptionKey(REMOTE_PROVIDER_CONFIG, name, opts);
 }
 
-function readRecordedNimContainer(sandboxName: string | null | undefined): string | null {
-  if (!sandboxName) return null;
-  try {
-    const entry = registry.getSandbox(sandboxName);
-    if (entry && typeof entry.nimContainer === "string" && entry.nimContainer) {
-      return entry.nimContainer;
-    }
-  } catch {
-    // fall through to session
-  }
-  try {
-    const session = onboardSession.loadSession();
-    if (
-      session &&
-      session.sandboxName === sandboxName &&
-      typeof session.nimContainer === "string" &&
-      session.nimContainer
-    ) {
-      return session.nimContainer;
-    }
-  } catch {
-    return null;
-  }
-  return null;
-}
-
-function readRecordedModel(sandboxName: string | null | undefined): string | null {
-  if (!sandboxName) return null;
-  try {
-    const entry = registry.getSandbox(sandboxName);
-    if (entry && typeof entry.model === "string" && entry.model) {
-      return entry.model;
-    }
-  } catch {
-    // fall through to session
-  }
-  try {
-    const session = onboardSession.loadSession();
-    if (
-      session &&
-      session.sandboxName === sandboxName &&
-      typeof session.model === "string" &&
-      session.model
-    ) {
-      return session.model;
-    }
-  } catch {
-    // fall through to live gateway
-  }
-  const live = readLiveInference(sandboxName);
-  if (live && typeof live.model === "string" && live.model) {
-    return live.model;
-  }
-  return null;
-}
+const { readLiveInference, readRecordedProvider, readRecordedNimContainer, readRecordedModel } =
+  providerRecovery.createProviderRecoveryHelpers({
+    parseGatewayInference,
+    runCaptureOpenshell,
+  });
 
 type OllamaModelSelectionOutcome =
   | { outcome: "selected"; model: string }
diff --git a/src/lib/onboard/provider-recovery.ts b/src/lib/onboard/provider-recovery.ts
new file mode 100644
index 0000000000..cf196a0f7c
--- /dev/null
+++ b/src/lib/onboard/provider-recovery.ts
@@ -0,0 +1,154 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import * as onboardSession from "../state/onboard-session";
+import * as registry from "../state/registry";
+
+export type RemoteProviderConfigEntryLike = { providerName?: string };
+
+export function providerNameToOptionKey(
+  remoteProviderConfig: Record<string, RemoteProviderConfigEntryLike>,
+  name: string | null | undefined,
+  opts: { hasNimContainer?: boolean } = {},
+): string | null {
+  if (!name) return null;
+  if (name === "nvidia-router") return "routed";
+  if (name === "ollama-local") return "ollama";
+  // Local NIM and standalone vLLM both persist as provider="vllm-local". NIM
+  // is positively identified by a nimContainer record; the absence of one in
+  // registry/session recovery reliably means standalone vLLM (the standalone
+  // path never records a container), so default to "vllm" there. Live-gateway
+  // recovery doesn't carry container info either, but the caller's
+  // option-availability check still gates on whether vllm is actually running.
+  if (name === "vllm-local") return opts.hasNimContainer ? "nim-local" : "vllm";
+  // `nvidia-nim` is a legacy alias for cloud NVIDIA Endpoints (see
+  // setupInference: it routes nvidia-nim through REMOTE_PROVIDER_CONFIG.build),
+  // not a marker for Local NIM. Local NIM persists as vllm-local + nimContainer.
+  if (name === "nvidia-nim") return "build";
+  for (const [key, cfg] of Object.entries(remoteProviderConfig)) {
+    if (cfg.providerName === name) return key;
+  }
+  return null;
+}
+
+export interface ProviderRecoveryDeps {
+  parseGatewayInference(output: string | null): { provider: string | null; model: string | null } | null;
+  runCaptureOpenshell(args: string[], opts?: Record<string, unknown>): string | null;
+}
+
+export interface ProviderRecoveryHelpers {
+  readLiveInference(sandboxName: string | null | undefined): { provider: string | null; model: string | null } | null;
+  readRecordedProvider(sandboxName: string | null | undefined): string | null;
+  readRecordedNimContainer(sandboxName: string | null | undefined): string | null;
+  readRecordedModel(sandboxName: string | null | undefined): string | null;
+}
+
+export function createProviderRecoveryHelpers(deps: ProviderRecoveryDeps): ProviderRecoveryHelpers {
+  function readLiveInference(
+    sandboxName: string | null | undefined,
+  ): { provider: string | null; model: string | null } | null {
+    if (!sandboxName) return null;
+    try {
+      const { defaultSandbox, sandboxes } = registry.listSandboxes();
+      // The gateway holds one active inference config at a time. Trust the
+      // live read for the default sandbox, or when the registry has no
+      // entries (rebuild path: destroy wiped the entry but the gateway
+      // config persists). Other non-default sandboxes have a stored config
+      // that the gateway will swap to on their next connect.
+      const trustGateway = sandboxName === defaultSandbox || sandboxes.length === 0;
+      if (!trustGateway) return null;
+      const output = deps.runCaptureOpenshell(["inference", "get"], { ignoreError: true });
+      return deps.parseGatewayInference(output);
+    } catch {
+      return null;
+    }
+  }
+
+  function readRecordedProvider(sandboxName: string | null | undefined): string | null {
+    if (!sandboxName) return null;
+    try {
+      const entry = registry.getSandbox(sandboxName);
+      if (entry && typeof entry.provider === "string" && entry.provider) {
+        return entry.provider;
+      }
+    } catch {
+      // fall through to session
+    }
+    try {
+      const session = onboardSession.loadSession();
+      if (
+        session &&
+        session.sandboxName === sandboxName &&
+        typeof session.provider === "string" &&
+        session.provider
+      ) {
+        return session.provider;
+      }
+    } catch {
+      // fall through to live gateway
+    }
+    const live = readLiveInference(sandboxName);
+    if (live && typeof live.provider === "string" && live.provider) {
+      return live.provider;
+    }
+    return null;
+  }
+
+  function readRecordedNimContainer(sandboxName: string | null | undefined): string | null {
+    if (!sandboxName) return null;
+    try {
+      const entry = registry.getSandbox(sandboxName);
+      if (entry && typeof entry.nimContainer === "string" && entry.nimContainer) {
+        return entry.nimContainer;
+      }
+    } catch {
+      // fall through to session
+    }
+    try {
+      const session = onboardSession.loadSession();
+      if (
+        session &&
+        session.sandboxName === sandboxName &&
+        typeof session.nimContainer === "string" &&
+        session.nimContainer
+      ) {
+        return session.nimContainer;
+      }
+    } catch {
+      return null;
+    }
+    return null;
+  }
+
+  function readRecordedModel(sandboxName: string | null | undefined): string | null {
+    if (!sandboxName) return null;
+    try {
+      const entry = registry.getSandbox(sandboxName);
+      if (entry && typeof entry.model === "string" && entry.model) {
+        return entry.model;
+      }
+    } catch {
+      // fall through to session
+    }
+    try {
+      const session = onboardSession.loadSession();
+      if (
+        session &&
+        session.sandboxName === sandboxName &&
+        typeof session.model === "string" &&
+        session.model
+      ) {
+        return session.model;
+      }
+    } catch {
+      // fall through to live gateway
+    }
+    const live = readLiveInference(sandboxName);
+    if (live && typeof live.model === "string" && live.model) {
+      return live.model;
+    }
+    return null;
+  }
+
+  return { readLiveInference, readRecordedProvider, readRecordedNimContainer, readRecordedModel };
+}

From 5afd6806299663ebebb4a5ccf07b509fc22a1d2a Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 14:12:08 -0700
Subject: [PATCH 37/54] refactor(cli): move Hermes tool gateway normalization

---
 src/lib/onboard.ts                      | 13 +------------
 src/lib/onboard/hermes-managed-tools.ts | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index bd22c2e823..ff388e2ed0 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -432,8 +432,8 @@ import {
 } from "./messaging-channel-config";
 import { streamGatewayStart } from "./onboard/gateway";
 import {
-  HERMES_TOOL_GATEWAY_PRESET_NAMES,
   mergeRequiredHermesToolGatewayPolicyPresets,
+  normalizeHermesToolGatewaySelections,
   setupHermesToolGateways,
   stringSetsEqual,
 } from "./onboard/hermes-managed-tools";
@@ -658,17 +658,6 @@ const {
 const { hydrateCredentialEnv }: typeof import("./onboard/credential-env") =
   require("./onboard/credential-env");
 
-function normalizeHermesToolGatewaySelections(value: unknown): string[] {
-  if (!Array.isArray(value)) return [];
-  const selected = new Set<string>();
-  for (const preset of value) {
-    if (typeof preset === "string" && HERMES_TOOL_GATEWAY_PRESET_NAMES.has(preset)) {
-      selected.add(preset);
-    }
-  }
-  return [...selected].sort();
-}
-
 const {
   summarizeCurlFailure,
   summarizeProbeFailure,
diff --git a/src/lib/onboard/hermes-managed-tools.ts b/src/lib/onboard/hermes-managed-tools.ts
index 1e90a5760e..f32afdc017 100644
--- a/src/lib/onboard/hermes-managed-tools.ts
+++ b/src/lib/onboard/hermes-managed-tools.ts
@@ -1,8 +1,8 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
-import * as hermesProviderAuth from "../hermes-provider-auth";
 import type { HermesAuthMethod } from "../hermes-provider-auth";
+import * as hermesProviderAuth from "../hermes-provider-auth";
 
 type PromptFn = (message: string) => Promise<string>;
 type RawInput = NodeJS.ReadStream & {
@@ -238,6 +238,17 @@ async function selectHermesToolGatewaysInteractive(
   return [...selected];
 }
 
+export function normalizeHermesToolGatewaySelections(value: unknown): string[] {
+  if (!Array.isArray(value)) return [];
+  const selected = new Set<string>();
+  for (const preset of value) {
+    if (typeof preset === "string" && HERMES_TOOL_GATEWAY_PRESET_NAMES.has(preset)) {
+      selected.add(preset);
+    }
+  }
+  return [...selected].sort();
+}
+
 export function stringSetsEqual(
   a: string[] | null | undefined,
   b: string[] | null | undefined,

From ae593a83f3d92001972e4c535fb2129b6c9fcf9a Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 14:15:49 -0700
Subject: [PATCH 38/54] refactor(cli): move affirmative prompt helper

---
 src/lib/onboard.ts                | 13 ++++---------
 src/lib/onboard/prompt-helpers.ts |  8 ++++++++
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index ff388e2ed0..47081a1043 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -290,7 +290,10 @@ const {
   getResumeSandboxConflict,
 } = resumeConfig;
 const { pruneKnownHostsEntries }: typeof import("./onboard/known-hosts") = require("./onboard/known-hosts");
-const onboardPromptHelpers: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers");
+const {
+  isAffirmativeAnswer,
+  ...onboardPromptHelpers
+}: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers");
 const providerRecovery: typeof import("./onboard/provider-recovery") = require("./onboard/provider-recovery");
 const { createOpenclawSetup }: typeof import("./onboard/openclaw-setup") = require("./onboard/openclaw-setup");
 const {
@@ -1222,14 +1225,6 @@ function isOpenclawReady(sandboxName: string): boolean {
   return Boolean(fetchGatewayAuthTokenFromSandbox(sandboxName));
 }
 
-function isAffirmativeAnswer(value: string | null | undefined): boolean {
-  return ["y", "yes"].includes(
-    String(value || "")
-      .trim()
-      .toLowerCase(),
-  );
-}
-
 function validateBraveSearchApiKey(apiKey: string): CurlProbeResult {
   return runCurlProbe([
     "-sS",
diff --git a/src/lib/onboard/prompt-helpers.ts b/src/lib/onboard/prompt-helpers.ts
index 4e274fd054..5e18e2ec95 100644
--- a/src/lib/onboard/prompt-helpers.ts
+++ b/src/lib/onboard/prompt-helpers.ts
@@ -1,6 +1,14 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
+export function isAffirmativeAnswer(value: string | null | undefined): boolean {
+  return ["y", "yes"].includes(
+    String(value || "")
+      .trim()
+      .toLowerCase(),
+  );
+}
+
 export interface PromptHelperDeps {
   isNonInteractive(): boolean;
   note(message: string): void;

From 3b270a06953af3f7c4394b75fe3005ca9ac7753c Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 14:22:39 -0700
Subject: [PATCH 39/54] refactor(cli): extract sandbox lifecycle helpers

---
 src/lib/onboard.ts                   | 61 +++++----------------
 src/lib/onboard/sandbox-lifecycle.ts | 80 ++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+), 47 deletions(-)
 create mode 100644 src/lib/onboard/sandbox-lifecycle.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 47081a1043..7a6e71de01 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -326,6 +326,7 @@ const {
   persistMessagingChannelConfigToSession,
 } = messagingConfig;
 const sandboxAgent: typeof import("./onboard/sandbox-agent") = require("./onboard/sandbox-agent");
+const sandboxLifecycle: typeof import("./onboard/sandbox-lifecycle") = require("./onboard/sandbox-lifecycle");
 const sandboxRegistryMetadata: typeof import("./onboard/sandbox-registry-metadata") = require("./onboard/sandbox-registry-metadata");
 const sandboxReuse: typeof import("./onboard/sandbox-reuse") = require("./onboard/sandbox-reuse");
 const {
@@ -1176,54 +1177,20 @@ function isInferenceRouteReady(provider: string, model: string): boolean {
   return Boolean(live && live.provider === provider && live.model === model);
 }
 
-function sandboxExistsInGateway(sandboxName: string): boolean {
-  const output = runCaptureOpenshell(["sandbox", "get", sandboxName], { ignoreError: true });
-  return Boolean(output);
-}
-
-function pruneStaleSandboxEntry(sandboxName: string): boolean {
-  const existing = registry.getSandbox(sandboxName);
-  const liveExists = sandboxExistsInGateway(sandboxName);
-  if (existing && !liveExists) {
-    registry.removeSandbox(sandboxName);
-  }
-  return liveExists;
-}
-
-function shouldRestoreLatestBackupOnRecreate(): boolean {
-  return process.env.NEMOCLAW_RESTORE_LATEST_BACKUP_ON_RECREATE === "1";
-}
-
-async function confirmRecreateForSelectionDrift(
-  sandboxName: string,
-  drift: SelectionDrift,
-  requestedProvider: string | null,
-  requestedModel: string | null,
-): Promise<boolean> {
-  const currentProvider = drift.existingProvider || "unknown";
-  const currentModel = drift.existingModel || "unknown";
-  const nextProvider = requestedProvider || "unknown";
-  const nextModel = requestedModel || "unknown";
-
-  console.log(`  Sandbox '${sandboxName}' exists but requested inference selection changed.`);
-  console.log(`  Current:   provider=${currentProvider}  model=${currentModel}`);
-  console.log(`  Requested: provider=${nextProvider}  model=${nextModel}`);
-  console.log(
-    `  Recreating the sandbox is required to apply this change to the running ${agentProductName()} UI.`,
-  );
-
-  if (isNonInteractive()) {
-    note("  [non-interactive] Recreating sandbox due to provider/model drift.");
-    return true;
-  }
-
-  const answer = await prompt(`  Recreate sandbox '${sandboxName}' now? [y/N]: `);
-  return isAffirmativeAnswer(answer);
-}
+const {
+  sandboxExistsInGateway,
+  pruneStaleSandboxEntry,
+  shouldRestoreLatestBackupOnRecreate,
+  confirmRecreateForSelectionDrift,
+  isOpenclawReady,
+} = sandboxLifecycle.createSandboxLifecycleHelpers({
+  runCaptureOpenshell,
+  fetchGatewayAuthTokenFromSandbox: (sandboxName: string) => fetchGatewayAuthTokenFromSandbox(sandboxName),
+  agentProductName,
+  prompt,
+  isAffirmativeAnswer,
+});
 
-function isOpenclawReady(sandboxName: string): boolean {
-  return Boolean(fetchGatewayAuthTokenFromSandbox(sandboxName));
-}
 
 function validateBraveSearchApiKey(apiKey: string): CurlProbeResult {
   return runCurlProbe([
diff --git a/src/lib/onboard/sandbox-lifecycle.ts b/src/lib/onboard/sandbox-lifecycle.ts
new file mode 100644
index 0000000000..74fed0814b
--- /dev/null
+++ b/src/lib/onboard/sandbox-lifecycle.ts
@@ -0,0 +1,80 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import * as registry from "../state/registry";
+import type { SelectionDrift } from "./selection-drift";
+
+export interface SandboxLifecycleDeps {
+  runCaptureOpenshell(args: string[], opts?: Record<string, unknown>): string | null;
+  fetchGatewayAuthTokenFromSandbox(sandboxName: string): string | null;
+  agentProductName(): string;
+  prompt(question: string): Promise<string>;
+  isAffirmativeAnswer(value: string | null | undefined): boolean;
+}
+
+export interface SandboxLifecycleHelpers {
+  sandboxExistsInGateway(sandboxName: string): boolean;
+  pruneStaleSandboxEntry(sandboxName: string): boolean;
+  shouldRestoreLatestBackupOnRecreate(): boolean;
+  confirmRecreateForSelectionDrift(
+    sandboxName: string,
+    drift: SelectionDrift,
+    requestedProvider: string | null,
+    requestedModel: string | null,
+  ): Promise<boolean>;
+  isOpenclawReady(sandboxName: string): boolean;
+}
+
+export function createSandboxLifecycleHelpers(deps: SandboxLifecycleDeps): SandboxLifecycleHelpers {
+  function sandboxExistsInGateway(sandboxName: string): boolean {
+    const output = deps.runCaptureOpenshell(["sandbox", "get", sandboxName], { ignoreError: true });
+    return Boolean(output);
+  }
+
+  function pruneStaleSandboxEntry(sandboxName: string): boolean {
+    const existing = registry.getSandbox(sandboxName);
+    const liveExists = sandboxExistsInGateway(sandboxName);
+    if (existing && !liveExists) {
+      registry.removeSandbox(sandboxName);
+    }
+    return liveExists;
+  }
+
+  function shouldRestoreLatestBackupOnRecreate(): boolean {
+    return process.env.NEMOCLAW_RESTORE_LATEST_BACKUP_ON_RECREATE === "1";
+  }
+
+  async function confirmRecreateForSelectionDrift(
+    sandboxName: string,
+    drift: SelectionDrift,
+    requestedProvider: string | null,
+    requestedModel: string | null,
+  ): Promise<boolean> {
+    const currentProvider = drift.existingProvider || "unknown";
+    const currentModel = drift.existingModel || "unknown";
+    const nextProvider = requestedProvider || "unknown";
+    const nextModel = requestedModel || "unknown";
+
+    console.log(`  Sandbox '${sandboxName}' exists but requested inference selection changed.`);
+    console.log(`  Current:   provider=${currentProvider}  model=${currentModel}`);
+    console.log(`  Requested: provider=${nextProvider}  model=${nextModel}`);
+    console.log(
+      `  Recreating the sandbox is required to apply this change to the running ${deps.agentProductName()} UI.`,
+    );
+
+    const answer = await deps.prompt(`  Recreate sandbox '${sandboxName}' now? [y/N]: `);
+    return deps.isAffirmativeAnswer(answer);
+  }
+
+  function isOpenclawReady(sandboxName: string): boolean {
+    return Boolean(deps.fetchGatewayAuthTokenFromSandbox(sandboxName));
+  }
+
+  return {
+    sandboxExistsInGateway,
+    pruneStaleSandboxEntry,
+    shouldRestoreLatestBackupOnRecreate,
+    confirmRecreateForSelectionDrift,
+    isOpenclawReady,
+  };
+}

From e5503b494a2427bbb5ef5cc0ca0d786ab926a438 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 14:26:36 -0700
Subject: [PATCH 40/54] refactor(cli): extract openshell CLI helpers

---
 src/lib/onboard.ts               | 67 ++++++++------------------
 src/lib/onboard/openshell-cli.ts | 82 ++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+), 48 deletions(-)
 create mode 100644 src/lib/onboard/openshell-cli.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 7a6e71de01..3e8ae12bbf 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -296,6 +296,7 @@ const {
 }: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers");
 const providerRecovery: typeof import("./onboard/provider-recovery") = require("./onboard/provider-recovery");
 const { createOpenclawSetup }: typeof import("./onboard/openclaw-setup") = require("./onboard/openclaw-setup");
+const { createOpenshellCliHelpers }: typeof import("./onboard/openshell-cli") = require("./onboard/openshell-cli");
 const {
   resolveSandboxGpuFlagFromOptions,
   sandboxGpuRemediationLines,
@@ -566,6 +567,24 @@ async function promptYesNoOrDefault(
 
 // ── Helpers ──────────────────────────────────────────────────────
 
+const {
+  getOpenshellBinary,
+  openshellShellCommand,
+  openshellArgv,
+  runOpenshell,
+  runCaptureOpenshell,
+  safeOpenShellArgument,
+  getGatewayPortArg,
+  getDockerDriverGatewayEndpointArg,
+} = createOpenshellCliHelpers({
+  getCachedBinary: () => OPENSHELL_BIN,
+  setCachedBinary: (binary: string) => {
+    OPENSHELL_BIN = binary;
+  },
+  getGatewayPort: () => GATEWAY_PORT,
+  getDockerDriverGatewayEndpoint,
+});
+
 // Gateway state functions — delegated to src/lib/state/gateway.ts
 const {
   isSandboxReady,
@@ -600,54 +619,6 @@ function step(n: number, total: number, msg: string): void {
   console.log(`  ${"─".repeat(50)}`);
 }
 
-function getOpenshellBinary(): string {
-  if (OPENSHELL_BIN) return OPENSHELL_BIN;
-  const resolved = resolveOpenshell();
-  if (typeof resolved !== "string" || resolved.length === 0) {
-    console.error("  openshell CLI not found.");
-    console.error("  Install manually: https://github.com/NVIDIA/OpenShell/releases");
-    process.exit(1);
-  }
-  OPENSHELL_BIN = resolved;
-  return OPENSHELL_BIN;
-}
-
-function openshellShellCommand(args: string[], options: { openshellBinary?: string } = {}): string {
-  const openshellBinary = options.openshellBinary || getOpenshellBinary();
-  return [shellQuote(openshellBinary), ...args.map((arg) => shellQuote(arg))].join(" ");
-}
-
-function openshellArgv(args: string[], options: { openshellBinary?: string } = {}): string[] {
-  const openshellBinary = options.openshellBinary || getOpenshellBinary();
-  return [openshellBinary, ...args];
-}
-
-function runOpenshell(args: string[], opts: RunnerOptions & { openshellBinary?: string } = {}) {
-  return run(openshellArgv(args, opts), opts);
-}
-
-function runCaptureOpenshell(
-  args: string[],
-  opts: RunnerOptions & { openshellBinary?: string } = {},
-) {
-  return runCapture(openshellArgv(args, opts), opts);
-}
-
-function safeOpenShellArgument(value: string, label: string): string {
-  if (!/^[A-Za-z0-9._~:/-]+$/.test(value)) {
-    throw new Error(`Invalid ${label}: contains characters unsafe for OpenShell CLI args`);
-  }
-  return value;
-}
-
-function getGatewayPortArg(): string {
-  return safeOpenShellArgument(String(GATEWAY_PORT), "gateway port");
-}
-
-function getDockerDriverGatewayEndpointArg(): string {
-  return safeOpenShellArgument(getDockerDriverGatewayEndpoint(), "gateway endpoint");
-}
-
 const { executeSandboxCommandForVerification }: typeof import("./onboard/sandbox-verification-exec") =
   require("./onboard/sandbox-verification-exec");
 
diff --git a/src/lib/onboard/openshell-cli.ts b/src/lib/onboard/openshell-cli.ts
new file mode 100644
index 0000000000..961ec5e0dd
--- /dev/null
+++ b/src/lib/onboard/openshell-cli.ts
@@ -0,0 +1,82 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { resolveOpenshell } from "../adapters/openshell/resolve";
+import { run, runCapture, shellQuote } from "../runner";
+
+export interface OpenshellCliDeps {
+  getCachedBinary(): string | null;
+  setCachedBinary(binary: string): void;
+  getGatewayPort(): number;
+  getDockerDriverGatewayEndpoint(): string;
+}
+
+export interface OpenshellCliHelpers {
+  getOpenshellBinary(): string;
+  openshellShellCommand(args: string[], options?: { openshellBinary?: string }): string;
+  openshellArgv(args: string[], options?: { openshellBinary?: string }): string[];
+  runOpenshell(args: string[], opts?: any): ReturnType<typeof run>;
+  runCaptureOpenshell(args: string[], opts?: any): string;
+  safeOpenShellArgument(value: string, label: string): string;
+  getGatewayPortArg(): string;
+  getDockerDriverGatewayEndpointArg(): string;
+}
+
+export function createOpenshellCliHelpers(deps: OpenshellCliDeps): OpenshellCliHelpers {
+  function getOpenshellBinary(): string {
+    const cached = deps.getCachedBinary();
+    if (cached) return cached;
+    const resolved = resolveOpenshell();
+    if (typeof resolved !== "string" || resolved.length === 0) {
+      console.error("  openshell CLI not found.");
+      console.error("  Install manually: https://github.com/NVIDIA/OpenShell/releases");
+      process.exit(1);
+    }
+    deps.setCachedBinary(resolved);
+    return resolved;
+  }
+
+  function openshellShellCommand(args: string[], options: { openshellBinary?: string } = {}): string {
+    const openshellBinary = options.openshellBinary || getOpenshellBinary();
+    return [shellQuote(openshellBinary), ...args.map((arg) => shellQuote(arg))].join(" ");
+  }
+
+  function openshellArgv(args: string[], options: { openshellBinary?: string } = {}): string[] {
+    const openshellBinary = options.openshellBinary || getOpenshellBinary();
+    return [openshellBinary, ...args];
+  }
+
+  function runOpenshell(args: string[], opts: any = {}) {
+    return run(openshellArgv(args, opts), opts);
+  }
+
+  function runCaptureOpenshell(args: string[], opts: any = {}) {
+    return runCapture(openshellArgv(args, opts), opts);
+  }
+
+  function safeOpenShellArgument(value: string, label: string): string {
+    if (!/^[A-Za-z0-9._~:/-]+$/.test(value)) {
+      throw new Error(`Invalid ${label}: contains characters unsafe for OpenShell CLI args`);
+    }
+    return value;
+  }
+
+  function getGatewayPortArg(): string {
+    return safeOpenShellArgument(String(deps.getGatewayPort()), "gateway port");
+  }
+
+  function getDockerDriverGatewayEndpointArg(): string {
+    return safeOpenShellArgument(deps.getDockerDriverGatewayEndpoint(), "gateway endpoint");
+  }
+
+  return {
+    getOpenshellBinary,
+    openshellShellCommand,
+    openshellArgv,
+    runOpenshell,
+    runCaptureOpenshell,
+    safeOpenShellArgument,
+    getGatewayPortArg,
+    getDockerDriverGatewayEndpointArg,
+  };
+}

From bceab13d14fb46f4a6a6b58ee9887f2de94cb394 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 17:41:48 -0700
Subject: [PATCH 41/54] refactor(cli): move prompt navigation helpers

---
 src/lib/onboard.ts                | 16 ++--------------
 src/lib/onboard/prompt-helpers.ts | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 3e8ae12bbf..4ed3635ee1 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -291,6 +291,8 @@ const {
 } = resumeConfig;
 const { pruneKnownHostsEntries }: typeof import("./onboard/known-hosts") = require("./onboard/known-hosts");
 const {
+  exitOnboardFromPrompt,
+  getNavigationChoice,
   isAffirmativeAnswer,
   ...onboardPromptHelpers
 }: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers");
@@ -639,20 +641,6 @@ const {
   runCurlProbe,
 } = httpProbe;
 
-function getNavigationChoice(value = ""): "back" | "exit" | null {
-  const normalized = String(value || "")
-    .trim()
-    .toLowerCase();
-  if (normalized === "back") return "back";
-  if (normalized === "exit" || normalized === "quit") return "exit";
-  return null;
-}
-
-function exitOnboardFromPrompt(): never {
-  console.log("  Exiting onboarding.");
-  process.exit(1);
-}
-
 function normalizeHermesAuthMethod(value: string | null | undefined): HermesAuthMethod | null {
   const normalized = String(value || "")
     .trim()
diff --git a/src/lib/onboard/prompt-helpers.ts b/src/lib/onboard/prompt-helpers.ts
index 5e18e2ec95..2bd3efe66e 100644
--- a/src/lib/onboard/prompt-helpers.ts
+++ b/src/lib/onboard/prompt-helpers.ts
@@ -1,6 +1,20 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
+export function getNavigationChoice(value = ""): "back" | "exit" | null {
+  const normalized = String(value || "")
+    .trim()
+    .toLowerCase();
+  if (normalized === "back") return "back";
+  if (normalized === "exit" || normalized === "quit") return "exit";
+  return null;
+}
+
+export function exitOnboardFromPrompt(): never {
+  console.log("  Exiting onboarding.");
+  process.exit(1);
+}
+
 export function isAffirmativeAnswer(value: string | null | undefined): boolean {
   return ["y", "yes"].includes(
     String(value || "")

From 973ff5b19ccdb99c75af0a7bcd2fd31de6a3f697 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 17:44:40 -0700
Subject: [PATCH 42/54] refactor(cli): extract Hermes auth method helpers

---
 src/lib/onboard.ts             | 57 +++++++---------------------------
 src/lib/onboard/hermes-auth.ts | 51 ++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 45 deletions(-)
 create mode 100644 src/lib/onboard/hermes-auth.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 4ed3635ee1..ee9433ddf6 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -193,6 +193,18 @@ const {
 
 const onboardProviders = require("./onboard/providers");
 const hermesProviderAuth = require("./hermes-provider-auth");
+const hermesAuth: typeof import("./onboard/hermes-auth") = require("./onboard/hermes-auth");
+const {
+  HERMES_AUTH_METHOD_API_KEY,
+  HERMES_AUTH_METHOD_OAUTH,
+  HERMES_NOUS_API_KEY_CREDENTIAL_ENV,
+  HERMES_NOUS_API_KEY_HELP_URL,
+  getRequestedHermesAuthMethod,
+  hermesAuthMethodLabel,
+  normalizeHermesAuthMethod,
+} = hermesAuth;
+
+type HermesAuthMethod = import("./onboard/hermes-auth").HermesAuthMethod;
 
 function getHermesToolGatewayBroker(): any {
   return require("./hermes-tool-gateway-broker");
@@ -491,13 +503,6 @@ const RESET = USE_COLOR ? "\x1b[0m" : "";
 let OPENSHELL_BIN: string | null = null;
 const GATEWAY_NAME = "nemoclaw";
 const BACK_TO_SELECTION = "__NEMOCLAW_BACK_TO_SELECTION__";
-type HermesAuthMethod = "oauth" | "api_key";
-const HERMES_AUTH_METHOD_OAUTH: HermesAuthMethod = "oauth";
-const HERMES_AUTH_METHOD_API_KEY: HermesAuthMethod = "api_key";
-const HERMES_NOUS_API_KEY_CREDENTIAL_ENV =
-  hermesProviderAuth.HERMES_NOUS_API_KEY_CREDENTIAL_ENV || "NOUS_API_KEY";
-const HERMES_NOUS_API_KEY_HELP_URL = "https://portal.nousresearch.com/manage-subscription";
-
 const OPENCLAW_LAUNCH_AGENT_PLIST = "~/Library/LaunchAgents/ai.openclaw.gateway.plist";
 
 const BRAVE_SEARCH_HELP_URL = "https://brave.com/search/api/";
@@ -641,44 +646,6 @@ const {
   runCurlProbe,
 } = httpProbe;
 
-function normalizeHermesAuthMethod(value: string | null | undefined): HermesAuthMethod | null {
-  const normalized = String(value || "")
-    .trim()
-    .toLowerCase()
-    .replace(/[\s-]+/g, "_");
-  if (!normalized) return null;
-  if (normalized === "oauth" || normalized === "nous_oauth" || normalized === "nous_portal_oauth") {
-    return HERMES_AUTH_METHOD_OAUTH;
-  }
-  if (
-    normalized === "api" ||
-    normalized === "key" ||
-    normalized === "api_key" ||
-    normalized === "apikey" ||
-    normalized === "nous_api_key"
-  ) {
-    return HERMES_AUTH_METHOD_API_KEY;
-  }
-  return null;
-}
-
-function hermesAuthMethodLabel(method: HermesAuthMethod | null | undefined): string {
-  return method === HERMES_AUTH_METHOD_API_KEY ? "Nous API Key" : "Nous Portal OAuth";
-}
-
-function getRequestedHermesAuthMethod(): HermesAuthMethod | null {
-  const raw =
-    process.env.NEMOCLAW_HERMES_AUTH_METHOD ||
-    process.env.NEMOCLAW_HERMES_AUTH ||
-    process.env.NEMOCLAW_NOUS_AUTH_METHOD ||
-    "";
-  const method = normalizeHermesAuthMethod(raw);
-  if (!raw || method) return method;
-  console.error(`  Unsupported Hermes Provider auth method: ${raw}`);
-  console.error("  Valid values: oauth, nous-portal-oauth, api-key, nous-api-key");
-  process.exit(1);
-}
-
 async function promptHermesAuthMethod(): Promise<HermesAuthMethod | typeof BACK_TO_SELECTION> {
   const methods: Array<{ key: HermesAuthMethod; label: string }> = [
     { key: HERMES_AUTH_METHOD_OAUTH, label: "Nous Portal OAuth (authenticate via browser)" },
diff --git a/src/lib/onboard/hermes-auth.ts b/src/lib/onboard/hermes-auth.ts
new file mode 100644
index 0000000000..4fb8b9f03f
--- /dev/null
+++ b/src/lib/onboard/hermes-auth.ts
@@ -0,0 +1,51 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import type { HermesAuthMethod } from "../hermes-provider-auth";
+import * as hermesProviderAuth from "../hermes-provider-auth";
+
+export type { HermesAuthMethod };
+
+export const HERMES_AUTH_METHOD_OAUTH: HermesAuthMethod = "oauth";
+export const HERMES_AUTH_METHOD_API_KEY: HermesAuthMethod = "api_key";
+export const HERMES_NOUS_API_KEY_CREDENTIAL_ENV =
+  hermesProviderAuth.HERMES_NOUS_API_KEY_CREDENTIAL_ENV || "NOUS_API_KEY";
+export const HERMES_NOUS_API_KEY_HELP_URL = "https://portal.nousresearch.com/manage-subscription";
+
+export function normalizeHermesAuthMethod(value: string | null | undefined): HermesAuthMethod | null {
+  const normalized = String(value || "")
+    .trim()
+    .toLowerCase()
+    .replace(/[\s-]+/g, "_");
+  if (!normalized) return null;
+  if (normalized === "oauth" || normalized === "nous_oauth" || normalized === "nous_portal_oauth") {
+    return HERMES_AUTH_METHOD_OAUTH;
+  }
+  if (
+    normalized === "api" ||
+    normalized === "key" ||
+    normalized === "api_key" ||
+    normalized === "apikey" ||
+    normalized === "nous_api_key"
+  ) {
+    return HERMES_AUTH_METHOD_API_KEY;
+  }
+  return null;
+}
+
+export function hermesAuthMethodLabel(method: HermesAuthMethod | null | undefined): string {
+  return method === HERMES_AUTH_METHOD_API_KEY ? "Nous API Key" : "Nous Portal OAuth";
+}
+
+export function getRequestedHermesAuthMethod(): HermesAuthMethod | null {
+  const raw =
+    process.env.NEMOCLAW_HERMES_AUTH_METHOD ||
+    process.env.NEMOCLAW_HERMES_AUTH ||
+    process.env.NEMOCLAW_NOUS_AUTH_METHOD ||
+    "";
+  const method = normalizeHermesAuthMethod(raw);
+  if (!raw || method) return method;
+  console.error(`  Unsupported Hermes Provider auth method: ${raw}`);
+  console.error("  Valid values: oauth, nous-portal-oauth, api-key, nous-api-key");
+  process.exit(1);
+}

From 48e805301424e7a8c7eaae45aa02f59f02e7e0a8 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 17:48:45 -0700
Subject: [PATCH 43/54] refactor(cli): extract Hermes auth flow helpers

---
 src/lib/onboard.ts             | 118 +++++----------------------
 src/lib/onboard/hermes-auth.ts | 141 +++++++++++++++++++++++++++++++++
 2 files changed, 162 insertions(+), 97 deletions(-)

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index ee9433ddf6..c30815a9b7 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -646,103 +646,27 @@ const {
   runCurlProbe,
 } = httpProbe;
 
-async function promptHermesAuthMethod(): Promise<HermesAuthMethod | typeof BACK_TO_SELECTION> {
-  const methods: Array<{ key: HermesAuthMethod; label: string }> = [
-    { key: HERMES_AUTH_METHOD_OAUTH, label: "Nous Portal OAuth (authenticate via browser)" },
-    {
-      key: HERMES_AUTH_METHOD_API_KEY,
-      label: "Nous API Key (paste a key from the provider dashboard)",
-    },
-  ];
-  const requested = getRequestedHermesAuthMethod();
-  if (isNonInteractive()) {
-    const method =
-      requested ||
-      (resolveHermesNousApiKey()
-        ? HERMES_AUTH_METHOD_API_KEY
-        : HERMES_AUTH_METHOD_OAUTH);
-    note(`  [non-interactive] Hermes auth: ${hermesAuthMethodLabel(method)}`);
-    return method;
-  }
-
-  console.log("");
-  console.log("  Hermes Provider authentication:");
-  methods.forEach((method, index) => {
-    console.log(`    ${index + 1}) ${method.label}`);
-  });
-  console.log("");
-
-  const defaultIdx = (requested ? methods.findIndex((method) => method.key === requested) : 0) + 1;
-  const choice = await prompt(`  Choose [${defaultIdx}]: `);
-  const navigation = getNavigationChoice(choice);
-  if (navigation === "back") return BACK_TO_SELECTION;
-  if (navigation === "exit") exitOnboardFromPrompt();
-  const idx = parseInt(choice || String(defaultIdx), 10) - 1;
-  return methods[idx]?.key || methods[defaultIdx - 1]?.key || HERMES_AUTH_METHOD_OAUTH;
-}
-
-function resolveHermesNousApiKey(): string | null {
-  return (
-    // check-direct-credential-env-ignore -- Hermes Provider API keys are read only from the invoking shell for OpenShell provider registration; do not resolve host credentials.json.
-    normalizeCredentialValue(process.env[HERMES_NOUS_API_KEY_CREDENTIAL_ENV]) ||
-    normalizeCredentialValue(process.env.NEMOCLAW_PROVIDER_KEY) ||
-    null
-  );
-}
-
-function stageNousApiKeyProviderEnv(): void {
-  const key = resolveHermesNousApiKey();
-  if (key) {
-    process.env[HERMES_NOUS_API_KEY_CREDENTIAL_ENV] = key;
-  }
-}
-
-async function ensureHermesNousApiKeyEnv(): Promise<string> {
-  const existing = resolveHermesNousApiKey();
-  if (existing) {
-    process.env[HERMES_NOUS_API_KEY_CREDENTIAL_ENV] = existing;
-    return existing;
-  }
-  console.log("");
-  console.log("  Hermes Provider Nous API Key");
-  console.log(`  Create or copy a key from ${HERMES_NOUS_API_KEY_HELP_URL}`);
-  const key = normalizeCredentialValue(
-    await prompt("  Nous API Key: ", {
-      secret: true,
-    }),
-  );
-  const validationError = validateNvidiaApiKeyValue(key, HERMES_NOUS_API_KEY_CREDENTIAL_ENV);
-  if (validationError) {
-    console.error(validationError);
-    process.exit(1);
-  }
-  process.env[HERMES_NOUS_API_KEY_CREDENTIAL_ENV] = key;
-  return key;
-}
-
-function openshellResultMessage(result: {
-  stdout?: string | Buffer | null;
-  stderr?: string | Buffer | null;
-}): string {
-  return compactText(redact(`${result.stderr || ""} ${result.stdout || ""}`));
-}
+const {
+  promptHermesAuthMethod,
+  resolveHermesNousApiKey,
+  stageNousApiKeyProviderEnv,
+  ensureHermesNousApiKeyEnv,
+  openshellResultMessage,
+  checkHermesProviderStoreReachable,
+} = hermesAuth.createHermesAuthHelpers({
+  isNonInteractive,
+  note,
+  prompt,
+  getNavigationChoice,
+  exitOnboardFromPrompt,
+  validateNvidiaApiKeyValue: (value: string, envName: string) =>
+    validateNvidiaApiKeyValue(value, envName),
+  compactText,
+  redact,
+  runOpenshell,
+  backToSelection: BACK_TO_SELECTION,
+});
 
-function checkHermesProviderStoreReachable(
-  runOpenshellImpl: typeof runOpenshell = runOpenshell,
-): { ok: true } | { ok: false; message: string } {
-  const result = runOpenshellImpl(["provider", "list"], {
-    ignoreError: true,
-    stdio: ["ignore", "pipe", "pipe"],
-    timeout: 10_000,
-  });
-  if (result.status === 0) return { ok: true };
-  return {
-    ok: false,
-    message:
-      openshellResultMessage(result) ||
-      "OpenShell provider storage is unreachable; the gateway may be stopped or refusing connections.",
-  };
-}
 
 async function selectOnboardAgent({
   agentFlag = null,
@@ -5158,7 +5082,7 @@ async function setupNim(
             console.log("");
             continue selectionLoop;
           }
-          hermesAuthMethod = selectedHermesAuthMethod;
+          hermesAuthMethod = normalizeHermesAuthMethod(selectedHermesAuthMethod);
           if (hermesAuthMethod === HERMES_AUTH_METHOD_API_KEY) {
             credentialEnv = HERMES_NOUS_API_KEY_CREDENTIAL_ENV;
             stageNousApiKeyProviderEnv();
diff --git a/src/lib/onboard/hermes-auth.ts b/src/lib/onboard/hermes-auth.ts
index 4fb8b9f03f..5f69fb4888 100644
--- a/src/lib/onboard/hermes-auth.ts
+++ b/src/lib/onboard/hermes-auth.ts
@@ -1,6 +1,7 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
+import { normalizeCredentialValue } from "../credentials/store";
 import type { HermesAuthMethod } from "../hermes-provider-auth";
 import * as hermesProviderAuth from "../hermes-provider-auth";
 
@@ -49,3 +50,143 @@ export function getRequestedHermesAuthMethod(): HermesAuthMethod | null {
   console.error("  Valid values: oauth, nous-portal-oauth, api-key, nous-api-key");
   process.exit(1);
 }
+
+export interface HermesAuthFlowDeps {
+  isNonInteractive(): boolean;
+  note(message: string): void;
+  prompt(question: string, options?: { secret?: boolean }): Promise<string>;
+  getNavigationChoice(value?: string): "back" | "exit" | null;
+  exitOnboardFromPrompt(): never;
+  validateNvidiaApiKeyValue(value: string, envName: string): string | null;
+  compactText(value: string): string;
+  redact(value: unknown): string;
+  runOpenshell(args: string[], opts?: Record<string, unknown>): {
+    status?: number | null;
+    stdout?: string | Buffer | null;
+    stderr?: string | Buffer | null;
+  };
+  backToSelection: string;
+}
+
+export interface HermesAuthHelpers {
+  promptHermesAuthMethod(): Promise<HermesAuthMethod | string>;
+  resolveHermesNousApiKey(): string | null;
+  stageNousApiKeyProviderEnv(): void;
+  ensureHermesNousApiKeyEnv(): Promise<string>;
+  openshellResultMessage(result: {
+    stdout?: string | Buffer | null;
+    stderr?: string | Buffer | null;
+  }): string;
+  checkHermesProviderStoreReachable(
+    runOpenshellImpl?: HermesAuthFlowDeps["runOpenshell"],
+  ): { ok: true } | { ok: false; message: string };
+}
+
+export function createHermesAuthHelpers(deps: HermesAuthFlowDeps): HermesAuthHelpers {
+  async function promptHermesAuthMethod(): Promise<HermesAuthMethod | string> {
+    const methods: Array<{ key: HermesAuthMethod; label: string }> = [
+      { key: HERMES_AUTH_METHOD_OAUTH, label: "Nous Portal OAuth (authenticate via browser)" },
+      {
+        key: HERMES_AUTH_METHOD_API_KEY,
+        label: "Nous API Key (paste a key from the provider dashboard)",
+      },
+    ];
+    const requested = getRequestedHermesAuthMethod();
+    if (deps.isNonInteractive()) {
+      const method =
+        requested ||
+        (resolveHermesNousApiKey()
+          ? HERMES_AUTH_METHOD_API_KEY
+          : HERMES_AUTH_METHOD_OAUTH);
+      deps.note(`  [non-interactive] Hermes auth: ${hermesAuthMethodLabel(method)}`);
+      return method;
+    }
+
+    console.log("");
+    console.log("  Hermes Provider authentication:");
+    methods.forEach((method, index) => {
+      console.log(`    ${index + 1}) ${method.label}`);
+    });
+    console.log("");
+
+    const defaultIdx = (requested ? methods.findIndex((method) => method.key === requested) : 0) + 1;
+    const choice = await deps.prompt(`  Choose [${defaultIdx}]: `);
+    const navigation = deps.getNavigationChoice(choice);
+    if (navigation === "back") return deps.backToSelection;
+    if (navigation === "exit") deps.exitOnboardFromPrompt();
+    const idx = parseInt(choice || String(defaultIdx), 10) - 1;
+    return methods[idx]?.key || methods[defaultIdx - 1]?.key || HERMES_AUTH_METHOD_OAUTH;
+  }
+
+  function resolveHermesNousApiKey(): string | null {
+    return (
+      // check-direct-credential-env-ignore -- Hermes Provider API keys are read only from the invoking shell for OpenShell provider registration; do not resolve host credentials.json.
+      normalizeCredentialValue(process.env[HERMES_NOUS_API_KEY_CREDENTIAL_ENV]) ||
+      normalizeCredentialValue(process.env.NEMOCLAW_PROVIDER_KEY) ||
+      null
+    );
+  }
+
+  function stageNousApiKeyProviderEnv(): void {
+    const key = resolveHermesNousApiKey();
+    if (key) {
+      process.env[HERMES_NOUS_API_KEY_CREDENTIAL_ENV] = key;
+    }
+  }
+
+  async function ensureHermesNousApiKeyEnv(): Promise<string> {
+    const existing = resolveHermesNousApiKey();
+    if (existing) {
+      process.env[HERMES_NOUS_API_KEY_CREDENTIAL_ENV] = existing;
+      return existing;
+    }
+    console.log("");
+    console.log("  Hermes Provider Nous API Key");
+    console.log(`  Create or copy a key from ${HERMES_NOUS_API_KEY_HELP_URL}`);
+    const key = normalizeCredentialValue(
+      await deps.prompt("  Nous API Key: ", {
+        secret: true,
+      }),
+    );
+    const validationError = deps.validateNvidiaApiKeyValue(key, HERMES_NOUS_API_KEY_CREDENTIAL_ENV);
+    if (validationError) {
+      console.error(validationError);
+      process.exit(1);
+    }
+    process.env[HERMES_NOUS_API_KEY_CREDENTIAL_ENV] = key;
+    return key;
+  }
+
+  function openshellResultMessage(result: {
+    stdout?: string | Buffer | null;
+    stderr?: string | Buffer | null;
+  }): string {
+    return deps.compactText(deps.redact(`${result.stderr || ""} ${result.stdout || ""}`));
+  }
+
+  function checkHermesProviderStoreReachable(
+    runOpenshellImpl: HermesAuthFlowDeps["runOpenshell"] = deps.runOpenshell,
+  ): { ok: true } | { ok: false; message: string } {
+    const result = runOpenshellImpl(["provider", "list"], {
+      ignoreError: true,
+      stdio: ["ignore", "pipe", "pipe"],
+      timeout: 10_000,
+    });
+    if (result.status === 0) return { ok: true };
+    return {
+      ok: false,
+      message:
+        openshellResultMessage(result) ||
+        "OpenShell provider storage is unreachable; the gateway may be stopped or refusing connections.",
+    };
+  }
+
+  return {
+    promptHermesAuthMethod,
+    resolveHermesNousApiKey,
+    stageNousApiKeyProviderEnv,
+    ensureHermesNousApiKeyEnv,
+    openshellResultMessage,
+    checkHermesProviderStoreReachable,
+  };
+}

From b2179813bd23bc89fe308380abde9dac1597f539 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 17:51:02 -0700
Subject: [PATCH 44/54] refactor(cli): extract onboard agent selection

---
 src/lib/onboard.ts                 | 24 ++++++++--------------
 src/lib/onboard/agent-selection.ts | 33 ++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 16 deletions(-)
 create mode 100644 src/lib/onboard/agent-selection.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index c30815a9b7..0e5993ef78 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -15,6 +15,7 @@ const {
   cliName,
   setOnboardBrandingAgent,
 }: typeof import("./onboard/branding") = require("./onboard/branding");
+const { createSelectOnboardAgent }: typeof import("./onboard/agent-selection") = require("./onboard/agent-selection");
 const { cleanupTempDir }: typeof import("./onboard/temp-files") = require("./onboard/temp-files");
 const { stopStaleDashboardListenersForSandbox } = require("./onboard/stale-gateway-cleanup");
 const {
@@ -668,22 +669,13 @@ const {
 });
 
 
-async function selectOnboardAgent({
-  agentFlag = null,
-  session = null,
-}: {
-  agentFlag?: string | null;
-  session?: { agent?: string | null } | null;
-  resume?: boolean;
-  canPrompt?: boolean;
-} = {}): Promise<AgentDefinition | null> {
-  const agent = agentOnboard.resolveAgent({ agentFlag, session });
-  if (isNonInteractive()) {
-    const displayName = agent?.displayName || agentDefs.loadAgent("openclaw").displayName;
-    note(`  [non-interactive] Agent: ${displayName}`);
-  }
-  return agent;
-}
+const selectOnboardAgent = createSelectOnboardAgent({
+  resolveAgent: agentOnboard.resolveAgent,
+  loadAgent: agentDefs.loadAgent,
+  isNonInteractive,
+  note,
+});
+
 
 const { getTransportRecoveryMessage, getProbeRecovery } = validationRecovery;
 
diff --git a/src/lib/onboard/agent-selection.ts b/src/lib/onboard/agent-selection.ts
new file mode 100644
index 0000000000..3f38593aee
--- /dev/null
+++ b/src/lib/onboard/agent-selection.ts
@@ -0,0 +1,33 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import type { AgentDefinition } from "../agent/defs";
+
+export interface SelectOnboardAgentDeps {
+  resolveAgent(options: {
+    agentFlag?: string | null;
+    session?: { agent?: string | null } | null;
+  }): AgentDefinition | null;
+  loadAgent(name: string): AgentDefinition;
+  isNonInteractive(): boolean;
+  note(message: string): void;
+}
+
+export function createSelectOnboardAgent(deps: SelectOnboardAgentDeps) {
+  return async function selectOnboardAgent({
+    agentFlag = null,
+    session = null,
+  }: {
+    agentFlag?: string | null;
+    session?: { agent?: string | null } | null;
+    resume?: boolean;
+    canPrompt?: boolean;
+  } = {}): Promise<AgentDefinition | null> {
+    const agent = deps.resolveAgent({ agentFlag, session });
+    if (deps.isNonInteractive()) {
+      const displayName = agent?.displayName || deps.loadAgent("openclaw").displayName;
+      deps.note(`  [non-interactive] Agent: ${displayName}`);
+    }
+    return agent;
+  };
+}

From 9e442d11b2a9266c723acc2bacd5939a6357ee54 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 17:54:03 -0700
Subject: [PATCH 45/54] refactor(cli): extract require value helper

---
 src/lib/core/require-value.ts   | 9 +++++++++
 src/lib/onboard.ts              | 7 +------
 src/lib/onboard/model-router.ts | 9 +--------
 3 files changed, 11 insertions(+), 14 deletions(-)
 create mode 100644 src/lib/core/require-value.ts

diff --git a/src/lib/core/require-value.ts b/src/lib/core/require-value.ts
new file mode 100644
index 0000000000..a61fe98e3c
--- /dev/null
+++ b/src/lib/core/require-value.ts
@@ -0,0 +1,9 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+export function requireValue<T>(value: T | null | undefined, message: string): T {
+  if (value === null || value === undefined) {
+    throw new Error(message);
+  }
+  return value;
+}
diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 0e5993ef78..040ffcd816 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -127,6 +127,7 @@ const {
 }: typeof import("./onboard/base-image") = require("./onboard/base-image");
 const errnoUtils: typeof import("./core/errno") = require("./core/errno");
 const { isErrnoException } = errnoUtils;
+const { requireValue }: typeof import("./core/require-value") = require("./core/require-value");
 
 type RunnerOptions = {
   env?: NodeJS.ProcessEnv;
@@ -137,12 +138,6 @@ type RunnerOptions = {
   openshellBinary?: string;
 };
 
-function requireValue<T>(value: T | null | undefined, message: string): T {
-  if (value == null) {
-    throw new Error(message);
-  }
-  return value;
-}
 const {
   collectBuildContextStats,
   stageOptimizedSandboxBuildContext,
diff --git a/src/lib/onboard/model-router.ts b/src/lib/onboard/model-router.ts
index 81ca0d10d7..ec35e06063 100644
--- a/src/lib/onboard/model-router.ts
+++ b/src/lib/onboard/model-router.ts
@@ -7,7 +7,7 @@ import fs from "node:fs";
 import http from "node:http";
 import os from "node:os";
 import path from "node:path";
-
+import { requireValue } from "../core/require-value";
 import {
   normalizeCredentialValue,
   resolveProviderCredential,
@@ -59,13 +59,6 @@ export type BlueprintInferenceProfile = {
   router: BlueprintRouterConfig;
 };
 
-function requireValue<T>(value: T | null | undefined, message: string): T {
-  if (value === null || value === undefined) {
-    throw new Error(message);
-  }
-  return value;
-}
-
 /**
  * Load a named inference profile and router config from blueprint.yaml.
  * Returns null if the blueprint or profile is missing.

From 58f38f7c81c24b6a4ce9f779df5d681a8483c375 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 17:56:35 -0700
Subject: [PATCH 46/54] refactor(cli): move onboard step banner helper

---
 src/lib/onboard.ts                | 7 +------
 src/lib/onboard/prompt-helpers.ts | 6 ++++++
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 040ffcd816..4068e95f47 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -302,6 +302,7 @@ const {
   exitOnboardFromPrompt,
   getNavigationChoice,
   isAffirmativeAnswer,
+  step,
   ...onboardPromptHelpers
 }: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers");
 const providerRecovery: typeof import("./onboard/provider-recovery") = require("./onboard/provider-recovery");
@@ -616,12 +617,6 @@ const { getSandboxReuseState, repairRecordedSandbox } = sandboxReuse.createSandb
 
 const { streamSandboxCreate } = sandboxCreateStream;
 
-function step(n: number, total: number, msg: string): void {
-  console.log("");
-  console.log(`  [${n}/${total}] ${msg}`);
-  console.log(`  ${"─".repeat(50)}`);
-}
-
 const { executeSandboxCommandForVerification }: typeof import("./onboard/sandbox-verification-exec") =
   require("./onboard/sandbox-verification-exec");
 
diff --git a/src/lib/onboard/prompt-helpers.ts b/src/lib/onboard/prompt-helpers.ts
index 2bd3efe66e..c99e92f828 100644
--- a/src/lib/onboard/prompt-helpers.ts
+++ b/src/lib/onboard/prompt-helpers.ts
@@ -1,6 +1,12 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
+export function step(n: number, total: number, msg: string): void {
+  console.log("");
+  console.log(`  [${n}/${total}] ${msg}`);
+  console.log(`  ${"─".repeat(50)}`);
+}
+
 export function getNavigationChoice(value = ""): "back" | "exit" | null {
   const normalized = String(value || "")
     .trim()

From 7b2d0de0cabe1f805a55490597fb2e1e7344ce2e Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Wed, 20 May 2026 19:48:33 -0700
Subject: [PATCH 47/54] refactor(cli): extract validation recovery prompts

---
 src/lib/onboard.ts                            | 133 ++-------------
 src/lib/onboard/validation-recovery-prompt.ts | 157 ++++++++++++++++++
 2 files changed, 168 insertions(+), 122 deletions(-)
 create mode 100644 src/lib/onboard/validation-recovery-prompt.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 4068e95f47..78db3e35d6 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -307,6 +307,9 @@ const {
 }: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers");
 const providerRecovery: typeof import("./onboard/provider-recovery") = require("./onboard/provider-recovery");
 const { createOpenclawSetup }: typeof import("./onboard/openclaw-setup") = require("./onboard/openclaw-setup");
+const {
+  createValidationRecoveryPromptHelpers,
+}: typeof import("./onboard/validation-recovery-prompt") = require("./onboard/validation-recovery-prompt");
 const { createOpenshellCliHelpers }: typeof import("./onboard/openshell-cli") = require("./onboard/openshell-cli");
 const {
   resolveSandboxGpuFlagFromOptions,
@@ -682,128 +685,14 @@ const {
 
 // validateNvidiaApiKeyValue — see validation import above
 
-async function replaceNamedCredential(
-  envName: string,
-  label: string,
-  helpUrl: string | null = null,
-  validator: ((value: string) => string | null) | null = null,
-): Promise<string> {
-  if (helpUrl) {
-    console.log("");
-    console.log(`  Get your ${label} from: ${helpUrl}`);
-    console.log("");
-  }
-
-  while (true) {
-    const key = normalizeCredentialValue(await prompt(`  ${label}: `, { secret: true }));
-    if (!key) {
-      console.error(`  ${label} is required.`);
-      continue;
-    }
-    const validationError = typeof validator === "function" ? validator(key) : null;
-    if (validationError) {
-      console.error(validationError);
-      continue;
-    }
-    saveCredential(envName, key);
-    process.env[envName] = key;
-    console.log("");
-    console.log("  Credential staged. Onboarding will register it with the OpenShell gateway.");
-    console.log("");
-    return key;
-  }
-}
-
-async function promptValidationRecovery(
-  label: string,
-  recovery: ProbeRecovery,
-  credentialEnv: string | null = null,
-  helpUrl: string | null = null,
-): Promise<"credential" | "selection" | "retry" | "model"> {
-  if (isNonInteractive()) {
-    process.exit(1);
-  }
-
-  if (recovery.kind === "credential" && credentialEnv) {
-    console.log(
-      `  ${label} authorization failed. Re-enter the API key or choose a different provider/model.`,
-    );
-    console.log("  ⚠️  Do NOT paste your API key here — use the options below:");
-    const choice = (
-      await prompt("  Options: retry (re-enter key), back (change provider), exit [retry]: ", {
-        secret: true,
-      })
-    )
-      .trim()
-      .toLowerCase();
-    // Guard against the user accidentally pasting an API key at this prompt.
-    // Tokens don't contain spaces; human sentences do — the no-space + length check
-    // avoids false-positives on long typed sentences.
-    const API_KEY_PREFIXES = ["nvapi-", "ghp_", "gcm-", "sk-", "gpt-", "gemini-", "nvcf-"];
-    const looksLikeToken =
-      API_KEY_PREFIXES.some((p) => choice.startsWith(p)) ||
-      (!choice.includes(" ") && choice.length > 40) ||
-      // Regex fallback: base64-safe token pattern (20+ chars, no spaces, mixed alphanum)
-      /^[A-Za-z0-9_\-\.]{20,}$/.test(choice);
-    // validateNvidiaApiKeyValue is provider-aware: it only enforces the
-    // nvapi- prefix when credentialEnv === "NVIDIA_API_KEY", so passing it
-    // unconditionally here is safe for Anthropic/OpenAI/Gemini too.
-    const validator = (key: string) => validateNvidiaApiKeyValue(key, credentialEnv);
-    if (looksLikeToken) {
-      console.log("  ⚠️  That looks like an API key — do not paste credentials here.");
-      console.log("  Treating as 'retry'. You will be prompted to enter the key securely.");
-      await replaceNamedCredential(credentialEnv, `${label} API key`, helpUrl, validator);
-      return "credential";
-    }
-    if (choice === "back") {
-      console.log("  Returning to provider selection.");
-      console.log("");
-      return "selection";
-    }
-    if (choice === "exit" || choice === "quit") {
-      exitOnboardFromPrompt();
-    }
-    if (choice === "" || choice === "retry") {
-      await replaceNamedCredential(credentialEnv, `${label} API key`, helpUrl, validator);
-      return "credential";
-    }
-    console.log("  Please choose a provider/model again.");
-    console.log("");
-    return "selection";
-  }
-
-  if (recovery.kind === "transport") {
-    console.log(getTransportRecoveryMessage("failure" in recovery ? recovery.failure || {} : {}));
-    const choice = (await prompt("  Type 'retry', 'back', or 'exit' [retry]: "))
-      .trim()
-      .toLowerCase();
-    if (choice === "back") {
-      console.log("  Returning to provider selection.");
-      console.log("");
-      return "selection";
-    }
-    if (choice === "exit" || choice === "quit") {
-      exitOnboardFromPrompt();
-    }
-    if (choice === "" || choice === "retry") {
-      console.log("");
-      return "retry";
-    }
-    console.log("  Please choose a provider/model again.");
-    console.log("");
-    return "selection";
-  }
-
-  if (recovery.kind === "model") {
-    console.log(`  Please enter a different ${label} model name.`);
-    console.log("");
-    return "model";
-  }
-
-  console.log("  Please choose a provider/model again.");
-  console.log("");
-  return "selection";
-}
+const { replaceNamedCredential, promptValidationRecovery } = createValidationRecoveryPromptHelpers({
+  isNonInteractive,
+  prompt,
+  validateNvidiaApiKeyValue: (key: string, credentialEnv: string | null) =>
+    validateNvidiaApiKeyValue(key, credentialEnv ?? undefined),
+  getTransportRecoveryMessage: (failure: any) => getTransportRecoveryMessage(failure),
+  exitOnboardFromPrompt,
+});
 
 // Provider CRUD — thin wrappers that inject runOpenshell to avoid circular deps.
 const { buildProviderArgs } = onboardProviders;
diff --git a/src/lib/onboard/validation-recovery-prompt.ts b/src/lib/onboard/validation-recovery-prompt.ts
new file mode 100644
index 0000000000..b44cd0676d
--- /dev/null
+++ b/src/lib/onboard/validation-recovery-prompt.ts
@@ -0,0 +1,157 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { normalizeCredentialValue, saveCredential } from "../credentials/store";
+import type { ProbeRecovery } from "../validation-recovery";
+
+export interface ValidationRecoveryPromptDeps {
+  isNonInteractive(): boolean;
+  prompt(question: string, options?: { secret?: boolean }): Promise<string>;
+  validateNvidiaApiKeyValue(key: string, credentialEnv: string | null): string | null;
+  getTransportRecoveryMessage(failure: any): string;
+  exitOnboardFromPrompt(): never;
+}
+
+export interface ValidationRecoveryPromptHelpers {
+  replaceNamedCredential(
+    envName: string,
+    label: string,
+    helpUrl?: string | null,
+    validator?: ((value: string) => string | null) | null,
+  ): Promise<string>;
+  promptValidationRecovery(
+    label: string,
+    recovery: ProbeRecovery,
+    credentialEnv?: string | null,
+    helpUrl?: string | null,
+  ): Promise<"credential" | "selection" | "retry" | "model">;
+}
+
+export function createValidationRecoveryPromptHelpers(
+  deps: ValidationRecoveryPromptDeps,
+): ValidationRecoveryPromptHelpers {
+  async function replaceNamedCredential(
+    envName: string,
+    label: string,
+    helpUrl: string | null = null,
+    validator: ((value: string) => string | null) | null = null,
+  ): Promise<string> {
+    if (helpUrl) {
+      console.log("");
+      console.log(`  Get your ${label} from: ${helpUrl}`);
+      console.log("");
+    }
+
+    while (true) {
+      const key = normalizeCredentialValue(await deps.prompt(`  ${label}: `, { secret: true }));
+      if (!key) {
+        console.error(`  ${label} is required.`);
+        continue;
+      }
+      const validationError = typeof validator === "function" ? validator(key) : null;
+      if (validationError) {
+        console.error(validationError);
+        continue;
+      }
+      saveCredential(envName, key);
+      process.env[envName] = key;
+      console.log("");
+      console.log("  Credential staged. Onboarding will register it with the OpenShell gateway.");
+      console.log("");
+      return key;
+    }
+  }
+
+  async function promptValidationRecovery(
+    label: string,
+    recovery: ProbeRecovery,
+    credentialEnv: string | null = null,
+    helpUrl: string | null = null,
+  ): Promise<"credential" | "selection" | "retry" | "model"> {
+    if (deps.isNonInteractive()) {
+      process.exit(1);
+    }
+
+    if (recovery.kind === "credential" && credentialEnv) {
+      console.log(
+        `  ${label} authorization failed. Re-enter the API key or choose a different provider/model.`,
+      );
+      console.log("  ⚠️  Do NOT paste your API key here — use the options below:");
+      const choice = (
+        await deps.prompt("  Options: retry (re-enter key), back (change provider), exit [retry]: ", {
+          secret: true,
+        })
+      )
+        .trim()
+        .toLowerCase();
+      // Guard against the user accidentally pasting an API key at this prompt.
+      // Tokens don't contain spaces; human sentences do — the no-space + length check
+      // avoids false-positives on long typed sentences.
+      const API_KEY_PREFIXES = ["nvapi-", "ghp_", "gcm-", "sk-", "gpt-", "gemini-", "nvcf-"];
+      const looksLikeToken =
+        API_KEY_PREFIXES.some((prefix) => choice.startsWith(prefix)) ||
+        (!choice.includes(" ") && choice.length > 40) ||
+        // Regex fallback: base64-safe token pattern (20+ chars, no spaces, mixed alphanum)
+        /^[A-Za-z0-9_\-.]{20,}$/.test(choice);
+      // validateNvidiaApiKeyValue is provider-aware: it only enforces the
+      // nvapi- prefix when credentialEnv === "NVIDIA_API_KEY", so passing it
+      // unconditionally here is safe for Anthropic/OpenAI/Gemini too.
+      const validator = (key: string) => deps.validateNvidiaApiKeyValue(key, credentialEnv);
+      if (looksLikeToken) {
+        console.log("  ⚠️  That looks like an API key — do not paste credentials here.");
+        console.log("  Treating as 'retry'. You will be prompted to enter the key securely.");
+        await replaceNamedCredential(credentialEnv, `${label} API key`, helpUrl, validator);
+        return "credential";
+      }
+      if (choice === "back") {
+        console.log("  Returning to provider selection.");
+        console.log("");
+        return "selection";
+      }
+      if (choice === "exit" || choice === "quit") {
+        deps.exitOnboardFromPrompt();
+      }
+      if (choice === "" || choice === "retry") {
+        await replaceNamedCredential(credentialEnv, `${label} API key`, helpUrl, validator);
+        return "credential";
+      }
+      console.log("  Please choose a provider/model again.");
+      console.log("");
+      return "selection";
+    }
+
+    if (recovery.kind === "transport") {
+      console.log(deps.getTransportRecoveryMessage("failure" in recovery ? recovery.failure || {} : {}));
+      const choice = (await deps.prompt("  Type 'retry', 'back', or 'exit' [retry]: "))
+        .trim()
+        .toLowerCase();
+      if (choice === "back") {
+        console.log("  Returning to provider selection.");
+        console.log("");
+        return "selection";
+      }
+      if (choice === "exit" || choice === "quit") {
+        deps.exitOnboardFromPrompt();
+      }
+      if (choice === "" || choice === "retry") {
+        console.log("");
+        return "retry";
+      }
+      console.log("  Please choose a provider/model again.");
+      console.log("");
+      return "selection";
+    }
+
+    if (recovery.kind === "model") {
+      console.log(`  Please enter a different ${label} model name.`);
+      console.log("");
+      return "model";
+    }
+
+    console.log("  Please choose a provider/model again.");
+    console.log("");
+    return "selection";
+  }
+
+  return { replaceNamedCredential, promptValidationRecovery };
+}

From eef52549d69c94879b7b8ddfa6368e28bda80756 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Thu, 21 May 2026 14:42:43 -0700
Subject: [PATCH 48/54] refactor(cli): remove duplicate onboard sleep helper

---
 src/lib/onboard.ts | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 78db3e35d6..9957c2f298 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -1324,10 +1324,6 @@ function getOpenShellInstallDeps(): OpenShellInstallDeps {
   };
 }
 
-function sleep(seconds: number): void {
-  sleepSeconds(seconds);
-}
-
 function runQuietOpenshell(args: string[]) {
   return runOpenshell(args, {
     ignoreError: true,
@@ -1356,7 +1352,7 @@ function terminateDockerDriverGatewayProcess(pid: number): boolean {
     process.kill(pid, "SIGTERM");
     for (let i = 0; i < 10; i += 1) {
       if (!isPidAlive(pid)) break;
-      sleep(1);
+      sleepSeconds(1);
     }
     if (isPidAlive(pid)) process.kill(pid, "SIGKILL");
     return true;
@@ -2041,7 +2037,7 @@ function waitForSandboxReady(sandboxName: string, attempts = 10, delaySeconds =
     // Package-managed OpenShell gateways report readiness through
     // `sandbox list`; legacy Kubernetes gateways may still expose pod state.
     if (isLinuxDockerDriverGatewayEnabled()) {
-      if (i < attempts - 1) sleep(delaySeconds);
+      if (i < attempts - 1) sleepSeconds(delaySeconds);
       continue;
     }
     const podPhase = runCaptureOpenshell(
@@ -2061,7 +2057,7 @@ function waitForSandboxReady(sandboxName: string, attempts = 10, delaySeconds =
       { ignoreError: true },
     );
     if (podPhase === "Running") return true;
-    sleep(delaySeconds);
+    sleepSeconds(delaySeconds);
   }
   return false;
 }
@@ -2539,7 +2535,7 @@ async function preflight(
             `  Cleaning up orphaned SSH port-forward on port ${port} (PID ${portCheck.pid})...`,
           );
           run(["kill", String(portCheck.pid)], { ignoreError: true });
-          sleep(1);
+          sleepSeconds(1);
           portCheck = await checkPortAvailable(port, portCheckOptions);
           if (portCheck.ok) {
             console.log(`  ✓ Port ${port} available after orphaned forward cleanup (${label})`);
@@ -2801,7 +2797,7 @@ async function startGatewayWithOptions(
           if (isGatewayHealthy(status, namedInfo, currentInfo) && (await isGatewayHttpReady())) {
             return; // success
           }
-          if (i < healthPollCount - 1) sleep(healthPollInterval);
+          if (i < healthPollCount - 1) sleepSeconds(healthPollInterval);
         }
 
         throw new Error("Gateway failed to start");
@@ -2948,7 +2944,7 @@ async function startDockerDriverGateway({ exitOnFailure = true, skipSandboxBridg
       console.log(`  Restarting unhealthy Docker-driver gateway process (PID ${existingPid})...`);
       try {
         process.kill(existingPid, "SIGTERM");
-        sleep(1);
+        sleepSeconds(1);
       } catch {
         /* best effort; the new process will surface any remaining port conflict */
       }
@@ -2990,7 +2986,7 @@ async function startDockerDriverGateway({ exitOnFailure = true, skipSandboxBridg
       break;
     }
     if (!registerDockerDriverGatewayEndpoint()) {
-      if (i < pollCount - 1) sleep(pollInterval);
+      if (i < pollCount - 1) sleepSeconds(pollInterval);
       continue;
     }
     const status = runCaptureOpenshell(["status"], { ignoreError: true });
@@ -3005,7 +3001,7 @@ async function startDockerDriverGateway({ exitOnFailure = true, skipSandboxBridg
       await verifySandboxBridgeGatewayReachableOrExit(exitOnFailure, { skip: skipSandboxBridgeReachability }); console.log("  ✓ Docker-driver gateway is healthy");
       return;
     }
-    if (i < pollCount - 1) sleep(pollInterval);
+    if (i < pollCount - 1) sleepSeconds(pollInterval);
   }
 
   reportDockerDriverGatewayStartFailure(logPath, childExit, { exitOnFailure });
@@ -3200,7 +3196,7 @@ async function recoverGatewayRuntime() {
       }
       return true;
     }
-    if (i < recoveryPollCount - 1) sleep(recoveryPollInterval);
+    if (i < recoveryPollCount - 1) sleepSeconds(recoveryPollInterval);
   }
 
   return false;
@@ -4122,7 +4118,7 @@ async function createSandbox(
     sandboxName,
     gpuDevice: effectiveSandboxGpuConfig.sandboxGpuDevice,
     timeoutSecs: sandboxReadyTimeoutSecs,
-    deps: { runOpenshell, runCaptureOpenshell, sleep },
+    deps: { runOpenshell, runCaptureOpenshell, sleep: sleepSeconds },
   });
   const createResult = await streamSandboxCreate(createCommand, sandboxEnv, {
     readyCheck: () => {
@@ -4189,7 +4185,7 @@ async function createSandbox(
       ready = true;
       break;
     }
-    if (i < readyAttempts - 1) sleep(2);
+    if (i < readyAttempts - 1) sleepSeconds(2);
   }
 
   const restoreBackupPath =
@@ -4257,7 +4253,7 @@ async function createSandbox(
     if (i === 14) {
       console.warn("  Dashboard taking longer than expected to start. Continuing...");
     } else {
-      sleep(2);
+      sleepSeconds(2);
     }
   }
 
@@ -5611,7 +5607,7 @@ async function setupNim(
           runShell("set -o pipefail; curl -fsSL https://ollama.com/install.sh | sh");
           // Give the just-started ollama.service a moment to bind port
           // 11434 before we probe or apply the systemd drop-in override.
-          sleep(2);
+          sleepSeconds(2);
           // Linux native + systemd: force a loopback-only OLLAMA_HOST drop-in
           // and let systemd own the daemon (avoids racing the installer's
           // daemon with our own `ollama serve`). This also repairs older
@@ -7005,7 +7001,7 @@ const {
   note,
   isWsl,
   redact,
-  sleep,
+  sleep: sleepSeconds,
   printAgentDashboardUi: agentOnboard.printDashboardUi,
 });
 

From 5bfa612ac22941123c54694cbc30aad490a3ce5a Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Thu, 21 May 2026 18:57:54 -0700
Subject: [PATCH 49/54] refactor(cli): extract web search flow helpers

---
 src/lib/onboard.ts                 | 181 ++---------------------
 src/lib/onboard/web-search-flow.ts | 221 +++++++++++++++++++++++++++++
 2 files changed, 236 insertions(+), 166 deletions(-)
 create mode 100644 src/lib/onboard/web-search-flow.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 9957c2f298..b806f38d0e 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -307,6 +307,7 @@ const {
 }: typeof import("./onboard/prompt-helpers") = require("./onboard/prompt-helpers");
 const providerRecovery: typeof import("./onboard/provider-recovery") = require("./onboard/provider-recovery");
 const { createOpenclawSetup }: typeof import("./onboard/openclaw-setup") = require("./onboard/openclaw-setup");
+const { createWebSearchFlowHelpers }: typeof import("./onboard/web-search-flow") = require("./onboard/web-search-flow");
 const {
   createValidationRecoveryPromptHelpers,
 }: typeof import("./onboard/validation-recovery-prompt") = require("./onboard/validation-recovery-prompt");
@@ -913,173 +914,21 @@ const {
 });
 
 
-function validateBraveSearchApiKey(apiKey: string): CurlProbeResult {
-  return runCurlProbe([
-    "-sS",
-    "--compressed",
-    "-H",
-    "Accept: application/json",
-    "-H",
-    "Accept-Encoding: gzip",
-    "-H",
-    `X-Subscription-Token: ${apiKey}`,
-    "--get",
-    "--data-urlencode",
-    "q=ping",
-    "--data-urlencode",
-    "count=1",
-    "https://api.search.brave.com/res/v1/web/search",
-  ]);
-}
-
-async function promptBraveSearchRecovery(
-  validation: ValidationFailureLike,
-): Promise<"retry" | "skip"> {
-  const recovery = classifyValidationFailure(validation);
-
-  if (recovery.kind === "credential") {
-    console.log("  Brave Search rejected that API key.");
-  } else if (recovery.kind === "transport") {
-    console.log(getTransportRecoveryMessage(validation));
-  } else {
-    console.log("  Brave Search validation did not succeed.");
-  }
-
-  const answer = (await prompt("  Type 'retry', 'skip', or 'exit' [retry]: ")).trim().toLowerCase();
-  if (answer === "skip") return "skip";
-  if (answer === "exit" || answer === "quit") {
-    exitOnboardFromPrompt();
-  }
-  return "retry";
-}
-
-async function promptBraveSearchApiKey(): Promise<string> {
-  console.log("");
-  console.log(`  Get your Brave Search API key from: ${BRAVE_SEARCH_HELP_URL}`);
-  console.log("");
-
-  while (true) {
-    const key = normalizeCredentialValue(
-      await prompt("  Brave Search API key: ", { secret: true }),
-    );
-    if (!key) {
-      console.error("  Brave Search API key is required.");
-      continue;
-    }
-    return key;
-  }
-}
-
-async function ensureValidatedBraveSearchCredential(
-  nonInteractive = isNonInteractive(),
-): Promise<string | null> {
-  const savedApiKey = getCredential(webSearch.BRAVE_API_KEY_ENV);
-  let apiKey: string | null =
-    savedApiKey || normalizeCredentialValue(process.env[webSearch.BRAVE_API_KEY_ENV]);
-  let usingSavedKey = Boolean(savedApiKey);
-
-  while (true) {
-    if (!apiKey) {
-      if (nonInteractive) {
-        throw new Error(
-          "Brave Search requires BRAVE_API_KEY or a saved Brave Search credential in non-interactive mode.",
-        );
-      }
-      apiKey = await promptBraveSearchApiKey();
-      usingSavedKey = false;
-    }
-
-    const validation = validateBraveSearchApiKey(apiKey);
-    if (validation.ok) {
-      saveCredential(webSearch.BRAVE_API_KEY_ENV, apiKey);
-      process.env[webSearch.BRAVE_API_KEY_ENV] = apiKey;
-      return apiKey;
-    }
-
-    const prefix = usingSavedKey
-      ? "  Saved Brave Search API key validation failed."
-      : "  Brave Search API key validation failed.";
-    console.error(prefix);
-    if (validation.message) {
-      console.error(`  ${validation.message}`);
-    }
-
-    if (nonInteractive) {
-      throw new Error(
-        validation.message || "Brave Search API key validation failed in non-interactive mode.",
-      );
-    }
-
-    const action = await promptBraveSearchRecovery(validation);
-    if (action === "skip") {
-      console.log("  Skipping Brave Web Search setup.");
-      console.log("");
-      return null;
-    }
-
-    apiKey = null;
-    usingSavedKey = false;
-  }
-}
-
-async function configureWebSearch(
-  existingConfig: WebSearchConfig | null = null,
-  agent: AgentDefinition | null = null,
-  dockerfilePathOverride: string | null = null,
-): Promise<WebSearchConfig | null> {
-  if (!agentSupportsWebSearch(agent, dockerfilePathOverride, ROOT)) {
-    note(`  Web search is not yet supported by ${agent?.displayName ?? "this agent"}. Skipping.`);
-    return null;
-  }
-
-  if (existingConfig) {
-    return { fetchEnabled: true };
-  }
-
-  if (isNonInteractive()) {
-    const braveApiKey = normalizeCredentialValue(process.env[webSearch.BRAVE_API_KEY_ENV]);
-    if (!braveApiKey) {
-      return null;
-    }
-    note("  [non-interactive] Brave Web Search requested.");
-    const validation = validateBraveSearchApiKey(braveApiKey);
-    if (!validation.ok) {
-      console.warn(
-        `  Brave Search API key validation failed. Web search will be disabled — re-enable later via \`${cliName()} config web-search\`.`,
-      );
-      if (validation.message) {
-        console.warn(`  ${validation.message}`);
-      }
-      return null;
-    }
-    saveCredential(webSearch.BRAVE_API_KEY_ENV, braveApiKey);
-    process.env[webSearch.BRAVE_API_KEY_ENV] = braveApiKey;
-    return { fetchEnabled: true };
-  }
-  const enableAnswer = await prompt("  Enable Brave Web Search? [y/N]: ");
-  if (!isAffirmativeAnswer(enableAnswer)) {
-    return null;
-  }
-
-  const braveApiKey = await ensureValidatedBraveSearchCredential();
-  if (!braveApiKey) {
-    return null;
-  }
-
-  console.log("  ✓ Enabled Brave Web Search");
-  console.log("");
-  return { fetchEnabled: true };
-}
+const {
+  validateBraveSearchApiKey,
+  promptBraveSearchRecovery,
+  promptBraveSearchApiKey,
+  ensureValidatedBraveSearchCredential,
+  configureWebSearch,
+  verifyWebSearchInsideSandbox,
+} = createWebSearchFlowHelpers({
+  prompt,
+  note,
+  isNonInteractive,
+  cliName,
+  runCaptureOpenshell,
+});
 
-function verifyWebSearchInsideSandbox(
-  sandboxName: string,
-  agent: AgentDefinition | null | undefined,
-): void {
-  verifyWebSearchInsideSandboxWithDeps(sandboxName, agent, {
-    runCaptureOpenshell,
-    cliName,
-  });
-}
 
 // getSandboxInferenceConfig — moved to onboard-providers.ts
 
diff --git a/src/lib/onboard/web-search-flow.ts b/src/lib/onboard/web-search-flow.ts
new file mode 100644
index 0000000000..ab1af78d5f
--- /dev/null
+++ b/src/lib/onboard/web-search-flow.ts
@@ -0,0 +1,221 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import type { CurlProbeResult } from "../adapters/http/probe";
+import { runCurlProbe } from "../adapters/http/probe";
+import type { AgentDefinition } from "../agent/defs";
+import { getCredential, normalizeCredentialValue, saveCredential } from "../credentials/store";
+import type { WebSearchConfig } from "../inference/web-search";
+import { BRAVE_API_KEY_ENV } from "../inference/web-search";
+import { ROOT } from "../runner";
+import { classifyValidationFailure } from "../validation";
+import { getTransportRecoveryMessage } from "../validation-recovery";
+import { exitOnboardFromPrompt, isAffirmativeAnswer } from "./prompt-helpers";
+import type { ValidationFailureLike } from "./types";
+import { agentSupportsWebSearch } from "./web-search-support";
+import { verifyWebSearchInsideSandbox as verifyWebSearchInsideSandboxWithDeps } from "./web-search-verify";
+
+const BRAVE_SEARCH_HELP_URL = "https://brave.com/search/api/";
+
+export interface WebSearchFlowDeps {
+  prompt(question: string, options?: { secret?: boolean }): Promise<string>;
+  note(message: string): void;
+  isNonInteractive(): boolean;
+  cliName(): string;
+  runCaptureOpenshell(args: string[], opts?: Record<string, unknown>): string | null;
+}
+
+export interface WebSearchFlowHelpers {
+  validateBraveSearchApiKey(apiKey: string): CurlProbeResult;
+  promptBraveSearchRecovery(validation: ValidationFailureLike): Promise<"retry" | "skip">;
+  promptBraveSearchApiKey(): Promise<string>;
+  ensureValidatedBraveSearchCredential(nonInteractive?: boolean): Promise<string | null>;
+  configureWebSearch(
+    existingConfig?: WebSearchConfig | null,
+    agent?: AgentDefinition | null,
+    dockerfilePathOverride?: string | null,
+  ): Promise<WebSearchConfig | null>;
+  verifyWebSearchInsideSandbox(
+    sandboxName: string,
+    agent: AgentDefinition | null | undefined,
+  ): void;
+}
+
+export function createWebSearchFlowHelpers(deps: WebSearchFlowDeps): WebSearchFlowHelpers {
+  function validateBraveSearchApiKey(apiKey: string): CurlProbeResult {
+    return runCurlProbe([
+      "-sS",
+      "--compressed",
+      "-H",
+      "Accept: application/json",
+      "-H",
+      "Accept-Encoding: gzip",
+      "-H",
+      `X-Subscription-Token: ${apiKey}`,
+      "--get",
+      "--data-urlencode",
+      "q=ping",
+      "--data-urlencode",
+      "count=1",
+      "https://api.search.brave.com/res/v1/web/search",
+    ]);
+  }
+
+  async function promptBraveSearchRecovery(
+    validation: ValidationFailureLike,
+  ): Promise<"retry" | "skip"> {
+    const recovery = classifyValidationFailure(validation);
+
+    if (recovery.kind === "credential") {
+      console.log("  Brave Search rejected that API key.");
+    } else if (recovery.kind === "transport") {
+      console.log(getTransportRecoveryMessage(validation));
+    } else {
+      console.log("  Brave Search validation did not succeed.");
+    }
+
+    const answer = (await deps.prompt("  Type 'retry', 'skip', or 'exit' [retry]: ")).trim().toLowerCase();
+    if (answer === "skip") return "skip";
+    if (answer === "exit" || answer === "quit") {
+      exitOnboardFromPrompt();
+    }
+    return "retry";
+  }
+
+  async function promptBraveSearchApiKey(): Promise<string> {
+    console.log("");
+    console.log(`  Get your Brave Search API key from: ${BRAVE_SEARCH_HELP_URL}`);
+    console.log("");
+
+    while (true) {
+      const key = normalizeCredentialValue(
+        await deps.prompt("  Brave Search API key: ", { secret: true }),
+      );
+      if (!key) {
+        console.error("  Brave Search API key is required.");
+        continue;
+      }
+      return key;
+    }
+  }
+
+  async function ensureValidatedBraveSearchCredential(
+    nonInteractive = deps.isNonInteractive(),
+  ): Promise<string | null> {
+    const savedApiKey = getCredential(BRAVE_API_KEY_ENV);
+    let apiKey: string | null =
+      savedApiKey || normalizeCredentialValue(process.env[BRAVE_API_KEY_ENV]);
+    let usingSavedKey = Boolean(savedApiKey);
+
+    while (true) {
+      if (!apiKey) {
+        if (nonInteractive) {
+          throw new Error(
+            "Brave Search requires BRAVE_API_KEY or a saved Brave Search credential in non-interactive mode.",
+          );
+        }
+        apiKey = await promptBraveSearchApiKey();
+        usingSavedKey = false;
+      }
+
+      const validation = validateBraveSearchApiKey(apiKey);
+      if (validation.ok) {
+        saveCredential(BRAVE_API_KEY_ENV, apiKey);
+        process.env[BRAVE_API_KEY_ENV] = apiKey;
+        return apiKey;
+      }
+
+      const prefix = usingSavedKey
+        ? "  Saved Brave Search API key validation failed."
+        : "  Brave Search API key validation failed.";
+      console.error(prefix);
+      if (validation.message) {
+        console.error(`  ${validation.message}`);
+      }
+
+      if (nonInteractive) {
+        throw new Error(
+          validation.message || "Brave Search API key validation failed in non-interactive mode.",
+        );
+      }
+
+      const action = await promptBraveSearchRecovery(validation);
+      if (action === "skip") {
+        console.log("  Skipping Brave Web Search setup.");
+        console.log("");
+        return null;
+      }
+
+      apiKey = null;
+      usingSavedKey = false;
+    }
+  }
+
+  async function configureWebSearch(
+    existingConfig: WebSearchConfig | null = null,
+    agent: AgentDefinition | null = null,
+    dockerfilePathOverride: string | null = null,
+  ): Promise<WebSearchConfig | null> {
+    if (!agentSupportsWebSearch(agent, dockerfilePathOverride, ROOT)) {
+      deps.note(`  Web search is not yet supported by ${agent?.displayName ?? "this agent"}. Skipping.`);
+      return null;
+    }
+
+    if (existingConfig) {
+      return { fetchEnabled: true };
+    }
+
+    if (deps.isNonInteractive()) {
+      const braveApiKey = normalizeCredentialValue(process.env[BRAVE_API_KEY_ENV]);
+      if (!braveApiKey) {
+        return null;
+      }
+      deps.note("  [non-interactive] Brave Web Search requested.");
+      const validation = validateBraveSearchApiKey(braveApiKey);
+      if (!validation.ok) {
+        console.warn(
+          `  Brave Search API key validation failed. Web search will be disabled — re-enable later via \`${deps.cliName()} config web-search\`.`,
+        );
+        if (validation.message) {
+          console.warn(`  ${validation.message}`);
+        }
+        return null;
+      }
+      saveCredential(BRAVE_API_KEY_ENV, braveApiKey);
+      process.env[BRAVE_API_KEY_ENV] = braveApiKey;
+      return { fetchEnabled: true };
+    }
+    const enableAnswer = await deps.prompt("  Enable Brave Web Search? [y/N]: ");
+    if (!isAffirmativeAnswer(enableAnswer)) {
+      return null;
+    }
+
+    const braveApiKey = await ensureValidatedBraveSearchCredential();
+    if (!braveApiKey) {
+      return null;
+    }
+
+    console.log("  ✓ Enabled Brave Web Search");
+    console.log("");
+    return { fetchEnabled: true };
+  }
+
+  function verifyWebSearchInsideSandbox(
+    sandboxName: string,
+    agent: AgentDefinition | null | undefined,
+  ): void {
+    verifyWebSearchInsideSandboxWithDeps(sandboxName, agent, {
+      runCaptureOpenshell: deps.runCaptureOpenshell,
+      cliName: deps.cliName,
+    });
+  }
+
+  return {
+    validateBraveSearchApiKey,
+    promptBraveSearchRecovery,
+    promptBraveSearchApiKey,
+    ensureValidatedBraveSearchCredential,
+    configureWebSearch,
+    verifyWebSearchInsideSandbox,
+  };
+}

From 826d82afaa1be6031cd6041026571b1c72d2d67a Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Thu, 21 May 2026 19:02:10 -0700
Subject: [PATCH 50/54] refactor(cli): extract inference selection validation

---
 src/lib/onboard.ts                            | 156 +-----------
 .../onboard/inference-selection-validation.ts | 223 ++++++++++++++++++
 2 files changed, 236 insertions(+), 143 deletions(-)
 create mode 100644 src/lib/onboard/inference-selection-validation.ts

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index b806f38d0e..a6e9cca8b1 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -16,6 +16,9 @@ const {
   setOnboardBrandingAgent,
 }: typeof import("./onboard/branding") = require("./onboard/branding");
 const { createSelectOnboardAgent }: typeof import("./onboard/agent-selection") = require("./onboard/agent-selection");
+const {
+  createInferenceSelectionValidationHelpers,
+}: typeof import("./onboard/inference-selection-validation") = require("./onboard/inference-selection-validation");
 const { cleanupTempDir }: typeof import("./onboard/temp-files") = require("./onboard/temp-files");
 const { stopStaleDashboardListenersForSandbox } = require("./onboard/stale-gateway-cleanup");
 const {
@@ -945,150 +948,17 @@ const {
   probeAnthropicEndpoint,
 } = require("./inference/onboard-probes");
 
-async function validateOpenAiLikeSelection(
-  label: string,
-  endpointUrl: string,
-  model: string,
-  credentialEnv: string | null = null,
-  retryMessage = "Please choose a provider/model again.",
-  helpUrl: string | null = null,
-  options: {
-    authMode?: "bearer" | "query-param";
-    requireResponsesToolCalling?: boolean;
-    requireChatCompletionsToolCalling?: boolean;
-    skipResponsesProbe?: boolean;
-    probeStreaming?: boolean;
-  } = {},
-): Promise<EndpointValidationResult> {
-  const apiKey = credentialEnv ? getCredential(credentialEnv) : "";
-  const probe = probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, options);
-  if (!probe.ok) {
-    console.error(`  ${label} endpoint validation failed.`);
-    console.error(`  ${probe.message}`);
-    if (isNonInteractive()) {
-      process.exit(1);
-    }
-    const retry = await promptValidationRecovery(
-      label,
-      getProbeRecovery(probe),
-      credentialEnv,
-      helpUrl,
-    );
-    if (retry === "selection") {
-      console.log(`  ${retryMessage}`);
-      console.log("");
-    }
-    return { ok: false, retry };
-  }
-  if (probe.note) {
-    console.log(`  ℹ ${probe.note}`);
-  } else {
-    console.log(`  ${probe.label} available — ${agentProductName()} will use ${probe.api}.`);
-  }
-  return { ok: true, api: probe.api ?? "openai-completions" };
-}
-
-async function validateAnthropicSelectionWithRetryMessage(
-  label: string,
-  endpointUrl: string,
-  model: string,
-  credentialEnv: string,
-  retryMessage = "Please choose a provider/model again.",
-  helpUrl: string | null = null,
-): Promise<EndpointValidationResult> {
-  const apiKey = getCredential(credentialEnv);
-  const probe = probeAnthropicEndpoint(endpointUrl, model, apiKey);
-  if (!probe.ok) {
-    console.error(`  ${label} endpoint validation failed.`);
-    console.error(`  ${probe.message}`);
-    if (isNonInteractive()) {
-      process.exit(1);
-    }
-    const retry = await promptValidationRecovery(
-      label,
-      getProbeRecovery(probe),
-      credentialEnv,
-      helpUrl,
-    );
-    if (retry === "selection") {
-      console.log(`  ${retryMessage}`);
-      console.log("");
-    }
-    return { ok: false, retry };
-  }
-  console.log(`  ${probe.label} available — ${agentProductName()} will use ${probe.api}.`);
-  return { ok: true, api: probe.api };
-}
-
-async function validateCustomOpenAiLikeSelection(
-  label: string,
-  endpointUrl: string,
-  model: string,
-  credentialEnv: string,
-  helpUrl: string | null = null,
-): Promise<EndpointValidationResult> {
-  const apiKey = getCredential(credentialEnv);
-  const probe = probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, {
-    requireResponsesToolCalling: true,
-    skipResponsesProbe: shouldForceCompletionsApi(process.env.NEMOCLAW_PREFERRED_API),
-    probeStreaming: true,
-  });
-  if (probe.ok) {
-    if (probe.note) {
-      console.log(`  ℹ ${probe.note}`);
-    } else {
-      console.log(`  ${probe.label} available — ${agentProductName()} will use ${probe.api}.`);
-    }
-    return { ok: true, api: probe.api ?? "openai-completions" };
-  }
-  console.error(`  ${label} endpoint validation failed.`);
-  console.error(`  ${probe.message}`);
-  if (isNonInteractive()) {
-    process.exit(1);
-  }
-  const retry = await promptValidationRecovery(
-    label,
-    getProbeRecovery(probe, { allowModelRetry: true }),
-    credentialEnv,
-    helpUrl,
-  );
-  if (retry === "selection") {
-    console.log("  Please choose a provider/model again.");
-    console.log("");
-  }
-  return { ok: false, retry };
-}
+const {
+  validateOpenAiLikeSelection,
+  validateAnthropicSelectionWithRetryMessage,
+  validateCustomOpenAiLikeSelection,
+  validateCustomAnthropicSelection,
+} = createInferenceSelectionValidationHelpers({
+  isNonInteractive,
+  agentProductName,
+  promptValidationRecovery,
+});
 
-async function validateCustomAnthropicSelection(
-  label: string,
-  endpointUrl: string,
-  model: string,
-  credentialEnv: string,
-  helpUrl: string | null = null,
-): Promise<EndpointValidationResult> {
-  const apiKey = getCredential(credentialEnv);
-  const probe = probeAnthropicEndpoint(endpointUrl, model, apiKey);
-  if (probe.ok) {
-    console.log(`  ${probe.label} available — ${agentProductName()} will use ${probe.api}.`);
-    return { ok: true, api: probe.api };
-  }
-  console.error(`  ${label} endpoint validation failed.`);
-  console.error(`  ${probe.message}`);
-  if (isNonInteractive()) {
-    process.exit(1);
-  }
-  const retry = await promptValidationRecovery(
-    label,
-    getProbeRecovery(probe, { allowModelRetry: true }),
-    credentialEnv,
-    helpUrl,
-  );
-  if (retry === "selection") {
-    console.log("  Please choose a provider/model again.");
-    console.log("");
-  }
-  return { ok: false, retry };
-}
 
 const { promptCloudModel, promptRemoteModel, promptInputModel } = modelPrompts;
 const { validateAnthropicModel, validateOpenAiLikeModel } = providerModels;
diff --git a/src/lib/onboard/inference-selection-validation.ts b/src/lib/onboard/inference-selection-validation.ts
new file mode 100644
index 0000000000..b5e4ca282c
--- /dev/null
+++ b/src/lib/onboard/inference-selection-validation.ts
@@ -0,0 +1,223 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { getCredential } from "../credentials/store";
+
+const { probeAnthropicEndpoint, probeOpenAiLikeEndpoint } = require("../inference/onboard-probes") as {
+  probeAnthropicEndpoint(endpointUrl: string, model: string, apiKey: string | null | undefined): any;
+  probeOpenAiLikeEndpoint(endpointUrl: string, model: string, apiKey: string | null | undefined, options?: Record<string, unknown>): any;
+};
+
+import { shouldForceCompletionsApi } from "../validation";
+import { getProbeRecovery } from "../validation-recovery";
+
+export type EndpointValidationResult =
+  | { ok: true; api: string | null; retry?: undefined }
+  | { ok: false; retry: "credential" | "selection" | "retry" | "model"; api?: undefined };
+
+export interface InferenceSelectionValidationDeps {
+  isNonInteractive(): boolean;
+  agentProductName(): string;
+  promptValidationRecovery(
+    label: string,
+    recovery: ReturnType<typeof getProbeRecovery>,
+    credentialEnv?: string | null,
+    helpUrl?: string | null,
+  ): Promise<"credential" | "selection" | "retry" | "model">;
+}
+
+export interface InferenceSelectionValidationHelpers {
+  validateOpenAiLikeSelection(
+    label: string,
+    endpointUrl: string,
+    model: string,
+    credentialEnv?: string | null,
+    retryMessage?: string,
+    helpUrl?: string | null,
+    options?: {
+      authMode?: "bearer" | "query-param";
+      requireResponsesToolCalling?: boolean;
+      requireChatCompletionsToolCalling?: boolean;
+      skipResponsesProbe?: boolean;
+      probeStreaming?: boolean;
+    },
+  ): Promise<EndpointValidationResult>;
+  validateAnthropicSelectionWithRetryMessage(
+    label: string,
+    endpointUrl: string,
+    model: string,
+    credentialEnv: string,
+    retryMessage?: string,
+    helpUrl?: string | null,
+  ): Promise<EndpointValidationResult>;
+  validateCustomOpenAiLikeSelection(
+    label: string,
+    endpointUrl: string,
+    model: string,
+    credentialEnv: string,
+    helpUrl?: string | null,
+  ): Promise<EndpointValidationResult>;
+  validateCustomAnthropicSelection(
+    label: string,
+    endpointUrl: string,
+    model: string,
+    credentialEnv: string,
+    helpUrl?: string | null,
+  ): Promise<EndpointValidationResult>;
+}
+
+export function createInferenceSelectionValidationHelpers(
+  deps: InferenceSelectionValidationDeps,
+): InferenceSelectionValidationHelpers {
+  async function validateOpenAiLikeSelection(
+    label: string,
+    endpointUrl: string,
+    model: string,
+    credentialEnv: string | null = null,
+    retryMessage = "Please choose a provider/model again.",
+    helpUrl: string | null = null,
+    options: {
+      authMode?: "bearer" | "query-param";
+      requireResponsesToolCalling?: boolean;
+      requireChatCompletionsToolCalling?: boolean;
+      skipResponsesProbe?: boolean;
+      probeStreaming?: boolean;
+    } = {},
+  ): Promise<EndpointValidationResult> {
+    const apiKey = credentialEnv ? getCredential(credentialEnv) : "";
+    const probe = probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, options);
+    if (!probe.ok) {
+      console.error(`  ${label} endpoint validation failed.`);
+      console.error(`  ${probe.message}`);
+      if (deps.isNonInteractive()) {
+        process.exit(1);
+      }
+      const retry = await deps.promptValidationRecovery(
+        label,
+        getProbeRecovery(probe),
+        credentialEnv,
+        helpUrl,
+      );
+      if (retry === "selection") {
+        console.log(`  ${retryMessage}`);
+        console.log("");
+      }
+      return { ok: false, retry };
+    }
+    if (probe.note) {
+      console.log(`  ℹ ${probe.note}`);
+    } else {
+      console.log(`  ${probe.label} available — ${deps.agentProductName()} will use ${probe.api}.`);
+    }
+    return { ok: true, api: probe.api ?? "openai-completions" };
+  }
+
+  async function validateAnthropicSelectionWithRetryMessage(
+    label: string,
+    endpointUrl: string,
+    model: string,
+    credentialEnv: string,
+    retryMessage = "Please choose a provider/model again.",
+    helpUrl: string | null = null,
+  ): Promise<EndpointValidationResult> {
+    const apiKey = getCredential(credentialEnv);
+    const probe = probeAnthropicEndpoint(endpointUrl, model, apiKey);
+    if (!probe.ok) {
+      console.error(`  ${label} endpoint validation failed.`);
+      console.error(`  ${probe.message}`);
+      if (deps.isNonInteractive()) {
+        process.exit(1);
+      }
+      const retry = await deps.promptValidationRecovery(
+        label,
+        getProbeRecovery(probe),
+        credentialEnv,
+        helpUrl,
+      );
+      if (retry === "selection") {
+        console.log(`  ${retryMessage}`);
+        console.log("");
+      }
+      return { ok: false, retry };
+    }
+    console.log(`  ${probe.label} available — ${deps.agentProductName()} will use ${probe.api}.`);
+    return { ok: true, api: probe.api };
+  }
+
+  async function validateCustomOpenAiLikeSelection(
+    label: string,
+    endpointUrl: string,
+    model: string,
+    credentialEnv: string,
+    helpUrl: string | null = null,
+  ): Promise<EndpointValidationResult> {
+    const apiKey = getCredential(credentialEnv);
+    const probe = probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, {
+      requireResponsesToolCalling: true,
+      skipResponsesProbe: shouldForceCompletionsApi(process.env.NEMOCLAW_PREFERRED_API),
+      probeStreaming: true,
+    });
+    if (probe.ok) {
+      if (probe.note) {
+        console.log(`  ℹ ${probe.note}`);
+      } else {
+        console.log(`  ${probe.label} available — ${deps.agentProductName()} will use ${probe.api}.`);
+      }
+      return { ok: true, api: probe.api ?? "openai-completions" };
+    }
+    console.error(`  ${label} endpoint validation failed.`);
+    console.error(`  ${probe.message}`);
+    if (deps.isNonInteractive()) {
+      process.exit(1);
+    }
+    const retry = await deps.promptValidationRecovery(
+      label,
+      getProbeRecovery(probe, { allowModelRetry: true }),
+      credentialEnv,
+      helpUrl,
+    );
+    if (retry === "selection") {
+      console.log("  Please choose a provider/model again.");
+      console.log("");
+    }
+    return { ok: false, retry };
+  }
+
+  async function validateCustomAnthropicSelection(
+    label: string,
+    endpointUrl: string,
+    model: string,
+    credentialEnv: string,
+    helpUrl: string | null = null,
+  ): Promise<EndpointValidationResult> {
+    const apiKey = getCredential(credentialEnv);
+    const probe = probeAnthropicEndpoint(endpointUrl, model, apiKey);
+    if (probe.ok) {
+      console.log(`  ${probe.label} available — ${deps.agentProductName()} will use ${probe.api}.`);
+      return { ok: true, api: probe.api };
+    }
+    console.error(`  ${label} endpoint validation failed.`);
+    console.error(`  ${probe.message}`);
+    if (deps.isNonInteractive()) {
+      process.exit(1);
+    }
+    const retry = await deps.promptValidationRecovery(
+      label,
+      getProbeRecovery(probe, { allowModelRetry: true }),
+      credentialEnv,
+      helpUrl,
+    );
+    if (retry === "selection") {
+      console.log("  Please choose a provider/model again.");
+      console.log("");
+    }
+    return { ok: false, retry };
+  }
+
+  return {
+    validateOpenAiLikeSelection,
+    validateAnthropicSelectionWithRetryMessage,
+    validateCustomOpenAiLikeSelection,
+    validateCustomAnthropicSelection,
+  };
+}

From a7fe203f8f7f686d9275a4d389affbb69d5fb53a Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Thu, 21 May 2026 19:05:07 -0700
Subject: [PATCH 51/54] refactor(cli): move direct sandbox gpu verifier

---
 src/lib/onboard.ts                       | 33 ++++++-----------------
 src/lib/onboard/sandbox-gpu-preflight.ts | 34 ++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index a6e9cca8b1..21bc0b7411 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -315,11 +315,12 @@ const {
   createValidationRecoveryPromptHelpers,
 }: typeof import("./onboard/validation-recovery-prompt") = require("./onboard/validation-recovery-prompt");
 const { createOpenshellCliHelpers }: typeof import("./onboard/openshell-cli") = require("./onboard/openshell-cli");
+const sandboxGpuPreflight: typeof import("./onboard/sandbox-gpu-preflight") = require("./onboard/sandbox-gpu-preflight");
 const {
   resolveSandboxGpuFlagFromOptions,
   sandboxGpuRemediationLines,
   validateSandboxGpuPreflight,
-}: typeof import("./onboard/sandbox-gpu-preflight") = require("./onboard/sandbox-gpu-preflight");
+} = sandboxGpuPreflight;
 const openshellVersion: typeof import("./onboard/openshell-version") = require("./onboard/openshell-version");
 const {
   getBlueprintMaxOpenshellVersion,
@@ -807,30 +808,12 @@ type EndpointValidationResult =
   | { ok: true; api: string | null; retry?: undefined }
   | { ok: false; retry: "credential" | "selection" | "retry" | "model"; api?: undefined };
 
-function verifyDirectSandboxGpu(sandboxName: string): void {
-  console.log("  Verifying direct sandbox GPU access...");
-  for (const proof of buildDirectSandboxGpuProofCommands(sandboxName)) {
-    const result = runOpenshell(proof.args, {
-      ignoreError: true,
-      suppressOutput: true,
-      timeout: 30_000,
-    });
-    if (result.status === 0) {
-      console.log(`  ✓ GPU proof passed: ${proof.label}`);
-      continue;
-    }
-    if (proof.optional === true) return;
-    const diagnostic = compactText(redact(`${result.stderr || ""} ${result.stdout || ""}`));
-    console.error(`  ✗ GPU proof failed: ${proof.label}`);
-    if (diagnostic) console.error(`    ${diagnostic.slice(0, 300)}`);
-    for (const line of sandboxGpuRemediationLines()) {
-      console.error(`    ${line}`);
-    }
-    const statusText = String(result.status || 1);
-    const diagnosticSuffix = diagnostic ? `: ${diagnostic.slice(0, 300)}` : "";
-    throw new Error(`GPU proof failed: ${proof.label} (status ${statusText})${diagnosticSuffix}`);
-  }
-}
+const verifyDirectSandboxGpu = sandboxGpuPreflight.createDirectSandboxGpuVerifier({
+  runOpenshell,
+  compactText,
+  redact,
+});
+
 
 function upsertMessagingProviders(tokenDefs: MessagingTokenDef[]) {
   const upserted = onboardProviders.upsertMessagingProviders(tokenDefs, runOpenshell);
diff --git a/src/lib/onboard/sandbox-gpu-preflight.ts b/src/lib/onboard/sandbox-gpu-preflight.ts
index d33a324f3c..ab0ac2e822 100644
--- a/src/lib/onboard/sandbox-gpu-preflight.ts
+++ b/src/lib/onboard/sandbox-gpu-preflight.ts
@@ -5,6 +5,7 @@ import {
   findReadableNvidiaCdiSpecFiles,
   getDockerCdiSpecDirs,
 } from "./docker-cdi";
+import { buildDirectSandboxGpuProofCommands } from "./initial-policy";
 import type { SandboxGpuConfig, SandboxGpuFlag } from "./sandbox-gpu-mode";
 
 export interface SandboxGpuFlagOptions {
@@ -45,6 +46,39 @@ export function sandboxGpuRemediationLines(): string[] {
   ];
 }
 
+export interface DirectSandboxGpuVerifierDeps {
+  runOpenshell(args: string[], opts?: Record<string, unknown>): { status?: number | null; stdout?: unknown; stderr?: unknown };
+  compactText(value: string): string;
+  redact(value: unknown): string;
+}
+
+export function createDirectSandboxGpuVerifier(deps: DirectSandboxGpuVerifierDeps) {
+  return function verifyDirectSandboxGpu(sandboxName: string): void {
+    console.log("  Verifying direct sandbox GPU access...");
+    for (const proof of buildDirectSandboxGpuProofCommands(sandboxName)) {
+      const result = deps.runOpenshell(proof.args, {
+        ignoreError: true,
+        suppressOutput: true,
+        timeout: 30_000,
+      });
+      if (result.status === 0) {
+        console.log(`  ✓ GPU proof passed: ${proof.label}`);
+        continue;
+      }
+      if (proof.optional === true) return;
+      const diagnostic = deps.compactText(deps.redact(`${result.stderr || ""} ${result.stdout || ""}`));
+      console.error(`  ✗ GPU proof failed: ${proof.label}`);
+      if (diagnostic) console.error(`    ${diagnostic.slice(0, 300)}`);
+      for (const line of sandboxGpuRemediationLines()) {
+        console.error(`    ${line}`);
+      }
+      const statusText = String(result.status || 1);
+      const diagnosticSuffix = diagnostic ? `: ${diagnostic.slice(0, 300)}` : "";
+      throw new Error(`GPU proof failed: ${proof.label} (status ${statusText})${diagnosticSuffix}`);
+    }
+  };
+}
+
 export function validateSandboxGpuPreflight(config: SandboxGpuConfig): void {
   if (config.errors.length > 0) {
     console.error("");

From ab9a8d180fc6ed313939d7330fc49fb81be8b3d4 Mon Sep 17 00:00:00 2001
From: Aaron Erickson <aerickson@nvidia.com>
Date: Sat, 23 May 2026 00:59:49 -0700
Subject: [PATCH 52/54] fix(ci): address code scanning alerts in onboard
 refactor

---
 src/lib/onboard.ts                  | 17 +++++++++--------
 src/lib/security/credential-hash.ts |  3 +++
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 045a04d086..f305d0e9a2 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -2428,14 +2428,12 @@ async function startGatewayWithOptions(
   }
   // Also purge any known_hosts entries matching the gateway hostname pattern
   const knownHostsPath = path.join(os.homedir(), ".ssh", "known_hosts");
-  if (fs.existsSync(knownHostsPath)) {
-    try {
-      const kh = fs.readFileSync(knownHostsPath, "utf8");
-      const cleaned = pruneKnownHostsEntries(kh);
-      if (cleaned !== kh) fs.writeFileSync(knownHostsPath, cleaned);
-    } catch {
-      /* best-effort cleanup — ignore read/write errors */
-    }
+  try {
+    const kh = fs.readFileSync(knownHostsPath, "utf8");
+    const cleaned = pruneKnownHostsEntries(kh);
+    if (cleaned !== kh) fs.writeFileSync(knownHostsPath, cleaned);
+  } catch {
+    /* best-effort cleanup — ignore absent/read/write errors */
   }
 
   const gwArgs = ["--name", GATEWAY_NAME, "--port", getGatewayPortArg()];
@@ -2669,6 +2667,9 @@ async function startDockerDriverGateway({ exitOnFailure = true, skipSandboxBridg
 
   fs.mkdirSync(stateDir, { recursive: true, mode: 0o700 });
   const logPath = path.join(stateDir, "openshell-gateway.log");
+  // The gateway state directory is NemoClaw-owned; creating it before opening
+  // the append-only log is intentional and safe for this local runtime file.
+  // codeql[js/file-system-race]
   const outFd = fs.openSync(logPath, "a", 0o600);
   const errFd = fs.openSync(logPath, "a", 0o600);
   console.log("  Starting OpenShell Docker-driver gateway...");
diff --git a/src/lib/security/credential-hash.ts b/src/lib/security/credential-hash.ts
index 5554591e6f..051dfb6ba3 100644
--- a/src/lib/security/credential-hash.ts
+++ b/src/lib/security/credential-hash.ts
@@ -6,5 +6,8 @@ import crypto from "node:crypto";
 export function hashCredential(value: string | null | undefined): string | null {
   const normalized = String(value ?? "").trim();
   if (!normalized) return null;
+  // This is a non-secret change detector for credential rotation, not a
+  // password verifier or credential storage primitive.
+  // codeql[js/insufficient-password-hash]
   return crypto.createHash("sha256").update(normalized).digest("hex");
 }

From bc85d6351c6679369f694a016257d53091ce17cc Mon Sep 17 00:00:00 2001
From: Aaron Erickson <aerickson@nvidia.com>
Date: Sat, 23 May 2026 01:05:14 -0700
Subject: [PATCH 53/54] fix(ci): place codeql suppressions on alert lines

---
 src/lib/onboard.ts                  | 5 ++---
 src/lib/security/credential-hash.ts | 3 +--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index f305d0e9a2..70e36b13b5 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -2669,9 +2669,8 @@ async function startDockerDriverGateway({ exitOnFailure = true, skipSandboxBridg
   const logPath = path.join(stateDir, "openshell-gateway.log");
   // The gateway state directory is NemoClaw-owned; creating it before opening
   // the append-only log is intentional and safe for this local runtime file.
-  // codeql[js/file-system-race]
-  const outFd = fs.openSync(logPath, "a", 0o600);
-  const errFd = fs.openSync(logPath, "a", 0o600);
+  const outFd = fs.openSync(logPath, "a", 0o600); // codeql[js/file-system-race]
+  const errFd = fs.openSync(logPath, "a", 0o600); // codeql[js/file-system-race]
   console.log("  Starting OpenShell Docker-driver gateway...");
   console.log(`  Gateway log: ${logPath}`);
   const launch = gatewayLaunch ?? {
diff --git a/src/lib/security/credential-hash.ts b/src/lib/security/credential-hash.ts
index 051dfb6ba3..492806eae6 100644
--- a/src/lib/security/credential-hash.ts
+++ b/src/lib/security/credential-hash.ts
@@ -8,6 +8,5 @@ export function hashCredential(value: string | null | undefined): string | null
   if (!normalized) return null;
   // This is a non-secret change detector for credential rotation, not a
   // password verifier or credential storage primitive.
-  // codeql[js/insufficient-password-hash]
-  return crypto.createHash("sha256").update(normalized).digest("hex");
+  return crypto.createHash("sha256").update(normalized).digest("hex"); // codeql[js/insufficient-password-hash]
 }

From 80cb1647f657c6e091837018f58343707336e69c Mon Sep 17 00:00:00 2001
From: Aaron Erickson <aerickson@nvidia.com>
Date: Sat, 23 May 2026 01:12:18 -0700
Subject: [PATCH 54/54] fix(onboard): preserve Brave credential prompt
 navigation

---
 src/lib/onboard/web-search-flow.ts | 32 ++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/src/lib/onboard/web-search-flow.ts b/src/lib/onboard/web-search-flow.ts
index ab1af78d5f..7700482704 100644
--- a/src/lib/onboard/web-search-flow.ts
+++ b/src/lib/onboard/web-search-flow.ts
@@ -10,6 +10,7 @@ import { BRAVE_API_KEY_ENV } from "../inference/web-search";
 import { ROOT } from "../runner";
 import { classifyValidationFailure } from "../validation";
 import { getTransportRecoveryMessage } from "../validation-recovery";
+import { BACK_TO_SELECTION, type BackToSelection, isBackToSelection } from "./credential-navigation";
 import { exitOnboardFromPrompt, isAffirmativeAnswer } from "./prompt-helpers";
 import type { ValidationFailureLike } from "./types";
 import { agentSupportsWebSearch } from "./web-search-support";
@@ -28,8 +29,8 @@ export interface WebSearchFlowDeps {
 export interface WebSearchFlowHelpers {
   validateBraveSearchApiKey(apiKey: string): CurlProbeResult;
   promptBraveSearchRecovery(validation: ValidationFailureLike): Promise<"retry" | "skip">;
-  promptBraveSearchApiKey(): Promise<string>;
-  ensureValidatedBraveSearchCredential(nonInteractive?: boolean): Promise<string | null>;
+  promptBraveSearchApiKey(): Promise<string | BackToSelection>;
+  ensureValidatedBraveSearchCredential(nonInteractive?: boolean): Promise<string | BackToSelection | null>;
   configureWebSearch(
     existingConfig?: WebSearchConfig | null,
     agent?: AgentDefinition | null,
@@ -82,15 +83,23 @@ export function createWebSearchFlowHelpers(deps: WebSearchFlowDeps): WebSearchFl
     return "retry";
   }
 
-  async function promptBraveSearchApiKey(): Promise<string> {
+  async function promptBraveSearchApiKey(): Promise<string | BackToSelection> {
     console.log("");
     console.log(`  Get your Brave Search API key from: ${BRAVE_SEARCH_HELP_URL}`);
     console.log("");
 
     while (true) {
-      const key = normalizeCredentialValue(
-        await deps.prompt("  Brave Search API key: ", { secret: true }),
-      );
+      const value = await deps.prompt("  Brave Search API key: ", { secret: true });
+      const intent = normalizeCredentialValue(value).toLowerCase();
+      if (intent === "back") return BACK_TO_SELECTION;
+      if (intent === "exit" || intent === "quit") {
+        exitOnboardFromPrompt();
+      }
+      if (intent === "?" || intent === "help") {
+        console.log("  Type back to choose again, or exit to quit.");
+        continue;
+      }
+      const key = normalizeCredentialValue(value);
       if (!key) {
         console.error("  Brave Search API key is required.");
         continue;
@@ -101,7 +110,7 @@ export function createWebSearchFlowHelpers(deps: WebSearchFlowDeps): WebSearchFl
 
   async function ensureValidatedBraveSearchCredential(
     nonInteractive = deps.isNonInteractive(),
-  ): Promise<string | null> {
+  ): Promise<string | BackToSelection | null> {
     const savedApiKey = getCredential(BRAVE_API_KEY_ENV);
     let apiKey: string | null =
       savedApiKey || normalizeCredentialValue(process.env[BRAVE_API_KEY_ENV]);
@@ -114,7 +123,11 @@ export function createWebSearchFlowHelpers(deps: WebSearchFlowDeps): WebSearchFl
             "Brave Search requires BRAVE_API_KEY or a saved Brave Search credential in non-interactive mode.",
           );
         }
-        apiKey = await promptBraveSearchApiKey();
+        const promptedApiKey = await promptBraveSearchApiKey();
+        if (isBackToSelection(promptedApiKey)) {
+          return promptedApiKey;
+        }
+        apiKey = promptedApiKey;
         usingSavedKey = false;
       }
 
@@ -191,6 +204,9 @@ export function createWebSearchFlowHelpers(deps: WebSearchFlowDeps): WebSearchFl
     }
 
     const braveApiKey = await ensureValidatedBraveSearchCredential();
+    if (isBackToSelection(braveApiKey)) {
+      return configureWebSearch(existingConfig, agent, dockerfilePathOverride);
+    }
     if (!braveApiKey) {
       return null;
     }