diff --git a/src/lib/agent/onboard.ts b/src/lib/agent/onboard.ts index 2418ed3831..db4c3f5794 100644 --- a/src/lib/agent/onboard.ts +++ b/src/lib/agent/onboard.ts @@ -13,7 +13,6 @@ import { dockerBuild, dockerImageInspect } from "../adapters/docker"; import { getAgentBranding } from "../cli/branding"; import { getProviderSelectionConfig } from "../inference/config"; import type { JsonObject as LooseObject } from "../core/json-types"; -import * as onboardSession from "../state/onboard-session"; import { runSandboxConfigSync } from "../onboard/config-sync"; import { ROOT, redact, run, shellQuote } from "../runner"; import { @@ -29,7 +28,9 @@ export interface OnboardContext { runCaptureOpenshell: (args: string[], opts?: { ignoreError?: boolean }) => string | null; openshellShellCommand: (args: string[], options?: { openshellBinary?: string }) => string; openshellBinary: string; - startRecordedStep: (stepName: string, updates: LooseObject) => void; + startRecordedStep: (stepName: string, updates: LooseObject) => Promise; + recordStepComplete: (stepName: string, updates: LooseObject) => Promise; + recordStepFailed: (stepName: string, message: string | null) => Promise; skippedStepMessage: (stepName: string, sandboxName: string) => void; } @@ -348,13 +349,14 @@ export function collectHermesStartupDiagnostics( /** * Record and print an agent setup failure before exiting the onboarding flow. */ -function failAgentSetup( +async function failAgentSetup( sandboxName: string, agent: AgentDefinition, message: string, + recordStepFailed: OnboardContext["recordStepFailed"], details: string[] = [], -): never { - onboardSession.markStepFailed( +): Promise { + await recordStepFailed( "agent_setup", details.length > 0 ? `${message}\n${details.join("\n")}` : message, ); @@ -401,6 +403,8 @@ export async function handleAgentSetup( runCaptureOpenshell, openshellBinary: openshellBin, startRecordedStep, + recordStepComplete, + recordStepFailed, skippedStepMessage, } = ctx; @@ -433,21 +437,22 @@ export async function handleAgentSetup( // to the Dockerfile's zero-byte placeholder. Mirrors the OpenClaw // path in src/lib/onboard.ts. Fixes #3999 for non-OpenClaw agents. syncNemoClawConfig(); - onboardSession.markStepComplete("agent_setup", { sandboxName, provider, model }); + await recordStepComplete("agent_setup", { sandboxName, provider, model }); return; } } } - startRecordedStep("agent_setup", { sandboxName, provider, model }); + await startRecordedStep("agent_setup", { sandboxName, provider, model }); step(7, 8, `Setting up ${agent.displayName} inside sandbox`); const binaryAvailability = verifyAgentBinaryAvailable(sandboxName, agent, runCaptureOpenshell); if (!binaryAvailability.available) { - failAgentSetup( + await failAgentSetup( sandboxName, agent, describeAgentBinaryFailure(sandboxName, agent, binaryAvailability), + recordStepFailed, ); } @@ -478,10 +483,11 @@ export async function handleAgentSetup( agent.name === "hermes" ? collectHermesStartupDiagnostics(sandboxName, runCaptureOpenshell) : []; - failAgentSetup( + await failAgentSetup( sandboxName, agent, `${agent.displayName} gateway did not respond within ${timeoutSecs}s`, + recordStepFailed, diagnostics, ); } @@ -489,7 +495,7 @@ export async function handleAgentSetup( console.log(` \u2713 ${agent.displayName} configured inside sandbox`); } - onboardSession.markStepComplete("agent_setup", { sandboxName, provider, model }); + await recordStepComplete("agent_setup", { sandboxName, provider, model }); } /** diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index c9543bbb8f..0bd183d995 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -293,6 +293,7 @@ const { resolveSandboxImageTagFromCreateOutput } = require("./domain/sandbox/image-tag") as typeof import("./domain/sandbox/image-tag"); const nim: typeof import("./inference/nim") = require("./inference/nim"); const onboardSession: typeof import("./state/onboard-session") = require("./state/onboard-session"); +const { OnboardRuntimeBoundary }: typeof import("./onboard/runtime-boundary") = require("./onboard/runtime-boundary"); const policies: typeof import("./policy") = require("./policy"); const tiers: typeof import("./policy/tiers") = require("./policy/tiers"); const { ensureUsageNoticeConsent } = require("./onboard/usage-notice"); @@ -8915,27 +8916,15 @@ function toSessionUpdates( return normalized; } -function startRecordedStep( - stepName: string, - updates: { - sandboxName?: string | null; - provider?: string | null; - model?: string | null; - policyPresets?: string[] | null; - } = {}, -): void { - onboardSession.markStepStarted(stepName); - if (Object.keys(updates).length > 0) { - onboardSession.updateSession((session: Session) => { - if (updates.sandboxName !== undefined) session.sandboxName = updates.sandboxName; - if (updates.provider !== undefined) session.provider = updates.provider; - if (updates.model !== undefined) session.model = updates.model; - if (updates.policyPresets !== undefined) session.policyPresets = updates.policyPresets; - return session; - }); - } - maybeForceE2eStepFailure(stepName); -} +const onboardRuntimeBoundary = new OnboardRuntimeBoundary({ + toSessionUpdates, + maybeForceE2eStepFailure, +}); +const startRecordedStep = onboardRuntimeBoundary.startRecordedStep.bind(onboardRuntimeBoundary); +const recordStepComplete = onboardRuntimeBoundary.recordStepComplete.bind(onboardRuntimeBoundary); +const recordStepSkipped = onboardRuntimeBoundary.recordStepSkipped.bind(onboardRuntimeBoundary); +const recordStepFailed = onboardRuntimeBoundary.recordStepFailed.bind(onboardRuntimeBoundary); +const recordSessionComplete = onboardRuntimeBoundary.recordSessionComplete.bind(onboardRuntimeBoundary); const ONBOARD_STEP_INDEX: Record = { preflight: { number: 1, title: "Preflight checks" }, @@ -8972,6 +8961,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { RECREATE_SANDBOX = opts.recreateSandbox || process.env.NEMOCLAW_RECREATE_SANDBOX === "1"; AUTO_YES = opts.autoYes === true || process.env.NEMOCLAW_YES === "1"; _preflightDashboardPort = opts.controlUiPort || null; + onboardRuntimeBoundary.reset(); delete process.env.OPENSHELL_GATEWAY; const resume = opts.resume === true; const fresh = opts.fresh === true; @@ -9341,9 +9331,9 @@ async function onboard(opts: OnboardOptions = {}): Promise { assertCdiNvidiaGpuSpecPresent(assessHost(), resumeOptedOutGpuPassthrough); validateSandboxGpuPreflight(resumeSandboxGpuConfig); } else { - startRecordedStep("preflight"); + await startRecordedStep("preflight"); gpu = await preflight({ ...opts, optedOutGpuPassthrough: opts.noGpu === true }); - onboardSession.markStepComplete("preflight"); + await recordStepComplete("preflight"); } const sandboxGpuConfig = resolveSandboxGpuConfig(gpu, { flag: effectiveSandboxGpuFlag, @@ -9480,11 +9470,11 @@ async function onboard(opts: OnboardOptions = {}): Promise { resume && session?.steps?.gateway?.status === "complete" && canReuseHealthyGateway; if (resumeGateway) { skippedStepMessage("gateway", "running"); - onboardSession.markStepComplete("gateway"); + await recordStepComplete("gateway"); } else if (!resume && canReuseHealthyGateway) { skippedStepMessage("gateway", "running", "reuse"); note(" Reusing healthy NemoClaw gateway."); - onboardSession.markStepComplete("gateway"); + await recordStepComplete("gateway"); } else { if (resume && session?.steps?.gateway?.status === "complete") { if (gatewayReuseState === "active-unnamed") { @@ -9502,9 +9492,9 @@ async function onboard(opts: OnboardOptions = {}): Promise { retireLegacyGatewayForDockerDriverUpgrade(); gatewayReuseState = "missing"; } - startRecordedStep("gateway"); + await startRecordedStep("gateway"); await startGateway(gpu, { gpuPassthrough }); - onboardSession.markStepComplete("gateway"); + await recordStepComplete("gateway"); } // #2753: prefer requestedSandboxName over an unconfirmed session name. @@ -9555,7 +9545,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { // below). A SIGINT between any earlier step and createSandbox would // otherwise leave a phantom that `nemoclaw list` resurrects until // manually destroyed. - startRecordedStep("provider_selection"); + await startRecordedStep("provider_selection"); const selection = await setupNim(gpu, sandboxName, agent); model = selection.model; provider = selection.provider; @@ -9565,7 +9555,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { hermesToolGateways = selection.hermesToolGateways; preferredInferenceApi = selection.preferredInferenceApi; nimContainer = selection.nimContainer; - onboardSession.markStepComplete( + await recordStepComplete( "provider_selection", toSessionUpdates({ provider, @@ -9598,7 +9588,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { if (!sandboxName) { sandboxName = await promptValidatedSandboxName(agent); } - startRecordedStep("inference", { provider, model }); + await startRecordedStep("inference", { provider, model }); const inferenceResult = await setupInference( sandboxName, model, @@ -9612,7 +9602,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { forceProviderSelection = true; continue; } - onboardSession.markStepComplete( + await recordStepComplete( "inference", toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }), ); @@ -9632,7 +9622,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { if (nimContainer && sandboxName) { registry.updateSandbox(sandboxName, { nimContainer }); } - onboardSession.markStepComplete( + await recordStepComplete( "inference", toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }), ); @@ -9671,7 +9661,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { } } - startRecordedStep("inference", { provider, model }); + await startRecordedStep("inference", { provider, model }); const inferenceResult = await setupInference( sandboxName, model, @@ -9689,7 +9679,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { if (nimContainer && sandboxName) { registry.updateSandbox(sandboxName, { nimContainer }); } - onboardSession.markStepComplete( + await recordStepComplete( "inference", toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }), ); @@ -9831,7 +9821,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { } else { nextWebSearchConfig = await configureWebSearch(null, agent, webSearchSupportProbePath); } - startRecordedStep("sandbox", { provider, model }); + await startRecordedStep("sandbox", { provider, model }); const recordedMessagingChannels = getRecordedMessagingChannelsForResume(resume, session, sandboxName); if (recordedMessagingChannels) { selectedMessagingChannels = recordedMessagingChannels; @@ -9885,7 +9875,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { ...getSandboxAgentRegistryFields(agent, !fromDockerfile), }); registry.setDefault(sandboxName); - onboardSession.markStepComplete( + await recordStepComplete( "sandbox", toSessionUpdates({ sandboxName, @@ -9915,10 +9905,12 @@ async function onboard(opts: OnboardOptions = {}): Promise { openshellShellCommand, openshellBinary: getOpenshellBinary(), startRecordedStep, + recordStepComplete, + recordStepFailed, skippedStepMessage, }); ensureAgentDashboardForward(sandboxName, agent); - onboardSession.markStepSkipped("openclaw"); + await recordStepSkipped("openclaw"); } else { const resumeOpenclaw = resume && sandboxName && isOpenclawReady(sandboxName); if (resumeOpenclaw) { @@ -9927,19 +9919,19 @@ async function onboard(opts: OnboardOptions = {}): Promise { // zero-byte placeholder; re-sync to avoid loadOnboardConfig // SyntaxError. Fixes #3999. syncNemoClawConfigInSandbox(sandboxName, provider, model); - onboardSession.markStepComplete( + await recordStepComplete( "openclaw", toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }), ); } else { - startRecordedStep("openclaw", { sandboxName, provider, model }); + await startRecordedStep("openclaw", { sandboxName, provider, model }); await setupOpenclaw(sandboxName, model, provider); - onboardSession.markStepComplete( + await recordStepComplete( "openclaw", toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }), ); } - onboardSession.markStepSkipped("agent_setup"); + await recordStepSkipped("agent_setup"); } const latestSession = onboardSession.loadSession(); @@ -9999,7 +9991,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { arePolicyPresetsApplied(sandboxName, recordedPolicyPresetsForSupport); if (resumePolicies) { skippedStepMessage("policies", recordedPolicyPresetsForSupport.join(", ")); - onboardSession.markStepComplete( + await recordStepComplete( "policies", toSessionUpdates({ sandboxName, @@ -10009,7 +10001,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { }), ); } else { - startRecordedStep("policies", { + await startRecordedStep("policies", { sandboxName, provider, model, @@ -10035,7 +10027,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { }); }, }); - onboardSession.markStepComplete( + await recordStepComplete( "policies", toSessionUpdates({ sandboxName, provider, model, policyPresets: appliedPolicyPresets }), ); @@ -10045,7 +10037,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { ensureAgentDashboardForward(sandboxName, agent); } - onboardSession.completeSession( + await recordSessionComplete( toSessionUpdates({ sandboxName, provider, model, hermesAuthMethod, hermesToolGateways }), ); completed = true; @@ -10125,6 +10117,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { printDashboard(sandboxName, model, provider, nimContainer, agent); } finally { releaseOnboardLock(); + onboardRuntimeBoundary.clear(); } } diff --git a/src/lib/onboard/machine/runtime.test.ts b/src/lib/onboard/machine/runtime.test.ts index becca6028e..7b26269541 100644 --- a/src/lib/onboard/machine/runtime.test.ts +++ b/src/lib/onboard/machine/runtime.test.ts @@ -7,7 +7,9 @@ import { createSession, filterSafeUpdates, normalizeSession, + sanitizeFailure, type Session, + type SessionUpdates, } from "../../state/onboard-session"; import type { OnboardMachineEvent } from "./events"; import { OnboardRuntime, type OnboardRuntimeDeps } from "./runtime"; @@ -21,6 +23,12 @@ function createHarness(initialSession: Session | null = createSession()) { let session = initialSession ? cloneSession(initialSession) : null; const events: OnboardMachineEvent[] = []; let tick = 0; + const updateSession = (mutator: (value: Session) => Session | void): Session => { + const current = session ? cloneSession(session) : createSession(); + const next = mutator(current) ?? current; + session = cloneSession(next); + return cloneSession(session); + }; const deps: OnboardRuntimeDeps = { loadSession: () => (session ? cloneSession(session) : null), createSession: (overrides) => createSession(overrides), @@ -28,12 +36,48 @@ function createHarness(initialSession: Session | null = createSession()) { session = cloneSession(next); return cloneSession(session); }, - updateSession: (mutator) => { - const current = session ? cloneSession(session) : createSession(); - const next = mutator(current) ?? current; - session = cloneSession(next); - return cloneSession(session); - }, + updateSession, + markStepStarted: (stepName) => + updateSession((current) => { + const step = current.steps[stepName]; + if (!step) return current; + step.status = "in_progress"; + current.lastStepStarted = stepName; + current.status = "in_progress"; + return current; + }), + markStepComplete: (stepName, updates: SessionUpdates = {}) => + updateSession((current) => { + const step = current.steps[stepName]; + if (!step) return current; + step.status = "complete"; + current.lastCompletedStep = stepName; + Object.assign(current, filterSafeUpdates(updates)); + return current; + }), + markStepSkipped: (stepName) => + updateSession((current) => { + const step = current.steps[stepName]; + if (!step) return current; + step.status = "skipped"; + return current; + }), + markStepFailed: (stepName, message) => + updateSession((current) => { + const step = current.steps[stepName]; + if (!step) return current; + step.status = "failed"; + current.status = "failed"; + current.failure = sanitizeFailure({ step: stepName, message, recordedAt: "now" }); + return current; + }), + completeSession: (updates: SessionUpdates = {}) => + updateSession((current) => { + Object.assign(current, filterSafeUpdates(updates)); + current.status = "complete"; + current.resumable = false; + return current; + }), filterSafeUpdates, emitEvent: (event) => events.push(event), now: () => `2026-05-19T00:00:${String(tick++).padStart(2, "0")}.000Z`, diff --git a/src/lib/onboard/machine/runtime.ts b/src/lib/onboard/machine/runtime.ts index 3e72cd0ccc..2e5d584f3b 100644 --- a/src/lib/onboard/machine/runtime.ts +++ b/src/lib/onboard/machine/runtime.ts @@ -21,6 +21,11 @@ export interface OnboardRuntimeDeps { createSession(overrides?: Partial): Session; saveSession(session: Session): Session; updateSession(mutator: (session: Session) => Session | void): Session; + markStepStarted(stepName: string): Session; + markStepComplete(stepName: string, updates?: SessionUpdates): Session; + markStepSkipped(stepName: string): Session; + markStepFailed(stepName: string, message?: string | null): Session; + completeSession(updates?: SessionUpdates): Session; filterSafeUpdates(updates: SessionUpdates): Partial; emitEvent(event: OnboardMachineEvent): void; now(): string; @@ -46,6 +51,11 @@ function defaultDeps(): OnboardRuntimeDeps { createSession: onboardSession.createSession, saveSession: onboardSession.saveSession, updateSession: onboardSession.updateSession, + markStepStarted: onboardSession.markStepStarted, + markStepComplete: onboardSession.markStepComplete, + markStepSkipped: onboardSession.markStepSkipped, + markStepFailed: onboardSession.markStepFailed, + completeSession: onboardSession.completeSession, filterSafeUpdates: onboardSession.filterSafeUpdates, emitEvent: emitOnboardMachineEvent, now: () => new Date().toISOString(), @@ -91,6 +101,26 @@ export class OnboardRuntime { return session; } + async markStepStarted(stepName: string): Promise { + return this.deps.markStepStarted(stepName); + } + + async markStepComplete(stepName: string, updates: SessionUpdates = {}): Promise { + return this.deps.markStepComplete(stepName, updates); + } + + async markStepSkipped(stepName: string): Promise { + return this.deps.markStepSkipped(stepName); + } + + async markStepFailed(stepName: string, message: string | null = null): Promise { + return this.deps.markStepFailed(stepName, message); + } + + async completeSession(updates: SessionUpdates = {}): Promise { + return this.deps.completeSession(updates); + } + async transition( to: OnboardMachineState, options: OnboardRuntimeTransitionOptions = {}, diff --git a/src/lib/onboard/runtime-boundary.ts b/src/lib/onboard/runtime-boundary.ts new file mode 100644 index 0000000000..0fbd52f256 --- /dev/null +++ b/src/lib/onboard/runtime-boundary.ts @@ -0,0 +1,62 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { Session, SessionUpdates } from "../state/onboard-session"; +import { OnboardRuntime } from "./machine/runtime"; + +export interface OnboardRuntimeBoundaryOptions { + toSessionUpdates(updates: Record): SessionUpdates; + maybeForceE2eStepFailure(stepName: string): void; +} + +export class OnboardRuntimeBoundary { + private runtime: OnboardRuntime | null = null; + + constructor(private readonly options: OnboardRuntimeBoundaryOptions) {} + + reset(): void { + this.runtime = new OnboardRuntime(); + } + + clear(): void { + this.runtime = null; + } + + getRuntime(): OnboardRuntime { + if (!this.runtime) this.runtime = new OnboardRuntime(); + return this.runtime; + } + + async startRecordedStep( + stepName: string, + updates: { + sandboxName?: string | null; + provider?: string | null; + model?: string | null; + policyPresets?: string[] | null; + } = {}, + ): Promise { + const runtime = this.getRuntime(); + await runtime.markStepStarted(stepName); + if (Object.keys(updates).length > 0) { + await runtime.updateContext(this.options.toSessionUpdates(updates)); + } + this.options.maybeForceE2eStepFailure(stepName); + } + + async recordStepComplete(stepName: string, updates: SessionUpdates = {}): Promise { + return this.getRuntime().markStepComplete(stepName, updates); + } + + async recordStepSkipped(stepName: string): Promise { + return this.getRuntime().markStepSkipped(stepName); + } + + async recordStepFailed(stepName: string, message: string | null): Promise { + return this.getRuntime().markStepFailed(stepName, message); + } + + async recordSessionComplete(updates: SessionUpdates = {}): Promise { + return this.getRuntime().completeSession(updates); + } +}