diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 7281ed598f..b31d0f12f2 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -90,6 +90,7 @@ const { setupMessagingChannels: setupMessagingChannelsImpl, readMessagingPlanFro const { clearAgentScopedResumeState, }: typeof import("./onboard/agent-resume-state") = require("./onboard/agent-resume-state"); +const { repairResumeMachineSnapshot }: typeof import("./onboard/resume-machine-repair") = require("./onboard/resume-machine-repair"); const { stopTrackedModelRouterForAgentChange, }: typeof import("./onboard/model-router-process") = require("./onboard/model-router-process"); @@ -413,6 +414,7 @@ const { handlePoliciesState }: typeof import("./onboard/machine/handlers/policie const { handlePreflightState }: typeof import("./onboard/machine/handlers/preflight") = require("./onboard/machine/handlers/preflight"); const { handleProviderInferenceState }: typeof import("./onboard/machine/handlers/provider-inference") = require("./onboard/machine/handlers/provider-inference"); const { handleSandboxState }: typeof import("./onboard/machine/handlers/sandbox") = require("./onboard/machine/handlers/sandbox"); +const { advanceTo }: typeof import("./onboard/machine/result") = require("./onboard/machine/result"); const { getOnboardProgressStep }: typeof import("./onboard/machine/progress") = require("./onboard/machine/progress"); const policies: typeof import("./policy") = require("./policy"); const policyPresetCarry: typeof import("./onboard/policy-preset-persistence") = require("./onboard/policy-preset-persistence"); @@ -557,9 +559,7 @@ const RESET = USE_COLOR ? "\x1b[0m" : ""; let OPENSHELL_BIN: string | null = null; const GATEWAY_NAME = gatewayBinding.resolveGatewayName(GATEWAY_PORT); -import type { - JsonObject as LooseObject, -} from "./core/json-types"; +import type { JsonObject as LooseObject } from "./core/json-types"; type OnboardOptions = { nonInteractive?: boolean; @@ -687,7 +687,6 @@ const selectOnboardAgent = createSelectOnboardAgent({ note, }); - const { getTransportRecoveryMessage } = validationRecovery; // Validation functions — delegated to src/lib/validation.ts @@ -859,7 +858,6 @@ const verifyDirectSandboxGpu = sandboxGpuPreflight.createDirectSandboxGpuVerifie redact, }); - function upsertMessagingProviders( tokenDefs: MessagingTokenDef[], options: { replaceExisting?: boolean } = {}, @@ -949,7 +947,6 @@ const { isAffirmativeAnswer, }); - const { ensureValidatedBraveSearchCredential, configureWebSearch, @@ -962,7 +959,6 @@ const { runCaptureOpenshell, }); - // getSandboxInferenceConfig — moved to onboard-providers.ts // Inference probes — moved to inference/onboard-probes.ts @@ -987,7 +983,6 @@ const { promptValidationRecovery, }); - const { promptCloudModel, promptRemoteModel, promptInputModel } = modelPrompts; const { validateAnthropicModel, validateOpenAiLikeModel } = providerModels; const nousModels: typeof import("./inference/nous-models") = require("./inference/nous-models"); @@ -5847,6 +5842,7 @@ const onboardRuntimeBoundary = new OnboardRuntimeBoundary({ toSessionUpdates: (updates: Record) => toSessionUpdates(updates as Parameters[0]), maybeForceE2eStepFailure, + stepMutationOptions: { updateMachine: false }, }); const sandboxCancelRollback = installSandboxCancelRollback({ runOpenshell, registry, clearOnboardSession: onboardSession.clearSession }); // #4614 @@ -6102,6 +6098,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { process.exit(1); } onboardSession.updateSession((current: Session) => { + repairResumeMachineSnapshot(current); current.mode = isNonInteractive() ? "non-interactive" : "interactive"; current.failure = null; current.status = "in_progress"; @@ -6145,6 +6142,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { ); } await onboardRuntimeBoundary.recordOnboardStarted(resume); + await recordStateResult(advanceTo("preflight", { metadata: { state: "init" } })); // Backstop for the resume path: a session may exist (so the early guard // skipped because resume === true) but never have recorded a sandboxName // — sandbox creation could have failed before that step ran. Without a diff --git a/src/lib/onboard/resume-machine-repair.test.ts b/src/lib/onboard/resume-machine-repair.test.ts new file mode 100644 index 0000000000..f94ea0c1e6 --- /dev/null +++ b/src/lib/onboard/resume-machine-repair.test.ts @@ -0,0 +1,220 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it } from "vitest"; + +import { + createSession, + filterSafeUpdates, + MACHINE_SNAPSHOT_VERSION, + normalizeSession, + type Session, + type SessionUpdates, +} from "../state/onboard-session"; +import { advanceTo, branchTo } from "./machine/result"; +import { OnboardRuntime, type OnboardRuntimeDeps } from "./machine/runtime"; +import { + repairResumeMachineSnapshot, + resumeMachineState, +} from "./resume-machine-repair"; +import { OnboardRuntimeBoundary } from "./runtime-boundary"; + +/** + * Builds a failed durable session while letting each test set the interrupted step. + */ +function createFailedSession(mutator: (session: Session) => void): Session { + const session = createSession({ + machine: { + version: MACHINE_SNAPSHOT_VERSION, + state: "failed", + stateEnteredAt: "2026-06-01T00:00:00.000Z", + revision: 7, + }, + status: "failed", + failure: { + step: null, + message: "interrupted", + recordedAt: "2026-06-01T00:00:00.000Z", + }, + }); + mutator(session); + return session; +} + +/** + * Round-trips sessions through normalization to match persisted runtime state. + */ +function cloneSession(session: Session): Session { + return normalizeSession(JSON.parse(JSON.stringify(session))) ?? session; +} + +/** + * Creates a memory-backed runtime boundary with record-only step mutations. + */ +function createBoundaryHarness(initial: Session) { + let session = cloneSession(initial); + const updateSession = (mutator: (value: Session) => Session | void): Session => { + const current = cloneSession(session); + session = cloneSession(mutator(current) ?? current); + return cloneSession(session); + }; + const deps: OnboardRuntimeDeps = { + loadSession: () => cloneSession(session), + createSession, + saveSession: (next) => { + session = cloneSession(next); + return cloneSession(session); + }, + updateSession, + markStepStarted: () => cloneSession(session), + markStepComplete: (_stepName, updates: SessionUpdates = {}) => + updateSession((current) => Object.assign(current, filterSafeUpdates(updates))), + markStepCompleteRecordOnly: (_stepName, updates: SessionUpdates = {}) => + updateSession((current) => Object.assign(current, filterSafeUpdates(updates))), + markStepSkipped: () => cloneSession(session), + markStepFailed: () => cloneSession(session), + markStepFailedRecordOnly: () => cloneSession(session), + completeSession: (updates: SessionUpdates = {}) => + updateSession((current) => { + Object.assign(current, filterSafeUpdates(updates)); + current.status = "complete"; + current.resumable = false; + return current; + }), + filterSafeUpdates, + emitEvent: () => undefined, + now: () => "2026-06-01T00:02:00.000Z", + }; + const boundary = new OnboardRuntimeBoundary({ + toSessionUpdates: (updates) => filterSafeUpdates(updates as SessionUpdates) as SessionUpdates, + maybeForceE2eStepFailure: () => undefined, + createRuntime: () => new OnboardRuntime(deps), + stepMutationOptions: { updateMachine: false }, + }); + return { boundary, getSession: () => cloneSession(session) }; +} + +/** + * Replays the live resume sequence from failed snapshot repair through completion. + */ +async function runRecordOnlyResumeSequence(initial: Session): Promise { + repairResumeMachineSnapshot(initial, "2026-06-01T00:01:00.000Z"); + initial.failure = null; + initial.status = "in_progress"; + const { boundary, getSession } = createBoundaryHarness(initial); + await boundary.recordOnboardStarted(true); + await boundary.recordStateResultsWithStepCompatibility([ + advanceTo("preflight", { metadata: { state: "init" } }), + advanceTo("gateway", { metadata: { state: "preflight" } }), + advanceTo("provider_selection", { metadata: { state: "gateway" } }), + advanceTo("inference", { metadata: { state: "provider_selection" } }), + advanceTo("sandbox", { metadata: { state: "inference" } }), + branchTo("openclaw", { metadata: { state: "sandbox" } }), + advanceTo("policies", { metadata: { state: "openclaw" } }), + advanceTo("finalizing", { metadata: { state: "policies" } }), + ]); + await boundary.recordSessionComplete(); + return getSession(); +} + +describe("resume machine repair", () => { + it("resumes a failed preflight session from preflight", () => { + const session = createFailedSession((current) => { + current.failure = { + step: "preflight", + message: "Docker is unavailable", + recordedAt: "2026-06-01T00:00:00.000Z", + }; + current.lastStepStarted = "preflight"; + current.steps.preflight.status = "failed"; + }); + + expect(resumeMachineState(session)).toBe("preflight"); + repairResumeMachineSnapshot(session, "2026-06-01T00:01:00.000Z"); + + expect(session.machine).toEqual({ + version: MACHINE_SNAPSHOT_VERSION, + state: "preflight", + stateEnteredAt: "2026-06-01T00:01:00.000Z", + revision: 8, + }); + }); + + it("uses the failed step before the last completed step", () => { + const session = createFailedSession((current) => { + current.lastCompletedStep = "provider_selection"; + current.steps.provider_selection.status = "complete"; + current.lastStepStarted = "inference"; + current.steps.inference.status = "failed"; + current.failure = { + step: "inference", + message: "route validation failed", + recordedAt: "2026-06-01T00:00:00.000Z", + }; + }); + + expect(resumeMachineState(session)).toBe("inference"); + }); + + it("derives the branch state after sandbox when no failed step is recorded", () => { + const session = createFailedSession((current) => { + current.agent = "hermes"; + current.lastCompletedStep = "sandbox"; + current.steps.sandbox.status = "complete"; + current.failure = null; + }); + + expect(resumeMachineState(session)).toBe("agent_setup"); + }); + + it("leaves nonterminal snapshots untouched", () => { + const session = createSession({ + machine: { + version: MACHINE_SNAPSHOT_VERSION, + state: "gateway", + stateEnteredAt: "2026-06-01T00:00:00.000Z", + revision: 3, + }, + }); + + repairResumeMachineSnapshot(session, "2026-06-01T00:01:00.000Z"); + + expect(session.machine).toEqual({ + version: MACHINE_SNAPSHOT_VERSION, + state: "gateway", + stateEnteredAt: "2026-06-01T00:00:00.000Z", + revision: 3, + }); + }); + + it.each([ + ["preflight", "preflight", null], + ["gateway", "gateway", "preflight"], + ["inference", "inference", "provider_selection"], + ] as const)( + "lets record-only resume complete from failed %s", + async (_name, failedStep, completedStep) => { + const session = createFailedSession((current) => { + current.failure = { + step: failedStep, + message: `${failedStep} failed`, + recordedAt: "2026-06-01T00:00:00.000Z", + }; + current.lastStepStarted = failedStep; + current.steps[failedStep].status = "failed"; + if (completedStep) { + current.lastCompletedStep = completedStep; + current.steps[completedStep].status = "complete"; + } + }); + + const completed = await runRecordOnlyResumeSequence(session); + + expect(completed).toMatchObject({ + status: "complete", + failure: null, + machine: { state: "complete" }, + }); + }, + ); +}); diff --git a/src/lib/onboard/resume-machine-repair.ts b/src/lib/onboard/resume-machine-repair.ts new file mode 100644 index 0000000000..8879abc7f7 --- /dev/null +++ b/src/lib/onboard/resume-machine-repair.ts @@ -0,0 +1,68 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { + MACHINE_SNAPSHOT_VERSION, + type Session, +} from "../state/onboard-session"; +import { nextMachineStateAfterCompletedStep } from "../state/onboard-step-state"; +import { machineStateFromOnboardSessionStep } from "./machine/events"; +import type { OnboardMachineState } from "./machine/types"; + +/** + * Reads the legacy step-level source of truth for interrupted sessions whose + * durable FSM snapshot was already collapsed to the terminal failed state. + */ +function activeStepMachineState(session: Session): OnboardMachineState | null { + const failedStepName = session.failure?.step ?? null; + const failedStep = failedStepName ? session.steps[failedStepName] : null; + const failedState = machineStateFromOnboardSessionStep(failedStepName); + if (failedState && (failedStep?.status === "failed" || failedStep?.status === "in_progress")) { + return failedState; + } + + const startedStepName = session.lastStepStarted; + const startedStep = startedStepName ? session.steps[startedStepName] : null; + const startedState = machineStateFromOnboardSessionStep(startedStepName); + if ( + startedState && + (startedStep?.status === "failed" || startedStep?.status === "in_progress") + ) { + return startedState; + } + + return null; +} + +/** + * Computes the nonterminal state where a failed durable session should resume. + */ +export function resumeMachineState(session: Session): OnboardMachineState { + return activeStepMachineState(session) ?? nextMachineStateAfterCompletedStep( + session.lastCompletedStep, + session, + ) ?? "init"; +} + +/** + * Repairs the legacy failed-session/FSM boundary during --resume. + * + * Source fix constraint: failed -> resume is not a modeled FSM transition yet, + * and legacy step fields still act as the secondary durable source for resume. + * Remove this bridge once failed-session recovery is represented by explicit + * FSM recovery results or step fields stop being used to derive resume state. + */ +export function repairResumeMachineSnapshot( + session: Session, + stateEnteredAt = new Date().toISOString(), +): Session { + if (session.machine.state !== "failed") return session; + const state = resumeMachineState(session); + session.machine = { + version: MACHINE_SNAPSHOT_VERSION, + state, + stateEnteredAt, + revision: session.machine.revision + 1, + }; + return session; +}