Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
6b753bf
docs(onboard): document FSM migration target
cv May 27, 2026
fb1b32d
refactor(onboard): centralize machine state metadata
cv May 27, 2026
c3e4ad6
refactor(onboard): derive session step mapping from FSM metadata
cv May 27, 2026
603832c
refactor(onboard): derive progress labels from FSM metadata
cv May 27, 2026
4fad8e7
fix(onboard): emit lifecycle events for onboarding start
cv May 28, 2026
f99e9cb
fix(onboard): emit machine events for resume conflicts
cv May 28, 2026
2b60df4
refactor(onboard): introduce explicit state result types
cv May 28, 2026
30341b0
refactor(onboard): apply explicit state results through runtime
cv May 28, 2026
d4ad2d9
refactor(onboard): make finalization return FSM result
cv May 28, 2026
356c947
refactor(onboard): make agent setup return FSM result
cv May 28, 2026
2296519
refactor(onboard): make policy setup return FSM result
cv May 28, 2026
67a9a1e
refactor(onboard): make preflight and gateway return FSM results
cv May 28, 2026
46f4a49
refactor(onboard): make sandbox return branch FSM result
cv May 28, 2026
9cc15f5
refactor(onboard): return FSM results from provider inference
cv May 28, 2026
dbbb273
refactor(onboard): add FSM runner shell
cv May 28, 2026
6b27a0b
refactor(onboard): consume handler FSM results compatibly
cv May 28, 2026
44009ad
refactor(onboard): allow step recording without machine transitions
cv May 28, 2026
cd6e5f7
refactor(onboard): plumb step mutation options through runtime
cv May 28, 2026
e266e3b
refactor(onboard): add record-only FSM runner adapter
cv May 28, 2026
bf4da0b
refactor(onboard): return ordered provider FSM results
cv May 28, 2026
212ff4d
refactor(onboard): run live sequence with record-only steps
cv May 28, 2026
9e8b1f5
merge(main): resolve PR 4471 conflicts
cv Jun 5, 2026
323ec5b
merge(provider-result-sequence): resolve PR 4472 conflicts
cv Jun 5, 2026
b2b5e1c
Merge branch 'main' into stack/onboard-fsm-provider-result-sequence
cv Jun 5, 2026
3d0507b
Merge branch 'stack/onboard-fsm-provider-result-sequence' into stack/…
cv Jun 5, 2026
7e0e47b
merge(main): resolve PR 4472 conflicts
cv Jun 7, 2026
11440c4
chore(onboard): keep entrypoint net-neutral
cv Jun 7, 2026
f5c0a22
fix(onboard): remove unused resume session assignment
cv Jun 7, 2026
5285792
test(onboard): cover failed resume record-only sequence
cv Jun 7, 2026
e0e3162
test(onboard): document resume repair harness
cv Jun 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions src/lib/onboard.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ const { setupMessagingChannels: setupMessagingChannelsImpl, readMessagingPlanFro
const {
clearAgentScopedResumeState,
}: typeof import("./onboard/agent-resume-state") = require("./onboard/agent-resume-state");
const { repairResumeMachineSnapshot }: typeof import("./onboard/resume-machine-repair") = require("./onboard/resume-machine-repair");
const {
stopTrackedModelRouterForAgentChange,
}: typeof import("./onboard/model-router-process") = require("./onboard/model-router-process");
Expand Down Expand Up @@ -413,6 +414,7 @@ const { handlePoliciesState }: typeof import("./onboard/machine/handlers/policie
const { handlePreflightState }: typeof import("./onboard/machine/handlers/preflight") = require("./onboard/machine/handlers/preflight");
const { handleProviderInferenceState }: typeof import("./onboard/machine/handlers/provider-inference") = require("./onboard/machine/handlers/provider-inference");
const { handleSandboxState }: typeof import("./onboard/machine/handlers/sandbox") = require("./onboard/machine/handlers/sandbox");
const { advanceTo }: typeof import("./onboard/machine/result") = require("./onboard/machine/result");
const { getOnboardProgressStep }: typeof import("./onboard/machine/progress") = require("./onboard/machine/progress");
const policies: typeof import("./policy") = require("./policy");
const policyPresetCarry: typeof import("./onboard/policy-preset-persistence") = require("./onboard/policy-preset-persistence");
Expand Down Expand Up @@ -557,9 +559,7 @@ const RESET = USE_COLOR ? "\x1b[0m" : "";
let OPENSHELL_BIN: string | null = null;
const GATEWAY_NAME = gatewayBinding.resolveGatewayName(GATEWAY_PORT);

import type {
JsonObject as LooseObject,
} from "./core/json-types";
import type { JsonObject as LooseObject } from "./core/json-types";

type OnboardOptions = {
nonInteractive?: boolean;
Expand Down Expand Up @@ -687,7 +687,6 @@ const selectOnboardAgent = createSelectOnboardAgent({
note,
});


const { getTransportRecoveryMessage } = validationRecovery;

// Validation functions — delegated to src/lib/validation.ts
Expand Down Expand Up @@ -859,7 +858,6 @@ const verifyDirectSandboxGpu = sandboxGpuPreflight.createDirectSandboxGpuVerifie
redact,
});


function upsertMessagingProviders(
tokenDefs: MessagingTokenDef[],
options: { replaceExisting?: boolean } = {},
Expand Down Expand Up @@ -949,7 +947,6 @@ const {
isAffirmativeAnswer,
});


const {
ensureValidatedBraveSearchCredential,
configureWebSearch,
Expand All @@ -962,7 +959,6 @@ const {
runCaptureOpenshell,
});


// getSandboxInferenceConfig — moved to onboard-providers.ts

// Inference probes — moved to inference/onboard-probes.ts
Expand All @@ -987,7 +983,6 @@ const {
promptValidationRecovery,
});


const { promptCloudModel, promptRemoteModel, promptInputModel } = modelPrompts;
const { validateAnthropicModel, validateOpenAiLikeModel } = providerModels;
const nousModels: typeof import("./inference/nous-models") = require("./inference/nous-models");
Expand Down Expand Up @@ -5847,6 +5842,7 @@ const onboardRuntimeBoundary = new OnboardRuntimeBoundary({
toSessionUpdates: (updates: Record<string, unknown>) =>
toSessionUpdates(updates as Parameters<typeof toSessionUpdates>[0]),
maybeForceE2eStepFailure,
stepMutationOptions: { updateMachine: false },
});

const sandboxCancelRollback = installSandboxCancelRollback({ runOpenshell, registry, clearOnboardSession: onboardSession.clearSession }); // #4614
Expand Down Expand Up @@ -6102,6 +6098,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
process.exit(1);
}
onboardSession.updateSession((current: Session) => {
repairResumeMachineSnapshot(current);
current.mode = isNonInteractive() ? "non-interactive" : "interactive";
current.failure = null;
current.status = "in_progress";
Expand Down Expand Up @@ -6145,6 +6142,7 @@ async function onboard(opts: OnboardOptions = {}): Promise<void> {
);
}
await onboardRuntimeBoundary.recordOnboardStarted(resume);
await recordStateResult(advanceTo("preflight", { metadata: { state: "init" } }));
// Backstop for the resume path: a session may exist (so the early guard
// skipped because resume === true) but never have recorded a sandboxName
// — sandbox creation could have failed before that step ran. Without a
Expand Down
220 changes: 220 additions & 0 deletions src/lib/onboard/resume-machine-repair.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

import { describe, expect, it } from "vitest";

import {
createSession,
filterSafeUpdates,
MACHINE_SNAPSHOT_VERSION,
normalizeSession,
type Session,
type SessionUpdates,
} from "../state/onboard-session";
import { advanceTo, branchTo } from "./machine/result";
import { OnboardRuntime, type OnboardRuntimeDeps } from "./machine/runtime";
import {
repairResumeMachineSnapshot,
resumeMachineState,
} from "./resume-machine-repair";
import { OnboardRuntimeBoundary } from "./runtime-boundary";

/**
* Builds a failed durable session while letting each test set the interrupted step.
*/
function createFailedSession(mutator: (session: Session) => void): Session {
const session = createSession({
machine: {
version: MACHINE_SNAPSHOT_VERSION,
state: "failed",
stateEnteredAt: "2026-06-01T00:00:00.000Z",
revision: 7,
},
status: "failed",
failure: {
step: null,
message: "interrupted",
recordedAt: "2026-06-01T00:00:00.000Z",
},
});
mutator(session);
return session;
}

/**
* Round-trips sessions through normalization to match persisted runtime state.
*/
function cloneSession(session: Session): Session {
return normalizeSession(JSON.parse(JSON.stringify(session))) ?? session;
}

/**
* Creates a memory-backed runtime boundary with record-only step mutations.
*/
function createBoundaryHarness(initial: Session) {
let session = cloneSession(initial);
const updateSession = (mutator: (value: Session) => Session | void): Session => {
const current = cloneSession(session);
session = cloneSession(mutator(current) ?? current);
return cloneSession(session);
};
const deps: OnboardRuntimeDeps = {
loadSession: () => cloneSession(session),
createSession,
saveSession: (next) => {
session = cloneSession(next);
return cloneSession(session);
},
updateSession,
markStepStarted: () => cloneSession(session),
markStepComplete: (_stepName, updates: SessionUpdates = {}) =>
updateSession((current) => Object.assign(current, filterSafeUpdates(updates))),
markStepCompleteRecordOnly: (_stepName, updates: SessionUpdates = {}) =>
updateSession((current) => Object.assign(current, filterSafeUpdates(updates))),
markStepSkipped: () => cloneSession(session),
markStepFailed: () => cloneSession(session),
markStepFailedRecordOnly: () => cloneSession(session),
completeSession: (updates: SessionUpdates = {}) =>
updateSession((current) => {
Object.assign(current, filterSafeUpdates(updates));
current.status = "complete";
current.resumable = false;
return current;
}),
filterSafeUpdates,
emitEvent: () => undefined,
now: () => "2026-06-01T00:02:00.000Z",
};
const boundary = new OnboardRuntimeBoundary({
toSessionUpdates: (updates) => filterSafeUpdates(updates as SessionUpdates) as SessionUpdates,
maybeForceE2eStepFailure: () => undefined,
createRuntime: () => new OnboardRuntime(deps),
stepMutationOptions: { updateMachine: false },
});
return { boundary, getSession: () => cloneSession(session) };
}

/**
* Replays the live resume sequence from failed snapshot repair through completion.
*/
async function runRecordOnlyResumeSequence(initial: Session): Promise<Session> {
repairResumeMachineSnapshot(initial, "2026-06-01T00:01:00.000Z");
initial.failure = null;
initial.status = "in_progress";
const { boundary, getSession } = createBoundaryHarness(initial);
await boundary.recordOnboardStarted(true);
await boundary.recordStateResultsWithStepCompatibility([
advanceTo("preflight", { metadata: { state: "init" } }),
advanceTo("gateway", { metadata: { state: "preflight" } }),
advanceTo("provider_selection", { metadata: { state: "gateway" } }),
advanceTo("inference", { metadata: { state: "provider_selection" } }),
advanceTo("sandbox", { metadata: { state: "inference" } }),
branchTo("openclaw", { metadata: { state: "sandbox" } }),
advanceTo("policies", { metadata: { state: "openclaw" } }),
advanceTo("finalizing", { metadata: { state: "policies" } }),
]);
await boundary.recordSessionComplete();
return getSession();
}

describe("resume machine repair", () => {
it("resumes a failed preflight session from preflight", () => {
const session = createFailedSession((current) => {
current.failure = {
step: "preflight",
message: "Docker is unavailable",
recordedAt: "2026-06-01T00:00:00.000Z",
};
current.lastStepStarted = "preflight";
current.steps.preflight.status = "failed";
});

expect(resumeMachineState(session)).toBe("preflight");
repairResumeMachineSnapshot(session, "2026-06-01T00:01:00.000Z");

expect(session.machine).toEqual({
version: MACHINE_SNAPSHOT_VERSION,
state: "preflight",
stateEnteredAt: "2026-06-01T00:01:00.000Z",
revision: 8,
});
});

it("uses the failed step before the last completed step", () => {
const session = createFailedSession((current) => {
current.lastCompletedStep = "provider_selection";
current.steps.provider_selection.status = "complete";
current.lastStepStarted = "inference";
current.steps.inference.status = "failed";
current.failure = {
step: "inference",
message: "route validation failed",
recordedAt: "2026-06-01T00:00:00.000Z",
};
});

expect(resumeMachineState(session)).toBe("inference");
});

it("derives the branch state after sandbox when no failed step is recorded", () => {
const session = createFailedSession((current) => {
current.agent = "hermes";
current.lastCompletedStep = "sandbox";
current.steps.sandbox.status = "complete";
current.failure = null;
});

expect(resumeMachineState(session)).toBe("agent_setup");
});

it("leaves nonterminal snapshots untouched", () => {
const session = createSession({
machine: {
version: MACHINE_SNAPSHOT_VERSION,
state: "gateway",
stateEnteredAt: "2026-06-01T00:00:00.000Z",
revision: 3,
},
});

repairResumeMachineSnapshot(session, "2026-06-01T00:01:00.000Z");

expect(session.machine).toEqual({
version: MACHINE_SNAPSHOT_VERSION,
state: "gateway",
stateEnteredAt: "2026-06-01T00:00:00.000Z",
revision: 3,
});
});

it.each([
["preflight", "preflight", null],
["gateway", "gateway", "preflight"],
["inference", "inference", "provider_selection"],
] as const)(
"lets record-only resume complete from failed %s",
async (_name, failedStep, completedStep) => {
const session = createFailedSession((current) => {
current.failure = {
step: failedStep,
message: `${failedStep} failed`,
recordedAt: "2026-06-01T00:00:00.000Z",
};
current.lastStepStarted = failedStep;
current.steps[failedStep].status = "failed";
if (completedStep) {
current.lastCompletedStep = completedStep;
current.steps[completedStep].status = "complete";
}
});

const completed = await runRecordOnlyResumeSequence(session);

expect(completed).toMatchObject({
status: "complete",
failure: null,
machine: { state: "complete" },
});
},
);
});
68 changes: 68 additions & 0 deletions src/lib/onboard/resume-machine-repair.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

import {
MACHINE_SNAPSHOT_VERSION,
type Session,
} from "../state/onboard-session";
import { nextMachineStateAfterCompletedStep } from "../state/onboard-step-state";
import { machineStateFromOnboardSessionStep } from "./machine/events";
import type { OnboardMachineState } from "./machine/types";

/**
* Reads the legacy step-level source of truth for interrupted sessions whose
* durable FSM snapshot was already collapsed to the terminal failed state.
*/
function activeStepMachineState(session: Session): OnboardMachineState | null {
const failedStepName = session.failure?.step ?? null;
const failedStep = failedStepName ? session.steps[failedStepName] : null;
const failedState = machineStateFromOnboardSessionStep(failedStepName);
if (failedState && (failedStep?.status === "failed" || failedStep?.status === "in_progress")) {
return failedState;
}

const startedStepName = session.lastStepStarted;
const startedStep = startedStepName ? session.steps[startedStepName] : null;
const startedState = machineStateFromOnboardSessionStep(startedStepName);
if (
startedState &&
(startedStep?.status === "failed" || startedStep?.status === "in_progress")
) {
return startedState;
}

return null;
}

/**
* Computes the nonterminal state where a failed durable session should resume.
*/
export function resumeMachineState(session: Session): OnboardMachineState {
return activeStepMachineState(session) ?? nextMachineStateAfterCompletedStep(
session.lastCompletedStep,
session,
) ?? "init";
}

/**
* Repairs the legacy failed-session/FSM boundary during --resume.
*
* Source fix constraint: failed -> resume is not a modeled FSM transition yet,
* and legacy step fields still act as the secondary durable source for resume.
* Remove this bridge once failed-session recovery is represented by explicit
* FSM recovery results or step fields stop being used to derive resume state.
*/
export function repairResumeMachineSnapshot(
session: Session,
stateEnteredAt = new Date().toISOString(),
): Session {
if (session.machine.state !== "failed") return session;
const state = resumeMachineState(session);
session.machine = {
version: MACHINE_SNAPSHOT_VERSION,
state,
stateEnteredAt,
revision: session.machine.revision + 1,
};
return session;
}