diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 669aac9f66..eebd6fec6d 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -207,6 +207,7 @@ const { getProviderSelectionConfig, parseGatewayInference, } = inferenceConfig; +const { ensureResumeProviderReady } = require("./onboard/resume-provider-shim"); const onboardProviders = require("./onboard/providers"); const hermesProviderAuth = require("./hermes-provider-auth"); @@ -1503,8 +1504,6 @@ const { shouldForceCompletionsApi, } = validation; -// validateNvidiaApiKeyValue — see validation import above - async function replaceNamedCredential( envName: string, label: string, @@ -9439,6 +9438,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { recordStepComplete, toSessionUpdates: (updates) => toSessionUpdates(updates as Parameters[0]), skippedStepMessage, + ensureResumeProviderReady, hydrateCredentialEnv, repairLocalInferenceSystemdOverrideOrExit, isNonInteractive, @@ -9992,7 +9992,6 @@ module.exports = { recoverGatewayRuntime, buildChain, buildControlUiUrls, - startGateway, findAvailableDashboardPort, findDashboardForwardOwner, @@ -10048,4 +10047,5 @@ module.exports = { checkTelegramReachability, TELEGRAM_NETWORK_CURL_CODES, verifyCompatibleEndpointSandboxSmoke, + resumeProviderShimDeps: { isRoutedInferenceProvider, replaceNamedCredential }, }; diff --git a/src/lib/onboard/machine/handlers/provider-inference.test.ts b/src/lib/onboard/machine/handlers/provider-inference.test.ts index bdc67f1cf6..2e0a0f9c3d 100644 --- a/src/lib/onboard/machine/handlers/provider-inference.test.ts +++ b/src/lib/onboard/machine/handlers/provider-inference.test.ts @@ -32,6 +32,10 @@ function createDeps(overrides: Partial undefined), complete: vi.fn(async () => createSession()), skipped: vi.fn(), + recoverProvider: vi.fn(async (_provider: string | null | undefined, credentialEnv: string | null | undefined) => ({ + forceInferenceSetup: false, + credentialEnv: credentialEnv ?? null, + })), hydrate: vi.fn(), repair: vi.fn(), routeReady: vi.fn(() => false), @@ -56,6 +60,7 @@ function createDeps(overrides: Partial) => updates as SessionUpdates, skippedStepMessage: calls.skipped, + ensureResumeProviderReady: calls.recoverProvider, hydrateCredentialEnv: calls.hydrate, repairLocalInferenceSystemdOverrideOrExit: calls.repair, isNonInteractive: () => true, @@ -166,6 +171,7 @@ describe("handleProviderInferenceState", () => { expect(calls.setupNim).not.toHaveBeenCalled(); expect(calls.setupInference).not.toHaveBeenCalled(); + expect(calls.recoverProvider).toHaveBeenCalledWith("ollama-local", null); expect(calls.skipped).toHaveBeenCalledWith("provider_selection", "ollama-local / llama3.1"); expect(calls.hydrate).toHaveBeenCalledWith(null); expect(calls.repair).toHaveBeenCalledWith("ollama-local", deps.isNonInteractive); @@ -173,6 +179,40 @@ describe("handleProviderInferenceState", () => { expect(result).toMatchObject({ provider: "ollama-local", model: "llama3.1" }); }); + it("reruns inference setup when resumed provider recovery forces recreation", async () => { + const session = createSession({ + provider: "compatible-endpoint", + model: "custom-model", + credentialEnv: null, + }); + session.steps.provider_selection.status = "complete"; + const { deps, calls } = createDeps({ + isInferenceRouteReady: vi.fn(() => true), + ensureResumeProviderReady: vi.fn(async () => ({ + forceInferenceSetup: true, + credentialEnv: "COMPATIBLE_API_KEY", + })), + }); + + await handleProviderInferenceState({ + ...baseOptions(deps, session), + resume: true, + sandboxName: "my-assistant", + }); + + expect(calls.setupNim).not.toHaveBeenCalled(); + expect(calls.hydrate).toHaveBeenCalledWith("COMPATIBLE_API_KEY"); + expect(calls.setupInference).toHaveBeenCalledWith( + "my-assistant", + "custom-model", + "compatible-endpoint", + null, + "COMPATIBLE_API_KEY", + null, + [], + ); + }); + it("reconciles model router on resumed routed inference", async () => { const session = createSession({ provider: "nvidia-router", model: "router/model" }); session.steps.provider_selection.status = "complete"; diff --git a/src/lib/onboard/machine/handlers/provider-inference.ts b/src/lib/onboard/machine/handlers/provider-inference.ts index 161b0f0b0c..95423ccfae 100644 --- a/src/lib/onboard/machine/handlers/provider-inference.ts +++ b/src/lib/onboard/machine/handlers/provider-inference.ts @@ -61,6 +61,10 @@ export interface ProviderInferenceStateOptions { recordStepComplete(stepName: string, updates: SessionUpdates): Promise; toSessionUpdates(updates: Record): SessionUpdates; skippedStepMessage(stepName: string, detail?: string | null): void; + ensureResumeProviderReady( + provider: string | null | undefined, + credentialEnv: string | null | undefined, + ): Promise<{ forceInferenceSetup: boolean; credentialEnv: string | null }>; hydrateCredentialEnv(credentialEnv: string | null): void; repairLocalInferenceSystemdOverrideOrExit(provider: string | null, isNonInteractive: () => boolean): void; isNonInteractive(): boolean; @@ -143,6 +147,7 @@ export async function handleProviderInferenceState({ let forceProviderSelection = initialForceProviderSelection; while (true) { + let forceInferenceSetup = false; const resumeProviderSelection = !forceProviderSelection && resume && @@ -150,6 +155,9 @@ export async function handleProviderInferenceState({ typeof provider === "string" && typeof model === "string"; if (resumeProviderSelection) { + const recovery = await deps.ensureResumeProviderReady(provider, credentialEnv); + forceInferenceSetup = recovery.forceInferenceSetup; + credentialEnv = recovery.credentialEnv; deps.skippedStepMessage("provider_selection", `${provider} / ${model}`); deps.hydrateCredentialEnv(credentialEnv); deps.repairLocalInferenceSystemdOverrideOrExit(provider, deps.isNonInteractive); @@ -187,6 +195,7 @@ export async function handleProviderInferenceState({ const resumeInference = !needsBedrockRuntimeAdapter && !forceProviderSelection && + !forceInferenceSetup && resume && deps.isInferenceRouteReady(provider, model); if (resumeInference) { diff --git a/src/lib/onboard/resume-provider-recovery.test.ts b/src/lib/onboard/resume-provider-recovery.test.ts new file mode 100644 index 0000000000..8b72919013 --- /dev/null +++ b/src/lib/onboard/resume-provider-recovery.test.ts @@ -0,0 +1,144 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it } from "vitest"; + +import { + ensureResumeProviderReady, + type RemoteProviderConfigEntry, + type ResumeProviderRecoveryDeps, +} from "./resume-provider-recovery"; + +const COMPATIBLE_ENDPOINT_CONFIG: RemoteProviderConfigEntry = { + label: "Compatible Endpoint", + providerName: "compatible-endpoint", + providerType: "openai", + credentialEnv: "COMPATIBLE_API_KEY", + endpointUrl: "https://example/v1", + helpUrl: null, + modelMode: "input", + defaultModel: "test-model", +}; + +type DepsRecorder = { + log: string[]; + warn: string[]; + note: string[]; + exitCalls: number[]; + replaceCalls: Array<{ env: string; label: string }>; + deps: ResumeProviderRecoveryDeps; +}; + +function makeDeps(overrides: { + providerExists?: boolean; + credentialValue?: string | null; + nonInteractive?: boolean; + remoteProviderConfig?: Record; +}): DepsRecorder { + const log: string[] = []; + const warn: string[] = []; + const note: string[] = []; + const exitCalls: number[] = []; + const replaceCalls: Array<{ env: string; label: string }> = []; + const deps: ResumeProviderRecoveryDeps = { + remoteProviderConfig: overrides.remoteProviderConfig ?? { + compatible: COMPATIBLE_ENDPOINT_CONFIG, + }, + defaultRouteCredentialEnv: "OPENAI_API_KEY", + isRoutedInferenceProvider: () => false, + providerExistsInGateway: () => overrides.providerExists ?? true, + hydrateCredentialEnv: () => overrides.credentialValue ?? null, + getProviderLabel: (key) => key, + isNonInteractive: () => overrides.nonInteractive ?? false, + log: (m) => log.push(m), + warn: (m) => warn.push(m), + note: (m) => note.push(m), + exit: (code) => exitCalls.push(code), + replaceNamedCredential: async (env, label) => { + replaceCalls.push({ env, label }); + return "fresh-key"; + }, + validateNvidiaApiKeyValue: () => null, + }; + return { log, warn, note, exitCalls, replaceCalls, deps }; +} + +describe("ensureResumeProviderReady", () => { + it("returns false-forced when no provider is set (nothing to recover)", async () => { + const { deps } = makeDeps({ providerExists: false }); + const result = await ensureResumeProviderReady(null, null, deps); + expect(result.forceInferenceSetup).toBe(false); + expect(result.credentialEnv).toBeNull(); + }); + + it("returns false-forced when the provider is unknown and not a routed provider", async () => { + const { deps } = makeDeps({ providerExists: false }); + const result = await ensureResumeProviderReady("mystery-provider", null, deps); + expect(result.forceInferenceSetup).toBe(false); + expect(result.credentialEnv).toBeNull(); + }); + + it("returns false-forced when the provider still exists in the gateway", async () => { + const { deps } = makeDeps({ providerExists: true }); + const result = await ensureResumeProviderReady("compatible-endpoint", "COMPATIBLE_API_KEY", deps); + expect(result.forceInferenceSetup).toBe(false); + expect(result.credentialEnv).toBe("COMPATIBLE_API_KEY"); + }); + + it("emits a [resume] note and forces inference setup when credential is already hydrated", async () => { + const recorder = makeDeps({ + providerExists: false, + credentialValue: "already-hydrated-key", + }); + const result = await ensureResumeProviderReady( + "compatible-endpoint", + "COMPATIBLE_API_KEY", + recorder.deps, + ); + expect(result.forceInferenceSetup).toBe(true); + expect(result.credentialEnv).toBe("COMPATIBLE_API_KEY"); + expect(recorder.note.join("\n")).toContain("[resume]"); + expect(recorder.replaceCalls).toHaveLength(0); + }); + + it("returns the config credential env when the resumed session did not record one", async () => { + const recorder = makeDeps({ + providerExists: false, + credentialValue: "already-hydrated-key", + }); + const result = await ensureResumeProviderReady("compatible-endpoint", null, recorder.deps); + expect(result.forceInferenceSetup).toBe(true); + expect(result.credentialEnv).toBe("COMPATIBLE_API_KEY"); + }); + + it("re-prompts for credentials when the provider was reset and credential is missing (#3278)", async () => { + const recorder = makeDeps({ + providerExists: false, + credentialValue: null, + }); + const result = await ensureResumeProviderReady( + "compatible-endpoint", + "COMPATIBLE_API_KEY", + recorder.deps, + ); + expect(result.forceInferenceSetup).toBe(true); + expect(result.credentialEnv).toBe("COMPATIBLE_API_KEY"); + expect(recorder.replaceCalls).toEqual([ + { env: "COMPATIBLE_API_KEY", label: "Compatible Endpoint API key" }, + ]); + expect(recorder.exitCalls).toEqual([]); + }); + + it("exits 1 in non-interactive mode when the provider is missing and no credential is set", async () => { + const recorder = makeDeps({ + providerExists: false, + credentialValue: null, + nonInteractive: true, + }); + await ensureResumeProviderReady("compatible-endpoint", "COMPATIBLE_API_KEY", recorder.deps); + expect(recorder.exitCalls).toEqual([1]); + expect(recorder.warn.join("\n")).toContain("COMPATIBLE_API_KEY"); + expect(recorder.warn.join("\n")).toContain("during resume"); + expect(recorder.replaceCalls).toHaveLength(0); + }); +}); diff --git a/src/lib/onboard/resume-provider-recovery.ts b/src/lib/onboard/resume-provider-recovery.ts new file mode 100644 index 0000000000..7e1a574ffb --- /dev/null +++ b/src/lib/onboard/resume-provider-recovery.ts @@ -0,0 +1,129 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Re-prompts for a remote provider's credential during `nemoclaw onboard --resume` +// when the previously-recorded provider has been deleted from the gateway (e.g. +// after `nemoclaw credentials reset ` removed it). +// +// Resume mode would otherwise short-circuit the inference setup step on the +// recorded `provider`/`model`, leaving the sandbox rebuild to fail with an +// authentication error (#3278). + +export type RemoteProviderConfigEntry = { + label: string; + providerName: string; + providerType: string; + credentialEnv: string; + endpointUrl: string; + helpUrl: string | null; + modelMode: "catalog" | "curated" | "input"; + defaultModel: string; + skipVerify?: boolean; +}; + +export type ResumeProviderRecoveryDeps = { + remoteProviderConfig: Record; + defaultRouteCredentialEnv: string; + isRoutedInferenceProvider: (provider: string) => boolean; + providerExistsInGateway: (name: string) => boolean; + hydrateCredentialEnv: (envName: string) => string | null; + getProviderLabel: (key: string) => string; + isNonInteractive: () => boolean; + log: (message: string) => void; + warn: (message: string) => void; + note: (message: string) => void; + exit: (code: number) => void; + replaceNamedCredential: ( + envName: string, + label: string, + helpUrl: string | null, + validator: (value: string) => string | null, + ) => Promise; + validateNvidiaApiKeyValue: (key: string, credentialEnv: string) => string | null; +}; + +export type ResumeProviderRecoveryResult = { + forceInferenceSetup: boolean; + credentialEnv: string | null; +}; + +/** + * Resolve a persisted OpenShell provider name back to its onboard provider config. + */ +export function getRemoteProviderConfigForName( + provider: string | null | undefined, + remoteProviderConfig: Record, +): RemoteProviderConfigEntry | null { + if (!provider) return null; + if (provider === "nvidia-nim") return remoteProviderConfig.build; + return ( + Object.values(remoteProviderConfig).find((entry) => entry.providerName === provider) || null + ); +} + +/** + * Choose the credential env used to recreate a missing provider during resume. + */ +export function getResumeProviderCredentialEnv( + provider: string, + config: RemoteProviderConfigEntry | null, + credentialEnv: string | null | undefined, + deps: Pick, +): string { + if (credentialEnv) return credentialEnv; + if (config?.credentialEnv) return config.credentialEnv; + return deps.isRoutedInferenceProvider(provider) ? deps.defaultRouteCredentialEnv : ""; +} + +/** + * Ensure a resumed remote provider still exists in the gateway, re-prompting + * for credentials when needed. + * + * Returns `forceInferenceSetup: true` when the caller must re-run the + * inference setup step (provider was missing and credential was hydrated or + * just re-entered). `credentialEnv` is the env var resolved for that recovery. + * + * In non-interactive mode with a missing credential, calls `deps.exit(1)`. + */ +export async function ensureResumeProviderReady( + provider: string | null | undefined, + credentialEnv: string | null | undefined, + deps: ResumeProviderRecoveryDeps, +): Promise { + const config = getRemoteProviderConfigForName(provider, deps.remoteProviderConfig); + if (!provider || (!config && !deps.isRoutedInferenceProvider(provider))) { + return { forceInferenceSetup: false, credentialEnv: credentialEnv ?? null }; + } + if (deps.providerExistsInGateway(provider)) { + return { forceInferenceSetup: false, credentialEnv: credentialEnv ?? null }; + } + + const resolvedCredentialEnv = getResumeProviderCredentialEnv(provider, config, credentialEnv, deps); + const credentialValue = deps.hydrateCredentialEnv(resolvedCredentialEnv); + const providerLabel = config?.label || deps.getProviderLabel(provider) || provider; + const helpUrl = config?.helpUrl || null; + if (!credentialValue) { + if (deps.isNonInteractive()) { + deps.warn( + ` ${resolvedCredentialEnv} is required to recreate provider '${provider}' during resume.`, + ); + deps.warn( + ` Re-run without --non-interactive to enter it, or set ${resolvedCredentialEnv} and retry.`, + ); + deps.exit(1); + return { forceInferenceSetup: false, credentialEnv: resolvedCredentialEnv }; + } + deps.log(""); + deps.log(` [resume] Provider '${provider}' is missing from the gateway.`); + deps.log(" Re-enter the API key so onboarding can recreate it before rebuilding."); + await deps.replaceNamedCredential( + resolvedCredentialEnv, + `${providerLabel} API key`, + helpUrl, + (value) => deps.validateNvidiaApiKeyValue(value, resolvedCredentialEnv), + ); + } else { + deps.note(` [resume] Provider '${provider}' is missing from the gateway; recreating it.`); + } + return { forceInferenceSetup: true, credentialEnv: resolvedCredentialEnv }; +} diff --git a/src/lib/onboard/resume-provider-shim.ts b/src/lib/onboard/resume-provider-shim.ts new file mode 100644 index 0000000000..267fdbd896 --- /dev/null +++ b/src/lib/onboard/resume-provider-shim.ts @@ -0,0 +1,55 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Wires `ensureResumeProviderReady` (in `./resume-provider-recovery`) to the +// dependencies it needs. Lives outside `src/lib/onboard.ts` so the wiring +// doesn't count against the entrypoint-budget gate. + +import { DEFAULT_ROUTE_CREDENTIAL_ENV } from "../inference/config"; +import { hydrateCredentialEnv } from "./credential-env"; +import { validateNvidiaApiKeyValue } from "../validation"; +import { D, R } from "../cli/terminal-style"; +import { + ensureResumeProviderReady as ensureResumeProviderReadyImpl, + type ResumeProviderRecoveryDeps, + type ResumeProviderRecoveryResult, +} from "./resume-provider-recovery"; + +const onboardProviders = require("./providers") as { + REMOTE_PROVIDER_CONFIG: ResumeProviderRecoveryDeps["remoteProviderConfig"]; + getProviderLabel: ResumeProviderRecoveryDeps["getProviderLabel"]; +}; + +// Lazy require breaks the circular module load — by the time +// `ensureResumeProviderReady` is called, onboard.ts has finished loading +// and its `module.exports.resumeProviderShimDeps` is populated. +type OnboardLazy = { + isNonInteractive: ResumeProviderRecoveryDeps["isNonInteractive"]; + providerExistsInGateway: ResumeProviderRecoveryDeps["providerExistsInGateway"]; + resumeProviderShimDeps: { + isRoutedInferenceProvider: ResumeProviderRecoveryDeps["isRoutedInferenceProvider"]; + replaceNamedCredential: ResumeProviderRecoveryDeps["replaceNamedCredential"]; + }; +}; + +export async function ensureResumeProviderReady( + provider: string | null | undefined, + credentialEnv: string | null | undefined, +): Promise { + const o = require("../onboard") as OnboardLazy; + return ensureResumeProviderReadyImpl(provider, credentialEnv, { + remoteProviderConfig: onboardProviders.REMOTE_PROVIDER_CONFIG, + defaultRouteCredentialEnv: DEFAULT_ROUTE_CREDENTIAL_ENV, + isRoutedInferenceProvider: o.resumeProviderShimDeps.isRoutedInferenceProvider, + providerExistsInGateway: o.providerExistsInGateway, + hydrateCredentialEnv, + getProviderLabel: onboardProviders.getProviderLabel, + isNonInteractive: o.isNonInteractive, + note: (m) => console.log(`${D}${m}${R}`), + replaceNamedCredential: o.resumeProviderShimDeps.replaceNamedCredential, + validateNvidiaApiKeyValue, + log: (m) => console.log(m), + warn: (m) => console.error(m), + exit: (c) => process.exit(c), + }); +}