diff --git a/src/lib/inference/local.test.ts b/src/lib/inference/local.test.ts index a36e126568..f5e3014f4d 100644 --- a/src/lib/inference/local.test.ts +++ b/src/lib/inference/local.test.ts @@ -34,6 +34,7 @@ import { getOllamaModelOptions, getOllamaProbeCommand, getOllamaWarmupCommand, + isOllamaRunnerCrash, parseOllamaList, parseOllamaTags, probeLocalProviderHealth, @@ -984,6 +985,46 @@ describe("local inference helpers", () => { expect(result.message).toMatch(/did not answer the local probe in time/); }); + it("flags runner-crash error payloads as a daemon failure (#4365)", () => { + // Issue #4365: when Ollama's model runner crashes ("model runner has + // unexpectedly stopped"), surface daemonFailure so the wizard escapes the + // Ollama-model inner loop instead of asking for another tag. + const crashSamples = [ + "model runner has unexpectedly stopped, this may be due to resource limitations or an internal error", + "llama runner process has terminated: exit status 134", + "model runner crashed", + "Ollama runner process exited unexpectedly", + "runner died: signal 9", + "runner killed", + ]; + for (const errText of crashSamples) { + expect(isOllamaRunnerCrash(errText)).toBe(true); + const payload = JSON.stringify({ error: errText }); + const captureEx = () => ({ stdout: payload, exitCode: 0, timedOut: false }); + const result = validateOllamaModel("nemotron-3-nano:30b", () => payload, undefined, captureEx); + expect(result.ok).toBe(false); + expect(result.daemonFailure).toBe(true); + } + }); + + it("does not flag model-fit / generic errors as a daemon failure (#4365)", () => { + expect(isOllamaRunnerCrash("model requires more system memory")).toBe(false); + expect(isOllamaRunnerCrash("model 'foo:latest' not found")).toBe(false); + expect(isOllamaRunnerCrash("")).toBe(false); + expect(isOllamaRunnerCrash(null)).toBe(false); + expect(isOllamaRunnerCrash(undefined)).toBe(false); + const payload = JSON.stringify({ error: "model requires more system memory" }); + const captureEx = () => ({ stdout: payload, exitCode: 0, timedOut: false }); + const result = validateOllamaModel( + "gabegoodhart/minimax-m2.1:latest", + () => payload, + () => false, + captureEx, + ); + expect(result.ok).toBe(false); + expect(result.daemonFailure).toBeUndefined(); + }); + it("passes when first probe times out then retry returns OOM error but total RAM is sufficient", () => { // Composite: mode 2 (first probe timeout) + mode 1 (retry returns OOM error). const freeOutput = " total used free\nMem: 131072 120000 1000"; diff --git a/src/lib/inference/local.ts b/src/lib/inference/local.ts index 26eaed18f6..ccd418e66d 100644 --- a/src/lib/inference/local.ts +++ b/src/lib/inference/local.ts @@ -175,6 +175,26 @@ export interface ValidationResult { ok: boolean; message?: string; diagnostic?: string; + /** + * Set when the failure points at the Ollama daemon / model runner itself, + * not the chosen model. Callers escape the Ollama-model loop instead of + * asking for another tag that would hit the same failure. (#4365) + */ + daemonFailure?: boolean; +} + +/** + * Recognises Ollama probe errors that mean the daemon's model runner crashed, + * stopped, or otherwise died (rather than the chosen model being unsuitable). + * Picking a different model would loop on the same failure, so the wizard + * escapes back to provider selection. (#4365) + */ +export function isOllamaRunnerCrash(errText: string | null | undefined): boolean { + const text = String(errText || ""); + if (!text) return false; + return /\brunner\b[\s\S]{0,80}\b(?:stopped|terminated|crashed|exited|died|killed)\b/i.test( + text, + ); } export interface LocalProviderHealthStatus { @@ -994,6 +1014,7 @@ export function validateOllamaModel( return { ok: false, message: `Selected Ollama model '${model}' failed the local probe: ${errText}`, + ...(isOllamaRunnerCrash(errText) ? { daemonFailure: true } : {}), }; } } diff --git a/src/lib/inference/ollama/proxy.ts b/src/lib/inference/ollama/proxy.ts index 50859142e1..2d8e52581f 100644 --- a/src/lib/inference/ollama/proxy.ts +++ b/src/lib/inference/ollama/proxy.ts @@ -739,7 +739,12 @@ async function checkOllamaModelToolSupport( async function prepareOllamaModel( model, installedModels: string[] = [], -): Promise<{ ok: boolean; message?: string; allowToolsIncompatible?: boolean }> { +): Promise<{ + ok: boolean; + message?: string; + allowToolsIncompatible?: boolean; + daemonFailure?: boolean; +}> { const alreadyInstalled = installedModels.includes(model); if (!alreadyInstalled) { console.log(` Pulling Ollama model: ${model}`); diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index df337cdfcf..4bbbcee4ce 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -526,6 +526,7 @@ import { resolveQrSelectedChannels, } from "./onboard/messaging-state"; import { getValidatedMessagingToken, getValidatedMessagingTokenByEnvKey } from "./onboard/messaging-token"; +import { handleOllamaProbeFailure } from "./onboard/ollama-probe-failure"; import { runOllamaStartupOrGate } from "./onboard/ollama-startup"; import type { DockerDriverBinaryOverrides, @@ -3935,10 +3936,8 @@ async function selectAndValidateOllamaModel( } const probe = await prepareOllamaModel(selectedModel, installedModels); if (!probe.ok) { - console.error(` ${probe.message}`); - if (isNonInteractive()) abortNonInteractive(`Ollama model '${selectedModel}' unavailable.`); - console.log(" Choose a different Ollama model or select Other."); - console.log(""); + const action = handleOllamaProbeFailure(probe, selectedModel, isNonInteractive); + if (action === "back-to-selection") return { outcome: "back-to-selection" }; continue; } const allowToolsIncompatible = probe.allowToolsIncompatible === true; diff --git a/src/lib/onboard/ollama-probe-failure.test.ts b/src/lib/onboard/ollama-probe-failure.test.ts new file mode 100644 index 0000000000..9cbffc929a --- /dev/null +++ b/src/lib/onboard/ollama-probe-failure.test.ts @@ -0,0 +1,171 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Issue #4365: focused unit tests for the Ollama probe-failure dispatcher. +// Mirrors the four branches handleOllamaProbeFailure picks between: pinned- +// provider exit, non-interactive abort, interactive daemon escape, and the +// non-daemon "choose another model" continue path. + +import { beforeEach, describe, expect, it, vi } from "vitest"; + +import { handleOllamaProbeFailure } from "../../../dist/lib/onboard/ollama-probe-failure"; + +describe("handleOllamaProbeFailure (#4365)", () => { + let originalProvider: string | undefined; + let originalNonInteractive: string | undefined; + + beforeEach(() => { + originalProvider = process.env.NEMOCLAW_PROVIDER; + originalNonInteractive = process.env.NEMOCLAW_NON_INTERACTIVE; + }); + + function restore() { + if (originalProvider === undefined) delete process.env.NEMOCLAW_PROVIDER; + else process.env.NEMOCLAW_PROVIDER = originalProvider; + if (originalNonInteractive === undefined) delete process.env.NEMOCLAW_NON_INTERACTIVE; + else process.env.NEMOCLAW_NON_INTERACTIVE = originalNonInteractive; + } + + it("exits when a pinned Ollama provider hits a daemon failure", () => { + process.env.NEMOCLAW_PROVIDER = "ollama"; + const errSpy = vi.spyOn(console, "error").mockImplementation(() => {}); + const logSpy = vi.spyOn(console, "log").mockImplementation(() => {}); + const exitSpy = vi.spyOn(process, "exit").mockImplementation(((code?: number) => { + throw new Error(`process.exit:${code ?? 0}`); + }) as never); + + try { + expect(() => + handleOllamaProbeFailure( + { ok: false, message: "runner crashed", daemonFailure: true }, + "nemotron-3-nano:30b", + () => false, + ), + ).toThrow(/process\.exit:1/); + const errLines = errSpy.mock.calls.map((c) => String(c[0])); + expect( + errLines.some((l) => + l.includes("NEMOCLAW_PROVIDER pins onboarding to Ollama but the Ollama model runner is unhealthy"), + ), + ).toBe(true); + } finally { + errSpy.mockRestore(); + logSpy.mockRestore(); + exitSpy.mockRestore(); + restore(); + } + }); + + it("aborts non-interactive runs on a daemon failure", () => { + delete process.env.NEMOCLAW_PROVIDER; + const errSpy = vi.spyOn(console, "error").mockImplementation(() => {}); + const logSpy = vi.spyOn(console, "log").mockImplementation(() => {}); + const exitSpy = vi.spyOn(process, "exit").mockImplementation(((code?: number) => { + throw new Error(`process.exit:${code ?? 0}`); + }) as never); + + try { + expect(() => + handleOllamaProbeFailure( + { ok: false, message: "runner died", daemonFailure: true }, + "nemotron-3-nano:30b", + () => true, + ), + ).toThrow(/process\.exit:1/); + const errLines = errSpy.mock.calls.map((c) => String(c[0])); + expect( + errLines.some((l) => l.includes("Aborting: Ollama daemon is unhealthy")), + ).toBe(true); + } finally { + errSpy.mockRestore(); + logSpy.mockRestore(); + exitSpy.mockRestore(); + restore(); + } + }); + + it("returns 'back-to-selection' with an escape hint for interactive non-pinned daemon failures", () => { + delete process.env.NEMOCLAW_PROVIDER; + const errSpy = vi.spyOn(console, "error").mockImplementation(() => {}); + const logSpy = vi.spyOn(console, "log").mockImplementation(() => {}); + + try { + const action = handleOllamaProbeFailure( + { ok: false, message: "model runner has unexpectedly stopped", daemonFailure: true }, + "qwen2.5:7b", + () => false, + ); + expect(action).toBe("back-to-selection"); + const logLines = logSpy.mock.calls.map((c) => String(c[0])); + expect( + logLines.some((l) => + l.includes("Ollama itself appears unavailable"), + ), + ).toBe(true); + expect( + logLines.some((l) => + l.includes("Returning to provider selection; choose a non-Ollama provider"), + ), + ).toBe(true); + } finally { + errSpy.mockRestore(); + logSpy.mockRestore(); + restore(); + } + }); + + it("returns 'continue' on a model-level failure (no daemonFailure flag)", () => { + delete process.env.NEMOCLAW_PROVIDER; + const errSpy = vi.spyOn(console, "error").mockImplementation(() => {}); + const logSpy = vi.spyOn(console, "log").mockImplementation(() => {}); + + try { + const action = handleOllamaProbeFailure( + { ok: false, message: "model requires more system memory" }, + "qwen2.5:7b", + () => false, + ); + expect(action).toBe("continue"); + const logLines = logSpy.mock.calls.map((c) => String(c[0])); + expect( + logLines.some((l) => l.includes("Choose a different Ollama model")), + ).toBe(true); + // Daemon-escape hint MUST NOT appear in the non-daemon path. + expect( + logLines.some((l) => l.includes("Ollama itself appears unavailable")), + ).toBe(false); + } finally { + errSpy.mockRestore(); + logSpy.mockRestore(); + restore(); + } + }); + + it("aborts non-interactive model-level failures via the legacy message", () => { + delete process.env.NEMOCLAW_PROVIDER; + const errSpy = vi.spyOn(console, "error").mockImplementation(() => {}); + const logSpy = vi.spyOn(console, "log").mockImplementation(() => {}); + const exitSpy = vi.spyOn(process, "exit").mockImplementation(((code?: number) => { + throw new Error(`process.exit:${code ?? 0}`); + }) as never); + + try { + expect(() => + handleOllamaProbeFailure( + { ok: false, message: "model requires more system memory" }, + "qwen2.5:7b", + () => true, + ), + ).toThrow(/process\.exit:1/); + const errLines = errSpy.mock.calls.map((c) => String(c[0])); + expect( + errLines.some((l) => l.includes("Aborting: Ollama model 'qwen2.5:7b' unavailable")), + ).toBe(true); + } finally { + errSpy.mockRestore(); + logSpy.mockRestore(); + exitSpy.mockRestore(); + restore(); + } + }); +}); diff --git a/src/lib/onboard/ollama-probe-failure.ts b/src/lib/onboard/ollama-probe-failure.ts new file mode 100644 index 0000000000..4f32297b80 --- /dev/null +++ b/src/lib/onboard/ollama-probe-failure.ts @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { abortNonInteractive } from "./non-interactive-abort"; +import { isOllamaProviderPinned } from "./ollama-startup"; + +export interface OllamaProbeFailureInput { + ok: boolean; + message?: string; + daemonFailure?: boolean; +} + +export type OllamaProbeFailureAction = "back-to-selection" | "continue"; + +/** + * Centralizes selectAndValidateOllamaModel's reaction to a failed Ollama + * probe. Lives outside onboard.ts so the codebase growth guardrail stays + * green and so the sequence has a focused test surface. (#4365) + * + * - daemonFailure → the Ollama daemon / runner itself is broken. Pinned- + * provider runs exit, non-interactive runs abort, interactive runs escape + * to provider selection (picking another Ollama tag would loop on the + * same failure). + * - otherwise → the chosen model is unsuitable. Non-interactive runs + * abort; interactive runs continue to the next inner-loop prompt for a + * different Ollama tag (existing behavior). + */ +export function handleOllamaProbeFailure( + probe: OllamaProbeFailureInput, + selectedModel: string, + isNonInteractive: () => boolean, +): OllamaProbeFailureAction { + console.error(` ${probe.message}`); + if (probe.daemonFailure) { + if (isOllamaProviderPinned()) { + console.error( + " NEMOCLAW_PROVIDER pins onboarding to Ollama but the Ollama model runner is unhealthy; refusing to loop on Ollama model selection.", + ); + process.exit(1); + } + if (isNonInteractive()) { + abortNonInteractive( + `Ollama daemon is unhealthy for model '${selectedModel}'.`, + "Pick a non-Ollama provider, restart Ollama, or rerun with NEMOCLAW_PROVIDER set explicitly.", + ); + } + console.log( + " Ollama itself appears unavailable — selecting a different Ollama model would hit the same failure.", + ); + console.log( + " Returning to provider selection; choose a non-Ollama provider to continue. (#4365)", + ); + console.log(""); + return "back-to-selection"; + } + if (isNonInteractive()) abortNonInteractive(`Ollama model '${selectedModel}' unavailable.`); + console.log(" Choose a different Ollama model or select Other."); + console.log(""); + return "continue"; +} diff --git a/src/lib/onboard/ollama-startup.test.ts b/src/lib/onboard/ollama-startup.test.ts new file mode 100644 index 0000000000..4bf159b9e5 --- /dev/null +++ b/src/lib/onboard/ollama-startup.test.ts @@ -0,0 +1,177 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Issue #4365: when Ollama autostart times out in interactive default mode, +// the wizard should surface a steer-away hint before returning to provider +// selection so the user does not keep re-picking Local Ollama. + +import { beforeEach, describe, expect, it, vi } from "vitest"; + +import { + isOllamaProviderPinned, + runOllamaStartupOrGate, + setOllamaAutostartDisabled, +} from "../../../dist/lib/onboard/ollama-startup"; + +const wait = require("../../../dist/lib/core/wait"); +const runner = require("../../../dist/lib/runner"); + +describe("runOllamaStartupOrGate (#4365 steer hint)", () => { + let originalWaitForHttp: typeof wait.waitForHttp; + let originalRunShell: typeof runner.runShell; + let originalProviderEnv: string | undefined; + let originalNoAutostartEnv: string | undefined; + + beforeEach(() => { + originalWaitForHttp = wait.waitForHttp; + originalRunShell = runner.runShell; + originalProviderEnv = process.env.NEMOCLAW_PROVIDER; + originalNoAutostartEnv = process.env.NEMOCLAW_OLLAMA_NO_AUTOSTART; + // Clear NEMOCLAW_OLLAMA_NO_AUTOSTART so isOllamaAutostartDisabled() stays + // false regardless of the caller's environment — otherwise the autostart- + // timeout branch is bypassed and these assertions never run. + delete process.env.NEMOCLAW_OLLAMA_NO_AUTOSTART; + setOllamaAutostartDisabled(false); + runner.runShell = () => ({ status: 0 }); + }); + + function restore() { + wait.waitForHttp = originalWaitForHttp; + runner.runShell = originalRunShell; + if (originalProviderEnv === undefined) delete process.env.NEMOCLAW_PROVIDER; + else process.env.NEMOCLAW_PROVIDER = originalProviderEnv; + if (originalNoAutostartEnv === undefined) { + delete process.env.NEMOCLAW_OLLAMA_NO_AUTOSTART; + } else { + process.env.NEMOCLAW_OLLAMA_NO_AUTOSTART = originalNoAutostartEnv; + } + } + + it("prints the steer hint and returns 'continue' on autostart timeout in interactive default mode", () => { + delete process.env.NEMOCLAW_PROVIDER; + wait.waitForHttp = () => false; + const errSpy = vi.spyOn(console, "error").mockImplementation(() => {}); + const logSpy = vi.spyOn(console, "log").mockImplementation(() => {}); + + try { + const outcome = runOllamaStartupOrGate({ + ollamaReady: false, + ollamaPort: 11434, + getLocalProviderBaseUrl: () => "http://host.openshell.internal:11435/v1", + isNonInteractive: () => false, + }); + + expect(outcome).toEqual({ kind: "continue" }); + const errLines = errSpy.mock.calls.map((c) => String(c[0])); + expect(errLines.some((l) => l.includes("Ollama did not become ready"))).toBe(true); + expect( + errLines.some((l) => + l.includes( + "Pick a non-Ollama provider in the next menu — re-selecting Local Ollama would hit the same timeout.", + ), + ), + ).toBe(true); + } finally { + errSpy.mockRestore(); + logSpy.mockRestore(); + restore(); + } + }); + + it("does not print the steer hint when the provider is pinned (the wizard exits instead)", () => { + process.env.NEMOCLAW_PROVIDER = "ollama"; + wait.waitForHttp = () => false; + const errSpy = vi.spyOn(console, "error").mockImplementation(() => {}); + const exitSpy = vi.spyOn(process, "exit").mockImplementation(((code?: number) => { + throw new Error(`process.exit:${code ?? 0}`); + }) as never); + + try { + expect(() => + runOllamaStartupOrGate({ + ollamaReady: false, + ollamaPort: 11434, + getLocalProviderBaseUrl: () => "http://host.openshell.internal:11435/v1", + isNonInteractive: () => false, + }), + ).toThrow(/process\.exit:1/); + const errLines = errSpy.mock.calls.map((c) => String(c[0])); + expect( + errLines.some((l) => + l.includes("NEMOCLAW_PROVIDER pins onboarding to Ollama but Ollama is unreachable"), + ), + ).toBe(true); + // The steer hint targets a re-prompt menu that never appears here. + expect( + errLines.some((l) => + l.includes("Pick a non-Ollama provider in the next menu"), + ), + ).toBe(false); + } finally { + errSpy.mockRestore(); + exitSpy.mockRestore(); + restore(); + } + }); + + it("isOllamaProviderPinned recognises every Ollama-using provider key (#4365)", () => { + // Mirror the matching logic in providers.getNonInteractiveProvider so a + // user setting NEMOCLAW_PROVIDER to any of the Ollama-using keys still + // triggers the pinned-provider escape paths. Without this, a casing + // variant or an install-* pin would let the wizard return to the + // selection menu and immediately re-pin to the same Ollama action, + // reintroducing the #4365 loop. + const cases: Array<[string | undefined, boolean]> = [ + ["ollama", true], + ["OLLAMA", true], + [" Ollama ", true], + [" ollama\n", true], + ["install-ollama", true], + ["INSTALL-OLLAMA", true], + ["install-windows-ollama", true], + ["start-windows-ollama", true], + ["build", false], + ["openai", false], + ["", false], + [undefined, false], + ]; + for (const [value, expected] of cases) { + if (value === undefined) delete process.env.NEMOCLAW_PROVIDER; + else process.env.NEMOCLAW_PROVIDER = value; + expect(isOllamaProviderPinned(), `pin=${JSON.stringify(value)}`).toBe(expected); + } + restore(); + }); + + it("returns 'ready' immediately when Ollama already responds (no hint, no spawn)", () => { + delete process.env.NEMOCLAW_PROVIDER; + let waitCalled = false; + wait.waitForHttp = () => { + waitCalled = true; + return true; + }; + let shellCalled = false; + runner.runShell = () => { + shellCalled = true; + return { status: 0 }; + }; + const errSpy = vi.spyOn(console, "error").mockImplementation(() => {}); + + try { + const outcome = runOllamaStartupOrGate({ + ollamaReady: true, + ollamaPort: 11434, + getLocalProviderBaseUrl: () => "http://host.openshell.internal:11435/v1", + isNonInteractive: () => false, + }); + + expect(outcome).toEqual({ kind: "ready" }); + expect(waitCalled).toBe(false); + expect(shellCalled).toBe(false); + expect(errSpy).not.toHaveBeenCalled(); + } finally { + errSpy.mockRestore(); + restore(); + } + }); +}); diff --git a/src/lib/onboard/ollama-startup.ts b/src/lib/onboard/ollama-startup.ts index 33df4fe6b4..0a0352f599 100644 --- a/src/lib/onboard/ollama-startup.ts +++ b/src/lib/onboard/ollama-startup.ts @@ -15,6 +15,29 @@ export function isOllamaAutostartDisabled(): boolean { return NO_OLLAMA_AUTOSTART || process.env.NEMOCLAW_OLLAMA_NO_AUTOSTART === "1"; } +// Provider keys that route the wizard into an Ollama-using branch — keep in +// sync with the Ollama entries in providers.ts validProviders. Each of these +// re-selects an Ollama path on every selection-loop iteration, so a +// runner-crash inside selectAndValidateOllamaModel must exit (rather than +// return to selection) to avoid looping. (#4365) +const OLLAMA_PINNED_PROVIDER_KEYS = new Set([ + "ollama", + "install-ollama", + "install-windows-ollama", + "start-windows-ollama", +]); + +/** + * True when NEMOCLAW_PROVIDER pins onboarding to any Ollama-using branch. + * Mirrors the normalization that getNonInteractiveProvider uses (trim + + * lowercase) so casing/whitespace variants like `OLLAMA` or ` ollama ` + * still trigger the pinned-provider escape paths. (#4365) + */ +export function isOllamaProviderPinned(): boolean { + const normalized = (process.env.NEMOCLAW_PROVIDER || "").trim().toLowerCase(); + return OLLAMA_PINNED_PROVIDER_KEYS.has(normalized); +} + export type OllamaFallbackResult = { provider: "ollama-local"; credentialEnv: null; @@ -64,15 +87,20 @@ export function runOllamaStartupOrGate(args: { }); if (!wait.waitForHttp(`http://127.0.0.1:${ollamaPort}/`, 10)) { console.error(` Ollama did not become ready on :${ollamaPort} within timeout.`); - const providerPinned = process.env.NEMOCLAW_PROVIDER === "ollama"; + const providerPinned = isOllamaProviderPinned(); if (isNonInteractive() || providerPinned) { if (providerPinned) { console.error( - " NEMOCLAW_PROVIDER=ollama is pinned but Ollama is unreachable; refusing to loop on provider selection.", + " NEMOCLAW_PROVIDER pins onboarding to Ollama but Ollama is unreachable; refusing to loop on provider selection.", ); } process.exit(1); } + // Surface a non-Ollama steer so the user does not pick Local Ollama again + // and hit the same timeout (issue #4365 loop). + console.error( + " Pick a non-Ollama provider in the next menu — re-selecting Local Ollama would hit the same timeout.", + ); return { kind: "continue" }; } return { kind: "ready" }; diff --git a/test/onboard-ollama-autostart.test.ts b/test/onboard-ollama-autostart.test.ts index 9563434173..38a6d641cc 100644 --- a/test/onboard-ollama-autostart.test.ts +++ b/test/onboard-ollama-autostart.test.ts @@ -28,6 +28,22 @@ type ScenarioOptions = { // When true, stub waitForHttp to return false. Only used to verify that the // gated path does not even reach waitForHttp. waitForHttpReturnsFalse?: boolean; + // When true, allow the wizard to reach selectAndValidateOllamaModel by + // stubbing startOllamaAuthProxy to a no-op success rather than the bail-out + // sentinel. Used by the #4365 runner-crash escape scenarios. + proceedToModelSelection?: boolean; + // Body returned by the fake curl for `/api/generate` probes (used by + // validateOllamaModel). Defaults to a healthy response. Set to a runner- + // crash payload to drive the #4365 daemonFailure escape path. + ollamaGenerateBody?: string; + // Bound the subprocess wall-clock. Defaults to the scenario timeout. A + // pre-fix runner-crash loop would never exit — assert against this to catch + // regressions. + subprocessTimeoutMs?: number; + // Override the NEMOCLAW_PROVIDER env value. Default is the literal `ollama` + // (pinned). Use a casing variant ("OLLAMA", " ollama ") to exercise the + // isOllamaProviderPinned normalization path. (#4365) + providerEnv?: string; }; type WizardResult = { @@ -74,9 +90,36 @@ function runOllamaAutostartScenario(opts: ScenarioOptions): WizardResult { // which is what gates the wizard — the curl stub itself stays permissive. const toolCallBody = '{"choices":[{"message":{"role":"assistant","content":"","tool_calls":[{"type":"function","function":{"name":"emit_ok","arguments":"{\\"ok\\":true}"}}]}}]}'; + // /api/generate body — validateOllamaModel parses this and looks for an + // `error` key. A healthy default; #4365 scenarios override with a runner- + // crash payload to drive the daemonFailure path. + const generateBody = opts.ollamaGenerateBody ?? '{"response":"hello"}'; + // /api/tags body. local.ts destructures `runCapture` at module load time, + // BEFORE the test mutates runner.runCapture — so `getOllamaModelOptions` + // (in local.ts) still calls through to the real spawnSync and lands on + // this fake curl. Returning a tag matching the bootstrap fallback + // (smallest registry entry) keeps the menu deterministic with gpu=null + // and ensures the picked model is already-installed (no pull prompt). + const tagsBody = '{"models":[{"name":"qwen2.5:7b"}]}'; fs.writeFileSync( path.join(fakeBin, "curl"), `#!/usr/bin/env bash +url="" +for arg in "$@"; do + case "$arg" in + http://*|https://*) url="$arg" ;; + esac +done +case "$url" in + *api/generate*) + printf '%s' '${generateBody.replace(/'/g, "'\\''")}' + exit 0 + ;; + *api/tags*) + printf '%s' '${tagsBody.replace(/'/g, "'\\''")}' + exit 0 + ;; +esac body='${toolCallBody}' status="200" outfile="" @@ -99,7 +142,7 @@ printf '%s' "$status" PATH: `${fakeBin}:${process.env.PATH || ""}`, // Pin provider selection so the test deterministically enters the Ollama // branch of the wizard regardless of menu ordering changes elsewhere. - NEMOCLAW_PROVIDER: "ollama", + NEMOCLAW_PROVIDER: opts.providerEnv ?? "ollama", }; if (opts.noAutostartEnv) scenarioEnv.NEMOCLAW_OLLAMA_NO_AUTOSTART = "1"; if (opts.nonInteractive) scenarioEnv.NEMOCLAW_NON_INTERACTIVE = "1"; @@ -197,12 +240,18 @@ localInference.findReachableOllamaHost = () => (ollamaRunning ? "127.0.0.1" : nu // sentinel here bails out of the wizard once it has done everything that // matters for the gated-vs-spawn assertions. The fallback branch breaks out // of selectionLoop BEFORE this is reached, so Scenarios A and D never see -// the sentinel — only B and C do. +// the sentinel — only B and C do. The #4365 scenarios opt out so the wizard +// can reach selectAndValidateOllamaModel. const proxy = require(${proxyPath}); class OllamaAutostartSentinel extends Error {} -proxy.startOllamaAuthProxy = () => { - throw new OllamaAutostartSentinel("ollama-autostart-test-sentinel"); -}; +const proceedToModelSelection = ${JSON.stringify(opts.proceedToModelSelection === true)}; +if (proceedToModelSelection) { + proxy.startOllamaAuthProxy = () => true; +} else { + proxy.startOllamaAuthProxy = () => { + throw new OllamaAutostartSentinel("ollama-autostart-test-sentinel"); + }; +} // Wrap selectAndValidateOllamaModel to record whether the wizard reached it. // Access via the dist module's exported function (it's local in source, but @@ -276,6 +325,9 @@ process.exit = (code) => { NEMOCLAW_MODEL: "", NEMOCLAW_YES: "", }, + // Bound the subprocess so a pre-fix runner-crash loop cannot wedge + // the test runner. (#4365) + timeout: opts.subprocessTimeoutMs ?? OLLAMA_AUTOSTART_TEST_TIMEOUT_MS, }); assert.equal(result.status, 0, `subprocess stderr:\n${result.stderr}\n\nstdout:\n${result.stdout}`); @@ -482,6 +534,82 @@ describe("nemoclaw onboard --no-ollama-autostart (issue #3751)", () => { }, ); + it( + "Scenario G (#4365): pinned-provider runner crash exits instead of looping on Ollama model selection", + { timeout: OLLAMA_AUTOSTART_TEST_TIMEOUT_MS }, + () => { + // Reporter's second-step: Ollama responds, user reaches model selection, + // but the model runner has unexpectedly stopped. Pre-fix the wizard would + // re-prompt for another Ollama model forever (or until the user finds + // "back"). With the fix, daemonFailure is detected and the wizard exits + // when NEMOCLAW_PROVIDER=ollama is pinned. The bounded subprocess + // timeout catches a regression: a pre-fix subprocess would loop until + // SIGTERM and result.status would be null. + const payload = runOllamaAutostartScenario({ + ollamaRunning: true, + noAutostartEnv: false, + proceedToModelSelection: true, + ollamaGenerateBody: JSON.stringify({ + error: "model runner has unexpectedly stopped, this may be due to resource limitations or an internal error", + }), + subprocessTimeoutMs: 20_000, + }); + + assert.ok( + payload.lines.some((line) => + line.includes("model runner has unexpectedly stopped"), + ), + `expected the runner-crash error in lines; got:\n${payload.lines.join("\n")}`, + ); + assert.ok( + payload.lines.some((line) => + line.includes( + "NEMOCLAW_PROVIDER pins onboarding to Ollama but the Ollama model runner is unhealthy", + ), + ), + `expected the pinned-provider runner-crash abort message; lines:\n${payload.lines.join("\n")}`, + ); + assert.ok( + payload.processExitCalled >= 1, + `expected process.exit on runner crash with pinned provider; lines:\n${payload.lines.join("\n")}`, + ); + }, + ); + + it( + "Scenario H (#4365): pinned-provider runner crash also exits when NEMOCLAW_PROVIDER uses a casing variant", + { timeout: OLLAMA_AUTOSTART_TEST_TIMEOUT_MS }, + () => { + // NEMOCLAW_PROVIDER=OLLAMA is accepted by getNonInteractiveProvider's + // .trim().toLowerCase() normalization. The runner-crash escape must + // recognize the same variants — otherwise the wizard would return + // `back-to-selection`, re-pin Ollama on the next iteration, and loop. + const payload = runOllamaAutostartScenario({ + ollamaRunning: true, + noAutostartEnv: false, + proceedToModelSelection: true, + providerEnv: " OLLAMA ", + ollamaGenerateBody: JSON.stringify({ + error: "model runner has unexpectedly stopped", + }), + subprocessTimeoutMs: 20_000, + }); + + assert.ok( + payload.lines.some((line) => + line.includes( + "NEMOCLAW_PROVIDER pins onboarding to Ollama but the Ollama model runner is unhealthy", + ), + ), + `expected the pinned-provider runner-crash abort even with a casing variant; lines:\n${payload.lines.join("\n")}`, + ); + assert.ok( + payload.processExitCalled >= 1, + `expected process.exit on runner crash with casing-variant pinned provider; lines:\n${payload.lines.join("\n")}`, + ); + }, + ); + it( "Scenario E: stopped Ollama + flag NOT set + NEMOCLAW_PROVIDER=ollama + waitForHttp timeout → process.exit, no selectionLoop re-entry", { timeout: OLLAMA_AUTOSTART_TEST_TIMEOUT_MS }, @@ -504,7 +632,7 @@ describe("nemoclaw onboard --no-ollama-autostart (issue #3751)", () => { ); assert.ok( payload.lines.some((line) => - line.includes("NEMOCLAW_PROVIDER=ollama is pinned but Ollama is unreachable"), + line.includes("NEMOCLAW_PROVIDER pins onboarding to Ollama but Ollama is unreachable"), ), `expected pinned-provider abort message; lines:\n${payload.lines.join("\n")}`, );