diff --git a/docs/inference/switch-inference-providers.mdx b/docs/inference/switch-inference-providers.mdx index 2c96efff9f..7c276df638 100644 --- a/docs/inference/switch-inference-providers.mdx +++ b/docs/inference/switch-inference-providers.mdx @@ -132,6 +132,7 @@ To change these values, set the corresponding environment variables before runni | `NEMOCLAW_AGENT_HEARTBEAT_EVERY` | Go-style duration (`30m`, `1h`, `0m` to disable) | `unset` (OpenClaw default) | Invalid values are ignored, and the default bakes into the image. +For Local Ollama, onboarding loads the selected model first and uses Ollama's reported runtime context length when `NEMOCLAW_CONTEXT_WINDOW` is unset. Use `NEMOCLAW_INFERENCE_INPUTS=text,image` only for a model that accepts image input through the selected provider. ```console diff --git a/docs/inference/use-local-inference.mdx b/docs/inference/use-local-inference.mdx index d6dfedc50c..922685caf5 100644 --- a/docs/inference/use-local-inference.mdx +++ b/docs/inference/use-local-inference.mdx @@ -73,6 +73,7 @@ NemoClaw lists installed models or offers starter models if none are installed. On hosts where the larger starter models fit the currently available GPU memory, the starter list includes `qwen3.6:35b` and selects it by default. When another GPU workload is using most of the memory at onboard time, NemoClaw downgrades the menu to the largest model that still fits. It pulls the selected model, loads it into memory, and validates it before continuing. +When Ollama reports a loaded-model context length, NemoClaw uses that value for the `contextWindow` baked into `openclaw.json` unless you set `NEMOCLAW_CONTEXT_WINDOW` yourself. If the selected model declares that it does not support tool calling, onboarding stops with guidance to choose a model whose `ollama show ` capabilities include `tools`. The validation also requires structured chat-completions tool calls. If the model leaks tool-call JSON as plain message text, onboarding stops so you can choose a model that returns tool calls in the expected response field. diff --git a/src/lib/inference/local.test.ts b/src/lib/inference/local.test.ts index a8475a0b14..ac106f89b6 100644 --- a/src/lib/inference/local.test.ts +++ b/src/lib/inference/local.test.ts @@ -36,7 +36,6 @@ import { getOllamaWarmupCommand, parseOllamaList, parseOllamaTags, - probeOllamaRuntimeModelStatus, probeLocalProviderHealth, validateOllamaModel, validateLocalProvider, @@ -654,6 +653,7 @@ describe("local inference helpers", () => { null, { type: "nvidia", totalMemoryMB: 131_072, availableMemoryMB: 131_072 }, log, + () => "", ), ).toBe(QWEN3_6_OLLAMA_MODEL); }); @@ -686,6 +686,7 @@ describe("local inference helpers", () => { null, { type: "nvidia", totalMemoryMB: 16_384, availableMemoryMB: 4_000 }, log, + () => "", ), ).toBe("qwen2.5:7b"); expect(messages.some((m) => m.includes("No known Ollama bootstrap model fits"))).toBe(true); @@ -793,23 +794,6 @@ describe("local inference helpers", () => { expect(validateOllamaModel("nemotron-3-nano:30b", () => "ok", undefined, captureEx)).toEqual({ ok: true }); }); - it("parses Ollama runtime status from /api/ps", () => { - const capture = () => - JSON.stringify({ - models: [ - { name: "qwen3.6:35b", size_vram: 0, processor: "100% CPU" }, - ], - }); - - expect(probeOllamaRuntimeModelStatus("qwen3.6:35b", capture)).toEqual({ - probed: true, - loaded: true, - cpuOnly: true, - processor: "100% CPU", - sizeVram: 0, - }); - }); - it("fails Spark Ollama validation when the model is CPU-only after warmup", () => { const payload = JSON.stringify({ model: "qwen3.6:35b", response: "hello", done: true }); const psOutput = JSON.stringify({ diff --git a/src/lib/inference/local.ts b/src/lib/inference/local.ts index 2c9f0a95f0..e79c8a3bd4 100644 --- a/src/lib/inference/local.ts +++ b/src/lib/inference/local.ts @@ -14,6 +14,16 @@ import { runCurlProbe } from "../adapters/http/probe"; import type { ContainerRuntime } from "../platform"; import type { CaptureResult } from "../runner"; import { buildSubprocessEnv } from "../subprocess-env"; +import { + applyOllamaRuntimeContextWindow as applyOllamaRuntimeContextWindowWithHost, + MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW, + parsePositiveInteger, + probeOllamaRuntimeModelStatus as probeOllamaRuntimeModelStatusWithHost, + resetOllamaRuntimeContextWindowAutoState, + resolveOllamaRuntimeContextWindow as resolveOllamaRuntimeContextWindowWithHost, +} from "./ollama-runtime-context"; +import type { OllamaRuntimeModelStatus } from "./ollama-runtime-context"; +export type { OllamaRuntimeModelStatus } from "./ollama-runtime-context"; const { shellQuote, runCapture, runCaptureEx } = require("../runner"); @@ -666,67 +676,32 @@ export function parseOllamaTags(output: string | null | undefined): string[] { } } -export interface OllamaRuntimeModelStatus { - probed: boolean; - loaded: boolean; - cpuOnly: boolean; - processor?: string; - sizeVram?: number; -} - -function normalizeOllamaModelName(value: unknown): string { - return String(value || "").trim(); -} +export { MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW, parsePositiveInteger }; export function probeOllamaRuntimeModelStatus( model: string, runCaptureImpl?: RunCaptureFn, ): OllamaRuntimeModelStatus { - const capture = runCaptureImpl ?? runCapture; - const host = getResolvedOllamaHost(); - const output = capture( - [ - "curl", - "-sf", - "--connect-timeout", - "3", - "--max-time", - "5", - `http://${host}:${OLLAMA_PORT}/api/ps`, - ], - { ignoreError: true }, - ); - if (!output) return { probed: false, loaded: false, cpuOnly: false }; + return probeOllamaRuntimeModelStatusWithHost(model, getResolvedOllamaHost, runCaptureImpl); +} - try { - const parsed = JSON.parse(String(output || "")); - const models = Array.isArray(parsed?.models) ? parsed.models : []; - const target = normalizeOllamaModelName(model); - const loaded = models.find((entry: { name?: unknown; model?: unknown }) => { - return ( - normalizeOllamaModelName(entry?.name) === target || - normalizeOllamaModelName(entry?.model) === target - ); - }); - if (!loaded) return { probed: true, loaded: false, cpuOnly: false }; +export function resolveOllamaRuntimeContextWindow( + model: string, + currentContextWindow: string | null | undefined = null, + runCaptureImpl?: RunCaptureFn, +): number | null { + return resolveOllamaRuntimeContextWindowWithHost( + model, + currentContextWindow, + getResolvedOllamaHost, + runCaptureImpl, + ); +} - const rawSizeVram = Number((loaded as { size_vram?: unknown }).size_vram); - const hasSizeVram = Number.isFinite(rawSizeVram); - const processor = normalizeOllamaModelName((loaded as { processor?: unknown }).processor); - const mentionsGpu = /\bGPU\b/i.test(processor); - const processorCpuOnly = /\bCPU\b/i.test(processor) && !mentionsGpu; - const sizeVramCpuOnly = hasSizeVram && rawSizeVram === 0 && !mentionsGpu; +export { resetOllamaRuntimeContextWindowAutoState }; - return { - probed: true, - loaded: true, - cpuOnly: processorCpuOnly || sizeVramCpuOnly, - ...(processor ? { processor } : {}), - ...(hasSizeVram ? { sizeVram: rawSizeVram } : {}), - }; - } catch { - return { probed: true, loaded: false, cpuOnly: false }; - } +export function applyOllamaRuntimeContextWindow(selectedModel: string): void { + applyOllamaRuntimeContextWindowWithHost(selectedModel, getResolvedOllamaHost); } function formatOllamaCpuOnlyDiagnostic(model: string, status: OllamaRuntimeModelStatus): string { @@ -796,6 +771,7 @@ export function resolveNonInteractiveOllamaModel( recoveredModel: string | null, gpu: GpuInfo | null, log: (message: string) => void = (m) => console.warn(m), + runCaptureImpl?: RunCaptureFn, ): string { const explicit = requestedModel || recoveredModel; if (explicit && !modelFitsAvailableMemory(explicit, gpu)) { @@ -812,7 +788,7 @@ export function resolveNonInteractiveOllamaModel( if (!explicit && !anyRegistryModelFits(gpu)) { warnNoBootstrapModelFits(gpu, log); } - return explicit || getDefaultOllamaModel(gpu); + return explicit || getDefaultOllamaModel(gpu, runCaptureImpl); } function warnNoBootstrapModelFits( diff --git a/src/lib/inference/ollama-runtime-context.test.ts b/src/lib/inference/ollama-runtime-context.test.ts new file mode 100644 index 0000000000..7cee94a59f --- /dev/null +++ b/src/lib/inference/ollama-runtime-context.test.ts @@ -0,0 +1,141 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { afterEach, describe, expect, it } from "vitest"; + +import { + applyOllamaRuntimeContextWindow, + parseOllamaRuntimeContextLength, + probeOllamaRuntimeModelStatus, + resetOllamaRuntimeContextWindowAutoState, + resolveOllamaRuntimeContextWindow, +} from "../../../dist/lib/inference/ollama-runtime-context"; + +const getOllamaHost = () => "127.0.0.1"; + +describe("Ollama runtime context helpers", () => { + afterEach(() => { + resetOllamaRuntimeContextWindowAutoState(); + }); + + it("parses valid Ollama /api/ps context lengths", () => { + expect(parseOllamaRuntimeContextLength(262144)).toEqual({ contextLength: 262144 }); + expect(parseOllamaRuntimeContextLength("262144")).toEqual({ contextLength: 262144 }); + }); + + it("treats omitted Ollama /api/ps context lengths as compatibility no-ops", () => { + expect(parseOllamaRuntimeContextLength(undefined)).toEqual({}); + expect(parseOllamaRuntimeContextLength(null)).toEqual({}); + expect(parseOllamaRuntimeContextLength(" ")).toEqual({}); + + const status = probeOllamaRuntimeModelStatus( + "qwen3.6:35b", + getOllamaHost, + () => JSON.stringify({ models: [{ name: "qwen3.6:35b", processor: "100% GPU" }] }), + ); + + expect(status.loaded).toBe(true); + expect(status.contextLength).toBeUndefined(); + expect(status.contextLengthWarning).toBeUndefined(); + expect( + resolveOllamaRuntimeContextWindow("qwen3.6:35b", null, getOllamaHost, () => + JSON.stringify({ models: [{ name: "qwen3.6:35b" }] }), + ), + ).toBeNull(); + }); + + it("warns and ignores malformed or non-positive Ollama /api/ps context lengths", () => { + for (const value of ["bogus", "1.5", 0, -1]) { + const parsed = parseOllamaRuntimeContextLength(value); + expect(parsed.contextLength).toBeUndefined(); + expect(parsed.warning).toContain("non-positive or malformed context_length"); + } + + const status = probeOllamaRuntimeModelStatus( + "qwen3.6:35b", + getOllamaHost, + () => JSON.stringify({ models: [{ name: "qwen3.6:35b", context_length: "bogus" }] }), + ); + + expect(status.loaded).toBe(true); + expect(status.contextLength).toBeUndefined(); + expect(status.contextLengthWarning).toContain("non-positive or malformed context_length"); + }); + + it("warns and ignores implausibly large Ollama /api/ps context lengths", () => { + const parsed = parseOllamaRuntimeContextLength(10_000_000); + expect(parsed.contextLength).toBeUndefined(); + expect(parsed.warning).toContain("above NemoClaw's auto-detect ceiling"); + + const status = probeOllamaRuntimeModelStatus( + "qwen3.6:35b", + getOllamaHost, + () => JSON.stringify({ models: [{ name: "qwen3.6:35b", context_length: 10_000_000 }] }), + ); + + expect(status.loaded).toBe(true); + expect(status.contextLength).toBeUndefined(); + expect(status.contextLengthWarning).toContain("above NemoClaw's auto-detect ceiling"); + expect( + resolveOllamaRuntimeContextWindow("qwen3.6:35b", null, getOllamaHost, () => + JSON.stringify({ models: [{ name: "qwen3.6:35b", context_length: 10_000_000 }] }), + ), + ).toBeNull(); + }); + + it("resolves runtime context length only when no explicit override is set", () => { + const capture = () => + JSON.stringify({ + models: [{ name: "qwen3.6:35b", context_length: "262144", processor: "100% GPU" }], + }); + + expect( + resolveOllamaRuntimeContextWindow("qwen3.6:35b", null, getOllamaHost, capture), + ).toBe(262144); + expect( + resolveOllamaRuntimeContextWindow("qwen3.6:35b", "131072", getOllamaHost, capture), + ).toBeNull(); + expect( + resolveOllamaRuntimeContextWindow("qwen3.6:35b", "bogus", getOllamaHost, capture), + ).toBeNull(); + expect( + resolveOllamaRuntimeContextWindow("qwen3.6:35b", " ", getOllamaHost, capture), + ).toBe(262144); + expect( + resolveOllamaRuntimeContextWindow("other:model", null, getOllamaHost, capture), + ).toBeNull(); + }); + + it("applies and clears only auto-detected context window state", () => { + const env: NodeJS.ProcessEnv = {}; + const messages: string[] = []; + let models: Array<{ name: string; context_length?: number }> = []; + const options = { + env, + logger: { + log: (message: string) => messages.push(message), + warn: (message: string) => messages.push(message), + }, + runCaptureImpl: () => JSON.stringify({ models }), + }; + + models = [{ name: "qwen3.6:35b", context_length: 262144 }]; + applyOllamaRuntimeContextWindow("qwen3.6:35b", getOllamaHost, options); + expect(env.NEMOCLAW_CONTEXT_WINDOW).toBe("262144"); + + models = [{ name: "qwen2.5:7b", context_length: 32768 }]; + applyOllamaRuntimeContextWindow("qwen2.5:7b", getOllamaHost, options); + expect(env.NEMOCLAW_CONTEXT_WINDOW).toBe("32768"); + + models = []; + applyOllamaRuntimeContextWindow("qwen2.5:7b", getOllamaHost, options); + expect(env.NEMOCLAW_CONTEXT_WINDOW).toBeUndefined(); + + resetOllamaRuntimeContextWindowAutoState(); + env.NEMOCLAW_CONTEXT_WINDOW = "262144"; + models = [{ name: "qwen2.5:7b", context_length: 32768 }]; + applyOllamaRuntimeContextWindow("qwen2.5:7b", getOllamaHost, options); + expect(env.NEMOCLAW_CONTEXT_WINDOW).toBe("262144"); + expect(messages.at(-1)).toContain("Keeping configured context window"); + }); +}); diff --git a/src/lib/inference/ollama-runtime-context.ts b/src/lib/inference/ollama-runtime-context.ts new file mode 100644 index 0000000000..0ef1b27b11 --- /dev/null +++ b/src/lib/inference/ollama-runtime-context.ts @@ -0,0 +1,219 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * Ollama runtime context-window helpers. + * + * Keep this module focused on data coming from Ollama's `/api/ps` runtime + * boundary. Onboarding should call the narrow wrappers in `local.ts` instead + * of re-implementing parsing or process-env state handling. + */ + +import { OLLAMA_PORT } from "../core/ports"; + +const { runCapture } = require("../runner"); + +export type OllamaRuntimeRunCaptureFn = ( + cmd: string | string[], + opts?: { ignoreError?: boolean }, +) => string; + +export interface OllamaRuntimeModelStatus { + probed: boolean; + loaded: boolean; + cpuOnly: boolean; + contextLength?: number; + contextLengthWarning?: string; + processor?: string; + sizeVram?: number; +} + +export interface ApplyOllamaRuntimeContextWindowOptions { + env?: NodeJS.ProcessEnv; + logger?: Pick; + runCaptureImpl?: OllamaRuntimeRunCaptureFn; +} + +// Four million tokens is intentionally above today's practical local-model +// context windows while still rejecting obviously broken daemon responses. +export const MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW = 4_194_304; + +function normalizeOllamaModelName(value: unknown): string { + return String(value || "").trim(); +} + +export function parsePositiveInteger(value: unknown): number | null { + if (typeof value === "number") { + return Number.isSafeInteger(value) && value > 0 ? value : null; + } + const raw = String(value ?? "").trim(); + if (!/^[1-9][0-9]*$/.test(raw)) return null; + const parsed = Number(raw); + return Number.isSafeInteger(parsed) && parsed > 0 ? parsed : null; +} + +export function hasExplicitContextWindow(value: unknown): boolean { + return String(value ?? "").trim() !== ""; +} + +/** + * Parse Ollama `/api/ps` `context_length` defensively. + * + * Source boundary: `context_length` is produced by the user-managed Ollama + * daemon outside this repository. NemoClaw can validate before consuming it, + * but this PR cannot make every installed daemon report a value or enforce a + * stricter schema at the producer. + * + * Tolerated invalid states: older daemons omitting the field, empty values, + * non-integer/malformed values, non-positive values, unsafe integers, and + * values above NemoClaw's auto-detect ceiling. Missing values are a silent + * compatibility no-op; malformed or implausible values return a warning and + * fall back to the existing NEMOCLAW_CONTEXT_WINDOW/default path. + * + * Regression coverage lives in `ollama-runtime-context.test.ts` for omitted, + * malformed, non-positive, valid string/number, and over-ceiling responses. + * Remove this fallback once NemoClaw requires an Ollama daemon contract that + * always reports a validated positive integer `context_length` for loaded + * models. + */ +export function parseOllamaRuntimeContextLength(value: unknown): { + contextLength?: number; + warning?: string; +} { + if (value === undefined || value === null || String(value).trim() === "") { + return {}; + } + const parsed = parsePositiveInteger(value); + if (!parsed) { + return { + warning: `Ollama /api/ps returned a non-positive or malformed context_length (${String(value)}); ignoring it.`, + }; + } + if (parsed > MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW) { + return { + warning: + `Ollama /api/ps returned context_length=${parsed}, above NemoClaw's ` + + `auto-detect ceiling (${MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW}); ignoring it.`, + }; + } + return { contextLength: parsed }; +} + +export function probeOllamaRuntimeModelStatus( + model: string, + getOllamaHost: () => string, + runCaptureImpl?: OllamaRuntimeRunCaptureFn, +): OllamaRuntimeModelStatus { + const capture = runCaptureImpl ?? runCapture; + const host = getOllamaHost(); + const output = capture( + [ + "curl", + "-sf", + "--connect-timeout", + "3", + "--max-time", + "5", + `http://${host}:${OLLAMA_PORT}/api/ps`, + ], + { ignoreError: true }, + ); + if (!output) return { probed: false, loaded: false, cpuOnly: false }; + + try { + const parsed = JSON.parse(String(output || "")); + const models = Array.isArray(parsed?.models) ? parsed.models : []; + const target = normalizeOllamaModelName(model); + const loaded = models.find((entry: { name?: unknown; model?: unknown }) => { + return ( + normalizeOllamaModelName(entry?.name) === target || + normalizeOllamaModelName(entry?.model) === target + ); + }); + if (!loaded) return { probed: true, loaded: false, cpuOnly: false }; + + const rawSizeVram = Number((loaded as { size_vram?: unknown }).size_vram); + const hasSizeVram = Number.isFinite(rawSizeVram); + const contextLengthResult = parseOllamaRuntimeContextLength( + (loaded as { context_length?: unknown }).context_length, + ); + const processor = normalizeOllamaModelName((loaded as { processor?: unknown }).processor); + const mentionsGpu = /\bGPU\b/i.test(processor); + const processorCpuOnly = /\bCPU\b/i.test(processor) && !mentionsGpu; + const sizeVramCpuOnly = hasSizeVram && rawSizeVram === 0 && !mentionsGpu; + + return { + probed: true, + loaded: true, + cpuOnly: processorCpuOnly || sizeVramCpuOnly, + ...(contextLengthResult.contextLength + ? { contextLength: contextLengthResult.contextLength } + : {}), + ...(contextLengthResult.warning + ? { contextLengthWarning: contextLengthResult.warning } + : {}), + ...(processor ? { processor } : {}), + ...(hasSizeVram ? { sizeVram: rawSizeVram } : {}), + }; + } catch { + return { probed: true, loaded: false, cpuOnly: false }; + } +} + +export function resolveOllamaRuntimeContextWindow( + model: string, + currentContextWindow: string | null | undefined, + getOllamaHost: () => string, + runCaptureImpl?: OllamaRuntimeRunCaptureFn, +): number | null { + if (hasExplicitContextWindow(currentContextWindow)) return null; + const runtimeStatus = probeOllamaRuntimeModelStatus(model, getOllamaHost, runCaptureImpl); + return runtimeStatus.loaded ? (runtimeStatus.contextLength ?? null) : null; +} + +let autoDetectedOllamaContextWindow: string | null = null; + +export function resetOllamaRuntimeContextWindowAutoState(): void { + autoDetectedOllamaContextWindow = null; +} + +export function applyOllamaRuntimeContextWindow( + selectedModel: string, + getOllamaHost: () => string, + options: ApplyOllamaRuntimeContextWindowOptions = {}, +): void { + const env = options.env ?? process.env; + const logger = options.logger ?? console; + const currentContextWindow = env.NEMOCLAW_CONTEXT_WINDOW; + const currentIsPreviousAuto = + !!currentContextWindow && + !!autoDetectedOllamaContextWindow && + currentContextWindow === autoDetectedOllamaContextWindow; + const userContextWindow = currentIsPreviousAuto ? null : currentContextWindow; + + if (hasExplicitContextWindow(userContextWindow)) { + logger.log(` ℹ Keeping configured context window: ${userContextWindow} tokens`); + return; + } + + const runtimeStatus = probeOllamaRuntimeModelStatus( + selectedModel, + getOllamaHost, + options.runCaptureImpl, + ); + if (runtimeStatus.contextLengthWarning) { + logger.warn(` ⚠ ${runtimeStatus.contextLengthWarning}`); + } + if (runtimeStatus.loaded && runtimeStatus.contextLength) { + const value = String(runtimeStatus.contextLength); + env.NEMOCLAW_CONTEXT_WINDOW = value; + autoDetectedOllamaContextWindow = value; + logger.log(` ✓ Using Ollama runtime context length: ${value} tokens`); + return; + } + + if (currentIsPreviousAuto) { + delete env.NEMOCLAW_CONTEXT_WINDOW; + autoDetectedOllamaContextWindow = null; + } +} diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 54799bc82a..dcb5c89baa 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -4111,7 +4111,6 @@ const { readLiveInference, readRecordedProvider, readRecordedNimContainer, readR type OllamaModelSelectionOutcome = | { outcome: "selected"; model: string } | { outcome: "back-to-selection" }; - // Pick an Ollama model, pull it if missing, and validate it via the local // proxy. Shared by the three Ollama provider branches (running, Windows-host // install/start, install-locally). Returns "back-to-selection" so the caller @@ -4201,6 +4200,7 @@ async function selectAndValidateOllamaModel( " ℹ Using chat completions API (Ollama tool calls require /v1/chat/completions)", ); } + localInference.applyOllamaRuntimeContextWindow(selectedModel); return { outcome: "selected", model: selectedModel }; } } diff --git a/test/onboard-selection.test.ts b/test/onboard-selection.test.ts index 6ef3bb11f1..e13d9ebfdc 100644 --- a/test/onboard-selection.test.ts +++ b/test/onboard-selection.test.ts @@ -1484,6 +1484,11 @@ runner.runCapture = (command) => { if (cmd.includes("127.0.0.1:11434/api/tags")) return JSON.stringify({ models: [{ name: "nemotron-3-nano:30b" }] }); if (cmd.includes("ollama list")) return "nemotron-3-nano:30b abc 24 GB now"; if (cmd.includes("127.0.0.1:8000/v1/models")) return ""; + if (cmd.includes("127.0.0.1:11434/api/ps")) { + return JSON.stringify({ + models: [{ name: "nemotron-3-nano:30b", context_length: 262144 }], + }); + } if (cmd.includes("api/generate")) return '{"response":"hello"}'; if (cmd.includes("-o args=")) return "node ollama-auth-proxy.js"; return ""; @@ -1499,7 +1504,15 @@ const { setupNim } = require(${onboardPath}); console.error = (...args) => lines.push(args.join(" ")); try { const result = await setupNim(null); - originalLog(JSON.stringify({ result, messages, lines, commands })); + originalLog( + JSON.stringify({ + result, + messages, + lines, + commands, + contextWindow: process.env.NEMOCLAW_CONTEXT_WINDOW, + }), + ); } finally { console.log = originalLog; console.error = originalError; @@ -1518,6 +1531,7 @@ const { setupNim } = require(${onboardPath}); ...process.env, HOME: tmpDir, PATH: `${fakeBin}:${process.env.PATH || ""}`, + NEMOCLAW_CONTEXT_WINDOW: "", }, }); @@ -1550,6 +1564,91 @@ const { setupNim } = require(${onboardPath}); command.includes("http://127.0.0.1:11434/api/generate"), ), ); + assert.equal(payload.contextWindow, "262144"); + }); + + it("re-resolves auto-detected Ollama context windows across model selections", () => { + const repoRoot = path.join(import.meta.dirname, ".."); + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-onboard-ollama-context-")); + const scriptPath = path.join(tmpDir, "ollama-context-check.js"); + const localInferencePath = JSON.stringify(path.join(repoRoot, "dist", "lib", "inference", "local.js")); + const runnerPath = JSON.stringify(path.join(repoRoot, "dist", "lib", "runner.js")); + + const script = String.raw` +const runner = require(${runnerPath}); + +let models = []; +runner.runCapture = (command) => { + const rendered = Array.isArray(command) ? command.join(" ") : command; + if (rendered.includes("/api/ps")) { + return JSON.stringify({ models }); + } + return ""; +}; + +const { + applyOllamaRuntimeContextWindow, + resetOllamaRuntimeContextWindowAutoState, +} = require(${localInferencePath}); + +const result = {}; +const originalWarn = console.warn; +const originalLog = console.log; +console.warn = () => {}; +console.log = () => {}; +try { + resetOllamaRuntimeContextWindowAutoState(); + delete process.env.NEMOCLAW_CONTEXT_WINDOW; + + models = [{ name: "qwen3.6:35b", context_length: 262144 }]; + applyOllamaRuntimeContextWindow("qwen3.6:35b"); + result.initial = process.env.NEMOCLAW_CONTEXT_WINDOW || null; + + models = [{ name: "qwen2.5:7b", context_length: 32768 }]; + applyOllamaRuntimeContextWindow("qwen2.5:7b"); + result.updated = process.env.NEMOCLAW_CONTEXT_WINDOW || null; + + models = []; + applyOllamaRuntimeContextWindow("qwen2.5:7b"); + result.cleared = process.env.NEMOCLAW_CONTEXT_WINDOW || null; + + resetOllamaRuntimeContextWindowAutoState(); + process.env.NEMOCLAW_CONTEXT_WINDOW = "262144"; + models = [{ name: "qwen2.5:7b", context_length: 32768 }]; + applyOllamaRuntimeContextWindow("qwen2.5:7b"); + result.userOverride = process.env.NEMOCLAW_CONTEXT_WINDOW || null; + + resetOllamaRuntimeContextWindowAutoState(); + process.env.NEMOCLAW_CONTEXT_WINDOW = "bogus"; + models = [{ name: "qwen2.5:7b", context_length: 32768 }]; + applyOllamaRuntimeContextWindow("qwen2.5:7b"); + result.invalidOverride = process.env.NEMOCLAW_CONTEXT_WINDOW || null; +} finally { + console.warn = originalWarn; + console.log = originalLog; +} + +console.log(JSON.stringify(result)); +`; + fs.writeFileSync(scriptPath, script); + + const result = spawnSync(process.execPath, [scriptPath], { + cwd: repoRoot, + encoding: "utf-8", + env: { + ...process.env, + HOME: tmpDir, + }, + }); + + assert.equal(result.status, 0, result.stderr); + assert.deepEqual(JSON.parse(result.stdout.trim()), { + initial: "262144", + updated: "32768", + cleared: null, + userOverride: "262144", + invalidOverride: "bogus", + }); }); it("starts managed Ollama on loopback before exposing the auth proxy", () => {