NVIDIA · cv · May 27, 2026 · May 26, 2026 · May 26, 2026 · May 26, 2026
diff --git a/docs/inference/switch-inference-providers.mdx b/docs/inference/switch-inference-providers.mdx
@@ -132,6 +132,7 @@ To change these values, set the corresponding environment variables before runni
 | `NEMOCLAW_AGENT_HEARTBEAT_EVERY` | Go-style duration (`30m`, `1h`, `0m` to disable) | `unset` (OpenClaw default) |
 
 Invalid values are ignored, and the default bakes into the image.
+For Local Ollama, onboarding loads the selected model first and uses Ollama's reported runtime context length when `NEMOCLAW_CONTEXT_WINDOW` is unset.
 Use `NEMOCLAW_INFERENCE_INPUTS=text,image` only for a model that accepts image input through the selected provider.
 
 ```console

diff --git a/docs/inference/use-local-inference.mdx b/docs/inference/use-local-inference.mdx
@@ -73,6 +73,7 @@ NemoClaw lists installed models or offers starter models if none are installed.
 On hosts where the larger starter models fit the currently available GPU memory, the starter list includes `qwen3.6:35b` and selects it by default.
 When another GPU workload is using most of the memory at onboard time, NemoClaw downgrades the menu to the largest model that still fits.
 It pulls the selected model, loads it into memory, and validates it before continuing.
+When Ollama reports a loaded-model context length, NemoClaw uses that value for the `contextWindow` baked into `openclaw.json` unless you set `NEMOCLAW_CONTEXT_WINDOW` yourself.
 If the selected model declares that it does not support tool calling, onboarding stops with guidance to choose a model whose `ollama show <model>` capabilities include `tools`.
 The validation also requires structured chat-completions tool calls.
 If the model leaks tool-call JSON as plain message text, onboarding stops so you can choose a model that returns tool calls in the expected response field.

diff --git a/src/lib/inference/local.test.ts b/src/lib/inference/local.test.ts
@@ -36,7 +36,6 @@ import {
   getOllamaWarmupCommand,
   parseOllamaList,
   parseOllamaTags,
-  probeOllamaRuntimeModelStatus,
   probeLocalProviderHealth,
   validateOllamaModel,
   validateLocalProvider,
@@ -654,6 +653,7 @@ describe("local inference helpers", () => {
         null,
         { type: "nvidia", totalMemoryMB: 131_072, availableMemoryMB: 131_072 },
         log,
+        () => "",
       ),
     ).toBe(QWEN3_6_OLLAMA_MODEL);
   });
@@ -686,6 +686,7 @@ describe("local inference helpers", () => {
         null,
         { type: "nvidia", totalMemoryMB: 16_384, availableMemoryMB: 4_000 },
         log,
+        () => "",
       ),
     ).toBe("qwen2.5:7b");
     expect(messages.some((m) => m.includes("No known Ollama bootstrap model fits"))).toBe(true);
@@ -793,23 +794,6 @@ describe("local inference helpers", () => {
     expect(validateOllamaModel("nemotron-3-nano:30b", () => "ok", undefined, captureEx)).toEqual({ ok: true });
   });
 
-  it("parses Ollama runtime status from /api/ps", () => {
-    const capture = () =>
-      JSON.stringify({
-        models: [
-          { name: "qwen3.6:35b", size_vram: 0, processor: "100% CPU" },
-        ],
-      });
-
-    expect(probeOllamaRuntimeModelStatus("qwen3.6:35b", capture)).toEqual({
-      probed: true,
-      loaded: true,
-      cpuOnly: true,
-      processor: "100% CPU",
-      sizeVram: 0,
-    });
-  });
-
   it("fails Spark Ollama validation when the model is CPU-only after warmup", () => {
     const payload = JSON.stringify({ model: "qwen3.6:35b", response: "hello", done: true });
     const psOutput = JSON.stringify({

diff --git a/src/lib/inference/local.ts b/src/lib/inference/local.ts
@@ -14,6 +14,16 @@ import { runCurlProbe } from "../adapters/http/probe";
 import type { ContainerRuntime } from "../platform";
 import type { CaptureResult } from "../runner";
 import { buildSubprocessEnv } from "../subprocess-env";
+import {
+  applyOllamaRuntimeContextWindow as applyOllamaRuntimeContextWindowWithHost,
+  MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW,
+  parsePositiveInteger,
+  probeOllamaRuntimeModelStatus as probeOllamaRuntimeModelStatusWithHost,
+  resetOllamaRuntimeContextWindowAutoState,
+  resolveOllamaRuntimeContextWindow as resolveOllamaRuntimeContextWindowWithHost,
+} from "./ollama-runtime-context";
+import type { OllamaRuntimeModelStatus } from "./ollama-runtime-context";
+export type { OllamaRuntimeModelStatus } from "./ollama-runtime-context";
 
 const { shellQuote, runCapture, runCaptureEx } = require("../runner");
 
@@ -666,67 +676,32 @@ export function parseOllamaTags(output: string | null | undefined): string[] {
   }
 }
 
-export interface OllamaRuntimeModelStatus {
-  probed: boolean;
-  loaded: boolean;
-  cpuOnly: boolean;
-  processor?: string;
-  sizeVram?: number;
-}
-
-function normalizeOllamaModelName(value: unknown): string {
-  return String(value || "").trim();
-}
+export { MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW, parsePositiveInteger };
 
 export function probeOllamaRuntimeModelStatus(
   model: string,
   runCaptureImpl?: RunCaptureFn,
 ): OllamaRuntimeModelStatus {
-  const capture = runCaptureImpl ?? runCapture;
-  const host = getResolvedOllamaHost();
-  const output = capture(
-    [
-      "curl",
-      "-sf",
-      "--connect-timeout",
-      "3",
-      "--max-time",
-      "5",
-      `http://${host}:${OLLAMA_PORT}/api/ps`,
-    ],
-    { ignoreError: true },
-  );
-  if (!output) return { probed: false, loaded: false, cpuOnly: false };
+  return probeOllamaRuntimeModelStatusWithHost(model, getResolvedOllamaHost, runCaptureImpl);
+}
 
-  try {
-    const parsed = JSON.parse(String(output || ""));
-    const models = Array.isArray(parsed?.models) ? parsed.models : [];
-    const target = normalizeOllamaModelName(model);
-    const loaded = models.find((entry: { name?: unknown; model?: unknown }) => {
-      return (
-        normalizeOllamaModelName(entry?.name) === target ||
-        normalizeOllamaModelName(entry?.model) === target
-      );
-    });
-    if (!loaded) return { probed: true, loaded: false, cpuOnly: false };
+export function resolveOllamaRuntimeContextWindow(
+  model: string,
+  currentContextWindow: string | null | undefined = null,
+  runCaptureImpl?: RunCaptureFn,
+): number | null {
+  return resolveOllamaRuntimeContextWindowWithHost(
+    model,
+    currentContextWindow,
+    getResolvedOllamaHost,
+    runCaptureImpl,
+  );
+}
 
-    const rawSizeVram = Number((loaded as { size_vram?: unknown }).size_vram);
-    const hasSizeVram = Number.isFinite(rawSizeVram);
-    const processor = normalizeOllamaModelName((loaded as { processor?: unknown }).processor);
-    const mentionsGpu = /\bGPU\b/i.test(processor);
-    const processorCpuOnly = /\bCPU\b/i.test(processor) && !mentionsGpu;
-    const sizeVramCpuOnly = hasSizeVram && rawSizeVram === 0 && !mentionsGpu;
+export { resetOllamaRuntimeContextWindowAutoState };
 
-    return {
-      probed: true,
-      loaded: true,
-      cpuOnly: processorCpuOnly || sizeVramCpuOnly,
-      ...(processor ? { processor } : {}),
-      ...(hasSizeVram ? { sizeVram: rawSizeVram } : {}),
-    };
-  } catch {
-    return { probed: true, loaded: false, cpuOnly: false };
-  }
+export function applyOllamaRuntimeContextWindow(selectedModel: string): void {
+  applyOllamaRuntimeContextWindowWithHost(selectedModel, getResolvedOllamaHost);
 }
 
 function formatOllamaCpuOnlyDiagnostic(model: string, status: OllamaRuntimeModelStatus): string {
@@ -796,6 +771,7 @@ export function resolveNonInteractiveOllamaModel(
   recoveredModel: string | null,
   gpu: GpuInfo | null,
   log: (message: string) => void = (m) => console.warn(m),
+  runCaptureImpl?: RunCaptureFn,
 ): string {
   const explicit = requestedModel || recoveredModel;
   if (explicit && !modelFitsAvailableMemory(explicit, gpu)) {
@@ -812,7 +788,7 @@ export function resolveNonInteractiveOllamaModel(
   if (!explicit && !anyRegistryModelFits(gpu)) {
     warnNoBootstrapModelFits(gpu, log);
   }
-  return explicit || getDefaultOllamaModel(gpu);
+  return explicit || getDefaultOllamaModel(gpu, runCaptureImpl);
 }
 
 function warnNoBootstrapModelFits(

diff --git a/src/lib/inference/ollama-runtime-context.test.ts b/src/lib/inference/ollama-runtime-context.test.ts
@@ -0,0 +1,141 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { afterEach, describe, expect, it } from "vitest";
+
+import {
+  applyOllamaRuntimeContextWindow,
+  parseOllamaRuntimeContextLength,
+  probeOllamaRuntimeModelStatus,
+  resetOllamaRuntimeContextWindowAutoState,
+  resolveOllamaRuntimeContextWindow,
+} from "../../../dist/lib/inference/ollama-runtime-context";
+
+const getOllamaHost = () => "127.0.0.1";
+
+describe("Ollama runtime context helpers", () => {
+  afterEach(() => {
+    resetOllamaRuntimeContextWindowAutoState();
+  });
+
+  it("parses valid Ollama /api/ps context lengths", () => {
+    expect(parseOllamaRuntimeContextLength(262144)).toEqual({ contextLength: 262144 });
+    expect(parseOllamaRuntimeContextLength("262144")).toEqual({ contextLength: 262144 });
+  });
+
+  it("treats omitted Ollama /api/ps context lengths as compatibility no-ops", () => {
+    expect(parseOllamaRuntimeContextLength(undefined)).toEqual({});
+    expect(parseOllamaRuntimeContextLength(null)).toEqual({});
+    expect(parseOllamaRuntimeContextLength("   ")).toEqual({});
+
+    const status = probeOllamaRuntimeModelStatus(
+      "qwen3.6:35b",
+      getOllamaHost,
+      () => JSON.stringify({ models: [{ name: "qwen3.6:35b", processor: "100% GPU" }] }),
+    );
+
+    expect(status.loaded).toBe(true);
+    expect(status.contextLength).toBeUndefined();
+    expect(status.contextLengthWarning).toBeUndefined();
+    expect(
+      resolveOllamaRuntimeContextWindow("qwen3.6:35b", null, getOllamaHost, () =>
+        JSON.stringify({ models: [{ name: "qwen3.6:35b" }] }),
+      ),
+    ).toBeNull();
+  });
+
+  it("warns and ignores malformed or non-positive Ollama /api/ps context lengths", () => {
+    for (const value of ["bogus", "1.5", 0, -1]) {
+      const parsed = parseOllamaRuntimeContextLength(value);
+      expect(parsed.contextLength).toBeUndefined();
+      expect(parsed.warning).toContain("non-positive or malformed context_length");
+    }
+
+    const status = probeOllamaRuntimeModelStatus(
+      "qwen3.6:35b",
+      getOllamaHost,
+      () => JSON.stringify({ models: [{ name: "qwen3.6:35b", context_length: "bogus" }] }),
+    );
+
+    expect(status.loaded).toBe(true);
+    expect(status.contextLength).toBeUndefined();
+    expect(status.contextLengthWarning).toContain("non-positive or malformed context_length");
+  });
+
+  it("warns and ignores implausibly large Ollama /api/ps context lengths", () => {
+    const parsed = parseOllamaRuntimeContextLength(10_000_000);
+    expect(parsed.contextLength).toBeUndefined();
+    expect(parsed.warning).toContain("above NemoClaw's auto-detect ceiling");
+
+    const status = probeOllamaRuntimeModelStatus(
+      "qwen3.6:35b",
+      getOllamaHost,
+      () => JSON.stringify({ models: [{ name: "qwen3.6:35b", context_length: 10_000_000 }] }),
+    );
+
+    expect(status.loaded).toBe(true);
+    expect(status.contextLength).toBeUndefined();
+    expect(status.contextLengthWarning).toContain("above NemoClaw's auto-detect ceiling");
+    expect(
+      resolveOllamaRuntimeContextWindow("qwen3.6:35b", null, getOllamaHost, () =>
+        JSON.stringify({ models: [{ name: "qwen3.6:35b", context_length: 10_000_000 }] }),
+      ),
+    ).toBeNull();
+  });
+
+  it("resolves runtime context length only when no explicit override is set", () => {
+    const capture = () =>
+      JSON.stringify({
+        models: [{ name: "qwen3.6:35b", context_length: "262144", processor: "100% GPU" }],
+      });
+
+    expect(
+      resolveOllamaRuntimeContextWindow("qwen3.6:35b", null, getOllamaHost, capture),
+    ).toBe(262144);
+    expect(
+      resolveOllamaRuntimeContextWindow("qwen3.6:35b", "131072", getOllamaHost, capture),
+    ).toBeNull();
+    expect(
+      resolveOllamaRuntimeContextWindow("qwen3.6:35b", "bogus", getOllamaHost, capture),
+    ).toBeNull();
+    expect(
+      resolveOllamaRuntimeContextWindow("qwen3.6:35b", "   ", getOllamaHost, capture),
+    ).toBe(262144);
+    expect(
+      resolveOllamaRuntimeContextWindow("other:model", null, getOllamaHost, capture),
+    ).toBeNull();
+  });
+
+  it("applies and clears only auto-detected context window state", () => {
+    const env: NodeJS.ProcessEnv = {};
+    const messages: string[] = [];
+    let models: Array<{ name: string; context_length?: number }> = [];
+    const options = {
+      env,
+      logger: {
+        log: (message: string) => messages.push(message),
+        warn: (message: string) => messages.push(message),
+      },
+      runCaptureImpl: () => JSON.stringify({ models }),
+    };
+
+    models = [{ name: "qwen3.6:35b", context_length: 262144 }];
+    applyOllamaRuntimeContextWindow("qwen3.6:35b", getOllamaHost, options);
+    expect(env.NEMOCLAW_CONTEXT_WINDOW).toBe("262144");
+
+    models = [{ name: "qwen2.5:7b", context_length: 32768 }];
+    applyOllamaRuntimeContextWindow("qwen2.5:7b", getOllamaHost, options);
+    expect(env.NEMOCLAW_CONTEXT_WINDOW).toBe("32768");
+
+    models = [];
+    applyOllamaRuntimeContextWindow("qwen2.5:7b", getOllamaHost, options);
+    expect(env.NEMOCLAW_CONTEXT_WINDOW).toBeUndefined();
+
+    resetOllamaRuntimeContextWindowAutoState();
+    env.NEMOCLAW_CONTEXT_WINDOW = "262144";
+    models = [{ name: "qwen2.5:7b", context_length: 32768 }];
+    applyOllamaRuntimeContextWindow("qwen2.5:7b", getOllamaHost, options);
+    expect(env.NEMOCLAW_CONTEXT_WINDOW).toBe("262144");
+    expect(messages.at(-1)).toContain("Keeping configured context window");
+  });
+});