From 902967602d6591fb9b3ef9433d14e21681285453 Mon Sep 17 00:00:00 2001
From: zyang-dev <267119621+zyang-dev@users.noreply.github.com>
Date: Tue, 26 May 2026 07:55:09 -0700
Subject: [PATCH 1/6] fix(inference): auto-detect Ollama context window during
 onboard

Signed-off-by: zyang-dev <267119621+zyang-dev@users.noreply.github.com>
---
 docs/inference/switch-inference-providers.mdx |  2 +
 docs/inference/use-local-inference.mdx        |  3 +
 src/lib/inference/local.test.ts               | 47 +++++++++-
 src/lib/inference/local.ts                    | 65 ++++++++++++-
 src/lib/onboard.ts                            | 40 ++++++++
 test/onboard-selection.test.ts                | 94 ++++++++++++++++++-
 6 files changed, 248 insertions(+), 3 deletions(-)
diff --git a/docs/inference/switch-inference-providers.mdx b/docs/inference/switch-inference-providers.mdx
index 2c96efff9f..e3553d5fc6 100644
--- a/docs/inference/switch-inference-providers.mdx
+++ b/docs/inference/switch-inference-providers.mdx
@@ -132,6 +132,8 @@ To change these values, set the corresponding environment variables before runni
 | `NEMOCLAW_AGENT_HEARTBEAT_EVERY` | Go-style duration (`30m`, `1h`, `0m` to disable) | `unset` (OpenClaw default) |
 
 Invalid values are ignored, and the default bakes into the image.
+For Local Ollama, onboarding loads the selected model first and uses Ollama's
+reported runtime context length when `NEMOCLAW_CONTEXT_WINDOW` is unset.
 Use `NEMOCLAW_INFERENCE_INPUTS=text,image` only for a model that accepts image input through the selected provider.
 
 ```console
diff --git a/docs/inference/use-local-inference.mdx b/docs/inference/use-local-inference.mdx
index d6dfedc50c..6a9400c6f6 100644
--- a/docs/inference/use-local-inference.mdx
+++ b/docs/inference/use-local-inference.mdx
@@ -73,6 +73,9 @@ NemoClaw lists installed models or offers starter models if none are installed.
 On hosts where the larger starter models fit the currently available GPU memory, the starter list includes `qwen3.6:35b` and selects it by default.
 When another GPU workload is using most of the memory at onboard time, NemoClaw downgrades the menu to the largest model that still fits.
 It pulls the selected model, loads it into memory, and validates it before continuing.
+When Ollama reports a loaded-model context length, NemoClaw uses that value for
+the `contextWindow` baked into `openclaw.json` unless you set
+`NEMOCLAW_CONTEXT_WINDOW` yourself.
 If the selected model declares that it does not support tool calling, onboarding stops with guidance to choose a model whose `ollama show <model>` capabilities include `tools`.
 The validation also requires structured chat-completions tool calls.
 If the model leaks tool-call JSON as plain message text, onboarding stops so you can choose a model that returns tool calls in the expected response field.
diff --git a/src/lib/inference/local.test.ts b/src/lib/inference/local.test.ts
index a8475a0b14..da8aae7b5a 100644
--- a/src/lib/inference/local.test.ts
+++ b/src/lib/inference/local.test.ts
@@ -37,6 +37,7 @@ import {
   parseOllamaList,
   parseOllamaTags,
   probeOllamaRuntimeModelStatus,
+  resolveOllamaRuntimeContextWindow,
   probeLocalProviderHealth,
   validateOllamaModel,
   validateLocalProvider,
@@ -654,6 +655,7 @@ describe("local inference helpers", () => {
         null,
         { type: "nvidia", totalMemoryMB: 131_072, availableMemoryMB: 131_072 },
         log,
+        () => "",
       ),
     ).toBe(QWEN3_6_OLLAMA_MODEL);
   });
@@ -686,6 +688,7 @@ describe("local inference helpers", () => {
         null,
         { type: "nvidia", totalMemoryMB: 16_384, availableMemoryMB: 4_000 },
         log,
+        () => "",
       ),
     ).toBe("qwen2.5:7b");
     expect(messages.some((m) => m.includes("No known Ollama bootstrap model fits"))).toBe(true);
@@ -797,7 +800,12 @@ describe("local inference helpers", () => {
     const capture = () =>
       JSON.stringify({
         models: [
-          { name: "qwen3.6:35b", size_vram: 0, processor: "100% CPU" },
+          {
+            name: "qwen3.6:35b",
+            context_length: 262144,
+            size_vram: 0,
+            processor: "100% CPU",
+          },
         ],
       });
 
@@ -805,11 +813,48 @@ describe("local inference helpers", () => {
       probed: true,
       loaded: true,
       cpuOnly: true,
+      contextLength: 262144,
       processor: "100% CPU",
       sizeVram: 0,
     });
   });
 
+  it("ignores implausibly large Ollama runtime context lengths", () => {
+    const capture = () =>
+      JSON.stringify({
+        models: [
+          {
+            name: "qwen3.6:35b",
+            context_length: 10_000_000,
+            processor: "100% GPU",
+          },
+        ],
+      });
+
+    const status = probeOllamaRuntimeModelStatus("qwen3.6:35b", capture);
+    expect(status.loaded).toBe(true);
+    expect(status.contextLength).toBeUndefined();
+    expect(status.contextLengthWarning).toContain("above NemoClaw's auto-detect ceiling");
+    expect(resolveOllamaRuntimeContextWindow("qwen3.6:35b", null, capture)).toBeNull();
+  });
+
+  it("resolves Ollama runtime context length only when no valid override is set", () => {
+    const capture = () =>
+      JSON.stringify({
+        models: [
+          {
+            model: "qwen3.6:35b",
+            context_length: "262144",
+            processor: "100% GPU",
+          },
+        ],
+      });
+
+    expect(resolveOllamaRuntimeContextWindow("qwen3.6:35b", null, capture)).toBe(262144);
+    expect(resolveOllamaRuntimeContextWindow("qwen3.6:35b", "131072", capture)).toBeNull();
+    expect(resolveOllamaRuntimeContextWindow("other:model", null, capture)).toBeNull();
+  });
+
   it("fails Spark Ollama validation when the model is CPU-only after warmup", () => {
     const payload = JSON.stringify({ model: "qwen3.6:35b", response: "hello", done: true });
     const psOutput = JSON.stringify({
diff --git a/src/lib/inference/local.ts b/src/lib/inference/local.ts
index 2c9f0a95f0..9fc06fdf06 100644
--- a/src/lib/inference/local.ts
+++ b/src/lib/inference/local.ts
@@ -670,14 +670,53 @@ export interface OllamaRuntimeModelStatus {
   probed: boolean;
   loaded: boolean;
   cpuOnly: boolean;
+  contextLength?: number;
+  contextLengthWarning?: string;
   processor?: string;
   sizeVram?: number;
 }
 
+// Four million tokens is intentionally above today's practical local-model
+// context windows while still rejecting obviously broken daemon responses.
+export const MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW = 4_194_304;
+
 function normalizeOllamaModelName(value: unknown): string {
   return String(value || "").trim();
 }
 
+export function parsePositiveInteger(value: unknown): number | null {
+  if (typeof value === "number") {
+    return Number.isSafeInteger(value) && value > 0 ? value : null;
+  }
+  const raw = String(value ?? "").trim();
+  if (!/^[1-9][0-9]*$/.test(raw)) return null;
+  const parsed = Number(raw);
+  return Number.isSafeInteger(parsed) && parsed > 0 ? parsed : null;
+}
+
+function parseOllamaRuntimeContextLength(value: unknown): {
+  contextLength?: number;
+  warning?: string;
+} {
+  if (value === undefined || value === null || String(value).trim() === "") {
+    return {};
+  }
+  const parsed = parsePositiveInteger(value);
+  if (!parsed) {
+    return {
+      warning: `Ollama /api/ps returned a non-positive context_length (${String(value)}); ignoring it.`,
+    };
+  }
+  if (parsed > MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW) {
+    return {
+      warning:
+        `Ollama /api/ps returned context_length=${parsed}, above NemoClaw's ` +
+        `auto-detect ceiling (${MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW}); ignoring it.`,
+    };
+  }
+  return { contextLength: parsed };
+}
+
 export function probeOllamaRuntimeModelStatus(
   model: string,
   runCaptureImpl?: RunCaptureFn,
@@ -712,6 +751,13 @@ export function probeOllamaRuntimeModelStatus(
 
     const rawSizeVram = Number((loaded as { size_vram?: unknown }).size_vram);
     const hasSizeVram = Number.isFinite(rawSizeVram);
+    // Current Ollama /api/ps responses include context_length for loaded
+    // models; older daemons may omit it. Missing or invalid values are
+    // best-effort no-ops so onboarding falls back to the normal
+    // NEMOCLAW_CONTEXT_WINDOW/default path.
+    const contextLengthResult = parseOllamaRuntimeContextLength(
+      (loaded as { context_length?: unknown }).context_length,
+    );
     const processor = normalizeOllamaModelName((loaded as { processor?: unknown }).processor);
     const mentionsGpu = /\bGPU\b/i.test(processor);
     const processorCpuOnly = /\bCPU\b/i.test(processor) && !mentionsGpu;
@@ -721,6 +767,12 @@ export function probeOllamaRuntimeModelStatus(
       probed: true,
       loaded: true,
       cpuOnly: processorCpuOnly || sizeVramCpuOnly,
+      ...(contextLengthResult.contextLength
+        ? { contextLength: contextLengthResult.contextLength }
+        : {}),
+      ...(contextLengthResult.warning
+        ? { contextLengthWarning: contextLengthResult.warning }
+        : {}),
       ...(processor ? { processor } : {}),
       ...(hasSizeVram ? { sizeVram: rawSizeVram } : {}),
     };
@@ -729,6 +781,16 @@ export function probeOllamaRuntimeModelStatus(
   }
 }
 
+export function resolveOllamaRuntimeContextWindow(
+  model: string,
+  currentContextWindow: string | null | undefined = null,
+  runCaptureImpl?: RunCaptureFn,
+): number | null {
+  if (parsePositiveInteger(currentContextWindow)) return null;
+  const runtimeStatus = probeOllamaRuntimeModelStatus(model, runCaptureImpl);
+  return runtimeStatus.loaded ? (runtimeStatus.contextLength ?? null) : null;
+}
+
 function formatOllamaCpuOnlyDiagnostic(model: string, status: OllamaRuntimeModelStatus): string {
   const observed: string[] = [];
   if (status.processor) observed.push(`processor=${status.processor}`);
@@ -796,6 +858,7 @@ export function resolveNonInteractiveOllamaModel(
   recoveredModel: string | null,
   gpu: GpuInfo | null,
   log: (message: string) => void = (m) => console.warn(m),
+  runCaptureImpl?: RunCaptureFn,
 ): string {
   const explicit = requestedModel || recoveredModel;
   if (explicit && !modelFitsAvailableMemory(explicit, gpu)) {
@@ -812,7 +875,7 @@ export function resolveNonInteractiveOllamaModel(
   if (!explicit && !anyRegistryModelFits(gpu)) {
     warnNoBootstrapModelFits(gpu, log);
   }
-  return explicit || getDefaultOllamaModel(gpu);
+  return explicit || getDefaultOllamaModel(gpu, runCaptureImpl);
 }
 
 function warnNoBootstrapModelFits(
diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 518a276843..42aa679d5d 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -4145,6 +4145,43 @@ type OllamaModelSelectionOutcome =
   | { outcome: "selected"; model: string }
   | { outcome: "back-to-selection" };
 
+let autoDetectedOllamaContextWindow: string | null = null;
+
+function resetOllamaRuntimeContextWindowAutoState(): void {
+  autoDetectedOllamaContextWindow = null;
+}
+
+function applyOllamaRuntimeContextWindow(selectedModel: string): void {
+  const currentContextWindow = process.env.NEMOCLAW_CONTEXT_WINDOW;
+  const currentIsPreviousAuto =
+    !!currentContextWindow &&
+    !!autoDetectedOllamaContextWindow &&
+    currentContextWindow === autoDetectedOllamaContextWindow;
+  const userContextWindow = currentIsPreviousAuto ? null : currentContextWindow;
+
+  if (localInference.parsePositiveInteger(userContextWindow)) {
+    console.log(`  ℹ Keeping configured context window: ${userContextWindow} tokens`);
+    return;
+  }
+
+  const runtimeStatus = localInference.probeOllamaRuntimeModelStatus(selectedModel);
+  if (runtimeStatus.contextLengthWarning) {
+    console.warn(`  ⚠ ${runtimeStatus.contextLengthWarning}`);
+  }
+  if (runtimeStatus.loaded && runtimeStatus.contextLength) {
+    const value = String(runtimeStatus.contextLength);
+    process.env.NEMOCLAW_CONTEXT_WINDOW = value;
+    autoDetectedOllamaContextWindow = value;
+    console.log(`  ✓ Using Ollama runtime context length: ${value} tokens`);
+    return;
+  }
+
+  if (currentIsPreviousAuto) {
+    delete process.env.NEMOCLAW_CONTEXT_WINDOW;
+    autoDetectedOllamaContextWindow = null;
+  }
+}
+
 // Pick an Ollama model, pull it if missing, and validate it via the local
 // proxy. Shared by the three Ollama provider branches (running, Windows-host
 // install/start, install-locally). Returns "back-to-selection" so the caller
@@ -4236,6 +4273,7 @@ async function selectAndValidateOllamaModel(
         "  ℹ Using chat completions API (Ollama tool calls require /v1/chat/completions)",
       );
     }
+    applyOllamaRuntimeContextWindow(selectedModel);
     return { outcome: "selected", model: selectedModel };
   }
 }
@@ -7538,6 +7576,8 @@ module.exports = {
   MESSAGING_CHANNELS,
   selectOnboardAgent,
   setupNim,
+  applyOllamaRuntimeContextWindow,
+  resetOllamaRuntimeContextWindowAutoState,
   providerNameToOptionKey,
   readRecordedProvider,
   readRecordedModel,
diff --git a/test/onboard-selection.test.ts b/test/onboard-selection.test.ts
index 6ef3bb11f1..f132e1d45a 100644
--- a/test/onboard-selection.test.ts
+++ b/test/onboard-selection.test.ts
@@ -1484,6 +1484,11 @@ runner.runCapture = (command) => {
   if (cmd.includes("127.0.0.1:11434/api/tags")) return JSON.stringify({ models: [{ name: "nemotron-3-nano:30b" }] });
   if (cmd.includes("ollama list")) return "nemotron-3-nano:30b  abc  24 GB  now";
   if (cmd.includes("127.0.0.1:8000/v1/models")) return "";
+  if (cmd.includes("127.0.0.1:11434/api/ps")) {
+    return JSON.stringify({
+      models: [{ name: "nemotron-3-nano:30b", context_length: 262144 }],
+    });
+  }
   if (cmd.includes("api/generate")) return '{"response":"hello"}';
   if (cmd.includes("-o args=")) return "node ollama-auth-proxy.js";
   return "";
@@ -1499,7 +1504,15 @@ const { setupNim } = require(${onboardPath});
   console.error = (...args) => lines.push(args.join(" "));
   try {
     const result = await setupNim(null);
-    originalLog(JSON.stringify({ result, messages, lines, commands }));
+    originalLog(
+      JSON.stringify({
+        result,
+        messages,
+        lines,
+        commands,
+        contextWindow: process.env.NEMOCLAW_CONTEXT_WINDOW,
+      }),
+    );
   } finally {
     console.log = originalLog;
     console.error = originalError;
@@ -1518,6 +1531,7 @@ const { setupNim } = require(${onboardPath});
         ...process.env,
         HOME: tmpDir,
         PATH: `${fakeBin}:${process.env.PATH || ""}`,
+        NEMOCLAW_CONTEXT_WINDOW: "",
       },
     });
 
@@ -1550,6 +1564,84 @@ const { setupNim } = require(${onboardPath});
         command.includes("http://127.0.0.1:11434/api/generate"),
       ),
     );
+    assert.equal(payload.contextWindow, "262144");
+  });
+
+  it("re-resolves auto-detected Ollama context windows across model selections", () => {
+    const repoRoot = path.join(import.meta.dirname, "..");
+    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-onboard-ollama-context-"));
+    const scriptPath = path.join(tmpDir, "ollama-context-check.js");
+    const onboardPath = JSON.stringify(path.join(repoRoot, "dist", "lib", "onboard.js"));
+    const runnerPath = JSON.stringify(path.join(repoRoot, "dist", "lib", "runner.js"));
+
+    const script = String.raw`
+const runner = require(${runnerPath});
+
+let models = [];
+runner.runCapture = (command) => {
+  const rendered = Array.isArray(command) ? command.join(" ") : command;
+  if (rendered.includes("/api/ps")) {
+    return JSON.stringify({ models });
+  }
+  return "";
+};
+
+const {
+  applyOllamaRuntimeContextWindow,
+  resetOllamaRuntimeContextWindowAutoState,
+} = require(${onboardPath});
+
+const result = {};
+const originalWarn = console.warn;
+const originalLog = console.log;
+console.warn = () => {};
+console.log = () => {};
+try {
+  resetOllamaRuntimeContextWindowAutoState();
+  delete process.env.NEMOCLAW_CONTEXT_WINDOW;
+
+  models = [{ name: "qwen3.6:35b", context_length: 262144 }];
+  applyOllamaRuntimeContextWindow("qwen3.6:35b");
+  result.initial = process.env.NEMOCLAW_CONTEXT_WINDOW || null;
+
+  models = [{ name: "qwen2.5:7b", context_length: 32768 }];
+  applyOllamaRuntimeContextWindow("qwen2.5:7b");
+  result.updated = process.env.NEMOCLAW_CONTEXT_WINDOW || null;
+
+  models = [];
+  applyOllamaRuntimeContextWindow("qwen2.5:7b");
+  result.cleared = process.env.NEMOCLAW_CONTEXT_WINDOW || null;
+
+  resetOllamaRuntimeContextWindowAutoState();
+  process.env.NEMOCLAW_CONTEXT_WINDOW = "262144";
+  models = [{ name: "qwen2.5:7b", context_length: 32768 }];
+  applyOllamaRuntimeContextWindow("qwen2.5:7b");
+  result.userOverride = process.env.NEMOCLAW_CONTEXT_WINDOW || null;
+} finally {
+  console.warn = originalWarn;
+  console.log = originalLog;
+}
+
+console.log(JSON.stringify(result));
+`;
+    fs.writeFileSync(scriptPath, script);
+
+    const result = spawnSync(process.execPath, [scriptPath], {
+      cwd: repoRoot,
+      encoding: "utf-8",
+      env: {
+        ...process.env,
+        HOME: tmpDir,
+      },
+    });
+
+    assert.equal(result.status, 0, result.stderr);
+    assert.deepEqual(JSON.parse(result.stdout.trim()), {
+      initial: "262144",
+      updated: "32768",
+      cleared: null,
+      userOverride: "262144",
+    });
   });
 
   it("starts managed Ollama on loopback before exposing the auth proxy", () => {

From d0b1feba3766614b77e8b46fe56155176056903b Mon Sep 17 00:00:00 2001
From: zyang-dev <267119621+zyang-dev@users.noreply.github.com>
Date: Tue, 26 May 2026 09:17:33 -0700
Subject: [PATCH 2/6] refactor(inference): move Ollama context auto-detect out
 of onboard

Signed-off-by: zyang-dev <267119621+zyang-dev@users.noreply.github.com>
---
 src/lib/inference/local.ts     | 37 ++++++++++++++++++++++++++++++
 src/lib/onboard.ts             | 42 +---------------------------------
 test/onboard-selection.test.ts |  4 ++--
 3 files changed, 40 insertions(+), 43 deletions(-)

diff --git a/src/lib/inference/local.ts b/src/lib/inference/local.ts
index 9fc06fdf06..6446068c38 100644
--- a/src/lib/inference/local.ts
+++ b/src/lib/inference/local.ts
@@ -791,6 +791,43 @@ export function resolveOllamaRuntimeContextWindow(
   return runtimeStatus.loaded ? (runtimeStatus.contextLength ?? null) : null;
 }
 
+let autoDetectedOllamaContextWindow: string | null = null;
+
+export function resetOllamaRuntimeContextWindowAutoState(): void {
+  autoDetectedOllamaContextWindow = null;
+}
+
+export function applyOllamaRuntimeContextWindow(selectedModel: string): void {
+  const currentContextWindow = process.env.NEMOCLAW_CONTEXT_WINDOW;
+  const currentIsPreviousAuto =
+    !!currentContextWindow &&
+    !!autoDetectedOllamaContextWindow &&
+    currentContextWindow === autoDetectedOllamaContextWindow;
+  const userContextWindow = currentIsPreviousAuto ? null : currentContextWindow;
+
+  if (parsePositiveInteger(userContextWindow)) {
+    console.log(`  ℹ Keeping configured context window: ${userContextWindow} tokens`);
+    return;
+  }
+
+  const runtimeStatus = probeOllamaRuntimeModelStatus(selectedModel);
+  if (runtimeStatus.contextLengthWarning) {
+    console.warn(`  ⚠ ${runtimeStatus.contextLengthWarning}`);
+  }
+  if (runtimeStatus.loaded && runtimeStatus.contextLength) {
+    const value = String(runtimeStatus.contextLength);
+    process.env.NEMOCLAW_CONTEXT_WINDOW = value;
+    autoDetectedOllamaContextWindow = value;
+    console.log(`  ✓ Using Ollama runtime context length: ${value} tokens`);
+    return;
+  }
+
+  if (currentIsPreviousAuto) {
+    delete process.env.NEMOCLAW_CONTEXT_WINDOW;
+    autoDetectedOllamaContextWindow = null;
+  }
+}
+
 function formatOllamaCpuOnlyDiagnostic(model: string, status: OllamaRuntimeModelStatus): string {
   const observed: string[] = [];
   if (status.processor) observed.push(`processor=${status.processor}`);
diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts
index 42aa679d5d..2a15a71a57 100644
--- a/src/lib/onboard.ts
+++ b/src/lib/onboard.ts
@@ -4144,44 +4144,6 @@ const { readLiveInference, readRecordedProvider, readRecordedNimContainer, readR
 type OllamaModelSelectionOutcome =
   | { outcome: "selected"; model: string }
   | { outcome: "back-to-selection" };
-
-let autoDetectedOllamaContextWindow: string | null = null;
-
-function resetOllamaRuntimeContextWindowAutoState(): void {
-  autoDetectedOllamaContextWindow = null;
-}
-
-function applyOllamaRuntimeContextWindow(selectedModel: string): void {
-  const currentContextWindow = process.env.NEMOCLAW_CONTEXT_WINDOW;
-  const currentIsPreviousAuto =
-    !!currentContextWindow &&
-    !!autoDetectedOllamaContextWindow &&
-    currentContextWindow === autoDetectedOllamaContextWindow;
-  const userContextWindow = currentIsPreviousAuto ? null : currentContextWindow;
-
-  if (localInference.parsePositiveInteger(userContextWindow)) {
-    console.log(`  ℹ Keeping configured context window: ${userContextWindow} tokens`);
-    return;
-  }
-
-  const runtimeStatus = localInference.probeOllamaRuntimeModelStatus(selectedModel);
-  if (runtimeStatus.contextLengthWarning) {
-    console.warn(`  ⚠ ${runtimeStatus.contextLengthWarning}`);
-  }
-  if (runtimeStatus.loaded && runtimeStatus.contextLength) {
-    const value = String(runtimeStatus.contextLength);
-    process.env.NEMOCLAW_CONTEXT_WINDOW = value;
-    autoDetectedOllamaContextWindow = value;
-    console.log(`  ✓ Using Ollama runtime context length: ${value} tokens`);
-    return;
-  }
-
-  if (currentIsPreviousAuto) {
-    delete process.env.NEMOCLAW_CONTEXT_WINDOW;
-    autoDetectedOllamaContextWindow = null;
-  }
-}
-
 // Pick an Ollama model, pull it if missing, and validate it via the local
 // proxy. Shared by the three Ollama provider branches (running, Windows-host
 // install/start, install-locally). Returns "back-to-selection" so the caller
@@ -4273,7 +4235,7 @@ async function selectAndValidateOllamaModel(
         "  ℹ Using chat completions API (Ollama tool calls require /v1/chat/completions)",
       );
     }
-    applyOllamaRuntimeContextWindow(selectedModel);
+    localInference.applyOllamaRuntimeContextWindow(selectedModel);
     return { outcome: "selected", model: selectedModel };
   }
 }
@@ -7576,8 +7538,6 @@ module.exports = {
   MESSAGING_CHANNELS,
   selectOnboardAgent,
   setupNim,
-  applyOllamaRuntimeContextWindow,
-  resetOllamaRuntimeContextWindowAutoState,
   providerNameToOptionKey,
   readRecordedProvider,
   readRecordedModel,
diff --git a/test/onboard-selection.test.ts b/test/onboard-selection.test.ts
index f132e1d45a..913110a592 100644
--- a/test/onboard-selection.test.ts
+++ b/test/onboard-selection.test.ts
@@ -1571,7 +1571,7 @@ const { setupNim } = require(${onboardPath});
     const repoRoot = path.join(import.meta.dirname, "..");
     const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-onboard-ollama-context-"));
     const scriptPath = path.join(tmpDir, "ollama-context-check.js");
-    const onboardPath = JSON.stringify(path.join(repoRoot, "dist", "lib", "onboard.js"));
+    const localInferencePath = JSON.stringify(path.join(repoRoot, "dist", "lib", "inference", "local.js"));
     const runnerPath = JSON.stringify(path.join(repoRoot, "dist", "lib", "runner.js"));
 
     const script = String.raw`
@@ -1589,7 +1589,7 @@ runner.runCapture = (command) => {
 const {
   applyOllamaRuntimeContextWindow,
   resetOllamaRuntimeContextWindowAutoState,
-} = require(${onboardPath});
+} = require(${localInferencePath});
 
 const result = {};
 const originalWarn = console.warn;

From 422587eebaf71bb0bfe43222d1f958614c66d01c Mon Sep 17 00:00:00 2001
From: zyang-dev <267119621+zyang-dev@users.noreply.github.com>
Date: Tue, 26 May 2026 09:30:25 -0700
Subject: [PATCH 3/6] fix(inference): preserve explicit Ollama context
 overrides

Signed-off-by: zyang-dev <267119621+zyang-dev@users.noreply.github.com>
---
 src/lib/inference/local.test.ts | 2 ++
 src/lib/inference/local.ts      | 8 ++++++--
 test/onboard-selection.test.ts  | 7 +++++++
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/lib/inference/local.test.ts b/src/lib/inference/local.test.ts
index da8aae7b5a..a7fbe508e0 100644
--- a/src/lib/inference/local.test.ts
+++ b/src/lib/inference/local.test.ts
@@ -852,6 +852,8 @@ describe("local inference helpers", () => {
 
     expect(resolveOllamaRuntimeContextWindow("qwen3.6:35b", null, capture)).toBe(262144);
     expect(resolveOllamaRuntimeContextWindow("qwen3.6:35b", "131072", capture)).toBeNull();
+    expect(resolveOllamaRuntimeContextWindow("qwen3.6:35b", "bogus", capture)).toBeNull();
+    expect(resolveOllamaRuntimeContextWindow("qwen3.6:35b", "   ", capture)).toBe(262144);
     expect(resolveOllamaRuntimeContextWindow("other:model", null, capture)).toBeNull();
   });
 
diff --git a/src/lib/inference/local.ts b/src/lib/inference/local.ts
index 6446068c38..8e48dea9a3 100644
--- a/src/lib/inference/local.ts
+++ b/src/lib/inference/local.ts
@@ -694,6 +694,10 @@ export function parsePositiveInteger(value: unknown): number | null {
   return Number.isSafeInteger(parsed) && parsed > 0 ? parsed : null;
 }
 
+function hasExplicitContextWindow(value: unknown): boolean {
+  return String(value ?? "").trim() !== "";
+}
+
 function parseOllamaRuntimeContextLength(value: unknown): {
   contextLength?: number;
   warning?: string;
@@ -786,7 +790,7 @@ export function resolveOllamaRuntimeContextWindow(
   currentContextWindow: string | null | undefined = null,
   runCaptureImpl?: RunCaptureFn,
 ): number | null {
-  if (parsePositiveInteger(currentContextWindow)) return null;
+  if (hasExplicitContextWindow(currentContextWindow)) return null;
   const runtimeStatus = probeOllamaRuntimeModelStatus(model, runCaptureImpl);
   return runtimeStatus.loaded ? (runtimeStatus.contextLength ?? null) : null;
 }
@@ -805,7 +809,7 @@ export function applyOllamaRuntimeContextWindow(selectedModel: string): void {
     currentContextWindow === autoDetectedOllamaContextWindow;
   const userContextWindow = currentIsPreviousAuto ? null : currentContextWindow;
 
-  if (parsePositiveInteger(userContextWindow)) {
+  if (hasExplicitContextWindow(userContextWindow)) {
     console.log(`  ℹ Keeping configured context window: ${userContextWindow} tokens`);
     return;
   }
diff --git a/test/onboard-selection.test.ts b/test/onboard-selection.test.ts
index 913110a592..e13d9ebfdc 100644
--- a/test/onboard-selection.test.ts
+++ b/test/onboard-selection.test.ts
@@ -1617,6 +1617,12 @@ try {
   models = [{ name: "qwen2.5:7b", context_length: 32768 }];
   applyOllamaRuntimeContextWindow("qwen2.5:7b");
   result.userOverride = process.env.NEMOCLAW_CONTEXT_WINDOW || null;
+
+  resetOllamaRuntimeContextWindowAutoState();
+  process.env.NEMOCLAW_CONTEXT_WINDOW = "bogus";
+  models = [{ name: "qwen2.5:7b", context_length: 32768 }];
+  applyOllamaRuntimeContextWindow("qwen2.5:7b");
+  result.invalidOverride = process.env.NEMOCLAW_CONTEXT_WINDOW || null;
 } finally {
   console.warn = originalWarn;
   console.log = originalLog;
@@ -1641,6 +1647,7 @@ console.log(JSON.stringify(result));
       updated: "32768",
       cleared: null,
       userOverride: "262144",
+      invalidOverride: "bogus",
     });
   });
 

From 23838253b7e1774971d4243215112046e4bcf5f2 Mon Sep 17 00:00:00 2001
From: zyang-dev <267119621+zyang-dev@users.noreply.github.com>
Date: Tue, 26 May 2026 11:15:30 -0700
Subject: [PATCH 4/6] docs(inference): normalize Ollama context-window sentence
 wrapping

Signed-off-by: zyang-dev <267119621+zyang-dev@users.noreply.github.com>
---
 docs/inference/switch-inference-providers.mdx | 3 +--
 docs/inference/use-local-inference.mdx        | 4 +---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/docs/inference/switch-inference-providers.mdx b/docs/inference/switch-inference-providers.mdx
index e3553d5fc6..7c276df638 100644
--- a/docs/inference/switch-inference-providers.mdx
+++ b/docs/inference/switch-inference-providers.mdx
@@ -132,8 +132,7 @@ To change these values, set the corresponding environment variables before runni
 | `NEMOCLAW_AGENT_HEARTBEAT_EVERY` | Go-style duration (`30m`, `1h`, `0m` to disable) | `unset` (OpenClaw default) |
 
 Invalid values are ignored, and the default bakes into the image.
-For Local Ollama, onboarding loads the selected model first and uses Ollama's
-reported runtime context length when `NEMOCLAW_CONTEXT_WINDOW` is unset.
+For Local Ollama, onboarding loads the selected model first and uses Ollama's reported runtime context length when `NEMOCLAW_CONTEXT_WINDOW` is unset.
 Use `NEMOCLAW_INFERENCE_INPUTS=text,image` only for a model that accepts image input through the selected provider.
 
 ```console
diff --git a/docs/inference/use-local-inference.mdx b/docs/inference/use-local-inference.mdx
index 6a9400c6f6..922685caf5 100644
--- a/docs/inference/use-local-inference.mdx
+++ b/docs/inference/use-local-inference.mdx
@@ -73,9 +73,7 @@ NemoClaw lists installed models or offers starter models if none are installed.
 On hosts where the larger starter models fit the currently available GPU memory, the starter list includes `qwen3.6:35b` and selects it by default.
 When another GPU workload is using most of the memory at onboard time, NemoClaw downgrades the menu to the largest model that still fits.
 It pulls the selected model, loads it into memory, and validates it before continuing.
-When Ollama reports a loaded-model context length, NemoClaw uses that value for
-the `contextWindow` baked into `openclaw.json` unless you set
-`NEMOCLAW_CONTEXT_WINDOW` yourself.
+When Ollama reports a loaded-model context length, NemoClaw uses that value for the `contextWindow` baked into `openclaw.json` unless you set `NEMOCLAW_CONTEXT_WINDOW` yourself.
 If the selected model declares that it does not support tool calling, onboarding stops with guidance to choose a model whose `ollama show <model>` capabilities include `tools`.
 The validation also requires structured chat-completions tool calls.
 If the model leaks tool-call JSON as plain message text, onboarding stops so you can choose a model that returns tool calls in the expected response field.

From baeab7b91f14f24be36b00672cbf4474fa6929a8 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Tue, 26 May 2026 20:43:26 -0700
Subject: [PATCH 5/6] refactor(inference): extract Ollama runtime context
 handling

---
 src/lib/inference/local.test.ts               |  63 -----
 src/lib/inference/local.ts                    | 168 ++------------
 .../inference/ollama-runtime-context.test.ts  | 141 +++++++++++
 src/lib/inference/ollama-runtime-context.ts   | 219 ++++++++++++++++++
 4 files changed, 380 insertions(+), 211 deletions(-)
 create mode 100644 src/lib/inference/ollama-runtime-context.test.ts
 create mode 100644 src/lib/inference/ollama-runtime-context.ts

diff --git a/src/lib/inference/local.test.ts b/src/lib/inference/local.test.ts
index a7fbe508e0..ac106f89b6 100644
--- a/src/lib/inference/local.test.ts
+++ b/src/lib/inference/local.test.ts
@@ -36,8 +36,6 @@ import {
   getOllamaWarmupCommand,
   parseOllamaList,
   parseOllamaTags,
-  probeOllamaRuntimeModelStatus,
-  resolveOllamaRuntimeContextWindow,
   probeLocalProviderHealth,
   validateOllamaModel,
   validateLocalProvider,
@@ -796,67 +794,6 @@ describe("local inference helpers", () => {
     expect(validateOllamaModel("nemotron-3-nano:30b", () => "ok", undefined, captureEx)).toEqual({ ok: true });
   });
 
-  it("parses Ollama runtime status from /api/ps", () => {
-    const capture = () =>
-      JSON.stringify({
-        models: [
-          {
-            name: "qwen3.6:35b",
-            context_length: 262144,
-            size_vram: 0,
-            processor: "100% CPU",
-          },
-        ],
-      });
-
-    expect(probeOllamaRuntimeModelStatus("qwen3.6:35b", capture)).toEqual({
-      probed: true,
-      loaded: true,
-      cpuOnly: true,
-      contextLength: 262144,
-      processor: "100% CPU",
-      sizeVram: 0,
-    });
-  });
-
-  it("ignores implausibly large Ollama runtime context lengths", () => {
-    const capture = () =>
-      JSON.stringify({
-        models: [
-          {
-            name: "qwen3.6:35b",
-            context_length: 10_000_000,
-            processor: "100% GPU",
-          },
-        ],
-      });
-
-    const status = probeOllamaRuntimeModelStatus("qwen3.6:35b", capture);
-    expect(status.loaded).toBe(true);
-    expect(status.contextLength).toBeUndefined();
-    expect(status.contextLengthWarning).toContain("above NemoClaw's auto-detect ceiling");
-    expect(resolveOllamaRuntimeContextWindow("qwen3.6:35b", null, capture)).toBeNull();
-  });
-
-  it("resolves Ollama runtime context length only when no valid override is set", () => {
-    const capture = () =>
-      JSON.stringify({
-        models: [
-          {
-            model: "qwen3.6:35b",
-            context_length: "262144",
-            processor: "100% GPU",
-          },
-        ],
-      });
-
-    expect(resolveOllamaRuntimeContextWindow("qwen3.6:35b", null, capture)).toBe(262144);
-    expect(resolveOllamaRuntimeContextWindow("qwen3.6:35b", "131072", capture)).toBeNull();
-    expect(resolveOllamaRuntimeContextWindow("qwen3.6:35b", "bogus", capture)).toBeNull();
-    expect(resolveOllamaRuntimeContextWindow("qwen3.6:35b", "   ", capture)).toBe(262144);
-    expect(resolveOllamaRuntimeContextWindow("other:model", null, capture)).toBeNull();
-  });
-
   it("fails Spark Ollama validation when the model is CPU-only after warmup", () => {
     const payload = JSON.stringify({ model: "qwen3.6:35b", response: "hello", done: true });
     const psOutput = JSON.stringify({
diff --git a/src/lib/inference/local.ts b/src/lib/inference/local.ts
index 8e48dea9a3..e79c8a3bd4 100644
--- a/src/lib/inference/local.ts
+++ b/src/lib/inference/local.ts
@@ -14,6 +14,16 @@ import { runCurlProbe } from "../adapters/http/probe";
 import type { ContainerRuntime } from "../platform";
 import type { CaptureResult } from "../runner";
 import { buildSubprocessEnv } from "../subprocess-env";
+import {
+  applyOllamaRuntimeContextWindow as applyOllamaRuntimeContextWindowWithHost,
+  MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW,
+  parsePositiveInteger,
+  probeOllamaRuntimeModelStatus as probeOllamaRuntimeModelStatusWithHost,
+  resetOllamaRuntimeContextWindowAutoState,
+  resolveOllamaRuntimeContextWindow as resolveOllamaRuntimeContextWindowWithHost,
+} from "./ollama-runtime-context";
+import type { OllamaRuntimeModelStatus } from "./ollama-runtime-context";
+export type { OllamaRuntimeModelStatus } from "./ollama-runtime-context";
 
 const { shellQuote, runCapture, runCaptureEx } = require("../runner");
 
@@ -666,123 +676,13 @@ export function parseOllamaTags(output: string | null | undefined): string[] {
   }
 }
 
-export interface OllamaRuntimeModelStatus {
-  probed: boolean;
-  loaded: boolean;
-  cpuOnly: boolean;
-  contextLength?: number;
-  contextLengthWarning?: string;
-  processor?: string;
-  sizeVram?: number;
-}
-
-// Four million tokens is intentionally above today's practical local-model
-// context windows while still rejecting obviously broken daemon responses.
-export const MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW = 4_194_304;
-
-function normalizeOllamaModelName(value: unknown): string {
-  return String(value || "").trim();
-}
-
-export function parsePositiveInteger(value: unknown): number | null {
-  if (typeof value === "number") {
-    return Number.isSafeInteger(value) && value > 0 ? value : null;
-  }
-  const raw = String(value ?? "").trim();
-  if (!/^[1-9][0-9]*$/.test(raw)) return null;
-  const parsed = Number(raw);
-  return Number.isSafeInteger(parsed) && parsed > 0 ? parsed : null;
-}
-
-function hasExplicitContextWindow(value: unknown): boolean {
-  return String(value ?? "").trim() !== "";
-}
-
-function parseOllamaRuntimeContextLength(value: unknown): {
-  contextLength?: number;
-  warning?: string;
-} {
-  if (value === undefined || value === null || String(value).trim() === "") {
-    return {};
-  }
-  const parsed = parsePositiveInteger(value);
-  if (!parsed) {
-    return {
-      warning: `Ollama /api/ps returned a non-positive context_length (${String(value)}); ignoring it.`,
-    };
-  }
-  if (parsed > MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW) {
-    return {
-      warning:
-        `Ollama /api/ps returned context_length=${parsed}, above NemoClaw's ` +
-        `auto-detect ceiling (${MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW}); ignoring it.`,
-    };
-  }
-  return { contextLength: parsed };
-}
+export { MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW, parsePositiveInteger };
 
 export function probeOllamaRuntimeModelStatus(
   model: string,
   runCaptureImpl?: RunCaptureFn,
 ): OllamaRuntimeModelStatus {
-  const capture = runCaptureImpl ?? runCapture;
-  const host = getResolvedOllamaHost();
-  const output = capture(
-    [
-      "curl",
-      "-sf",
-      "--connect-timeout",
-      "3",
-      "--max-time",
-      "5",
-      `http://${host}:${OLLAMA_PORT}/api/ps`,
-    ],
-    { ignoreError: true },
-  );
-  if (!output) return { probed: false, loaded: false, cpuOnly: false };
-
-  try {
-    const parsed = JSON.parse(String(output || ""));
-    const models = Array.isArray(parsed?.models) ? parsed.models : [];
-    const target = normalizeOllamaModelName(model);
-    const loaded = models.find((entry: { name?: unknown; model?: unknown }) => {
-      return (
-        normalizeOllamaModelName(entry?.name) === target ||
-        normalizeOllamaModelName(entry?.model) === target
-      );
-    });
-    if (!loaded) return { probed: true, loaded: false, cpuOnly: false };
-
-    const rawSizeVram = Number((loaded as { size_vram?: unknown }).size_vram);
-    const hasSizeVram = Number.isFinite(rawSizeVram);
-    // Current Ollama /api/ps responses include context_length for loaded
-    // models; older daemons may omit it. Missing or invalid values are
-    // best-effort no-ops so onboarding falls back to the normal
-    // NEMOCLAW_CONTEXT_WINDOW/default path.
-    const contextLengthResult = parseOllamaRuntimeContextLength(
-      (loaded as { context_length?: unknown }).context_length,
-    );
-    const processor = normalizeOllamaModelName((loaded as { processor?: unknown }).processor);
-    const mentionsGpu = /\bGPU\b/i.test(processor);
-    const processorCpuOnly = /\bCPU\b/i.test(processor) && !mentionsGpu;
-    const sizeVramCpuOnly = hasSizeVram && rawSizeVram === 0 && !mentionsGpu;
-
-    return {
-      probed: true,
-      loaded: true,
-      cpuOnly: processorCpuOnly || sizeVramCpuOnly,
-      ...(contextLengthResult.contextLength
-        ? { contextLength: contextLengthResult.contextLength }
-        : {}),
-      ...(contextLengthResult.warning
-        ? { contextLengthWarning: contextLengthResult.warning }
-        : {}),
-      ...(processor ? { processor } : {}),
-      ...(hasSizeVram ? { sizeVram: rawSizeVram } : {}),
-    };
-  } catch {
-    return { probed: true, loaded: false, cpuOnly: false };
-  }
+  return probeOllamaRuntimeModelStatusWithHost(model, getResolvedOllamaHost, runCaptureImpl);
 }
 
 export function resolveOllamaRuntimeContextWindow(
@@ -790,46 +690,18 @@ export function resolveOllamaRuntimeContextWindow(
   currentContextWindow: string | null | undefined = null,
   runCaptureImpl?: RunCaptureFn,
 ): number | null {
-  if (hasExplicitContextWindow(currentContextWindow)) return null;
-  const runtimeStatus = probeOllamaRuntimeModelStatus(model, runCaptureImpl);
-  return runtimeStatus.loaded ? (runtimeStatus.contextLength ?? null) : null;
+  return resolveOllamaRuntimeContextWindowWithHost(
+    model,
+    currentContextWindow,
+    getResolvedOllamaHost,
+    runCaptureImpl,
+  );
 }
 
-let autoDetectedOllamaContextWindow: string | null = null;
-
-export function resetOllamaRuntimeContextWindowAutoState(): void {
-  autoDetectedOllamaContextWindow = null;
-}
+export { resetOllamaRuntimeContextWindowAutoState };
 
 export function applyOllamaRuntimeContextWindow(selectedModel: string): void {
-  const currentContextWindow = process.env.NEMOCLAW_CONTEXT_WINDOW;
-  const currentIsPreviousAuto =
-    !!currentContextWindow &&
-    !!autoDetectedOllamaContextWindow &&
-    currentContextWindow === autoDetectedOllamaContextWindow;
-  const userContextWindow = currentIsPreviousAuto ? null : currentContextWindow;
-
-  if (hasExplicitContextWindow(userContextWindow)) {
-    console.log(`  ℹ Keeping configured context window: ${userContextWindow} tokens`);
-    return;
-  }
-
-  const runtimeStatus = probeOllamaRuntimeModelStatus(selectedModel);
-  if (runtimeStatus.contextLengthWarning) {
-    console.warn(`  ⚠ ${runtimeStatus.contextLengthWarning}`);
-  }
-  if (runtimeStatus.loaded && runtimeStatus.contextLength) {
-    const value = String(runtimeStatus.contextLength);
-    process.env.NEMOCLAW_CONTEXT_WINDOW = value;
-    autoDetectedOllamaContextWindow = value;
-    console.log(`  ✓ Using Ollama runtime context length: ${value} tokens`);
-    return;
-  }
-
-  if (currentIsPreviousAuto) {
-    delete process.env.NEMOCLAW_CONTEXT_WINDOW;
-    autoDetectedOllamaContextWindow = null;
-  }
+  applyOllamaRuntimeContextWindowWithHost(selectedModel, getResolvedOllamaHost);
 }
 
 function formatOllamaCpuOnlyDiagnostic(model: string, status: OllamaRuntimeModelStatus): string {
diff --git a/src/lib/inference/ollama-runtime-context.test.ts b/src/lib/inference/ollama-runtime-context.test.ts
new file mode 100644
index 0000000000..62e9451bc4
--- /dev/null
+++ b/src/lib/inference/ollama-runtime-context.test.ts
@@ -0,0 +1,141 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+import { afterEach, describe, expect, it } from "vitest";
+
+import {
+  applyOllamaRuntimeContextWindow,
+  parseOllamaRuntimeContextLength,
+  probeOllamaRuntimeModelStatus,
+  resetOllamaRuntimeContextWindowAutoState,
+  resolveOllamaRuntimeContextWindow,
+} from "../../../dist/lib/inference/ollama-runtime-context";
+
+const getOllamaHost = () => "127.0.0.1";
+
+describe("Ollama runtime context helpers", () => {
+  afterEach(() => {
+    resetOllamaRuntimeContextWindowAutoState();
+  });
+
+  it("parses valid Ollama /api/ps context lengths", () => {
+    expect(parseOllamaRuntimeContextLength(262144)).toEqual({ contextLength: 262144 });
+    expect(parseOllamaRuntimeContextLength("262144")).toEqual({ contextLength: 262144 });
+  });
+
+  it("treats omitted Ollama /api/ps context lengths as compatibility no-ops", () => {
+    expect(parseOllamaRuntimeContextLength(undefined)).toEqual({});
+    expect(parseOllamaRuntimeContextLength(null)).toEqual({});
+    expect(parseOllamaRuntimeContextLength("   ")).toEqual({});
+
+    const status = probeOllamaRuntimeModelStatus(
+      "qwen3.6:35b",
+      getOllamaHost,
+      () => JSON.stringify({ models: [{ name: "qwen3.6:35b", processor: "100% GPU" }] }),
+    );
+
+    expect(status.loaded).toBe(true);
+    expect(status.contextLength).toBeUndefined();
+    expect(status.contextLengthWarning).toBeUndefined();
+    expect(
+      resolveOllamaRuntimeContextWindow("qwen3.6:35b", null, getOllamaHost, () =>
+        JSON.stringify({ models: [{ name: "qwen3.6:35b" }] }),
+      ),
+    ).toBeNull();
+  });
+
+  it("warns and ignores malformed or non-positive Ollama /api/ps context lengths", () => {
+    for (const value of ["bogus", "1.5", 0, -1]) {
+      const parsed = parseOllamaRuntimeContextLength(value);
+      expect(parsed.contextLength).toBeUndefined();
+      expect(parsed.warning).toContain("non-positive or malformed context_length");
+    }
+
+    const status = probeOllamaRuntimeModelStatus(
+      "qwen3.6:35b",
+      getOllamaHost,
+      () => JSON.stringify({ models: [{ name: "qwen3.6:35b", context_length: "bogus" }] }),
+    );
+
+    expect(status.loaded).toBe(true);
+    expect(status.contextLength).toBeUndefined();
+    expect(status.contextLengthWarning).toContain("non-positive or malformed context_length");
+  });
+
+  it("warns and ignores implausibly large Ollama /api/ps context lengths", () => {
+    const parsed = parseOllamaRuntimeContextLength(10_000_000);
+    expect(parsed.contextLength).toBeUndefined();
+    expect(parsed.warning).toContain("above NemoClaw's auto-detect ceiling");
+
+    const status = probeOllamaRuntimeModelStatus(
+      "qwen3.6:35b",
+      getOllamaHost,
+      () => JSON.stringify({ models: [{ name: "qwen3.6:35b", context_length: 10_000_000 }] }),
+    );
+
+    expect(status.loaded).toBe(true);
+    expect(status.contextLength).toBeUndefined();
+    expect(status.contextLengthWarning).toContain("above NemoClaw's auto-detect ceiling");
+    expect(
+      resolveOllamaRuntimeContextWindow("qwen3.6:35b", null, getOllamaHost, () =>
+        JSON.stringify({ models: [{ name: "qwen3.6:35b", context_length: 10_000_000 }] }),
+      ),
+    ).toBeNull();
+  });
+
+  it("resolves runtime context length only when no explicit override is set", () => {
+    const capture = () =>
+      JSON.stringify({
+        models: [{ model: "qwen3.6:35b", context_length: "262144", processor: "100% GPU" }],
+      });
+
+    expect(
+      resolveOllamaRuntimeContextWindow("qwen3.6:35b", null, getOllamaHost, capture),
+    ).toBe(262144);
+    expect(
+      resolveOllamaRuntimeContextWindow("qwen3.6:35b", "131072", getOllamaHost, capture),
+    ).toBeNull();
+    expect(
+      resolveOllamaRuntimeContextWindow("qwen3.6:35b", "bogus", getOllamaHost, capture),
+    ).toBeNull();
+    expect(
+      resolveOllamaRuntimeContextWindow("qwen3.6:35b", "   ", getOllamaHost, capture),
+    ).toBe(262144);
+    expect(
+      resolveOllamaRuntimeContextWindow("other:model", null, getOllamaHost, capture),
+    ).toBeNull();
+  });
+
+  it("applies and clears only auto-detected context window state", () => {
+    const env: NodeJS.ProcessEnv = {};
+    const messages: string[] = [];
+    let models: Array<{ name: string; context_length?: number }> = [];
+    const options = {
+      env,
+      logger: {
+        log: (message: string) => messages.push(message),
+        warn: (message: string) => messages.push(message),
+      },
+      runCaptureImpl: () => JSON.stringify({ models }),
+    };
+
+    models = [{ name: "qwen3.6:35b", context_length: 262144 }];
+    applyOllamaRuntimeContextWindow("qwen3.6:35b", getOllamaHost, options);
+    expect(env.NEMOCLAW_CONTEXT_WINDOW).toBe("262144");
+
+    models = [{ name: "qwen2.5:7b", context_length: 32768 }];
+    applyOllamaRuntimeContextWindow("qwen2.5:7b", getOllamaHost, options);
+    expect(env.NEMOCLAW_CONTEXT_WINDOW).toBe("32768");
+
+    models = [];
+    applyOllamaRuntimeContextWindow("qwen2.5:7b", getOllamaHost, options);
+    expect(env.NEMOCLAW_CONTEXT_WINDOW).toBeUndefined();
+
+    resetOllamaRuntimeContextWindowAutoState();
+    env.NEMOCLAW_CONTEXT_WINDOW = "262144";
+    models = [{ name: "qwen2.5:7b", context_length: 32768 }];
+    applyOllamaRuntimeContextWindow("qwen2.5:7b", getOllamaHost, options);
+    expect(env.NEMOCLAW_CONTEXT_WINDOW).toBe("262144");
+    expect(messages.at(-1)).toContain("Keeping configured context window");
+  });
+});
diff --git a/src/lib/inference/ollama-runtime-context.ts b/src/lib/inference/ollama-runtime-context.ts
new file mode 100644
index 0000000000..0ef1b27b11
--- /dev/null
+++ b/src/lib/inference/ollama-runtime-context.ts
@@ -0,0 +1,219 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Ollama runtime context-window helpers.
+ *
+ * Keep this module focused on data coming from Ollama's `/api/ps` runtime
+ * boundary. Onboarding should call the narrow wrappers in `local.ts` instead
+ * of re-implementing parsing or process-env state handling.
+ */
+
+import { OLLAMA_PORT } from "../core/ports";
+
+const { runCapture } = require("../runner");
+
+export type OllamaRuntimeRunCaptureFn = (
+  cmd: string | string[],
+  opts?: { ignoreError?: boolean },
+) => string;
+
+export interface OllamaRuntimeModelStatus {
+  probed: boolean;
+  loaded: boolean;
+  cpuOnly: boolean;
+  contextLength?: number;
+  contextLengthWarning?: string;
+  processor?: string;
+  sizeVram?: number;
+}
+
+export interface ApplyOllamaRuntimeContextWindowOptions {
+  env?: NodeJS.ProcessEnv;
+  logger?: Pick<Console, "log" | "warn">;
+  runCaptureImpl?: OllamaRuntimeRunCaptureFn;
+}
+
+// Four million tokens is intentionally above today's practical local-model
+// context windows while still rejecting obviously broken daemon responses.
+export const MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW = 4_194_304;
+
+function normalizeOllamaModelName(value: unknown): string {
+  return String(value || "").trim();
+}
+
+export function parsePositiveInteger(value: unknown): number | null {
+  if (typeof value === "number") {
+    return Number.isSafeInteger(value) && value > 0 ? value : null;
+  }
+  const raw = String(value ?? "").trim();
+  if (!/^[1-9][0-9]*$/.test(raw)) return null;
+  const parsed = Number(raw);
+  return Number.isSafeInteger(parsed) && parsed > 0 ? parsed : null;
+}
+
+export function hasExplicitContextWindow(value: unknown): boolean {
+  return String(value ?? "").trim() !== "";
+}
+
+/**
+ * Parse Ollama `/api/ps` `context_length` defensively.
+ *
+ * Source boundary: `context_length` is produced by the user-managed Ollama
+ * daemon outside this repository. NemoClaw can validate before consuming it,
+ * but this PR cannot make every installed daemon report a value or enforce a
+ * stricter schema at the producer.
+ *
+ * Tolerated invalid states: older daemons omitting the field, empty values,
+ * non-integer/malformed values, non-positive values, unsafe integers, and
+ * values above NemoClaw's auto-detect ceiling. Missing values are a silent
+ * compatibility no-op; malformed or implausible values return a warning and
+ * fall back to the existing NEMOCLAW_CONTEXT_WINDOW/default path.
+ *
+ * Regression coverage lives in `ollama-runtime-context.test.ts` for omitted,
+ * malformed, non-positive, valid string/number, and over-ceiling responses.
+ * Remove this fallback once NemoClaw requires an Ollama daemon contract that
+ * always reports a validated positive integer `context_length` for loaded
+ * models.
+ */
+export function parseOllamaRuntimeContextLength(value: unknown): {
+  contextLength?: number;
+  warning?: string;
+} {
+  if (value === undefined || value === null || String(value).trim() === "") {
+    return {};
+  }
+  const parsed = parsePositiveInteger(value);
+  if (!parsed) {
+    return {
+      warning: `Ollama /api/ps returned a non-positive or malformed context_length (${String(value)}); ignoring it.`,
+    };
+  }
+  if (parsed > MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW) {
+    return {
+      warning:
+        `Ollama /api/ps returned context_length=${parsed}, above NemoClaw's ` +
+        `auto-detect ceiling (${MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW}); ignoring it.`,
+    };
+  }
+  return { contextLength: parsed };
+}
+
+export function probeOllamaRuntimeModelStatus(
+  model: string,
+  getOllamaHost: () => string,
+  runCaptureImpl?: OllamaRuntimeRunCaptureFn,
+): OllamaRuntimeModelStatus {
+  const capture = runCaptureImpl ?? runCapture;
+  const host = getOllamaHost();
+  const output = capture(
+    [
+      "curl",
+      "-sf",
+      "--connect-timeout",
+      "3",
+      "--max-time",
+      "5",
+      `http://${host}:${OLLAMA_PORT}/api/ps`,
+    ],
+    { ignoreError: true },
+  );
+  if (!output) return { probed: false, loaded: false, cpuOnly: false };
+
+  try {
+    const parsed = JSON.parse(String(output || ""));
+    const models = Array.isArray(parsed?.models) ? parsed.models : [];
+    const target = normalizeOllamaModelName(model);
+    const loaded = models.find((entry: { name?: unknown; model?: unknown }) => {
+      return (
+        normalizeOllamaModelName(entry?.name) === target ||
+        normalizeOllamaModelName(entry?.model) === target
+      );
+    });
+    if (!loaded) return { probed: true, loaded: false, cpuOnly: false };
+
+    const rawSizeVram = Number((loaded as { size_vram?: unknown }).size_vram);
+    const hasSizeVram = Number.isFinite(rawSizeVram);
+    const contextLengthResult = parseOllamaRuntimeContextLength(
+      (loaded as { context_length?: unknown }).context_length,
+    );
+    const processor = normalizeOllamaModelName((loaded as { processor?: unknown }).processor);
+    const mentionsGpu = /\bGPU\b/i.test(processor);
+    const processorCpuOnly = /\bCPU\b/i.test(processor) && !mentionsGpu;
+    const sizeVramCpuOnly = hasSizeVram && rawSizeVram === 0 && !mentionsGpu;
+
+    return {
+      probed: true,
+      loaded: true,
+      cpuOnly: processorCpuOnly || sizeVramCpuOnly,
+      ...(contextLengthResult.contextLength
+        ? { contextLength: contextLengthResult.contextLength }
+        : {}),
+      ...(contextLengthResult.warning
+        ? { contextLengthWarning: contextLengthResult.warning }
+        : {}),
+      ...(processor ? { processor } : {}),
+      ...(hasSizeVram ? { sizeVram: rawSizeVram } : {}),
+    };
+  } catch {
+    return { probed: true, loaded: false, cpuOnly: false };
+  }
+}
+
+export function resolveOllamaRuntimeContextWindow(
+  model: string,
+  currentContextWindow: string | null | undefined,
+  getOllamaHost: () => string,
+  runCaptureImpl?: OllamaRuntimeRunCaptureFn,
+): number | null {
+  if (hasExplicitContextWindow(currentContextWindow)) return null;
+  const runtimeStatus = probeOllamaRuntimeModelStatus(model, getOllamaHost, runCaptureImpl);
+  return runtimeStatus.loaded ? (runtimeStatus.contextLength ?? null) : null;
+}
+
+let autoDetectedOllamaContextWindow: string | null = null;
+
+export function resetOllamaRuntimeContextWindowAutoState(): void {
+  autoDetectedOllamaContextWindow = null;
+}
+
+export function applyOllamaRuntimeContextWindow(
+  selectedModel: string,
+  getOllamaHost: () => string,
+  options: ApplyOllamaRuntimeContextWindowOptions = {},
+): void {
+  const env = options.env ?? process.env;
+  const logger = options.logger ?? console;
+  const currentContextWindow = env.NEMOCLAW_CONTEXT_WINDOW;
+  const currentIsPreviousAuto =
+    !!currentContextWindow &&
+    !!autoDetectedOllamaContextWindow &&
+    currentContextWindow === autoDetectedOllamaContextWindow;
+  const userContextWindow = currentIsPreviousAuto ? null : currentContextWindow;
+
+  if (hasExplicitContextWindow(userContextWindow)) {
+    logger.log(`  ℹ Keeping configured context window: ${userContextWindow} tokens`);
+    return;
+  }
+
+  const runtimeStatus = probeOllamaRuntimeModelStatus(
+    selectedModel,
+    getOllamaHost,
+    options.runCaptureImpl,
+  );
+  if (runtimeStatus.contextLengthWarning) {
+    logger.warn(`  ⚠ ${runtimeStatus.contextLengthWarning}`);
+  }
+  if (runtimeStatus.loaded && runtimeStatus.contextLength) {
+    const value = String(runtimeStatus.contextLength);
+    env.NEMOCLAW_CONTEXT_WINDOW = value;
+    autoDetectedOllamaContextWindow = value;
+    logger.log(`  ✓ Using Ollama runtime context length: ${value} tokens`);
+    return;
+  }
+
+  if (currentIsPreviousAuto) {
+    delete env.NEMOCLAW_CONTEXT_WINDOW;
+    autoDetectedOllamaContextWindow = null;
+  }
+}

From 73aa46ddef94d37df309d7fd148c2eb5bb687040 Mon Sep 17 00:00:00 2001
From: Carlos Villela <cvillela@nvidia.com>
Date: Tue, 26 May 2026 21:16:47 -0700
Subject: [PATCH 6/6] test(inference): align Ollama runtime mock field

---
 src/lib/inference/ollama-runtime-context.test.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib/inference/ollama-runtime-context.test.ts b/src/lib/inference/ollama-runtime-context.test.ts
index 62e9451bc4..7cee94a59f 100644
--- a/src/lib/inference/ollama-runtime-context.test.ts
+++ b/src/lib/inference/ollama-runtime-context.test.ts
@@ -86,7 +86,7 @@ describe("Ollama runtime context helpers", () => {
   it("resolves runtime context length only when no explicit override is set", () => {
     const capture = () =>
       JSON.stringify({
-        models: [{ model: "qwen3.6:35b", context_length: "262144", processor: "100% GPU" }],
+        models: [{ name: "qwen3.6:35b", context_length: "262144", processor: "100% GPU" }],
       });
 
     expect(