Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/inference/switch-inference-providers.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ To change these values, set the corresponding environment variables before runni
| `NEMOCLAW_AGENT_HEARTBEAT_EVERY` | Go-style duration (`30m`, `1h`, `0m` to disable) | `unset` (OpenClaw default) |

Invalid values are ignored, and the default bakes into the image.
For Local Ollama, onboarding loads the selected model first and uses Ollama's reported runtime context length when `NEMOCLAW_CONTEXT_WINDOW` is unset.
Use `NEMOCLAW_INFERENCE_INPUTS=text,image` only for a model that accepts image input through the selected provider.

```console
Expand Down
1 change: 1 addition & 0 deletions docs/inference/use-local-inference.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ NemoClaw lists installed models or offers starter models if none are installed.
On hosts where the larger starter models fit the currently available GPU memory, the starter list includes `qwen3.6:35b` and selects it by default.
When another GPU workload is using most of the memory at onboard time, NemoClaw downgrades the menu to the largest model that still fits.
It pulls the selected model, loads it into memory, and validates it before continuing.
When Ollama reports a loaded-model context length, NemoClaw uses that value for the `contextWindow` baked into `openclaw.json` unless you set `NEMOCLAW_CONTEXT_WINDOW` yourself.
If the selected model declares that it does not support tool calling, onboarding stops with guidance to choose a model whose `ollama show <model>` capabilities include `tools`.
The validation also requires structured chat-completions tool calls.
If the model leaks tool-call JSON as plain message text, onboarding stops so you can choose a model that returns tool calls in the expected response field.
Expand Down
20 changes: 2 additions & 18 deletions src/lib/inference/local.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ import {
getOllamaWarmupCommand,
parseOllamaList,
parseOllamaTags,
probeOllamaRuntimeModelStatus,
probeLocalProviderHealth,
validateOllamaModel,
validateLocalProvider,
Expand Down Expand Up @@ -654,6 +653,7 @@ describe("local inference helpers", () => {
null,
{ type: "nvidia", totalMemoryMB: 131_072, availableMemoryMB: 131_072 },
log,
() => "",
),
).toBe(QWEN3_6_OLLAMA_MODEL);
});
Expand Down Expand Up @@ -686,6 +686,7 @@ describe("local inference helpers", () => {
null,
{ type: "nvidia", totalMemoryMB: 16_384, availableMemoryMB: 4_000 },
log,
() => "",
),
).toBe("qwen2.5:7b");
expect(messages.some((m) => m.includes("No known Ollama bootstrap model fits"))).toBe(true);
Expand Down Expand Up @@ -793,23 +794,6 @@ describe("local inference helpers", () => {
expect(validateOllamaModel("nemotron-3-nano:30b", () => "ok", undefined, captureEx)).toEqual({ ok: true });
});

it("parses Ollama runtime status from /api/ps", () => {
const capture = () =>
JSON.stringify({
models: [
{ name: "qwen3.6:35b", size_vram: 0, processor: "100% CPU" },
],
});

expect(probeOllamaRuntimeModelStatus("qwen3.6:35b", capture)).toEqual({
probed: true,
loaded: true,
cpuOnly: true,
processor: "100% CPU",
sizeVram: 0,
});
});

it("fails Spark Ollama validation when the model is CPU-only after warmup", () => {
const payload = JSON.stringify({ model: "qwen3.6:35b", response: "hello", done: true });
const psOutput = JSON.stringify({
Expand Down
84 changes: 30 additions & 54 deletions src/lib/inference/local.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,16 @@ import { runCurlProbe } from "../adapters/http/probe";
import type { ContainerRuntime } from "../platform";
import type { CaptureResult } from "../runner";
import { buildSubprocessEnv } from "../subprocess-env";
import {
applyOllamaRuntimeContextWindow as applyOllamaRuntimeContextWindowWithHost,
MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW,
parsePositiveInteger,
probeOllamaRuntimeModelStatus as probeOllamaRuntimeModelStatusWithHost,
resetOllamaRuntimeContextWindowAutoState,
resolveOllamaRuntimeContextWindow as resolveOllamaRuntimeContextWindowWithHost,
} from "./ollama-runtime-context";
import type { OllamaRuntimeModelStatus } from "./ollama-runtime-context";
export type { OllamaRuntimeModelStatus } from "./ollama-runtime-context";

const { shellQuote, runCapture, runCaptureEx } = require("../runner");

Expand Down Expand Up @@ -666,67 +676,32 @@ export function parseOllamaTags(output: string | null | undefined): string[] {
}
}

export interface OllamaRuntimeModelStatus {
probed: boolean;
loaded: boolean;
cpuOnly: boolean;
processor?: string;
sizeVram?: number;
}

function normalizeOllamaModelName(value: unknown): string {
return String(value || "").trim();
}
export { MAX_AUTODETECTED_OLLAMA_CONTEXT_WINDOW, parsePositiveInteger };

export function probeOllamaRuntimeModelStatus(
model: string,
runCaptureImpl?: RunCaptureFn,
): OllamaRuntimeModelStatus {
const capture = runCaptureImpl ?? runCapture;
const host = getResolvedOllamaHost();
const output = capture(
[
"curl",
"-sf",
"--connect-timeout",
"3",
"--max-time",
"5",
`http://${host}:${OLLAMA_PORT}/api/ps`,
],
{ ignoreError: true },
);
if (!output) return { probed: false, loaded: false, cpuOnly: false };
return probeOllamaRuntimeModelStatusWithHost(model, getResolvedOllamaHost, runCaptureImpl);
}

try {
const parsed = JSON.parse(String(output || ""));
const models = Array.isArray(parsed?.models) ? parsed.models : [];
const target = normalizeOllamaModelName(model);
const loaded = models.find((entry: { name?: unknown; model?: unknown }) => {
return (
normalizeOllamaModelName(entry?.name) === target ||
normalizeOllamaModelName(entry?.model) === target
);
});
if (!loaded) return { probed: true, loaded: false, cpuOnly: false };
export function resolveOllamaRuntimeContextWindow(
model: string,
currentContextWindow: string | null | undefined = null,
runCaptureImpl?: RunCaptureFn,
): number | null {
return resolveOllamaRuntimeContextWindowWithHost(
model,
currentContextWindow,
getResolvedOllamaHost,
runCaptureImpl,
);
}

const rawSizeVram = Number((loaded as { size_vram?: unknown }).size_vram);
const hasSizeVram = Number.isFinite(rawSizeVram);
const processor = normalizeOllamaModelName((loaded as { processor?: unknown }).processor);
const mentionsGpu = /\bGPU\b/i.test(processor);
const processorCpuOnly = /\bCPU\b/i.test(processor) && !mentionsGpu;
const sizeVramCpuOnly = hasSizeVram && rawSizeVram === 0 && !mentionsGpu;
export { resetOllamaRuntimeContextWindowAutoState };

return {
probed: true,
loaded: true,
cpuOnly: processorCpuOnly || sizeVramCpuOnly,
...(processor ? { processor } : {}),
...(hasSizeVram ? { sizeVram: rawSizeVram } : {}),
};
} catch {
return { probed: true, loaded: false, cpuOnly: false };
}
export function applyOllamaRuntimeContextWindow(selectedModel: string): void {
applyOllamaRuntimeContextWindowWithHost(selectedModel, getResolvedOllamaHost);
}

function formatOllamaCpuOnlyDiagnostic(model: string, status: OllamaRuntimeModelStatus): string {
Expand Down Expand Up @@ -796,6 +771,7 @@ export function resolveNonInteractiveOllamaModel(
recoveredModel: string | null,
gpu: GpuInfo | null,
log: (message: string) => void = (m) => console.warn(m),
runCaptureImpl?: RunCaptureFn,
): string {
const explicit = requestedModel || recoveredModel;
if (explicit && !modelFitsAvailableMemory(explicit, gpu)) {
Expand All @@ -812,7 +788,7 @@ export function resolveNonInteractiveOllamaModel(
if (!explicit && !anyRegistryModelFits(gpu)) {
warnNoBootstrapModelFits(gpu, log);
}
return explicit || getDefaultOllamaModel(gpu);
return explicit || getDefaultOllamaModel(gpu, runCaptureImpl);
}

function warnNoBootstrapModelFits(
Expand Down
141 changes: 141 additions & 0 deletions src/lib/inference/ollama-runtime-context.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

import { afterEach, describe, expect, it } from "vitest";

import {
applyOllamaRuntimeContextWindow,
parseOllamaRuntimeContextLength,
probeOllamaRuntimeModelStatus,
resetOllamaRuntimeContextWindowAutoState,
resolveOllamaRuntimeContextWindow,
} from "../../../dist/lib/inference/ollama-runtime-context";

const getOllamaHost = () => "127.0.0.1";

describe("Ollama runtime context helpers", () => {
afterEach(() => {
resetOllamaRuntimeContextWindowAutoState();
});

it("parses valid Ollama /api/ps context lengths", () => {
expect(parseOllamaRuntimeContextLength(262144)).toEqual({ contextLength: 262144 });
expect(parseOllamaRuntimeContextLength("262144")).toEqual({ contextLength: 262144 });
});

it("treats omitted Ollama /api/ps context lengths as compatibility no-ops", () => {
expect(parseOllamaRuntimeContextLength(undefined)).toEqual({});
expect(parseOllamaRuntimeContextLength(null)).toEqual({});
expect(parseOllamaRuntimeContextLength(" ")).toEqual({});

const status = probeOllamaRuntimeModelStatus(
"qwen3.6:35b",
getOllamaHost,
() => JSON.stringify({ models: [{ name: "qwen3.6:35b", processor: "100% GPU" }] }),
);

expect(status.loaded).toBe(true);
expect(status.contextLength).toBeUndefined();
expect(status.contextLengthWarning).toBeUndefined();
expect(
resolveOllamaRuntimeContextWindow("qwen3.6:35b", null, getOllamaHost, () =>
JSON.stringify({ models: [{ name: "qwen3.6:35b" }] }),
),
).toBeNull();
});

it("warns and ignores malformed or non-positive Ollama /api/ps context lengths", () => {
for (const value of ["bogus", "1.5", 0, -1]) {
const parsed = parseOllamaRuntimeContextLength(value);
expect(parsed.contextLength).toBeUndefined();
expect(parsed.warning).toContain("non-positive or malformed context_length");
}

const status = probeOllamaRuntimeModelStatus(
"qwen3.6:35b",
getOllamaHost,
() => JSON.stringify({ models: [{ name: "qwen3.6:35b", context_length: "bogus" }] }),
);

expect(status.loaded).toBe(true);
expect(status.contextLength).toBeUndefined();
expect(status.contextLengthWarning).toContain("non-positive or malformed context_length");
});

it("warns and ignores implausibly large Ollama /api/ps context lengths", () => {
const parsed = parseOllamaRuntimeContextLength(10_000_000);
expect(parsed.contextLength).toBeUndefined();
expect(parsed.warning).toContain("above NemoClaw's auto-detect ceiling");

const status = probeOllamaRuntimeModelStatus(
"qwen3.6:35b",
getOllamaHost,
() => JSON.stringify({ models: [{ name: "qwen3.6:35b", context_length: 10_000_000 }] }),
);

expect(status.loaded).toBe(true);
expect(status.contextLength).toBeUndefined();
expect(status.contextLengthWarning).toContain("above NemoClaw's auto-detect ceiling");
expect(
resolveOllamaRuntimeContextWindow("qwen3.6:35b", null, getOllamaHost, () =>
JSON.stringify({ models: [{ name: "qwen3.6:35b", context_length: 10_000_000 }] }),
),
).toBeNull();
});

it("resolves runtime context length only when no explicit override is set", () => {
const capture = () =>
JSON.stringify({
models: [{ name: "qwen3.6:35b", context_length: "262144", processor: "100% GPU" }],
});

expect(
resolveOllamaRuntimeContextWindow("qwen3.6:35b", null, getOllamaHost, capture),
).toBe(262144);
expect(
resolveOllamaRuntimeContextWindow("qwen3.6:35b", "131072", getOllamaHost, capture),
).toBeNull();
expect(
resolveOllamaRuntimeContextWindow("qwen3.6:35b", "bogus", getOllamaHost, capture),
).toBeNull();
expect(
resolveOllamaRuntimeContextWindow("qwen3.6:35b", " ", getOllamaHost, capture),
).toBe(262144);
expect(
resolveOllamaRuntimeContextWindow("other:model", null, getOllamaHost, capture),
).toBeNull();
});
Comment thread
coderabbitai[bot] marked this conversation as resolved.

it("applies and clears only auto-detected context window state", () => {
const env: NodeJS.ProcessEnv = {};
const messages: string[] = [];
let models: Array<{ name: string; context_length?: number }> = [];
const options = {
env,
logger: {
log: (message: string) => messages.push(message),
warn: (message: string) => messages.push(message),
},
runCaptureImpl: () => JSON.stringify({ models }),
};

models = [{ name: "qwen3.6:35b", context_length: 262144 }];
applyOllamaRuntimeContextWindow("qwen3.6:35b", getOllamaHost, options);
expect(env.NEMOCLAW_CONTEXT_WINDOW).toBe("262144");

models = [{ name: "qwen2.5:7b", context_length: 32768 }];
applyOllamaRuntimeContextWindow("qwen2.5:7b", getOllamaHost, options);
expect(env.NEMOCLAW_CONTEXT_WINDOW).toBe("32768");

models = [];
applyOllamaRuntimeContextWindow("qwen2.5:7b", getOllamaHost, options);
expect(env.NEMOCLAW_CONTEXT_WINDOW).toBeUndefined();

resetOllamaRuntimeContextWindowAutoState();
env.NEMOCLAW_CONTEXT_WINDOW = "262144";
models = [{ name: "qwen2.5:7b", context_length: 32768 }];
applyOllamaRuntimeContextWindow("qwen2.5:7b", getOllamaHost, options);
expect(env.NEMOCLAW_CONTEXT_WINDOW).toBe("262144");
expect(messages.at(-1)).toContain("Keeping configured context window");
});
});
Loading
Loading