diff --git a/src/lib/actions/sandbox/destroy.ts b/src/lib/actions/sandbox/destroy.ts index a9383f2235..125d99abde 100644 --- a/src/lib/actions/sandbox/destroy.ts +++ b/src/lib/actions/sandbox/destroy.ts @@ -85,14 +85,19 @@ function cleanupSandboxServices( { stopHostServices = false }: { stopHostServices?: boolean } = {}, ): void { if (stopHostServices) { + // `stopAll()` already runs `unloadOllamaModels()` unconditionally — + // see src/lib/services.ts. Don't double-call here. const { stopAll } = require("../../services"); stopAll({ sandboxName }); - } - - const sb = registry.getSandbox(sandboxName); - if (sb?.provider?.includes("ollama")) { - const { unloadOllamaModels } = require("../../onboard-ollama-proxy"); - unloadOllamaModels(); + } else { + // No global stop, so `stopAll()` did not run; explicitly free Ollama + // models for this sandbox if its provider used Ollama. Without this + // branch a single-sandbox destroy would leave models loaded on the GPU. + const sb = registry.getSandbox(sandboxName); + if (sb?.provider?.includes("ollama")) { + const { unloadOllamaModels } = require("../../onboard-ollama-proxy"); + unloadOllamaModels(); + } } try { @@ -235,9 +240,13 @@ export async function destroySandbox( nim.stopNimContainer(sandboxName, { silent: true }); } + // The Ollama auth proxy is per-sandbox and only spawned when the provider + // is Ollama, so this guard scopes only `killStaleProxy()`. GPU unload is + // handled separately by `cleanupSandboxServices` above (which routes + // through `stopAll()` or directly into `unloadOllamaModels()` based on + // whether host services are being torn down). if (sb?.provider?.includes("ollama")) { - const { unloadOllamaModels, killStaleProxy } = require("../../onboard-ollama-proxy"); - unloadOllamaModels(); + const { killStaleProxy } = require("../../onboard-ollama-proxy"); killStaleProxy(); } diff --git a/src/lib/onboard-ollama-proxy.ts b/src/lib/onboard-ollama-proxy.ts index df02749a76..bf5ecf2a3a 100644 --- a/src/lib/onboard-ollama-proxy.ts +++ b/src/lib/onboard-ollama-proxy.ts @@ -9,7 +9,6 @@ const fs = require("fs"); const os = require("os"); const path = require("path"); const { spawn, spawnSync } = require("child_process"); -const http = require("http"); const { ROOT, SCRIPTS, run, runCapture, shellQuote } = require("./runner"); const { OLLAMA_PORT, OLLAMA_PROXY_PORT } = require("./core/ports"); const { @@ -631,56 +630,47 @@ async function prepareOllamaModel(model, installedModels = []) { /** * Unload all running Ollama models from GPU memory. * Best-effort operation: silently ignores errors if Ollama is not running. + * + * Uses `spawnSync` with `curl --max-time 3` rather than Node's + * `http.request`/`http.get` so the unload completes before + * `process.exit()`. The previous async version was fire-and-forget and got + * dropped by Node's event loop on fast CLI exit (e.g. `nemoclaw destroy`), + * leaving GPU memory reserved. Reverting to async HTTP would reintroduce + * that race; keep it synchronous. */ function unloadOllamaModels() { try { - const req = http.get( - { - hostname: "localhost", - port: OLLAMA_PORT, - path: "/api/ps", - timeout: 3000, - }, - (res) => { - let data = ""; - res.on("data", (chunk) => { - data += chunk; - }); - res.on("end", () => { - if (res.statusCode !== 200) return; - try { - const parsed = JSON.parse(data); - const models = parsed.models || []; - for (const entry of models) { - if (!entry.name) continue; - const unloadReq = http.request( - { - hostname: "localhost", - port: OLLAMA_PORT, - path: "/api/generate", - method: "POST", - timeout: 3000, - headers: { "Content-Type": "application/json" }, - }, - () => { - /* ignore response */ - }, - ); - unloadReq.on("error", () => { - /* best-effort */ - }); - unloadReq.write(JSON.stringify({ model: entry.name, keep_alive: 0 })); - unloadReq.end(); - } - } catch { - /* best-effort */ - } - }); - }, + const psResult = spawnSync( + "curl", + ["-sS", "--max-time", "3", `http://localhost:${OLLAMA_PORT}/api/ps`], + { encoding: "utf8" }, ); - req.on("error", () => { - /* best-effort */ - }); + if (psResult.status !== 0) return; + + const parsed = JSON.parse(psResult.stdout || "{}"); + const models = Array.isArray(parsed.models) ? parsed.models : []; + + for (const entry of models) { + if (!entry?.name) continue; + spawnSync( + "curl", + [ + "-sS", + "-o", + "/dev/null", + "--max-time", + "3", + "-X", + "POST", + "-H", + "Content-Type: application/json", + "-d", + JSON.stringify({ model: entry.name, keep_alive: 0 }), + `http://localhost:${OLLAMA_PORT}/api/generate`, + ], + { encoding: "utf8" }, + ); + } } catch { /* best-effort */ } diff --git a/src/lib/services.test.ts b/src/lib/services.test.ts index c4149014a3..4c37965de0 100644 --- a/src/lib/services.test.ts +++ b/src/lib/services.test.ts @@ -2,14 +2,23 @@ // SPDX-License-Identifier: Apache-2.0 import { describe, it, expect, beforeEach, afterEach, vi } from "vitest"; -import http from "node:http"; +import childProcess, { type SpawnSyncReturns } from "node:child_process"; import { mkdtempSync, writeFileSync, existsSync, rmSync } from "node:fs"; -import { join } from "node:path"; +import { join, resolve } from "node:path"; import { tmpdir } from "node:os"; // Import from compiled dist/ so coverage is attributed correctly. import { getServiceStatuses, showStatus, stopAll } from "../../dist/lib/services"; +const ollamaProxyDistPath = resolve( + import.meta.dirname, + "..", + "..", + "dist", + "lib", + "onboard-ollama-proxy.js", +); + describe("getServiceStatuses", () => { let pidDir: string; @@ -116,16 +125,41 @@ describe("showStatus", () => { describe("stopAll", () => { let pidDir: string; - let httpGetSpy: ReturnType; + let spawnSyncCalls: Array<{ command: string; args: readonly string[] }>; + let originalSpawnSync: typeof childProcess.spawnSync; beforeEach(() => { pidDir = mkdtempSync(join(tmpdir(), "nemoclaw-svc-test-")); - const request = { on: vi.fn(() => request) }; - httpGetSpy = vi.spyOn(http, "get").mockImplementation((() => request) as any); + spawnSyncCalls = []; + originalSpawnSync = childProcess.spawnSync; + // @ts-expect-error — partial mock signature is intentional. + childProcess.spawnSync = (command: string, args: readonly string[]) => { + spawnSyncCalls.push({ command, args }); + const reply: SpawnSyncReturns = { + pid: 0, + output: ["", "", ""], + stdout: "", + stderr: "", + status: 0, + signal: null, + }; + // Return an empty model list so the unload's for-loop is a no-op. + if (command === "curl" && args.some((a) => a.endsWith("/api/ps"))) { + reply.stdout = JSON.stringify({ models: [] }); + reply.output = ["", reply.stdout, ""]; + } + return reply; + }; + // The dist `onboard-ollama-proxy` module destructures `spawnSync` at + // require time, so to make `stopAll` pick up the patched function we + // bust its cache. `services.ts` requires the proxy lazily, so the + // next call sees the freshly-loaded module. + delete require.cache[require.resolve(ollamaProxyDistPath)]; }); afterEach(() => { - httpGetSpy.mockRestore(); + childProcess.spawnSync = originalSpawnSync; + delete require.cache[require.resolve(ollamaProxyDistPath)]; rmSync(pidDir, { recursive: true, force: true }); }); @@ -159,9 +193,12 @@ describe("stopAll", () => { stopAll({ pidDir }); logSpy.mockRestore(); - expect(httpGetSpy).toHaveBeenCalledWith( - expect.objectContaining({ hostname: "localhost", port: 11434, path: "/api/ps" }), - expect.any(Function), + const psCall = spawnSyncCalls.find( + (c) => + c.command === "curl" && + c.args.some((a) => a.endsWith("/api/ps")), ); + expect(psCall).toBeDefined(); + expect(psCall?.args).toContain("--max-time"); }); }); diff --git a/test/ollama-gpu-cleanup.test.ts b/test/ollama-gpu-cleanup.test.ts index 22671a30ef..a69031df3b 100644 --- a/test/ollama-gpu-cleanup.test.ts +++ b/test/ollama-gpu-cleanup.test.ts @@ -1,135 +1,136 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -import { describe, expect, it, vi } from "vitest"; -import http from "node:http"; - -import { unloadOllamaModels } from "../dist/lib/onboard-ollama-proxy.js"; +import childProcess, { type SpawnSyncReturns } from "node:child_process"; +import path from "node:path"; +import { describe, expect, it } from "vitest"; + +const modulePath = path.join( + import.meta.dirname, + "..", + "dist", + "lib", + "onboard-ollama-proxy.js", +); + +type SpawnCall = { command: string; args: readonly string[] }; + +function ok(stdout = ""): SpawnSyncReturns { + return { + pid: 0, + output: ["", stdout, ""], + stdout, + stderr: "", + status: 0, + signal: null, + }; +} + +function fail(stderr = "couldn't connect"): SpawnSyncReturns { + return { + pid: 0, + output: ["", "", stderr], + stdout: "", + stderr, + status: 7, + signal: null, + }; +} + +function withMockedSpawnSync( + responder: (call: SpawnCall) => SpawnSyncReturns, + fn: (calls: SpawnCall[]) => T, +): T { + const calls: SpawnCall[] = []; + const original = childProcess.spawnSync; + // @ts-expect-error — partial mock signature is intentional. + childProcess.spawnSync = (command: string, args: readonly string[]) => { + const call = { command, args }; + calls.push(call); + return responder(call); + }; + try { + delete require.cache[require.resolve(modulePath)]; + return fn(calls); + } finally { + childProcess.spawnSync = original; + delete require.cache[require.resolve(modulePath)]; + } +} describe("Ollama GPU cleanup", () => { - it("unloads all running Ollama models via the production HTTP implementation", async () => { - const mockModels = { - models: [{ name: "llama3.1:8b" }, { name: "qwen:7b" }], - }; - - const mockResponse = { - statusCode: 200, - on: vi.fn((event, handler) => { - if (event === "data") { - handler(JSON.stringify(mockModels)); - } else if (event === "end") { - handler(); + it("calls curl synchronously to unload every running model via /api/generate", () => { + withMockedSpawnSync( + ({ args }) => { + if (args.some((a) => a.endsWith("/api/ps"))) { + return ok( + JSON.stringify({ models: [{ name: "llama3.1:8b" }, { name: "qwen:7b" }] }), + ); } - return mockResponse; - }), - }; - - const mockGetRequest = { - on: vi.fn(() => mockGetRequest), - }; - - const mockUnloadRequests: Array<{ - on: ReturnType; - write: ReturnType; - end: ReturnType; - }> = []; - - const httpGetSpy = vi.spyOn(http, "get").mockImplementation(((options: any, callback: any) => { - expect(options.hostname).toBe("localhost"); - expect(options.port).toBe(11434); - expect(options.path).toBe("/api/ps"); - callback(mockResponse); - return mockGetRequest; - }) as any); - - const httpRequestSpy = vi.spyOn(http, "request").mockImplementation((( - options: any, - callback: any, - ) => { - expect(options.hostname).toBe("localhost"); - expect(options.port).toBe(11434); - expect(options.path).toBe("/api/generate"); - expect(options.method).toBe("POST"); - expect(options.headers["Content-Type"]).toBe("application/json"); - const req = { - on: vi.fn(() => req), - write: vi.fn(), - end: vi.fn(), - }; - mockUnloadRequests.push(req); - callback(); - return req; - }) as any); - - unloadOllamaModels(); - - expect(httpGetSpy).toHaveBeenCalledTimes(1); - - await new Promise((resolve) => setTimeout(resolve, 50)); - - expect(httpRequestSpy).toHaveBeenCalledTimes(2); - expect(mockUnloadRequests.map((req) => req.write.mock.calls[0]?.[0])).toEqual([ - JSON.stringify({ model: "llama3.1:8b", keep_alive: 0 }), - JSON.stringify({ model: "qwen:7b", keep_alive: 0 }), - ]); - expect(mockUnloadRequests.every((req) => req.end.mock.calls.length === 1)).toBe(true); + return ok(); + }, + (calls) => { + const { unloadOllamaModels } = require(modulePath); + unloadOllamaModels(); + + expect(calls).toHaveLength(3); + + expect(calls[0].command).toBe("curl"); + expect(calls[0].args).toContain("--max-time"); + expect(calls[0].args[calls[0].args.length - 1]).toMatch(/\/api\/ps$/); + + expect(calls[1].command).toBe("curl"); + expect(calls[1].args).toContain("-X"); + expect(calls[1].args).toContain("POST"); + expect(calls[1].args).toContain(JSON.stringify({ model: "llama3.1:8b", keep_alive: 0 })); + expect(calls[1].args[calls[1].args.length - 1]).toMatch(/\/api\/generate$/); + + expect(calls[2].args).toContain(JSON.stringify({ model: "qwen:7b", keep_alive: 0 })); + }, + ); + }); - httpGetSpy.mockRestore(); - httpRequestSpy.mockRestore(); + it("returns silently when /api/ps fails (Ollama not running)", () => { + withMockedSpawnSync( + () => fail(), + (calls) => { + const { unloadOllamaModels } = require(modulePath); + expect(() => unloadOllamaModels()).not.toThrow(); + expect(calls).toHaveLength(1); + expect(calls[0].args[calls[0].args.length - 1]).toMatch(/\/api\/ps$/); + }, + ); }); - it("handles errors gracefully when Ollama is not running", () => { - const mockGetRequest = { - on: vi.fn((event, handler) => { - if (event === "error") { - handler(new Error("Connection refused")); + it("does not unload anything when Ollama reports no loaded models", () => { + withMockedSpawnSync( + ({ args }) => { + if (args.some((a) => a.endsWith("/api/ps"))) { + return ok(JSON.stringify({ models: [] })); } - return mockGetRequest; - }), - }; - - const httpGetSpy = vi.spyOn(http, "get").mockImplementation((() => mockGetRequest) as any); - - expect(() => unloadOllamaModels()).not.toThrow(); - expect(httpGetSpy).toHaveBeenCalledTimes(1); - - httpGetSpy.mockRestore(); + return ok(); + }, + (calls) => { + const { unloadOllamaModels } = require(modulePath); + unloadOllamaModels(); + expect(calls).toHaveLength(1); + }, + ); }); - it("does not unload anything when Ollama reports no loaded models", async () => { - const mockModels = { models: [] }; - - const mockResponse = { - statusCode: 200, - on: vi.fn((event, handler) => { - if (event === "data") { - handler(JSON.stringify(mockModels)); - } else if (event === "end") { - handler(); + it("ignores malformed JSON from /api/ps without throwing", () => { + withMockedSpawnSync( + ({ args }) => { + if (args.some((a) => a.endsWith("/api/ps"))) { + return ok("not-json"); } - return mockResponse; - }), - }; - - const mockGetRequest = { - on: vi.fn(() => mockGetRequest), - }; - - const httpGetSpy = vi.spyOn(http, "get").mockImplementation(((_options: any, callback: any) => { - callback(mockResponse); - return mockGetRequest; - }) as any); - - const httpRequestSpy = vi.spyOn(http, "request"); - - unloadOllamaModels(); - - await new Promise((resolve) => setTimeout(resolve, 50)); - - expect(httpGetSpy).toHaveBeenCalledTimes(1); - expect(httpRequestSpy).not.toHaveBeenCalled(); - - httpGetSpy.mockRestore(); - httpRequestSpy.mockRestore(); + return ok(); + }, + (calls) => { + const { unloadOllamaModels } = require(modulePath); + expect(() => unloadOllamaModels()).not.toThrow(); + expect(calls).toHaveLength(1); + }, + ); }); });