Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 17 additions & 8 deletions src/lib/actions/sandbox/destroy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,19 @@ function cleanupSandboxServices(
{ stopHostServices = false }: { stopHostServices?: boolean } = {},
): void {
if (stopHostServices) {
// `stopAll()` already runs `unloadOllamaModels()` unconditionally —
// see src/lib/services.ts. Don't double-call here.
const { stopAll } = require("../../services");
stopAll({ sandboxName });
}

const sb = registry.getSandbox(sandboxName);
if (sb?.provider?.includes("ollama")) {
const { unloadOllamaModels } = require("../../onboard-ollama-proxy");
unloadOllamaModels();
} else {
// No global stop, so `stopAll()` did not run; explicitly free Ollama
// models for this sandbox if its provider used Ollama. Without this
// branch a single-sandbox destroy would leave models loaded on the GPU.
const sb = registry.getSandbox(sandboxName);
if (sb?.provider?.includes("ollama")) {
const { unloadOllamaModels } = require("../../onboard-ollama-proxy");
unloadOllamaModels();
}
}

try {
Expand Down Expand Up @@ -235,9 +240,13 @@ export async function destroySandbox(
nim.stopNimContainer(sandboxName, { silent: true });
}

// The Ollama auth proxy is per-sandbox and only spawned when the provider
// is Ollama, so this guard scopes only `killStaleProxy()`. GPU unload is
// handled separately by `cleanupSandboxServices` above (which routes
// through `stopAll()` or directly into `unloadOllamaModels()` based on
// whether host services are being torn down).
if (sb?.provider?.includes("ollama")) {
const { unloadOllamaModels, killStaleProxy } = require("../../onboard-ollama-proxy");
unloadOllamaModels();
const { killStaleProxy } = require("../../onboard-ollama-proxy");
killStaleProxy();
}

Expand Down
84 changes: 37 additions & 47 deletions src/lib/onboard-ollama-proxy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ const fs = require("fs");
const os = require("os");
const path = require("path");
const { spawn, spawnSync } = require("child_process");
const http = require("http");
const { ROOT, SCRIPTS, run, runCapture, shellQuote } = require("./runner");
const { OLLAMA_PORT, OLLAMA_PROXY_PORT } = require("./core/ports");
const {
Expand Down Expand Up @@ -631,56 +630,47 @@ async function prepareOllamaModel(model, installedModels = []) {
/**
* Unload all running Ollama models from GPU memory.
* Best-effort operation: silently ignores errors if Ollama is not running.
*
* Uses `spawnSync` with `curl --max-time 3` rather than Node's
* `http.request`/`http.get` so the unload completes before
* `process.exit()`. The previous async version was fire-and-forget and got
* dropped by Node's event loop on fast CLI exit (e.g. `nemoclaw destroy`),
* leaving GPU memory reserved. Reverting to async HTTP would reintroduce
* that race; keep it synchronous.
*/
function unloadOllamaModels() {
try {
const req = http.get(
{
hostname: "localhost",
port: OLLAMA_PORT,
path: "/api/ps",
timeout: 3000,
},
(res) => {
let data = "";
res.on("data", (chunk) => {
data += chunk;
});
res.on("end", () => {
if (res.statusCode !== 200) return;
try {
const parsed = JSON.parse(data);
const models = parsed.models || [];
for (const entry of models) {
if (!entry.name) continue;
const unloadReq = http.request(
{
hostname: "localhost",
port: OLLAMA_PORT,
path: "/api/generate",
method: "POST",
timeout: 3000,
headers: { "Content-Type": "application/json" },
},
() => {
/* ignore response */
},
);
unloadReq.on("error", () => {
/* best-effort */
});
unloadReq.write(JSON.stringify({ model: entry.name, keep_alive: 0 }));
unloadReq.end();
}
} catch {
/* best-effort */
}
});
},
const psResult = spawnSync(
"curl",
["-sS", "--max-time", "3", `http://localhost:${OLLAMA_PORT}/api/ps`],
{ encoding: "utf8" },
);
req.on("error", () => {
/* best-effort */
});
if (psResult.status !== 0) return;

const parsed = JSON.parse(psResult.stdout || "{}");
const models = Array.isArray(parsed.models) ? parsed.models : [];

for (const entry of models) {
if (!entry?.name) continue;
spawnSync(
"curl",
[
"-sS",
"-o",
"/dev/null",
"--max-time",
"3",
"-X",
"POST",
"-H",
"Content-Type: application/json",
"-d",
JSON.stringify({ model: entry.name, keep_alive: 0 }),
`http://localhost:${OLLAMA_PORT}/api/generate`,
],
{ encoding: "utf8" },
);
}
} catch {
/* best-effort */
}
Expand Down
55 changes: 46 additions & 9 deletions src/lib/services.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,23 @@
// SPDX-License-Identifier: Apache-2.0

import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
import http from "node:http";
import childProcess, { type SpawnSyncReturns } from "node:child_process";
import { mkdtempSync, writeFileSync, existsSync, rmSync } from "node:fs";
import { join } from "node:path";
import { join, resolve } from "node:path";
import { tmpdir } from "node:os";

// Import from compiled dist/ so coverage is attributed correctly.
import { getServiceStatuses, showStatus, stopAll } from "../../dist/lib/services";

const ollamaProxyDistPath = resolve(
import.meta.dirname,
"..",
"..",
"dist",
"lib",
"onboard-ollama-proxy.js",
);

describe("getServiceStatuses", () => {
let pidDir: string;

Expand Down Expand Up @@ -116,16 +125,41 @@ describe("showStatus", () => {

describe("stopAll", () => {
let pidDir: string;
let httpGetSpy: ReturnType<typeof vi.spyOn>;
let spawnSyncCalls: Array<{ command: string; args: readonly string[] }>;
let originalSpawnSync: typeof childProcess.spawnSync;

beforeEach(() => {
pidDir = mkdtempSync(join(tmpdir(), "nemoclaw-svc-test-"));
const request = { on: vi.fn(() => request) };
httpGetSpy = vi.spyOn(http, "get").mockImplementation((() => request) as any);
spawnSyncCalls = [];
originalSpawnSync = childProcess.spawnSync;
// @ts-expect-error — partial mock signature is intentional.
childProcess.spawnSync = (command: string, args: readonly string[]) => {
spawnSyncCalls.push({ command, args });
const reply: SpawnSyncReturns<string> = {
pid: 0,
output: ["", "", ""],
stdout: "",
stderr: "",
status: 0,
signal: null,
};
// Return an empty model list so the unload's for-loop is a no-op.
if (command === "curl" && args.some((a) => a.endsWith("/api/ps"))) {
reply.stdout = JSON.stringify({ models: [] });
reply.output = ["", reply.stdout, ""];
}
return reply;
};
// The dist `onboard-ollama-proxy` module destructures `spawnSync` at
// require time, so to make `stopAll` pick up the patched function we
// bust its cache. `services.ts` requires the proxy lazily, so the
// next call sees the freshly-loaded module.
delete require.cache[require.resolve(ollamaProxyDistPath)];
});

afterEach(() => {
httpGetSpy.mockRestore();
childProcess.spawnSync = originalSpawnSync;
delete require.cache[require.resolve(ollamaProxyDistPath)];
rmSync(pidDir, { recursive: true, force: true });
});

Expand Down Expand Up @@ -159,9 +193,12 @@ describe("stopAll", () => {
stopAll({ pidDir });
logSpy.mockRestore();

expect(httpGetSpy).toHaveBeenCalledWith(
expect.objectContaining({ hostname: "localhost", port: 11434, path: "/api/ps" }),
expect.any(Function),
const psCall = spawnSyncCalls.find(
(c) =>
c.command === "curl" &&
c.args.some((a) => a.endsWith("/api/ps")),
);
expect(psCall).toBeDefined();
expect(psCall?.args).toContain("--max-time");
});
});
Loading
Loading