NVIDIA · cv · May 8, 2026 · May 7, 2026 · May 7, 2026 · May 7, 2026
diff --git a/src/lib/actions/sandbox/destroy.ts b/src/lib/actions/sandbox/destroy.ts
@@ -85,14 +85,19 @@ function cleanupSandboxServices(
   { stopHostServices = false }: { stopHostServices?: boolean } = {},
 ): void {
   if (stopHostServices) {
+    // `stopAll()` already runs `unloadOllamaModels()` unconditionally —
+    // see src/lib/services.ts. Don't double-call here.
     const { stopAll } = require("../../services");
     stopAll({ sandboxName });
-  }
-
-  const sb = registry.getSandbox(sandboxName);
-  if (sb?.provider?.includes("ollama")) {
-    const { unloadOllamaModels } = require("../../onboard-ollama-proxy");
-    unloadOllamaModels();
+  } else {
+    // No global stop, so `stopAll()` did not run; explicitly free Ollama
+    // models for this sandbox if its provider used Ollama. Without this
+    // branch a single-sandbox destroy would leave models loaded on the GPU.
+    const sb = registry.getSandbox(sandboxName);
+    if (sb?.provider?.includes("ollama")) {
+      const { unloadOllamaModels } = require("../../onboard-ollama-proxy");
+      unloadOllamaModels();
+    }
   }
 
   try {
@@ -235,9 +240,13 @@ export async function destroySandbox(
     nim.stopNimContainer(sandboxName, { silent: true });
   }
 
+  // The Ollama auth proxy is per-sandbox and only spawned when the provider
+  // is Ollama, so this guard scopes only `killStaleProxy()`. GPU unload is
+  // handled separately by `cleanupSandboxServices` above (which routes
+  // through `stopAll()` or directly into `unloadOllamaModels()` based on
+  // whether host services are being torn down).
   if (sb?.provider?.includes("ollama")) {
-    const { unloadOllamaModels, killStaleProxy } = require("../../onboard-ollama-proxy");
-    unloadOllamaModels();
+    const { killStaleProxy } = require("../../onboard-ollama-proxy");
     killStaleProxy();
   }
 

diff --git a/src/lib/onboard-ollama-proxy.ts b/src/lib/onboard-ollama-proxy.ts
@@ -9,7 +9,6 @@ const fs = require("fs");
 const os = require("os");
 const path = require("path");
 const { spawn, spawnSync } = require("child_process");
-const http = require("http");
 const { ROOT, SCRIPTS, run, runCapture, shellQuote } = require("./runner");
 const { OLLAMA_PORT, OLLAMA_PROXY_PORT } = require("./core/ports");
 const {
@@ -631,56 +630,47 @@ async function prepareOllamaModel(model, installedModels = []) {
 /**
  * Unload all running Ollama models from GPU memory.
  * Best-effort operation: silently ignores errors if Ollama is not running.
+ *
+ * Uses `spawnSync` with `curl --max-time 3` rather than Node's
+ * `http.request`/`http.get` so the unload completes before
+ * `process.exit()`. The previous async version was fire-and-forget and got
+ * dropped by Node's event loop on fast CLI exit (e.g. `nemoclaw destroy`),
+ * leaving GPU memory reserved. Reverting to async HTTP would reintroduce
+ * that race; keep it synchronous.
  */
 function unloadOllamaModels() {
   try {
-    const req = http.get(
-      {
-        hostname: "localhost",
-        port: OLLAMA_PORT,
-        path: "/api/ps",
-        timeout: 3000,
-      },
-      (res) => {
-        let data = "";
-        res.on("data", (chunk) => {
-          data += chunk;
-        });
-        res.on("end", () => {
-          if (res.statusCode !== 200) return;
-          try {
-            const parsed = JSON.parse(data);
-            const models = parsed.models || [];
-            for (const entry of models) {
-              if (!entry.name) continue;
-              const unloadReq = http.request(
-                {
-                  hostname: "localhost",
-                  port: OLLAMA_PORT,
-                  path: "/api/generate",
-                  method: "POST",
-                  timeout: 3000,
-                  headers: { "Content-Type": "application/json" },
-                },
-                () => {
-                  /* ignore response */
-                },
-              );
-              unloadReq.on("error", () => {
-                /* best-effort */
-              });
-              unloadReq.write(JSON.stringify({ model: entry.name, keep_alive: 0 }));
-              unloadReq.end();
-            }
-          } catch {
-            /* best-effort */
-          }
-        });
-      },
+    const psResult = spawnSync(
+      "curl",
+      ["-sS", "--max-time", "3", `http://localhost:${OLLAMA_PORT}/api/ps`],
+      { encoding: "utf8" },
     );
-    req.on("error", () => {
-      /* best-effort */
-    });
+    if (psResult.status !== 0) return;
+
+    const parsed = JSON.parse(psResult.stdout || "{}");
+    const models = Array.isArray(parsed.models) ? parsed.models : [];
+
+    for (const entry of models) {
+      if (!entry?.name) continue;
+      spawnSync(
+        "curl",
+        [
+          "-sS",
+          "-o",
+          "/dev/null",
+          "--max-time",
+          "3",
+          "-X",
+          "POST",
+          "-H",
+          "Content-Type: application/json",
+          "-d",
+          JSON.stringify({ model: entry.name, keep_alive: 0 }),
+          `http://localhost:${OLLAMA_PORT}/api/generate`,
+        ],
+        { encoding: "utf8" },
+      );
+    }
   } catch {
     /* best-effort */
   }

diff --git a/src/lib/services.test.ts b/src/lib/services.test.ts
@@ -2,14 +2,23 @@
 // SPDX-License-Identifier: Apache-2.0
 
 import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
-import http from "node:http";
+import childProcess, { type SpawnSyncReturns } from "node:child_process";
 import { mkdtempSync, writeFileSync, existsSync, rmSync } from "node:fs";
-import { join } from "node:path";
+import { join, resolve } from "node:path";
 import { tmpdir } from "node:os";
 
 // Import from compiled dist/ so coverage is attributed correctly.
 import { getServiceStatuses, showStatus, stopAll } from "../../dist/lib/services";
 
+const ollamaProxyDistPath = resolve(
+  import.meta.dirname,
+  "..",
+  "..",
+  "dist",
+  "lib",
+  "onboard-ollama-proxy.js",
+);
+
 describe("getServiceStatuses", () => {
   let pidDir: string;
 
@@ -116,16 +125,41 @@ describe("showStatus", () => {
 
 describe("stopAll", () => {
   let pidDir: string;
-  let httpGetSpy: ReturnType<typeof vi.spyOn>;
+  let spawnSyncCalls: Array<{ command: string; args: readonly string[] }>;
+  let originalSpawnSync: typeof childProcess.spawnSync;
 
   beforeEach(() => {
     pidDir = mkdtempSync(join(tmpdir(), "nemoclaw-svc-test-"));
-    const request = { on: vi.fn(() => request) };
-    httpGetSpy = vi.spyOn(http, "get").mockImplementation((() => request) as any);
+    spawnSyncCalls = [];
+    originalSpawnSync = childProcess.spawnSync;
+    // @ts-expect-error — partial mock signature is intentional.
+    childProcess.spawnSync = (command: string, args: readonly string[]) => {
+      spawnSyncCalls.push({ command, args });
+      const reply: SpawnSyncReturns<string> = {
+        pid: 0,
+        output: ["", "", ""],
+        stdout: "",
+        stderr: "",
+        status: 0,
+        signal: null,
+      };
+      // Return an empty model list so the unload's for-loop is a no-op.
+      if (command === "curl" && args.some((a) => a.endsWith("/api/ps"))) {
+        reply.stdout = JSON.stringify({ models: [] });
+        reply.output = ["", reply.stdout, ""];
+      }
+      return reply;
+    };
+    // The dist `onboard-ollama-proxy` module destructures `spawnSync` at
+    // require time, so to make `stopAll` pick up the patched function we
+    // bust its cache. `services.ts` requires the proxy lazily, so the
+    // next call sees the freshly-loaded module.
+    delete require.cache[require.resolve(ollamaProxyDistPath)];
   });
 
   afterEach(() => {
-    httpGetSpy.mockRestore();
+    childProcess.spawnSync = originalSpawnSync;
+    delete require.cache[require.resolve(ollamaProxyDistPath)];
     rmSync(pidDir, { recursive: true, force: true });
   });
 
@@ -159,9 +193,12 @@ describe("stopAll", () => {
     stopAll({ pidDir });
     logSpy.mockRestore();
 
-    expect(httpGetSpy).toHaveBeenCalledWith(
-      expect.objectContaining({ hostname: "localhost", port: 11434, path: "/api/ps" }),
-      expect.any(Function),
+    const psCall = spawnSyncCalls.find(
+      (c) =>
+        c.command === "curl" &&
+        c.args.some((a) => a.endsWith("/api/ps")),
     );
+    expect(psCall).toBeDefined();
+    expect(psCall?.args).toContain("--max-time");
   });
 });