From de47a857b05115ff7e21063ab4d9a1624f293060 Mon Sep 17 00:00:00 2001
From: Tinson Lai <tinsonl@nvidia.com>
Date: Thu, 7 May 2026 12:03:30 +0000
Subject: [PATCH 1/4] refactor(sandbox): dedupe Ollama GPU unload calls in
 destroy path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address PR #2651 review follow-up captured in #2717. `destroySandbox`
called `unloadOllamaModels()` from three places that all fired in the
common `nemoclaw <name> destroy` path:

  1. Direct call inside `destroySandbox`, gated on the ollama-provider check.
  2. `cleanupSandboxServices()`, with the same provider check.
  3. `stopAll()` (when `stopHostServices` was true), unconditional.

Drop the direct call inside `destroySandbox` — `cleanupSandboxServices`
is the single hook the issue recommends. Inside `cleanupSandboxServices`,
make the `stopAll`-side and the explicit-per-sandbox unload mutually
exclusive: `stopAll()` already unloads unconditionally, so when
`stopHostServices` is true the explicit branch is suppressed; otherwise
the per-sandbox branch runs. End result: exactly one `unloadOllamaModels()`
per `destroySandbox` invocation, regardless of whether host services are
also stopping.

Add a one-line explanatory comment in `services.ts` documenting why
`stopAll` unloads unconditionally without a provider gate (the
sandbox-agnostic stop paths don't have that context — issue #2717
item 4).

Add a doc-comment note above `unloadOllamaModels` in
`onboard-ollama-proxy.ts` flagging that
`test/ollama-gpu-cleanup.test.ts` imports it directly from `dist/`, so
renames or relocations need to be coordinated with the test (issue
#2717 item 2 — adapted from the original "test mirror" wording since
the test now imports from dist rather than duplicating the function).

Item 3 (sync unload) was explicitly low-priority in the issue and is
out of scope.

No new tests added: NemoClaw enforces a zero source-shape test budget
(`ci/source-shape-test-budget.json`), behavioural coverage of the dedup
would require either exporting the private `cleanupSandboxServices`
helper or threading dependency injection through `destroySandbox`. The
dedup is structural and the explanatory comments (in both touched
files) document the invariant clearly enough for code review.

Closes #2717

Signed-off-by: Tinson Lai <tinsonl@nvidia.com>
---
 src/lib/actions/sandbox/destroy.ts | 23 +++++++++++++++--------
 src/lib/onboard-ollama-proxy.ts    |  4 ++++
 src/lib/services.ts                |  7 +++++++
 3 files changed, 26 insertions(+), 8 deletions(-)
diff --git a/src/lib/actions/sandbox/destroy.ts b/src/lib/actions/sandbox/destroy.ts
index 79e27da053..b496aa28b5 100644
--- a/src/lib/actions/sandbox/destroy.ts
+++ b/src/lib/actions/sandbox/destroy.ts
@@ -84,14 +84,19 @@ function cleanupSandboxServices(
   { stopHostServices = false }: { stopHostServices?: boolean } = {},
 ): void {
   if (stopHostServices) {
+    // `stopAll()` already runs `unloadOllamaModels()` unconditionally —
+    // see src/lib/services.ts. Don't double-call here.
     const { stopAll } = require("../../services");
     stopAll({ sandboxName });
-  }
-
-  const sb = registry.getSandbox(sandboxName);
-  if (sb?.provider?.includes("ollama")) {
-    const { unloadOllamaModels } = require("../../onboard-ollama-proxy");
-    unloadOllamaModels();
+  } else {
+    // No global stop, so `stopAll()` did not run; explicitly free Ollama
+    // models for this sandbox if its provider used Ollama. Without this
+    // branch a single-sandbox destroy would leave models loaded on the GPU.
+    const sb = registry.getSandbox(sandboxName);
+    if (sb?.provider?.includes("ollama")) {
+      const { unloadOllamaModels } = require("../../onboard-ollama-proxy");
+      unloadOllamaModels();
+    }
   }
 
   try {
@@ -203,8 +208,10 @@ export async function destroySandbox(
   }
 
   if (sb?.provider?.includes("ollama")) {
-    const { unloadOllamaModels, killStaleProxy } = require("../../onboard-ollama-proxy");
-    unloadOllamaModels();
+    // The Ollama unload is delegated to cleanupSandboxServices() further
+    // down — keeping it here would triple-call (direct + cleanup branch +
+    // stopAll when host services are stopping). See issue #2717.
+    const { killStaleProxy } = require("../../onboard-ollama-proxy");
     killStaleProxy();
   }
 
diff --git a/src/lib/onboard-ollama-proxy.ts b/src/lib/onboard-ollama-proxy.ts
index 6d3c8438a8..0c2475e79d 100644
--- a/src/lib/onboard-ollama-proxy.ts
+++ b/src/lib/onboard-ollama-proxy.ts
@@ -631,6 +631,10 @@ async function prepareOllamaModel(model, installedModels = []) {
 /**
  * Unload all running Ollama models from GPU memory.
  * Best-effort operation: silently ignores errors if Ollama is not running.
+ *
+ * NOTE: `test/ollama-gpu-cleanup.test.ts` imports this function directly
+ * from `dist/lib/onboard-ollama-proxy.js`. Renaming or relocating it will
+ * break that test — keep both in sync (issue #2717).
  */
 function unloadOllamaModels() {
   try {
diff --git a/src/lib/services.ts b/src/lib/services.ts
index e6dd5088e1..6962e6e397 100644
--- a/src/lib/services.ts
+++ b/src/lib/services.ts
@@ -429,6 +429,13 @@ export function stopAll(opts: ServiceOptions = {}): void {
   }
 
   try {
+    // Unconditional unload (no provider gate) because `stopAll` is invoked
+    // from sandbox-agnostic paths (`nemoclaw stop`, `nemoclaw tunnel stop`,
+    // and the destroy hook when stopping host services) where the caller
+    // doesn't know whether the active sandbox uses Ollama. The unload
+    // itself is no-op when no models are loaded, so calling it on every
+    // global stop is cheap and avoids leaking GPU memory when the registry
+    // is empty (#2717).
     const { unloadOllamaModels } = require("./onboard-ollama-proxy");
     unloadOllamaModels();
   } catch {

From 8c836ea59677d2dbc6cc72c7a845c6c90baa6b4c Mon Sep 17 00:00:00 2001
From: Tinson Lai <tinsonl@nvidia.com>
Date: Thu, 7 May 2026 13:00:26 +0000
Subject: [PATCH 2/4] refactor(sandbox): make Ollama GPU unload durable and
 update tests

Follow-up to the previous commit on this branch addressing #2717 item 3.

Replace `http.get` + `http.request` in `unloadOllamaModels()` with
`spawnSync("curl", ...)` so the request observably completes before the
call returns: a curl child process is owned by the kernel rather than
Node's event loop, so a subsequent `process.exit()` from
destroy/uninstall paths cannot drop a half-flushed socket. The pattern
matches `probeProxyToken` and `pullOllamaModelViaHttp` in the same file.
`--max-time 3` preserves the prior timeout. The now-unused `http`
require is removed.

Update tests to match the new I/O primitive:

- `test/ollama-gpu-cleanup.test.ts` is rewritten to mock
  `child_process.spawnSync` via the require-cache-bust pattern (matches
  `test/runner.test.ts`); adds a malformed-JSON case.
- `src/lib/services.test.ts` swaps the `http.get` spy for a `spawnSync`
  mock so the existing assertion that `stopAll` triggers the Ollama
  unload still holds.

Refs #2717

Signed-off-by: Tinson Lai <tinsonl@nvidia.com>
---
 src/lib/onboard-ollama-proxy.ts |  88 +++++-------
 src/lib/services.test.ts        |  55 ++++++--
 test/ollama-gpu-cleanup.test.ts | 241 ++++++++++++++++----------------
 3 files changed, 204 insertions(+), 180 deletions(-)

diff --git a/src/lib/onboard-ollama-proxy.ts b/src/lib/onboard-ollama-proxy.ts
index 0c2475e79d..486b20ba24 100644
--- a/src/lib/onboard-ollama-proxy.ts
+++ b/src/lib/onboard-ollama-proxy.ts
@@ -9,7 +9,6 @@ const fs = require("fs");
 const os = require("os");
 const path = require("path");
 const { spawn, spawnSync } = require("child_process");
-const http = require("http");
 const { ROOT, SCRIPTS, run, runCapture, shellQuote } = require("./runner");
 const { OLLAMA_PORT, OLLAMA_PROXY_PORT } = require("./ports");
 const {
@@ -630,61 +629,48 @@ async function prepareOllamaModel(model, installedModels = []) {
 
 /**
  * Unload all running Ollama models from GPU memory.
- * Best-effort operation: silently ignores errors if Ollama is not running.
  *
- * NOTE: `test/ollama-gpu-cleanup.test.ts` imports this function directly
- * from `dist/lib/onboard-ollama-proxy.js`. Renaming or relocating it will
- * break that test — keep both in sync (issue #2717).
+ * Uses `spawnSync("curl", ...)` rather than node's `http` module so that the
+ * request observably completes before the call returns: a curl child process
+ * is owned by the kernel, not Node's event loop, so `process.exit()` from a
+ * destroy/uninstall path cannot drop a half-flushed socket. The pattern
+ * matches `probeProxyToken` and `pullOllamaModelViaHttp` in this file.
+ *
+ * Best-effort operation: silently ignores errors if Ollama is not running.
  */
 function unloadOllamaModels() {
   try {
-    const req = http.get(
-      {
-        hostname: "localhost",
-        port: OLLAMA_PORT,
-        path: "/api/ps",
-        timeout: 3000,
-      },
-      (res) => {
-        let data = "";
-        res.on("data", (chunk) => {
-          data += chunk;
-        });
-        res.on("end", () => {
-          if (res.statusCode !== 200) return;
-          try {
-            const parsed = JSON.parse(data);
-            const models = parsed.models || [];
-            for (const entry of models) {
-              if (!entry.name) continue;
-              const unloadReq = http.request(
-                {
-                  hostname: "localhost",
-                  port: OLLAMA_PORT,
-                  path: "/api/generate",
-                  method: "POST",
-                  timeout: 3000,
-                  headers: { "Content-Type": "application/json" },
-                },
-                () => {
-                  /* ignore response */
-                },
-              );
-              unloadReq.on("error", () => {
-                /* best-effort */
-              });
-              unloadReq.write(JSON.stringify({ model: entry.name, keep_alive: 0 }));
-              unloadReq.end();
-            }
-          } catch {
-            /* best-effort */
-          }
-        });
-      },
+    const psResult = spawnSync(
+      "curl",
+      ["-sS", "--max-time", "3", `http://localhost:${OLLAMA_PORT}/api/ps`],
+      { encoding: "utf8" },
     );
-    req.on("error", () => {
-      /* best-effort */
-    });
+    if (psResult.status !== 0) return;
+
+    const parsed = JSON.parse(psResult.stdout || "{}");
+    const models = Array.isArray(parsed.models) ? parsed.models : [];
+
+    for (const entry of models) {
+      if (!entry?.name) continue;
+      spawnSync(
+        "curl",
+        [
+          "-sS",
+          "-o",
+          "/dev/null",
+          "--max-time",
+          "3",
+          "-X",
+          "POST",
+          "-H",
+          "Content-Type: application/json",
+          "-d",
+          JSON.stringify({ model: entry.name, keep_alive: 0 }),
+          `http://localhost:${OLLAMA_PORT}/api/generate`,
+        ],
+        { encoding: "utf8" },
+      );
+    }
   } catch {
     /* best-effort */
   }
diff --git a/src/lib/services.test.ts b/src/lib/services.test.ts
index c4149014a3..4c37965de0 100644
--- a/src/lib/services.test.ts
+++ b/src/lib/services.test.ts
@@ -2,14 +2,23 @@
 // SPDX-License-Identifier: Apache-2.0
 
 import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
-import http from "node:http";
+import childProcess, { type SpawnSyncReturns } from "node:child_process";
 import { mkdtempSync, writeFileSync, existsSync, rmSync } from "node:fs";
-import { join } from "node:path";
+import { join, resolve } from "node:path";
 import { tmpdir } from "node:os";
 
 // Import from compiled dist/ so coverage is attributed correctly.
 import { getServiceStatuses, showStatus, stopAll } from "../../dist/lib/services";
 
+const ollamaProxyDistPath = resolve(
+  import.meta.dirname,
+  "..",
+  "..",
+  "dist",
+  "lib",
+  "onboard-ollama-proxy.js",
+);
+
 describe("getServiceStatuses", () => {
   let pidDir: string;
 
@@ -116,16 +125,41 @@ describe("showStatus", () => {
 
 describe("stopAll", () => {
   let pidDir: string;
-  let httpGetSpy: ReturnType<typeof vi.spyOn>;
+  let spawnSyncCalls: Array<{ command: string; args: readonly string[] }>;
+  let originalSpawnSync: typeof childProcess.spawnSync;
 
   beforeEach(() => {
     pidDir = mkdtempSync(join(tmpdir(), "nemoclaw-svc-test-"));
-    const request = { on: vi.fn(() => request) };
-    httpGetSpy = vi.spyOn(http, "get").mockImplementation((() => request) as any);
+    spawnSyncCalls = [];
+    originalSpawnSync = childProcess.spawnSync;
+    // @ts-expect-error — partial mock signature is intentional.
+    childProcess.spawnSync = (command: string, args: readonly string[]) => {
+      spawnSyncCalls.push({ command, args });
+      const reply: SpawnSyncReturns<string> = {
+        pid: 0,
+        output: ["", "", ""],
+        stdout: "",
+        stderr: "",
+        status: 0,
+        signal: null,
+      };
+      // Return an empty model list so the unload's for-loop is a no-op.
+      if (command === "curl" && args.some((a) => a.endsWith("/api/ps"))) {
+        reply.stdout = JSON.stringify({ models: [] });
+        reply.output = ["", reply.stdout, ""];
+      }
+      return reply;
+    };
+    // The dist `onboard-ollama-proxy` module destructures `spawnSync` at
+    // require time, so to make `stopAll` pick up the patched function we
+    // bust its cache. `services.ts` requires the proxy lazily, so the
+    // next call sees the freshly-loaded module.
+    delete require.cache[require.resolve(ollamaProxyDistPath)];
   });
 
   afterEach(() => {
-    httpGetSpy.mockRestore();
+    childProcess.spawnSync = originalSpawnSync;
+    delete require.cache[require.resolve(ollamaProxyDistPath)];
     rmSync(pidDir, { recursive: true, force: true });
   });
 
@@ -159,9 +193,12 @@ describe("stopAll", () => {
     stopAll({ pidDir });
     logSpy.mockRestore();
 
-    expect(httpGetSpy).toHaveBeenCalledWith(
-      expect.objectContaining({ hostname: "localhost", port: 11434, path: "/api/ps" }),
-      expect.any(Function),
+    const psCall = spawnSyncCalls.find(
+      (c) =>
+        c.command === "curl" &&
+        c.args.some((a) => a.endsWith("/api/ps")),
     );
+    expect(psCall).toBeDefined();
+    expect(psCall?.args).toContain("--max-time");
   });
 });
diff --git a/test/ollama-gpu-cleanup.test.ts b/test/ollama-gpu-cleanup.test.ts
index 22671a30ef..a69031df3b 100644
--- a/test/ollama-gpu-cleanup.test.ts
+++ b/test/ollama-gpu-cleanup.test.ts
@@ -1,135 +1,136 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
-import { describe, expect, it, vi } from "vitest";
-import http from "node:http";
-
-import { unloadOllamaModels } from "../dist/lib/onboard-ollama-proxy.js";
+import childProcess, { type SpawnSyncReturns } from "node:child_process";
+import path from "node:path";
+import { describe, expect, it } from "vitest";
+
+const modulePath = path.join(
+  import.meta.dirname,
+  "..",
+  "dist",
+  "lib",
+  "onboard-ollama-proxy.js",
+);
+
+type SpawnCall = { command: string; args: readonly string[] };
+
+function ok(stdout = ""): SpawnSyncReturns<string> {
+  return {
+    pid: 0,
+    output: ["", stdout, ""],
+    stdout,
+    stderr: "",
+    status: 0,
+    signal: null,
+  };
+}
+
+function fail(stderr = "couldn't connect"): SpawnSyncReturns<string> {
+  return {
+    pid: 0,
+    output: ["", "", stderr],
+    stdout: "",
+    stderr,
+    status: 7,
+    signal: null,
+  };
+}
+
+function withMockedSpawnSync<T>(
+  responder: (call: SpawnCall) => SpawnSyncReturns<string>,
+  fn: (calls: SpawnCall[]) => T,
+): T {
+  const calls: SpawnCall[] = [];
+  const original = childProcess.spawnSync;
+  // @ts-expect-error — partial mock signature is intentional.
+  childProcess.spawnSync = (command: string, args: readonly string[]) => {
+    const call = { command, args };
+    calls.push(call);
+    return responder(call);
+  };
+  try {
+    delete require.cache[require.resolve(modulePath)];
+    return fn(calls);
+  } finally {
+    childProcess.spawnSync = original;
+    delete require.cache[require.resolve(modulePath)];
+  }
+}
 
 describe("Ollama GPU cleanup", () => {
-  it("unloads all running Ollama models via the production HTTP implementation", async () => {
-    const mockModels = {
-      models: [{ name: "llama3.1:8b" }, { name: "qwen:7b" }],
-    };
-
-    const mockResponse = {
-      statusCode: 200,
-      on: vi.fn((event, handler) => {
-        if (event === "data") {
-          handler(JSON.stringify(mockModels));
-        } else if (event === "end") {
-          handler();
+  it("calls curl synchronously to unload every running model via /api/generate", () => {
+    withMockedSpawnSync(
+      ({ args }) => {
+        if (args.some((a) => a.endsWith("/api/ps"))) {
+          return ok(
+            JSON.stringify({ models: [{ name: "llama3.1:8b" }, { name: "qwen:7b" }] }),
+          );
         }
-        return mockResponse;
-      }),
-    };
-
-    const mockGetRequest = {
-      on: vi.fn(() => mockGetRequest),
-    };
-
-    const mockUnloadRequests: Array<{
-      on: ReturnType<typeof vi.fn>;
-      write: ReturnType<typeof vi.fn>;
-      end: ReturnType<typeof vi.fn>;
-    }> = [];
-
-    const httpGetSpy = vi.spyOn(http, "get").mockImplementation(((options: any, callback: any) => {
-      expect(options.hostname).toBe("localhost");
-      expect(options.port).toBe(11434);
-      expect(options.path).toBe("/api/ps");
-      callback(mockResponse);
-      return mockGetRequest;
-    }) as any);
-
-    const httpRequestSpy = vi.spyOn(http, "request").mockImplementation(((
-      options: any,
-      callback: any,
-    ) => {
-      expect(options.hostname).toBe("localhost");
-      expect(options.port).toBe(11434);
-      expect(options.path).toBe("/api/generate");
-      expect(options.method).toBe("POST");
-      expect(options.headers["Content-Type"]).toBe("application/json");
-      const req = {
-        on: vi.fn(() => req),
-        write: vi.fn(),
-        end: vi.fn(),
-      };
-      mockUnloadRequests.push(req);
-      callback();
-      return req;
-    }) as any);
-
-    unloadOllamaModels();
-
-    expect(httpGetSpy).toHaveBeenCalledTimes(1);
-
-    await new Promise((resolve) => setTimeout(resolve, 50));
-
-    expect(httpRequestSpy).toHaveBeenCalledTimes(2);
-    expect(mockUnloadRequests.map((req) => req.write.mock.calls[0]?.[0])).toEqual([
-      JSON.stringify({ model: "llama3.1:8b", keep_alive: 0 }),
-      JSON.stringify({ model: "qwen:7b", keep_alive: 0 }),
-    ]);
-    expect(mockUnloadRequests.every((req) => req.end.mock.calls.length === 1)).toBe(true);
+        return ok();
+      },
+      (calls) => {
+        const { unloadOllamaModels } = require(modulePath);
+        unloadOllamaModels();
+
+        expect(calls).toHaveLength(3);
+
+        expect(calls[0].command).toBe("curl");
+        expect(calls[0].args).toContain("--max-time");
+        expect(calls[0].args[calls[0].args.length - 1]).toMatch(/\/api\/ps$/);
+
+        expect(calls[1].command).toBe("curl");
+        expect(calls[1].args).toContain("-X");
+        expect(calls[1].args).toContain("POST");
+        expect(calls[1].args).toContain(JSON.stringify({ model: "llama3.1:8b", keep_alive: 0 }));
+        expect(calls[1].args[calls[1].args.length - 1]).toMatch(/\/api\/generate$/);
+
+        expect(calls[2].args).toContain(JSON.stringify({ model: "qwen:7b", keep_alive: 0 }));
+      },
+    );
+  });
 
-    httpGetSpy.mockRestore();
-    httpRequestSpy.mockRestore();
+  it("returns silently when /api/ps fails (Ollama not running)", () => {
+    withMockedSpawnSync(
+      () => fail(),
+      (calls) => {
+        const { unloadOllamaModels } = require(modulePath);
+        expect(() => unloadOllamaModels()).not.toThrow();
+        expect(calls).toHaveLength(1);
+        expect(calls[0].args[calls[0].args.length - 1]).toMatch(/\/api\/ps$/);
+      },
+    );
   });
 
-  it("handles errors gracefully when Ollama is not running", () => {
-    const mockGetRequest = {
-      on: vi.fn((event, handler) => {
-        if (event === "error") {
-          handler(new Error("Connection refused"));
+  it("does not unload anything when Ollama reports no loaded models", () => {
+    withMockedSpawnSync(
+      ({ args }) => {
+        if (args.some((a) => a.endsWith("/api/ps"))) {
+          return ok(JSON.stringify({ models: [] }));
         }
-        return mockGetRequest;
-      }),
-    };
-
-    const httpGetSpy = vi.spyOn(http, "get").mockImplementation((() => mockGetRequest) as any);
-
-    expect(() => unloadOllamaModels()).not.toThrow();
-    expect(httpGetSpy).toHaveBeenCalledTimes(1);
-
-    httpGetSpy.mockRestore();
+        return ok();
+      },
+      (calls) => {
+        const { unloadOllamaModels } = require(modulePath);
+        unloadOllamaModels();
+        expect(calls).toHaveLength(1);
+      },
+    );
   });
 
-  it("does not unload anything when Ollama reports no loaded models", async () => {
-    const mockModels = { models: [] };
-
-    const mockResponse = {
-      statusCode: 200,
-      on: vi.fn((event, handler) => {
-        if (event === "data") {
-          handler(JSON.stringify(mockModels));
-        } else if (event === "end") {
-          handler();
+  it("ignores malformed JSON from /api/ps without throwing", () => {
+    withMockedSpawnSync(
+      ({ args }) => {
+        if (args.some((a) => a.endsWith("/api/ps"))) {
+          return ok("not-json");
         }
-        return mockResponse;
-      }),
-    };
-
-    const mockGetRequest = {
-      on: vi.fn(() => mockGetRequest),
-    };
-
-    const httpGetSpy = vi.spyOn(http, "get").mockImplementation(((_options: any, callback: any) => {
-      callback(mockResponse);
-      return mockGetRequest;
-    }) as any);
-
-    const httpRequestSpy = vi.spyOn(http, "request");
-
-    unloadOllamaModels();
-
-    await new Promise((resolve) => setTimeout(resolve, 50));
-
-    expect(httpGetSpy).toHaveBeenCalledTimes(1);
-    expect(httpRequestSpy).not.toHaveBeenCalled();
-
-    httpGetSpy.mockRestore();
-    httpRequestSpy.mockRestore();
+        return ok();
+      },
+      (calls) => {
+        const { unloadOllamaModels } = require(modulePath);
+        expect(() => unloadOllamaModels()).not.toThrow();
+        expect(calls).toHaveLength(1);
+      },
+    );
   });
 });

From 3ddf28a3ea473f017a521c486e0f193f62e1e03d Mon Sep 17 00:00:00 2001
From: Tinson Lai <tinsonl@nvidia.com>
Date: Thu, 7 May 2026 13:34:49 +0000
Subject: [PATCH 3/4] refactor(sandbox): drop rationale comments added on this
 branch

Remove the explanatory comments added by earlier commits on this branch:
the destroy.ts dedup hint, the `unloadOllamaModels` JSDoc paragraph in
onboard-ollama-proxy.ts, and the `stopAll()` unconditional-unload
comment in services.ts. The structural code reads cleanly without the
prose.

Signed-off-by: Tinson Lai <tinsonl@nvidia.com>
---
 src/lib/actions/sandbox/destroy.ts | 3 ---
 src/lib/onboard-ollama-proxy.ts    | 7 -------
 src/lib/services.ts                | 7 -------
 3 files changed, 17 deletions(-)

diff --git a/src/lib/actions/sandbox/destroy.ts b/src/lib/actions/sandbox/destroy.ts
index b496aa28b5..f2375d7c9a 100644
--- a/src/lib/actions/sandbox/destroy.ts
+++ b/src/lib/actions/sandbox/destroy.ts
@@ -208,9 +208,6 @@ export async function destroySandbox(
   }
 
   if (sb?.provider?.includes("ollama")) {
-    // The Ollama unload is delegated to cleanupSandboxServices() further
-    // down — keeping it here would triple-call (direct + cleanup branch +
-    // stopAll when host services are stopping). See issue #2717.
     const { killStaleProxy } = require("../../onboard-ollama-proxy");
     killStaleProxy();
   }
diff --git a/src/lib/onboard-ollama-proxy.ts b/src/lib/onboard-ollama-proxy.ts
index 486b20ba24..d30619a760 100644
--- a/src/lib/onboard-ollama-proxy.ts
+++ b/src/lib/onboard-ollama-proxy.ts
@@ -629,13 +629,6 @@ async function prepareOllamaModel(model, installedModels = []) {
 
 /**
  * Unload all running Ollama models from GPU memory.
- *
- * Uses `spawnSync("curl", ...)` rather than node's `http` module so that the
- * request observably completes before the call returns: a curl child process
- * is owned by the kernel, not Node's event loop, so `process.exit()` from a
- * destroy/uninstall path cannot drop a half-flushed socket. The pattern
- * matches `probeProxyToken` and `pullOllamaModelViaHttp` in this file.
- *
  * Best-effort operation: silently ignores errors if Ollama is not running.
  */
 function unloadOllamaModels() {
diff --git a/src/lib/services.ts b/src/lib/services.ts
index 6962e6e397..e6dd5088e1 100644
--- a/src/lib/services.ts
+++ b/src/lib/services.ts
@@ -429,13 +429,6 @@ export function stopAll(opts: ServiceOptions = {}): void {
   }
 
   try {
-    // Unconditional unload (no provider gate) because `stopAll` is invoked
-    // from sandbox-agnostic paths (`nemoclaw stop`, `nemoclaw tunnel stop`,
-    // and the destroy hook when stopping host services) where the caller
-    // doesn't know whether the active sandbox uses Ollama. The unload
-    // itself is no-op when no models are loaded, so calling it on every
-    // global stop is cheap and avoids leaking GPU memory when the registry
-    // is empty (#2717).
     const { unloadOllamaModels } = require("./onboard-ollama-proxy");
     unloadOllamaModels();
   } catch {

From 18bf317c00807b93fbbadb0862849e32c18b9244 Mon Sep 17 00:00:00 2001
From: Tinson Lai <tinsonl@nvidia.com>
Date: Fri, 8 May 2026 02:12:44 +0000
Subject: [PATCH 4/4] docs(sandbox): explain durable Ollama unload and
 per-sandbox proxy guard
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a rationale comment to `unloadOllamaModels` noting why the unload
uses synchronous `spawnSync`/`curl --max-time 3` rather than async
http.* — the previous fire-and-forget version was dropped by Node's
event loop on fast CLI exit, leaving GPU memory reserved. A future
maintainer reverting to async HTTP would reintroduce that race.

In `destroySandbox`, clarify that the `sb?.provider?.includes("ollama")`
guard scopes only `killStaleProxy()` because the auth proxy is
per-sandbox, while GPU unload is already handled by
`cleanupSandboxServices`.

Per #3173 review feedback.

Signed-off-by: Tinson Lai <tinsonl@nvidia.com>
---
 src/lib/actions/sandbox/destroy.ts | 5 +++++
 src/lib/onboard-ollama-proxy.ts    | 7 +++++++
 2 files changed, 12 insertions(+)

diff --git a/src/lib/actions/sandbox/destroy.ts b/src/lib/actions/sandbox/destroy.ts
index d553915cb7..55bef9fea9 100644
--- a/src/lib/actions/sandbox/destroy.ts
+++ b/src/lib/actions/sandbox/destroy.ts
@@ -207,6 +207,11 @@ export async function destroySandbox(
     nim.stopNimContainer(sandboxName, { silent: true });
   }
 
+  // The Ollama auth proxy is per-sandbox and only spawned when the provider
+  // is Ollama, so this guard scopes only `killStaleProxy()`. GPU unload is
+  // handled separately by `cleanupSandboxServices` above (which routes
+  // through `stopAll()` or directly into `unloadOllamaModels()` based on
+  // whether host services are being torn down).
   if (sb?.provider?.includes("ollama")) {
     const { killStaleProxy } = require("../../onboard-ollama-proxy");
     killStaleProxy();
diff --git a/src/lib/onboard-ollama-proxy.ts b/src/lib/onboard-ollama-proxy.ts
index bf32a539b9..bf5ecf2a3a 100644
--- a/src/lib/onboard-ollama-proxy.ts
+++ b/src/lib/onboard-ollama-proxy.ts
@@ -630,6 +630,13 @@ async function prepareOllamaModel(model, installedModels = []) {
 /**
  * Unload all running Ollama models from GPU memory.
  * Best-effort operation: silently ignores errors if Ollama is not running.
+ *
+ * Uses `spawnSync` with `curl --max-time 3` rather than Node's
+ * `http.request`/`http.get` so the unload completes before
+ * `process.exit()`. The previous async version was fire-and-forget and got
+ * dropped by Node's event loop on fast CLI exit (e.g. `nemoclaw destroy`),
+ * leaving GPU memory reserved. Reverting to async HTTP would reintroduce
+ * that race; keep it synchronous.
  */
 function unloadOllamaModels() {
   try {