diff --git a/src/lib/actions/sandbox/snapshot.ts b/src/lib/actions/sandbox/snapshot.ts index 8a12254087..0c94d8e54d 100644 --- a/src/lib/actions/sandbox/snapshot.ts +++ b/src/lib/actions/sandbox/snapshot.ts @@ -216,6 +216,10 @@ async function autoCreateSandboxFromSource( // dst has its own lifecycle; don't inherit src's local NIM container // reference, or destroying dst would stop src's NIM. nimContainer: null, + // No CUDA proof has run for dst (this auto-create path passes no GPU flags), + // so clear src's proof rather than inheriting it — otherwise dst could show + // `Sandbox GPU: enabled (CUDA verified)` based on another sandbox's run (#4231). + sandboxGpuProof: null, }); console.log(` ${G}\u2713${R} Sandbox '${dstName}' created`); diff --git a/src/lib/actions/sandbox/status-snapshot.ts b/src/lib/actions/sandbox/status-snapshot.ts index 6073fc5237..5ad989d37e 100644 --- a/src/lib/actions/sandbox/status-snapshot.ts +++ b/src/lib/actions/sandbox/status-snapshot.ts @@ -26,8 +26,8 @@ import { import { probeSandboxInferenceGatewayHealth } from "./process-recovery"; import { getSandboxStatusPreflight, - withoutTerminalPhasePreflight, type SandboxStatusFailureLayer, + withoutTerminalPhasePreflight, } from "./status-preflight"; type ProbeProviderHealth = ( @@ -84,6 +84,9 @@ export interface SandboxStatusReport { sandboxGpuEnabled: boolean; sandboxGpuMode: string | null; sandboxGpuDevice: string | null; + // Last recorded CUDA-usability proof so `status` can distinguish a configured + // GPU from a proven-usable one instead of reporting any GPU as healthy (#4231). + sandboxGpuProof: registry.SandboxGpuProofResult | null; openshellDriver: string; openshellVersion: string; policies: string[]; @@ -222,6 +225,7 @@ export async function getSandboxStatusReport( sandboxGpuEnabled, sandboxGpuMode: (sb && sb.sandboxGpuMode) || null, sandboxGpuDevice: (sb && sb.sandboxGpuDevice) || null, + sandboxGpuProof: (sb && sb.sandboxGpuProof) || null, openshellDriver: (sb && sb.openshellDriver) || "unknown", openshellVersion: (sb && sb.openshellVersion) || "unknown", policies, diff --git a/src/lib/actions/sandbox/status.test.ts b/src/lib/actions/sandbox/status.test.ts index 9722fc0399..cd5eba214c 100644 --- a/src/lib/actions/sandbox/status.test.ts +++ b/src/lib/actions/sandbox/status.test.ts @@ -2,15 +2,16 @@ // SPDX-License-Identifier: Apache-2.0 import { describe, expect, it } from "vitest"; - -import type { ProviderHealthProbeOptions } from "../../../../dist/lib/inference/health"; import { classifySandboxContainerFailureForStatus, classifySandboxStatusPreflightFailure, getSandboxStatusInferenceHealth, isDockerDaemonUnreachableForStatus, maybeGetSandboxStatusInferenceHealth, + sandboxGpuProofStatusSuffix, + sandboxGpuProofUnverified, } from "../../../../dist/lib/actions/sandbox/status"; +import type { ProviderHealthProbeOptions } from "../../../../dist/lib/inference/health"; describe("sandbox status inference health", () => { it("passes the current model with the current provider", () => { @@ -276,3 +277,38 @@ describe("classifySandboxStatusPreflightFailure", () => { expect(result).toBeNull(); }); }); + +describe("sandbox GPU proof status rendering (#4231)", () => { + it("does not call an unproven GPU healthy", () => { + expect(sandboxGpuProofUnverified(null)).toBe(true); + expect(sandboxGpuProofUnverified(undefined)).toBe(true); + expect( + sandboxGpuProofUnverified({ status: "unverified", cudaVerified: false, at: "t" }), + ).toBe(true); + expect( + sandboxGpuProofUnverified({ status: "verified", cudaVerified: true, at: "t" }), + ).toBe(false); + expect( + sandboxGpuProofUnverified({ status: "failed", cudaVerified: false, at: "t" }), + ).toBe(false); + }); + + it("renders verified / unverified / failed suffixes distinctly", () => { + expect( + sandboxGpuProofStatusSuffix({ status: "verified", cudaVerified: true, at: "t" }), + ).toContain("CUDA verified"); + // No recorded proof (older entries) must not read as healthy. + expect(sandboxGpuProofStatusSuffix(null)).toContain("CUDA unverified"); + expect( + sandboxGpuProofStatusSuffix({ status: "unverified", cudaVerified: false, at: "t" }), + ).toContain("CUDA unverified"); + const failed = sandboxGpuProofStatusSuffix({ + status: "failed", + cudaVerified: false, + label: "cuInit(0)", + at: "t", + }); + expect(failed).toContain("last CUDA proof failed"); + expect(failed).toContain("cuInit(0)"); + }); +}); diff --git a/src/lib/actions/sandbox/status.ts b/src/lib/actions/sandbox/status.ts index 99667784ed..068bde5c36 100644 --- a/src/lib/actions/sandbox/status.ts +++ b/src/lib/actions/sandbox/status.ts @@ -12,6 +12,7 @@ import * as nim from "../../inference/nim"; import * as sandboxVersion from "../../sandbox/version"; import * as shields from "../../shields"; import { isTerminalSandboxPhase, parseSandboxPhase } from "../../state/gateway"; +import type { SandboxGpuProofResult } from "../../state/registry"; import * as registry from "../../state/registry"; import { createSystemDeps as createSessionDeps, @@ -28,26 +29,26 @@ import { printWrongGatewayActiveGuidance, } from "./gateway-state"; import { isSandboxGatewayRunningForStatus } from "./process-recovery"; -import { collectSandboxStatusSnapshot } from "./status-snapshot"; import { getSandboxStatusPreflight, printGatewayFailureLayerHeader, printSandboxStatusPreflightHeader, withoutTerminalPhasePreflight, } from "./status-preflight"; +import { collectSandboxStatusSnapshot } from "./status-snapshot"; export { + type ClassifySandboxStatusPreflightFailureDeps, classifySandboxContainerFailureForStatus, classifySandboxStatusPreflightFailure, - isDockerDaemonUnreachableForStatus, getSandboxStatusPreflight, + isDockerDaemonUnreachableForStatus, printGatewayFailureLayerHeader, printSandboxStatusPreflightHeader, - withoutTerminalPhasePreflight, - type ClassifySandboxStatusPreflightFailureDeps, type SandboxStatusFailureLayer, type SandboxStatusPreflightFailure, type SandboxStatusPreflightResult, + withoutTerminalPhasePreflight, } from "./status-preflight"; export { collectSandboxStatusSnapshot, @@ -68,6 +69,29 @@ function shouldProbeSandboxRuntimeVersion( return lookup.state === "present" && Boolean(sandbox.agentVersion); } +// True when sandbox GPU is enabled but no CUDA-usability proof has confirmed it +// (older entries with no recorded proof, or a run whose CUDA proof could not +// execute). Treated as not-yet-proven rather than healthy (#4231). +export function sandboxGpuProofUnverified( + proof: SandboxGpuProofResult | null | undefined, +): boolean { + return !proof || proof.status === "unverified"; +} + +// Render the proof-state suffix appended to the `Sandbox GPU: enabled` line so +// the status reflects verified/unverified/failed CUDA usability instead of +// reporting any configured GPU as healthy (#4231). +export function sandboxGpuProofStatusSuffix( + proof: SandboxGpuProofResult | null | undefined, +): string { + if (proof?.status === "verified") return ` ${G}(CUDA verified)${R}`; + if (proof?.status === "failed") { + const label = proof.label ? `: ${proof.label}` : ""; + return ` ${RD}(last CUDA proof failed${label})${R}`; + } + return ` ${YW}(CUDA unverified)${R}`; +} + /** * Render one Inference status line. The main probe and each subprobe go * through this helper so multi-hop providers (e.g. ollama-local backend + @@ -183,10 +207,26 @@ export async function showSandboxStatus(sandboxName: string): Promise { const sandboxGpu = sandboxGpuEnabled ? "enabled" : "disabled"; const sandboxGpuMode = sb.sandboxGpuMode ? ` (${sb.sandboxGpuMode})` : ""; const sandboxGpuDevice = sb.sandboxGpuDevice ? ` device=${sb.sandboxGpuDevice}` : ""; + const sandboxGpuProofSuffix = sandboxGpuEnabled + ? sandboxGpuProofStatusSuffix(sb.sandboxGpuProof) + : ""; const openshellDriver = sb.openshellDriver || "unknown"; const openshellVersion = sb.openshellVersion || "unknown"; console.log(` Host GPU: ${hostGpu}`); - console.log(` Sandbox GPU: ${sandboxGpu}${sandboxGpuMode}${sandboxGpuDevice}`); + console.log( + ` Sandbox GPU: ${sandboxGpu}${sandboxGpuMode}${sandboxGpuDevice}${sandboxGpuProofSuffix}`, + ); + if (sandboxGpuEnabled && sb.sandboxGpuProof?.status === "failed") { + const detail = sb.sandboxGpuProof.detail; + if (detail) console.log(` ${detail}`); + console.log( + " CUDA failed a live proof. Recreate with corrected GPU device/group access, or rerun onboard with --no-gpu.", + ); + } else if (sandboxGpuEnabled && sandboxGpuProofUnverified(sb.sandboxGpuProof)) { + console.log( + " CUDA usability has not been proven. Rerun onboard to verify, or use --no-gpu for CPU.", + ); + } console.log(` OpenShell: ${openshellVersion} (${openshellDriver})`); console.log(` Policies: ${(sb.policies || []).join(", ") || "none"}`); diff --git a/src/lib/inference/gpu-trust.ts b/src/lib/inference/gpu-trust.ts index 7730e88a92..585282363f 100644 --- a/src/lib/inference/gpu-trust.ts +++ b/src/lib/inference/gpu-trust.ts @@ -25,6 +25,27 @@ export function isDenylistedNvidiaGpuName(name: string): boolean { return NVIDIA_GPU_NAME_DENYLIST_PATTERN.test(name); } +// Result of a bounded Docker `--gpus` CUDA proof. `passed` is true only when a +// real CUDA workload (not just nvidia-smi) succeeded — that is the signal that +// distinguishes a genuine Windows-ARM N1X + WSL2 + Docker Desktop GPU (#4565) +// from the Windows-on-ARM Snapdragon nvidia-smi shim (#3988/#4424), which has +// no usable NVIDIA device and so cannot pass the workload. +export interface DockerGpuProofResult { + passed: boolean; + timedOut: boolean; + exitCode: number | null; + diagnostic: string; +} + +// Optional accept-path used by `detectGpu()` when an ARM64 Linux host reports a +// denylisted `JMJWOA-Generic-*` placeholder. The prover returns `null` when the +// host is not a proof candidate (not ARM64 WSL Docker Desktop), preserving the +// #3988 fail-closed default; otherwise it returns the bounded Docker GPU proof +// outcome so a passing real GPU can be trusted without trusting the name alone. +export type Arm64WslDockerDesktopGpuProver = ( + gpuNames: string[], +) => DockerGpuProofResult | null; + export function isPlausibleNvidiaGpuName(name: string): boolean { return !!name && !isDenylistedNvidiaGpuName(name) && NVIDIA_GPU_NAME_PATTERN.test(name); } diff --git a/src/lib/inference/nim.test.ts b/src/lib/inference/nim.test.ts index 97fe476940..0816ebb964 100644 --- a/src/lib/inference/nim.test.ts +++ b/src/lib/inference/nim.test.ts @@ -453,6 +453,99 @@ describe("nim", () => { } }); + // #4565: a real Windows-ARM N1X + WSL2 + Docker Desktop host reports the + // same `JMJWOA-Generic-*` placeholder as the Snapdragon shim, but it can + // pass a bounded Docker `--gpus` CUDA proof. When the injected prover + // confirms the proof, the denylisted name is accepted and the detection is + // tagged so the sandbox preflight reaches the Docker Desktop WSL branch. + it("accepts a denylisted ARM64 GPU when the bounded Docker GPU proof passes (#4565)", () => { + const runCapture = vi.fn((cmd: string | string[]) => { + if (!Array.isArray(cmd)) throw new Error("expected argv array"); + if (cmd[0] === "nvidia-smi" && cmd.some((a: string) => a.includes("name,memory.total"))) { + return "JMJWOA-Generic-GPU, 65471, 65000\n"; + } + return ""; + }); + const { nimModule, restore } = loadNimWithMockedRunner(runCapture); + const proveArm64WslDockerDesktopGpu = vi.fn(() => ({ + passed: true, + timedOut: false, + exitCode: 0, + diagnostic: "", + })); + + try { + withFirmwareModel("Microsoft Corporation Virtual Machine", () => { + const result = nimModule.detectGpu({ proveArm64WslDockerDesktopGpu }); + expect(result).toMatchObject({ + type: "nvidia", + name: "JMJWOA-Generic-GPU", + count: 1, + totalMemoryMB: 65471, + wslDockerDesktopGpuProofPassed: true, + }); + expect(proveArm64WslDockerDesktopGpu).toHaveBeenCalledWith(["JMJWOA-Generic-GPU"]); + }); + } finally { + restore(); + } + }); + + // Snapdragon WoA fail-closed: the same placeholder name, but the bounded + // CUDA proof fails because there is no usable NVIDIA device. The detection + // must stay null so #3988/#4424 is not reopened. + it("keeps rejecting a denylisted ARM64 GPU when the Docker GPU proof fails (#4565/#3988)", () => { + const runCapture = vi.fn((cmd: string | string[]) => { + if (!Array.isArray(cmd)) throw new Error("expected argv array"); + if (cmd[0] === "nvidia-smi" && cmd.some((a: string) => a.includes("name,memory.total"))) { + return "JMJWOA-Generic-GPU, 65471, 65000\n"; + } + return ""; + }); + const { nimModule, restore } = loadNimWithMockedRunner(runCapture); + const failingProver = vi.fn(() => ({ + passed: false, + timedOut: false, + exitCode: 1, + diagnostic: "no CUDA-capable device is detected", + })); + const notCandidateProver = vi.fn(() => null); + + try { + withFirmwareModel("Microsoft Corporation Virtual Machine", () => { + expect(nimModule.detectGpu({ proveArm64WslDockerDesktopGpu: failingProver })).toBeNull(); + // A host that is not an ARM64 WSL Docker Desktop candidate returns + // null from the prover and must also fail closed (no proof attempted). + expect( + nimModule.detectGpu({ proveArm64WslDockerDesktopGpu: notCandidateProver }), + ).toBeNull(); + }); + } finally { + restore(); + } + }); + + // When no prover is wired (deps explicitly null), the denylist stays + // fail-closed exactly as before the #4565 accept-path existed. + it("rejects a denylisted ARM64 GPU when no Docker GPU prover is provided", () => { + const runCapture = vi.fn((cmd: string | string[]) => { + if (!Array.isArray(cmd)) throw new Error("expected argv array"); + if (cmd[0] === "nvidia-smi" && cmd.some((a: string) => a.includes("name,memory.total"))) { + return "JMJWOA-Generic-GPU, 65471, 65000\n"; + } + return ""; + }); + const { nimModule, restore } = loadNimWithMockedRunner(runCapture); + + try { + withFirmwareModel("Microsoft Corporation Virtual Machine", () => { + expect(nimModule.detectGpu({ proveArm64WslDockerDesktopGpu: null })).toBeNull(); + }); + } finally { + restore(); + } + }); + // Trust-tier gate: on ARM64 Linux with generic firmware, the absence of // `/proc/driver/nvidia/` is the Windows-on-ARM WSL shim profile and must // be rejected even when the nvidia-smi probe returns a plausible-looking diff --git a/src/lib/inference/nim.ts b/src/lib/inference/nim.ts index 49aa289200..c71ba90efb 100644 --- a/src/lib/inference/nim.ts +++ b/src/lib/inference/nim.ts @@ -21,6 +21,7 @@ const nimImages = require("../../../bin/lib/nim-images.json"); import { VLLM_PORT } from "../core/ports"; import { + type Arm64WslDockerDesktopGpuProver, isDenylistedNvidiaGpuName, isPlausibleNvidiaGpuName, nvidiaHostLooksGenuine, @@ -70,6 +71,44 @@ export interface GpuDetection { unifiedMemory?: boolean; spark?: boolean; platform?: NvidiaPlatform; + // Set when a denylisted `JMJWOA-Generic-*` placeholder name was accepted only + // because a bounded Docker `--gpus` CUDA proof passed (Windows-ARM N1X + WSL2 + // + Docker Desktop, #4565). Diagnostic marker that this detection cleared a + // live proof rather than firmware/name trust. The sandbox GPU preflight still + // reaches the Docker Desktop WSL compatibility branch via its own + // `detectWslDockerDesktopStatus()` check (consistent because the proof itself + // requires Docker Desktop WSL); this flag does not gate that branch. + wslDockerDesktopGpuProofPassed?: boolean; +} + +export interface DetectGpuDeps { + // Optional accept-path for ARM64 WSL Docker Desktop `JMJWOA-Generic-*` GPUs + // (#4565). Injected in tests; in production `detectGpu()` lazily builds the + // default prover from the onboard WSL Docker Desktop module only when it is + // about to reject a denylisted ARM64 name. + proveArm64WslDockerDesktopGpu?: Arm64WslDockerDesktopGpuProver | null; +} + +// Lazily construct the default ARM64 WSL Docker Desktop GPU prover. Kept lazy +// (and behind a require) so the inference layer does not statically depend on +// the onboard layer, and so the bounded Docker proof is only wired when we +// actually reach the denylist-reject path on an ARM64 host. +function defaultArm64WslDockerDesktopGpuProver(): Arm64WslDockerDesktopGpuProver | null { + try { + return require("../onboard/wsl-docker-desktop-gpu").createArm64WslDockerDesktopGpuProver(); + } catch (error) { + // Only the optional module-resolution case should degrade to "no prover"; + // a real bug inside the prover module must bubble up rather than masquerade + // as a missing GPU on an otherwise-supported N1X host. + if ( + error && + typeof error === "object" && + (error as NodeJS.ErrnoException).code === "MODULE_NOT_FOUND" + ) { + return null; + } + throw error; + } } // Group GPUs by their nvidia-smi model name, preserving first-appearance order. @@ -314,7 +353,7 @@ export function canRunNimWithMemory(totalMemoryMB: number): boolean { return nimImages.models.some((m: NimModel) => m.minGpuMemoryMB <= totalMemoryMB); } -export function detectGpu(): GpuDetection | null { +export function detectGpu(deps: DetectGpuDeps = {}): GpuDetection | null { // Try NVIDIA first — query name, total, and free VRAM in a single call so // the preflight line can show the GPU model alongside the memory size and // the bootstrap-model selector can pick a model that fits currently @@ -356,20 +395,43 @@ export function detectGpu(): GpuDetection | null { // Off Spark/Station/Jetson firmware, layer a denylist check and the // trust-tier gate before trusting the nvidia-smi probe. The observed // Windows-on-ARM WSL2 nvidia-smi shim emits a `JMJWOA-Generic-*` - // placeholder name AND ships no `/proc/driver/nvidia/` directory, so - // either signal alone is sufficient to reject. Treat any denylisted - // row as a poisoned probe and reject the whole result — partial - // filtering would let a mixed-row spoof surface a non-placeholder - // row as a real GPU. + // placeholder name AND ships no `/proc/driver/nvidia/` directory. A + // denylisted row still fails closed by default; the only escape is a + // bounded Docker `--gpus` CUDA proof (#4565), which the Snapdragon shim + // cannot pass. Without that proof, any denylisted row rejects the whole + // probe — partial filtering would let a mixed-row spoof surface a + // non-placeholder row as a real GPU. const firmwareConfirmsNvidia = platform === "spark" || platform === "station" || platform === "jetson"; let trusted: ParsedGpu[]; + let wslDockerDesktopGpuProofPassed = false; if (firmwareConfirmsNvidia) { trusted = parsed; - } else { - if (parsed.some((p: ParsedGpu) => isDenylistedNvidiaGpuName(p.name))) { + } else if (parsed.some((p: ParsedGpu) => isDenylistedNvidiaGpuName(p.name))) { + // A denylisted `JMJWOA-Generic-*` placeholder. Both real Windows-ARM + // N1X (WSL2 + Docker Desktop) and the Snapdragon nvidia-smi shim emit + // this name, so the name and `/proc/driver/nvidia` are insufficient. + // Give the host one bounded Docker `--gpus` CUDA proof: only the real + // GPU can run the workload, so a pass safely accepts N1X while the + // shim keeps failing closed (#4565 without reopening #3988/#4424). + const prover = + deps.proveArm64WslDockerDesktopGpu === undefined + ? defaultArm64WslDockerDesktopGpuProver() + : deps.proveArm64WslDockerDesktopGpu; + const proof = prover ? prover(parsed.map((p: ParsedGpu) => p.name)) : null; + if (!proof || !proof.passed) { return null; } + // The proof confirms a usable GPU, but it does not vouch for every + // row. Keep only the placeholder rows it covers plus any plausibly- + // named NVIDIA rows; drop unrecognized garbage so a mixed-row spoof + // cannot inflate totalMemoryMB with a phantom device. + trusted = parsed.filter( + (p: ParsedGpu) => + isDenylistedNvidiaGpuName(p.name) || isPlausibleNvidiaGpuName(p.name), + ); + wslDockerDesktopGpuProofPassed = true; + } else { if (!nvidiaHostLooksGenuine()) { return null; } @@ -402,6 +464,7 @@ export function detectGpu(): GpuDetection | null { nimCapable: canRunNimWithMemory(totalMemoryMB), platform, spark: platform === "spark", + ...(wslDockerDesktopGpuProofPassed ? { wslDockerDesktopGpuProofPassed: true } : {}), }; } } diff --git a/src/lib/onboard/docker-gpu-local-inference.test.ts b/src/lib/onboard/docker-gpu-local-inference.test.ts index 5b9a4b77b0..58e2d90efd 100644 --- a/src/lib/onboard/docker-gpu-local-inference.test.ts +++ b/src/lib/onboard/docker-gpu-local-inference.test.ts @@ -307,6 +307,28 @@ describe("verifyGpuSandboxAfterReady", () => { expect(verifyDirectSandboxGpu).toHaveBeenCalledWith("alpha"); }); + it("captures the CUDA-usability proof onto the config for status persistence (#4231)", () => { + const proof = { status: "verified" as const, cudaVerified: true, at: "t" }; + // Fresh config so the assignment does not leak into the shared GPU_CONFIG. + const config: { sandboxGpuEnabled: boolean; sandboxGpuProof?: typeof proof | null } = { + sandboxGpuEnabled: true, + }; + verifyGpuSandboxAfterReady( + config, + "vllm-local", + baseOptions({ + verifyDirectSandboxGpu: vi.fn(() => proof), + deps: { + findContainerIds: () => ["container-abc"], + dockerCapture: vi.fn(() => inspectWithNetworkMode("host")), + dockerRun: dockerRunWithCurl({ status: 0 }), + sleep: vi.fn(), + }, + }), + ); + expect(config.sandboxGpuProof).toEqual(proof); + }); + it("does not duplicate proof diagnostics when Docker GPU patch verifier handles them", () => { const proofError = new Error("process.exit"); const verifyGpuOrExit = vi.fn(() => { diff --git a/src/lib/onboard/docker-gpu-local-inference.ts b/src/lib/onboard/docker-gpu-local-inference.ts index 933e9e730f..c61aa77458 100644 --- a/src/lib/onboard/docker-gpu-local-inference.ts +++ b/src/lib/onboard/docker-gpu-local-inference.ts @@ -8,6 +8,7 @@ import { getLocalProviderValidationBaseUrl, LOCAL_INFERENCE_SANDBOX_HOST_URL_ENV, } from "../inference/local"; +import type { SandboxGpuProofResult } from "../state/registry"; import { DOCKER_GPU_PATCH_NETWORK_ENV, type DockerGpuPatchMode, @@ -31,6 +32,10 @@ const DOCKER_GPU_INFERENCE_PROBE_RETRY_DELAY_SECS = 2; type DockerGpuLocalInferenceConfig = { sandboxGpuEnabled: boolean; sandboxGpuDevice?: string | null; + // Written back by `verifyGpuSandboxAfterReady` with the CUDA-usability proof + // result so the registry/`status` can distinguish a configured GPU from a + // proven-usable one (#4231). + sandboxGpuProof?: SandboxGpuProofResult | null; }; type DockerGpuLocalInferenceOptions = { @@ -369,8 +374,10 @@ export type GpuSandboxAfterReadyOptions = { sandboxName: string; dockerDriverGateway: boolean; useDockerGpuPatch: boolean; - verifyDirectSandboxGpu: (sandboxName: string) => void; - verifyGpuOrExit?: (verifyDirectSandboxGpu: (sandboxName: string) => void) => void; + verifyDirectSandboxGpu: (sandboxName: string) => SandboxGpuProofResult; + verifyGpuOrExit?: ( + verifyDirectSandboxGpu: (sandboxName: string) => SandboxGpuProofResult, + ) => SandboxGpuProofResult; selectedMode: () => DockerGpuPatchMode | null; runCaptureOpenshell: (args: string[], opts?: Record) => string; env?: NodeJS.ProcessEnv; @@ -393,11 +400,12 @@ export function verifyGpuSandboxAfterReady( options: GpuSandboxAfterReadyOptions, ): void { try { - if (options.verifyGpuOrExit) { - options.verifyGpuOrExit(options.verifyDirectSandboxGpu); - } else { - options.verifyDirectSandboxGpu(options.sandboxName); - } + // Capture the CUDA-usability proof result and write it back onto the shared + // config so onboarding can persist it to the registry and `status` can + // report proven usability rather than mere configuration (#4231). + config.sandboxGpuProof = options.verifyGpuOrExit + ? options.verifyGpuOrExit(options.verifyDirectSandboxGpu) + : options.verifyDirectSandboxGpu(options.sandboxName); } catch (error) { // `verifyGpuOrExit` is supplied by the Docker GPU create patch and already // prints the richer Error-phase / patched-container diagnostics before diff --git a/src/lib/onboard/docker-gpu-sandbox-create.ts b/src/lib/onboard/docker-gpu-sandbox-create.ts index 00e7dd3fbd..2c22f46608 100644 --- a/src/lib/onboard/docker-gpu-sandbox-create.ts +++ b/src/lib/onboard/docker-gpu-sandbox-create.ts @@ -1,6 +1,8 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +import { getSandboxFailurePhase } from "../state/gateway"; +import type { SandboxGpuProofResult } from "../state/registry"; import type { DockerGpuPatchBackend, DockerGpuPatchDeps, @@ -19,7 +21,6 @@ import { shouldApplyDockerGpuPatch, waitForOpenShellSupervisorReconnect, } from "./docker-gpu-patch"; -import { getSandboxFailurePhase } from "../state/gateway"; type DockerGpuSandboxCreateDeps = Pick< DockerGpuPatchDeps, @@ -64,9 +65,12 @@ export type DockerGpuSandboxCreatePatch = { * Run the GPU proof while distinguishing "sandbox in terminal phase" from * "proof failed inside a live sandbox". Calls `process.exit(1)` for the * former and rethrows after printing diagnostics for the latter so the - * onboarding flow surfaces the right failure cause (#4316). + * onboarding flow surfaces the right failure cause (#4316). Returns the + * CUDA-usability proof result on success so callers can persist it (#4231). */ - verifyGpuOrExit: (verifyDirectSandboxGpu: (sandboxName: string) => void) => void; + verifyGpuOrExit: ( + verifyDirectSandboxGpu: (sandboxName: string) => SandboxGpuProofResult, + ) => SandboxGpuProofResult; }; export function createDockerGpuSandboxCreatePatch( @@ -209,7 +213,7 @@ export function createDockerGpuSandboxCreatePatch( } } try { - verifyDirectSandboxGpu(sandboxName); + return verifyDirectSandboxGpu(sandboxName); } catch (error) { printDockerGpuProofFailure(sandboxName, error, result?.mode ?? null, { runCaptureOpenshell: options.deps.runCaptureOpenshell, diff --git a/src/lib/onboard/sandbox-gpu-mode.ts b/src/lib/onboard/sandbox-gpu-mode.ts index dc6034e301..fe7ef7433b 100644 --- a/src/lib/onboard/sandbox-gpu-mode.ts +++ b/src/lib/onboard/sandbox-gpu-mode.ts @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 import type { GpuDetection } from "../inference/nim"; +import type { SandboxGpuProofResult } from "../state/registry"; export type SandboxGpuMode = "auto" | "1" | "0"; export type SandboxGpuFlag = "enable" | "disable" | null; @@ -13,6 +14,10 @@ export type SandboxGpuConfig = { sandboxGpuEnabled: boolean; sandboxGpuDevice: string | null; errors: string[]; + // Outcome of the live direct sandbox GPU proof, populated after onboarding + // runs the verifier so it can be persisted to the registry (#4231). Absent + // until the proof runs; never overwrites a stored proof on reuse paths. + sandboxGpuProof?: SandboxGpuProofResult | null; }; export type ResumeSandboxGpuOverrides = { diff --git a/src/lib/onboard/sandbox-gpu-preflight.test.ts b/src/lib/onboard/sandbox-gpu-preflight.test.ts index b42a790a1f..193306879d 100644 --- a/src/lib/onboard/sandbox-gpu-preflight.test.ts +++ b/src/lib/onboard/sandbox-gpu-preflight.test.ts @@ -156,25 +156,118 @@ describe("sandbox GPU preflight", () => { ); }); - it("treats optional direct sandbox GPU proof failures as non-fatal", () => { + it("treats optional direct sandbox GPU proof failures as non-fatal and reports unverified", () => { const runOpenshell = vi.fn(() => ({ status: 1, stdout: "", stderr: "optional proof failed" })); const verifier = createDirectSandboxGpuVerifier({ runOpenshell, + detectNvidiaPlatform: () => "linux", buildDirectSandboxGpuProofCommands: vi.fn(() => [ - { args: ["sandbox", "exec", "demo", "--", "nvidia-smi"], label: "nvidia-smi", optional: true }, - { args: ["sandbox", "exec", "demo", "--", "false"], label: "fatal proof" }, + { id: "nvidia-smi", args: ["sandbox", "exec", "demo", "--", "nvidia-smi"], label: "nvidia-smi", optional: true }, + { id: "cuda-init", args: ["sandbox", "exec", "demo", "--", "false"], label: "cuda-init", optional: true }, + ]), + compactText: (value) => value.trim(), + redact: (value) => String(value), + }); + + let result: ReturnType | undefined; + expect(() => { + result = verifier("demo"); + }).not.toThrow(); + // Optional failures no longer short-circuit; every optional proof runs so + // the CUDA-usability outcome is observed rather than swallowed (#4231). + expect(runOpenshell).toHaveBeenCalledTimes(2); + expect(result?.status).toBe("unverified"); + expect(result?.cudaVerified).toBe(false); + }); + + it("reports failed when the CUDA usability proof reaches the driver and fails (#4231)", () => { + const verifier = createDirectSandboxGpuVerifier({ + runOpenshell: vi.fn((args: string[]) => { + if (args.includes("cuda-init-cmd")) { + return { status: 1, stdout: "cuInit(0)=999", stderr: "" }; + } + return { status: 0, stdout: "", stderr: "" }; + }), + detectNvidiaPlatform: () => "jetson", + buildDirectSandboxGpuProofCommands: vi.fn(() => [ + { id: "nvidia-smi", args: ["sandbox", "exec", "demo", "--", "nvidia-smi"], label: "nvidia-smi" }, + { id: "cuda-init", args: ["sandbox", "exec", "demo", "--", "cuda-init-cmd"], label: "cuInit(0)", optional: true }, + ]), + compactText: (value) => value.trim(), + redact: (value) => String(value), + }); + + const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => undefined); + try { + const result = verifier("demo"); + expect(result.status).toBe("failed"); + expect(result.cudaVerified).toBe(false); + expect(result.detail).toContain("cuInit(0)=999"); + const warnings = warnSpy.mock.calls.map((call) => call[0]).join("\n"); + expect(warnings).toContain("/dev/nvmap"); + } finally { + warnSpy.mockRestore(); + } + }); + + it("reports verified when the CUDA usability proof passes", () => { + const verifier = createDirectSandboxGpuVerifier({ + runOpenshell: vi.fn(() => ({ status: 0, stdout: "cuInit(0)=0", stderr: "" })), + detectNvidiaPlatform: () => "linux", + buildDirectSandboxGpuProofCommands: vi.fn(() => [ + { id: "cuda-init", args: ["sandbox", "exec", "demo", "--", "cuda"], label: "cuInit(0)", optional: true }, + ]), + compactText: (value) => value.trim(), + redact: (value) => String(value), + }); + + const result = verifier("demo"); + expect(result.status).toBe("verified"); + expect(result.cudaVerified).toBe(true); + }); + + it("does not report verified when cuda-init exits 0 without the cuInit marker", () => { + // A zero exit that never printed `cuInit(0)=` (e.g. a wrapper that swallowed + // the real exit code) must not be trusted as CUDA-verified. + const verifier = createDirectSandboxGpuVerifier({ + runOpenshell: vi.fn(() => ({ status: 0, stdout: "", stderr: "" })), + detectNvidiaPlatform: () => "linux", + buildDirectSandboxGpuProofCommands: vi.fn(() => [ + { id: "cuda-init", args: ["sandbox", "exec", "demo", "--", "cuda"], label: "cuInit(0)", optional: true }, + ]), + compactText: (value) => value.trim(), + redact: (value) => String(value), + }); + + const result = verifier("demo"); + expect(result.status).toBe("unverified"); + expect(result.cudaVerified).toBe(false); + }); + + it("treats a zero exit with a non-zero cuInit code as failed, not verified (#4231)", () => { + // A wrapper that swallows the probe's non-zero exit but still prints a + // non-zero `cuInit(0)=` reached the driver and CUDA failed; it must not + // read as verified just because the process exited 0. + const verifier = createDirectSandboxGpuVerifier({ + runOpenshell: vi.fn(() => ({ status: 0, stdout: "cuInit(0)=999", stderr: "" })), + detectNvidiaPlatform: () => "linux", + buildDirectSandboxGpuProofCommands: vi.fn(() => [ + { id: "cuda-init", args: ["sandbox", "exec", "demo", "--", "cuda"], label: "cuInit(0)", optional: true }, ]), compactText: (value) => value.trim(), redact: (value) => String(value), }); - expect(() => verifier("demo")).not.toThrow(); - expect(runOpenshell).toHaveBeenCalledTimes(1); + const result = verifier("demo"); + expect(result.status).toBe("failed"); + expect(result.cudaVerified).toBe(false); + expect(result.detail).toContain("cuInit(0)=999"); }); it("throws on required direct sandbox GPU proof failures", () => { const verifier = createDirectSandboxGpuVerifier({ runOpenshell: vi.fn(() => ({ status: 1, stdout: "", stderr: "required proof failed" })), + detectNvidiaPlatform: () => "linux", buildDirectSandboxGpuProofCommands: vi.fn(() => [ { args: ["sandbox", "exec", "demo", "--", "false"], label: "fatal proof" }, ]), @@ -192,6 +285,7 @@ describe("sandbox GPU preflight", () => { env: { WSL_DISTRO_NAME: "Ubuntu" }, dockerInfoFormat: vi.fn(() => '"Docker Desktop"'), runOpenshell: vi.fn(() => ({ status: 1, stdout: "", stderr: "required proof failed" })), + detectNvidiaPlatform: () => "linux", buildDirectSandboxGpuProofCommands: vi.fn(() => [ { args: ["sandbox", "exec", "demo", "--", "false"], label: "fatal proof" }, ]), diff --git a/src/lib/onboard/sandbox-gpu-preflight.ts b/src/lib/onboard/sandbox-gpu-preflight.ts index d3ea20e369..2c9529b4a4 100644 --- a/src/lib/onboard/sandbox-gpu-preflight.ts +++ b/src/lib/onboard/sandbox-gpu-preflight.ts @@ -2,6 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 import { dockerInfoFormat } from "../adapters/docker"; +import type { GpuDetection } from "../inference/nim"; +import type { SandboxGpuProofResult } from "../state/registry"; import { findReadableNvidiaCdiSpecFiles, getDockerCdiSpecDirs } from "./docker-cdi"; import type { SandboxGpuConfig, SandboxGpuFlag } from "./sandbox-gpu-mode"; import { @@ -45,6 +47,21 @@ export function resolveSandboxGpuFlagFromOptions(opts: SandboxGpuFlagOptions): S return null; } +// Jetson/Tegra CUDA failures are usually device/group permission issues rather +// than CDI/runtime misconfiguration: the sandbox sees the GPU but the agent +// user lacks access to the Tegra device nodes. Surface the concrete devices and +// groups so the user can fix the recreate rather than seeing a bare "enabled" +// status that hides an unusable GPU (#4231). +export function jetsonGpuProofRemediationLines(): string[] { + return [ + "Jetson/Tegra CUDA proof did not pass. CUDA needs access to the Tegra device", + "nodes; confirm the sandbox propagates them and the agent user's groups:", + " ls -l /dev/nvmap /dev/nvhost-* (must be readable by the sandbox)", + " add the host video/render groups via --group-add when recreating", + "Then recreate the sandbox, or force CPU behavior with NEMOCLAW_SANDBOX_GPU=0.", + ]; +} + export function sandboxGpuRemediationLines( options: { wslDockerDesktop?: boolean; wslDockerDesktopStatus?: WslDockerDesktopStatus } = {}, ): string[] { @@ -143,43 +160,136 @@ export interface DirectSandboxGpuVerifierDeps extends WslDockerDesktopDetectionD opts?: Record, ): { status?: number | null; stdout?: unknown; stderr?: unknown }; buildDirectSandboxGpuProofCommands?: (sandboxName: string) => Array<{ + id?: string; args: string[]; label: string; optional?: boolean; }>; compactText(value: string): string; redact(value: unknown): string; + // Host firmware platform resolver, used to choose Jetson-specific remediation + // when a CUDA proof fails. Defaults to the live `nim.detectNvidiaPlatform()` + // so onboarding does not have to thread the platform through. Injected in + // tests to exercise the Jetson path without Jetson firmware. + detectNvidiaPlatform?: () => GpuDetection["platform"] | null; } -export function createDirectSandboxGpuVerifier(deps: DirectSandboxGpuVerifierDeps) { - return function verifyDirectSandboxGpu(sandboxName: string): void { +// The proof whose result decides CUDA usability. `cuInit(0)` via libcuda is the +// authoritative usability signal (it actually initializes the CUDA driver), so +// a clean pass means "verified" and a run that reaches the driver and fails +// means "failed" rather than merely "unverified". +const CUDA_USABILITY_PROOF_ID = "cuda-init"; +// Capture the cuInit(0) return code so we can require it to be 0 for a verified +// result. Matching only the marker text is not enough: a wrapper that swallows +// the probe's non-zero exit but still prints `cuInit(0)=` would otherwise +// read as verified for an unusable GPU (#4231). +const CUDA_INIT_RESULT_PATTERN = /cuInit\(0\)=(-?\d+)/; + +export type VerifyDirectSandboxGpu = ( + sandboxName: string, + hostGpuPlatform?: GpuDetection["platform"] | null, +) => SandboxGpuProofResult; + +export function createDirectSandboxGpuVerifier( + deps: DirectSandboxGpuVerifierDeps, +): VerifyDirectSandboxGpu { + return function verifyDirectSandboxGpu( + sandboxName: string, + hostGpuPlatform?: GpuDetection["platform"] | null, + ): SandboxGpuProofResult { console.log(" Verifying direct sandbox GPU access..."); + const resolvedPlatform = + hostGpuPlatform !== undefined + ? hostGpuPlatform + : (deps.detectNvidiaPlatform ?? require("../inference/nim").detectNvidiaPlatform)(); const buildProofCommands = deps.buildDirectSandboxGpuProofCommands ?? require("./initial-policy").buildDirectSandboxGpuProofCommands; + let cudaVerified = false; + // A CUDA-usability proof that reached the driver and failed (vs one that + // could not run at all). Records the proof that determines "failed" status. + let cudaFailure: { label: string; detail: string } | null = null; for (const proof of buildProofCommands(sandboxName)) { const result = deps.runOpenshell(proof.args, { ignoreError: true, suppressOutput: true, timeout: 30_000, }); + // Test the cuInit marker against the FULL combined output; truncation to + // 300 chars is only for display/storage, so a verbose proof cannot push + // the marker past the cutoff and silently downgrade the classification. + const rawOutput = deps.redact(`${result.stderr || ""} ${result.stdout || ""}`); + const cudaInitMatch = rawOutput.match(CUDA_INIT_RESULT_PATTERN); + const cudaInitRan = cudaInitMatch !== null; + // Only `cuInit(0)=0` proves usability; any other return code means the + // driver was reached but initialization failed. + const cudaInitSucceeded = cudaInitMatch?.[1] === "0"; + const diagnostic = deps.compactText(rawOutput).slice(0, 300); if (result.status === 0) { console.log(` ✓ GPU proof passed: ${proof.label}`); + if (proof.id === CUDA_USABILITY_PROOF_ID && cudaInitRan) { + // Require the cuInit(0)=0 marker on success too, symmetric with the + // failure path: a zero exit without driver initialization, or a + // wrapper that swallowed a non-zero exit but still printed a non-zero + // cuInit code, must not read as verified — treat the latter as failed. + if (cudaInitSucceeded) { + cudaVerified = true; + } else { + cudaFailure = { label: proof.label, detail: diagnostic }; + } + } continue; } - if (proof.optional === true) return; - const diagnostic = deps.compactText(deps.redact(`${result.stderr || ""} ${result.stdout || ""}`)); - console.error(` ✗ GPU proof failed: ${proof.label}`); - if (diagnostic) console.error(` ${diagnostic.slice(0, 300)}`); - for (const line of sandboxGpuRemediationLines({ - wslDockerDesktopStatus: detectWslDockerDesktopStatus(deps), - })) { - console.error(` ${line}`); + if (proof.optional !== true) { + // Required proof (e.g. the sandbox-exec wrapper itself): keep the + // historical hard-fail so onboarding aborts and rolls back. + console.error(` ✗ GPU proof failed: ${proof.label}`); + if (diagnostic) console.error(` ${diagnostic}`); + for (const line of sandboxGpuRemediationLines({ + wslDockerDesktopStatus: detectWslDockerDesktopStatus(deps), + })) { + console.error(` ${line}`); + } + const statusText = String(result.status || 1); + const diagnosticSuffix = diagnostic ? `: ${diagnostic}` : ""; + throw new Error(`GPU proof failed: ${proof.label} (status ${statusText})${diagnosticSuffix}`); } - const statusText = String(result.status || 1); - const diagnosticSuffix = diagnostic ? `: ${diagnostic.slice(0, 300)}` : ""; - throw new Error(`GPU proof failed: ${proof.label} (status ${statusText})${diagnosticSuffix}`); + // Optional proof failure is non-fatal but is no longer swallowed: a + // CUDA-usability proof that reached the driver and failed marks the GPU + // as proven-unusable so `status` can report it instead of "enabled" + // (#4231, Jetson /dev/nvmap permission failures). + if (proof.id === CUDA_USABILITY_PROOF_ID && cudaInitRan) { + cudaFailure = { label: proof.label, detail: diagnostic }; + } + console.warn(` ⚠ GPU proof inconclusive: ${proof.label}`); + if (diagnostic) console.warn(` ${diagnostic}`); + } + const status: SandboxGpuProofResult["status"] = cudaVerified + ? "verified" + : cudaFailure + ? "failed" + : "unverified"; + if (status === "verified") { + console.log(" ✓ Sandbox CUDA usability proven (cuInit succeeded)."); + } else if (status === "failed") { + console.warn(` ⚠ Sandbox CUDA proof failed: ${cudaFailure?.label}`); + const lines = + resolvedPlatform === "jetson" + ? jetsonGpuProofRemediationLines() + : sandboxGpuRemediationLines({ wslDockerDesktopStatus: detectWslDockerDesktopStatus(deps) }); + for (const line of lines) console.warn(` ${line}`); + } else { + console.warn( + " ⚠ Sandbox GPU enabled but CUDA usability is unverified (no CUDA proof ran).", + ); } + return { + status, + cudaVerified, + label: cudaFailure?.label ?? null, + detail: cudaFailure?.detail ?? null, + at: new Date().toISOString(), + }; }; } diff --git a/src/lib/onboard/sandbox-registry-metadata.ts b/src/lib/onboard/sandbox-registry-metadata.ts index a14429d03f..4242554657 100644 --- a/src/lib/onboard/sandbox-registry-metadata.ts +++ b/src/lib/onboard/sandbox-registry-metadata.ts @@ -21,6 +21,7 @@ export interface SandboxRegistryMetadataHelpers { | "sandboxGpuEnabled" | "sandboxGpuMode" | "sandboxGpuDevice" + | "sandboxGpuProof" | "openshellDriver" | "openshellVersion" >; @@ -46,6 +47,7 @@ export function createSandboxRegistryMetadataHelpers( | "sandboxGpuEnabled" | "sandboxGpuMode" | "sandboxGpuDevice" + | "sandboxGpuProof" | "openshellDriver" | "openshellVersion" > { @@ -59,6 +61,9 @@ export function createSandboxRegistryMetadataHelpers( sandboxGpuEnabled: config.sandboxGpuEnabled, sandboxGpuMode: config.mode, sandboxGpuDevice: config.sandboxGpuDevice, + // Only persist a proof when this run produced one; omit on reuse/update + // paths so a prior proof result is preserved rather than nulled out. + ...(config.sandboxGpuProof ? { sandboxGpuProof: config.sandboxGpuProof } : {}), openshellDriver: deps.isLinuxDockerDriverGatewayEnabled() ? "docker" : "kubernetes", openshellVersion: deps.getInstalledOpenshellVersion( deps.runCaptureOpenshell(["--version"], { ignoreError: true }), diff --git a/src/lib/onboard/wsl-docker-desktop-gpu.test.ts b/src/lib/onboard/wsl-docker-desktop-gpu.test.ts index e0103f978b..32c482cad1 100644 --- a/src/lib/onboard/wsl-docker-desktop-gpu.test.ts +++ b/src/lib/onboard/wsl-docker-desktop-gpu.test.ts @@ -8,11 +8,15 @@ vi.mock("../adapters/docker", () => ({ })); import { + createArm64WslDockerDesktopGpuProver, detectWslDockerDesktopStatus, + isExecFormatErrorDiagnostic, isWslDockerDesktopRuntime, WSL_DOCKER_DESKTOP_GPU_COMPATIBILITY_REMOVAL_CONDITION, + WSL_DOCKER_DESKTOP_GPU_PROOF_COMMAND, wslDockerDesktopGpuCompatibilityAction, wslDockerDesktopGpuCompatibilityRemediationLines, + wslDockerDesktopGpuProofTimeoutMs, } from "./wsl-docker-desktop-gpu"; describe("WSL Docker Desktop GPU compatibility helpers", () => { @@ -64,3 +68,109 @@ describe("WSL Docker Desktop GPU compatibility helpers", () => { expect(WSL_DOCKER_DESKTOP_GPU_COMPATIBILITY_REMOVAL_CONDITION).toContain("Remove"); }); }); + +describe("createArm64WslDockerDesktopGpuProver (#4565)", () => { + const passingProof = { passed: true, timedOut: false, exitCode: 0, diagnostic: "" }; + + it("returns null on non-ARM64 hosts without running the proof", () => { + const runProof = vi.fn(() => passingProof); + const prover = createArm64WslDockerDesktopGpuProver({ + platform: "linux", + arch: "x64", + detectWslDockerDesktopStatus: () => "docker-desktop", + runProof, + log: () => undefined, + }); + expect(prover(["JMJWOA-Generic-GPU"])).toBeNull(); + expect(runProof).not.toHaveBeenCalled(); + }); + + it("returns null when the host is not Docker Desktop-backed WSL", () => { + const runProof = vi.fn(() => passingProof); + const prover = createArm64WslDockerDesktopGpuProver({ + platform: "linux", + arch: "arm64", + detectWslDockerDesktopStatus: () => "not-docker-desktop", + runProof, + log: () => undefined, + }); + expect(prover(["JMJWOA-Generic-GPU"])).toBeNull(); + expect(runProof).not.toHaveBeenCalled(); + }); + + it("runs the bounded proof and reports the result on ARM64 Docker Desktop WSL", () => { + const runProof = vi.fn((_argv: string[], _timeoutMs: number) => passingProof); + const prover = createArm64WslDockerDesktopGpuProver({ + platform: "linux", + arch: "arm64", + detectWslDockerDesktopStatus: () => "docker-desktop", + runProof, + log: () => undefined, + }); + expect(prover(["JMJWOA-Generic-GPU"])).toEqual(passingProof); + expect(runProof).toHaveBeenCalledTimes(1); + const argv = runProof.mock.calls[0]?.[0] ?? []; + expect(argv[0]).toBe("docker"); + expect(argv).toContain("--gpus"); + }); + + it("uses an arch-correct CUDA sample image (not the amd64-only nbody) on this ARM64 path", () => { + // The proof only runs on ARM64, so the image must ship a real aarch64 CUDA + // binary. `cuda-sample:nbody` packs an x86-64 binary in its arm64 tag and + // fails with `exec format error` on the N1X target (#4565); the chosen + // vectorAdd image ships a genuine aarch64 binary. + expect(WSL_DOCKER_DESKTOP_GPU_PROOF_COMMAND).toContain("cuda-sample:vectoradd"); + expect(WSL_DOCKER_DESKTOP_GPU_PROOF_COMMAND).not.toContain("nbody"); + }); + + it("propagates a failing proof so detection stays fail-closed", () => { + const failing = { passed: false, timedOut: false, exitCode: 1, diagnostic: "no CUDA device" }; + const prover = createArm64WslDockerDesktopGpuProver({ + platform: "linux", + arch: "arm64", + detectWslDockerDesktopStatus: () => "docker-desktop", + runProof: () => failing, + log: () => undefined, + }); + expect(prover(["JMJWOA-Generic-GPU"])?.passed).toBe(false); + }); + + it("flags an exec-format-error proof as an image-arch problem, not a missing GPU (#4565)", () => { + const execFormatFailure = { + passed: false, + timedOut: false, + exitCode: 1, + diagnostic: "exec /cuda-samples/sample: exec format error", + }; + const logs: string[] = []; + const prover = createArm64WslDockerDesktopGpuProver({ + platform: "linux", + arch: "arm64", + detectWslDockerDesktopStatus: () => "docker-desktop", + runProof: () => execFormatFailure, + log: (message) => logs.push(message), + }); + // Still fail-closed (no false positive), but the operator-facing message + // must distinguish an image-architecture bug from a missing GPU. + expect(prover(["JMJWOA-Generic-GPU"])?.passed).toBe(false); + const combined = logs.join("\n"); + expect(combined).toContain("architecture"); + expect(combined).not.toContain("treating GPU as unproven"); + }); + + it("honors a positive NEMOCLAW_WSL_GPU_PROOF_TIMEOUT_MS override", () => { + expect(wslDockerDesktopGpuProofTimeoutMs({ NEMOCLAW_WSL_GPU_PROOF_TIMEOUT_MS: "5000" })).toBe(5000); + expect(wslDockerDesktopGpuProofTimeoutMs({})).toBeGreaterThan(0); + expect(wslDockerDesktopGpuProofTimeoutMs({ NEMOCLAW_WSL_GPU_PROOF_TIMEOUT_MS: "-1" })).toBeGreaterThan( + 0, + ); + }); + + it("detects Docker exec-format-error diagnostics", () => { + expect(isExecFormatErrorDiagnostic("exec /cuda-samples/sample: exec format error")).toBe(true); + expect(isExecFormatErrorDiagnostic("standard_init_linux.go: exec format error")).toBe(true); + expect(isExecFormatErrorDiagnostic("no CUDA-capable device is detected")).toBe(false); + expect(isExecFormatErrorDiagnostic(null)).toBe(false); + expect(isExecFormatErrorDiagnostic(undefined)).toBe(false); + }); +}); diff --git a/src/lib/onboard/wsl-docker-desktop-gpu.ts b/src/lib/onboard/wsl-docker-desktop-gpu.ts index a8870bbac3..c30e6b43c4 100644 --- a/src/lib/onboard/wsl-docker-desktop-gpu.ts +++ b/src/lib/onboard/wsl-docker-desktop-gpu.ts @@ -3,12 +3,38 @@ import fs from "node:fs"; import os from "node:os"; - import { dockerInfoFormat as defaultDockerInfoFormat } from "../adapters/docker"; +import type { + Arm64WslDockerDesktopGpuProver, + DockerGpuProofResult, +} from "../inference/gpu-trust"; const WSL_DOCKER_DESKTOP_DETECTION_TIMEOUT_MS = 30_000; +// This prover only ever runs on ARM64 (see `createArm64WslDockerDesktopGpuProver`), +// so the proof image MUST ship a real aarch64 CUDA binary. The older +// `cuda-sample:nbody` image is unusable here: its arm64 manifest entry actually +// contains an x86-64 ELF, so on the N1X Windows-ARM target it fails with +// `exec /cuda-samples/sample: exec format error` (#4565). `vectoradd-cuda12.5.0` +// ships a genuine aarch64 binary and runs a real CUDA kernel (device alloc + +// add + result verification), which is a strong usability proof that still +// fails closed on the Snapdragon nvidia-smi shim (no usable CUDA device, #3988). +// The image's entrypoint runs vectorAdd directly, so no trailing args are needed. export const WSL_DOCKER_DESKTOP_GPU_PROOF_COMMAND = - "docker run --rm --gpus all nvcr.io/nvidia/k8s/cuda-sample:nbody nbody -gpu -benchmark"; + "docker run --rm --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0"; + +// The proof runs a real CUDA workload and may first pull the CUDA sample image, +// so it is bounded generously (3 min) rather than with the 30s detection +// timeout. Operators on slow links can override via +// NEMOCLAW_WSL_GPU_PROOF_TIMEOUT_MS. The timeout is the safety bound that keeps +// onboarding from hanging if Docker Desktop GPU passthrough stalls. +const WSL_DOCKER_DESKTOP_GPU_PROOF_DEFAULT_TIMEOUT_MS = 180_000; + +export function wslDockerDesktopGpuProofTimeoutMs( + env: NodeJS.ProcessEnv = process.env, +): number { + const raw = Number(env.NEMOCLAW_WSL_GPU_PROOF_TIMEOUT_MS); + return Number.isFinite(raw) && raw > 0 ? raw : WSL_DOCKER_DESKTOP_GPU_PROOF_DEFAULT_TIMEOUT_MS; +} // Source-of-truth for this compatibility branch: Docker Desktop-backed WSL can // advertise Docker CDI directories while the WSL distro cannot see a usable @@ -115,6 +141,100 @@ export function wslDockerDesktopGpuCompatibilityRemediationLines( return null; } +export type Arm64WslDockerDesktopGpuProverDeps = WslDockerDesktopDetectionDeps & { + arch?: string; + detectWslDockerDesktopStatus?: (deps: WslDockerDesktopDetectionDeps) => WslDockerDesktopStatus; + runProof?: (argv: string[], timeoutMs: number) => DockerGpuProofResult; + log?: (message: string) => void; +}; + +// Split the fixed proof command constant into an argv. The command is repo- +// controlled and contains no quoting, so a whitespace split is exact and avoids +// routing the bounded proof through a shell. +function wslDockerDesktopGpuProofArgv(): string[] { + return WSL_DOCKER_DESKTOP_GPU_PROOF_COMMAND.split(/\s+/).filter(Boolean); +} + +// Docker reports an architecture mismatch (proof image built for a different +// CPU than the host) as `exec ...: exec format error`. On this ARM64-only path +// that means the proof image's binary is not aarch64 — a packaging/image bug, +// not a "no GPU" condition — so we must not let it read as a missing GPU (#4565). +export function isExecFormatErrorDiagnostic(diagnostic: string | null | undefined): boolean { + return typeof diagnostic === "string" && /exec format error/i.test(diagnostic); +} + +function runWslDockerDesktopGpuProof(argv: string[], timeoutMs: number): DockerGpuProofResult { + try { + // Lazy require: keeps this onboard module from statically pulling in the + // runner (and its transitive platform require) at import time. + const { runCaptureEx } = require("../runner") as typeof import("../runner"); + const result = runCaptureEx(argv, { timeout: timeoutMs }); + // Docker daemon errors ("could not select device driver") and CUDA-sample + // failures ("no CUDA-capable device is detected") are written to stderr, so + // prefer it for the diagnostic and fall back to stdout (vectorAdd output). + const diagnosticSource = result.stderr || result.stdout; + return { + passed: result.exitCode === 0 && !result.timedOut, + timedOut: result.timedOut, + exitCode: result.exitCode, + diagnostic: diagnosticSource.slice(0, 300), + }; + } catch (err) { + return { + passed: false, + timedOut: false, + exitCode: null, + diagnostic: err instanceof Error ? err.message.slice(0, 300) : String(err).slice(0, 300), + }; + } +} + +// Build the ARM64 WSL Docker Desktop GPU prover consumed by `detectGpu()` for +// denylisted `JMJWOA-Generic-*` names (#4565). Returns `null` for any host that +// is not ARM64 Linux on Docker Desktop-backed WSL, so the #3988/#4424 fail- +// closed default is preserved everywhere else. When the host IS a candidate it +// runs one bounded Docker `--gpus` CUDA workload (the aarch64 vectorAdd sample): +// a real N1X GPU passes, while the Snapdragon nvidia-smi shim — which has no +// usable CUDA device — cannot, so the placeholder name alone is never trusted. +export function createArm64WslDockerDesktopGpuProver( + deps: Arm64WslDockerDesktopGpuProverDeps = {}, +): Arm64WslDockerDesktopGpuProver { + const log = deps.log ?? ((message: string) => console.log(message)); + const detectStatus = deps.detectWslDockerDesktopStatus ?? detectWslDockerDesktopStatus; + const runProof = deps.runProof ?? runWslDockerDesktopGpuProof; + return function proveArm64WslDockerDesktopGpu( + gpuNames: string[], + ): DockerGpuProofResult | null { + const platform = deps.platform ?? process.platform; + const arch = deps.arch ?? process.arch; + if (platform !== "linux" || arch !== "arm64") return null; + if (detectStatus(deps) !== "docker-desktop") return null; + const names = gpuNames.filter(Boolean).join(", ") || "generic ARM64 GPU"; + log( + ` Running bounded Docker Desktop WSL GPU proof for ${names} (may pull a CUDA sample image)...`, + ); + log(` ${WSL_DOCKER_DESKTOP_GPU_PROOF_COMMAND}`); + const result = runProof(wslDockerDesktopGpuProofArgv(), wslDockerDesktopGpuProofTimeoutMs(deps.env)); + if (result.passed) { + log(" ✓ Docker Desktop WSL GPU proof passed; trusting the reported GPU."); + } else if (result.timedOut) { + log(" ✗ Docker Desktop WSL GPU proof timed out; treating GPU as unproven (CPU fallback)."); + log(" Rerun with --no-gpu to skip GPU passthrough, or raise NEMOCLAW_WSL_GPU_PROOF_TIMEOUT_MS."); + } else if (isExecFormatErrorDiagnostic(result.diagnostic)) { + // The proof binary's architecture did not match the host. This is an image + // problem, not a GPU problem, so call it out explicitly rather than letting + // the host fall back to CPU as if no GPU were present (#4565). + log(" ✗ Docker Desktop WSL GPU proof could not run: CUDA sample image architecture does not"); + log(" match this host (exec format error). This is a proof-image issue, not a missing GPU."); + log(" Rerun with --no-gpu to skip GPU passthrough, or report this so the proof image can be fixed."); + } else { + log(" ✗ Docker Desktop WSL GPU proof failed; treating GPU as unproven (CPU fallback)."); + log(" Rerun with --no-gpu to skip GPU passthrough."); + } + return result; + }; +} + export function wslDockerDesktopGpuCompatibilityAction(): WslDockerDesktopGpuCompatibilityAction { return { id: "wsl_docker_desktop_gpu_compatibility", diff --git a/src/lib/runner.ts b/src/lib/runner.ts index bda565a579..1503eed20a 100644 --- a/src/lib/runner.ts +++ b/src/lib/runner.ts @@ -265,6 +265,10 @@ const { redact, redactError, writeRedactedResult } = require("./security/redact" /** Structured result returned by runCaptureEx. */ export interface CaptureResult { stdout: string; + /** Captured stderr, trimmed. Many tools (docker, CUDA samples) write their + * actionable failure text here, so callers building diagnostics need it. + * Optional so existing `runCaptureEx` test seams stay source-compatible. */ + stderr?: string; exitCode: number | null; /** True when spawnSync sets result.error due to a timeout (ETIMEDOUT). */ timedOut: boolean; @@ -299,8 +303,10 @@ function runCaptureEx(cmd: readonly string[], opts: Omit status` can report proof state instead of treating any +// configured GPU as healthy (#4231). +export type SandboxGpuProofStatus = "verified" | "unverified" | "failed"; + +export interface SandboxGpuProofResult { + status: SandboxGpuProofStatus; + // True only when a CUDA-usability proof (cuInit via libcuda) actually passed. + cudaVerified: boolean; + // Label of the last proof that determined `status`. + label?: string | null; + // Redacted, truncated diagnostic captured when the proof failed. + detail?: string | null; + at: string; +} + export interface SandboxEntry { name: string; createdAt?: string; @@ -26,6 +44,7 @@ export interface SandboxEntry { sandboxGpuEnabled?: boolean; sandboxGpuMode?: "auto" | "1" | "0" | string | null; sandboxGpuDevice?: string | null; + sandboxGpuProof?: SandboxGpuProofResult | null; openshellDriver?: string | null; openshellVersion?: string | null; policies?: string[]; @@ -218,6 +237,7 @@ export function registerSandbox(entry: SandboxEntry): void { sandboxGpuEnabled: entry.sandboxGpuEnabled === true, sandboxGpuMode: entry.sandboxGpuMode || null, sandboxGpuDevice: entry.sandboxGpuDevice || null, + sandboxGpuProof: entry.sandboxGpuProof ?? null, openshellDriver: entry.openshellDriver || null, openshellVersion: entry.openshellVersion || null, policies: entry.policies || [], diff --git a/test/e2e/test-gpu-e2e.sh b/test/e2e/test-gpu-e2e.sh index bda0b4a29d..a6d7af0589 100755 --- a/test/e2e/test-gpu-e2e.sh +++ b/test/e2e/test-gpu-e2e.sh @@ -285,6 +285,16 @@ if status_output=$(nemoclaw "$SANDBOX_NAME" status 2>&1); then else fail "Sandbox GPU is not enabled in status output" fi + # #4231: status must report proven CUDA usability, not a bare "enabled". On a + # working GPU host the onboarding cuInit proof passes, so status should carry + # the "(CUDA verified)" suffix rather than "(CUDA unverified)" or a failure. + if echo "$status_output" | grep -Fq "CUDA verified"; then + pass "Sandbox GPU status reports CUDA verified" + elif echo "$status_output" | grep -Eq "CUDA unverified|last CUDA proof failed"; then + fail "Sandbox GPU status shows CUDA not proven on a working GPU host" + else + skip "Sandbox GPU CUDA proof state not present in status output" + fi else fail "Could not read sandbox GPU status" fi