From 445b61634607a3f2af78c2451180aab800443378 Mon Sep 17 00:00:00 2001 From: zyang-dev <267119621+zyang-dev@users.noreply.github.com> Date: Wed, 3 Jun 2026 14:48:57 -0700 Subject: [PATCH 1/9] fix(onboard): detect stale or unrefreshed NVIDIA CDI specs in host preflight Signed-off-by: zyang-dev <267119621+zyang-dev@users.noreply.github.com> --- scripts/install.sh | 8 +- src/lib/onboard.ts | 15 +- src/lib/onboard/preflight-cdi.test.ts | 703 ++++++++++++++++++++++++++ src/lib/onboard/preflight.test.ts | 340 ------------- src/lib/onboard/preflight.ts | 329 ++++++++++-- test/install-preflight.test.ts | 2 +- 6 files changed, 1015 insertions(+), 382 deletions(-) create mode 100644 src/lib/onboard/preflight-cdi.test.ts diff --git a/scripts/install.sh b/scripts/install.sh index 6e93f6b129..c08a2d98e8 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -1861,7 +1861,7 @@ repair_installer_nvidia_cdi_spec() { const host = assessHost(); if ( host && - host.cdiNvidiaGpuSpecMissing && + (host.cdiNvidiaGpuSpecNeedsRepair || host.cdiNvidiaGpuSpecMissing) && !isWslDockerDesktopRuntime(host) ) { process.stdout.write(getNvidiaCdiSpecPath(host)); @@ -1886,10 +1886,10 @@ repair_installer_nvidia_cdi_spec() { fi local sudo_cmd=() - info "Generating missing NVIDIA CDI device spec at ${spec_path}." + info "Refreshing NVIDIA CDI device spec at ${spec_path}." info "NVIDIA GPU passthrough uses CDI specs so Docker/OpenShell can request nvidia.com/gpu devices." - info "Docker is configured for CDI, but the nvidia.com/gpu spec is missing." - info "Without it, OpenShell gateway startup would fail before the sandbox can use the GPU." + info "Docker is configured for CDI, but the nvidia.com/gpu spec is missing or may be stale." + info "Without a refreshed spec, OpenShell gateway startup can fail before the sandbox can use the GPU." info "NemoClaw will first enable NVIDIA's CDI refresh service." info "If that service does not generate the spec, NemoClaw will run nvidia-ctk cdi generate directly." if [[ "$(id -u)" -ne 0 ]]; then diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index d24a5a8a0f..3c0a59b8b3 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -1841,16 +1841,19 @@ function assertCdiNvidiaGpuSpecPresent( hostGpuPlatform: string | null | undefined = null, ): void { if (hostGpuPlatform === "jetson" || preflightUtils.isWslDockerDesktopRuntime(host)) return; - if (!host.cdiNvidiaGpuSpecMissing || optedOutGpuPassthrough) return; - console.error( - " Docker is configured for CDI device injection (CDISpecDirs is set), but no", - ); + if ( + !(host.cdiNvidiaGpuSpecNeedsRepair || host.cdiNvidiaGpuSpecMissing) || + optedOutGpuPassthrough + ) { + return; + } console.error( - " nvidia.com/gpu CDI spec was found on the host. OpenShell's gateway start will", + " Docker is configured for CDI device injection (CDISpecDirs is set), but the", ); console.error( - " fail with `unresolvable CDI devices nvidia.com/gpu=all` (issue #3152).", + " NVIDIA GPU CDI spec is missing or stale. OpenShell GPU startup can fail", ); + console.error(" until the CDI spec is refreshed."); printRemediationActions(planHostRemediation(host)); process.exit(1); } diff --git a/src/lib/onboard/preflight-cdi.test.ts b/src/lib/onboard/preflight-cdi.test.ts new file mode 100644 index 0000000000..fd4f0aa4dd --- /dev/null +++ b/src/lib/onboard/preflight-cdi.test.ts @@ -0,0 +1,703 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it } from "vitest"; +// Import through the compiled dist/ output (via the bin/lib shim) so +// coverage is attributed to dist/lib/onboard/preflight.js, which is what the +// ratchet measures. +import { + assessHost, + getNvidiaCdiSpecPath, + parseDockerCdiSpecDirs, + planHostRemediation, +} from "../../../dist/lib/onboard/preflight"; + +type HostAssessment = Parameters[0]; + +function baseAssessment(overrides: Partial = {}): HostAssessment { + return { + platform: "linux", + isWsl: false, + runtime: "docker", + packageManager: "apt", + systemctlAvailable: true, + dockerServiceActive: true, + dockerServiceEnabled: true, + dockerInstalled: true, + dockerRunning: true, + dockerReachable: true, + nodeInstalled: true, + openshellInstalled: true, + dockerCgroupVersion: "v2", + dockerDefaultCgroupnsMode: "unknown", + isContainerRuntimeUnderProvisioned: false, + hasNestedOverlayConflict: false, + requiresHostCgroupnsFix: false, + isUnsupportedRuntime: false, + isHeadlessLikely: false, + hasNvidiaGpu: true, + dockerCdiSpecDirs: ["/etc/cdi", "/var/run/cdi"], + cdiNvidiaGpuSpecMissing: false, + nvidiaContainerToolkitInstalled: true, + notes: [], + ...overrides, + }; +} + +describe("parseDockerCdiSpecDirs", () => { + it("extracts the dirs from `docker info --format '{{json .}}'` output", () => { + const fixture = JSON.stringify({ CDISpecDirs: ["/etc/cdi", "/var/run/cdi"] }); + expect(parseDockerCdiSpecDirs(fixture)).toEqual(["/etc/cdi", "/var/run/cdi"]); + }); + + it("returns an empty array when CDISpecDirs is absent", () => { + expect(parseDockerCdiSpecDirs(JSON.stringify({ ServerVersion: "27.0" }))).toEqual([]); + }); + + it("returns an empty array when CDISpecDirs is the empty list", () => { + expect(parseDockerCdiSpecDirs(JSON.stringify({ CDISpecDirs: [] }))).toEqual([]); + }); + + it("returns an empty array on empty input", () => { + expect(parseDockerCdiSpecDirs("")).toEqual([]); + }); +}); + +describe("assessHost — CDI device-spec gap (#3152)", () => { + it("flags missing nvidia.com/gpu specs on an NVIDIA Linux host with CDI dirs configured", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: () => "Linux version 6.8.0-58-generic", + readdirImpl: () => [], + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + OperatingSystem: "Ubuntu 24.04", + CDISpecDirs: ["/etc/cdi", "/var/run/cdi"], + }), + commandExistsImpl: (name: string) => name === "docker", + gpuProbeImpl: () => true, + }); + + expect(result.dockerCdiSpecDirs).toEqual(["/etc/cdi", "/var/run/cdi"]); + expect(result.cdiNvidiaGpuSpecMissing).toBe(true); + }); + + it("does not flag the host when an nvidia.com/gpu YAML spec is present", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => + filePath.endsWith("nvidia.yaml") + ? "cdiVersion: 0.5.0\nkind: nvidia.com/gpu\ndevices: []\n" + : "Linux version 6.8.0-58-generic", + readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi", "/var/run/cdi"], + }), + commandExistsImpl: (name: string) => name === "docker", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(false); + }); + + it("flags disabled NVIDIA CDI refresh units even when a spec is present", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => + filePath.endsWith("nvidia.yaml") + ? "cdiVersion: 0.5.0\nkind: nvidia.com/gpu\ndevices: []\n" + : "Linux version 6.8.0-58-generic", + readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), + runCaptureImpl: (command: readonly string[]) => { + if (command[0] === "systemctl" && command[1] === "is-enabled") return "disabled"; + if (command[0] === "systemctl" && command[1] === "is-active") return "inactive"; + if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; + if (command[0] === "stat") return "1f3 0"; + return ""; + }, + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi"], + }), + commandExistsImpl: (name: string) => + name === "docker" || name === "systemctl" || name === "nvidia-ctk", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(false); + expect(result.cdiNvidiaGpuRefreshUnhealthy).toBe(true); + expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(false); + expect(result.nvidiaCdiRefreshPathEnabled).toBe(false); + expect(result.nvidiaCdiRefreshPathActive).toBe(false); + }); + + it("does not flag the normal path-only refresh pattern as unhealthy", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => + filePath.endsWith("nvidia.yaml") + ? "cdiVersion: 0.5.0\nkind: nvidia.com/gpu\ndevices: []\n" + : "Linux version 6.8.0-58-generic", + readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), + runCaptureImpl: (command: readonly string[]) => { + if (command[0] === "systemctl" && command[1] === "is-enabled") { + return command[2] === "nvidia-cdi-refresh.service" ? "disabled" : "enabled"; + } + if (command[0] === "systemctl" && command[1] === "is-active") return "active"; + if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; + if (command[0] === "stat") return "1f3 0"; + return ""; + }, + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi"], + }), + commandExistsImpl: (name: string) => + name === "docker" || name === "systemctl" || name === "nvidia-ctk", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(false); + expect(result.cdiNvidiaGpuRefreshUnhealthy).toBe(false); + expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(false); + expect(result.nvidiaCdiRefreshPathEnabled).toBe(true); + expect(result.nvidiaCdiRefreshPathActive).toBe(true); + expect(result.nvidiaCdiRefreshServiceEnabled).toBe(false); + }); + + it("flags a stale NVIDIA CDI spec when nvidia-uvm omits minor and its major no longer matches", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => + filePath.endsWith("nvidia.yaml") + ? [ + "cdiVersion: 0.5.0", + "kind: nvidia.com/gpu", + "devices:", + " - name: all", + " containerEdits:", + " deviceNodes:", + " - path: /dev/nvidia-uvm", + " hostPath: /dev/nvidia-uvm", + " type: c", + " major: 498", + "", + ].join("\n") + : "Linux version 6.8.0-58-generic", + readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), + runCaptureImpl: (command: readonly string[]) => { + if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; + if (command[0] === "systemctl" && command[1] === "is-active") return "active"; + if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; + if (command[0] === "stat" && command[3] === "/dev/nvidia-uvm") return "1f3 0"; + return ""; + }, + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi"], + }), + commandExistsImpl: (name: string) => + name === "docker" || name === "systemctl" || name === "nvidia-ctk", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(false); + expect(result.cdiNvidiaGpuRefreshUnhealthy).toBe(false); + expect(result.cdiNvidiaGpuSpecStale).toBe(true); + expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(true); + expect(result.cdiNvidiaGpuSpecMismatch).toContain("/dev/nvidia-uvm=498:0"); + expect(result.cdiNvidiaGpuSpecMismatch).toContain("live=499:0"); + }); + + it("flags a stale NVIDIA CDI spec when a non-uvm device no longer matches the live device", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => + filePath.endsWith("nvidia.yaml") + ? [ + "cdiVersion: 0.5.0", + "kind: nvidia.com/gpu", + "devices:", + " - name: all", + " containerEdits:", + " deviceNodes:", + " - path: /dev/nvidia0", + " type: c", + " major: 196", + " minor: 0", + "", + ].join("\n") + : "Linux version 6.8.0-58-generic", + readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), + runCaptureImpl: (command: readonly string[]) => { + if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; + if (command[0] === "systemctl" && command[1] === "is-active") return "active"; + if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; + if (command[0] === "stat" && command[3] === "/dev/nvidia0") return "c3 0"; + return ""; + }, + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi"], + }), + commandExistsImpl: (name: string) => + name === "docker" || name === "systemctl" || name === "nvidia-ctk", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(false); + expect(result.cdiNvidiaGpuRefreshUnhealthy).toBe(false); + expect(result.cdiNvidiaGpuSpecStale).toBe(true); + expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(true); + expect(result.cdiNvidiaGpuSpecMismatch).toContain("/dev/nvidia0=196:0"); + expect(result.cdiNvidiaGpuSpecMismatch).toContain("live=195:0"); + }); + + it("skips declared CDI device nodes whose live device is absent", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => + filePath.endsWith("nvidia.yaml") + ? [ + "cdiVersion: 0.5.0", + "kind: nvidia.com/gpu", + "devices:", + " - name: all", + " containerEdits:", + " deviceNodes:", + " - path: /dev/nvidia1", + " type: c", + " major: 195", + " minor: 1", + "", + ].join("\n") + : "Linux version 6.8.0-58-generic", + readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), + runCaptureImpl: (command: readonly string[]) => { + if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; + if (command[0] === "systemctl" && command[1] === "is-active") return "active"; + if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; + if (command[0] === "stat" && command[3] === "/dev/nvidia1") return ""; + return ""; + }, + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi"], + }), + commandExistsImpl: (name: string) => + name === "docker" || name === "systemctl" || name === "nvidia-ctk", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(false); + expect(result.cdiNvidiaGpuRefreshUnhealthy).toBe(false); + expect(result.cdiNvidiaGpuSpecStale).toBe(false); + expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(false); + }); + + it("accepts a healthy refresh service with all CDI device nodes matching live devices", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => + filePath.endsWith("nvidia.yaml") + ? [ + "cdiVersion: 0.5.0", + "kind: nvidia.com/gpu", + "devices:", + " - name: all", + " containerEdits:", + " deviceNodes:", + " - path: /dev/nvidia0", + " type: c", + " major: 195", + " minor: 0", + " - path: /dev/nvidia-uvm", + " hostPath: /dev/nvidia-uvm", + " type: c", + " major: 499", + " - path: /dev/nvidia-uvm-tools", + " type: c", + " major: 499", + " minor: 1", + "", + ].join("\n") + : "Linux version 6.8.0-58-generic", + readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), + runCaptureImpl: (command: readonly string[]) => { + if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; + if (command[0] === "systemctl" && command[1] === "is-active") return "active"; + if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; + if (command[0] === "stat" && command[3] === "/dev/nvidia0") return "c3 0"; + if (command[0] === "stat" && command[3] === "/dev/nvidia-uvm") return "1f3 0"; + if (command[0] === "stat" && command[3] === "/dev/nvidia-uvm-tools") return "1f3 1"; + return ""; + }, + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi"], + }), + commandExistsImpl: (name: string) => + name === "docker" || name === "systemctl" || name === "nvidia-ctk", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(false); + expect(result.cdiNvidiaGpuRefreshUnhealthy).toBe(false); + expect(result.cdiNvidiaGpuSpecStale).toBe(false); + expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(false); + }); + + it("does not flag a CDI device node whose explicit minor matches the live device", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => + filePath.endsWith("nvidia.yaml") + ? [ + "cdiVersion: 0.5.0", + "kind: nvidia.com/gpu", + "devices:", + " - name: all", + " containerEdits:", + " deviceNodes:", + " - path: /dev/nvidia-uvm-tools", + " type: c", + " major: 499", + " minor: 1", + "", + ].join("\n") + : "Linux version 6.8.0-58-generic", + readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), + runCaptureImpl: (command: readonly string[]) => { + if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; + if (command[0] === "systemctl" && command[1] === "is-active") return "active"; + if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; + if (command[0] === "stat" && command[3] === "/dev/nvidia-uvm-tools") return "1f3 1"; + return ""; + }, + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi"], + }), + commandExistsImpl: (name: string) => + name === "docker" || name === "systemctl" || name === "nvidia-ctk", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(false); + expect(result.cdiNvidiaGpuRefreshUnhealthy).toBe(false); + expect(result.cdiNvidiaGpuSpecStale).toBe(false); + expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(false); + }); + + it("stats CDI hostPath instead of the container path when both are present", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => + filePath.endsWith("nvidia.yaml") + ? [ + "cdiVersion: 0.5.0", + "kind: nvidia.com/gpu", + "devices:", + " - name: all", + " containerEdits:", + " deviceNodes:", + " - path: /container/nvidia0", + " hostPath: /dev/nvidia0", + " type: c", + " major: 196", + " minor: 0", + "", + ].join("\n") + : "Linux version 6.8.0-58-generic", + readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), + runCaptureImpl: (command: readonly string[]) => { + if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; + if (command[0] === "systemctl" && command[1] === "is-active") return "active"; + if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; + if (command[0] === "stat" && command[3] === "/dev/nvidia0") return "c3 0"; + if (command[0] === "stat" && command[3] === "/container/nvidia0") return "c4 0"; + return ""; + }, + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi"], + }), + commandExistsImpl: (name: string) => + name === "docker" || name === "systemctl" || name === "nvidia-ctk", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecStale).toBe(true); + expect(result.cdiNvidiaGpuSpecMismatch).toContain("/dev/nvidia0=196:0"); + expect(result.cdiNvidiaGpuSpecMismatch).toContain("live=195:0"); + }); + + it("accepts a JSON-serialised CDI spec as well", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => + filePath.endsWith("nvidia.json") + ? '{"cdiVersion":"0.5.0","kind":"nvidia.com/gpu","devices":[]}' + : "Linux version 6.8.0-58-generic", + readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.json"] : []), + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi"], + }), + commandExistsImpl: (name: string) => name === "docker", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(false); + }); + + it("does not flag a non-NVIDIA Linux host even with CDI dirs configured", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: () => "Linux version 6.8.0-58-generic", + readdirImpl: () => [], + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi"], + }), + commandExistsImpl: (name: string) => name === "docker", + gpuProbeImpl: () => false, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(false); + }); + + it("does not flag a host that does not advertise CDISpecDirs", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: () => "Linux version 6.8.0-58-generic", + readdirImpl: () => [], + dockerInfoOutput: JSON.stringify({ ServerVersion: "24.0" }), + commandExistsImpl: (name: string) => name === "docker", + gpuProbeImpl: () => true, + }); + + expect(result.dockerCdiSpecDirs).toEqual([]); + expect(result.cdiNvidiaGpuSpecMissing).toBe(false); + }); + + it("does not flag macOS even when the docker info shape would otherwise match", () => { + const result = assessHost({ + platform: "darwin", + env: {}, + readFileImpl: () => "", + readdirImpl: () => [], + dockerInfoOutput: JSON.stringify({ CDISpecDirs: ["/etc/cdi"] }), + commandExistsImpl: (name: string) => name === "docker", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(false); + }); + + it("does not accept a sibling device class such as nvidia.com/gpu-extra as a satisfying spec", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => + filePath.endsWith("nvidia-extra.yaml") + ? "cdiVersion: 0.5.0\nkind: nvidia.com/gpu-extra\ndevices: []\n" + : "Linux version 6.8.0-58-generic", + readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia-extra.yaml"] : []), + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi"], + }), + commandExistsImpl: (name: string) => name === "docker", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(true); + }); + + it("does not accept a sibling device class in JSON form either", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => + filePath.endsWith("nvidia-extra.json") + ? '{"cdiVersion":"0.5.0","kind":"nvidia.com/gpu-extra","devices":[]}' + : "Linux version 6.8.0-58-generic", + readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia-extra.json"] : []), + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi"], + }), + commandExistsImpl: (name: string) => name === "docker", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(true); + }); + + it("ignores spec files whose `kind` only mentions nvidia.com/gpu in a comment", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => + filePath.endsWith("notes.yaml") + ? "# this used to declare nvidia.com/gpu; now stripped\nkind: example.com/cpu\n" + : "Linux version 6.8.0-58-generic", + readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["notes.yaml"] : []), + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi"], + }), + commandExistsImpl: (name: string) => name === "docker", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(true); + }); +}); + +describe("getNvidiaCdiSpecPath", () => { + it("builds the default NVIDIA CDI spec path from Docker CDI dirs", () => { + expect(getNvidiaCdiSpecPath({ dockerCdiSpecDirs: ["/etc/cdi/", "/var/run/cdi"] })).toBe( + "/etc/cdi/nvidia.yaml", + ); + }); +}); + +describe("planHostRemediation — CDI", () => { + it("emits a blocking generate_nvidia_cdi_spec action when CDI dirs are configured but no nvidia.com/gpu spec exists", () => { + const actions = planHostRemediation( + baseAssessment({ + cdiNvidiaGpuSpecMissing: true, + }), + ); + + const action = actions.find( + (entry: { id: string }) => entry.id === "generate_nvidia_cdi_spec", + ); + expect(action).toBeTruthy(); + expect(action?.kind).toBe("sudo"); + expect(action?.blocking).toBe(true); + expect(action?.commands[0]).toBe("sudo mkdir -p /etc/cdi"); + expect(action?.commands[1]).toBe( + "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", + ); + expect(action?.commands[2]).toBe("sudo systemctl start nvidia-cdi-refresh.service"); + expect(action?.commands[3]).toContain("nvidia-ctk cdi list"); + expect(action?.commands[4]).toContain( + "sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml", + ); + expect(action?.commands[5]).toContain("nvidia-ctk cdi list"); + expect(action?.commands[6]).toContain("nemoclaw onboard"); + expect(action?.reason).toContain("nvidia.com/gpu"); + }); + + it("emits a non-blocking refresh-service warning when refresh units are unhealthy", () => { + const actions = planHostRemediation( + baseAssessment({ + dockerCdiSpecDirs: ["/etc/cdi"], + cdiNvidiaGpuRefreshUnhealthy: true, + cdiNvidiaGpuSpecNeedsRepair: false, + nvidiaCdiRefreshPathEnabled: false, + nvidiaCdiRefreshPathActive: false, + }), + ); + + const action = actions.find( + (entry: { id: string }) => entry.id === "warn_nvidia_cdi_refresh_unhealthy", + ); + expect(action).toBeTruthy(); + expect(action?.blocking).toBe(false); + expect(action?.title).toBe("Enable NVIDIA CDI refresh service"); + expect(action?.reason).toContain("path disabled"); + expect(action?.commands[0]).toBe( + "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", + ); + expect(action?.commands[1]).toBe("sudo systemctl start nvidia-cdi-refresh.service"); + }); + + it("emits an install_nvidia_container_toolkit action with apt bootstrap when nvidia-ctk is missing on apt hosts", () => { + const actions = planHostRemediation( + baseAssessment({ + cdiNvidiaGpuSpecMissing: true, + nvidiaContainerToolkitInstalled: false, + }), + ); + + expect(actions.find((entry) => entry.id === "generate_nvidia_cdi_spec")).toBeUndefined(); + const action = actions.find((entry) => entry.id === "install_nvidia_container_toolkit"); + expect(action).toBeTruthy(); + expect(action?.kind).toBe("sudo"); + expect(action?.blocking).toBe(true); + expect(action?.title).toContain("Install NVIDIA Container Toolkit"); + expect(action?.reason).toContain("nvidia-container-toolkit"); + expect(action?.commands.some((c) => c.includes("nvidia-container-toolkit-keyring.gpg"))).toBe( + true, + ); + expect(action?.commands.some((c) => c === "sudo apt-get install -y nvidia-container-toolkit")).toBe( + true, + ); + expect( + action?.commands.some((c) => c.startsWith("sudo nvidia-ctk cdi generate --output=")), + ).toBe(true); + const ctkInstallIndex = + action?.commands.findIndex((c) => c === "sudo apt-get install -y nvidia-container-toolkit") ?? + -1; + const ctkGenerateIndex = + action?.commands.findIndex((c) => c.startsWith("sudo nvidia-ctk cdi generate --output=")) ?? + -1; + expect(ctkInstallIndex).toBeGreaterThanOrEqual(0); + expect(ctkGenerateIndex).toBeGreaterThan(ctkInstallIndex); + }); + + it("emits an install_nvidia_container_toolkit action with a docs pointer when nvidia-ctk is missing on unknown package managers", () => { + const actions = planHostRemediation( + baseAssessment({ + packageManager: "unknown", + cdiNvidiaGpuSpecMissing: true, + nvidiaContainerToolkitInstalled: false, + }), + ); + + const action = actions.find((entry) => entry.id === "install_nvidia_container_toolkit"); + expect(action).toBeTruthy(); + expect( + action?.commands.some((c) => + c.includes("docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide"), + ), + ).toBe(true); + expect( + action?.commands.some((c) => c.startsWith("sudo nvidia-ctk cdi generate --output=")), + ).toBe(true); + }); +}); diff --git a/src/lib/onboard/preflight.test.ts b/src/lib/onboard/preflight.test.ts index b71bbea7f1..a360c2e043 100644 --- a/src/lib/onboard/preflight.test.ts +++ b/src/lib/onboard/preflight.test.ts @@ -10,12 +10,10 @@ import { checkPortAvailable, getDockerBridgeGatewayIp, getMemoryInfo, - getNvidiaCdiSpecPath, ensureSwap, isDockerUnderProvisioned, MIN_RECOMMENDED_DOCKER_CPUS, MIN_RECOMMENDED_DOCKER_MEM_GIB, - parseDockerCdiSpecDirs, parseDockerInfoCpus, parseDockerInfoMemTotalBytes, parseDockerStorageDriver, @@ -640,208 +638,6 @@ describe("parseDockerUsesContainerdSnapshotter", () => { }); }); -describe("parseDockerCdiSpecDirs", () => { - it("extracts the dirs from `docker info --format '{{json .}}'` output", () => { - const fixture = JSON.stringify({ CDISpecDirs: ["/etc/cdi", "/var/run/cdi"] }); - expect(parseDockerCdiSpecDirs(fixture)).toEqual(["/etc/cdi", "/var/run/cdi"]); - }); - - it("returns an empty array when CDISpecDirs is absent", () => { - expect(parseDockerCdiSpecDirs(JSON.stringify({ ServerVersion: "27.0" }))).toEqual([]); - }); - - it("returns an empty array when CDISpecDirs is the empty list", () => { - expect(parseDockerCdiSpecDirs(JSON.stringify({ CDISpecDirs: [] }))).toEqual([]); - }); - - it("returns an empty array on empty input", () => { - expect(parseDockerCdiSpecDirs("")).toEqual([]); - }); -}); - -describe("assessHost — CDI device-spec gap (#3152)", () => { - it("flags missing nvidia.com/gpu specs on an NVIDIA Linux host with CDI dirs configured", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: () => "Linux version 6.8.0-58-generic", - readdirImpl: () => [], - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - OperatingSystem: "Ubuntu 24.04", - CDISpecDirs: ["/etc/cdi", "/var/run/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.dockerCdiSpecDirs).toEqual(["/etc/cdi", "/var/run/cdi"]); - expect(result.cdiNvidiaGpuSpecMissing).toBe(true); - }); - - it("does not flag the host when an nvidia.com/gpu YAML spec is present", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("nvidia.yaml") - ? "cdiVersion: 0.5.0\nkind: nvidia.com/gpu\ndevices: []\n" - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi", "/var/run/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - }); - - it("accepts a JSON-serialised CDI spec as well", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("nvidia.json") - ? '{"cdiVersion":"0.5.0","kind":"nvidia.com/gpu","devices":[]}' - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.json"] : []), - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - }); - - it("does not flag a non-NVIDIA Linux host even with CDI dirs configured", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: () => "Linux version 6.8.0-58-generic", - readdirImpl: () => [], - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => false, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - }); - - it("does not flag a host that does not advertise CDISpecDirs", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: () => "Linux version 6.8.0-58-generic", - readdirImpl: () => [], - dockerInfoOutput: JSON.stringify({ ServerVersion: "24.0" }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.dockerCdiSpecDirs).toEqual([]); - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - }); - - it("does not flag macOS even when the docker info shape would otherwise match", () => { - const result = assessHost({ - platform: "darwin", - env: {}, - readFileImpl: () => "", - readdirImpl: () => [], - dockerInfoOutput: JSON.stringify({ CDISpecDirs: ["/etc/cdi"] }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - }); - - it("does not accept a sibling device class such as nvidia.com/gpu-extra as a satisfying spec", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("nvidia-extra.yaml") - ? "cdiVersion: 0.5.0\nkind: nvidia.com/gpu-extra\ndevices: []\n" - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia-extra.yaml"] : []), - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(true); - }); - - it("does not accept a sibling device class in JSON form either", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("nvidia-extra.json") - ? '{"cdiVersion":"0.5.0","kind":"nvidia.com/gpu-extra","devices":[]}' - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia-extra.json"] : []), - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(true); - }); - - it("ignores spec files whose `kind` only mentions nvidia.com/gpu in a comment", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("notes.yaml") - ? "# this used to declare nvidia.com/gpu; now stripped\nkind: example.com/cpu\n" - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["notes.yaml"] : []), - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(true); - }); -}); - -describe("getNvidiaCdiSpecPath", () => { - it("builds the default NVIDIA CDI spec path from Docker CDI dirs", () => { - expect(getNvidiaCdiSpecPath({ dockerCdiSpecDirs: ["/etc/cdi/", "/var/run/cdi"] })).toBe( - "/etc/cdi/nvidia.yaml", - ); - }); -}); - describe("planHostRemediation", () => { function baseAssessment( overrides: Partial[0]> = {}, @@ -1079,142 +875,6 @@ describe("planHostRemediation", () => { expect(actions.some((action: { id: string }) => action.id === "install_openshell")).toBe(true); }); - it("emits a blocking generate_nvidia_cdi_spec action when CDI dirs are configured but no nvidia.com/gpu spec exists", () => { - const actions = planHostRemediation({ - platform: "linux", - isWsl: false, - runtime: "docker", - packageManager: "apt", - systemctlAvailable: true, - dockerServiceActive: true, - dockerServiceEnabled: true, - dockerInstalled: true, - dockerRunning: true, - dockerReachable: true, - nodeInstalled: true, - openshellInstalled: true, - dockerCgroupVersion: "v2", - dockerDefaultCgroupnsMode: "unknown", - isContainerRuntimeUnderProvisioned: false, - hasNestedOverlayConflict: false, - requiresHostCgroupnsFix: false, - isUnsupportedRuntime: false, - isHeadlessLikely: false, - hasNvidiaGpu: true, - dockerCdiSpecDirs: ["/etc/cdi", "/var/run/cdi"], - cdiNvidiaGpuSpecMissing: true, - nvidiaContainerToolkitInstalled: true, - notes: [], - }); - - const action = actions.find( - (entry: { id: string }) => entry.id === "generate_nvidia_cdi_spec", - ); - expect(action).toBeTruthy(); - expect(action?.kind).toBe("sudo"); - expect(action?.blocking).toBe(true); - expect(action?.commands[0]).toBe("sudo mkdir -p /etc/cdi"); - expect(action?.commands[1]).toBe( - "sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml", - ); - expect(action?.commands[2]).toContain("nvidia-ctk cdi list"); - expect(action?.commands[3]).toContain("nemoclaw onboard"); - expect(action?.reason).toContain("nvidia.com/gpu"); - }); - - it("emits an install_nvidia_container_toolkit action with apt bootstrap when nvidia-ctk is missing on apt hosts", () => { - const actions = planHostRemediation({ - platform: "linux", - isWsl: false, - runtime: "docker", - packageManager: "apt", - systemctlAvailable: true, - dockerServiceActive: true, - dockerServiceEnabled: true, - dockerInstalled: true, - dockerRunning: true, - dockerReachable: true, - nodeInstalled: true, - openshellInstalled: true, - dockerCgroupVersion: "v2", - dockerDefaultCgroupnsMode: "unknown", - isContainerRuntimeUnderProvisioned: false, - hasNestedOverlayConflict: false, - requiresHostCgroupnsFix: false, - isUnsupportedRuntime: false, - isHeadlessLikely: false, - hasNvidiaGpu: true, - dockerCdiSpecDirs: ["/etc/cdi", "/var/run/cdi"], - cdiNvidiaGpuSpecMissing: true, - nvidiaContainerToolkitInstalled: false, - notes: [], - }); - - expect(actions.find((entry) => entry.id === "generate_nvidia_cdi_spec")).toBeUndefined(); - const action = actions.find((entry) => entry.id === "install_nvidia_container_toolkit"); - expect(action).toBeTruthy(); - expect(action?.kind).toBe("sudo"); - expect(action?.blocking).toBe(true); - expect(action?.title).toContain("Install NVIDIA Container Toolkit"); - expect(action?.reason).toContain("nvidia-container-toolkit"); - expect(action?.commands.some((c) => c.includes("nvidia-container-toolkit-keyring.gpg"))).toBe( - true, - ); - expect(action?.commands.some((c) => c === "sudo apt-get install -y nvidia-container-toolkit")).toBe( - true, - ); - expect( - action?.commands.some((c) => c.startsWith("sudo nvidia-ctk cdi generate --output=")), - ).toBe(true); - const ctkInstallIndex = - action?.commands.findIndex((c) => c === "sudo apt-get install -y nvidia-container-toolkit") ?? - -1; - const ctkGenerateIndex = - action?.commands.findIndex((c) => c.startsWith("sudo nvidia-ctk cdi generate --output=")) ?? - -1; - expect(ctkInstallIndex).toBeGreaterThanOrEqual(0); - expect(ctkGenerateIndex).toBeGreaterThan(ctkInstallIndex); - }); - - it("emits an install_nvidia_container_toolkit action with a docs pointer when nvidia-ctk is missing on unknown package managers", () => { - const actions = planHostRemediation({ - platform: "linux", - isWsl: false, - runtime: "docker", - packageManager: "unknown", - systemctlAvailable: true, - dockerServiceActive: true, - dockerServiceEnabled: true, - dockerInstalled: true, - dockerRunning: true, - dockerReachable: true, - nodeInstalled: true, - openshellInstalled: true, - dockerCgroupVersion: "v2", - dockerDefaultCgroupnsMode: "unknown", - isContainerRuntimeUnderProvisioned: false, - hasNestedOverlayConflict: false, - requiresHostCgroupnsFix: false, - isUnsupportedRuntime: false, - isHeadlessLikely: false, - hasNvidiaGpu: true, - dockerCdiSpecDirs: ["/etc/cdi", "/var/run/cdi"], - cdiNvidiaGpuSpecMissing: true, - nvidiaContainerToolkitInstalled: false, - notes: [], - }); - - const action = actions.find((entry) => entry.id === "install_nvidia_container_toolkit"); - expect(action).toBeTruthy(); - expect( - action?.commands.some((c) => - c.includes("docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide"), - ), - ).toBe(true); - expect( - action?.commands.some((c) => c.startsWith("sudo nvidia-ctk cdi generate --output=")), - ).toBe(true); - }); }); describe("ensureSwap", () => { diff --git a/src/lib/onboard/preflight.ts b/src/lib/onboard/preflight.ts index 0739ee6055..e09a29e598 100644 --- a/src/lib/onboard/preflight.ts +++ b/src/lib/onboard/preflight.ts @@ -124,6 +124,14 @@ export interface HostAssessment { hasNvidiaGpu: boolean; dockerCdiSpecDirs: string[]; cdiNvidiaGpuSpecMissing: boolean; + cdiNvidiaGpuSpecStale?: boolean; + cdiNvidiaGpuSpecMismatch?: string; + cdiNvidiaGpuRefreshUnhealthy?: boolean; + cdiNvidiaGpuSpecNeedsRepair?: boolean; + nvidiaCdiRefreshPathActive?: boolean | null; + nvidiaCdiRefreshPathEnabled?: boolean | null; + nvidiaCdiRefreshServiceEnabled?: boolean | null; + nvidiaCdiRefreshServiceFailed?: boolean | null; nvidiaContainerToolkitInstalled: boolean; notes: string[]; } @@ -154,6 +162,17 @@ export interface AssessHostOpts { gpuProbeImpl?: () => boolean; } +type DeviceNumbers = { major: number; minor: number }; + +type CdiDeviceNode = DeviceNumbers & { + filePath: string; + path: string; +}; + +const NVIDIA_CDI_KIND_YAML_RE = + /^[ \t]*kind[ \t]*:[ \t]*(?:"nvidia\.com\/gpu"|'nvidia\.com\/gpu'|nvidia\.com\/gpu)[ \t]*(?:#.*)?$/im; +const NVIDIA_CDI_KIND_JSON_RE = /"kind"\s*:\s*"nvidia\.com\/gpu"/; + function buildCommandVArgv(commandName: string): readonly string[] { return ["sh", "-c", 'command -v "$1"', "--", commandName]; } @@ -329,9 +348,6 @@ function hasNvidiaCdiSpec( // check and suppress the preflight warning. A comment that merely mentions // `nvidia.com/gpu` is also rejected because `kindRe` only matches when the // *whole* scalar value is the device class. - const kindRe = - /^[ \t]*kind[ \t]*:[ \t]*(?:"nvidia\.com\/gpu"|'nvidia\.com\/gpu'|nvidia\.com\/gpu)[ \t]*(?:#.*)?$/im; - const jsonRe = /"kind"\s*:\s*"nvidia\.com\/gpu"/; for (const dir of specDirs) { let entries: string[]; try { @@ -347,12 +363,138 @@ function hasNvidiaCdiSpec( } catch { continue; } - if (kindRe.test(raw) || jsonRe.test(raw)) return true; + if (NVIDIA_CDI_KIND_YAML_RE.test(raw) || NVIDIA_CDI_KIND_JSON_RE.test(raw)) return true; } } return false; } +function parseIntegerLike(value: unknown): number | null { + if (typeof value === "number") { + return Number.isInteger(value) && value >= 0 ? value : null; + } + if (typeof value !== "string") return null; + const trimmed = value.trim(); + if (!trimmed) return null; + const base = /^0x/i.test(trimmed) ? 16 : 10; + const parsed = Number.parseInt(trimmed, base); + return Number.isInteger(parsed) && parsed >= 0 ? parsed : null; +} + +function parseLinuxStatDeviceNumbers(output: string | null | undefined): DeviceNumbers | null { + const parts = String(output || "") + .trim() + .split(/\s+/) + .filter(Boolean); + if (parts.length < 2) return null; + const major = Number.parseInt(parts[0], 16); + const minor = Number.parseInt(parts[1], 16); + if (!Number.isInteger(major) || !Number.isInteger(minor) || major < 0 || minor < 0) { + return null; + } + return { major, minor }; +} + +function readLiveLinuxDeviceNumbers( + devicePath: string, + runCaptureImpl: RunCaptureFn, +): DeviceNumbers | null { + try { + return parseLinuxStatDeviceNumbers( + runCaptureImpl(["stat", "-c", "%t %T", devicePath], { ignoreError: true }), + ); + } catch { + return null; + } +} + +function parseCdiSpec(raw: string, filePath: string): unknown { + if (/\.json$/i.test(filePath)) return JSON.parse(raw); + const YAML = require("yaml"); + return YAML.parse(raw); +} + +function collectCdiDeviceNodes(value: unknown, filePath: string): CdiDeviceNode[] { + const nodes: CdiDeviceNode[] = []; + const stack: unknown[] = [value]; + + while (stack.length > 0) { + const current = stack.pop(); + if (Array.isArray(current)) { + for (const item of current) stack.push(item); + continue; + } + if (!current || typeof current !== "object") continue; + const obj = current as Record; + // We stat the host device, so prefer CDI's host-side path when present. + const nodePath = + (typeof obj.hostPath === "string" && obj.hostPath) || + (typeof obj.path === "string" && obj.path) || + ""; + const major = parseIntegerLike(obj.major); + if (nodePath.startsWith("/dev/") && major !== null) { + const minor = obj.minor === undefined ? 0 : parseIntegerLike(obj.minor); + if (minor !== null) nodes.push({ filePath, path: nodePath, major, minor }); + } + for (const child of Object.values(obj)) stack.push(child); + } + + return nodes; +} + +function findCdiDeviceNodeMismatch( + specDirs: readonly string[], + readdirImpl: (dir: string) => string[], + readFileImpl: (filePath: string, encoding: BufferEncoding) => string, + runCaptureImpl: RunCaptureFn, +): string | null { + for (const dir of specDirs) { + let entries: string[]; + try { + entries = readdirImpl(dir); + } catch { + continue; + } + for (const entry of entries) { + if (!/\.(ya?ml|json)$/i.test(entry)) continue; + const filePath = path.join(dir, entry); + let raw: string; + try { + raw = readFileImpl(filePath, "utf-8"); + } catch { + continue; + } + if (!NVIDIA_CDI_KIND_YAML_RE.test(raw) && !NVIDIA_CDI_KIND_JSON_RE.test(raw)) { + continue; + } + let parsed: unknown; + try { + parsed = parseCdiSpec(raw, filePath); + } catch { + continue; + } + const deviceNodes = collectCdiDeviceNodes(parsed, filePath); + for (const node of deviceNodes) { + const liveDevice = readLiveLinuxDeviceNumbers(node.path, runCaptureImpl); + if (!liveDevice) continue; + if (node.major === liveDevice.major && node.minor === liveDevice.minor) continue; + return `${node.filePath} ${node.path}=${node.major}:${node.minor}, live=${liveDevice.major}:${liveDevice.minor}`; + } + } + } + return null; +} + +function parseSystemctlFailedState(value = ""): boolean | null { + const normalized = String(value || "") + .trim() + .toLowerCase(); + if (!normalized) return null; + if (normalized === "failed") return true; + if (normalized === "active" || normalized === "inactive") return false; + return null; +} + export function parseDockerInfoCpus(info = ""): number | undefined { const jsonMatch = info.match(/"NCPU"\s*:\s*(\d+)/); if (jsonMatch) { @@ -496,6 +638,63 @@ export function buildContainerToolkitBootstrapCommands( ]; } +function buildNvidiaCdiRepairCommands(assessment: HostAssessment, specPath: string): string[] { + const specDir = path.dirname(specPath); + const commands = [`sudo mkdir -p ${specDir}`]; + if (assessment.systemctlAvailable !== false) { + commands.push( + "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", + "sudo systemctl start nvidia-cdi-refresh.service", + "nvidia-ctk cdi list # verify nvidia.com/gpu entries appear", + ); + } + commands.push( + `sudo nvidia-ctk cdi generate --output=${specPath} # fallback if the refresh service does not repair the spec`, + "nvidia-ctk cdi list # verify nvidia.com/gpu entries appear", + "nemoclaw onboard # or rerun with --no-gpu to skip GPU passthrough", + ); + return commands; +} + +function buildNvidiaCdiRefreshCommands(): string[] { + return [ + "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", + "sudo systemctl start nvidia-cdi-refresh.service", + "nvidia-ctk cdi list # verify nvidia.com/gpu entries appear", + ]; +} + +function explainNvidiaCdiRepairReason(assessment: HostAssessment): string { + const reasons: string[] = []; + if (assessment.cdiNvidiaGpuSpecMissing) { + reasons.push( + "Docker is configured for CDI device injection (CDISpecDirs is set) but no nvidia.com/gpu CDI spec is present on the host.", + ); + } + if (assessment.cdiNvidiaGpuSpecStale) { + const detail = assessment.cdiNvidiaGpuSpecMismatch + ? ` (${assessment.cdiNvidiaGpuSpecMismatch})` + : ""; + reasons.push( + `The NVIDIA CDI spec appears stale because a declared device node does not match the live device${detail}.`, + ); + } + if (assessment.cdiNvidiaGpuRefreshUnhealthy) { + const unitDetails: string[] = []; + if (assessment.nvidiaCdiRefreshPathEnabled === false) unitDetails.push("path disabled"); + if (assessment.nvidiaCdiRefreshPathActive === false) unitDetails.push("path inactive"); + if (assessment.nvidiaCdiRefreshServiceFailed === true) unitDetails.push("service failed"); + const suffix = unitDetails.length > 0 ? ` (${unitDetails.join(", ")})` : ""; + reasons.push( + `NVIDIA's CDI refresh units are not healthy${suffix}, so Docker may keep using stale GPU device numbers after driver changes.`, + ); + } + reasons.push( + "OpenShell's `gateway start --gpu` can fail until the CDI spec is refreshed and verified.", + ); + return reasons.join(" "); +} + export function assessHost(opts: AssessHostOpts = {}): HostAssessment { const platform = opts.platform ?? process.platform; const env = opts.env ?? process.env; @@ -514,7 +713,8 @@ export function assessHost(opts: AssessHostOpts = {}): HostAssessment { const nvidiaContainerToolkitInstalled = opts.commandExistsImpl?.("nvidia-ctk") ?? commandExists("nvidia-ctk", runCaptureImpl); const packageManager = detectPackageManager(runCaptureImpl); - const systemctlAvailable = commandExists("systemctl", runCaptureImpl); + const systemctlAvailable = + opts.commandExistsImpl?.("systemctl") ?? commandExists("systemctl", runCaptureImpl); let dockerInfoOutput = opts.dockerInfoOutput; let dockerReachable = false; @@ -543,6 +743,7 @@ export function assessHost(opts: AssessHostOpts = {}): HostAssessment { if (dockerReachable && runtime === "unknown" && platform === "linux") { runtime = "docker"; } + const isWslHost = detectWsl({ platform, env, release, procVersion }); const dockerCgroupVersion = dockerReachable ? parseDockerCgroupVersion(dockerInfoOutput) : "unknown"; @@ -565,11 +766,58 @@ export function assessHost(opts: AssessHostOpts = {}): HostAssessment { // front so preflight can point the user at `nvidia-ctk cdi generate` before // we waste minutes downloading the gateway image. See issue #3152. const dockerCdiSpecDirs = dockerReachable ? parseDockerCdiSpecDirs(dockerInfoOutput) : []; - const cdiNvidiaGpuSpecMissing = - platform === "linux" && - hasNvidiaGpu && - dockerCdiSpecDirs.length > 0 && - !hasNvidiaCdiSpec(dockerCdiSpecDirs, readdirImpl, readFileImpl); + const cdiSpecPresenceApplies = + platform === "linux" && hasNvidiaGpu && dockerCdiSpecDirs.length > 0; + const cdiSpecRepairApplies = + cdiSpecPresenceApplies && !(isWslHost && runtime === "docker-desktop"); + const cdiNvidiaGpuSpecPresent = + cdiSpecPresenceApplies && hasNvidiaCdiSpec(dockerCdiSpecDirs, readdirImpl, readFileImpl); + const cdiNvidiaGpuSpecMissing = cdiSpecPresenceApplies && !cdiNvidiaGpuSpecPresent; + const refreshHealthApplies = + cdiSpecRepairApplies && systemctlAvailable && nvidiaContainerToolkitInstalled; + const nvidiaCdiRefreshPathEnabled = refreshHealthApplies + ? parseSystemctlState( + runCaptureImpl(["systemctl", "is-enabled", "nvidia-cdi-refresh.path"], { + ignoreError: true, + }), + ) + : null; + const nvidiaCdiRefreshPathActive = refreshHealthApplies + ? parseSystemctlState( + runCaptureImpl(["systemctl", "is-active", "nvidia-cdi-refresh.path"], { + ignoreError: true, + }), + ) + : null; + const nvidiaCdiRefreshServiceEnabled = refreshHealthApplies + ? parseSystemctlState( + runCaptureImpl(["systemctl", "is-enabled", "nvidia-cdi-refresh.service"], { + ignoreError: true, + }), + ) + : null; + const nvidiaCdiRefreshServiceFailed = refreshHealthApplies + ? parseSystemctlFailedState( + runCaptureImpl(["systemctl", "is-failed", "nvidia-cdi-refresh.service"], { + ignoreError: true, + }), + ) + : null; + const cdiNvidiaGpuRefreshUnhealthy = + nvidiaCdiRefreshPathEnabled === false || + nvidiaCdiRefreshPathActive === false || + nvidiaCdiRefreshServiceFailed === true; + const cdiNvidiaGpuSpecMismatch = + cdiSpecRepairApplies && cdiNvidiaGpuSpecPresent + ? findCdiDeviceNodeMismatch( + dockerCdiSpecDirs, + readdirImpl, + readFileImpl, + runCaptureImpl, + ) + : null; + const cdiNvidiaGpuSpecStale = Boolean(cdiNvidiaGpuSpecMismatch); + const cdiNvidiaGpuSpecNeedsRepair = cdiNvidiaGpuSpecMissing || cdiNvidiaGpuSpecStale; const isContainerRuntimeUnderProvisioned = isDockerUnderProvisioned( dockerCpus, dockerMemTotalBytes, @@ -588,7 +836,6 @@ export function assessHost(opts: AssessHostOpts = {}): HostAssessment { // the user-confirmed reproducer. Engaging the auto-fix there could // build an unnecessary patched image; preferring to leave WSL alone // until we have a confirmed repro is the conservative call. - const isWslHost = detectWsl({ platform, env, release, procVersion }); const hasNestedOverlayConflict = platform === "linux" && !isWslHost && @@ -637,6 +884,14 @@ export function assessHost(opts: AssessHostOpts = {}): HostAssessment { hasNvidiaGpu, dockerCdiSpecDirs, cdiNvidiaGpuSpecMissing, + cdiNvidiaGpuSpecStale, + cdiNvidiaGpuSpecMismatch: cdiNvidiaGpuSpecMismatch ?? undefined, + cdiNvidiaGpuRefreshUnhealthy, + cdiNvidiaGpuSpecNeedsRepair, + nvidiaCdiRefreshPathActive, + nvidiaCdiRefreshPathEnabled, + nvidiaCdiRefreshServiceEnabled, + nvidiaCdiRefreshServiceFailed, nvidiaContainerToolkitInstalled, notes: [], }; @@ -847,26 +1102,43 @@ export function planHostRemediation(assessment: HostAssessment): RemediationActi }); } - if (assessment.cdiNvidiaGpuSpecMissing) { + if ( + assessment.cdiNvidiaGpuRefreshUnhealthy && + !assessment.cdiNvidiaGpuSpecNeedsRepair && + !assessment.cdiNvidiaGpuSpecMissing && + !isWslDockerDesktopRuntime(assessment) + ) { + actions.push({ + id: "warn_nvidia_cdi_refresh_unhealthy", + title: "Enable NVIDIA CDI refresh service", + kind: "sudo", + reason: explainNvidiaCdiRepairReason({ + ...assessment, + cdiNvidiaGpuSpecMissing: false, + cdiNvidiaGpuSpecStale: false, + cdiNvidiaGpuSpecMismatch: undefined, + }), + commands: buildNvidiaCdiRefreshCommands(), + blocking: false, + }); + } + + if (assessment.cdiNvidiaGpuSpecNeedsRepair || assessment.cdiNvidiaGpuSpecMissing) { const specPath = getNvidiaCdiSpecPath(assessment); - const specDir = path.dirname(specPath); - const generateCommands = [ - `sudo mkdir -p ${specDir}`, - `sudo nvidia-ctk cdi generate --output=${specPath}`, - "nvidia-ctk cdi list # verify nvidia.com/gpu entries appear", - "nemoclaw onboard # or rerun with --no-gpu to skip GPU passthrough", - ]; + const generateCommands = buildNvidiaCdiRepairCommands(assessment, specPath); if (isWslDockerDesktopRuntime(assessment)) { actions.push(wslDockerDesktopGpuCompatibilityAction()); } else if (assessment.nvidiaContainerToolkitInstalled) { + const title = assessment.cdiNvidiaGpuSpecMissing + ? "Generate NVIDIA CDI device specs" + : "Refresh NVIDIA CDI device specs"; actions.push({ - id: "generate_nvidia_cdi_spec", - title: "Generate NVIDIA CDI device specs", + id: assessment.cdiNvidiaGpuSpecMissing + ? "generate_nvidia_cdi_spec" + : "refresh_nvidia_cdi_spec", + title, kind: "sudo", - reason: - "Docker is configured for CDI device injection (CDISpecDirs is set) but no " + - "nvidia.com/gpu CDI spec is present on the host. OpenShell's `gateway start --gpu` " + - "will fail with `unresolvable CDI devices nvidia.com/gpu=all` until a spec is generated.", + reason: explainNvidiaCdiRepairReason(assessment), commands: generateCommands, blocking: true, }); @@ -875,12 +1147,7 @@ export function planHostRemediation(assessment: HostAssessment): RemediationActi id: "install_nvidia_container_toolkit", title: "Install NVIDIA Container Toolkit and generate CDI device specs", kind: "sudo", - reason: - "Docker is configured for CDI device injection (CDISpecDirs is set) but the " + - "`nvidia-container-toolkit` package (which provides `nvidia-ctk`) is not installed " + - "on the host. OpenShell's `gateway start --gpu` will fail with " + - "`unresolvable CDI devices nvidia.com/gpu=all` until the toolkit is installed and a " + - "CDI spec is generated.", + reason: `${explainNvidiaCdiRepairReason(assessment)} The nvidia-container-toolkit package (which provides nvidia-ctk) is not installed on the host.`, commands: buildContainerToolkitBootstrapCommands( assessment.packageManager, generateCommands, diff --git a/test/install-preflight.test.ts b/test/install-preflight.test.ts index 13140f5df9..062258e141 100644 --- a/test/install-preflight.test.ts +++ b/test/install-preflight.test.ts @@ -1394,7 +1394,7 @@ exit 1 }); expect(result.status, output).toBe(0); - expect(output).toMatch(/Generating missing NVIDIA CDI device spec/); + expect(output).toMatch(/Refreshing NVIDIA CDI device spec/); expect(output).toMatch(/NemoClaw will first enable NVIDIA's CDI refresh service/); expect(output).toMatch(/NemoClaw does not store your password/); expect(output).toMatch(/Generated NVIDIA CDI device spec/); From 0df653a6a3f8c28b1b8b787e70c1071b489e55e6 Mon Sep 17 00:00:00 2001 From: zyang-dev <267119621+zyang-dev@users.noreply.github.com> Date: Wed, 3 Jun 2026 15:48:01 -0700 Subject: [PATCH 2/9] fix(onboard): evaluate CDI staleness on the effective spec only to avoid false-positive blocks Signed-off-by: zyang-dev <267119621+zyang-dev@users.noreply.github.com> --- scripts/install.sh | 64 +++++- src/lib/onboard/preflight-cdi.test.ts | 267 ++++++++++++++++++++++++++ src/lib/onboard/preflight.ts | 153 ++++++++++----- test/install-preflight.test.ts | 39 +++- 4 files changed, 475 insertions(+), 48 deletions(-) diff --git a/scripts/install.sh b/scripts/install.sh index c08a2d98e8..9071a11b66 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -1848,11 +1848,46 @@ preinstall_backup_and_retire_legacy_gateway() { # --------------------------------------------------------------------------- # 5. Onboard # --------------------------------------------------------------------------- +repair_installer_stale_nvidia_cdi_spec() { + local flagged_file="${1:-}" + local service_spec_path="/var/run/cdi/nvidia.yaml" + local sudo_cmd=() + + info "Refreshing NVIDIA CDI device spec with NVIDIA's CDI refresh service." + info "NVIDIA GPU passthrough uses CDI specs so Docker/OpenShell can request nvidia.com/gpu devices." + info "Docker is configured for CDI, but the effective nvidia.com/gpu spec may be stale." + info "The refresh service regenerates ${service_spec_path}; re-assessment verifies that effective spec." + if [[ -n "$flagged_file" && "$flagged_file" != "$service_spec_path" ]]; then + info "The stale ${flagged_file} file is a leftover; the refreshed ${service_spec_path} overrides it." + fi + if ! command_exists systemctl; then + warn "Could not refresh the stale NVIDIA CDI spec automatically because systemctl is unavailable." + return 0 + fi + if [[ "$(id -u)" -ne 0 ]]; then + sudo_cmd=(sudo) + info "You may be asked for your password to authorize these host-level admin changes." + info "NemoClaw does not store your password." + if ! sudo -v; then + warn "Could not obtain sudo credentials for NVIDIA CDI refresh service repair." + return 0 + fi + fi + if "${sudo_cmd[@]}" systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service >/dev/null 2>&1 \ + && "${sudo_cmd[@]}" systemctl start nvidia-cdi-refresh.service >/dev/null 2>&1; then + ok "Enabled NVIDIA CDI refresh service and refreshed the service-managed NVIDIA CDI device spec." + return 0 + fi + warn "Could not refresh the stale NVIDIA CDI spec automatically with nvidia-cdi-refresh.service." +} + repair_installer_nvidia_cdi_spec() { local preflight_module="$1" + local repair_plan="" + local repair_kind="" local spec_path="" - spec_path="$( + repair_plan="$( # shellcheck disable=SC2016 node -e ' const preflightPath = process.argv[1]; @@ -1861,10 +1896,22 @@ repair_installer_nvidia_cdi_spec() { const host = assessHost(); if ( host && - (host.cdiNvidiaGpuSpecNeedsRepair || host.cdiNvidiaGpuSpecMissing) && + host.cdiNvidiaGpuSpecMissing && + !isWslDockerDesktopRuntime(host) + ) { + process.stdout.write(`missing\t${getNvidiaCdiSpecPath(host)}`); + return; + } + if ( + host && + host.cdiNvidiaGpuSpecStale && + host.cdiNvidiaGpuSpecNeedsRepair && + !host.cdiNvidiaGpuSpecMissing && !isWslDockerDesktopRuntime(host) ) { - process.stdout.write(getNvidiaCdiSpecPath(host)); + const mismatch = String(host.cdiNvidiaGpuSpecMismatch || ""); + const flaggedFilePath = mismatch.trim().split(/\s+/, 1)[0] || ""; + process.stdout.write(`stale\t${flaggedFilePath}`); } } catch { process.exit(0); @@ -1872,9 +1919,18 @@ repair_installer_nvidia_cdi_spec() { ' "$preflight_module" 2>/dev/null || true )" - if [[ -z "$spec_path" ]]; then + if [[ -z "$repair_plan" ]]; then return 0 fi + + repair_kind="${repair_plan%%$'\t'*}" + spec_path="${repair_plan#*$'\t'}" + + if [[ "$repair_kind" == "stale" ]]; then + repair_installer_stale_nvidia_cdi_spec "$spec_path" + return 0 + fi + if ! command_exists nvidia-ctk; then return 0 fi diff --git a/src/lib/onboard/preflight-cdi.test.ts b/src/lib/onboard/preflight-cdi.test.ts index fd4f0aa4dd..b2ca71ca8f 100644 --- a/src/lib/onboard/preflight-cdi.test.ts +++ b/src/lib/onboard/preflight-cdi.test.ts @@ -174,6 +174,184 @@ describe("assessHost — CDI device-spec gap (#3152)", () => { expect(result.nvidiaCdiRefreshServiceEnabled).toBe(false); }); + it("ignores a stale lower-precedence /etc/cdi spec when /var/run/cdi is fresh", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => { + if (filePath === "/etc/cdi/nvidia.yaml") { + return [ + "cdiVersion: 0.5.0", + "kind: nvidia.com/gpu", + "devices:", + " - name: all", + " containerEdits:", + " deviceNodes:", + " - path: /dev/nvidia-uvm", + " hostPath: /dev/nvidia-uvm", + " type: c", + " major: 498", + "", + ].join("\n"); + } + if (filePath === "/var/run/cdi/nvidia.yaml") { + return [ + "cdiVersion: 0.5.0", + "kind: nvidia.com/gpu", + "devices:", + " - name: all", + " containerEdits:", + " deviceNodes:", + " - path: /dev/nvidia-uvm", + " hostPath: /dev/nvidia-uvm", + " type: c", + " major: 499", + "", + ].join("\n"); + } + return "Linux version 6.8.0-58-generic"; + }, + readdirImpl: (dir: string) => { + if (dir === "/etc/cdi") return ["nvidia.yaml"]; + if (dir === "/var/run/cdi") return ["nvidia.yaml"]; + return []; + }, + runCaptureImpl: (command: readonly string[]) => { + if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; + if (command[0] === "systemctl" && command[1] === "is-active") return "active"; + if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; + if (command[0] === "stat" && command[3] === "/dev/nvidia-uvm") return "1f3 0"; + return ""; + }, + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi", "/var/run/cdi"], + }), + commandExistsImpl: (name: string) => + name === "docker" || name === "systemctl" || name === "nvidia-ctk", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(false); + expect(result.cdiNvidiaGpuSpecStale).toBe(false); + expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(false); + expect(result.cdiNvidiaGpuSpecMismatch).toBeUndefined(); + }); + + it("flags a stale /etc/cdi spec when no higher-precedence /var/run/cdi spec exists", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => + filePath === "/etc/cdi/nvidia.yaml" + ? [ + "cdiVersion: 0.5.0", + "kind: nvidia.com/gpu", + "devices:", + " - name: all", + " containerEdits:", + " deviceNodes:", + " - path: /dev/nvidia-uvm", + " hostPath: /dev/nvidia-uvm", + " type: c", + " major: 498", + "", + ].join("\n") + : "Linux version 6.8.0-58-generic", + readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), + runCaptureImpl: (command: readonly string[]) => { + if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; + if (command[0] === "systemctl" && command[1] === "is-active") return "active"; + if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; + if (command[0] === "stat" && command[3] === "/dev/nvidia-uvm") return "1f3 0"; + return ""; + }, + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi", "/var/run/cdi"], + }), + commandExistsImpl: (name: string) => + name === "docker" || name === "systemctl" || name === "nvidia-ctk", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(false); + expect(result.cdiNvidiaGpuSpecStale).toBe(true); + expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(true); + expect(result.cdiNvidiaGpuSpecMismatch).toContain("/etc/cdi/nvidia.yaml"); + expect(result.cdiNvidiaGpuSpecMismatch).toContain("/dev/nvidia-uvm=498:0"); + expect(result.cdiNvidiaGpuSpecMismatch).toContain("live=499:0"); + }); + + it("flags a stale /var/run/cdi spec when it is the effective spec", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => { + if (filePath === "/etc/cdi/nvidia.yaml") { + return [ + "cdiVersion: 0.5.0", + "kind: nvidia.com/gpu", + "devices:", + " - name: all", + " containerEdits:", + " deviceNodes:", + " - path: /dev/nvidia-uvm", + " hostPath: /dev/nvidia-uvm", + " type: c", + " major: 499", + "", + ].join("\n"); + } + if (filePath === "/var/run/cdi/nvidia.yaml") { + return [ + "cdiVersion: 0.5.0", + "kind: nvidia.com/gpu", + "devices:", + " - name: all", + " containerEdits:", + " deviceNodes:", + " - path: /dev/nvidia-uvm", + " hostPath: /dev/nvidia-uvm", + " type: c", + " major: 498", + "", + ].join("\n"); + } + return "Linux version 6.8.0-58-generic"; + }, + readdirImpl: (dir: string) => { + if (dir === "/etc/cdi") return ["nvidia.yaml"]; + if (dir === "/var/run/cdi") return ["nvidia.yaml"]; + return []; + }, + runCaptureImpl: (command: readonly string[]) => { + if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; + if (command[0] === "systemctl" && command[1] === "is-active") return "active"; + if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; + if (command[0] === "stat" && command[3] === "/dev/nvidia-uvm") return "1f3 0"; + return ""; + }, + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi", "/var/run/cdi"], + }), + commandExistsImpl: (name: string) => + name === "docker" || name === "systemctl" || name === "nvidia-ctk", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(false); + expect(result.cdiNvidiaGpuSpecStale).toBe(true); + expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(true); + expect(result.cdiNvidiaGpuSpecMismatch).toContain("/var/run/cdi/nvidia.yaml"); + expect(result.cdiNvidiaGpuSpecMismatch).toContain("/dev/nvidia-uvm=498:0"); + expect(result.cdiNvidiaGpuSpecMismatch).toContain("live=499:0"); + }); + it("flags a stale NVIDIA CDI spec when nvidia-uvm omits minor and its major no longer matches", () => { const result = assessHost({ platform: "linux", @@ -622,6 +800,95 @@ describe("planHostRemediation — CDI", () => { expect(action?.reason).toContain("nvidia.com/gpu"); }); + it("emits stale-spec refresh commands without direct /etc/cdi generation", () => { + const actions = planHostRemediation( + baseAssessment({ + cdiNvidiaGpuSpecStale: true, + cdiNvidiaGpuSpecNeedsRepair: true, + cdiNvidiaGpuSpecMismatch: + "/etc/cdi/nvidia.yaml /dev/nvidia-uvm=498:0, live=499:0", + }), + ); + + const action = actions.find( + (entry: { id: string }) => entry.id === "refresh_nvidia_cdi_spec", + ); + expect(action).toBeTruthy(); + expect(action?.kind).toBe("sudo"); + expect(action?.blocking).toBe(true); + expect(action?.commands[0]).toBe( + "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", + ); + expect(action?.commands[1]).toBe("sudo systemctl start nvidia-cdi-refresh.service"); + expect(action?.commands[2]).toContain("sudo rm -f /etc/cdi/nvidia.yaml"); + expect(action?.commands[2]).toContain("optional"); + expect(action?.commands[3]).toContain("nemoclaw onboard"); + expect(action?.commands.some((command) => command.includes("mkdir -p /etc/cdi"))).toBe( + false, + ); + expect(action?.commands.some((command) => command.includes("--output=/etc/cdi"))).toBe( + false, + ); + expect(action?.commands.some((command) => command.includes("nvidia-ctk cdi list"))).toBe( + false, + ); + expect(action?.reason).toContain("/etc/cdi/nvidia.yaml"); + expect(action?.reason).toContain("/var/run/cdi/nvidia.yaml"); + }); + + it("does not offer leftover removal when the stale effective spec is /var/run/cdi", () => { + const actions = planHostRemediation( + baseAssessment({ + cdiNvidiaGpuSpecStale: true, + cdiNvidiaGpuSpecNeedsRepair: true, + cdiNvidiaGpuSpecMismatch: + "/var/run/cdi/nvidia.yaml /dev/nvidia-uvm=498:0, live=499:0", + }), + ); + + const action = actions.find( + (entry: { id: string }) => entry.id === "refresh_nvidia_cdi_spec", + ); + expect(action).toBeTruthy(); + expect(action?.commands.some((command) => command.includes("rm -f"))).toBe(false); + expect(action?.commands.some((command) => command.includes("--output=/etc/cdi"))).toBe( + false, + ); + expect(action?.commands.some((command) => command.includes("nvidia-ctk cdi list"))).toBe( + false, + ); + }); + + it("uses stale refresh commands after toolkit bootstrap when nvidia-ctk is missing", () => { + const actions = planHostRemediation( + baseAssessment({ + cdiNvidiaGpuSpecStale: true, + cdiNvidiaGpuSpecNeedsRepair: true, + cdiNvidiaGpuSpecMismatch: + "/etc/cdi/nvidia.yaml /dev/nvidia-uvm=498:0, live=499:0", + nvidiaContainerToolkitInstalled: false, + }), + ); + + const action = actions.find((entry) => entry.id === "install_nvidia_container_toolkit"); + expect(action).toBeTruthy(); + expect(action?.title).toContain("refresh CDI"); + expect(action?.commands.some((command) => command === "sudo apt-get install -y nvidia-container-toolkit")).toBe( + true, + ); + expect( + action?.commands.some((command) => + command === "sudo systemctl start nvidia-cdi-refresh.service", + ), + ).toBe(true); + expect(action?.commands.some((command) => command.includes("--output=/etc/cdi"))).toBe( + false, + ); + expect(action?.commands.some((command) => command.includes("nvidia-ctk cdi list"))).toBe( + false, + ); + }); + it("emits a non-blocking refresh-service warning when refresh units are unhealthy", () => { const actions = planHostRemediation( baseAssessment({ diff --git a/src/lib/onboard/preflight.ts b/src/lib/onboard/preflight.ts index e09a29e598..f3a17ee6ae 100644 --- a/src/lib/onboard/preflight.ts +++ b/src/lib/onboard/preflight.ts @@ -169,9 +169,15 @@ type CdiDeviceNode = DeviceNumbers & { path: string; }; +type EffectiveNvidiaCdiSpec = { + filePath: string; + parsed: unknown; +}; + const NVIDIA_CDI_KIND_YAML_RE = /^[ \t]*kind[ \t]*:[ \t]*(?:"nvidia\.com\/gpu"|'nvidia\.com\/gpu'|nvidia\.com\/gpu)[ \t]*(?:#.*)?$/im; const NVIDIA_CDI_KIND_JSON_RE = /"kind"\s*:\s*"nvidia\.com\/gpu"/; +const NVIDIA_CDI_REFRESH_SPEC_PATH = "/var/run/cdi/nvidia.yaml"; function buildCommandVArgv(commandName: string): readonly string[] { return ["sh", "-c", 'command -v "$1"', "--", commandName]; @@ -414,6 +420,41 @@ function parseCdiSpec(raw: string, filePath: string): unknown { return YAML.parse(raw); } +function findEffectiveNvidiaCdiSpec( + specDirs: readonly string[], + readdirImpl: (dir: string) => string[], + readFileImpl: (filePath: string, encoding: BufferEncoding) => string, +): EffectiveNvidiaCdiSpec | null { + // Docker CDI precedence is highest in the last configured directory. + for (const dir of [...specDirs].reverse()) { + let entries: string[]; + try { + entries = readdirImpl(dir); + } catch { + continue; + } + for (const entry of entries) { + if (!/\.(ya?ml|json)$/i.test(entry)) continue; + const filePath = path.join(dir, entry); + let raw: string; + try { + raw = readFileImpl(filePath, "utf-8"); + } catch { + continue; + } + if (!NVIDIA_CDI_KIND_YAML_RE.test(raw) && !NVIDIA_CDI_KIND_JSON_RE.test(raw)) { + continue; + } + try { + return { filePath, parsed: parseCdiSpec(raw, filePath) }; + } catch { + continue; + } + } + } + return null; +} + function collectCdiDeviceNodes(value: unknown, filePath: string): CdiDeviceNode[] { const nodes: CdiDeviceNode[] = []; const stack: unknown[] = [value]; @@ -448,39 +489,13 @@ function findCdiDeviceNodeMismatch( readFileImpl: (filePath: string, encoding: BufferEncoding) => string, runCaptureImpl: RunCaptureFn, ): string | null { - for (const dir of specDirs) { - let entries: string[]; - try { - entries = readdirImpl(dir); - } catch { - continue; - } - for (const entry of entries) { - if (!/\.(ya?ml|json)$/i.test(entry)) continue; - const filePath = path.join(dir, entry); - let raw: string; - try { - raw = readFileImpl(filePath, "utf-8"); - } catch { - continue; - } - if (!NVIDIA_CDI_KIND_YAML_RE.test(raw) && !NVIDIA_CDI_KIND_JSON_RE.test(raw)) { - continue; - } - let parsed: unknown; - try { - parsed = parseCdiSpec(raw, filePath); - } catch { - continue; - } - const deviceNodes = collectCdiDeviceNodes(parsed, filePath); - for (const node of deviceNodes) { - const liveDevice = readLiveLinuxDeviceNumbers(node.path, runCaptureImpl); - if (!liveDevice) continue; - if (node.major === liveDevice.major && node.minor === liveDevice.minor) continue; - return `${node.filePath} ${node.path}=${node.major}:${node.minor}, live=${liveDevice.major}:${liveDevice.minor}`; - } - } + const effective = findEffectiveNvidiaCdiSpec(specDirs, readdirImpl, readFileImpl); + if (!effective) return null; + for (const node of collectCdiDeviceNodes(effective.parsed, effective.filePath)) { + const liveDevice = readLiveLinuxDeviceNumbers(node.path, runCaptureImpl); + if (!liveDevice) continue; + if (node.major === liveDevice.major && node.minor === liveDevice.minor) continue; + return `${node.filePath} ${node.path}=${node.major}:${node.minor}, live=${liveDevice.major}:${liveDevice.minor}`; } return null; } @@ -664,6 +679,50 @@ function buildNvidiaCdiRefreshCommands(): string[] { ]; } +function extractCdiMismatchFilePath(mismatch: string | undefined): string { + const trimmed = String(mismatch || "").trim(); + if (!trimmed) return ""; + const firstWhitespace = trimmed.search(/\s/); + return firstWhitespace > 0 ? trimmed.slice(0, firstWhitespace) : trimmed; +} + +function buildStaleCdiAutoFixCommands(): string[] { + return [ + "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", + "sudo systemctl start nvidia-cdi-refresh.service", + ]; +} + +function buildStaleCdiWarnCommands(flaggedFilePath: string): string[] { + const commands = buildStaleCdiAutoFixCommands(); + if (flaggedFilePath && flaggedFilePath !== NVIDIA_CDI_REFRESH_SPEC_PATH) { + commands.push( + `sudo rm -f ${flaggedFilePath} # optional: remove the stale leftover (the service owns ${NVIDIA_CDI_REFRESH_SPEC_PATH})`, + ); + } + commands.push( + "nemoclaw onboard # re-run to confirm the stale-spec warning clears (or --no-gpu to skip GPU)", + ); + return commands; +} + +function explainStaleCdiReason(mismatch: string | undefined): string { + const detail = mismatch || "unknown device-node mismatch"; + const flaggedFilePath = extractCdiMismatchFilePath(mismatch); + const isLeftover = flaggedFilePath && flaggedFilePath !== NVIDIA_CDI_REFRESH_SPEC_PATH; + return ( + `An NVIDIA CDI device node no longer matches the live device (${detail}). ` + + "OpenShell's `gateway start --gpu` injects devices from the CDI spec, so a stale " + + "device number points the container at the wrong device and CUDA init fails " + + "(`CUDA unknown error`). The nvidia-cdi-refresh service keeps " + + `${NVIDIA_CDI_REFRESH_SPEC_PATH} current on driver/toolkit changes` + + (isLeftover + ? `; the flagged ${flaggedFilePath} is a stale leftover that the refreshed ` + + `${NVIDIA_CDI_REFRESH_SPEC_PATH} overrides.` + : "; re-enable and run it to regenerate the spec.") + ); +} + function explainNvidiaCdiRepairReason(assessment: HostAssessment): string { const reasons: string[] = []; if (assessment.cdiNvidiaGpuSpecMissing) { @@ -1124,33 +1183,41 @@ export function planHostRemediation(assessment: HostAssessment): RemediationActi } if (assessment.cdiNvidiaGpuSpecNeedsRepair || assessment.cdiNvidiaGpuSpecMissing) { + const missingSpec = assessment.cdiNvidiaGpuSpecMissing; + const flaggedFilePath = extractCdiMismatchFilePath(assessment.cdiNvidiaGpuSpecMismatch); const specPath = getNvidiaCdiSpecPath(assessment); - const generateCommands = buildNvidiaCdiRepairCommands(assessment, specPath); + const repairCommands = missingSpec + ? buildNvidiaCdiRepairCommands(assessment, specPath) + : buildStaleCdiWarnCommands(flaggedFilePath); + const reason = missingSpec + ? explainNvidiaCdiRepairReason(assessment) + : explainStaleCdiReason(assessment.cdiNvidiaGpuSpecMismatch); if (isWslDockerDesktopRuntime(assessment)) { actions.push(wslDockerDesktopGpuCompatibilityAction()); } else if (assessment.nvidiaContainerToolkitInstalled) { - const title = assessment.cdiNvidiaGpuSpecMissing + const title = missingSpec ? "Generate NVIDIA CDI device specs" : "Refresh NVIDIA CDI device specs"; actions.push({ - id: assessment.cdiNvidiaGpuSpecMissing - ? "generate_nvidia_cdi_spec" - : "refresh_nvidia_cdi_spec", + id: missingSpec ? "generate_nvidia_cdi_spec" : "refresh_nvidia_cdi_spec", title, kind: "sudo", - reason: explainNvidiaCdiRepairReason(assessment), - commands: generateCommands, + reason, + commands: repairCommands, blocking: true, }); } else { + const title = missingSpec + ? "Install NVIDIA Container Toolkit and generate CDI device specs" + : "Install NVIDIA Container Toolkit and refresh CDI device specs"; actions.push({ id: "install_nvidia_container_toolkit", - title: "Install NVIDIA Container Toolkit and generate CDI device specs", + title, kind: "sudo", - reason: `${explainNvidiaCdiRepairReason(assessment)} The nvidia-container-toolkit package (which provides nvidia-ctk) is not installed on the host.`, + reason: `${reason} The nvidia-container-toolkit package (which provides nvidia-ctk) is not installed on the host.`, commands: buildContainerToolkitBootstrapCommands( assessment.packageManager, - generateCommands, + repairCommands, ), blocking: true, }); diff --git a/test/install-preflight.test.ts b/test/install-preflight.test.ts index 062258e141..952bbddbd0 100644 --- a/test/install-preflight.test.ts +++ b/test/install-preflight.test.ts @@ -1222,10 +1222,12 @@ fi`, systemctlScript, isWsl = false, runtime = "docker", + stale = false, }: { systemctlScript: string; isWsl?: boolean; runtime?: string; + stale?: boolean; }) { const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-install-cdi-repair-")); const fakeBin = path.join(tmp, "bin"); @@ -1246,7 +1248,10 @@ exports.assessHost = () => ({ isWsl: ${isWsl ? "true" : "false"}, notes: [], dockerCdiSpecDirs: [process.env.CDI_DIR], - cdiNvidiaGpuSpecMissing: !fs.existsSync(process.env.CDI_STATE), + cdiNvidiaGpuSpecMissing: ${stale ? "false" : "!fs.existsSync(process.env.CDI_STATE)"}, + cdiNvidiaGpuSpecStale: ${stale ? "!fs.existsSync(process.env.CDI_STATE)" : "false"}, + cdiNvidiaGpuSpecNeedsRepair: !fs.existsSync(process.env.CDI_STATE), + cdiNvidiaGpuSpecMismatch: process.env.CDI_STALE_FILE + " /dev/nvidia-uvm=498:0, live=499:0", }); exports.getNvidiaCdiSpecPath = (host) => String(host.dockerCdiSpecDirs[0]).replace(/\\/+$/, "") + "/nvidia.yaml"; @@ -1334,6 +1339,7 @@ run_installer_host_preflight SOURCE_ROOT: sourceRoot, CDI_DIR: cdiDir, CDI_STATE: cdiState, + CDI_STALE_FILE: path.join(cdiDir, "nvidia.yaml"), SUDO_LOG: sudoLog, SYSTEMCTL_LOG: systemctlLog, }, @@ -1383,6 +1389,37 @@ exit 99 expect(sudoLog).not.toMatch(/nvidia-ctk cdi generate/); }); + it("repairs stale NVIDIA CDI specs with the refresh service only", () => { + const { cdiStateExists, output, result, sudoLog, systemctlLog } = + runNvidiaCdiInstallerRepairTest({ + stale: true, + systemctlScript: `#!/usr/bin/env bash +set -euo pipefail +printf '%s\\n' "$*" >> "$SYSTEMCTL_LOG" +if [ "\${1:-}" = "start" ]; then + touch "$CDI_STATE" +fi +exit 0 +`, + }); + + expect(result.status, output).toBe(0); + expect(cdiStateExists).toBe(true); + expect(output).toMatch(/Refreshing NVIDIA CDI device spec with NVIDIA's CDI refresh service/); + expect(output).toMatch(/effective nvidia\.com\/gpu spec may be stale/); + expect(output).toMatch(/refreshed the service-managed NVIDIA CDI device spec/); + expect(output).not.toMatch(/falling back to direct generation/); + expect(output).not.toMatch(/Host preflight found issues/); + expect(systemctlLog).toMatch( + /^enable --now nvidia-cdi-refresh\.path nvidia-cdi-refresh\.service$/m, + ); + expect(systemctlLog).toMatch(/^start nvidia-cdi-refresh\.service$/m); + expect(sudoLog).toMatch(/^-v$/m); + expect(sudoLog).not.toMatch(/nvidia-ctk cdi generate/); + expect(sudoLog).not.toMatch(/mkdir -p/); + expect(sudoLog).not.toMatch(/rm -f/); + }); + it("falls back to direct NVIDIA CDI generation when refresh service does not repair", () => { const { cdiDir, output, result, sudoLog, systemctlLog } = runNvidiaCdiInstallerRepairTest({ From 366f996ce5069bdc7176b807b80124787d565ba9 Mon Sep 17 00:00:00 2001 From: zyang-dev <267119621+zyang-dev@users.noreply.github.com> Date: Wed, 3 Jun 2026 16:04:29 -0700 Subject: [PATCH 3/9] fix(install): remove invalid return from CDI repair probe Signed-off-by: zyang-dev <267119621+zyang-dev@users.noreply.github.com> --- scripts/install.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/install.sh b/scripts/install.sh index 9071a11b66..d8508d34ab 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -1900,9 +1900,7 @@ repair_installer_nvidia_cdi_spec() { !isWslDockerDesktopRuntime(host) ) { process.stdout.write(`missing\t${getNvidiaCdiSpecPath(host)}`); - return; - } - if ( + } else if ( host && host.cdiNvidiaGpuSpecStale && host.cdiNvidiaGpuSpecNeedsRepair && From 9f74eef3762b8f3d2529a1cb1c33dedee878738d Mon Sep 17 00:00:00 2001 From: zyang-dev <267119621+zyang-dev@users.noreply.github.com> Date: Wed, 3 Jun 2026 16:10:17 -0700 Subject: [PATCH 4/9] fix(onboard): keep CDI guard line-neutral in onboard entrypoint Signed-off-by: zyang-dev <267119621+zyang-dev@users.noreply.github.com> --- src/lib/onboard.ts | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 3c0a59b8b3..2894382e2a 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -1841,19 +1841,13 @@ function assertCdiNvidiaGpuSpecPresent( hostGpuPlatform: string | null | undefined = null, ): void { if (hostGpuPlatform === "jetson" || preflightUtils.isWslDockerDesktopRuntime(host)) return; - if ( - !(host.cdiNvidiaGpuSpecNeedsRepair || host.cdiNvidiaGpuSpecMissing) || - optedOutGpuPassthrough - ) { - return; - } + if (!(host.cdiNvidiaGpuSpecNeedsRepair || host.cdiNvidiaGpuSpecMissing) || optedOutGpuPassthrough) return; console.error( - " Docker is configured for CDI device injection (CDISpecDirs is set), but the", + " Docker is configured for CDI device injection (CDISpecDirs is set), but the NVIDIA GPU CDI spec", ); console.error( - " NVIDIA GPU CDI spec is missing or stale. OpenShell GPU startup can fail", + " is missing or stale. OpenShell GPU startup can fail until the CDI spec is refreshed.", ); - console.error(" until the CDI spec is refreshed."); printRemediationActions(planHostRemediation(host)); process.exit(1); } From 34c7b5fd94a28917a53431777d963a5e29660e5d Mon Sep 17 00:00:00 2001 From: zyang-dev <267119621+zyang-dev@users.noreply.github.com> Date: Wed, 3 Jun 2026 16:19:52 -0700 Subject: [PATCH 5/9] refactor(onboard): move CDI preflight helpers into docker-cdi module Signed-off-by: zyang-dev <267119621+zyang-dev@users.noreply.github.com> --- src/lib/onboard/docker-cdi.test.ts | 226 ++++++++ src/lib/onboard/docker-cdi.ts | 308 ++++++++++- src/lib/onboard/preflight-cdi.test.ts | 756 ++------------------------ src/lib/onboard/preflight.ts | 321 +---------- 4 files changed, 599 insertions(+), 1012 deletions(-) create mode 100644 src/lib/onboard/docker-cdi.test.ts diff --git a/src/lib/onboard/docker-cdi.test.ts b/src/lib/onboard/docker-cdi.test.ts new file mode 100644 index 0000000000..63fb0aa9c5 --- /dev/null +++ b/src/lib/onboard/docker-cdi.test.ts @@ -0,0 +1,226 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it } from "vitest"; +// Import through dist so coverage follows the CLI build output, matching the +// neighboring preflight tests. +import { + buildNvidiaCdiRepairCommands, + buildStaleCdiWarnCommands, + collectCdiDeviceNodes, + findCdiDeviceNodeMismatch, + getNvidiaCdiSpecPath, + hasNvidiaCdiSpec, + parseDockerCdiSpecDirs, +} from "../../../dist/lib/onboard/docker-cdi"; + +function specWithDeviceNodes(deviceNodes: string): string { + return [ + "cdiVersion: 0.5.0", + "kind: nvidia.com/gpu", + "devices:", + " - name: all", + " containerEdits:", + " deviceNodes:", + deviceNodes, + "", + ].join("\n"); +} + +function cdiFs(files: Record) { + return { + readdirImpl: (dir: string) => + Object.keys(files) + .filter((filePath) => filePath.startsWith(`${dir}/`)) + .map((filePath) => filePath.slice(dir.length + 1)) + .filter((entry) => entry && !entry.includes("/")), + readFileImpl: (filePath: string) => files[filePath] ?? "", + }; +} + +function statDevices(devices: Record) { + return (command: readonly string[]) => { + if (command[0] === "stat") return devices[command[3]] ?? ""; + return ""; + }; +} + +describe("docker-cdi parsing", () => { + it("extracts CDI dirs from whole docker info JSON and .CDISpecDirs JSON", () => { + expect( + parseDockerCdiSpecDirs(JSON.stringify({ CDISpecDirs: ["/etc/cdi", "/var/run/cdi"] })), + ).toEqual(["/etc/cdi", "/var/run/cdi"]); + expect(parseDockerCdiSpecDirs('["/etc/cdi","/var/run/cdi"]')).toEqual([ + "/etc/cdi", + "/var/run/cdi", + ]); + }); + + it("returns an empty array when CDI dirs are absent or empty", () => { + expect(parseDockerCdiSpecDirs(JSON.stringify({ ServerVersion: "27.0" }))).toEqual([]); + expect(parseDockerCdiSpecDirs(JSON.stringify({ CDISpecDirs: [] }))).toEqual([]); + expect(parseDockerCdiSpecDirs("")).toEqual([]); + }); + + it("builds the default NVIDIA CDI spec path from Docker CDI dirs", () => { + expect(getNvidiaCdiSpecPath({ dockerCdiSpecDirs: ["/etc/cdi/", "/var/run/cdi"] })).toBe( + "/etc/cdi/nvidia.yaml", + ); + }); + + it("accepts exact nvidia.com/gpu YAML and JSON specs only", () => { + const fs = cdiFs({ + "/etc/cdi/nvidia.yaml": "cdiVersion: 0.5.0\nkind: nvidia.com/gpu\ndevices: []\n", + "/etc/cdi/nvidia.json": '{"cdiVersion":"0.5.0","kind":"nvidia.com/gpu","devices":[]}', + "/etc/cdi/nvidia-extra.yaml": "kind: nvidia.com/gpu-extra\ndevices: []\n", + "/etc/cdi/notes.yaml": "# nvidia.com/gpu used to be here\nkind: example.com/cpu\n", + }); + + expect(hasNvidiaCdiSpec(["/etc/cdi"], fs.readdirImpl, fs.readFileImpl)).toBe(true); + expect( + hasNvidiaCdiSpec( + ["/etc/cdi"], + () => ["nvidia-extra.yaml", "notes.yaml"], + fs.readFileImpl, + ), + ).toBe(false); + }); +}); + +describe("docker-cdi staleness detection", () => { + it("ignores stale lower-precedence /etc/cdi when /var/run/cdi is fresh", () => { + const fs = cdiFs({ + "/etc/cdi/nvidia.yaml": specWithDeviceNodes( + " - path: /dev/nvidia-uvm\n hostPath: /dev/nvidia-uvm\n type: c\n major: 498", + ), + "/var/run/cdi/nvidia.yaml": specWithDeviceNodes( + " - path: /dev/nvidia-uvm\n hostPath: /dev/nvidia-uvm\n type: c\n major: 499", + ), + }); + + expect( + findCdiDeviceNodeMismatch( + ["/etc/cdi", "/var/run/cdi"], + fs.readdirImpl, + fs.readFileImpl, + statDevices({ "/dev/nvidia-uvm": "1f3 0" }), + ), + ).toBeNull(); + }); + + it("flags stale /etc/cdi when no higher-precedence /var/run/cdi spec exists", () => { + const fs = cdiFs({ + "/etc/cdi/nvidia.yaml": specWithDeviceNodes( + " - path: /dev/nvidia-uvm\n hostPath: /dev/nvidia-uvm\n type: c\n major: 498", + ), + }); + + const mismatch = findCdiDeviceNodeMismatch( + ["/etc/cdi", "/var/run/cdi"], + fs.readdirImpl, + fs.readFileImpl, + statDevices({ "/dev/nvidia-uvm": "1f3 0" }), + ); + + expect(mismatch).toContain("/etc/cdi/nvidia.yaml"); + expect(mismatch).toContain("/dev/nvidia-uvm=498:0"); + expect(mismatch).toContain("live=499:0"); + }); + + it("flags stale /var/run/cdi when it is the effective spec", () => { + const fs = cdiFs({ + "/etc/cdi/nvidia.yaml": specWithDeviceNodes( + " - path: /dev/nvidia-uvm\n hostPath: /dev/nvidia-uvm\n type: c\n major: 499", + ), + "/var/run/cdi/nvidia.yaml": specWithDeviceNodes( + " - path: /dev/nvidia-uvm\n hostPath: /dev/nvidia-uvm\n type: c\n major: 498", + ), + }); + + const mismatch = findCdiDeviceNodeMismatch( + ["/etc/cdi", "/var/run/cdi"], + fs.readdirImpl, + fs.readFileImpl, + statDevices({ "/dev/nvidia-uvm": "1f3 0" }), + ); + + expect(mismatch).toContain("/var/run/cdi/nvidia.yaml"); + expect(mismatch).toContain("/dev/nvidia-uvm=498:0"); + expect(mismatch).toContain("live=499:0"); + }); + + it("defaults omitted minor to 0 and detects non-uvm drift", () => { + const fs = cdiFs({ + "/etc/cdi/nvidia.yaml": specWithDeviceNodes( + " - path: /dev/nvidia-uvm\n type: c\n major: 498\n - path: /dev/nvidia0\n type: c\n major: 195\n minor: 0", + ), + }); + + expect( + findCdiDeviceNodeMismatch( + ["/etc/cdi"], + fs.readdirImpl, + fs.readFileImpl, + statDevices({ "/dev/nvidia-uvm": "1f3 0", "/dev/nvidia0": "c3 0" }), + ), + ).toContain("/dev/nvidia-uvm=498:0"); + }); + + it("skips absent devices and accepts matching explicit minors", () => { + const fs = cdiFs({ + "/etc/cdi/nvidia.yaml": specWithDeviceNodes( + " - path: /dev/nvidia1\n type: c\n major: 195\n minor: 1\n - path: /dev/nvidia-uvm-tools\n type: c\n major: 499\n minor: 1", + ), + }); + + expect( + findCdiDeviceNodeMismatch( + ["/etc/cdi"], + fs.readdirImpl, + fs.readFileImpl, + statDevices({ "/dev/nvidia1": "", "/dev/nvidia-uvm-tools": "1f3 1" }), + ), + ).toBeNull(); + }); + + it("stats CDI hostPath instead of the container path when both are present", () => { + const nodes = collectCdiDeviceNodes( + { + deviceNodes: [ + { path: "/container/nvidia0", hostPath: "/dev/nvidia0", major: 196, minor: 0 }, + ], + }, + "/etc/cdi/nvidia.yaml", + ); + expect(nodes[0]).toMatchObject({ path: "/dev/nvidia0", major: 196, minor: 0 }); + }); +}); + +describe("docker-cdi remediation commands", () => { + it("keeps missing-spec remediation on the direct-generation fallback path", () => { + const commands = buildNvidiaCdiRepairCommands({ systemctlAvailable: true }, "/etc/cdi/nvidia.yaml"); + + expect(commands[0]).toBe("sudo mkdir -p /etc/cdi"); + expect(commands[1]).toBe( + "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", + ); + expect(commands[2]).toBe("sudo systemctl start nvidia-cdi-refresh.service"); + expect(commands[3]).toContain("nvidia-ctk cdi list"); + expect(commands[4]).toContain("sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml"); + expect(commands[5]).toContain("nvidia-ctk cdi list"); + }); + + it("shows stale-spec refresh commands with optional leftover removal only for /etc/cdi", () => { + const leftoverCommands = buildStaleCdiWarnCommands("/etc/cdi/nvidia.yaml"); + expect(leftoverCommands[0]).toBe( + "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", + ); + expect(leftoverCommands[1]).toBe("sudo systemctl start nvidia-cdi-refresh.service"); + expect(leftoverCommands[2]).toContain("sudo rm -f /etc/cdi/nvidia.yaml"); + expect(leftoverCommands.join("\n")).not.toContain("--output=/etc/cdi"); + expect(leftoverCommands.join("\n")).not.toContain("nvidia-ctk cdi list"); + + const serviceCommands = buildStaleCdiWarnCommands("/var/run/cdi/nvidia.yaml"); + expect(serviceCommands.some((command) => command.includes("rm -f"))).toBe(false); + }); +}); diff --git a/src/lib/onboard/docker-cdi.ts b/src/lib/onboard/docker-cdi.ts index b0c1d7a8d7..4a089e6d9d 100644 --- a/src/lib/onboard/docker-cdi.ts +++ b/src/lib/onboard/docker-cdi.ts @@ -6,14 +6,48 @@ import path from "node:path"; import { dockerInfoFormat } from "../adapters/docker"; +export type RunCaptureFn = typeof import("../runner").runCapture; + +export type NvidiaCdiRepairAssessment = { + cdiNvidiaGpuSpecMissing?: boolean; + cdiNvidiaGpuSpecStale?: boolean; + cdiNvidiaGpuSpecMismatch?: string; + cdiNvidiaGpuRefreshUnhealthy?: boolean; + dockerCdiSpecDirs: string[]; + nvidiaCdiRefreshPathEnabled?: boolean | null; + nvidiaCdiRefreshPathActive?: boolean | null; + nvidiaCdiRefreshServiceFailed?: boolean | null; + systemctlAvailable?: boolean; +}; + +type DeviceNumbers = { major: number; minor: number }; + +type CdiDeviceNode = DeviceNumbers & { + filePath: string; + path: string; +}; + +type EffectiveNvidiaCdiSpec = { + filePath: string; + parsed: unknown; +}; + +const NVIDIA_CDI_KIND_YAML_RE = + /^[ \t]*kind[ \t]*:[ \t]*(?:"nvidia\.com\/gpu"|'nvidia\.com\/gpu'|nvidia\.com\/gpu)[ \t]*(?:#.*)?$/im; +const NVIDIA_CDI_KIND_JSON_RE = /"kind"\s*:\s*"nvidia\.com\/gpu"/; +const NVIDIA_CDI_REFRESH_SPEC_PATH = "/var/run/cdi/nvidia.yaml"; + export function parseDockerCdiSpecDirs(value: string | null | undefined): string[] { const raw = String(value || "").trim(); if (!raw || raw === "") return []; try { const parsed = JSON.parse(raw); - return Array.isArray(parsed) - ? parsed.map((entry) => String(entry || "").trim()).filter(Boolean) - : []; + const dirs: unknown[] = Array.isArray(parsed) + ? parsed + : parsed && typeof parsed === "object" && Array.isArray(parsed.CDISpecDirs) + ? parsed.CDISpecDirs + : []; + return dirs.map((entry) => String(entry || "").trim()).filter(Boolean); } catch { return raw .split(/[\s,]+/) @@ -28,6 +62,19 @@ export function getDockerCdiSpecDirs(): string[] { ); } +function normalizeCdiSpecDir(specDir: string | undefined): string { + const trimmed = String(specDir || "/etc/cdi") + .trim() + .replace(/\/+$/, ""); + return trimmed || "/etc/cdi"; +} + +export function getNvidiaCdiSpecPath( + assessment: Pick, +): string { + return path.join(normalizeCdiSpecDir(assessment.dockerCdiSpecDirs[0]), "nvidia.yaml"); +} + function isLikelyNvidiaCdiSpecFile(filePath: string): boolean { if (!/\.(json|ya?ml)$/i.test(filePath)) return false; let content = ""; @@ -55,3 +102,258 @@ export function findReadableNvidiaCdiSpecFiles(dirs: string[]): string[] { } return specs.sort(); } + +export function hasNvidiaCdiSpec( + specDirs: readonly string[], + readdirImpl: (dir: string) => string[], + readFileImpl: (filePath: string, encoding: BufferEncoding) => string, +): boolean { + for (const dir of specDirs) { + let entries: string[]; + try { + entries = readdirImpl(dir); + } catch { + continue; + } + for (const entry of entries) { + if (!/\.(ya?ml|json)$/i.test(entry)) continue; + let raw: string; + try { + raw = readFileImpl(path.join(dir, entry), "utf-8"); + } catch { + continue; + } + if (NVIDIA_CDI_KIND_YAML_RE.test(raw) || NVIDIA_CDI_KIND_JSON_RE.test(raw)) return true; + } + } + return false; +} + +function parseIntegerLike(value: unknown): number | null { + if (typeof value === "number") { + return Number.isInteger(value) && value >= 0 ? value : null; + } + if (typeof value !== "string") return null; + const trimmed = value.trim(); + if (!trimmed) return null; + const base = /^0x/i.test(trimmed) ? 16 : 10; + const parsed = Number.parseInt(trimmed, base); + return Number.isInteger(parsed) && parsed >= 0 ? parsed : null; +} + +function parseLinuxStatDeviceNumbers(output: string | null | undefined): DeviceNumbers | null { + const parts = String(output || "") + .trim() + .split(/\s+/) + .filter(Boolean); + if (parts.length < 2) return null; + const major = Number.parseInt(parts[0], 16); + const minor = Number.parseInt(parts[1], 16); + if (!Number.isInteger(major) || !Number.isInteger(minor) || major < 0 || minor < 0) { + return null; + } + return { major, minor }; +} + +function readLiveLinuxDeviceNumbers( + devicePath: string, + runCaptureImpl: RunCaptureFn, +): DeviceNumbers | null { + try { + return parseLinuxStatDeviceNumbers( + runCaptureImpl(["stat", "-c", "%t %T", devicePath], { ignoreError: true }), + ); + } catch { + return null; + } +} + +function parseCdiSpec(raw: string, filePath: string): unknown { + if (/\.json$/i.test(filePath)) return JSON.parse(raw); + const YAML = require("yaml"); + return YAML.parse(raw); +} + +export function findEffectiveNvidiaCdiSpec( + specDirs: readonly string[], + readdirImpl: (dir: string) => string[], + readFileImpl: (filePath: string, encoding: BufferEncoding) => string, +): EffectiveNvidiaCdiSpec | null { + // Docker CDI precedence is highest in the last configured directory. + for (const dir of [...specDirs].reverse()) { + let entries: string[]; + try { + entries = readdirImpl(dir); + } catch { + continue; + } + for (const entry of entries) { + if (!/\.(ya?ml|json)$/i.test(entry)) continue; + const filePath = path.join(dir, entry); + let raw: string; + try { + raw = readFileImpl(filePath, "utf-8"); + } catch { + continue; + } + if (!NVIDIA_CDI_KIND_YAML_RE.test(raw) && !NVIDIA_CDI_KIND_JSON_RE.test(raw)) { + continue; + } + try { + return { filePath, parsed: parseCdiSpec(raw, filePath) }; + } catch { + continue; + } + } + } + return null; +} + +export function collectCdiDeviceNodes(value: unknown, filePath: string): CdiDeviceNode[] { + const nodes: CdiDeviceNode[] = []; + const stack: unknown[] = [value]; + + while (stack.length > 0) { + const current = stack.pop(); + if (Array.isArray(current)) { + for (const item of current) stack.push(item); + continue; + } + if (!current || typeof current !== "object") continue; + const obj = current as Record; + // We stat the host device, so prefer CDI's host-side path when present. + const nodePath = + (typeof obj.hostPath === "string" && obj.hostPath) || + (typeof obj.path === "string" && obj.path) || + ""; + const major = parseIntegerLike(obj.major); + if (nodePath.startsWith("/dev/") && major !== null) { + const minor = obj.minor === undefined ? 0 : parseIntegerLike(obj.minor); + if (minor !== null) nodes.push({ filePath, path: nodePath, major, minor }); + } + for (const child of Object.values(obj)) stack.push(child); + } + + return nodes; +} + +export function findCdiDeviceNodeMismatch( + specDirs: readonly string[], + readdirImpl: (dir: string) => string[], + readFileImpl: (filePath: string, encoding: BufferEncoding) => string, + runCaptureImpl: RunCaptureFn, +): string | null { + const effective = findEffectiveNvidiaCdiSpec(specDirs, readdirImpl, readFileImpl); + if (!effective) return null; + for (const node of collectCdiDeviceNodes(effective.parsed, effective.filePath)) { + const liveDevice = readLiveLinuxDeviceNumbers(node.path, runCaptureImpl); + if (!liveDevice) continue; + if (node.major === liveDevice.major && node.minor === liveDevice.minor) continue; + return `${node.filePath} ${node.path}=${node.major}:${node.minor}, live=${liveDevice.major}:${liveDevice.minor}`; + } + return null; +} + +export function buildNvidiaCdiRepairCommands( + assessment: Pick, + specPath: string, +): string[] { + const specDir = path.dirname(specPath); + const commands = [`sudo mkdir -p ${specDir}`]; + if (assessment.systemctlAvailable !== false) { + commands.push( + "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", + "sudo systemctl start nvidia-cdi-refresh.service", + "nvidia-ctk cdi list # verify nvidia.com/gpu entries appear", + ); + } + commands.push( + `sudo nvidia-ctk cdi generate --output=${specPath} # fallback if the refresh service does not repair the spec`, + "nvidia-ctk cdi list # verify nvidia.com/gpu entries appear", + "nemoclaw onboard # or rerun with --no-gpu to skip GPU passthrough", + ); + return commands; +} + +export function buildNvidiaCdiRefreshCommands(): string[] { + return [ + "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", + "sudo systemctl start nvidia-cdi-refresh.service", + "nvidia-ctk cdi list # verify nvidia.com/gpu entries appear", + ]; +} + +export function extractCdiMismatchFilePath(mismatch: string | undefined): string { + const trimmed = String(mismatch || "").trim(); + if (!trimmed) return ""; + const firstWhitespace = trimmed.search(/\s/); + return firstWhitespace > 0 ? trimmed.slice(0, firstWhitespace) : trimmed; +} + +export function buildStaleCdiAutoFixCommands(): string[] { + return [ + "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", + "sudo systemctl start nvidia-cdi-refresh.service", + ]; +} + +export function buildStaleCdiWarnCommands(flaggedFilePath: string): string[] { + const commands = buildStaleCdiAutoFixCommands(); + if (flaggedFilePath && flaggedFilePath !== NVIDIA_CDI_REFRESH_SPEC_PATH) { + commands.push( + `sudo rm -f ${flaggedFilePath} # optional: remove the stale leftover (the service owns ${NVIDIA_CDI_REFRESH_SPEC_PATH})`, + ); + } + commands.push( + "nemoclaw onboard # re-run to confirm the stale-spec warning clears (or --no-gpu to skip GPU)", + ); + return commands; +} + +export function explainStaleCdiReason(mismatch: string | undefined): string { + const detail = mismatch || "unknown device-node mismatch"; + const flaggedFilePath = extractCdiMismatchFilePath(mismatch); + const isLeftover = flaggedFilePath && flaggedFilePath !== NVIDIA_CDI_REFRESH_SPEC_PATH; + return ( + `An NVIDIA CDI device node no longer matches the live device (${detail}). ` + + "OpenShell's `gateway start --gpu` injects devices from the CDI spec, so a stale " + + "device number points the container at the wrong device and CUDA init fails " + + "(`CUDA unknown error`). The nvidia-cdi-refresh service keeps " + + `${NVIDIA_CDI_REFRESH_SPEC_PATH} current on driver/toolkit changes` + + (isLeftover + ? `; the flagged ${flaggedFilePath} is a stale leftover that the refreshed ` + + `${NVIDIA_CDI_REFRESH_SPEC_PATH} overrides.` + : "; re-enable and run it to regenerate the spec.") + ); +} + +export function explainNvidiaCdiRepairReason(assessment: NvidiaCdiRepairAssessment): string { + const reasons: string[] = []; + if (assessment.cdiNvidiaGpuSpecMissing) { + reasons.push( + "Docker is configured for CDI device injection (CDISpecDirs is set) but no nvidia.com/gpu CDI spec is present on the host.", + ); + } + if (assessment.cdiNvidiaGpuSpecStale) { + const detail = assessment.cdiNvidiaGpuSpecMismatch + ? ` (${assessment.cdiNvidiaGpuSpecMismatch})` + : ""; + reasons.push( + `The NVIDIA CDI spec appears stale because a declared device node does not match the live device${detail}.`, + ); + } + if (assessment.cdiNvidiaGpuRefreshUnhealthy) { + const unitDetails: string[] = []; + if (assessment.nvidiaCdiRefreshPathEnabled === false) unitDetails.push("path disabled"); + if (assessment.nvidiaCdiRefreshPathActive === false) unitDetails.push("path inactive"); + if (assessment.nvidiaCdiRefreshServiceFailed === true) unitDetails.push("service failed"); + const suffix = unitDetails.length > 0 ? ` (${unitDetails.join(", ")})` : ""; + reasons.push( + `NVIDIA's CDI refresh units are not healthy${suffix}, so Docker may keep using stale GPU device numbers after driver changes.`, + ); + } + reasons.push( + "OpenShell's `gateway start --gpu` can fail until the CDI spec is refreshed and verified.", + ); + return reasons.join(" "); +} diff --git a/src/lib/onboard/preflight-cdi.test.ts b/src/lib/onboard/preflight-cdi.test.ts index b2ca71ca8f..53664122a7 100644 --- a/src/lib/onboard/preflight-cdi.test.ts +++ b/src/lib/onboard/preflight-cdi.test.ts @@ -2,15 +2,9 @@ // SPDX-License-Identifier: Apache-2.0 import { describe, expect, it } from "vitest"; -// Import through the compiled dist/ output (via the bin/lib shim) so -// coverage is attributed to dist/lib/onboard/preflight.js, which is what the -// ratchet measures. -import { - assessHost, - getNvidiaCdiSpecPath, - parseDockerCdiSpecDirs, - planHostRemediation, -} from "../../../dist/lib/onboard/preflight"; +// Import through the compiled dist/ output so coverage is attributed to the +// CLI build output that the ratchet measures. +import { assessHost, planHostRemediation } from "../../../dist/lib/onboard/preflight"; type HostAssessment = Parameters[0]; @@ -44,26 +38,15 @@ function baseAssessment(overrides: Partial = {}): HostAssessment }; } -describe("parseDockerCdiSpecDirs", () => { - it("extracts the dirs from `docker info --format '{{json .}}'` output", () => { - const fixture = JSON.stringify({ CDISpecDirs: ["/etc/cdi", "/var/run/cdi"] }); - expect(parseDockerCdiSpecDirs(fixture)).toEqual(["/etc/cdi", "/var/run/cdi"]); - }); - - it("returns an empty array when CDISpecDirs is absent", () => { - expect(parseDockerCdiSpecDirs(JSON.stringify({ ServerVersion: "27.0" }))).toEqual([]); - }); - - it("returns an empty array when CDISpecDirs is the empty list", () => { - expect(parseDockerCdiSpecDirs(JSON.stringify({ CDISpecDirs: [] }))).toEqual([]); - }); - - it("returns an empty array on empty input", () => { - expect(parseDockerCdiSpecDirs("")).toEqual([]); - }); -}); +function healthySystemctlAndStat(command: readonly string[]) { + if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; + if (command[0] === "systemctl" && command[1] === "is-active") return "active"; + if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; + if (command[0] === "stat" && command[3] === "/dev/nvidia-uvm") return "1f3 0"; + return ""; +} -describe("assessHost — CDI device-spec gap (#3152)", () => { +describe("assessHost — CDI", () => { it("flags missing nvidia.com/gpu specs on an NVIDIA Linux host with CDI dirs configured", () => { const result = assessHost({ platform: "linux", @@ -105,76 +88,7 @@ describe("assessHost — CDI device-spec gap (#3152)", () => { expect(result.cdiNvidiaGpuSpecMissing).toBe(false); }); - it("flags disabled NVIDIA CDI refresh units even when a spec is present", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("nvidia.yaml") - ? "cdiVersion: 0.5.0\nkind: nvidia.com/gpu\ndevices: []\n" - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), - runCaptureImpl: (command: readonly string[]) => { - if (command[0] === "systemctl" && command[1] === "is-enabled") return "disabled"; - if (command[0] === "systemctl" && command[1] === "is-active") return "inactive"; - if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; - if (command[0] === "stat") return "1f3 0"; - return ""; - }, - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => - name === "docker" || name === "systemctl" || name === "nvidia-ctk", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - expect(result.cdiNvidiaGpuRefreshUnhealthy).toBe(true); - expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(false); - expect(result.nvidiaCdiRefreshPathEnabled).toBe(false); - expect(result.nvidiaCdiRefreshPathActive).toBe(false); - }); - - it("does not flag the normal path-only refresh pattern as unhealthy", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("nvidia.yaml") - ? "cdiVersion: 0.5.0\nkind: nvidia.com/gpu\ndevices: []\n" - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), - runCaptureImpl: (command: readonly string[]) => { - if (command[0] === "systemctl" && command[1] === "is-enabled") { - return command[2] === "nvidia-cdi-refresh.service" ? "disabled" : "enabled"; - } - if (command[0] === "systemctl" && command[1] === "is-active") return "active"; - if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; - if (command[0] === "stat") return "1f3 0"; - return ""; - }, - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => - name === "docker" || name === "systemctl" || name === "nvidia-ctk", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - expect(result.cdiNvidiaGpuRefreshUnhealthy).toBe(false); - expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(false); - expect(result.nvidiaCdiRefreshPathEnabled).toBe(true); - expect(result.nvidiaCdiRefreshPathActive).toBe(true); - expect(result.nvidiaCdiRefreshServiceEnabled).toBe(false); - }); - - it("ignores a stale lower-precedence /etc/cdi spec when /var/run/cdi is fresh", () => { + it("uses the effective CDI spec when assessing staleness", () => { const result = assessHost({ platform: "linux", env: {}, @@ -217,13 +131,7 @@ describe("assessHost — CDI device-spec gap (#3152)", () => { if (dir === "/var/run/cdi") return ["nvidia.yaml"]; return []; }, - runCaptureImpl: (command: readonly string[]) => { - if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; - if (command[0] === "systemctl" && command[1] === "is-active") return "active"; - if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; - if (command[0] === "stat" && command[3] === "/dev/nvidia-uvm") return "1f3 0"; - return ""; - }, + runCaptureImpl: healthySystemctlAndStat, dockerInfoOutput: JSON.stringify({ ServerVersion: "27.0", CDISpecDirs: ["/etc/cdi", "/var/run/cdi"], @@ -236,16 +144,15 @@ describe("assessHost — CDI device-spec gap (#3152)", () => { expect(result.cdiNvidiaGpuSpecMissing).toBe(false); expect(result.cdiNvidiaGpuSpecStale).toBe(false); expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(false); - expect(result.cdiNvidiaGpuSpecMismatch).toBeUndefined(); }); - it("flags a stale /etc/cdi spec when no higher-precedence /var/run/cdi spec exists", () => { + it("records stale effective CDI specs as repair-blocking", () => { const result = assessHost({ platform: "linux", env: {}, release: "6.8.0-58-generic", readFileImpl: (filePath: string) => - filePath === "/etc/cdi/nvidia.yaml" + filePath.endsWith("nvidia.yaml") ? [ "cdiVersion: 0.5.0", "kind: nvidia.com/gpu", @@ -261,83 +168,10 @@ describe("assessHost — CDI device-spec gap (#3152)", () => { ].join("\n") : "Linux version 6.8.0-58-generic", readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), - runCaptureImpl: (command: readonly string[]) => { - if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; - if (command[0] === "systemctl" && command[1] === "is-active") return "active"; - if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; - if (command[0] === "stat" && command[3] === "/dev/nvidia-uvm") return "1f3 0"; - return ""; - }, - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi", "/var/run/cdi"], - }), - commandExistsImpl: (name: string) => - name === "docker" || name === "systemctl" || name === "nvidia-ctk", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - expect(result.cdiNvidiaGpuSpecStale).toBe(true); - expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(true); - expect(result.cdiNvidiaGpuSpecMismatch).toContain("/etc/cdi/nvidia.yaml"); - expect(result.cdiNvidiaGpuSpecMismatch).toContain("/dev/nvidia-uvm=498:0"); - expect(result.cdiNvidiaGpuSpecMismatch).toContain("live=499:0"); - }); - - it("flags a stale /var/run/cdi spec when it is the effective spec", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => { - if (filePath === "/etc/cdi/nvidia.yaml") { - return [ - "cdiVersion: 0.5.0", - "kind: nvidia.com/gpu", - "devices:", - " - name: all", - " containerEdits:", - " deviceNodes:", - " - path: /dev/nvidia-uvm", - " hostPath: /dev/nvidia-uvm", - " type: c", - " major: 499", - "", - ].join("\n"); - } - if (filePath === "/var/run/cdi/nvidia.yaml") { - return [ - "cdiVersion: 0.5.0", - "kind: nvidia.com/gpu", - "devices:", - " - name: all", - " containerEdits:", - " deviceNodes:", - " - path: /dev/nvidia-uvm", - " hostPath: /dev/nvidia-uvm", - " type: c", - " major: 498", - "", - ].join("\n"); - } - return "Linux version 6.8.0-58-generic"; - }, - readdirImpl: (dir: string) => { - if (dir === "/etc/cdi") return ["nvidia.yaml"]; - if (dir === "/var/run/cdi") return ["nvidia.yaml"]; - return []; - }, - runCaptureImpl: (command: readonly string[]) => { - if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; - if (command[0] === "systemctl" && command[1] === "is-active") return "active"; - if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; - if (command[0] === "stat" && command[3] === "/dev/nvidia-uvm") return "1f3 0"; - return ""; - }, + runCaptureImpl: healthySystemctlAndStat, dockerInfoOutput: JSON.stringify({ ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi", "/var/run/cdi"], + CDISpecDirs: ["/etc/cdi"], }), commandExistsImpl: (name: string) => name === "docker" || name === "systemctl" || name === "nvidia-ctk", @@ -347,38 +181,27 @@ describe("assessHost — CDI device-spec gap (#3152)", () => { expect(result.cdiNvidiaGpuSpecMissing).toBe(false); expect(result.cdiNvidiaGpuSpecStale).toBe(true); expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(true); - expect(result.cdiNvidiaGpuSpecMismatch).toContain("/var/run/cdi/nvidia.yaml"); expect(result.cdiNvidiaGpuSpecMismatch).toContain("/dev/nvidia-uvm=498:0"); expect(result.cdiNvidiaGpuSpecMismatch).toContain("live=499:0"); }); - it("flags a stale NVIDIA CDI spec when nvidia-uvm omits minor and its major no longer matches", () => { + it("treats refresh-unit health as a non-repair warning", () => { const result = assessHost({ platform: "linux", env: {}, release: "6.8.0-58-generic", readFileImpl: (filePath: string) => filePath.endsWith("nvidia.yaml") - ? [ - "cdiVersion: 0.5.0", - "kind: nvidia.com/gpu", - "devices:", - " - name: all", - " containerEdits:", - " deviceNodes:", - " - path: /dev/nvidia-uvm", - " hostPath: /dev/nvidia-uvm", - " type: c", - " major: 498", - "", - ].join("\n") + ? "cdiVersion: 0.5.0\nkind: nvidia.com/gpu\ndevices: []\n" : "Linux version 6.8.0-58-generic", readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), runCaptureImpl: (command: readonly string[]) => { - if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; + if (command[0] === "systemctl" && command[1] === "is-enabled") { + return command[2] === "nvidia-cdi-refresh.service" ? "disabled" : "enabled"; + } if (command[0] === "systemctl" && command[1] === "is-active") return "active"; if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; - if (command[0] === "stat" && command[3] === "/dev/nvidia-uvm") return "1f3 0"; + if (command[0] === "stat") return "1f3 0"; return ""; }, dockerInfoOutput: JSON.stringify({ @@ -390,417 +213,52 @@ describe("assessHost — CDI device-spec gap (#3152)", () => { gpuProbeImpl: () => true, }); - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); expect(result.cdiNvidiaGpuRefreshUnhealthy).toBe(false); - expect(result.cdiNvidiaGpuSpecStale).toBe(true); - expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(true); - expect(result.cdiNvidiaGpuSpecMismatch).toContain("/dev/nvidia-uvm=498:0"); - expect(result.cdiNvidiaGpuSpecMismatch).toContain("live=499:0"); + expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(false); + expect(result.nvidiaCdiRefreshServiceEnabled).toBe(false); }); - it("flags a stale NVIDIA CDI spec when a non-uvm device no longer matches the live device", () => { - const result = assessHost({ + it("does not apply CDI checks without an NVIDIA Linux CDI context", () => { + const linuxWithoutGpu = assessHost({ platform: "linux", env: {}, release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("nvidia.yaml") - ? [ - "cdiVersion: 0.5.0", - "kind: nvidia.com/gpu", - "devices:", - " - name: all", - " containerEdits:", - " deviceNodes:", - " - path: /dev/nvidia0", - " type: c", - " major: 196", - " minor: 0", - "", - ].join("\n") - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), - runCaptureImpl: (command: readonly string[]) => { - if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; - if (command[0] === "systemctl" && command[1] === "is-active") return "active"; - if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; - if (command[0] === "stat" && command[3] === "/dev/nvidia0") return "c3 0"; - return ""; - }, - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => - name === "docker" || name === "systemctl" || name === "nvidia-ctk", - gpuProbeImpl: () => true, + readFileImpl: () => "Linux version 6.8.0-58-generic", + readdirImpl: () => [], + dockerInfoOutput: JSON.stringify({ ServerVersion: "27.0", CDISpecDirs: ["/etc/cdi"] }), + commandExistsImpl: (name: string) => name === "docker", + gpuProbeImpl: () => false, }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - expect(result.cdiNvidiaGpuRefreshUnhealthy).toBe(false); - expect(result.cdiNvidiaGpuSpecStale).toBe(true); - expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(true); - expect(result.cdiNvidiaGpuSpecMismatch).toContain("/dev/nvidia0=196:0"); - expect(result.cdiNvidiaGpuSpecMismatch).toContain("live=195:0"); - }); - - it("skips declared CDI device nodes whose live device is absent", () => { - const result = assessHost({ + const noCdiDirs = assessHost({ platform: "linux", env: {}, release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("nvidia.yaml") - ? [ - "cdiVersion: 0.5.0", - "kind: nvidia.com/gpu", - "devices:", - " - name: all", - " containerEdits:", - " deviceNodes:", - " - path: /dev/nvidia1", - " type: c", - " major: 195", - " minor: 1", - "", - ].join("\n") - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), - runCaptureImpl: (command: readonly string[]) => { - if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; - if (command[0] === "systemctl" && command[1] === "is-active") return "active"; - if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; - if (command[0] === "stat" && command[3] === "/dev/nvidia1") return ""; - return ""; - }, - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => - name === "docker" || name === "systemctl" || name === "nvidia-ctk", + readFileImpl: () => "Linux version 6.8.0-58-generic", + readdirImpl: () => [], + dockerInfoOutput: JSON.stringify({ ServerVersion: "24.0" }), + commandExistsImpl: (name: string) => name === "docker", gpuProbeImpl: () => true, }); - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - expect(result.cdiNvidiaGpuRefreshUnhealthy).toBe(false); - expect(result.cdiNvidiaGpuSpecStale).toBe(false); - expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(false); + expect(linuxWithoutGpu.cdiNvidiaGpuSpecMissing).toBe(false); + expect(noCdiDirs.dockerCdiSpecDirs).toEqual([]); + expect(noCdiDirs.cdiNvidiaGpuSpecMissing).toBe(false); }); - - it("accepts a healthy refresh service with all CDI device nodes matching live devices", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("nvidia.yaml") - ? [ - "cdiVersion: 0.5.0", - "kind: nvidia.com/gpu", - "devices:", - " - name: all", - " containerEdits:", - " deviceNodes:", - " - path: /dev/nvidia0", - " type: c", - " major: 195", - " minor: 0", - " - path: /dev/nvidia-uvm", - " hostPath: /dev/nvidia-uvm", - " type: c", - " major: 499", - " - path: /dev/nvidia-uvm-tools", - " type: c", - " major: 499", - " minor: 1", - "", - ].join("\n") - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), - runCaptureImpl: (command: readonly string[]) => { - if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; - if (command[0] === "systemctl" && command[1] === "is-active") return "active"; - if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; - if (command[0] === "stat" && command[3] === "/dev/nvidia0") return "c3 0"; - if (command[0] === "stat" && command[3] === "/dev/nvidia-uvm") return "1f3 0"; - if (command[0] === "stat" && command[3] === "/dev/nvidia-uvm-tools") return "1f3 1"; - return ""; - }, - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => - name === "docker" || name === "systemctl" || name === "nvidia-ctk", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - expect(result.cdiNvidiaGpuRefreshUnhealthy).toBe(false); - expect(result.cdiNvidiaGpuSpecStale).toBe(false); - expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(false); - }); - - it("does not flag a CDI device node whose explicit minor matches the live device", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("nvidia.yaml") - ? [ - "cdiVersion: 0.5.0", - "kind: nvidia.com/gpu", - "devices:", - " - name: all", - " containerEdits:", - " deviceNodes:", - " - path: /dev/nvidia-uvm-tools", - " type: c", - " major: 499", - " minor: 1", - "", - ].join("\n") - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), - runCaptureImpl: (command: readonly string[]) => { - if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; - if (command[0] === "systemctl" && command[1] === "is-active") return "active"; - if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; - if (command[0] === "stat" && command[3] === "/dev/nvidia-uvm-tools") return "1f3 1"; - return ""; - }, - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => - name === "docker" || name === "systemctl" || name === "nvidia-ctk", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - expect(result.cdiNvidiaGpuRefreshUnhealthy).toBe(false); - expect(result.cdiNvidiaGpuSpecStale).toBe(false); - expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(false); - }); - - it("stats CDI hostPath instead of the container path when both are present", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("nvidia.yaml") - ? [ - "cdiVersion: 0.5.0", - "kind: nvidia.com/gpu", - "devices:", - " - name: all", - " containerEdits:", - " deviceNodes:", - " - path: /container/nvidia0", - " hostPath: /dev/nvidia0", - " type: c", - " major: 196", - " minor: 0", - "", - ].join("\n") - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), - runCaptureImpl: (command: readonly string[]) => { - if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; - if (command[0] === "systemctl" && command[1] === "is-active") return "active"; - if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; - if (command[0] === "stat" && command[3] === "/dev/nvidia0") return "c3 0"; - if (command[0] === "stat" && command[3] === "/container/nvidia0") return "c4 0"; - return ""; - }, - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => - name === "docker" || name === "systemctl" || name === "nvidia-ctk", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecStale).toBe(true); - expect(result.cdiNvidiaGpuSpecMismatch).toContain("/dev/nvidia0=196:0"); - expect(result.cdiNvidiaGpuSpecMismatch).toContain("live=195:0"); - }); - - it("accepts a JSON-serialised CDI spec as well", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("nvidia.json") - ? '{"cdiVersion":"0.5.0","kind":"nvidia.com/gpu","devices":[]}' - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.json"] : []), - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - }); - - it("does not flag a non-NVIDIA Linux host even with CDI dirs configured", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: () => "Linux version 6.8.0-58-generic", - readdirImpl: () => [], - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => false, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - }); - - it("does not flag a host that does not advertise CDISpecDirs", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: () => "Linux version 6.8.0-58-generic", - readdirImpl: () => [], - dockerInfoOutput: JSON.stringify({ ServerVersion: "24.0" }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.dockerCdiSpecDirs).toEqual([]); - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - }); - - it("does not flag macOS even when the docker info shape would otherwise match", () => { - const result = assessHost({ - platform: "darwin", - env: {}, - readFileImpl: () => "", - readdirImpl: () => [], - dockerInfoOutput: JSON.stringify({ CDISpecDirs: ["/etc/cdi"] }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - }); - - it("does not accept a sibling device class such as nvidia.com/gpu-extra as a satisfying spec", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("nvidia-extra.yaml") - ? "cdiVersion: 0.5.0\nkind: nvidia.com/gpu-extra\ndevices: []\n" - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia-extra.yaml"] : []), - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(true); - }); - - it("does not accept a sibling device class in JSON form either", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("nvidia-extra.json") - ? '{"cdiVersion":"0.5.0","kind":"nvidia.com/gpu-extra","devices":[]}' - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia-extra.json"] : []), - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(true); - }); - - it("ignores spec files whose `kind` only mentions nvidia.com/gpu in a comment", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("notes.yaml") - ? "# this used to declare nvidia.com/gpu; now stripped\nkind: example.com/cpu\n" - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["notes.yaml"] : []), - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(true); - }); -}); - -describe("getNvidiaCdiSpecPath", () => { - it("builds the default NVIDIA CDI spec path from Docker CDI dirs", () => { - expect(getNvidiaCdiSpecPath({ dockerCdiSpecDirs: ["/etc/cdi/", "/var/run/cdi"] })).toBe( - "/etc/cdi/nvidia.yaml", - ); - }); -}); +}); describe("planHostRemediation — CDI", () => { - it("emits a blocking generate_nvidia_cdi_spec action when CDI dirs are configured but no nvidia.com/gpu spec exists", () => { - const actions = planHostRemediation( - baseAssessment({ - cdiNvidiaGpuSpecMissing: true, - }), - ); + it("emits a blocking generate action for missing nvidia.com/gpu specs", () => { + const actions = planHostRemediation(baseAssessment({ cdiNvidiaGpuSpecMissing: true })); + const action = actions.find((entry: { id: string }) => entry.id === "generate_nvidia_cdi_spec"); - const action = actions.find( - (entry: { id: string }) => entry.id === "generate_nvidia_cdi_spec", - ); expect(action).toBeTruthy(); expect(action?.kind).toBe("sudo"); expect(action?.blocking).toBe(true); - expect(action?.commands[0]).toBe("sudo mkdir -p /etc/cdi"); - expect(action?.commands[1]).toBe( - "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", - ); - expect(action?.commands[2]).toBe("sudo systemctl start nvidia-cdi-refresh.service"); - expect(action?.commands[3]).toContain("nvidia-ctk cdi list"); - expect(action?.commands[4]).toContain( - "sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml", - ); - expect(action?.commands[5]).toContain("nvidia-ctk cdi list"); - expect(action?.commands[6]).toContain("nemoclaw onboard"); - expect(action?.reason).toContain("nvidia.com/gpu"); + expect(action?.commands.some((command) => command.includes("--output=/etc/cdi"))).toBe(true); + expect(action?.commands.some((command) => command.includes("nvidia-ctk cdi list"))).toBe(true); }); - it("emits stale-spec refresh commands without direct /etc/cdi generation", () => { + it("emits service-refresh commands for stale nvidia.com/gpu specs", () => { const actions = planHostRemediation( baseAssessment({ cdiNvidiaGpuSpecStale: true, @@ -809,81 +267,18 @@ describe("planHostRemediation — CDI", () => { "/etc/cdi/nvidia.yaml /dev/nvidia-uvm=498:0, live=499:0", }), ); + const action = actions.find((entry: { id: string }) => entry.id === "refresh_nvidia_cdi_spec"); - const action = actions.find( - (entry: { id: string }) => entry.id === "refresh_nvidia_cdi_spec", - ); expect(action).toBeTruthy(); - expect(action?.kind).toBe("sudo"); expect(action?.blocking).toBe(true); expect(action?.commands[0]).toBe( "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", ); expect(action?.commands[1]).toBe("sudo systemctl start nvidia-cdi-refresh.service"); - expect(action?.commands[2]).toContain("sudo rm -f /etc/cdi/nvidia.yaml"); - expect(action?.commands[2]).toContain("optional"); - expect(action?.commands[3]).toContain("nemoclaw onboard"); - expect(action?.commands.some((command) => command.includes("mkdir -p /etc/cdi"))).toBe( - false, - ); - expect(action?.commands.some((command) => command.includes("--output=/etc/cdi"))).toBe( - false, - ); - expect(action?.commands.some((command) => command.includes("nvidia-ctk cdi list"))).toBe( - false, - ); - expect(action?.reason).toContain("/etc/cdi/nvidia.yaml"); - expect(action?.reason).toContain("/var/run/cdi/nvidia.yaml"); - }); - - it("does not offer leftover removal when the stale effective spec is /var/run/cdi", () => { - const actions = planHostRemediation( - baseAssessment({ - cdiNvidiaGpuSpecStale: true, - cdiNvidiaGpuSpecNeedsRepair: true, - cdiNvidiaGpuSpecMismatch: - "/var/run/cdi/nvidia.yaml /dev/nvidia-uvm=498:0, live=499:0", - }), - ); - - const action = actions.find( - (entry: { id: string }) => entry.id === "refresh_nvidia_cdi_spec", - ); - expect(action).toBeTruthy(); - expect(action?.commands.some((command) => command.includes("rm -f"))).toBe(false); - expect(action?.commands.some((command) => command.includes("--output=/etc/cdi"))).toBe( - false, - ); - expect(action?.commands.some((command) => command.includes("nvidia-ctk cdi list"))).toBe( - false, - ); - }); - - it("uses stale refresh commands after toolkit bootstrap when nvidia-ctk is missing", () => { - const actions = planHostRemediation( - baseAssessment({ - cdiNvidiaGpuSpecStale: true, - cdiNvidiaGpuSpecNeedsRepair: true, - cdiNvidiaGpuSpecMismatch: - "/etc/cdi/nvidia.yaml /dev/nvidia-uvm=498:0, live=499:0", - nvidiaContainerToolkitInstalled: false, - }), - ); - - const action = actions.find((entry) => entry.id === "install_nvidia_container_toolkit"); - expect(action).toBeTruthy(); - expect(action?.title).toContain("refresh CDI"); - expect(action?.commands.some((command) => command === "sudo apt-get install -y nvidia-container-toolkit")).toBe( + expect(action?.commands.some((command) => command.includes("sudo rm -f /etc/cdi/nvidia.yaml"))).toBe( true, ); - expect( - action?.commands.some((command) => - command === "sudo systemctl start nvidia-cdi-refresh.service", - ), - ).toBe(true); - expect(action?.commands.some((command) => command.includes("--output=/etc/cdi"))).toBe( - false, - ); + expect(action?.commands.some((command) => command.includes("--output=/etc/cdi"))).toBe(false); expect(action?.commands.some((command) => command.includes("nvidia-ctk cdi list"))).toBe( false, ); @@ -899,72 +294,31 @@ describe("planHostRemediation — CDI", () => { nvidiaCdiRefreshPathActive: false, }), ); - const action = actions.find( (entry: { id: string }) => entry.id === "warn_nvidia_cdi_refresh_unhealthy", ); + expect(action).toBeTruthy(); expect(action?.blocking).toBe(false); expect(action?.title).toBe("Enable NVIDIA CDI refresh service"); expect(action?.reason).toContain("path disabled"); - expect(action?.commands[0]).toBe( - "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", - ); - expect(action?.commands[1]).toBe("sudo systemctl start nvidia-cdi-refresh.service"); }); - it("emits an install_nvidia_container_toolkit action with apt bootstrap when nvidia-ctk is missing on apt hosts", () => { + it("bootstraps nvidia-container-toolkit before missing-spec generation", () => { const actions = planHostRemediation( baseAssessment({ cdiNvidiaGpuSpecMissing: true, nvidiaContainerToolkitInstalled: false, }), ); - - expect(actions.find((entry) => entry.id === "generate_nvidia_cdi_spec")).toBeUndefined(); const action = actions.find((entry) => entry.id === "install_nvidia_container_toolkit"); + expect(action).toBeTruthy(); - expect(action?.kind).toBe("sudo"); - expect(action?.blocking).toBe(true); - expect(action?.title).toContain("Install NVIDIA Container Toolkit"); - expect(action?.reason).toContain("nvidia-container-toolkit"); - expect(action?.commands.some((c) => c.includes("nvidia-container-toolkit-keyring.gpg"))).toBe( + expect(action?.commands.some((command) => command === "sudo apt-get install -y nvidia-container-toolkit")).toBe( true, ); - expect(action?.commands.some((c) => c === "sudo apt-get install -y nvidia-container-toolkit")).toBe( + expect(action?.commands.some((command) => command.startsWith("sudo nvidia-ctk cdi generate --output="))).toBe( true, ); - expect( - action?.commands.some((c) => c.startsWith("sudo nvidia-ctk cdi generate --output=")), - ).toBe(true); - const ctkInstallIndex = - action?.commands.findIndex((c) => c === "sudo apt-get install -y nvidia-container-toolkit") ?? - -1; - const ctkGenerateIndex = - action?.commands.findIndex((c) => c.startsWith("sudo nvidia-ctk cdi generate --output=")) ?? - -1; - expect(ctkInstallIndex).toBeGreaterThanOrEqual(0); - expect(ctkGenerateIndex).toBeGreaterThan(ctkInstallIndex); - }); - - it("emits an install_nvidia_container_toolkit action with a docs pointer when nvidia-ctk is missing on unknown package managers", () => { - const actions = planHostRemediation( - baseAssessment({ - packageManager: "unknown", - cdiNvidiaGpuSpecMissing: true, - nvidiaContainerToolkitInstalled: false, - }), - ); - - const action = actions.find((entry) => entry.id === "install_nvidia_container_toolkit"); - expect(action).toBeTruthy(); - expect( - action?.commands.some((c) => - c.includes("docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide"), - ), - ).toBe(true); - expect( - action?.commands.some((c) => c.startsWith("sudo nvidia-ctk cdi generate --output=")), - ).toBe(true); }); }); diff --git a/src/lib/onboard/preflight.ts b/src/lib/onboard/preflight.ts index f3a17ee6ae..fdc4761d15 100644 --- a/src/lib/onboard/preflight.ts +++ b/src/lib/onboard/preflight.ts @@ -16,10 +16,23 @@ import os from "node:os"; import path from "node:path"; import { DASHBOARD_PORT } from "../core/ports"; +import { + buildNvidiaCdiRefreshCommands, + buildNvidiaCdiRepairCommands, + buildStaleCdiWarnCommands, + explainNvidiaCdiRepairReason, + explainStaleCdiReason, + extractCdiMismatchFilePath, + findCdiDeviceNodeMismatch, + getNvidiaCdiSpecPath, + hasNvidiaCdiSpec, + parseDockerCdiSpecDirs, +} from "./docker-cdi"; import { isWslDockerDesktopRuntime, wslDockerDesktopGpuCompatibilityAction, } from "./wsl-docker-desktop-gpu"; +export { getNvidiaCdiSpecPath, parseDockerCdiSpecDirs } from "./docker-cdi"; export { isWslDockerDesktopRuntime } from "./wsl-docker-desktop-gpu"; // runner.ts still uses CommonJS-style exports — use require here. @@ -162,23 +175,6 @@ export interface AssessHostOpts { gpuProbeImpl?: () => boolean; } -type DeviceNumbers = { major: number; minor: number }; - -type CdiDeviceNode = DeviceNumbers & { - filePath: string; - path: string; -}; - -type EffectiveNvidiaCdiSpec = { - filePath: string; - parsed: unknown; -}; - -const NVIDIA_CDI_KIND_YAML_RE = - /^[ \t]*kind[ \t]*:[ \t]*(?:"nvidia\.com\/gpu"|'nvidia\.com\/gpu'|nvidia\.com\/gpu)[ \t]*(?:#.*)?$/im; -const NVIDIA_CDI_KIND_JSON_RE = /"kind"\s*:\s*"nvidia\.com\/gpu"/; -const NVIDIA_CDI_REFRESH_SPEC_PATH = "/var/run/cdi/nvidia.yaml"; - function buildCommandVArgv(commandName: string): readonly string[] { return ["sh", "-c", 'command -v "$1"', "--", commandName]; } @@ -310,196 +306,6 @@ export function parseDockerUsesContainerdSnapshotter(info = ""): boolean { return /io\.containerd\.snapshotter\.v1/.test(info); } -// Parses the Docker daemon's configured CDI spec directories from `docker -// info --format '{{json .}}'` output. Docker 25+ surfaces these as -// `"CDISpecDirs": ["/etc/cdi", "/var/run/cdi"]` whenever the daemon is built -// with CDI support and `features.cdi=true` (the default on recent installs). -// An empty list means CDI device injection is not enabled, so OpenShell will -// fall back to the legacy `nvidia` runtime path and there is no spec gap to -// worry about. -export function parseDockerCdiSpecDirs(info = ""): string[] { - const match = info.match(/"CDISpecDirs"\s*:\s*\[([^\]]*)\]/); - if (!match) return []; - return Array.from(match[1].matchAll(/"([^"]+)"/g), (m) => m[1]).filter(Boolean); -} - -function normalizeCdiSpecDir(specDir: string | undefined): string { - const trimmed = String(specDir || "/etc/cdi") - .trim() - .replace(/\/+$/, ""); - return trimmed || "/etc/cdi"; -} - -export function getNvidiaCdiSpecPath( - assessment: Pick, -): string { - return path.join(normalizeCdiSpecDir(assessment.dockerCdiSpecDirs[0]), "nvidia.yaml"); -} - -// True when at least one CDI spec under the configured directories declares -// `kind: nvidia.com/gpu` (the device class OpenShell injects with `--gpu`). -// Specs are typically YAML, but the JSON shape is also accepted because -// `nvidia-ctk cdi generate --format=json` is supported. Errors reading any -// individual file or directory are tolerated — a missing dir is the same -// shape as "no spec found there". -function hasNvidiaCdiSpec( - specDirs: readonly string[], - readdirImpl: (dir: string) => string[], - readFileImpl: (filePath: string, encoding: BufferEncoding) => string, -): boolean { - // YAML keys are unquoted; JSON quotes the kind value. Anchor both patterns - // to the *exact* device-class string `nvidia.com/gpu` and require a value - // terminator (end of line, whitespace + comment, or whitespace + EOL) so a - // sibling spec like `nvidia.com/gpu-extra` does not silently satisfy the - // check and suppress the preflight warning. A comment that merely mentions - // `nvidia.com/gpu` is also rejected because `kindRe` only matches when the - // *whole* scalar value is the device class. - for (const dir of specDirs) { - let entries: string[]; - try { - entries = readdirImpl(dir); - } catch { - continue; - } - for (const entry of entries) { - if (!/\.(ya?ml|json)$/i.test(entry)) continue; - let raw: string; - try { - raw = readFileImpl(path.join(dir, entry), "utf-8"); - } catch { - continue; - } - if (NVIDIA_CDI_KIND_YAML_RE.test(raw) || NVIDIA_CDI_KIND_JSON_RE.test(raw)) return true; - } - } - return false; -} - -function parseIntegerLike(value: unknown): number | null { - if (typeof value === "number") { - return Number.isInteger(value) && value >= 0 ? value : null; - } - if (typeof value !== "string") return null; - const trimmed = value.trim(); - if (!trimmed) return null; - const base = /^0x/i.test(trimmed) ? 16 : 10; - const parsed = Number.parseInt(trimmed, base); - return Number.isInteger(parsed) && parsed >= 0 ? parsed : null; -} - -function parseLinuxStatDeviceNumbers(output: string | null | undefined): DeviceNumbers | null { - const parts = String(output || "") - .trim() - .split(/\s+/) - .filter(Boolean); - if (parts.length < 2) return null; - const major = Number.parseInt(parts[0], 16); - const minor = Number.parseInt(parts[1], 16); - if (!Number.isInteger(major) || !Number.isInteger(minor) || major < 0 || minor < 0) { - return null; - } - return { major, minor }; -} - -function readLiveLinuxDeviceNumbers( - devicePath: string, - runCaptureImpl: RunCaptureFn, -): DeviceNumbers | null { - try { - return parseLinuxStatDeviceNumbers( - runCaptureImpl(["stat", "-c", "%t %T", devicePath], { ignoreError: true }), - ); - } catch { - return null; - } -} - -function parseCdiSpec(raw: string, filePath: string): unknown { - if (/\.json$/i.test(filePath)) return JSON.parse(raw); - const YAML = require("yaml"); - return YAML.parse(raw); -} - -function findEffectiveNvidiaCdiSpec( - specDirs: readonly string[], - readdirImpl: (dir: string) => string[], - readFileImpl: (filePath: string, encoding: BufferEncoding) => string, -): EffectiveNvidiaCdiSpec | null { - // Docker CDI precedence is highest in the last configured directory. - for (const dir of [...specDirs].reverse()) { - let entries: string[]; - try { - entries = readdirImpl(dir); - } catch { - continue; - } - for (const entry of entries) { - if (!/\.(ya?ml|json)$/i.test(entry)) continue; - const filePath = path.join(dir, entry); - let raw: string; - try { - raw = readFileImpl(filePath, "utf-8"); - } catch { - continue; - } - if (!NVIDIA_CDI_KIND_YAML_RE.test(raw) && !NVIDIA_CDI_KIND_JSON_RE.test(raw)) { - continue; - } - try { - return { filePath, parsed: parseCdiSpec(raw, filePath) }; - } catch { - continue; - } - } - } - return null; -} - -function collectCdiDeviceNodes(value: unknown, filePath: string): CdiDeviceNode[] { - const nodes: CdiDeviceNode[] = []; - const stack: unknown[] = [value]; - - while (stack.length > 0) { - const current = stack.pop(); - if (Array.isArray(current)) { - for (const item of current) stack.push(item); - continue; - } - if (!current || typeof current !== "object") continue; - const obj = current as Record; - // We stat the host device, so prefer CDI's host-side path when present. - const nodePath = - (typeof obj.hostPath === "string" && obj.hostPath) || - (typeof obj.path === "string" && obj.path) || - ""; - const major = parseIntegerLike(obj.major); - if (nodePath.startsWith("/dev/") && major !== null) { - const minor = obj.minor === undefined ? 0 : parseIntegerLike(obj.minor); - if (minor !== null) nodes.push({ filePath, path: nodePath, major, minor }); - } - for (const child of Object.values(obj)) stack.push(child); - } - - return nodes; -} - -function findCdiDeviceNodeMismatch( - specDirs: readonly string[], - readdirImpl: (dir: string) => string[], - readFileImpl: (filePath: string, encoding: BufferEncoding) => string, - runCaptureImpl: RunCaptureFn, -): string | null { - const effective = findEffectiveNvidiaCdiSpec(specDirs, readdirImpl, readFileImpl); - if (!effective) return null; - for (const node of collectCdiDeviceNodes(effective.parsed, effective.filePath)) { - const liveDevice = readLiveLinuxDeviceNumbers(node.path, runCaptureImpl); - if (!liveDevice) continue; - if (node.major === liveDevice.major && node.minor === liveDevice.minor) continue; - return `${node.filePath} ${node.path}=${node.major}:${node.minor}, live=${liveDevice.major}:${liveDevice.minor}`; - } - return null; -} - function parseSystemctlFailedState(value = ""): boolean | null { const normalized = String(value || "") .trim() @@ -653,107 +459,6 @@ export function buildContainerToolkitBootstrapCommands( ]; } -function buildNvidiaCdiRepairCommands(assessment: HostAssessment, specPath: string): string[] { - const specDir = path.dirname(specPath); - const commands = [`sudo mkdir -p ${specDir}`]; - if (assessment.systemctlAvailable !== false) { - commands.push( - "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", - "sudo systemctl start nvidia-cdi-refresh.service", - "nvidia-ctk cdi list # verify nvidia.com/gpu entries appear", - ); - } - commands.push( - `sudo nvidia-ctk cdi generate --output=${specPath} # fallback if the refresh service does not repair the spec`, - "nvidia-ctk cdi list # verify nvidia.com/gpu entries appear", - "nemoclaw onboard # or rerun with --no-gpu to skip GPU passthrough", - ); - return commands; -} - -function buildNvidiaCdiRefreshCommands(): string[] { - return [ - "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", - "sudo systemctl start nvidia-cdi-refresh.service", - "nvidia-ctk cdi list # verify nvidia.com/gpu entries appear", - ]; -} - -function extractCdiMismatchFilePath(mismatch: string | undefined): string { - const trimmed = String(mismatch || "").trim(); - if (!trimmed) return ""; - const firstWhitespace = trimmed.search(/\s/); - return firstWhitespace > 0 ? trimmed.slice(0, firstWhitespace) : trimmed; -} - -function buildStaleCdiAutoFixCommands(): string[] { - return [ - "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", - "sudo systemctl start nvidia-cdi-refresh.service", - ]; -} - -function buildStaleCdiWarnCommands(flaggedFilePath: string): string[] { - const commands = buildStaleCdiAutoFixCommands(); - if (flaggedFilePath && flaggedFilePath !== NVIDIA_CDI_REFRESH_SPEC_PATH) { - commands.push( - `sudo rm -f ${flaggedFilePath} # optional: remove the stale leftover (the service owns ${NVIDIA_CDI_REFRESH_SPEC_PATH})`, - ); - } - commands.push( - "nemoclaw onboard # re-run to confirm the stale-spec warning clears (or --no-gpu to skip GPU)", - ); - return commands; -} - -function explainStaleCdiReason(mismatch: string | undefined): string { - const detail = mismatch || "unknown device-node mismatch"; - const flaggedFilePath = extractCdiMismatchFilePath(mismatch); - const isLeftover = flaggedFilePath && flaggedFilePath !== NVIDIA_CDI_REFRESH_SPEC_PATH; - return ( - `An NVIDIA CDI device node no longer matches the live device (${detail}). ` + - "OpenShell's `gateway start --gpu` injects devices from the CDI spec, so a stale " + - "device number points the container at the wrong device and CUDA init fails " + - "(`CUDA unknown error`). The nvidia-cdi-refresh service keeps " + - `${NVIDIA_CDI_REFRESH_SPEC_PATH} current on driver/toolkit changes` + - (isLeftover - ? `; the flagged ${flaggedFilePath} is a stale leftover that the refreshed ` + - `${NVIDIA_CDI_REFRESH_SPEC_PATH} overrides.` - : "; re-enable and run it to regenerate the spec.") - ); -} - -function explainNvidiaCdiRepairReason(assessment: HostAssessment): string { - const reasons: string[] = []; - if (assessment.cdiNvidiaGpuSpecMissing) { - reasons.push( - "Docker is configured for CDI device injection (CDISpecDirs is set) but no nvidia.com/gpu CDI spec is present on the host.", - ); - } - if (assessment.cdiNvidiaGpuSpecStale) { - const detail = assessment.cdiNvidiaGpuSpecMismatch - ? ` (${assessment.cdiNvidiaGpuSpecMismatch})` - : ""; - reasons.push( - `The NVIDIA CDI spec appears stale because a declared device node does not match the live device${detail}.`, - ); - } - if (assessment.cdiNvidiaGpuRefreshUnhealthy) { - const unitDetails: string[] = []; - if (assessment.nvidiaCdiRefreshPathEnabled === false) unitDetails.push("path disabled"); - if (assessment.nvidiaCdiRefreshPathActive === false) unitDetails.push("path inactive"); - if (assessment.nvidiaCdiRefreshServiceFailed === true) unitDetails.push("service failed"); - const suffix = unitDetails.length > 0 ? ` (${unitDetails.join(", ")})` : ""; - reasons.push( - `NVIDIA's CDI refresh units are not healthy${suffix}, so Docker may keep using stale GPU device numbers after driver changes.`, - ); - } - reasons.push( - "OpenShell's `gateway start --gpu` can fail until the CDI spec is refreshed and verified.", - ); - return reasons.join(" "); -} - export function assessHost(opts: AssessHostOpts = {}): HostAssessment { const platform = opts.platform ?? process.platform; const env = opts.env ?? process.env; From ead83f78392d0e7248ca0bd2d96d4461a0e680ab Mon Sep 17 00:00:00 2001 From: zyang-dev <267119621+zyang-dev@users.noreply.github.com> Date: Wed, 3 Jun 2026 16:24:59 -0700 Subject: [PATCH 6/9] fix(onboard): guard stale CDI repair on toolkit and systemd availability Signed-off-by: zyang-dev <267119621+zyang-dev@users.noreply.github.com> --- scripts/install.sh | 1 + src/lib/onboard.ts | 7 +----- src/lib/onboard/docker-cdi.test.ts | 10 +++++++++ src/lib/onboard/docker-cdi.ts | 15 +++++++++++++ src/lib/onboard/preflight-cdi.test.ts | 19 ++++++++++++++++ src/lib/onboard/preflight.ts | 7 ++++-- test/install-preflight.test.ts | 32 +++++++++++++++++++++++++++ 7 files changed, 83 insertions(+), 8 deletions(-) diff --git a/scripts/install.sh b/scripts/install.sh index d8508d34ab..56b1bd826b 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -1905,6 +1905,7 @@ repair_installer_nvidia_cdi_spec() { host.cdiNvidiaGpuSpecStale && host.cdiNvidiaGpuSpecNeedsRepair && !host.cdiNvidiaGpuSpecMissing && + host.nvidiaContainerToolkitInstalled && !isWslDockerDesktopRuntime(host) ) { const mismatch = String(host.cdiNvidiaGpuSpecMismatch || ""); diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 2894382e2a..c138e3ce68 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -1842,12 +1842,7 @@ function assertCdiNvidiaGpuSpecPresent( ): void { if (hostGpuPlatform === "jetson" || preflightUtils.isWslDockerDesktopRuntime(host)) return; if (!(host.cdiNvidiaGpuSpecNeedsRepair || host.cdiNvidiaGpuSpecMissing) || optedOutGpuPassthrough) return; - console.error( - " Docker is configured for CDI device injection (CDISpecDirs is set), but the NVIDIA GPU CDI spec", - ); - console.error( - " is missing or stale. OpenShell GPU startup can fail until the CDI spec is refreshed.", - ); + console.error(" Docker is configured for CDI device injection (CDISpecDirs is set), but the NVIDIA GPU CDI spec is missing or stale. OpenShell GPU startup can fail until the CDI spec is refreshed."); printRemediationActions(planHostRemediation(host)); process.exit(1); } diff --git a/src/lib/onboard/docker-cdi.test.ts b/src/lib/onboard/docker-cdi.test.ts index 63fb0aa9c5..972132952c 100644 --- a/src/lib/onboard/docker-cdi.test.ts +++ b/src/lib/onboard/docker-cdi.test.ts @@ -6,6 +6,7 @@ import { describe, expect, it } from "vitest"; // neighboring preflight tests. import { buildNvidiaCdiRepairCommands, + buildStaleCdiManualWarnCommands, buildStaleCdiWarnCommands, collectCdiDeviceNodes, findCdiDeviceNodeMismatch, @@ -223,4 +224,13 @@ describe("docker-cdi remediation commands", () => { const serviceCommands = buildStaleCdiWarnCommands("/var/run/cdi/nvidia.yaml"); expect(serviceCommands.some((command) => command.includes("rm -f"))).toBe(false); }); + + it("shows manual stale-spec guidance without systemctl on non-systemd hosts", () => { + const commands = buildStaleCdiManualWarnCommands("/etc/cdi/nvidia.yaml"); + + expect(commands.join("\n")).toContain("/var/run/cdi/nvidia.yaml"); + expect(commands.join("\n")).toContain("sudo rm -f /etc/cdi/nvidia.yaml"); + expect(commands.join("\n")).not.toContain("systemctl"); + expect(commands.join("\n")).not.toContain("nvidia-ctk cdi list"); + }); }); diff --git a/src/lib/onboard/docker-cdi.ts b/src/lib/onboard/docker-cdi.ts index 4a089e6d9d..d5bfe0e585 100644 --- a/src/lib/onboard/docker-cdi.ts +++ b/src/lib/onboard/docker-cdi.ts @@ -310,6 +310,21 @@ export function buildStaleCdiWarnCommands(flaggedFilePath: string): string[] { return commands; } +export function buildStaleCdiManualWarnCommands(flaggedFilePath: string): string[] { + const commands = [ + `Refresh NVIDIA CDI specs using your host's service manager so ${NVIDIA_CDI_REFRESH_SPEC_PATH} is current.`, + ]; + if (flaggedFilePath && flaggedFilePath !== NVIDIA_CDI_REFRESH_SPEC_PATH) { + commands.push( + `Optionally remove the stale leftover after the refresh: sudo rm -f ${flaggedFilePath}`, + ); + } + commands.push( + "nemoclaw onboard # re-run to confirm the stale-spec warning clears (or --no-gpu to skip GPU)", + ); + return commands; +} + export function explainStaleCdiReason(mismatch: string | undefined): string { const detail = mismatch || "unknown device-node mismatch"; const flaggedFilePath = extractCdiMismatchFilePath(mismatch); diff --git a/src/lib/onboard/preflight-cdi.test.ts b/src/lib/onboard/preflight-cdi.test.ts index 53664122a7..e02412405d 100644 --- a/src/lib/onboard/preflight-cdi.test.ts +++ b/src/lib/onboard/preflight-cdi.test.ts @@ -284,6 +284,25 @@ describe("planHostRemediation — CDI", () => { ); }); + it("emits manual stale-spec guidance without systemctl on non-systemd hosts", () => { + const actions = planHostRemediation( + baseAssessment({ + systemctlAvailable: false, + cdiNvidiaGpuSpecStale: true, + cdiNvidiaGpuSpecNeedsRepair: true, + cdiNvidiaGpuSpecMismatch: + "/etc/cdi/nvidia.yaml /dev/nvidia-uvm=498:0, live=499:0", + }), + ); + const action = actions.find((entry: { id: string }) => entry.id === "refresh_nvidia_cdi_spec"); + + expect(action).toBeTruthy(); + expect(action?.blocking).toBe(true); + expect(action?.kind).toBe("manual"); + expect(action?.commands.join("\n")).toContain("/var/run/cdi/nvidia.yaml"); + expect(action?.commands.join("\n")).not.toContain("systemctl"); + }); + it("emits a non-blocking refresh-service warning when refresh units are unhealthy", () => { const actions = planHostRemediation( baseAssessment({ diff --git a/src/lib/onboard/preflight.ts b/src/lib/onboard/preflight.ts index fdc4761d15..2b70dbf3ba 100644 --- a/src/lib/onboard/preflight.ts +++ b/src/lib/onboard/preflight.ts @@ -19,6 +19,7 @@ import { DASHBOARD_PORT } from "../core/ports"; import { buildNvidiaCdiRefreshCommands, buildNvidiaCdiRepairCommands, + buildStaleCdiManualWarnCommands, buildStaleCdiWarnCommands, explainNvidiaCdiRepairReason, explainStaleCdiReason, @@ -893,7 +894,9 @@ export function planHostRemediation(assessment: HostAssessment): RemediationActi const specPath = getNvidiaCdiSpecPath(assessment); const repairCommands = missingSpec ? buildNvidiaCdiRepairCommands(assessment, specPath) - : buildStaleCdiWarnCommands(flaggedFilePath); + : assessment.systemctlAvailable + ? buildStaleCdiWarnCommands(flaggedFilePath) + : buildStaleCdiManualWarnCommands(flaggedFilePath); const reason = missingSpec ? explainNvidiaCdiRepairReason(assessment) : explainStaleCdiReason(assessment.cdiNvidiaGpuSpecMismatch); @@ -906,7 +909,7 @@ export function planHostRemediation(assessment: HostAssessment): RemediationActi actions.push({ id: missingSpec ? "generate_nvidia_cdi_spec" : "refresh_nvidia_cdi_spec", title, - kind: "sudo", + kind: missingSpec || assessment.systemctlAvailable ? "sudo" : "manual", reason, commands: repairCommands, blocking: true, diff --git a/test/install-preflight.test.ts b/test/install-preflight.test.ts index 952bbddbd0..66a3871e9d 100644 --- a/test/install-preflight.test.ts +++ b/test/install-preflight.test.ts @@ -1223,11 +1223,13 @@ fi`, isWsl = false, runtime = "docker", stale = false, + toolkitInstalled = true, }: { systemctlScript: string; isWsl?: boolean; runtime?: string; stale?: boolean; + toolkitInstalled?: boolean; }) { const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-install-cdi-repair-")); const fakeBin = path.join(tmp, "bin"); @@ -1252,6 +1254,7 @@ exports.assessHost = () => ({ cdiNvidiaGpuSpecStale: ${stale ? "!fs.existsSync(process.env.CDI_STATE)" : "false"}, cdiNvidiaGpuSpecNeedsRepair: !fs.existsSync(process.env.CDI_STATE), cdiNvidiaGpuSpecMismatch: process.env.CDI_STALE_FILE + " /dev/nvidia-uvm=498:0, live=499:0", + nvidiaContainerToolkitInstalled: ${toolkitInstalled ? "true" : "false"}, }); exports.getNvidiaCdiSpecPath = (host) => String(host.dockerCdiSpecDirs[0]).replace(/\\/+$/, "") + "/nvidia.yaml"; @@ -1272,6 +1275,13 @@ exports.planHostRemediation = (host) => commands: ["sudo nvidia-ctk cdi generate --output=" + exports.getNvidiaCdiSpecPath(host)], blocking: true, }] + : host.cdiNvidiaGpuSpecStale && !host.nvidiaContainerToolkitInstalled + ? [{ + title: "Install NVIDIA Container Toolkit and refresh CDI device specs", + reason: "nvidia-container-toolkit missing", + commands: ["sudo apt-get install -y nvidia-container-toolkit"], + blocking: true, + }] : []; `, ); @@ -1420,6 +1430,28 @@ exit 0 expect(sudoLog).not.toMatch(/rm -f/); }); + it("does not auto-repair stale NVIDIA CDI specs before toolkit installation", () => { + const { cdiStateExists, output, result, sudoLog, systemctlLog } = + runNvidiaCdiInstallerRepairTest({ + stale: true, + toolkitInstalled: false, + systemctlScript: `#!/usr/bin/env bash +set -euo pipefail +printf '%s\\n' "$*" >> "$SYSTEMCTL_LOG" +touch "$CDI_STATE" +exit 0 +`, + }); + + expect(result.status, output).toBe(1); + expect(cdiStateExists).toBe(false); + expect(output).toMatch(/Host preflight found issues/); + expect(output).toMatch(/Install NVIDIA Container Toolkit and refresh CDI device specs/); + expect(output).not.toMatch(/Refreshing NVIDIA CDI device spec with NVIDIA's CDI refresh service/); + expect(systemctlLog).toBe(""); + expect(sudoLog).toBe(""); + }); + it("falls back to direct NVIDIA CDI generation when refresh service does not repair", () => { const { cdiDir, output, result, sudoLog, systemctlLog } = runNvidiaCdiInstallerRepairTest({ From 529f1de1b8537c1da70b9132414ac86374c4185b Mon Sep 17 00:00:00 2001 From: zyang-dev <267119621+zyang-dev@users.noreply.github.com> Date: Wed, 3 Jun 2026 16:43:43 -0700 Subject: [PATCH 7/9] fix(onboard): quote CDI remediation command paths Signed-off-by: zyang-dev <267119621+zyang-dev@users.noreply.github.com> --- src/lib/onboard/docker-cdi.test.ts | 32 +++++++++++++++++++++++++----- src/lib/onboard/docker-cdi.ts | 13 ++++++++---- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/src/lib/onboard/docker-cdi.test.ts b/src/lib/onboard/docker-cdi.test.ts index 972132952c..27ad45487a 100644 --- a/src/lib/onboard/docker-cdi.test.ts +++ b/src/lib/onboard/docker-cdi.test.ts @@ -199,25 +199,38 @@ describe("docker-cdi staleness detection", () => { describe("docker-cdi remediation commands", () => { it("keeps missing-spec remediation on the direct-generation fallback path", () => { - const commands = buildNvidiaCdiRepairCommands({ systemctlAvailable: true }, "/etc/cdi/nvidia.yaml"); + const commands = buildNvidiaCdiRepairCommands( + { systemctlAvailable: true }, + "/etc/cdi/nvidia.yaml", + ); - expect(commands[0]).toBe("sudo mkdir -p /etc/cdi"); + expect(commands[0]).toBe("sudo mkdir -p '/etc/cdi'"); expect(commands[1]).toBe( "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", ); expect(commands[2]).toBe("sudo systemctl start nvidia-cdi-refresh.service"); expect(commands[3]).toContain("nvidia-ctk cdi list"); - expect(commands[4]).toContain("sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml"); + expect(commands[4]).toContain("sudo nvidia-ctk cdi generate --output='/etc/cdi/nvidia.yaml'"); expect(commands[5]).toContain("nvidia-ctk cdi list"); }); + it("shell-quotes CDI repair paths in generated commands", () => { + const commands = buildNvidiaCdiRepairCommands( + { systemctlAvailable: false }, + "/tmp/cdi dir/nvidia;bad.yaml", + ); + + expect(commands[0]).toBe("sudo mkdir -p '/tmp/cdi dir'"); + expect(commands[1]).toContain("--output='/tmp/cdi dir/nvidia;bad.yaml'"); + }); + it("shows stale-spec refresh commands with optional leftover removal only for /etc/cdi", () => { const leftoverCommands = buildStaleCdiWarnCommands("/etc/cdi/nvidia.yaml"); expect(leftoverCommands[0]).toBe( "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", ); expect(leftoverCommands[1]).toBe("sudo systemctl start nvidia-cdi-refresh.service"); - expect(leftoverCommands[2]).toContain("sudo rm -f /etc/cdi/nvidia.yaml"); + expect(leftoverCommands[2]).toContain("sudo rm -f '/etc/cdi/nvidia.yaml'"); expect(leftoverCommands.join("\n")).not.toContain("--output=/etc/cdi"); expect(leftoverCommands.join("\n")).not.toContain("nvidia-ctk cdi list"); @@ -229,8 +242,17 @@ describe("docker-cdi remediation commands", () => { const commands = buildStaleCdiManualWarnCommands("/etc/cdi/nvidia.yaml"); expect(commands.join("\n")).toContain("/var/run/cdi/nvidia.yaml"); - expect(commands.join("\n")).toContain("sudo rm -f /etc/cdi/nvidia.yaml"); + expect(commands.join("\n")).toContain("sudo rm -f '/etc/cdi/nvidia.yaml'"); expect(commands.join("\n")).not.toContain("systemctl"); expect(commands.join("\n")).not.toContain("nvidia-ctk cdi list"); }); + + it("shell-quotes stale leftover paths in displayed guidance", () => { + expect(buildStaleCdiWarnCommands("/tmp/cdi dir/nvidia;bad.yaml").join("\n")).toContain( + "sudo rm -f '/tmp/cdi dir/nvidia;bad.yaml'", + ); + expect(buildStaleCdiManualWarnCommands("/tmp/cdi dir/nvidia;bad.yaml").join("\n")).toContain( + "sudo rm -f '/tmp/cdi dir/nvidia;bad.yaml'", + ); + }); }); diff --git a/src/lib/onboard/docker-cdi.ts b/src/lib/onboard/docker-cdi.ts index d5bfe0e585..8fe6dc532d 100644 --- a/src/lib/onboard/docker-cdi.ts +++ b/src/lib/onboard/docker-cdi.ts @@ -5,6 +5,7 @@ import fs from "node:fs"; import path from "node:path"; import { dockerInfoFormat } from "../adapters/docker"; +import { shellQuote } from "../core/shell-quote"; export type RunCaptureFn = typeof import("../runner").runCapture; @@ -259,7 +260,9 @@ export function buildNvidiaCdiRepairCommands( specPath: string, ): string[] { const specDir = path.dirname(specPath); - const commands = [`sudo mkdir -p ${specDir}`]; + const quotedSpecDir = shellQuote(specDir); + const quotedSpecPath = shellQuote(specPath); + const commands = [`sudo mkdir -p ${quotedSpecDir}`]; if (assessment.systemctlAvailable !== false) { commands.push( "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", @@ -268,7 +271,7 @@ export function buildNvidiaCdiRepairCommands( ); } commands.push( - `sudo nvidia-ctk cdi generate --output=${specPath} # fallback if the refresh service does not repair the spec`, + `sudo nvidia-ctk cdi generate --output=${quotedSpecPath} # fallback if the refresh service does not repair the spec`, "nvidia-ctk cdi list # verify nvidia.com/gpu entries appear", "nemoclaw onboard # or rerun with --no-gpu to skip GPU passthrough", ); @@ -300,8 +303,9 @@ export function buildStaleCdiAutoFixCommands(): string[] { export function buildStaleCdiWarnCommands(flaggedFilePath: string): string[] { const commands = buildStaleCdiAutoFixCommands(); if (flaggedFilePath && flaggedFilePath !== NVIDIA_CDI_REFRESH_SPEC_PATH) { + const quotedFlaggedFilePath = shellQuote(flaggedFilePath); commands.push( - `sudo rm -f ${flaggedFilePath} # optional: remove the stale leftover (the service owns ${NVIDIA_CDI_REFRESH_SPEC_PATH})`, + `sudo rm -f ${quotedFlaggedFilePath} # optional: remove the stale leftover (the service owns ${NVIDIA_CDI_REFRESH_SPEC_PATH})`, ); } commands.push( @@ -315,8 +319,9 @@ export function buildStaleCdiManualWarnCommands(flaggedFilePath: string): string `Refresh NVIDIA CDI specs using your host's service manager so ${NVIDIA_CDI_REFRESH_SPEC_PATH} is current.`, ]; if (flaggedFilePath && flaggedFilePath !== NVIDIA_CDI_REFRESH_SPEC_PATH) { + const quotedFlaggedFilePath = shellQuote(flaggedFilePath); commands.push( - `Optionally remove the stale leftover after the refresh: sudo rm -f ${flaggedFilePath}`, + `Optionally remove the stale leftover after the refresh: sudo rm -f ${quotedFlaggedFilePath}`, ); } commands.push( From 1279b2383babbf03d8160ab3df5a962f7ab4711b Mon Sep 17 00:00:00 2001 From: zyang-dev <267119621+zyang-dev@users.noreply.github.com> Date: Wed, 3 Jun 2026 19:44:31 -0700 Subject: [PATCH 8/9] Updated CDI preflight expectations to match the newly shell-quoted remediation command paths Signed-off-by: zyang-dev <267119621+zyang-dev@users.noreply.github.com> --- src/lib/onboard/preflight-cdi.test.ts | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/lib/onboard/preflight-cdi.test.ts b/src/lib/onboard/preflight-cdi.test.ts index e02412405d..869c3a55ce 100644 --- a/src/lib/onboard/preflight-cdi.test.ts +++ b/src/lib/onboard/preflight-cdi.test.ts @@ -254,7 +254,9 @@ describe("planHostRemediation — CDI", () => { expect(action).toBeTruthy(); expect(action?.kind).toBe("sudo"); expect(action?.blocking).toBe(true); - expect(action?.commands.some((command) => command.includes("--output=/etc/cdi"))).toBe(true); + expect(action?.commands.some((command) => command.includes("--output='/etc/cdi"))).toBe( + true, + ); expect(action?.commands.some((command) => command.includes("nvidia-ctk cdi list"))).toBe(true); }); @@ -275,9 +277,9 @@ describe("planHostRemediation — CDI", () => { "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", ); expect(action?.commands[1]).toBe("sudo systemctl start nvidia-cdi-refresh.service"); - expect(action?.commands.some((command) => command.includes("sudo rm -f /etc/cdi/nvidia.yaml"))).toBe( - true, - ); + expect( + action?.commands.some((command) => command.includes("sudo rm -f '/etc/cdi/nvidia.yaml'")), + ).toBe(true); expect(action?.commands.some((command) => command.includes("--output=/etc/cdi"))).toBe(false); expect(action?.commands.some((command) => command.includes("nvidia-ctk cdi list"))).toBe( false, From 416376564cce401b609199868401152d8976ff10 Mon Sep 17 00:00:00 2001 From: zyang-dev <267119621+zyang-dev@users.noreply.github.com> Date: Wed, 3 Jun 2026 19:56:23 -0700 Subject: [PATCH 9/9] refactor(onboard): move CDI host assessment into docker helper Signed-off-by: zyang-dev <267119621+zyang-dev@users.noreply.github.com> --- src/lib/onboard/docker-cdi.ts | 127 ++++++++++++++++++++++++++++++++++ src/lib/onboard/preflight.ts | 99 ++++---------------------- 2 files changed, 142 insertions(+), 84 deletions(-) diff --git a/src/lib/onboard/docker-cdi.ts b/src/lib/onboard/docker-cdi.ts index 8fe6dc532d..b467ec3101 100644 --- a/src/lib/onboard/docker-cdi.ts +++ b/src/lib/onboard/docker-cdi.ts @@ -21,6 +21,33 @@ export type NvidiaCdiRepairAssessment = { systemctlAvailable?: boolean; }; +export type NvidiaCdiHostAssessmentOpts = { + dockerInfoOutput?: string; + dockerReachable: boolean; + hasNvidiaGpu: boolean; + isWsl: boolean; + nvidiaContainerToolkitInstalled: boolean; + platform: NodeJS.Platform | string; + readFileImpl: (filePath: string, encoding: BufferEncoding) => string; + readdirImpl: (dir: string) => string[]; + runCaptureImpl: RunCaptureFn; + runtime: string; + systemctlAvailable?: boolean; +}; + +export type NvidiaCdiHostAssessment = { + dockerCdiSpecDirs: string[]; + cdiNvidiaGpuSpecMissing: boolean; + cdiNvidiaGpuSpecStale: boolean; + cdiNvidiaGpuSpecMismatch?: string; + cdiNvidiaGpuRefreshUnhealthy: boolean; + cdiNvidiaGpuSpecNeedsRepair: boolean; + nvidiaCdiRefreshPathActive: boolean | null; + nvidiaCdiRefreshPathEnabled: boolean | null; + nvidiaCdiRefreshServiceEnabled: boolean | null; + nvidiaCdiRefreshServiceFailed: boolean | null; +}; + type DeviceNumbers = { major: number; minor: number }; type CdiDeviceNode = DeviceNumbers & { @@ -255,6 +282,106 @@ export function findCdiDeviceNodeMismatch( return null; } +function parseSystemctlState(value = ""): boolean | null { + const normalized = String(value || "") + .trim() + .toLowerCase(); + if (!normalized) return null; + if (normalized === "active" || normalized === "enabled") return true; + if ( + normalized === "inactive" || + normalized === "failed" || + normalized === "disabled" || + normalized === "masked" + ) { + return false; + } + return null; +} + +function parseSystemctlFailedState(value = ""): boolean | null { + const normalized = String(value || "") + .trim() + .toLowerCase(); + if (!normalized) return null; + if (normalized === "failed") return true; + if (normalized === "active" || normalized === "inactive") return false; + return null; +} + +export function assessNvidiaCdiHost(opts: NvidiaCdiHostAssessmentOpts): NvidiaCdiHostAssessment { + const dockerCdiSpecDirs = opts.dockerReachable + ? parseDockerCdiSpecDirs(opts.dockerInfoOutput) + : []; + const cdiSpecPresenceApplies = + opts.platform === "linux" && opts.hasNvidiaGpu && dockerCdiSpecDirs.length > 0; + const cdiSpecRepairApplies = + cdiSpecPresenceApplies && !(opts.isWsl && opts.runtime === "docker-desktop"); + const cdiNvidiaGpuSpecPresent = + cdiSpecPresenceApplies && + hasNvidiaCdiSpec(dockerCdiSpecDirs, opts.readdirImpl, opts.readFileImpl); + const cdiNvidiaGpuSpecMissing = cdiSpecPresenceApplies && !cdiNvidiaGpuSpecPresent; + const refreshHealthApplies = + cdiSpecRepairApplies && + Boolean(opts.systemctlAvailable) && + opts.nvidiaContainerToolkitInstalled; + const nvidiaCdiRefreshPathEnabled = refreshHealthApplies + ? parseSystemctlState( + opts.runCaptureImpl(["systemctl", "is-enabled", "nvidia-cdi-refresh.path"], { + ignoreError: true, + }), + ) + : null; + const nvidiaCdiRefreshPathActive = refreshHealthApplies + ? parseSystemctlState( + opts.runCaptureImpl(["systemctl", "is-active", "nvidia-cdi-refresh.path"], { + ignoreError: true, + }), + ) + : null; + const nvidiaCdiRefreshServiceEnabled = refreshHealthApplies + ? parseSystemctlState( + opts.runCaptureImpl(["systemctl", "is-enabled", "nvidia-cdi-refresh.service"], { + ignoreError: true, + }), + ) + : null; + const nvidiaCdiRefreshServiceFailed = refreshHealthApplies + ? parseSystemctlFailedState( + opts.runCaptureImpl(["systemctl", "is-failed", "nvidia-cdi-refresh.service"], { + ignoreError: true, + }), + ) + : null; + const cdiNvidiaGpuRefreshUnhealthy = + nvidiaCdiRefreshPathEnabled === false || + nvidiaCdiRefreshPathActive === false || + nvidiaCdiRefreshServiceFailed === true; + const cdiNvidiaGpuSpecMismatch = + cdiSpecRepairApplies && cdiNvidiaGpuSpecPresent + ? findCdiDeviceNodeMismatch( + dockerCdiSpecDirs, + opts.readdirImpl, + opts.readFileImpl, + opts.runCaptureImpl, + ) + : null; + const cdiNvidiaGpuSpecStale = Boolean(cdiNvidiaGpuSpecMismatch); + + return { + dockerCdiSpecDirs, + cdiNvidiaGpuSpecMissing, + cdiNvidiaGpuSpecStale, + cdiNvidiaGpuSpecMismatch: cdiNvidiaGpuSpecMismatch ?? undefined, + cdiNvidiaGpuRefreshUnhealthy, + cdiNvidiaGpuSpecNeedsRepair: cdiNvidiaGpuSpecMissing || cdiNvidiaGpuSpecStale, + nvidiaCdiRefreshPathActive, + nvidiaCdiRefreshPathEnabled, + nvidiaCdiRefreshServiceEnabled, + nvidiaCdiRefreshServiceFailed, + }; +} + export function buildNvidiaCdiRepairCommands( assessment: Pick, specPath: string, diff --git a/src/lib/onboard/preflight.ts b/src/lib/onboard/preflight.ts index 2b70dbf3ba..d417e87e28 100644 --- a/src/lib/onboard/preflight.ts +++ b/src/lib/onboard/preflight.ts @@ -17,6 +17,7 @@ import path from "node:path"; import { DASHBOARD_PORT } from "../core/ports"; import { + assessNvidiaCdiHost, buildNvidiaCdiRefreshCommands, buildNvidiaCdiRepairCommands, buildStaleCdiManualWarnCommands, @@ -24,10 +25,7 @@ import { explainNvidiaCdiRepairReason, explainStaleCdiReason, extractCdiMismatchFilePath, - findCdiDeviceNodeMismatch, getNvidiaCdiSpecPath, - hasNvidiaCdiSpec, - parseDockerCdiSpecDirs, } from "./docker-cdi"; import { isWslDockerDesktopRuntime, @@ -307,16 +305,6 @@ export function parseDockerUsesContainerdSnapshotter(info = ""): boolean { return /io\.containerd\.snapshotter\.v1/.test(info); } -function parseSystemctlFailedState(value = ""): boolean | null { - const normalized = String(value || "") - .trim() - .toLowerCase(); - if (!normalized) return null; - if (normalized === "failed") return true; - if (normalized === "active" || normalized === "inactive") return false; - return null; -} - export function parseDockerInfoCpus(info = ""): number | undefined { const jsonMatch = info.match(/"NCPU"\s*:\s*(\d+)/); if (jsonMatch) { @@ -522,67 +510,19 @@ export function assessHost(opts: AssessHostOpts = {}): HostAssessment { const dockerMemTotalBytes = dockerReachable ? parseDockerInfoMemTotalBytes(dockerInfoOutput) : undefined; - // CDI spec gap: Docker 25+ on hosts with `nvidia-container-toolkit` installed - // typically advertises `"CDISpecDirs": ["/etc/cdi", "/var/run/cdi"]` in its - // info output. OpenShell's `gateway start --gpu` then opportunistically - // selects CDI mode and tries to inject `nvidia.com/gpu=all`. If no spec has - // been generated yet (`/etc/cdi/nvidia.yaml` is missing), the gateway start - // fails with `unresolvable CDI devices nvidia.com/gpu=all`. Detect this up - // front so preflight can point the user at `nvidia-ctk cdi generate` before - // we waste minutes downloading the gateway image. See issue #3152. - const dockerCdiSpecDirs = dockerReachable ? parseDockerCdiSpecDirs(dockerInfoOutput) : []; - const cdiSpecPresenceApplies = - platform === "linux" && hasNvidiaGpu && dockerCdiSpecDirs.length > 0; - const cdiSpecRepairApplies = - cdiSpecPresenceApplies && !(isWslHost && runtime === "docker-desktop"); - const cdiNvidiaGpuSpecPresent = - cdiSpecPresenceApplies && hasNvidiaCdiSpec(dockerCdiSpecDirs, readdirImpl, readFileImpl); - const cdiNvidiaGpuSpecMissing = cdiSpecPresenceApplies && !cdiNvidiaGpuSpecPresent; - const refreshHealthApplies = - cdiSpecRepairApplies && systemctlAvailable && nvidiaContainerToolkitInstalled; - const nvidiaCdiRefreshPathEnabled = refreshHealthApplies - ? parseSystemctlState( - runCaptureImpl(["systemctl", "is-enabled", "nvidia-cdi-refresh.path"], { - ignoreError: true, - }), - ) - : null; - const nvidiaCdiRefreshPathActive = refreshHealthApplies - ? parseSystemctlState( - runCaptureImpl(["systemctl", "is-active", "nvidia-cdi-refresh.path"], { - ignoreError: true, - }), - ) - : null; - const nvidiaCdiRefreshServiceEnabled = refreshHealthApplies - ? parseSystemctlState( - runCaptureImpl(["systemctl", "is-enabled", "nvidia-cdi-refresh.service"], { - ignoreError: true, - }), - ) - : null; - const nvidiaCdiRefreshServiceFailed = refreshHealthApplies - ? parseSystemctlFailedState( - runCaptureImpl(["systemctl", "is-failed", "nvidia-cdi-refresh.service"], { - ignoreError: true, - }), - ) - : null; - const cdiNvidiaGpuRefreshUnhealthy = - nvidiaCdiRefreshPathEnabled === false || - nvidiaCdiRefreshPathActive === false || - nvidiaCdiRefreshServiceFailed === true; - const cdiNvidiaGpuSpecMismatch = - cdiSpecRepairApplies && cdiNvidiaGpuSpecPresent - ? findCdiDeviceNodeMismatch( - dockerCdiSpecDirs, - readdirImpl, - readFileImpl, - runCaptureImpl, - ) - : null; - const cdiNvidiaGpuSpecStale = Boolean(cdiNvidiaGpuSpecMismatch); - const cdiNvidiaGpuSpecNeedsRepair = cdiNvidiaGpuSpecMissing || cdiNvidiaGpuSpecStale; + const cdiAssessment = assessNvidiaCdiHost({ + dockerInfoOutput, + dockerReachable, + hasNvidiaGpu, + isWsl: isWslHost, + nvidiaContainerToolkitInstalled, + platform, + readFileImpl, + readdirImpl, + runCaptureImpl, + runtime, + systemctlAvailable, + }); const isContainerRuntimeUnderProvisioned = isDockerUnderProvisioned( dockerCpus, dockerMemTotalBytes, @@ -647,16 +587,7 @@ export function assessHost(opts: AssessHostOpts = {}): HostAssessment { isUnsupportedRuntime: runtime === "podman", isHeadlessLikely: isHeadlessLikely(env), hasNvidiaGpu, - dockerCdiSpecDirs, - cdiNvidiaGpuSpecMissing, - cdiNvidiaGpuSpecStale, - cdiNvidiaGpuSpecMismatch: cdiNvidiaGpuSpecMismatch ?? undefined, - cdiNvidiaGpuRefreshUnhealthy, - cdiNvidiaGpuSpecNeedsRepair, - nvidiaCdiRefreshPathActive, - nvidiaCdiRefreshPathEnabled, - nvidiaCdiRefreshServiceEnabled, - nvidiaCdiRefreshServiceFailed, + ...cdiAssessment, nvidiaContainerToolkitInstalled, notes: [], };