diff --git a/scripts/install.sh b/scripts/install.sh index 6e93f6b129..56b1bd826b 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -1848,11 +1848,46 @@ preinstall_backup_and_retire_legacy_gateway() { # --------------------------------------------------------------------------- # 5. Onboard # --------------------------------------------------------------------------- +repair_installer_stale_nvidia_cdi_spec() { + local flagged_file="${1:-}" + local service_spec_path="/var/run/cdi/nvidia.yaml" + local sudo_cmd=() + + info "Refreshing NVIDIA CDI device spec with NVIDIA's CDI refresh service." + info "NVIDIA GPU passthrough uses CDI specs so Docker/OpenShell can request nvidia.com/gpu devices." + info "Docker is configured for CDI, but the effective nvidia.com/gpu spec may be stale." + info "The refresh service regenerates ${service_spec_path}; re-assessment verifies that effective spec." + if [[ -n "$flagged_file" && "$flagged_file" != "$service_spec_path" ]]; then + info "The stale ${flagged_file} file is a leftover; the refreshed ${service_spec_path} overrides it." + fi + if ! command_exists systemctl; then + warn "Could not refresh the stale NVIDIA CDI spec automatically because systemctl is unavailable." + return 0 + fi + if [[ "$(id -u)" -ne 0 ]]; then + sudo_cmd=(sudo) + info "You may be asked for your password to authorize these host-level admin changes." + info "NemoClaw does not store your password." + if ! sudo -v; then + warn "Could not obtain sudo credentials for NVIDIA CDI refresh service repair." + return 0 + fi + fi + if "${sudo_cmd[@]}" systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service >/dev/null 2>&1 \ + && "${sudo_cmd[@]}" systemctl start nvidia-cdi-refresh.service >/dev/null 2>&1; then + ok "Enabled NVIDIA CDI refresh service and refreshed the service-managed NVIDIA CDI device spec." + return 0 + fi + warn "Could not refresh the stale NVIDIA CDI spec automatically with nvidia-cdi-refresh.service." +} + repair_installer_nvidia_cdi_spec() { local preflight_module="$1" + local repair_plan="" + local repair_kind="" local spec_path="" - spec_path="$( + repair_plan="$( # shellcheck disable=SC2016 node -e ' const preflightPath = process.argv[1]; @@ -1864,7 +1899,18 @@ repair_installer_nvidia_cdi_spec() { host.cdiNvidiaGpuSpecMissing && !isWslDockerDesktopRuntime(host) ) { - process.stdout.write(getNvidiaCdiSpecPath(host)); + process.stdout.write(`missing\t${getNvidiaCdiSpecPath(host)}`); + } else if ( + host && + host.cdiNvidiaGpuSpecStale && + host.cdiNvidiaGpuSpecNeedsRepair && + !host.cdiNvidiaGpuSpecMissing && + host.nvidiaContainerToolkitInstalled && + !isWslDockerDesktopRuntime(host) + ) { + const mismatch = String(host.cdiNvidiaGpuSpecMismatch || ""); + const flaggedFilePath = mismatch.trim().split(/\s+/, 1)[0] || ""; + process.stdout.write(`stale\t${flaggedFilePath}`); } } catch { process.exit(0); @@ -1872,9 +1918,18 @@ repair_installer_nvidia_cdi_spec() { ' "$preflight_module" 2>/dev/null || true )" - if [[ -z "$spec_path" ]]; then + if [[ -z "$repair_plan" ]]; then + return 0 + fi + + repair_kind="${repair_plan%%$'\t'*}" + spec_path="${repair_plan#*$'\t'}" + + if [[ "$repair_kind" == "stale" ]]; then + repair_installer_stale_nvidia_cdi_spec "$spec_path" return 0 fi + if ! command_exists nvidia-ctk; then return 0 fi @@ -1886,10 +1941,10 @@ repair_installer_nvidia_cdi_spec() { fi local sudo_cmd=() - info "Generating missing NVIDIA CDI device spec at ${spec_path}." + info "Refreshing NVIDIA CDI device spec at ${spec_path}." info "NVIDIA GPU passthrough uses CDI specs so Docker/OpenShell can request nvidia.com/gpu devices." - info "Docker is configured for CDI, but the nvidia.com/gpu spec is missing." - info "Without it, OpenShell gateway startup would fail before the sandbox can use the GPU." + info "Docker is configured for CDI, but the nvidia.com/gpu spec is missing or may be stale." + info "Without a refreshed spec, OpenShell gateway startup can fail before the sandbox can use the GPU." info "NemoClaw will first enable NVIDIA's CDI refresh service." info "If that service does not generate the spec, NemoClaw will run nvidia-ctk cdi generate directly." if [[ "$(id -u)" -ne 0 ]]; then diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index d24a5a8a0f..c138e3ce68 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -1841,16 +1841,8 @@ function assertCdiNvidiaGpuSpecPresent( hostGpuPlatform: string | null | undefined = null, ): void { if (hostGpuPlatform === "jetson" || preflightUtils.isWslDockerDesktopRuntime(host)) return; - if (!host.cdiNvidiaGpuSpecMissing || optedOutGpuPassthrough) return; - console.error( - " Docker is configured for CDI device injection (CDISpecDirs is set), but no", - ); - console.error( - " nvidia.com/gpu CDI spec was found on the host. OpenShell's gateway start will", - ); - console.error( - " fail with `unresolvable CDI devices nvidia.com/gpu=all` (issue #3152).", - ); + if (!(host.cdiNvidiaGpuSpecNeedsRepair || host.cdiNvidiaGpuSpecMissing) || optedOutGpuPassthrough) return; + console.error(" Docker is configured for CDI device injection (CDISpecDirs is set), but the NVIDIA GPU CDI spec is missing or stale. OpenShell GPU startup can fail until the CDI spec is refreshed."); printRemediationActions(planHostRemediation(host)); process.exit(1); } diff --git a/src/lib/onboard/docker-cdi.test.ts b/src/lib/onboard/docker-cdi.test.ts new file mode 100644 index 0000000000..27ad45487a --- /dev/null +++ b/src/lib/onboard/docker-cdi.test.ts @@ -0,0 +1,258 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it } from "vitest"; +// Import through dist so coverage follows the CLI build output, matching the +// neighboring preflight tests. +import { + buildNvidiaCdiRepairCommands, + buildStaleCdiManualWarnCommands, + buildStaleCdiWarnCommands, + collectCdiDeviceNodes, + findCdiDeviceNodeMismatch, + getNvidiaCdiSpecPath, + hasNvidiaCdiSpec, + parseDockerCdiSpecDirs, +} from "../../../dist/lib/onboard/docker-cdi"; + +function specWithDeviceNodes(deviceNodes: string): string { + return [ + "cdiVersion: 0.5.0", + "kind: nvidia.com/gpu", + "devices:", + " - name: all", + " containerEdits:", + " deviceNodes:", + deviceNodes, + "", + ].join("\n"); +} + +function cdiFs(files: Record) { + return { + readdirImpl: (dir: string) => + Object.keys(files) + .filter((filePath) => filePath.startsWith(`${dir}/`)) + .map((filePath) => filePath.slice(dir.length + 1)) + .filter((entry) => entry && !entry.includes("/")), + readFileImpl: (filePath: string) => files[filePath] ?? "", + }; +} + +function statDevices(devices: Record) { + return (command: readonly string[]) => { + if (command[0] === "stat") return devices[command[3]] ?? ""; + return ""; + }; +} + +describe("docker-cdi parsing", () => { + it("extracts CDI dirs from whole docker info JSON and .CDISpecDirs JSON", () => { + expect( + parseDockerCdiSpecDirs(JSON.stringify({ CDISpecDirs: ["/etc/cdi", "/var/run/cdi"] })), + ).toEqual(["/etc/cdi", "/var/run/cdi"]); + expect(parseDockerCdiSpecDirs('["/etc/cdi","/var/run/cdi"]')).toEqual([ + "/etc/cdi", + "/var/run/cdi", + ]); + }); + + it("returns an empty array when CDI dirs are absent or empty", () => { + expect(parseDockerCdiSpecDirs(JSON.stringify({ ServerVersion: "27.0" }))).toEqual([]); + expect(parseDockerCdiSpecDirs(JSON.stringify({ CDISpecDirs: [] }))).toEqual([]); + expect(parseDockerCdiSpecDirs("")).toEqual([]); + }); + + it("builds the default NVIDIA CDI spec path from Docker CDI dirs", () => { + expect(getNvidiaCdiSpecPath({ dockerCdiSpecDirs: ["/etc/cdi/", "/var/run/cdi"] })).toBe( + "/etc/cdi/nvidia.yaml", + ); + }); + + it("accepts exact nvidia.com/gpu YAML and JSON specs only", () => { + const fs = cdiFs({ + "/etc/cdi/nvidia.yaml": "cdiVersion: 0.5.0\nkind: nvidia.com/gpu\ndevices: []\n", + "/etc/cdi/nvidia.json": '{"cdiVersion":"0.5.0","kind":"nvidia.com/gpu","devices":[]}', + "/etc/cdi/nvidia-extra.yaml": "kind: nvidia.com/gpu-extra\ndevices: []\n", + "/etc/cdi/notes.yaml": "# nvidia.com/gpu used to be here\nkind: example.com/cpu\n", + }); + + expect(hasNvidiaCdiSpec(["/etc/cdi"], fs.readdirImpl, fs.readFileImpl)).toBe(true); + expect( + hasNvidiaCdiSpec( + ["/etc/cdi"], + () => ["nvidia-extra.yaml", "notes.yaml"], + fs.readFileImpl, + ), + ).toBe(false); + }); +}); + +describe("docker-cdi staleness detection", () => { + it("ignores stale lower-precedence /etc/cdi when /var/run/cdi is fresh", () => { + const fs = cdiFs({ + "/etc/cdi/nvidia.yaml": specWithDeviceNodes( + " - path: /dev/nvidia-uvm\n hostPath: /dev/nvidia-uvm\n type: c\n major: 498", + ), + "/var/run/cdi/nvidia.yaml": specWithDeviceNodes( + " - path: /dev/nvidia-uvm\n hostPath: /dev/nvidia-uvm\n type: c\n major: 499", + ), + }); + + expect( + findCdiDeviceNodeMismatch( + ["/etc/cdi", "/var/run/cdi"], + fs.readdirImpl, + fs.readFileImpl, + statDevices({ "/dev/nvidia-uvm": "1f3 0" }), + ), + ).toBeNull(); + }); + + it("flags stale /etc/cdi when no higher-precedence /var/run/cdi spec exists", () => { + const fs = cdiFs({ + "/etc/cdi/nvidia.yaml": specWithDeviceNodes( + " - path: /dev/nvidia-uvm\n hostPath: /dev/nvidia-uvm\n type: c\n major: 498", + ), + }); + + const mismatch = findCdiDeviceNodeMismatch( + ["/etc/cdi", "/var/run/cdi"], + fs.readdirImpl, + fs.readFileImpl, + statDevices({ "/dev/nvidia-uvm": "1f3 0" }), + ); + + expect(mismatch).toContain("/etc/cdi/nvidia.yaml"); + expect(mismatch).toContain("/dev/nvidia-uvm=498:0"); + expect(mismatch).toContain("live=499:0"); + }); + + it("flags stale /var/run/cdi when it is the effective spec", () => { + const fs = cdiFs({ + "/etc/cdi/nvidia.yaml": specWithDeviceNodes( + " - path: /dev/nvidia-uvm\n hostPath: /dev/nvidia-uvm\n type: c\n major: 499", + ), + "/var/run/cdi/nvidia.yaml": specWithDeviceNodes( + " - path: /dev/nvidia-uvm\n hostPath: /dev/nvidia-uvm\n type: c\n major: 498", + ), + }); + + const mismatch = findCdiDeviceNodeMismatch( + ["/etc/cdi", "/var/run/cdi"], + fs.readdirImpl, + fs.readFileImpl, + statDevices({ "/dev/nvidia-uvm": "1f3 0" }), + ); + + expect(mismatch).toContain("/var/run/cdi/nvidia.yaml"); + expect(mismatch).toContain("/dev/nvidia-uvm=498:0"); + expect(mismatch).toContain("live=499:0"); + }); + + it("defaults omitted minor to 0 and detects non-uvm drift", () => { + const fs = cdiFs({ + "/etc/cdi/nvidia.yaml": specWithDeviceNodes( + " - path: /dev/nvidia-uvm\n type: c\n major: 498\n - path: /dev/nvidia0\n type: c\n major: 195\n minor: 0", + ), + }); + + expect( + findCdiDeviceNodeMismatch( + ["/etc/cdi"], + fs.readdirImpl, + fs.readFileImpl, + statDevices({ "/dev/nvidia-uvm": "1f3 0", "/dev/nvidia0": "c3 0" }), + ), + ).toContain("/dev/nvidia-uvm=498:0"); + }); + + it("skips absent devices and accepts matching explicit minors", () => { + const fs = cdiFs({ + "/etc/cdi/nvidia.yaml": specWithDeviceNodes( + " - path: /dev/nvidia1\n type: c\n major: 195\n minor: 1\n - path: /dev/nvidia-uvm-tools\n type: c\n major: 499\n minor: 1", + ), + }); + + expect( + findCdiDeviceNodeMismatch( + ["/etc/cdi"], + fs.readdirImpl, + fs.readFileImpl, + statDevices({ "/dev/nvidia1": "", "/dev/nvidia-uvm-tools": "1f3 1" }), + ), + ).toBeNull(); + }); + + it("stats CDI hostPath instead of the container path when both are present", () => { + const nodes = collectCdiDeviceNodes( + { + deviceNodes: [ + { path: "/container/nvidia0", hostPath: "/dev/nvidia0", major: 196, minor: 0 }, + ], + }, + "/etc/cdi/nvidia.yaml", + ); + expect(nodes[0]).toMatchObject({ path: "/dev/nvidia0", major: 196, minor: 0 }); + }); +}); + +describe("docker-cdi remediation commands", () => { + it("keeps missing-spec remediation on the direct-generation fallback path", () => { + const commands = buildNvidiaCdiRepairCommands( + { systemctlAvailable: true }, + "/etc/cdi/nvidia.yaml", + ); + + expect(commands[0]).toBe("sudo mkdir -p '/etc/cdi'"); + expect(commands[1]).toBe( + "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", + ); + expect(commands[2]).toBe("sudo systemctl start nvidia-cdi-refresh.service"); + expect(commands[3]).toContain("nvidia-ctk cdi list"); + expect(commands[4]).toContain("sudo nvidia-ctk cdi generate --output='/etc/cdi/nvidia.yaml'"); + expect(commands[5]).toContain("nvidia-ctk cdi list"); + }); + + it("shell-quotes CDI repair paths in generated commands", () => { + const commands = buildNvidiaCdiRepairCommands( + { systemctlAvailable: false }, + "/tmp/cdi dir/nvidia;bad.yaml", + ); + + expect(commands[0]).toBe("sudo mkdir -p '/tmp/cdi dir'"); + expect(commands[1]).toContain("--output='/tmp/cdi dir/nvidia;bad.yaml'"); + }); + + it("shows stale-spec refresh commands with optional leftover removal only for /etc/cdi", () => { + const leftoverCommands = buildStaleCdiWarnCommands("/etc/cdi/nvidia.yaml"); + expect(leftoverCommands[0]).toBe( + "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", + ); + expect(leftoverCommands[1]).toBe("sudo systemctl start nvidia-cdi-refresh.service"); + expect(leftoverCommands[2]).toContain("sudo rm -f '/etc/cdi/nvidia.yaml'"); + expect(leftoverCommands.join("\n")).not.toContain("--output=/etc/cdi"); + expect(leftoverCommands.join("\n")).not.toContain("nvidia-ctk cdi list"); + + const serviceCommands = buildStaleCdiWarnCommands("/var/run/cdi/nvidia.yaml"); + expect(serviceCommands.some((command) => command.includes("rm -f"))).toBe(false); + }); + + it("shows manual stale-spec guidance without systemctl on non-systemd hosts", () => { + const commands = buildStaleCdiManualWarnCommands("/etc/cdi/nvidia.yaml"); + + expect(commands.join("\n")).toContain("/var/run/cdi/nvidia.yaml"); + expect(commands.join("\n")).toContain("sudo rm -f '/etc/cdi/nvidia.yaml'"); + expect(commands.join("\n")).not.toContain("systemctl"); + expect(commands.join("\n")).not.toContain("nvidia-ctk cdi list"); + }); + + it("shell-quotes stale leftover paths in displayed guidance", () => { + expect(buildStaleCdiWarnCommands("/tmp/cdi dir/nvidia;bad.yaml").join("\n")).toContain( + "sudo rm -f '/tmp/cdi dir/nvidia;bad.yaml'", + ); + expect(buildStaleCdiManualWarnCommands("/tmp/cdi dir/nvidia;bad.yaml").join("\n")).toContain( + "sudo rm -f '/tmp/cdi dir/nvidia;bad.yaml'", + ); + }); +}); diff --git a/src/lib/onboard/docker-cdi.ts b/src/lib/onboard/docker-cdi.ts index b0c1d7a8d7..b467ec3101 100644 --- a/src/lib/onboard/docker-cdi.ts +++ b/src/lib/onboard/docker-cdi.ts @@ -5,15 +5,77 @@ import fs from "node:fs"; import path from "node:path"; import { dockerInfoFormat } from "../adapters/docker"; +import { shellQuote } from "../core/shell-quote"; + +export type RunCaptureFn = typeof import("../runner").runCapture; + +export type NvidiaCdiRepairAssessment = { + cdiNvidiaGpuSpecMissing?: boolean; + cdiNvidiaGpuSpecStale?: boolean; + cdiNvidiaGpuSpecMismatch?: string; + cdiNvidiaGpuRefreshUnhealthy?: boolean; + dockerCdiSpecDirs: string[]; + nvidiaCdiRefreshPathEnabled?: boolean | null; + nvidiaCdiRefreshPathActive?: boolean | null; + nvidiaCdiRefreshServiceFailed?: boolean | null; + systemctlAvailable?: boolean; +}; + +export type NvidiaCdiHostAssessmentOpts = { + dockerInfoOutput?: string; + dockerReachable: boolean; + hasNvidiaGpu: boolean; + isWsl: boolean; + nvidiaContainerToolkitInstalled: boolean; + platform: NodeJS.Platform | string; + readFileImpl: (filePath: string, encoding: BufferEncoding) => string; + readdirImpl: (dir: string) => string[]; + runCaptureImpl: RunCaptureFn; + runtime: string; + systemctlAvailable?: boolean; +}; + +export type NvidiaCdiHostAssessment = { + dockerCdiSpecDirs: string[]; + cdiNvidiaGpuSpecMissing: boolean; + cdiNvidiaGpuSpecStale: boolean; + cdiNvidiaGpuSpecMismatch?: string; + cdiNvidiaGpuRefreshUnhealthy: boolean; + cdiNvidiaGpuSpecNeedsRepair: boolean; + nvidiaCdiRefreshPathActive: boolean | null; + nvidiaCdiRefreshPathEnabled: boolean | null; + nvidiaCdiRefreshServiceEnabled: boolean | null; + nvidiaCdiRefreshServiceFailed: boolean | null; +}; + +type DeviceNumbers = { major: number; minor: number }; + +type CdiDeviceNode = DeviceNumbers & { + filePath: string; + path: string; +}; + +type EffectiveNvidiaCdiSpec = { + filePath: string; + parsed: unknown; +}; + +const NVIDIA_CDI_KIND_YAML_RE = + /^[ \t]*kind[ \t]*:[ \t]*(?:"nvidia\.com\/gpu"|'nvidia\.com\/gpu'|nvidia\.com\/gpu)[ \t]*(?:#.*)?$/im; +const NVIDIA_CDI_KIND_JSON_RE = /"kind"\s*:\s*"nvidia\.com\/gpu"/; +const NVIDIA_CDI_REFRESH_SPEC_PATH = "/var/run/cdi/nvidia.yaml"; export function parseDockerCdiSpecDirs(value: string | null | undefined): string[] { const raw = String(value || "").trim(); if (!raw || raw === "") return []; try { const parsed = JSON.parse(raw); - return Array.isArray(parsed) - ? parsed.map((entry) => String(entry || "").trim()).filter(Boolean) - : []; + const dirs: unknown[] = Array.isArray(parsed) + ? parsed + : parsed && typeof parsed === "object" && Array.isArray(parsed.CDISpecDirs) + ? parsed.CDISpecDirs + : []; + return dirs.map((entry) => String(entry || "").trim()).filter(Boolean); } catch { return raw .split(/[\s,]+/) @@ -28,6 +90,19 @@ export function getDockerCdiSpecDirs(): string[] { ); } +function normalizeCdiSpecDir(specDir: string | undefined): string { + const trimmed = String(specDir || "/etc/cdi") + .trim() + .replace(/\/+$/, ""); + return trimmed || "/etc/cdi"; +} + +export function getNvidiaCdiSpecPath( + assessment: Pick, +): string { + return path.join(normalizeCdiSpecDir(assessment.dockerCdiSpecDirs[0]), "nvidia.yaml"); +} + function isLikelyNvidiaCdiSpecFile(filePath: string): boolean { if (!/\.(json|ya?ml)$/i.test(filePath)) return false; let content = ""; @@ -55,3 +130,377 @@ export function findReadableNvidiaCdiSpecFiles(dirs: string[]): string[] { } return specs.sort(); } + +export function hasNvidiaCdiSpec( + specDirs: readonly string[], + readdirImpl: (dir: string) => string[], + readFileImpl: (filePath: string, encoding: BufferEncoding) => string, +): boolean { + for (const dir of specDirs) { + let entries: string[]; + try { + entries = readdirImpl(dir); + } catch { + continue; + } + for (const entry of entries) { + if (!/\.(ya?ml|json)$/i.test(entry)) continue; + let raw: string; + try { + raw = readFileImpl(path.join(dir, entry), "utf-8"); + } catch { + continue; + } + if (NVIDIA_CDI_KIND_YAML_RE.test(raw) || NVIDIA_CDI_KIND_JSON_RE.test(raw)) return true; + } + } + return false; +} + +function parseIntegerLike(value: unknown): number | null { + if (typeof value === "number") { + return Number.isInteger(value) && value >= 0 ? value : null; + } + if (typeof value !== "string") return null; + const trimmed = value.trim(); + if (!trimmed) return null; + const base = /^0x/i.test(trimmed) ? 16 : 10; + const parsed = Number.parseInt(trimmed, base); + return Number.isInteger(parsed) && parsed >= 0 ? parsed : null; +} + +function parseLinuxStatDeviceNumbers(output: string | null | undefined): DeviceNumbers | null { + const parts = String(output || "") + .trim() + .split(/\s+/) + .filter(Boolean); + if (parts.length < 2) return null; + const major = Number.parseInt(parts[0], 16); + const minor = Number.parseInt(parts[1], 16); + if (!Number.isInteger(major) || !Number.isInteger(minor) || major < 0 || minor < 0) { + return null; + } + return { major, minor }; +} + +function readLiveLinuxDeviceNumbers( + devicePath: string, + runCaptureImpl: RunCaptureFn, +): DeviceNumbers | null { + try { + return parseLinuxStatDeviceNumbers( + runCaptureImpl(["stat", "-c", "%t %T", devicePath], { ignoreError: true }), + ); + } catch { + return null; + } +} + +function parseCdiSpec(raw: string, filePath: string): unknown { + if (/\.json$/i.test(filePath)) return JSON.parse(raw); + const YAML = require("yaml"); + return YAML.parse(raw); +} + +export function findEffectiveNvidiaCdiSpec( + specDirs: readonly string[], + readdirImpl: (dir: string) => string[], + readFileImpl: (filePath: string, encoding: BufferEncoding) => string, +): EffectiveNvidiaCdiSpec | null { + // Docker CDI precedence is highest in the last configured directory. + for (const dir of [...specDirs].reverse()) { + let entries: string[]; + try { + entries = readdirImpl(dir); + } catch { + continue; + } + for (const entry of entries) { + if (!/\.(ya?ml|json)$/i.test(entry)) continue; + const filePath = path.join(dir, entry); + let raw: string; + try { + raw = readFileImpl(filePath, "utf-8"); + } catch { + continue; + } + if (!NVIDIA_CDI_KIND_YAML_RE.test(raw) && !NVIDIA_CDI_KIND_JSON_RE.test(raw)) { + continue; + } + try { + return { filePath, parsed: parseCdiSpec(raw, filePath) }; + } catch { + continue; + } + } + } + return null; +} + +export function collectCdiDeviceNodes(value: unknown, filePath: string): CdiDeviceNode[] { + const nodes: CdiDeviceNode[] = []; + const stack: unknown[] = [value]; + + while (stack.length > 0) { + const current = stack.pop(); + if (Array.isArray(current)) { + for (const item of current) stack.push(item); + continue; + } + if (!current || typeof current !== "object") continue; + const obj = current as Record; + // We stat the host device, so prefer CDI's host-side path when present. + const nodePath = + (typeof obj.hostPath === "string" && obj.hostPath) || + (typeof obj.path === "string" && obj.path) || + ""; + const major = parseIntegerLike(obj.major); + if (nodePath.startsWith("/dev/") && major !== null) { + const minor = obj.minor === undefined ? 0 : parseIntegerLike(obj.minor); + if (minor !== null) nodes.push({ filePath, path: nodePath, major, minor }); + } + for (const child of Object.values(obj)) stack.push(child); + } + + return nodes; +} + +export function findCdiDeviceNodeMismatch( + specDirs: readonly string[], + readdirImpl: (dir: string) => string[], + readFileImpl: (filePath: string, encoding: BufferEncoding) => string, + runCaptureImpl: RunCaptureFn, +): string | null { + const effective = findEffectiveNvidiaCdiSpec(specDirs, readdirImpl, readFileImpl); + if (!effective) return null; + for (const node of collectCdiDeviceNodes(effective.parsed, effective.filePath)) { + const liveDevice = readLiveLinuxDeviceNumbers(node.path, runCaptureImpl); + if (!liveDevice) continue; + if (node.major === liveDevice.major && node.minor === liveDevice.minor) continue; + return `${node.filePath} ${node.path}=${node.major}:${node.minor}, live=${liveDevice.major}:${liveDevice.minor}`; + } + return null; +} + +function parseSystemctlState(value = ""): boolean | null { + const normalized = String(value || "") + .trim() + .toLowerCase(); + if (!normalized) return null; + if (normalized === "active" || normalized === "enabled") return true; + if ( + normalized === "inactive" || + normalized === "failed" || + normalized === "disabled" || + normalized === "masked" + ) { + return false; + } + return null; +} + +function parseSystemctlFailedState(value = ""): boolean | null { + const normalized = String(value || "") + .trim() + .toLowerCase(); + if (!normalized) return null; + if (normalized === "failed") return true; + if (normalized === "active" || normalized === "inactive") return false; + return null; +} + +export function assessNvidiaCdiHost(opts: NvidiaCdiHostAssessmentOpts): NvidiaCdiHostAssessment { + const dockerCdiSpecDirs = opts.dockerReachable + ? parseDockerCdiSpecDirs(opts.dockerInfoOutput) + : []; + const cdiSpecPresenceApplies = + opts.platform === "linux" && opts.hasNvidiaGpu && dockerCdiSpecDirs.length > 0; + const cdiSpecRepairApplies = + cdiSpecPresenceApplies && !(opts.isWsl && opts.runtime === "docker-desktop"); + const cdiNvidiaGpuSpecPresent = + cdiSpecPresenceApplies && + hasNvidiaCdiSpec(dockerCdiSpecDirs, opts.readdirImpl, opts.readFileImpl); + const cdiNvidiaGpuSpecMissing = cdiSpecPresenceApplies && !cdiNvidiaGpuSpecPresent; + const refreshHealthApplies = + cdiSpecRepairApplies && + Boolean(opts.systemctlAvailable) && + opts.nvidiaContainerToolkitInstalled; + const nvidiaCdiRefreshPathEnabled = refreshHealthApplies + ? parseSystemctlState( + opts.runCaptureImpl(["systemctl", "is-enabled", "nvidia-cdi-refresh.path"], { + ignoreError: true, + }), + ) + : null; + const nvidiaCdiRefreshPathActive = refreshHealthApplies + ? parseSystemctlState( + opts.runCaptureImpl(["systemctl", "is-active", "nvidia-cdi-refresh.path"], { + ignoreError: true, + }), + ) + : null; + const nvidiaCdiRefreshServiceEnabled = refreshHealthApplies + ? parseSystemctlState( + opts.runCaptureImpl(["systemctl", "is-enabled", "nvidia-cdi-refresh.service"], { + ignoreError: true, + }), + ) + : null; + const nvidiaCdiRefreshServiceFailed = refreshHealthApplies + ? parseSystemctlFailedState( + opts.runCaptureImpl(["systemctl", "is-failed", "nvidia-cdi-refresh.service"], { + ignoreError: true, + }), + ) + : null; + const cdiNvidiaGpuRefreshUnhealthy = + nvidiaCdiRefreshPathEnabled === false || + nvidiaCdiRefreshPathActive === false || + nvidiaCdiRefreshServiceFailed === true; + const cdiNvidiaGpuSpecMismatch = + cdiSpecRepairApplies && cdiNvidiaGpuSpecPresent + ? findCdiDeviceNodeMismatch( + dockerCdiSpecDirs, + opts.readdirImpl, + opts.readFileImpl, + opts.runCaptureImpl, + ) + : null; + const cdiNvidiaGpuSpecStale = Boolean(cdiNvidiaGpuSpecMismatch); + + return { + dockerCdiSpecDirs, + cdiNvidiaGpuSpecMissing, + cdiNvidiaGpuSpecStale, + cdiNvidiaGpuSpecMismatch: cdiNvidiaGpuSpecMismatch ?? undefined, + cdiNvidiaGpuRefreshUnhealthy, + cdiNvidiaGpuSpecNeedsRepair: cdiNvidiaGpuSpecMissing || cdiNvidiaGpuSpecStale, + nvidiaCdiRefreshPathActive, + nvidiaCdiRefreshPathEnabled, + nvidiaCdiRefreshServiceEnabled, + nvidiaCdiRefreshServiceFailed, + }; +} + +export function buildNvidiaCdiRepairCommands( + assessment: Pick, + specPath: string, +): string[] { + const specDir = path.dirname(specPath); + const quotedSpecDir = shellQuote(specDir); + const quotedSpecPath = shellQuote(specPath); + const commands = [`sudo mkdir -p ${quotedSpecDir}`]; + if (assessment.systemctlAvailable !== false) { + commands.push( + "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", + "sudo systemctl start nvidia-cdi-refresh.service", + "nvidia-ctk cdi list # verify nvidia.com/gpu entries appear", + ); + } + commands.push( + `sudo nvidia-ctk cdi generate --output=${quotedSpecPath} # fallback if the refresh service does not repair the spec`, + "nvidia-ctk cdi list # verify nvidia.com/gpu entries appear", + "nemoclaw onboard # or rerun with --no-gpu to skip GPU passthrough", + ); + return commands; +} + +export function buildNvidiaCdiRefreshCommands(): string[] { + return [ + "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", + "sudo systemctl start nvidia-cdi-refresh.service", + "nvidia-ctk cdi list # verify nvidia.com/gpu entries appear", + ]; +} + +export function extractCdiMismatchFilePath(mismatch: string | undefined): string { + const trimmed = String(mismatch || "").trim(); + if (!trimmed) return ""; + const firstWhitespace = trimmed.search(/\s/); + return firstWhitespace > 0 ? trimmed.slice(0, firstWhitespace) : trimmed; +} + +export function buildStaleCdiAutoFixCommands(): string[] { + return [ + "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", + "sudo systemctl start nvidia-cdi-refresh.service", + ]; +} + +export function buildStaleCdiWarnCommands(flaggedFilePath: string): string[] { + const commands = buildStaleCdiAutoFixCommands(); + if (flaggedFilePath && flaggedFilePath !== NVIDIA_CDI_REFRESH_SPEC_PATH) { + const quotedFlaggedFilePath = shellQuote(flaggedFilePath); + commands.push( + `sudo rm -f ${quotedFlaggedFilePath} # optional: remove the stale leftover (the service owns ${NVIDIA_CDI_REFRESH_SPEC_PATH})`, + ); + } + commands.push( + "nemoclaw onboard # re-run to confirm the stale-spec warning clears (or --no-gpu to skip GPU)", + ); + return commands; +} + +export function buildStaleCdiManualWarnCommands(flaggedFilePath: string): string[] { + const commands = [ + `Refresh NVIDIA CDI specs using your host's service manager so ${NVIDIA_CDI_REFRESH_SPEC_PATH} is current.`, + ]; + if (flaggedFilePath && flaggedFilePath !== NVIDIA_CDI_REFRESH_SPEC_PATH) { + const quotedFlaggedFilePath = shellQuote(flaggedFilePath); + commands.push( + `Optionally remove the stale leftover after the refresh: sudo rm -f ${quotedFlaggedFilePath}`, + ); + } + commands.push( + "nemoclaw onboard # re-run to confirm the stale-spec warning clears (or --no-gpu to skip GPU)", + ); + return commands; +} + +export function explainStaleCdiReason(mismatch: string | undefined): string { + const detail = mismatch || "unknown device-node mismatch"; + const flaggedFilePath = extractCdiMismatchFilePath(mismatch); + const isLeftover = flaggedFilePath && flaggedFilePath !== NVIDIA_CDI_REFRESH_SPEC_PATH; + return ( + `An NVIDIA CDI device node no longer matches the live device (${detail}). ` + + "OpenShell's `gateway start --gpu` injects devices from the CDI spec, so a stale " + + "device number points the container at the wrong device and CUDA init fails " + + "(`CUDA unknown error`). The nvidia-cdi-refresh service keeps " + + `${NVIDIA_CDI_REFRESH_SPEC_PATH} current on driver/toolkit changes` + + (isLeftover + ? `; the flagged ${flaggedFilePath} is a stale leftover that the refreshed ` + + `${NVIDIA_CDI_REFRESH_SPEC_PATH} overrides.` + : "; re-enable and run it to regenerate the spec.") + ); +} + +export function explainNvidiaCdiRepairReason(assessment: NvidiaCdiRepairAssessment): string { + const reasons: string[] = []; + if (assessment.cdiNvidiaGpuSpecMissing) { + reasons.push( + "Docker is configured for CDI device injection (CDISpecDirs is set) but no nvidia.com/gpu CDI spec is present on the host.", + ); + } + if (assessment.cdiNvidiaGpuSpecStale) { + const detail = assessment.cdiNvidiaGpuSpecMismatch + ? ` (${assessment.cdiNvidiaGpuSpecMismatch})` + : ""; + reasons.push( + `The NVIDIA CDI spec appears stale because a declared device node does not match the live device${detail}.`, + ); + } + if (assessment.cdiNvidiaGpuRefreshUnhealthy) { + const unitDetails: string[] = []; + if (assessment.nvidiaCdiRefreshPathEnabled === false) unitDetails.push("path disabled"); + if (assessment.nvidiaCdiRefreshPathActive === false) unitDetails.push("path inactive"); + if (assessment.nvidiaCdiRefreshServiceFailed === true) unitDetails.push("service failed"); + const suffix = unitDetails.length > 0 ? ` (${unitDetails.join(", ")})` : ""; + reasons.push( + `NVIDIA's CDI refresh units are not healthy${suffix}, so Docker may keep using stale GPU device numbers after driver changes.`, + ); + } + reasons.push( + "OpenShell's `gateway start --gpu` can fail until the CDI spec is refreshed and verified.", + ); + return reasons.join(" "); +} diff --git a/src/lib/onboard/preflight-cdi.test.ts b/src/lib/onboard/preflight-cdi.test.ts new file mode 100644 index 0000000000..869c3a55ce --- /dev/null +++ b/src/lib/onboard/preflight-cdi.test.ts @@ -0,0 +1,345 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it } from "vitest"; +// Import through the compiled dist/ output so coverage is attributed to the +// CLI build output that the ratchet measures. +import { assessHost, planHostRemediation } from "../../../dist/lib/onboard/preflight"; + +type HostAssessment = Parameters[0]; + +function baseAssessment(overrides: Partial = {}): HostAssessment { + return { + platform: "linux", + isWsl: false, + runtime: "docker", + packageManager: "apt", + systemctlAvailable: true, + dockerServiceActive: true, + dockerServiceEnabled: true, + dockerInstalled: true, + dockerRunning: true, + dockerReachable: true, + nodeInstalled: true, + openshellInstalled: true, + dockerCgroupVersion: "v2", + dockerDefaultCgroupnsMode: "unknown", + isContainerRuntimeUnderProvisioned: false, + hasNestedOverlayConflict: false, + requiresHostCgroupnsFix: false, + isUnsupportedRuntime: false, + isHeadlessLikely: false, + hasNvidiaGpu: true, + dockerCdiSpecDirs: ["/etc/cdi", "/var/run/cdi"], + cdiNvidiaGpuSpecMissing: false, + nvidiaContainerToolkitInstalled: true, + notes: [], + ...overrides, + }; +} + +function healthySystemctlAndStat(command: readonly string[]) { + if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; + if (command[0] === "systemctl" && command[1] === "is-active") return "active"; + if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; + if (command[0] === "stat" && command[3] === "/dev/nvidia-uvm") return "1f3 0"; + return ""; +} + +describe("assessHost — CDI", () => { + it("flags missing nvidia.com/gpu specs on an NVIDIA Linux host with CDI dirs configured", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: () => "Linux version 6.8.0-58-generic", + readdirImpl: () => [], + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + OperatingSystem: "Ubuntu 24.04", + CDISpecDirs: ["/etc/cdi", "/var/run/cdi"], + }), + commandExistsImpl: (name: string) => name === "docker", + gpuProbeImpl: () => true, + }); + + expect(result.dockerCdiSpecDirs).toEqual(["/etc/cdi", "/var/run/cdi"]); + expect(result.cdiNvidiaGpuSpecMissing).toBe(true); + }); + + it("does not flag the host when an nvidia.com/gpu YAML spec is present", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => + filePath.endsWith("nvidia.yaml") + ? "cdiVersion: 0.5.0\nkind: nvidia.com/gpu\ndevices: []\n" + : "Linux version 6.8.0-58-generic", + readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi", "/var/run/cdi"], + }), + commandExistsImpl: (name: string) => name === "docker", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(false); + }); + + it("uses the effective CDI spec when assessing staleness", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => { + if (filePath === "/etc/cdi/nvidia.yaml") { + return [ + "cdiVersion: 0.5.0", + "kind: nvidia.com/gpu", + "devices:", + " - name: all", + " containerEdits:", + " deviceNodes:", + " - path: /dev/nvidia-uvm", + " hostPath: /dev/nvidia-uvm", + " type: c", + " major: 498", + "", + ].join("\n"); + } + if (filePath === "/var/run/cdi/nvidia.yaml") { + return [ + "cdiVersion: 0.5.0", + "kind: nvidia.com/gpu", + "devices:", + " - name: all", + " containerEdits:", + " deviceNodes:", + " - path: /dev/nvidia-uvm", + " hostPath: /dev/nvidia-uvm", + " type: c", + " major: 499", + "", + ].join("\n"); + } + return "Linux version 6.8.0-58-generic"; + }, + readdirImpl: (dir: string) => { + if (dir === "/etc/cdi") return ["nvidia.yaml"]; + if (dir === "/var/run/cdi") return ["nvidia.yaml"]; + return []; + }, + runCaptureImpl: healthySystemctlAndStat, + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi", "/var/run/cdi"], + }), + commandExistsImpl: (name: string) => + name === "docker" || name === "systemctl" || name === "nvidia-ctk", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(false); + expect(result.cdiNvidiaGpuSpecStale).toBe(false); + expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(false); + }); + + it("records stale effective CDI specs as repair-blocking", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => + filePath.endsWith("nvidia.yaml") + ? [ + "cdiVersion: 0.5.0", + "kind: nvidia.com/gpu", + "devices:", + " - name: all", + " containerEdits:", + " deviceNodes:", + " - path: /dev/nvidia-uvm", + " hostPath: /dev/nvidia-uvm", + " type: c", + " major: 498", + "", + ].join("\n") + : "Linux version 6.8.0-58-generic", + readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), + runCaptureImpl: healthySystemctlAndStat, + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi"], + }), + commandExistsImpl: (name: string) => + name === "docker" || name === "systemctl" || name === "nvidia-ctk", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuSpecMissing).toBe(false); + expect(result.cdiNvidiaGpuSpecStale).toBe(true); + expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(true); + expect(result.cdiNvidiaGpuSpecMismatch).toContain("/dev/nvidia-uvm=498:0"); + expect(result.cdiNvidiaGpuSpecMismatch).toContain("live=499:0"); + }); + + it("treats refresh-unit health as a non-repair warning", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => + filePath.endsWith("nvidia.yaml") + ? "cdiVersion: 0.5.0\nkind: nvidia.com/gpu\ndevices: []\n" + : "Linux version 6.8.0-58-generic", + readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), + runCaptureImpl: (command: readonly string[]) => { + if (command[0] === "systemctl" && command[1] === "is-enabled") { + return command[2] === "nvidia-cdi-refresh.service" ? "disabled" : "enabled"; + } + if (command[0] === "systemctl" && command[1] === "is-active") return "active"; + if (command[0] === "systemctl" && command[1] === "is-failed") return "inactive"; + if (command[0] === "stat") return "1f3 0"; + return ""; + }, + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + CDISpecDirs: ["/etc/cdi"], + }), + commandExistsImpl: (name: string) => + name === "docker" || name === "systemctl" || name === "nvidia-ctk", + gpuProbeImpl: () => true, + }); + + expect(result.cdiNvidiaGpuRefreshUnhealthy).toBe(false); + expect(result.cdiNvidiaGpuSpecNeedsRepair).toBe(false); + expect(result.nvidiaCdiRefreshServiceEnabled).toBe(false); + }); + + it("does not apply CDI checks without an NVIDIA Linux CDI context", () => { + const linuxWithoutGpu = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: () => "Linux version 6.8.0-58-generic", + readdirImpl: () => [], + dockerInfoOutput: JSON.stringify({ ServerVersion: "27.0", CDISpecDirs: ["/etc/cdi"] }), + commandExistsImpl: (name: string) => name === "docker", + gpuProbeImpl: () => false, + }); + const noCdiDirs = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: () => "Linux version 6.8.0-58-generic", + readdirImpl: () => [], + dockerInfoOutput: JSON.stringify({ ServerVersion: "24.0" }), + commandExistsImpl: (name: string) => name === "docker", + gpuProbeImpl: () => true, + }); + + expect(linuxWithoutGpu.cdiNvidiaGpuSpecMissing).toBe(false); + expect(noCdiDirs.dockerCdiSpecDirs).toEqual([]); + expect(noCdiDirs.cdiNvidiaGpuSpecMissing).toBe(false); + }); +}); + +describe("planHostRemediation — CDI", () => { + it("emits a blocking generate action for missing nvidia.com/gpu specs", () => { + const actions = planHostRemediation(baseAssessment({ cdiNvidiaGpuSpecMissing: true })); + const action = actions.find((entry: { id: string }) => entry.id === "generate_nvidia_cdi_spec"); + + expect(action).toBeTruthy(); + expect(action?.kind).toBe("sudo"); + expect(action?.blocking).toBe(true); + expect(action?.commands.some((command) => command.includes("--output='/etc/cdi"))).toBe( + true, + ); + expect(action?.commands.some((command) => command.includes("nvidia-ctk cdi list"))).toBe(true); + }); + + it("emits service-refresh commands for stale nvidia.com/gpu specs", () => { + const actions = planHostRemediation( + baseAssessment({ + cdiNvidiaGpuSpecStale: true, + cdiNvidiaGpuSpecNeedsRepair: true, + cdiNvidiaGpuSpecMismatch: + "/etc/cdi/nvidia.yaml /dev/nvidia-uvm=498:0, live=499:0", + }), + ); + const action = actions.find((entry: { id: string }) => entry.id === "refresh_nvidia_cdi_spec"); + + expect(action).toBeTruthy(); + expect(action?.blocking).toBe(true); + expect(action?.commands[0]).toBe( + "sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service", + ); + expect(action?.commands[1]).toBe("sudo systemctl start nvidia-cdi-refresh.service"); + expect( + action?.commands.some((command) => command.includes("sudo rm -f '/etc/cdi/nvidia.yaml'")), + ).toBe(true); + expect(action?.commands.some((command) => command.includes("--output=/etc/cdi"))).toBe(false); + expect(action?.commands.some((command) => command.includes("nvidia-ctk cdi list"))).toBe( + false, + ); + }); + + it("emits manual stale-spec guidance without systemctl on non-systemd hosts", () => { + const actions = planHostRemediation( + baseAssessment({ + systemctlAvailable: false, + cdiNvidiaGpuSpecStale: true, + cdiNvidiaGpuSpecNeedsRepair: true, + cdiNvidiaGpuSpecMismatch: + "/etc/cdi/nvidia.yaml /dev/nvidia-uvm=498:0, live=499:0", + }), + ); + const action = actions.find((entry: { id: string }) => entry.id === "refresh_nvidia_cdi_spec"); + + expect(action).toBeTruthy(); + expect(action?.blocking).toBe(true); + expect(action?.kind).toBe("manual"); + expect(action?.commands.join("\n")).toContain("/var/run/cdi/nvidia.yaml"); + expect(action?.commands.join("\n")).not.toContain("systemctl"); + }); + + it("emits a non-blocking refresh-service warning when refresh units are unhealthy", () => { + const actions = planHostRemediation( + baseAssessment({ + dockerCdiSpecDirs: ["/etc/cdi"], + cdiNvidiaGpuRefreshUnhealthy: true, + cdiNvidiaGpuSpecNeedsRepair: false, + nvidiaCdiRefreshPathEnabled: false, + nvidiaCdiRefreshPathActive: false, + }), + ); + const action = actions.find( + (entry: { id: string }) => entry.id === "warn_nvidia_cdi_refresh_unhealthy", + ); + + expect(action).toBeTruthy(); + expect(action?.blocking).toBe(false); + expect(action?.title).toBe("Enable NVIDIA CDI refresh service"); + expect(action?.reason).toContain("path disabled"); + }); + + it("bootstraps nvidia-container-toolkit before missing-spec generation", () => { + const actions = planHostRemediation( + baseAssessment({ + cdiNvidiaGpuSpecMissing: true, + nvidiaContainerToolkitInstalled: false, + }), + ); + const action = actions.find((entry) => entry.id === "install_nvidia_container_toolkit"); + + expect(action).toBeTruthy(); + expect(action?.commands.some((command) => command === "sudo apt-get install -y nvidia-container-toolkit")).toBe( + true, + ); + expect(action?.commands.some((command) => command.startsWith("sudo nvidia-ctk cdi generate --output="))).toBe( + true, + ); + }); +}); diff --git a/src/lib/onboard/preflight.test.ts b/src/lib/onboard/preflight.test.ts index b71bbea7f1..a360c2e043 100644 --- a/src/lib/onboard/preflight.test.ts +++ b/src/lib/onboard/preflight.test.ts @@ -10,12 +10,10 @@ import { checkPortAvailable, getDockerBridgeGatewayIp, getMemoryInfo, - getNvidiaCdiSpecPath, ensureSwap, isDockerUnderProvisioned, MIN_RECOMMENDED_DOCKER_CPUS, MIN_RECOMMENDED_DOCKER_MEM_GIB, - parseDockerCdiSpecDirs, parseDockerInfoCpus, parseDockerInfoMemTotalBytes, parseDockerStorageDriver, @@ -640,208 +638,6 @@ describe("parseDockerUsesContainerdSnapshotter", () => { }); }); -describe("parseDockerCdiSpecDirs", () => { - it("extracts the dirs from `docker info --format '{{json .}}'` output", () => { - const fixture = JSON.stringify({ CDISpecDirs: ["/etc/cdi", "/var/run/cdi"] }); - expect(parseDockerCdiSpecDirs(fixture)).toEqual(["/etc/cdi", "/var/run/cdi"]); - }); - - it("returns an empty array when CDISpecDirs is absent", () => { - expect(parseDockerCdiSpecDirs(JSON.stringify({ ServerVersion: "27.0" }))).toEqual([]); - }); - - it("returns an empty array when CDISpecDirs is the empty list", () => { - expect(parseDockerCdiSpecDirs(JSON.stringify({ CDISpecDirs: [] }))).toEqual([]); - }); - - it("returns an empty array on empty input", () => { - expect(parseDockerCdiSpecDirs("")).toEqual([]); - }); -}); - -describe("assessHost — CDI device-spec gap (#3152)", () => { - it("flags missing nvidia.com/gpu specs on an NVIDIA Linux host with CDI dirs configured", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: () => "Linux version 6.8.0-58-generic", - readdirImpl: () => [], - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - OperatingSystem: "Ubuntu 24.04", - CDISpecDirs: ["/etc/cdi", "/var/run/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.dockerCdiSpecDirs).toEqual(["/etc/cdi", "/var/run/cdi"]); - expect(result.cdiNvidiaGpuSpecMissing).toBe(true); - }); - - it("does not flag the host when an nvidia.com/gpu YAML spec is present", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("nvidia.yaml") - ? "cdiVersion: 0.5.0\nkind: nvidia.com/gpu\ndevices: []\n" - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.yaml"] : []), - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi", "/var/run/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - }); - - it("accepts a JSON-serialised CDI spec as well", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("nvidia.json") - ? '{"cdiVersion":"0.5.0","kind":"nvidia.com/gpu","devices":[]}' - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia.json"] : []), - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - }); - - it("does not flag a non-NVIDIA Linux host even with CDI dirs configured", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: () => "Linux version 6.8.0-58-generic", - readdirImpl: () => [], - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => false, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - }); - - it("does not flag a host that does not advertise CDISpecDirs", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: () => "Linux version 6.8.0-58-generic", - readdirImpl: () => [], - dockerInfoOutput: JSON.stringify({ ServerVersion: "24.0" }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.dockerCdiSpecDirs).toEqual([]); - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - }); - - it("does not flag macOS even when the docker info shape would otherwise match", () => { - const result = assessHost({ - platform: "darwin", - env: {}, - readFileImpl: () => "", - readdirImpl: () => [], - dockerInfoOutput: JSON.stringify({ CDISpecDirs: ["/etc/cdi"] }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(false); - }); - - it("does not accept a sibling device class such as nvidia.com/gpu-extra as a satisfying spec", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("nvidia-extra.yaml") - ? "cdiVersion: 0.5.0\nkind: nvidia.com/gpu-extra\ndevices: []\n" - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia-extra.yaml"] : []), - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(true); - }); - - it("does not accept a sibling device class in JSON form either", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("nvidia-extra.json") - ? '{"cdiVersion":"0.5.0","kind":"nvidia.com/gpu-extra","devices":[]}' - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["nvidia-extra.json"] : []), - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(true); - }); - - it("ignores spec files whose `kind` only mentions nvidia.com/gpu in a comment", () => { - const result = assessHost({ - platform: "linux", - env: {}, - release: "6.8.0-58-generic", - readFileImpl: (filePath: string) => - filePath.endsWith("notes.yaml") - ? "# this used to declare nvidia.com/gpu; now stripped\nkind: example.com/cpu\n" - : "Linux version 6.8.0-58-generic", - readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["notes.yaml"] : []), - dockerInfoOutput: JSON.stringify({ - ServerVersion: "27.0", - CDISpecDirs: ["/etc/cdi"], - }), - commandExistsImpl: (name: string) => name === "docker", - gpuProbeImpl: () => true, - }); - - expect(result.cdiNvidiaGpuSpecMissing).toBe(true); - }); -}); - -describe("getNvidiaCdiSpecPath", () => { - it("builds the default NVIDIA CDI spec path from Docker CDI dirs", () => { - expect(getNvidiaCdiSpecPath({ dockerCdiSpecDirs: ["/etc/cdi/", "/var/run/cdi"] })).toBe( - "/etc/cdi/nvidia.yaml", - ); - }); -}); - describe("planHostRemediation", () => { function baseAssessment( overrides: Partial[0]> = {}, @@ -1079,142 +875,6 @@ describe("planHostRemediation", () => { expect(actions.some((action: { id: string }) => action.id === "install_openshell")).toBe(true); }); - it("emits a blocking generate_nvidia_cdi_spec action when CDI dirs are configured but no nvidia.com/gpu spec exists", () => { - const actions = planHostRemediation({ - platform: "linux", - isWsl: false, - runtime: "docker", - packageManager: "apt", - systemctlAvailable: true, - dockerServiceActive: true, - dockerServiceEnabled: true, - dockerInstalled: true, - dockerRunning: true, - dockerReachable: true, - nodeInstalled: true, - openshellInstalled: true, - dockerCgroupVersion: "v2", - dockerDefaultCgroupnsMode: "unknown", - isContainerRuntimeUnderProvisioned: false, - hasNestedOverlayConflict: false, - requiresHostCgroupnsFix: false, - isUnsupportedRuntime: false, - isHeadlessLikely: false, - hasNvidiaGpu: true, - dockerCdiSpecDirs: ["/etc/cdi", "/var/run/cdi"], - cdiNvidiaGpuSpecMissing: true, - nvidiaContainerToolkitInstalled: true, - notes: [], - }); - - const action = actions.find( - (entry: { id: string }) => entry.id === "generate_nvidia_cdi_spec", - ); - expect(action).toBeTruthy(); - expect(action?.kind).toBe("sudo"); - expect(action?.blocking).toBe(true); - expect(action?.commands[0]).toBe("sudo mkdir -p /etc/cdi"); - expect(action?.commands[1]).toBe( - "sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml", - ); - expect(action?.commands[2]).toContain("nvidia-ctk cdi list"); - expect(action?.commands[3]).toContain("nemoclaw onboard"); - expect(action?.reason).toContain("nvidia.com/gpu"); - }); - - it("emits an install_nvidia_container_toolkit action with apt bootstrap when nvidia-ctk is missing on apt hosts", () => { - const actions = planHostRemediation({ - platform: "linux", - isWsl: false, - runtime: "docker", - packageManager: "apt", - systemctlAvailable: true, - dockerServiceActive: true, - dockerServiceEnabled: true, - dockerInstalled: true, - dockerRunning: true, - dockerReachable: true, - nodeInstalled: true, - openshellInstalled: true, - dockerCgroupVersion: "v2", - dockerDefaultCgroupnsMode: "unknown", - isContainerRuntimeUnderProvisioned: false, - hasNestedOverlayConflict: false, - requiresHostCgroupnsFix: false, - isUnsupportedRuntime: false, - isHeadlessLikely: false, - hasNvidiaGpu: true, - dockerCdiSpecDirs: ["/etc/cdi", "/var/run/cdi"], - cdiNvidiaGpuSpecMissing: true, - nvidiaContainerToolkitInstalled: false, - notes: [], - }); - - expect(actions.find((entry) => entry.id === "generate_nvidia_cdi_spec")).toBeUndefined(); - const action = actions.find((entry) => entry.id === "install_nvidia_container_toolkit"); - expect(action).toBeTruthy(); - expect(action?.kind).toBe("sudo"); - expect(action?.blocking).toBe(true); - expect(action?.title).toContain("Install NVIDIA Container Toolkit"); - expect(action?.reason).toContain("nvidia-container-toolkit"); - expect(action?.commands.some((c) => c.includes("nvidia-container-toolkit-keyring.gpg"))).toBe( - true, - ); - expect(action?.commands.some((c) => c === "sudo apt-get install -y nvidia-container-toolkit")).toBe( - true, - ); - expect( - action?.commands.some((c) => c.startsWith("sudo nvidia-ctk cdi generate --output=")), - ).toBe(true); - const ctkInstallIndex = - action?.commands.findIndex((c) => c === "sudo apt-get install -y nvidia-container-toolkit") ?? - -1; - const ctkGenerateIndex = - action?.commands.findIndex((c) => c.startsWith("sudo nvidia-ctk cdi generate --output=")) ?? - -1; - expect(ctkInstallIndex).toBeGreaterThanOrEqual(0); - expect(ctkGenerateIndex).toBeGreaterThan(ctkInstallIndex); - }); - - it("emits an install_nvidia_container_toolkit action with a docs pointer when nvidia-ctk is missing on unknown package managers", () => { - const actions = planHostRemediation({ - platform: "linux", - isWsl: false, - runtime: "docker", - packageManager: "unknown", - systemctlAvailable: true, - dockerServiceActive: true, - dockerServiceEnabled: true, - dockerInstalled: true, - dockerRunning: true, - dockerReachable: true, - nodeInstalled: true, - openshellInstalled: true, - dockerCgroupVersion: "v2", - dockerDefaultCgroupnsMode: "unknown", - isContainerRuntimeUnderProvisioned: false, - hasNestedOverlayConflict: false, - requiresHostCgroupnsFix: false, - isUnsupportedRuntime: false, - isHeadlessLikely: false, - hasNvidiaGpu: true, - dockerCdiSpecDirs: ["/etc/cdi", "/var/run/cdi"], - cdiNvidiaGpuSpecMissing: true, - nvidiaContainerToolkitInstalled: false, - notes: [], - }); - - const action = actions.find((entry) => entry.id === "install_nvidia_container_toolkit"); - expect(action).toBeTruthy(); - expect( - action?.commands.some((c) => - c.includes("docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide"), - ), - ).toBe(true); - expect( - action?.commands.some((c) => c.startsWith("sudo nvidia-ctk cdi generate --output=")), - ).toBe(true); - }); }); describe("ensureSwap", () => { diff --git a/src/lib/onboard/preflight.ts b/src/lib/onboard/preflight.ts index 0739ee6055..d417e87e28 100644 --- a/src/lib/onboard/preflight.ts +++ b/src/lib/onboard/preflight.ts @@ -16,10 +16,22 @@ import os from "node:os"; import path from "node:path"; import { DASHBOARD_PORT } from "../core/ports"; +import { + assessNvidiaCdiHost, + buildNvidiaCdiRefreshCommands, + buildNvidiaCdiRepairCommands, + buildStaleCdiManualWarnCommands, + buildStaleCdiWarnCommands, + explainNvidiaCdiRepairReason, + explainStaleCdiReason, + extractCdiMismatchFilePath, + getNvidiaCdiSpecPath, +} from "./docker-cdi"; import { isWslDockerDesktopRuntime, wslDockerDesktopGpuCompatibilityAction, } from "./wsl-docker-desktop-gpu"; +export { getNvidiaCdiSpecPath, parseDockerCdiSpecDirs } from "./docker-cdi"; export { isWslDockerDesktopRuntime } from "./wsl-docker-desktop-gpu"; // runner.ts still uses CommonJS-style exports — use require here. @@ -124,6 +136,14 @@ export interface HostAssessment { hasNvidiaGpu: boolean; dockerCdiSpecDirs: string[]; cdiNvidiaGpuSpecMissing: boolean; + cdiNvidiaGpuSpecStale?: boolean; + cdiNvidiaGpuSpecMismatch?: string; + cdiNvidiaGpuRefreshUnhealthy?: boolean; + cdiNvidiaGpuSpecNeedsRepair?: boolean; + nvidiaCdiRefreshPathActive?: boolean | null; + nvidiaCdiRefreshPathEnabled?: boolean | null; + nvidiaCdiRefreshServiceEnabled?: boolean | null; + nvidiaCdiRefreshServiceFailed?: boolean | null; nvidiaContainerToolkitInstalled: boolean; notes: string[]; } @@ -285,74 +305,6 @@ export function parseDockerUsesContainerdSnapshotter(info = ""): boolean { return /io\.containerd\.snapshotter\.v1/.test(info); } -// Parses the Docker daemon's configured CDI spec directories from `docker -// info --format '{{json .}}'` output. Docker 25+ surfaces these as -// `"CDISpecDirs": ["/etc/cdi", "/var/run/cdi"]` whenever the daemon is built -// with CDI support and `features.cdi=true` (the default on recent installs). -// An empty list means CDI device injection is not enabled, so OpenShell will -// fall back to the legacy `nvidia` runtime path and there is no spec gap to -// worry about. -export function parseDockerCdiSpecDirs(info = ""): string[] { - const match = info.match(/"CDISpecDirs"\s*:\s*\[([^\]]*)\]/); - if (!match) return []; - return Array.from(match[1].matchAll(/"([^"]+)"/g), (m) => m[1]).filter(Boolean); -} - -function normalizeCdiSpecDir(specDir: string | undefined): string { - const trimmed = String(specDir || "/etc/cdi") - .trim() - .replace(/\/+$/, ""); - return trimmed || "/etc/cdi"; -} - -export function getNvidiaCdiSpecPath( - assessment: Pick, -): string { - return path.join(normalizeCdiSpecDir(assessment.dockerCdiSpecDirs[0]), "nvidia.yaml"); -} - -// True when at least one CDI spec under the configured directories declares -// `kind: nvidia.com/gpu` (the device class OpenShell injects with `--gpu`). -// Specs are typically YAML, but the JSON shape is also accepted because -// `nvidia-ctk cdi generate --format=json` is supported. Errors reading any -// individual file or directory are tolerated — a missing dir is the same -// shape as "no spec found there". -function hasNvidiaCdiSpec( - specDirs: readonly string[], - readdirImpl: (dir: string) => string[], - readFileImpl: (filePath: string, encoding: BufferEncoding) => string, -): boolean { - // YAML keys are unquoted; JSON quotes the kind value. Anchor both patterns - // to the *exact* device-class string `nvidia.com/gpu` and require a value - // terminator (end of line, whitespace + comment, or whitespace + EOL) so a - // sibling spec like `nvidia.com/gpu-extra` does not silently satisfy the - // check and suppress the preflight warning. A comment that merely mentions - // `nvidia.com/gpu` is also rejected because `kindRe` only matches when the - // *whole* scalar value is the device class. - const kindRe = - /^[ \t]*kind[ \t]*:[ \t]*(?:"nvidia\.com\/gpu"|'nvidia\.com\/gpu'|nvidia\.com\/gpu)[ \t]*(?:#.*)?$/im; - const jsonRe = /"kind"\s*:\s*"nvidia\.com\/gpu"/; - for (const dir of specDirs) { - let entries: string[]; - try { - entries = readdirImpl(dir); - } catch { - continue; - } - for (const entry of entries) { - if (!/\.(ya?ml|json)$/i.test(entry)) continue; - let raw: string; - try { - raw = readFileImpl(path.join(dir, entry), "utf-8"); - } catch { - continue; - } - if (kindRe.test(raw) || jsonRe.test(raw)) return true; - } - } - return false; -} - export function parseDockerInfoCpus(info = ""): number | undefined { const jsonMatch = info.match(/"NCPU"\s*:\s*(\d+)/); if (jsonMatch) { @@ -514,7 +466,8 @@ export function assessHost(opts: AssessHostOpts = {}): HostAssessment { const nvidiaContainerToolkitInstalled = opts.commandExistsImpl?.("nvidia-ctk") ?? commandExists("nvidia-ctk", runCaptureImpl); const packageManager = detectPackageManager(runCaptureImpl); - const systemctlAvailable = commandExists("systemctl", runCaptureImpl); + const systemctlAvailable = + opts.commandExistsImpl?.("systemctl") ?? commandExists("systemctl", runCaptureImpl); let dockerInfoOutput = opts.dockerInfoOutput; let dockerReachable = false; @@ -543,6 +496,7 @@ export function assessHost(opts: AssessHostOpts = {}): HostAssessment { if (dockerReachable && runtime === "unknown" && platform === "linux") { runtime = "docker"; } + const isWslHost = detectWsl({ platform, env, release, procVersion }); const dockerCgroupVersion = dockerReachable ? parseDockerCgroupVersion(dockerInfoOutput) : "unknown"; @@ -556,20 +510,19 @@ export function assessHost(opts: AssessHostOpts = {}): HostAssessment { const dockerMemTotalBytes = dockerReachable ? parseDockerInfoMemTotalBytes(dockerInfoOutput) : undefined; - // CDI spec gap: Docker 25+ on hosts with `nvidia-container-toolkit` installed - // typically advertises `"CDISpecDirs": ["/etc/cdi", "/var/run/cdi"]` in its - // info output. OpenShell's `gateway start --gpu` then opportunistically - // selects CDI mode and tries to inject `nvidia.com/gpu=all`. If no spec has - // been generated yet (`/etc/cdi/nvidia.yaml` is missing), the gateway start - // fails with `unresolvable CDI devices nvidia.com/gpu=all`. Detect this up - // front so preflight can point the user at `nvidia-ctk cdi generate` before - // we waste minutes downloading the gateway image. See issue #3152. - const dockerCdiSpecDirs = dockerReachable ? parseDockerCdiSpecDirs(dockerInfoOutput) : []; - const cdiNvidiaGpuSpecMissing = - platform === "linux" && - hasNvidiaGpu && - dockerCdiSpecDirs.length > 0 && - !hasNvidiaCdiSpec(dockerCdiSpecDirs, readdirImpl, readFileImpl); + const cdiAssessment = assessNvidiaCdiHost({ + dockerInfoOutput, + dockerReachable, + hasNvidiaGpu, + isWsl: isWslHost, + nvidiaContainerToolkitInstalled, + platform, + readFileImpl, + readdirImpl, + runCaptureImpl, + runtime, + systemctlAvailable, + }); const isContainerRuntimeUnderProvisioned = isDockerUnderProvisioned( dockerCpus, dockerMemTotalBytes, @@ -588,7 +541,6 @@ export function assessHost(opts: AssessHostOpts = {}): HostAssessment { // the user-confirmed reproducer. Engaging the auto-fix there could // build an unnecessary patched image; preferring to leave WSL alone // until we have a confirmed repro is the conservative call. - const isWslHost = detectWsl({ platform, env, release, procVersion }); const hasNestedOverlayConflict = platform === "linux" && !isWslHost && @@ -635,8 +587,7 @@ export function assessHost(opts: AssessHostOpts = {}): HostAssessment { isUnsupportedRuntime: runtime === "podman", isHeadlessLikely: isHeadlessLikely(env), hasNvidiaGpu, - dockerCdiSpecDirs, - cdiNvidiaGpuSpecMissing, + ...cdiAssessment, nvidiaContainerToolkitInstalled, notes: [], }; @@ -847,43 +798,65 @@ export function planHostRemediation(assessment: HostAssessment): RemediationActi }); } - if (assessment.cdiNvidiaGpuSpecMissing) { + if ( + assessment.cdiNvidiaGpuRefreshUnhealthy && + !assessment.cdiNvidiaGpuSpecNeedsRepair && + !assessment.cdiNvidiaGpuSpecMissing && + !isWslDockerDesktopRuntime(assessment) + ) { + actions.push({ + id: "warn_nvidia_cdi_refresh_unhealthy", + title: "Enable NVIDIA CDI refresh service", + kind: "sudo", + reason: explainNvidiaCdiRepairReason({ + ...assessment, + cdiNvidiaGpuSpecMissing: false, + cdiNvidiaGpuSpecStale: false, + cdiNvidiaGpuSpecMismatch: undefined, + }), + commands: buildNvidiaCdiRefreshCommands(), + blocking: false, + }); + } + + if (assessment.cdiNvidiaGpuSpecNeedsRepair || assessment.cdiNvidiaGpuSpecMissing) { + const missingSpec = assessment.cdiNvidiaGpuSpecMissing; + const flaggedFilePath = extractCdiMismatchFilePath(assessment.cdiNvidiaGpuSpecMismatch); const specPath = getNvidiaCdiSpecPath(assessment); - const specDir = path.dirname(specPath); - const generateCommands = [ - `sudo mkdir -p ${specDir}`, - `sudo nvidia-ctk cdi generate --output=${specPath}`, - "nvidia-ctk cdi list # verify nvidia.com/gpu entries appear", - "nemoclaw onboard # or rerun with --no-gpu to skip GPU passthrough", - ]; + const repairCommands = missingSpec + ? buildNvidiaCdiRepairCommands(assessment, specPath) + : assessment.systemctlAvailable + ? buildStaleCdiWarnCommands(flaggedFilePath) + : buildStaleCdiManualWarnCommands(flaggedFilePath); + const reason = missingSpec + ? explainNvidiaCdiRepairReason(assessment) + : explainStaleCdiReason(assessment.cdiNvidiaGpuSpecMismatch); if (isWslDockerDesktopRuntime(assessment)) { actions.push(wslDockerDesktopGpuCompatibilityAction()); } else if (assessment.nvidiaContainerToolkitInstalled) { + const title = missingSpec + ? "Generate NVIDIA CDI device specs" + : "Refresh NVIDIA CDI device specs"; actions.push({ - id: "generate_nvidia_cdi_spec", - title: "Generate NVIDIA CDI device specs", - kind: "sudo", - reason: - "Docker is configured for CDI device injection (CDISpecDirs is set) but no " + - "nvidia.com/gpu CDI spec is present on the host. OpenShell's `gateway start --gpu` " + - "will fail with `unresolvable CDI devices nvidia.com/gpu=all` until a spec is generated.", - commands: generateCommands, + id: missingSpec ? "generate_nvidia_cdi_spec" : "refresh_nvidia_cdi_spec", + title, + kind: missingSpec || assessment.systemctlAvailable ? "sudo" : "manual", + reason, + commands: repairCommands, blocking: true, }); } else { + const title = missingSpec + ? "Install NVIDIA Container Toolkit and generate CDI device specs" + : "Install NVIDIA Container Toolkit and refresh CDI device specs"; actions.push({ id: "install_nvidia_container_toolkit", - title: "Install NVIDIA Container Toolkit and generate CDI device specs", + title, kind: "sudo", - reason: - "Docker is configured for CDI device injection (CDISpecDirs is set) but the " + - "`nvidia-container-toolkit` package (which provides `nvidia-ctk`) is not installed " + - "on the host. OpenShell's `gateway start --gpu` will fail with " + - "`unresolvable CDI devices nvidia.com/gpu=all` until the toolkit is installed and a " + - "CDI spec is generated.", + reason: `${reason} The nvidia-container-toolkit package (which provides nvidia-ctk) is not installed on the host.`, commands: buildContainerToolkitBootstrapCommands( assessment.packageManager, - generateCommands, + repairCommands, ), blocking: true, }); diff --git a/test/install-preflight.test.ts b/test/install-preflight.test.ts index 13140f5df9..66a3871e9d 100644 --- a/test/install-preflight.test.ts +++ b/test/install-preflight.test.ts @@ -1222,10 +1222,14 @@ fi`, systemctlScript, isWsl = false, runtime = "docker", + stale = false, + toolkitInstalled = true, }: { systemctlScript: string; isWsl?: boolean; runtime?: string; + stale?: boolean; + toolkitInstalled?: boolean; }) { const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-install-cdi-repair-")); const fakeBin = path.join(tmp, "bin"); @@ -1246,7 +1250,11 @@ exports.assessHost = () => ({ isWsl: ${isWsl ? "true" : "false"}, notes: [], dockerCdiSpecDirs: [process.env.CDI_DIR], - cdiNvidiaGpuSpecMissing: !fs.existsSync(process.env.CDI_STATE), + cdiNvidiaGpuSpecMissing: ${stale ? "false" : "!fs.existsSync(process.env.CDI_STATE)"}, + cdiNvidiaGpuSpecStale: ${stale ? "!fs.existsSync(process.env.CDI_STATE)" : "false"}, + cdiNvidiaGpuSpecNeedsRepair: !fs.existsSync(process.env.CDI_STATE), + cdiNvidiaGpuSpecMismatch: process.env.CDI_STALE_FILE + " /dev/nvidia-uvm=498:0, live=499:0", + nvidiaContainerToolkitInstalled: ${toolkitInstalled ? "true" : "false"}, }); exports.getNvidiaCdiSpecPath = (host) => String(host.dockerCdiSpecDirs[0]).replace(/\\/+$/, "") + "/nvidia.yaml"; @@ -1267,6 +1275,13 @@ exports.planHostRemediation = (host) => commands: ["sudo nvidia-ctk cdi generate --output=" + exports.getNvidiaCdiSpecPath(host)], blocking: true, }] + : host.cdiNvidiaGpuSpecStale && !host.nvidiaContainerToolkitInstalled + ? [{ + title: "Install NVIDIA Container Toolkit and refresh CDI device specs", + reason: "nvidia-container-toolkit missing", + commands: ["sudo apt-get install -y nvidia-container-toolkit"], + blocking: true, + }] : []; `, ); @@ -1334,6 +1349,7 @@ run_installer_host_preflight SOURCE_ROOT: sourceRoot, CDI_DIR: cdiDir, CDI_STATE: cdiState, + CDI_STALE_FILE: path.join(cdiDir, "nvidia.yaml"), SUDO_LOG: sudoLog, SYSTEMCTL_LOG: systemctlLog, }, @@ -1383,6 +1399,59 @@ exit 99 expect(sudoLog).not.toMatch(/nvidia-ctk cdi generate/); }); + it("repairs stale NVIDIA CDI specs with the refresh service only", () => { + const { cdiStateExists, output, result, sudoLog, systemctlLog } = + runNvidiaCdiInstallerRepairTest({ + stale: true, + systemctlScript: `#!/usr/bin/env bash +set -euo pipefail +printf '%s\\n' "$*" >> "$SYSTEMCTL_LOG" +if [ "\${1:-}" = "start" ]; then + touch "$CDI_STATE" +fi +exit 0 +`, + }); + + expect(result.status, output).toBe(0); + expect(cdiStateExists).toBe(true); + expect(output).toMatch(/Refreshing NVIDIA CDI device spec with NVIDIA's CDI refresh service/); + expect(output).toMatch(/effective nvidia\.com\/gpu spec may be stale/); + expect(output).toMatch(/refreshed the service-managed NVIDIA CDI device spec/); + expect(output).not.toMatch(/falling back to direct generation/); + expect(output).not.toMatch(/Host preflight found issues/); + expect(systemctlLog).toMatch( + /^enable --now nvidia-cdi-refresh\.path nvidia-cdi-refresh\.service$/m, + ); + expect(systemctlLog).toMatch(/^start nvidia-cdi-refresh\.service$/m); + expect(sudoLog).toMatch(/^-v$/m); + expect(sudoLog).not.toMatch(/nvidia-ctk cdi generate/); + expect(sudoLog).not.toMatch(/mkdir -p/); + expect(sudoLog).not.toMatch(/rm -f/); + }); + + it("does not auto-repair stale NVIDIA CDI specs before toolkit installation", () => { + const { cdiStateExists, output, result, sudoLog, systemctlLog } = + runNvidiaCdiInstallerRepairTest({ + stale: true, + toolkitInstalled: false, + systemctlScript: `#!/usr/bin/env bash +set -euo pipefail +printf '%s\\n' "$*" >> "$SYSTEMCTL_LOG" +touch "$CDI_STATE" +exit 0 +`, + }); + + expect(result.status, output).toBe(1); + expect(cdiStateExists).toBe(false); + expect(output).toMatch(/Host preflight found issues/); + expect(output).toMatch(/Install NVIDIA Container Toolkit and refresh CDI device specs/); + expect(output).not.toMatch(/Refreshing NVIDIA CDI device spec with NVIDIA's CDI refresh service/); + expect(systemctlLog).toBe(""); + expect(sudoLog).toBe(""); + }); + it("falls back to direct NVIDIA CDI generation when refresh service does not repair", () => { const { cdiDir, output, result, sudoLog, systemctlLog } = runNvidiaCdiInstallerRepairTest({ @@ -1394,7 +1463,7 @@ exit 1 }); expect(result.status, output).toBe(0); - expect(output).toMatch(/Generating missing NVIDIA CDI device spec/); + expect(output).toMatch(/Refreshing NVIDIA CDI device spec/); expect(output).toMatch(/NemoClaw will first enable NVIDIA's CDI refresh service/); expect(output).toMatch(/NemoClaw does not store your password/); expect(output).toMatch(/Generated NVIDIA CDI device spec/);