Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 61 additions & 6 deletions scripts/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1848,11 +1848,46 @@ preinstall_backup_and_retire_legacy_gateway() {
# ---------------------------------------------------------------------------
# 5. Onboard
# ---------------------------------------------------------------------------
repair_installer_stale_nvidia_cdi_spec() {
local flagged_file="${1:-}"
local service_spec_path="/var/run/cdi/nvidia.yaml"
local sudo_cmd=()

info "Refreshing NVIDIA CDI device spec with NVIDIA's CDI refresh service."
info "NVIDIA GPU passthrough uses CDI specs so Docker/OpenShell can request nvidia.com/gpu devices."
info "Docker is configured for CDI, but the effective nvidia.com/gpu spec may be stale."
info "The refresh service regenerates ${service_spec_path}; re-assessment verifies that effective spec."
if [[ -n "$flagged_file" && "$flagged_file" != "$service_spec_path" ]]; then
info "The stale ${flagged_file} file is a leftover; the refreshed ${service_spec_path} overrides it."
fi
if ! command_exists systemctl; then
warn "Could not refresh the stale NVIDIA CDI spec automatically because systemctl is unavailable."
return 0
fi
if [[ "$(id -u)" -ne 0 ]]; then
sudo_cmd=(sudo)
info "You may be asked for your password to authorize these host-level admin changes."
info "NemoClaw does not store your password."
if ! sudo -v; then
warn "Could not obtain sudo credentials for NVIDIA CDI refresh service repair."
return 0
fi
fi
if "${sudo_cmd[@]}" systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service >/dev/null 2>&1 \
&& "${sudo_cmd[@]}" systemctl start nvidia-cdi-refresh.service >/dev/null 2>&1; then
ok "Enabled NVIDIA CDI refresh service and refreshed the service-managed NVIDIA CDI device spec."
return 0
fi
warn "Could not refresh the stale NVIDIA CDI spec automatically with nvidia-cdi-refresh.service."
}

repair_installer_nvidia_cdi_spec() {
local preflight_module="$1"
local repair_plan=""
local repair_kind=""
local spec_path=""

spec_path="$(
repair_plan="$(
# shellcheck disable=SC2016
node -e '
const preflightPath = process.argv[1];
Expand All @@ -1864,17 +1899,37 @@ repair_installer_nvidia_cdi_spec() {
host.cdiNvidiaGpuSpecMissing &&
!isWslDockerDesktopRuntime(host)
) {
process.stdout.write(getNvidiaCdiSpecPath(host));
process.stdout.write(`missing\t${getNvidiaCdiSpecPath(host)}`);
} else if (
host &&
host.cdiNvidiaGpuSpecStale &&
host.cdiNvidiaGpuSpecNeedsRepair &&
!host.cdiNvidiaGpuSpecMissing &&
host.nvidiaContainerToolkitInstalled &&
!isWslDockerDesktopRuntime(host)
) {
const mismatch = String(host.cdiNvidiaGpuSpecMismatch || "");
const flaggedFilePath = mismatch.trim().split(/\s+/, 1)[0] || "";
process.stdout.write(`stale\t${flaggedFilePath}`);
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}
} catch {
process.exit(0);
}
' "$preflight_module" 2>/dev/null || true
)"

if [[ -z "$spec_path" ]]; then
if [[ -z "$repair_plan" ]]; then
return 0
fi

repair_kind="${repair_plan%%$'\t'*}"
spec_path="${repair_plan#*$'\t'}"

if [[ "$repair_kind" == "stale" ]]; then
repair_installer_stale_nvidia_cdi_spec "$spec_path"
return 0
fi

if ! command_exists nvidia-ctk; then
return 0
fi
Expand All @@ -1886,10 +1941,10 @@ repair_installer_nvidia_cdi_spec() {
fi

local sudo_cmd=()
info "Generating missing NVIDIA CDI device spec at ${spec_path}."
info "Refreshing NVIDIA CDI device spec at ${spec_path}."
info "NVIDIA GPU passthrough uses CDI specs so Docker/OpenShell can request nvidia.com/gpu devices."
info "Docker is configured for CDI, but the nvidia.com/gpu spec is missing."
info "Without it, OpenShell gateway startup would fail before the sandbox can use the GPU."
info "Docker is configured for CDI, but the nvidia.com/gpu spec is missing or may be stale."
info "Without a refreshed spec, OpenShell gateway startup can fail before the sandbox can use the GPU."
info "NemoClaw will first enable NVIDIA's CDI refresh service."
info "If that service does not generate the spec, NemoClaw will run nvidia-ctk cdi generate directly."
if [[ "$(id -u)" -ne 0 ]]; then
Expand Down
12 changes: 2 additions & 10 deletions src/lib/onboard.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1841,16 +1841,8 @@ function assertCdiNvidiaGpuSpecPresent(
hostGpuPlatform: string | null | undefined = null,
): void {
if (hostGpuPlatform === "jetson" || preflightUtils.isWslDockerDesktopRuntime(host)) return;
if (!host.cdiNvidiaGpuSpecMissing || optedOutGpuPassthrough) return;
console.error(
" Docker is configured for CDI device injection (CDISpecDirs is set), but no",
);
console.error(
" nvidia.com/gpu CDI spec was found on the host. OpenShell's gateway start will",
);
console.error(
" fail with `unresolvable CDI devices nvidia.com/gpu=all` (issue #3152).",
);
if (!(host.cdiNvidiaGpuSpecNeedsRepair || host.cdiNvidiaGpuSpecMissing) || optedOutGpuPassthrough) return;
console.error(" Docker is configured for CDI device injection (CDISpecDirs is set), but the NVIDIA GPU CDI spec is missing or stale. OpenShell GPU startup can fail until the CDI spec is refreshed.");
printRemediationActions(planHostRemediation(host));
process.exit(1);
}
Expand Down
258 changes: 258 additions & 0 deletions src/lib/onboard/docker-cdi.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

import { describe, expect, it } from "vitest";
// Import through dist so coverage follows the CLI build output, matching the
// neighboring preflight tests.
import {
buildNvidiaCdiRepairCommands,
buildStaleCdiManualWarnCommands,
buildStaleCdiWarnCommands,
collectCdiDeviceNodes,
findCdiDeviceNodeMismatch,
getNvidiaCdiSpecPath,
hasNvidiaCdiSpec,
parseDockerCdiSpecDirs,
} from "../../../dist/lib/onboard/docker-cdi";

function specWithDeviceNodes(deviceNodes: string): string {
return [
"cdiVersion: 0.5.0",
"kind: nvidia.com/gpu",
"devices:",
" - name: all",
" containerEdits:",
" deviceNodes:",
deviceNodes,
"",
].join("\n");
}

function cdiFs(files: Record<string, string>) {
return {
readdirImpl: (dir: string) =>
Object.keys(files)
.filter((filePath) => filePath.startsWith(`${dir}/`))
.map((filePath) => filePath.slice(dir.length + 1))
.filter((entry) => entry && !entry.includes("/")),
readFileImpl: (filePath: string) => files[filePath] ?? "",
};
}

function statDevices(devices: Record<string, string>) {
return (command: readonly string[]) => {
if (command[0] === "stat") return devices[command[3]] ?? "";
return "";
};
}

describe("docker-cdi parsing", () => {
it("extracts CDI dirs from whole docker info JSON and .CDISpecDirs JSON", () => {
expect(
parseDockerCdiSpecDirs(JSON.stringify({ CDISpecDirs: ["/etc/cdi", "/var/run/cdi"] })),
).toEqual(["/etc/cdi", "/var/run/cdi"]);
expect(parseDockerCdiSpecDirs('["/etc/cdi","/var/run/cdi"]')).toEqual([
"/etc/cdi",
"/var/run/cdi",
]);
});

it("returns an empty array when CDI dirs are absent or empty", () => {
expect(parseDockerCdiSpecDirs(JSON.stringify({ ServerVersion: "27.0" }))).toEqual([]);
expect(parseDockerCdiSpecDirs(JSON.stringify({ CDISpecDirs: [] }))).toEqual([]);
expect(parseDockerCdiSpecDirs("")).toEqual([]);
});

it("builds the default NVIDIA CDI spec path from Docker CDI dirs", () => {
expect(getNvidiaCdiSpecPath({ dockerCdiSpecDirs: ["/etc/cdi/", "/var/run/cdi"] })).toBe(
"/etc/cdi/nvidia.yaml",
);
});

it("accepts exact nvidia.com/gpu YAML and JSON specs only", () => {
const fs = cdiFs({
"/etc/cdi/nvidia.yaml": "cdiVersion: 0.5.0\nkind: nvidia.com/gpu\ndevices: []\n",
"/etc/cdi/nvidia.json": '{"cdiVersion":"0.5.0","kind":"nvidia.com/gpu","devices":[]}',
"/etc/cdi/nvidia-extra.yaml": "kind: nvidia.com/gpu-extra\ndevices: []\n",
"/etc/cdi/notes.yaml": "# nvidia.com/gpu used to be here\nkind: example.com/cpu\n",
});

expect(hasNvidiaCdiSpec(["/etc/cdi"], fs.readdirImpl, fs.readFileImpl)).toBe(true);
expect(
hasNvidiaCdiSpec(
["/etc/cdi"],
() => ["nvidia-extra.yaml", "notes.yaml"],
fs.readFileImpl,
),
).toBe(false);
});
});

describe("docker-cdi staleness detection", () => {
it("ignores stale lower-precedence /etc/cdi when /var/run/cdi is fresh", () => {
const fs = cdiFs({
"/etc/cdi/nvidia.yaml": specWithDeviceNodes(
" - path: /dev/nvidia-uvm\n hostPath: /dev/nvidia-uvm\n type: c\n major: 498",
),
"/var/run/cdi/nvidia.yaml": specWithDeviceNodes(
" - path: /dev/nvidia-uvm\n hostPath: /dev/nvidia-uvm\n type: c\n major: 499",
),
});

expect(
findCdiDeviceNodeMismatch(
["/etc/cdi", "/var/run/cdi"],
fs.readdirImpl,
fs.readFileImpl,
statDevices({ "/dev/nvidia-uvm": "1f3 0" }),
),
).toBeNull();
});

it("flags stale /etc/cdi when no higher-precedence /var/run/cdi spec exists", () => {
const fs = cdiFs({
"/etc/cdi/nvidia.yaml": specWithDeviceNodes(
" - path: /dev/nvidia-uvm\n hostPath: /dev/nvidia-uvm\n type: c\n major: 498",
),
});

const mismatch = findCdiDeviceNodeMismatch(
["/etc/cdi", "/var/run/cdi"],
fs.readdirImpl,
fs.readFileImpl,
statDevices({ "/dev/nvidia-uvm": "1f3 0" }),
);

expect(mismatch).toContain("/etc/cdi/nvidia.yaml");
expect(mismatch).toContain("/dev/nvidia-uvm=498:0");
expect(mismatch).toContain("live=499:0");
});

it("flags stale /var/run/cdi when it is the effective spec", () => {
const fs = cdiFs({
"/etc/cdi/nvidia.yaml": specWithDeviceNodes(
" - path: /dev/nvidia-uvm\n hostPath: /dev/nvidia-uvm\n type: c\n major: 499",
),
"/var/run/cdi/nvidia.yaml": specWithDeviceNodes(
" - path: /dev/nvidia-uvm\n hostPath: /dev/nvidia-uvm\n type: c\n major: 498",
),
});

const mismatch = findCdiDeviceNodeMismatch(
["/etc/cdi", "/var/run/cdi"],
fs.readdirImpl,
fs.readFileImpl,
statDevices({ "/dev/nvidia-uvm": "1f3 0" }),
);

expect(mismatch).toContain("/var/run/cdi/nvidia.yaml");
expect(mismatch).toContain("/dev/nvidia-uvm=498:0");
expect(mismatch).toContain("live=499:0");
});

it("defaults omitted minor to 0 and detects non-uvm drift", () => {
const fs = cdiFs({
"/etc/cdi/nvidia.yaml": specWithDeviceNodes(
" - path: /dev/nvidia-uvm\n type: c\n major: 498\n - path: /dev/nvidia0\n type: c\n major: 195\n minor: 0",
),
});

expect(
findCdiDeviceNodeMismatch(
["/etc/cdi"],
fs.readdirImpl,
fs.readFileImpl,
statDevices({ "/dev/nvidia-uvm": "1f3 0", "/dev/nvidia0": "c3 0" }),
),
).toContain("/dev/nvidia-uvm=498:0");
});

it("skips absent devices and accepts matching explicit minors", () => {
const fs = cdiFs({
"/etc/cdi/nvidia.yaml": specWithDeviceNodes(
" - path: /dev/nvidia1\n type: c\n major: 195\n minor: 1\n - path: /dev/nvidia-uvm-tools\n type: c\n major: 499\n minor: 1",
),
});

expect(
findCdiDeviceNodeMismatch(
["/etc/cdi"],
fs.readdirImpl,
fs.readFileImpl,
statDevices({ "/dev/nvidia1": "", "/dev/nvidia-uvm-tools": "1f3 1" }),
),
).toBeNull();
});

it("stats CDI hostPath instead of the container path when both are present", () => {
const nodes = collectCdiDeviceNodes(
{
deviceNodes: [
{ path: "/container/nvidia0", hostPath: "/dev/nvidia0", major: 196, minor: 0 },
],
},
"/etc/cdi/nvidia.yaml",
);
expect(nodes[0]).toMatchObject({ path: "/dev/nvidia0", major: 196, minor: 0 });
});
});

describe("docker-cdi remediation commands", () => {
it("keeps missing-spec remediation on the direct-generation fallback path", () => {
const commands = buildNvidiaCdiRepairCommands(
{ systemctlAvailable: true },
"/etc/cdi/nvidia.yaml",
);

expect(commands[0]).toBe("sudo mkdir -p '/etc/cdi'");
expect(commands[1]).toBe(
"sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service",
);
expect(commands[2]).toBe("sudo systemctl start nvidia-cdi-refresh.service");
expect(commands[3]).toContain("nvidia-ctk cdi list");
expect(commands[4]).toContain("sudo nvidia-ctk cdi generate --output='/etc/cdi/nvidia.yaml'");
expect(commands[5]).toContain("nvidia-ctk cdi list");
});

it("shell-quotes CDI repair paths in generated commands", () => {
const commands = buildNvidiaCdiRepairCommands(
{ systemctlAvailable: false },
"/tmp/cdi dir/nvidia;bad.yaml",
);

expect(commands[0]).toBe("sudo mkdir -p '/tmp/cdi dir'");
expect(commands[1]).toContain("--output='/tmp/cdi dir/nvidia;bad.yaml'");
});

it("shows stale-spec refresh commands with optional leftover removal only for /etc/cdi", () => {
const leftoverCommands = buildStaleCdiWarnCommands("/etc/cdi/nvidia.yaml");
expect(leftoverCommands[0]).toBe(
"sudo systemctl enable --now nvidia-cdi-refresh.path nvidia-cdi-refresh.service",
);
expect(leftoverCommands[1]).toBe("sudo systemctl start nvidia-cdi-refresh.service");
expect(leftoverCommands[2]).toContain("sudo rm -f '/etc/cdi/nvidia.yaml'");
expect(leftoverCommands.join("\n")).not.toContain("--output=/etc/cdi");
expect(leftoverCommands.join("\n")).not.toContain("nvidia-ctk cdi list");

const serviceCommands = buildStaleCdiWarnCommands("/var/run/cdi/nvidia.yaml");
expect(serviceCommands.some((command) => command.includes("rm -f"))).toBe(false);
});

it("shows manual stale-spec guidance without systemctl on non-systemd hosts", () => {
const commands = buildStaleCdiManualWarnCommands("/etc/cdi/nvidia.yaml");

expect(commands.join("\n")).toContain("/var/run/cdi/nvidia.yaml");
expect(commands.join("\n")).toContain("sudo rm -f '/etc/cdi/nvidia.yaml'");
expect(commands.join("\n")).not.toContain("systemctl");
expect(commands.join("\n")).not.toContain("nvidia-ctk cdi list");
});

it("shell-quotes stale leftover paths in displayed guidance", () => {
expect(buildStaleCdiWarnCommands("/tmp/cdi dir/nvidia;bad.yaml").join("\n")).toContain(
"sudo rm -f '/tmp/cdi dir/nvidia;bad.yaml'",
);
expect(buildStaleCdiManualWarnCommands("/tmp/cdi dir/nvidia;bad.yaml").join("\n")).toContain(
"sudo rm -f '/tmp/cdi dir/nvidia;bad.yaml'",
);
});
});
Loading
Loading