Skip to content

Commit 9be7d68

Browse files
committed
fix: bundle NVIDIA 580.95.05 userspace libs for Akash CDI GPU access
1 parent 5f397bb commit 9be7d68

4 files changed

Lines changed: 42 additions & 3 deletions

File tree

docker/train_entrypoint.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,24 @@ if [[ "${PSYCHE_MAIN_HOST:-}" != "" ]]; then
99
exec /bin/sidecar_entrypoint.sh
1010
fi
1111

12+
# GPU diagnostics
13+
echo "=== GPU DIAGNOSTICS ==="
14+
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-<not set>}"
15+
echo "NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-<not set>}"
16+
echo "NVIDIA_DRIVER_CAPABILITIES=${NVIDIA_DRIVER_CAPABILITIES:-<not set>}"
17+
ls /usr/local/nvidia/lib64/ 2>/dev/null && echo "Found /usr/local/nvidia/lib64" || echo "No /usr/local/nvidia/lib64"
18+
ls /dev/nvidia* 2>/dev/null || echo "No /dev/nvidia* devices found"
19+
echo "Host kernel driver version:"
20+
cat /proc/driver/nvidia/version 2>/dev/null || \
21+
cat /sys/module/nvidia/version 2>/dev/null || \
22+
echo "Cannot determine driver version"
23+
echo "Searching for libcuda in LD_LIBRARY_PATH dirs:"
24+
IFS=':' read -ra _LIB_DIRS <<< "${LD_LIBRARY_PATH:-}"
25+
for _dir in "${_LIB_DIRS[@]}"; do
26+
ls "$_dir"/libcuda* 2>/dev/null && echo " ^ in $_dir"
27+
done
28+
echo "=== END DIAGNOSTICS ==="
29+
1230
# Some sanity checks before starting
1331

1432
if [[ "${NVIDIA_DRIVER_CAPABILITIES:-}" == "" ]]; then

nix/docker.nix

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,19 @@ let
102102
]
103103
);
104104

105+
nvidiaUserSpaceLibs = lib.optionals pkgs.config.cudaSupport [
106+
((pkgs.linuxPackages.nvidiaPackages.dc.override {
107+
libsOnly = true;
108+
kernel = null;
109+
}).overrideAttrs (old: let version = "580.95.05"; in {
110+
inherit version;
111+
src = pkgs.fetchurl {
112+
url = "https://us.download.nvidia.com/tesla/${version}/NVIDIA-Linux-x86_64-${version}.run";
113+
sha256 = "sha256-hJ7w746EK5gGss3p8RwTA9VPGpp2lGfk5dlhsv4Rgqc=";
114+
};
115+
}))
116+
];
117+
105118
dockerPackages = {
106119
docker-psyche-solana-client = pkgs.dockerTools.streamLayeredImage {
107120
name = "psyche-solana-client";
@@ -115,7 +128,9 @@ let
115128
coreutils
116129
stdenv.cc
117130
rdma-core
118-
rustPackages."psyche-solana-client"
131+
dockerTools.fakeNss
132+
dockerTools.usrBinEnv
133+
rustPackages."psyche-solana-client".unwrapped
119134
rustPackages."psyche-centralized-client"
120135
rustPackages."inference"
121136
rustPackages."train"
@@ -128,15 +143,19 @@ let
128143
cp ${../docker/sidecar_entrypoint.sh} $out/bin/sidecar_entrypoint.sh
129144
chmod +x $out/bin/train_entrypoint.sh
130145
chmod +x $out/bin/sidecar_entrypoint.sh
146+
ln -s ${bashInteractive}/bin/bash $out/bin/bash
147+
ln -s ${bashInteractive}/bin/bash $out/bin/sh
131148
'')
132149
]
133-
++ cudaRuntimeLibs;
150+
++ cudaRuntimeLibs
151+
++ nvidiaUserSpaceLibs;
134152

135153
config = {
136154
Env = [
137155
"NVIDIA_DRIVER_CAPABILITIES=all"
138156
"NVIDIA_VISIBLE_DEVICES=all"
139-
"LD_LIBRARY_PATH=${lib.makeLibraryPath cudaRuntimeLibs}:/lib:/usr/lib"
157+
"LD_LIBRARY_PATH=${lib.makeLibraryPath (cudaRuntimeLibs ++ nvidiaUserSpaceLibs)}:/usr/local/nvidia/lib64:/lib:/usr/lib"
158+
"PATH=${lib.makeBinPath (with pkgs; [ bashInteractive coreutils findutils gnugrep ])}:/bin:/usr/bin"
140159
"LOGNAME=root"
141160
"TORCHINDUCTOR_CACHE_DIR=/tmp/torchinductor"
142161
"PYTHONUNBUFFERED=1"

nix/lib.nix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,7 @@ let
531531
{
532532
nativeBuildInputs = [ pkgs.makeWrapper ];
533533
meta.mainProgram = package.meta.mainProgram;
534+
passthru = { unwrapped = package; };
534535
}
535536
''
536537
mkdir -p $out/bin

nix/nixpkgs.nix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ in
4646

4747
config = {
4848
allowUnfree = true;
49+
nvidia.acceptLicense = true;
4950
metalSupport = lib.mkDefault false;
5051
}
5152
// lib.optionalAttrs cudaSupported {

0 commit comments

Comments
 (0)