File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -9,6 +9,24 @@ if [[ "${PSYCHE_MAIN_HOST:-}" != "" ]]; then
99 exec /bin/sidecar_entrypoint.sh
1010fi
1111
12+ # GPU diagnostics
13+ echo " === GPU DIAGNOSTICS ==="
14+ echo " LD_LIBRARY_PATH=${LD_LIBRARY_PATH:- <not set>} "
15+ echo " NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:- <not set>} "
16+ echo " NVIDIA_DRIVER_CAPABILITIES=${NVIDIA_DRIVER_CAPABILITIES:- <not set>} "
17+ ls /usr/local/nvidia/lib64/ 2> /dev/null && echo " Found /usr/local/nvidia/lib64" || echo " No /usr/local/nvidia/lib64"
18+ ls /dev/nvidia* 2> /dev/null || echo " No /dev/nvidia* devices found"
19+ echo " Host kernel driver version:"
20+ cat /proc/driver/nvidia/version 2> /dev/null || \
21+ cat /sys/module/nvidia/version 2> /dev/null || \
22+ echo " Cannot determine driver version"
23+ echo " Searching for libcuda in LD_LIBRARY_PATH dirs:"
24+ IFS=' :' read -ra _LIB_DIRS <<< " ${LD_LIBRARY_PATH:-}"
25+ for _dir in " ${_LIB_DIRS[@]} " ; do
26+ ls " $_dir " /libcuda* 2> /dev/null && echo " ^ in $_dir "
27+ done
28+ echo " === END DIAGNOSTICS ==="
29+
1230# Some sanity checks before starting
1331
1432if [[ " ${NVIDIA_DRIVER_CAPABILITIES:- } " == " " ]]; then
Original file line number Diff line number Diff line change 102102 ]
103103 ) ;
104104
105+ nvidiaUserSpaceLibs = lib . optionals pkgs . config . cudaSupport [
106+ ( ( pkgs . linuxPackages . nvidiaPackages . dc . override {
107+ libsOnly = true ;
108+ kernel = null ;
109+ } ) . overrideAttrs ( old : let version = "580.95.05" ; in {
110+ inherit version ;
111+ src = pkgs . fetchurl {
112+ url = "https://us.download.nvidia.com/tesla/${ version } /NVIDIA-Linux-x86_64-${ version } .run" ;
113+ sha256 = "sha256-hJ7w746EK5gGss3p8RwTA9VPGpp2lGfk5dlhsv4Rgqc=" ;
114+ } ;
115+ } ) )
116+ ] ;
117+
105118 dockerPackages = {
106119 docker-psyche-solana-client = pkgs . dockerTools . streamLayeredImage {
107120 name = "psyche-solana-client" ;
115128 coreutils
116129 stdenv . cc
117130 rdma-core
118- rustPackages . "psyche-solana-client"
131+ dockerTools . fakeNss
132+ dockerTools . usrBinEnv
133+ rustPackages . "psyche-solana-client" . unwrapped
119134 rustPackages . "psyche-centralized-client"
120135 rustPackages . "inference"
121136 rustPackages . "train"
@@ -128,15 +143,19 @@ let
128143 cp ${ ../docker/sidecar_entrypoint.sh } $out/bin/sidecar_entrypoint.sh
129144 chmod +x $out/bin/train_entrypoint.sh
130145 chmod +x $out/bin/sidecar_entrypoint.sh
146+ ln -s ${ bashInteractive } /bin/bash $out/bin/bash
147+ ln -s ${ bashInteractive } /bin/bash $out/bin/sh
131148 '' )
132149 ]
133- ++ cudaRuntimeLibs ;
150+ ++ cudaRuntimeLibs
151+ ++ nvidiaUserSpaceLibs ;
134152
135153 config = {
136154 Env = [
137155 "NVIDIA_DRIVER_CAPABILITIES=all"
138156 "NVIDIA_VISIBLE_DEVICES=all"
139- "LD_LIBRARY_PATH=${ lib . makeLibraryPath cudaRuntimeLibs } :/lib:/usr/lib"
157+ "LD_LIBRARY_PATH=${ lib . makeLibraryPath ( cudaRuntimeLibs ++ nvidiaUserSpaceLibs ) } :/usr/local/nvidia/lib64:/lib:/usr/lib"
158+ "PATH=${ lib . makeBinPath ( with pkgs ; [ bashInteractive coreutils findutils gnugrep ] ) } :/bin:/usr/bin"
140159 "LOGNAME=root"
141160 "TORCHINDUCTOR_CACHE_DIR=/tmp/torchinductor"
142161 "PYTHONUNBUFFERED=1"
Original file line number Diff line number Diff line change 531531 {
532532 nativeBuildInputs = [ pkgs . makeWrapper ] ;
533533 meta . mainProgram = package . meta . mainProgram ;
534+ passthru = { unwrapped = package ; } ;
534535 }
535536 ''
536537 mkdir -p $out/bin
Original file line number Diff line number Diff line change 4646
4747 config = {
4848 allowUnfree = true ;
49+ nvidia . acceptLicense = true ;
4950 metalSupport = lib . mkDefault false ;
5051 }
5152 // lib . optionalAttrs cudaSupported {
You can’t perform that action at this time.
0 commit comments