diff --git a/README.md b/README.md index cdfead8..63a4649 100644 --- a/README.md +++ b/README.md @@ -29,25 +29,24 @@ The model is trained from a random initialization until convergence, which is de 1. Clone the repository: `git clone https://github.com/LBANN/ScaFFold.git && cd ScaFFold` +1. Build the ccl plugin (if not using WCI wheel) + `. scripts/install-rccl.sh` + 1. Create and activate a python venv for running the benchmark: `ml load python/3.11.5 && python3 -m venv .venvs/scaffoldvenv && source .venvs/scaffoldvenv/bin/activate && pip install --upgrade pip` 1. Necessary LLNL settings: - CUDA (matrix): - 1. `ml cuda/12.6.0 gcc/12.1.1 mvapich2/2.3.7` + 1. `ml cuda/12.9.1 gcc/13.3.1 mvapich2/2.3.7` 1. `export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH` - ROCm (elcap): - 1. `ml load rocm/6.4.2 rccl/fast-env-slows-mpi` - - If using generic wheel: - 1. `export LD_LIBRARY_PATH=/opt/cray/pe/cce/20.0.0/cce/x86_64/lib:$LD_LIBRARY_PATH` - 1. `export LD_LIBRARY_PATH=/collab/usr/global/tools/rccl/toss_4_x86_64_ib_cray/rocm-6.4.1/install/lib/:$LD_LIBRARY_PATH` # Necessary to use libfabric plugin (Only necessary if using generic install, wci already links correctly) + 1. `ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi` - If using WCI wheel: - 1. `export LD_LIBRARY_PATH=/opt/cray/pe/cce/20.0.0/cce-clang/x86_64/lib/:$LD_LIBRARY_PATH` # for libomp.so - 1. `export SPINDLE_FLUXOPT=off` # Avoid spindle error + 1. `export LD_PRELOAD=/opt/rocm-7.1.0/llvm/lib/libomp.so` # for libomp.so 1. Install the benchmark in the python venv: - - CUDA: `pip install --no-binary=mpi4py .[cuda] --prefix=.venvs/scaffoldvenv --extra-index-url https://download.pytorch.org/whl/cu126 2>&1 | tee install.log` - - ROCm (generic): `pip install --no-binary=mpi4py .[rocm] --prefix=.venvs/scaffoldvenv --extra-index-url https://download.pytorch.org/whl/rocm6.4 2>&1 | tee install.log` + - CUDA: `pip install --no-binary=mpi4py .[cuda] --prefix=.venvs/scaffoldvenv --extra-index-url https://download.pytorch.org/whl/cu129 2>&1 | tee install.log` + - ROCm (generic): `pip install --no-binary=mpi4py .[rocm] --prefix=.venvs/scaffoldvenv --extra-index-url https://download.pytorch.org/whl/rocm7.1 2>&1 | tee install.log` - ROCm (LLNL): `pip install .[rocmwci] --prefix=.venvs/scaffoldvenv 2>&1 | tee install.log` @@ -227,8 +226,8 @@ make && make install git clone https://github.com/LLNL/Caliper.git cd Caliper mkdir pybuild && cd pybuild -ml rocm/6.4.0 -ml cuda/12.6.0 +ml rocm/7.1.0 +ml cuda/12.9.1 cmake -DWITH_PYTHON_BINDINGS=ON \ -DWITH_ROCPROFILER=ON \ -DWITH_CUPTI=ON \ diff --git a/ScaFFold/utils/create_restart_script.py b/ScaFFold/utils/create_restart_script.py index a4bd618..27a892a 100644 --- a/ScaFFold/utils/create_restart_script.py +++ b/ScaFFold/utils/create_restart_script.py @@ -98,7 +98,7 @@ def _get_env_setup() -> str: # --- Begin Environment Setup --- # Load Modules if command -v module &> /dev/null; then - module load rocm/6.4.2 rccl/fast-env-slows-mpi + ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi fi # Activate Virtual Environment @@ -111,7 +111,7 @@ def _get_env_setup() -> str: # Environment variables export SPINDLE_FLUXOPT=off -export LD_PRELOAD=/opt/rocm-6.4.2/llvm/lib/libomp.so +export LD_PRELOAD=/opt/rocm-7.1.0/llvm/lib/libomp.so export PROFILE_TORCH=ON # --- End Environment Setup --- diff --git a/pyproject.toml b/pyproject.toml index b715d47..1caa3a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,7 @@ dependencies = [ "wandb>=0.19.6", "open3d>=0.18.0", "PyYAML>=6.0.2", - "distconv @ git+https://github.com/LBANN/DistConv.git", + "distconv @ git+https://github.com/LBANN/DistConv.git@232cba6", ] requires-python = ">=3.9" @@ -60,16 +60,16 @@ profiling = [ "pybind11>=3.0.0" ] cuda = [ - "torch==2.8.0+cu126", - "mpi4py==4.0.2", + "torch==2.10.0+cu129", + "mpi4py==4.1.1", ] rocm = [ - "torch==2.8.0+rocm6.4", - "mpi4py==4.0.2", + "torch==2.10.0+rocm7.1", + "mpi4py==4.1.1", ] rocmwci = [ - "torch==2.8.0+rocm642", - "mpi4py==4.1.1.dev0+mpich.8.1.32", + "torch==2.10.0+rocm710", + "mpi4py==4.1.1+mpich.9.1.0", ] [project.entry-points.console_scripts] diff --git a/requirements.txt b/requirements.txt index 8af677c..8361868 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,17 +7,5 @@ tqdm>=4.67.1 wandb>=0.19.6 open3d>=0.18.0 PyYAML>=6.0.2 -mpi4py==4.0.2 --no-binary mpi4py -distconv @ git+https://github.com/LBANN/DistConv.git - -# cuda -# torch==2.7.1+cu126 -# torchvision==0.22.1+cu126 -# torchaudio==2.7.1+cu126 -# --extra-index-url https://download.pytorch.org/whl/cu126 - -# rocm -# torch==2.8.0+rocm6. -# torchvision==0.23.0+rocm6.4 -# torchaudio==2.8.0+rocm6.4 -# --extra-index-url https://download.pytorch.org/whl/test/rocm6.4 +mpi4py==4.1.1 --no-binary mpi4py +distconv @ git+https://github.com/LBANN/DistConv.git@232cba6 diff --git a/scripts/install-matrix.sh b/scripts/install-matrix.sh index f8c7e61..15c4e6d 100644 --- a/scripts/install-matrix.sh +++ b/scripts/install-matrix.sh @@ -1,4 +1,4 @@ ml load python/3.11.5 && python3 -m venv .venvs/scaffoldvenv-matrix && source .venvs/scaffoldvenv-matrix/bin/activate && pip install --upgrade pip -ml cuda/12.6.0 gcc/12.1.1 mvapich2/2.3.7 +ml cuda/12.9.1 gcc/13.3.1 mvapich2/2.3.7 export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH -pip install --no-binary=mpi4py -e .[cuda] --prefix=.venvs/scaffoldvenv-matrix --extra-index-url https://download.pytorch.org/whl/cu126 2>&1 | tee install.log +pip install --no-binary=mpi4py -e .[cuda] --prefix=.venvs/scaffoldvenv-matrix --extra-index-url https://download.pytorch.org/whl/cu129 2>&1 | tee install.log diff --git a/scripts/install-rccl.sh b/scripts/install-rccl.sh new file mode 100644 index 0000000..306486a --- /dev/null +++ b/scripts/install-rccl.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# Exit if target directory already exists +if [ -d "aws-ofi-nccl.git" ]; then + echo "Directory 'aws-ofi-nccl.git' already exists. Exiting to avoid overwrite." + return 1 2>/dev/null || exit 1 +fi + +rocm_version=7.1.0 + +module swap PrgEnv-cray PrgEnv-gnu +module load rocm/$rocm_version + +git clone --recursive --branch v1.18.0 https://github.com/aws/aws-ofi-nccl.git aws-ofi-nccl.git + +cd aws-ofi-nccl.git + +installdir=$(pwd)/install + +./autogen.sh + +export LD_LIBRARY_PATH=$PWD/../rccl/install/lib:/opt/rocm-$rocm_version/lib:$LD_LIBRARY_PATH + +#CC=hipcc CXX=hipcc CFLAGS=-I$PWD/../rccl/install/include/rccl ./configure \ +./configure \ + --with-libfabric=/opt/cray/libfabric/2.1 \ + --with-rocm=$ROCM_PATH \ + --prefix=$installdir + +make +make install \ No newline at end of file diff --git a/scripts/install-tuolumne-torchpypi.sh b/scripts/install-tuolumne-torchpypi.sh new file mode 100644 index 0000000..87c8473 --- /dev/null +++ b/scripts/install-tuolumne-torchpypi.sh @@ -0,0 +1,4 @@ +. install-rccl.sh +ml load python/3.11.5 && python3 -m venv .venvs/scaffoldvenv-tuo-pypi && source .venvs/scaffoldvenv-tuo-pypi/bin/activate && pip install --upgrade pip +ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi +pip install -e .[rocm] --prefix=.venvs/scaffoldvenv-tuo-pypi --extra-index-url https://download.pytorch.org/whl/rocm7.1 2>&1 | tee install.log diff --git a/scripts/install-tuolumne.sh b/scripts/install-tuolumne.sh index 3e03de3..d8f5da1 100644 --- a/scripts/install-tuolumne.sh +++ b/scripts/install-tuolumne.sh @@ -1,3 +1,25 @@ ml load python/3.11.5 && python3 -m venv .venvs/scaffoldvenv-tuo && source .venvs/scaffoldvenv-tuo/bin/activate && pip install --upgrade pip -ml load rocm/6.4.2 rccl/fast-env-slows-mpi libfabric +ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi pip install -e .[rocmwci] --prefix=.venvs/scaffoldvenv-tuo 2>&1 | tee install.log +# Needed until new wheel exists for torch using mpich 9.1.0 +TORCH_LIB_DIR=".venvs/scaffoldvenv-tuo/lib/python3.11/site-packages/torch/lib" +OLD="libmpi_gnu_112.so.12" +NEW="libmpi_gnu.so.12" +cd "$TORCH_LIB_DIR" || exit 1 +# Patch every file that has OLD in its DT_NEEDED +for f in *.so*; do + [ -f "$f" ] || continue + + if patchelf --print-needed "$f" 2>/dev/null | grep -Fxq "$OLD"; then + echo "Patching $f" + patchelf --replace-needed "$OLD" "$NEW" "$f" + fi +done +echo +echo "Verification (should show no $OLD):" +for f in *.so*; do + [ -f "$f" ] || continue + if patchelf --print-needed "$f" 2>/dev/null | grep -Fxq "$OLD"; then + echo "STILL NEEDS $OLD -> $f" + fi +done \ No newline at end of file diff --git a/scripts/scaffold-matrix.job b/scripts/scaffold-matrix.job index 65f2e85..d194aa8 100644 --- a/scripts/scaffold-matrix.job +++ b/scripts/scaffold-matrix.job @@ -8,8 +8,6 @@ #SBATCH -A fractale #SBATCH -perl -ml cuda/12.6.0 gcc/12.1.1 mvapich2/2.3.7 - . .venvs/scaffoldvenv-matrix/bin/activate export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH diff --git a/scripts/scaffold-tuolumne-torchpypi.job b/scripts/scaffold-tuolumne-torchpypi.job new file mode 100644 index 0000000..3b25274 --- /dev/null +++ b/scripts/scaffold-tuolumne-torchpypi.job @@ -0,0 +1,24 @@ +#!/bin/bash + +# flux: --exclusive +# flux: -N 1 +# flux: -g=1 +# flux: -t 60m +# flux: -qpdebug +# flux: -B fractale + +ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi + +. .venvs/scaffoldvenv-tuo-pypi/bin/activate + +# Use ccl plugin that we manually built with install-rccl.sh +export NCCL_NET_PLUGIN=../aws-ofi-nccl.git/install/lib/librccl-net.so +export NCCL_NET="AWS Libfabric" + +torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml + +# Uncomment if you want torch profiling +#export PROFILE_TORCH=ON + +torchrun-hpc -N 1 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c $(pwd)/ScaFFold/configs/benchmark_default.yml +#torchrun-hpc -N 2 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c $(pwd)/ScaFFold/configs/benchmark_default.yml diff --git a/scripts/scaffold-tuolumne.job b/scripts/scaffold-tuolumne.job index 4eb3715..79604f7 100644 --- a/scripts/scaffold-tuolumne.job +++ b/scripts/scaffold-tuolumne.job @@ -7,15 +7,16 @@ # flux: -qpdebug # flux: -B fractale -ml rocm/6.4.2 rccl/fast-env-slows-mpi +ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi . .venvs/scaffoldvenv-tuo/bin/activate -# Avoid spindle error -export SPINDLE_FLUXOPT=off +# (1) Avoid libmagma error +# (2) Removing libmpi may cause segfault on mpi4py import +export LD_PRELOAD="/opt/rocm-7.1.0/llvm/lib/libomp.so /opt/cray/pe/mpich/9.1.0/ofi/gnu/11.2/lib/libmpi_gnu.so.12" -# Avoid libmagma error -export LD_PRELOAD=/opt/rocm-6.4.2/llvm/lib/libomp.so +# Ensure using libfabric. NCCL_NET_PLUGIN should be unecessary to set for WCI wheel. +export NCCL_NET="AWS Libfabric" torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml