Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 10 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,25 +29,24 @@ The model is trained from a random initialization until convergence, which is de
1. Clone the repository:
`git clone https://github.com/LBANN/ScaFFold.git && cd ScaFFold`

1. Build the ccl plugin (if not using WCI wheel)
`. scripts/install-rccl.sh`

1. Create and activate a python venv for running the benchmark:
`ml load python/3.11.5 && python3 -m venv .venvs/scaffoldvenv && source .venvs/scaffoldvenv/bin/activate && pip install --upgrade pip`

1. Necessary LLNL settings:
- CUDA (matrix):
1. `ml cuda/12.6.0 gcc/12.1.1 mvapich2/2.3.7`
1. `ml cuda/12.9.1 gcc/13.3.1 mvapich2/2.3.7`
1. `export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH`
- ROCm (elcap):
1. `ml load rocm/6.4.2 rccl/fast-env-slows-mpi`
- If using generic wheel:
1. `export LD_LIBRARY_PATH=/opt/cray/pe/cce/20.0.0/cce/x86_64/lib:$LD_LIBRARY_PATH`
1. `export LD_LIBRARY_PATH=/collab/usr/global/tools/rccl/toss_4_x86_64_ib_cray/rocm-6.4.1/install/lib/:$LD_LIBRARY_PATH` # Necessary to use libfabric plugin (Only necessary if using generic install, wci already links correctly)
1. `ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi`
- If using WCI wheel:
1. `export LD_LIBRARY_PATH=/opt/cray/pe/cce/20.0.0/cce-clang/x86_64/lib/:$LD_LIBRARY_PATH` # for libomp.so
1. `export SPINDLE_FLUXOPT=off` # Avoid spindle error
1. `export LD_PRELOAD=/opt/rocm-7.1.0/llvm/lib/libomp.so` # for libomp.so

1. Install the benchmark in the python venv:
- CUDA: `pip install --no-binary=mpi4py .[cuda] --prefix=.venvs/scaffoldvenv --extra-index-url https://download.pytorch.org/whl/cu126 2>&1 | tee install.log`
- ROCm (generic): `pip install --no-binary=mpi4py .[rocm] --prefix=.venvs/scaffoldvenv --extra-index-url https://download.pytorch.org/whl/rocm6.4 2>&1 | tee install.log`
- CUDA: `pip install --no-binary=mpi4py .[cuda] --prefix=.venvs/scaffoldvenv --extra-index-url https://download.pytorch.org/whl/cu129 2>&1 | tee install.log`
- ROCm (generic): `pip install --no-binary=mpi4py .[rocm] --prefix=.venvs/scaffoldvenv --extra-index-url https://download.pytorch.org/whl/rocm7.1 2>&1 | tee install.log`
- ROCm (LLNL): `pip install .[rocmwci] --prefix=.venvs/scaffoldvenv 2>&1 | tee install.log`


Expand Down Expand Up @@ -227,8 +226,8 @@ make && make install
git clone https://github.com/LLNL/Caliper.git
cd Caliper
mkdir pybuild && cd pybuild
ml rocm/6.4.0
ml cuda/12.6.0
ml rocm/7.1.0
ml cuda/12.9.1
cmake -DWITH_PYTHON_BINDINGS=ON \
-DWITH_ROCPROFILER=ON \
-DWITH_CUPTI=ON \
Expand Down
4 changes: 2 additions & 2 deletions ScaFFold/utils/create_restart_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def _get_env_setup() -> str:
# --- Begin Environment Setup ---
# Load Modules
if command -v module &> /dev/null; then
module load rocm/6.4.2 rccl/fast-env-slows-mpi
ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi
fi

# Activate Virtual Environment
Expand All @@ -111,7 +111,7 @@ def _get_env_setup() -> str:

# Environment variables
export SPINDLE_FLUXOPT=off
export LD_PRELOAD=/opt/rocm-6.4.2/llvm/lib/libomp.so
export LD_PRELOAD=/opt/rocm-7.1.0/llvm/lib/libomp.so

export PROFILE_TORCH=ON
# --- End Environment Setup ---
Expand Down
14 changes: 7 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ dependencies = [
"wandb>=0.19.6",
"open3d>=0.18.0",
"PyYAML>=6.0.2",
"distconv @ git+https://github.com/LBANN/DistConv.git",
"distconv @ git+https://github.com/LBANN/DistConv.git@232cba6",
]
requires-python = ">=3.9"

Expand All @@ -60,16 +60,16 @@ profiling = [
"pybind11>=3.0.0"
]
cuda = [
"torch==2.8.0+cu126",
"mpi4py==4.0.2",
"torch==2.10.0+cu129",
"mpi4py==4.1.1",
]
rocm = [
"torch==2.8.0+rocm6.4",
"mpi4py==4.0.2",
"torch==2.10.0+rocm7.1",
"mpi4py==4.1.1",
]
rocmwci = [
"torch==2.8.0+rocm642",
"mpi4py==4.1.1.dev0+mpich.8.1.32",
"torch==2.10.0+rocm710",
"mpi4py==4.1.1+mpich.9.1.0",
]

[project.entry-points.console_scripts]
Expand Down
16 changes: 2 additions & 14 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,5 @@ tqdm>=4.67.1
wandb>=0.19.6
open3d>=0.18.0
PyYAML>=6.0.2
mpi4py==4.0.2 --no-binary mpi4py
distconv @ git+https://github.com/LBANN/DistConv.git

# cuda
# torch==2.7.1+cu126
# torchvision==0.22.1+cu126
# torchaudio==2.7.1+cu126
# --extra-index-url https://download.pytorch.org/whl/cu126

# rocm
# torch==2.8.0+rocm6.
# torchvision==0.23.0+rocm6.4
# torchaudio==2.8.0+rocm6.4
# --extra-index-url https://download.pytorch.org/whl/test/rocm6.4
mpi4py==4.1.1 --no-binary mpi4py
distconv @ git+https://github.com/LBANN/DistConv.git@232cba6
4 changes: 2 additions & 2 deletions scripts/install-matrix.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ml load python/3.11.5 && python3 -m venv .venvs/scaffoldvenv-matrix && source .venvs/scaffoldvenv-matrix/bin/activate && pip install --upgrade pip
ml cuda/12.6.0 gcc/12.1.1 mvapich2/2.3.7
ml cuda/12.9.1 gcc/13.3.1 mvapich2/2.3.7
export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
pip install --no-binary=mpi4py -e .[cuda] --prefix=.venvs/scaffoldvenv-matrix --extra-index-url https://download.pytorch.org/whl/cu126 2>&1 | tee install.log
pip install --no-binary=mpi4py -e .[cuda] --prefix=.venvs/scaffoldvenv-matrix --extra-index-url https://download.pytorch.org/whl/cu129 2>&1 | tee install.log
31 changes: 31 additions & 0 deletions scripts/install-rccl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash

# Exit if target directory already exists
if [ -d "aws-ofi-nccl.git" ]; then
echo "Directory 'aws-ofi-nccl.git' already exists. Exiting to avoid overwrite."
return 1 2>/dev/null || exit 1
fi

rocm_version=7.1.0

module swap PrgEnv-cray PrgEnv-gnu
module load rocm/$rocm_version

git clone --recursive --branch v1.18.0 https://github.com/aws/aws-ofi-nccl.git aws-ofi-nccl.git

cd aws-ofi-nccl.git

installdir=$(pwd)/install

./autogen.sh

export LD_LIBRARY_PATH=$PWD/../rccl/install/lib:/opt/rocm-$rocm_version/lib:$LD_LIBRARY_PATH

#CC=hipcc CXX=hipcc CFLAGS=-I$PWD/../rccl/install/include/rccl ./configure \
./configure \
--with-libfabric=/opt/cray/libfabric/2.1 \
--with-rocm=$ROCM_PATH \
--prefix=$installdir

make
make install
4 changes: 4 additions & 0 deletions scripts/install-tuolumne-torchpypi.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
. install-rccl.sh
ml load python/3.11.5 && python3 -m venv .venvs/scaffoldvenv-tuo-pypi && source .venvs/scaffoldvenv-tuo-pypi/bin/activate && pip install --upgrade pip
ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi
pip install -e .[rocm] --prefix=.venvs/scaffoldvenv-tuo-pypi --extra-index-url https://download.pytorch.org/whl/rocm7.1 2>&1 | tee install.log
24 changes: 23 additions & 1 deletion scripts/install-tuolumne.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
ml load python/3.11.5 && python3 -m venv .venvs/scaffoldvenv-tuo && source .venvs/scaffoldvenv-tuo/bin/activate && pip install --upgrade pip
ml load rocm/6.4.2 rccl/fast-env-slows-mpi libfabric
ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi
pip install -e .[rocmwci] --prefix=.venvs/scaffoldvenv-tuo 2>&1 | tee install.log
# Needed until new wheel exists for torch using mpich 9.1.0
TORCH_LIB_DIR=".venvs/scaffoldvenv-tuo/lib/python3.11/site-packages/torch/lib"
OLD="libmpi_gnu_112.so.12"
NEW="libmpi_gnu.so.12"
cd "$TORCH_LIB_DIR" || exit 1
# Patch every file that has OLD in its DT_NEEDED
for f in *.so*; do
[ -f "$f" ] || continue

if patchelf --print-needed "$f" 2>/dev/null | grep -Fxq "$OLD"; then
echo "Patching $f"
patchelf --replace-needed "$OLD" "$NEW" "$f"
fi
done
echo
echo "Verification (should show no $OLD):"
for f in *.so*; do
[ -f "$f" ] || continue
if patchelf --print-needed "$f" 2>/dev/null | grep -Fxq "$OLD"; then
echo "STILL NEEDS $OLD -> $f"
fi
done
Comment on lines +4 to +25
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will go away once an updated wheel is provided hopefully soon

2 changes: 0 additions & 2 deletions scripts/scaffold-matrix.job
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
#SBATCH -A fractale
#SBATCH -perl

ml cuda/12.6.0 gcc/12.1.1 mvapich2/2.3.7

. .venvs/scaffoldvenv-matrix/bin/activate

export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
Expand Down
24 changes: 24 additions & 0 deletions scripts/scaffold-tuolumne-torchpypi.job
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

# flux: --exclusive
# flux: -N 1
# flux: -g=1
# flux: -t 60m
# flux: -qpdebug
# flux: -B fractale

ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi

. .venvs/scaffoldvenv-tuo-pypi/bin/activate

# Use ccl plugin that we manually built with install-rccl.sh
export NCCL_NET_PLUGIN=../aws-ofi-nccl.git/install/lib/librccl-net.so
export NCCL_NET="AWS Libfabric"

torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml

# Uncomment if you want torch profiling
#export PROFILE_TORCH=ON

torchrun-hpc -N 1 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c $(pwd)/ScaFFold/configs/benchmark_default.yml
#torchrun-hpc -N 2 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c $(pwd)/ScaFFold/configs/benchmark_default.yml
11 changes: 6 additions & 5 deletions scripts/scaffold-tuolumne.job
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,16 @@
# flux: -qpdebug
# flux: -B fractale

ml rocm/6.4.2 rccl/fast-env-slows-mpi
ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi

. .venvs/scaffoldvenv-tuo/bin/activate

# Avoid spindle error
export SPINDLE_FLUXOPT=off
# (1) Avoid libmagma error
# (2) Removing libmpi may cause segfault on mpi4py import
export LD_PRELOAD="/opt/rocm-7.1.0/llvm/lib/libomp.so /opt/cray/pe/mpich/9.1.0/ofi/gnu/11.2/lib/libmpi_gnu.so.12"
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The second path will go away once an updated wheel is provided hopefully soon


# Avoid libmagma error
export LD_PRELOAD=/opt/rocm-6.4.2/llvm/lib/libomp.so
# Ensure using libfabric. NCCL_NET_PLUGIN should be unecessary to set for WCI wheel.
export NCCL_NET="AWS Libfabric"

torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml

Expand Down