LBANN · PatrickRMiles · Mar 5, 2026 · Feb 12, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/README.md b/README.md
@@ -29,25 +29,24 @@ The model is trained from a random initialization until convergence, which is de
 1. Clone the repository:  
     `git clone https://github.com/LBANN/ScaFFold.git && cd ScaFFold`
 
+1. Build the ccl plugin (if not using WCI wheel)
+    `. scripts/install-rccl.sh`
+
 1. Create and activate a python venv for running the benchmark:  
     `ml load python/3.11.5 && python3 -m venv .venvs/scaffoldvenv && source .venvs/scaffoldvenv/bin/activate && pip install --upgrade pip`
 
 1. Necessary LLNL settings:
     - CUDA (matrix):
-        1. `ml cuda/12.6.0 gcc/12.1.1 mvapich2/2.3.7`
+        1. `ml cuda/12.9.1 gcc/13.3.1 mvapich2/2.3.7`
         1. `export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH`
     - ROCm (elcap):
-        1. `ml load rocm/6.4.2 rccl/fast-env-slows-mpi`
-            - If using generic wheel:
-                1. `export LD_LIBRARY_PATH=/opt/cray/pe/cce/20.0.0/cce/x86_64/lib:$LD_LIBRARY_PATH`
-                1. `export LD_LIBRARY_PATH=/collab/usr/global/tools/rccl/toss_4_x86_64_ib_cray/rocm-6.4.1/install/lib/:$LD_LIBRARY_PATH` # Necessary to use libfabric plugin (Only necessary if using generic install, wci already links correctly)
+        1. `ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi`
             - If using WCI wheel:
-                1. `export LD_LIBRARY_PATH=/opt/cray/pe/cce/20.0.0/cce-clang/x86_64/lib/:$LD_LIBRARY_PATH` # for libomp.so
-                1. `export SPINDLE_FLUXOPT=off` # Avoid spindle error
+                1. `export LD_PRELOAD=/opt/rocm-7.1.0/llvm/lib/libomp.so` # for libomp.so
 
 1. Install the benchmark in the python venv:
-    - CUDA: `pip install --no-binary=mpi4py .[cuda] --prefix=.venvs/scaffoldvenv --extra-index-url https://download.pytorch.org/whl/cu126 2>&1 | tee install.log`
-    - ROCm (generic): `pip install --no-binary=mpi4py .[rocm] --prefix=.venvs/scaffoldvenv --extra-index-url https://download.pytorch.org/whl/rocm6.4 2>&1 | tee install.log`
+    - CUDA: `pip install --no-binary=mpi4py .[cuda] --prefix=.venvs/scaffoldvenv --extra-index-url https://download.pytorch.org/whl/cu129 2>&1 | tee install.log`
+    - ROCm (generic): `pip install --no-binary=mpi4py .[rocm] --prefix=.venvs/scaffoldvenv --extra-index-url https://download.pytorch.org/whl/rocm7.1 2>&1 | tee install.log`
     - ROCm (LLNL): `pip install .[rocmwci] --prefix=.venvs/scaffoldvenv 2>&1 | tee install.log`
 
 
@@ -227,8 +226,8 @@ make && make install
 git clone https://github.com/LLNL/Caliper.git
 cd Caliper
 mkdir pybuild && cd pybuild
-ml rocm/6.4.0
-ml cuda/12.6.0
+ml rocm/7.1.0
+ml cuda/12.9.1
 cmake -DWITH_PYTHON_BINDINGS=ON \
    -DWITH_ROCPROFILER=ON \
    -DWITH_CUPTI=ON \

diff --git a/ScaFFold/utils/create_restart_script.py b/ScaFFold/utils/create_restart_script.py
@@ -98,7 +98,7 @@ def _get_env_setup() -> str:
 # --- Begin Environment Setup ---
 # Load Modules
 if command -v module &> /dev/null; then
-    module load rocm/6.4.2 rccl/fast-env-slows-mpi
+    ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi
 fi
 
 # Activate Virtual Environment
@@ -111,7 +111,7 @@ def _get_env_setup() -> str:
 
 # Environment variables
 export SPINDLE_FLUXOPT=off
-export LD_PRELOAD=/opt/rocm-6.4.2/llvm/lib/libomp.so
+export LD_PRELOAD=/opt/rocm-7.1.0/llvm/lib/libomp.so
 
 export PROFILE_TORCH=ON
 # --- End Environment Setup ---

diff --git a/pyproject.toml b/pyproject.toml
@@ -51,7 +51,7 @@ dependencies = [
   "wandb>=0.19.6",
   "open3d>=0.18.0",
   "PyYAML>=6.0.2",
-  "distconv @ git+https://github.com/LBANN/DistConv.git",
+  "distconv @ git+https://github.com/LBANN/DistConv.git@232cba6",
 ]
 requires-python = ">=3.9"
 
@@ -60,16 +60,16 @@ profiling = [
   "pybind11>=3.0.0"
 ]
 cuda = [
-    "torch==2.8.0+cu126",
-    "mpi4py==4.0.2",
+    "torch==2.10.0+cu129",
+    "mpi4py==4.1.1",
 ]
 rocm = [
-    "torch==2.8.0+rocm6.4",
-    "mpi4py==4.0.2",
+    "torch==2.10.0+rocm7.1",
+    "mpi4py==4.1.1",
 ]
 rocmwci = [
-    "torch==2.8.0+rocm642",
-    "mpi4py==4.1.1.dev0+mpich.8.1.32",
+    "torch==2.10.0+rocm710",
+    "mpi4py==4.1.1+mpich.9.1.0",
 ]
 
 [project.entry-points.console_scripts]

diff --git a/requirements.txt b/requirements.txt
@@ -7,17 +7,5 @@ tqdm>=4.67.1
 wandb>=0.19.6
 open3d>=0.18.0
 PyYAML>=6.0.2
-mpi4py==4.0.2 --no-binary mpi4py
-distconv @ git+https://github.com/LBANN/DistConv.git
-
-# cuda
-#     torch==2.7.1+cu126
-#     torchvision==0.22.1+cu126
-#     torchaudio==2.7.1+cu126
-# --extra-index-url https://download.pytorch.org/whl/cu126
-
-# rocm
-#     torch==2.8.0+rocm6.
-#     torchvision==0.23.0+rocm6.4
-#     torchaudio==2.8.0+rocm6.4
-# --extra-index-url https://download.pytorch.org/whl/test/rocm6.4
+mpi4py==4.1.1 --no-binary mpi4py
+distconv @ git+https://github.com/LBANN/DistConv.git@232cba6
diff --git a/scripts/install-matrix.sh b/scripts/install-matrix.sh
@@ -1,4 +1,4 @@
 ml load python/3.11.5 && python3 -m venv .venvs/scaffoldvenv-matrix && source .venvs/scaffoldvenv-matrix/bin/activate && pip install --upgrade pip
-ml cuda/12.6.0 gcc/12.1.1 mvapich2/2.3.7
+ml cuda/12.9.1 gcc/13.3.1 mvapich2/2.3.7
 export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
-pip install --no-binary=mpi4py -e .[cuda] --prefix=.venvs/scaffoldvenv-matrix --extra-index-url https://download.pytorch.org/whl/cu126 2>&1 | tee install.log
+pip install --no-binary=mpi4py -e .[cuda] --prefix=.venvs/scaffoldvenv-matrix --extra-index-url https://download.pytorch.org/whl/cu129 2>&1 | tee install.log
diff --git a/scripts/install-rccl.sh b/scripts/install-rccl.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Exit if target directory already exists
+if [ -d "aws-ofi-nccl.git" ]; then
+  echo "Directory 'aws-ofi-nccl.git' already exists. Exiting to avoid overwrite."
+  return 1 2>/dev/null || exit 1
+fi
+
+rocm_version=7.1.0
+
+module swap PrgEnv-cray PrgEnv-gnu
+module load rocm/$rocm_version
+
+git clone --recursive --branch v1.18.0 https://github.com/aws/aws-ofi-nccl.git aws-ofi-nccl.git
+
+cd aws-ofi-nccl.git
+
+installdir=$(pwd)/install
+
+./autogen.sh
+
+export LD_LIBRARY_PATH=$PWD/../rccl/install/lib:/opt/rocm-$rocm_version/lib:$LD_LIBRARY_PATH
+
+#CC=hipcc CXX=hipcc CFLAGS=-I$PWD/../rccl/install/include/rccl ./configure \
+./configure \
+  --with-libfabric=/opt/cray/libfabric/2.1 \
+  --with-rocm=$ROCM_PATH \
+  --prefix=$installdir
+
+make
+make install
diff --git a/scripts/install-tuolumne-torchpypi.sh b/scripts/install-tuolumne-torchpypi.sh
@@ -0,0 +1,4 @@
+. install-rccl.sh
+ml load python/3.11.5 && python3 -m venv .venvs/scaffoldvenv-tuo-pypi && source .venvs/scaffoldvenv-tuo-pypi/bin/activate && pip install --upgrade pip
+ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi
+pip install -e .[rocm] --prefix=.venvs/scaffoldvenv-tuo-pypi --extra-index-url https://download.pytorch.org/whl/rocm7.1 2>&1 | tee install.log
diff --git a/scripts/install-tuolumne.sh b/scripts/install-tuolumne.sh
@@ -1,3 +1,25 @@
 ml load python/3.11.5 && python3 -m venv .venvs/scaffoldvenv-tuo && source .venvs/scaffoldvenv-tuo/bin/activate && pip install --upgrade pip
-ml load rocm/6.4.2 rccl/fast-env-slows-mpi libfabric
+ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi
 pip install -e .[rocmwci] --prefix=.venvs/scaffoldvenv-tuo 2>&1 | tee install.log
+# Needed until new wheel exists for torch using mpich 9.1.0
+TORCH_LIB_DIR=".venvs/scaffoldvenv-tuo/lib/python3.11/site-packages/torch/lib"
+OLD="libmpi_gnu_112.so.12"
+NEW="libmpi_gnu.so.12"
+cd "$TORCH_LIB_DIR" || exit 1
+# Patch every file that has OLD in its DT_NEEDED
+for f in *.so*; do
+  [ -f "$f" ] || continue
+
+  if patchelf --print-needed "$f" 2>/dev/null | grep -Fxq "$OLD"; then
+    echo "Patching $f"
+    patchelf --replace-needed "$OLD" "$NEW" "$f"
+  fi
+done
+echo
+echo "Verification (should show no $OLD):"
+for f in *.so*; do
+  [ -f "$f" ] || continue
+  if patchelf --print-needed "$f" 2>/dev/null | grep -Fxq "$OLD"; then
+    echo "STILL NEEDS $OLD -> $f"
+  fi
+done
diff --git a/scripts/scaffold-matrix.job b/scripts/scaffold-matrix.job
@@ -8,8 +8,6 @@
 #SBATCH -A fractale
 #SBATCH -perl
 
-ml cuda/12.6.0 gcc/12.1.1 mvapich2/2.3.7
-
 . .venvs/scaffoldvenv-matrix/bin/activate
 
 export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH

diff --git a/scripts/scaffold-tuolumne-torchpypi.job b/scripts/scaffold-tuolumne-torchpypi.job
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# flux: --exclusive
+# flux: -N 1
+# flux: -g=1
+# flux: -t 60m
+# flux: -qpdebug
+# flux: -B fractale
+
+ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi
+
+. .venvs/scaffoldvenv-tuo-pypi/bin/activate
+
+# Use ccl plugin that we manually built with install-rccl.sh
+export NCCL_NET_PLUGIN=../aws-ofi-nccl.git/install/lib/librccl-net.so
+export NCCL_NET="AWS Libfabric"
+
+torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml
+
+# Uncomment if you want torch profiling
+#export PROFILE_TORCH=ON
+
+torchrun-hpc -N 1 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c $(pwd)/ScaFFold/configs/benchmark_default.yml
+#torchrun-hpc -N 2 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c $(pwd)/ScaFFold/configs/benchmark_default.yml
diff --git a/scripts/scaffold-tuolumne.job b/scripts/scaffold-tuolumne.job
@@ -7,15 +7,16 @@
 # flux: -qpdebug
 # flux: -B fractale
 
-ml rocm/6.4.2 rccl/fast-env-slows-mpi
+ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi
 
 . .venvs/scaffoldvenv-tuo/bin/activate
 
-# Avoid spindle error
-export SPINDLE_FLUXOPT=off
+# (1) Avoid libmagma error
+# (2) Removing libmpi may cause segfault on mpi4py import
+export LD_PRELOAD="/opt/rocm-7.1.0/llvm/lib/libomp.so /opt/cray/pe/mpich/9.1.0/ofi/gnu/11.2/lib/libmpi_gnu.so.12"
 
-# Avoid libmagma error
-export LD_PRELOAD=/opt/rocm-6.4.2/llvm/lib/libomp.so
+# Ensure using libfabric. NCCL_NET_PLUGIN should be unecessary to set for WCI wheel.
+export NCCL_NET="AWS Libfabric"
 
 torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml