Skip to content

Commit f425465

Browse files
ivanbasovclaude
andcommitted
Fix training script portability and documentation issues
- sbatch_train.sh: resolve REPO_ROOT from script location, not $(pwd) - sbatch_train.sh: consolidate PREDECODER_DISABLE_SDR/TORCH_COMPILE defaults so Docker and bare-metal paths behave identically - sbatch_train.sh: log message before chmod 1777; add --nodes=1 to multi-GPU examples - cluster_install_deps.sh: arch-aware Miniconda URL (supports aarch64/ARM) - cluster_install_deps.sh: single TORCH_CUDA default (remove redundant fallback) - TRAINING.md: document SHARED_LOG_DIR; correct cluster defaults for SDR/compile vars - conf/config_qec_decoder_r{9,13}_fp8.yaml: note that training hyperparams come from internal defaults, point to config_public.yaml Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 6925a23 commit f425465

5 files changed

Lines changed: 33 additions & 17 deletions

File tree

TRAINING.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,8 @@ export CONFIG_NAME=config_qec_decoder_r13_fp8
145145
| `PREDECODER_TRAIN_SAMPLES` | config-defined | Samples per epoch. Bypasses auto-scaling when set explicitly. |
146146
| `PREDECODER_LR_MILESTONES` | config-defined | Comma-separated LR schedule milestone fractions (e.g. `0.25,0.5,1.0`). |
147147
| `PREDECODER_TIMING_RUN` | unset | Set `1` for timing/benchmarking mode (disables some overhead). |
148-
| `PREDECODER_TORCH_COMPILE` | unset | `0` to disable `torch.compile`, `1` to enable. |
149-
| `PREDECODER_DISABLE_SDR` | unset | `1` to skip Syndrome Density Reduction computation (saves time). |
148+
| `PREDECODER_TORCH_COMPILE` | `0` when run via `sbatch_train.sh`, otherwise unset | `0` to disable `torch.compile`, `1` to enable. |
149+
| `PREDECODER_DISABLE_SDR` | `1` when run via `sbatch_train.sh`, otherwise unset | `1` to skip Syndrome Density Reduction computation (saves time on cluster). |
150150
| `TORCH_COMPILE` | unset | Alternative way to control `torch.compile` (`0`/`1`). |
151151
| `TORCH_COMPILE_MODE` | unset | `default`, `reduce-overhead`, or `max-autotune`. |
152152

@@ -159,8 +159,9 @@ export CONFIG_NAME=config_qec_decoder_r13_fp8
159159
| `TORCH_CUDA` | `cu121` | PyTorch CUDA wheel tag (e.g. `cu121`, `cu124`, `cu130`). |
160160
| `DOCKER_IMAGE` | `predecoder-train` | Pre-built Docker image name. |
161161
| `DOCKER_BASE_IMAGE` | `nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04` | Fallback CUDA base image. |
162+
| `SHARED_LOG_DIR` | `$SHARED_OUTPUT_DIR/logs` | Override the logs root directory (advanced). |
162163
| `PREDECODER_BASE_OUTPUT_DIR` | `$SHARED_OUTPUT_DIR/outputs` | Override the outputs root (advanced). |
163-
| `PREDECODER_LOG_BASE_DIR` | `$SHARED_OUTPUT_DIR/logs` | Override the logs root (advanced). |
164+
| `PREDECODER_LOG_BASE_DIR` | `$SHARED_OUTPUT_DIR/logs` | Override the logs root (advanced, set by `cluster_train.sh` from `SHARED_LOG_DIR`). |
164165

165166
## Example SLURM configurations
166167

@@ -190,7 +191,7 @@ EXPERIMENT_NAME=qec-decoder-depolarizing-r13-fp8-4gpu \
190191
CONFIG_NAME=config_qec_decoder_r13_fp8 \
191192
GPUS=4 FRESH_START=1 \
192193
sbatch --partition=<your-4gpu-partition> \
193-
--gres=gpu:4 --cpus-per-task=80 --mem=240G \
194+
--nodes=1 --gres=gpu:4 --cpus-per-task=80 --mem=240G \
194195
code/scripts/sbatch_train.sh
195196
```
196197

@@ -207,7 +208,7 @@ GPUS=4 \
207208
PREDECODER_TRAIN_SAMPLES=8388608 \
208209
PREDECODER_LR_MILESTONES="1.0,2.0,4.0" \
209210
sbatch --partition=<your-4gpu-partition> \
210-
--gres=gpu:4 --cpus-per-task=80 --mem=240G \
211+
--nodes=1 --gres=gpu:4 --cpus-per-task=80 --mem=240G \
211212
code/scripts/sbatch_train.sh
212213
```
213214

code/scripts/cluster_install_deps.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
set -euo pipefail
1616
INSTALL_DIR="${INSTALL_DIR:-$HOME/predecoder_env}"
1717
PYTHON_VERSION="${PYTHON_VERSION:-3.11}"
18-
TORCH_CUDA="${TORCH_CUDA:-}"
18+
TORCH_CUDA="${TORCH_CUDA:-cu121}"
1919
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
2020
REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../.." && pwd)"
2121

@@ -43,7 +43,8 @@ if [ -z "$PYTHON_BIN" ]; then
4343
mkdir -p "$INSTALL_DIR" && cd "$INSTALL_DIR"
4444
MINICONDA_DIR="${INSTALL_DIR}/miniconda3"
4545
if [ ! -d "$MINICONDA_DIR" ]; then
46-
MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh"
46+
ARCH=$(uname -m)
47+
MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${ARCH}.sh"
4748
[ -n "$(command -v wget)" ] && wget -q "$MINICONDA_URL" -O miniconda.sh || curl -sL -o miniconda.sh "$MINICONDA_URL"
4849
bash miniconda.sh -b -p "$MINICONDA_DIR" && rm -f miniconda.sh
4950
fi
@@ -68,11 +69,10 @@ cd "$REPO_ROOT"
6869

6970
# Use PyTorch CUDA index so torch is CUDA-built (on aarch64, PyPI serves CPU-only).
7071
# TORCH_CUDA e.g. cu121 or cu124; default cu121 to match nvidia/cuda:12.1 base image.
71-
PYTORCH_CUDA_TAG="${TORCH_CUDA:-cu121}"
72-
PYTORCH_INDEX="https://download.pytorch.org/whl/${PYTORCH_CUDA_TAG}"
73-
echo "Installing requirements (torch from CUDA index: ${PYTORCH_CUDA_TAG})..."
72+
PYTORCH_INDEX="https://download.pytorch.org/whl/${TORCH_CUDA}"
73+
echo "Installing requirements (torch from CUDA index: ${TORCH_CUDA})..."
7474
"$PYTHON_BIN" -m pip install -r code/requirements_public_train.txt \
75-
--index-url "$PYTORCH_INDEX" --extra-index-url https://pypi.org/simple
75+
--index-url "${PYTORCH_INDEX}" --extra-index-url https://pypi.org/simple
7676

7777
"$PYTHON_BIN" -c "
7878
import torch

code/scripts/sbatch_train.sh

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
# EXPERIMENT_NAME=qec-decoder-depolarizing-r13-fp8 \
4747
# CONFIG_NAME=config_qec_decoder_r13_fp8 \
4848
# GPUS=4 FRESH_START=1 \
49-
# sbatch --partition=<4gpu-partition> --gres=gpu:4 \
49+
# sbatch --partition=<4gpu-partition> --nodes=1 --gres=gpu:4 \
5050
# --cpus-per-task=80 --mem=240G \
5151
# code/scripts/sbatch_train.sh
5252
#
@@ -56,7 +56,7 @@
5656
# GPUS=4 \
5757
# PREDECODER_TRAIN_SAMPLES=8388608 \
5858
# PREDECODER_LR_MILESTONES="1.0,2.0,4.0" \
59-
# sbatch --partition=<4gpu-partition> --gres=gpu:4 \
59+
# sbatch --partition=<4gpu-partition> --nodes=1 --gres=gpu:4 \
6060
# --cpus-per-task=80 --mem=240G \
6161
# code/scripts/sbatch_train.sh
6262
# ──────────────────────────────────────────────────────────────
@@ -77,7 +77,8 @@ log() { echo "[$(date -Iseconds)] $*"; }
7777
export PREDECODER_VERBOSE=1
7878
export PYTHONUNBUFFERED=1
7979

80-
REPO_ROOT="${REPO_ROOT:-$(pwd)}"
80+
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
81+
REPO_ROOT="${REPO_ROOT:-$(cd -- "${SCRIPT_DIR}/../.." && pwd)}"
8182
SHARED_OUTPUT_DIR="${SHARED_OUTPUT_DIR:-$HOME/predecoder_outputs}"
8283
GPUS="${GPUS:-1}"
8384
EXPERIMENT_NAME="${EXPERIMENT_NAME:-qec-decoder-depolarizing-r9-fp8}"
@@ -119,12 +120,17 @@ log "========== Checking for Docker =========="
119120
HOST_UID=$(id -u)
120121
HOST_GID=$(id -g)
121122

123+
# Cluster-run defaults: disable SDR (expensive) and torch.compile (can crash in some envs).
124+
# Users can override either by setting the variable before calling sbatch.
125+
PREDECODER_DISABLE_SDR="${PREDECODER_DISABLE_SDR:-1}"
126+
PREDECODER_TORCH_COMPILE="${PREDECODER_TORCH_COMPILE:-0}"
127+
122128
COMMON_ENV=(
123129
-e SHARED_OUTPUT_DIR=/data
124130
-e PYTHONUNBUFFERED=1
125131
-e PREDECODER_VERBOSE=1
126-
-e "PREDECODER_TORCH_COMPILE=${PREDECODER_TORCH_COMPILE:-0}"
127-
-e "PREDECODER_DISABLE_SDR=${PREDECODER_DISABLE_SDR:-1}"
132+
-e "PREDECODER_TORCH_COMPILE=${PREDECODER_TORCH_COMPILE}"
133+
-e "PREDECODER_DISABLE_SDR=${PREDECODER_DISABLE_SDR}"
128134
-e "GPUS=${GPUS}"
129135
-e "EXPERIMENT_NAME=${EXPERIMENT_NAME}"
130136
-e "CONFIG_NAME=${CONFIG_NAME}"
@@ -146,6 +152,7 @@ if command -v docker >/dev/null 2>&1 && docker image inspect "$DOCKER_IMAGE" >/d
146152
"${COMMON_ENV[@]}" \
147153
"$DOCKER_IMAGE" 2>&1
148154
elif command -v docker >/dev/null 2>&1 && docker run --rm --gpus all "$DOCKER_BASE_IMAGE" nvidia-smi -L >/dev/null 2>&1; then
155+
log "Setting sticky bit on $SHARED_OUTPUT_DIR for NFS/Docker UID compatibility."
149156
chmod 1777 "$SHARED_OUTPUT_DIR" 2>/dev/null || true
150157
log "Using Docker base image $DOCKER_BASE_IMAGE (install + train)."
151158
docker run --rm --gpus all \
@@ -159,7 +166,7 @@ else
159166
log "No Docker. Running install + train on node."
160167
export SHARED_OUTPUT_DIR GPUS EXPERIMENT_NAME CONFIG_NAME
161168
export FRESH_START="${FRESH_START:-0}"
162-
export PREDECODER_DISABLE_SDR="${PREDECODER_DISABLE_SDR:-1}"
169+
export PREDECODER_DISABLE_SDR PREDECODER_TORCH_COMPILE
163170
INSTALL_DIR="${INSTALL_DIR:-$HOME/predecoder_env}"
164171
bash code/scripts/cluster_install_deps.sh
165172
export PREDECODER_PYTHON="${INSTALL_DIR}/venv/bin/python"

conf/config_qec_decoder_r13_fp8.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
# Config for: Model 4 (R=13), depolarizing p=0.006, experiment qec-decoder-depolarizing-r13-fp8.
55
# Training uses receptive field 13; evaluation targets distance/n_rounds 13.
66
# fp8 in the name is for the intended export target; training runs in fp32.
7+
#
8+
# This file only overrides model/data fields. Training hyperparameters (epochs, batch
9+
# size, LR schedule, etc.) come from internal defaults — see conf/config_public.yaml
10+
# for the user-facing training fields and their documented defaults.
711

812
# === Model selection (Model 4 → receptive field 13) ===
913
model_id: 4

conf/config_qec_decoder_r9_fp8.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
# Config for: Model 1 (R=9), depolarizing p=0.006, experiment qec-decoder-depolarizing-r9-fp8.
55
# Training uses receptive field 9; evaluation targets distance/n_rounds 9.
66
# fp8 in the name is for the intended export target; training runs in fp32.
7+
#
8+
# This file only overrides model/data fields. Training hyperparameters (epochs, batch
9+
# size, LR schedule, etc.) come from internal defaults — see conf/config_public.yaml
10+
# for the user-facing training fields and their documented defaults.
711

812
# === Model selection (Model 1 → receptive field 9) ===
913
model_id: 1

0 commit comments

Comments
 (0)