Fix training script portability and documentation issues

ivanbasov · claude · ivanbasov · commit f4254653fe46 · 2026-03-09T13:45:39.000-07:00
- sbatch_train.sh: resolve REPO_ROOT from script location, not $(pwd)
- sbatch_train.sh: consolidate PREDECODER_DISABLE_SDR/TORCH_COMPILE defaults
  so Docker and bare-metal paths behave identically
- sbatch_train.sh: log message before chmod 1777; add --nodes=1 to multi-GPU examples
- cluster_install_deps.sh: arch-aware Miniconda URL (supports aarch64/ARM)
- cluster_install_deps.sh: single TORCH_CUDA default (remove redundant fallback)
- TRAINING.md: document SHARED_LOG_DIR; correct cluster defaults for SDR/compile vars
- conf/config_qec_decoder_r{9,13}_fp8.yaml: note that training hyperparams come
  from internal defaults, point to config_public.yaml

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/TRAINING.md b/TRAINING.md
@@ -145,8 +145,8 @@ export CONFIG_NAME=config_qec_decoder_r13_fp8
 | `PREDECODER_TRAIN_SAMPLES` | config-defined | Samples per epoch. Bypasses auto-scaling when set explicitly. |
 | `PREDECODER_LR_MILESTONES` | config-defined | Comma-separated LR schedule milestone fractions (e.g. `0.25,0.5,1.0`). |
 | `PREDECODER_TIMING_RUN` | unset | Set `1` for timing/benchmarking mode (disables some overhead). |
-| `PREDECODER_TORCH_COMPILE` | unset | `0` to disable `torch.compile`, `1` to enable. |
-| `PREDECODER_DISABLE_SDR` | unset | `1` to skip Syndrome Density Reduction computation (saves time). |
+| `PREDECODER_TORCH_COMPILE` | `0` when run via `sbatch_train.sh`, otherwise unset | `0` to disable `torch.compile`, `1` to enable. |
+| `PREDECODER_DISABLE_SDR` | `1` when run via `sbatch_train.sh`, otherwise unset | `1` to skip Syndrome Density Reduction computation (saves time on cluster). |
 | `TORCH_COMPILE` | unset | Alternative way to control `torch.compile` (`0`/`1`). |
 | `TORCH_COMPILE_MODE` | unset | `default`, `reduce-overhead`, or `max-autotune`. |
 
@@ -159,8 +159,9 @@ export CONFIG_NAME=config_qec_decoder_r13_fp8
 | `TORCH_CUDA` | `cu121` | PyTorch CUDA wheel tag (e.g. `cu121`, `cu124`, `cu130`). |
 | `DOCKER_IMAGE` | `predecoder-train` | Pre-built Docker image name. |
 | `DOCKER_BASE_IMAGE` | `nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04` | Fallback CUDA base image. |
+| `SHARED_LOG_DIR` | `$SHARED_OUTPUT_DIR/logs` | Override the logs root directory (advanced). |
 | `PREDECODER_BASE_OUTPUT_DIR` | `$SHARED_OUTPUT_DIR/outputs` | Override the outputs root (advanced). |
-| `PREDECODER_LOG_BASE_DIR` | `$SHARED_OUTPUT_DIR/logs` | Override the logs root (advanced). |
+| `PREDECODER_LOG_BASE_DIR` | `$SHARED_OUTPUT_DIR/logs` | Override the logs root (advanced, set by `cluster_train.sh` from `SHARED_LOG_DIR`). |
 
 ## Example SLURM configurations
 
@@ -190,7 +191,7 @@ EXPERIMENT_NAME=qec-decoder-depolarizing-r13-fp8-4gpu \
 CONFIG_NAME=config_qec_decoder_r13_fp8 \
 GPUS=4 FRESH_START=1 \
   sbatch --partition=<your-4gpu-partition> \
-         --gres=gpu:4 --cpus-per-task=80 --mem=240G \
+         --nodes=1 --gres=gpu:4 --cpus-per-task=80 --mem=240G \
          code/scripts/sbatch_train.sh
 ```
 
@@ -207,7 +208,7 @@ GPUS=4 \
 PREDECODER_TRAIN_SAMPLES=8388608 \
 PREDECODER_LR_MILESTONES="1.0,2.0,4.0" \
   sbatch --partition=<your-4gpu-partition> \
-         --gres=gpu:4 --cpus-per-task=80 --mem=240G \
+         --nodes=1 --gres=gpu:4 --cpus-per-task=80 --mem=240G \
          code/scripts/sbatch_train.sh
 ```
 
diff --git a/code/scripts/cluster_install_deps.sh b/code/scripts/cluster_install_deps.sh
@@ -15,7 +15,7 @@
 set -euo pipefail
 INSTALL_DIR="${INSTALL_DIR:-$HOME/predecoder_env}"
 PYTHON_VERSION="${PYTHON_VERSION:-3.11}"
-TORCH_CUDA="${TORCH_CUDA:-}"
+TORCH_CUDA="${TORCH_CUDA:-cu121}"
 SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../.." && pwd)"
 
@@ -43,7 +43,8 @@ if [ -z "$PYTHON_BIN" ]; then
   mkdir -p "$INSTALL_DIR" && cd "$INSTALL_DIR"
   MINICONDA_DIR="${INSTALL_DIR}/miniconda3"
   if [ ! -d "$MINICONDA_DIR" ]; then
-    MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh"
+    ARCH=$(uname -m)
+    MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${ARCH}.sh"
     [ -n "$(command -v wget)" ] && wget -q "$MINICONDA_URL" -O miniconda.sh || curl -sL -o miniconda.sh "$MINICONDA_URL"
     bash miniconda.sh -b -p "$MINICONDA_DIR" && rm -f miniconda.sh
   fi
@@ -68,11 +69,10 @@ cd "$REPO_ROOT"
 
 # Use PyTorch CUDA index so torch is CUDA-built (on aarch64, PyPI serves CPU-only).
 # TORCH_CUDA e.g. cu121 or cu124; default cu121 to match nvidia/cuda:12.1 base image.
-PYTORCH_CUDA_TAG="${TORCH_CUDA:-cu121}"
-PYTORCH_INDEX="https://download.pytorch.org/whl/${PYTORCH_CUDA_TAG}"
-echo "Installing requirements (torch from CUDA index: ${PYTORCH_CUDA_TAG})..."
+PYTORCH_INDEX="https://download.pytorch.org/whl/${TORCH_CUDA}"
+echo "Installing requirements (torch from CUDA index: ${TORCH_CUDA})..."
 "$PYTHON_BIN" -m pip install -r code/requirements_public_train.txt \
-  --index-url "$PYTORCH_INDEX" --extra-index-url https://pypi.org/simple
+  --index-url "${PYTORCH_INDEX}" --extra-index-url https://pypi.org/simple
 
 "$PYTHON_BIN" -c "
 import torch
diff --git a/code/scripts/sbatch_train.sh b/code/scripts/sbatch_train.sh
@@ -46,7 +46,7 @@
 #   EXPERIMENT_NAME=qec-decoder-depolarizing-r13-fp8 \
 #   CONFIG_NAME=config_qec_decoder_r13_fp8 \
 #   GPUS=4 FRESH_START=1 \
-#     sbatch --partition=<4gpu-partition> --gres=gpu:4 \
+#     sbatch --partition=<4gpu-partition> --nodes=1 --gres=gpu:4 \
 #            --cpus-per-task=80 --mem=240G \
 #            code/scripts/sbatch_train.sh
 #
@@ -56,7 +56,7 @@
 #   GPUS=4 \
 #   PREDECODER_TRAIN_SAMPLES=8388608 \
 #   PREDECODER_LR_MILESTONES="1.0,2.0,4.0" \
-#     sbatch --partition=<4gpu-partition> --gres=gpu:4 \
+#     sbatch --partition=<4gpu-partition> --nodes=1 --gres=gpu:4 \
 #            --cpus-per-task=80 --mem=240G \
 #            code/scripts/sbatch_train.sh
 # ──────────────────────────────────────────────────────────────
@@ -77,7 +77,8 @@ log() { echo "[$(date -Iseconds)] $*"; }
 export PREDECODER_VERBOSE=1
 export PYTHONUNBUFFERED=1
 
-REPO_ROOT="${REPO_ROOT:-$(pwd)}"
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="${REPO_ROOT:-$(cd -- "${SCRIPT_DIR}/../.." && pwd)}"
 SHARED_OUTPUT_DIR="${SHARED_OUTPUT_DIR:-$HOME/predecoder_outputs}"
 GPUS="${GPUS:-1}"
 EXPERIMENT_NAME="${EXPERIMENT_NAME:-qec-decoder-depolarizing-r9-fp8}"
@@ -119,12 +120,17 @@ log "========== Checking for Docker =========="
 HOST_UID=$(id -u)
 HOST_GID=$(id -g)
 
+# Cluster-run defaults: disable SDR (expensive) and torch.compile (can crash in some envs).
+# Users can override either by setting the variable before calling sbatch.
+PREDECODER_DISABLE_SDR="${PREDECODER_DISABLE_SDR:-1}"
+PREDECODER_TORCH_COMPILE="${PREDECODER_TORCH_COMPILE:-0}"
+
 COMMON_ENV=(
   -e SHARED_OUTPUT_DIR=/data
   -e PYTHONUNBUFFERED=1
   -e PREDECODER_VERBOSE=1
-  -e "PREDECODER_TORCH_COMPILE=${PREDECODER_TORCH_COMPILE:-0}"
-  -e "PREDECODER_DISABLE_SDR=${PREDECODER_DISABLE_SDR:-1}"
+  -e "PREDECODER_TORCH_COMPILE=${PREDECODER_TORCH_COMPILE}"
+  -e "PREDECODER_DISABLE_SDR=${PREDECODER_DISABLE_SDR}"
   -e "GPUS=${GPUS}"
   -e "EXPERIMENT_NAME=${EXPERIMENT_NAME}"
   -e "CONFIG_NAME=${CONFIG_NAME}"
@@ -146,6 +152,7 @@ if command -v docker >/dev/null 2>&1 && docker image inspect "$DOCKER_IMAGE" >/d
     "${COMMON_ENV[@]}" \
     "$DOCKER_IMAGE" 2>&1
 elif command -v docker >/dev/null 2>&1 && docker run --rm --gpus all "$DOCKER_BASE_IMAGE" nvidia-smi -L >/dev/null 2>&1; then
+  log "Setting sticky bit on $SHARED_OUTPUT_DIR for NFS/Docker UID compatibility."
   chmod 1777 "$SHARED_OUTPUT_DIR" 2>/dev/null || true
   log "Using Docker base image $DOCKER_BASE_IMAGE (install + train)."
   docker run --rm --gpus all \
@@ -159,7 +166,7 @@ else
   log "No Docker. Running install + train on node."
   export SHARED_OUTPUT_DIR GPUS EXPERIMENT_NAME CONFIG_NAME
   export FRESH_START="${FRESH_START:-0}"
-  export PREDECODER_DISABLE_SDR="${PREDECODER_DISABLE_SDR:-1}"
+  export PREDECODER_DISABLE_SDR PREDECODER_TORCH_COMPILE
   INSTALL_DIR="${INSTALL_DIR:-$HOME/predecoder_env}"
   bash code/scripts/cluster_install_deps.sh
   export PREDECODER_PYTHON="${INSTALL_DIR}/venv/bin/python"
diff --git a/conf/config_qec_decoder_r13_fp8.yaml b/conf/config_qec_decoder_r13_fp8.yaml
@@ -4,6 +4,10 @@
 # Config for: Model 4 (R=13), depolarizing p=0.006, experiment qec-decoder-depolarizing-r13-fp8.
 # Training uses receptive field 13; evaluation targets distance/n_rounds 13.
 # fp8 in the name is for the intended export target; training runs in fp32.
+#
+# This file only overrides model/data fields. Training hyperparameters (epochs, batch
+# size, LR schedule, etc.) come from internal defaults — see conf/config_public.yaml
+# for the user-facing training fields and their documented defaults.
 
 # === Model selection (Model 4 → receptive field 13) ===
 model_id: 4
diff --git a/conf/config_qec_decoder_r9_fp8.yaml b/conf/config_qec_decoder_r9_fp8.yaml
@@ -4,6 +4,10 @@
 # Config for: Model 1 (R=9), depolarizing p=0.006, experiment qec-decoder-depolarizing-r9-fp8.
 # Training uses receptive field 9; evaluation targets distance/n_rounds 9.
 # fp8 in the name is for the intended export target; training runs in fp32.
+#
+# This file only overrides model/data fields. Training hyperparameters (epochs, batch
+# size, LR schedule, etc.) come from internal defaults — see conf/config_public.yaml
+# for the user-facing training fields and their documented defaults.
 
 # === Model selection (Model 1 → receptive field 9) ===
 model_id: 1