NVIDIA · ivanbasov · Apr 3, 2026 · Mar 30, 2026 · Mar 30, 2026 · Apr 1, 2026
diff --git a/.github/workflows/ci-gpu.yml b/.github/workflows/ci-gpu.yml
@@ -46,7 +46,9 @@ jobs:
       fail-fast: false
       matrix:
         python-version: ["3.11", "3.12", "3.13"]
-    name: "gpu / py${{ matrix.python-version }}"
+        # cu128 = representative CUDA 12.x wheel; cu130 = CUDA 13.0 wheel.
+        torch-cuda: ["cu128", "cu130"]
+    name: "gpu / py${{ matrix.python-version }} / ${{ matrix.torch-cuda }}"
     steps:
       - name: Setup proxy cache
         uses: nv-gha-runners/setup-proxy-cache@main
@@ -80,11 +82,14 @@ jobs:
           MODE: train
           SKIP_TESTS: "0"
           REQUIRE_GPU: "1"
+          TORCH_CUDA: ${{ matrix.torch-cuda }}
+          VENV_DIR: .venv_train_${{ matrix.python-version }}_${{ matrix.torch-cuda }}
+          REQ_FILE: code/requirements_public_gpu_${{ matrix.torch-cuda == 'cu130' && 'cu13' || 'cu12' }}.txt
 
       - name: Training + inference with LER check
         shell: bash
         run: |
-          source .venv_train_${{ matrix.python-version }}/bin/activate
+          source .venv_train_${{ matrix.python-version }}_${{ matrix.torch-cuda }}/bin/activate
           bash code/scripts/smoke_run.sh 2>&1 | tee /tmp/ci_train.log
           r=${PIPESTATUS[0]}; [ $r -ne 0 ] && exit $r
           # 0.35: short run (16k samples, 2 epochs for stable LER across py versions)
@@ -99,7 +104,7 @@ jobs:
       - name: Training + inference with multi-worker DataLoader (num_workers=2)
         shell: bash
         run: |
-          source .venv_train_${{ matrix.python-version }}/bin/activate
+          source .venv_train_${{ matrix.python-version }}_${{ matrix.torch-cuda }}/bin/activate
           bash code/scripts/smoke_run.sh 2>&1 | tee /tmp/ci_multiworker.log
           r=${PIPESTATUS[0]}; [ $r -ne 0 ] && exit $r
           python code/scripts/check_ler_from_log.py /tmp/ci_multiworker.log --max-ler 0.35

diff --git a/Dockerfile b/Dockerfile
@@ -5,7 +5,7 @@
 #
 # Build:
 #   docker build -t predecoder-train .
-#   docker build -t predecoder-train --build-arg TORCH_CUDA=cu124 .   # different CUDA
+#   docker build -t predecoder-train --build-arg TORCH_CUDA=cu128 .   # different CUDA
 #
 # Run:
 #   docker run --rm --gpus all \

diff --git a/TRAINING.md b/TRAINING.md
@@ -19,7 +19,7 @@ For local single-machine usage, see `README.md`.
 docker build -t predecoder-train .
 
 # Optionally, for a different CUDA version:
-docker build -t predecoder-train --build-arg TORCH_CUDA=cu124 .
+docker build -t predecoder-train --build-arg TORCH_CUDA=cu128 .
 
 # Train
 docker run --rm --gpus all \
@@ -156,7 +156,7 @@ export CONFIG_NAME=config_qec_decoder_r13_fp8
 |----------|---------|-------------|
 | `INSTALL_DIR` | `$HOME/predecoder_env` | Where `cluster_install_deps.sh` creates the Python environment. |
 | `PREDECODER_PYTHON` | auto-detect | Explicit path to the Python binary. |
-| `TORCH_CUDA` | `cu121` | PyTorch CUDA wheel tag (e.g. `cu121`, `cu124`, `cu130`). |
+| `TORCH_CUDA` | `cu121` | PyTorch CUDA wheel tag (e.g. `cu121`, `cu128`, `cu130`). |
 | `DOCKER_IMAGE` | `predecoder-train` | Pre-built Docker image name. |
 | `DOCKER_BASE_IMAGE` | `nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04` | Fallback CUDA base image. |
 | `SHARED_LOG_DIR` | `$SHARED_OUTPUT_DIR/logs` | Override the logs root directory (advanced). |
@@ -257,7 +257,7 @@ $SHARED_OUTPUT_DIR/
    - `--cpus-per-task=`, `--mem=`, `--time=` as appropriate
 
 2. **CUDA version**: set `TORCH_CUDA=cuXXX` to match your driver
-   (e.g. `cu121` for CUDA 12.1, `cu124` for CUDA 12.4).
+   (e.g. `cu121` for CUDA 12.1, `cu128` for CUDA 12.8).
 
 3. **Docker base image**: set `DOCKER_BASE_IMAGE` if your cluster uses a different CUDA runtime.
 

diff --git a/code/requirements_public_gpu_cu12.txt b/code/requirements_public_gpu_cu12.txt
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# GPU environment for CUDA 12.x (cu12x family: cu121, cu128).
+# Adds cupy-cuda12x on top of the training requirements to enable zero-copy
+# DLPack GPU transfers in the cuStabilizer (BitMatrixSampler) path.
+#
+# Install PyTorch from the matching wheel index, e.g. for cu128:
+#   pip install -r requirements_public_gpu_cu12.txt \
+#     --index-url https://download.pytorch.org/whl/cu128 \
+#     --extra-index-url https://pypi.org/simple
+-r requirements_public_train-cu12.txt
+cupy-cuda12x
diff --git a/code/requirements_public_gpu_cu13.txt b/code/requirements_public_gpu_cu13.txt
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# GPU environment for CUDA 13.x (cu130+).
+# Adds cupy-cuda13x on top of the training requirements to enable zero-copy
+# DLPack GPU transfers in the cuStabilizer (BitMatrixSampler) path.
+#
+# Install PyTorch from the matching wheel index:
+#   pip install -r requirements_public_gpu_cu13.txt \
+#     --index-url https://download.pytorch.org/whl/cu130 \
+#     --extra-index-url https://pypi.org/simple
+-r requirements_public_train-cu13.txt
+cupy-cuda13x
diff --git a/code/scripts/cluster_install_deps.sh b/code/scripts/cluster_install_deps.sh
@@ -78,7 +78,7 @@ fi
 cd "$REPO_ROOT"
 
 # Use PyTorch CUDA index so torch is CUDA-built (on aarch64, PyPI serves CPU-only).
-# TORCH_CUDA e.g. cu121 or cu124; default cu121 to match nvidia/cuda:12.1 base image.
+# TORCH_CUDA e.g. cu121 or cu128; default cu121 to match nvidia/cuda:12.1 base image.
 PYTORCH_INDEX="https://download.pytorch.org/whl/${TORCH_CUDA}"
 echo "Installing requirements (torch from CUDA index: ${TORCH_CUDA})..."
 "$PYTHON_BIN" -m pip install -r code/requirements_public_train-cu${CUDA_MAJOR_VERSION}.txt \

diff --git a/code/scripts/sbatch_train.sh b/code/scripts/sbatch_train.sh
@@ -25,7 +25,7 @@
 #   DOCKER_IMAGE        Pre-built Docker image name. (default: predecoder-train)
 #   DOCKER_BASE_IMAGE   Fallback CUDA base image for install-from-scratch.
 #                       (default: nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04)
-#   TORCH_CUDA          PyTorch CUDA wheel tag, e.g. cu121 or cu124.
+#   TORCH_CUDA          PyTorch CUDA wheel tag, e.g. cu121 or cu128.
 #   INSTALL_DIR         Where to install Python/venv on bare-metal nodes.
 #                       (default: $HOME/predecoder_env)
 #   PREDECODER_DISABLE_SDR  Set to 1 to skip SDR (useful for cluster runs).