ci: exercise multi-worker DataLoader (num_workers > 0) in CI

kvmto · kvmto · commit 2db609ef5567 · 2026-03-25T16:12:58.000Z
Production defaults use num_workers=4 with spawn multiprocessing, but all
CI jobs and tests forced num_workers=0. Add coverage for both layers:

- New test (test_dataloader_multiprocessing.py): verifies the Stim inference
  datapipe is pickle-safe and produces correct results with num_workers=2
  across X, Z, and mixed bases. Runs on CPU in a dedicated ci.yml job.

- New ci-gpu.yml step: re-runs inference with PREDECODER_INFERENCE_NUM_WORKERS=2
  after the existing smoke run, exercising the full logical_error_rate.py
  pipeline (multi-worker DataLoader → model forward → PyMatching → LER check).

Signed-off-by: kvmto &lt;kmato@nvidia.com&gt;
diff --git a/.github/workflows/ci-gpu.yml b/.github/workflows/ci-gpu.yml
@@ -96,6 +96,21 @@ jobs:
           PREDECODER_TEST_SAMPLES: "2048"
           PREDECODER_TRAIN_EPOCHS: "2"
 
+      - name: Training + inference with multi-worker DataLoader (num_workers=2)
+        shell: bash
+        run: |
+          source .venv_train_${{ matrix.python-version }}/bin/activate
+          bash code/scripts/smoke_run.sh 2>&1 | tee /tmp/ci_multiworker.log
+          r=${PIPESTATUS[0]}; [ $r -ne 0 ] && exit $r
+          python code/scripts/check_ler_from_log.py /tmp/ci_multiworker.log --max-ler 0.35
+        env:
+          EXPERIMENT_NAME: ci_multiworker
+          PREDECODER_TRAIN_SAMPLES: "16384"
+          PREDECODER_VAL_SAMPLES: "2048"
+          PREDECODER_TEST_SAMPLES: "2048"
+          PREDECODER_TRAIN_EPOCHS: "2"
+          PREDECODER_INFERENCE_NUM_WORKERS: "2"
+
   # ---------------------------------------------------------------------------
   # Mid-tier (~5-10 min): extended training + inference with LER check.
   # Runs only after merge to main (not on PR branches) to save GPU time.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -71,6 +71,30 @@ jobs:
           SKIP_TESTS: "0"
           PIP_EXTRA_INDEX_URL: "https://download.pytorch.org/whl/cpu"
 
+  # ---------------------------------------------------------------------------
+  # Multi-worker DataLoader: verifies the Stim inference datapipe works with
+  # num_workers > 0 (spawn multiprocessing context), matching the production
+  # default of num_workers=4 but never exercised in other CI jobs.
+  # ---------------------------------------------------------------------------
+  multiprocessing-dataloader:
+    runs-on: linux-amd64-cpu4
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          lfs: true
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip setuptools wheel
+          pip install -r code/requirements_public_inference.txt \
+            --extra-index-url https://download.pytorch.org/whl/cpu
+      - name: Run multi-worker DataLoader tests
+        run: >
+          PYTHONPATH=code python -m unittest discover
+          -s code/tests -p "test_dataloader_multiprocessing.py" -v
+
   unit-tests-coverage:
     runs-on: linux-amd64-cpu4
     steps:
diff --git a/code/tests/test_dataloader_multiprocessing.py b/code/tests/test_dataloader_multiprocessing.py
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+"""
+Multi-worker DataLoader tests for the Stim inference datapipe.
+
+Verifies QCDataPipePreDecoder_Memory_inference is pickle-safe and correct
+under num_workers > 0 with spawn multiprocessing (CPU-only, no GPU needed).
+"""
+
+import sys
+import unittest
+from pathlib import Path
+
+_repo_code = Path(__file__).resolve().parent.parent
+if str(_repo_code) not in sys.path:
+    sys.path.insert(0, str(_repo_code))
+
+import torch
+from torch.utils.data import DataLoader
+
+from data.datapipe_stim import QCDataPipePreDecoder_Memory_inference
+
+_D, _T, _N, _BS, _W = 3, 3, 32, 8, 2
+
+
+def _make_loader(basis, num_workers=_W, **kw):
+    ds = QCDataPipePreDecoder_Memory_inference(
+        distance=_D,
+        n_rounds=_T,
+        num_samples=_N,
+        error_mode="circuit_level_surface_custom",
+        p_error=0.01,
+        measure_basis=basis,
+        code_rotation="XV",
+    )
+    opts = dict(batch_size=_BS, shuffle=False)
+    if num_workers > 0:
+        opts["multiprocessing_context"] = "spawn"
+    opts.update(kw)
+    return ds, DataLoader(ds, num_workers=num_workers, **opts)
+
+
+class TestMultiWorkerDataLoader(unittest.TestCase):
+
+    def test_iteration_completes_all_bases(self):
+        for basis in ("X", "Z", "both"):
+            with self.subTest(basis=basis):
+                _, loader = _make_loader(basis)
+                total = sum(b["trainX"].shape[0] for b in loader)
+                self.assertEqual(total, _N)
+
+    def test_matches_single_worker_all_bases(self):
+        for basis in ("X", "Z", "both"):
+            with self.subTest(basis=basis):
+                ds, _ = _make_loader(basis, num_workers=0)
+                loader_0 = DataLoader(ds, batch_size=_BS, shuffle=False)
+                loader_n = DataLoader(
+                    ds,
+                    batch_size=_BS,
+                    shuffle=False,
+                    num_workers=_W,
+                    multiprocessing_context="spawn",
+                )
+                for b0, bn in zip(loader_0, loader_n):
+                    for k in ("trainX", "x_syn_diff", "z_syn_diff", "dets_and_obs"):
+                        torch.testing.assert_close(b0[k], bn[k])
+
+    def test_persistent_workers_with_prefetch(self):
+        _, loader = _make_loader("X", persistent_workers=True, prefetch_factor=2)
+        total = sum(b["trainX"].shape[0] for b in loader)
+        self.assertEqual(total, _N)
+
+
+if __name__ == "__main__":
+    unittest.main()