fix(mid): seed BitMatrixSampler explicitly to restore test reproducibility

ivanbasov · claude · ivanbasov · commit ec4e8bf09b3d · 2026-04-06T08:51:45.000-07:00
torch.manual_seed() does not control cuQuantum's BitMatrixSampler internal
RNG, so the two mid-GPU tests that relied on it for reproducibility were
non-deterministic and intermittently failing.

Add an optional `seed` parameter to `dem_sampling()` and
`MemoryCircuitTorch.generate_batch()`. When a seed is provided a fresh
BitMatrixSampler is always created with `Options(seed=N)`, resetting its
internal RNG and guaranteeing identical outputs on every call with the same
seed. Production paths (seed=None) are unaffected — the cached sampler is
reused as before.

Update the two failing tests to use the explicit seed kwarg instead of
torch.manual_seed():
- test_he_reduces_error_weight: seed=123
- test_full_pipeline_w2_reproducible: seed=100

Fixes: NVIDIA/Ising-Decoding CI run 23963347042

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/code/qec/dem_sampling.py b/code/qec/dem_sampling.py
@@ -59,6 +59,7 @@ def _custab_available() -> bool:
 _cached_HT: "torch.Tensor | None" = None
 _cached_max_shots: int = 0
 _cached_device_id: int | None = None
+_cached_seed: "int | None" = None
 
 _DEM_TIMINGS_S: deque[float] = deque(maxlen=200)
 _custab_path_logged: bool = False
@@ -75,19 +76,21 @@ def get_dem_sampling_avg_ms() -> float:
 
 def _reset_sampler_cache() -> None:
     """Reset the module-level sampler cache."""
-    global _cached_sampler, _cached_H, _cached_HT, _cached_max_shots, _cached_device_id
+    global _cached_sampler, _cached_H, _cached_HT, _cached_max_shots, _cached_device_id, _cached_seed
     _cached_sampler = None
     _cached_H = None
     _cached_HT = None
     _cached_max_shots = 0
     _cached_device_id = None
+    _cached_seed = None
 
 
 def dem_sampling(
     H: torch.Tensor,
     p: torch.Tensor,
     batch_size: int,
-    device_id: int | None = None
+    device_id: int | None = None,
+    seed: int | None = None,
 ) -> torch.Tensor:
     """
     Sample errors from a detector error model (DEM) via cuST BitMatrixSampler.
@@ -98,6 +101,10 @@ def dem_sampling(
         batch_size: int - Number of samples to generate
         device_id: Optional int - Device ID for cuST. If omitted, infer from
             H.device when H is on CUDA.
+        seed: Optional int - RNG seed for the BitMatrixSampler. When provided,
+            a fresh sampler is always created so that the same seed produces
+            identical outputs on repeated calls (useful for reproducible tests).
+            When None (default), the cached sampler is reused across calls.
 
     Returns:
         frames_xz: (batch_size, 2*num_detectors) uint8 - Detector outcomes
@@ -106,7 +113,7 @@ def dem_sampling(
     from cuquantum.stabilizer.simulator import Options
 
     global _cached_sampler, _cached_H, _cached_HT, _cached_max_shots
-    global _cached_device_id, _custab_path_logged
+    global _cached_device_id, _cached_seed, _custab_path_logged
 
     if H.ndim != 2:
         raise ValueError(f"H must be 2-D, got ndim={H.ndim}")
@@ -129,9 +136,14 @@ def dem_sampling(
         _cached_H = H
         _cached_sampler = None
         _cached_device_id = None
+        _cached_seed = None
 
+    # When a seed is requested we always create a fresh sampler so that the
+    # BitMatrixSampler's internal RNG is reset to that seed, giving bit-for-bit
+    # reproducibility across repeated calls with the same seed value.
     need_new = (
-        _cached_sampler is None or batch_size > _cached_max_shots or _cached_device_id != device_id
+        _cached_sampler is None or batch_size > _cached_max_shots
+        or _cached_device_id != device_id or seed is not None
     )
 
     if need_new:
@@ -146,15 +158,19 @@ def dem_sampling(
             H_in = _cached_HT.detach().cpu().numpy().astype(np.uint8)
             p_in = p.detach().cpu().numpy().astype(np.float64)
             pkg = "numpy"
+        opt_kwargs: dict = {"device_id": device_id}
+        if seed is not None:
+            opt_kwargs["seed"] = seed
         _cached_sampler = BitMatrixSampler(
             H_in,
             p_in,
             max_shots,
             package=pkg,
-            options=Options(device_id=device_id),
+            options=Options(**opt_kwargs),
         )
         _cached_max_shots = max_shots
         _cached_device_id = device_id
+        _cached_seed = seed
 
     t0 = time.perf_counter()
     if gpu_native:
diff --git a/code/qec/surface_code/memory_circuit_torch.py b/code/qec/surface_code/memory_circuit_torch.py
@@ -234,6 +234,7 @@ def generate_batch(
         batch_size: int,
         return_aux: bool = False,
         collect_timing: bool = False,
+        seed: int | None = None,
     ) -> Union[
         tuple[torch.Tensor, torch.Tensor],
         tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
@@ -248,6 +249,8 @@ def generate_batch(
           build Stim dets_and_obs from circuit-order measurements.
         - If collect_timing=True, also return timing breakdown in milliseconds:
           data generation, HE, format, and total.
+        - If seed is given, the BitMatrixSampler is re-created with that seed so
+          repeated calls with the same seed produce identical outputs.
         """
         if self._compile_thread is not None:
             # torch.compile warmup can be slow; 20 min cap prevents silent hangs.
@@ -269,6 +272,7 @@ def generate_batch(
             self.p,
             int(batch_size),
             device_id=device_id,
+            seed=seed,
         )  # (B, 2*num_detectors)
         meas_old = measure_from_stacked_frames(
             frames_xz, self.meas_qubits, self.meas_bases, nq=self.nq
diff --git a/code/tests/mid/test_homological_equivalence.py b/code/tests/mid/test_homological_equivalence.py
@@ -1250,10 +1250,8 @@ def test_he_reduces_error_weight(self):
         gen0 = self._make_generator(num_he_cycles=0)
         gen1 = self._make_generator(num_he_cycles=2)
 
-        torch.manual_seed(123)
-        trainX0, trainY0 = gen0.generate_batch(batch_size=self.batch_size)
-        torch.manual_seed(123)
-        trainX1, trainY1 = gen1.generate_batch(batch_size=self.batch_size)
+        trainX0, trainY0 = gen0.generate_batch(batch_size=self.batch_size, seed=123)
+        trainX1, trainY1 = gen1.generate_batch(batch_size=self.batch_size, seed=123)
 
         # trainX is derived from meas_old and should be identical for same sample.
         self.assertTrue(torch.equal(trainX0, trainX1))
diff --git a/code/tests/mid/test_w2_verify.py b/code/tests/mid/test_w2_verify.py
@@ -88,9 +88,7 @@ def test_full_pipeline_w2_reproducible(self):
             A=None,
         )
         B = 128
-        torch.manual_seed(100)
-        tX_a, tY_a = gen_w2.generate_batch(batch_size=B)
-        torch.manual_seed(100)
-        tX_b, tY_b = gen_w2.generate_batch(batch_size=B)
+        tX_a, tY_a = gen_w2.generate_batch(batch_size=B, seed=100)
+        tX_b, tY_b = gen_w2.generate_batch(batch_size=B, seed=100)
         self.assertTrue(torch.allclose(tX_a, tX_b), "trainX should match for same seed")
         self.assertTrue(torch.allclose(tY_a, tY_b), "trainY should match for same seed")