diff --git a/.github/actions/build-lib/action.yaml b/.github/actions/build-lib/action.yaml
index 67dd960b..a6ec4f01 100644
--- a/.github/actions/build-lib/action.yaml
+++ b/.github/actions/build-lib/action.yaml
@@ -23,6 +23,14 @@ inputs:
     description: 'Install prefix'
     default: '~/.cudaqx'
     required: false
+  require-custabilizer:
+    description: 'Enable cuStabilizer GPU DEM sampling (ON or OFF)'
+    default: 'ON'
+    required: false
+  require-custabilizer-gpu-torch:
+    description: 'Install PyTorch for GPU Torch tensor path (ON or OFF, requires custabilizer ON)'
+    default: 'ON'
+    required: false
 outputs:
   build-dir:
     description: 'Build dir.'
@@ -66,6 +74,8 @@ runs:
       env:
         CCACHE_DIR: /ccache-${{ inputs.lib }}
         GH_TOKEN: ${{ github.token }} # REVERT-WITH-CUDAQ-REALTIME-BUILD
+        REQUIRE_CUSTABILIZER: ${{ inputs.require-custabilizer }}
+        REQUIRE_CUSTABILIZER_GPU_TORCH: ${{ inputs.require-custabilizer-gpu-torch }}
       run: |
         build_dir=build_${{ inputs.lib }}
         .github/actions/build-lib/build_${{ inputs.lib }}.sh $build_dir ${{ inputs.install-prefix }}
diff --git a/.github/actions/build-lib/build_all.sh b/.github/actions/build-lib/build_all.sh
index 03fc901d..17e5022d 100755
--- a/.github/actions/build-lib/build_all.sh
+++ b/.github/actions/build-lib/build_all.sh
@@ -18,6 +18,87 @@ if [ -z "$CUDAQ_REALTIME_ROOT" ]; then
   cd "$_build_cwd"
 fi
 
+REQUIRE_CUSTABILIZER=${REQUIRE_CUSTABILIZER:-ON}
+REQUIRE_CUSTABILIZER_GPU_TORCH=${REQUIRE_CUSTABILIZER_GPU_TORCH:-ON}
+
+if [ "$REQUIRE_CUSTABILIZER" = "OFF" ]; then
+  REQUIRE_CUSTABILIZER_GPU_TORCH="OFF"
+fi
+
+if [ "$REQUIRE_CUSTABILIZER" = "ON" ] && [ -z "$CUSTABILIZER_ROOT" ] && [ -x "$(command -v python3)" ]; then
+  NVCC_BIN=${CUDACXX:-$(command -v nvcc)}
+  CUDA_MAJOR=""
+  if [ -n "$NVCC_BIN" ] && [ -x "$NVCC_BIN" ]; then
+    CUDA_MAJOR=$("$NVCC_BIN" --version | sed -nE 's/.*release ([0-9]+)\..*/\1/p' | head -n 1)
+  fi
+  CUSTAB_PIP="custabilizer-cu${CUDA_MAJOR:-12}"
+  CUQPY_PIP="cuquantum-python-cu${CUDA_MAJOR:-12}"
+  pip install --upgrade "${CUSTAB_PIP}>=0.3.0" "${CUQPY_PIP}>=26.3.0"
+fi
+
+if [ "$REQUIRE_CUSTABILIZER_GPU_TORCH" = "ON" ] && [ -x "$(command -v python3)" ]; then
+  NVCC_BIN=${CUDACXX:-$(command -v nvcc)}
+  if [ -n "$NVCC_BIN" ] && [ -x "$NVCC_BIN" ]; then
+    cuda_version=$("$NVCC_BIN" --version | sed -nE 's/.*release ([0-9]+\.[0-9]+).*/\1/p' | head -n 1)
+    cuda_no_dot=$(echo "$cuda_version" | tr -d '.')
+    pip install torch --index-url "https://download.pytorch.org/whl/cu${cuda_no_dot}" || true
+  fi
+fi
+
+if [ -z "$CUSTABILIZER_ROOT" ] && [ "$REQUIRE_CUSTABILIZER" = "ON" ] && [ -x "$(command -v python3)" ]; then
+  if [ -z "$CUSTABILIZER_PIP_PACKAGE" ]; then
+    CUDA_MAJOR=""
+    NVCC_BIN=${CUDACXX:-$(command -v nvcc)}
+    if [ -n "$NVCC_BIN" ] && [ -x "$NVCC_BIN" ]; then
+      CUDA_MAJOR=$("$NVCC_BIN" --version | sed -nE 's/.*release ([0-9]+)\..*/\1/p' | head -n 1)
+    fi
+    if [ -n "$CUDA_MAJOR" ]; then
+      CUSTABILIZER_PIP_PACKAGE="custabilizer-cu${CUDA_MAJOR}"
+    else
+      for candidate in custabilizer-cu13 custabilizer-cu12; do
+        if python3 -m pip show "$candidate" >/dev/null 2>&1; then
+          CUSTABILIZER_PIP_PACKAGE="$candidate"
+          break
+        fi
+      done
+    fi
+  fi
+
+  if [ -n "$CUSTABILIZER_PIP_PACKAGE" ] && \
+     python3 -m pip show "$CUSTABILIZER_PIP_PACKAGE" >/dev/null 2>&1; then
+    CUSTABILIZER_ROOT=$(python3 - "$CUSTABILIZER_PIP_PACKAGE" <<'PY'
+import pathlib, subprocess, sys
+package = sys.argv[1]
+show = subprocess.check_output([sys.executable, "-m", "pip", "show", package], text=True)
+location = ""
+for line in show.splitlines():
+    if line.startswith("Location:"):
+        location = line.split(":", 1)[1].strip(); break
+if not location:
+    print(""); raise SystemExit(0)
+base = pathlib.Path(location)
+for root in [base / "custabilizer", base / "cuquantum", base]:
+    h = root / "include" / "custabilizer.h"
+    lib = root / "lib" / "libcustabilizer.so.0"
+    lib64 = root / "lib64" / "libcustabilizer.so.0"
+    if h.exists() and (lib.exists() or lib64.exists()):
+        print(root); raise SystemExit(0)
+print("")
+PY
+)
+  fi
+fi
+
+CUSTABILIZER_CMAKE_ARG=""
+if [ -n "$CUSTABILIZER_ROOT" ]; then
+  echo "Resolved cuStabilizer root: $CUSTABILIZER_ROOT"
+  CUSTABILIZER_CMAKE_ARG="-DCUSTABILIZER_ROOT=$CUSTABILIZER_ROOT"
+  export LD_LIBRARY_PATH="$CUSTABILIZER_ROOT/lib:$CUSTABILIZER_ROOT/lib64:${LD_LIBRARY_PATH:-}"
+  export CPATH="$CUSTABILIZER_ROOT/include:${CPATH:-}"
+else
+  echo "Unable to resolve CUSTABILIZER_ROOT from pip wheel."
+fi
+
 cmake -S . -B "$1" \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_C_COMPILER=gcc-11 \
@@ -25,6 +106,8 @@ cmake -S . -B "$1" \
   -DCMAKE_C_COMPILER_LAUNCHER=ccache \
   -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
   -DCUDAQ_DIR=/cudaq-install/lib/cmake/cudaq/ \
+  ${CUSTABILIZER_CMAKE_ARG:+$CUSTABILIZER_CMAKE_ARG} \
+  -DCUDAQ_QEC_REQUIRE_CUSTABILIZER=$REQUIRE_CUSTABILIZER \
   -DCUDAQX_ENABLE_LIBS="all" \
   -DCUDAQX_INCLUDE_TESTS=ON \
   -DCUDAQX_BINDINGS_PYTHON=ON \
diff --git a/.github/actions/build-lib/build_qec.sh b/.github/actions/build-lib/build_qec.sh
index 9e74a59c..b34b2b8e 100755
--- a/.github/actions/build-lib/build_qec.sh
+++ b/.github/actions/build-lib/build_qec.sh
@@ -19,6 +19,106 @@ if [ -z "$CUDAQ_REALTIME_ROOT" ]; then
   cd "$_build_cwd"
 fi
 
+REQUIRE_CUSTABILIZER=${REQUIRE_CUSTABILIZER:-ON}
+REQUIRE_CUSTABILIZER_GPU_TORCH=${REQUIRE_CUSTABILIZER_GPU_TORCH:-ON}
+
+# Enforce: cuStabilizer OFF forces GPU Torch OFF
+if [ "$REQUIRE_CUSTABILIZER" = "OFF" ]; then
+  REQUIRE_CUSTABILIZER_GPU_TORCH="OFF"
+fi
+
+# Install cuStabilizer dependencies if required
+if [ "$REQUIRE_CUSTABILIZER" = "ON" ] && [ -z "$CUSTABILIZER_ROOT" ] && [ -x "$(command -v python3)" ]; then
+  NVCC_BIN=${CUDACXX:-$(command -v nvcc)}
+  CUDA_MAJOR=""
+  if [ -n "$NVCC_BIN" ] && [ -x "$NVCC_BIN" ]; then
+    CUDA_MAJOR=$("$NVCC_BIN" --version | sed -nE 's/.*release ([0-9]+)\..*/\1/p' | head -n 1)
+  fi
+  CUSTAB_PIP="custabilizer-cu${CUDA_MAJOR:-12}"
+  CUQPY_PIP="cuquantum-python-cu${CUDA_MAJOR:-12}"
+  pip install --upgrade "${CUSTAB_PIP}>=0.3.0" "${CUQPY_PIP}>=26.3.0"
+fi
+
+# Install Torch if GPU Torch path is required
+if [ "$REQUIRE_CUSTABILIZER_GPU_TORCH" = "ON" ] && [ -x "$(command -v python3)" ]; then
+  NVCC_BIN=${CUDACXX:-$(command -v nvcc)}
+  if [ -n "$NVCC_BIN" ] && [ -x "$NVCC_BIN" ]; then
+    cuda_version=$("$NVCC_BIN" --version | sed -nE 's/.*release ([0-9]+\.[0-9]+).*/\1/p' | head -n 1)
+    cuda_no_dot=$(echo "$cuda_version" | tr -d '.')
+    pip install torch --index-url "https://download.pytorch.org/whl/cu${cuda_no_dot}" || true
+  fi
+fi
+
+if [ -z "$CUSTABILIZER_ROOT" ] && [ "$REQUIRE_CUSTABILIZER" = "ON" ] && [ -x "$(command -v python3)" ]; then
+  if [ -z "$CUSTABILIZER_PIP_PACKAGE" ]; then
+    CUDA_MAJOR=""
+    NVCC_BIN=${CUDACXX:-$(command -v nvcc)}
+    if [ -n "$NVCC_BIN" ] && [ -x "$NVCC_BIN" ]; then
+      CUDA_MAJOR=$("$NVCC_BIN" --version | sed -nE 's/.*release ([0-9]+)\..*/\1/p' | head -n 1)
+    fi
+
+    if [ -n "$CUDA_MAJOR" ]; then
+      CUSTABILIZER_PIP_PACKAGE="custabilizer-cu${CUDA_MAJOR}"
+    else
+      for candidate in custabilizer-cu13 custabilizer-cu12; do
+        if python3 -m pip show "$candidate" >/dev/null 2>&1; then
+          CUSTABILIZER_PIP_PACKAGE="$candidate"
+          break
+        fi
+      done
+    fi
+  fi
+
+  if [ -n "$CUSTABILIZER_PIP_PACKAGE" ] && \
+     python3 -m pip show "$CUSTABILIZER_PIP_PACKAGE" >/dev/null 2>&1; then
+    CUSTABILIZER_ROOT=$(python3 - "$CUSTABILIZER_PIP_PACKAGE" <<'PY'
+import pathlib
+import subprocess
+import sys
+
+package = sys.argv[1]
+show = subprocess.check_output(
+    [sys.executable, "-m", "pip", "show", package], text=True
+)
+location = ""
+for line in show.splitlines():
+    if line.startswith("Location:"):
+        location = line.split(":", 1)[1].strip()
+        break
+
+if not location:
+    print("")
+    raise SystemExit(0)
+
+base = pathlib.Path(location)
+candidates = [base / "custabilizer", base / "cuquantum", base]
+for root in candidates:
+    include = root / "include" / "custabilizer.h"
+    if not include.exists():
+        continue
+    lib = root / "lib" / "libcustabilizer.so.0"
+    lib64 = root / "lib64" / "libcustabilizer.so.0"
+    if lib.exists() or lib64.exists():
+        print(root)
+        raise SystemExit(0)
+
+print("")
+PY
+)
+  fi
+fi
+
+CUSTABILIZER_CMAKE_ARG=""
+if [ -n "$CUSTABILIZER_ROOT" ]; then
+  echo "Resolved cuStabilizer root: $CUSTABILIZER_ROOT"
+  CUSTABILIZER_CMAKE_ARG="-DCUSTABILIZER_ROOT=$CUSTABILIZER_ROOT"
+  # Ensure the resolved path wins over any stale container paths.
+  export LD_LIBRARY_PATH="$CUSTABILIZER_ROOT/lib:$CUSTABILIZER_ROOT/lib64:${LD_LIBRARY_PATH:-}"
+  export CPATH="$CUSTABILIZER_ROOT/include:${CPATH:-}"
+else
+  echo "Unable to resolve CUSTABILIZER_ROOT from pip wheel."
+fi
+
 cmake -S libs/qec -B "$1" \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_C_COMPILER=gcc-11 \
@@ -26,8 +126,10 @@ cmake -S libs/qec -B "$1" \
   -DCMAKE_C_COMPILER_LAUNCHER=ccache \
   -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
   -DCUDAQ_DIR=/cudaq-install/lib/cmake/cudaq/ \
+  ${CUSTABILIZER_CMAKE_ARG:+$CUSTABILIZER_CMAKE_ARG} \
   -DCUDAQX_INCLUDE_TESTS=ON \
   -DCUDAQX_BINDINGS_PYTHON=ON \
+  -DCUDAQ_QEC_REQUIRE_CUSTABILIZER=$REQUIRE_CUSTABILIZER \
   -DCMAKE_INSTALL_PREFIX="$2" \
   -DCUDAQ_REALTIME_ROOT=$CUDAQ_REALTIME_ROOT
 
diff --git a/.github/workflows/all_libs.yaml b/.github/workflows/all_libs.yaml
index d9a3bfcb..4f8feb10 100644
--- a/.github/workflows/all_libs.yaml
+++ b/.github/workflows/all_libs.yaml
@@ -65,6 +65,7 @@ jobs:
       - name: Install build requirements
         run: |
           apt install -y --no-install-recommends gfortran libblas-dev wget
+          pip install --upgrade cuquantum-python-cu${{ steps.config.outputs.cuda_major }}>=26.3.0 custabilizer-cu${{ steps.config.outputs.cuda_major }}>=0.3.0
 
       - name: Install TensorRT (amd64)
         if: matrix.platform == 'amd64'
@@ -108,10 +109,10 @@ jobs:
           LD_LIBRARY_PATH: ${{ env.MPI_PATH }}/lib:${{ env.LD_LIBRARY_PATH }}
         shell: bash
         run: |
-          # Install the correct torch first.
-          cuda_no_dot=$(echo ${{ matrix.cuda_version }} | sed 's/\.//')
-          pip install torch==2.9.0 --index-url https://download.pytorch.org/whl/cu${cuda_no_dot}
-          pip install numpy pytest onnxscript cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} lightning ml_collections mpi4py transformers quimb opt_einsum nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}==26.01.0
+          pip install --upgrade numpy pytest onnxscript cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} lightning ml_collections mpi4py transformers quimb opt_einsum nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}>=26.3.0 custabilizer-cu${{ steps.config.outputs.cuda_major }}>=0.3.0
+          echo "=== cuStabilizer verification ==="
+          pip show custabilizer-cu${{ steps.config.outputs.cuda_major }} | grep Version
+          find / -name "libcustabilizer.so*" 2>/dev/null || true
           # The following tests are needed for docs/sphinx/examples/qec/python/tensor_network_decoder.py.
           if [ "$(uname -m)" == "x86_64" ]; then
               # Stim is not currently available on manylinux ARM wheels, so only
@@ -119,9 +120,22 @@ jobs:
               pip install stim beliefmatching
           fi
 
+      - name: Set cuStabilizer runtime paths
+        run: |
+          CUSTAB_ROOT=$(python3 -c "
+          import subprocess, pathlib, sys
+          pkg = 'custabilizer-cu' + '${{ steps.config.outputs.cuda_major }}'
+          show = subprocess.check_output([sys.executable, '-m', 'pip', 'show', pkg], text=True)
+          loc = [l.split(':',1)[1].strip() for l in show.splitlines() if l.startswith('Location:')][0]
+          root = pathlib.Path(loc) / 'cuquantum'
+          print(root if (root / 'lib' / 'libcustabilizer.so.0').exists() else '')
+          ")
+          if [ -n "$CUSTAB_ROOT" ]; then
+            echo "LD_LIBRARY_PATH=$CUSTAB_ROOT/lib:$CUSTAB_ROOT/lib64:${{ env.MPI_PATH }}/lib:${LD_LIBRARY_PATH:-}" >> $GITHUB_ENV
+          fi
+
       - name: Run Python tests
         env:
-          LD_LIBRARY_PATH: ${{ env.MPI_PATH }}/lib:${{ env.LD_LIBRARY_PATH }}
           OMPI_MCA_pml: ob1
         run: cmake --build ${{ steps.build.outputs.build-dir }} --target run_python_tests
 
@@ -131,6 +145,5 @@ jobs:
  
       - name: Run example tests
         env:
-          LD_LIBRARY_PATH: ${{ env.MPI_PATH }}/lib:${{ env.LD_LIBRARY_PATH }}
           OMPI_MCA_pml: ob1
         run: bash scripts/ci/test_examples.sh all
diff --git a/.github/workflows/all_libs_release.yaml b/.github/workflows/all_libs_release.yaml
index 6df0a3fe..64d15c1a 100644
--- a/.github/workflows/all_libs_release.yaml
+++ b/.github/workflows/all_libs_release.yaml
@@ -41,8 +41,12 @@ jobs:
     permissions: write-all
     steps:
       - name: Install dependencies
+        id: deps
         run: |
           apt update && apt install -y --no-install-recommends zip unzip patchelf git-lfs
+          cuda_major=$(echo ${{ matrix.cuda_version }} | cut -d . -f1)
+          echo "cuda_major=$cuda_major" >> $GITHUB_OUTPUT
+          pip install --upgrade cuquantum-python-cu${cuda_major}>=26.3.0 custabilizer-cu${cuda_major}>=0.3.0
 
       - name: Checkout repository
         uses: actions/checkout@v4
@@ -130,10 +134,10 @@ jobs:
           LD_LIBRARY_PATH: ${{ env.MPI_PATH }}/lib:${{ env.LD_LIBRARY_PATH }}
         shell: bash
         run: |
-          # Install the correct torch first.
-          cuda_no_dot=$(echo ${{ matrix.cuda_version }} | sed 's/\.//')
-          pip install torch==2.9.0 --index-url https://download.pytorch.org/whl/cu${cuda_no_dot}
-          pip install numpy pytest onnxscript cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} lightning ml_collections mpi4py transformers quimb opt_einsum nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}==26.01.0
+          pip install --upgrade numpy pytest onnxscript cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} lightning ml_collections mpi4py transformers quimb opt_einsum nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}>=26.3.0 custabilizer-cu${{ steps.config.outputs.cuda_major }}>=0.3.0
+          echo "=== cuStabilizer verification ==="
+          pip show custabilizer-cu${{ steps.config.outputs.cuda_major }} | grep Version
+          find / -name "libcustabilizer.so*" 2>/dev/null || true
           # The following tests are needed for docs/sphinx/examples/qec/python/tensor_network_decoder.py.
           if [ "$(uname -m)" == "x86_64" ]; then
               # Stim is not currently available on manylinux ARM wheels, so only
@@ -141,13 +145,25 @@ jobs:
               pip install stim beliefmatching
           fi
 
+      - name: Set cuStabilizer runtime paths
+        run: |
+          CUSTAB_ROOT=$(python3 -c "
+          import subprocess, pathlib, sys
+          pkg = 'custabilizer-cu' + '${{ steps.config.outputs.cuda_major }}'
+          show = subprocess.check_output([sys.executable, '-m', 'pip', 'show', pkg], text=True)
+          loc = [l.split(':',1)[1].strip() for l in show.splitlines() if l.startswith('Location:')][0]
+          root = pathlib.Path(loc) / 'cuquantum'
+          print(root if (root / 'lib' / 'libcustabilizer.so.0').exists() else '')
+          ")
+          if [ -n "$CUSTAB_ROOT" ]; then
+            echo "LD_LIBRARY_PATH=$CUSTAB_ROOT/lib:$CUSTAB_ROOT/lib64:${{ env.MPI_PATH }}/lib:${LD_LIBRARY_PATH:-}" >> $GITHUB_ENV
+          fi
+
       - name: Run Python tests
         env:
-          LD_LIBRARY_PATH: ${{ env.MPI_PATH }}/lib:${{ env.LD_LIBRARY_PATH }}
           OMPI_MCA_pml: ob1
         run: |
           if [[ -n "${{ env.QEC_EXTERNAL_DECODERS }}" ]]; then
-            # Verify that external decoder is available if applicable
             export PYTHONPATH="/usr/local/cudaq:$HOME/.cudaqx"
             python3 -c "import cudaq_qec as qec; print(qec.__version__); d = qec.get_decoder('nv-qldpc-decoder', qec.get_code('steane').get_parity()); print(d.get_version())"
           fi
@@ -160,7 +176,6 @@ jobs:
  
       - name: Run example tests
         env:
-          LD_LIBRARY_PATH: ${{ env.MPI_PATH }}/lib:${{ env.LD_LIBRARY_PATH }}
           OMPI_MCA_pml: ob1
         run: |
           ln -s /usr/local/cudaq /cudaq-install
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 03aec19e..7459d782 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -60,12 +60,24 @@ jobs:
           apt install -y --no-install-recommends \
             gfortran libblas-dev doxygen
 
-          python3 -m pip install IPython breathe enum_tools myst_parser nbsphinx \
+          python3 -m pip install --break-system-packages --upgrade IPython breathe enum_tools myst_parser nbsphinx \
             sphinx_copybutton sphinx_inline_tabs sphinx_gallery sphinx_rtd_theme \
-            sphinx_reredirects sphinx_toolbox cupy-cuda12x cuquantum-python-cu12
+            sphinx_reredirects "sphinx-autodoc-typehints<3.8" sphinx_toolbox cupy-cuda12x cuquantum-python-cu12>=26.3.0 custabilizer-cu12>=0.3.0
+          echo "=== cuStabilizer verification ==="
+          pip show custabilizer-cu12 | grep Version
+          find / -name "libcustabilizer.so*" 2>/dev/null || true
 
       - name: Build docs
         run: |
+          # Resolve cuStabilizer root from the pip wheel.
+          CUSTABILIZER_ROOT=$(python3 -c "
+          import subprocess, pathlib, sys
+          show = subprocess.check_output([sys.executable, '-m', 'pip', 'show', 'custabilizer-cu12'], text=True)
+          loc = [l.split(':',1)[1].strip() for l in show.splitlines() if l.startswith('Location:')][0]
+          root = pathlib.Path(loc) / 'cuquantum'
+          print(root if (root / 'include' / 'custabilizer.h').exists() else '')
+          ")
+
           # Note: TRT decoder is disabled because all docs for the TRT decoder
           # are generated independently of the source code (no Doxygen
           # documentation).
@@ -74,6 +86,7 @@ jobs:
             -DCMAKE_C_COMPILER=gcc-11 \
             -DCMAKE_CXX_COMPILER=g++-11 \
             -DCUDAQ_DIR=/cudaq-install/lib/cmake/cudaq/ \
+            ${CUSTABILIZER_ROOT:+-DCUSTABILIZER_ROOT=$CUSTABILIZER_ROOT} \
             -DCUDAQX_ENABLE_LIBS="all" \
             -DCUDAQX_INCLUDE_DOCS=ON \
             -DCUDAQX_BINDINGS_PYTHON=ON \
diff --git a/.github/workflows/lib_qec.yaml b/.github/workflows/lib_qec.yaml
index cd9ac571..aed1fb4b 100644
--- a/.github/workflows/lib_qec.yaml
+++ b/.github/workflows/lib_qec.yaml
@@ -12,6 +12,8 @@ jobs:
       matrix:
         platform: ['amd64', 'arm64']
         cuda_version: ['12.6', '13.0']
+        custabilizer: ['ON']
+        custabilizer_gpu_torch: ['ON']
     runs-on: ${{ startsWith(github.repository, 'NVIDIA/cudaqx') && format('linux-{0}-cpu8', matrix.platform) || 'ubuntu-latest' }}
     container: ghcr.io/nvidia/cuda-quantum-devcontainer:${{ matrix.platform }}-cu${{ matrix.cuda_version }}-gcc11-main
     permissions:
@@ -88,11 +90,27 @@ jobs:
           pr-number: ${{ steps.export-pr-info.outputs.pr_number }}
           save-ccache: true
           platform: ${{ matrix.platform }}
+          require-custabilizer: ${{ matrix.custabilizer }}
+          require-custabilizer-gpu-torch: ${{ matrix.custabilizer_gpu_torch }}
 
       # ========================================================================
       # Run tests
       # ========================================================================
-      
+
+      - name: Set cuStabilizer build-time paths
+        run: |
+          CUSTAB_ROOT=$(python3 -c "
+          import subprocess, pathlib, sys
+          pkg = 'custabilizer-cu' + '${{ steps.config.outputs.cuda_major }}'
+          show = subprocess.check_output([sys.executable, '-m', 'pip', 'show', pkg], text=True)
+          loc = [l.split(':',1)[1].strip() for l in show.splitlines() if l.startswith('Location:')][0]
+          root = pathlib.Path(loc) / 'cuquantum'
+          print(root if (root / 'lib' / 'libcustabilizer.so.0').exists() else '')
+          ")
+          if [ -n "$CUSTAB_ROOT" ]; then
+            echo "LD_LIBRARY_PATH=$CUSTAB_ROOT/lib:$CUSTAB_ROOT/lib64:${LD_LIBRARY_PATH:-}" >> $GITHUB_ENV
+          fi
+
       - name: Run tests
         run: cmake --build ${{ steps.build.outputs.build-dir }} --target run_tests
 
@@ -103,10 +121,10 @@ jobs:
       - name: Install python requirements
         shell: bash
         run: |
-          # Install the correct torch first.
-          cuda_no_dot=$(echo ${{ matrix.cuda_version }} | sed 's/\.//')
-          pip install torch==2.9.0 --index-url https://download.pytorch.org/whl/cu${cuda_no_dot}
-          pip install numpy pytest onnxscript cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} quimb opt_einsum nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}==26.01.0
+          pip install --upgrade numpy pytest onnxscript cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} quimb opt_einsum nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}>=26.3.0 custabilizer-cu${{ steps.config.outputs.cuda_major }}>=0.3.0
+          echo "=== cuStabilizer verification ==="
+          pip show custabilizer-cu${{ steps.config.outputs.cuda_major }} | grep Version
+          find / -name "libcustabilizer.so*" 2>/dev/null || true
           # The following tests are needed for docs/sphinx/examples/qec/python/tensor_network_decoder.py.
           if [ "$(uname -m)" == "x86_64" ]; then
               # Stim is not currently available on manylinux ARM wheels, so only
@@ -173,8 +191,29 @@ jobs:
         run: |
           tar xzf /tmp/qec-build-artifacts.tar.gz -C /
 
+      - name: Configure
+        id: config
+        run: |
+          cuda_major=$(echo ${{ matrix.cuda_version }} | cut -d . -f1)
+          echo "cuda_major=$cuda_major" >> $GITHUB_OUTPUT
+
+      - name: Install cuStabilizer runtime
+        run: |
+          pip install --upgrade cuquantum-python-cu${{ steps.config.outputs.cuda_major }}>=26.3.0 custabilizer-cu${{ steps.config.outputs.cuda_major }}>=0.3.0
+          echo "=== cuStabilizer verification ==="
+          pip show custabilizer-cu${{ steps.config.outputs.cuda_major }} | grep Version
+          CUSTABILIZER_ROOT=$(python3 -c "
+          import subprocess, pathlib, sys
+          show = subprocess.check_output([sys.executable, '-m', 'pip', 'show', 'custabilizer-cu${{ steps.config.outputs.cuda_major }}'], text=True)
+          loc = [l.split(':',1)[1].strip() for l in show.splitlines() if l.startswith('Location:')][0]
+          root = pathlib.Path(loc) / 'cuquantum'
+          print(root if (root / 'include' / 'custabilizer.h').exists() else '')
+          ")
+          echo "CUSTABILIZER_ROOT=$CUSTABILIZER_ROOT"
+          echo "LD_LIBRARY_PATH=$CUSTABILIZER_ROOT/lib:$CUSTABILIZER_ROOT/lib64:${LD_LIBRARY_PATH:-}" >> $GITHUB_ENV
+
       - name: Run GPU tests
         run: |
           cd $GITHUB_WORKSPACE/build_qec
-          ctest --output-on-failure -R "GpuKernelsTest|test_realtime_decoding"
+          ctest --output-on-failure -R "GpuKernelsTest|test_realtime_decoding|DemSamplingGPU"
 
diff --git a/cmake/Modules/FindcuStabilizer.cmake b/cmake/Modules/FindcuStabilizer.cmake
new file mode 100644
index 00000000..b143db82
--- /dev/null
+++ b/cmake/Modules/FindcuStabilizer.cmake
@@ -0,0 +1,84 @@
+# ============================================================================ #
+# Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.    #
+# ============================================================================ #
+
+#[=======================================================================[.rst:
+FindcuStabilizer
+----------------
+
+Find the cuStabilizer library (shipped as a pip wheel: custabilizer-cu12 / cu13).
+
+Imported Targets
+^^^^^^^^^^^^^^^^
+
+``cuStabilizer::cuStabilizer``
+  The cuStabilizer library.
+
+Result Variables
+^^^^^^^^^^^^^^^^
+
+``cuStabilizer_FOUND``
+``cuStabilizer_INCLUDE_DIR``
+``cuStabilizer_LIBRARY``
+
+Hints
+^^^^^
+
+``CUSTABILIZER_ROOT``
+  Preferred search prefix (set by CI from ``pip show custabilizer-cuXX``).
+
+#]=======================================================================]
+
+find_path(cuStabilizer_INCLUDE_DIR
+  NAMES custabilizer.h
+  HINTS
+    ${CUSTABILIZER_ROOT}/include
+)
+
+find_library(cuStabilizer_LIBRARY
+  NAMES custabilizer libcustabilizer.so.0
+  HINTS
+    ${CUSTABILIZER_ROOT}/lib64
+    ${CUSTABILIZER_ROOT}/lib
+)
+
+set(cuStabilizer_VERSION "")
+if(cuStabilizer_INCLUDE_DIR AND EXISTS "${cuStabilizer_INCLUDE_DIR}/custabilizer.h")
+  file(STRINGS "${cuStabilizer_INCLUDE_DIR}/custabilizer.h"
+       _custab_major_line REGEX "^#define[ \t]+CUSTABILIZER_MAJOR[ \t]+[0-9]+")
+  file(STRINGS "${cuStabilizer_INCLUDE_DIR}/custabilizer.h"
+       _custab_minor_line REGEX "^#define[ \t]+CUSTABILIZER_MINOR[ \t]+[0-9]+")
+  file(STRINGS "${cuStabilizer_INCLUDE_DIR}/custabilizer.h"
+       _custab_patch_line REGEX "^#define[ \t]+CUSTABILIZER_PATCH[ \t]+[0-9]+")
+  if(_custab_major_line AND _custab_minor_line AND _custab_patch_line)
+    string(REGEX REPLACE "^#define[ \t]+CUSTABILIZER_MAJOR[ \t]+([0-9]+).*$" "\\1"
+           _custab_major "${_custab_major_line}")
+    string(REGEX REPLACE "^#define[ \t]+CUSTABILIZER_MINOR[ \t]+([0-9]+).*$" "\\1"
+           _custab_minor "${_custab_minor_line}")
+    string(REGEX REPLACE "^#define[ \t]+CUSTABILIZER_PATCH[ \t]+([0-9]+).*$" "\\1"
+           _custab_patch "${_custab_patch_line}")
+    set(cuStabilizer_VERSION "${_custab_major}.${_custab_minor}.${_custab_patch}")
+  endif()
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(cuStabilizer
+  REQUIRED_VARS cuStabilizer_INCLUDE_DIR cuStabilizer_LIBRARY
+  VERSION_VAR cuStabilizer_VERSION
+)
+
+if(cuStabilizer_FOUND AND NOT TARGET cuStabilizer::cuStabilizer)
+  add_library(cuStabilizer::cuStabilizer INTERFACE IMPORTED GLOBAL)
+  set_target_properties(cuStabilizer::cuStabilizer PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${cuStabilizer_INCLUDE_DIR}"
+  )
+  if(cuStabilizer_LIBRARY)
+    set_target_properties(cuStabilizer::cuStabilizer PROPERTIES
+      INTERFACE_LINK_LIBRARIES "${cuStabilizer_LIBRARY}"
+    )
+  endif()
+endif()
diff --git a/libs/core/include/cuda-qx/core/kwargs_utils.h b/libs/core/include/cuda-qx/core/kwargs_utils.h
index ce50f3cc..1357f8d1 100644
--- a/libs/core/include/cuda-qx/core/kwargs_utils.h
+++ b/libs/core/include/cuda-qx/core/kwargs_utils.h
@@ -105,7 +105,7 @@ template <typename T>
 tensor<T> toTensor(const py::array_t<T> &H, bool perform_pcm_checks = false) {
   py::buffer_info buf = H.request();
 
-  if (buf.ndim >= 1 && buf.strides[0] == buf.itemsize) {
+  if (buf.ndim >= 2 && buf.strides[0] < buf.strides[buf.ndim - 1]) {
     throw std::runtime_error("toTensor: data must be in row-major order, but "
                              "column-major order was detected.");
   }
diff --git a/libs/qec/CMakeLists.txt b/libs/qec/CMakeLists.txt
index 6ceafa9a..7b8ac405 100644
--- a/libs/qec/CMakeLists.txt
+++ b/libs/qec/CMakeLists.txt
@@ -61,6 +61,11 @@ option(CUDAQX_QEC_INSTALL_PYTHON
        "Install python files alongside the library."
        ${CUDAQX_INSTALL_PYTHON})
 
+# Require cuStabilizer + CUDA (with sparse DEM APIs) for QEC builds.
+option(CUDAQ_QEC_REQUIRE_CUSTABILIZER
+       "Fail configuration when cuStabilizer/CUDA or required DEM APIs are unavailable."
+       ON)
+
 # Option to control TRT decoder build (default: ON)
 option(CUDAQ_QEC_BUILD_TRT_DECODER "Build the TensorRT decoder plugin" ON)
 
diff --git a/libs/qec/include/cudaq/qec/dem_sampling.h b/libs/qec/include/cudaq/qec/dem_sampling.h
new file mode 100644
index 00000000..a37499df
--- /dev/null
+++ b/libs/qec/include/cudaq/qec/dem_sampling.h
@@ -0,0 +1,71 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+#pragma once
+
+#include "cuda-qx/core/tensor.h"
+#include <cstdint>
+#include <optional>
+#include <tuple>
+#include <vector>
+
+namespace cudaq::qec::dem_sampler {
+
+/// CPU implementation of DEM sampling. This is the existing implementation
+/// that uses std::bernoulli_distribution and tensor dot product.
+namespace cpu {
+
+/// @brief Sample measurements from a check matrix (CPU, per-mechanism probs)
+/// @param check_matrix Binary matrix [num_checks × num_error_mechanisms]
+/// @param numShots Number of measurement shots
+/// @param error_probabilities Per-error-mechanism probabilities
+/// @return (checks [numShots × num_checks], errors [numShots × num_mechanisms])
+std::tuple<cudaqx::tensor<uint8_t>, cudaqx::tensor<uint8_t>>
+sample_dem(const cudaqx::tensor<uint8_t> &check_matrix, std::size_t numShots,
+           const std::vector<double> &error_probabilities);
+
+/// @brief Sample measurements from a check matrix with seed (CPU)
+std::tuple<cudaqx::tensor<uint8_t>, cudaqx::tensor<uint8_t>>
+sample_dem(const cudaqx::tensor<uint8_t> &check_matrix, std::size_t numShots,
+           const std::vector<double> &error_probabilities, unsigned seed);
+
+} // namespace cpu
+
+/// GPU implementation of DEM sampling via cuStabilizer C API.
+///
+/// The caller provides device pointers. The function composes:
+///   1. pack_check_matrix_rowwise (dense uint8 → bitpacked uint32)
+///   2. custabilizerSampleProbArraySparseCompute (sparse Bernoulli sampling)
+///   3. custabilizerGF2SparseDenseMatrixMultiply (syndrome = errors * H^T)
+///   4. unpack_syndromes_gpu (bitpacked uint32 → uint8)
+///   5. csr_to_dense_fused (CSR errors → dense uint8)
+///
+/// All GPU memory is allocated and freed internally per call.
+namespace gpu {
+
+/// @brief GPU DEM sampling with device pointers
+///
+/// @param d_check_matrix Device pointer [num_checks × num_error_mechanisms]
+/// @param num_checks Number of checks (rows of H)
+/// @param num_error_mechanisms Number of error mechanisms (columns of H)
+/// @param d_error_probabilities Device pointer [num_error_mechanisms]
+/// @param num_shots Number of samples
+/// @param seed RNG seed
+/// @param d_checks_out Device pointer [num_shots × num_checks] (OUTPUT)
+/// @param d_errors_out Device pointer [num_shots × num_error_mechanisms] (OUT)
+/// @param stream_handle Optional CUDA stream handle (uintptr_t cast), 0 for
+///        default stream
+/// @return true on success, false if cuStabilizer is unavailable
+bool sample_dem(const uint8_t *d_check_matrix, size_t num_checks,
+                size_t num_error_mechanisms,
+                const double *d_error_probabilities, size_t num_shots,
+                unsigned seed, uint8_t *d_checks_out, uint8_t *d_errors_out,
+                std::uintptr_t stream_handle = 0);
+
+} // namespace gpu
+
+} // namespace cudaq::qec::dem_sampler
diff --git a/libs/qec/include/cudaq/qec/experiments.h b/libs/qec/include/cudaq/qec/experiments.h
index 74ba476f..5cb0b3fe 100644
--- a/libs/qec/include/cudaq/qec/experiments.h
+++ b/libs/qec/include/cudaq/qec/experiments.h
@@ -9,6 +9,8 @@
 
 #include "cudaq/qec/code.h"
 #include "cudaq/qec/detector_error_model.h"
+#include <tuple>
+#include <vector>
 
 namespace cudaq::qec {
 
@@ -61,6 +63,25 @@ std::tuple<cudaqx::tensor<uint8_t>, cudaqx::tensor<uint8_t>>
 sample_code_capacity(const code &code, std::size_t numShots,
                      double error_probability);
 
+/// @brief Sample a detector error model on CPU (legacy API).
+/// @param check_matrix Binary matrix [num_checks x num_error_mechanisms]
+/// @param numShots Number of independent Monte-Carlo shots
+/// @param error_probabilities Per-error-mechanism Bernoulli probabilities
+/// @return Tuple of (checks, errors)
+std::tuple<cudaqx::tensor<uint8_t>, cudaqx::tensor<uint8_t>>
+dem_sampling(const cudaqx::tensor<uint8_t> &check_matrix, std::size_t numShots,
+             const std::vector<double> &error_probabilities);
+
+/// @brief Sample a detector error model on CPU (legacy API, seeded).
+/// @param check_matrix Binary matrix [num_checks x num_error_mechanisms]
+/// @param numShots Number of independent Monte-Carlo shots
+/// @param error_probabilities Per-error-mechanism Bernoulli probabilities
+/// @param seed RNG seed for reproducibility
+/// @return Tuple of (checks, errors)
+std::tuple<cudaqx::tensor<uint8_t>, cudaqx::tensor<uint8_t>>
+dem_sampling(const cudaqx::tensor<uint8_t> &check_matrix, std::size_t numShots,
+             const std::vector<double> &error_probabilities, unsigned seed);
+
 /// @brief Sample syndrome measurements with circuit-level noise
 /// @param statePrep Initial state preparation operation
 /// @param numShots Number of measurement shots
diff --git a/libs/qec/lib/CMakeLists.txt b/libs/qec/lib/CMakeLists.txt
index 86b4e5dd..b81441cd 100644
--- a/libs/qec/lib/CMakeLists.txt
+++ b/libs/qec/lib/CMakeLists.txt
@@ -10,8 +10,93 @@ set(LIBRARY_NAME cudaq-qec)
 
 add_compile_options(-Wno-attributes) 
 
-# FIXME?: This must be a shared library. Trying to build a static one will fail.
-add_library(${LIBRARY_NAME} SHARED
+# cuStabilizer GPU DEM sampling support (required by default).
+find_package(cuStabilizer QUIET)
+set(_cuStabilizer_location
+    "include='${cuStabilizer_INCLUDE_DIR}', library='${cuStabilizer_LIBRARY}'")
+if(cuStabilizer_VERSION)
+  string(APPEND _cuStabilizer_location ", version='${cuStabilizer_VERSION}'")
+endif()
+
+# Force a fresh probe each configure invocation.
+unset(CUDAQ_QEC_CUSTABILIZER_HAS_SPARSE_DEM_APIS CACHE)
+unset(CUDAQ_QEC_CUSTABILIZER_HAS_SPARSE_DEM_APIS)
+if(cuStabilizer_FOUND AND CMAKE_CUDA_COMPILER)
+  include(CheckCXXSourceCompiles)
+  set(CMAKE_REQUIRED_INCLUDES "${cuStabilizer_INCLUDE_DIR}")
+  if(CUDAToolkit_INCLUDE_DIRS)
+    list(APPEND CMAKE_REQUIRED_INCLUDES ${CUDAToolkit_INCLUDE_DIRS})
+  endif()
+  # check_cxx_source_compiles links executables by default. Probe using a
+  # static library target so we only validate header/API availability.
+  set(_cudaqx_prev_try_compile_target_type "${CMAKE_TRY_COMPILE_TARGET_TYPE}")
+  set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+  check_cxx_source_compiles(
+    "
+    #include <custabilizer.h>
+    int main() {
+      (void)CUSTABILIZER_STATUS_INSUFFICIENT_SPARSE_STORAGE;
+      using prepare_t = decltype(&custabilizerSampleProbArraySparsePrepare);
+      using compute_t = decltype(&custabilizerSampleProbArraySparseCompute);
+      using spmm_t = decltype(&custabilizerGF2SparseDenseMatrixMultiply);
+      (void)sizeof(prepare_t);
+      (void)sizeof(compute_t);
+      (void)sizeof(spmm_t);
+      return 0;
+    }
+    "
+    CUDAQ_QEC_CUSTABILIZER_HAS_SPARSE_DEM_APIS
+  )
+  if(DEFINED _cudaqx_prev_try_compile_target_type AND
+     NOT _cudaqx_prev_try_compile_target_type STREQUAL "")
+    set(CMAKE_TRY_COMPILE_TARGET_TYPE "${_cudaqx_prev_try_compile_target_type}")
+  else()
+    unset(CMAKE_TRY_COMPILE_TARGET_TYPE)
+  endif()
+  unset(_cudaqx_prev_try_compile_target_type)
+  unset(CMAKE_REQUIRED_INCLUDES)
+endif()
+
+if(NOT DEFINED CUDAQ_QEC_CUSTABILIZER_HAS_SPARSE_DEM_APIS)
+  set(CUDAQ_QEC_CUSTABILIZER_HAS_SPARSE_DEM_APIS FALSE)
+endif()
+
+if(cuStabilizer_FOUND AND CMAKE_CUDA_COMPILER AND
+   CUDAQ_QEC_CUSTABILIZER_HAS_SPARSE_DEM_APIS)
+  message(STATUS
+          "cuStabilizer found (${_cuStabilizer_location}) - GPU DEM sampling enabled")
+  set(CUDAQ_QEC_HAS_CUSTABILIZER TRUE)
+  set(CUDAQ_QEC_HAS_CUSTABILIZER TRUE PARENT_SCOPE)
+elseif(cuStabilizer_FOUND AND CMAKE_CUDA_COMPILER)
+  message(WARNING
+    "cuStabilizer found (${_cuStabilizer_location}), but sparse DEM APIs are "
+    "missing. GPU DEM sampling disabled. Required APIs: "
+    "custabilizerSampleProbArraySparsePrepare, "
+    "custabilizerSampleProbArraySparseCompute, "
+    "custabilizerGF2SparseDenseMatrixMultiply."
+  )
+  set(CUDAQ_QEC_HAS_CUSTABILIZER FALSE)
+  set(CUDAQ_QEC_HAS_CUSTABILIZER FALSE PARENT_SCOPE)
+else()
+  message(STATUS "cuStabilizer not found or CUDA unavailable – GPU DEM sampling disabled")
+  set(CUDAQ_QEC_HAS_CUSTABILIZER FALSE)
+  set(CUDAQ_QEC_HAS_CUSTABILIZER FALSE PARENT_SCOPE)
+endif()
+
+if(CUDAQ_QEC_REQUIRE_CUSTABILIZER AND NOT CUDAQ_QEC_HAS_CUSTABILIZER)
+  message(FATAL_ERROR
+    "CUDAQ_QEC_REQUIRE_CUSTABILIZER is ON, but required cuStabilizer/CUDA "
+    "support was not detected.\n"
+    "Detected: cuStabilizer_FOUND='${cuStabilizer_FOUND}', "
+    "${_cuStabilizer_location}, "
+    "cuda_compiler='${CMAKE_CUDA_COMPILER}', "
+    "sparse_dem_apis='${CUDAQ_QEC_CUSTABILIZER_HAS_SPARSE_DEM_APIS}'.\n"
+    "Required APIs: custabilizerSampleProbArraySparsePrepare, "
+    "custabilizerSampleProbArraySparseCompute, "
+    "custabilizerGF2SparseDenseMatrixMultiply.")
+endif()
+
+set(QEC_SOURCES
   code.cpp
   decoder.cpp
   detector_error_model.cpp
@@ -24,6 +109,16 @@ add_library(${LIBRARY_NAME} SHARED
   version.cpp
 )
 
+if(CUDAQ_QEC_HAS_CUSTABILIZER)
+  list(APPEND QEC_SOURCES
+    dem_sampling/dem_sampling_utils.cu
+    dem_sampling/dem_sampling_gpu.cpp
+  )
+endif()
+
+# FIXME?: This must be a shared library. Trying to build a static one will fail.
+add_library(${LIBRARY_NAME} SHARED ${QEC_SOURCES})
+
 add_subdirectory(decoders/plugins/example)
 add_subdirectory(decoders/plugins/pymatching)
 
@@ -38,7 +133,10 @@ if (CUDAQX_QEC_USE_FLOAT)
    target_compile_definitions(${LIBRARY_NAME} PUBLIC -DCUDAQX_QEC_FLOAT_TYPE=float)
 endif()
 
+
+
 target_include_directories(${LIBRARY_NAME}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
   PUBLIC 
     $<BUILD_INTERFACE:${CUDAQX_QEC_INCLUDE_DIR}>
     $<INSTALL_INTERFACE:${CUDAQ_INCLUDE_DIR}>
@@ -57,6 +155,18 @@ target_link_libraries(${LIBRARY_NAME}
     cudaq::cudaq-common
 )
 
+if(CUDAQ_QEC_HAS_CUSTABILIZER)
+  target_compile_definitions(${LIBRARY_NAME} PUBLIC CUDAQ_QEC_HAS_CUSTABILIZER)
+  target_link_libraries(${LIBRARY_NAME} PRIVATE
+    cuStabilizer::cuStabilizer
+    CUDA::cudart
+  )
+  set_source_files_properties(
+    dem_sampling/dem_sampling_utils.cu
+    PROPERTIES LANGUAGE CUDA
+  )
+endif()
+
 set_target_properties(${LIBRARY_NAME} PROPERTIES
   LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 
diff --git a/libs/qec/lib/dem_sampling/dem_sampling_gpu.cpp b/libs/qec/lib/dem_sampling/dem_sampling_gpu.cpp
new file mode 100644
index 00000000..e173a1ae
--- /dev/null
+++ b/libs/qec/lib/dem_sampling/dem_sampling_gpu.cpp
@@ -0,0 +1,193 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/qec/dem_sampling.h"
+
+#ifdef CUDAQ_QEC_HAS_CUSTABILIZER
+#include "dem_sampling_utils.h"
+#include <cuda_runtime.h>
+#include <custabilizer.h>
+#include <memory>
+#include <stdexcept>
+#include <string>
+
+namespace {
+
+void cuda_check(cudaError_t err, const char *msg) {
+  if (err != cudaSuccess)
+    throw std::runtime_error(std::string(msg) + ": " + cudaGetErrorString(err));
+}
+
+void custab_check(custabilizerStatus_t s, const char *msg) {
+  if (s != CUSTABILIZER_STATUS_SUCCESS)
+    throw std::runtime_error(std::string(msg) + ": " +
+                             custabilizerGetErrorString(s));
+}
+
+struct StreamCudaDeleter {
+  cudaStream_t stream = nullptr;
+  void operator()(void *p) const {
+    if (p)
+      cudaFreeAsync(p, stream);
+  }
+};
+
+template <typename T>
+using DevicePtr = std::unique_ptr<T, StreamCudaDeleter>;
+
+template <typename T>
+DevicePtr<T> device_alloc(size_t count, cudaStream_t stream) {
+  T *p = nullptr;
+  cuda_check(cudaMallocAsync(&p, count * sizeof(T), stream), "cudaMallocAsync");
+  return DevicePtr<T>(p, StreamCudaDeleter{stream});
+}
+
+struct HandleRAII {
+  custabilizerHandle_t h = nullptr;
+  HandleRAII() { custab_check(custabilizerCreate(&h), "custabilizerCreate"); }
+  ~HandleRAII() {
+    if (h)
+      custabilizerDestroy(h);
+  }
+  HandleRAII(const HandleRAII &) = delete;
+  HandleRAII &operator=(const HandleRAII &) = delete;
+};
+
+} // namespace
+#endif // CUDAQ_QEC_HAS_CUSTABILIZER
+
+namespace cudaq::qec::dem_sampler::gpu {
+
+bool sample_dem(const uint8_t *d_check_matrix, size_t num_checks,
+                size_t num_error_mechanisms,
+                const double *d_error_probabilities, size_t num_shots,
+                unsigned seed, uint8_t *d_checks_out, uint8_t *d_errors_out,
+                std::uintptr_t stream_handle) {
+#ifdef CUDAQ_QEC_HAS_CUSTABILIZER
+  if (num_checks == 0 || num_error_mechanisms == 0 || num_shots == 0)
+    return true;
+
+  using namespace cudaq::qec::dem_sampling_utils;
+
+  cudaStream_t stream = reinterpret_cast<cudaStream_t>(stream_handle);
+
+  HandleRAII handle;
+
+  // ── 1. Pack H^T from H directly on device ────────────────────────────────
+  // For GF2SparseDenseMatrixMultiply:
+  //   C = A @ B where A is CSR [shots × errors], and B is dense packed
+  //   [errors × n_checks_padded/32]. Here B = H^T packed row-wise.
+  //
+  // We avoid a host round-trip by packing H^T directly from H in a single
+  // kernel.
+  uint64_t n_checks_padded = ((num_checks + 31) / 32) * 32;
+  uint64_t check_words = n_checks_padded / 32;
+
+  // Pack H^T row-wise: [num_error_mechanisms × num_checks] -> [num_err × words]
+  auto d_ht_packed =
+      device_alloc<uint32_t>(num_error_mechanisms * check_words, stream);
+  pack_check_matrix_transposed_rowwise(d_check_matrix, d_ht_packed.get(),
+                                       num_checks, num_error_mechanisms,
+                                       stream);
+
+  // ── 2. Sparse Bernoulli sampling ──────────────────────────────────────────
+  size_t ws_size = 0;
+  custab_check(custabilizerSampleProbArraySparsePrepare(
+                   handle.h, static_cast<int64_t>(num_shots),
+                   static_cast<int64_t>(num_error_mechanisms), &ws_size),
+               "SparsePrepare");
+
+  auto d_workspace = device_alloc<uint8_t>(ws_size, stream);
+
+  // Initial capacity estimate: E[nnz] * 1.25 + 1024
+  // Copy probabilities to host via the caller's stream to respect stream
+  // ordering (the data may have been produced by a kernel on this stream).
+  std::vector<double> h_probs(num_error_mechanisms);
+  cuda_check(cudaMemcpyAsync(h_probs.data(), d_error_probabilities,
+                             num_error_mechanisms * sizeof(double),
+                             cudaMemcpyDeviceToHost, stream),
+             "copy probs");
+  cuda_check(cudaStreamSynchronize(stream), "sync probs");
+  double sum_p = 0.0;
+  for (auto p : h_probs)
+    sum_p += p;
+  uint64_t capacity = static_cast<uint64_t>(num_shots * sum_p * 1.25 + 1024);
+  uint64_t max_cap = num_shots * num_error_mechanisms;
+  if (capacity > max_cap)
+    capacity = max_cap;
+  if (capacity < num_shots)
+    capacity = num_shots;
+
+  auto d_row_offsets = device_alloc<uint64_t>(num_shots + 1, stream);
+  auto d_col_indices = device_alloc<uint64_t>(capacity, stream);
+
+  uint64_t nnz = capacity;
+  custabilizerStatus_t status = custabilizerSampleProbArraySparseCompute(
+      handle.h, static_cast<int64_t>(num_shots),
+      static_cast<int64_t>(num_error_mechanisms), d_error_probabilities,
+      static_cast<uint64_t>(seed), &nnz, d_col_indices.get(),
+      d_row_offsets.get(), d_workspace.get(), ws_size, stream);
+
+  if (status == CUSTABILIZER_STATUS_INSUFFICIENT_SPARSE_STORAGE) {
+    // nnz now contains the required capacity
+    capacity = nnz;
+    d_col_indices = device_alloc<uint64_t>(capacity, stream);
+    nnz = capacity;
+    custab_check(custabilizerSampleProbArraySparseCompute(
+                     handle.h, static_cast<int64_t>(num_shots),
+                     static_cast<int64_t>(num_error_mechanisms),
+                     d_error_probabilities, static_cast<uint64_t>(seed), &nnz,
+                     d_col_indices.get(), d_row_offsets.get(),
+                     d_workspace.get(), ws_size, stream),
+                 "SparseCompute retry");
+  } else {
+    custab_check(status, "SparseCompute");
+  }
+
+  // ── 3. GF(2) sparse-dense matmul: syndromes = errors * H^T ──────────────
+  // A = CSR errors [num_shots × num_error_mechanisms]
+  // B = H^T packed [num_error_mechanisms × n_checks_padded/32]
+  // C = packed syndromes [num_shots × n_checks_padded/32]
+  auto d_syndromes_packed =
+      device_alloc<uint32_t>(num_shots * check_words, stream);
+
+  custab_check(custabilizerGF2SparseDenseMatrixMultiply(
+                   handle.h, static_cast<uint64_t>(num_shots),
+                   static_cast<uint64_t>(n_checks_padded),
+                   static_cast<uint64_t>(num_error_mechanisms), nnz,
+                   d_col_indices.get(), d_row_offsets.get(), d_ht_packed.get(),
+                   0, // beta=0: assign
+                   d_syndromes_packed.get(), stream),
+               "GF2SparseDenseMatrixMultiply");
+
+  // ── 4. Unpack syndromes: bitpacked → uint8 ──────────────────────────────
+  unpack_syndromes_gpu(d_syndromes_packed.get(), d_checks_out, num_shots,
+                       num_checks, stream);
+
+  // ── 5. CSR errors → dense uint8 ─────────────────────────────────────────
+  csr_to_dense_fused(d_row_offsets.get(), d_col_indices.get(), num_shots,
+                     num_error_mechanisms, d_errors_out, stream);
+
+  cuda_check(cudaStreamSynchronize(stream), "sync");
+
+  return true;
+#else
+  (void)d_check_matrix;
+  (void)num_checks;
+  (void)num_error_mechanisms;
+  (void)d_error_probabilities;
+  (void)num_shots;
+  (void)seed;
+  (void)d_checks_out;
+  (void)d_errors_out;
+  (void)stream_handle;
+  return false;
+#endif
+}
+
+} // namespace cudaq::qec::dem_sampler::gpu
diff --git a/libs/qec/lib/dem_sampling/dem_sampling_utils.cu b/libs/qec/lib/dem_sampling/dem_sampling_utils.cu
new file mode 100644
index 00000000..8d2dac0e
--- /dev/null
+++ b/libs/qec/lib/dem_sampling/dem_sampling_utils.cu
@@ -0,0 +1,223 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "dem_sampling_utils.h"
+#include <cuda_runtime.h>
+#include <stdexcept>
+#include <string>
+
+namespace cudaq::qec::dem_sampling_utils {
+
+namespace {
+
+constexpr uint64_t kMaxGridDimX = static_cast<uint64_t>(INT32_MAX);
+constexpr uint64_t kMaxGridDimYZ = 65535u;
+
+void check_kernel_launch(const char *kernel_name) {
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+    throw std::runtime_error(std::string(kernel_name) +
+                             " launch: " + cudaGetErrorString(err));
+}
+
+void check_grid_dims(uint64_t grid_x, uint64_t grid_y,
+                     const char *kernel_name) {
+  if (grid_x > kMaxGridDimX || grid_y > kMaxGridDimYZ)
+    throw std::runtime_error(std::string(kernel_name) + ": grid dimensions (" +
+                             std::to_string(grid_x) + ", " +
+                             std::to_string(grid_y) + ") exceed CUDA limits (" +
+                             std::to_string(kMaxGridDimX) + ", " +
+                             std::to_string(kMaxGridDimYZ) + ")");
+}
+
+} // namespace
+
+// ── Row-wise packing: [M × N] uint8 → [M × words] uint32 ──────────────────
+__global__ void
+pack_check_matrix_rowwise_kernel(const uint8_t *__restrict__ d_check_matrix,
+                                 uint32_t *__restrict__ d_check_packed,
+                                 uint32_t M, uint32_t N,
+                                 uint32_t errors_words) {
+  const uint32_t row = blockIdx.x;
+  const uint32_t word_idx = blockIdx.y * blockDim.x + threadIdx.x;
+
+  if (row >= M || word_idx >= errors_words)
+    return;
+
+  const uint8_t *row_ptr = d_check_matrix + static_cast<uint64_t>(row) * N;
+  const uint32_t bit_start = word_idx * 32;
+
+  uint32_t packed = 0;
+#pragma unroll
+  for (uint32_t bit = 0; bit < 32; bit++) {
+    const uint32_t col = bit_start + bit;
+    if (col < N) {
+      packed |= (static_cast<uint32_t>(__ldg(&row_ptr[col]) & 1u) << bit);
+    }
+  }
+
+  d_check_packed[static_cast<uint64_t>(row) * errors_words + word_idx] = packed;
+}
+
+void pack_check_matrix_rowwise(const uint8_t *d_check_matrix,
+                               uint32_t *d_check_packed, uint64_t M, uint64_t N,
+                               cudaStream_t stream) {
+  if (M == 0 || N == 0)
+    return;
+
+  uint64_t words = bitpack_words(N);
+  dim3 block(256);
+  uint64_t grid_x = M;
+  uint64_t grid_y = (words + block.x - 1) / block.x;
+  check_grid_dims(grid_x, grid_y, "pack_check_matrix_rowwise");
+  dim3 grid(static_cast<uint32_t>(grid_x), static_cast<uint32_t>(grid_y));
+
+  pack_check_matrix_rowwise_kernel<<<grid, block, 0, stream>>>(
+      d_check_matrix, d_check_packed, static_cast<uint32_t>(M),
+      static_cast<uint32_t>(N), static_cast<uint32_t>(words));
+  check_kernel_launch("pack_check_matrix_rowwise");
+}
+
+// ── Direct transpose+pack: H [checks x errors] → packed H^T [errors x words]
+__global__ void pack_check_matrix_transposed_rowwise_kernel(
+    const uint8_t *__restrict__ d_check_matrix,
+    uint32_t *__restrict__ d_check_t_packed, uint32_t num_checks,
+    uint32_t num_error_mechanisms, uint32_t checks_words) {
+  const uint32_t error_mech = blockIdx.x;
+  const uint32_t word_idx = blockIdx.y * blockDim.x + threadIdx.x;
+
+  if (error_mech >= num_error_mechanisms || word_idx >= checks_words)
+    return;
+
+  const uint32_t bit_start = word_idx * 32;
+  uint32_t packed = 0;
+#pragma unroll
+  for (uint32_t bit = 0; bit < 32; ++bit) {
+    const uint32_t check = bit_start + bit;
+    if (check < num_checks) {
+      const uint64_t idx =
+          static_cast<uint64_t>(check) * num_error_mechanisms + error_mech;
+      packed |=
+          (static_cast<uint32_t>(__ldg(&d_check_matrix[idx]) & 1u) << bit);
+    }
+  }
+
+  d_check_t_packed[static_cast<uint64_t>(error_mech) * checks_words +
+                   word_idx] = packed;
+}
+
+void pack_check_matrix_transposed_rowwise(const uint8_t *d_check_matrix,
+                                          uint32_t *d_check_t_packed,
+                                          uint64_t num_checks,
+                                          uint64_t num_error_mechanisms,
+                                          cudaStream_t stream) {
+  if (num_checks == 0 || num_error_mechanisms == 0)
+    return;
+
+  const uint64_t checks_words = bitpack_words(num_checks);
+  dim3 block(256);
+  uint64_t grid_x = num_error_mechanisms;
+  uint64_t grid_y = (checks_words + block.x - 1) / block.x;
+  check_grid_dims(grid_x, grid_y, "pack_check_matrix_transposed_rowwise");
+  dim3 grid(static_cast<uint32_t>(grid_x), static_cast<uint32_t>(grid_y));
+
+  pack_check_matrix_transposed_rowwise_kernel<<<grid, block, 0, stream>>>(
+      d_check_matrix, d_check_t_packed, static_cast<uint32_t>(num_checks),
+      static_cast<uint32_t>(num_error_mechanisms),
+      static_cast<uint32_t>(checks_words));
+  check_kernel_launch("pack_check_matrix_transposed_rowwise");
+}
+
+// ── Syndrome unpacking: [S × words] uint32 → [S × M] uint8 ────────────────
+__global__ void unpack_syndromes_kernel(const uint32_t *__restrict__ d_packed,
+                                        uint8_t *__restrict__ d_unpacked,
+                                        uint64_t S, uint64_t M) {
+  const uint64_t shot = blockIdx.x;
+  const uint64_t check =
+      static_cast<uint64_t>(blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if (check >= M)
+    return;
+
+  const uint64_t words_per_row = (M + 31) >> 5;
+  const uint32_t word_idx = static_cast<uint32_t>(check >> 5);
+  const uint32_t bit_pos = static_cast<uint32_t>(check & 31);
+
+  const uint32_t word = __ldg(&d_packed[shot * words_per_row + word_idx]);
+  d_unpacked[shot * M + check] = static_cast<uint8_t>((word >> bit_pos) & 1u);
+}
+
+void unpack_syndromes_gpu(const uint32_t *d_packed, uint8_t *d_unpacked,
+                          uint64_t num_shots, uint64_t num_checks,
+                          cudaStream_t stream) {
+  if (num_shots == 0 || num_checks == 0)
+    return;
+
+  const int threads = 256;
+  uint64_t grid_x = num_shots;
+  uint64_t grid_y = (num_checks + threads - 1) / threads;
+  check_grid_dims(grid_x, grid_y, "unpack_syndromes");
+  dim3 grid(static_cast<uint32_t>(grid_x), static_cast<uint32_t>(grid_y));
+
+  unpack_syndromes_kernel<<<grid, threads, 0, stream>>>(d_packed, d_unpacked,
+                                                        num_shots, num_checks);
+  check_kernel_launch("unpack_syndromes");
+}
+
+// ── CSR → dense (uint64_t indices, warp-cooperative fused zeroing) ─────────
+__global__ void
+csr_to_dense_fused_kernel(const uint64_t *__restrict__ d_row_offsets,
+                          const uint64_t *__restrict__ d_col_indices,
+                          uint64_t num_shots, uint64_t num_error_mechanisms,
+                          uint8_t *__restrict__ d_errors_out) {
+  const uint32_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+  const uint32_t lane = threadIdx.x % 32;
+
+  if (warp_id >= num_shots)
+    return;
+
+  const uint64_t shot = warp_id;
+  uint8_t *shot_out = d_errors_out + shot * num_error_mechanisms;
+
+  for (uint64_t i = lane; i < num_error_mechanisms; i += 32)
+    shot_out[i] = 0;
+
+  __syncwarp();
+
+  const uint64_t start = d_row_offsets[shot];
+  const uint64_t end = d_row_offsets[shot + 1];
+  const uint64_t num_errors = end - start;
+
+  for (uint64_t e = lane; e < num_errors; e += 32) {
+    uint64_t idx = d_col_indices[start + e];
+    if (idx < num_error_mechanisms) {
+      shot_out[idx] = 1;
+    }
+  }
+}
+
+void csr_to_dense_fused(const uint64_t *d_row_offsets,
+                        const uint64_t *d_col_indices, uint64_t num_shots,
+                        uint64_t num_error_mechanisms, uint8_t *d_errors_out,
+                        cudaStream_t stream) {
+  if (num_shots == 0 || num_error_mechanisms == 0)
+    return;
+
+  const int warps_per_block = 8;
+  const int threads_per_block = warps_per_block * 32;
+  uint64_t num_blocks_u64 = (num_shots + warps_per_block - 1) / warps_per_block;
+  check_grid_dims(num_blocks_u64, 1, "csr_to_dense_fused");
+  const int num_blocks = static_cast<int>(num_blocks_u64);
+
+  csr_to_dense_fused_kernel<<<num_blocks, threads_per_block, 0, stream>>>(
+      d_row_offsets, d_col_indices, num_shots, num_error_mechanisms,
+      d_errors_out);
+  check_kernel_launch("csr_to_dense_fused");
+}
+
+} // namespace cudaq::qec::dem_sampling_utils
diff --git a/libs/qec/lib/dem_sampling/dem_sampling_utils.h b/libs/qec/lib/dem_sampling/dem_sampling_utils.h
new file mode 100644
index 00000000..cb24c2b5
--- /dev/null
+++ b/libs/qec/lib/dem_sampling/dem_sampling_utils.h
@@ -0,0 +1,47 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+#pragma once
+
+#include <cstdint>
+#include <cuda_runtime.h>
+
+namespace cudaq::qec::dem_sampling_utils {
+
+inline __host__ __device__ uint64_t bitpack_words(uint64_t num_bits) {
+  return (num_bits + 31) / 32;
+}
+
+/// Pack [M x N] uint8 matrix row-wise into [M x words] uint32, where
+/// words = ceil(N/32). Bit i in word w encodes column w*32+i.
+void pack_check_matrix_rowwise(const uint8_t *d_check_matrix,
+                               uint32_t *d_check_packed, uint64_t M, uint64_t N,
+                               cudaStream_t stream = 0);
+
+/// Pack H^T directly from H without an explicit transpose buffer.
+/// Input H is [num_checks x num_error_mechanisms] uint8.
+/// Output packed H^T is [num_error_mechanisms x ceil(num_checks/32)] uint32.
+void pack_check_matrix_transposed_rowwise(const uint8_t *d_check_matrix,
+                                          uint32_t *d_check_t_packed,
+                                          uint64_t num_checks,
+                                          uint64_t num_error_mechanisms,
+                                          cudaStream_t stream = 0);
+
+/// Unpack [S x words] uint32 syndromes into [S x M] uint8.
+void unpack_syndromes_gpu(const uint32_t *d_packed, uint8_t *d_unpacked,
+                          uint64_t num_shots, uint64_t num_checks,
+                          cudaStream_t stream = 0);
+
+/// Convert CSR errors (uint64_t offsets/indices from cuStabilizer) to
+/// dense [num_shots x num_error_mechanisms] uint8, fusing zeroing with
+/// scatter. The warp-cooperative kernel avoids a separate cudaMemset.
+void csr_to_dense_fused(const uint64_t *d_row_offsets,
+                        const uint64_t *d_col_indices, uint64_t num_shots,
+                        uint64_t num_error_mechanisms, uint8_t *d_errors_out,
+                        cudaStream_t stream = 0);
+
+} // namespace cudaq::qec::dem_sampling_utils
diff --git a/libs/qec/lib/experiments.cpp b/libs/qec/lib/experiments.cpp
index 6c4155e4..6c899dde 100644
--- a/libs/qec/lib/experiments.cpp
+++ b/libs/qec/lib/experiments.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2024 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
@@ -8,6 +8,8 @@
 
 #include "cudaq/qec/experiments.h"
 #include "device/memory_circuit.h"
+#include "cudaq/qec/dem_sampling.h"
+#include <cmath>
 
 using namespace cudaqx;
 
@@ -83,6 +85,90 @@ sample_code_capacity(const code &code, std::size_t nShots,
                               seed);
 }
 
+} // namespace cudaq::qec
+
+namespace cudaq::qec::dem_sampler::cpu {
+
+std::tuple<cudaqx::tensor<uint8_t>, cudaqx::tensor<uint8_t>>
+sample_dem(const cudaqx::tensor<uint8_t> &check_matrix, std::size_t numShots,
+           const std::vector<double> &error_probabilities, unsigned seed) {
+  if (check_matrix.rank() != 2)
+    throw std::invalid_argument("check_matrix must be rank-2");
+
+  size_t num_checks = check_matrix.shape()[0];
+  size_t num_error_mechanisms = check_matrix.shape()[1];
+
+  if (error_probabilities.size() != num_error_mechanisms)
+    throw std::invalid_argument(
+        "error_probabilities size must match number of error mechanisms");
+
+  for (double p : error_probabilities) {
+    if (!std::isfinite(p) || p < 0.0 || p > 1.0)
+      throw std::invalid_argument(
+          "error_probabilities entries must be finite values in [0, 1]");
+  }
+
+  if (numShots == 0) {
+    cudaqx::tensor<uint8_t> errors({0, num_error_mechanisms});
+    cudaqx::tensor<uint8_t> checks({0, num_checks});
+    return std::make_tuple(checks, errors);
+  }
+
+  std::mt19937 rng(seed);
+  std::vector<std::bernoulli_distribution> distributions;
+  distributions.reserve(num_error_mechanisms);
+  for (double p : error_probabilities)
+    distributions.emplace_back(p);
+
+  cudaqx::tensor<uint8_t> H_binary({num_checks, num_error_mechanisms});
+  std::vector<uint8_t> h_bin(num_checks * num_error_mechanisms);
+  for (size_t i = 0; i < num_checks; ++i)
+    for (size_t j = 0; j < num_error_mechanisms; ++j)
+      h_bin[i * num_error_mechanisms + j] =
+          check_matrix.at({i, j}) & static_cast<uint8_t>(1);
+  H_binary.copy(h_bin.data(), H_binary.shape());
+
+  cudaqx::tensor<uint8_t> errors({numShots, num_error_mechanisms});
+  cudaqx::tensor<uint8_t> checks({numShots, num_checks});
+
+  std::vector<uint8_t> error_bits(numShots * num_error_mechanisms);
+  for (size_t shot = 0; shot < numShots; ++shot) {
+    for (size_t err = 0; err < num_error_mechanisms; ++err) {
+      error_bits[shot * num_error_mechanisms + err] = distributions[err](rng);
+    }
+  }
+
+  errors.copy(error_bits.data(), errors.shape());
+  checks = errors.dot(H_binary.transpose()) % 2;
+
+  return std::make_tuple(checks, errors);
+}
+
+std::tuple<cudaqx::tensor<uint8_t>, cudaqx::tensor<uint8_t>>
+sample_dem(const cudaqx::tensor<uint8_t> &check_matrix, std::size_t numShots,
+           const std::vector<double> &error_probabilities) {
+  return sample_dem(check_matrix, numShots, error_probabilities,
+                    std::random_device()());
+}
+
+} // namespace cudaq::qec::dem_sampler::cpu
+
+namespace cudaq::qec {
+
+std::tuple<cudaqx::tensor<uint8_t>, cudaqx::tensor<uint8_t>>
+dem_sampling(const cudaqx::tensor<uint8_t> &check_matrix, std::size_t nShots,
+             const std::vector<double> &error_probabilities) {
+  return dem_sampler::cpu::sample_dem(check_matrix, nShots,
+                                      error_probabilities);
+}
+
+std::tuple<cudaqx::tensor<uint8_t>, cudaqx::tensor<uint8_t>>
+dem_sampling(const cudaqx::tensor<uint8_t> &check_matrix, std::size_t nShots,
+             const std::vector<double> &error_probabilities, unsigned seed) {
+  return dem_sampler::cpu::sample_dem(check_matrix, nShots, error_probabilities,
+                                      seed);
+}
+
 std::tuple<cudaqx::tensor<uint8_t>, cudaqx::tensor<uint8_t>>
 sample_memory_circuit(const code &code, operation statePrep,
                       std::size_t numShots, std::size_t numRounds,
diff --git a/libs/qec/pyproject.toml.cu12 b/libs/qec/pyproject.toml.cu12
index 63ba5aee..ed18e8f9 100644
--- a/libs/qec/pyproject.toml.cu12
+++ b/libs/qec/pyproject.toml.cu12
@@ -13,6 +13,7 @@ readme = "README.md"
 license = "LicenseRef-NVIDIA-Proprietary"
 dependencies = [
   'cuda-quantum-cu12 == 0.14.*',
+  'cuquantum-python-cu12>=26.03.0',
 ]
 classifiers = [
     'Intended Audience :: Science/Research',
@@ -49,6 +50,7 @@ ninja.version = ">=1.10"
 [tool.scikit-build.cmake.define]
 CUDAQX_QEC_INCLUDE_TESTS = false
 CUDAQX_QEC_BINDINGS_PYTHON = true
+CUDAQ_QEC_REQUIRE_CUSTABILIZER = true
 
 [tool.setuptools_scm]
 write_to = "_version.py"
@@ -57,16 +59,17 @@ write_to = "_version.py"
 tensor_network_decoder = [
   "quimb",
   "opt_einsum",
-  "torch",
-  "cuquantum-python-cu12==26.01.0"
+  "cuquantum-python-cu12>=26.3.0"
 ]
 trt_decoder = [
   "tensorrt-cu12"
 ]
+dem_sampling = [
+  "cuquantum-python-cu12>=26.3.0"
+]
 all = [
   "quimb",
   "opt_einsum",
-  "torch",
-  "cuquantum-python-cu12==26.01.0",
+  "cuquantum-python-cu12>=26.3.0",
   "tensorrt-cu12; platform_machine == 'x86_64'"
 ]
diff --git a/libs/qec/pyproject.toml.cu13 b/libs/qec/pyproject.toml.cu13
index 5179eb90..fcbd2fb6 100644
--- a/libs/qec/pyproject.toml.cu13
+++ b/libs/qec/pyproject.toml.cu13
@@ -13,6 +13,7 @@ readme = "README.md"
 license = "LicenseRef-NVIDIA-Proprietary"
 dependencies = [
   'cuda-quantum-cu13 == 0.14.*',
+  'cuquantum-python-cu13>=26.03.0',
 ]
 classifiers = [
     'Intended Audience :: Science/Research',
@@ -49,6 +50,7 @@ ninja.version = ">=1.10"
 [tool.scikit-build.cmake.define]
 CUDAQX_QEC_INCLUDE_TESTS = false
 CUDAQX_QEC_BINDINGS_PYTHON = true
+CUDAQ_QEC_REQUIRE_CUSTABILIZER = true
 
 [tool.setuptools_scm]
 write_to = "_version.py"
@@ -57,16 +59,17 @@ write_to = "_version.py"
 tensor_network_decoder = [
   "quimb",
   "opt_einsum",
-  "torch>=2.9.0",
-  "cuquantum-python-cu13==26.01.0"
+  "cuquantum-python-cu13>=26.3.0"
 ]
 trt_decoder = [
   "tensorrt-cu13"
 ]
+dem_sampling = [
+  "cuquantum-python-cu13>=26.3.0"
+]
 all = [
   "quimb",
   "opt_einsum",
-  "torch>=2.9.0",
-  "cuquantum-python-cu13==26.01.0",
+  "cuquantum-python-cu13>=26.3.0",
   "tensorrt-cu13"
 ]
diff --git a/libs/qec/python/CMakeLists.txt b/libs/qec/python/CMakeLists.txt
index ae23fd4b..01e1f348 100644
--- a/libs/qec/python/CMakeLists.txt
+++ b/libs/qec/python/CMakeLists.txt
@@ -29,6 +29,7 @@ cudaqx_add_pymodule(${MODULE_NAME}
   bindings/py_decoder.cpp
   bindings/py_decoding_config.cpp
   bindings/py_decoding.cpp
+  bindings/py_dem_sampling.cpp
   bindings/py_surface_code.cpp
   bindings/py_patch.cpp
 )
@@ -40,6 +41,12 @@ target_link_libraries(${MODULE_NAME}
     cudaq::cudaq-python-interop
 )
 
+if(CUDAQ_QEC_HAS_CUSTABILIZER)
+  find_package(CUDAToolkit REQUIRED)
+  target_link_libraries(${MODULE_NAME} PRIVATE CUDA::cudart)
+  target_include_directories(${MODULE_NAME} PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+endif()
+
 set_target_properties(${MODULE_NAME} PROPERTIES
   LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/python/cudaq_qec")
 
diff --git a/libs/qec/python/bindings/cudaqx_qec.cpp b/libs/qec/python/bindings/cudaqx_qec.cpp
index a52df904..bd8c514b 100644
--- a/libs/qec/python/bindings/cudaqx_qec.cpp
+++ b/libs/qec/python/bindings/cudaqx_qec.cpp
@@ -10,6 +10,7 @@
 #include "py_decoder.h"
 #include "py_decoding.h"
 #include "py_decoding_config.h"
+#include "py_dem_sampling.h"
 #include "py_surface_code.h"
 
 #include <pybind11/stl.h>
@@ -21,5 +22,6 @@ PYBIND11_MODULE(_pycudaqx_qec_the_suffix_matters_cudaq_qec, mod) {
   cudaq::qec::bindDecoder(mod);
   cudaq::qec::decoding::config::bindDecodingConfig(mod);
   cudaq::qec::decoding::bindDecoding(mod);
+  cudaq::qec::dem_sampler::bindDemSampling(mod);
   cudaq::qec::surface_code::bindSurfaceCode(mod);
 }
diff --git a/libs/qec/python/bindings/py_decoder.cpp b/libs/qec/python/bindings/py_decoder.cpp
index bd40560c..ef17d3a4 100644
--- a/libs/qec/python/bindings/py_decoder.cpp
+++ b/libs/qec/python/bindings/py_decoder.cpp
@@ -322,7 +322,7 @@ void bindDecoder(py::module &mod) {
               "Parity check matrix must be an array of uint8_t.");
         }
 
-        if (buf.strides[0] == buf.itemsize) {
+        if (buf.strides[0] < buf.strides[buf.ndim - 1]) {
           throw std::runtime_error(
               "Parity check matrix must be in row-major order, but "
               "column-major order was detected.");
diff --git a/libs/qec/python/bindings/py_dem_sampling.cpp b/libs/qec/python/bindings/py_dem_sampling.cpp
new file mode 100644
index 00000000..b6ddffc0
--- /dev/null
+++ b/libs/qec/python/bindings/py_dem_sampling.cpp
@@ -0,0 +1,628 @@
+/*******************************************************************************
+ * Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+#include <algorithm>
+#include <cctype>
+#include <cmath>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <random>
+#include <string>
+#include <vector>
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "type_casters.h"
+#include "cudaq/qec/dem_sampling.h"
+
+#include "cuda-qx/core/kwargs_utils.h"
+
+#ifdef CUDAQ_QEC_HAS_CUSTABILIZER
+#include <cuda_runtime.h>
+#endif
+
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+namespace cudaq::qec::dem_sampler {
+
+namespace {
+
+/// Convert a py::object (numpy array or PyTorch tensor) to a numpy uint8 array.
+py::array_t<uint8_t> asNumpyUint8(py::object obj) {
+  py::module_ np = py::module_::import("numpy");
+  try {
+    py::module_ torch = py::module_::import("torch");
+    if (py::isinstance(obj, torch.attr("Tensor"))) {
+      py::object t = obj.attr("detach")();
+      if (t.attr("is_cuda").cast<bool>())
+        t = t.attr("cpu")();
+      obj = t.attr("numpy")();
+    }
+  } catch (py::error_already_set &) {
+    PyErr_Clear();
+  }
+  return np.attr("ascontiguousarray")(obj, "dtype"_a = np.attr("uint8"))
+      .cast<py::array_t<uint8_t>>();
+}
+
+/// Convert a py::object (numpy array or PyTorch tensor) to a numpy float64
+/// array.
+py::array_t<double> asNumpyFloat64(py::object obj) {
+  py::module_ np = py::module_::import("numpy");
+  try {
+    py::module_ torch = py::module_::import("torch");
+    if (py::isinstance(obj, torch.attr("Tensor"))) {
+      py::object t = obj.attr("detach")();
+      if (t.attr("is_cuda").cast<bool>())
+        t = t.attr("cpu")();
+      obj = t.attr("numpy")();
+    }
+  } catch (py::error_already_set &) {
+    PyErr_Clear();
+  }
+  auto arr = np.attr("ascontiguousarray")(obj, "dtype"_a = np.attr("float64"))
+                 .cast<py::array_t<double>>();
+  py::buffer_info info = arr.request();
+  if (info.ndim != 1)
+    throw std::runtime_error(
+        "error_probabilities must be a 1-D array, got ndim=" +
+        std::to_string(info.ndim));
+  return arr;
+}
+
+enum class DemSamplingBackend { Auto, Cpu, Gpu };
+
+DemSamplingBackend parseBackend(std::string backend) {
+  std::transform(
+      backend.begin(), backend.end(), backend.begin(),
+      [](unsigned char ch) { return static_cast<char>(std::tolower(ch)); });
+
+  if (backend == "auto")
+    return DemSamplingBackend::Auto;
+  if (backend == "cpu")
+    return DemSamplingBackend::Cpu;
+  if (backend == "gpu")
+    return DemSamplingBackend::Gpu;
+
+  throw std::runtime_error("dem_sampling: invalid backend '" + backend +
+                           "'. Expected one of: auto, cpu, gpu.");
+}
+
+void validateInputShapes(const py::array_t<uint8_t> &check_matrix_np,
+                         const py::array_t<double> &error_probs_np) {
+  py::buffer_info h_buf = check_matrix_np.request();
+  if (h_buf.ndim != 2)
+    throw std::runtime_error("check_matrix must be rank-2");
+
+  py::buffer_info p_buf = error_probs_np.request();
+  auto num_mechanisms = static_cast<std::size_t>(h_buf.shape[1]);
+  auto num_probs = static_cast<std::size_t>(p_buf.shape[0]);
+  if (num_probs != num_mechanisms) {
+    throw std::runtime_error("dem_sampling: error_probabilities length (" +
+                             std::to_string(num_probs) +
+                             ") != check_matrix columns (" +
+                             std::to_string(num_mechanisms) + ")");
+  }
+}
+
+void validateProbabilityValues(const py::array_t<double> &error_probs_np) {
+  py::buffer_info p_buf = error_probs_np.request();
+  auto *probs = static_cast<const double *>(p_buf.ptr);
+  auto count = static_cast<std::size_t>(p_buf.shape[0]);
+
+  for (std::size_t i = 0; i < count; ++i) {
+    const double p = probs[i];
+    if (!std::isfinite(p) || p < 0.0 || p > 1.0) {
+      throw std::runtime_error(
+          "dem_sampling: error_probabilities[" + std::to_string(i) +
+          "] is invalid. Values must be finite and in [0, 1].");
+    }
+  }
+}
+
+#ifdef CUDAQ_QEC_HAS_CUSTABILIZER
+
+struct CudaDeleter {
+  void operator()(void *p) const {
+    if (p)
+      cudaFree(p);
+  }
+};
+
+template <typename T>
+using DevicePtr = std::unique_ptr<T, CudaDeleter>;
+
+template <typename T>
+DevicePtr<T> device_alloc(std::size_t count) {
+  T *p = nullptr;
+  if (cudaMalloc(&p, count * sizeof(T)) != cudaSuccess) {
+    cudaGetLastError();
+    return DevicePtr<T>(nullptr);
+  }
+  return DevicePtr<T>(p);
+}
+
+class ScopedCudaDevice {
+public:
+  ScopedCudaDevice(int target_device, std::string &failure_reason) {
+    auto get_device_status = cudaGetDevice(&previous_device);
+    if (get_device_status != cudaSuccess) {
+      failure_reason = std::string("cudaGetDevice failed: ") +
+                       cudaGetErrorString(get_device_status);
+      return;
+    }
+
+    if (target_device < 0)
+      target_device = previous_device;
+    active_device = target_device;
+
+    if (previous_device != active_device) {
+      auto set_device_status = cudaSetDevice(active_device);
+      if (set_device_status != cudaSuccess) {
+        failure_reason = std::string("cudaSetDevice failed: ") +
+                         cudaGetErrorString(set_device_status);
+        return;
+      }
+    }
+
+    is_active = true;
+  }
+
+  ~ScopedCudaDevice() {
+    if (is_active && previous_device != active_device)
+      cudaSetDevice(previous_device);
+  }
+
+  bool ok() const { return is_active; }
+
+private:
+  int previous_device = -1;
+  int active_device = -1;
+  bool is_active = false;
+};
+
+bool getTorchDeviceIndex(const py::module_ &torch, const py::object &device,
+                         int &device_index, std::string &failure_reason) {
+  if (!py::isinstance(device, torch.attr("device"))) {
+    failure_reason = "torch device object is invalid";
+    return false;
+  }
+
+  py::object index_obj = device.attr("index");
+  if (!index_obj.is_none()) {
+    device_index = index_obj.cast<int>();
+  } else {
+    auto get_device_status = cudaGetDevice(&device_index);
+    if (get_device_status != cudaSuccess) {
+      failure_reason = std::string("cudaGetDevice failed: ") +
+                       cudaGetErrorString(get_device_status);
+      return false;
+    }
+  }
+
+  int device_count = 0;
+  auto count_status = cudaGetDeviceCount(&device_count);
+  if (count_status != cudaSuccess) {
+    failure_reason = std::string("cudaGetDeviceCount failed: ") +
+                     cudaGetErrorString(count_status);
+    return false;
+  }
+  if (device_index < 0 || device_index >= device_count) {
+    failure_reason = "invalid CUDA device index " +
+                     std::to_string(device_index) +
+                     " (device_count=" + std::to_string(device_count) + ")";
+    return false;
+  }
+
+  return true;
+}
+
+bool getTorchCurrentStreamHandle(const py::module_ &torch,
+                                 const py::object &device,
+                                 std::uintptr_t &stream_handle,
+                                 std::string &failure_reason) {
+  try {
+    py::object stream_obj = torch.attr("cuda").attr("current_stream")(device);
+    stream_handle = stream_obj.attr("cuda_stream").cast<std::uintptr_t>();
+    return true;
+  } catch (py::error_already_set &e) {
+    failure_reason =
+        std::string("failed to query torch CUDA stream: ") + e.what();
+    PyErr_Clear();
+    return false;
+  }
+}
+
+bool tryGpuSampling(const py::array_t<uint8_t> &check_matrix_np,
+                    const py::array_t<double> &error_probs_np,
+                    std::size_t numShots, unsigned seed,
+                    py::array_t<uint8_t> &syndromes_out,
+                    py::array_t<uint8_t> &errors_out,
+                    std::string &failure_reason) {
+  int device_count = 0;
+  auto device_count_status = cudaGetDeviceCount(&device_count);
+  if (device_count_status != cudaSuccess) {
+    failure_reason = std::string("cudaGetDeviceCount failed: ") +
+                     cudaGetErrorString(device_count_status);
+    return false;
+  }
+  if (device_count == 0) {
+    failure_reason = "no CUDA device available";
+    return false;
+  }
+
+  py::buffer_info h_buf = check_matrix_np.request();
+  if (h_buf.ndim != 2) {
+    failure_reason = "check_matrix must be rank-2";
+    return false;
+  }
+  auto num_checks = static_cast<std::size_t>(h_buf.shape[0]);
+  auto num_mechanisms = static_cast<std::size_t>(h_buf.shape[1]);
+
+  py::buffer_info p_buf = error_probs_np.request();
+  auto num_probs = static_cast<std::size_t>(p_buf.shape[0]);
+  if (num_probs != num_mechanisms) {
+    failure_reason = "error_probabilities length mismatch";
+    return false;
+  }
+
+  std::size_t h_bytes = num_checks * num_mechanisms * sizeof(uint8_t);
+  std::size_t p_bytes = num_mechanisms * sizeof(double);
+  std::size_t syn_bytes = numShots * num_checks * sizeof(uint8_t);
+  std::size_t err_bytes = numShots * num_mechanisms * sizeof(uint8_t);
+
+  auto d_H = device_alloc<uint8_t>(num_checks * num_mechanisms);
+  if (!d_H) {
+    failure_reason = "failed to allocate device memory for check_matrix";
+    return false;
+  }
+  auto d_probs = device_alloc<double>(num_mechanisms);
+  if (!d_probs) {
+    failure_reason = "failed to allocate device memory for error_probabilities";
+    return false;
+  }
+  auto d_syn = device_alloc<uint8_t>(numShots * num_checks);
+  if (!d_syn) {
+    failure_reason = "failed to allocate device memory for syndromes";
+    return false;
+  }
+  auto d_err = device_alloc<uint8_t>(numShots * num_mechanisms);
+  if (!d_err) {
+    failure_reason = "failed to allocate device memory for errors";
+    return false;
+  }
+
+  auto copy_h_status =
+      cudaMemcpy(d_H.get(), h_buf.ptr, h_bytes, cudaMemcpyHostToDevice);
+  if (copy_h_status != cudaSuccess) {
+    failure_reason = std::string("failed to copy check_matrix to device: ") +
+                     cudaGetErrorString(copy_h_status);
+    cudaGetLastError();
+    return false;
+  }
+  auto copy_p_status =
+      cudaMemcpy(d_probs.get(), p_buf.ptr, p_bytes, cudaMemcpyHostToDevice);
+  if (copy_p_status != cudaSuccess) {
+    failure_reason = std::string("failed to copy probabilities to device: ") +
+                     cudaGetErrorString(copy_p_status);
+    cudaGetLastError();
+    return false;
+  }
+
+  bool ok =
+      gpu::sample_dem(d_H.get(), num_checks, num_mechanisms, d_probs.get(),
+                      numShots, seed, d_syn.get(), d_err.get());
+
+  if (!ok) {
+    failure_reason = "cuStabilizer GPU sampler unavailable at runtime";
+    return false;
+  }
+
+  if (ok) {
+    syndromes_out =
+        py::array_t<uint8_t>({static_cast<py::ssize_t>(numShots),
+                              static_cast<py::ssize_t>(num_checks)});
+    errors_out =
+        py::array_t<uint8_t>({static_cast<py::ssize_t>(numShots),
+                              static_cast<py::ssize_t>(num_mechanisms)});
+
+    auto copy_syn_status = cudaMemcpy(syndromes_out.mutable_data(), d_syn.get(),
+                                      syn_bytes, cudaMemcpyDeviceToHost);
+    if (copy_syn_status != cudaSuccess) {
+      failure_reason = std::string("failed to copy syndromes to host: ") +
+                       cudaGetErrorString(copy_syn_status);
+      cudaGetLastError();
+      return false;
+    }
+
+    auto copy_err_status = cudaMemcpy(errors_out.mutable_data(), d_err.get(),
+                                      err_bytes, cudaMemcpyDeviceToHost);
+    if (copy_err_status != cudaSuccess) {
+      failure_reason = std::string("failed to copy errors to host: ") +
+                       cudaGetErrorString(copy_err_status);
+      cudaGetLastError();
+      return false;
+    }
+  }
+
+  failure_reason.clear();
+  return ok;
+}
+
+bool tryTorchGpuSampling(py::object check_matrix_obj,
+                         py::object error_probs_obj, std::size_t numShots,
+                         unsigned seed, py::tuple &result_out,
+                         std::string &failure_reason, bool allow_move_to_cuda) {
+  py::module_ torch;
+  try {
+    torch = py::module_::import("torch");
+  } catch (py::error_already_set &) {
+    PyErr_Clear();
+    failure_reason = "PyTorch is not available";
+    return false;
+  }
+
+  if (!py::isinstance(check_matrix_obj, torch.attr("Tensor")) ||
+      !py::isinstance(error_probs_obj, torch.attr("Tensor"))) {
+    failure_reason = "inputs are not both torch.Tensor";
+    return false;
+  }
+
+  py::object check_t = check_matrix_obj;
+  py::object probs_t = error_probs_obj;
+
+  bool check_is_cuda = check_t.attr("is_cuda").cast<bool>();
+  bool probs_is_cuda = probs_t.attr("is_cuda").cast<bool>();
+
+  py::object device;
+  if (check_is_cuda) {
+    device = check_t.attr("device");
+  } else if (probs_is_cuda) {
+    device = probs_t.attr("device");
+  } else if (allow_move_to_cuda) {
+    int device_count = 0;
+    auto device_count_status = cudaGetDeviceCount(&device_count);
+    if (device_count_status != cudaSuccess) {
+      failure_reason = std::string("cudaGetDeviceCount failed: ") +
+                       cudaGetErrorString(device_count_status);
+      return false;
+    }
+    if (device_count == 0) {
+      failure_reason = "no CUDA device available";
+      return false;
+    }
+    device = torch.attr("device")("cuda");
+  } else {
+    failure_reason = "torch tensors are not on CUDA device";
+    return false;
+  }
+
+  check_t =
+      check_t.attr("to")("device"_a = device, "dtype"_a = torch.attr("uint8"));
+  probs_t = probs_t.attr("to")("device"_a = device,
+                               "dtype"_a = torch.attr("float64"));
+  check_t = check_t.attr("contiguous")();
+  probs_t = probs_t.attr("contiguous")();
+
+  auto check_shape = check_t.attr("shape").cast<py::tuple>();
+  if (check_shape.size() != 2) {
+    failure_reason = "check_matrix must be rank-2";
+    return false;
+  }
+
+  auto probs_shape = probs_t.attr("shape").cast<py::tuple>();
+  if (probs_shape.size() != 1) {
+    failure_reason = "error_probabilities must be rank-1";
+    return false;
+  }
+
+  auto num_checks =
+      static_cast<std::size_t>(check_shape[0].cast<py::ssize_t>());
+  auto num_mechanisms =
+      static_cast<std::size_t>(check_shape[1].cast<py::ssize_t>());
+  auto num_probs = static_cast<std::size_t>(probs_shape[0].cast<py::ssize_t>());
+
+  if (num_probs != num_mechanisms) {
+    failure_reason = "error_probabilities length mismatch";
+    return false;
+  }
+
+  bool all_finite =
+      torch.attr("isfinite")(probs_t).attr("all")().attr("item")().cast<bool>();
+  bool any_below_zero =
+      probs_t.attr("lt")(0.0).attr("any")().attr("item")().cast<bool>();
+  bool any_above_one =
+      probs_t.attr("gt")(1.0).attr("any")().attr("item")().cast<bool>();
+  if (!all_finite || any_below_zero || any_above_one) {
+    failure_reason = "error_probabilities must be finite values in [0, 1]";
+    return false;
+  }
+
+  py::object actual_device = check_t.attr("device");
+  int target_device = -1;
+  if (!getTorchDeviceIndex(torch, actual_device, target_device, failure_reason))
+    return false;
+
+  ScopedCudaDevice device_guard(target_device, failure_reason);
+  if (!device_guard.ok())
+    return false;
+
+  std::uintptr_t stream_handle = 0;
+  if (!getTorchCurrentStreamHandle(torch, actual_device, stream_handle,
+                                   failure_reason))
+    return false;
+
+  py::object syndromes_t =
+      torch.attr("empty")(py::make_tuple(static_cast<py::ssize_t>(numShots),
+                                         static_cast<py::ssize_t>(num_checks)),
+                          "dtype"_a = torch.attr("uint8"), "device"_a = device);
+  py::object errors_t = torch.attr("empty")(
+      py::make_tuple(static_cast<py::ssize_t>(numShots),
+                     static_cast<py::ssize_t>(num_mechanisms)),
+      "dtype"_a = torch.attr("uint8"), "device"_a = device);
+
+  auto d_H = reinterpret_cast<const uint8_t *>(
+      check_t.attr("data_ptr")().cast<std::uintptr_t>());
+  auto d_probs = reinterpret_cast<const double *>(
+      probs_t.attr("data_ptr")().cast<std::uintptr_t>());
+  auto d_syn = reinterpret_cast<uint8_t *>(
+      syndromes_t.attr("data_ptr")().cast<std::uintptr_t>());
+  auto d_err = reinterpret_cast<uint8_t *>(
+      errors_t.attr("data_ptr")().cast<std::uintptr_t>());
+
+  bool ok = gpu::sample_dem(d_H, num_checks, num_mechanisms, d_probs, numShots,
+                            seed, d_syn, d_err, stream_handle);
+  if (!ok) {
+    failure_reason = "cuStabilizer GPU sampler unavailable at runtime";
+    return false;
+  }
+
+  result_out = py::make_tuple(syndromes_t, errors_t);
+  failure_reason.clear();
+  return true;
+}
+#endif // CUDAQ_QEC_HAS_CUSTABILIZER
+
+} // namespace
+
+void bindDemSampling(py::module &mod) {
+  auto qecmod = py::hasattr(mod, "qecrt")
+                    ? mod.attr("qecrt").cast<py::module_>()
+                    : mod.def_submodule("qecrt");
+
+#ifdef CUDAQ_QEC_HAS_CUSTABILIZER
+  qecmod.attr("dem_sampling_has_gpu_compiled") = py::bool_(true);
+#else
+  qecmod.attr("dem_sampling_has_gpu_compiled") = py::bool_(false);
+#endif
+
+  qecmod.def(
+      "dem_sampling",
+      [](py::object check_matrix_obj, std::size_t numShots,
+         py::object error_probs_obj, std::optional<unsigned> seed,
+         std::string backend) -> py::tuple {
+        const auto backend_mode = parseBackend(std::move(backend));
+
+        if (numShots == 0) {
+          auto check_matrix_np = asNumpyUint8(check_matrix_obj);
+          auto error_probs_np = asNumpyFloat64(error_probs_obj);
+          validateInputShapes(check_matrix_np, error_probs_np);
+          py::buffer_info h_buf = check_matrix_np.request();
+          auto num_checks = static_cast<py::ssize_t>(h_buf.shape[0]);
+          auto num_mechanisms = static_cast<py::ssize_t>(h_buf.shape[1]);
+          py::module_ np = py::module_::import("numpy");
+          return py::make_tuple(
+              np.attr("empty")(py::make_tuple(0, num_checks),
+                               "dtype"_a = np.attr("uint8")),
+              np.attr("empty")(py::make_tuple(0, num_mechanisms),
+                               "dtype"_a = np.attr("uint8")));
+        }
+
+        unsigned actual_seed =
+            seed.has_value() ? seed.value()
+                             : static_cast<unsigned>(std::random_device{}());
+
+#ifdef CUDAQ_QEC_HAS_CUSTABILIZER
+        std::string torch_gpu_failure_reason = "not attempted";
+        if (backend_mode != DemSamplingBackend::Cpu) {
+          py::tuple torch_result;
+          // In "gpu" mode we may move torch CPU tensors to CUDA. In "auto"
+          // mode we preserve existing behavior and only use this path for
+          // CUDA tensors.
+          bool allow_move_to_cuda = backend_mode == DemSamplingBackend::Gpu;
+          if (tryTorchGpuSampling(check_matrix_obj, error_probs_obj, numShots,
+                                  actual_seed, torch_result,
+                                  torch_gpu_failure_reason, allow_move_to_cuda))
+            return torch_result;
+        }
+#endif
+
+        auto check_matrix_np = asNumpyUint8(check_matrix_obj);
+        auto error_probs_np = asNumpyFloat64(error_probs_obj);
+        validateInputShapes(check_matrix_np, error_probs_np);
+        validateProbabilityValues(error_probs_np);
+
+#ifdef CUDAQ_QEC_HAS_CUSTABILIZER
+        if (backend_mode != DemSamplingBackend::Cpu) {
+          py::array_t<uint8_t> syndromes_gpu, errors_gpu;
+          std::string gpu_failure_reason;
+          if (tryGpuSampling(check_matrix_np, error_probs_np, numShots,
+                             actual_seed, syndromes_gpu, errors_gpu,
+                             gpu_failure_reason))
+            return py::make_tuple(syndromes_gpu, errors_gpu);
+
+          if (backend_mode == DemSamplingBackend::Gpu) {
+            throw std::runtime_error(
+                "dem_sampling: GPU backend requested but unavailable. "
+                "torch path: " +
+                torch_gpu_failure_reason +
+                "; numpy path: " + gpu_failure_reason);
+          }
+        }
+#elif !defined(CUDAQ_QEC_HAS_CUSTABILIZER)
+        if (backend_mode == DemSamplingBackend::Gpu) {
+          throw std::runtime_error(
+              "dem_sampling: GPU backend requested, but this build does not "
+              "include cuStabilizer support.");
+        }
+#endif
+
+        auto H = cudaqx::toTensor(check_matrix_np);
+
+        py::buffer_info p_buf = error_probs_np.request();
+        std::vector<double> probs(static_cast<double *>(p_buf.ptr),
+                                  static_cast<double *>(p_buf.ptr) +
+                                      p_buf.shape[0]);
+
+        auto [syndromes, errors] =
+            cpu::sample_dem(H, numShots, probs, actual_seed);
+
+        return py::make_tuple(
+            cudaq::python::copyCUDAQXTensorToPyArray(syndromes),
+            cudaq::python::copyCUDAQXTensorToPyArray(errors));
+      },
+      R"pbdoc(
+        Sample syndrome measurements from a detector error model.
+
+        Generates random error vectors according to the given per-mechanism
+        error probabilities, then computes syndromes via the check matrix.
+        Backend selection is controlled by the `backend` argument.
+          - "auto" (default): try GPU first, fall back to CPU.
+          - "cpu": always run the CPU implementation.
+          - "gpu": require GPU; raise if unavailable.
+
+        The check_matrix and error_probabilities arguments accept both
+        NumPy arrays and PyTorch tensors.
+
+        If PyTorch CUDA tensors are used on the GPU path, outputs are returned
+        as PyTorch CUDA tensors (device pointers preserved). Otherwise outputs
+        are NumPy arrays.
+
+        Args:
+            check_matrix: Binary check matrix [num_checks x num_error_mechanisms],
+                          as a NumPy uint8 array or PyTorch tensor.
+            numShots: Number of measurement shots to sample.
+            error_probabilities: Per-error-mechanism probabilities
+                                 [num_error_mechanisms], as a NumPy float64
+                                 array or PyTorch tensor.
+            seed: Optional RNG seed for reproducibility.
+            backend: "auto", "cpu", or "gpu". Default is "auto".
+
+        Returns:
+            A tuple (syndromes, errors) with shapes [numShots x num_checks]
+            and [numShots x num_error_mechanisms].
+      )pbdoc",
+      py::arg("check_matrix"), py::arg("numShots"),
+      py::arg("error_probabilities"), py::arg("seed") = py::none(),
+      py::arg("backend") = "auto");
+}
+
+} // namespace cudaq::qec::dem_sampler
diff --git a/libs/qec/python/bindings/py_dem_sampling.h b/libs/qec/python/bindings/py_dem_sampling.h
new file mode 100644
index 00000000..1d290210
--- /dev/null
+++ b/libs/qec/python/bindings/py_dem_sampling.h
@@ -0,0 +1,15 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace cudaq::qec::dem_sampler {
+void bindDemSampling(py::module &mod);
+} // namespace cudaq::qec::dem_sampler
diff --git a/libs/qec/python/cudaq_qec/__init__.py b/libs/qec/python/cudaq_qec/__init__.py
index 19a2c9d3..faa7fe3c 100644
--- a/libs/qec/python/cudaq_qec/__init__.py
+++ b/libs/qec/python/cudaq_qec/__init__.py
@@ -7,7 +7,18 @@
 # ============================================================================ #
 
 from .patch import patch
-from ._pycudaqx_qec_the_suffix_matters_cudaq_qec import *
+try:
+    from ._pycudaqx_qec_the_suffix_matters_cudaq_qec import *
+except ImportError as exc:
+    err = str(exc)
+    if "libcustabilizer" in err or "cuStabilizer" in err:
+        raise ImportError(
+            "Failed to load cudaq_qec native extension because cuStabilizer "
+            "runtime libraries are missing. Install the matching cuQuantum "
+            "package for your CUDA wheel (for example, "
+            "'cuquantum-python-cu12>=26.03.0' or "
+            "'cuquantum-python-cu13>=26.03.0').") from exc
+    raise
 
 __version__ = qecrt.__version__
 code = qecrt.code
@@ -57,6 +68,8 @@
 stabilizer_grid = qecrt.stabilizer_grid
 role_to_str = qecrt.role_to_str
 
+from .dem_sampling import dem_sampling
+
 from .plugins import decoders, codes
 import pkgutil, importlib, traceback
 
diff --git a/libs/qec/python/cudaq_qec/dem_sampling.py b/libs/qec/python/cudaq_qec/dem_sampling.py
new file mode 100644
index 00000000..99b29063
--- /dev/null
+++ b/libs/qec/python/cudaq_qec/dem_sampling.py
@@ -0,0 +1,66 @@
+# ============================================================================ #
+# Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.    #
+# ============================================================================ #
+"""DEM sampling via the C++ pybind11 binding (GPU with cuStabilizer + CPU fallback).
+
+Public API:
+    dem_sampling(
+        check_matrix, num_shots, error_probabilities, seed=None, backend="auto"
+    )
+
+When compiled with cuStabilizer, the function delegates to the GPU sampler.
+Otherwise it falls back to the CPU implementation. Both paths are handled
+transparently by the C++ binding.
+
+The check_matrix and error_probabilities arguments accept both NumPy arrays
+and PyTorch tensors, enabling direct integration with AI/ML workflows.
+"""
+
+from __future__ import annotations
+
+from typing import Optional, Tuple
+
+__all__ = ["dem_sampling"]
+
+
+def dem_sampling(
+    check_matrix,
+    num_shots: int,
+    error_probabilities,
+    seed: Optional[int] = None,
+    backend: str = "auto",
+) -> Tuple[object, object]:
+    """Sample errors and syndromes from a Detector Error Model.
+
+    Args:
+        check_matrix: Binary matrix [num_checks x num_error_mechanisms],
+            as a NumPy uint8 array or PyTorch tensor.
+        num_shots: Number of independent Monte-Carlo shots.
+        error_probabilities: 1-D array of length num_error_mechanisms with
+            independent Bernoulli probabilities for each mechanism.
+            Accepts NumPy float64 array or PyTorch tensor.
+        seed: Optional RNG seed for reproducibility.
+        backend: Backend selection policy:
+            - "auto" (default): try GPU, fall back to CPU.
+            - "cpu": force CPU implementation.
+            - "gpu": force GPU implementation and raise if unavailable.
+
+    Returns:
+        (syndromes, errors) where
+          syndromes: uint8 array/tensor [num_shots x num_checks]
+          errors:    uint8 array/tensor [num_shots x num_error_mechanisms]
+
+    When compiled with cuStabilizer the function uses GPU-accelerated
+    sampling. Otherwise it falls back to the CPU implementation.
+    Both NumPy arrays and PyTorch tensors are accepted as inputs.
+    For PyTorch CUDA tensors on GPU path, outputs are PyTorch CUDA tensors;
+    otherwise outputs are NumPy arrays.
+    """
+    from . import _pycudaqx_qec_the_suffix_matters_cudaq_qec as _qecmod
+
+    return _qecmod.qecrt.dem_sampling(check_matrix, num_shots,
+                                      error_probabilities, seed, backend)
diff --git a/libs/qec/python/tests/test_dem_sampling.py b/libs/qec/python/tests/test_dem_sampling.py
new file mode 100644
index 00000000..c1079c35
--- /dev/null
+++ b/libs/qec/python/tests/test_dem_sampling.py
@@ -0,0 +1,608 @@
+# ============================================================================ #
+# Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.    #
+# ============================================================================ #
+"""Tests for cudaq_qec.dem_sampling (GPU with cuStabilizer + CPU fallback)."""
+
+import numpy as np
+import pytest
+
+import cudaq_qec as qec
+from cudaq_qec.dem_sampling import dem_sampling
+
+
+def _compute_syndrome(H, errors):
+    """Reference: syndromes = errors @ H^T mod 2."""
+    return (errors @ H.T) % 2
+
+
+def _has_runtime_gpu_backend():
+    """True only when GPU backend is both compiled and usable at runtime."""
+    if not getattr(qec.qecrt, "dem_sampling_has_gpu_compiled", False):
+        return False
+
+    H = np.array([[1]], dtype=np.uint8)
+    probs = np.array([0.0], dtype=np.float64)
+    try:
+        dem_sampling(H, 1, probs, seed=0, backend="gpu")
+        return True
+    except RuntimeError:
+        return False
+
+
+_HAS_RUNTIME_GPU_BACKEND = _has_runtime_gpu_backend()
+
+# ---------------------------------------------------------------------------
+# Deterministic tests (numpy input)
+# ---------------------------------------------------------------------------
+
+
+class TestAllZeroProbabilities:
+
+    def test_all_zeros(self):
+        H = np.array([[1, 0, 1, 0], [0, 1, 1, 0], [0, 0, 0, 1]], dtype=np.uint8)
+        probs = np.zeros(4)
+        syndromes, errors = dem_sampling(H, 10, probs, seed=0)
+
+        assert syndromes.shape == (10, 3)
+        assert errors.shape == (10, 4)
+        np.testing.assert_array_equal(errors, 0)
+        np.testing.assert_array_equal(syndromes, 0)
+
+
+class TestAllOneProbabilities:
+
+    def test_all_ones(self):
+        H = np.array([[1, 0, 1, 0], [1, 1, 0, 1], [0, 1, 1, 0]], dtype=np.uint8)
+        probs = np.ones(4)
+        syndromes, errors = dem_sampling(H, 5, probs, seed=0)
+
+        np.testing.assert_array_equal(errors, 1)
+        for shot in range(5):
+            assert syndromes[shot, 0] == 0
+            assert syndromes[shot, 1] == 1
+            assert syndromes[shot, 2] == 0
+
+
+class TestMixedDeterministicProbs:
+
+    def test_mixed(self):
+        H = np.array([[1, 0, 1], [0, 1, 1]], dtype=np.uint8)
+        probs = np.array([1.0, 0.0, 1.0])
+        syndromes, errors = dem_sampling(H, 8, probs, seed=42)
+
+        expected_errors = np.array([1, 0, 1], dtype=np.uint8)
+        expected_syn = np.array([0, 1], dtype=np.uint8)
+
+        for shot in range(8):
+            np.testing.assert_array_equal(errors[shot], expected_errors)
+            np.testing.assert_array_equal(syndromes[shot], expected_syn)
+
+
+class TestIdentityMatrix:
+
+    def test_identity_all_ones(self):
+        H = np.eye(5, dtype=np.uint8)
+        probs = np.ones(5)
+        syndromes, errors = dem_sampling(H, 3, probs, seed=7)
+
+        np.testing.assert_array_equal(errors, 1)
+        np.testing.assert_array_equal(syndromes, errors)
+
+    def test_identity_syndrome_equals_errors(self):
+        H = np.eye(10, dtype=np.uint8)
+        probs = np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=np.float64)
+        syndromes, errors = dem_sampling(H, 4, probs, seed=0)
+
+        np.testing.assert_array_equal(syndromes, errors)
+        expected = np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=np.uint8)
+        for shot in range(4):
+            np.testing.assert_array_equal(errors[shot], expected)
+
+
+class TestAllOnesMatrix:
+
+    def test_even_columns(self):
+        H = np.ones((3, 4), dtype=np.uint8)
+        probs = np.ones(4)
+        syndromes, errors = dem_sampling(H, 4, probs, seed=0)
+
+        np.testing.assert_array_equal(errors, 1)
+        np.testing.assert_array_equal(syndromes, 0)
+
+    def test_odd_columns(self):
+        H = np.ones((3, 3), dtype=np.uint8)
+        probs = np.ones(3)
+        syndromes, errors = dem_sampling(H, 4, probs, seed=0)
+
+        np.testing.assert_array_equal(errors, 1)
+        np.testing.assert_array_equal(syndromes, 1)
+
+
+class TestSingleColumnMatrix:
+
+    def test_single_column(self):
+        H = np.array([[1], [0], [1]], dtype=np.uint8)
+        probs = np.array([1.0])
+        syndromes, errors = dem_sampling(H, 6, probs, seed=0)
+
+        np.testing.assert_array_equal(errors, 1)
+        expected_syn = np.array([1, 0, 1], dtype=np.uint8)
+        for shot in range(6):
+            np.testing.assert_array_equal(syndromes[shot], expected_syn)
+
+
+class TestSingleRowMatrix:
+
+    def test_single_row(self):
+        H = np.array([[1, 1, 0, 1, 0]], dtype=np.uint8)
+        probs = np.array([1.0, 0.0, 1.0, 0.0, 1.0])
+        syndromes, errors = dem_sampling(H, 4, probs, seed=0)
+
+        expected_errors = np.array([1, 0, 1, 0, 1], dtype=np.uint8)
+        for shot in range(4):
+            np.testing.assert_array_equal(errors[shot], expected_errors)
+            assert syndromes[shot, 0] == 1
+
+
+class TestSingleShot:
+
+    def test_single_shot(self):
+        H = np.array([[1, 1, 0], [0, 1, 1]], dtype=np.uint8)
+        probs = np.array([1.0, 1.0, 0.0])
+        syndromes, errors = dem_sampling(H, 1, probs, seed=0)
+
+        assert errors.shape == (1, 3)
+        np.testing.assert_array_equal(errors[0], [1, 1, 0])
+        np.testing.assert_array_equal(syndromes[0], [0, 1])
+
+
+class TestRepetitionCode:
+
+    def test_single_error(self):
+        H = np.array([[1, 1, 0, 0], [0, 1, 1, 0], [0, 0, 1, 1]], dtype=np.uint8)
+        probs = np.array([0.0, 1.0, 0.0, 0.0])
+        syndromes, errors = dem_sampling(H, 3, probs, seed=0)
+
+        expected_errors = np.array([0, 1, 0, 0], dtype=np.uint8)
+        expected_syn = np.array([1, 1, 0], dtype=np.uint8)
+
+        for shot in range(3):
+            np.testing.assert_array_equal(errors[shot], expected_errors)
+            np.testing.assert_array_equal(syndromes[shot], expected_syn)
+
+
+class TestSyndromeConsistency:
+
+    def test_fixed_matrix(self):
+        H = np.array([
+            [1, 0, 1, 0, 0, 1, 0, 0],
+            [0, 1, 0, 1, 0, 0, 1, 0],
+            [0, 0, 1, 0, 1, 0, 0, 1],
+            [1, 1, 0, 0, 0, 0, 1, 1],
+        ],
+                     dtype=np.uint8)
+        probs = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
+        syndromes, errors = dem_sampling(H, 500, probs, seed=7)
+
+        expected = _compute_syndrome(H, errors)
+        np.testing.assert_array_equal(syndromes, expected)
+
+
+class TestSeedReproducibility:
+
+    def test_same_seed(self):
+        H = np.eye(5, dtype=np.uint8)
+        probs = np.array([0.1, 0.3, 0.5, 0.7, 0.9])
+
+        s1, e1 = dem_sampling(H, 100, probs, seed=42)
+        s2, e2 = dem_sampling(H, 100, probs, seed=42)
+
+        np.testing.assert_array_equal(s1, s2)
+        np.testing.assert_array_equal(e1, e2)
+
+    def test_different_seeds_differ(self):
+        H = np.eye(5, dtype=np.uint8)
+        probs = np.full(5, 0.5)
+
+        _, e1 = dem_sampling(H, 1000, probs, seed=1)
+        _, e2 = dem_sampling(H, 1000, probs, seed=2)
+
+        assert not np.array_equal(e1, e2)
+
+
+class TestOutputProperties:
+
+    def test_shapes_and_dtypes(self):
+        H = np.array([[1, 0, 1], [0, 1, 1]], dtype=np.uint8)
+        probs = np.array([0.1, 0.2, 0.3])
+        syndromes, errors = dem_sampling(H, 10, probs, seed=0)
+
+        assert syndromes.shape == (10, 2)
+        assert errors.shape == (10, 3)
+        assert syndromes.dtype == np.uint8
+        assert errors.dtype == np.uint8
+
+    def test_binary_output(self):
+        H = np.eye(5, dtype=np.uint8)
+        probs = np.full(5, 0.5)
+        syndromes, errors = dem_sampling(H, 100, probs, seed=42)
+
+        assert set(np.unique(syndromes)).issubset({0, 1})
+        assert set(np.unique(errors)).issubset({0, 1})
+
+
+class TestInputValidation:
+
+    def test_non_2d_matrix(self):
+        with pytest.raises((ValueError, RuntimeError)):
+            dem_sampling(np.zeros(5, dtype=np.uint8), 10, np.zeros(5))
+
+    def test_mismatched_probs(self):
+        H = np.eye(5, dtype=np.uint8)
+        with pytest.raises((ValueError, RuntimeError)):
+            dem_sampling(H, 10, np.zeros(3))
+
+    def test_non_1d_probs(self):
+        H = np.eye(5, dtype=np.uint8)
+        with pytest.raises((ValueError, RuntimeError)):
+            dem_sampling(H, 10, np.zeros((5, 1)))
+
+    def test_probability_below_zero(self):
+        H = np.eye(3, dtype=np.uint8)
+        probs = np.array([0.1, -0.2, 0.3], dtype=np.float64)
+        with pytest.raises((ValueError, RuntimeError)):
+            dem_sampling(H, 10, probs, backend="cpu")
+
+    def test_probability_above_one(self):
+        H = np.eye(3, dtype=np.uint8)
+        probs = np.array([0.1, 1.2, 0.3], dtype=np.float64)
+        with pytest.raises((ValueError, RuntimeError)):
+            dem_sampling(H, 10, probs, backend="cpu")
+
+    def test_probability_nan(self):
+        H = np.eye(3, dtype=np.uint8)
+        probs = np.array([0.1, np.nan, 0.3], dtype=np.float64)
+        with pytest.raises((ValueError, RuntimeError)):
+            dem_sampling(H, 10, probs, backend="cpu")
+
+    @pytest.mark.skipif(not _HAS_RUNTIME_GPU_BACKEND,
+                        reason="GPU backend unavailable in this environment")
+    def test_force_gpu_rejects_invalid_probability_numpy(self):
+        H = np.eye(3, dtype=np.uint8)
+        probs = np.array([0.1, 1.1, 0.3], dtype=np.float64)
+        with pytest.raises((ValueError, RuntimeError)):
+            dem_sampling(H, 10, probs, backend="gpu")
+
+    @pytest.mark.skipif(not _HAS_RUNTIME_GPU_BACKEND,
+                        reason="GPU backend unavailable in this environment")
+    def test_force_gpu_rejects_invalid_probability_torch_cuda(self):
+        torch_mod = pytest.importorskip("torch")
+        if not torch_mod.cuda.is_available():
+            pytest.skip("PyTorch CUDA unavailable")
+
+        H = torch_mod.eye(3, dtype=torch_mod.uint8, device="cuda")
+        probs = torch_mod.tensor([0.1, 1.1, 0.3],
+                                 dtype=torch_mod.float64,
+                                 device="cuda")
+        with pytest.raises((ValueError, RuntimeError)):
+            dem_sampling(H, 10, probs, backend="gpu")
+
+
+class TestBackendSelection:
+
+    def test_invalid_backend(self):
+        H = np.eye(2, dtype=np.uint8)
+        probs = np.array([0.1, 0.2], dtype=np.float64)
+        with pytest.raises(RuntimeError):
+            dem_sampling(H, 4, probs, seed=1, backend="not-a-backend")
+
+    def test_force_cpu_backend(self):
+        H = np.array([[1, 0, 1], [0, 1, 1]], dtype=np.uint8)
+        probs = np.array([1.0, 0.0, 1.0], dtype=np.float64)
+        syndromes, errors = dem_sampling(H, 5, probs, seed=11, backend="cpu")
+
+        expected_errors = np.array([1, 0, 1], dtype=np.uint8)
+        expected_syn = np.array([0, 1], dtype=np.uint8)
+        for shot in range(5):
+            np.testing.assert_array_equal(errors[shot], expected_errors)
+            np.testing.assert_array_equal(syndromes[shot], expected_syn)
+
+    @pytest.mark.skipif(not _HAS_RUNTIME_GPU_BACKEND,
+                        reason="GPU backend unavailable in this environment")
+    def test_force_gpu_backend(self):
+        H = np.array([[1, 0, 1], [0, 1, 1]], dtype=np.uint8)
+        probs = np.array([1.0, 0.0, 1.0], dtype=np.float64)
+        syndromes, errors = dem_sampling(H, 5, probs, seed=11, backend="gpu")
+
+        expected_errors = np.array([1, 0, 1], dtype=np.uint8)
+        expected_syn = np.array([0, 1], dtype=np.uint8)
+        for shot in range(5):
+            np.testing.assert_array_equal(errors[shot], expected_errors)
+            np.testing.assert_array_equal(syndromes[shot], expected_syn)
+
+
+# ---------------------------------------------------------------------------
+# PyTorch tensor input tests
+# ---------------------------------------------------------------------------
+
+try:
+    import torch
+    _HAS_TORCH = True
+except ImportError:
+    _HAS_TORCH = False
+
+
+@pytest.mark.skipif(not _HAS_TORCH, reason="PyTorch not installed")
+class TestPyTorchInput:
+
+    def test_cpu_tensors(self):
+        H = torch.tensor([[1, 0, 1, 0], [0, 1, 1, 0], [0, 0, 0, 1]],
+                         dtype=torch.uint8)
+        probs = torch.tensor([1.0, 0.0, 1.0, 0.0], dtype=torch.float64)
+        syndromes, errors = dem_sampling(H, 5, probs, seed=42)
+
+        assert isinstance(syndromes, np.ndarray)
+        assert isinstance(errors, np.ndarray)
+        assert syndromes.shape == (5, 3)
+        assert errors.shape == (5, 4)
+
+        expected_errors = np.array([1, 0, 1, 0], dtype=np.uint8)
+        for shot in range(5):
+            np.testing.assert_array_equal(errors[shot], expected_errors)
+
+    def test_matches_numpy_input(self):
+        """Verify PyTorch and NumPy inputs produce identical results."""
+        H_np = np.array([[1, 0, 1], [0, 1, 1]], dtype=np.uint8)
+        probs_np = np.array([0.3, 0.5, 0.7])
+
+        H_torch = torch.from_numpy(H_np)
+        probs_torch = torch.from_numpy(probs_np)
+
+        s_np, e_np = dem_sampling(H_np, 100, probs_np, seed=99)
+        s_torch, e_torch = dem_sampling(H_torch, 100, probs_torch, seed=99)
+
+        np.testing.assert_array_equal(s_np, s_torch)
+        np.testing.assert_array_equal(e_np, e_torch)
+
+    def test_force_cpu_backend(self):
+        H = torch.tensor([[1, 0, 1], [0, 1, 1]], dtype=torch.uint8)
+        probs = torch.tensor([1.0, 0.0, 1.0], dtype=torch.float64)
+        syndromes, errors = dem_sampling(H, 4, probs, seed=7, backend="cpu")
+
+        expected_errors = np.array([1, 0, 1], dtype=np.uint8)
+        expected_syn = np.array([0, 1], dtype=np.uint8)
+        for shot in range(4):
+            np.testing.assert_array_equal(errors[shot], expected_errors)
+            np.testing.assert_array_equal(syndromes[shot], expected_syn)
+
+    def test_cpu_torch_probs_with_grad(self):
+        """CPU path should accept torch tensors that require gradients."""
+        H = torch.tensor([[1, 0, 1], [0, 1, 1]], dtype=torch.uint8)
+        probs = torch.tensor([1.0, 0.0, 1.0],
+                             dtype=torch.float64,
+                             requires_grad=True)
+        syndromes, errors = dem_sampling(H, 4, probs, seed=7, backend="cpu")
+
+        expected_errors = np.array([1, 0, 1], dtype=np.uint8)
+        expected_syn = np.array([0, 1], dtype=np.uint8)
+        for shot in range(4):
+            np.testing.assert_array_equal(errors[shot], expected_errors)
+            np.testing.assert_array_equal(syndromes[shot], expected_syn)
+
+    @pytest.mark.skipif(not _HAS_RUNTIME_GPU_BACKEND,
+                        reason="GPU backend unavailable in this environment")
+    def test_force_gpu_backend(self):
+        H = torch.tensor([[1, 0, 1], [0, 1, 1]], dtype=torch.uint8)
+        probs = torch.tensor([1.0, 0.0, 1.0], dtype=torch.float64)
+        syndromes, errors = dem_sampling(H, 4, probs, seed=7, backend="gpu")
+
+        assert isinstance(syndromes, torch.Tensor)
+        assert isinstance(errors, torch.Tensor)
+        assert syndromes.is_cuda
+        assert errors.is_cuda
+        assert syndromes.data_ptr() > 0
+        assert errors.data_ptr() > 0
+
+        expected_errors = np.array([1, 0, 1], dtype=np.uint8)
+        expected_syn = np.array([0, 1], dtype=np.uint8)
+        for shot in range(4):
+            np.testing.assert_array_equal(errors[shot].cpu().numpy(),
+                                          expected_errors)
+            np.testing.assert_array_equal(syndromes[shot].cpu().numpy(),
+                                          expected_syn)
+
+    @pytest.mark.skipif(not (_HAS_TORCH and torch.cuda.is_available()),
+                        reason="CUDA not available for PyTorch")
+    def test_cuda_tensors(self):
+        """CUDA tensors should stay on device in torch GPU path."""
+        H = torch.tensor([[1, 0, 1], [0, 1, 1]], dtype=torch.uint8).cuda()
+        probs = torch.tensor([1.0, 0.0, 1.0], dtype=torch.float64).cuda()
+        syndromes, errors = dem_sampling(H, 4, probs, seed=7)
+
+        assert isinstance(syndromes, torch.Tensor)
+        assert isinstance(errors, torch.Tensor)
+        assert syndromes.is_cuda
+        assert errors.is_cuda
+        assert tuple(syndromes.shape) == (4, 2)
+        assert tuple(errors.shape) == (4, 3)
+        assert syndromes.data_ptr() > 0
+        assert errors.data_ptr() > 0
+
+        expected_errors = np.array([1, 0, 1], dtype=np.uint8)
+        expected_syn = np.array([0, 1], dtype=np.uint8)
+        for shot in range(4):
+            np.testing.assert_array_equal(errors[shot].cpu().numpy(),
+                                          expected_errors)
+            np.testing.assert_array_equal(syndromes[shot].cpu().numpy(),
+                                          expected_syn)
+
+    @pytest.mark.skipif(
+        not (_HAS_RUNTIME_GPU_BACKEND and _HAS_TORCH and
+             torch.cuda.is_available()),
+        reason="GPU backend unavailable for CUDA stream test",
+    )
+    def test_non_default_cuda_stream(self):
+        """GPU path must honor torch's current (non-default) CUDA stream."""
+        device = torch.device("cuda")
+        H = torch.tensor([[1, 0, 1], [0, 1, 1]],
+                         dtype=torch.uint8,
+                         device=device)
+        probs = torch.tensor([1.0, 0.0, 1.0],
+                             dtype=torch.float64,
+                             device=device)
+        stream = torch.cuda.Stream(device=device)
+
+        with torch.cuda.stream(stream):
+            syndromes, errors = dem_sampling(H, 4, probs, seed=7, backend="gpu")
+            assert isinstance(syndromes, torch.Tensor)
+            assert isinstance(errors, torch.Tensor)
+            assert syndromes.device.type == "cuda"
+            assert errors.device.type == "cuda"
+
+            expected_errors = torch.tensor([1, 0, 1],
+                                           dtype=torch.uint8,
+                                           device=device)
+            expected_syn = torch.tensor([0, 1],
+                                        dtype=torch.uint8,
+                                        device=device)
+            for shot in range(4):
+                assert torch.equal(errors[shot], expected_errors)
+                assert torch.equal(syndromes[shot], expected_syn)
+
+        stream.synchronize()
+
+    @pytest.mark.skipif(
+        not (_HAS_RUNTIME_GPU_BACKEND and _HAS_TORCH and
+             torch.cuda.device_count() > 1),
+        reason="Requires runtime GPU backend and at least two CUDA devices",
+    )
+    def test_multi_gpu_device_routing(self):
+        """GPU path should execute on and return tensors from the input device."""
+        device = torch.device("cuda:1")
+        H = torch.tensor([[1, 0, 1], [0, 1, 1]],
+                         dtype=torch.uint8,
+                         device=device)
+        probs = torch.tensor([1.0, 0.0, 1.0],
+                             dtype=torch.float64,
+                             device=device)
+
+        syndromes, errors = dem_sampling(H, 4, probs, seed=7, backend="gpu")
+        assert isinstance(syndromes, torch.Tensor)
+        assert isinstance(errors, torch.Tensor)
+        assert syndromes.device == device
+        assert errors.device == device
+
+        expected_errors = torch.tensor([1, 0, 1],
+                                       dtype=torch.uint8,
+                                       device=device)
+        expected_syn = torch.tensor([0, 1], dtype=torch.uint8, device=device)
+        for shot in range(4):
+            assert torch.equal(errors[shot], expected_errors)
+            assert torch.equal(syndromes[shot], expected_syn)
+
+    def test_syndrome_consistency_torch(self):
+        """Verify syndrome = errors @ H^T mod 2 with PyTorch input."""
+        H_np = np.array([
+            [1, 0, 1, 0, 0, 1, 0, 0],
+            [0, 1, 0, 1, 0, 0, 1, 0],
+            [0, 0, 1, 0, 1, 0, 0, 1],
+            [1, 1, 0, 0, 0, 0, 1, 1],
+        ],
+                        dtype=np.uint8)
+        probs_np = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
+
+        H = torch.from_numpy(H_np)
+        probs = torch.from_numpy(probs_np)
+
+        syndromes, errors = dem_sampling(H, 200, probs, seed=7)
+        expected = _compute_syndrome(H_np, errors)
+        np.testing.assert_array_equal(syndromes, expected)
+
+
+# ---------------------------------------------------------------------------
+# Edge-case tests
+# ---------------------------------------------------------------------------
+
+
+class TestZeroShots:
+
+    def test_zero_shots_numpy(self):
+        H = np.array([[1, 0, 1], [0, 1, 1]], dtype=np.uint8)
+        probs = np.array([0.1, 0.2, 0.3])
+        syndromes, errors = dem_sampling(H, 0, probs, seed=0)
+
+        assert syndromes.shape == (0, 2)
+        assert errors.shape == (0, 3)
+        assert syndromes.dtype == np.uint8
+        assert errors.dtype == np.uint8
+
+    def test_zero_shots_cpu_backend(self):
+        H = np.array([[1, 0], [0, 1]], dtype=np.uint8)
+        probs = np.array([0.5, 0.5])
+        syndromes, errors = dem_sampling(H, 0, probs, seed=0, backend="cpu")
+
+        assert syndromes.shape == (0, 2)
+        assert errors.shape == (0, 2)
+
+    @pytest.mark.skipif(not _HAS_RUNTIME_GPU_BACKEND,
+                        reason="GPU backend unavailable")
+    def test_zero_shots_gpu_backend(self):
+        H = np.array([[1, 0], [0, 1]], dtype=np.uint8)
+        probs = np.array([0.5, 0.5])
+        syndromes, errors = dem_sampling(H, 0, probs, seed=0, backend="gpu")
+
+        assert syndromes.shape == (0, 2)
+        assert errors.shape == (0, 2)
+
+
+class TestNonBinaryCheckMatrix:
+
+    def test_non_binary_entries_masked(self):
+        """H entries > 1 should be treated as H & 1 (binary)."""
+        H = np.array([[2, 3], [1, 0]], dtype=np.uint8)
+        probs = np.array([1.0, 1.0])
+        syndromes, errors = dem_sampling(H, 4, probs, seed=0, backend="cpu")
+
+        np.testing.assert_array_equal(errors, 1)
+        for shot in range(4):
+            assert syndromes[shot, 0] == 1, "Binarized [0,1]: sum=1 mod 2=1"
+            assert syndromes[shot, 1] == 1, "Binarized [1,0]: sum=1 mod 2=1"
+
+    @pytest.mark.skipif(not _HAS_RUNTIME_GPU_BACKEND,
+                        reason="GPU backend unavailable")
+    def test_non_binary_cpu_gpu_match(self):
+        """CPU and GPU must agree on non-binary H after binarization."""
+        H = np.array([[2, 3], [1, 0]], dtype=np.uint8)
+        probs = np.array([1.0, 1.0])
+
+        syn_cpu, err_cpu = dem_sampling(H, 4, probs, seed=0, backend="cpu")
+        syn_gpu, err_gpu = dem_sampling(H, 4, probs, seed=0, backend="gpu")
+
+        np.testing.assert_array_equal(err_cpu, err_gpu)
+        np.testing.assert_array_equal(syn_cpu, syn_gpu)
+
+
+class TestSeedlessPath:
+
+    def test_seedless_runs(self):
+        """Calling without seed should succeed and produce valid output."""
+        H = np.array([[1, 0, 1], [0, 1, 1]], dtype=np.uint8)
+        probs = np.array([0.3, 0.5, 0.7])
+        syndromes, errors = dem_sampling(H, 50, probs)
+
+        assert syndromes.shape == (50, 2)
+        assert errors.shape == (50, 3)
+        assert set(np.unique(syndromes)).issubset({0, 1})
+        assert set(np.unique(errors)).issubset({0, 1})
+
+    def test_seedless_nondeterministic(self):
+        """Two seedless calls should very likely produce different results."""
+        H = np.eye(10, dtype=np.uint8)
+        probs = np.full(10, 0.5)
+        _, e1 = dem_sampling(H, 500, probs)
+        _, e2 = dem_sampling(H, 500, probs)
+        assert not np.array_equal(e1, e2)
diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt
index 430e8fdd..1d435d22 100644
--- a/libs/qec/unittests/CMakeLists.txt
+++ b/libs/qec/unittests/CMakeLists.txt
@@ -48,6 +48,15 @@ target_link_libraries(test_qec PRIVATE GTest::gtest_main cudaq-qec cudaq::cudaq-
 add_dependencies(CUDAQXQECUnitTests test_qec)
 gtest_discover_tests(test_qec)
 
+add_executable(test_dem_sampling test_dem_sampling.cpp)
+target_link_libraries(test_dem_sampling PRIVATE GTest::gtest_main cudaq-qec cudaq::cudaq)
+if(TARGET cuStabilizer::cuStabilizer)
+  target_link_libraries(test_dem_sampling PRIVATE CUDA::cudart)
+  target_include_directories(test_dem_sampling PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+endif()
+add_dependencies(CUDAQXQECUnitTests test_dem_sampling)
+gtest_discover_tests(test_dem_sampling)
+
 
 # TensorRT decoder test is only built for x86 architectures
 if(CUDAQ_QEC_BUILD_TRT_DECODER AND CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
diff --git a/libs/qec/unittests/test_dem_sampling.cpp b/libs/qec/unittests/test_dem_sampling.cpp
new file mode 100644
index 00000000..16847c92
--- /dev/null
+++ b/libs/qec/unittests/test_dem_sampling.cpp
@@ -0,0 +1,801 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/qec/dem_sampling.h"
+#include "cudaq/qec/experiments.h"
+
+#include <algorithm>
+#include <cmath>
+#include <gtest/gtest.h>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#ifdef CUDAQ_QEC_HAS_CUSTABILIZER
+#include <cuda_runtime.h>
+#endif
+
+namespace {
+
+/// Reference syndrome: checks = errors * H^T mod 2  (single shot, flat layout)
+std::vector<uint8_t> compute_syndrome(const std::vector<uint8_t> &H,
+                                      const std::vector<uint8_t> &errors,
+                                      size_t num_checks, size_t num_errors) {
+  std::vector<uint8_t> syndrome(num_checks, 0);
+  for (size_t col = 0; col < num_errors; col++) {
+    if (errors[col]) {
+      for (size_t row = 0; row < num_checks; row++) {
+        syndrome[row] ^= H[row * num_errors + col];
+      }
+    }
+  }
+  return syndrome;
+}
+
+/// Build a cudaqx tensor from a flat uint8 vector.
+cudaqx::tensor<uint8_t> make_tensor(const std::vector<uint8_t> &data,
+                                    size_t rows, size_t cols) {
+  cudaqx::tensor<uint8_t> t({rows, cols});
+  t.copy(data.data(), t.shape());
+  return t;
+}
+
+} // namespace
+
+// =============================================================================
+// CPU tests
+// =============================================================================
+
+TEST(DemSamplingCPU, AllZeroProbabilities) {
+  //   H = | 1 0 1 0 |     probs = [0, 0, 0, 0]
+  //       | 0 1 1 0 |
+  //       | 0 0 0 1 |
+  // No errors should ever fire, so errors=0 and checks=0.
+  auto H = make_tensor({1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1}, 3, 4);
+  std::vector<double> probs(4, 0.0);
+
+  auto [checks, errors] =
+      cudaq::qec::dem_sampler::cpu::sample_dem(H, 10, probs, 42);
+
+  EXPECT_EQ(checks.shape()[0], 10u);
+  EXPECT_EQ(checks.shape()[1], 3u);
+  EXPECT_EQ(errors.shape()[0], 10u);
+  EXPECT_EQ(errors.shape()[1], 4u);
+
+  for (size_t i = 0; i < 10; i++) {
+    for (size_t j = 0; j < 3; j++)
+      EXPECT_EQ(checks.at({i, j}), 0);
+    for (size_t j = 0; j < 4; j++)
+      EXPECT_EQ(errors.at({i, j}), 0);
+  }
+}
+
+TEST(DemSamplingCPU, AllOneProbabilities) {
+  //   H = | 1 0 1 0 |     probs = [1, 1, 1, 1]
+  //       | 1 1 0 1 |
+  //       | 0 1 1 0 |
+  // Every error fires. Syndrome for all-ones error = each row summed mod 2.
+  //   row 0: 1+0+1+0 = 2 mod 2 = 0
+  //   row 1: 1+1+0+1 = 3 mod 2 = 1
+  //   row 2: 0+1+1+0 = 2 mod 2 = 0
+  std::vector<uint8_t> H_data = {1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0};
+  auto H = make_tensor(H_data, 3, 4);
+  std::vector<double> probs(4, 1.0);
+
+  auto [checks, errors] =
+      cudaq::qec::dem_sampler::cpu::sample_dem(H, 5, probs, 42);
+
+  for (size_t shot = 0; shot < 5; shot++) {
+    for (size_t e = 0; e < 4; e++)
+      EXPECT_EQ(errors.at({shot, e}), 1);
+    EXPECT_EQ(checks.at({shot, 0}), 0);
+    EXPECT_EQ(checks.at({shot, 1}), 1);
+    EXPECT_EQ(checks.at({shot, 2}), 0);
+  }
+}
+
+TEST(DemSamplingCPU, MixedDeterministicProbs) {
+  //   H = | 1 0 1 |     probs = [1, 0, 1]
+  //       | 0 1 1 |
+  // Errors = [1, 0, 1] every shot.
+  //   check 0: 1*1 + 0*0 + 1*1 = 2 mod 2 = 0
+  //   check 1: 0*1 + 1*0 + 1*1 = 1 mod 2 = 1
+  auto H = make_tensor({1, 0, 1, 0, 1, 1}, 2, 3);
+  std::vector<double> probs = {1.0, 0.0, 1.0};
+
+  auto [checks, errors] =
+      cudaq::qec::dem_sampler::cpu::sample_dem(H, 8, probs, 99);
+
+  for (size_t shot = 0; shot < 8; shot++) {
+    EXPECT_EQ(errors.at({shot, 0}), 1);
+    EXPECT_EQ(errors.at({shot, 1}), 0);
+    EXPECT_EQ(errors.at({shot, 2}), 1);
+    EXPECT_EQ(checks.at({shot, 0}), 0);
+    EXPECT_EQ(checks.at({shot, 1}), 1);
+  }
+}
+
+TEST(DemSamplingCPU, IdentityMatrix) {
+  // H = I_5 with p=1: syndromes must equal errors (all ones).
+  auto H = make_tensor({1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
+                        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1},
+                       5, 5);
+  std::vector<double> probs(5, 1.0);
+
+  auto [checks, errors] =
+      cudaq::qec::dem_sampler::cpu::sample_dem(H, 3, probs, 7);
+
+  for (size_t shot = 0; shot < 3; shot++)
+    for (size_t j = 0; j < 5; j++) {
+      EXPECT_EQ(errors.at({shot, j}), 1);
+      EXPECT_EQ(checks.at({shot, j}), 1);
+    }
+}
+
+TEST(DemSamplingCPU, AllOnesMatrixEvenColumns) {
+  // H = 3x4 all-ones.  p = [1,1,1,1].
+  // Every error fires: 4 ones per row, 4 mod 2 = 0.  All syndromes = 0.
+  std::vector<uint8_t> H_data(3 * 4, 1);
+  auto H = make_tensor(H_data, 3, 4);
+  std::vector<double> probs(4, 1.0);
+
+  auto [checks, errors] =
+      cudaq::qec::dem_sampler::cpu::sample_dem(H, 4, probs, 0);
+
+  for (size_t shot = 0; shot < 4; shot++)
+    for (size_t c = 0; c < 3; c++)
+      EXPECT_EQ(checks.at({shot, c}), 0) << "Even column count => syndrome 0";
+}
+
+TEST(DemSamplingCPU, AllOnesMatrixOddColumns) {
+  // H = 3x3 all-ones.  p = [1,1,1].
+  // Every error fires: 3 ones per row, 3 mod 2 = 1.  All syndromes = 1.
+  std::vector<uint8_t> H_data(3 * 3, 1);
+  auto H = make_tensor(H_data, 3, 3);
+  std::vector<double> probs(3, 1.0);
+
+  auto [checks, errors] =
+      cudaq::qec::dem_sampler::cpu::sample_dem(H, 4, probs, 0);
+
+  for (size_t shot = 0; shot < 4; shot++)
+    for (size_t c = 0; c < 3; c++)
+      EXPECT_EQ(checks.at({shot, c}), 1) << "Odd column count => syndrome 1";
+}
+
+TEST(DemSamplingCPU, SingleColumnMatrix) {
+  //  H = | 1 |   p = [1.0]
+  //      | 0 |
+  //      | 1 |
+  // The single error always fires. Syndrome = column = [1, 0, 1].
+  auto H = make_tensor({1, 0, 1}, 3, 1);
+  std::vector<double> probs = {1.0};
+
+  auto [checks, errors] =
+      cudaq::qec::dem_sampler::cpu::sample_dem(H, 6, probs, 0);
+
+  for (size_t shot = 0; shot < 6; shot++) {
+    EXPECT_EQ(errors.at({shot, 0}), 1);
+    EXPECT_EQ(checks.at({shot, 0}), 1);
+    EXPECT_EQ(checks.at({shot, 1}), 0);
+    EXPECT_EQ(checks.at({shot, 2}), 1);
+  }
+}
+
+TEST(DemSamplingCPU, SingleRowMatrix) {
+  //  H = | 1 1 0 1 0 |   p = [1, 0, 1, 0, 1]
+  // Errors = [1, 0, 1, 0, 1].
+  // Single check: 1*1 + 1*0 + 0*1 + 1*0 + 0*1 = 1 mod 2 = 1.
+  auto H = make_tensor({1, 1, 0, 1, 0}, 1, 5);
+  std::vector<double> probs = {1.0, 0.0, 1.0, 0.0, 1.0};
+
+  auto [checks, errors] =
+      cudaq::qec::dem_sampler::cpu::sample_dem(H, 4, probs, 0);
+
+  for (size_t shot = 0; shot < 4; shot++) {
+    EXPECT_EQ(errors.at({shot, 0}), 1);
+    EXPECT_EQ(errors.at({shot, 1}), 0);
+    EXPECT_EQ(errors.at({shot, 2}), 1);
+    EXPECT_EQ(errors.at({shot, 3}), 0);
+    EXPECT_EQ(errors.at({shot, 4}), 1);
+    EXPECT_EQ(checks.at({shot, 0}), 1);
+  }
+}
+
+TEST(DemSamplingCPU, SingleShot) {
+  auto H = make_tensor({1, 1, 0, 0, 1, 1}, 2, 3);
+  std::vector<double> probs = {1.0, 1.0, 0.0};
+
+  auto [checks, errors] =
+      cudaq::qec::dem_sampler::cpu::sample_dem(H, 1, probs, 0);
+
+  EXPECT_EQ(errors.shape()[0], 1u);
+  EXPECT_EQ(errors.at({0, 0}), 1);
+  EXPECT_EQ(errors.at({0, 1}), 1);
+  EXPECT_EQ(errors.at({0, 2}), 0);
+  // check 0: 1*1 + 1*1 + 0*0 = 2 mod 2 = 0
+  // check 1: 0*1 + 1*1 + 1*0 = 1 mod 2 = 1
+  EXPECT_EQ(checks.at({0, 0}), 0);
+  EXPECT_EQ(checks.at({0, 1}), 1);
+}
+
+TEST(DemSamplingCPU, RejectOutOfRangeProbabilities) {
+  auto H = make_tensor({1, 0, 1, 0, 1, 1}, 2, 3);
+
+  std::vector<double> probs_negative = {0.1, -0.2, 0.3};
+  EXPECT_THROW(
+      cudaq::qec::dem_sampler::cpu::sample_dem(H, 16, probs_negative, 1),
+      std::invalid_argument);
+
+  std::vector<double> probs_above_one = {0.1, 1.2, 0.3};
+  EXPECT_THROW(
+      cudaq::qec::dem_sampler::cpu::sample_dem(H, 16, probs_above_one, 1),
+      std::invalid_argument);
+
+  std::vector<double> probs_nan = {
+      0.1, std::numeric_limits<double>::quiet_NaN(), 0.3};
+  EXPECT_THROW(cudaq::qec::dem_sampler::cpu::sample_dem(H, 16, probs_nan, 1),
+               std::invalid_argument);
+}
+
+TEST(DemSamplingCPU, SeedReproducibility) {
+  auto H = make_tensor({1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1}, 3, 5);
+  std::vector<double> probs = {0.1, 0.3, 0.5, 0.7, 0.9};
+
+  auto [c1, e1] = cudaq::qec::dem_sampler::cpu::sample_dem(H, 100, probs, 42);
+  auto [c2, e2] = cudaq::qec::dem_sampler::cpu::sample_dem(H, 100, probs, 42);
+
+  for (size_t i = 0; i < 100; i++) {
+    for (size_t j = 0; j < 3; j++)
+      EXPECT_EQ(c1.at({i, j}), c2.at({i, j}));
+    for (size_t j = 0; j < 5; j++)
+      EXPECT_EQ(e1.at({i, j}), e2.at({i, j}));
+  }
+}
+
+TEST(DemSamplingCPU, SyndromeConsistency) {
+  // Use a fixed, known H and verify syndrome = errors * H^T mod 2 per shot.
+  std::vector<uint8_t> H_data = {1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
+                                 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
+                                 0, 1, 1, 1, 0, 0, 0, 0, 1, 1};
+  const size_t num_checks = 4, num_errors = 8, num_shots = 200;
+  auto H = make_tensor(H_data, num_checks, num_errors);
+  std::vector<double> probs = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8};
+
+  auto [checks, errors] =
+      cudaq::qec::dem_sampler::cpu::sample_dem(H, num_shots, probs, 42);
+
+  for (size_t shot = 0; shot < num_shots; shot++) {
+    std::vector<uint8_t> shot_errors(num_errors);
+    for (size_t e = 0; e < num_errors; e++)
+      shot_errors[e] = errors.at({shot, e});
+
+    auto expected =
+        compute_syndrome(H_data, shot_errors, num_checks, num_errors);
+    for (size_t c = 0; c < num_checks; c++)
+      EXPECT_EQ(checks.at({shot, c}), expected[c])
+          << "Shot " << shot << " check " << c;
+  }
+}
+
+TEST(DemSamplingCPU, RepetitionCodeParity) {
+  // Repetition code: H = [[1,1,0,0], [0,1,1,0], [0,0,1,1]]
+  // Single error on qubit 1 (index 1): flips checks 0 and 1.
+  auto H = make_tensor({1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1}, 3, 4);
+  std::vector<double> probs = {0.0, 1.0, 0.0, 0.0};
+
+  auto [checks, errors] =
+      cudaq::qec::dem_sampler::cpu::sample_dem(H, 3, probs, 0);
+
+  for (size_t shot = 0; shot < 3; shot++) {
+    EXPECT_EQ(errors.at({shot, 0}), 0);
+    EXPECT_EQ(errors.at({shot, 1}), 1);
+    EXPECT_EQ(errors.at({shot, 2}), 0);
+    EXPECT_EQ(errors.at({shot, 3}), 0);
+    EXPECT_EQ(checks.at({shot, 0}), 1);
+    EXPECT_EQ(checks.at({shot, 1}), 1);
+    EXPECT_EQ(checks.at({shot, 2}), 0);
+  }
+}
+
+TEST(DemSamplingCPU, BackwardsCompatibility) {
+  auto H = make_tensor({1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1}, 3, 5);
+  std::vector<double> probs = {0.1, 0.2, 0.15, 0.05, 0.25};
+  unsigned seed = 42;
+
+  auto [old_checks, old_errors] = cudaq::qec::dem_sampling(H, 100, probs, seed);
+  auto [new_checks, new_errors] =
+      cudaq::qec::dem_sampler::cpu::sample_dem(H, 100, probs, seed);
+
+  for (size_t i = 0; i < 100; i++) {
+    for (size_t j = 0; j < 3; j++)
+      EXPECT_EQ(old_checks.at({i, j}), new_checks.at({i, j}));
+    for (size_t j = 0; j < 5; j++)
+      EXPECT_EQ(old_errors.at({i, j}), new_errors.at({i, j}));
+  }
+}
+
+TEST(DemSamplingCPU, ZeroShots) {
+  auto H = make_tensor({1, 0, 1, 0, 1, 1}, 2, 3);
+  std::vector<double> probs = {0.5, 0.5, 0.5};
+
+  auto [checks, errors] =
+      cudaq::qec::dem_sampler::cpu::sample_dem(H, 0, probs, 42);
+
+  EXPECT_EQ(checks.shape()[0], 0u);
+  EXPECT_EQ(checks.shape()[1], 2u);
+  EXPECT_EQ(errors.shape()[0], 0u);
+  EXPECT_EQ(errors.shape()[1], 3u);
+}
+
+TEST(DemSamplingCPU, NonBinaryCheckMatrixMasked) {
+  // H entries > 1 should be treated as H & 1 (matching GPU behavior).
+  // H = [[2, 3], [1, 0]]  ->  binarized [[0, 1], [1, 0]]
+  // probs = [1, 1]  ->  errors = [1, 1]
+  // syndromes with binarized H: [0^1, 1^0] = [1, 1]
+  auto H = make_tensor({2, 3, 1, 0}, 2, 2);
+  std::vector<double> probs = {1.0, 1.0};
+
+  auto [checks, errors] =
+      cudaq::qec::dem_sampler::cpu::sample_dem(H, 4, probs, 0);
+
+  for (size_t shot = 0; shot < 4; shot++) {
+    EXPECT_EQ(errors.at({shot, 0}), 1);
+    EXPECT_EQ(errors.at({shot, 1}), 1);
+    EXPECT_EQ(checks.at({shot, 0}), 1)
+        << "Binarized row 0: [0,1], sum = 1 mod 2 = 1";
+    EXPECT_EQ(checks.at({shot, 1}), 1)
+        << "Binarized row 1: [1,0], sum = 1 mod 2 = 1";
+  }
+}
+
+TEST(DemSamplingCPU, SeedlessPathRuns) {
+  auto H = make_tensor({1, 0, 1, 0, 1, 1}, 2, 3);
+  std::vector<double> probs = {0.5, 0.5, 0.5};
+
+  auto [checks, errors] =
+      cudaq::qec::dem_sampler::cpu::sample_dem(H, 10, probs);
+
+  EXPECT_EQ(checks.shape()[0], 10u);
+  EXPECT_EQ(errors.shape()[0], 10u);
+  for (size_t shot = 0; shot < 10; shot++)
+    for (size_t c = 0; c < 2; c++)
+      EXPECT_TRUE(checks.at({shot, c}) == 0 || checks.at({shot, c}) == 1);
+}
+
+// =============================================================================
+// GPU tests (guarded by CUDAQ_QEC_HAS_CUSTABILIZER)
+// =============================================================================
+
+#ifdef CUDAQ_QEC_HAS_CUSTABILIZER
+
+class DemSamplingGPU : public ::testing::Test {
+protected:
+  static bool has_gpu() {
+    int count = 0;
+    return cudaGetDeviceCount(&count) == cudaSuccess && count > 0;
+  }
+  void SetUp() override {
+    if (!has_gpu())
+      GTEST_SKIP() << "No GPU available";
+  }
+
+  /// RAII wrapper for a set of GPU DEM sampling buffers.
+  struct GpuBuffers {
+    uint8_t *d_H = nullptr, *d_checks = nullptr, *d_errors = nullptr;
+    double *d_probs = nullptr;
+    size_t num_checks, num_errors, num_shots;
+
+    GpuBuffers(const std::vector<uint8_t> &H, const std::vector<double> &probs,
+               size_t checks, size_t errors, size_t shots)
+        : num_checks(checks), num_errors(errors), num_shots(shots) {
+      EXPECT_EQ(cudaMalloc(&d_H, checks * errors), cudaSuccess);
+      EXPECT_EQ(cudaMalloc(&d_probs, errors * sizeof(double)), cudaSuccess);
+      EXPECT_EQ(cudaMalloc(&d_checks, shots * checks), cudaSuccess);
+      EXPECT_EQ(cudaMalloc(&d_errors, shots * errors), cudaSuccess);
+      EXPECT_EQ(
+          cudaMemcpy(d_H, H.data(), checks * errors, cudaMemcpyHostToDevice),
+          cudaSuccess);
+      EXPECT_EQ(cudaMemcpy(d_probs, probs.data(), errors * sizeof(double),
+                           cudaMemcpyHostToDevice),
+                cudaSuccess);
+    }
+
+    ~GpuBuffers() {
+      cudaFree(d_H);
+      cudaFree(d_probs);
+      cudaFree(d_checks);
+      cudaFree(d_errors);
+    }
+
+    bool run(unsigned seed) {
+      return cudaq::qec::dem_sampler::gpu::sample_dem(
+          d_H, num_checks, num_errors, d_probs, num_shots, seed, d_checks,
+          d_errors);
+    }
+
+    std::vector<uint8_t> get_checks() {
+      std::vector<uint8_t> out(num_shots * num_checks);
+      EXPECT_EQ(
+          cudaMemcpy(out.data(), d_checks, out.size(), cudaMemcpyDeviceToHost),
+          cudaSuccess);
+      return out;
+    }
+
+    std::vector<uint8_t> get_errors() {
+      std::vector<uint8_t> out(num_shots * num_errors);
+      EXPECT_EQ(
+          cudaMemcpy(out.data(), d_errors, out.size(), cudaMemcpyDeviceToHost),
+          cudaSuccess);
+      return out;
+    }
+
+    GpuBuffers(const GpuBuffers &) = delete;
+    GpuBuffers &operator=(const GpuBuffers &) = delete;
+  };
+};
+
+TEST_F(DemSamplingGPU, AllZeroProbabilities) {
+  const size_t num_checks = 3, num_errors = 5, num_shots = 10;
+  std::vector<uint8_t> H = {1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1};
+  std::vector<double> probs(num_errors, 0.0);
+
+  GpuBuffers buf(H, probs, num_checks, num_errors, num_shots);
+  ASSERT_TRUE(buf.run(42));
+
+  for (auto v : buf.get_checks())
+    EXPECT_EQ(v, 0);
+  for (auto v : buf.get_errors())
+    EXPECT_EQ(v, 0);
+}
+
+TEST_F(DemSamplingGPU, AllOneProbabilities) {
+  //   H = | 1 0 1 0 |   probs = [1,1,1,1]
+  //       | 1 1 0 1 |
+  //       | 0 1 1 0 |
+  // Expected syndromes: [0, 1, 0] (same as CPU test).
+  const size_t num_checks = 3, num_errors = 4, num_shots = 5;
+  std::vector<uint8_t> H = {1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0};
+  std::vector<double> probs(num_errors, 1.0);
+
+  GpuBuffers buf(H, probs, num_checks, num_errors, num_shots);
+  ASSERT_TRUE(buf.run(42));
+
+  auto h_checks = buf.get_checks();
+  auto h_errors = buf.get_errors();
+
+  for (size_t shot = 0; shot < num_shots; shot++) {
+    for (size_t e = 0; e < num_errors; e++)
+      EXPECT_EQ(h_errors[shot * num_errors + e], 1);
+    EXPECT_EQ(h_checks[shot * num_checks + 0], 0);
+    EXPECT_EQ(h_checks[shot * num_checks + 1], 1);
+    EXPECT_EQ(h_checks[shot * num_checks + 2], 0);
+  }
+}
+
+TEST_F(DemSamplingGPU, MixedDeterministicProbs) {
+  //   H = | 1 0 1 |   probs = [1, 0, 1]
+  //       | 0 1 1 |
+  // Expected: errors = [1,0,1], checks = [0, 1]
+  const size_t num_checks = 2, num_errors = 3, num_shots = 8;
+  std::vector<uint8_t> H = {1, 0, 1, 0, 1, 1};
+  std::vector<double> probs = {1.0, 0.0, 1.0};
+
+  GpuBuffers buf(H, probs, num_checks, num_errors, num_shots);
+  ASSERT_TRUE(buf.run(99));
+
+  auto h_checks = buf.get_checks();
+  auto h_errors = buf.get_errors();
+
+  for (size_t shot = 0; shot < num_shots; shot++) {
+    EXPECT_EQ(h_errors[shot * num_errors + 0], 1);
+    EXPECT_EQ(h_errors[shot * num_errors + 1], 0);
+    EXPECT_EQ(h_errors[shot * num_errors + 2], 1);
+    EXPECT_EQ(h_checks[shot * num_checks + 0], 0);
+    EXPECT_EQ(h_checks[shot * num_checks + 1], 1);
+  }
+}
+
+TEST_F(DemSamplingGPU, IdentityMatrix) {
+  const size_t N = 5, num_shots = 3;
+  std::vector<uint8_t> H(N * N, 0);
+  for (size_t i = 0; i < N; i++)
+    H[i * N + i] = 1;
+  std::vector<double> probs(N, 1.0);
+
+  GpuBuffers buf(H, probs, N, N, num_shots);
+  ASSERT_TRUE(buf.run(7));
+
+  auto h_checks = buf.get_checks();
+  auto h_errors = buf.get_errors();
+
+  for (size_t shot = 0; shot < num_shots; shot++)
+    for (size_t j = 0; j < N; j++) {
+      EXPECT_EQ(h_errors[shot * N + j], 1);
+      EXPECT_EQ(h_checks[shot * N + j], 1);
+    }
+}
+
+TEST_F(DemSamplingGPU, AllOnesMatrixEvenColumns) {
+  // H = 3x4 all-ones, p=1. Even column count => all syndromes = 0.
+  const size_t num_checks = 3, num_errors = 4, num_shots = 4;
+  std::vector<uint8_t> H(num_checks * num_errors, 1);
+  std::vector<double> probs(num_errors, 1.0);
+
+  GpuBuffers buf(H, probs, num_checks, num_errors, num_shots);
+  ASSERT_TRUE(buf.run(0));
+
+  for (auto v : buf.get_checks())
+    EXPECT_EQ(v, 0) << "Even column count => syndrome 0";
+}
+
+TEST_F(DemSamplingGPU, AllOnesMatrixOddColumns) {
+  // H = 3x3 all-ones, p=1. Odd column count => all syndromes = 1.
+  const size_t N = 3, num_shots = 4;
+  std::vector<uint8_t> H(N * N, 1);
+  std::vector<double> probs(N, 1.0);
+
+  GpuBuffers buf(H, probs, N, N, num_shots);
+  ASSERT_TRUE(buf.run(0));
+
+  for (auto v : buf.get_checks())
+    EXPECT_EQ(v, 1) << "Odd column count => syndrome 1";
+}
+
+TEST_F(DemSamplingGPU, RepetitionCodeParity) {
+  // H = [[1,1,0,0],[0,1,1,0],[0,0,1,1]], single error on qubit 1.
+  const size_t num_checks = 3, num_errors = 4, num_shots = 3;
+  std::vector<uint8_t> H = {1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1};
+  std::vector<double> probs = {0.0, 1.0, 0.0, 0.0};
+
+  GpuBuffers buf(H, probs, num_checks, num_errors, num_shots);
+  ASSERT_TRUE(buf.run(0));
+
+  auto h_checks = buf.get_checks();
+  auto h_errors = buf.get_errors();
+
+  for (size_t shot = 0; shot < num_shots; shot++) {
+    EXPECT_EQ(h_errors[shot * num_errors + 0], 0);
+    EXPECT_EQ(h_errors[shot * num_errors + 1], 1);
+    EXPECT_EQ(h_errors[shot * num_errors + 2], 0);
+    EXPECT_EQ(h_errors[shot * num_errors + 3], 0);
+    EXPECT_EQ(h_checks[shot * num_checks + 0], 1);
+    EXPECT_EQ(h_checks[shot * num_checks + 1], 1);
+    EXPECT_EQ(h_checks[shot * num_checks + 2], 0);
+  }
+}
+
+TEST_F(DemSamplingGPU, SyndromeConsistency) {
+  // Fixed H, verify syndrome = errors * H^T mod 2 for every shot.
+  const size_t num_checks = 4, num_errors = 8, num_shots = 100;
+  std::vector<uint8_t> H = {1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
+                            0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1};
+  std::vector<double> probs = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8};
+
+  GpuBuffers buf(H, probs, num_checks, num_errors, num_shots);
+  ASSERT_TRUE(buf.run(42));
+
+  auto h_checks = buf.get_checks();
+  auto h_errors = buf.get_errors();
+
+  for (size_t shot = 0; shot < num_shots; shot++) {
+    std::vector<uint8_t> shot_err(h_errors.begin() + shot * num_errors,
+                                  h_errors.begin() + (shot + 1) * num_errors);
+    auto expected = compute_syndrome(H, shot_err, num_checks, num_errors);
+    for (size_t c = 0; c < num_checks; c++)
+      EXPECT_EQ(h_checks[shot * num_checks + c], expected[c])
+          << "Shot " << shot << " check " << c;
+  }
+}
+
+TEST_F(DemSamplingGPU, SeedReproducibility) {
+  const size_t num_checks = 3, num_errors = 5, num_shots = 100;
+  std::vector<uint8_t> H = {1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1};
+  std::vector<double> probs = {0.1, 0.3, 0.5, 0.7, 0.9};
+
+  GpuBuffers buf1(H, probs, num_checks, num_errors, num_shots);
+  ASSERT_TRUE(buf1.run(42));
+  auto checks1 = buf1.get_checks();
+  auto errors1 = buf1.get_errors();
+
+  GpuBuffers buf2(H, probs, num_checks, num_errors, num_shots);
+  ASSERT_TRUE(buf2.run(42));
+  auto checks2 = buf2.get_checks();
+  auto errors2 = buf2.get_errors();
+
+  EXPECT_EQ(checks1, checks2) << "Same seed must produce identical syndromes";
+  EXPECT_EQ(errors1, errors2) << "Same seed must produce identical errors";
+}
+
+TEST_F(DemSamplingGPU, CpuGpuCrossValidation) {
+  // Use deterministic probs (0 and 1) so CPU and GPU must match exactly.
+  const size_t num_checks = 3, num_errors = 6, num_shots = 20;
+  std::vector<uint8_t> H_data = {1, 0, 1, 0, 0, 1, 0, 1, 0,
+                                 1, 0, 0, 1, 1, 0, 0, 1, 1};
+  std::vector<double> probs = {1.0, 0.0, 1.0, 0.0, 1.0, 0.0};
+
+  auto H_tensor = make_tensor(H_data, num_checks, num_errors);
+  auto [cpu_checks, cpu_errors] =
+      cudaq::qec::dem_sampler::cpu::sample_dem(H_tensor, num_shots, probs, 42);
+
+  GpuBuffers buf(H_data, probs, num_checks, num_errors, num_shots);
+  ASSERT_TRUE(buf.run(42));
+  auto gpu_checks = buf.get_checks();
+  auto gpu_errors = buf.get_errors();
+
+  for (size_t shot = 0; shot < num_shots; shot++) {
+    for (size_t e = 0; e < num_errors; e++)
+      EXPECT_EQ(gpu_errors[shot * num_errors + e], cpu_errors.at({shot, e}))
+          << "Shot " << shot << " error " << e;
+    for (size_t c = 0; c < num_checks; c++)
+      EXPECT_EQ(gpu_checks[shot * num_checks + c], cpu_checks.at({shot, c}))
+          << "Shot " << shot << " check " << c;
+  }
+}
+
+TEST_F(DemSamplingGPU, BinaryOutputOnly) {
+  const size_t num_checks = 3, num_errors = 5, num_shots = 200;
+  std::vector<uint8_t> H = {1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1};
+  std::vector<double> probs = {0.2, 0.4, 0.6, 0.8, 0.5};
+
+  GpuBuffers buf(H, probs, num_checks, num_errors, num_shots);
+  ASSERT_TRUE(buf.run(123));
+
+  for (auto v : buf.get_checks())
+    EXPECT_TRUE(v == 0 || v == 1)
+        << "Syndrome value must be 0 or 1, got " << (int)v;
+  for (auto v : buf.get_errors())
+    EXPECT_TRUE(v == 0 || v == 1)
+        << "Error value must be 0 or 1, got " << (int)v;
+}
+
+TEST_F(DemSamplingGPU, BitpackBoundary32Columns) {
+  // 32 columns lands exactly on a uint32 word boundary — no padding bits.
+  const size_t num_checks = 4, num_errors = 32, num_shots = 50;
+  std::vector<uint8_t> H(num_checks * num_errors, 0);
+  for (size_t i = 0; i < num_checks; i++)
+    for (size_t j = 0; j < num_errors; j++)
+      H[i * num_errors + j] = ((i + j) % 3 == 0) ? 1 : 0;
+  std::vector<double> probs(num_errors, 1.0);
+
+  GpuBuffers buf(H, probs, num_checks, num_errors, num_shots);
+  ASSERT_TRUE(buf.run(77));
+
+  auto h_checks = buf.get_checks();
+  auto h_errors = buf.get_errors();
+
+  for (size_t e = 0; e < num_errors; e++)
+    EXPECT_EQ(h_errors[e], 1) << "p=1 => all errors fire";
+
+  for (size_t shot = 0; shot < num_shots; shot++) {
+    std::vector<uint8_t> shot_err(h_errors.begin() + shot * num_errors,
+                                  h_errors.begin() + (shot + 1) * num_errors);
+    auto expected = compute_syndrome(H, shot_err, num_checks, num_errors);
+    for (size_t c = 0; c < num_checks; c++)
+      EXPECT_EQ(h_checks[shot * num_checks + c], expected[c])
+          << "Shot " << shot << " check " << c;
+  }
+}
+
+TEST_F(DemSamplingGPU, BitpackBoundary33Columns) {
+  // 33 columns = 1 word + 1 bit, tests partial-word handling.
+  const size_t num_checks = 3, num_errors = 33, num_shots = 50;
+  std::vector<uint8_t> H(num_checks * num_errors, 0);
+  for (size_t i = 0; i < num_checks; i++)
+    for (size_t j = 0; j < num_errors; j++)
+      H[i * num_errors + j] = ((i + j) % 2 == 0) ? 1 : 0;
+  std::vector<double> probs(num_errors, 1.0);
+
+  GpuBuffers buf(H, probs, num_checks, num_errors, num_shots);
+  ASSERT_TRUE(buf.run(88));
+
+  auto h_checks = buf.get_checks();
+  auto h_errors = buf.get_errors();
+
+  for (size_t shot = 0; shot < num_shots; shot++) {
+    std::vector<uint8_t> shot_err(h_errors.begin() + shot * num_errors,
+                                  h_errors.begin() + (shot + 1) * num_errors);
+    auto expected = compute_syndrome(H, shot_err, num_checks, num_errors);
+    for (size_t c = 0; c < num_checks; c++)
+      EXPECT_EQ(h_checks[shot * num_checks + c], expected[c])
+          << "Shot " << shot << " check " << c;
+  }
+}
+
+TEST_F(DemSamplingGPU, BitpackBoundary64Columns) {
+  // 64 columns = exactly 2 uint32 words.
+  const size_t num_checks = 4, num_errors = 64, num_shots = 40;
+  std::vector<uint8_t> H(num_checks * num_errors, 0);
+  for (size_t i = 0; i < num_checks; i++)
+    for (size_t j = 0; j < num_errors; j++)
+      H[i * num_errors + j] = ((i * 7 + j * 3) % 5 == 0) ? 1 : 0;
+  std::vector<double> probs(num_errors, 1.0);
+
+  GpuBuffers buf(H, probs, num_checks, num_errors, num_shots);
+  ASSERT_TRUE(buf.run(99));
+
+  auto h_checks = buf.get_checks();
+  auto h_errors = buf.get_errors();
+
+  for (size_t shot = 0; shot < num_shots; shot++) {
+    std::vector<uint8_t> shot_err(h_errors.begin() + shot * num_errors,
+                                  h_errors.begin() + (shot + 1) * num_errors);
+    auto expected = compute_syndrome(H, shot_err, num_checks, num_errors);
+    for (size_t c = 0; c < num_checks; c++)
+      EXPECT_EQ(h_checks[shot * num_checks + c], expected[c])
+          << "Shot " << shot << " check " << c;
+  }
+}
+
+TEST_F(DemSamplingGPU, LargeScaleSyndromeConsistency) {
+  // Larger problem: 20 checks, 100 errors, 1000 shots.
+  const size_t num_checks = 20, num_errors = 100, num_shots = 1000;
+
+  std::mt19937 gen(12345);
+  std::bernoulli_distribution dist(0.3);
+  std::vector<uint8_t> H(num_checks * num_errors);
+  for (auto &v : H)
+    v = dist(gen) ? 1 : 0;
+
+  std::uniform_real_distribution<double> pdist(0.01, 0.5);
+  std::vector<double> probs(num_errors);
+  for (auto &p : probs)
+    p = pdist(gen);
+
+  GpuBuffers buf(H, probs, num_checks, num_errors, num_shots);
+  ASSERT_TRUE(buf.run(42));
+
+  auto h_checks = buf.get_checks();
+  auto h_errors = buf.get_errors();
+
+  for (size_t shot = 0; shot < num_shots; shot++) {
+    std::vector<uint8_t> shot_err(h_errors.begin() + shot * num_errors,
+                                  h_errors.begin() + (shot + 1) * num_errors);
+    auto expected = compute_syndrome(H, shot_err, num_checks, num_errors);
+    for (size_t c = 0; c < num_checks; c++)
+      EXPECT_EQ(h_checks[shot * num_checks + c], expected[c])
+          << "Shot " << shot << " check " << c;
+  }
+}
+
+TEST_F(DemSamplingGPU, ZeroShots) {
+  const size_t num_checks = 3, num_errors = 5;
+  std::vector<uint8_t> H = {1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1};
+  std::vector<double> probs(num_errors, 0.5);
+
+  // num_shots = 0 should return true immediately with no work
+  ASSERT_TRUE(cudaq::qec::dem_sampler::gpu::sample_dem(
+      nullptr, num_checks, num_errors, nullptr, 0, 42, nullptr, nullptr));
+}
+
+TEST_F(DemSamplingGPU, NonBinaryCheckMatrixCpuGpuMatch) {
+  // H with entries > 1: both paths should agree after binarization.
+  const size_t num_checks = 2, num_errors = 2, num_shots = 4;
+  std::vector<uint8_t> H_data = {2, 3, 1, 0};
+  std::vector<double> probs = {1.0, 1.0};
+
+  auto H_tensor = make_tensor(H_data, num_checks, num_errors);
+  auto [cpu_checks, cpu_errors] =
+      cudaq::qec::dem_sampler::cpu::sample_dem(H_tensor, num_shots, probs, 0);
+
+  GpuBuffers buf(H_data, probs, num_checks, num_errors, num_shots);
+  ASSERT_TRUE(buf.run(0));
+  auto gpu_checks = buf.get_checks();
+  auto gpu_errors = buf.get_errors();
+
+  for (size_t shot = 0; shot < num_shots; shot++) {
+    for (size_t e = 0; e < num_errors; e++)
+      EXPECT_EQ(gpu_errors[shot * num_errors + e], cpu_errors.at({shot, e}))
+          << "Shot " << shot << " error " << e;
+    for (size_t c = 0; c < num_checks; c++)
+      EXPECT_EQ(gpu_checks[shot * num_checks + c], cpu_checks.at({shot, c}))
+          << "Shot " << shot << " check " << c;
+  }
+}
+
+#endif // CUDAQ_QEC_HAS_CUSTABILIZER