diff --git a/.github/actions/build-lib/action.yaml b/.github/actions/build-lib/action.yaml index 67dd960b..a6ec4f01 100644 --- a/.github/actions/build-lib/action.yaml +++ b/.github/actions/build-lib/action.yaml @@ -23,6 +23,14 @@ inputs: description: 'Install prefix' default: '~/.cudaqx' required: false + require-custabilizer: + description: 'Enable cuStabilizer GPU DEM sampling (ON or OFF)' + default: 'ON' + required: false + require-custabilizer-gpu-torch: + description: 'Install PyTorch for GPU Torch tensor path (ON or OFF, requires custabilizer ON)' + default: 'ON' + required: false outputs: build-dir: description: 'Build dir.' @@ -66,6 +74,8 @@ runs: env: CCACHE_DIR: /ccache-${{ inputs.lib }} GH_TOKEN: ${{ github.token }} # REVERT-WITH-CUDAQ-REALTIME-BUILD + REQUIRE_CUSTABILIZER: ${{ inputs.require-custabilizer }} + REQUIRE_CUSTABILIZER_GPU_TORCH: ${{ inputs.require-custabilizer-gpu-torch }} run: | build_dir=build_${{ inputs.lib }} .github/actions/build-lib/build_${{ inputs.lib }}.sh $build_dir ${{ inputs.install-prefix }} diff --git a/.github/actions/build-lib/build_all.sh b/.github/actions/build-lib/build_all.sh index 03fc901d..17e5022d 100755 --- a/.github/actions/build-lib/build_all.sh +++ b/.github/actions/build-lib/build_all.sh @@ -18,6 +18,87 @@ if [ -z "$CUDAQ_REALTIME_ROOT" ]; then cd "$_build_cwd" fi +REQUIRE_CUSTABILIZER=${REQUIRE_CUSTABILIZER:-ON} +REQUIRE_CUSTABILIZER_GPU_TORCH=${REQUIRE_CUSTABILIZER_GPU_TORCH:-ON} + +if [ "$REQUIRE_CUSTABILIZER" = "OFF" ]; then + REQUIRE_CUSTABILIZER_GPU_TORCH="OFF" +fi + +if [ "$REQUIRE_CUSTABILIZER" = "ON" ] && [ -z "$CUSTABILIZER_ROOT" ] && [ -x "$(command -v python3)" ]; then + NVCC_BIN=${CUDACXX:-$(command -v nvcc)} + CUDA_MAJOR="" + if [ -n "$NVCC_BIN" ] && [ -x "$NVCC_BIN" ]; then + CUDA_MAJOR=$("$NVCC_BIN" --version | sed -nE 's/.*release ([0-9]+)\..*/\1/p' | head -n 1) + fi + CUSTAB_PIP="custabilizer-cu${CUDA_MAJOR:-12}" + CUQPY_PIP="cuquantum-python-cu${CUDA_MAJOR:-12}" + pip install --upgrade "${CUSTAB_PIP}>=0.3.0" "${CUQPY_PIP}>=26.3.0" +fi + +if [ "$REQUIRE_CUSTABILIZER_GPU_TORCH" = "ON" ] && [ -x "$(command -v python3)" ]; then + NVCC_BIN=${CUDACXX:-$(command -v nvcc)} + if [ -n "$NVCC_BIN" ] && [ -x "$NVCC_BIN" ]; then + cuda_version=$("$NVCC_BIN" --version | sed -nE 's/.*release ([0-9]+\.[0-9]+).*/\1/p' | head -n 1) + cuda_no_dot=$(echo "$cuda_version" | tr -d '.') + pip install torch --index-url "https://download.pytorch.org/whl/cu${cuda_no_dot}" || true + fi +fi + +if [ -z "$CUSTABILIZER_ROOT" ] && [ "$REQUIRE_CUSTABILIZER" = "ON" ] && [ -x "$(command -v python3)" ]; then + if [ -z "$CUSTABILIZER_PIP_PACKAGE" ]; then + CUDA_MAJOR="" + NVCC_BIN=${CUDACXX:-$(command -v nvcc)} + if [ -n "$NVCC_BIN" ] && [ -x "$NVCC_BIN" ]; then + CUDA_MAJOR=$("$NVCC_BIN" --version | sed -nE 's/.*release ([0-9]+)\..*/\1/p' | head -n 1) + fi + if [ -n "$CUDA_MAJOR" ]; then + CUSTABILIZER_PIP_PACKAGE="custabilizer-cu${CUDA_MAJOR}" + else + for candidate in custabilizer-cu13 custabilizer-cu12; do + if python3 -m pip show "$candidate" >/dev/null 2>&1; then + CUSTABILIZER_PIP_PACKAGE="$candidate" + break + fi + done + fi + fi + + if [ -n "$CUSTABILIZER_PIP_PACKAGE" ] && \ + python3 -m pip show "$CUSTABILIZER_PIP_PACKAGE" >/dev/null 2>&1; then + CUSTABILIZER_ROOT=$(python3 - "$CUSTABILIZER_PIP_PACKAGE" <<'PY' +import pathlib, subprocess, sys +package = sys.argv[1] +show = subprocess.check_output([sys.executable, "-m", "pip", "show", package], text=True) +location = "" +for line in show.splitlines(): + if line.startswith("Location:"): + location = line.split(":", 1)[1].strip(); break +if not location: + print(""); raise SystemExit(0) +base = pathlib.Path(location) +for root in [base / "custabilizer", base / "cuquantum", base]: + h = root / "include" / "custabilizer.h" + lib = root / "lib" / "libcustabilizer.so.0" + lib64 = root / "lib64" / "libcustabilizer.so.0" + if h.exists() and (lib.exists() or lib64.exists()): + print(root); raise SystemExit(0) +print("") +PY +) + fi +fi + +CUSTABILIZER_CMAKE_ARG="" +if [ -n "$CUSTABILIZER_ROOT" ]; then + echo "Resolved cuStabilizer root: $CUSTABILIZER_ROOT" + CUSTABILIZER_CMAKE_ARG="-DCUSTABILIZER_ROOT=$CUSTABILIZER_ROOT" + export LD_LIBRARY_PATH="$CUSTABILIZER_ROOT/lib:$CUSTABILIZER_ROOT/lib64:${LD_LIBRARY_PATH:-}" + export CPATH="$CUSTABILIZER_ROOT/include:${CPATH:-}" +else + echo "Unable to resolve CUSTABILIZER_ROOT from pip wheel." +fi + cmake -S . -B "$1" \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_C_COMPILER=gcc-11 \ @@ -25,6 +106,8 @@ cmake -S . -B "$1" \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCUDAQ_DIR=/cudaq-install/lib/cmake/cudaq/ \ + ${CUSTABILIZER_CMAKE_ARG:+$CUSTABILIZER_CMAKE_ARG} \ + -DCUDAQ_QEC_REQUIRE_CUSTABILIZER=$REQUIRE_CUSTABILIZER \ -DCUDAQX_ENABLE_LIBS="all" \ -DCUDAQX_INCLUDE_TESTS=ON \ -DCUDAQX_BINDINGS_PYTHON=ON \ diff --git a/.github/actions/build-lib/build_qec.sh b/.github/actions/build-lib/build_qec.sh index 9e74a59c..b34b2b8e 100755 --- a/.github/actions/build-lib/build_qec.sh +++ b/.github/actions/build-lib/build_qec.sh @@ -19,6 +19,106 @@ if [ -z "$CUDAQ_REALTIME_ROOT" ]; then cd "$_build_cwd" fi +REQUIRE_CUSTABILIZER=${REQUIRE_CUSTABILIZER:-ON} +REQUIRE_CUSTABILIZER_GPU_TORCH=${REQUIRE_CUSTABILIZER_GPU_TORCH:-ON} + +# Enforce: cuStabilizer OFF forces GPU Torch OFF +if [ "$REQUIRE_CUSTABILIZER" = "OFF" ]; then + REQUIRE_CUSTABILIZER_GPU_TORCH="OFF" +fi + +# Install cuStabilizer dependencies if required +if [ "$REQUIRE_CUSTABILIZER" = "ON" ] && [ -z "$CUSTABILIZER_ROOT" ] && [ -x "$(command -v python3)" ]; then + NVCC_BIN=${CUDACXX:-$(command -v nvcc)} + CUDA_MAJOR="" + if [ -n "$NVCC_BIN" ] && [ -x "$NVCC_BIN" ]; then + CUDA_MAJOR=$("$NVCC_BIN" --version | sed -nE 's/.*release ([0-9]+)\..*/\1/p' | head -n 1) + fi + CUSTAB_PIP="custabilizer-cu${CUDA_MAJOR:-12}" + CUQPY_PIP="cuquantum-python-cu${CUDA_MAJOR:-12}" + pip install --upgrade "${CUSTAB_PIP}>=0.3.0" "${CUQPY_PIP}>=26.3.0" +fi + +# Install Torch if GPU Torch path is required +if [ "$REQUIRE_CUSTABILIZER_GPU_TORCH" = "ON" ] && [ -x "$(command -v python3)" ]; then + NVCC_BIN=${CUDACXX:-$(command -v nvcc)} + if [ -n "$NVCC_BIN" ] && [ -x "$NVCC_BIN" ]; then + cuda_version=$("$NVCC_BIN" --version | sed -nE 's/.*release ([0-9]+\.[0-9]+).*/\1/p' | head -n 1) + cuda_no_dot=$(echo "$cuda_version" | tr -d '.') + pip install torch --index-url "https://download.pytorch.org/whl/cu${cuda_no_dot}" || true + fi +fi + +if [ -z "$CUSTABILIZER_ROOT" ] && [ "$REQUIRE_CUSTABILIZER" = "ON" ] && [ -x "$(command -v python3)" ]; then + if [ -z "$CUSTABILIZER_PIP_PACKAGE" ]; then + CUDA_MAJOR="" + NVCC_BIN=${CUDACXX:-$(command -v nvcc)} + if [ -n "$NVCC_BIN" ] && [ -x "$NVCC_BIN" ]; then + CUDA_MAJOR=$("$NVCC_BIN" --version | sed -nE 's/.*release ([0-9]+)\..*/\1/p' | head -n 1) + fi + + if [ -n "$CUDA_MAJOR" ]; then + CUSTABILIZER_PIP_PACKAGE="custabilizer-cu${CUDA_MAJOR}" + else + for candidate in custabilizer-cu13 custabilizer-cu12; do + if python3 -m pip show "$candidate" >/dev/null 2>&1; then + CUSTABILIZER_PIP_PACKAGE="$candidate" + break + fi + done + fi + fi + + if [ -n "$CUSTABILIZER_PIP_PACKAGE" ] && \ + python3 -m pip show "$CUSTABILIZER_PIP_PACKAGE" >/dev/null 2>&1; then + CUSTABILIZER_ROOT=$(python3 - "$CUSTABILIZER_PIP_PACKAGE" <<'PY' +import pathlib +import subprocess +import sys + +package = sys.argv[1] +show = subprocess.check_output( + [sys.executable, "-m", "pip", "show", package], text=True +) +location = "" +for line in show.splitlines(): + if line.startswith("Location:"): + location = line.split(":", 1)[1].strip() + break + +if not location: + print("") + raise SystemExit(0) + +base = pathlib.Path(location) +candidates = [base / "custabilizer", base / "cuquantum", base] +for root in candidates: + include = root / "include" / "custabilizer.h" + if not include.exists(): + continue + lib = root / "lib" / "libcustabilizer.so.0" + lib64 = root / "lib64" / "libcustabilizer.so.0" + if lib.exists() or lib64.exists(): + print(root) + raise SystemExit(0) + +print("") +PY +) + fi +fi + +CUSTABILIZER_CMAKE_ARG="" +if [ -n "$CUSTABILIZER_ROOT" ]; then + echo "Resolved cuStabilizer root: $CUSTABILIZER_ROOT" + CUSTABILIZER_CMAKE_ARG="-DCUSTABILIZER_ROOT=$CUSTABILIZER_ROOT" + # Ensure the resolved path wins over any stale container paths. + export LD_LIBRARY_PATH="$CUSTABILIZER_ROOT/lib:$CUSTABILIZER_ROOT/lib64:${LD_LIBRARY_PATH:-}" + export CPATH="$CUSTABILIZER_ROOT/include:${CPATH:-}" +else + echo "Unable to resolve CUSTABILIZER_ROOT from pip wheel." +fi + cmake -S libs/qec -B "$1" \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_C_COMPILER=gcc-11 \ @@ -26,8 +126,10 @@ cmake -S libs/qec -B "$1" \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCUDAQ_DIR=/cudaq-install/lib/cmake/cudaq/ \ + ${CUSTABILIZER_CMAKE_ARG:+$CUSTABILIZER_CMAKE_ARG} \ -DCUDAQX_INCLUDE_TESTS=ON \ -DCUDAQX_BINDINGS_PYTHON=ON \ + -DCUDAQ_QEC_REQUIRE_CUSTABILIZER=$REQUIRE_CUSTABILIZER \ -DCMAKE_INSTALL_PREFIX="$2" \ -DCUDAQ_REALTIME_ROOT=$CUDAQ_REALTIME_ROOT diff --git a/.github/workflows/all_libs.yaml b/.github/workflows/all_libs.yaml index d9a3bfcb..4f8feb10 100644 --- a/.github/workflows/all_libs.yaml +++ b/.github/workflows/all_libs.yaml @@ -65,6 +65,7 @@ jobs: - name: Install build requirements run: | apt install -y --no-install-recommends gfortran libblas-dev wget + pip install --upgrade cuquantum-python-cu${{ steps.config.outputs.cuda_major }}>=26.3.0 custabilizer-cu${{ steps.config.outputs.cuda_major }}>=0.3.0 - name: Install TensorRT (amd64) if: matrix.platform == 'amd64' @@ -108,10 +109,10 @@ jobs: LD_LIBRARY_PATH: ${{ env.MPI_PATH }}/lib:${{ env.LD_LIBRARY_PATH }} shell: bash run: | - # Install the correct torch first. - cuda_no_dot=$(echo ${{ matrix.cuda_version }} | sed 's/\.//') - pip install torch==2.9.0 --index-url https://download.pytorch.org/whl/cu${cuda_no_dot} - pip install numpy pytest onnxscript cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} lightning ml_collections mpi4py transformers quimb opt_einsum nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}==26.01.0 + pip install --upgrade numpy pytest onnxscript cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} lightning ml_collections mpi4py transformers quimb opt_einsum nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}>=26.3.0 custabilizer-cu${{ steps.config.outputs.cuda_major }}>=0.3.0 + echo "=== cuStabilizer verification ===" + pip show custabilizer-cu${{ steps.config.outputs.cuda_major }} | grep Version + find / -name "libcustabilizer.so*" 2>/dev/null || true # The following tests are needed for docs/sphinx/examples/qec/python/tensor_network_decoder.py. if [ "$(uname -m)" == "x86_64" ]; then # Stim is not currently available on manylinux ARM wheels, so only @@ -119,9 +120,22 @@ jobs: pip install stim beliefmatching fi + - name: Set cuStabilizer runtime paths + run: | + CUSTAB_ROOT=$(python3 -c " + import subprocess, pathlib, sys + pkg = 'custabilizer-cu' + '${{ steps.config.outputs.cuda_major }}' + show = subprocess.check_output([sys.executable, '-m', 'pip', 'show', pkg], text=True) + loc = [l.split(':',1)[1].strip() for l in show.splitlines() if l.startswith('Location:')][0] + root = pathlib.Path(loc) / 'cuquantum' + print(root if (root / 'lib' / 'libcustabilizer.so.0').exists() else '') + ") + if [ -n "$CUSTAB_ROOT" ]; then + echo "LD_LIBRARY_PATH=$CUSTAB_ROOT/lib:$CUSTAB_ROOT/lib64:${{ env.MPI_PATH }}/lib:${LD_LIBRARY_PATH:-}" >> $GITHUB_ENV + fi + - name: Run Python tests env: - LD_LIBRARY_PATH: ${{ env.MPI_PATH }}/lib:${{ env.LD_LIBRARY_PATH }} OMPI_MCA_pml: ob1 run: cmake --build ${{ steps.build.outputs.build-dir }} --target run_python_tests @@ -131,6 +145,5 @@ jobs: - name: Run example tests env: - LD_LIBRARY_PATH: ${{ env.MPI_PATH }}/lib:${{ env.LD_LIBRARY_PATH }} OMPI_MCA_pml: ob1 run: bash scripts/ci/test_examples.sh all diff --git a/.github/workflows/all_libs_release.yaml b/.github/workflows/all_libs_release.yaml index 6df0a3fe..64d15c1a 100644 --- a/.github/workflows/all_libs_release.yaml +++ b/.github/workflows/all_libs_release.yaml @@ -41,8 +41,12 @@ jobs: permissions: write-all steps: - name: Install dependencies + id: deps run: | apt update && apt install -y --no-install-recommends zip unzip patchelf git-lfs + cuda_major=$(echo ${{ matrix.cuda_version }} | cut -d . -f1) + echo "cuda_major=$cuda_major" >> $GITHUB_OUTPUT + pip install --upgrade cuquantum-python-cu${cuda_major}>=26.3.0 custabilizer-cu${cuda_major}>=0.3.0 - name: Checkout repository uses: actions/checkout@v4 @@ -130,10 +134,10 @@ jobs: LD_LIBRARY_PATH: ${{ env.MPI_PATH }}/lib:${{ env.LD_LIBRARY_PATH }} shell: bash run: | - # Install the correct torch first. - cuda_no_dot=$(echo ${{ matrix.cuda_version }} | sed 's/\.//') - pip install torch==2.9.0 --index-url https://download.pytorch.org/whl/cu${cuda_no_dot} - pip install numpy pytest onnxscript cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} lightning ml_collections mpi4py transformers quimb opt_einsum nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}==26.01.0 + pip install --upgrade numpy pytest onnxscript cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} lightning ml_collections mpi4py transformers quimb opt_einsum nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}>=26.3.0 custabilizer-cu${{ steps.config.outputs.cuda_major }}>=0.3.0 + echo "=== cuStabilizer verification ===" + pip show custabilizer-cu${{ steps.config.outputs.cuda_major }} | grep Version + find / -name "libcustabilizer.so*" 2>/dev/null || true # The following tests are needed for docs/sphinx/examples/qec/python/tensor_network_decoder.py. if [ "$(uname -m)" == "x86_64" ]; then # Stim is not currently available on manylinux ARM wheels, so only @@ -141,13 +145,25 @@ jobs: pip install stim beliefmatching fi + - name: Set cuStabilizer runtime paths + run: | + CUSTAB_ROOT=$(python3 -c " + import subprocess, pathlib, sys + pkg = 'custabilizer-cu' + '${{ steps.config.outputs.cuda_major }}' + show = subprocess.check_output([sys.executable, '-m', 'pip', 'show', pkg], text=True) + loc = [l.split(':',1)[1].strip() for l in show.splitlines() if l.startswith('Location:')][0] + root = pathlib.Path(loc) / 'cuquantum' + print(root if (root / 'lib' / 'libcustabilizer.so.0').exists() else '') + ") + if [ -n "$CUSTAB_ROOT" ]; then + echo "LD_LIBRARY_PATH=$CUSTAB_ROOT/lib:$CUSTAB_ROOT/lib64:${{ env.MPI_PATH }}/lib:${LD_LIBRARY_PATH:-}" >> $GITHUB_ENV + fi + - name: Run Python tests env: - LD_LIBRARY_PATH: ${{ env.MPI_PATH }}/lib:${{ env.LD_LIBRARY_PATH }} OMPI_MCA_pml: ob1 run: | if [[ -n "${{ env.QEC_EXTERNAL_DECODERS }}" ]]; then - # Verify that external decoder is available if applicable export PYTHONPATH="/usr/local/cudaq:$HOME/.cudaqx" python3 -c "import cudaq_qec as qec; print(qec.__version__); d = qec.get_decoder('nv-qldpc-decoder', qec.get_code('steane').get_parity()); print(d.get_version())" fi @@ -160,7 +176,6 @@ jobs: - name: Run example tests env: - LD_LIBRARY_PATH: ${{ env.MPI_PATH }}/lib:${{ env.LD_LIBRARY_PATH }} OMPI_MCA_pml: ob1 run: | ln -s /usr/local/cudaq /cudaq-install diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 03aec19e..7459d782 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -60,12 +60,24 @@ jobs: apt install -y --no-install-recommends \ gfortran libblas-dev doxygen - python3 -m pip install IPython breathe enum_tools myst_parser nbsphinx \ + python3 -m pip install --break-system-packages --upgrade IPython breathe enum_tools myst_parser nbsphinx \ sphinx_copybutton sphinx_inline_tabs sphinx_gallery sphinx_rtd_theme \ - sphinx_reredirects sphinx_toolbox cupy-cuda12x cuquantum-python-cu12 + sphinx_reredirects "sphinx-autodoc-typehints<3.8" sphinx_toolbox cupy-cuda12x cuquantum-python-cu12>=26.3.0 custabilizer-cu12>=0.3.0 + echo "=== cuStabilizer verification ===" + pip show custabilizer-cu12 | grep Version + find / -name "libcustabilizer.so*" 2>/dev/null || true - name: Build docs run: | + # Resolve cuStabilizer root from the pip wheel. + CUSTABILIZER_ROOT=$(python3 -c " + import subprocess, pathlib, sys + show = subprocess.check_output([sys.executable, '-m', 'pip', 'show', 'custabilizer-cu12'], text=True) + loc = [l.split(':',1)[1].strip() for l in show.splitlines() if l.startswith('Location:')][0] + root = pathlib.Path(loc) / 'cuquantum' + print(root if (root / 'include' / 'custabilizer.h').exists() else '') + ") + # Note: TRT decoder is disabled because all docs for the TRT decoder # are generated independently of the source code (no Doxygen # documentation). @@ -74,6 +86,7 @@ jobs: -DCMAKE_C_COMPILER=gcc-11 \ -DCMAKE_CXX_COMPILER=g++-11 \ -DCUDAQ_DIR=/cudaq-install/lib/cmake/cudaq/ \ + ${CUSTABILIZER_ROOT:+-DCUSTABILIZER_ROOT=$CUSTABILIZER_ROOT} \ -DCUDAQX_ENABLE_LIBS="all" \ -DCUDAQX_INCLUDE_DOCS=ON \ -DCUDAQX_BINDINGS_PYTHON=ON \ diff --git a/.github/workflows/lib_qec.yaml b/.github/workflows/lib_qec.yaml index cd9ac571..aed1fb4b 100644 --- a/.github/workflows/lib_qec.yaml +++ b/.github/workflows/lib_qec.yaml @@ -12,6 +12,8 @@ jobs: matrix: platform: ['amd64', 'arm64'] cuda_version: ['12.6', '13.0'] + custabilizer: ['ON'] + custabilizer_gpu_torch: ['ON'] runs-on: ${{ startsWith(github.repository, 'NVIDIA/cudaqx') && format('linux-{0}-cpu8', matrix.platform) || 'ubuntu-latest' }} container: ghcr.io/nvidia/cuda-quantum-devcontainer:${{ matrix.platform }}-cu${{ matrix.cuda_version }}-gcc11-main permissions: @@ -88,11 +90,27 @@ jobs: pr-number: ${{ steps.export-pr-info.outputs.pr_number }} save-ccache: true platform: ${{ matrix.platform }} + require-custabilizer: ${{ matrix.custabilizer }} + require-custabilizer-gpu-torch: ${{ matrix.custabilizer_gpu_torch }} # ======================================================================== # Run tests # ======================================================================== - + + - name: Set cuStabilizer build-time paths + run: | + CUSTAB_ROOT=$(python3 -c " + import subprocess, pathlib, sys + pkg = 'custabilizer-cu' + '${{ steps.config.outputs.cuda_major }}' + show = subprocess.check_output([sys.executable, '-m', 'pip', 'show', pkg], text=True) + loc = [l.split(':',1)[1].strip() for l in show.splitlines() if l.startswith('Location:')][0] + root = pathlib.Path(loc) / 'cuquantum' + print(root if (root / 'lib' / 'libcustabilizer.so.0').exists() else '') + ") + if [ -n "$CUSTAB_ROOT" ]; then + echo "LD_LIBRARY_PATH=$CUSTAB_ROOT/lib:$CUSTAB_ROOT/lib64:${LD_LIBRARY_PATH:-}" >> $GITHUB_ENV + fi + - name: Run tests run: cmake --build ${{ steps.build.outputs.build-dir }} --target run_tests @@ -103,10 +121,10 @@ jobs: - name: Install python requirements shell: bash run: | - # Install the correct torch first. - cuda_no_dot=$(echo ${{ matrix.cuda_version }} | sed 's/\.//') - pip install torch==2.9.0 --index-url https://download.pytorch.org/whl/cu${cuda_no_dot} - pip install numpy pytest onnxscript cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} quimb opt_einsum nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}==26.01.0 + pip install --upgrade numpy pytest onnxscript cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} quimb opt_einsum nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}>=26.3.0 custabilizer-cu${{ steps.config.outputs.cuda_major }}>=0.3.0 + echo "=== cuStabilizer verification ===" + pip show custabilizer-cu${{ steps.config.outputs.cuda_major }} | grep Version + find / -name "libcustabilizer.so*" 2>/dev/null || true # The following tests are needed for docs/sphinx/examples/qec/python/tensor_network_decoder.py. if [ "$(uname -m)" == "x86_64" ]; then # Stim is not currently available on manylinux ARM wheels, so only @@ -173,8 +191,29 @@ jobs: run: | tar xzf /tmp/qec-build-artifacts.tar.gz -C / + - name: Configure + id: config + run: | + cuda_major=$(echo ${{ matrix.cuda_version }} | cut -d . -f1) + echo "cuda_major=$cuda_major" >> $GITHUB_OUTPUT + + - name: Install cuStabilizer runtime + run: | + pip install --upgrade cuquantum-python-cu${{ steps.config.outputs.cuda_major }}>=26.3.0 custabilizer-cu${{ steps.config.outputs.cuda_major }}>=0.3.0 + echo "=== cuStabilizer verification ===" + pip show custabilizer-cu${{ steps.config.outputs.cuda_major }} | grep Version + CUSTABILIZER_ROOT=$(python3 -c " + import subprocess, pathlib, sys + show = subprocess.check_output([sys.executable, '-m', 'pip', 'show', 'custabilizer-cu${{ steps.config.outputs.cuda_major }}'], text=True) + loc = [l.split(':',1)[1].strip() for l in show.splitlines() if l.startswith('Location:')][0] + root = pathlib.Path(loc) / 'cuquantum' + print(root if (root / 'include' / 'custabilizer.h').exists() else '') + ") + echo "CUSTABILIZER_ROOT=$CUSTABILIZER_ROOT" + echo "LD_LIBRARY_PATH=$CUSTABILIZER_ROOT/lib:$CUSTABILIZER_ROOT/lib64:${LD_LIBRARY_PATH:-}" >> $GITHUB_ENV + - name: Run GPU tests run: | cd $GITHUB_WORKSPACE/build_qec - ctest --output-on-failure -R "GpuKernelsTest|test_realtime_decoding" + ctest --output-on-failure -R "GpuKernelsTest|test_realtime_decoding|DemSamplingGPU" diff --git a/cmake/Modules/FindcuStabilizer.cmake b/cmake/Modules/FindcuStabilizer.cmake new file mode 100644 index 00000000..b143db82 --- /dev/null +++ b/cmake/Modules/FindcuStabilizer.cmake @@ -0,0 +1,84 @@ +# ============================================================================ # +# Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +#[=======================================================================[.rst: +FindcuStabilizer +---------------- + +Find the cuStabilizer library (shipped as a pip wheel: custabilizer-cu12 / cu13). + +Imported Targets +^^^^^^^^^^^^^^^^ + +``cuStabilizer::cuStabilizer`` + The cuStabilizer library. + +Result Variables +^^^^^^^^^^^^^^^^ + +``cuStabilizer_FOUND`` +``cuStabilizer_INCLUDE_DIR`` +``cuStabilizer_LIBRARY`` + +Hints +^^^^^ + +``CUSTABILIZER_ROOT`` + Preferred search prefix (set by CI from ``pip show custabilizer-cuXX``). + +#]=======================================================================] + +find_path(cuStabilizer_INCLUDE_DIR + NAMES custabilizer.h + HINTS + ${CUSTABILIZER_ROOT}/include +) + +find_library(cuStabilizer_LIBRARY + NAMES custabilizer libcustabilizer.so.0 + HINTS + ${CUSTABILIZER_ROOT}/lib64 + ${CUSTABILIZER_ROOT}/lib +) + +set(cuStabilizer_VERSION "") +if(cuStabilizer_INCLUDE_DIR AND EXISTS "${cuStabilizer_INCLUDE_DIR}/custabilizer.h") + file(STRINGS "${cuStabilizer_INCLUDE_DIR}/custabilizer.h" + _custab_major_line REGEX "^#define[ \t]+CUSTABILIZER_MAJOR[ \t]+[0-9]+") + file(STRINGS "${cuStabilizer_INCLUDE_DIR}/custabilizer.h" + _custab_minor_line REGEX "^#define[ \t]+CUSTABILIZER_MINOR[ \t]+[0-9]+") + file(STRINGS "${cuStabilizer_INCLUDE_DIR}/custabilizer.h" + _custab_patch_line REGEX "^#define[ \t]+CUSTABILIZER_PATCH[ \t]+[0-9]+") + if(_custab_major_line AND _custab_minor_line AND _custab_patch_line) + string(REGEX REPLACE "^#define[ \t]+CUSTABILIZER_MAJOR[ \t]+([0-9]+).*$" "\\1" + _custab_major "${_custab_major_line}") + string(REGEX REPLACE "^#define[ \t]+CUSTABILIZER_MINOR[ \t]+([0-9]+).*$" "\\1" + _custab_minor "${_custab_minor_line}") + string(REGEX REPLACE "^#define[ \t]+CUSTABILIZER_PATCH[ \t]+([0-9]+).*$" "\\1" + _custab_patch "${_custab_patch_line}") + set(cuStabilizer_VERSION "${_custab_major}.${_custab_minor}.${_custab_patch}") + endif() +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(cuStabilizer + REQUIRED_VARS cuStabilizer_INCLUDE_DIR cuStabilizer_LIBRARY + VERSION_VAR cuStabilizer_VERSION +) + +if(cuStabilizer_FOUND AND NOT TARGET cuStabilizer::cuStabilizer) + add_library(cuStabilizer::cuStabilizer INTERFACE IMPORTED GLOBAL) + set_target_properties(cuStabilizer::cuStabilizer PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${cuStabilizer_INCLUDE_DIR}" + ) + if(cuStabilizer_LIBRARY) + set_target_properties(cuStabilizer::cuStabilizer PROPERTIES + INTERFACE_LINK_LIBRARIES "${cuStabilizer_LIBRARY}" + ) + endif() +endif() diff --git a/libs/core/include/cuda-qx/core/kwargs_utils.h b/libs/core/include/cuda-qx/core/kwargs_utils.h index ce50f3cc..1357f8d1 100644 --- a/libs/core/include/cuda-qx/core/kwargs_utils.h +++ b/libs/core/include/cuda-qx/core/kwargs_utils.h @@ -105,7 +105,7 @@ template tensor toTensor(const py::array_t &H, bool perform_pcm_checks = false) { py::buffer_info buf = H.request(); - if (buf.ndim >= 1 && buf.strides[0] == buf.itemsize) { + if (buf.ndim >= 2 && buf.strides[0] < buf.strides[buf.ndim - 1]) { throw std::runtime_error("toTensor: data must be in row-major order, but " "column-major order was detected."); } diff --git a/libs/qec/CMakeLists.txt b/libs/qec/CMakeLists.txt index 6ceafa9a..7b8ac405 100644 --- a/libs/qec/CMakeLists.txt +++ b/libs/qec/CMakeLists.txt @@ -61,6 +61,11 @@ option(CUDAQX_QEC_INSTALL_PYTHON "Install python files alongside the library." ${CUDAQX_INSTALL_PYTHON}) +# Require cuStabilizer + CUDA (with sparse DEM APIs) for QEC builds. +option(CUDAQ_QEC_REQUIRE_CUSTABILIZER + "Fail configuration when cuStabilizer/CUDA or required DEM APIs are unavailable." + ON) + # Option to control TRT decoder build (default: ON) option(CUDAQ_QEC_BUILD_TRT_DECODER "Build the TensorRT decoder plugin" ON) diff --git a/libs/qec/include/cudaq/qec/dem_sampling.h b/libs/qec/include/cudaq/qec/dem_sampling.h new file mode 100644 index 00000000..a37499df --- /dev/null +++ b/libs/qec/include/cudaq/qec/dem_sampling.h @@ -0,0 +1,71 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ +#pragma once + +#include "cuda-qx/core/tensor.h" +#include +#include +#include +#include + +namespace cudaq::qec::dem_sampler { + +/// CPU implementation of DEM sampling. This is the existing implementation +/// that uses std::bernoulli_distribution and tensor dot product. +namespace cpu { + +/// @brief Sample measurements from a check matrix (CPU, per-mechanism probs) +/// @param check_matrix Binary matrix [num_checks × num_error_mechanisms] +/// @param numShots Number of measurement shots +/// @param error_probabilities Per-error-mechanism probabilities +/// @return (checks [numShots × num_checks], errors [numShots × num_mechanisms]) +std::tuple, cudaqx::tensor> +sample_dem(const cudaqx::tensor &check_matrix, std::size_t numShots, + const std::vector &error_probabilities); + +/// @brief Sample measurements from a check matrix with seed (CPU) +std::tuple, cudaqx::tensor> +sample_dem(const cudaqx::tensor &check_matrix, std::size_t numShots, + const std::vector &error_probabilities, unsigned seed); + +} // namespace cpu + +/// GPU implementation of DEM sampling via cuStabilizer C API. +/// +/// The caller provides device pointers. The function composes: +/// 1. pack_check_matrix_rowwise (dense uint8 → bitpacked uint32) +/// 2. custabilizerSampleProbArraySparseCompute (sparse Bernoulli sampling) +/// 3. custabilizerGF2SparseDenseMatrixMultiply (syndrome = errors * H^T) +/// 4. unpack_syndromes_gpu (bitpacked uint32 → uint8) +/// 5. csr_to_dense_fused (CSR errors → dense uint8) +/// +/// All GPU memory is allocated and freed internally per call. +namespace gpu { + +/// @brief GPU DEM sampling with device pointers +/// +/// @param d_check_matrix Device pointer [num_checks × num_error_mechanisms] +/// @param num_checks Number of checks (rows of H) +/// @param num_error_mechanisms Number of error mechanisms (columns of H) +/// @param d_error_probabilities Device pointer [num_error_mechanisms] +/// @param num_shots Number of samples +/// @param seed RNG seed +/// @param d_checks_out Device pointer [num_shots × num_checks] (OUTPUT) +/// @param d_errors_out Device pointer [num_shots × num_error_mechanisms] (OUT) +/// @param stream_handle Optional CUDA stream handle (uintptr_t cast), 0 for +/// default stream +/// @return true on success, false if cuStabilizer is unavailable +bool sample_dem(const uint8_t *d_check_matrix, size_t num_checks, + size_t num_error_mechanisms, + const double *d_error_probabilities, size_t num_shots, + unsigned seed, uint8_t *d_checks_out, uint8_t *d_errors_out, + std::uintptr_t stream_handle = 0); + +} // namespace gpu + +} // namespace cudaq::qec::dem_sampler diff --git a/libs/qec/include/cudaq/qec/experiments.h b/libs/qec/include/cudaq/qec/experiments.h index 74ba476f..5cb0b3fe 100644 --- a/libs/qec/include/cudaq/qec/experiments.h +++ b/libs/qec/include/cudaq/qec/experiments.h @@ -9,6 +9,8 @@ #include "cudaq/qec/code.h" #include "cudaq/qec/detector_error_model.h" +#include +#include namespace cudaq::qec { @@ -61,6 +63,25 @@ std::tuple, cudaqx::tensor> sample_code_capacity(const code &code, std::size_t numShots, double error_probability); +/// @brief Sample a detector error model on CPU (legacy API). +/// @param check_matrix Binary matrix [num_checks x num_error_mechanisms] +/// @param numShots Number of independent Monte-Carlo shots +/// @param error_probabilities Per-error-mechanism Bernoulli probabilities +/// @return Tuple of (checks, errors) +std::tuple, cudaqx::tensor> +dem_sampling(const cudaqx::tensor &check_matrix, std::size_t numShots, + const std::vector &error_probabilities); + +/// @brief Sample a detector error model on CPU (legacy API, seeded). +/// @param check_matrix Binary matrix [num_checks x num_error_mechanisms] +/// @param numShots Number of independent Monte-Carlo shots +/// @param error_probabilities Per-error-mechanism Bernoulli probabilities +/// @param seed RNG seed for reproducibility +/// @return Tuple of (checks, errors) +std::tuple, cudaqx::tensor> +dem_sampling(const cudaqx::tensor &check_matrix, std::size_t numShots, + const std::vector &error_probabilities, unsigned seed); + /// @brief Sample syndrome measurements with circuit-level noise /// @param statePrep Initial state preparation operation /// @param numShots Number of measurement shots diff --git a/libs/qec/lib/CMakeLists.txt b/libs/qec/lib/CMakeLists.txt index 86b4e5dd..b81441cd 100644 --- a/libs/qec/lib/CMakeLists.txt +++ b/libs/qec/lib/CMakeLists.txt @@ -10,8 +10,93 @@ set(LIBRARY_NAME cudaq-qec) add_compile_options(-Wno-attributes) -# FIXME?: This must be a shared library. Trying to build a static one will fail. -add_library(${LIBRARY_NAME} SHARED +# cuStabilizer GPU DEM sampling support (required by default). +find_package(cuStabilizer QUIET) +set(_cuStabilizer_location + "include='${cuStabilizer_INCLUDE_DIR}', library='${cuStabilizer_LIBRARY}'") +if(cuStabilizer_VERSION) + string(APPEND _cuStabilizer_location ", version='${cuStabilizer_VERSION}'") +endif() + +# Force a fresh probe each configure invocation. +unset(CUDAQ_QEC_CUSTABILIZER_HAS_SPARSE_DEM_APIS CACHE) +unset(CUDAQ_QEC_CUSTABILIZER_HAS_SPARSE_DEM_APIS) +if(cuStabilizer_FOUND AND CMAKE_CUDA_COMPILER) + include(CheckCXXSourceCompiles) + set(CMAKE_REQUIRED_INCLUDES "${cuStabilizer_INCLUDE_DIR}") + if(CUDAToolkit_INCLUDE_DIRS) + list(APPEND CMAKE_REQUIRED_INCLUDES ${CUDAToolkit_INCLUDE_DIRS}) + endif() + # check_cxx_source_compiles links executables by default. Probe using a + # static library target so we only validate header/API availability. + set(_cudaqx_prev_try_compile_target_type "${CMAKE_TRY_COMPILE_TARGET_TYPE}") + set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) + check_cxx_source_compiles( + " + #include + int main() { + (void)CUSTABILIZER_STATUS_INSUFFICIENT_SPARSE_STORAGE; + using prepare_t = decltype(&custabilizerSampleProbArraySparsePrepare); + using compute_t = decltype(&custabilizerSampleProbArraySparseCompute); + using spmm_t = decltype(&custabilizerGF2SparseDenseMatrixMultiply); + (void)sizeof(prepare_t); + (void)sizeof(compute_t); + (void)sizeof(spmm_t); + return 0; + } + " + CUDAQ_QEC_CUSTABILIZER_HAS_SPARSE_DEM_APIS + ) + if(DEFINED _cudaqx_prev_try_compile_target_type AND + NOT _cudaqx_prev_try_compile_target_type STREQUAL "") + set(CMAKE_TRY_COMPILE_TARGET_TYPE "${_cudaqx_prev_try_compile_target_type}") + else() + unset(CMAKE_TRY_COMPILE_TARGET_TYPE) + endif() + unset(_cudaqx_prev_try_compile_target_type) + unset(CMAKE_REQUIRED_INCLUDES) +endif() + +if(NOT DEFINED CUDAQ_QEC_CUSTABILIZER_HAS_SPARSE_DEM_APIS) + set(CUDAQ_QEC_CUSTABILIZER_HAS_SPARSE_DEM_APIS FALSE) +endif() + +if(cuStabilizer_FOUND AND CMAKE_CUDA_COMPILER AND + CUDAQ_QEC_CUSTABILIZER_HAS_SPARSE_DEM_APIS) + message(STATUS + "cuStabilizer found (${_cuStabilizer_location}) - GPU DEM sampling enabled") + set(CUDAQ_QEC_HAS_CUSTABILIZER TRUE) + set(CUDAQ_QEC_HAS_CUSTABILIZER TRUE PARENT_SCOPE) +elseif(cuStabilizer_FOUND AND CMAKE_CUDA_COMPILER) + message(WARNING + "cuStabilizer found (${_cuStabilizer_location}), but sparse DEM APIs are " + "missing. GPU DEM sampling disabled. Required APIs: " + "custabilizerSampleProbArraySparsePrepare, " + "custabilizerSampleProbArraySparseCompute, " + "custabilizerGF2SparseDenseMatrixMultiply." + ) + set(CUDAQ_QEC_HAS_CUSTABILIZER FALSE) + set(CUDAQ_QEC_HAS_CUSTABILIZER FALSE PARENT_SCOPE) +else() + message(STATUS "cuStabilizer not found or CUDA unavailable – GPU DEM sampling disabled") + set(CUDAQ_QEC_HAS_CUSTABILIZER FALSE) + set(CUDAQ_QEC_HAS_CUSTABILIZER FALSE PARENT_SCOPE) +endif() + +if(CUDAQ_QEC_REQUIRE_CUSTABILIZER AND NOT CUDAQ_QEC_HAS_CUSTABILIZER) + message(FATAL_ERROR + "CUDAQ_QEC_REQUIRE_CUSTABILIZER is ON, but required cuStabilizer/CUDA " + "support was not detected.\n" + "Detected: cuStabilizer_FOUND='${cuStabilizer_FOUND}', " + "${_cuStabilizer_location}, " + "cuda_compiler='${CMAKE_CUDA_COMPILER}', " + "sparse_dem_apis='${CUDAQ_QEC_CUSTABILIZER_HAS_SPARSE_DEM_APIS}'.\n" + "Required APIs: custabilizerSampleProbArraySparsePrepare, " + "custabilizerSampleProbArraySparseCompute, " + "custabilizerGF2SparseDenseMatrixMultiply.") +endif() + +set(QEC_SOURCES code.cpp decoder.cpp detector_error_model.cpp @@ -24,6 +109,16 @@ add_library(${LIBRARY_NAME} SHARED version.cpp ) +if(CUDAQ_QEC_HAS_CUSTABILIZER) + list(APPEND QEC_SOURCES + dem_sampling/dem_sampling_utils.cu + dem_sampling/dem_sampling_gpu.cpp + ) +endif() + +# FIXME?: This must be a shared library. Trying to build a static one will fail. +add_library(${LIBRARY_NAME} SHARED ${QEC_SOURCES}) + add_subdirectory(decoders/plugins/example) add_subdirectory(decoders/plugins/pymatching) @@ -38,7 +133,10 @@ if (CUDAQX_QEC_USE_FLOAT) target_compile_definitions(${LIBRARY_NAME} PUBLIC -DCUDAQX_QEC_FLOAT_TYPE=float) endif() + + target_include_directories(${LIBRARY_NAME} + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} PUBLIC $ $ @@ -57,6 +155,18 @@ target_link_libraries(${LIBRARY_NAME} cudaq::cudaq-common ) +if(CUDAQ_QEC_HAS_CUSTABILIZER) + target_compile_definitions(${LIBRARY_NAME} PUBLIC CUDAQ_QEC_HAS_CUSTABILIZER) + target_link_libraries(${LIBRARY_NAME} PRIVATE + cuStabilizer::cuStabilizer + CUDA::cudart + ) + set_source_files_properties( + dem_sampling/dem_sampling_utils.cu + PROPERTIES LANGUAGE CUDA + ) +endif() + set_target_properties(${LIBRARY_NAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) diff --git a/libs/qec/lib/dem_sampling/dem_sampling_gpu.cpp b/libs/qec/lib/dem_sampling/dem_sampling_gpu.cpp new file mode 100644 index 00000000..e173a1ae --- /dev/null +++ b/libs/qec/lib/dem_sampling/dem_sampling_gpu.cpp @@ -0,0 +1,193 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "cudaq/qec/dem_sampling.h" + +#ifdef CUDAQ_QEC_HAS_CUSTABILIZER +#include "dem_sampling_utils.h" +#include +#include +#include +#include +#include + +namespace { + +void cuda_check(cudaError_t err, const char *msg) { + if (err != cudaSuccess) + throw std::runtime_error(std::string(msg) + ": " + cudaGetErrorString(err)); +} + +void custab_check(custabilizerStatus_t s, const char *msg) { + if (s != CUSTABILIZER_STATUS_SUCCESS) + throw std::runtime_error(std::string(msg) + ": " + + custabilizerGetErrorString(s)); +} + +struct StreamCudaDeleter { + cudaStream_t stream = nullptr; + void operator()(void *p) const { + if (p) + cudaFreeAsync(p, stream); + } +}; + +template +using DevicePtr = std::unique_ptr; + +template +DevicePtr device_alloc(size_t count, cudaStream_t stream) { + T *p = nullptr; + cuda_check(cudaMallocAsync(&p, count * sizeof(T), stream), "cudaMallocAsync"); + return DevicePtr(p, StreamCudaDeleter{stream}); +} + +struct HandleRAII { + custabilizerHandle_t h = nullptr; + HandleRAII() { custab_check(custabilizerCreate(&h), "custabilizerCreate"); } + ~HandleRAII() { + if (h) + custabilizerDestroy(h); + } + HandleRAII(const HandleRAII &) = delete; + HandleRAII &operator=(const HandleRAII &) = delete; +}; + +} // namespace +#endif // CUDAQ_QEC_HAS_CUSTABILIZER + +namespace cudaq::qec::dem_sampler::gpu { + +bool sample_dem(const uint8_t *d_check_matrix, size_t num_checks, + size_t num_error_mechanisms, + const double *d_error_probabilities, size_t num_shots, + unsigned seed, uint8_t *d_checks_out, uint8_t *d_errors_out, + std::uintptr_t stream_handle) { +#ifdef CUDAQ_QEC_HAS_CUSTABILIZER + if (num_checks == 0 || num_error_mechanisms == 0 || num_shots == 0) + return true; + + using namespace cudaq::qec::dem_sampling_utils; + + cudaStream_t stream = reinterpret_cast(stream_handle); + + HandleRAII handle; + + // ── 1. Pack H^T from H directly on device ──────────────────────────────── + // For GF2SparseDenseMatrixMultiply: + // C = A @ B where A is CSR [shots × errors], and B is dense packed + // [errors × n_checks_padded/32]. Here B = H^T packed row-wise. + // + // We avoid a host round-trip by packing H^T directly from H in a single + // kernel. + uint64_t n_checks_padded = ((num_checks + 31) / 32) * 32; + uint64_t check_words = n_checks_padded / 32; + + // Pack H^T row-wise: [num_error_mechanisms × num_checks] -> [num_err × words] + auto d_ht_packed = + device_alloc(num_error_mechanisms * check_words, stream); + pack_check_matrix_transposed_rowwise(d_check_matrix, d_ht_packed.get(), + num_checks, num_error_mechanisms, + stream); + + // ── 2. Sparse Bernoulli sampling ────────────────────────────────────────── + size_t ws_size = 0; + custab_check(custabilizerSampleProbArraySparsePrepare( + handle.h, static_cast(num_shots), + static_cast(num_error_mechanisms), &ws_size), + "SparsePrepare"); + + auto d_workspace = device_alloc(ws_size, stream); + + // Initial capacity estimate: E[nnz] * 1.25 + 1024 + // Copy probabilities to host via the caller's stream to respect stream + // ordering (the data may have been produced by a kernel on this stream). + std::vector h_probs(num_error_mechanisms); + cuda_check(cudaMemcpyAsync(h_probs.data(), d_error_probabilities, + num_error_mechanisms * sizeof(double), + cudaMemcpyDeviceToHost, stream), + "copy probs"); + cuda_check(cudaStreamSynchronize(stream), "sync probs"); + double sum_p = 0.0; + for (auto p : h_probs) + sum_p += p; + uint64_t capacity = static_cast(num_shots * sum_p * 1.25 + 1024); + uint64_t max_cap = num_shots * num_error_mechanisms; + if (capacity > max_cap) + capacity = max_cap; + if (capacity < num_shots) + capacity = num_shots; + + auto d_row_offsets = device_alloc(num_shots + 1, stream); + auto d_col_indices = device_alloc(capacity, stream); + + uint64_t nnz = capacity; + custabilizerStatus_t status = custabilizerSampleProbArraySparseCompute( + handle.h, static_cast(num_shots), + static_cast(num_error_mechanisms), d_error_probabilities, + static_cast(seed), &nnz, d_col_indices.get(), + d_row_offsets.get(), d_workspace.get(), ws_size, stream); + + if (status == CUSTABILIZER_STATUS_INSUFFICIENT_SPARSE_STORAGE) { + // nnz now contains the required capacity + capacity = nnz; + d_col_indices = device_alloc(capacity, stream); + nnz = capacity; + custab_check(custabilizerSampleProbArraySparseCompute( + handle.h, static_cast(num_shots), + static_cast(num_error_mechanisms), + d_error_probabilities, static_cast(seed), &nnz, + d_col_indices.get(), d_row_offsets.get(), + d_workspace.get(), ws_size, stream), + "SparseCompute retry"); + } else { + custab_check(status, "SparseCompute"); + } + + // ── 3. GF(2) sparse-dense matmul: syndromes = errors * H^T ────────────── + // A = CSR errors [num_shots × num_error_mechanisms] + // B = H^T packed [num_error_mechanisms × n_checks_padded/32] + // C = packed syndromes [num_shots × n_checks_padded/32] + auto d_syndromes_packed = + device_alloc(num_shots * check_words, stream); + + custab_check(custabilizerGF2SparseDenseMatrixMultiply( + handle.h, static_cast(num_shots), + static_cast(n_checks_padded), + static_cast(num_error_mechanisms), nnz, + d_col_indices.get(), d_row_offsets.get(), d_ht_packed.get(), + 0, // beta=0: assign + d_syndromes_packed.get(), stream), + "GF2SparseDenseMatrixMultiply"); + + // ── 4. Unpack syndromes: bitpacked → uint8 ────────────────────────────── + unpack_syndromes_gpu(d_syndromes_packed.get(), d_checks_out, num_shots, + num_checks, stream); + + // ── 5. CSR errors → dense uint8 ───────────────────────────────────────── + csr_to_dense_fused(d_row_offsets.get(), d_col_indices.get(), num_shots, + num_error_mechanisms, d_errors_out, stream); + + cuda_check(cudaStreamSynchronize(stream), "sync"); + + return true; +#else + (void)d_check_matrix; + (void)num_checks; + (void)num_error_mechanisms; + (void)d_error_probabilities; + (void)num_shots; + (void)seed; + (void)d_checks_out; + (void)d_errors_out; + (void)stream_handle; + return false; +#endif +} + +} // namespace cudaq::qec::dem_sampler::gpu diff --git a/libs/qec/lib/dem_sampling/dem_sampling_utils.cu b/libs/qec/lib/dem_sampling/dem_sampling_utils.cu new file mode 100644 index 00000000..8d2dac0e --- /dev/null +++ b/libs/qec/lib/dem_sampling/dem_sampling_utils.cu @@ -0,0 +1,223 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "dem_sampling_utils.h" +#include +#include +#include + +namespace cudaq::qec::dem_sampling_utils { + +namespace { + +constexpr uint64_t kMaxGridDimX = static_cast(INT32_MAX); +constexpr uint64_t kMaxGridDimYZ = 65535u; + +void check_kernel_launch(const char *kernel_name) { + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + throw std::runtime_error(std::string(kernel_name) + + " launch: " + cudaGetErrorString(err)); +} + +void check_grid_dims(uint64_t grid_x, uint64_t grid_y, + const char *kernel_name) { + if (grid_x > kMaxGridDimX || grid_y > kMaxGridDimYZ) + throw std::runtime_error(std::string(kernel_name) + ": grid dimensions (" + + std::to_string(grid_x) + ", " + + std::to_string(grid_y) + ") exceed CUDA limits (" + + std::to_string(kMaxGridDimX) + ", " + + std::to_string(kMaxGridDimYZ) + ")"); +} + +} // namespace + +// ── Row-wise packing: [M × N] uint8 → [M × words] uint32 ────────────────── +__global__ void +pack_check_matrix_rowwise_kernel(const uint8_t *__restrict__ d_check_matrix, + uint32_t *__restrict__ d_check_packed, + uint32_t M, uint32_t N, + uint32_t errors_words) { + const uint32_t row = blockIdx.x; + const uint32_t word_idx = blockIdx.y * blockDim.x + threadIdx.x; + + if (row >= M || word_idx >= errors_words) + return; + + const uint8_t *row_ptr = d_check_matrix + static_cast(row) * N; + const uint32_t bit_start = word_idx * 32; + + uint32_t packed = 0; +#pragma unroll + for (uint32_t bit = 0; bit < 32; bit++) { + const uint32_t col = bit_start + bit; + if (col < N) { + packed |= (static_cast(__ldg(&row_ptr[col]) & 1u) << bit); + } + } + + d_check_packed[static_cast(row) * errors_words + word_idx] = packed; +} + +void pack_check_matrix_rowwise(const uint8_t *d_check_matrix, + uint32_t *d_check_packed, uint64_t M, uint64_t N, + cudaStream_t stream) { + if (M == 0 || N == 0) + return; + + uint64_t words = bitpack_words(N); + dim3 block(256); + uint64_t grid_x = M; + uint64_t grid_y = (words + block.x - 1) / block.x; + check_grid_dims(grid_x, grid_y, "pack_check_matrix_rowwise"); + dim3 grid(static_cast(grid_x), static_cast(grid_y)); + + pack_check_matrix_rowwise_kernel<<>>( + d_check_matrix, d_check_packed, static_cast(M), + static_cast(N), static_cast(words)); + check_kernel_launch("pack_check_matrix_rowwise"); +} + +// ── Direct transpose+pack: H [checks x errors] → packed H^T [errors x words] +__global__ void pack_check_matrix_transposed_rowwise_kernel( + const uint8_t *__restrict__ d_check_matrix, + uint32_t *__restrict__ d_check_t_packed, uint32_t num_checks, + uint32_t num_error_mechanisms, uint32_t checks_words) { + const uint32_t error_mech = blockIdx.x; + const uint32_t word_idx = blockIdx.y * blockDim.x + threadIdx.x; + + if (error_mech >= num_error_mechanisms || word_idx >= checks_words) + return; + + const uint32_t bit_start = word_idx * 32; + uint32_t packed = 0; +#pragma unroll + for (uint32_t bit = 0; bit < 32; ++bit) { + const uint32_t check = bit_start + bit; + if (check < num_checks) { + const uint64_t idx = + static_cast(check) * num_error_mechanisms + error_mech; + packed |= + (static_cast(__ldg(&d_check_matrix[idx]) & 1u) << bit); + } + } + + d_check_t_packed[static_cast(error_mech) * checks_words + + word_idx] = packed; +} + +void pack_check_matrix_transposed_rowwise(const uint8_t *d_check_matrix, + uint32_t *d_check_t_packed, + uint64_t num_checks, + uint64_t num_error_mechanisms, + cudaStream_t stream) { + if (num_checks == 0 || num_error_mechanisms == 0) + return; + + const uint64_t checks_words = bitpack_words(num_checks); + dim3 block(256); + uint64_t grid_x = num_error_mechanisms; + uint64_t grid_y = (checks_words + block.x - 1) / block.x; + check_grid_dims(grid_x, grid_y, "pack_check_matrix_transposed_rowwise"); + dim3 grid(static_cast(grid_x), static_cast(grid_y)); + + pack_check_matrix_transposed_rowwise_kernel<<>>( + d_check_matrix, d_check_t_packed, static_cast(num_checks), + static_cast(num_error_mechanisms), + static_cast(checks_words)); + check_kernel_launch("pack_check_matrix_transposed_rowwise"); +} + +// ── Syndrome unpacking: [S × words] uint32 → [S × M] uint8 ──────────────── +__global__ void unpack_syndromes_kernel(const uint32_t *__restrict__ d_packed, + uint8_t *__restrict__ d_unpacked, + uint64_t S, uint64_t M) { + const uint64_t shot = blockIdx.x; + const uint64_t check = + static_cast(blockIdx.y) * blockDim.x + threadIdx.x; + + if (check >= M) + return; + + const uint64_t words_per_row = (M + 31) >> 5; + const uint32_t word_idx = static_cast(check >> 5); + const uint32_t bit_pos = static_cast(check & 31); + + const uint32_t word = __ldg(&d_packed[shot * words_per_row + word_idx]); + d_unpacked[shot * M + check] = static_cast((word >> bit_pos) & 1u); +} + +void unpack_syndromes_gpu(const uint32_t *d_packed, uint8_t *d_unpacked, + uint64_t num_shots, uint64_t num_checks, + cudaStream_t stream) { + if (num_shots == 0 || num_checks == 0) + return; + + const int threads = 256; + uint64_t grid_x = num_shots; + uint64_t grid_y = (num_checks + threads - 1) / threads; + check_grid_dims(grid_x, grid_y, "unpack_syndromes"); + dim3 grid(static_cast(grid_x), static_cast(grid_y)); + + unpack_syndromes_kernel<<>>(d_packed, d_unpacked, + num_shots, num_checks); + check_kernel_launch("unpack_syndromes"); +} + +// ── CSR → dense (uint64_t indices, warp-cooperative fused zeroing) ───────── +__global__ void +csr_to_dense_fused_kernel(const uint64_t *__restrict__ d_row_offsets, + const uint64_t *__restrict__ d_col_indices, + uint64_t num_shots, uint64_t num_error_mechanisms, + uint8_t *__restrict__ d_errors_out) { + const uint32_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / 32; + const uint32_t lane = threadIdx.x % 32; + + if (warp_id >= num_shots) + return; + + const uint64_t shot = warp_id; + uint8_t *shot_out = d_errors_out + shot * num_error_mechanisms; + + for (uint64_t i = lane; i < num_error_mechanisms; i += 32) + shot_out[i] = 0; + + __syncwarp(); + + const uint64_t start = d_row_offsets[shot]; + const uint64_t end = d_row_offsets[shot + 1]; + const uint64_t num_errors = end - start; + + for (uint64_t e = lane; e < num_errors; e += 32) { + uint64_t idx = d_col_indices[start + e]; + if (idx < num_error_mechanisms) { + shot_out[idx] = 1; + } + } +} + +void csr_to_dense_fused(const uint64_t *d_row_offsets, + const uint64_t *d_col_indices, uint64_t num_shots, + uint64_t num_error_mechanisms, uint8_t *d_errors_out, + cudaStream_t stream) { + if (num_shots == 0 || num_error_mechanisms == 0) + return; + + const int warps_per_block = 8; + const int threads_per_block = warps_per_block * 32; + uint64_t num_blocks_u64 = (num_shots + warps_per_block - 1) / warps_per_block; + check_grid_dims(num_blocks_u64, 1, "csr_to_dense_fused"); + const int num_blocks = static_cast(num_blocks_u64); + + csr_to_dense_fused_kernel<<>>( + d_row_offsets, d_col_indices, num_shots, num_error_mechanisms, + d_errors_out); + check_kernel_launch("csr_to_dense_fused"); +} + +} // namespace cudaq::qec::dem_sampling_utils diff --git a/libs/qec/lib/dem_sampling/dem_sampling_utils.h b/libs/qec/lib/dem_sampling/dem_sampling_utils.h new file mode 100644 index 00000000..cb24c2b5 --- /dev/null +++ b/libs/qec/lib/dem_sampling/dem_sampling_utils.h @@ -0,0 +1,47 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ +#pragma once + +#include +#include + +namespace cudaq::qec::dem_sampling_utils { + +inline __host__ __device__ uint64_t bitpack_words(uint64_t num_bits) { + return (num_bits + 31) / 32; +} + +/// Pack [M x N] uint8 matrix row-wise into [M x words] uint32, where +/// words = ceil(N/32). Bit i in word w encodes column w*32+i. +void pack_check_matrix_rowwise(const uint8_t *d_check_matrix, + uint32_t *d_check_packed, uint64_t M, uint64_t N, + cudaStream_t stream = 0); + +/// Pack H^T directly from H without an explicit transpose buffer. +/// Input H is [num_checks x num_error_mechanisms] uint8. +/// Output packed H^T is [num_error_mechanisms x ceil(num_checks/32)] uint32. +void pack_check_matrix_transposed_rowwise(const uint8_t *d_check_matrix, + uint32_t *d_check_t_packed, + uint64_t num_checks, + uint64_t num_error_mechanisms, + cudaStream_t stream = 0); + +/// Unpack [S x words] uint32 syndromes into [S x M] uint8. +void unpack_syndromes_gpu(const uint32_t *d_packed, uint8_t *d_unpacked, + uint64_t num_shots, uint64_t num_checks, + cudaStream_t stream = 0); + +/// Convert CSR errors (uint64_t offsets/indices from cuStabilizer) to +/// dense [num_shots x num_error_mechanisms] uint8, fusing zeroing with +/// scatter. The warp-cooperative kernel avoids a separate cudaMemset. +void csr_to_dense_fused(const uint64_t *d_row_offsets, + const uint64_t *d_col_indices, uint64_t num_shots, + uint64_t num_error_mechanisms, uint8_t *d_errors_out, + cudaStream_t stream = 0); + +} // namespace cudaq::qec::dem_sampling_utils diff --git a/libs/qec/lib/experiments.cpp b/libs/qec/lib/experiments.cpp index 6c4155e4..6c899dde 100644 --- a/libs/qec/lib/experiments.cpp +++ b/libs/qec/lib/experiments.cpp @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright (c) 2024 - 2025 NVIDIA Corporation & Affiliates. * + * Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates. * * All rights reserved. * * * * This source code and the accompanying materials are made available under * @@ -8,6 +8,8 @@ #include "cudaq/qec/experiments.h" #include "device/memory_circuit.h" +#include "cudaq/qec/dem_sampling.h" +#include using namespace cudaqx; @@ -83,6 +85,90 @@ sample_code_capacity(const code &code, std::size_t nShots, seed); } +} // namespace cudaq::qec + +namespace cudaq::qec::dem_sampler::cpu { + +std::tuple, cudaqx::tensor> +sample_dem(const cudaqx::tensor &check_matrix, std::size_t numShots, + const std::vector &error_probabilities, unsigned seed) { + if (check_matrix.rank() != 2) + throw std::invalid_argument("check_matrix must be rank-2"); + + size_t num_checks = check_matrix.shape()[0]; + size_t num_error_mechanisms = check_matrix.shape()[1]; + + if (error_probabilities.size() != num_error_mechanisms) + throw std::invalid_argument( + "error_probabilities size must match number of error mechanisms"); + + for (double p : error_probabilities) { + if (!std::isfinite(p) || p < 0.0 || p > 1.0) + throw std::invalid_argument( + "error_probabilities entries must be finite values in [0, 1]"); + } + + if (numShots == 0) { + cudaqx::tensor errors({0, num_error_mechanisms}); + cudaqx::tensor checks({0, num_checks}); + return std::make_tuple(checks, errors); + } + + std::mt19937 rng(seed); + std::vector distributions; + distributions.reserve(num_error_mechanisms); + for (double p : error_probabilities) + distributions.emplace_back(p); + + cudaqx::tensor H_binary({num_checks, num_error_mechanisms}); + std::vector h_bin(num_checks * num_error_mechanisms); + for (size_t i = 0; i < num_checks; ++i) + for (size_t j = 0; j < num_error_mechanisms; ++j) + h_bin[i * num_error_mechanisms + j] = + check_matrix.at({i, j}) & static_cast(1); + H_binary.copy(h_bin.data(), H_binary.shape()); + + cudaqx::tensor errors({numShots, num_error_mechanisms}); + cudaqx::tensor checks({numShots, num_checks}); + + std::vector error_bits(numShots * num_error_mechanisms); + for (size_t shot = 0; shot < numShots; ++shot) { + for (size_t err = 0; err < num_error_mechanisms; ++err) { + error_bits[shot * num_error_mechanisms + err] = distributions[err](rng); + } + } + + errors.copy(error_bits.data(), errors.shape()); + checks = errors.dot(H_binary.transpose()) % 2; + + return std::make_tuple(checks, errors); +} + +std::tuple, cudaqx::tensor> +sample_dem(const cudaqx::tensor &check_matrix, std::size_t numShots, + const std::vector &error_probabilities) { + return sample_dem(check_matrix, numShots, error_probabilities, + std::random_device()()); +} + +} // namespace cudaq::qec::dem_sampler::cpu + +namespace cudaq::qec { + +std::tuple, cudaqx::tensor> +dem_sampling(const cudaqx::tensor &check_matrix, std::size_t nShots, + const std::vector &error_probabilities) { + return dem_sampler::cpu::sample_dem(check_matrix, nShots, + error_probabilities); +} + +std::tuple, cudaqx::tensor> +dem_sampling(const cudaqx::tensor &check_matrix, std::size_t nShots, + const std::vector &error_probabilities, unsigned seed) { + return dem_sampler::cpu::sample_dem(check_matrix, nShots, error_probabilities, + seed); +} + std::tuple, cudaqx::tensor> sample_memory_circuit(const code &code, operation statePrep, std::size_t numShots, std::size_t numRounds, diff --git a/libs/qec/pyproject.toml.cu12 b/libs/qec/pyproject.toml.cu12 index 63ba5aee..ed18e8f9 100644 --- a/libs/qec/pyproject.toml.cu12 +++ b/libs/qec/pyproject.toml.cu12 @@ -13,6 +13,7 @@ readme = "README.md" license = "LicenseRef-NVIDIA-Proprietary" dependencies = [ 'cuda-quantum-cu12 == 0.14.*', + 'cuquantum-python-cu12>=26.03.0', ] classifiers = [ 'Intended Audience :: Science/Research', @@ -49,6 +50,7 @@ ninja.version = ">=1.10" [tool.scikit-build.cmake.define] CUDAQX_QEC_INCLUDE_TESTS = false CUDAQX_QEC_BINDINGS_PYTHON = true +CUDAQ_QEC_REQUIRE_CUSTABILIZER = true [tool.setuptools_scm] write_to = "_version.py" @@ -57,16 +59,17 @@ write_to = "_version.py" tensor_network_decoder = [ "quimb", "opt_einsum", - "torch", - "cuquantum-python-cu12==26.01.0" + "cuquantum-python-cu12>=26.3.0" ] trt_decoder = [ "tensorrt-cu12" ] +dem_sampling = [ + "cuquantum-python-cu12>=26.3.0" +] all = [ "quimb", "opt_einsum", - "torch", - "cuquantum-python-cu12==26.01.0", + "cuquantum-python-cu12>=26.3.0", "tensorrt-cu12; platform_machine == 'x86_64'" ] diff --git a/libs/qec/pyproject.toml.cu13 b/libs/qec/pyproject.toml.cu13 index 5179eb90..fcbd2fb6 100644 --- a/libs/qec/pyproject.toml.cu13 +++ b/libs/qec/pyproject.toml.cu13 @@ -13,6 +13,7 @@ readme = "README.md" license = "LicenseRef-NVIDIA-Proprietary" dependencies = [ 'cuda-quantum-cu13 == 0.14.*', + 'cuquantum-python-cu13>=26.03.0', ] classifiers = [ 'Intended Audience :: Science/Research', @@ -49,6 +50,7 @@ ninja.version = ">=1.10" [tool.scikit-build.cmake.define] CUDAQX_QEC_INCLUDE_TESTS = false CUDAQX_QEC_BINDINGS_PYTHON = true +CUDAQ_QEC_REQUIRE_CUSTABILIZER = true [tool.setuptools_scm] write_to = "_version.py" @@ -57,16 +59,17 @@ write_to = "_version.py" tensor_network_decoder = [ "quimb", "opt_einsum", - "torch>=2.9.0", - "cuquantum-python-cu13==26.01.0" + "cuquantum-python-cu13>=26.3.0" ] trt_decoder = [ "tensorrt-cu13" ] +dem_sampling = [ + "cuquantum-python-cu13>=26.3.0" +] all = [ "quimb", "opt_einsum", - "torch>=2.9.0", - "cuquantum-python-cu13==26.01.0", + "cuquantum-python-cu13>=26.3.0", "tensorrt-cu13" ] diff --git a/libs/qec/python/CMakeLists.txt b/libs/qec/python/CMakeLists.txt index ae23fd4b..01e1f348 100644 --- a/libs/qec/python/CMakeLists.txt +++ b/libs/qec/python/CMakeLists.txt @@ -29,6 +29,7 @@ cudaqx_add_pymodule(${MODULE_NAME} bindings/py_decoder.cpp bindings/py_decoding_config.cpp bindings/py_decoding.cpp + bindings/py_dem_sampling.cpp bindings/py_surface_code.cpp bindings/py_patch.cpp ) @@ -40,6 +41,12 @@ target_link_libraries(${MODULE_NAME} cudaq::cudaq-python-interop ) +if(CUDAQ_QEC_HAS_CUSTABILIZER) + find_package(CUDAToolkit REQUIRED) + target_link_libraries(${MODULE_NAME} PRIVATE CUDA::cudart) + target_include_directories(${MODULE_NAME} PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) +endif() + set_target_properties(${MODULE_NAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/python/cudaq_qec") diff --git a/libs/qec/python/bindings/cudaqx_qec.cpp b/libs/qec/python/bindings/cudaqx_qec.cpp index a52df904..bd8c514b 100644 --- a/libs/qec/python/bindings/cudaqx_qec.cpp +++ b/libs/qec/python/bindings/cudaqx_qec.cpp @@ -10,6 +10,7 @@ #include "py_decoder.h" #include "py_decoding.h" #include "py_decoding_config.h" +#include "py_dem_sampling.h" #include "py_surface_code.h" #include @@ -21,5 +22,6 @@ PYBIND11_MODULE(_pycudaqx_qec_the_suffix_matters_cudaq_qec, mod) { cudaq::qec::bindDecoder(mod); cudaq::qec::decoding::config::bindDecodingConfig(mod); cudaq::qec::decoding::bindDecoding(mod); + cudaq::qec::dem_sampler::bindDemSampling(mod); cudaq::qec::surface_code::bindSurfaceCode(mod); } diff --git a/libs/qec/python/bindings/py_decoder.cpp b/libs/qec/python/bindings/py_decoder.cpp index bd40560c..ef17d3a4 100644 --- a/libs/qec/python/bindings/py_decoder.cpp +++ b/libs/qec/python/bindings/py_decoder.cpp @@ -322,7 +322,7 @@ void bindDecoder(py::module &mod) { "Parity check matrix must be an array of uint8_t."); } - if (buf.strides[0] == buf.itemsize) { + if (buf.strides[0] < buf.strides[buf.ndim - 1]) { throw std::runtime_error( "Parity check matrix must be in row-major order, but " "column-major order was detected."); diff --git a/libs/qec/python/bindings/py_dem_sampling.cpp b/libs/qec/python/bindings/py_dem_sampling.cpp new file mode 100644 index 00000000..b6ddffc0 --- /dev/null +++ b/libs/qec/python/bindings/py_dem_sampling.cpp @@ -0,0 +1,628 @@ +/******************************************************************************* + * Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "type_casters.h" +#include "cudaq/qec/dem_sampling.h" + +#include "cuda-qx/core/kwargs_utils.h" + +#ifdef CUDAQ_QEC_HAS_CUSTABILIZER +#include +#endif + +namespace py = pybind11; +using namespace pybind11::literals; + +namespace cudaq::qec::dem_sampler { + +namespace { + +/// Convert a py::object (numpy array or PyTorch tensor) to a numpy uint8 array. +py::array_t asNumpyUint8(py::object obj) { + py::module_ np = py::module_::import("numpy"); + try { + py::module_ torch = py::module_::import("torch"); + if (py::isinstance(obj, torch.attr("Tensor"))) { + py::object t = obj.attr("detach")(); + if (t.attr("is_cuda").cast()) + t = t.attr("cpu")(); + obj = t.attr("numpy")(); + } + } catch (py::error_already_set &) { + PyErr_Clear(); + } + return np.attr("ascontiguousarray")(obj, "dtype"_a = np.attr("uint8")) + .cast>(); +} + +/// Convert a py::object (numpy array or PyTorch tensor) to a numpy float64 +/// array. +py::array_t asNumpyFloat64(py::object obj) { + py::module_ np = py::module_::import("numpy"); + try { + py::module_ torch = py::module_::import("torch"); + if (py::isinstance(obj, torch.attr("Tensor"))) { + py::object t = obj.attr("detach")(); + if (t.attr("is_cuda").cast()) + t = t.attr("cpu")(); + obj = t.attr("numpy")(); + } + } catch (py::error_already_set &) { + PyErr_Clear(); + } + auto arr = np.attr("ascontiguousarray")(obj, "dtype"_a = np.attr("float64")) + .cast>(); + py::buffer_info info = arr.request(); + if (info.ndim != 1) + throw std::runtime_error( + "error_probabilities must be a 1-D array, got ndim=" + + std::to_string(info.ndim)); + return arr; +} + +enum class DemSamplingBackend { Auto, Cpu, Gpu }; + +DemSamplingBackend parseBackend(std::string backend) { + std::transform( + backend.begin(), backend.end(), backend.begin(), + [](unsigned char ch) { return static_cast(std::tolower(ch)); }); + + if (backend == "auto") + return DemSamplingBackend::Auto; + if (backend == "cpu") + return DemSamplingBackend::Cpu; + if (backend == "gpu") + return DemSamplingBackend::Gpu; + + throw std::runtime_error("dem_sampling: invalid backend '" + backend + + "'. Expected one of: auto, cpu, gpu."); +} + +void validateInputShapes(const py::array_t &check_matrix_np, + const py::array_t &error_probs_np) { + py::buffer_info h_buf = check_matrix_np.request(); + if (h_buf.ndim != 2) + throw std::runtime_error("check_matrix must be rank-2"); + + py::buffer_info p_buf = error_probs_np.request(); + auto num_mechanisms = static_cast(h_buf.shape[1]); + auto num_probs = static_cast(p_buf.shape[0]); + if (num_probs != num_mechanisms) { + throw std::runtime_error("dem_sampling: error_probabilities length (" + + std::to_string(num_probs) + + ") != check_matrix columns (" + + std::to_string(num_mechanisms) + ")"); + } +} + +void validateProbabilityValues(const py::array_t &error_probs_np) { + py::buffer_info p_buf = error_probs_np.request(); + auto *probs = static_cast(p_buf.ptr); + auto count = static_cast(p_buf.shape[0]); + + for (std::size_t i = 0; i < count; ++i) { + const double p = probs[i]; + if (!std::isfinite(p) || p < 0.0 || p > 1.0) { + throw std::runtime_error( + "dem_sampling: error_probabilities[" + std::to_string(i) + + "] is invalid. Values must be finite and in [0, 1]."); + } + } +} + +#ifdef CUDAQ_QEC_HAS_CUSTABILIZER + +struct CudaDeleter { + void operator()(void *p) const { + if (p) + cudaFree(p); + } +}; + +template +using DevicePtr = std::unique_ptr; + +template +DevicePtr device_alloc(std::size_t count) { + T *p = nullptr; + if (cudaMalloc(&p, count * sizeof(T)) != cudaSuccess) { + cudaGetLastError(); + return DevicePtr(nullptr); + } + return DevicePtr(p); +} + +class ScopedCudaDevice { +public: + ScopedCudaDevice(int target_device, std::string &failure_reason) { + auto get_device_status = cudaGetDevice(&previous_device); + if (get_device_status != cudaSuccess) { + failure_reason = std::string("cudaGetDevice failed: ") + + cudaGetErrorString(get_device_status); + return; + } + + if (target_device < 0) + target_device = previous_device; + active_device = target_device; + + if (previous_device != active_device) { + auto set_device_status = cudaSetDevice(active_device); + if (set_device_status != cudaSuccess) { + failure_reason = std::string("cudaSetDevice failed: ") + + cudaGetErrorString(set_device_status); + return; + } + } + + is_active = true; + } + + ~ScopedCudaDevice() { + if (is_active && previous_device != active_device) + cudaSetDevice(previous_device); + } + + bool ok() const { return is_active; } + +private: + int previous_device = -1; + int active_device = -1; + bool is_active = false; +}; + +bool getTorchDeviceIndex(const py::module_ &torch, const py::object &device, + int &device_index, std::string &failure_reason) { + if (!py::isinstance(device, torch.attr("device"))) { + failure_reason = "torch device object is invalid"; + return false; + } + + py::object index_obj = device.attr("index"); + if (!index_obj.is_none()) { + device_index = index_obj.cast(); + } else { + auto get_device_status = cudaGetDevice(&device_index); + if (get_device_status != cudaSuccess) { + failure_reason = std::string("cudaGetDevice failed: ") + + cudaGetErrorString(get_device_status); + return false; + } + } + + int device_count = 0; + auto count_status = cudaGetDeviceCount(&device_count); + if (count_status != cudaSuccess) { + failure_reason = std::string("cudaGetDeviceCount failed: ") + + cudaGetErrorString(count_status); + return false; + } + if (device_index < 0 || device_index >= device_count) { + failure_reason = "invalid CUDA device index " + + std::to_string(device_index) + + " (device_count=" + std::to_string(device_count) + ")"; + return false; + } + + return true; +} + +bool getTorchCurrentStreamHandle(const py::module_ &torch, + const py::object &device, + std::uintptr_t &stream_handle, + std::string &failure_reason) { + try { + py::object stream_obj = torch.attr("cuda").attr("current_stream")(device); + stream_handle = stream_obj.attr("cuda_stream").cast(); + return true; + } catch (py::error_already_set &e) { + failure_reason = + std::string("failed to query torch CUDA stream: ") + e.what(); + PyErr_Clear(); + return false; + } +} + +bool tryGpuSampling(const py::array_t &check_matrix_np, + const py::array_t &error_probs_np, + std::size_t numShots, unsigned seed, + py::array_t &syndromes_out, + py::array_t &errors_out, + std::string &failure_reason) { + int device_count = 0; + auto device_count_status = cudaGetDeviceCount(&device_count); + if (device_count_status != cudaSuccess) { + failure_reason = std::string("cudaGetDeviceCount failed: ") + + cudaGetErrorString(device_count_status); + return false; + } + if (device_count == 0) { + failure_reason = "no CUDA device available"; + return false; + } + + py::buffer_info h_buf = check_matrix_np.request(); + if (h_buf.ndim != 2) { + failure_reason = "check_matrix must be rank-2"; + return false; + } + auto num_checks = static_cast(h_buf.shape[0]); + auto num_mechanisms = static_cast(h_buf.shape[1]); + + py::buffer_info p_buf = error_probs_np.request(); + auto num_probs = static_cast(p_buf.shape[0]); + if (num_probs != num_mechanisms) { + failure_reason = "error_probabilities length mismatch"; + return false; + } + + std::size_t h_bytes = num_checks * num_mechanisms * sizeof(uint8_t); + std::size_t p_bytes = num_mechanisms * sizeof(double); + std::size_t syn_bytes = numShots * num_checks * sizeof(uint8_t); + std::size_t err_bytes = numShots * num_mechanisms * sizeof(uint8_t); + + auto d_H = device_alloc(num_checks * num_mechanisms); + if (!d_H) { + failure_reason = "failed to allocate device memory for check_matrix"; + return false; + } + auto d_probs = device_alloc(num_mechanisms); + if (!d_probs) { + failure_reason = "failed to allocate device memory for error_probabilities"; + return false; + } + auto d_syn = device_alloc(numShots * num_checks); + if (!d_syn) { + failure_reason = "failed to allocate device memory for syndromes"; + return false; + } + auto d_err = device_alloc(numShots * num_mechanisms); + if (!d_err) { + failure_reason = "failed to allocate device memory for errors"; + return false; + } + + auto copy_h_status = + cudaMemcpy(d_H.get(), h_buf.ptr, h_bytes, cudaMemcpyHostToDevice); + if (copy_h_status != cudaSuccess) { + failure_reason = std::string("failed to copy check_matrix to device: ") + + cudaGetErrorString(copy_h_status); + cudaGetLastError(); + return false; + } + auto copy_p_status = + cudaMemcpy(d_probs.get(), p_buf.ptr, p_bytes, cudaMemcpyHostToDevice); + if (copy_p_status != cudaSuccess) { + failure_reason = std::string("failed to copy probabilities to device: ") + + cudaGetErrorString(copy_p_status); + cudaGetLastError(); + return false; + } + + bool ok = + gpu::sample_dem(d_H.get(), num_checks, num_mechanisms, d_probs.get(), + numShots, seed, d_syn.get(), d_err.get()); + + if (!ok) { + failure_reason = "cuStabilizer GPU sampler unavailable at runtime"; + return false; + } + + if (ok) { + syndromes_out = + py::array_t({static_cast(numShots), + static_cast(num_checks)}); + errors_out = + py::array_t({static_cast(numShots), + static_cast(num_mechanisms)}); + + auto copy_syn_status = cudaMemcpy(syndromes_out.mutable_data(), d_syn.get(), + syn_bytes, cudaMemcpyDeviceToHost); + if (copy_syn_status != cudaSuccess) { + failure_reason = std::string("failed to copy syndromes to host: ") + + cudaGetErrorString(copy_syn_status); + cudaGetLastError(); + return false; + } + + auto copy_err_status = cudaMemcpy(errors_out.mutable_data(), d_err.get(), + err_bytes, cudaMemcpyDeviceToHost); + if (copy_err_status != cudaSuccess) { + failure_reason = std::string("failed to copy errors to host: ") + + cudaGetErrorString(copy_err_status); + cudaGetLastError(); + return false; + } + } + + failure_reason.clear(); + return ok; +} + +bool tryTorchGpuSampling(py::object check_matrix_obj, + py::object error_probs_obj, std::size_t numShots, + unsigned seed, py::tuple &result_out, + std::string &failure_reason, bool allow_move_to_cuda) { + py::module_ torch; + try { + torch = py::module_::import("torch"); + } catch (py::error_already_set &) { + PyErr_Clear(); + failure_reason = "PyTorch is not available"; + return false; + } + + if (!py::isinstance(check_matrix_obj, torch.attr("Tensor")) || + !py::isinstance(error_probs_obj, torch.attr("Tensor"))) { + failure_reason = "inputs are not both torch.Tensor"; + return false; + } + + py::object check_t = check_matrix_obj; + py::object probs_t = error_probs_obj; + + bool check_is_cuda = check_t.attr("is_cuda").cast(); + bool probs_is_cuda = probs_t.attr("is_cuda").cast(); + + py::object device; + if (check_is_cuda) { + device = check_t.attr("device"); + } else if (probs_is_cuda) { + device = probs_t.attr("device"); + } else if (allow_move_to_cuda) { + int device_count = 0; + auto device_count_status = cudaGetDeviceCount(&device_count); + if (device_count_status != cudaSuccess) { + failure_reason = std::string("cudaGetDeviceCount failed: ") + + cudaGetErrorString(device_count_status); + return false; + } + if (device_count == 0) { + failure_reason = "no CUDA device available"; + return false; + } + device = torch.attr("device")("cuda"); + } else { + failure_reason = "torch tensors are not on CUDA device"; + return false; + } + + check_t = + check_t.attr("to")("device"_a = device, "dtype"_a = torch.attr("uint8")); + probs_t = probs_t.attr("to")("device"_a = device, + "dtype"_a = torch.attr("float64")); + check_t = check_t.attr("contiguous")(); + probs_t = probs_t.attr("contiguous")(); + + auto check_shape = check_t.attr("shape").cast(); + if (check_shape.size() != 2) { + failure_reason = "check_matrix must be rank-2"; + return false; + } + + auto probs_shape = probs_t.attr("shape").cast(); + if (probs_shape.size() != 1) { + failure_reason = "error_probabilities must be rank-1"; + return false; + } + + auto num_checks = + static_cast(check_shape[0].cast()); + auto num_mechanisms = + static_cast(check_shape[1].cast()); + auto num_probs = static_cast(probs_shape[0].cast()); + + if (num_probs != num_mechanisms) { + failure_reason = "error_probabilities length mismatch"; + return false; + } + + bool all_finite = + torch.attr("isfinite")(probs_t).attr("all")().attr("item")().cast(); + bool any_below_zero = + probs_t.attr("lt")(0.0).attr("any")().attr("item")().cast(); + bool any_above_one = + probs_t.attr("gt")(1.0).attr("any")().attr("item")().cast(); + if (!all_finite || any_below_zero || any_above_one) { + failure_reason = "error_probabilities must be finite values in [0, 1]"; + return false; + } + + py::object actual_device = check_t.attr("device"); + int target_device = -1; + if (!getTorchDeviceIndex(torch, actual_device, target_device, failure_reason)) + return false; + + ScopedCudaDevice device_guard(target_device, failure_reason); + if (!device_guard.ok()) + return false; + + std::uintptr_t stream_handle = 0; + if (!getTorchCurrentStreamHandle(torch, actual_device, stream_handle, + failure_reason)) + return false; + + py::object syndromes_t = + torch.attr("empty")(py::make_tuple(static_cast(numShots), + static_cast(num_checks)), + "dtype"_a = torch.attr("uint8"), "device"_a = device); + py::object errors_t = torch.attr("empty")( + py::make_tuple(static_cast(numShots), + static_cast(num_mechanisms)), + "dtype"_a = torch.attr("uint8"), "device"_a = device); + + auto d_H = reinterpret_cast( + check_t.attr("data_ptr")().cast()); + auto d_probs = reinterpret_cast( + probs_t.attr("data_ptr")().cast()); + auto d_syn = reinterpret_cast( + syndromes_t.attr("data_ptr")().cast()); + auto d_err = reinterpret_cast( + errors_t.attr("data_ptr")().cast()); + + bool ok = gpu::sample_dem(d_H, num_checks, num_mechanisms, d_probs, numShots, + seed, d_syn, d_err, stream_handle); + if (!ok) { + failure_reason = "cuStabilizer GPU sampler unavailable at runtime"; + return false; + } + + result_out = py::make_tuple(syndromes_t, errors_t); + failure_reason.clear(); + return true; +} +#endif // CUDAQ_QEC_HAS_CUSTABILIZER + +} // namespace + +void bindDemSampling(py::module &mod) { + auto qecmod = py::hasattr(mod, "qecrt") + ? mod.attr("qecrt").cast() + : mod.def_submodule("qecrt"); + +#ifdef CUDAQ_QEC_HAS_CUSTABILIZER + qecmod.attr("dem_sampling_has_gpu_compiled") = py::bool_(true); +#else + qecmod.attr("dem_sampling_has_gpu_compiled") = py::bool_(false); +#endif + + qecmod.def( + "dem_sampling", + [](py::object check_matrix_obj, std::size_t numShots, + py::object error_probs_obj, std::optional seed, + std::string backend) -> py::tuple { + const auto backend_mode = parseBackend(std::move(backend)); + + if (numShots == 0) { + auto check_matrix_np = asNumpyUint8(check_matrix_obj); + auto error_probs_np = asNumpyFloat64(error_probs_obj); + validateInputShapes(check_matrix_np, error_probs_np); + py::buffer_info h_buf = check_matrix_np.request(); + auto num_checks = static_cast(h_buf.shape[0]); + auto num_mechanisms = static_cast(h_buf.shape[1]); + py::module_ np = py::module_::import("numpy"); + return py::make_tuple( + np.attr("empty")(py::make_tuple(0, num_checks), + "dtype"_a = np.attr("uint8")), + np.attr("empty")(py::make_tuple(0, num_mechanisms), + "dtype"_a = np.attr("uint8"))); + } + + unsigned actual_seed = + seed.has_value() ? seed.value() + : static_cast(std::random_device{}()); + +#ifdef CUDAQ_QEC_HAS_CUSTABILIZER + std::string torch_gpu_failure_reason = "not attempted"; + if (backend_mode != DemSamplingBackend::Cpu) { + py::tuple torch_result; + // In "gpu" mode we may move torch CPU tensors to CUDA. In "auto" + // mode we preserve existing behavior and only use this path for + // CUDA tensors. + bool allow_move_to_cuda = backend_mode == DemSamplingBackend::Gpu; + if (tryTorchGpuSampling(check_matrix_obj, error_probs_obj, numShots, + actual_seed, torch_result, + torch_gpu_failure_reason, allow_move_to_cuda)) + return torch_result; + } +#endif + + auto check_matrix_np = asNumpyUint8(check_matrix_obj); + auto error_probs_np = asNumpyFloat64(error_probs_obj); + validateInputShapes(check_matrix_np, error_probs_np); + validateProbabilityValues(error_probs_np); + +#ifdef CUDAQ_QEC_HAS_CUSTABILIZER + if (backend_mode != DemSamplingBackend::Cpu) { + py::array_t syndromes_gpu, errors_gpu; + std::string gpu_failure_reason; + if (tryGpuSampling(check_matrix_np, error_probs_np, numShots, + actual_seed, syndromes_gpu, errors_gpu, + gpu_failure_reason)) + return py::make_tuple(syndromes_gpu, errors_gpu); + + if (backend_mode == DemSamplingBackend::Gpu) { + throw std::runtime_error( + "dem_sampling: GPU backend requested but unavailable. " + "torch path: " + + torch_gpu_failure_reason + + "; numpy path: " + gpu_failure_reason); + } + } +#elif !defined(CUDAQ_QEC_HAS_CUSTABILIZER) + if (backend_mode == DemSamplingBackend::Gpu) { + throw std::runtime_error( + "dem_sampling: GPU backend requested, but this build does not " + "include cuStabilizer support."); + } +#endif + + auto H = cudaqx::toTensor(check_matrix_np); + + py::buffer_info p_buf = error_probs_np.request(); + std::vector probs(static_cast(p_buf.ptr), + static_cast(p_buf.ptr) + + p_buf.shape[0]); + + auto [syndromes, errors] = + cpu::sample_dem(H, numShots, probs, actual_seed); + + return py::make_tuple( + cudaq::python::copyCUDAQXTensorToPyArray(syndromes), + cudaq::python::copyCUDAQXTensorToPyArray(errors)); + }, + R"pbdoc( + Sample syndrome measurements from a detector error model. + + Generates random error vectors according to the given per-mechanism + error probabilities, then computes syndromes via the check matrix. + Backend selection is controlled by the `backend` argument. + - "auto" (default): try GPU first, fall back to CPU. + - "cpu": always run the CPU implementation. + - "gpu": require GPU; raise if unavailable. + + The check_matrix and error_probabilities arguments accept both + NumPy arrays and PyTorch tensors. + + If PyTorch CUDA tensors are used on the GPU path, outputs are returned + as PyTorch CUDA tensors (device pointers preserved). Otherwise outputs + are NumPy arrays. + + Args: + check_matrix: Binary check matrix [num_checks x num_error_mechanisms], + as a NumPy uint8 array or PyTorch tensor. + numShots: Number of measurement shots to sample. + error_probabilities: Per-error-mechanism probabilities + [num_error_mechanisms], as a NumPy float64 + array or PyTorch tensor. + seed: Optional RNG seed for reproducibility. + backend: "auto", "cpu", or "gpu". Default is "auto". + + Returns: + A tuple (syndromes, errors) with shapes [numShots x num_checks] + and [numShots x num_error_mechanisms]. + )pbdoc", + py::arg("check_matrix"), py::arg("numShots"), + py::arg("error_probabilities"), py::arg("seed") = py::none(), + py::arg("backend") = "auto"); +} + +} // namespace cudaq::qec::dem_sampler diff --git a/libs/qec/python/bindings/py_dem_sampling.h b/libs/qec/python/bindings/py_dem_sampling.h new file mode 100644 index 00000000..1d290210 --- /dev/null +++ b/libs/qec/python/bindings/py_dem_sampling.h @@ -0,0 +1,15 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include + +namespace py = pybind11; + +namespace cudaq::qec::dem_sampler { +void bindDemSampling(py::module &mod); +} // namespace cudaq::qec::dem_sampler diff --git a/libs/qec/python/cudaq_qec/__init__.py b/libs/qec/python/cudaq_qec/__init__.py index 19a2c9d3..faa7fe3c 100644 --- a/libs/qec/python/cudaq_qec/__init__.py +++ b/libs/qec/python/cudaq_qec/__init__.py @@ -7,7 +7,18 @@ # ============================================================================ # from .patch import patch -from ._pycudaqx_qec_the_suffix_matters_cudaq_qec import * +try: + from ._pycudaqx_qec_the_suffix_matters_cudaq_qec import * +except ImportError as exc: + err = str(exc) + if "libcustabilizer" in err or "cuStabilizer" in err: + raise ImportError( + "Failed to load cudaq_qec native extension because cuStabilizer " + "runtime libraries are missing. Install the matching cuQuantum " + "package for your CUDA wheel (for example, " + "'cuquantum-python-cu12>=26.03.0' or " + "'cuquantum-python-cu13>=26.03.0').") from exc + raise __version__ = qecrt.__version__ code = qecrt.code @@ -57,6 +68,8 @@ stabilizer_grid = qecrt.stabilizer_grid role_to_str = qecrt.role_to_str +from .dem_sampling import dem_sampling + from .plugins import decoders, codes import pkgutil, importlib, traceback diff --git a/libs/qec/python/cudaq_qec/dem_sampling.py b/libs/qec/python/cudaq_qec/dem_sampling.py new file mode 100644 index 00000000..99b29063 --- /dev/null +++ b/libs/qec/python/cudaq_qec/dem_sampling.py @@ -0,0 +1,66 @@ +# ============================================================================ # +# Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # +"""DEM sampling via the C++ pybind11 binding (GPU with cuStabilizer + CPU fallback). + +Public API: + dem_sampling( + check_matrix, num_shots, error_probabilities, seed=None, backend="auto" + ) + +When compiled with cuStabilizer, the function delegates to the GPU sampler. +Otherwise it falls back to the CPU implementation. Both paths are handled +transparently by the C++ binding. + +The check_matrix and error_probabilities arguments accept both NumPy arrays +and PyTorch tensors, enabling direct integration with AI/ML workflows. +""" + +from __future__ import annotations + +from typing import Optional, Tuple + +__all__ = ["dem_sampling"] + + +def dem_sampling( + check_matrix, + num_shots: int, + error_probabilities, + seed: Optional[int] = None, + backend: str = "auto", +) -> Tuple[object, object]: + """Sample errors and syndromes from a Detector Error Model. + + Args: + check_matrix: Binary matrix [num_checks x num_error_mechanisms], + as a NumPy uint8 array or PyTorch tensor. + num_shots: Number of independent Monte-Carlo shots. + error_probabilities: 1-D array of length num_error_mechanisms with + independent Bernoulli probabilities for each mechanism. + Accepts NumPy float64 array or PyTorch tensor. + seed: Optional RNG seed for reproducibility. + backend: Backend selection policy: + - "auto" (default): try GPU, fall back to CPU. + - "cpu": force CPU implementation. + - "gpu": force GPU implementation and raise if unavailable. + + Returns: + (syndromes, errors) where + syndromes: uint8 array/tensor [num_shots x num_checks] + errors: uint8 array/tensor [num_shots x num_error_mechanisms] + + When compiled with cuStabilizer the function uses GPU-accelerated + sampling. Otherwise it falls back to the CPU implementation. + Both NumPy arrays and PyTorch tensors are accepted as inputs. + For PyTorch CUDA tensors on GPU path, outputs are PyTorch CUDA tensors; + otherwise outputs are NumPy arrays. + """ + from . import _pycudaqx_qec_the_suffix_matters_cudaq_qec as _qecmod + + return _qecmod.qecrt.dem_sampling(check_matrix, num_shots, + error_probabilities, seed, backend) diff --git a/libs/qec/python/tests/test_dem_sampling.py b/libs/qec/python/tests/test_dem_sampling.py new file mode 100644 index 00000000..c1079c35 --- /dev/null +++ b/libs/qec/python/tests/test_dem_sampling.py @@ -0,0 +1,608 @@ +# ============================================================================ # +# Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # +"""Tests for cudaq_qec.dem_sampling (GPU with cuStabilizer + CPU fallback).""" + +import numpy as np +import pytest + +import cudaq_qec as qec +from cudaq_qec.dem_sampling import dem_sampling + + +def _compute_syndrome(H, errors): + """Reference: syndromes = errors @ H^T mod 2.""" + return (errors @ H.T) % 2 + + +def _has_runtime_gpu_backend(): + """True only when GPU backend is both compiled and usable at runtime.""" + if not getattr(qec.qecrt, "dem_sampling_has_gpu_compiled", False): + return False + + H = np.array([[1]], dtype=np.uint8) + probs = np.array([0.0], dtype=np.float64) + try: + dem_sampling(H, 1, probs, seed=0, backend="gpu") + return True + except RuntimeError: + return False + + +_HAS_RUNTIME_GPU_BACKEND = _has_runtime_gpu_backend() + +# --------------------------------------------------------------------------- +# Deterministic tests (numpy input) +# --------------------------------------------------------------------------- + + +class TestAllZeroProbabilities: + + def test_all_zeros(self): + H = np.array([[1, 0, 1, 0], [0, 1, 1, 0], [0, 0, 0, 1]], dtype=np.uint8) + probs = np.zeros(4) + syndromes, errors = dem_sampling(H, 10, probs, seed=0) + + assert syndromes.shape == (10, 3) + assert errors.shape == (10, 4) + np.testing.assert_array_equal(errors, 0) + np.testing.assert_array_equal(syndromes, 0) + + +class TestAllOneProbabilities: + + def test_all_ones(self): + H = np.array([[1, 0, 1, 0], [1, 1, 0, 1], [0, 1, 1, 0]], dtype=np.uint8) + probs = np.ones(4) + syndromes, errors = dem_sampling(H, 5, probs, seed=0) + + np.testing.assert_array_equal(errors, 1) + for shot in range(5): + assert syndromes[shot, 0] == 0 + assert syndromes[shot, 1] == 1 + assert syndromes[shot, 2] == 0 + + +class TestMixedDeterministicProbs: + + def test_mixed(self): + H = np.array([[1, 0, 1], [0, 1, 1]], dtype=np.uint8) + probs = np.array([1.0, 0.0, 1.0]) + syndromes, errors = dem_sampling(H, 8, probs, seed=42) + + expected_errors = np.array([1, 0, 1], dtype=np.uint8) + expected_syn = np.array([0, 1], dtype=np.uint8) + + for shot in range(8): + np.testing.assert_array_equal(errors[shot], expected_errors) + np.testing.assert_array_equal(syndromes[shot], expected_syn) + + +class TestIdentityMatrix: + + def test_identity_all_ones(self): + H = np.eye(5, dtype=np.uint8) + probs = np.ones(5) + syndromes, errors = dem_sampling(H, 3, probs, seed=7) + + np.testing.assert_array_equal(errors, 1) + np.testing.assert_array_equal(syndromes, errors) + + def test_identity_syndrome_equals_errors(self): + H = np.eye(10, dtype=np.uint8) + probs = np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=np.float64) + syndromes, errors = dem_sampling(H, 4, probs, seed=0) + + np.testing.assert_array_equal(syndromes, errors) + expected = np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=np.uint8) + for shot in range(4): + np.testing.assert_array_equal(errors[shot], expected) + + +class TestAllOnesMatrix: + + def test_even_columns(self): + H = np.ones((3, 4), dtype=np.uint8) + probs = np.ones(4) + syndromes, errors = dem_sampling(H, 4, probs, seed=0) + + np.testing.assert_array_equal(errors, 1) + np.testing.assert_array_equal(syndromes, 0) + + def test_odd_columns(self): + H = np.ones((3, 3), dtype=np.uint8) + probs = np.ones(3) + syndromes, errors = dem_sampling(H, 4, probs, seed=0) + + np.testing.assert_array_equal(errors, 1) + np.testing.assert_array_equal(syndromes, 1) + + +class TestSingleColumnMatrix: + + def test_single_column(self): + H = np.array([[1], [0], [1]], dtype=np.uint8) + probs = np.array([1.0]) + syndromes, errors = dem_sampling(H, 6, probs, seed=0) + + np.testing.assert_array_equal(errors, 1) + expected_syn = np.array([1, 0, 1], dtype=np.uint8) + for shot in range(6): + np.testing.assert_array_equal(syndromes[shot], expected_syn) + + +class TestSingleRowMatrix: + + def test_single_row(self): + H = np.array([[1, 1, 0, 1, 0]], dtype=np.uint8) + probs = np.array([1.0, 0.0, 1.0, 0.0, 1.0]) + syndromes, errors = dem_sampling(H, 4, probs, seed=0) + + expected_errors = np.array([1, 0, 1, 0, 1], dtype=np.uint8) + for shot in range(4): + np.testing.assert_array_equal(errors[shot], expected_errors) + assert syndromes[shot, 0] == 1 + + +class TestSingleShot: + + def test_single_shot(self): + H = np.array([[1, 1, 0], [0, 1, 1]], dtype=np.uint8) + probs = np.array([1.0, 1.0, 0.0]) + syndromes, errors = dem_sampling(H, 1, probs, seed=0) + + assert errors.shape == (1, 3) + np.testing.assert_array_equal(errors[0], [1, 1, 0]) + np.testing.assert_array_equal(syndromes[0], [0, 1]) + + +class TestRepetitionCode: + + def test_single_error(self): + H = np.array([[1, 1, 0, 0], [0, 1, 1, 0], [0, 0, 1, 1]], dtype=np.uint8) + probs = np.array([0.0, 1.0, 0.0, 0.0]) + syndromes, errors = dem_sampling(H, 3, probs, seed=0) + + expected_errors = np.array([0, 1, 0, 0], dtype=np.uint8) + expected_syn = np.array([1, 1, 0], dtype=np.uint8) + + for shot in range(3): + np.testing.assert_array_equal(errors[shot], expected_errors) + np.testing.assert_array_equal(syndromes[shot], expected_syn) + + +class TestSyndromeConsistency: + + def test_fixed_matrix(self): + H = np.array([ + [1, 0, 1, 0, 0, 1, 0, 0], + [0, 1, 0, 1, 0, 0, 1, 0], + [0, 0, 1, 0, 1, 0, 0, 1], + [1, 1, 0, 0, 0, 0, 1, 1], + ], + dtype=np.uint8) + probs = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]) + syndromes, errors = dem_sampling(H, 500, probs, seed=7) + + expected = _compute_syndrome(H, errors) + np.testing.assert_array_equal(syndromes, expected) + + +class TestSeedReproducibility: + + def test_same_seed(self): + H = np.eye(5, dtype=np.uint8) + probs = np.array([0.1, 0.3, 0.5, 0.7, 0.9]) + + s1, e1 = dem_sampling(H, 100, probs, seed=42) + s2, e2 = dem_sampling(H, 100, probs, seed=42) + + np.testing.assert_array_equal(s1, s2) + np.testing.assert_array_equal(e1, e2) + + def test_different_seeds_differ(self): + H = np.eye(5, dtype=np.uint8) + probs = np.full(5, 0.5) + + _, e1 = dem_sampling(H, 1000, probs, seed=1) + _, e2 = dem_sampling(H, 1000, probs, seed=2) + + assert not np.array_equal(e1, e2) + + +class TestOutputProperties: + + def test_shapes_and_dtypes(self): + H = np.array([[1, 0, 1], [0, 1, 1]], dtype=np.uint8) + probs = np.array([0.1, 0.2, 0.3]) + syndromes, errors = dem_sampling(H, 10, probs, seed=0) + + assert syndromes.shape == (10, 2) + assert errors.shape == (10, 3) + assert syndromes.dtype == np.uint8 + assert errors.dtype == np.uint8 + + def test_binary_output(self): + H = np.eye(5, dtype=np.uint8) + probs = np.full(5, 0.5) + syndromes, errors = dem_sampling(H, 100, probs, seed=42) + + assert set(np.unique(syndromes)).issubset({0, 1}) + assert set(np.unique(errors)).issubset({0, 1}) + + +class TestInputValidation: + + def test_non_2d_matrix(self): + with pytest.raises((ValueError, RuntimeError)): + dem_sampling(np.zeros(5, dtype=np.uint8), 10, np.zeros(5)) + + def test_mismatched_probs(self): + H = np.eye(5, dtype=np.uint8) + with pytest.raises((ValueError, RuntimeError)): + dem_sampling(H, 10, np.zeros(3)) + + def test_non_1d_probs(self): + H = np.eye(5, dtype=np.uint8) + with pytest.raises((ValueError, RuntimeError)): + dem_sampling(H, 10, np.zeros((5, 1))) + + def test_probability_below_zero(self): + H = np.eye(3, dtype=np.uint8) + probs = np.array([0.1, -0.2, 0.3], dtype=np.float64) + with pytest.raises((ValueError, RuntimeError)): + dem_sampling(H, 10, probs, backend="cpu") + + def test_probability_above_one(self): + H = np.eye(3, dtype=np.uint8) + probs = np.array([0.1, 1.2, 0.3], dtype=np.float64) + with pytest.raises((ValueError, RuntimeError)): + dem_sampling(H, 10, probs, backend="cpu") + + def test_probability_nan(self): + H = np.eye(3, dtype=np.uint8) + probs = np.array([0.1, np.nan, 0.3], dtype=np.float64) + with pytest.raises((ValueError, RuntimeError)): + dem_sampling(H, 10, probs, backend="cpu") + + @pytest.mark.skipif(not _HAS_RUNTIME_GPU_BACKEND, + reason="GPU backend unavailable in this environment") + def test_force_gpu_rejects_invalid_probability_numpy(self): + H = np.eye(3, dtype=np.uint8) + probs = np.array([0.1, 1.1, 0.3], dtype=np.float64) + with pytest.raises((ValueError, RuntimeError)): + dem_sampling(H, 10, probs, backend="gpu") + + @pytest.mark.skipif(not _HAS_RUNTIME_GPU_BACKEND, + reason="GPU backend unavailable in this environment") + def test_force_gpu_rejects_invalid_probability_torch_cuda(self): + torch_mod = pytest.importorskip("torch") + if not torch_mod.cuda.is_available(): + pytest.skip("PyTorch CUDA unavailable") + + H = torch_mod.eye(3, dtype=torch_mod.uint8, device="cuda") + probs = torch_mod.tensor([0.1, 1.1, 0.3], + dtype=torch_mod.float64, + device="cuda") + with pytest.raises((ValueError, RuntimeError)): + dem_sampling(H, 10, probs, backend="gpu") + + +class TestBackendSelection: + + def test_invalid_backend(self): + H = np.eye(2, dtype=np.uint8) + probs = np.array([0.1, 0.2], dtype=np.float64) + with pytest.raises(RuntimeError): + dem_sampling(H, 4, probs, seed=1, backend="not-a-backend") + + def test_force_cpu_backend(self): + H = np.array([[1, 0, 1], [0, 1, 1]], dtype=np.uint8) + probs = np.array([1.0, 0.0, 1.0], dtype=np.float64) + syndromes, errors = dem_sampling(H, 5, probs, seed=11, backend="cpu") + + expected_errors = np.array([1, 0, 1], dtype=np.uint8) + expected_syn = np.array([0, 1], dtype=np.uint8) + for shot in range(5): + np.testing.assert_array_equal(errors[shot], expected_errors) + np.testing.assert_array_equal(syndromes[shot], expected_syn) + + @pytest.mark.skipif(not _HAS_RUNTIME_GPU_BACKEND, + reason="GPU backend unavailable in this environment") + def test_force_gpu_backend(self): + H = np.array([[1, 0, 1], [0, 1, 1]], dtype=np.uint8) + probs = np.array([1.0, 0.0, 1.0], dtype=np.float64) + syndromes, errors = dem_sampling(H, 5, probs, seed=11, backend="gpu") + + expected_errors = np.array([1, 0, 1], dtype=np.uint8) + expected_syn = np.array([0, 1], dtype=np.uint8) + for shot in range(5): + np.testing.assert_array_equal(errors[shot], expected_errors) + np.testing.assert_array_equal(syndromes[shot], expected_syn) + + +# --------------------------------------------------------------------------- +# PyTorch tensor input tests +# --------------------------------------------------------------------------- + +try: + import torch + _HAS_TORCH = True +except ImportError: + _HAS_TORCH = False + + +@pytest.mark.skipif(not _HAS_TORCH, reason="PyTorch not installed") +class TestPyTorchInput: + + def test_cpu_tensors(self): + H = torch.tensor([[1, 0, 1, 0], [0, 1, 1, 0], [0, 0, 0, 1]], + dtype=torch.uint8) + probs = torch.tensor([1.0, 0.0, 1.0, 0.0], dtype=torch.float64) + syndromes, errors = dem_sampling(H, 5, probs, seed=42) + + assert isinstance(syndromes, np.ndarray) + assert isinstance(errors, np.ndarray) + assert syndromes.shape == (5, 3) + assert errors.shape == (5, 4) + + expected_errors = np.array([1, 0, 1, 0], dtype=np.uint8) + for shot in range(5): + np.testing.assert_array_equal(errors[shot], expected_errors) + + def test_matches_numpy_input(self): + """Verify PyTorch and NumPy inputs produce identical results.""" + H_np = np.array([[1, 0, 1], [0, 1, 1]], dtype=np.uint8) + probs_np = np.array([0.3, 0.5, 0.7]) + + H_torch = torch.from_numpy(H_np) + probs_torch = torch.from_numpy(probs_np) + + s_np, e_np = dem_sampling(H_np, 100, probs_np, seed=99) + s_torch, e_torch = dem_sampling(H_torch, 100, probs_torch, seed=99) + + np.testing.assert_array_equal(s_np, s_torch) + np.testing.assert_array_equal(e_np, e_torch) + + def test_force_cpu_backend(self): + H = torch.tensor([[1, 0, 1], [0, 1, 1]], dtype=torch.uint8) + probs = torch.tensor([1.0, 0.0, 1.0], dtype=torch.float64) + syndromes, errors = dem_sampling(H, 4, probs, seed=7, backend="cpu") + + expected_errors = np.array([1, 0, 1], dtype=np.uint8) + expected_syn = np.array([0, 1], dtype=np.uint8) + for shot in range(4): + np.testing.assert_array_equal(errors[shot], expected_errors) + np.testing.assert_array_equal(syndromes[shot], expected_syn) + + def test_cpu_torch_probs_with_grad(self): + """CPU path should accept torch tensors that require gradients.""" + H = torch.tensor([[1, 0, 1], [0, 1, 1]], dtype=torch.uint8) + probs = torch.tensor([1.0, 0.0, 1.0], + dtype=torch.float64, + requires_grad=True) + syndromes, errors = dem_sampling(H, 4, probs, seed=7, backend="cpu") + + expected_errors = np.array([1, 0, 1], dtype=np.uint8) + expected_syn = np.array([0, 1], dtype=np.uint8) + for shot in range(4): + np.testing.assert_array_equal(errors[shot], expected_errors) + np.testing.assert_array_equal(syndromes[shot], expected_syn) + + @pytest.mark.skipif(not _HAS_RUNTIME_GPU_BACKEND, + reason="GPU backend unavailable in this environment") + def test_force_gpu_backend(self): + H = torch.tensor([[1, 0, 1], [0, 1, 1]], dtype=torch.uint8) + probs = torch.tensor([1.0, 0.0, 1.0], dtype=torch.float64) + syndromes, errors = dem_sampling(H, 4, probs, seed=7, backend="gpu") + + assert isinstance(syndromes, torch.Tensor) + assert isinstance(errors, torch.Tensor) + assert syndromes.is_cuda + assert errors.is_cuda + assert syndromes.data_ptr() > 0 + assert errors.data_ptr() > 0 + + expected_errors = np.array([1, 0, 1], dtype=np.uint8) + expected_syn = np.array([0, 1], dtype=np.uint8) + for shot in range(4): + np.testing.assert_array_equal(errors[shot].cpu().numpy(), + expected_errors) + np.testing.assert_array_equal(syndromes[shot].cpu().numpy(), + expected_syn) + + @pytest.mark.skipif(not (_HAS_TORCH and torch.cuda.is_available()), + reason="CUDA not available for PyTorch") + def test_cuda_tensors(self): + """CUDA tensors should stay on device in torch GPU path.""" + H = torch.tensor([[1, 0, 1], [0, 1, 1]], dtype=torch.uint8).cuda() + probs = torch.tensor([1.0, 0.0, 1.0], dtype=torch.float64).cuda() + syndromes, errors = dem_sampling(H, 4, probs, seed=7) + + assert isinstance(syndromes, torch.Tensor) + assert isinstance(errors, torch.Tensor) + assert syndromes.is_cuda + assert errors.is_cuda + assert tuple(syndromes.shape) == (4, 2) + assert tuple(errors.shape) == (4, 3) + assert syndromes.data_ptr() > 0 + assert errors.data_ptr() > 0 + + expected_errors = np.array([1, 0, 1], dtype=np.uint8) + expected_syn = np.array([0, 1], dtype=np.uint8) + for shot in range(4): + np.testing.assert_array_equal(errors[shot].cpu().numpy(), + expected_errors) + np.testing.assert_array_equal(syndromes[shot].cpu().numpy(), + expected_syn) + + @pytest.mark.skipif( + not (_HAS_RUNTIME_GPU_BACKEND and _HAS_TORCH and + torch.cuda.is_available()), + reason="GPU backend unavailable for CUDA stream test", + ) + def test_non_default_cuda_stream(self): + """GPU path must honor torch's current (non-default) CUDA stream.""" + device = torch.device("cuda") + H = torch.tensor([[1, 0, 1], [0, 1, 1]], + dtype=torch.uint8, + device=device) + probs = torch.tensor([1.0, 0.0, 1.0], + dtype=torch.float64, + device=device) + stream = torch.cuda.Stream(device=device) + + with torch.cuda.stream(stream): + syndromes, errors = dem_sampling(H, 4, probs, seed=7, backend="gpu") + assert isinstance(syndromes, torch.Tensor) + assert isinstance(errors, torch.Tensor) + assert syndromes.device.type == "cuda" + assert errors.device.type == "cuda" + + expected_errors = torch.tensor([1, 0, 1], + dtype=torch.uint8, + device=device) + expected_syn = torch.tensor([0, 1], + dtype=torch.uint8, + device=device) + for shot in range(4): + assert torch.equal(errors[shot], expected_errors) + assert torch.equal(syndromes[shot], expected_syn) + + stream.synchronize() + + @pytest.mark.skipif( + not (_HAS_RUNTIME_GPU_BACKEND and _HAS_TORCH and + torch.cuda.device_count() > 1), + reason="Requires runtime GPU backend and at least two CUDA devices", + ) + def test_multi_gpu_device_routing(self): + """GPU path should execute on and return tensors from the input device.""" + device = torch.device("cuda:1") + H = torch.tensor([[1, 0, 1], [0, 1, 1]], + dtype=torch.uint8, + device=device) + probs = torch.tensor([1.0, 0.0, 1.0], + dtype=torch.float64, + device=device) + + syndromes, errors = dem_sampling(H, 4, probs, seed=7, backend="gpu") + assert isinstance(syndromes, torch.Tensor) + assert isinstance(errors, torch.Tensor) + assert syndromes.device == device + assert errors.device == device + + expected_errors = torch.tensor([1, 0, 1], + dtype=torch.uint8, + device=device) + expected_syn = torch.tensor([0, 1], dtype=torch.uint8, device=device) + for shot in range(4): + assert torch.equal(errors[shot], expected_errors) + assert torch.equal(syndromes[shot], expected_syn) + + def test_syndrome_consistency_torch(self): + """Verify syndrome = errors @ H^T mod 2 with PyTorch input.""" + H_np = np.array([ + [1, 0, 1, 0, 0, 1, 0, 0], + [0, 1, 0, 1, 0, 0, 1, 0], + [0, 0, 1, 0, 1, 0, 0, 1], + [1, 1, 0, 0, 0, 0, 1, 1], + ], + dtype=np.uint8) + probs_np = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]) + + H = torch.from_numpy(H_np) + probs = torch.from_numpy(probs_np) + + syndromes, errors = dem_sampling(H, 200, probs, seed=7) + expected = _compute_syndrome(H_np, errors) + np.testing.assert_array_equal(syndromes, expected) + + +# --------------------------------------------------------------------------- +# Edge-case tests +# --------------------------------------------------------------------------- + + +class TestZeroShots: + + def test_zero_shots_numpy(self): + H = np.array([[1, 0, 1], [0, 1, 1]], dtype=np.uint8) + probs = np.array([0.1, 0.2, 0.3]) + syndromes, errors = dem_sampling(H, 0, probs, seed=0) + + assert syndromes.shape == (0, 2) + assert errors.shape == (0, 3) + assert syndromes.dtype == np.uint8 + assert errors.dtype == np.uint8 + + def test_zero_shots_cpu_backend(self): + H = np.array([[1, 0], [0, 1]], dtype=np.uint8) + probs = np.array([0.5, 0.5]) + syndromes, errors = dem_sampling(H, 0, probs, seed=0, backend="cpu") + + assert syndromes.shape == (0, 2) + assert errors.shape == (0, 2) + + @pytest.mark.skipif(not _HAS_RUNTIME_GPU_BACKEND, + reason="GPU backend unavailable") + def test_zero_shots_gpu_backend(self): + H = np.array([[1, 0], [0, 1]], dtype=np.uint8) + probs = np.array([0.5, 0.5]) + syndromes, errors = dem_sampling(H, 0, probs, seed=0, backend="gpu") + + assert syndromes.shape == (0, 2) + assert errors.shape == (0, 2) + + +class TestNonBinaryCheckMatrix: + + def test_non_binary_entries_masked(self): + """H entries > 1 should be treated as H & 1 (binary).""" + H = np.array([[2, 3], [1, 0]], dtype=np.uint8) + probs = np.array([1.0, 1.0]) + syndromes, errors = dem_sampling(H, 4, probs, seed=0, backend="cpu") + + np.testing.assert_array_equal(errors, 1) + for shot in range(4): + assert syndromes[shot, 0] == 1, "Binarized [0,1]: sum=1 mod 2=1" + assert syndromes[shot, 1] == 1, "Binarized [1,0]: sum=1 mod 2=1" + + @pytest.mark.skipif(not _HAS_RUNTIME_GPU_BACKEND, + reason="GPU backend unavailable") + def test_non_binary_cpu_gpu_match(self): + """CPU and GPU must agree on non-binary H after binarization.""" + H = np.array([[2, 3], [1, 0]], dtype=np.uint8) + probs = np.array([1.0, 1.0]) + + syn_cpu, err_cpu = dem_sampling(H, 4, probs, seed=0, backend="cpu") + syn_gpu, err_gpu = dem_sampling(H, 4, probs, seed=0, backend="gpu") + + np.testing.assert_array_equal(err_cpu, err_gpu) + np.testing.assert_array_equal(syn_cpu, syn_gpu) + + +class TestSeedlessPath: + + def test_seedless_runs(self): + """Calling without seed should succeed and produce valid output.""" + H = np.array([[1, 0, 1], [0, 1, 1]], dtype=np.uint8) + probs = np.array([0.3, 0.5, 0.7]) + syndromes, errors = dem_sampling(H, 50, probs) + + assert syndromes.shape == (50, 2) + assert errors.shape == (50, 3) + assert set(np.unique(syndromes)).issubset({0, 1}) + assert set(np.unique(errors)).issubset({0, 1}) + + def test_seedless_nondeterministic(self): + """Two seedless calls should very likely produce different results.""" + H = np.eye(10, dtype=np.uint8) + probs = np.full(10, 0.5) + _, e1 = dem_sampling(H, 500, probs) + _, e2 = dem_sampling(H, 500, probs) + assert not np.array_equal(e1, e2) diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt index 430e8fdd..1d435d22 100644 --- a/libs/qec/unittests/CMakeLists.txt +++ b/libs/qec/unittests/CMakeLists.txt @@ -48,6 +48,15 @@ target_link_libraries(test_qec PRIVATE GTest::gtest_main cudaq-qec cudaq::cudaq- add_dependencies(CUDAQXQECUnitTests test_qec) gtest_discover_tests(test_qec) +add_executable(test_dem_sampling test_dem_sampling.cpp) +target_link_libraries(test_dem_sampling PRIVATE GTest::gtest_main cudaq-qec cudaq::cudaq) +if(TARGET cuStabilizer::cuStabilizer) + target_link_libraries(test_dem_sampling PRIVATE CUDA::cudart) + target_include_directories(test_dem_sampling PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) +endif() +add_dependencies(CUDAQXQECUnitTests test_dem_sampling) +gtest_discover_tests(test_dem_sampling) + # TensorRT decoder test is only built for x86 architectures if(CUDAQ_QEC_BUILD_TRT_DECODER AND CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)") diff --git a/libs/qec/unittests/test_dem_sampling.cpp b/libs/qec/unittests/test_dem_sampling.cpp new file mode 100644 index 00000000..16847c92 --- /dev/null +++ b/libs/qec/unittests/test_dem_sampling.cpp @@ -0,0 +1,801 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "cudaq/qec/dem_sampling.h" +#include "cudaq/qec/experiments.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CUDAQ_QEC_HAS_CUSTABILIZER +#include +#endif + +namespace { + +/// Reference syndrome: checks = errors * H^T mod 2 (single shot, flat layout) +std::vector compute_syndrome(const std::vector &H, + const std::vector &errors, + size_t num_checks, size_t num_errors) { + std::vector syndrome(num_checks, 0); + for (size_t col = 0; col < num_errors; col++) { + if (errors[col]) { + for (size_t row = 0; row < num_checks; row++) { + syndrome[row] ^= H[row * num_errors + col]; + } + } + } + return syndrome; +} + +/// Build a cudaqx tensor from a flat uint8 vector. +cudaqx::tensor make_tensor(const std::vector &data, + size_t rows, size_t cols) { + cudaqx::tensor t({rows, cols}); + t.copy(data.data(), t.shape()); + return t; +} + +} // namespace + +// ============================================================================= +// CPU tests +// ============================================================================= + +TEST(DemSamplingCPU, AllZeroProbabilities) { + // H = | 1 0 1 0 | probs = [0, 0, 0, 0] + // | 0 1 1 0 | + // | 0 0 0 1 | + // No errors should ever fire, so errors=0 and checks=0. + auto H = make_tensor({1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1}, 3, 4); + std::vector probs(4, 0.0); + + auto [checks, errors] = + cudaq::qec::dem_sampler::cpu::sample_dem(H, 10, probs, 42); + + EXPECT_EQ(checks.shape()[0], 10u); + EXPECT_EQ(checks.shape()[1], 3u); + EXPECT_EQ(errors.shape()[0], 10u); + EXPECT_EQ(errors.shape()[1], 4u); + + for (size_t i = 0; i < 10; i++) { + for (size_t j = 0; j < 3; j++) + EXPECT_EQ(checks.at({i, j}), 0); + for (size_t j = 0; j < 4; j++) + EXPECT_EQ(errors.at({i, j}), 0); + } +} + +TEST(DemSamplingCPU, AllOneProbabilities) { + // H = | 1 0 1 0 | probs = [1, 1, 1, 1] + // | 1 1 0 1 | + // | 0 1 1 0 | + // Every error fires. Syndrome for all-ones error = each row summed mod 2. + // row 0: 1+0+1+0 = 2 mod 2 = 0 + // row 1: 1+1+0+1 = 3 mod 2 = 1 + // row 2: 0+1+1+0 = 2 mod 2 = 0 + std::vector H_data = {1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0}; + auto H = make_tensor(H_data, 3, 4); + std::vector probs(4, 1.0); + + auto [checks, errors] = + cudaq::qec::dem_sampler::cpu::sample_dem(H, 5, probs, 42); + + for (size_t shot = 0; shot < 5; shot++) { + for (size_t e = 0; e < 4; e++) + EXPECT_EQ(errors.at({shot, e}), 1); + EXPECT_EQ(checks.at({shot, 0}), 0); + EXPECT_EQ(checks.at({shot, 1}), 1); + EXPECT_EQ(checks.at({shot, 2}), 0); + } +} + +TEST(DemSamplingCPU, MixedDeterministicProbs) { + // H = | 1 0 1 | probs = [1, 0, 1] + // | 0 1 1 | + // Errors = [1, 0, 1] every shot. + // check 0: 1*1 + 0*0 + 1*1 = 2 mod 2 = 0 + // check 1: 0*1 + 1*0 + 1*1 = 1 mod 2 = 1 + auto H = make_tensor({1, 0, 1, 0, 1, 1}, 2, 3); + std::vector probs = {1.0, 0.0, 1.0}; + + auto [checks, errors] = + cudaq::qec::dem_sampler::cpu::sample_dem(H, 8, probs, 99); + + for (size_t shot = 0; shot < 8; shot++) { + EXPECT_EQ(errors.at({shot, 0}), 1); + EXPECT_EQ(errors.at({shot, 1}), 0); + EXPECT_EQ(errors.at({shot, 2}), 1); + EXPECT_EQ(checks.at({shot, 0}), 0); + EXPECT_EQ(checks.at({shot, 1}), 1); + } +} + +TEST(DemSamplingCPU, IdentityMatrix) { + // H = I_5 with p=1: syndromes must equal errors (all ones). + auto H = make_tensor({1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1}, + 5, 5); + std::vector probs(5, 1.0); + + auto [checks, errors] = + cudaq::qec::dem_sampler::cpu::sample_dem(H, 3, probs, 7); + + for (size_t shot = 0; shot < 3; shot++) + for (size_t j = 0; j < 5; j++) { + EXPECT_EQ(errors.at({shot, j}), 1); + EXPECT_EQ(checks.at({shot, j}), 1); + } +} + +TEST(DemSamplingCPU, AllOnesMatrixEvenColumns) { + // H = 3x4 all-ones. p = [1,1,1,1]. + // Every error fires: 4 ones per row, 4 mod 2 = 0. All syndromes = 0. + std::vector H_data(3 * 4, 1); + auto H = make_tensor(H_data, 3, 4); + std::vector probs(4, 1.0); + + auto [checks, errors] = + cudaq::qec::dem_sampler::cpu::sample_dem(H, 4, probs, 0); + + for (size_t shot = 0; shot < 4; shot++) + for (size_t c = 0; c < 3; c++) + EXPECT_EQ(checks.at({shot, c}), 0) << "Even column count => syndrome 0"; +} + +TEST(DemSamplingCPU, AllOnesMatrixOddColumns) { + // H = 3x3 all-ones. p = [1,1,1]. + // Every error fires: 3 ones per row, 3 mod 2 = 1. All syndromes = 1. + std::vector H_data(3 * 3, 1); + auto H = make_tensor(H_data, 3, 3); + std::vector probs(3, 1.0); + + auto [checks, errors] = + cudaq::qec::dem_sampler::cpu::sample_dem(H, 4, probs, 0); + + for (size_t shot = 0; shot < 4; shot++) + for (size_t c = 0; c < 3; c++) + EXPECT_EQ(checks.at({shot, c}), 1) << "Odd column count => syndrome 1"; +} + +TEST(DemSamplingCPU, SingleColumnMatrix) { + // H = | 1 | p = [1.0] + // | 0 | + // | 1 | + // The single error always fires. Syndrome = column = [1, 0, 1]. + auto H = make_tensor({1, 0, 1}, 3, 1); + std::vector probs = {1.0}; + + auto [checks, errors] = + cudaq::qec::dem_sampler::cpu::sample_dem(H, 6, probs, 0); + + for (size_t shot = 0; shot < 6; shot++) { + EXPECT_EQ(errors.at({shot, 0}), 1); + EXPECT_EQ(checks.at({shot, 0}), 1); + EXPECT_EQ(checks.at({shot, 1}), 0); + EXPECT_EQ(checks.at({shot, 2}), 1); + } +} + +TEST(DemSamplingCPU, SingleRowMatrix) { + // H = | 1 1 0 1 0 | p = [1, 0, 1, 0, 1] + // Errors = [1, 0, 1, 0, 1]. + // Single check: 1*1 + 1*0 + 0*1 + 1*0 + 0*1 = 1 mod 2 = 1. + auto H = make_tensor({1, 1, 0, 1, 0}, 1, 5); + std::vector probs = {1.0, 0.0, 1.0, 0.0, 1.0}; + + auto [checks, errors] = + cudaq::qec::dem_sampler::cpu::sample_dem(H, 4, probs, 0); + + for (size_t shot = 0; shot < 4; shot++) { + EXPECT_EQ(errors.at({shot, 0}), 1); + EXPECT_EQ(errors.at({shot, 1}), 0); + EXPECT_EQ(errors.at({shot, 2}), 1); + EXPECT_EQ(errors.at({shot, 3}), 0); + EXPECT_EQ(errors.at({shot, 4}), 1); + EXPECT_EQ(checks.at({shot, 0}), 1); + } +} + +TEST(DemSamplingCPU, SingleShot) { + auto H = make_tensor({1, 1, 0, 0, 1, 1}, 2, 3); + std::vector probs = {1.0, 1.0, 0.0}; + + auto [checks, errors] = + cudaq::qec::dem_sampler::cpu::sample_dem(H, 1, probs, 0); + + EXPECT_EQ(errors.shape()[0], 1u); + EXPECT_EQ(errors.at({0, 0}), 1); + EXPECT_EQ(errors.at({0, 1}), 1); + EXPECT_EQ(errors.at({0, 2}), 0); + // check 0: 1*1 + 1*1 + 0*0 = 2 mod 2 = 0 + // check 1: 0*1 + 1*1 + 1*0 = 1 mod 2 = 1 + EXPECT_EQ(checks.at({0, 0}), 0); + EXPECT_EQ(checks.at({0, 1}), 1); +} + +TEST(DemSamplingCPU, RejectOutOfRangeProbabilities) { + auto H = make_tensor({1, 0, 1, 0, 1, 1}, 2, 3); + + std::vector probs_negative = {0.1, -0.2, 0.3}; + EXPECT_THROW( + cudaq::qec::dem_sampler::cpu::sample_dem(H, 16, probs_negative, 1), + std::invalid_argument); + + std::vector probs_above_one = {0.1, 1.2, 0.3}; + EXPECT_THROW( + cudaq::qec::dem_sampler::cpu::sample_dem(H, 16, probs_above_one, 1), + std::invalid_argument); + + std::vector probs_nan = { + 0.1, std::numeric_limits::quiet_NaN(), 0.3}; + EXPECT_THROW(cudaq::qec::dem_sampler::cpu::sample_dem(H, 16, probs_nan, 1), + std::invalid_argument); +} + +TEST(DemSamplingCPU, SeedReproducibility) { + auto H = make_tensor({1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1}, 3, 5); + std::vector probs = {0.1, 0.3, 0.5, 0.7, 0.9}; + + auto [c1, e1] = cudaq::qec::dem_sampler::cpu::sample_dem(H, 100, probs, 42); + auto [c2, e2] = cudaq::qec::dem_sampler::cpu::sample_dem(H, 100, probs, 42); + + for (size_t i = 0; i < 100; i++) { + for (size_t j = 0; j < 3; j++) + EXPECT_EQ(c1.at({i, j}), c2.at({i, j})); + for (size_t j = 0; j < 5; j++) + EXPECT_EQ(e1.at({i, j}), e2.at({i, j})); + } +} + +TEST(DemSamplingCPU, SyndromeConsistency) { + // Use a fixed, known H and verify syndrome = errors * H^T mod 2 per shot. + std::vector H_data = {1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, + 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, + 0, 1, 1, 1, 0, 0, 0, 0, 1, 1}; + const size_t num_checks = 4, num_errors = 8, num_shots = 200; + auto H = make_tensor(H_data, num_checks, num_errors); + std::vector probs = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8}; + + auto [checks, errors] = + cudaq::qec::dem_sampler::cpu::sample_dem(H, num_shots, probs, 42); + + for (size_t shot = 0; shot < num_shots; shot++) { + std::vector shot_errors(num_errors); + for (size_t e = 0; e < num_errors; e++) + shot_errors[e] = errors.at({shot, e}); + + auto expected = + compute_syndrome(H_data, shot_errors, num_checks, num_errors); + for (size_t c = 0; c < num_checks; c++) + EXPECT_EQ(checks.at({shot, c}), expected[c]) + << "Shot " << shot << " check " << c; + } +} + +TEST(DemSamplingCPU, RepetitionCodeParity) { + // Repetition code: H = [[1,1,0,0], [0,1,1,0], [0,0,1,1]] + // Single error on qubit 1 (index 1): flips checks 0 and 1. + auto H = make_tensor({1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1}, 3, 4); + std::vector probs = {0.0, 1.0, 0.0, 0.0}; + + auto [checks, errors] = + cudaq::qec::dem_sampler::cpu::sample_dem(H, 3, probs, 0); + + for (size_t shot = 0; shot < 3; shot++) { + EXPECT_EQ(errors.at({shot, 0}), 0); + EXPECT_EQ(errors.at({shot, 1}), 1); + EXPECT_EQ(errors.at({shot, 2}), 0); + EXPECT_EQ(errors.at({shot, 3}), 0); + EXPECT_EQ(checks.at({shot, 0}), 1); + EXPECT_EQ(checks.at({shot, 1}), 1); + EXPECT_EQ(checks.at({shot, 2}), 0); + } +} + +TEST(DemSamplingCPU, BackwardsCompatibility) { + auto H = make_tensor({1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1}, 3, 5); + std::vector probs = {0.1, 0.2, 0.15, 0.05, 0.25}; + unsigned seed = 42; + + auto [old_checks, old_errors] = cudaq::qec::dem_sampling(H, 100, probs, seed); + auto [new_checks, new_errors] = + cudaq::qec::dem_sampler::cpu::sample_dem(H, 100, probs, seed); + + for (size_t i = 0; i < 100; i++) { + for (size_t j = 0; j < 3; j++) + EXPECT_EQ(old_checks.at({i, j}), new_checks.at({i, j})); + for (size_t j = 0; j < 5; j++) + EXPECT_EQ(old_errors.at({i, j}), new_errors.at({i, j})); + } +} + +TEST(DemSamplingCPU, ZeroShots) { + auto H = make_tensor({1, 0, 1, 0, 1, 1}, 2, 3); + std::vector probs = {0.5, 0.5, 0.5}; + + auto [checks, errors] = + cudaq::qec::dem_sampler::cpu::sample_dem(H, 0, probs, 42); + + EXPECT_EQ(checks.shape()[0], 0u); + EXPECT_EQ(checks.shape()[1], 2u); + EXPECT_EQ(errors.shape()[0], 0u); + EXPECT_EQ(errors.shape()[1], 3u); +} + +TEST(DemSamplingCPU, NonBinaryCheckMatrixMasked) { + // H entries > 1 should be treated as H & 1 (matching GPU behavior). + // H = [[2, 3], [1, 0]] -> binarized [[0, 1], [1, 0]] + // probs = [1, 1] -> errors = [1, 1] + // syndromes with binarized H: [0^1, 1^0] = [1, 1] + auto H = make_tensor({2, 3, 1, 0}, 2, 2); + std::vector probs = {1.0, 1.0}; + + auto [checks, errors] = + cudaq::qec::dem_sampler::cpu::sample_dem(H, 4, probs, 0); + + for (size_t shot = 0; shot < 4; shot++) { + EXPECT_EQ(errors.at({shot, 0}), 1); + EXPECT_EQ(errors.at({shot, 1}), 1); + EXPECT_EQ(checks.at({shot, 0}), 1) + << "Binarized row 0: [0,1], sum = 1 mod 2 = 1"; + EXPECT_EQ(checks.at({shot, 1}), 1) + << "Binarized row 1: [1,0], sum = 1 mod 2 = 1"; + } +} + +TEST(DemSamplingCPU, SeedlessPathRuns) { + auto H = make_tensor({1, 0, 1, 0, 1, 1}, 2, 3); + std::vector probs = {0.5, 0.5, 0.5}; + + auto [checks, errors] = + cudaq::qec::dem_sampler::cpu::sample_dem(H, 10, probs); + + EXPECT_EQ(checks.shape()[0], 10u); + EXPECT_EQ(errors.shape()[0], 10u); + for (size_t shot = 0; shot < 10; shot++) + for (size_t c = 0; c < 2; c++) + EXPECT_TRUE(checks.at({shot, c}) == 0 || checks.at({shot, c}) == 1); +} + +// ============================================================================= +// GPU tests (guarded by CUDAQ_QEC_HAS_CUSTABILIZER) +// ============================================================================= + +#ifdef CUDAQ_QEC_HAS_CUSTABILIZER + +class DemSamplingGPU : public ::testing::Test { +protected: + static bool has_gpu() { + int count = 0; + return cudaGetDeviceCount(&count) == cudaSuccess && count > 0; + } + void SetUp() override { + if (!has_gpu()) + GTEST_SKIP() << "No GPU available"; + } + + /// RAII wrapper for a set of GPU DEM sampling buffers. + struct GpuBuffers { + uint8_t *d_H = nullptr, *d_checks = nullptr, *d_errors = nullptr; + double *d_probs = nullptr; + size_t num_checks, num_errors, num_shots; + + GpuBuffers(const std::vector &H, const std::vector &probs, + size_t checks, size_t errors, size_t shots) + : num_checks(checks), num_errors(errors), num_shots(shots) { + EXPECT_EQ(cudaMalloc(&d_H, checks * errors), cudaSuccess); + EXPECT_EQ(cudaMalloc(&d_probs, errors * sizeof(double)), cudaSuccess); + EXPECT_EQ(cudaMalloc(&d_checks, shots * checks), cudaSuccess); + EXPECT_EQ(cudaMalloc(&d_errors, shots * errors), cudaSuccess); + EXPECT_EQ( + cudaMemcpy(d_H, H.data(), checks * errors, cudaMemcpyHostToDevice), + cudaSuccess); + EXPECT_EQ(cudaMemcpy(d_probs, probs.data(), errors * sizeof(double), + cudaMemcpyHostToDevice), + cudaSuccess); + } + + ~GpuBuffers() { + cudaFree(d_H); + cudaFree(d_probs); + cudaFree(d_checks); + cudaFree(d_errors); + } + + bool run(unsigned seed) { + return cudaq::qec::dem_sampler::gpu::sample_dem( + d_H, num_checks, num_errors, d_probs, num_shots, seed, d_checks, + d_errors); + } + + std::vector get_checks() { + std::vector out(num_shots * num_checks); + EXPECT_EQ( + cudaMemcpy(out.data(), d_checks, out.size(), cudaMemcpyDeviceToHost), + cudaSuccess); + return out; + } + + std::vector get_errors() { + std::vector out(num_shots * num_errors); + EXPECT_EQ( + cudaMemcpy(out.data(), d_errors, out.size(), cudaMemcpyDeviceToHost), + cudaSuccess); + return out; + } + + GpuBuffers(const GpuBuffers &) = delete; + GpuBuffers &operator=(const GpuBuffers &) = delete; + }; +}; + +TEST_F(DemSamplingGPU, AllZeroProbabilities) { + const size_t num_checks = 3, num_errors = 5, num_shots = 10; + std::vector H = {1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1}; + std::vector probs(num_errors, 0.0); + + GpuBuffers buf(H, probs, num_checks, num_errors, num_shots); + ASSERT_TRUE(buf.run(42)); + + for (auto v : buf.get_checks()) + EXPECT_EQ(v, 0); + for (auto v : buf.get_errors()) + EXPECT_EQ(v, 0); +} + +TEST_F(DemSamplingGPU, AllOneProbabilities) { + // H = | 1 0 1 0 | probs = [1,1,1,1] + // | 1 1 0 1 | + // | 0 1 1 0 | + // Expected syndromes: [0, 1, 0] (same as CPU test). + const size_t num_checks = 3, num_errors = 4, num_shots = 5; + std::vector H = {1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0}; + std::vector probs(num_errors, 1.0); + + GpuBuffers buf(H, probs, num_checks, num_errors, num_shots); + ASSERT_TRUE(buf.run(42)); + + auto h_checks = buf.get_checks(); + auto h_errors = buf.get_errors(); + + for (size_t shot = 0; shot < num_shots; shot++) { + for (size_t e = 0; e < num_errors; e++) + EXPECT_EQ(h_errors[shot * num_errors + e], 1); + EXPECT_EQ(h_checks[shot * num_checks + 0], 0); + EXPECT_EQ(h_checks[shot * num_checks + 1], 1); + EXPECT_EQ(h_checks[shot * num_checks + 2], 0); + } +} + +TEST_F(DemSamplingGPU, MixedDeterministicProbs) { + // H = | 1 0 1 | probs = [1, 0, 1] + // | 0 1 1 | + // Expected: errors = [1,0,1], checks = [0, 1] + const size_t num_checks = 2, num_errors = 3, num_shots = 8; + std::vector H = {1, 0, 1, 0, 1, 1}; + std::vector probs = {1.0, 0.0, 1.0}; + + GpuBuffers buf(H, probs, num_checks, num_errors, num_shots); + ASSERT_TRUE(buf.run(99)); + + auto h_checks = buf.get_checks(); + auto h_errors = buf.get_errors(); + + for (size_t shot = 0; shot < num_shots; shot++) { + EXPECT_EQ(h_errors[shot * num_errors + 0], 1); + EXPECT_EQ(h_errors[shot * num_errors + 1], 0); + EXPECT_EQ(h_errors[shot * num_errors + 2], 1); + EXPECT_EQ(h_checks[shot * num_checks + 0], 0); + EXPECT_EQ(h_checks[shot * num_checks + 1], 1); + } +} + +TEST_F(DemSamplingGPU, IdentityMatrix) { + const size_t N = 5, num_shots = 3; + std::vector H(N * N, 0); + for (size_t i = 0; i < N; i++) + H[i * N + i] = 1; + std::vector probs(N, 1.0); + + GpuBuffers buf(H, probs, N, N, num_shots); + ASSERT_TRUE(buf.run(7)); + + auto h_checks = buf.get_checks(); + auto h_errors = buf.get_errors(); + + for (size_t shot = 0; shot < num_shots; shot++) + for (size_t j = 0; j < N; j++) { + EXPECT_EQ(h_errors[shot * N + j], 1); + EXPECT_EQ(h_checks[shot * N + j], 1); + } +} + +TEST_F(DemSamplingGPU, AllOnesMatrixEvenColumns) { + // H = 3x4 all-ones, p=1. Even column count => all syndromes = 0. + const size_t num_checks = 3, num_errors = 4, num_shots = 4; + std::vector H(num_checks * num_errors, 1); + std::vector probs(num_errors, 1.0); + + GpuBuffers buf(H, probs, num_checks, num_errors, num_shots); + ASSERT_TRUE(buf.run(0)); + + for (auto v : buf.get_checks()) + EXPECT_EQ(v, 0) << "Even column count => syndrome 0"; +} + +TEST_F(DemSamplingGPU, AllOnesMatrixOddColumns) { + // H = 3x3 all-ones, p=1. Odd column count => all syndromes = 1. + const size_t N = 3, num_shots = 4; + std::vector H(N * N, 1); + std::vector probs(N, 1.0); + + GpuBuffers buf(H, probs, N, N, num_shots); + ASSERT_TRUE(buf.run(0)); + + for (auto v : buf.get_checks()) + EXPECT_EQ(v, 1) << "Odd column count => syndrome 1"; +} + +TEST_F(DemSamplingGPU, RepetitionCodeParity) { + // H = [[1,1,0,0],[0,1,1,0],[0,0,1,1]], single error on qubit 1. + const size_t num_checks = 3, num_errors = 4, num_shots = 3; + std::vector H = {1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1}; + std::vector probs = {0.0, 1.0, 0.0, 0.0}; + + GpuBuffers buf(H, probs, num_checks, num_errors, num_shots); + ASSERT_TRUE(buf.run(0)); + + auto h_checks = buf.get_checks(); + auto h_errors = buf.get_errors(); + + for (size_t shot = 0; shot < num_shots; shot++) { + EXPECT_EQ(h_errors[shot * num_errors + 0], 0); + EXPECT_EQ(h_errors[shot * num_errors + 1], 1); + EXPECT_EQ(h_errors[shot * num_errors + 2], 0); + EXPECT_EQ(h_errors[shot * num_errors + 3], 0); + EXPECT_EQ(h_checks[shot * num_checks + 0], 1); + EXPECT_EQ(h_checks[shot * num_checks + 1], 1); + EXPECT_EQ(h_checks[shot * num_checks + 2], 0); + } +} + +TEST_F(DemSamplingGPU, SyndromeConsistency) { + // Fixed H, verify syndrome = errors * H^T mod 2 for every shot. + const size_t num_checks = 4, num_errors = 8, num_shots = 100; + std::vector H = {1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, + 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1}; + std::vector probs = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8}; + + GpuBuffers buf(H, probs, num_checks, num_errors, num_shots); + ASSERT_TRUE(buf.run(42)); + + auto h_checks = buf.get_checks(); + auto h_errors = buf.get_errors(); + + for (size_t shot = 0; shot < num_shots; shot++) { + std::vector shot_err(h_errors.begin() + shot * num_errors, + h_errors.begin() + (shot + 1) * num_errors); + auto expected = compute_syndrome(H, shot_err, num_checks, num_errors); + for (size_t c = 0; c < num_checks; c++) + EXPECT_EQ(h_checks[shot * num_checks + c], expected[c]) + << "Shot " << shot << " check " << c; + } +} + +TEST_F(DemSamplingGPU, SeedReproducibility) { + const size_t num_checks = 3, num_errors = 5, num_shots = 100; + std::vector H = {1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1}; + std::vector probs = {0.1, 0.3, 0.5, 0.7, 0.9}; + + GpuBuffers buf1(H, probs, num_checks, num_errors, num_shots); + ASSERT_TRUE(buf1.run(42)); + auto checks1 = buf1.get_checks(); + auto errors1 = buf1.get_errors(); + + GpuBuffers buf2(H, probs, num_checks, num_errors, num_shots); + ASSERT_TRUE(buf2.run(42)); + auto checks2 = buf2.get_checks(); + auto errors2 = buf2.get_errors(); + + EXPECT_EQ(checks1, checks2) << "Same seed must produce identical syndromes"; + EXPECT_EQ(errors1, errors2) << "Same seed must produce identical errors"; +} + +TEST_F(DemSamplingGPU, CpuGpuCrossValidation) { + // Use deterministic probs (0 and 1) so CPU and GPU must match exactly. + const size_t num_checks = 3, num_errors = 6, num_shots = 20; + std::vector H_data = {1, 0, 1, 0, 0, 1, 0, 1, 0, + 1, 0, 0, 1, 1, 0, 0, 1, 1}; + std::vector probs = {1.0, 0.0, 1.0, 0.0, 1.0, 0.0}; + + auto H_tensor = make_tensor(H_data, num_checks, num_errors); + auto [cpu_checks, cpu_errors] = + cudaq::qec::dem_sampler::cpu::sample_dem(H_tensor, num_shots, probs, 42); + + GpuBuffers buf(H_data, probs, num_checks, num_errors, num_shots); + ASSERT_TRUE(buf.run(42)); + auto gpu_checks = buf.get_checks(); + auto gpu_errors = buf.get_errors(); + + for (size_t shot = 0; shot < num_shots; shot++) { + for (size_t e = 0; e < num_errors; e++) + EXPECT_EQ(gpu_errors[shot * num_errors + e], cpu_errors.at({shot, e})) + << "Shot " << shot << " error " << e; + for (size_t c = 0; c < num_checks; c++) + EXPECT_EQ(gpu_checks[shot * num_checks + c], cpu_checks.at({shot, c})) + << "Shot " << shot << " check " << c; + } +} + +TEST_F(DemSamplingGPU, BinaryOutputOnly) { + const size_t num_checks = 3, num_errors = 5, num_shots = 200; + std::vector H = {1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1}; + std::vector probs = {0.2, 0.4, 0.6, 0.8, 0.5}; + + GpuBuffers buf(H, probs, num_checks, num_errors, num_shots); + ASSERT_TRUE(buf.run(123)); + + for (auto v : buf.get_checks()) + EXPECT_TRUE(v == 0 || v == 1) + << "Syndrome value must be 0 or 1, got " << (int)v; + for (auto v : buf.get_errors()) + EXPECT_TRUE(v == 0 || v == 1) + << "Error value must be 0 or 1, got " << (int)v; +} + +TEST_F(DemSamplingGPU, BitpackBoundary32Columns) { + // 32 columns lands exactly on a uint32 word boundary — no padding bits. + const size_t num_checks = 4, num_errors = 32, num_shots = 50; + std::vector H(num_checks * num_errors, 0); + for (size_t i = 0; i < num_checks; i++) + for (size_t j = 0; j < num_errors; j++) + H[i * num_errors + j] = ((i + j) % 3 == 0) ? 1 : 0; + std::vector probs(num_errors, 1.0); + + GpuBuffers buf(H, probs, num_checks, num_errors, num_shots); + ASSERT_TRUE(buf.run(77)); + + auto h_checks = buf.get_checks(); + auto h_errors = buf.get_errors(); + + for (size_t e = 0; e < num_errors; e++) + EXPECT_EQ(h_errors[e], 1) << "p=1 => all errors fire"; + + for (size_t shot = 0; shot < num_shots; shot++) { + std::vector shot_err(h_errors.begin() + shot * num_errors, + h_errors.begin() + (shot + 1) * num_errors); + auto expected = compute_syndrome(H, shot_err, num_checks, num_errors); + for (size_t c = 0; c < num_checks; c++) + EXPECT_EQ(h_checks[shot * num_checks + c], expected[c]) + << "Shot " << shot << " check " << c; + } +} + +TEST_F(DemSamplingGPU, BitpackBoundary33Columns) { + // 33 columns = 1 word + 1 bit, tests partial-word handling. + const size_t num_checks = 3, num_errors = 33, num_shots = 50; + std::vector H(num_checks * num_errors, 0); + for (size_t i = 0; i < num_checks; i++) + for (size_t j = 0; j < num_errors; j++) + H[i * num_errors + j] = ((i + j) % 2 == 0) ? 1 : 0; + std::vector probs(num_errors, 1.0); + + GpuBuffers buf(H, probs, num_checks, num_errors, num_shots); + ASSERT_TRUE(buf.run(88)); + + auto h_checks = buf.get_checks(); + auto h_errors = buf.get_errors(); + + for (size_t shot = 0; shot < num_shots; shot++) { + std::vector shot_err(h_errors.begin() + shot * num_errors, + h_errors.begin() + (shot + 1) * num_errors); + auto expected = compute_syndrome(H, shot_err, num_checks, num_errors); + for (size_t c = 0; c < num_checks; c++) + EXPECT_EQ(h_checks[shot * num_checks + c], expected[c]) + << "Shot " << shot << " check " << c; + } +} + +TEST_F(DemSamplingGPU, BitpackBoundary64Columns) { + // 64 columns = exactly 2 uint32 words. + const size_t num_checks = 4, num_errors = 64, num_shots = 40; + std::vector H(num_checks * num_errors, 0); + for (size_t i = 0; i < num_checks; i++) + for (size_t j = 0; j < num_errors; j++) + H[i * num_errors + j] = ((i * 7 + j * 3) % 5 == 0) ? 1 : 0; + std::vector probs(num_errors, 1.0); + + GpuBuffers buf(H, probs, num_checks, num_errors, num_shots); + ASSERT_TRUE(buf.run(99)); + + auto h_checks = buf.get_checks(); + auto h_errors = buf.get_errors(); + + for (size_t shot = 0; shot < num_shots; shot++) { + std::vector shot_err(h_errors.begin() + shot * num_errors, + h_errors.begin() + (shot + 1) * num_errors); + auto expected = compute_syndrome(H, shot_err, num_checks, num_errors); + for (size_t c = 0; c < num_checks; c++) + EXPECT_EQ(h_checks[shot * num_checks + c], expected[c]) + << "Shot " << shot << " check " << c; + } +} + +TEST_F(DemSamplingGPU, LargeScaleSyndromeConsistency) { + // Larger problem: 20 checks, 100 errors, 1000 shots. + const size_t num_checks = 20, num_errors = 100, num_shots = 1000; + + std::mt19937 gen(12345); + std::bernoulli_distribution dist(0.3); + std::vector H(num_checks * num_errors); + for (auto &v : H) + v = dist(gen) ? 1 : 0; + + std::uniform_real_distribution pdist(0.01, 0.5); + std::vector probs(num_errors); + for (auto &p : probs) + p = pdist(gen); + + GpuBuffers buf(H, probs, num_checks, num_errors, num_shots); + ASSERT_TRUE(buf.run(42)); + + auto h_checks = buf.get_checks(); + auto h_errors = buf.get_errors(); + + for (size_t shot = 0; shot < num_shots; shot++) { + std::vector shot_err(h_errors.begin() + shot * num_errors, + h_errors.begin() + (shot + 1) * num_errors); + auto expected = compute_syndrome(H, shot_err, num_checks, num_errors); + for (size_t c = 0; c < num_checks; c++) + EXPECT_EQ(h_checks[shot * num_checks + c], expected[c]) + << "Shot " << shot << " check " << c; + } +} + +TEST_F(DemSamplingGPU, ZeroShots) { + const size_t num_checks = 3, num_errors = 5; + std::vector H = {1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1}; + std::vector probs(num_errors, 0.5); + + // num_shots = 0 should return true immediately with no work + ASSERT_TRUE(cudaq::qec::dem_sampler::gpu::sample_dem( + nullptr, num_checks, num_errors, nullptr, 0, 42, nullptr, nullptr)); +} + +TEST_F(DemSamplingGPU, NonBinaryCheckMatrixCpuGpuMatch) { + // H with entries > 1: both paths should agree after binarization. + const size_t num_checks = 2, num_errors = 2, num_shots = 4; + std::vector H_data = {2, 3, 1, 0}; + std::vector probs = {1.0, 1.0}; + + auto H_tensor = make_tensor(H_data, num_checks, num_errors); + auto [cpu_checks, cpu_errors] = + cudaq::qec::dem_sampler::cpu::sample_dem(H_tensor, num_shots, probs, 0); + + GpuBuffers buf(H_data, probs, num_checks, num_errors, num_shots); + ASSERT_TRUE(buf.run(0)); + auto gpu_checks = buf.get_checks(); + auto gpu_errors = buf.get_errors(); + + for (size_t shot = 0; shot < num_shots; shot++) { + for (size_t e = 0; e < num_errors; e++) + EXPECT_EQ(gpu_errors[shot * num_errors + e], cpu_errors.at({shot, e})) + << "Shot " << shot << " error " << e; + for (size_t c = 0; c < num_checks; c++) + EXPECT_EQ(gpu_checks[shot * num_checks + c], cpu_checks.at({shot, c})) + << "Shot " << shot << " check " << c; + } +} + +#endif // CUDAQ_QEC_HAS_CUSTABILIZER