feat(ci): add CUDA 11.8 wheel builds (abetlen#2238)

abetlen · web-flow · commit 43c92a7fef5c · 2026-06-01T04:30:02.000-07:00
* feat(ci): add CUDA 11.8 wheel builds

* fix(ci): make CUDA 11.8 wheel builds version-consistent

* fix(ci): allow non-CUDA dependencies for CUDA 11.8 wheels

* fix(ci): omit Hopper targets from CUDA 11.8 wheels

* fix(ci): use GCC 11 for CUDA 11.8 Linux wheels

* fix(ci): use MSVC 14.29 for CUDA 11.8 Windows wheels
diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
@@ -24,7 +24,7 @@ jobs:
               # wheel.py-api = "py3" makes the CUDA wheel interpreter-agnostic,
               # so one builder per toolkit version is sufficient.
               'pyver' = @("3.9")
-              'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1")
+              'cuda' = @("11.8.0", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1")
               'releasetag' = @("basic")
               'exclude' = @(
                 @{ 'os' = 'windows-2022'; 'cuda' = '12.1.1' },
@@ -50,8 +50,15 @@ jobs:
       AVXVER: ${{ matrix.releasetag }}
 
     steps:
+      - name: Set up MSVC for CUDA 11.8
+        if: runner.os == 'Windows' && matrix.cuda == '11.8.0'
+        uses: ilammy/msvc-dev-cmd@v1
+        with:
+          arch: x64
+          toolset: 14.29
+
       - name: Set up MSVC
-        if: runner.os == 'Windows'
+        if: runner.os == 'Windows' && matrix.cuda != '11.8.0'
         uses: ilammy/msvc-dev-cmd@v1
         with:
           arch: x64
@@ -81,7 +88,31 @@ jobs:
         run: |
           $cudaVersion = $env:CUDAVER
           $cudaChannel = "nvidia/label/cuda-$cudaVersion"
-          if ($IsLinux) {
+          if ($cudaVersion -eq '11.8.0') {
+            if ($IsLinux) {
+              $cudaPackages = @(
+                "${cudaChannel}::cuda-nvcc_linux-64=11.8.0",
+                "${cudaChannel}::cuda-cccl=11.8.89",
+                "${cudaChannel}::cuda-cudart=11.8.89",
+                "${cudaChannel}::cuda-cudart-dev=11.8.89",
+                "${cudaChannel}::cuda-driver-dev=11.8.89",
+                "${cudaChannel}::libcublas=11.11.3.6",
+                "${cudaChannel}::libcublas-dev=11.11.3.6"
+              )
+            } elseif ($IsWindows) {
+              $cudaPackages = @(
+                "${cudaChannel}::cuda-nvcc_win-64=11.8.0",
+                "${cudaChannel}::cuda-cccl=11.8.89",
+                "${cudaChannel}::cuda-cudart=11.8.89",
+                "${cudaChannel}::cuda-cudart-dev=11.8.89",
+                "${cudaChannel}::libcublas=11.11.3.6",
+                "${cudaChannel}::libcublas-dev=11.11.3.6"
+              )
+            } else {
+              throw 'Unsupported CUDA wheel build platform'
+            }
+            mamba install -y --channel-priority flexible --override-channels -c $cudaChannel $cudaPackages
+          } elseif ($IsLinux) {
             mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_linux-64" "${cudaChannel}::cuda-cccl" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev"
           } elseif ($IsWindows) {
             if ($cudaVersion -like '12.5.*') {
@@ -122,7 +153,12 @@ jobs:
           $cudaRootCmake = $cudaRoot.Replace('\', '/')
           $env:CMAKE_ARGS = "-DCUDAToolkit_ROOT=$cudaRootCmake -DCUDA_TOOLKIT_ROOT_DIR=$cudaRootCmake"
           if ($IsLinux) {
-            if (Test-Path '/usr/bin/g++-12') {
+            if ([version]$env:CUDAVER -lt [version]"12.0" -and (Test-Path '/usr/bin/g++-11')) {
+              $env:CC = '/usr/bin/gcc-11'
+              $env:CXX = '/usr/bin/g++-11'
+              $env:CUDAHOSTCXX = '/usr/bin/g++-11'
+              $cudaHostCompilerArg = " -DCMAKE_CUDA_HOST_COMPILER=$env:CUDAHOSTCXX"
+            } elseif (Test-Path '/usr/bin/g++-12') {
               $env:CC = '/usr/bin/gcc-12'
               $env:CXX = '/usr/bin/g++-12'
               $env:CUDAHOSTCXX = '/usr/bin/g++-12'
@@ -169,10 +205,15 @@ jobs:
           }
           $cudaTagVersion = $nvccVersion.Replace('.','')
           $env:VERBOSE = '1'
+          $cudaArchs = "60-real;61-real;70-real;75-real;80-real;86-real;89-real;90-real;90-virtual"
+          if ([version]$nvccVersion -lt [version]"12.0") {
+            # CUDA 11.8 cannot compile llama.cpp's Hopper PDL device calls.
+            $cudaArchs = "60-real;61-real;70-real;75-real;80-real;86-real;89-real"
+          }
           # Build real cubins for the supported GPUs, including Pascal, and keep
           # one forward-compatible PTX target instead of embedding PTX for every
           # SM. This keeps the wheel under GitHub's 2 GiB release-asset limit.
-          $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=60-real;61-real;70-real;75-real;80-real;86-real;89-real;90-real;90-virtual -DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler -DCMAKE_CUDA_FLAGS_INIT=-allow-unsupported-compiler $env:CMAKE_ARGS"
+          $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=$cudaArchs -DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler -DCMAKE_CUDA_FLAGS_INIT=-allow-unsupported-compiler $env:CMAKE_ARGS"
           $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
           python -m build --wheel
           # Publish tags that reflect the actual installed toolkit version.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat(ci): add CUDA 11.8 wheel builds by @abetlen in #2238
 - fix(ci): add Pascal compute capability targets to CUDA wheel builds by @abetlen in #2237
 
 ## [0.3.24]
diff --git a/README.md b/README.md
@@ -125,8 +125,8 @@ CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python
 
 It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements:
 
-- CUDA Version is 12.1, 12.2, 12.3, 12.4 or 12.5
-- NVIDIA GPU compute capability is 6.0 or newer
+- CUDA Version is 11.8, 12.1, 12.2, 12.3, 12.4 or 12.5
+- NVIDIA GPU compute capability is 6.0 through 8.9 for CUDA 11.8 wheels, or 6.0 or newer for CUDA 12 wheels
 - Python Version is 3.10, 3.11 or 3.12
 
 ```bash
@@ -135,6 +135,7 @@ pip install llama-cpp-python \
 ```
 
 Where `<cuda-version>` is one of the following:
+- `cu118`: CUDA 11.8
 - `cu121`: CUDA 12.1
 - `cu122`: CUDA 12.2
 - `cu123`: CUDA 12.3