Build Wheels (CUDA) #102

Workflow file for this run

.github/workflows/build-wheels-cuda.yaml at ddc0d15

	name: Build Wheels (CUDA)

	on:
	workflow_dispatch:
	inputs:
	release_tag:
	description: Release tag to upload wheel assets to
	required: false
	type: string

	permissions:
	contents: write

	jobs:
	define_matrix:
	name: Define Build Matrix
	runs-on: ubuntu-22.04
	outputs:
	matrix: ${{ steps.set-matrix.outputs.matrix }}
	defaults:
	run:
	shell: pwsh

	steps:
	- name: Define Job Output
	id: set-matrix
	run: \|
	$matrix = @{
	'os' = @('ubuntu-22.04', 'windows-2022')
	# wheel.py-api = "py3" makes the CUDA wheel interpreter-agnostic,
	# so one builder per toolkit version is sufficient.
	'pyver' = @("3.9")
	'cuda' = @("11.8.0", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "13.0.2", "13.2.1")
	'releasetag' = @("basic")
	'exclude' = @(
	@{ 'os' = 'windows-2022'; 'cuda' = '12.1.1' },
	@{ 'os' = 'windows-2022'; 'cuda' = '12.2.2' },
	@{ 'os' = 'windows-2022'; 'cuda' = '12.3.2' }
	)
	}

	$matrixOut = ConvertTo-Json $matrix -Compress
	Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT

	build_wheels:
	name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' \|\| matrix.releasetag }}
	needs: define_matrix
	runs-on: ${{ matrix.os }}
	strategy:
	matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
	defaults:
	run:
	shell: pwsh
	env:
	CUDAVER: ${{ matrix.cuda }}
	AVXVER: ${{ matrix.releasetag }}

	steps:
	- name: Set up MSVC for CUDA 11.8
	if: runner.os == 'Windows' && matrix.cuda == '11.8.0'
	uses: ilammy/msvc-dev-cmd@v1
	with:
	arch: x64
	toolset: 14.29

	- name: Set up MSVC
	if: runner.os == 'Windows' && matrix.cuda != '11.8.0'
	uses: ilammy/msvc-dev-cmd@v1
	with:
	arch: x64

	- uses: actions/checkout@v6
	with:
	submodules: "recursive"

	- uses: actions/setup-python@v6
	with:
	python-version: ${{ matrix.pyver }}
	cache: 'pip'

	- name: Setup Mamba
	uses: conda-incubator/setup-miniconda@v4.0.1
	with:
	activate-environment: "llamacpp"
	python-version: ${{ matrix.pyver }}
	miniforge-version: latest
	add-pip-as-python-dependency: true
	auto-activate-base: false

	- name: Install Dependencies
	env:
	MAMBA_DOWNLOAD_FAILFAST: "0"
	MAMBA_NO_LOW_SPEED_LIMIT: "1"
	run: \|
	$cudaVersion = $env:CUDAVER
	$cudaChannel = "nvidia/label/cuda-$cudaVersion"
	if ($cudaVersion -eq '11.8.0') {
	if ($IsLinux) {
	$cudaPackages = @(
	"${cudaChannel}::cuda-nvcc_linux-64=11.8.0",
	"${cudaChannel}::cuda-cccl=11.8.89",
	"${cudaChannel}::cuda-cudart=11.8.89",
	"${cudaChannel}::cuda-cudart-dev=11.8.89",
	"${cudaChannel}::cuda-driver-dev=11.8.89",
	"${cudaChannel}::libcublas=11.11.3.6",
	"${cudaChannel}::libcublas-dev=11.11.3.6"
	)
	} elseif ($IsWindows) {
	$cudaPackages = @(
	"${cudaChannel}::cuda-nvcc_win-64=11.8.0",
	"${cudaChannel}::cuda-cccl=11.8.89",
	"${cudaChannel}::cuda-cudart=11.8.89",
	"${cudaChannel}::cuda-cudart-dev=11.8.89",
	"${cudaChannel}::libcublas=11.11.3.6",
	"${cudaChannel}::libcublas-dev=11.11.3.6"
	)
	} else {
	throw 'Unsupported CUDA wheel build platform'
	}
	mamba install -y --channel-priority flexible --override-channels -c $cudaChannel $cudaPackages
	} elseif ($IsLinux) {
	mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_linux-64" "${cudaChannel}::cuda-cccl" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev"
	} elseif ($IsWindows) {
	if ($cudaVersion -like '12.5.*' -or [version]$cudaVersion -ge [version]"13.0") {
	# The Windows 12.5+ toolkit meta-package pulls compiler activation
	# scripts that overflow cmd.exe after MSVC is already initialized.
	mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-nvcc_win-64" "${cudaChannel}::cuda-cccl" "${cudaChannel}::cuda-libraries-dev=$cudaVersion" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev"
	} else {
	mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_win-64" "${cudaChannel}::cuda-cccl" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev"
	}
	} else {
	throw 'Unsupported CUDA wheel build platform'
	}
	if ($LASTEXITCODE -ne 0) {
	exit $LASTEXITCODE
	}
	if ($IsWindows) {
	python -m pip install build wheel ninja
	} else {
	sudo apt-get update
	sudo apt-get install -y patchelf
	python -m pip install auditwheel build wheel
	}

	- name: Build Wheel
	run: \|
	$pathSeparator = if ($IsWindows) { ';' } else { ':' }
	if ($IsWindows) {
	$cudaRoot = Join-Path $env:CONDA_PREFIX 'Library'
	} elseif (Test-Path (Join-Path $env:CONDA_PREFIX 'targets/x86_64-linux/include/cuda_runtime.h')) {
	$cudaRoot = Join-Path $env:CONDA_PREFIX 'targets/x86_64-linux'
	} else {
	$cudaRoot = $env:CONDA_PREFIX
	}

	$env:CUDA_PATH = $cudaRoot
	$env:CUDA_HOME = $cudaRoot
	$env:CUDAToolkit_ROOT = $cudaRoot
	$env:CUDA_TOOLKIT_ROOT_DIR = $cudaRoot
	$cudaHostCompilerArg = ''
	$cudaRootCmake = $cudaRoot.Replace('\', '/')
	$env:CMAKE_ARGS = "-DCUDAToolkit_ROOT=$cudaRootCmake -DCUDA_TOOLKIT_ROOT_DIR=$cudaRootCmake"
	if ($IsLinux) {
	if ([version]$env:CUDAVER -lt [version]"12.0" -and (Test-Path '/usr/bin/g++-11')) {
	$env:CC = '/usr/bin/gcc-11'
	$env:CXX = '/usr/bin/g++-11'
	$env:CUDAHOSTCXX = '/usr/bin/g++-11'
	$cudaHostCompilerArg = " -DCMAKE_CUDA_HOST_COMPILER=$env:CUDAHOSTCXX"
	} elseif (Test-Path '/usr/bin/g++-12') {
	$env:CC = '/usr/bin/gcc-12'
	$env:CXX = '/usr/bin/g++-12'
	$env:CUDAHOSTCXX = '/usr/bin/g++-12'
	$cudaHostCompilerArg = " -DCMAKE_CUDA_HOST_COMPILER=$env:CUDAHOSTCXX"
	}
	$env:CMAKE_ARGS = "-DCUDAToolkit_ROOT=$cudaRoot -DCUDA_TOOLKIT_ROOT_DIR=$cudaRoot$cudaHostCompilerArg"
	$env:CPATH = "$cudaRoot/include$pathSeparator$env:CPATH"
	$env:CPLUS_INCLUDE_PATH = "$cudaRoot/include$pathSeparator$env:CPLUS_INCLUDE_PATH"
	$env:LIBRARY_PATH = "$cudaRoot/lib$pathSeparator$env:CONDA_PREFIX/lib$pathSeparator$env:LIBRARY_PATH"
	$env:LD_LIBRARY_PATH = "$cudaRoot/lib$pathSeparator$env:CONDA_PREFIX/lib$pathSeparator$env:LD_LIBRARY_PATH"
	$cudaLibraryPaths = @(
	(Join-Path $cudaRoot 'lib'),
	(Join-Path $cudaRoot 'lib64'),
	(Join-Path $env:CONDA_PREFIX 'lib')
	) \| Where-Object { Test-Path $_ }
	Write-Output "CUDA_LIBRARY_PATHS=$($cudaLibraryPaths -join ':')" >> $env:GITHUB_ENV
	} elseif ($IsWindows) {
	$ninjaPath = ((Get-Command ninja -ErrorAction Stop).Source).Replace('\', '/')
	$env:CMAKE_GENERATOR = 'Ninja'
	$env:CMAKE_MAKE_PROGRAM = $ninjaPath
	$env:PATH = "$(Join-Path $cudaRoot 'bin')$pathSeparator$env:PATH"
	}

	if ($IsWindows) {
	$nvccCandidates = @(
	(Join-Path $cudaRoot 'bin\nvcc.exe'),
	(Join-Path $env:CONDA_PREFIX 'Library\bin\nvcc.exe'),
	(Join-Path $env:CONDA_PREFIX 'bin\nvcc.exe')
	)
	} else {
	$nvccCandidates = @(
	(Join-Path $env:CONDA_PREFIX 'bin/nvcc'),
	(Join-Path $env:CONDA_PREFIX 'targets/x86_64-linux/bin/nvcc')
	)
	}
	$nvccPath = $nvccCandidates \| Where-Object { Test-Path $_ } \| Select-Object -First 1
	if (-not $nvccPath) {
	throw 'Failed to find nvcc in the conda environment'
	}
	$env:CUDACXX = $nvccPath
	$env:PATH = "$(Split-Path $nvccPath)$pathSeparator$env:PATH"
	if ($IsWindows) {
	$nvccPathCmake = $nvccPath.Replace('\', '/')
	$env:CUDACXX = $nvccPathCmake
	$env:CMAKE_ARGS = "-DCMAKE_CUDA_COMPILER=$nvccPathCmake -DCMAKE_CUDA_COMPILER_ARG1=-allow-unsupported-compiler -DCMAKE_MAKE_PROGRAM=$env:CMAKE_MAKE_PROGRAM $env:CMAKE_ARGS"
	}
	$nvccVersion = ((& $nvccPath --version) \| Select-String 'release ([0-9]+\.[0-9]+)').Matches[0].Groups[1].Value
	if (-not $nvccVersion) {
	throw 'Failed to detect the installed CUDA toolkit version'
	}
	$cudaTagVersion = $nvccVersion.Replace('.','')
	$env:VERBOSE = '1'
	$cudaArchs = "60-real;61-real;70-real;75-real;80-real;86-real;89-real;90-real;90-virtual"
	if ([version]$nvccVersion -lt [version]"12.0") {
	# CUDA 11.8 cannot compile llama.cpp's Hopper PDL device calls.
	$cudaArchs = "60-real;61-real;70-real;75-real;80-real;86-real;89-real"
	} elseif ([version]$nvccVersion -ge [version]"13.0") {
	# CUDA 13 dropped offline compilation support for pre-Turing targets.
	$cudaArchs = "75-real;80-real;86-real;89-real;90-real;90-virtual"
	}
	# Build real cubins for the supported GPUs and keep
	# one forward-compatible PTX target instead of embedding PTX for every
	# SM. This keeps the wheel under GitHub's 2 GiB release-asset limit.
	$env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=$cudaArchs -DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler -DCMAKE_CUDA_FLAGS_INIT=-allow-unsupported-compiler $env:CMAKE_ARGS"
	$env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
	if ($IsLinux) {
	$env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_OPENMP=OFF'
	}
	python -m build --wheel
	# Publish tags that reflect the actual installed toolkit version.
	Write-Output "CUDA_VERSION=$cudaTagVersion" >> $env:GITHUB_ENV

	- name: Repair Linux wheel
	if: runner.os == 'Linux'
	shell: bash
	run: \|
	set -euxo pipefail
	mkdir -p wheelhouse
	export LD_LIBRARY_PATH="$PWD/llama_cpp/lib:${CUDA_LIBRARY_PATHS}:${LD_LIBRARY_PATH:-}"
	auditwheel_bin="${CONDA}/envs/llamacpp/bin/auditwheel"
	"${auditwheel_bin}" repair \
	--exclude libcuda.so \
	--exclude libcuda.so.1 \
	--exclude libcudart.so.11.0 \
	--exclude libcudart.so.12 \
	--exclude libcudart.so.13 \
	--exclude libcublas.so.11 \
	--exclude libcublas.so.12 \
	--exclude libcublas.so.13 \
	--exclude libcublasLt.so.11 \
	--exclude libcublasLt.so.12 \
	--exclude libcublasLt.so.13 \
	-w wheelhouse \
	dist/*.whl
	rm dist/*.whl
	cp wheelhouse/*.whl dist/
	"${auditwheel_bin}" show dist/*.whl

	- uses: softprops/action-gh-release@v3
	if: startsWith(github.ref, 'refs/tags/') \|\| (github.event_name == 'workflow_dispatch' && inputs.release_tag != '')
	with:
	files: dist/*
	# Set tag_name to <tag>-cu<cuda_version>.
	tag_name: ${{ github.event_name == 'workflow_dispatch' && inputs.release_tag \|\| github.ref_name }}-cu${{ env.CUDA_VERSION }}
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Build Wheels (CUDA) #102

Workflow file

Build Wheels (CUDA) #102

Uh oh!

Workflow file for this run