diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index a1b456edda..4ae37b1745 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -49,6 +49,9 @@ jobs: # Linux needs auditwheel repair so manylinux and musllinux wheels are # published with distinct platform tags instead of generic linux tags. CIBW_REPAIR_WHEEL_COMMAND_LINUX: "LD_LIBRARY_PATH=/project/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}" + # cibuildwheel v3 defaults to manylinux_2_28 images whose current + # GCC toolchain emits symbols newer than the policy allows. + CIBW_MANYLINUX_X86_64_IMAGE: "manylinux2014" # The release wheel is tagged py3-none, so one build per platform # covers all supported Python versions and avoids duplicate names. CIBW_BUILD_LINUX: "cp38-*" @@ -85,6 +88,8 @@ jobs: CIBW_SKIP: "pp*" CIBW_REPAIR_WHEEL_COMMAND: "LD_LIBRARY_PATH=$PWD/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}" CIBW_ARCHS: "aarch64" + # Keep this consistent with the x86_64 Linux release wheels. + CIBW_MANYLINUX_AARCH64_IMAGE: "manylinux2014" # Keep native arm64 builds on a portable CPU baseline instead of # tuning wheels to the hosted runner. CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off" @@ -101,27 +106,8 @@ jobs: path: ./wheelhouse/*.whl build_wheels_riscv64: - name: Build riscv64 wheels (${{ matrix.shard.name }}) + name: Build riscv64 wheel runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - shard: - - name: cp310 - build: "cp310-*" - artifact: wheels_riscv64_cp310 - - name: cp311 - build: "cp311-*" - artifact: wheels_riscv64_cp311 - - name: cp312 - build: "cp312-*" - artifact: wheels_riscv64_cp312 - - name: cp313 - build: "cp313-*" - artifact: wheels_riscv64_cp313 - - name: cp314 - build: "cp314-*" - artifact: wheels_riscv64_cp314 steps: - uses: actions/checkout@v6 with: @@ -141,16 +127,16 @@ jobs: # Build riscv64 wheels against a conservative baseline instead of # enabling RVV-related extensions from the build container. CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_RVV=off -DGGML_RV_ZFH=off -DGGML_RV_ZVFH=off -DGGML_RV_ZICBOP=off -DGGML_RV_ZIHINTPAUSE=off" - # Split the emulated riscv64 build into one Python version per job - # to minimize wall-clock time without changing the release artifacts. - CIBW_BUILD: ${{ matrix.shard.build }} + # The release wheel is tagged py3-none, so one riscv64 build is + # enough and avoids duplicate same-name release artifacts. + CIBW_BUILD: "cp310-*" with: output-dir: wheelhouse - name: Upload wheels as artifacts uses: actions/upload-artifact@v7 with: - name: ${{ matrix.shard.artifact }} + name: wheels_riscv64 path: ./wheelhouse/*.whl build_sdist: diff --git a/.github/workflows/build-wheels-rocm.yaml b/.github/workflows/build-wheels-rocm.yaml index 1902c125ba..6ad0b49541 100644 --- a/.github/workflows/build-wheels-rocm.yaml +++ b/.github/workflows/build-wheels-rocm.yaml @@ -33,7 +33,6 @@ jobs: - uses: actions/setup-python@v6 with: python-version: ${{ matrix.pyver }} - cache: "pip" - name: Install build dependencies run: | diff --git a/.github/workflows/generate-index-from-release.yaml b/.github/workflows/generate-index-from-release.yaml index a9124fbc09..edf292387d 100644 --- a/.github/workflows/generate-index-from-release.yaml +++ b/.github/workflows/generate-index-from-release.yaml @@ -40,12 +40,14 @@ jobs: run: | ./scripts/get-releases.sh ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$' + ./scripts/releases-to-pep-503.sh index/whl/cu118 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu118$' ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$' ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$' ./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$' ./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$' - # ./scripts/releases-to-pep-503.sh index/whl/cu125 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$' - # ./scripts/releases-to-pep-503.sh index/whl/cu126 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$' + ./scripts/releases-to-pep-503.sh index/whl/cu125 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu125$' + ./scripts/releases-to-pep-503.sh index/whl/cu130 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu130$' + ./scripts/releases-to-pep-503.sh index/whl/cu132 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu132$' ./scripts/releases-to-pep-503.sh index/whl/rocm72 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-rocm72$' ./scripts/releases-to-pep-503.sh index/whl/hip-radeon '^[v]?[0-9]+\.[0-9]+\.[0-9]+-hip-radeon$' ./scripts/releases-to-pep-503.sh index/whl/vulkan '^[v]?[0-9]+\.[0-9]+\.[0-9]+-vulkan$' diff --git a/CHANGELOG.md b/CHANGELOG.md index 18bcf258a0..28645f13c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.26] + - feat: Generic Multimodal Chat Handler by @abetlen in #2256 -- feat: update llama.cpp to ggml-org/llama.cpp@e3ba22d6c +- feat: update llama.cpp to ggml-org/llama.cpp@7c158fbb4 - feat(ci): add ROCm wheel builds by @abetlen in #2252 - feat(ci): add Vulkan wheel builds by @abetlen in #2251 - fix: handle additional `from_pretrained` files in subfolders by @TNing in #2085 diff --git a/README.md b/README.md index 5711d4afbb..8f7b65e835 100644 --- a/README.md +++ b/README.md @@ -538,6 +538,8 @@ Below are the supported multi-modal models and their respective chat handlers (P | [gemma-4](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) | `Gemma4ChatHandler` | `gemma4` | | GGUF models with an mtmd projector and embedded chat template | `MTMDChatHandler` | `mtmd` | +Try Gemma 4 12B in Google Colab -> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abetlen/llama-cpp-python/blob/main/examples/colab/notebook.ipynb) + Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images. ```python diff --git a/examples/colab/notebook.ipynb b/examples/colab/notebook.ipynb new file mode 100644 index 0000000000..8e258b9c03 --- /dev/null +++ b/examples/colab/notebook.ipynb @@ -0,0 +1,131 @@ +{ + "nbformat": 4, + "nbformat_minor": 5, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "accelerator": "GPU", + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Gemma 4 12B Multimodal Chat\n", + "\n", + "Run Gemma 4 12B locally in Google Colab with the pre-built CUDA wheel for `llama-cpp-python`.\n", + "\n", + "Use a GPU runtime before running this notebook: **Runtime > Change runtime type > T4 GPU**.\n", + "\n", + "Current Colab CUDA images commonly provide CUDA 12 user-space libraries even when `nvidia-smi` reports a CUDA 13-capable driver, so this notebook installs the `cu125` wheel. If your runtime provides `libcudart.so.13`, switch the wheel index URL to `/whl/cu130`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --no-cache-dir --upgrade --force-reinstall \\\n", + " \"huggingface-hub>=0.23.0\" \\\n", + " llama-cpp-python \\\n", + " --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu125\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_cpp import Llama\n", + "from llama_cpp.llama_chat_format import Gemma4ChatHandler\n", + "\n", + "MODEL_REPO = \"ggml-org/gemma-4-12B-it-GGUF\"\n", + "MODEL_FILE = \"gemma-4-12B-it-Q4_K_M.gguf\"\n", + "MMPROJ_FILE = \"mmproj-gemma-4-12B-it-Q8_0.gguf\"\n", + "\n", + "chat_handler = Gemma4ChatHandler.from_pretrained(\n", + " repo_id=MODEL_REPO,\n", + " filename=MMPROJ_FILE,\n", + " verbose=False,\n", + ")\n", + "\n", + "llm = Llama.from_pretrained(\n", + " repo_id=MODEL_REPO,\n", + " filename=MODEL_FILE,\n", + " chat_handler=chat_handler,\n", + " n_gpu_layers=-1,\n", + " n_ctx=8192,\n", + " flash_attn=True,\n", + " verbose=False,\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = llm.create_chat_completion(\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"What is the capital of France? Answer in one sentence.\",\n", + " }\n", + " ],\n", + " max_tokens=32,\n", + " temperature=0.0,\n", + ")\n", + "\n", + "print(response[\"choices\"][0][\"message\"][\"content\"])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import Image, display\n", + "\n", + "IMAGE_URL = \"https://raw.githubusercontent.com/ggml-org/llama.cpp/master/tools/mtmd/test-1.jpeg\"\n", + "\n", + "display(Image(url=IMAGE_URL, width=320))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = llm.create_chat_completion(\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"type\": \"text\", \"text\": \"Describe this image in one concise sentence.\"},\n", + " {\"type\": \"image_url\", \"image_url\": {\"url\": IMAGE_URL}},\n", + " ],\n", + " }\n", + " ],\n", + " max_tokens=128,\n", + " temperature=0.2,\n", + ")\n", + "\n", + "print(response[\"choices\"][0][\"message\"][\"content\"])\n" + ] + } + ] +} diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 52101c9b7e..bbfb73de3f 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.25" +__version__ = "0.3.26" diff --git a/scripts/releases-to-pep-503.sh b/scripts/releases-to-pep-503.sh index 71910efcbf..8359624492 100755 --- a/scripts/releases-to-pep-503.sh +++ b/scripts/releases-to-pep-503.sh @@ -54,8 +54,12 @@ cat << EOF > "$output_dir/llama-cpp-python/index.html"

Links for llama-cpp-python

EOF -# Filter releases by pattern -releases=$(grep -E "$pattern" "$current_dir/all_releases.txt") +# Filter releases by pattern. Some backend indexes are valid even when there +# are no matching releases yet. +releases=$(grep -E "$pattern" "$current_dir/all_releases.txt" || true) +if [ -z "$releases" ]; then + log_info "No releases found matching pattern: $pattern" +fi # Prepare curl headers headers=('--header' 'Accept: application/vnd.github.v3+json') @@ -81,16 +85,16 @@ for release in $releases; do continue fi - # Get release version from release ie v0.1.0-cu121 -> v0.1.0 - release_version=$(echo "$release" | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+") - echo "

$release_version

" >> "$output_dir/llama-cpp-python/index.html" - wheel_urls=$(echo "$response" | jq -r '.assets[] | select(.name | endswith(".whl")) | .browser_download_url') if [ -z "$wheel_urls" ]; then log_error "No wheel files found for release $release" continue fi + # Get release version from release ie v0.1.0-cu121 -> v0.1.0 + release_version=$(echo "$release" | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+") + echo "

$release_version

" >> "$output_dir/llama-cpp-python/index.html" + echo "$wheel_urls" | while read -r asset; do echo " $asset" >> "$output_dir/llama-cpp-python/index.html" echo "
" >> "$output_dir/llama-cpp-python/index.html" diff --git a/vendor/llama.cpp b/vendor/llama.cpp index e3ba22d6cc..7c158fbb4a 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit e3ba22d6cc4dec84e59a909c7f96e1689c7384a9 +Subproject commit 7c158fbb4aec1bdc9c81d6ca0e785139f4826fae