diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index a1b456edda..4ae37b1745 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -49,6 +49,9 @@ jobs:
           # Linux needs auditwheel repair so manylinux and musllinux wheels are
           # published with distinct platform tags instead of generic linux tags.
           CIBW_REPAIR_WHEEL_COMMAND_LINUX: "LD_LIBRARY_PATH=/project/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}"
+          # cibuildwheel v3 defaults to manylinux_2_28 images whose current
+          # GCC toolchain emits symbols newer than the policy allows.
+          CIBW_MANYLINUX_X86_64_IMAGE: "manylinux2014"
           # The release wheel is tagged py3-none, so one build per platform
           # covers all supported Python versions and avoids duplicate names.
           CIBW_BUILD_LINUX: "cp38-*"
@@ -85,6 +88,8 @@ jobs:
           CIBW_SKIP: "pp*"
           CIBW_REPAIR_WHEEL_COMMAND: "LD_LIBRARY_PATH=$PWD/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}"
           CIBW_ARCHS: "aarch64"
+          # Keep this consistent with the x86_64 Linux release wheels.
+          CIBW_MANYLINUX_AARCH64_IMAGE: "manylinux2014"
           # Keep native arm64 builds on a portable CPU baseline instead of
           # tuning wheels to the hosted runner.
           CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off"
@@ -101,27 +106,8 @@ jobs:
           path: ./wheelhouse/*.whl
 
   build_wheels_riscv64:
-    name: Build riscv64 wheels (${{ matrix.shard.name }})
+    name: Build riscv64 wheel
     runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        shard:
-          - name: cp310
-            build: "cp310-*"
-            artifact: wheels_riscv64_cp310
-          - name: cp311
-            build: "cp311-*"
-            artifact: wheels_riscv64_cp311
-          - name: cp312
-            build: "cp312-*"
-            artifact: wheels_riscv64_cp312
-          - name: cp313
-            build: "cp313-*"
-            artifact: wheels_riscv64_cp313
-          - name: cp314
-            build: "cp314-*"
-            artifact: wheels_riscv64_cp314
     steps:
       - uses: actions/checkout@v6
         with:
@@ -141,16 +127,16 @@ jobs:
           # Build riscv64 wheels against a conservative baseline instead of
           # enabling RVV-related extensions from the build container.
           CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_RVV=off -DGGML_RV_ZFH=off -DGGML_RV_ZVFH=off -DGGML_RV_ZICBOP=off -DGGML_RV_ZIHINTPAUSE=off"
-          # Split the emulated riscv64 build into one Python version per job
-          # to minimize wall-clock time without changing the release artifacts.
-          CIBW_BUILD: ${{ matrix.shard.build }}
+          # The release wheel is tagged py3-none, so one riscv64 build is
+          # enough and avoids duplicate same-name release artifacts.
+          CIBW_BUILD: "cp310-*"
         with:
           output-dir: wheelhouse
 
       - name: Upload wheels as artifacts
         uses: actions/upload-artifact@v7
         with:
-          name: ${{ matrix.shard.artifact }}
+          name: wheels_riscv64
           path: ./wheelhouse/*.whl
 
   build_sdist:
diff --git a/.github/workflows/build-wheels-rocm.yaml b/.github/workflows/build-wheels-rocm.yaml
index 1902c125ba..6ad0b49541 100644
--- a/.github/workflows/build-wheels-rocm.yaml
+++ b/.github/workflows/build-wheels-rocm.yaml
@@ -33,7 +33,6 @@ jobs:
       - uses: actions/setup-python@v6
         with:
           python-version: ${{ matrix.pyver }}
-          cache: "pip"
 
       - name: Install build dependencies
         run: |
diff --git a/.github/workflows/generate-index-from-release.yaml b/.github/workflows/generate-index-from-release.yaml
index a9124fbc09..edf292387d 100644
--- a/.github/workflows/generate-index-from-release.yaml
+++ b/.github/workflows/generate-index-from-release.yaml
@@ -40,12 +40,14 @@ jobs:
         run: |
           ./scripts/get-releases.sh
           ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu118 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu118$'
           ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$'
           ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$'
           ./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$'
           ./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
-          # ./scripts/releases-to-pep-503.sh index/whl/cu125 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
-          # ./scripts/releases-to-pep-503.sh index/whl/cu126 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu125 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu125$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu130 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu130$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu132 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu132$'
           ./scripts/releases-to-pep-503.sh index/whl/rocm72 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-rocm72$'
           ./scripts/releases-to-pep-503.sh index/whl/hip-radeon '^[v]?[0-9]+\.[0-9]+\.[0-9]+-hip-radeon$'
           ./scripts/releases-to-pep-503.sh index/whl/vulkan '^[v]?[0-9]+\.[0-9]+\.[0-9]+-vulkan$'
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 18bcf258a0..28645f13c8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,8 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.26]
+
 - feat: Generic Multimodal Chat Handler by @abetlen in #2256
-- feat: update llama.cpp to ggml-org/llama.cpp@e3ba22d6c
+- feat: update llama.cpp to ggml-org/llama.cpp@7c158fbb4
 - feat(ci): add ROCm wheel builds by @abetlen in #2252
 - feat(ci): add Vulkan wheel builds by @abetlen in #2251
 - fix: handle additional `from_pretrained` files in subfolders by @TNing in #2085
diff --git a/README.md b/README.md
index 5711d4afbb..8f7b65e835 100644
--- a/README.md
+++ b/README.md
@@ -538,6 +538,8 @@ Below are the supported multi-modal models and their respective chat handlers (P
 | [gemma-4](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) | `Gemma4ChatHandler` | `gemma4` |
 | GGUF models with an mtmd projector and embedded chat template | `MTMDChatHandler` | `mtmd` |
 
+Try Gemma 4 12B in Google Colab -> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abetlen/llama-cpp-python/blob/main/examples/colab/notebook.ipynb)
+
 Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
 
 ```python
diff --git a/examples/colab/notebook.ipynb b/examples/colab/notebook.ipynb
new file mode 100644
index 0000000000..8e258b9c03
--- /dev/null
+++ b/examples/colab/notebook.ipynb
@@ -0,0 +1,131 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 5,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "accelerator": "GPU",
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Gemma 4 12B Multimodal Chat\n",
+        "\n",
+        "Run Gemma 4 12B locally in Google Colab with the pre-built CUDA wheel for `llama-cpp-python`.\n",
+        "\n",
+        "Use a GPU runtime before running this notebook: **Runtime > Change runtime type > T4 GPU**.\n",
+        "\n",
+        "Current Colab CUDA images commonly provide CUDA 12 user-space libraries even when `nvidia-smi` reports a CUDA 13-capable driver, so this notebook installs the `cu125` wheel. If your runtime provides `libcudart.so.13`, switch the wheel index URL to `/whl/cu130`.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!pip install --no-cache-dir --upgrade --force-reinstall \\\n",
+        "  \"huggingface-hub>=0.23.0\" \\\n",
+        "  llama-cpp-python \\\n",
+        "  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu125\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from llama_cpp import Llama\n",
+        "from llama_cpp.llama_chat_format import Gemma4ChatHandler\n",
+        "\n",
+        "MODEL_REPO = \"ggml-org/gemma-4-12B-it-GGUF\"\n",
+        "MODEL_FILE = \"gemma-4-12B-it-Q4_K_M.gguf\"\n",
+        "MMPROJ_FILE = \"mmproj-gemma-4-12B-it-Q8_0.gguf\"\n",
+        "\n",
+        "chat_handler = Gemma4ChatHandler.from_pretrained(\n",
+        "    repo_id=MODEL_REPO,\n",
+        "    filename=MMPROJ_FILE,\n",
+        "    verbose=False,\n",
+        ")\n",
+        "\n",
+        "llm = Llama.from_pretrained(\n",
+        "    repo_id=MODEL_REPO,\n",
+        "    filename=MODEL_FILE,\n",
+        "    chat_handler=chat_handler,\n",
+        "    n_gpu_layers=-1,\n",
+        "    n_ctx=8192,\n",
+        "    flash_attn=True,\n",
+        "    verbose=False,\n",
+        ")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "response = llm.create_chat_completion(\n",
+        "    messages=[\n",
+        "        {\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": \"What is the capital of France? Answer in one sentence.\",\n",
+        "        }\n",
+        "    ],\n",
+        "    max_tokens=32,\n",
+        "    temperature=0.0,\n",
+        ")\n",
+        "\n",
+        "print(response[\"choices\"][0][\"message\"][\"content\"])\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from IPython.display import Image, display\n",
+        "\n",
+        "IMAGE_URL = \"https://raw.githubusercontent.com/ggml-org/llama.cpp/master/tools/mtmd/test-1.jpeg\"\n",
+        "\n",
+        "display(Image(url=IMAGE_URL, width=320))\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "response = llm.create_chat_completion(\n",
+        "    messages=[\n",
+        "        {\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": [\n",
+        "                {\"type\": \"text\", \"text\": \"Describe this image in one concise sentence.\"},\n",
+        "                {\"type\": \"image_url\", \"image_url\": {\"url\": IMAGE_URL}},\n",
+        "            ],\n",
+        "        }\n",
+        "    ],\n",
+        "    max_tokens=128,\n",
+        "    temperature=0.2,\n",
+        ")\n",
+        "\n",
+        "print(response[\"choices\"][0][\"message\"][\"content\"])\n"
+      ]
+    }
+  ]
+}
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 52101c9b7e..bbfb73de3f 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.25"
+__version__ = "0.3.26"
diff --git a/scripts/releases-to-pep-503.sh b/scripts/releases-to-pep-503.sh
index 71910efcbf..8359624492 100755
--- a/scripts/releases-to-pep-503.sh
+++ b/scripts/releases-to-pep-503.sh
@@ -54,8 +54,12 @@ cat << EOF > "$output_dir/llama-cpp-python/index.html"
     <h1>Links for llama-cpp-python</h1>
 EOF
 
-# Filter releases by pattern
-releases=$(grep -E "$pattern" "$current_dir/all_releases.txt")
+# Filter releases by pattern. Some backend indexes are valid even when there
+# are no matching releases yet.
+releases=$(grep -E "$pattern" "$current_dir/all_releases.txt" || true)
+if [ -z "$releases" ]; then
+    log_info "No releases found matching pattern: $pattern"
+fi
 
 # Prepare curl headers
 headers=('--header' 'Accept: application/vnd.github.v3+json')
@@ -81,16 +85,16 @@ for release in $releases; do
         continue
     fi
 
-    # Get release version from release ie v0.1.0-cu121 -> v0.1.0
-    release_version=$(echo "$release" | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+")
-    echo "    <h2>$release_version</h2>" >> "$output_dir/llama-cpp-python/index.html"
-    
     wheel_urls=$(echo "$response" | jq -r '.assets[] | select(.name | endswith(".whl")) | .browser_download_url')
     if [ -z "$wheel_urls" ]; then
         log_error "No wheel files found for release $release"
         continue
     fi
 
+    # Get release version from release ie v0.1.0-cu121 -> v0.1.0
+    release_version=$(echo "$release" | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+")
+    echo "    <h2>$release_version</h2>" >> "$output_dir/llama-cpp-python/index.html"
+
     echo "$wheel_urls" | while read -r asset; do
         echo "    <a href=\"$asset\">$asset</a>" >> "$output_dir/llama-cpp-python/index.html"
         echo "    <br>" >> "$output_dir/llama-cpp-python/index.html"
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index e3ba22d6cc..7c158fbb4a 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit e3ba22d6cc4dec84e59a909c7f96e1689c7384a9
+Subproject commit 7c158fbb4aec1bdc9c81d6ca0e785139f4826fae