docs: add Gemma 4 QAT Colab notebook (abetlen#2276)

abetlen · web-flow · commit 7f16fe1963bb · 2026-06-06T00:15:04.000-07:00
* docs: add Gemma 4 QAT Colab notebook

* docs: rename Gemma 4 QAT Colab notebook
diff --git a/README.md b/README.md
@@ -540,6 +540,8 @@ Below are the supported multi-modal models and their respective chat handlers (P
 
 Try Gemma 4 12B in Google Colab -> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abetlen/llama-cpp-python/blob/main/examples/colab/notebook.ipynb)
 
+Try Gemma 4 12B QAT in Google Colab -> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abetlen/llama-cpp-python/blob/main/examples/colab/Gemma4-12B-QAT.ipynb)
+
 Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
 
 ```python
diff --git a/examples/colab/Gemma4-12B-QAT.ipynb b/examples/colab/Gemma4-12B-QAT.ipynb
@@ -0,0 +1,162 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 5,
+ "metadata": {
+  "colab": {
+   "provenance": [],
+   "gpuType": "T4"
+  },
+  "accelerator": "GPU",
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Gemma 4 12B QAT Multimodal Chat\n",
+    "\n",
+    "Run the Gemma 4 12B QAT GGUF model locally in Google Colab with the pre-built CUDA wheel for `llama-cpp-python`.\n",
+    "\n",
+    "Use a GPU runtime before running this notebook: **Runtime > Change runtime type > T4 GPU**.\n",
+    "\n",
+    "Current Colab CUDA images commonly provide CUDA 12 user-space libraries even when `nvidia-smi` reports a CUDA 13-capable driver, so this notebook installs the `cu125` wheel. If your runtime provides `libcudart.so.13`, switch the wheel index URL to `/whl/cu130`.\n"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install --no-cache-dir --upgrade --force-reinstall \\\n",
+    "  \"huggingface-hub>=0.23.0\" \\\n",
+    "  llama-cpp-python \\\n",
+    "  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu125\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_cpp import Llama\n",
+    "from llama_cpp.llama_chat_format import Gemma4ChatHandler\n",
+    "\n",
+    "MODEL_REPO = \"unsloth/gemma-4-12B-it-qat-GGUF\"\n",
+    "MODEL_FILE = \"gemma-4-12B-it-qat-UD-Q4_K_XL.gguf\"\n",
+    "MMPROJ_FILE = \"mmproj-F16.gguf\"\n",
+    "\n",
+    "chat_handler = Gemma4ChatHandler.from_pretrained(\n",
+    "    repo_id=MODEL_REPO,\n",
+    "    filename=MMPROJ_FILE,\n",
+    "    verbose=False,\n",
+    ")\n",
+    "\n",
+    "llm = Llama.from_pretrained(\n",
+    "    repo_id=MODEL_REPO,\n",
+    "    filename=MODEL_FILE,\n",
+    "    chat_handler=chat_handler,\n",
+    "    n_gpu_layers=-1,\n",
+    "    n_ctx=8192,\n",
+    "    flash_attn=True,\n",
+    "    verbose=False,\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import Image, display\n",
+    "\n",
+    "IMAGE_URL = \"https://raw.githubusercontent.com/ggml-org/llama.cpp/master/tools/mtmd/test-1.jpeg\"\n",
+    "\n",
+    "display(Image(url=IMAGE_URL, width=320))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = llm.create_chat_completion(\n",
+    "    messages=[\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": [\n",
+    "                {\"type\": \"text\", \"text\": \"Describe this image in one concise sentence.\"},\n",
+    "                {\"type\": \"image_url\", \"image_url\": {\"url\": IMAGE_URL}},\n",
+    "            ],\n",
+    "        }\n",
+    "    ],\n",
+    "    max_tokens=128,\n",
+    "    temperature=0.2,\n",
+    ")\n",
+    "\n",
+    "print(response[\"choices\"][0][\"message\"][\"content\"])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tools = [\n",
+    "    {\n",
+    "        \"type\": \"function\",\n",
+    "        \"function\": {\n",
+    "            \"name\": \"record_image_observation\",\n",
+    "            \"description\": \"Record a structured observation about an image.\",\n",
+    "            \"parameters\": {\n",
+    "                \"type\": \"object\",\n",
+    "                \"properties\": {\n",
+    "                    \"main_subject\": {\"type\": \"string\"},\n",
+    "                    \"setting\": {\"type\": \"string\"},\n",
+    "                    \"notable_details\": {\n",
+    "                        \"type\": \"array\",\n",
+    "                        \"items\": {\"type\": \"string\"},\n",
+    "                    },\n",
+    "                    \"confidence\": {\"type\": \"number\", \"minimum\": 0, \"maximum\": 1},\n",
+    "                },\n",
+    "                \"required\": [\"main_subject\", \"setting\", \"notable_details\", \"confidence\"],\n",
+    "            },\n",
+    "        },\n",
+    "    }\n",
+    "]\n",
+    "\n",
+    "response = llm.create_chat_completion(\n",
+    "    messages=[\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": [\n",
+    "                {\"type\": \"text\", \"text\": \"Use the provided tool to record a structured observation for this image.\"},\n",
+    "                {\"type\": \"image_url\", \"image_url\": {\"url\": IMAGE_URL}},\n",
+    "            ],\n",
+    "        }\n",
+    "    ],\n",
+    "    tools=tools,\n",
+    "    tool_choice={\"type\": \"function\", \"function\": {\"name\": \"record_image_observation\"}},\n",
+    "    max_tokens=256,\n",
+    "    temperature=0.0,\n",
+    ")\n",
+    "\n",
+    "message = response[\"choices\"][0][\"message\"]\n",
+    "print(message.get(\"tool_calls\", message.get(\"content\")))\n"
+   ]
+  }
+ ]
+}