Skip to content

Commit 7f16fe1

Browse files
authored
docs: add Gemma 4 QAT Colab notebook (abetlen#2276)
* docs: add Gemma 4 QAT Colab notebook * docs: rename Gemma 4 QAT Colab notebook
1 parent 7a2a36d commit 7f16fe1

2 files changed

Lines changed: 164 additions & 0 deletions

File tree

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,8 @@ Below are the supported multi-modal models and their respective chat handlers (P
540540

541541
Try Gemma 4 12B in Google Colab -> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abetlen/llama-cpp-python/blob/main/examples/colab/notebook.ipynb)
542542

543+
Try Gemma 4 12B QAT in Google Colab -> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abetlen/llama-cpp-python/blob/main/examples/colab/Gemma4-12B-QAT.ipynb)
544+
543545
Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
544546

545547
```python
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 5,
4+
"metadata": {
5+
"colab": {
6+
"provenance": [],
7+
"gpuType": "T4"
8+
},
9+
"accelerator": "GPU",
10+
"kernelspec": {
11+
"name": "python3",
12+
"display_name": "Python 3"
13+
},
14+
"language_info": {
15+
"name": "python"
16+
}
17+
},
18+
"cells": [
19+
{
20+
"cell_type": "markdown",
21+
"metadata": {},
22+
"source": [
23+
"# Gemma 4 12B QAT Multimodal Chat\n",
24+
"\n",
25+
"Run the Gemma 4 12B QAT GGUF model locally in Google Colab with the pre-built CUDA wheel for `llama-cpp-python`.\n",
26+
"\n",
27+
"Use a GPU runtime before running this notebook: **Runtime > Change runtime type > T4 GPU**.\n",
28+
"\n",
29+
"Current Colab CUDA images commonly provide CUDA 12 user-space libraries even when `nvidia-smi` reports a CUDA 13-capable driver, so this notebook installs the `cu125` wheel. If your runtime provides `libcudart.so.13`, switch the wheel index URL to `/whl/cu130`.\n"
30+
],
31+
"outputs": [],
32+
"execution_count": null
33+
},
34+
{
35+
"cell_type": "code",
36+
"execution_count": null,
37+
"metadata": {},
38+
"outputs": [],
39+
"source": [
40+
"!pip install --no-cache-dir --upgrade --force-reinstall \\\n",
41+
" \"huggingface-hub>=0.23.0\" \\\n",
42+
" llama-cpp-python \\\n",
43+
" --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu125\n"
44+
]
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": null,
49+
"metadata": {},
50+
"outputs": [],
51+
"source": [
52+
"from llama_cpp import Llama\n",
53+
"from llama_cpp.llama_chat_format import Gemma4ChatHandler\n",
54+
"\n",
55+
"MODEL_REPO = \"unsloth/gemma-4-12B-it-qat-GGUF\"\n",
56+
"MODEL_FILE = \"gemma-4-12B-it-qat-UD-Q4_K_XL.gguf\"\n",
57+
"MMPROJ_FILE = \"mmproj-F16.gguf\"\n",
58+
"\n",
59+
"chat_handler = Gemma4ChatHandler.from_pretrained(\n",
60+
" repo_id=MODEL_REPO,\n",
61+
" filename=MMPROJ_FILE,\n",
62+
" verbose=False,\n",
63+
")\n",
64+
"\n",
65+
"llm = Llama.from_pretrained(\n",
66+
" repo_id=MODEL_REPO,\n",
67+
" filename=MODEL_FILE,\n",
68+
" chat_handler=chat_handler,\n",
69+
" n_gpu_layers=-1,\n",
70+
" n_ctx=8192,\n",
71+
" flash_attn=True,\n",
72+
" verbose=False,\n",
73+
")\n"
74+
]
75+
},
76+
{
77+
"cell_type": "code",
78+
"execution_count": null,
79+
"metadata": {},
80+
"outputs": [],
81+
"source": [
82+
"from IPython.display import Image, display\n",
83+
"\n",
84+
"IMAGE_URL = \"https://raw.githubusercontent.com/ggml-org/llama.cpp/master/tools/mtmd/test-1.jpeg\"\n",
85+
"\n",
86+
"display(Image(url=IMAGE_URL, width=320))\n"
87+
]
88+
},
89+
{
90+
"cell_type": "code",
91+
"execution_count": null,
92+
"metadata": {},
93+
"outputs": [],
94+
"source": [
95+
"response = llm.create_chat_completion(\n",
96+
" messages=[\n",
97+
" {\n",
98+
" \"role\": \"user\",\n",
99+
" \"content\": [\n",
100+
" {\"type\": \"text\", \"text\": \"Describe this image in one concise sentence.\"},\n",
101+
" {\"type\": \"image_url\", \"image_url\": {\"url\": IMAGE_URL}},\n",
102+
" ],\n",
103+
" }\n",
104+
" ],\n",
105+
" max_tokens=128,\n",
106+
" temperature=0.2,\n",
107+
")\n",
108+
"\n",
109+
"print(response[\"choices\"][0][\"message\"][\"content\"])\n"
110+
]
111+
},
112+
{
113+
"cell_type": "code",
114+
"execution_count": null,
115+
"metadata": {},
116+
"outputs": [],
117+
"source": [
118+
"tools = [\n",
119+
" {\n",
120+
" \"type\": \"function\",\n",
121+
" \"function\": {\n",
122+
" \"name\": \"record_image_observation\",\n",
123+
" \"description\": \"Record a structured observation about an image.\",\n",
124+
" \"parameters\": {\n",
125+
" \"type\": \"object\",\n",
126+
" \"properties\": {\n",
127+
" \"main_subject\": {\"type\": \"string\"},\n",
128+
" \"setting\": {\"type\": \"string\"},\n",
129+
" \"notable_details\": {\n",
130+
" \"type\": \"array\",\n",
131+
" \"items\": {\"type\": \"string\"},\n",
132+
" },\n",
133+
" \"confidence\": {\"type\": \"number\", \"minimum\": 0, \"maximum\": 1},\n",
134+
" },\n",
135+
" \"required\": [\"main_subject\", \"setting\", \"notable_details\", \"confidence\"],\n",
136+
" },\n",
137+
" },\n",
138+
" }\n",
139+
"]\n",
140+
"\n",
141+
"response = llm.create_chat_completion(\n",
142+
" messages=[\n",
143+
" {\n",
144+
" \"role\": \"user\",\n",
145+
" \"content\": [\n",
146+
" {\"type\": \"text\", \"text\": \"Use the provided tool to record a structured observation for this image.\"},\n",
147+
" {\"type\": \"image_url\", \"image_url\": {\"url\": IMAGE_URL}},\n",
148+
" ],\n",
149+
" }\n",
150+
" ],\n",
151+
" tools=tools,\n",
152+
" tool_choice={\"type\": \"function\", \"function\": {\"name\": \"record_image_observation\"}},\n",
153+
" max_tokens=256,\n",
154+
" temperature=0.0,\n",
155+
")\n",
156+
"\n",
157+
"message = response[\"choices\"][0][\"message\"]\n",
158+
"print(message.get(\"tool_calls\", message.get(\"content\")))\n"
159+
]
160+
}
161+
]
162+
}

0 commit comments

Comments
 (0)