diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index 81c9a961b4..0b974e93c8 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -14,7 +14,7 @@ jobs:
         os: [ubuntu-22.04, windows-2022, macos-14, macos-15]
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
         with:
           submodules: "recursive"
 
@@ -75,7 +75,7 @@ jobs:
     name: Build arm64 wheels
     runs-on: ubuntu-24.04-arm
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
         with:
           submodules: "recursive"
 
@@ -123,7 +123,7 @@ jobs:
             build: "cp314-*"
             artifact: wheels_riscv64_cp314
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
         with:
           submodules: "recursive"
 
@@ -158,7 +158,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
         with:
           submodules: "recursive"
 
@@ -202,7 +202,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/download-artifact@v4
+      - uses: actions/download-artifact@v8
         with:
           merge-multiple: true
           path: dist
diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml
index 3dfef68008..349eb209dd 100644
--- a/.github/workflows/build-docker.yaml
+++ b/.github/workflows/build-docker.yaml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-22.04
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           submodules: "recursive"
 
@@ -32,7 +32,7 @@ jobs:
         uses: docker/setup-buildx-action@v4
 
       - name: Login to GitHub Container Registry
-        uses: docker/login-action@v3 
+        uses: docker/login-action@v4 
         with:
           registry: ghcr.io
           username: ${{ github.repository_owner }}
diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
index 3e67d14cba..f5b3718f17 100644
--- a/.github/workflows/build-wheels-cuda.yaml
+++ b/.github/workflows/build-wheels-cuda.yaml
@@ -63,7 +63,7 @@ jobs:
         with:
           arch: x64
 
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
         with:
           submodules: "recursive"
 
@@ -73,7 +73,7 @@ jobs:
           cache: 'pip'
 
       - name: Setup Mamba
-        uses: conda-incubator/setup-miniconda@v3.1.0
+        uses: conda-incubator/setup-miniconda@v4.0.1
         with:
           activate-environment: "llamacpp"
           python-version: ${{ matrix.pyver }}
diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml
index bf2c8bc124..ca8e322230 100644
--- a/.github/workflows/build-wheels-metal.yaml
+++ b/.github/workflows/build-wheels-metal.yaml
@@ -14,7 +14,7 @@ jobs:
         os: [macos-14, macos-15]
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
         with:
           submodules: "recursive"
 
@@ -54,7 +54,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/download-artifact@v4
+      - uses: actions/download-artifact@v8
         with:
           merge-multiple: true
           path: dist2
diff --git a/.github/workflows/build-wheels-rocm.yaml b/.github/workflows/build-wheels-rocm.yaml
index 0971953886..792e33cf89 100644
--- a/.github/workflows/build-wheels-rocm.yaml
+++ b/.github/workflows/build-wheels-rocm.yaml
@@ -26,7 +26,7 @@ jobs:
           apt-get update
           apt-get install -y --no-install-recommends git cmake lsb-release ninja-build
 
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
         with:
           submodules: "recursive"
 
@@ -82,7 +82,7 @@ jobs:
             amdgpu_targets: gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
         with:
           submodules: "recursive"
 
@@ -203,7 +203,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/download-artifact@v4
+      - uses: actions/download-artifact@v8
         with:
           merge-multiple: true
           path: dist
@@ -223,7 +223,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/download-artifact@v4
+      - uses: actions/download-artifact@v8
         with:
           merge-multiple: true
           path: dist
diff --git a/.github/workflows/build-wheels-vulkan.yaml b/.github/workflows/build-wheels-vulkan.yaml
index 760205c839..49d38fb80b 100644
--- a/.github/workflows/build-wheels-vulkan.yaml
+++ b/.github/workflows/build-wheels-vulkan.yaml
@@ -31,7 +31,7 @@ jobs:
         with:
           arch: x64
 
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
         with:
           submodules: "recursive"
 
@@ -109,7 +109,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/download-artifact@v4
+      - uses: actions/download-artifact@v8
         with:
           merge-multiple: true
           path: dist
diff --git a/.github/workflows/generate-index-from-release.yaml b/.github/workflows/generate-index-from-release.yaml
index c93e0be351..6b2c8770fa 100644
--- a/.github/workflows/generate-index-from-release.yaml
+++ b/.github/workflows/generate-index-from-release.yaml
@@ -31,7 +31,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
       - name: Setup Pages
         uses: actions/configure-pages@v5
       - name: Build
@@ -57,4 +57,4 @@ jobs:
           path: 'index'
       - name: Deploy to GitHub Pages
         id: deployment
-        uses: actions/deploy-pages@v4
+        uses: actions/deploy-pages@v5
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
index 3c6f5ff45e..ac39181f90 100644
--- a/.github/workflows/lint.yaml
+++ b/.github/workflows/lint.yaml
@@ -12,7 +12,7 @@ jobs:
   ruff:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
 
       - name: Set up Python
         uses: actions/setup-python@v6
diff --git a/.github/workflows/publish-to-test.yaml b/.github/workflows/publish-to-test.yaml
index 54572bdad9..70ae5389af 100644
--- a/.github/workflows/publish-to-test.yaml
+++ b/.github/workflows/publish-to-test.yaml
@@ -16,7 +16,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
       with:
         submodules: "recursive"
         
diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
index 3c2ea56d68..8908ccdd6f 100644
--- a/.github/workflows/publish.yaml
+++ b/.github/workflows/publish.yaml
@@ -10,7 +10,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
       with:
         submodules: "recursive"
 
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index ecba1b40b0..82f7a8a7de 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -49,7 +49,7 @@ jobs:
       matrix:
         python-version: ["3.9", "3.10", "3.11", "3.12"]
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
         with:
           submodules: "recursive"
           
@@ -80,7 +80,7 @@ jobs:
       matrix:
         python-version: ["3.9", "3.10", "3.11", "3.12"]
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
         with:
           submodules: "recursive"
           
@@ -114,7 +114,7 @@ jobs:
       matrix:
         python-version: ["3.9", "3.10", "3.11", "3.12"]
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
         with:
           submodules: "recursive"
           
@@ -151,7 +151,7 @@ jobs:
     needs: download-model
     runs-on: macos-15-intel
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
         with:
           submodules: "recursive"
           
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 243c344c13..18bcf258a0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-- feat: update llama.cpp to ggml-org/llama.cpp@94a220cd6
+- feat: Generic Multimodal Chat Handler by @abetlen in #2256
+- feat: update llama.cpp to ggml-org/llama.cpp@e3ba22d6c
 - feat(ci): add ROCm wheel builds by @abetlen in #2252
 - feat(ci): add Vulkan wheel builds by @abetlen in #2251
 - fix: handle additional `from_pretrained` files in subfolders by @TNing in #2085
diff --git a/README.md b/README.md
index dd0024676a..5711d4afbb 100644
--- a/README.md
+++ b/README.md
@@ -536,6 +536,7 @@ Below are the supported multi-modal models and their respective chat handlers (P
 | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6` |
 | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` |
 | [gemma-4](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) | `Gemma4ChatHandler` | `gemma4` |
+| GGUF models with an mtmd projector and embedded chat template | `MTMDChatHandler` | `mtmd` |
 
 Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
 
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 44c6c1f76f..4298642f67 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -3265,48 +3265,512 @@ def from_pretrained(
         )
 
 
-class Gemma4ChatHandler(Llava15ChatHandler):
-    DEFAULT_SYSTEM_MESSAGE = None
+class MTMDChatHandler:
+    def __init__(
+        self, clip_model_path: str, verbose: bool = True, use_gpu: bool = True
+    ):
+        import llama_cpp.mtmd_cpp as mtmd_cpp
 
-    CHAT_FORMAT = (
-        "{% if messages and messages[0]['role'] == 'system' %}"
-        "{% if messages[0]['content'] is string %}"
-        "{% set first_user_prefix = messages[0]['content'] + '\n\n' %}"
-        "{% else %}"
-        "{% set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' %}"
-        "{% endif %}"
-        "{% set loop_messages = messages[1:] %}"
-        "{% else %}"
-        "{% set first_user_prefix = '' %}"
-        "{% set loop_messages = messages %}"
-        "{% endif %}"
-        "{% for message in loop_messages %}"
-        "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
-        "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
-        "{% endif %}"
-        "{% set role = 'model' if message['role'] == 'assistant' else message['role'] %}"
-        "{{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else '') }}"
-        "{% if message['content'] is string %}"
-        "{{ message['content'] | trim }}"
-        "{% elif message['content'] is iterable %}"
-        "{% for item in message['content'] %}"
-        "{% if item['type'] == 'image_url' and item['image_url'] is string %}"
-        "{{ '\n\n' + item['image_url'] + '\n\n' }}"
-        "{% elif item['type'] == 'image_url' and item['image_url'] is mapping %}"
-        "{{ '\n\n' + item['image_url']['url'] + '\n\n' }}"
-        "{% elif item['type'] == 'text' %}"
-        "{{ item['text'] | trim }}"
-        "{% endif %}"
-        "{% endfor %}"
-        "{% else %}"
-        "{{ raise_exception('Invalid content type') }}"
-        "{% endif %}"
-        "{{ '<end_of_turn>\n' }}"
-        "{% endfor %}"
-        "{% if add_generation_prompt %}"
-        "{{ '<start_of_turn>model\n' }}"
-        "{% endif %}"
-    )
+        self.clip_model_path = clip_model_path
+        self.verbose = verbose
+        self.use_gpu = use_gpu
+        self._mtmd_cpp = mtmd_cpp
+        self._exit_stack = ExitStack()
+        self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None
+
+        if not os.path.exists(clip_model_path):
+            raise ValueError(f"Clip model path does not exist: {clip_model_path}")
+
+    def _init_mtmd_context(self, llama_model: llama.Llama):
+        self.verbose = llama_model.verbose
+        if self.mtmd_ctx is not None:
+            return
+
+        with suppress_stdout_stderr(disable=self.verbose):
+            ctx_params = self._mtmd_cpp.mtmd_context_params_default()
+            ctx_params.use_gpu = self.use_gpu
+            ctx_params.print_timings = self.verbose
+            ctx_params.n_threads = llama_model.n_threads
+            ctx_params.flash_attn_type = (
+                llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+                if (
+                    llama_model.context_params.flash_attn_type
+                    == llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+                )
+                else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
+            )
+
+            self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file(
+                self.clip_model_path.encode(), llama_model.model, ctx_params
+            )
+
+            if self.mtmd_ctx is None:
+                raise ValueError(
+                    f"Failed to load mtmd context from: {self.clip_model_path}"
+                )
+
+            if not self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx):
+                raise ValueError("Vision is not supported by this model")
+
+            def mtmd_free():
+                with suppress_stdout_stderr(disable=self.verbose):
+                    if self.mtmd_ctx is not None:
+                        self._mtmd_cpp.mtmd_free(self.mtmd_ctx)
+                        self.mtmd_ctx = None
+
+            self._exit_stack.callback(mtmd_free)
+
+    def load_image(self, image_url: str) -> bytes:
+        return self._load_image(image_url)
+
+    def _create_bitmap_from_bytes(self, image_bytes: bytes):
+        if self.mtmd_ctx is None:
+            raise ValueError("mtmd context not initialized")
+
+        with suppress_stdout_stderr(disable=self.verbose):
+            bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf(
+                self.mtmd_ctx,
+                (ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)),
+                len(image_bytes),
+            )
+
+            if bitmap is None:
+                raise ValueError("Failed to create bitmap from image bytes")
+
+            return bitmap
+
+    def _get_chat_template(self, llama_model: llama.Llama) -> str:
+        chat_template = llama_model.metadata.get("tokenizer.chat_template")
+        if not isinstance(chat_template, str) or chat_template == "":
+            raise ValueError(
+                f"{self.__class__.__name__} requires tokenizer.chat_template metadata"
+            )
+        return chat_template
+
+    def _get_template_messages(
+        self,
+        messages: List[llama_types.ChatCompletionRequestMessage],
+        media_marker: str,
+    ) -> List[Any]:
+        return [
+            self._convert_message_for_template(message, media_marker)
+            for message in messages
+        ]
+
+    @classmethod
+    def _convert_message_for_template(
+        cls,
+        message: llama_types.ChatCompletionRequestMessage,
+        media_marker: str,
+    ) -> Dict[str, Any]:
+        message_dict = dict(message)
+        content = message_dict.get("content")
+        if isinstance(content, list):
+            message_dict["content"] = [
+                cls._convert_content_part_for_template(part, media_marker)
+                for part in content
+            ]
+        return message_dict
+
+    @staticmethod
+    def _convert_content_part_for_template(
+        part: Any,
+        media_marker: str,
+    ) -> Any:
+        if isinstance(part, dict) and part.get("type") == "image_url":
+            return {"type": "text", "text": media_marker}
+        return part
+
+    @staticmethod
+    def _decode_token_piece(piece: Any) -> str:
+        if isinstance(piece, bytes):
+            return piece.decode("utf-8", errors="ignore")
+        return str(piece)
+
+    def _postprocess_template_text(
+        self,
+        text: str,
+        image_urls: List[str],
+        media_marker: str,
+    ) -> str:
+        for image_url in image_urls:
+            text = text.replace(image_url, media_marker)
+        return text
+
+    def __call__(
+        self,
+        *,
+        llama: llama.Llama,
+        messages: List[llama_types.ChatCompletionRequestMessage],
+        functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
+        function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
+        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+        tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+        temperature: float = 0.2,
+        top_p: float = 0.95,
+        top_k: int = 40,
+        min_p: float = 0.05,
+        typical_p: float = 1.0,
+        stream: bool = False,
+        stop: Optional[Union[str, List[str]]] = [],
+        seed: Optional[int] = None,
+        response_format: Optional[
+            llama_types.ChatCompletionRequestResponseFormat
+        ] = None,
+        max_tokens: Optional[int] = None,
+        presence_penalty: float = 0.0,
+        frequency_penalty: float = 0.0,
+        repeat_penalty: float = 1.1,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_tau: float = 5.0,
+        mirostat_eta: float = 0.1,
+        model: Optional[str] = None,
+        logits_processor: Optional[llama.LogitsProcessorList] = None,
+        grammar: Optional[llama.LlamaGrammar] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        top_logprobs: Optional[int] = None,
+        **kwargs,  # type: ignore
+    ) -> Union[
+        llama_types.CreateChatCompletionResponse,
+        Iterator[llama_types.CreateChatCompletionStreamResponse],
+    ]:
+        self._init_mtmd_context(llama)
+        assert self.mtmd_ctx is not None
+
+        image_urls = self.get_image_urls(messages)
+        media_marker = self._mtmd_cpp.mtmd_default_marker().decode("utf-8")
+        template_env = ImmutableSandboxedEnvironment(
+            trim_blocks=True,
+            lstrip_blocks=True,
+            extensions=[
+                Jinja2ChatFormatter.IgnoreGenerationTags,
+                jinja2.ext.loopcontrols,
+            ],
+        )
+        template_env.filters["tojson"] = Jinja2ChatFormatter.tojson
+        template = template_env.from_string(self._get_chat_template(llama))
+
+        def raise_exception(message: str):
+            raise ValueError(message)
+
+        text = template.render(
+            messages=self._get_template_messages(messages, media_marker),
+            add_generation_prompt=True,
+            eos_token=self._decode_token_piece(llama.detokenize([llama.token_eos()])),
+            bos_token=self._decode_token_piece(llama.detokenize([llama.token_bos()])),
+            raise_exception=raise_exception,
+            functions=functions,
+            function_call=function_call,
+            tools=tools,
+            tool_choice=tool_choice,
+            strftime_now=Jinja2ChatFormatter.strftime_now,
+            **kwargs,
+        )
+        text = self._postprocess_template_text(text, image_urls, media_marker)
+
+        if self.verbose:
+            print(text, file=sys.stderr)
+
+        bitmaps = []
+        bitmap_cleanup = []
+        try:
+            for image_url in image_urls:
+                image_bytes = self.load_image(image_url)
+                bitmap = self._create_bitmap_from_bytes(image_bytes)
+                bitmaps.append(bitmap)
+                bitmap_cleanup.append(bitmap)
+
+            input_text = self._mtmd_cpp.mtmd_input_text()
+            input_text.text = text.encode("utf-8")
+            input_text.add_special = True
+            input_text.parse_special = True
+
+            chunks = self._mtmd_cpp.mtmd_input_chunks_init()
+            if chunks is None:
+                raise ValueError("Failed to create input chunks")
+
+            try:
+                bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(
+                    *bitmaps
+                )
+                result = self._mtmd_cpp.mtmd_tokenize(
+                    self.mtmd_ctx,
+                    chunks,
+                    ctypes.byref(input_text),
+                    bitmap_array,
+                    len(bitmaps),
+                )
+
+                if result != 0:
+                    raise ValueError(f"Failed to tokenize input: error code {result}")
+
+                llama.reset()
+                llama._ctx.kv_cache_clear()
+
+                n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks)
+
+                for i in range(n_chunks):
+                    chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i)
+                    if chunk is None:
+                        continue
+
+                    chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk)
+
+                    if chunk_type == self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_TEXT:
+                        n_tokens_out = ctypes.c_size_t()
+                        tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(
+                            chunk, ctypes.byref(n_tokens_out)
+                        )
+
+                        if tokens_ptr and n_tokens_out.value > 0:
+                            tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)]
+
+                            if llama.n_tokens + len(tokens) > llama.n_ctx():
+                                raise ValueError(
+                                    f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}"
+                                )
+                            llama.eval(tokens)
+
+                    elif chunk_type in [
+                        self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE,
+                        self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_AUDIO,
+                    ]:
+                        chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(
+                            chunk
+                        )
+
+                        if llama.n_tokens + chunk_n_tokens > llama.n_ctx():
+                            raise ValueError(
+                                f"Prompt exceeds n_ctx: {llama.n_tokens + chunk_n_tokens} > {llama.n_ctx()}"
+                            )
+
+                        new_n_past = llama_cpp.llama_pos(0)
+                        result = self._mtmd_cpp.mtmd_helper_eval_chunk_single(
+                            self.mtmd_ctx,
+                            llama._ctx.ctx,
+                            chunk,
+                            llama_cpp.llama_pos(llama.n_tokens),
+                            llama_cpp.llama_seq_id(0),
+                            llama.n_batch,
+                            False,  # logits_last
+                            ctypes.byref(new_n_past),
+                        )
+
+                        if result != 0:
+                            raise ValueError(
+                                f"Failed to evaluate chunk: error code {result}"
+                            )
+
+                        llama.n_tokens = new_n_past.value
+
+                prompt = llama.input_ids[: llama.n_tokens].tolist()
+
+            finally:
+                self._mtmd_cpp.mtmd_input_chunks_free(chunks)
+
+        finally:
+            for bitmap in bitmap_cleanup:
+                self._mtmd_cpp.mtmd_bitmap_free(bitmap)
+
+        if response_format is not None and response_format["type"] == "json_object":
+            grammar = _grammar_for_response_format(response_format)
+
+        if functions is not None:
+            tools = [
+                {
+                    "type": "function",
+                    "function": function,
+                }
+                for function in functions
+            ]
+
+        if function_call is not None:
+            if isinstance(function_call, str) and (
+                function_call == "none" or function_call == "auto"
+            ):
+                tool_choice = function_call
+            if isinstance(function_call, dict) and "name" in function_call:
+                tool_choice = {
+                    "type": "function",
+                    "function": {
+                        "name": function_call["name"],
+                    },
+                }
+
+        tool = None
+        if (
+            tool_choice is not None
+            and isinstance(tool_choice, dict)
+            and tools is not None
+        ):
+            name = tool_choice["function"]["name"]
+            tool = next((t for t in tools if t["function"]["name"] == name), None)
+            if tool is None:
+                raise ValueError(f"Tool choice '{name}' not found in tools.")
+            schema = tool["function"]["parameters"]
+            try:
+                grammar = llama_grammar.LlamaGrammar.from_json_schema(
+                    json.dumps(schema), verbose=llama.verbose
+                )
+            except Exception as e:
+                if llama.verbose:
+                    print(str(e), file=sys.stderr)
+                grammar = llama_grammar.LlamaGrammar.from_string(
+                    llama_grammar.JSON_GBNF, verbose=llama.verbose
+                )
+
+        completion_or_chunks = llama.create_completion(
+            prompt=prompt,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            typical_p=typical_p,
+            logprobs=top_logprobs if logprobs else None,
+            stream=stream,
+            stop=stop,
+            seed=seed,
+            max_tokens=max_tokens,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
+            repeat_penalty=repeat_penalty,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            model=model,
+            logits_processor=logits_processor,
+            grammar=grammar,
+            logit_bias=logit_bias,
+        )
+
+        if tool is not None:
+            tool_name = tool["function"]["name"]
+            return _convert_completion_to_chat_function(
+                tool_name, completion_or_chunks, stream
+            )
+        return _convert_completion_to_chat(completion_or_chunks, stream=stream)
+
+    @staticmethod
+    def _load_image(image_url: str) -> bytes:
+        if image_url.startswith("data:"):
+            import base64
+
+            image_bytes = base64.b64decode(image_url.split(",")[1])
+            return image_bytes
+        else:
+            import urllib.request
+
+            with urllib.request.urlopen(image_url) as f:
+                image_bytes = f.read()
+                return image_bytes
+
+    @staticmethod
+    def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]):
+        image_urls: List[str] = []
+        for message in messages:
+            if message["role"] == "user":
+                if message["content"] is None:
+                    continue
+                for content in message["content"]:
+                    if isinstance(content, dict) and "type" in content:
+                        if content["type"] == "image_url":
+                            if (
+                                isinstance(content["image_url"], dict)
+                                and "url" in content["image_url"]
+                            ):
+                                image_urls.append(content["image_url"]["url"])
+                            else:
+                                image_urls.append(content["image_url"])
+        return image_urls
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        repo_id: str,
+        filename: Optional[str],
+        local_dir: Optional[Union[str, os.PathLike[str]]] = None,
+        local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
+        cache_dir: Optional[Union[str, os.PathLike[str]]] = None,
+        **kwargs: Any,
+    ) -> "MTMDChatHandler":
+        import fnmatch
+        from pathlib import Path
+
+        try:
+            from huggingface_hub import hf_hub_download, HfFileSystem  # type: ignore
+            from huggingface_hub.utils import validate_repo_id  # type: ignore
+        except ImportError:
+            raise ImportError(
+                "Llama.from_pretrained requires the huggingface-hub package. "
+                "You can install it with `pip install huggingface-hub`."
+            )
+
+        validate_repo_id(repo_id)
+
+        hffs = HfFileSystem()
+
+        files = [
+            file["name"] if isinstance(file, dict) else file
+            for file in hffs.ls(repo_id)  # type: ignore
+        ]
+
+        file_list: List[str] = []
+        for file in files:
+            rel_path = Path(file).relative_to(repo_id)
+            file_list.append(str(rel_path))
+
+        matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)]  # type: ignore
+
+        if len(matching_files) == 0:
+            raise ValueError(
+                f"No file found in {repo_id} that match {filename}\n\n"
+                f"Available Files:\n{json.dumps(file_list)}"
+            )
+
+        if len(matching_files) > 1:
+            raise ValueError(
+                f"Multiple files found in {repo_id} matching {filename}\n\n"
+                f"Available Files:\n{json.dumps(files)}"
+            )
+
+        (matching_file,) = matching_files
+
+        subfolder = str(Path(matching_file).parent)
+        filename = Path(matching_file).name
+
+        hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            subfolder=subfolder,
+            local_dir=cast(Union[str, Path, None], local_dir),
+            local_dir_use_symlinks=local_dir_use_symlinks,
+            cache_dir=cast(Union[str, Path, None], cache_dir),
+        )
+
+        if local_dir is None:
+            model_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=filename,
+                subfolder=subfolder,
+                local_dir=local_dir,
+                local_dir_use_symlinks=local_dir_use_symlinks,
+                cache_dir=cast(Union[str, Path, None], cache_dir),
+                local_files_only=True,
+            )
+        else:
+            model_path = os.path.join(local_dir, filename)
+
+        return cls(
+            clip_model_path=model_path,
+            **kwargs,
+        )
+
+
+class Gemma4ChatHandler(MTMDChatHandler):
+    pass
 
 
 class ObsidianChatHandler(Llava15ChatHandler):
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 3222abd631..8aa929202c 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -115,18 +115,21 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 chat_handler = llama_cpp.llama_chat_format.Llava16ChatHandler(
                     clip_model_path=settings.clip_model_path, verbose=settings.verbose
                 )
-        elif settings.chat_format == "gemma4":
+        elif settings.chat_format in ("mtmd", "gemma4"):
             assert settings.clip_model_path is not None, "clip model not found"
+            chat_handler_cls = (
+                llama_cpp.llama_chat_format.MTMDChatHandler
+                if settings.chat_format == "mtmd"
+                else llama_cpp.llama_chat_format.Gemma4ChatHandler
+            )
             if settings.hf_model_repo_id is not None:
-                chat_handler = (
-                    llama_cpp.llama_chat_format.Gemma4ChatHandler.from_pretrained(
-                        repo_id=settings.hf_model_repo_id,
-                        filename=settings.clip_model_path,
-                        verbose=settings.verbose,
-                    )
+                chat_handler = chat_handler_cls.from_pretrained(
+                    repo_id=settings.hf_model_repo_id,
+                    filename=settings.clip_model_path,
+                    verbose=settings.verbose,
                 )
             else:
-                chat_handler = llama_cpp.llama_chat_format.Gemma4ChatHandler(
+                chat_handler = chat_handler_cls(
                     clip_model_path=settings.clip_model_path, verbose=settings.verbose
                 )
         elif settings.chat_format == "moondream":
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 94a220cd67..e3ba22d6cc 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 94a220cd6745e6e3f8de62870b66fd5b9bc92700
+Subproject commit e3ba22d6cc4dec84e59a909c7f96e1689c7384a9