diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 81c9a961b4..0b974e93c8 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -14,7 +14,7 @@ jobs: os: [ubuntu-22.04, windows-2022, macos-14, macos-15] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: "recursive" @@ -75,7 +75,7 @@ jobs: name: Build arm64 wheels runs-on: ubuntu-24.04-arm steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: "recursive" @@ -123,7 +123,7 @@ jobs: build: "cp314-*" artifact: wheels_riscv64_cp314 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: "recursive" @@ -158,7 +158,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: "recursive" @@ -202,7 +202,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: merge-multiple: true path: dist diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 3dfef68008..349eb209dd 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-22.04 steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: submodules: "recursive" @@ -32,7 +32,7 @@ jobs: uses: docker/setup-buildx-action@v4 - name: Login to GitHub Container Registry - uses: docker/login-action@v3 + uses: docker/login-action@v4 with: registry: ghcr.io username: ${{ github.repository_owner }} diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index 3e67d14cba..f5b3718f17 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -63,7 +63,7 @@ jobs: with: arch: x64 - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: "recursive" @@ -73,7 +73,7 @@ jobs: cache: 'pip' - name: Setup Mamba - uses: conda-incubator/setup-miniconda@v3.1.0 + uses: conda-incubator/setup-miniconda@v4.0.1 with: activate-environment: "llamacpp" python-version: ${{ matrix.pyver }} diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index bf2c8bc124..ca8e322230 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -14,7 +14,7 @@ jobs: os: [macos-14, macos-15] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: "recursive" @@ -54,7 +54,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: merge-multiple: true path: dist2 diff --git a/.github/workflows/build-wheels-rocm.yaml b/.github/workflows/build-wheels-rocm.yaml index 0971953886..792e33cf89 100644 --- a/.github/workflows/build-wheels-rocm.yaml +++ b/.github/workflows/build-wheels-rocm.yaml @@ -26,7 +26,7 @@ jobs: apt-get update apt-get install -y --no-install-recommends git cmake lsb-release ninja-build - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: "recursive" @@ -82,7 +82,7 @@ jobs: amdgpu_targets: gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: "recursive" @@ -203,7 +203,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: merge-multiple: true path: dist @@ -223,7 +223,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: merge-multiple: true path: dist diff --git a/.github/workflows/build-wheels-vulkan.yaml b/.github/workflows/build-wheels-vulkan.yaml index 760205c839..49d38fb80b 100644 --- a/.github/workflows/build-wheels-vulkan.yaml +++ b/.github/workflows/build-wheels-vulkan.yaml @@ -31,7 +31,7 @@ jobs: with: arch: x64 - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: "recursive" @@ -109,7 +109,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: merge-multiple: true path: dist diff --git a/.github/workflows/generate-index-from-release.yaml b/.github/workflows/generate-index-from-release.yaml index c93e0be351..6b2c8770fa 100644 --- a/.github/workflows/generate-index-from-release.yaml +++ b/.github/workflows/generate-index-from-release.yaml @@ -31,7 +31,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Setup Pages uses: actions/configure-pages@v5 - name: Build @@ -57,4 +57,4 @@ jobs: path: 'index' - name: Deploy to GitHub Pages id: deployment - uses: actions/deploy-pages@v4 + uses: actions/deploy-pages@v5 diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 3c6f5ff45e..ac39181f90 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -12,7 +12,7 @@ jobs: ruff: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python uses: actions/setup-python@v6 diff --git a/.github/workflows/publish-to-test.yaml b/.github/workflows/publish-to-test.yaml index 54572bdad9..70ae5389af 100644 --- a/.github/workflows/publish-to-test.yaml +++ b/.github/workflows/publish-to-test.yaml @@ -16,7 +16,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: "recursive" diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 3c2ea56d68..8908ccdd6f 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: "recursive" diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index ecba1b40b0..82f7a8a7de 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -49,7 +49,7 @@ jobs: matrix: python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: "recursive" @@ -80,7 +80,7 @@ jobs: matrix: python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: "recursive" @@ -114,7 +114,7 @@ jobs: matrix: python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: "recursive" @@ -151,7 +151,7 @@ jobs: needs: download-model runs-on: macos-15-intel steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: "recursive" diff --git a/CHANGELOG.md b/CHANGELOG.md index 243c344c13..18bcf258a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -- feat: update llama.cpp to ggml-org/llama.cpp@94a220cd6 +- feat: Generic Multimodal Chat Handler by @abetlen in #2256 +- feat: update llama.cpp to ggml-org/llama.cpp@e3ba22d6c - feat(ci): add ROCm wheel builds by @abetlen in #2252 - feat(ci): add Vulkan wheel builds by @abetlen in #2251 - fix: handle additional `from_pretrained` files in subfolders by @TNing in #2085 diff --git a/README.md b/README.md index dd0024676a..5711d4afbb 100644 --- a/README.md +++ b/README.md @@ -536,6 +536,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | | [gemma-4](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) | `Gemma4ChatHandler` | `gemma4` | +| GGUF models with an mtmd projector and embedded chat template | `MTMDChatHandler` | `mtmd` | Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images. diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 44c6c1f76f..4298642f67 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3265,48 +3265,512 @@ def from_pretrained( ) -class Gemma4ChatHandler(Llava15ChatHandler): - DEFAULT_SYSTEM_MESSAGE = None +class MTMDChatHandler: + def __init__( + self, clip_model_path: str, verbose: bool = True, use_gpu: bool = True + ): + import llama_cpp.mtmd_cpp as mtmd_cpp - CHAT_FORMAT = ( - "{% if messages and messages[0]['role'] == 'system' %}" - "{% if messages[0]['content'] is string %}" - "{% set first_user_prefix = messages[0]['content'] + '\n\n' %}" - "{% else %}" - "{% set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' %}" - "{% endif %}" - "{% set loop_messages = messages[1:] %}" - "{% else %}" - "{% set first_user_prefix = '' %}" - "{% set loop_messages = messages %}" - "{% endif %}" - "{% for message in loop_messages %}" - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" - "{% endif %}" - "{% set role = 'model' if message['role'] == 'assistant' else message['role'] %}" - "{{ '' + role + '\n' + (first_user_prefix if loop.first else '') }}" - "{% if message['content'] is string %}" - "{{ message['content'] | trim }}" - "{% elif message['content'] is iterable %}" - "{% for item in message['content'] %}" - "{% if item['type'] == 'image_url' and item['image_url'] is string %}" - "{{ '\n\n' + item['image_url'] + '\n\n' }}" - "{% elif item['type'] == 'image_url' and item['image_url'] is mapping %}" - "{{ '\n\n' + item['image_url']['url'] + '\n\n' }}" - "{% elif item['type'] == 'text' %}" - "{{ item['text'] | trim }}" - "{% endif %}" - "{% endfor %}" - "{% else %}" - "{{ raise_exception('Invalid content type') }}" - "{% endif %}" - "{{ '\n' }}" - "{% endfor %}" - "{% if add_generation_prompt %}" - "{{ 'model\n' }}" - "{% endif %}" - ) + self.clip_model_path = clip_model_path + self.verbose = verbose + self.use_gpu = use_gpu + self._mtmd_cpp = mtmd_cpp + self._exit_stack = ExitStack() + self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None + + if not os.path.exists(clip_model_path): + raise ValueError(f"Clip model path does not exist: {clip_model_path}") + + def _init_mtmd_context(self, llama_model: llama.Llama): + self.verbose = llama_model.verbose + if self.mtmd_ctx is not None: + return + + with suppress_stdout_stderr(disable=self.verbose): + ctx_params = self._mtmd_cpp.mtmd_context_params_default() + ctx_params.use_gpu = self.use_gpu + ctx_params.print_timings = self.verbose + ctx_params.n_threads = llama_model.n_threads + ctx_params.flash_attn_type = ( + llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED + if ( + llama_model.context_params.flash_attn_type + == llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED + ) + else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED + ) + + self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( + self.clip_model_path.encode(), llama_model.model, ctx_params + ) + + if self.mtmd_ctx is None: + raise ValueError( + f"Failed to load mtmd context from: {self.clip_model_path}" + ) + + if not self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx): + raise ValueError("Vision is not supported by this model") + + def mtmd_free(): + with suppress_stdout_stderr(disable=self.verbose): + if self.mtmd_ctx is not None: + self._mtmd_cpp.mtmd_free(self.mtmd_ctx) + self.mtmd_ctx = None + + self._exit_stack.callback(mtmd_free) + + def load_image(self, image_url: str) -> bytes: + return self._load_image(image_url) + + def _create_bitmap_from_bytes(self, image_bytes: bytes): + if self.mtmd_ctx is None: + raise ValueError("mtmd context not initialized") + + with suppress_stdout_stderr(disable=self.verbose): + bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( + self.mtmd_ctx, + (ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)), + len(image_bytes), + ) + + if bitmap is None: + raise ValueError("Failed to create bitmap from image bytes") + + return bitmap + + def _get_chat_template(self, llama_model: llama.Llama) -> str: + chat_template = llama_model.metadata.get("tokenizer.chat_template") + if not isinstance(chat_template, str) or chat_template == "": + raise ValueError( + f"{self.__class__.__name__} requires tokenizer.chat_template metadata" + ) + return chat_template + + def _get_template_messages( + self, + messages: List[llama_types.ChatCompletionRequestMessage], + media_marker: str, + ) -> List[Any]: + return [ + self._convert_message_for_template(message, media_marker) + for message in messages + ] + + @classmethod + def _convert_message_for_template( + cls, + message: llama_types.ChatCompletionRequestMessage, + media_marker: str, + ) -> Dict[str, Any]: + message_dict = dict(message) + content = message_dict.get("content") + if isinstance(content, list): + message_dict["content"] = [ + cls._convert_content_part_for_template(part, media_marker) + for part in content + ] + return message_dict + + @staticmethod + def _convert_content_part_for_template( + part: Any, + media_marker: str, + ) -> Any: + if isinstance(part, dict) and part.get("type") == "image_url": + return {"type": "text", "text": media_marker} + return part + + @staticmethod + def _decode_token_piece(piece: Any) -> str: + if isinstance(piece, bytes): + return piece.decode("utf-8", errors="ignore") + return str(piece) + + def _postprocess_template_text( + self, + text: str, + image_urls: List[str], + media_marker: str, + ) -> str: + for image_url in image_urls: + text = text.replace(image_url, media_marker) + return text + + def __call__( + self, + *, + llama: llama.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + min_p: float = 0.05, + typical_p: float = 1.0, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + seed: Optional[int] = None, + response_format: Optional[ + llama_types.ChatCompletionRequestResponseFormat + ] = None, + max_tokens: Optional[int] = None, + presence_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + logits_processor: Optional[llama.LogitsProcessorList] = None, + grammar: Optional[llama.LlamaGrammar] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + top_logprobs: Optional[int] = None, + **kwargs, # type: ignore + ) -> Union[ + llama_types.CreateChatCompletionResponse, + Iterator[llama_types.CreateChatCompletionStreamResponse], + ]: + self._init_mtmd_context(llama) + assert self.mtmd_ctx is not None + + image_urls = self.get_image_urls(messages) + media_marker = self._mtmd_cpp.mtmd_default_marker().decode("utf-8") + template_env = ImmutableSandboxedEnvironment( + trim_blocks=True, + lstrip_blocks=True, + extensions=[ + Jinja2ChatFormatter.IgnoreGenerationTags, + jinja2.ext.loopcontrols, + ], + ) + template_env.filters["tojson"] = Jinja2ChatFormatter.tojson + template = template_env.from_string(self._get_chat_template(llama)) + + def raise_exception(message: str): + raise ValueError(message) + + text = template.render( + messages=self._get_template_messages(messages, media_marker), + add_generation_prompt=True, + eos_token=self._decode_token_piece(llama.detokenize([llama.token_eos()])), + bos_token=self._decode_token_piece(llama.detokenize([llama.token_bos()])), + raise_exception=raise_exception, + functions=functions, + function_call=function_call, + tools=tools, + tool_choice=tool_choice, + strftime_now=Jinja2ChatFormatter.strftime_now, + **kwargs, + ) + text = self._postprocess_template_text(text, image_urls, media_marker) + + if self.verbose: + print(text, file=sys.stderr) + + bitmaps = [] + bitmap_cleanup = [] + try: + for image_url in image_urls: + image_bytes = self.load_image(image_url) + bitmap = self._create_bitmap_from_bytes(image_bytes) + bitmaps.append(bitmap) + bitmap_cleanup.append(bitmap) + + input_text = self._mtmd_cpp.mtmd_input_text() + input_text.text = text.encode("utf-8") + input_text.add_special = True + input_text.parse_special = True + + chunks = self._mtmd_cpp.mtmd_input_chunks_init() + if chunks is None: + raise ValueError("Failed to create input chunks") + + try: + bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))( + *bitmaps + ) + result = self._mtmd_cpp.mtmd_tokenize( + self.mtmd_ctx, + chunks, + ctypes.byref(input_text), + bitmap_array, + len(bitmaps), + ) + + if result != 0: + raise ValueError(f"Failed to tokenize input: error code {result}") + + llama.reset() + llama._ctx.kv_cache_clear() + + n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) + + for i in range(n_chunks): + chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) + if chunk is None: + continue + + chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) + + if chunk_type == self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_TEXT: + n_tokens_out = ctypes.c_size_t() + tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text( + chunk, ctypes.byref(n_tokens_out) + ) + + if tokens_ptr and n_tokens_out.value > 0: + tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] + + if llama.n_tokens + len(tokens) > llama.n_ctx(): + raise ValueError( + f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}" + ) + llama.eval(tokens) + + elif chunk_type in [ + self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE, + self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_AUDIO, + ]: + chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens( + chunk + ) + + if llama.n_tokens + chunk_n_tokens > llama.n_ctx(): + raise ValueError( + f"Prompt exceeds n_ctx: {llama.n_tokens + chunk_n_tokens} > {llama.n_ctx()}" + ) + + new_n_past = llama_cpp.llama_pos(0) + result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( + self.mtmd_ctx, + llama._ctx.ctx, + chunk, + llama_cpp.llama_pos(llama.n_tokens), + llama_cpp.llama_seq_id(0), + llama.n_batch, + False, # logits_last + ctypes.byref(new_n_past), + ) + + if result != 0: + raise ValueError( + f"Failed to evaluate chunk: error code {result}" + ) + + llama.n_tokens = new_n_past.value + + prompt = llama.input_ids[: llama.n_tokens].tolist() + + finally: + self._mtmd_cpp.mtmd_input_chunks_free(chunks) + + finally: + for bitmap in bitmap_cleanup: + self._mtmd_cpp.mtmd_bitmap_free(bitmap) + + if response_format is not None and response_format["type"] == "json_object": + grammar = _grammar_for_response_format(response_format) + + if functions is not None: + tools = [ + { + "type": "function", + "function": function, + } + for function in functions + ] + + if function_call is not None: + if isinstance(function_call, str) and ( + function_call == "none" or function_call == "auto" + ): + tool_choice = function_call + if isinstance(function_call, dict) and "name" in function_call: + tool_choice = { + "type": "function", + "function": { + "name": function_call["name"], + }, + } + + tool = None + if ( + tool_choice is not None + and isinstance(tool_choice, dict) + and tools is not None + ): + name = tool_choice["function"]["name"] + tool = next((t for t in tools if t["function"]["name"] == name), None) + if tool is None: + raise ValueError(f"Tool choice '{name}' not found in tools.") + schema = tool["function"]["parameters"] + try: + grammar = llama_grammar.LlamaGrammar.from_json_schema( + json.dumps(schema), verbose=llama.verbose + ) + except Exception as e: + if llama.verbose: + print(str(e), file=sys.stderr) + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.JSON_GBNF, verbose=llama.verbose + ) + + completion_or_chunks = llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + logprobs=top_logprobs if logprobs else None, + stream=stream, + stop=stop, + seed=seed, + max_tokens=max_tokens, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + model=model, + logits_processor=logits_processor, + grammar=grammar, + logit_bias=logit_bias, + ) + + if tool is not None: + tool_name = tool["function"]["name"] + return _convert_completion_to_chat_function( + tool_name, completion_or_chunks, stream + ) + return _convert_completion_to_chat(completion_or_chunks, stream=stream) + + @staticmethod + def _load_image(image_url: str) -> bytes: + if image_url.startswith("data:"): + import base64 + + image_bytes = base64.b64decode(image_url.split(",")[1]) + return image_bytes + else: + import urllib.request + + with urllib.request.urlopen(image_url) as f: + image_bytes = f.read() + return image_bytes + + @staticmethod + def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]): + image_urls: List[str] = [] + for message in messages: + if message["role"] == "user": + if message["content"] is None: + continue + for content in message["content"]: + if isinstance(content, dict) and "type" in content: + if content["type"] == "image_url": + if ( + isinstance(content["image_url"], dict) + and "url" in content["image_url"] + ): + image_urls.append(content["image_url"]["url"]) + else: + image_urls.append(content["image_url"]) + return image_urls + + @classmethod + def from_pretrained( + cls, + repo_id: str, + filename: Optional[str], + local_dir: Optional[Union[str, os.PathLike[str]]] = None, + local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", + cache_dir: Optional[Union[str, os.PathLike[str]]] = None, + **kwargs: Any, + ) -> "MTMDChatHandler": + import fnmatch + from pathlib import Path + + try: + from huggingface_hub import hf_hub_download, HfFileSystem # type: ignore + from huggingface_hub.utils import validate_repo_id # type: ignore + except ImportError: + raise ImportError( + "Llama.from_pretrained requires the huggingface-hub package. " + "You can install it with `pip install huggingface-hub`." + ) + + validate_repo_id(repo_id) + + hffs = HfFileSystem() + + files = [ + file["name"] if isinstance(file, dict) else file + for file in hffs.ls(repo_id) # type: ignore + ] + + file_list: List[str] = [] + for file in files: + rel_path = Path(file).relative_to(repo_id) + file_list.append(str(rel_path)) + + matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)] # type: ignore + + if len(matching_files) == 0: + raise ValueError( + f"No file found in {repo_id} that match {filename}\n\n" + f"Available Files:\n{json.dumps(file_list)}" + ) + + if len(matching_files) > 1: + raise ValueError( + f"Multiple files found in {repo_id} matching {filename}\n\n" + f"Available Files:\n{json.dumps(files)}" + ) + + (matching_file,) = matching_files + + subfolder = str(Path(matching_file).parent) + filename = Path(matching_file).name + + hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder, + local_dir=cast(Union[str, Path, None], local_dir), + local_dir_use_symlinks=local_dir_use_symlinks, + cache_dir=cast(Union[str, Path, None], cache_dir), + ) + + if local_dir is None: + model_path = hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder, + local_dir=local_dir, + local_dir_use_symlinks=local_dir_use_symlinks, + cache_dir=cast(Union[str, Path, None], cache_dir), + local_files_only=True, + ) + else: + model_path = os.path.join(local_dir, filename) + + return cls( + clip_model_path=model_path, + **kwargs, + ) + + +class Gemma4ChatHandler(MTMDChatHandler): + pass class ObsidianChatHandler(Llava15ChatHandler): diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 3222abd631..8aa929202c 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -115,18 +115,21 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: chat_handler = llama_cpp.llama_chat_format.Llava16ChatHandler( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) - elif settings.chat_format == "gemma4": + elif settings.chat_format in ("mtmd", "gemma4"): assert settings.clip_model_path is not None, "clip model not found" + chat_handler_cls = ( + llama_cpp.llama_chat_format.MTMDChatHandler + if settings.chat_format == "mtmd" + else llama_cpp.llama_chat_format.Gemma4ChatHandler + ) if settings.hf_model_repo_id is not None: - chat_handler = ( - llama_cpp.llama_chat_format.Gemma4ChatHandler.from_pretrained( - repo_id=settings.hf_model_repo_id, - filename=settings.clip_model_path, - verbose=settings.verbose, - ) + chat_handler = chat_handler_cls.from_pretrained( + repo_id=settings.hf_model_repo_id, + filename=settings.clip_model_path, + verbose=settings.verbose, ) else: - chat_handler = llama_cpp.llama_chat_format.Gemma4ChatHandler( + chat_handler = chat_handler_cls( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) elif settings.chat_format == "moondream": diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 94a220cd67..e3ba22d6cc 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 94a220cd6745e6e3f8de62870b66fd5b9bc92700 +Subproject commit e3ba22d6cc4dec84e59a909c7f96e1689c7384a9