diff --git a/SECURITY.md b/SECURITY.md index 6a1bb6c32cd..9370fb1a883 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -40,7 +40,8 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru ### Untrusted environments or networks If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions: -* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value +* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/examples/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061). +* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value. * Encrypt your data if sending it over the network. ### Multi-Tenant environments diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 6d34541a3ce..645bdad9b57 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -419,8 +419,12 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str] def load_hparams(dir_model: Path): with open(dir_model / "config.json", "r", encoding="utf-8") as f: hparams = json.load(f) + architectures = hparams.get("architectures") if "text_config" in hparams: hparams = {**hparams, **hparams["text_config"]} + if architectures is not None: + # preserve "architectures" from root level config + hparams["architectures"] = architectures return hparams @classmethod @@ -1061,6 +1065,8 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab class VisionModel(ModelBase): model_arch = gguf.MODEL_ARCH.CLIP_VISION n_text_embd = 0 + preprocessor_config: dict[str, Any] + global_config: dict[str, Any] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1075,24 +1081,33 @@ def __init__(self, *args, **kwargs): if "vision_config" not in self.hparams: raise ValueError("vision_config not found in hparams") - # move vision config to the top level + # move vision config to the top level, while preserving the original hparams in global_config + self.global_config = self.hparams self.hparams = self.hparams["vision_config"] + # load preprocessor config + with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f: + self.preprocessor_config = json.load(f) + def set_type(self): self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION) def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PROJECTION_DIM, self.n_embd_text) - self.gguf_writer.add_bool(gguf.Keys.ClipVision.HAS_VISION_ENCODER, True) + self.gguf_writer.add_vision_projection_dim(self.n_embd_text) + self.gguf_writer.add_vision_has_vision_encoder(True) # vision config - self.gguf_writer.add_uint32(gguf.Keys.ClipVision.IMAGE_SIZE, self.find_hparam(["image_size"])) - self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PATCH_SIZE, self.find_hparam(["patch_size"])) - self.gguf_writer.add_uint32(gguf.Keys.ClipVision.EMBEDDING_LENGTH, self.find_hparam(["hidden_size"])) - self.gguf_writer.add_uint32(gguf.Keys.ClipVision.FEED_FORWARD_LENGTH, self.find_hparam(["intermediate_size"])) - self.gguf_writer.add_uint32(gguf.Keys.ClipVision.BLOCK_COUNT, self.find_hparam(["num_hidden_layers"])) - self.gguf_writer.add_uint32(gguf.Keys.ClipVision.Attention.HEAD_COUNT, self.find_hparam(["num_attention_heads"])) + self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"])) + self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"])) + self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"])) + self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"])) + self.gguf_writer.add_vision_block_count(self.find_hparam(["num_hidden_layers"])) + self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"])) + + # preprocessor config + self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"]) + self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_mean"]) def write_vocab(self): raise ValueError("VisionModel does not support vocab writing") @@ -1703,11 +1718,23 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed norms: {norms}") -@ModelBase.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +@ModelBase.register( + "LLaMAForCausalLM", + "LlamaForCausalLM", + "MistralForCausalLM", + "MixtralForCausalLM", + "Idefics3ForConditionalGeneration", + "SmolVLMForConditionalGeneration") class LlamaModel(TextModel): model_arch = gguf.MODEL_ARCH.LLAMA undo_permute = True + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # fix for SmolVLM2, missing `num_attention_heads` in config.json + if self.hparams["architectures"][0] == "SmolVLMForConditionalGeneration": + self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) + def set_vocab(self): try: self._set_vocab_sentencepiece() @@ -1770,6 +1797,12 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") + is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name + + if is_vision_tensor: + return [] # skip vision tensors + elif name.startswith("model.text_model"): + name = name.replace("text_model.", "") # for SmolVLM if self.undo_permute: if name.endswith(("q_proj.weight", "q_proj.bias")): @@ -1852,6 +1885,41 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") +@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration") +class SmolVLMModel(VisionModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # fix for SmolVLM2, missing some keys in config.json + # default values are taken from transformers code + if self.hparams["model_type"] == "smolvlm_vision": + self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152) + self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16) + self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072) + self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 12) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.IDEFICS3) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) + self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2)) + self.gguf_writer.add_vision_use_gelu(True) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + del bid, new_name, n_dims # unused + if ".embeddings." in name: + return gguf.GGMLQuantizationType.F32 + return False + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name + + if is_vision_tensor: + return [(self.map_tensor_name(name), data_torch)] + + return [] # skip other tensors + + @ModelBase.register("Llama4ForConditionalGeneration") class Llama4Model(LlamaModel): model_arch = gguf.MODEL_ARCH.LLAMA4 @@ -3591,12 +3659,10 @@ class Gemma3VisionModel(VisionModel): def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams - self.gguf_writer.add_string(gguf.Keys.ClipVision.PROJECTOR_TYPE, "gemma3") + self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.GEMMA3) # default values below are taken from HF tranformers code - self.gguf_writer.add_float32(gguf.Keys.ClipVision.Attention.LAYERNORM_EPS, hparams.get("layer_norm_eps", 1e-6)) - self.gguf_writer.add_array(gguf.Keys.ClipVision.IMAGE_MEAN, [0.5, 0.5, 0.5]) - self.gguf_writer.add_array(gguf.Keys.ClipVision.IMAGE_STD, [0.5, 0.5, 0.5]) - self.gguf_writer.add_bool (gguf.Keys.ClipVision.USE_GELU, True) + self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_vision_use_gelu(True) def tensor_force_quant(self, name, new_name, bid, n_dims): del bid, new_name, n_dims # unused @@ -3614,10 +3680,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter or name.startswith("multimodal_projector.") or name.startswith("vision_model."): # process vision tensors name = name.replace("_weight", ".weight") - if "fc1" in name: - name = name.replace("fc1", "fc2") - else: - name = name.replace("fc2", "fc1") # correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector # the other norm values are part of SigLIP model, and they are already correct diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h index 180ae9880b1..faa8a9d5e9d 100644 --- a/examples/llava/clip-impl.h +++ b/examples/llava/clip-impl.h @@ -33,13 +33,13 @@ #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" #define KEY_PROJ_DIM "clip.%s.projection_dim" #define KEY_TOKENS "tokenizer.ggml.tokens" -#define KEY_N_POSITIONS "clip.text.context_length" #define KEY_IMAGE_SIZE "clip.vision.image_size" #define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_IMAGE_MEAN "clip.vision.image_mean" #define KEY_IMAGE_STD "clip.vision.image_std" -#define KEY_PROJ_TYPE "clip.projector_type" #define KEY_FEATURE_LAYER "clip.vision.feature_layer" +#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" +#define KEY_PROJ_TYPE "clip.projector_type" #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" @@ -72,6 +72,7 @@ #define TN_IMAGE_NEWLINE "model.image_newline" #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 +#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3 // mimicpmv #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" @@ -99,6 +100,7 @@ enum projector_type { PROJECTOR_TYPE_GLM_EDGE, PROJECTOR_TYPE_MERGER, PROJECTOR_TYPE_GEMMA3, + PROJECTOR_TYPE_IDEFICS3, PROJECTOR_TYPE_UNKNOWN, }; @@ -110,6 +112,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_GLM_EDGE, "adapter"}, { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"}, { PROJECTOR_TYPE_GEMMA3, "gemma3"}, + { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 759706156d5..ad39e1bd619 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -159,6 +159,7 @@ struct clip_hparams { int32_t projection_dim; int32_t n_head; int32_t n_layer; + int32_t proj_scale_factor = 0; // idefics3 patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT; @@ -506,6 +507,35 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im embeddings = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)), embeddings); + + } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { + // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 + + ggml_tensor * cur = embeddings; + const int scale_factor = model.hparams.proj_scale_factor; + const int n_embd = cur->ne[0]; + const int seq = cur->ne[1]; + const int bsz = 1; // batch size, always 1 for now since we don't support batching + const int height = std::sqrt(seq); + const int width = std::sqrt(seq); + GGML_ASSERT(scale_factor != 0); + cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), + n_embd * scale_factor * scale_factor, + height / scale_factor, + width / scale_factor, + bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur), + n_embd * scale_factor * scale_factor, + seq / (scale_factor * scale_factor), + bsz); + + cur = ggml_mul_mat(ctx0, model.projection, cur); + embeddings = cur; + } else { + GGML_ABORT("SigLIP: Unsupported projector type"); } // build the graph @@ -1081,12 +1111,20 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im } static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) { - if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { - return clip_image_build_graph_siglip(ctx, imgs); - } else { - // TODO: we should have one build_* function per model - return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf); + ggml_cgraph * res; + switch (ctx->proj_type) { + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_IDEFICS3: + { + res = clip_image_build_graph_siglip(ctx, imgs); + } break; + default: + { + // TODO: we should have one build_* function per model + res = clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf); + } break; } + return res; } struct clip_model_loader { @@ -1147,6 +1185,8 @@ struct clip_model_loader { } void load_hparams() { + auto & hparams = ctx_clip.vision_model.hparams; + // projector type { std::string proj_type; @@ -1177,7 +1217,6 @@ struct clip_model_loader { get_bool(KEY_USE_GELU, ctx_clip.use_gelu, false); get_bool(KEY_USE_SILU, ctx_clip.use_silu, false); - auto & hparams = ctx_clip.vision_model.hparams; get_u32(string_format(KEY_N_EMBD, "vision"), hparams.hidden_size); get_u32(string_format(KEY_N_HEAD, "vision"), hparams.n_head); get_u32(string_format(KEY_N_FF, "vision"), hparams.n_intermediate); @@ -1233,6 +1272,16 @@ struct clip_model_loader { LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0); LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0); } + + // model-specific params + switch (ctx_clip.proj_type) { + case PROJECTOR_TYPE_IDEFICS3: + { + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + } break; + default: + break; + } } void load_tensors() { @@ -1422,6 +1471,10 @@ struct clip_model_loader { vision_model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); vision_model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N); } break; + case PROJECTOR_TYPE_IDEFICS3: + { + vision_model.projection = get_tensor(TN_MM_PROJECTOR); + } break; default: GGML_ASSERT(false && "unknown projector type"); } @@ -2195,10 +2248,12 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str return true; } - if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { + if (ctx->has_glm_projector + || ctx->proj_type == PROJECTOR_TYPE_GEMMA3 + || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { clip_image_u8 resized_image; int sz = params.image_size; - image_manipulation::bicubic_resize(*img, resized_image, sz, sz); + image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz}); clip_image_f32_ptr img_f32(clip_image_f32_init()); //clip_image_save_to_bmp(resized_image, "resized.bmp"); normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std); @@ -2330,6 +2385,8 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i n_patches = x_patch * y_patch; } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { n_patches = 256; + } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { + n_patches /= ctx->vision_model.hparams.proj_scale_factor; } return n_patches; @@ -2597,6 +2654,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { // do nothing } + else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { + // do nothing + } else { struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); @@ -2783,37 +2843,34 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i } int clip_n_mmproj_embd(const struct clip_ctx * ctx) { - if (ctx->proj_type == PROJECTOR_TYPE_LDP) { - return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) { - return ctx->vision_model.mm_model_peg_0_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_MLP) { - return ctx->vision_model.mm_2_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { - return ctx->vision_model.mm_3_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - if (ctx->minicpmv_version == 2) { - return 4096; - } - else if (ctx->minicpmv_version == 3) { - return 3584; - } - else if (ctx->minicpmv_version == 4) { - return 3584; - } - } - if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE){ - return ctx->vision_model.mm_model_mlp_3_w->ne[1]; - } - if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { - return ctx->vision_model.mm_1_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { - return ctx->vision_model.mm_input_proj_w->ne[0]; + switch (ctx->proj_type) { + case PROJECTOR_TYPE_LDP: + return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; + case PROJECTOR_TYPE_LDPV2: + return ctx->vision_model.mm_model_peg_0_b->ne[0]; + case PROJECTOR_TYPE_MLP: + return ctx->vision_model.mm_2_b->ne[0]; + case PROJECTOR_TYPE_MLP_NORM: + return ctx->vision_model.mm_3_b->ne[0]; + case PROJECTOR_TYPE_RESAMPLER: + if (ctx->minicpmv_version == 2) { + return 4096; + } else if (ctx->minicpmv_version == 3) { + return 3584; + } else if (ctx->minicpmv_version == 4) { + return 3584; + } + break; // Should not happen if version is valid + case PROJECTOR_TYPE_GLM_EDGE: + return ctx->vision_model.mm_model_mlp_3_w->ne[1]; + case PROJECTOR_TYPE_MERGER: + return ctx->vision_model.mm_1_b->ne[0]; + case PROJECTOR_TYPE_GEMMA3: + return ctx->vision_model.mm_input_proj_w->ne[0]; + case PROJECTOR_TYPE_IDEFICS3: + return ctx->vision_model.projection->ne[1]; + default: + break; // Fall through to throw } std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp index 8866c12e4dd..c3fb2f18a2b 100644 --- a/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp @@ -176,6 +176,8 @@ int32_t mtmd_tokenize(mtmd_context * ctx, std::string prompt_modified(text.text); std::string marker_modified(ctx->image_marker); + projector_type proj_type = clip_get_projector_type(ctx->ctx_clip); + // a bit hacky here, but works for now // for some models, we need to add prefix and suffix to the image embeddings if (clip_is_gemma3(ctx->ctx_clip)) { @@ -183,6 +185,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx, // ... (image embeddings) ... marker_modified = "" + ctx->image_marker + ""; string_replace_all(prompt_modified, ctx->image_marker, marker_modified); + + } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) { + // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215 + marker_modified = "" + ctx->image_marker + ""; + string_replace_all(prompt_modified, ctx->image_marker, marker_modified); } // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix diff --git a/examples/llava/tests.sh b/examples/llava/tests.sh index 61ebb3ac18e..8752fc267a8 100755 --- a/examples/llava/tests.sh +++ b/examples/llava/tests.sh @@ -28,6 +28,9 @@ add_test() { arr_tmpl+=("$tmpl") } +add_test "llama-mtmd-cli" "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0" +add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M" +add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0" add_test "llama-mtmd-cli" "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M" add_test "llama-mtmd-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek" add_test "llama-mtmd-cli" "THUDM/glm-edge-v-5b-gguf:Q4_K_M" @@ -39,7 +42,13 @@ add_test "llama-mtmd-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K" add_test "llama-mtmd-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0" add_test "llama-qwen2vl-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M" -# add_test "llama-mtmd-cli" "cmp-nct/Yi-VL-6B-GGUF:Q5_K" # this model has broken chat template, not usable +# these models always give the wrong answer, not sure why +# add_test "llama-mtmd-cli" "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M" +# add_test "llama-mtmd-cli" "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0" +# add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0" + +# this model has broken chat template, not usable +# add_test "llama-mtmd-cli" "cmp-nct/Yi-VL-6B-GGUF:Q5_K" ############### diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 266d8af4693..d92392edb7e 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -44,8 +44,8 @@ // note: assumes single GPU device - the default one // TODO: support multiple GPU devices static struct ggml_backend_metal_device_context { - id mtl_device; - int mtl_device_ref_count; + id mtl_device; + int mtl_device_ref_count; id mtl_library; bool has_simdgroup_reduction; @@ -490,7 +490,259 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_COUNT }; +// +// ggml_metal_heap +// + +struct ggml_metal_heap { + // number of times the heap was unused + int n_unused; + + // total number of buffer allocations in this heap across all computes + int64_t n_alloc; + + // current offset in the heap - we reset this after each node in order to reuse the memory + size_t offs; + + // the currently allocated MTLBuffer objects in this heap + id obj; + + NSMutableArray * bufs; +}; + +static struct ggml_metal_heap * ggml_metal_heap_init(id device, size_t size) { + struct ggml_metal_heap * heap = calloc(1, sizeof(struct ggml_metal_heap)); + + MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init]; + desc.storageMode = MTLStorageModePrivate; + desc.cpuCacheMode = MTLCPUCacheModeDefaultCache; + desc.type = MTLHeapTypePlacement; + desc.size = size; + + heap->n_unused = 0; + heap->n_alloc = 0; + + heap->obj = [device newHeapWithDescriptor:desc]; + if (!heap->obj) { + GGML_LOG_ERROR("%s: error: failed to create MTLHeap with size %zu\n", __func__, size); + + free(heap); + + return false; + } + + [desc release]; + + heap->bufs = [[NSMutableArray alloc] init]; + + return heap; +} + +static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) { + heap->offs = 0; + + // count how many graph computes the heap ended up being unused + if ([heap->bufs count] > 0) { + heap->n_unused = 0; + } else { + heap->n_unused++; + } + + for (id buf in heap->bufs) { + [buf release]; + } + [heap->bufs removeAllObjects]; + + // tell the OS that it can reuse this memory if needed + // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc + [heap->obj setPurgeableState:MTLPurgeableStateVolatile]; +} + +static void ggml_metal_heap_free(struct ggml_metal_heap * heap) { + if (heap == nil) { + return; + } + + ggml_metal_heap_reset(heap); + + [heap->obj release]; + [heap->bufs release]; + + free(heap); +} + +@interface ggml_metal_heap_ptr : NSObject + +@property (nonatomic, assign) struct ggml_metal_heap * data; + +@end + +@implementation ggml_metal_heap_ptr +@end + +// +// ggml_metal_mem_pool +// + +struct ggml_metal_mem_pool { + id device; + + int n_heaps; // total number of heaps ever created (including those that were removed) + + NSMutableArray * heaps; + NSMutableArray * heaps_to_remove; +}; + +static struct ggml_metal_mem_pool * ggml_metal_mem_pool_init(void) { + struct ggml_metal_mem_pool * mem_pool = calloc(1, sizeof(struct ggml_metal_mem_pool)); + + mem_pool->n_heaps = 0; + + mem_pool->heaps = [[NSMutableArray alloc] init]; + mem_pool->heaps_to_remove = [[NSMutableArray alloc] init]; + + return mem_pool; +} + +static void ggml_metal_mem_pool_free(struct ggml_metal_mem_pool * mem_pool) { + GGML_LOG_DEBUG("%s: freeing memory pool, num heaps = %zu (total = %d)\n", __func__, [mem_pool->heaps count], mem_pool->n_heaps); + + size_t size_all = 0; + size_t size_cur = 0; + + for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { + GGML_LOG_DEBUG("%s: heap: %p\n", __func__, (void *) ptr.data); + GGML_LOG_DEBUG("%s: n_alloc: %" PRId64 "\n", __func__, ptr.data->n_alloc); + GGML_LOG_DEBUG("%s: n_unused: %d\n", __func__, ptr.data->n_unused); + GGML_LOG_DEBUG("%s: size: %.2f MiB\n", __func__, [ptr.data->obj size] / 1024.0 / 1024.0); + GGML_LOG_DEBUG("%s: bufs: %zu\n", __func__, [ptr.data->bufs count]); + + if ([ptr.data->bufs count] > 0) { + size_cur += [ptr.data->obj size]; + } + size_all += [ptr.data->obj size]; + + ggml_metal_heap_free(ptr.data); + [ptr release]; + } + [mem_pool->heaps release]; + [mem_pool->heaps_to_remove release]; + + if (size_all > 0) { + GGML_LOG_DEBUG("%s: size_all: %.2f MiB\n", __func__, size_all / 1024.0 / 1024.0); + GGML_LOG_DEBUG("%s: size_cur: %.2f MiB\n", __func__, size_cur / 1024.0 / 1024.0); + } + + free(mem_pool); +} + +static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) { + for (NSUInteger i = 0; i < [mem_pool->heaps count]; i++) { + ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:i]; + + struct ggml_metal_heap * heap = ptr.data; + ggml_metal_heap_reset(heap); + + // if the heap hasn't been used for a while, remove it + if (heap->n_unused >= 128) { + [mem_pool->heaps_to_remove addObject:@(i)]; + } + } + + if (mem_pool->heaps_to_remove.count > 0) { + for (NSUInteger i = 0; i < [mem_pool->heaps_to_remove count]; i++) { + NSUInteger index = [[mem_pool->heaps_to_remove objectAtIndex:i] intValue]; + ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:index]; + + struct ggml_metal_heap * heap = ptr.data; + ggml_metal_heap_free(heap); + + [mem_pool->heaps removeObjectAtIndex:index]; + [ptr release]; + } + + [mem_pool->heaps_to_remove removeAllObjects]; + } +} + +static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) { + for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { + ptr.data->offs = 0; + } +} + +static id ggml_metal_mem_pool_alloc(struct ggml_metal_mem_pool * mem_pool, size_t size) { + const size_t alignment = 32; + + const size_t size_aligned = GGML_PAD(size, alignment); + + // try one of the existing heaps + for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { + struct ggml_metal_heap * heap = ptr.data; + if (heap->offs + size_aligned <= [heap->obj size]) { + // if this is the first buffer in the heap for the current command buffer, tell the OS that + // it cannot free the memory used by the heap + // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc + if ([heap->bufs count] == 0) { + [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; + } + + id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; + if (buf == nil) { + GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned); + return nil; + } + + heap->n_alloc++; + heap->offs += size_aligned; + + [heap->bufs addObject:buf]; + + return buf; + } + } + + // create a new heap that can fit this buffer + ggml_metal_heap_ptr * heap_ptr = [ggml_metal_heap_ptr new]; + + struct ggml_metal_heap * heap = ggml_metal_heap_init(mem_pool->device, size_aligned); + if (heap == NULL) { + GGML_LOG_ERROR("%s: error: failed to create heap of size %zu\n", __func__, size_aligned); + return NULL; + } + + //GGML_LOG_DEBUG("%s: creating new heap of size %zu, got %zu\n", __func__, size_aligned, [heap->obj size]); + + heap_ptr.data = heap; + ggml_metal_heap_reset(heap); + + [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; + id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; + if (buf == nil) { + GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned); + return NULL; + } + + heap->n_alloc++; + heap->offs += size_aligned; + + [heap->bufs addObject:buf]; + + [mem_pool->heaps addObject:heap_ptr]; + mem_pool->n_heaps++; + + return buf; +} + +struct ggml_metal_command_buffer { + id obj; + + // each command buffer has a memory pool from which it can allocate temporary buffers during the compute + struct ggml_metal_mem_pool * mem_pool; +}; + struct ggml_backend_metal_context { + id device; id queue; dispatch_queue_t d_queue; @@ -515,7 +767,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte void (^encode_async)(size_t ith); // n_cb command buffers + 1 used by the main thread - id command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; + struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; // abort ggml_metal_graph_compute if callback returns true ggml_abort_callback abort_callback; @@ -705,9 +957,11 @@ @implementation GGMLMetalClass struct ggml_backend_metal_device_context * ctx_dev = dev->context; id device = ggml_backend_metal_device_acq(ctx_dev); + GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); - ctx->queue = [device newCommandQueue]; + ctx->device = device; + ctx->queue = [device newCommandQueue]; if (ctx->queue == nil) { GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); return NULL; @@ -768,7 +1022,10 @@ @implementation GGMLMetalClass ctx->gf = nil; ctx->encode_async = nil; for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { - ctx->command_buffers[i] = nil; + ctx->cmd_bufs[i].obj = nil; + + ctx->cmd_bufs[i].mem_pool = ggml_metal_mem_pool_init(); + ctx->cmd_bufs[i].mem_pool->device = device; } #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) @@ -1181,6 +1438,12 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { [ctx->queue release]; + for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { + // ctx->cmd_bufs[i].obj is auto released + + ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool); + } + dispatch_release(ctx->d_queue); free(ctx); @@ -1486,10 +1749,11 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex } } -static void ggml_metal_encode_node( +static bool ggml_metal_encode_node( ggml_backend_t backend, int idx, - id encoder) { + id encoder, + struct ggml_metal_mem_pool * mem_pool) { struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; @@ -1505,7 +1769,7 @@ static void ggml_metal_encode_node( struct ggml_tensor * dst = node; if (ggml_is_empty(dst)) { - return; + return true; } switch (dst->op) { @@ -1516,7 +1780,7 @@ static void ggml_metal_encode_node( case GGML_OP_PERMUTE: { // noop -> next node - } return; + } return true; default: { } break; @@ -1527,6 +1791,8 @@ static void ggml_metal_encode_node( GGML_ABORT("unsupported op"); } + ggml_metal_mem_pool_clear(mem_pool); + const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; const int64_t ne02 = src0 ? src0->ne[2] : 0; @@ -2173,26 +2439,76 @@ static void ggml_metal_encode_node( const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - ggml_metal_kargs_soft_max args = { +// use this branch to test the ggml_metal_mem_pool functionality +#if 0 + // cpy to tmp buffer in MTLHeap + + id h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0)); + if (!h_src0) { + GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, ggml_nbytes(src0)); + return false; + } + + offs_src0 = 0; + + ggml_metal_kargs_cpy args_cpy = { /*.ne00 =*/ ne00, /*.ne01 =*/ ne01, /*.ne02 =*/ ne02, - /*.scale =*/ scale, - /*.max_bias =*/ max_bias, - /*.m0 =*/ m0, - /*.m1 =*/ m1, + /*.ne03 =*/ ne03, + /*.nb00 =*/ nb00, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.ne0 =*/ ne00, + /*.ne1 =*/ ne01, + /*.ne2 =*/ ne02, + /*.ne3 =*/ ne03, + /*.nb0 =*/ nb00, + /*.nb1 =*/ nb01, + /*.nb2 =*/ nb02, + /*.nb3 =*/ nb03, + }; + + if (src0->type == GGML_TYPE_F16) { + [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline]; + } else { + [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline]; + } + [encoder setBytes:&args_cpy length:sizeof(args_cpy) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:h_src0 offset:0 atIndex:2]; + + GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); + int nth_cpy = MIN(1024, ne00 / ggml_blck_size(src0->type)); + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth_cpy, 1, 1)]; + +#else + id h_src0 = id_src0; +#endif + // softmax + + ggml_metal_kargs_soft_max args = { + /*.ne00 =*/ ne00, + /*.ne01 =*/ ne01, + /*.ne02 =*/ ne02, + /*.scale =*/ scale, + /*.max_bias =*/ max_bias, + /*.m0 =*/ m0, + /*.m1 =*/ m1, /*.n_head_log2 =*/ n_head_log2, }; [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:h_src0 offset:offs_src0 atIndex:0]; if (id_src1) { - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; } else { - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:h_src0 offset:offs_src0 atIndex:1]; } - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&args length:sizeof(args) atIndex:3]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&args length:sizeof(args) atIndex:3]; [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; @@ -4601,6 +4917,8 @@ static void ggml_metal_encode_node( GGML_ABORT("fatal error"); } } + + return true; } static enum ggml_status ggml_metal_graph_compute( @@ -4654,25 +4972,25 @@ static enum ggml_status ggml_metal_graph_compute( } // the main thread commits the first few commands immediately - // command_buffer[n_cb] + // cmd_buf[n_cb] { - id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; - ctx->command_buffers[n_cb] = command_buffer; + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + ctx->cmd_bufs[n_cb].obj = cmd_buf; - [command_buffer enqueue]; + [cmd_buf enqueue]; ctx->encode_async(n_cb); } // prepare the rest of the command buffers asynchronously - // command_buffer[0.. n_cb) + // cmd_buf[0.. n_cb) for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; - ctx->command_buffers[cb_idx] = command_buffer; + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + ctx->cmd_bufs[cb_idx].obj = cmd_buf; // always enqueue the first two command buffers // enqueue all of the command buffers if we don't need to abort if (cb_idx < 2 || ctx->abort_callback == NULL) { - [command_buffer enqueue]; + [cmd_buf enqueue]; } } @@ -4681,14 +4999,14 @@ static enum ggml_status ggml_metal_graph_compute( // wait for completion and check status of each command buffer // needed to detect if the device ran out-of-memory for example (#1881) { - id command_buffer = ctx->command_buffers[n_cb]; - [command_buffer waitUntilCompleted]; + id cmd_buf = ctx->cmd_bufs[n_cb].obj; + [cmd_buf waitUntilCompleted]; - MTLCommandBufferStatus status = [command_buffer status]; + MTLCommandBufferStatus status = [cmd_buf status]; if (status != MTLCommandBufferStatusCompleted) { GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); } return GGML_STATUS_FAILED; @@ -4696,20 +5014,20 @@ static enum ggml_status ggml_metal_graph_compute( } for (int i = 0; i < n_cb; ++i) { - id command_buffer = ctx->command_buffers[i]; - [command_buffer waitUntilCompleted]; + id cmd_buf = ctx->cmd_bufs[i].obj; + [cmd_buf waitUntilCompleted]; - MTLCommandBufferStatus status = [command_buffer status]; + MTLCommandBufferStatus status = [cmd_buf status]; if (status != MTLCommandBufferStatusCompleted) { GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); } return GGML_STATUS_FAILED; } - id next_buffer = (i + 1 < n_cb ? ctx->command_buffers[i + 1] : nil); + id next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil); if (!next_buffer) { continue; } @@ -5092,8 +5410,9 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const int n_nodes_per_cb = ctx->n_nodes_per_cb; - id command_buffer = ctx->command_buffers[cb_idx]; - id encoder = [command_buffer computeCommandEncoder]; + id cmd_buf = ctx->cmd_bufs[cb_idx].obj; + + id encoder = [cmd_buf computeCommandEncoder]; int node_start = 0; int node_end = n_nodes_0; @@ -5105,22 +5424,29 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const bool should_capture = ctx->capture_next_compute; + struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool; + ggml_metal_mem_pool_reset(mem_pool); + for (int idx = node_start; idx < node_end; ++idx) { if (should_capture) { [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; } - ggml_metal_encode_node(backend, idx, encoder); + const bool res = ggml_metal_encode_node(backend, idx, encoder, mem_pool); if (should_capture) { [encoder popDebugGroup]; } + + if (!res) { + break; + } } [encoder endEncoding]; if (cb_idx < 2 || ctx->abort_callback == NULL) { - [command_buffer commit]; + [cmd_buf commit]; } }); } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 3f24705201d..59510bd0c2a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -231,11 +231,15 @@ class ClipVision: IMAGE_MEAN = "clip.vision.image_mean" IMAGE_STD = "clip.vision.image_std" USE_GELU = "clip.use_gelu" + USE_SILU = "clip.use_silu" class Attention: HEAD_COUNT = "clip.vision.attention.head_count" LAYERNORM_EPS = "clip.vision.attention.layer_norm_epsilon" + class Projector: + SCALE_FACTOR = "clip.vision.projector.scale_factor" + # # recommended mapping of model tensor names for storage in gguf # @@ -2122,6 +2126,11 @@ def get_type(val: Any) -> GGUFValueType: raise ValueError(f"Unknown type: {type(val)}") +class VisionProjectorType: + GEMMA3 = "gemma3" + IDEFICS3 = "idefics3" + + # Items here are (block size, type size) QK_K = 256 GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index aef03db1577..48e9a470b78 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -931,6 +931,53 @@ def add_eot_token_id(self, id: int) -> None: def add_eom_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOM_ID, id) + # for vision models + + def add_vision_projection_dim(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value) + + def add_vision_has_vision_encoder(self, value: bool) -> None: + self.add_bool(Keys.ClipVision.HAS_VISION_ENCODER, value) + + def add_vision_patch_size(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.PATCH_SIZE, value) + + def add_vision_embedding_length(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.EMBEDDING_LENGTH, value) + + def add_vision_feed_forward_length(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.FEED_FORWARD_LENGTH, value) + + def add_vision_block_count(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.BLOCK_COUNT, value) + + def add_vision_head_count(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.Attention.HEAD_COUNT, value) + + def add_vision_projector_type(self, value: str) -> None: + self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value) + + def add_vision_attention_layernorm_eps(self, value: float) -> None: + self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value) + + def add_vision_image_size(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value) + + def add_vision_image_mean(self, values: Sequence[float]) -> None: + self.add_array(Keys.ClipVision.IMAGE_MEAN, values) + + def add_vision_image_std(self, values: Sequence[float]) -> None: + self.add_array(Keys.ClipVision.IMAGE_STD, values) + + def add_vision_use_gelu(self, value: bool) -> None: + self.add_bool(Keys.ClipVision.USE_GELU, value) + + def add_vision_use_silu(self, value: bool) -> None: + self.add_bool(Keys.ClipVision.USE_SILU, value) + + def add_vision_projector_scale_factor(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value) + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: pack_prefix = '' if not skip_pack_prefix: diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 22066b28680..3ff378c1366 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -961,13 +961,13 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_FFN_UP: ( "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", "vpm.encoder.layers.{bid}.mlp.fc1", - "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM + "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 (note: name is swapped) ), MODEL_TENSOR.V_ENC_FFN_DOWN: ( "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", "vpm.encoder.layers.{bid}.mlp.fc2", - "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM + "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 (note: name is swapped) ), MODEL_TENSOR.V_PRE_NORM: ( diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index f62850ca574..41f89e3a9d3 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -62,6 +62,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "yandex", LLM_CHAT_TEMPLATE_YANDEX }, { "bailing", LLM_CHAT_TEMPLATE_BAILING }, { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 }, + { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM }, }; llm_chat_template llm_chat_template_from_str(const std::string & name) { @@ -81,7 +82,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { if (tmpl_contains("<|im_start|>")) { return tmpl_contains("<|im_sep|>") ? LLM_CHAT_TEMPLATE_PHI_4 - : LLM_CHAT_TEMPLATE_CHATML; + : tmpl_contains("") + ? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml + : LLM_CHAT_TEMPLATE_CHATML; } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) { if (tmpl_contains("[SYSTEM_PROMPT]")) { return LLM_CHAT_TEMPLATE_MISTRAL_V7; @@ -622,7 +625,23 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|header_start|>assistant<|header_end|>\n\n"; } - } else { + } else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) { + // SmolVLM + ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << message->content << "\n\n"; + } else if (role == "user") { + ss << "User: " << message->content << "\n"; + } else { + ss << "Assistant: " << message->content << "\n"; + } + } + if (add_ass) { + ss << "Assistant:"; + } + } else { // template not supported return -1; } diff --git a/src/llama-chat.h b/src/llama-chat.h index 34537ca21e4..dc30df711a9 100644 --- a/src/llama-chat.h +++ b/src/llama-chat.h @@ -41,6 +41,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_YANDEX, LLM_CHAT_TEMPLATE_BAILING, LLM_CHAT_TEMPLATE_LLAMA4, + LLM_CHAT_TEMPLATE_SMOLVLM, LLM_CHAT_TEMPLATE_UNKNOWN, };