From 7bfe60fdf929ae569b81bbbce7ff7be5a1f8e354 Mon Sep 17 00:00:00 2001 From: manayang Date: Wed, 22 Apr 2026 17:58:43 +0800 Subject: [PATCH 1/5] mtmd, llama : Update HunyuanVL vision-language model support (#22037) * mtmd, llama : add HunyuanVL vision-language model support - add LLM_ARCH_HUNYUAN_VL with M-RoPE (XD-RoPE) support - add PROJECTOR_TYPE_HUNYUANVL with PatchMerger vision encoder - add HunyuanVL-specific M-RoPE position encoding for image tokens - add GGUF conversion for HunyuanVL vision and text models - add smoke test in tools/mtmd/tests.sh * fix: fix HunyuanVL XD-RoPE h/w section order * fix: Remove redundant code * convert : fix HunyuanOCR / HunyuanVL conversion - Tested locally: both HunyuanOCR and HunyuanVL-4B convert to GGUF - successfully and produce correct inference output on Metal (F16 / Q8_0). * clip : fix -Werror=misleading-indentation in bilinear resize * fix CI: convert_hf_to_gguf type check error - convert_hf_to_gguf.py: give HunyuanVLTextModel.__init__ an explicit `dir_model: Path` parameter so ty can infer the type for load_hparams instead of reporting `Unknown | None`. --------- Co-authored-by: wendadawen --- convert_hf_to_gguf.py | 107 +++++++++++++++++++++++++++---- gguf-py/gguf/constants.py | 20 ++++++ gguf-py/gguf/gguf_writer.py | 3 + src/llama-arch.cpp | 2 + src/llama-arch.h | 2 + src/llama-hparams.h | 1 + src/llama-model.cpp | 22 +++++++ src/models/hunyuan-dense.cpp | 41 ++++++++---- tools/mtmd/clip-impl.h | 4 +- tools/mtmd/clip.cpp | 80 +++++++++++++++++++++++ tools/mtmd/models/hunyuanocr.cpp | 16 ++++- tools/mtmd/mtmd.cpp | 64 +++++++++++++++++- tools/mtmd/tests.sh | 1 + 13 files changed, 336 insertions(+), 27 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5b4fb79fc1b..090686b1531 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -11855,7 +11855,7 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration") +@ModelBase.register("HunYuanDenseV1ForCausalLM") class HunYuanModel(TextModel): model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE @@ -11994,28 +11994,58 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("HunYuanVLForConditionalGeneration") -class HunyuanOCRVisionModel(MmprojModel): +class HunyuanVLVisionModel(MmprojModel): + # Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name + # "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout. + # Each variant maps to a different projector type in clip.cpp so image + # preprocessing follows the correct code path. + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) assert self.hparams_vision is not None - # HunyuanOCR uses max_image_size instead of image_size + # HunyuanOCR / HunyuanVL uses max_image_size instead of image_size if "image_size" not in self.hparams_vision: self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048) + @staticmethod + def is_ocr_variant(hparams: dict) -> bool: + """Return True for HunyuanOCR, False for HunyuanVL. + + The projector's output dim must equal the text model's hidden_size by + construction (that's what "projector" means). HunyuanOCR pairs a 1B text + backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the + ViT -> LLM projection dim is a hard architectural signature, not a + magic number. + """ + vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) + return vision_out == 1024 + def set_gguf_parameters(self): super().set_gguf_parameters() assert self.hparams_vision is not None - hparams = self.hparams_vision - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR) - self.gguf_writer.add_vision_use_gelu(True) - self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5)) - self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2)) - self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"]) - self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"]) + vcfg = self.hparams_vision + + if self.is_ocr_variant(self.global_config): + # --- HunyuanOCR --- + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR) + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5)) + self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2)) + self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"]) + self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"]) + return + + # --- HunyuanVL --- + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL) + self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu") + self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"])) + self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"])) + self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"])) + self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"])) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if not name.startswith("vit."): - return # skip text tensors + return # strip CLS token (row 0) from position embeddings so resize_position_embeddings works if "position_embedding" in name: data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd] @@ -12023,11 +12053,66 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter def tensor_force_quant(self, name, new_name, bid, n_dims): # force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal + # Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2. if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"): return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 return super().tensor_force_quant(name, new_name, bid, n_dims) +@ModelBase.register("HunYuanVLForConditionalGeneration") +class HunyuanVLTextModel(HunYuanModel): + # The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR + # and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE), + # while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from + # the config and pick the matching GGUF architecture. + model_arch = gguf.MODEL_ARCH.HUNYUAN_VL + + @staticmethod + def _is_ocr_config(hparams: dict) -> bool: + # OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that + # outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with + # HunyuanVLVisionModel.is_ocr_variant. + return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024 + + def __init__(self, dir_model: Path, *args, **kwargs): + raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False) + if self._is_ocr_config(raw_hparams): + self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE + else: + self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL + super().__init__(dir_model, *args, **kwargs) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses + # the HunYuan-Dense arch which already handles standard rope in super(). + if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL: + return + + if self.rope_parameters.get("rope_type") != "xdrope": + return + + # defaults for HunyuanVL. The C++ side later computes: + # freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2)) + self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"])) + self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"])) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_rope_scaling_factor(float(self.rope_parameters.get("factor", 1))) + + ctx_len = int(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(ctx_len) + self.gguf_writer.add_context_length(ctx_len) + + self.gguf_writer.add_rope_dimension_sections(list(self.rope_parameters["xdrope_section"])) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Skip vision tensors — they are written by HunyuanVLVisionModel + if name.startswith("vit."): + return + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("SmolLM3ForCausalLM") class SmolLM3Model(LlamaModel): model_arch = gguf.MODEL_ARCH.SMOLLM3 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c5297a2f440..83ae51ce9ce 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -197,6 +197,7 @@ class Rope: FREQ_BASE_SWA = "{arch}.rope.freq_base_swa" SCALING_TYPE = "{arch}.rope.scaling.type" SCALING_FACTOR = "{arch}.rope.scaling.factor" + SCALING_ALPHA = "{arch}.rope.scaling.alpha" SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" @@ -471,6 +472,7 @@ class MODEL_ARCH(IntEnum): ERNIE4_5_MOE = auto() HUNYUAN_MOE = auto() HUNYUAN_DENSE = auto() + HUNYUAN_VL = auto() SMOLLM3 = auto() GPT_OSS = auto() LFM2 = auto() @@ -957,6 +959,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.FALCON_H1: "falcon-h1", MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe", MODEL_ARCH.HUNYUAN_DENSE: "hunyuan-dense", + MODEL_ARCH.HUNYUAN_VL: "hunyuan_vl", MODEL_ARCH.SMOLLM3: "smollm3", MODEL_ARCH.GPT_OSS: "gpt-oss", MODEL_ARCH.LFM2: "lfm2", @@ -3489,6 +3492,22 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.HUNYUAN_VL: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.SMOLLM3: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -4138,6 +4157,7 @@ class VisionProjectorType: YOUTUVL = "youtuvl" NEMOTRON_V2_VL = "nemotron_v2_vl" HUNYUANOCR = "hunyuanocr" + HUNYUANVL = "hunyuanvl" # Items here are (block size, type size) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 90d500dc771..6a81ca37d8c 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -973,6 +973,9 @@ def add_rope_scaling_type(self, value: RopeScalingType) -> None: def add_rope_scaling_factor(self, value: float) -> None: self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value) + def add_rope_scaling_alpha(self, value: float) -> None: + self.add_float32(Keys.Rope.SCALING_ALPHA.format(arch=self.arch), value) + def add_rope_scaling_attn_factors(self, value: float) -> None: self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 6904b9c1a64..633a66fc665 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -109,6 +109,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" }, { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" }, { LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" }, + { LLM_ARCH_HUNYUAN_VL, "hunyuan_vl" }, { LLM_ARCH_SMOLLM3, "smollm3" }, { LLM_ARCH_OPENAI_MOE, "gpt-oss" }, { LLM_ARCH_LFM2, "lfm2" }, @@ -250,6 +251,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, + { LLM_KV_ROPE_SCALING_ALPHA, "%s.rope.scaling.alpha" }, { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index c4aabab7e0c..8f335f5c7b3 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -113,6 +113,7 @@ enum llm_arch { LLM_ARCH_ERNIE4_5_MOE, LLM_ARCH_HUNYUAN_MOE, LLM_ARCH_HUNYUAN_DENSE, + LLM_ARCH_HUNYUAN_VL, LLM_ARCH_SMOLLM3, LLM_ARCH_OPENAI_MOE, LLM_ARCH_LFM2, @@ -254,6 +255,7 @@ enum llm_kv { LLM_KV_ROPE_SCALE_LINEAR, LLM_KV_ROPE_SCALING_TYPE, LLM_KV_ROPE_SCALING_FACTOR, + LLM_KV_ROPE_SCALING_ALPHA, LLM_KV_ROPE_SCALING_ATTN_FACTOR, LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, LLM_KV_ROPE_SCALING_FINETUNED, diff --git a/src/llama-hparams.h b/src/llama-hparams.h index c2000c77c37..ac7f9ee8650 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -116,6 +116,7 @@ struct llama_hparams { float rope_freq_base_train_swa = 10000.0f; float rope_freq_scale_train; float rope_freq_scale_train_swa = 1.0f; + float rope_scaling_alpha = 0.0f; // NTK-aware alpha for XDRoPE uint32_t n_ctx_orig_yarn; float rope_yarn_log_mul = 0.0f; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f77b2e9217f..9e2a13cbd43 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -737,6 +737,13 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false); ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false); + if (arch == LLM_ARCH_HUNYUAN_VL || arch == LLM_ARCH_HUNYUAN_DENSE) { + if (hparams.n_expert <= 1) { + hparams.n_expert = 0; + hparams.n_expert_used = 0; + } + } + if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd); ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd_out_impl); @@ -815,6 +822,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false); + ml.get_key(LLM_KV_ROPE_SCALING_ALPHA, hparams.rope_scaling_alpha, false); // non-transformer models do not have attention heads if (hparams.n_head() > 0) { @@ -2592,9 +2600,18 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_HUNYUAN_VL: case LLM_ARCH_HUNYUAN_DENSE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false); + + // XDRoPE / NTK-aware scaling: base = rope_theta * alpha^(dim / (dim - 2)) + if (hparams.rope_scaling_alpha > 0.0f) { + const int dim = hparams.n_embd_head_k(); + hparams.rope_freq_base_train = hparams.rope_freq_base_train + * powf(hparams.rope_scaling_alpha, (float)dim / (float)(dim - 2)); + } switch (hparams.n_embd) { case 1024: type = LLM_TYPE_0_5B; break; @@ -6947,6 +6964,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0); } } break; + case LLM_ARCH_HUNYUAN_VL: case LLM_ARCH_HUNYUAN_DENSE: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -8967,6 +8985,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_HUNYUAN_VL: case LLM_ARCH_HUNYUAN_DENSE: { llm = std::make_unique(*this, params); @@ -9316,6 +9335,9 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_GLM4_MOE: return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_HUNYUAN_VL: + return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX; + // all model arches should be listed explicitly here case LLM_ARCH_UNKNOWN: GGML_ABORT("unknown architecture"); diff --git a/src/models/hunyuan-dense.cpp b/src/models/hunyuan-dense.cpp index e4e837eb4f1..1cd85d6d9d4 100644 --- a/src/models/hunyuan-dense.cpp +++ b/src/models/hunyuan-dense.cpp @@ -6,6 +6,11 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); GGML_ASSERT(n_embd_head == n_rot); + const bool use_mrope = hparams.use_mrope(); + + int sections[4]; + std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); + ggml_tensor * cur; ggml_tensor * inpL; @@ -37,22 +42,36 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur, n_embd_head, n_head, n_head_kv, il); - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); + if (use_mrope) { + Qcur = ggml_rope_multi( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_multi( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + } else { + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 61fe82439f1..7d6484eea85 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -150,7 +150,7 @@ #define TN_TOK_BOI "v.boi" #define TN_TOK_EOI "v.eoi" -// hunyuanocr +// hunyuanocr / hunyuanvl (shared GGUF tensor names) #define TN_MM_PRE_NORM "mm.pre_norm.%s" #define TN_TOK_IMG_BEGIN "mm.image_begin" #define TN_TOK_IMG_END "mm.image_end" @@ -303,6 +303,7 @@ enum projector_type { PROJECTOR_TYPE_KIMIK25, PROJECTOR_TYPE_NEMOTRON_V2_VL, PROJECTOR_TYPE_HUNYUANOCR, + PROJECTOR_TYPE_HUNYUANVL, PROJECTOR_TYPE_UNKNOWN, }; @@ -349,6 +350,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_KIMIK25, "kimik25"}, { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"}, { PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"}, + { PROJECTOR_TYPE_HUNYUANVL, "hunyuanvl"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 540b0ea4143..45e39898d82 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -912,6 +912,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: { builder = std::make_unique(ctx, img); } break; @@ -1473,6 +1474,16 @@ struct clip_model_loader { get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels); hparams.set_warmup_n_tokens(28*28); } break; + case PROJECTOR_TYPE_HUNYUANVL: + { + hparams.n_merge = 2; + hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW; + hparams.image_resize_pad = false; + hparams.ffn_op = FFN_GELU; + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + hparams.set_limit_image_tokens(256, 16384); + hparams.set_warmup_n_tokens(32*32); + } break; case PROJECTOR_TYPE_LFM2A: { // audio preprocessing params @@ -2222,6 +2233,7 @@ struct clip_model_loader { model.mm_eoi = get_tensor(TN_TOK_EOI); } break; case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: { // proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear) model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); @@ -2860,6 +2872,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_PADDLEOCR: case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: case PROJECTOR_TYPE_YOUTUVL: return (img->nx / params.patch_size) / 2; case PROJECTOR_TYPE_STEP3VL: @@ -2879,6 +2892,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_PADDLEOCR: + case PROJECTOR_TYPE_HUNYUANVL: case PROJECTOR_TYPE_YOUTUVL: return (img->ny / params.patch_size) / 2; case PROJECTOR_TYPE_STEP3VL: @@ -3070,6 +3084,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im n_patches = h * (h + 1) + 1; } break; case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: { int merge = ctx->model.hparams.n_merge; int ow = (img->nx / patch_size) / merge; @@ -3534,6 +3549,70 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima { // do nothing } break; + case PROJECTOR_TYPE_HUNYUANVL: + { + // Compute the HunyuanVL 2D position embedding on CPU (with the + // custom sf=(target+0.1)/n_grid bilinear sampling that the + // reference implementation uses) and upload it to the graph + // input declared in clip_graph_hunyuanocr::build(). + GGML_ASSERT(model.position_embeddings != nullptr); + ggml_tensor * src_t = model.position_embeddings; + const int64_t n_embd = src_t->ne[0]; + const int64_t n_pos = src_t->ne[1]; // = n_grid * n_grid + const int n_grid = (int)std::lround(std::sqrt((double)n_pos)); + GGML_ASSERT((int64_t)n_grid * n_grid == n_pos); + const int out_w = pos_w; // pw + const int out_h = pos_h; // ph + + // Pull weight to host. + std::vector src(n_embd * n_pos); + ggml_backend_tensor_get(src_t, src.data(), 0, ggml_nbytes(src_t)); + + // Output layout matches ggml_new_tensor_2d(F32, n_embd, out_h*out_w): + // ne[0] = n_embd (fastest), ne[1] = out_h*out_w + // dst[(y*out_w + x) * n_embd + c] + std::vector dst((size_t)n_embd * out_h * out_w); + + const float sx = (float)(out_w + 0.1f) / (float)n_grid; + const float sy = (float)(out_h + 0.1f) / (float)n_grid; + + for (int y = 0; y < out_h; ++y) { + // Match ggml_compute_forward_upscale_f32 pixel-center + // convention (align_corners=False): src_y = (y+0.5)/sy - 0.5. + const float fy = ((float)y + 0.5f) / sy - 0.5f; + int y0 = (int)std::floor(fy); + int y1 = y0 + 1; + y0 = std::clamp(y0, 0, n_grid - 1); + y1 = std::clamp(y1, 0, n_grid - 1); + float wy1 = std::clamp(fy - (float)y0, 0.0f, 1.0f); + const float wy0 = 1.0f - wy1; + for (int x = 0; x < out_w; ++x) { + const float fx = ((float)x + 0.5f) / sx - 0.5f; + int x0 = (int)std::floor(fx); + int x1 = x0 + 1; + x0 = std::clamp(x0, 0, n_grid - 1); + x1 = std::clamp(x1, 0, n_grid - 1); + float wx1 = std::clamp(fx - (float)x0, 0.0f, 1.0f); + const float wx0 = 1.0f - wx1; + + const float w00 = wy0 * wx0; + const float w01 = wy0 * wx1; + const float w10 = wy1 * wx0; + const float w11 = wy1 * wx1; + + const float * s00 = &src[((size_t)y0 * n_grid + x0) * n_embd]; + const float * s01 = &src[((size_t)y0 * n_grid + x1) * n_embd]; + const float * s10 = &src[((size_t)y1 * n_grid + x0) * n_embd]; + const float * s11 = &src[((size_t)y1 * n_grid + x1) * n_embd]; + float * d = &dst[((size_t)y * out_w + x) * n_embd]; + for (int c = 0; c < n_embd; ++c) { + d[c] = w00 * s00[c] + w01 * s01[c] + w10 * s10[c] + w11 * s11[c]; + } + } + } + + set_input_f32("hunyuanvl_pos_embd", dst); + } break; case PROJECTOR_TYPE_LLAMA4: { // set the 2D positions @@ -3760,6 +3839,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_YASA2: return ctx->model.mm_2_w->ne[1]; case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: return ctx->model.mm_model_proj->ne[1]; case PROJECTOR_TYPE_COGVLM: return ctx->model.mm_4h_to_h_w->ne[1]; diff --git a/tools/mtmd/models/hunyuanocr.cpp b/tools/mtmd/models/hunyuanocr.cpp index 37d1e2b86a9..45ed684f70d 100644 --- a/tools/mtmd/models/hunyuanocr.cpp +++ b/tools/mtmd/models/hunyuanocr.cpp @@ -5,7 +5,21 @@ ggml_cgraph * clip_graph_hunyuanocr::build() { const int pw = n_patches_x; const int ph = n_patches_y; - ggml_tensor * pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR); + // Position embedding interpolation. + // HunyuanVL needs scale factors sf=(target+0.1)/n_grid, which the standard + // ggml_interpolate cannot express. To avoid adding a new ggml op, the + // resize is computed on CPU in clip_image_batch_encode and uploaded here + // as a graph input (named "hunyuanvl_pos_embd"). + // HunyuanOCR uses the same square layout and the standard ratio-based + // interpolation provided by resize_position_embeddings(). + ggml_tensor * pos_embd = nullptr; + if (proj_type == PROJECTOR_TYPE_HUNYUANVL && model.position_embeddings) { + pos_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ph * pw); + ggml_set_name(pos_embd, "hunyuanvl_pos_embd"); + ggml_set_input(pos_embd); + } else { + pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR); + } ggml_tensor * inp = build_inp(); ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index cc3de6a858c..626361b9244 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -35,15 +35,23 @@ struct mtmd_bitmap { // position indexing for decoder model enum mtmd_pos_type { - MTMD_POS_TYPE_NORMAL, // number of positions equals to number of tokens - MTMD_POS_TYPE_MROPE, // qwen-vl mrope style, each image takes max(t,h,w) position indexes + MTMD_POS_TYPE_NORMAL, // number of positions equals to number of tokens + MTMD_POS_TYPE_MROPE, // qwen-vl mrope style, each image takes max(t,h,w) position indexes + MTMD_POS_TYPE_HUNYUANVL, // HunyuanVL mrope + BOI/EOI/newline layout with XD-RoPE dim-3 }; struct mtmd_image_tokens { uint32_t nx; // number of tokens in x direction uint32_t ny; // number of tokens in y direction mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL; - uint32_t n_tokens() const { return nx * ny; } + uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL) + uint32_t n_tokens() const { + if (pos == MTMD_POS_TYPE_HUNYUANVL) { + // [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI] + return (nx + 1) * ny + 2; + } + return nx * ny; + } clip_image_f32_batch batch_f32; // preprocessed image patches std::string id; // optional user-defined ID, useful for KV cache tracking @@ -52,6 +60,7 @@ struct mtmd_image_tokens { nx, ny, pos, + image_idx, batch_f32.clone(), id }; @@ -466,6 +475,7 @@ struct mtmd_context { image_preproc = std::make_unique(ctx_v); } break; case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: { // note: these use fullwidth | (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary img_beg = "<|hy_place▁holder▁no▁100|>"; @@ -611,6 +621,7 @@ struct mtmd_tokenizer { const llama_vocab * vocab; mtmd_input_chunks cur; + uint32_t n_images_added = 0; // 0-based index assigned to the next image chunk mtmd_tokenizer(mtmd_context * ctx, const mtmd_input_text * text, @@ -819,6 +830,14 @@ struct mtmd_tokenizer { image_tokens->ny = 1; } image_tokens->pos = ctx->pos_type; + // HunyuanVL wraps the image grid with BOI/EOI and adds one newline per row, + // and uses XD-RoPE (dim-3 = image index). Override the position type so that + // n_tokens() and mtmd_image_tokens_get_decoder_pos pick the HunyuanVL layout. + if (ctx->proj_type_v() == PROJECTOR_TYPE_HUNYUANVL) { + image_tokens->pos = MTMD_POS_TYPE_HUNYUANVL; + image_tokens->image_idx = n_images_added; + GGML_ASSERT(n_tokens == (size_t)image_tokens->n_tokens()); + } image_tokens->batch_f32 = std::move(batch_f32); image_tokens->id = bitmap->id; // optional @@ -839,6 +858,9 @@ struct mtmd_tokenizer { add_text(ctx->img_end, true); // add image end token } + // advance image-chunk counter so the next image gets the next XD-RoPE dim-3 slot + n_images_added++; + } else { // handle audio @@ -1286,6 +1308,38 @@ mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * ima pos.y = pos_0 + i; pos.z = pos_0 + i; } break; + case MTMD_POS_TYPE_HUNYUANVL: + { + // HunyuanVL layout: [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI] + // Total = 1 + ny*(nx+1) + 1. BOI and EOI use sequential positions in every dim; + // content and row-newline tokens use (row, col) with XD-RoPE dim-3 = image_idx. + const uint32_t nx = image_tokens->nx; + const uint32_t n_total = image_tokens->n_tokens(); + if (i == 0) { + // BOI + pos.t = pos_0 + i; + pos.x = pos_0 + i; + pos.y = pos_0 + i; + pos.z = pos_0 + i; + } else if (i == n_total - 1) { + // EOI + pos.t = pos_0 + i; + pos.x = pos_0 + i; + pos.y = pos_0 + i; + pos.z = pos_0 + i; + } else { + // content token at (row, col), or the trailing newline of a row (col == nx) + // section 0 = sequential, section 1 = w(col), section 2 = h(row), section 3 = image_count. + // set_position_mrope_2d writes .y -> section 1 and .x -> section 2 + const uint32_t offset = (uint32_t)i - 1; + const uint32_t row = offset / (nx + 1); + const uint32_t col = offset % (nx + 1); + pos.t = pos_0 + i; + pos.x = row; + pos.y = col; + pos.z = image_tokens->image_idx; + } + } break; default: GGML_ABORT("invalid position type"); } @@ -1302,6 +1356,10 @@ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) { return std::max(image_tokens->nx, image_tokens->ny); case MTMD_POS_TYPE_NORMAL: return image_tokens->n_tokens(); + case MTMD_POS_TYPE_HUNYUANVL: + // HunyuanVL: the sequential (dim-0) position advances by the full token count + // (includes BOI/EOI and row newline tokens), not by max(nx, ny) + return image_tokens->n_tokens(); default: GGML_ABORT("invalid position type"); } diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh index 5da48d61bfd..83416fb272b 100755 --- a/tools/mtmd/tests.sh +++ b/tools/mtmd/tests.sh @@ -91,6 +91,7 @@ add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0" add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr add_test_vision "ggml-org/dots.ocr-GGUF:Q8_0" -p "OCR" add_test_vision "ggml-org/HunyuanOCR-GGUF:Q8_0" -p "OCR" +add_test_vision "ggml-org/HunyuanVL-4B-GGUF:Q8_0" add_test_vision "ggml-org/gemma-4-E2B-it-GGUF:Q8_0" --jinja add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0" From 17f624516858b1a95b59076b0367b1f26f37ecd5 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Wed, 22 Apr 2026 12:10:50 +0200 Subject: [PATCH 2/5] server: ignore reasoning content from transcription api (#21905) --- tools/server/server-task.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index ac1e77615da..9380792c06d 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -1111,7 +1111,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { json server_task_result_cmpl_final::to_json_oaicompat_asr() { json event = json { {"type", "transcript.text.done"}, - {"text", content}, + {"text", oaicompat_msg.content}, {"usage", json { {"type", "tokens"}, {"input_tokens", n_prompt_tokens}, From 82d3f4d3b2fcff84d61f8a4f660f8aee71a4ea39 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Wed, 22 Apr 2026 12:16:29 +0200 Subject: [PATCH 3/5] mtmd: also support LLAMA_ROPE_TYPE_NONE (#22242) --- tools/mtmd/mtmd.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 626361b9244..59907786786 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -195,6 +195,7 @@ struct mtmd_context { auto decoder_rope_type = llama_model_rope_type(text_model); switch (decoder_rope_type) { + case LLAMA_ROPE_TYPE_NONE: case LLAMA_ROPE_TYPE_NORM: case LLAMA_ROPE_TYPE_NEOX: { From 225088ea76687c005e282d3a48d73e9c0c8c5091 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Wed, 22 Apr 2026 18:02:56 +0530 Subject: [PATCH 4/5] sycl: Improve mul_mat_id memory efficiency and add BF16 fast path (#22119) * sycl: size mul_mat_id staging buffers by routed rows Previously src1_contiguous/dst_contiguous in ggml_sycl_mul_mat_id were sized to ggml_nelements(src1/dst), which over-allocates when ne12 > 1 and can fail with UR_RESULT_ERROR_OUT_OF_HOST_MEMORY on Level Zero for MoE models (notably with --cpu-moe). Size them by the actual number of routed rows (ids->ne[1] * n_ids) instead. * sycl: add bf16 mul_mat fast path via DNNL When src0 is BF16 (commonly the case for lm_head / output.weight), the existing f16 path is skipped because bf16 isn't covered, and the f32 fallback dequantizes the entire src0 slab to f32 in a single pool alloc (row_diff*ne00 floats). For large-vocab models this can reach several GB and fail with UR_RESULT_ERROR_OUT_OF_HOST_MEMORY on Level Zero. Add a bf16xbf16 -> f32 DNNL matmul fast path that uses the bf16 storage in place and only materializes a small src1 bf16 conversion buffer. bf16 matmul accumulates in f32, so it's correct even when the op requests GGML_PREC_F32 (as lm_head does). - gemm.hpp: map bfloat16 to dnnl::memory::data_type::bf16. - convert.{hpp,cpp}: expose ggml_get_to_bf16_sycl for f32/f16/bf16 -> bf16. - ggml-sycl.cpp: take the bf16 path early in ggml_sycl_op_mul_mat_sycl when DNNL and GGML_SYCL_HAS_BF16 are both available. --- ggml/src/ggml-sycl/common.hpp | 7 +++++++ ggml/src/ggml-sycl/convert.cpp | 23 ++++++++++++++++------- ggml/src/ggml-sycl/convert.hpp | 9 +++++++++ ggml/src/ggml-sycl/gemm.hpp | 3 +++ ggml/src/ggml-sycl/ggml-sycl.cpp | 30 ++++++++++++++++++++++++++++-- ggml/src/ggml-sycl/set_rows.cpp | 8 +++++++- 6 files changed, 70 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index fd84c917853..0101b27640a 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -28,6 +28,13 @@ namespace syclexp = sycl::ext::oneapi::experimental; +#if defined(__INTEL_LLVM_COMPILER) && __has_include() + #include + #ifndef GGML_SYCL_HAS_BF16 + #define GGML_SYCL_HAS_BF16 + #endif +#endif + #if GGML_SYCL_DNNL #include "dnnl.hpp" #include "dnnl_sycl.hpp" diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp index f3c521b45f6..67b9c06f3e4 100644 --- a/ggml/src/ggml-sycl/convert.cpp +++ b/ggml/src/ggml-sycl/convert.cpp @@ -2,13 +2,6 @@ #include "dequantize.hpp" #include "presets.hpp" -#if defined(__INTEL_LLVM_COMPILER) - #if __has_include() - #include - #define GGML_SYCL_HAS_BF16 - #endif -#endif - template static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, const sycl::nd_item<3> &item_ct1) { @@ -767,6 +760,22 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) { } +#ifdef GGML_SYCL_HAS_BF16 +to_bf16_sycl_t ggml_get_to_bf16_sycl(ggml_type type, ggml_tensor * /*dst*/) { + switch (type) { + case GGML_TYPE_F32: + return convert_unary_sycl; + case GGML_TYPE_F16: + return convert_unary_sycl; + case GGML_TYPE_BF16: + return convert_unary_sycl; + default: + GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(type)); + return nullptr; + } +} +#endif + to_fp16_nc_sycl_t ggml_get_to_fp16_nc_sycl(ggml_type type) { switch (type) { case GGML_TYPE_F32: diff --git a/ggml/src/ggml-sycl/convert.hpp b/ggml/src/ggml-sycl/convert.hpp index 6e621f2154d..8de79d10ff6 100644 --- a/ggml/src/ggml-sycl/convert.hpp +++ b/ggml/src/ggml-sycl/convert.hpp @@ -23,6 +23,11 @@ typedef to_t_sycl_t to_fp16_sycl_t; to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst); to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor * dst); +#ifdef GGML_SYCL_HAS_BF16 +typedef to_t_sycl_t to_bf16_sycl_t; +to_bf16_sycl_t ggml_get_to_bf16_sycl(ggml_type type, ggml_tensor * dst); +#endif + // Nc = Non-contiguous template using to_t_nc_sycl_t = void (*)(const void * x, T * y, int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, @@ -35,15 +40,19 @@ template inline dst_t ggml_sycl_cast(src_t x) { if constexpr (std::is_same_v) { return x; +#ifdef GGML_SYCL_HAS_BF16 } else if constexpr (std::is_same_v) { return sycl::ext::oneapi::bfloat16(float(x)); } else if constexpr (std::is_same_v) { return static_cast(x); +#endif } else if constexpr (std::is_same_v && std::is_same_v) { return x.template convert(); +#ifdef GGML_SYCL_HAS_BF16 } else if constexpr (std::is_same_v && std::is_same_v>) { return {x.x, x.y}; +#endif } else if constexpr(std::is_same_v) { return int32_t(x); } else { diff --git a/ggml/src/ggml-sycl/gemm.hpp b/ggml/src/ggml-sycl/gemm.hpp index dcf6c7aeeb4..c202da110be 100644 --- a/ggml/src/ggml-sycl/gemm.hpp +++ b/ggml/src/ggml-sycl/gemm.hpp @@ -29,6 +29,9 @@ class DnnlGemmWrapper { static constexpr dt to_dt() { if constexpr (std::is_same_v) return dt::f32; else if constexpr (std::is_same_v) return dt::f16; +#ifdef GGML_SYCL_HAS_BF16 + else if constexpr (std::is_same_v) return dt::bf16; +#endif else static_assert(0); } diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index c02a41ad862..3829da87903 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2176,6 +2176,31 @@ inline void ggml_sycl_op_mul_mat_sycl( #else bool use_fp16 = false; #endif + +#if GGML_SYCL_DNNL && defined(GGML_SYCL_HAS_BF16) + // Fast path for bf16 src0 + if (src0->type == GGML_TYPE_BF16 && !g_ggml_sycl_disable_dnn && ggml_is_contiguous(src0) && + row_diff == src0->ne[1]) { + using bf16_t = sycl::ext::oneapi::bfloat16; + ggml_sycl_pool_alloc src1_as_bf16(ctx.pool(), src1_ncols*ne10); + if (src1->type != GGML_TYPE_BF16) { + const to_bf16_sycl_t to_bf16_sycl = ggml_get_to_bf16_sycl(src1->type, dst); + GGML_ASSERT(to_bf16_sycl != nullptr); + to_bf16_sycl(src1_ddf_i, src1_as_bf16.get(), src1_ncols*ne10, stream); + } else { + stream->memcpy(src1_as_bf16.get(), src1_ddf_i, src1_ncols*ne10*sizeof(bf16_t)); + } + DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, + src0_dd_i, DnnlGemmWrapper::to_dt(), + src1_as_bf16.get(), DnnlGemmWrapper::to_dt(), + dst_dd_i, DnnlGemmWrapper::to_dt(), stream); + GGML_UNUSED(dst); + GGML_UNUSED(src1_ddq_i); + GGML_UNUSED(src1_padded_row_size); + return; + } +#endif + if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { ggml_sycl_pool_alloc src0_as_f16(ctx.pool()); @@ -3848,8 +3873,9 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, } } } else { - ggml_sycl_pool_alloc src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1)); - ggml_sycl_pool_alloc dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst)); + const int64_t n_routed_rows = ids->ne[1] * n_ids; + ggml_sycl_pool_alloc src1_contiguous(ctx.pool(), sizeof(float)*n_routed_rows*ne10); + ggml_sycl_pool_alloc dst_contiguous(ctx.pool(), sizeof(float)*n_routed_rows*ne0); src1_row.data = src1_contiguous.get(); dst_row.data = dst_contiguous.get(); diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp index a641c100913..8fb41943525 100644 --- a/ggml/src/ggml-sycl/set_rows.cpp +++ b/ggml/src/ggml-sycl/set_rows.cpp @@ -4,7 +4,11 @@ namespace utils { template static constexpr bool is_arithmetic_v() { - return std::is_arithmetic_v || std::is_same_v || std::is_same_v; + return std::is_arithmetic_v || std::is_same_v +#ifdef GGML_SYCL_HAS_BF16 + || std::is_same_v +#endif + ; } } @@ -181,6 +185,7 @@ static void set_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor * s stream ); break; +#ifdef GGML_SYCL_HAS_BF16 case GGML_TYPE_BF16: set_rows_sycl( src0_d, src1_d, (char *)dst->data, @@ -193,6 +198,7 @@ static void set_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor * s stream ); break; +#endif case GGML_TYPE_Q8_0: set_rows_sycl_q(src0_d, src1_d, (block_q8_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream); break; From bcb5eeb64529806b8d1cb80eccbc22c7d0897cb2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 22 Apr 2026 15:44:45 +0300 Subject: [PATCH 5/5] speculative-simple : add checkpoint support (#22227) * speculative-simple : add checkpoint support * cont : fix build --- .../speculative-simple/speculative-simple.cpp | 112 ++++++++++++++++-- tools/server/server-context.cpp | 10 +- 2 files changed, 112 insertions(+), 10 deletions(-) diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index a03dbce887f..73394b74ee9 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -8,8 +8,24 @@ #include #include #include +#include #include #include +#include + +struct spec_checkpoint { + int64_t n_tokens = 0; + + std::vector data; + + size_t size() const { + return data.size(); + } + + bool empty() const { + return data.empty(); + } +}; int main(int argc, char ** argv) { std::setlocale(LC_NUMERIC, "C"); @@ -46,6 +62,14 @@ int main(int argc, char ** argv) { model_tgt = llama_init_tgt->model(); ctx_tgt = llama_init_tgt->context(); + // check if the context supports partial sequence removal + const auto ctx_seq_rm = common_context_can_seq_rm(ctx_tgt); + const bool use_ckpt = (ctx_seq_rm == COMMON_CONTEXT_SEQ_RM_TYPE_FULL); + + if (use_ckpt) { + LOG_INF("speculative decoding will use checkpoints (context does not support partial sequence removal)\n"); + } + const llama_vocab * vocab = llama_model_get_vocab(model_tgt); // load the draft model @@ -119,7 +143,7 @@ int main(int argc, char ** argv) { const auto t_enc_start = ggml_time_us(); // target model sampling context - struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling); + common_sampler_ptr smpl(common_sampler_init(model_tgt, params.sampling)); // eval the prompt llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1)); @@ -142,21 +166,61 @@ int main(int argc, char ** argv) { llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1); + size_t n_draft = 0; + + llama_tokens draft; + spec_checkpoint spec_ckpt; + const auto t_enc_end = ggml_time_us(); const auto t_dec_start = ggml_time_us(); while (true) { - // optionally, generate draft tokens that can be appended to the target batch + // generate or reuse draft tokens // // this is the most important part of the speculation. the more probable tokens that are provided here // the better the performance will be. in theory, this computation can be performed asynchronously and even // offloaded to a remote device. it doesn't even have to be based on an LLM. instead, it can provide tokens // from a cache or lookup tables. // - llama_tokens draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last); + if (draft.empty()) { + // generate a new draft + draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last); + + if ((int) draft.size() > params_spec.n_max) { + LOG_WRN("draft size %zu exceeds max %d, truncating\n", draft.size(), params_spec.n_max); + draft.resize(params_spec.n_max); + } + + if ((int) draft.size() < params_spec.n_min) { + LOG_DBG("ignoring small draft: %zu < %d\n", draft.size(), params_spec.n_min); + draft.clear(); + } + + // save the original draft size + n_draft = draft.size(); + + // save a checkpoint of the target context before evaluating the draft + // this allows us to restore the state if partial draft acceptance occurs + if (!draft.empty() && use_ckpt) { + const size_t ckpt_size = llama_state_seq_get_size_ext(ctx_tgt, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); + spec_ckpt.data.resize(ckpt_size); - //LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str()); + const size_t n = llama_state_seq_get_data_ext(ctx_tgt, spec_ckpt.data.data(), ckpt_size, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); + GGML_ASSERT(n == ckpt_size); + + spec_ckpt.n_tokens = (int64_t) prompt_tgt.size(); + LOG_DBG("created speculative checkpoint (n_tokens = %" PRId64 ", size = %.3f MiB)\n", + spec_ckpt.n_tokens, (float) spec_ckpt.data.size() / 1024 / 1024); + } + } else { + // we have a previous (partial) draft to reuse from checkpoint restoration + if (use_ckpt) { + GGML_ASSERT(!spec_ckpt.empty()); + } + } + + GGML_ASSERT(n_draft > 0); // always have a token to evaluate from before - id_last common_batch_clear(batch_tgt); @@ -178,6 +242,12 @@ int main(int argc, char ** argv) { llama_decode(ctx_tgt, batch_tgt); } + // only save the sampler sampler state if we use checkpoints + common_sampler_ptr smpl_save; + if (use_ckpt) { + smpl_save.reset(common_sampler_clone(smpl.get())); + } + // sample from the full target batch and return the accepted tokens based on the target sampler // // for each token to be accepted, the sampler would have to sample that same token @@ -185,14 +255,38 @@ int main(int argc, char ** argv) { // available logits from the batch and sample the next token until we run out of logits or the sampler // disagrees with the draft // - const auto ids = common_sampler_sample_and_accept_n(smpl, ctx_tgt, draft); + auto ids = common_sampler_sample_and_accept_n(smpl.get(), ctx_tgt, draft); //LOG_DBG("ids: %s\n", string_from(ctx_tgt, ids).c_str()); GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token + // check for partial draft acceptance: + // if the context doesn't support partial sequence removal, restore the checkpoint + // and make the accepted tokens the new partial draft for the next iteration + if (use_ckpt && ids.size() - 1 < draft.size()) { + LOG_DBG("partial acceptance: %zu < %zu, restoring checkpoint\n", ids.size() - 1, draft.size()); + + draft = std::move(ids); + + const size_t n = llama_state_seq_set_data_ext(ctx_tgt, spec_ckpt.data.data(), spec_ckpt.size(), 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); + GGML_ASSERT(n == spec_ckpt.size()); + + llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, spec_ckpt.n_tokens, -1); + + prompt_tgt.resize(spec_ckpt.n_tokens); + smpl = std::move(smpl_save); + + n_past = (int) prompt_tgt.size(); + + continue; + } + + common_speculative_accept(spec, ids.size() - 1); + + // full acceptance: consume the draft and commit accepted tokens n_past += ids.size() - 1; - n_drafted += draft.size(); // note: we ignore the discarded small drafts + n_drafted += n_draft; // note: we ignore the discarded small drafts n_accept += ids.size() - 1; n_predict += ids.size(); @@ -222,6 +316,9 @@ int main(int argc, char ** argv) { LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last); + // clear the draft since it has been consumed + draft.clear(); + { LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past); @@ -254,11 +351,10 @@ int main(int argc, char ** argv) { LOG_INF("\n"); LOG_INF("target:\n\n"); - common_perf_print(ctx_tgt, smpl); + common_perf_print(ctx_tgt, smpl.get()); llama_batch_free(batch_tgt); - common_sampler_free(smpl); common_speculative_free(spec); llama_backend_free(); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 53f61b5a9b8..b8c05cd80e7 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2961,7 +2961,13 @@ struct server_context_impl { // verify and try to accept the draft { - common_sampler_ptr smpl_save(common_sampler_clone(slot.smpl.get())); + const bool use_ckpt = slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL; + + // only save the sampler sampler state if we use checkpoints + common_sampler_ptr smpl_save; + if (use_ckpt) { + smpl_save.reset(common_sampler_clone(slot.smpl.get())); + } GGML_ASSERT(slot.spec_i_batch.size() == n_draft + 1); auto accepted = common_sampler_sample_and_accept_n(slot.smpl.get(), slot.ctx, slot.spec_i_batch, slot.spec_draft); @@ -2973,7 +2979,7 @@ struct server_context_impl { // check for partial draft acceptance if (accepted.size() < slot.spec_draft.size() + 1) { - if (slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) { + if (use_ckpt) { // partial acceptance is not supported by the context -> truncate the draft and restore the state slot.spec_draft = std::move(accepted);