@@ -4074,6 +4074,87 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
40744074 yield from super().modify_tensors(data_torch, name, bid)
40754075
40764076
4077+ @ModelBase.register(
4078+ "NemotronH_Nano_VL_V2",
4079+ "RADIOModel",
4080+ )
4081+ class NemotronNanoV2VLModel(MmprojModel):
4082+ # ViT-Huge architecture parameters for RADIO v2.5-h
4083+ _vit_hidden_size = 1280
4084+ _vit_intermediate_size = 5120
4085+ _vit_num_layers = 32
4086+ _vit_num_heads = 16
4087+
4088+ def get_vision_config(self) -> dict[str, Any] | None:
4089+ # RADIO config doesn't have standard ViT parameters, so they need to be constructed manually
4090+ vision_config = self.global_config.get("vision_config")
4091+ if vision_config is None:
4092+ return None
4093+ # Add ViT-H parameters
4094+ vision_config = {
4095+ **vision_config,
4096+ "hidden_size": self._vit_hidden_size,
4097+ "intermediate_size": self._vit_intermediate_size,
4098+ "num_hidden_layers": self._vit_num_layers,
4099+ "num_attention_heads": self._vit_num_heads,
4100+ "image_size": self.global_config.get("force_image_size", 512),
4101+ }
4102+ return vision_config
4103+
4104+ def set_gguf_parameters(self):
4105+ if "image_mean" not in self.preprocessor_config:
4106+ self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406]
4107+ if "image_std" not in self.preprocessor_config:
4108+ self.preprocessor_config["image_std"] = [0.229, 0.224, 0.225]
4109+
4110+ super().set_gguf_parameters()
4111+ hparams = self.global_config
4112+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL)
4113+ self.gguf_writer.add_vision_attention_layernorm_eps(1e-6)
4114+ self.gguf_writer.add_vision_use_gelu(True)
4115+ downsample_ratio = hparams.get("downsample_ratio", 0.5)
4116+ self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
4117+
4118+ def tensor_force_quant(self, name, new_name, bid, n_dims):
4119+ if ".position_embd." in new_name or "pos_embed" in new_name:
4120+ return gguf.GGMLQuantizationType.F32
4121+ return super().tensor_force_quant(name, new_name, bid, n_dims)
4122+
4123+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4124+ if "input_conditioner" in name:
4125+ return
4126+
4127+ # RADIO's pos_embed doesn't have .weight suffix, but clip.cpp expects it
4128+ if "patch_generator.pos_embed" in name:
4129+ if not name.endswith(".weight"):
4130+ name += ".weight"
4131+ # Downsample position embeddings for fixed 512x512 image size
4132+ import torch.nn.functional as F
4133+ n_embd = self.hparams["hidden_size"]
4134+ image_size = self.global_config.get("force_image_size", 512)
4135+ patch_size = self.hparams["patch_size"]
4136+ target_patches_per_side = image_size // patch_size # 32
4137+ max_patches_per_side = int((data_torch.shape[1]) ** 0.5) # 128
4138+ if target_patches_per_side != max_patches_per_side:
4139+ # Reshape to grid, interpolate, flatten back
4140+ data_torch = data_torch.reshape(1, max_patches_per_side, max_patches_per_side, n_embd)
4141+ data_torch = data_torch.permute(0, 3, 1, 2).float() # [1, n_embd, 128, 128]
4142+ data_torch = F.interpolate(data_torch, size=(target_patches_per_side, target_patches_per_side),
4143+ mode='bilinear', align_corners=True)
4144+ data_torch = data_torch.permute(0, 2, 3, 1) # [1, 32, 32, n_embd]
4145+ data_torch = data_torch.reshape(1, target_patches_per_side * target_patches_per_side, n_embd)
4146+
4147+ # Reshape linear patch embedding to conv2d format for ggml_conv_2d
4148+ # From [n_embd, patch_size*patch_size*3] to [n_embd, 3, patch_size, patch_size]
4149+ if "patch_generator.embedder" in name:
4150+ patch_size = self.hparams["patch_size"]
4151+ n_embd = self.hparams["hidden_size"]
4152+ data_torch = data_torch.reshape(n_embd, 3, patch_size, patch_size)
4153+
4154+ if name.startswith("vision_model.radio_model.model.") or name.startswith("mlp1."):
4155+ yield from super().modify_tensors(data_torch, name, bid)
4156+
4157+
40774158@ModelBase.register("WavTokenizerDec")
40784159class WavTokenizerDecModel(TextModel):
40794160 model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
@@ -7055,6 +7136,8 @@ def __init__(self, dir_model: Path, *args, **kwargs):
70557136 if hparams is None:
70567137 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
70577138 hparams = json.load(f)
7139+ if "llm_config" in hparams:
7140+ hparams["text_config"] = hparams["llm_config"]
70587141 super().__init__(dir_model, *args, hparams=hparams, **kwargs)
70597142 self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
70607143 self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
@@ -9542,6 +9625,14 @@ def set_vocab(self):
95429625 self.gguf_writer.add_add_bos_token(True)
95439626
95449627 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9628+ # Skip vision model and projector tensors for VLM models (handled by mmproj) (e.g., Nemotron Nano 12B v2 VL)
9629+ if name.startswith(("vision_model.", "mlp1.")):
9630+ return
9631+
9632+ # Strip language_model. prefix for VLM models (e.g., Nemotron Nano 12B v2 VL)
9633+ if name.startswith("language_model."):
9634+ name = name[len("language_model."):]
9635+
95459636 if self.is_moe and bid is not None:
95469637 if name.endswith("mixer.gate.e_score_correction_bias"):
95479638 new_name = name.replace("e_score_correction_bias", "e_score_correction.bias")
0 commit comments