Skip to content

Commit 01d8eaa

Browse files
authored
mtmd : Add Nemotron Nano 12B v2 VL support (ggml-org#19547)
* nemotron nano v2 vlm support added * simplified code; addressed reviews * pre-downsample position embeddings during GGUF conversion for fixed input size
1 parent 1725e31 commit 01d8eaa

9 files changed

Lines changed: 167 additions & 1 deletion

File tree

convert_hf_to_gguf.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4074,6 +4074,87 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
40744074
yield from super().modify_tensors(data_torch, name, bid)
40754075

40764076

4077+
@ModelBase.register(
4078+
"NemotronH_Nano_VL_V2",
4079+
"RADIOModel",
4080+
)
4081+
class NemotronNanoV2VLModel(MmprojModel):
4082+
# ViT-Huge architecture parameters for RADIO v2.5-h
4083+
_vit_hidden_size = 1280
4084+
_vit_intermediate_size = 5120
4085+
_vit_num_layers = 32
4086+
_vit_num_heads = 16
4087+
4088+
def get_vision_config(self) -> dict[str, Any] | None:
4089+
# RADIO config doesn't have standard ViT parameters, so they need to be constructed manually
4090+
vision_config = self.global_config.get("vision_config")
4091+
if vision_config is None:
4092+
return None
4093+
# Add ViT-H parameters
4094+
vision_config = {
4095+
**vision_config,
4096+
"hidden_size": self._vit_hidden_size,
4097+
"intermediate_size": self._vit_intermediate_size,
4098+
"num_hidden_layers": self._vit_num_layers,
4099+
"num_attention_heads": self._vit_num_heads,
4100+
"image_size": self.global_config.get("force_image_size", 512),
4101+
}
4102+
return vision_config
4103+
4104+
def set_gguf_parameters(self):
4105+
if "image_mean" not in self.preprocessor_config:
4106+
self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406]
4107+
if "image_std" not in self.preprocessor_config:
4108+
self.preprocessor_config["image_std"] = [0.229, 0.224, 0.225]
4109+
4110+
super().set_gguf_parameters()
4111+
hparams = self.global_config
4112+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL)
4113+
self.gguf_writer.add_vision_attention_layernorm_eps(1e-6)
4114+
self.gguf_writer.add_vision_use_gelu(True)
4115+
downsample_ratio = hparams.get("downsample_ratio", 0.5)
4116+
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
4117+
4118+
def tensor_force_quant(self, name, new_name, bid, n_dims):
4119+
if ".position_embd." in new_name or "pos_embed" in new_name:
4120+
return gguf.GGMLQuantizationType.F32
4121+
return super().tensor_force_quant(name, new_name, bid, n_dims)
4122+
4123+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4124+
if "input_conditioner" in name:
4125+
return
4126+
4127+
# RADIO's pos_embed doesn't have .weight suffix, but clip.cpp expects it
4128+
if "patch_generator.pos_embed" in name:
4129+
if not name.endswith(".weight"):
4130+
name += ".weight"
4131+
# Downsample position embeddings for fixed 512x512 image size
4132+
import torch.nn.functional as F
4133+
n_embd = self.hparams["hidden_size"]
4134+
image_size = self.global_config.get("force_image_size", 512)
4135+
patch_size = self.hparams["patch_size"]
4136+
target_patches_per_side = image_size // patch_size # 32
4137+
max_patches_per_side = int((data_torch.shape[1]) ** 0.5) # 128
4138+
if target_patches_per_side != max_patches_per_side:
4139+
# Reshape to grid, interpolate, flatten back
4140+
data_torch = data_torch.reshape(1, max_patches_per_side, max_patches_per_side, n_embd)
4141+
data_torch = data_torch.permute(0, 3, 1, 2).float() # [1, n_embd, 128, 128]
4142+
data_torch = F.interpolate(data_torch, size=(target_patches_per_side, target_patches_per_side),
4143+
mode='bilinear', align_corners=True)
4144+
data_torch = data_torch.permute(0, 2, 3, 1) # [1, 32, 32, n_embd]
4145+
data_torch = data_torch.reshape(1, target_patches_per_side * target_patches_per_side, n_embd)
4146+
4147+
# Reshape linear patch embedding to conv2d format for ggml_conv_2d
4148+
# From [n_embd, patch_size*patch_size*3] to [n_embd, 3, patch_size, patch_size]
4149+
if "patch_generator.embedder" in name:
4150+
patch_size = self.hparams["patch_size"]
4151+
n_embd = self.hparams["hidden_size"]
4152+
data_torch = data_torch.reshape(n_embd, 3, patch_size, patch_size)
4153+
4154+
if name.startswith("vision_model.radio_model.model.") or name.startswith("mlp1."):
4155+
yield from super().modify_tensors(data_torch, name, bid)
4156+
4157+
40774158
@ModelBase.register("WavTokenizerDec")
40784159
class WavTokenizerDecModel(TextModel):
40794160
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
@@ -7055,6 +7136,8 @@ def __init__(self, dir_model: Path, *args, **kwargs):
70557136
if hparams is None:
70567137
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
70577138
hparams = json.load(f)
7139+
if "llm_config" in hparams:
7140+
hparams["text_config"] = hparams["llm_config"]
70587141
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
70597142
self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
70607143
self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
@@ -9542,6 +9625,14 @@ def set_vocab(self):
95429625
self.gguf_writer.add_add_bos_token(True)
95439626

95449627
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9628+
# Skip vision model and projector tensors for VLM models (handled by mmproj) (e.g., Nemotron Nano 12B v2 VL)
9629+
if name.startswith(("vision_model.", "mlp1.")):
9630+
return
9631+
9632+
# Strip language_model. prefix for VLM models (e.g., Nemotron Nano 12B v2 VL)
9633+
if name.startswith("language_model."):
9634+
name = name[len("language_model."):]
9635+
95459636
if self.is_moe and bid is not None:
95469637
if name.endswith("mixer.gate.e_score_correction_bias"):
95479638
new_name = name.replace("e_score_correction_bias", "e_score_correction.bias")

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3830,6 +3830,7 @@ class VisionProjectorType:
38303830
MUSIC_FLAMINGO = "musicflamingo" # audio
38313831
GLM4V = "glm4v"
38323832
YOUTUVL = "youtuvl"
3833+
NEMOTRON_V2_VL = "nemotron_v2_vl"
38333834

38343835

38353836
# Items here are (block size, type size)

gguf-py/gguf/tensor_mapping.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1346,6 +1346,7 @@ class TensorNameMap:
13461346
"model.vision_tower.embeddings.cls_token", # Intern-S1
13471347
"vision_model.class_embedding", # llama 4
13481348
"model.vision.patch_embedding.cls_embedding", # cogvlm
1349+
"vision_model.radio_model.model.patch_generator.cls_token.token", # Nemotron Nano v2 VL
13491350
),
13501351

13511352
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
@@ -1360,6 +1361,7 @@ class TensorNameMap:
13601361
"vision_tower.patch_embed.proj", # kimi-vl
13611362
"model.vision.patch_embedding.proj", # cogvlm
13621363
"siglip2.vision_model.embeddings.patch_embedding",
1364+
"vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL
13631365
),
13641366

13651367
MODEL_TENSOR.V_ENC_EMBD_NORM: (
@@ -1376,12 +1378,14 @@ class TensorNameMap:
13761378
"visual.pos_embed", # qwen3vl
13771379
"model.vision.patch_embedding.position_embedding", # cogvlm
13781380
"visual.embeddings.position_embedding", # glm4v
1381+
"vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL
13791382
),
13801383

13811384
MODEL_TENSOR.V_ENC_ATTN_QKV: (
13821385
"visual.blocks.{bid}.attn.qkv", # qwen3vl
13831386
"model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
1384-
"vision_tower.encoder.blocks.{bid}.wqkv" # Kimi-K2.5
1387+
"vision_tower.encoder.blocks.{bid}.wqkv", # Kimi-K2.5
1388+
"vision_model.radio_model.model.blocks.{bid}.attn.qkv", # Nemotron Nano v2 VL
13851389
),
13861390

13871391
MODEL_TENSOR.V_ENC_ATTN_Q: (
@@ -1446,6 +1450,7 @@ class TensorNameMap:
14461450
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
14471451
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
14481452
"siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
1453+
"vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL
14491454
),
14501455

14511456
MODEL_TENSOR.V_ENC_ATTN_O: (
@@ -1462,6 +1467,7 @@ class TensorNameMap:
14621467
"vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
14631468
"model.vision.transformer.layers.{bid}.attention.dense", # cogvlm
14641469
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
1470+
"vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
14651471
),
14661472

14671473
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
@@ -1477,6 +1483,7 @@ class TensorNameMap:
14771483
"vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
14781484
"model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm
14791485
"siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
1486+
"vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL
14801487
),
14811488

14821489
MODEL_TENSOR.V_ENC_FFN_UP: (
@@ -1493,6 +1500,7 @@ class TensorNameMap:
14931500
"vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
14941501
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
14951502
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
1503+
"vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
14961504
),
14971505

14981506
MODEL_TENSOR.V_ENC_FFN_GATE: (
@@ -1515,6 +1523,7 @@ class TensorNameMap:
15151523
"vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
15161524
"model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm
15171525
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
1526+
"vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
15181527
),
15191528

15201529
MODEL_TENSOR.V_LAYER_SCALE_1: (

tools/mtmd/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ add_library(mtmd
2020
models/internvl.cpp
2121
models/kimivl.cpp
2222
models/kimik25.cpp
23+
models/nemotron-v2-vl.cpp
2324
models/llama4.cpp
2425
models/llava.cpp
2526
models/minicpmv.cpp

tools/mtmd/clip-impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ enum projector_type {
236236
PROJECTOR_TYPE_GLM4V,
237237
PROJECTOR_TYPE_YOUTUVL,
238238
PROJECTOR_TYPE_KIMIK25,
239+
PROJECTOR_TYPE_NEMOTRON_V2_VL,
239240
PROJECTOR_TYPE_UNKNOWN,
240241
};
241242

@@ -270,6 +271,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
270271
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
271272
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
272273
{ PROJECTOR_TYPE_KIMIK25, "kimik25"},
274+
{ PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
273275
};
274276

275277
static projector_type clip_projector_type_from_string(const std::string & str) {

tools/mtmd/clip-model.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ enum ffn_op_type {
1515
FFN_GELU_ERF,
1616
FFN_SILU,
1717
FFN_GELU_QUICK,
18+
FFN_RELU_SQR,
1819
};
1920

2021
enum norm_type {

tools/mtmd/clip.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,12 @@ ggml_tensor * clip_graph::build_ffn(
559559
cur = ggml_gelu_quick(ctx0, cur);
560560
cb(cur, "ffn_gelu_quick", il);
561561
} break;
562+
case FFN_RELU_SQR:
563+
{
564+
cur = ggml_relu(ctx0, cur);
565+
cur = ggml_sqr(ctx0, cur);
566+
cb(cur, "ffn_relu_sqr", il);
567+
} break;
562568
}
563569

564570
if (down) {
@@ -810,6 +816,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
810816
{
811817
builder = std::make_unique<clip_graph_internvl>(ctx, img);
812818
} break;
819+
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
820+
{
821+
builder = std::make_unique<clip_graph_nemotron_v2_vl>(ctx, img);
822+
} break;
813823
case PROJECTOR_TYPE_LLAMA4:
814824
{
815825
builder = std::make_unique<clip_graph_llama4>(ctx, img);
@@ -1110,6 +1120,7 @@ struct clip_model_loader {
11101120
}
11111121
} break;
11121122
case PROJECTOR_TYPE_INTERNVL:
1123+
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
11131124
{
11141125
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
11151126
} break;
@@ -1767,6 +1778,12 @@ struct clip_model_loader {
17671778
model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
17681779
model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
17691780
} break;
1781+
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
1782+
{
1783+
model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
1784+
model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
1785+
model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
1786+
} break;
17701787
case PROJECTOR_TYPE_GLMA:
17711788
{
17721789
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
@@ -3088,6 +3105,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
30883105
case PROJECTOR_TYPE_GLM_EDGE:
30893106
case PROJECTOR_TYPE_GEMMA3:
30903107
case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution
3108+
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
30913109
{
30923110
clip_image_u8 resized_image;
30933111
int sz = params.image_size;
@@ -3397,6 +3415,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
33973415
case PROJECTOR_TYPE_GEMMA3:
33983416
case PROJECTOR_TYPE_IDEFICS3:
33993417
case PROJECTOR_TYPE_INTERNVL:
3418+
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
34003419
case PROJECTOR_TYPE_LLAMA4:
34013420
{
34023421
// both X and Y are downscaled by the scale factor
@@ -3805,6 +3824,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
38053824
case PROJECTOR_TYPE_GEMMA3NV:
38063825
case PROJECTOR_TYPE_IDEFICS3:
38073826
case PROJECTOR_TYPE_INTERNVL:
3827+
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
38083828
case PROJECTOR_TYPE_QWEN2A:
38093829
case PROJECTOR_TYPE_GLMA:
38103830
case PROJECTOR_TYPE_ULTRAVOX:
@@ -3968,6 +3988,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
39683988
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
39693989
return ctx->model.mm_2_w->ne[1];
39703990
case PROJECTOR_TYPE_INTERNVL:
3991+
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
39713992
return ctx->model.mm_3_w->ne[1];
39723993
case PROJECTOR_TYPE_LLAMA4:
39733994
return ctx->model.mm_model_proj->ne[1];

tools/mtmd/models/models.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ struct clip_graph_internvl : clip_graph {
4242
ggml_cgraph * build() override;
4343
};
4444

45+
struct clip_graph_nemotron_v2_vl : clip_graph {
46+
clip_graph_nemotron_v2_vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
47+
ggml_cgraph * build() override;
48+
};
49+
4550
struct clip_graph_llama4 : clip_graph {
4651
clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
4752
ggml_cgraph * build() override;
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#include "models.h"
2+
3+
ggml_cgraph * clip_graph_nemotron_v2_vl::build() {
4+
GGML_ASSERT(model.class_embedding != nullptr);
5+
GGML_ASSERT(model.position_embeddings != nullptr);
6+
7+
const int n_registers = model.class_embedding->ne[1];
8+
const int n_pos = n_patches + n_registers;
9+
10+
ggml_tensor * inp = build_inp();
11+
12+
// add position embeddings (pre-downsampled during GGUF conversion for fixed 512x512 input)
13+
inp = ggml_add(ctx0, inp, model.position_embeddings);
14+
cb(inp, "inp_pos", -1);
15+
16+
inp = ggml_concat(ctx0, model.class_embedding, inp, 1);
17+
18+
ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, hparams.ffn_op, nullptr, nullptr);
19+
20+
cur = ggml_view_2d(ctx0, cur,
21+
n_embd, n_patches,
22+
ggml_row_size(cur->type, n_embd),
23+
n_registers * ggml_row_size(cur->type, n_embd));
24+
25+
cur = build_patch_merge_permute(cur, model.hparams.n_merge);
26+
27+
{
28+
cur = build_norm(cur, model.mm_0_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
29+
cur = build_ffn(cur, model.mm_1_w, nullptr, nullptr, nullptr, model.mm_3_w, nullptr, FFN_RELU_SQR, -1);
30+
}
31+
32+
ggml_build_forward_expand(gf, cur);
33+
34+
return gf;
35+
}

0 commit comments

Comments
 (0)