From 0c509239445088f21265580b86733c5b184261d9 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Fri, 11 Apr 2025 12:09:39 +0200
Subject: [PATCH 1/4] =?UTF-8?q?clip=20:=20use=20smart=20pointer=20(?=
 =?UTF-8?q?=E2=9A=A0=EF=B8=8F=20breaking=20change)=20(#12869)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* clip : use smart pointers

* fix warmup

* add forward declaration

* misisng include

* fix include (2)

* composite

* simplify batch ptr

* fix conflict
---
 examples/llava/clip-impl.h |  23 ++-
 examples/llava/clip.cpp    | 343 ++++++++++++++++++-------------------
 examples/llava/clip.h      |  30 ++--
 examples/llava/llava.cpp   | 103 ++++++-----
 examples/llava/mtmd.cpp    |  10 +-
 5 files changed, 255 insertions(+), 254 deletions(-)

diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h
index 4c035298749..4d7340a56bd 100644
--- a/examples/llava/clip-impl.h
+++ b/examples/llava/clip-impl.h
@@ -1,5 +1,6 @@
 #include "ggml.h"
 #include "gguf.h"
+#include "clip.h"
 
 #include "clip.h"
 
@@ -202,23 +203,31 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, ..
 // cpp wrappers
 //
 
+// wrapper for clip_image_size
+struct clip_image_size_deleter {
+    void operator()(clip_image_size * val) { clip_image_size_free(val); }
+};
+typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
+
+// wrapper for clip_image_u8
 struct clip_image_u8_deleter {
     void operator()(clip_image_u8 * val) { clip_image_u8_free(val); }
 };
+typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr;
 
+// wrapper for clip_image_f32
 struct clip_image_f32_deleter {
     void operator()(clip_image_f32 * val) { clip_image_f32_free(val); }
 };
+typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr;
 
-struct clip_image_f32_batch_deleter {
-    void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
+struct clip_image_u8_batch {
+    std::vector<clip_image_u8_ptr> entries;
 };
 
-typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr;
-typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr;
-typedef std::unique_ptr<clip_image_f32_batch, clip_image_f32_batch_deleter> clip_image_f32_batch_ptr;
-
-// TODO @ngxson : we're currently having a naming clash between struct clip_image_size and function clip_image_size()
+struct clip_image_f32_batch {
+    std::vector<clip_image_f32_ptr> entries;
+};
 
 //
 // common utils
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 710309edaec..a55b3f38351 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -315,58 +315,47 @@ struct clip_ctx {
     bool use_gelu = false;
     bool use_silu = false;
 
-    struct gguf_context * ctx_gguf = nullptr;
-    struct ggml_context * ctx_data = nullptr;
+    gguf_context_ptr ctx_gguf;
+    ggml_context_ptr ctx_data;
 
     std::vector<uint8_t> buf_compute_meta;
 
     std::vector<ggml_backend_t> backend_ptrs;
     std::vector<ggml_backend_buffer_type_t> backend_buft;
 
-    ggml_backend_t backend     = nullptr;
-    ggml_backend_t backend_cpu = nullptr;
-    ggml_backend_buffer_t buf  = nullptr;
+    ggml_backend_ptr backend;
+    ggml_backend_ptr backend_cpu;
+    ggml_backend_buffer_ptr buf;
 
     ggml_backend_sched_ptr sched;
 
-    struct clip_image_size * load_image_size = nullptr;
+    clip_image_size load_image_size;
 
     clip_ctx(clip_context_params & ctx_params) {
-        backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
-        backend     = ctx_params.use_gpu
+        backend_cpu = ggml_backend_ptr(ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr));
+        backend     = ggml_backend_ptr(ctx_params.use_gpu
                         ? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
-                        : nullptr;
+                        : nullptr);
 
         if (backend) {
-            LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
-            backend_ptrs.push_back(backend);
-            backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
+            LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend.get()));
+            backend_ptrs.push_back(backend.get());
+            backend_buft.push_back(ggml_backend_get_default_buffer_type(backend.get()));
         } else {
-            backend = backend_cpu;
+            backend = std::move(backend_cpu);
             LOG_INF("%s: CLIP using CPU backend\n", __func__);
         }
 
-        backend_ptrs.push_back(backend_cpu);
-        backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
+        backend_ptrs.push_back(backend_cpu.get());
+        backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu.get()));
 
         sched.reset(
             ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
         );
     }
-
-    ~clip_ctx() {
-        ggml_free(ctx_data);
-        gguf_free(ctx_gguf);
-        ggml_backend_buffer_free(buf);
-        ggml_backend_free(backend);
-        if (backend_cpu != backend) {
-            ggml_backend_free(backend_cpu);
-        }
-        clip_image_size_free(load_image_size);
-    }
 };
 
-static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
+static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
     const auto & model = ctx->vision_model;
     const auto & hparams = model.hparams;
 
@@ -382,7 +371,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
     const int n_layer              = hparams.n_layer;
     const float eps                = hparams.eps;
 
-    GGML_ASSERT(imgs->size == 1); // batch_size == 1
+    GGML_ASSERT(imgs.entries.size() == 1); // batch_size == 1
 
     struct ggml_init_params params = {
         /*.mem_size   =*/ ctx->buf_compute_meta.size(),
@@ -390,7 +379,9 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
         /*.no_alloc   =*/ true,
     };
 
-    struct ggml_context * ctx0 = ggml_init(params);
+    ggml_context_ptr ctx0_ptr(ggml_init(params));
+    auto ctx0 = ctx0_ptr.get();
+
     struct ggml_cgraph * gf = ggml_new_graph(ctx0);
 
     // input raw
@@ -512,12 +503,10 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
     // build the graph
     ggml_build_forward_expand(gf, embeddings);
 
-    ggml_free(ctx0);
-
     return gf;
 }
 
-static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
+static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
     if (!ctx->has_vision_encoder) {
         LOG_ERR("This gguf file seems to have no vision encoder\n");
         return nullptr;
@@ -530,23 +519,20 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
     int image_size_width  = image_size;
     int image_size_height = image_size;
     if (ctx->has_minicpmv_projector) {
-        if (load_image_size == nullptr) {
-            load_image_size = clip_image_size_init();
-        }
-        LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
-        image_size_width  = load_image_size->width;
-        image_size_height = load_image_size->height;
+        LOG_DBG("%s: %d %d\n", __func__, load_image_size.width, load_image_size.height);
+        image_size_width  = load_image_size.width;
+        image_size_height = load_image_size.height;
         if (is_inf) {
-            image_size_width  = imgs->data->nx;
-            image_size_height = imgs->data->ny;
+            image_size_width  = imgs.entries[0]->nx;
+            image_size_height = imgs.entries[0]->ny;
         }
     }
     else if (ctx->has_qwen2vl_merger) {
         // use the image's native resolution when image is avaible
         if (is_inf) {
         // if (imgs->data->nx && imgs->data->ny) {
-            image_size_width  = imgs->data->nx;
-            image_size_height = imgs->data->ny;
+            image_size_width  = imgs.entries[0]->nx;
+            image_size_height = imgs.entries[0]->ny;
         }
     }
     const int patch_size           = hparams.patch_size;
@@ -561,7 +547,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
     const float eps                = hparams.eps;
     int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
 
-    const int batch_size = imgs->size;
+    const int batch_size = imgs.entries.size();
 
     if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) {
         GGML_ASSERT(batch_size == 1);
@@ -573,7 +559,9 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
         /*.no_alloc   =*/ true,
     };
 
-    struct ggml_context * ctx0 = ggml_init(params);
+    ggml_context_ptr ctx0_ptr(ggml_init(params));
+    auto ctx0 = ctx0_ptr.get();
+
     struct ggml_cgraph * gf = ggml_new_graph(ctx0);
 
     struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
@@ -1061,7 +1049,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
                 embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
             }
         } else {
-            GGML_ABORT("fatel error");
+            GGML_ABORT("fatal error");
         }
     }
     else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
@@ -1081,12 +1069,10 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
     // build the graph
     ggml_build_forward_expand(gf, embeddings);
 
-    ggml_free(ctx0);
-
     return gf;
 }
 
-static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
+static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
     if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
         return clip_image_build_graph_siglip(ctx, imgs);
     } else {
@@ -1257,7 +1243,7 @@ struct clip_model_loader {
             /*.mem_buffer =*/ NULL,
             /*.no_alloc =*/ true,
         };
-        ctx_clip.ctx_data = ggml_init(params);
+        ctx_clip.ctx_data.reset(ggml_init(params));
         if (!ctx_clip.ctx_data) {
             throw std::runtime_error(string_format("%s: failed to init ggml context\n", __func__));
         }
@@ -1271,7 +1257,7 @@ struct clip_model_loader {
             if (cur) {
                 tensors_to_load.push_back(cur);
                 // add tensors to context
-                struct ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data, cur);
+                struct ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur);
                 ggml_set_name(data_tensor, cur->name);
                 cur = data_tensor;
             }
@@ -1442,11 +1428,11 @@ struct clip_model_loader {
             }
 
             // alloc memory and offload data
-            ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
-            ctx_clip.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data, buft);
-            ggml_backend_buffer_set_usage(ctx_clip.buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+            ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend.get());
+            ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
+            ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
             for (auto & t : tensors_to_load) {
-                struct ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data, t->name);
+                struct ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
                 const size_t offset = tensor_offset[t->name];
                 fin.seekg(offset, std::ios::beg);
                 if (!fin) {
@@ -1471,10 +1457,20 @@ struct clip_model_loader {
 
     void alloc_compute_meta() {
         ctx_clip.buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
+
+        // create a fake batch
         clip_image_f32_batch batch;
-        batch.size = 1;
-        batch.data = nullptr;
-        ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, &batch, nullptr, false);
+        clip_image_f32_ptr img(clip_image_f32_init());
+        clip_image_size image_size;
+        image_size.width  = clip_get_image_size(&ctx_clip);
+        image_size.height = clip_get_image_size(&ctx_clip);
+        int n_patches = clip_get_image_size(&ctx_clip) / image_size.width;
+        img->nx = n_patches;
+        img->ny = n_patches;
+        img->buf.resize(n_patches * image_size.width * image_size.height * 3);
+        batch.entries.push_back(std::move(img));
+
+        ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch, image_size, false);
         ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
         for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
             ggml_backend_t backend = ctx_clip.backend_ptrs[i];
@@ -1575,11 +1571,11 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
 }
 
 void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
-    ctx_clip->load_image_size = load_image_size;
+    ctx_clip->load_image_size = *load_image_size; // copy
 }
 
 struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
-    return ctx_clip->load_image_size;
+    return &ctx_clip->load_image_size;
 }
 
 struct clip_image_size * clip_image_size_init() {
@@ -1597,6 +1593,10 @@ struct clip_image_f32 * clip_image_f32_init() {
     return new clip_image_f32();
 }
 
+struct clip_image_f32_batch * clip_image_f32_batch_init() {
+    return new clip_image_f32_batch();
+}
+
 unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
     if (nx) *nx = img->nx;
     if (ny) *ny = img->ny;
@@ -1609,19 +1609,37 @@ void clip_image_size_free(struct clip_image_size * load_image_size) {
     }
     delete load_image_size;
 }
-void clip_image_u8_free(struct clip_image_u8  * img) { delete img; }
-void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
-void clip_image_u8_batch_free(struct clip_image_u8_batch  * batch) {
-    if (batch->size > 0) {
-        delete[] batch->data;
-        batch->size = 0;
+void clip_image_u8_free(struct clip_image_u8  * img) { if (img) delete img; }
+void clip_image_f32_free(struct clip_image_f32 * img) { if (img) delete img; }
+void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { if (batch) delete batch; }
+void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { if (batch) delete batch; }
+
+size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) {
+    return batch->entries.size();
+}
+
+size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) {
+    if (idx < 0 || idx >= (int)batch->entries.size()) {
+        LOG_ERR("%s: invalid index %d\n", __func__, idx);
+        return 0;
+    }
+    return batch->entries[idx]->nx;
+}
+
+size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) {
+    if (idx < 0 || idx >= (int)batch->entries.size()) {
+        LOG_ERR("%s: invalid index %d\n", __func__, idx);
+        return 0;
     }
+    return batch->entries[idx]->ny;
 }
-void clip_image_f32_batch_free(struct clip_image_f32_batch  * batch) {
-    if (batch->size > 0) {
-        delete[] batch->data;
-        batch->size = 0;
+
+clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) {
+    if (idx < 0 || idx >= (int)batch->entries.size()) {
+        LOG_ERR("%s: invalid index %d\n", __func__, idx);
+        return nullptr;
     }
+    return batch->entries[idx].get();
 }
 
 void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
@@ -1695,14 +1713,15 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta
 }
 
 // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
-static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) {
-    dst->nx = src->nx;
-    dst->ny = src->ny;
-    dst->buf.resize(src->buf.size());
+static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
+    dst.nx = src.nx;
+    dst.ny = src.ny;
+    dst.buf.resize(src.buf.size());
 
-    for (size_t i = 0; i < src->buf.size(); ++i) {
+    // TODO @ngxson : seems like this could be done more efficiently on cgraph
+    for (size_t i = 0; i < src.buf.size(); ++i) {
         int c = i % 3; // rgb
-        dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c];
+        dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
     }
 }
 
@@ -1710,7 +1729,7 @@ inline int clip(int x, int lower, int upper) {
     return std::max(lower, std::min(x, upper));
 }
 
-static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) {
+static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
     const int nx = img.nx;
     const int ny = img.ny;
 
@@ -1848,13 +1867,13 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
     return best_fit;
 }
 
-static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & image, int patch_size) {
-    std::vector<clip_image_u8*> patches;
+static std::vector<clip_image_u8_ptr> divide_to_patches_u8(const clip_image_u8 & image, int patch_size) {
+    std::vector<clip_image_u8_ptr> patches;
     int width = image.nx;
     int height = image.ny;
     for (int i = 0; i < height; i += patch_size) {
         for (int j = 0; j < width; j += patch_size) {
-            clip_image_u8 *patch = clip_image_u8_init();
+            clip_image_u8_ptr patch(clip_image_u8_init());
             patch->nx = std::min(patch_size, width - j);
             patch->ny = std::min(patch_size, height - i);
             patch->buf.resize(3 * patch->nx * patch->ny);
@@ -1865,7 +1884,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
                     }
                 }
             }
-            patches.push_back(patch);
+            patches.push_back(std::move(patch));
         }
     }
     return patches;
@@ -1946,7 +1965,7 @@ static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int mul
 //    -> https://arxiv.org/pdf/2403.11703
 //    -> https://github.com/thunlp/LLaVA-UHD
 //    -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
-static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
+static std::vector<std::vector<clip_image_u8_ptr>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
     const std::pair<int, int> original_size={img->nx,img->ny};
     const int original_width = img->nx;
     const int original_height = img->ny;
@@ -1954,30 +1973,30 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
     const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
     const int multiple = fmin(ceil(ratio), max_slice_nums);
 
-    std::vector<std::vector<clip_image_u8 *>> images;
+    std::vector<std::vector<clip_image_u8_ptr>> images;
     LOG_DBG("%s: multiple %d\n", __func__, multiple);
-    images.push_back(std::vector<clip_image_u8 *>());
+    images.push_back(std::vector<clip_image_u8_ptr>());
 
     if (multiple <= 1) {
         auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
-        clip_image_u8 * source_image = clip_image_u8_init();
+        clip_image_u8_ptr source_image(clip_image_u8_init());
         bicubic_resize(*img, *source_image, best_size.first, best_size.second);
         // source_image = image.resize(best_size, Image.Resampling.BICUBIC)
-        images[images.size()-1].push_back(source_image);
+        images.back().push_back(std::move(source_image));
     }
     else if (multiple > 1) {
         auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
-        clip_image_u8 * source_image = clip_image_u8_init();
+        clip_image_u8_ptr source_image(clip_image_u8_init());
         bicubic_resize(*img, *source_image, best_size.first, best_size.second);
         // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
         LOG_DBG("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
-        images[images.size()-1].push_back(source_image);
+        images.back().push_back(std::move(source_image));
 
         std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
         LOG_DBG("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
 
         auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
-        clip_image_u8 * refine_image = clip_image_u8_init();
+        clip_image_u8_ptr refine_image(clip_image_u8_init());
         bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
 
         LOG_DBG("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
@@ -1988,9 +2007,9 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
         int grid_x = int(width / best_grid.first);
         int grid_y = int(height / best_grid.second);
         for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
-            images.push_back(std::vector<clip_image_u8 *>());
+            images.push_back(std::vector<clip_image_u8_ptr>());
             for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
-                clip_image_u8 * patch = clip_image_u8_init();
+                clip_image_u8_ptr patch(clip_image_u8_init());
                 patch->nx = grid_x;
                 patch->ny = grid_y;
                 patch->buf.resize(3 * patch->nx * patch->ny);
@@ -2003,10 +2022,9 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
                         patch->buf[j+2] = refine_image->buf[i+2];
                     }
                 }
-                images[images.size()-1].push_back(patch);
+                images.back().push_back(std::move(patch));
             }
         }
-        clip_image_u8_free(refine_image);
     }
     return images;
 }
@@ -2014,8 +2032,8 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
 int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
     const int max_slice_nums=9;
     const int scale_resolution=448;
-    const int original_width = ctx_clip->load_image_size->width;
-    const int original_height = ctx_clip->load_image_size->height;
+    const int original_width = ctx_clip->load_image_size.width;
+    const int original_height = ctx_clip->load_image_size.height;
     const float log_ratio = log(1.0*original_width/original_height);
     const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
     const int multiple = fmin(ceil(ratio), max_slice_nums);
@@ -2025,64 +2043,44 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
 
 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
-bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
+bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
 
-    if(clip_is_minicpmv(ctx)){
+    if (clip_is_minicpmv(ctx)) {
         int max_slice_nums = 9;
-        std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
-        res_imgs->size = 0;
-        for (size_t i = 0; i < imgs.size(); ++i){
-            res_imgs->size += imgs[i].size();
-        }
-        res_imgs->data = new clip_image_f32[res_imgs->size];
-        int idx = 0;
+        std::vector<std::vector<clip_image_u8_ptr>> imgs = uhd_slice_image(img, max_slice_nums);
         for (size_t i = 0; i < imgs.size(); ++i) {
             for (size_t j = 0; j < imgs[i].size(); ++j) {
                 LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
-                clip_image_f32 * res = clip_image_f32_init();
-                normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
-                res_imgs->data[idx++] = *res;
-                clip_image_f32_free(res);
-            }
-        }
-        for (size_t i = 0; i < imgs.size(); ++i) {
-            for (size_t j = 0; j < imgs[i].size(); ++j) {
-                if (imgs[i][j] != nullptr) {
-                    clip_image_u8_free(imgs[i][j]);
-                }
+                clip_image_f32_ptr res(clip_image_f32_init());
+                normalize_image_u8_to_f32(*imgs[i][j], *res, ctx->image_mean, ctx->image_std);
+                res_imgs->entries.push_back(std::move(res));
             }
         }
         return true;
     }
     else if (ctx->has_qwen2vl_merger) {
-        clip_image_u8 * resized = clip_image_u8_init();
-        auto patch_size = clip_patch_size(ctx) * 2;
+        clip_image_u8 resized;
+        auto patch_size = clip_get_patch_size(ctx) * 2;
         int nx = ceil((float)img->nx / patch_size) * patch_size;
         int ny = ceil((float)img->ny / patch_size) * patch_size;
-        bicubic_resize(*img, *resized, nx, ny);
+        bicubic_resize(*img, resized, nx, ny);
 
-        res_imgs->data = new clip_image_f32[1];
-        // clip_image_f32 * res = clip_image_f32_init();
-        normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std);
+        clip_image_f32_ptr img_f32(clip_image_f32_init());
+        // clip_image_f32_ptr res(clip_image_f32_init());
+        normalize_image_u8_to_f32(resized, *img_f32, ctx->image_mean, ctx->image_std);
         // res_imgs->data[0] = *res;
-        res_imgs->size = 1;
-
-        // clip_image_f32_free(res);
-        clip_image_u8_free(resized);
+        res_imgs->entries.push_back(std::move(img_f32));
         return true;
     }
 
     if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
-        res_imgs->size = 1;
-        res_imgs->data = new clip_image_f32[res_imgs->size];
         clip_image_u8 resized_image;
         int32_t sz=ctx->vision_model.hparams.image_size;
         bicubic_resize(*img, resized_image,sz,sz);
-        clip_image_f32 * res = clip_image_f32_init();
+        clip_image_f32_ptr img_f32(clip_image_f32_init());
         //clip_image_save_to_bmp(resized_image, "resized.bmp");
-        normalize_image_u8_to_f32(&resized_image, res, ctx->image_mean, ctx->image_std);
-        res_imgs->data[0] = *res;
-        clip_image_f32_free(res);
+        normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
+        res_imgs->entries.push_back(std::move(img_f32));
         return true;
     }
 
@@ -2097,16 +2095,12 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
         pad_to_square = false;
     }
     // free the previous res_imgs if any set
-    if (res_imgs->size > 0) {
-        clip_image_f32_batch_free(res_imgs);
-    }
-    res_imgs->data = nullptr;
-    res_imgs->size = 0;
+    res_imgs->entries.clear();
 
     // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
     // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
 
-    clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
+    clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
     if (pad_to_square && img->nx != img->ny) {
         int longer_side = std::max(img->nx, img->ny);
         temp->nx = longer_side;
@@ -2149,28 +2143,18 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
             //     clip_image_u8_free(temp2);
             // }
 
-            std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
+            std::vector<clip_image_u8_ptr> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
 
-            clip_image_u8 *image_original_resize = clip_image_u8_init();
+            clip_image_u8_ptr image_original_resize(clip_image_u8_init());
             // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
             bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
-            patches.insert(patches.begin(), image_original_resize);
-            // clip_image_f32_batch_init(patches.size());
-            res_imgs->size = patches.size();
-            res_imgs->data = new clip_image_f32[res_imgs->size];
-            int num=0;
-            for (auto& patch : patches) {
-                normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std);
-                num++;
-            }
-
-            for (size_t i = 0; i < patches.size(); i++) {
-                // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
-                clip_image_u8_free(patches[i]);
+            patches.insert(patches.begin(), std::move(image_original_resize));
+            for (auto & patch : patches) {
+                clip_image_f32_ptr res(clip_image_f32_init());
+                normalize_image_u8_to_f32(*patch, *res, ctx->image_mean, ctx->image_std);
+                res_imgs->entries.push_back(std::move(res));
             }
 
-            clip_image_u8_free(temp);
-
             return true;
         } else {
             temp->nx = img->nx;
@@ -2186,7 +2170,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
 
     const int nx2 = ctx->vision_model.hparams.image_size;
     const int ny2 = ctx->vision_model.hparams.image_size;
-    clip_image_f32 * res = clip_image_f32_init();
+    clip_image_f32_ptr res(clip_image_f32_init());
     res->nx = nx2;
     res->ny = ny2;
     res->buf.resize(3 * nx2 * ny2);
@@ -2238,7 +2222,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
             }
         }
     }
-    clip_image_u8_free(temp);
 
     // {
     //     clip_image_u8 * temp2 = clip_image_u8_init();
@@ -2248,10 +2231,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
     // }
     // res_imgs.push_back(res);
 
-    res_imgs->size = 1;
-    res_imgs->data = new clip_image_f32[res_imgs->size];
-    res_imgs->data[0] = *res;
-    clip_image_f32_free(res);
+    res_imgs->entries.push_back(std::move(res));
 
     return true;
 }
@@ -2279,15 +2259,15 @@ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w
     return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }
 
-int32_t clip_image_size(const struct clip_ctx * ctx) {
+int32_t clip_get_image_size(const struct clip_ctx * ctx) {
     return ctx->vision_model.hparams.image_size;
 }
 
-int32_t clip_patch_size(const struct clip_ctx * ctx) {
+int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
     return ctx->vision_model.hparams.patch_size;
 }
 
-int32_t clip_hidden_size(const struct clip_ctx * ctx) {
+int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
     return ctx->vision_model.hparams.hidden_size;
 }
 
@@ -2434,19 +2414,23 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
         return false;
     }
 
-    clip_image_f32_batch imgs{};
-    imgs.size = 1;
-    imgs.data = img;
+    clip_image_f32_batch imgs;
+    clip_image_f32_ptr img_copy(clip_image_f32_init());
+    *img_copy = *img;
+    imgs.entries.push_back(std::move(img_copy));
+
     return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
 }
 
-bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
+bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
+    const clip_image_f32_batch & imgs = *imgs_c_ptr;
+
     if (!ctx->has_vision_encoder) {
         LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__);
         return false;
     }
 
-    int batch_size = imgs->size;
+    int batch_size = imgs.entries.size();
     if (ctx->has_llava_projector) {
         GGML_ASSERT(batch_size == 1); // TODO: support multiple images
     }
@@ -2473,25 +2457,22 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     int image_size_width  = image_size;
     int image_size_height = image_size;
     if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
-        image_size_width  = imgs->data[0].nx;
-        image_size_height = imgs->data[0].ny;
+        image_size_width  = imgs.entries[0]->nx;
+        image_size_height = imgs.entries[0]->ny;
     }
     const int patch_size    = hparams.patch_size;
     const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
     const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
-    if(ctx->load_image_size==nullptr){
-        ctx->load_image_size= clip_image_size_init();
-    }
-    const int pos_w = ctx->load_image_size->width/patch_size;
-    const int pos_h = ctx->load_image_size->height/patch_size;
+    const int pos_w = ctx->load_image_size.width / patch_size;
+    const int pos_h = ctx->load_image_size.height / patch_size;
 
     {
         struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
         float * data = (float *)malloc(ggml_nbytes(inp_raw));
 
-        for (size_t i = 0; i < imgs->size; i++) {
-            const int nx = imgs->data[i].nx;
-            const int ny = imgs->data[i].ny;
+        for (size_t i = 0; i < imgs.entries.size(); i++) {
+            const int nx = imgs.entries[i]->nx;
+            const int ny = imgs.entries[i]->ny;
             if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
                 GGML_ASSERT(nx == image_size && ny == image_size);
             }
@@ -2502,7 +2483,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 for (int k = 0; k < 3; k++) {
                     for (int y = 0; y < ny; y++) {
                         for (int x = 0; x < nx; x++) {
-                            data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
+                            data[(b * 3 * n) + k * n + y * nx + x] = imgs.entries[b]->buf[3 * (y * nx + x) + k];
                         }
                     }
                 }
@@ -2629,7 +2610,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         }
     }
 
-    ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
+    ggml_backend_cpu_set_n_threads(ctx->backend_cpu.get(), n_threads);
 
     auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
     if (status != GGML_STATUS_SUCCESS) {
@@ -2662,8 +2643,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
         /* verbosity */ GGML_LOG_LEVEL_ERROR,
     });
 
-    const auto & ctx_src = ctx_clip->ctx_gguf;
-    const auto & ctx_data = ctx_clip->ctx_data;
+    const auto & ctx_src = ctx_clip->ctx_gguf.get();
+    const auto & ctx_data = ctx_clip->ctx_data.get();
 
     auto * ctx_out = gguf_init_empty();
     gguf_set_kv(ctx_out, ctx_src);
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index f61e0c0b2b3..cc133a58de3 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -30,15 +30,8 @@ struct clip_image_size {
     int height;
 };
 
-struct clip_image_u8_batch {
-    struct clip_image_u8 * data;
-    size_t size;
-};
-
-struct clip_image_f32_batch {
-    struct clip_image_f32 * data;
-    size_t size;
-};
+struct clip_image_u8_batch;
+struct clip_image_f32_batch;
 
 struct clip_context_params {
     bool use_gpu;
@@ -55,9 +48,9 @@ CLIP_API void clip_free(struct clip_ctx * ctx);
 CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
 CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
 
-CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
-CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
-CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
+CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
 
 // TODO: should be enum, not string
 CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
@@ -73,9 +66,10 @@ CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
 CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
 CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
 
-CLIP_API struct clip_image_size * clip_image_size_init();
-CLIP_API struct clip_image_u8  * clip_image_u8_init ();
-CLIP_API struct clip_image_f32 * clip_image_f32_init();
+CLIP_API struct clip_image_size      * clip_image_size_init();
+CLIP_API struct clip_image_u8        * clip_image_u8_init ();
+CLIP_API struct clip_image_f32       * clip_image_f32_init();
+CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava
 
 // nx, ny are the output image dimensions
 CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
@@ -86,6 +80,12 @@ CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
 CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
 CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
 
+// use for accessing underlay data of clip_image_f32_batch
+CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
+CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
+CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
+CLIP_API clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
+
 /**
  * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
  * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 518aad3f1f7..03a22cbb4c2 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -10,6 +10,7 @@
 #include <cstring>
 #include <limits>
 #include <vector>
+#include <memory>
 
 #if defined(LLAVA_LOG_OFF)
 #   define LOG_INF(...)
@@ -45,6 +46,17 @@ struct clip_image_grid_shape {
     int second;
 };
 
+// convenience cpp wrapper
+struct clip_image_f32_batch_deleter {
+    void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
+};
+typedef std::unique_ptr<clip_image_f32_batch, clip_image_f32_batch_deleter> clip_image_f32_batch_ptr;
+
+struct clip_image_size_deleter {
+    void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
+};
+typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
+
 /**
  * Selects the best resolution from a list of possible resolutions based on the original size.
  *
@@ -105,8 +117,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
         struct ggml_context * ctx;
     } model;
 
-    const int32_t image_size = clip_image_size(ctx_clip);
-    const int32_t patch_size = clip_patch_size(ctx_clip);
+    const int32_t image_size = clip_get_image_size(ctx_clip);
+    const int32_t patch_size = clip_get_patch_size(ctx_clip);
 
     int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
 
@@ -246,12 +258,9 @@ static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size)
 
 static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
     // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
-    clip_image_f32_batch img_res_v;
-    img_res_v.size = 0;
-    img_res_v.data = nullptr;
-    if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
+    clip_image_f32_batch_ptr img_res_v(clip_image_f32_batch_init());
+    if (!clip_image_preprocess(ctx_clip, img, img_res_v.get())) {
         LOG_ERR("%s: unable to preprocess image\n", __func__);
-        delete[] img_res_v.data;
         return false;
     }
 
@@ -259,66 +268,72 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
 
     const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
 
+    const size_t n_imgs = clip_image_f32_batch_n_images(img_res_v.get());
+
     if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
         std::vector<float *> image_embd_v;
-        image_embd_v.resize(img_res_v.size);
-        struct clip_image_size * load_image_size = clip_image_size_init();
+        image_embd_v.resize(n_imgs);
+        clip_image_size load_image_size;
 
-        for (size_t i = 0; i < img_res_v.size; i++) {
+        for (size_t i = 0; i < n_imgs; i++) {
             const int64_t t_img_enc_step_start_us = ggml_time_us();
-            image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
-            int patch_size=14;
-            load_image_size->width = img_res_v.data[i].nx;
-            load_image_size->height = img_res_v.data[i].ny;
-            clip_add_load_image_size(ctx_clip, load_image_size);
+            int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
+            int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
+            image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, nx, ny));
+            int patch_size = 14;
+            load_image_size.width = nx;
+            load_image_size.height = ny;
+            clip_add_load_image_size(ctx_clip, &load_image_size);
 
             bool encoded = false;
+            clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
             if (clip_is_qwen2vl(ctx_clip)) {
-                encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
+                encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]);
             }
             else {
-                encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
+                encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(img_res, patch_size), image_embd_v[i]);
             }
 
             if (!encoded) {
-                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
                 return false;
             }
             const int64_t t_img_enc_steop_batch_us = ggml_time_us();
-            LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
+            LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)n_imgs, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
         }
         const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
 
         int n_img_pos_out = 0;
         for (size_t i = 0; i < image_embd_v.size(); i++) {
+            int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
+            int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
+            clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
             std::memcpy(
                 image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
                 image_embd_v[i],
-                clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
-            n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]);
+                clip_embd_nbytes_by_img(ctx_clip, nx, ny));
+            n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
         }
         *n_img_pos = n_img_pos_out;
         for (size_t i = 0; i < image_embd_v.size(); i++) {
             free(image_embd_v[i]);
         }
         image_embd_v.clear();
-        load_image_size->width = img->nx;
-        load_image_size->height = img->ny;
-        clip_add_load_image_size(ctx_clip, load_image_size);
-        LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
-        delete[] img_res_v.data;
-        img_res_v.size = 0;
-        img_res_v.data = nullptr;
+        load_image_size.width = img->nx;
+        load_image_size.height = img->ny;
+        clip_add_load_image_size(ctx_clip, &load_image_size);
+        LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size.width, load_image_size.height);
     }
     else if (clip_is_glm(ctx_clip)){
         struct clip_image_size * load_image_size = clip_image_size_init();
-        load_image_size->width = img_res_v.data[0].nx;
-        load_image_size->height = img_res_v.data[0].ny;
+        load_image_size->width  = clip_image_f32_batch_nx(img_res_v.get(), 0);
+        load_image_size->height = clip_image_f32_batch_ny(img_res_v.get(), 0);
         clip_add_load_image_size(ctx_clip, load_image_size);
 
-        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
-        int pos = int(load_image_size->width/clip_patch_size(ctx_clip)/2);
+        clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
+        bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
+        int pos = int(load_image_size->width/clip_get_patch_size(ctx_clip)/2);
         *n_img_pos = (pos * pos + 2);
         if (!encoded){
             LOG_ERR("Unable to encode image \n");
@@ -328,8 +343,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
     else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
         // flat / default llava-1.5 type embedding
         *n_img_pos = clip_n_patches(ctx_clip);
-        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
-        delete[] img_res_v.data;
+        clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
+        bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
         if (!encoded) {
             LOG_ERR("Unable to encode image\n");
 
@@ -340,17 +355,18 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
         // spatial_unpad llava-1.6 type embedding
         // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
         std::vector<float *> image_embd_v;
-        image_embd_v.resize(img_res_v.size);
-        for (size_t i = 0; i < img_res_v.size; i++) {
+        image_embd_v.resize(n_imgs);
+        for (size_t i = 0; i < n_imgs; i++) {
+            clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
             image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
-            const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
+            const bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
             if (!encoded) {
-                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
                 return false;
             }
         }
         const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
 
         const int32_t * image_grid = clip_image_grid(ctx_clip);
         const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
@@ -360,12 +376,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
             grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
         }
 
-        // free all img_res_v - not needed anymore
-        delete[] img_res_v.data;
-        img_res_v.size = 0;
-        img_res_v.data = nullptr;
-
-        const int32_t image_size = clip_image_size(ctx_clip);
+        const int32_t image_size = clip_get_image_size(ctx_clip);
 
         struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
 
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 58503d0b22c..114c274bc12 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -41,14 +41,14 @@ struct mtmd_context {
 };
 
 struct mtmd_image_tokens_data {
-    clip_image_f32_batch_ptr batch_f32; // preprocessed image patches
+    clip_image_f32_batch batch_f32; // preprocessed image patches
 };
 
 struct mtmd_image_tokens {
     uint32_t nx; // number of tokens in x direction
     uint32_t ny; // number of tokens in y direction
     uint32_t n_tokens() const { return nx * ny; }
-    clip_image_f32_batch_ptr batch_f32; // preprocessed image patches
+    clip_image_f32_batch batch_f32; // preprocessed image patches
 };
 
 mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
@@ -141,8 +141,8 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
             std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3);
 
             // preprocess image
-            clip_image_f32_batch_ptr batch_f32(new clip_image_f32_batch);
-            bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), batch_f32.get());
+            clip_image_f32_batch batch_f32;
+            bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
             if (!ok) {
                 LOG_ERR("Unable to preprocess image\n");
                 return nullptr;
@@ -181,7 +181,7 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
     bool ok = clip_image_batch_encode(
         ctx->ctx_clip,
         ctx->n_threads,
-        image_tokens->batch_f32.get(),
+        &image_tokens->batch_f32,
         ctx->image_embd_v.data());
     return ok ? 0 : 1;
 }

From 06bb53ad9b6e6d92b6ab6979927530080b1c990c Mon Sep 17 00:00:00 2001
From: Yuxuan Zhang <2448370773@qq.com>
Date: Fri, 11 Apr 2025 18:10:10 +0800
Subject: [PATCH 2/4] llama-model : add Glm4Model implementation for GLM-4-0414
 (#12867)

* GLM-4-0414

* use original one

* Using with tensor map

* fix bug

* change order

* change order

* format with flask8
---
 README.md                      |   1 +
 convert_hf_to_gguf.py          |  20 +++-
 convert_hf_to_gguf_update.py   |   1 +
 gguf-py/gguf/constants.py      |  19 +++
 gguf-py/gguf/tensor_mapping.py |   8 +-
 src/llama-arch.cpp             |  20 ++++
 src/llama-arch.h               |   3 +
 src/llama-model.cpp            | 204 +++++++++++++++++++++++++++++++++
 src/llama-vocab.cpp            |   1 +
 9 files changed, 273 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index e56042f1cb2..a129d27d54c 100644
--- a/README.md
+++ b/README.md
@@ -97,6 +97,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
 - [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
 - [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) + [GLMEdge-1.5b](https://huggingface.co/THUDM/glm-edge-1.5b-chat) + [GLMEdge-4b](https://huggingface.co/THUDM/glm-edge-4b-chat)
+- [x] [GLM-4-0414](https://huggingface.co/collections/THUDM/glm-4-0414-67f3cbcb34dd9d252707cb2e)
 - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
 - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
 - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 4aff03a34bb..2bf97475f78 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -735,6 +735,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
             # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
             res = "llama4"
+        if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
+            # ref: https://huggingface.co/THUDM/glm-4-9b-hf
+            res = "glm4"
 
         if res is None:
             logger.warning("\n")
@@ -4897,6 +4900,22 @@ def prepare_tensors(self):
         self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
 
 
+@Model.register("Glm4ForCausalLM")
+class Glm4Model(Model):
+    model_arch = gguf.MODEL_ARCH.GLM4
+
+    def set_vocab(self):
+        self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "yarn":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
+
+
 @Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
 class ChatGLMModel(Model):
     model_arch = gguf.MODEL_ARCH.CHATGLM
@@ -5588,7 +5607,6 @@ def main() -> None:
     with torch.inference_mode():
         output_type = ftype_map[args.outtype]
         model_architecture = hparams["architectures"][0]
-
         try:
             model_class = Model.from_model_architecture(model_architecture)
         except NotImplementedError:
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index ce6104da4a0..160c9fe0e61 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -114,6 +114,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "trillion",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
     {"name": "bailingmoe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
     {"name": "llama4",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
+    {"name": "glm4",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
 ]
 
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 0410654dde2..162070e6e19 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -280,6 +280,7 @@ class MODEL_ARCH(IntEnum):
     DEEPSEEK         = auto()
     DEEPSEEK2        = auto()
     CHATGLM          = auto()
+    GLM4             = auto()
     BITNET           = auto()
     T5               = auto()
     T5ENCODER        = auto()
@@ -487,6 +488,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.DEEPSEEK:         "deepseek",
     MODEL_ARCH.DEEPSEEK2:        "deepseek2",
     MODEL_ARCH.CHATGLM:          "chatglm",
+    MODEL_ARCH.GLM4:             "glm4",
     MODEL_ARCH.BITNET:           "bitnet",
     MODEL_ARCH.T5:               "t5",
     MODEL_ARCH.T5ENCODER:        "t5encoder",
@@ -1561,6 +1563,23 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.GLM4 : [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.ATTN_POST_NORM,
+        MODEL_TENSOR.FFN_POST_NORM,
+    ],
     MODEL_ARCH.BITNET: [
         MODEL_TENSOR.ATTN_Q,
         MODEL_TENSOR.ATTN_K,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index a9e681f8e67..35154e9b5da 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -13,7 +13,7 @@ class TensorNameMap:
             "transformer.wte",                           # gpt2 gpt-j mpt refact qwen dbrx jais exaone
             "transformer.word_embeddings",               # falcon
             "word_embeddings",                           # bloom
-            "model.embed_tokens",                        # llama-hf nemotron olmoe olmo2 rwkv6qwen2
+            "model.embed_tokens",                        # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414
             "tok_embeddings",                            # llama-pth
             "embeddings.word_embeddings",                # bert nomic-bert
             "language_model.embedding.word_embeddings",  # persimmon
@@ -241,7 +241,8 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.ATTN_POST_NORM: (
-            "model.layers.{bid}.post_attention_layernorm",     # gemma2 olmo2
+            "model.layers.{bid}.post_attention_layernorm",     # gemma2 olmo2    # ge
+            "model.layers.{bid}.post_self_attn_layernorm",     # glm-4-0414
         ),
 
         # Rotary embeddings
@@ -278,6 +279,7 @@ class TensorNameMap:
         # Post feed-forward norm
         MODEL_TENSOR.FFN_POST_NORM: (
             "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
+            "model.layers.{bid}.post_mlp_layernorm", # glm-4-0414
         ),
 
         MODEL_TENSOR.FFN_GATE_INP: (
@@ -316,7 +318,7 @@ class TensorNameMap:
             "h.{bid}.mlp.c_fc",                                       # gpt2
             "transformer.h.{bid}.mlp.fc1",                            # phi2
             "model.layers.{bid}.mlp.fc1",                             # phi2
-            "model.layers.{bid}.mlp.gate_up_proj",                    # phi3
+            "model.layers.{bid}.mlp.gate_up_proj",                    # phi3 glm-4-0414
             "model.layers.layers.{bid}.mlp.up_proj",                  # plamo
             "model.layers.{bid}.feed_forward.w3",                     # internlm2
             "encoder.layers.{bid}.mlp.fc11",                          # nomic-bert
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 264f8c5b98f..a6fddc7fd2e 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -54,6 +54,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_DEEPSEEK,         "deepseek"         },
     { LLM_ARCH_DEEPSEEK2,        "deepseek2"        },
     { LLM_ARCH_CHATGLM,          "chatglm"          },
+    { LLM_ARCH_GLM4,             "glm4"             },
     { LLM_ARCH_BITNET,           "bitnet"           },
     { LLM_ARCH_T5,               "t5"               },
     { LLM_ARCH_T5ENCODER,        "t5encoder"        },
@@ -1152,6 +1153,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
         },
     },
+    {
+        LLM_ARCH_GLM4,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
+            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
+        },
+    },
     {
         LLM_ARCH_BITNET,
         {
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 2019352812d..2c2099b3c38 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -58,6 +58,7 @@ enum llm_arch {
     LLM_ARCH_DEEPSEEK,
     LLM_ARCH_DEEPSEEK2,
     LLM_ARCH_CHATGLM,
+    LLM_ARCH_GLM4,
     LLM_ARCH_BITNET,
     LLM_ARCH_T5,
     LLM_ARCH_T5ENCODER,
@@ -256,6 +257,8 @@ enum llm_tensor {
     LLM_TENSOR_ATTN_Q_NORM,
     LLM_TENSOR_ATTN_K_NORM,
     LLM_TENSOR_LAYER_OUT_NORM,
+    LLM_TENSOR_POST_ATTN_NORM,
+    LLM_TENSOR_POST_MLP_NORM,
     LLM_TENSOR_SSM_IN,
     LLM_TENSOR_SSM_CONV1D,
     LLM_TENSOR_SSM_X,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index ff847701e99..b74dd72cfbf 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1205,6 +1205,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_GLM4:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                switch (hparams.n_layer) {
+                    case 40: type = LLM_TYPE_9B; break;
+                    case 61: type = LLM_TYPE_32B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
         case LLM_ARCH_BITNET:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -3476,6 +3485,45 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
                     }
                 } break;
+            case LLM_ARCH_GLM4:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+
+                        if (layer.wqkv == nullptr) {
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                        }
+
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
+
+                        layer.ffn_post_norm  = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+                    }
+                } break;
             case LLM_ARCH_NEMOTRON:
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -10854,6 +10902,157 @@ struct llm_build_chatglm : public llm_graph_context {
     }
 };
 
+struct llm_build_glm4 : public llm_graph_context {
+    llm_build_glm4(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        ggml_tensor * cur;
+        ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        ggml_tensor * inp_pos = build_inp_pos();
+
+        auto * inp_attn = build_attn_inp_kv_unified();
+
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_tensor * inpSA = inpL;
+
+            // Pre-attention norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm,
+                    NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                ggml_tensor * Qcur = nullptr;
+                ggml_tensor * Kcur = nullptr;
+                ggml_tensor * Vcur = nullptr;
+
+                if (model.layers[il].wqkv == nullptr) {
+                    Qcur = build_lora_mm(model.layers[il].wq, cur);
+                    if (model.layers[il].bq) {
+                        Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    }
+                    Kcur = build_lora_mm(model.layers[il].wk, cur);
+                    if (model.layers[il].bk) {
+                        Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    }
+                    Vcur = build_lora_mm(model.layers[il].wv, cur);
+                    if (model.layers[il].bv) {
+                        Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    }
+                } else {
+                    cur = build_lora_mm(model.layers[il].wqkv, cur);
+                    cb(cur, "wqkv", il);
+                    if (model.layers[il].bqkv) {
+                        cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                        cb(cur, "bqkv", il);
+                    }
+                    Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                    Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                    Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+                }
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+
+                Kcur = ggml_rope_ext(
+                        ctx0, Kcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                cur = build_attn(inp_attn, gf,
+                        model.layers[il].wo, NULL,
+                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            // Post-attention norm (new!)
+            cur = build_norm(cur,
+                    model.layers[il].attn_post_norm,
+                    NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "post_attn_norm", il);
+
+            // Add the input (residual connection after post-attention norm)
+            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                // Pre-MLP norm
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm,
+                        NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                // MLP
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        NULL,                      NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
+                cb(cur, "ffn_out", il);
+
+                // Post-MLP norm
+                cur = build_norm(cur,
+                        model.layers[il].ffn_post_norm,
+                        NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "post_mlp_norm", il);
+            }
+
+            // Add residual connection after post-MLP norm
+            inpL = ggml_add(ctx0, cur, ffn_inp);
+            cb(inpL, "l_out", il);
+        }
+
+        // Final norm
+        cur = build_norm(inpL,
+                model.output_norm,
+                NULL,
+                LLM_NORM_RMS, -1);
+
+        cb(cur, "result_norm", -1);
+        res->t_embd = cur;
+
+        // Output projection
+        cur = build_lora_mm(model.output, cur);
+
+        cb(cur, "result_output", -1);
+        res->t_logits = cur;
+
+        ggml_build_forward_expand(gf, cur);
+    }
+};
+
 struct llm_build_nemotron : public llm_graph_context {
     llm_build_nemotron(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -12735,6 +12934,10 @@ llm_graph_result_ptr llama_model::build_graph(
             {
                 llm = std::make_unique<llm_build_chatglm>(*this, params, gf);
             } break;
+        case LLM_ARCH_GLM4:
+            {
+                llm = std::make_unique<llm_build_glm4>(*this, params, gf);
+            } break;
         case LLM_ARCH_BITNET:
             {
                 llm = std::make_unique<llm_build_bitnet>(*this, params, gf);
@@ -12932,6 +13135,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_DEEPSEEK2:
         case LLM_ARCH_PLM:
         case LLM_ARCH_CHATGLM:
+        case LLM_ARCH_GLM4:
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_CHAMELEON:
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 0feabd95aaf..464ff01e06f 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1572,6 +1572,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
                 clean_spaces = false;
             } else if (
+                tokenizer_pre == "glm4" ||
                 tokenizer_pre == "chatglm-bpe") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
                 special_bos_id = LLAMA_TOKEN_NULL;

From b2034c2b55b36b2192bdefb3b295db2a911370f5 Mon Sep 17 00:00:00 2001
From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com>
Date: Fri, 11 Apr 2025 20:01:56 +0800
Subject: [PATCH 3/4] contrib: support modelscope community (#12664)

* support download from modelscope

* support login

* remove comments

* add arguments

* fix code

* fix win32

* test passed

* fix readme

* revert readme

* change to MODEL_ENDPOINT

* revert tail line

* fix readme

* refactor model endpoint

* remove blank line

* fix header

* fix as comments

* update comment

* update readme

---------

Co-authored-by: tastelikefeet <yuze.zyz@alibaba-inc/com>
---
 README.md            |  4 +++-
 common/arg.cpp       | 17 ++++++++---------
 common/common.cpp    | 13 +++++++++++++
 common/common.h      |  2 ++
 examples/run/run.cpp |  6 ++++--
 5 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index a129d27d54c..cf45f23cf44 100644
--- a/README.md
+++ b/README.md
@@ -260,7 +260,9 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
 - [Trending](https://huggingface.co/models?library=gguf&sort=trending)
 - [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
 
-You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from Hugging Face by using this CLI argument: `-hf <user>/<model>[:quant]`
+You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`.
+
+By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
 
 After downloading a model, use the CLI tools to run it locally - see below.
 
diff --git a/common/arg.cpp b/common/arg.cpp
index 642fefb5754..0b57f9da1ee 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -228,12 +228,13 @@ static bool common_download_file_single(const std::string & url, const std::stri
     curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
     curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
 
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
     // Check if hf-token or bearer-token was specified
     if (!bearer_token.empty()) {
         std::string auth_header = "Authorization: Bearer " + bearer_token;
         http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
-        curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
     }
+    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
 
 #if defined(_WIN32)
     // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
@@ -544,7 +545,10 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
     curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
     curl_slist_ptr http_headers;
     std::string res_str;
-    std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
+
+    std::string model_endpoint = get_model_endpoint();
+
+    std::string url = model_endpoint + "v2/" + hf_repo + "/manifests/" + tag;
     curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
     curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
     typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
@@ -659,13 +663,8 @@ static void common_params_handle_model(
                 }
             }
 
-            std::string hf_endpoint = "https://huggingface.co/";
-            const char * hf_endpoint_env = getenv("HF_ENDPOINT");
-            if (hf_endpoint_env) {
-                hf_endpoint = hf_endpoint_env;
-                if (hf_endpoint.back() != '/') hf_endpoint += '/';
-            }
-            model.url = hf_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
+            std::string model_endpoint = get_model_endpoint();
+            model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
             // make sure model path is present (for caching purposes)
             if (model.path.empty()) {
                 // this is to avoid different repo having same file name, or same file name in different subdirs
diff --git a/common/common.cpp b/common/common.cpp
index d4882c5123c..4e1a020d017 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1027,6 +1027,19 @@ struct common_init_result common_init_from_params(common_params & params) {
     return iparams;
 }
 
+std::string get_model_endpoint() {
+    const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
+    // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
+    const char * hf_endpoint_env = getenv("HF_ENDPOINT");
+    const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
+    std::string model_endpoint = "https://huggingface.co/";
+    if (endpoint_env) {
+        model_endpoint = endpoint_env;
+        if (model_endpoint.back() != '/') model_endpoint += '/';
+    }
+    return model_endpoint;
+}
+
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
     llama_clear_adapter_lora(ctx);
     for (auto & la : lora) {
diff --git a/common/common.h b/common/common.h
index 725b5123d24..e6eaa8e80cf 100644
--- a/common/common.h
+++ b/common/common.h
@@ -543,6 +543,8 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
 
+std::string                   get_model_endpoint();
+
 //
 // Batch utils
 //
diff --git a/examples/run/run.cpp b/examples/run/run.cpp
index 68e94b0b3c3..e63c2aac331 100644
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@@ -697,8 +697,10 @@ class LlamaData {
         std::vector<std::string> headers = { "User-Agent: llama-cpp", "Accept: application/json" };
         std::string              url;
 
+        std::string model_endpoint = get_model_endpoint();
+
         if (pos == std::string::npos) {
-            auto [model_name, manifest_url] = extract_model_and_tag(model, "https://huggingface.co/v2/");
+            auto [model_name, manifest_url] = extract_model_and_tag(model, model_endpoint + "v2/");
             hfr                             = model_name;
 
             nlohmann::json manifest;
@@ -713,7 +715,7 @@ class LlamaData {
             hff = model.substr(pos + 1);
         }
 
-        url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
+        url = model_endpoint + hfr + "/resolve/main/" + hff;
 
         return download(url, bn, true, headers);
     }

From 578754b3157d662c2fdd51eaa62b6c1f43d3172c Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan.cr@gmail.com>
Date: Fri, 11 Apr 2025 15:32:14 +0200
Subject: [PATCH 4/4] sycl: Support sycl_ext_oneapi_limited_graph (#12873)

The current usage of the SYCL-Graph extension checks for
the `sycl_ext_oneapi_graph` device aspect. However, it is also
possible to support `sycl_ext_oneapi_limied_graph` devices that
don't support update
---
 ggml/src/ggml-sycl/ggml-sycl.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 236b039e618..3e48a9244d3 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3653,7 +3653,8 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
 
 #ifdef GGML_SYCL_GRAPH
     if (!g_ggml_sycl_disable_graph) {
-        if (!sycl_ctx->exec_graph && !dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_graph)) {
+        const bool graph_support = dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_limited_graph);
+        if (!graph_support) {
             GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device);
             ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
             return GGML_STATUS_SUCCESS;
@@ -3664,8 +3665,10 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
         ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
         model_sycl_graph.end_recording();
 
-        if (!sycl_ctx->exec_graph) {
-            auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}});
+        const bool graph_update_support = dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_graph);
+        if (!sycl_ctx->exec_graph || !graph_update_support) {
+            auto exec_graph = graph_update_support ? model_sycl_graph.finalize(sycl_ex::property::graph::updatable{}) :
+                                                     model_sycl_graph.finalize();
             sycl_ctx->exec_graph = std::make_unique<
                 sycl_ex::command_graph<sycl_ex::graph_state::executable>>(exec_graph);
         } else {