From 0c509239445088f21265580b86733c5b184261d9 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Fri, 11 Apr 2025 12:09:39 +0200 Subject: [PATCH 1/4] =?UTF-8?q?clip=20:=20use=20smart=20pointer=20(?= =?UTF-8?q?=E2=9A=A0=EF=B8=8F=20breaking=20change)=20(#12869)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * clip : use smart pointers * fix warmup * add forward declaration * misisng include * fix include (2) * composite * simplify batch ptr * fix conflict --- examples/llava/clip-impl.h | 23 ++- examples/llava/clip.cpp | 343 ++++++++++++++++++------------------- examples/llava/clip.h | 30 ++-- examples/llava/llava.cpp | 103 ++++++----- examples/llava/mtmd.cpp | 10 +- 5 files changed, 255 insertions(+), 254 deletions(-) diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h index 4c035298749..4d7340a56bd 100644 --- a/examples/llava/clip-impl.h +++ b/examples/llava/clip-impl.h @@ -1,5 +1,6 @@ #include "ggml.h" #include "gguf.h" +#include "clip.h" #include "clip.h" @@ -202,23 +203,31 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, .. // cpp wrappers // +// wrapper for clip_image_size +struct clip_image_size_deleter { + void operator()(clip_image_size * val) { clip_image_size_free(val); } +}; +typedef std::unique_ptr clip_image_size_ptr; + +// wrapper for clip_image_u8 struct clip_image_u8_deleter { void operator()(clip_image_u8 * val) { clip_image_u8_free(val); } }; +typedef std::unique_ptr clip_image_u8_ptr; +// wrapper for clip_image_f32 struct clip_image_f32_deleter { void operator()(clip_image_f32 * val) { clip_image_f32_free(val); } }; +typedef std::unique_ptr clip_image_f32_ptr; -struct clip_image_f32_batch_deleter { - void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); } +struct clip_image_u8_batch { + std::vector entries; }; -typedef std::unique_ptr clip_image_u8_ptr; -typedef std::unique_ptr clip_image_f32_ptr; -typedef std::unique_ptr clip_image_f32_batch_ptr; - -// TODO @ngxson : we're currently having a naming clash between struct clip_image_size and function clip_image_size() +struct clip_image_f32_batch { + std::vector entries; +}; // // common utils diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 710309edaec..a55b3f38351 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -315,58 +315,47 @@ struct clip_ctx { bool use_gelu = false; bool use_silu = false; - struct gguf_context * ctx_gguf = nullptr; - struct ggml_context * ctx_data = nullptr; + gguf_context_ptr ctx_gguf; + ggml_context_ptr ctx_data; std::vector buf_compute_meta; std::vector backend_ptrs; std::vector backend_buft; - ggml_backend_t backend = nullptr; - ggml_backend_t backend_cpu = nullptr; - ggml_backend_buffer_t buf = nullptr; + ggml_backend_ptr backend; + ggml_backend_ptr backend_cpu; + ggml_backend_buffer_ptr buf; ggml_backend_sched_ptr sched; - struct clip_image_size * load_image_size = nullptr; + clip_image_size load_image_size; clip_ctx(clip_context_params & ctx_params) { - backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); - backend = ctx_params.use_gpu + backend_cpu = ggml_backend_ptr(ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr)); + backend = ggml_backend_ptr(ctx_params.use_gpu ? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr) - : nullptr; + : nullptr); if (backend) { - LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend)); - backend_ptrs.push_back(backend); - backend_buft.push_back(ggml_backend_get_default_buffer_type(backend)); + LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend.get())); + backend_ptrs.push_back(backend.get()); + backend_buft.push_back(ggml_backend_get_default_buffer_type(backend.get())); } else { - backend = backend_cpu; + backend = std::move(backend_cpu); LOG_INF("%s: CLIP using CPU backend\n", __func__); } - backend_ptrs.push_back(backend_cpu); - backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu)); + backend_ptrs.push_back(backend_cpu.get()); + backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu.get())); sched.reset( ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false) ); } - - ~clip_ctx() { - ggml_free(ctx_data); - gguf_free(ctx_gguf); - ggml_backend_buffer_free(buf); - ggml_backend_free(backend); - if (backend_cpu != backend) { - ggml_backend_free(backend_cpu); - } - clip_image_size_free(load_image_size); - } }; -static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch * imgs) { +static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch & imgs) { const auto & model = ctx->vision_model; const auto & hparams = model.hparams; @@ -382,7 +371,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im const int n_layer = hparams.n_layer; const float eps = hparams.eps; - GGML_ASSERT(imgs->size == 1); // batch_size == 1 + GGML_ASSERT(imgs.entries.size() == 1); // batch_size == 1 struct ggml_init_params params = { /*.mem_size =*/ ctx->buf_compute_meta.size(), @@ -390,7 +379,9 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im /*.no_alloc =*/ true, }; - struct ggml_context * ctx0 = ggml_init(params); + ggml_context_ptr ctx0_ptr(ggml_init(params)); + auto ctx0 = ctx0_ptr.get(); + struct ggml_cgraph * gf = ggml_new_graph(ctx0); // input raw @@ -512,12 +503,10 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im // build the graph ggml_build_forward_expand(gf, embeddings); - ggml_free(ctx0); - return gf; } -static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { +static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) { if (!ctx->has_vision_encoder) { LOG_ERR("This gguf file seems to have no vision encoder\n"); return nullptr; @@ -530,23 +519,20 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im int image_size_width = image_size; int image_size_height = image_size; if (ctx->has_minicpmv_projector) { - if (load_image_size == nullptr) { - load_image_size = clip_image_size_init(); - } - LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height); - image_size_width = load_image_size->width; - image_size_height = load_image_size->height; + LOG_DBG("%s: %d %d\n", __func__, load_image_size.width, load_image_size.height); + image_size_width = load_image_size.width; + image_size_height = load_image_size.height; if (is_inf) { - image_size_width = imgs->data->nx; - image_size_height = imgs->data->ny; + image_size_width = imgs.entries[0]->nx; + image_size_height = imgs.entries[0]->ny; } } else if (ctx->has_qwen2vl_merger) { // use the image's native resolution when image is avaible if (is_inf) { // if (imgs->data->nx && imgs->data->ny) { - image_size_width = imgs->data->nx; - image_size_height = imgs->data->ny; + image_size_width = imgs.entries[0]->nx; + image_size_height = imgs.entries[0]->ny; } } const int patch_size = hparams.patch_size; @@ -561,7 +547,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im const float eps = hparams.eps; int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; - const int batch_size = imgs->size; + const int batch_size = imgs.entries.size(); if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) { GGML_ASSERT(batch_size == 1); @@ -573,7 +559,9 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im /*.no_alloc =*/ true, }; - struct ggml_context * ctx0 = ggml_init(params); + ggml_context_ptr ctx0_ptr(ggml_init(params)); + auto ctx0 = ctx0_ptr.get(); + struct ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size); @@ -1061,7 +1049,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings); } } else { - GGML_ABORT("fatel error"); + GGML_ABORT("fatal error"); } } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { @@ -1081,12 +1069,10 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im // build the graph ggml_build_forward_expand(gf, embeddings); - ggml_free(ctx0); - return gf; } -static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { +static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) { if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { return clip_image_build_graph_siglip(ctx, imgs); } else { @@ -1257,7 +1243,7 @@ struct clip_model_loader { /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; - ctx_clip.ctx_data = ggml_init(params); + ctx_clip.ctx_data.reset(ggml_init(params)); if (!ctx_clip.ctx_data) { throw std::runtime_error(string_format("%s: failed to init ggml context\n", __func__)); } @@ -1271,7 +1257,7 @@ struct clip_model_loader { if (cur) { tensors_to_load.push_back(cur); // add tensors to context - struct ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data, cur); + struct ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur); ggml_set_name(data_tensor, cur->name); cur = data_tensor; } @@ -1442,11 +1428,11 @@ struct clip_model_loader { } // alloc memory and offload data - ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend); - ctx_clip.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data, buft); - ggml_backend_buffer_set_usage(ctx_clip.buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend.get()); + ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft)); + ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS); for (auto & t : tensors_to_load) { - struct ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data, t->name); + struct ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name); const size_t offset = tensor_offset[t->name]; fin.seekg(offset, std::ios::beg); if (!fin) { @@ -1471,10 +1457,20 @@ struct clip_model_loader { void alloc_compute_meta() { ctx_clip.buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead()); + + // create a fake batch clip_image_f32_batch batch; - batch.size = 1; - batch.data = nullptr; - ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, &batch, nullptr, false); + clip_image_f32_ptr img(clip_image_f32_init()); + clip_image_size image_size; + image_size.width = clip_get_image_size(&ctx_clip); + image_size.height = clip_get_image_size(&ctx_clip); + int n_patches = clip_get_image_size(&ctx_clip) / image_size.width; + img->nx = n_patches; + img->ny = n_patches; + img->buf.resize(n_patches * image_size.width * image_size.height * 3); + batch.entries.push_back(std::move(img)); + + ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch, image_size, false); ggml_backend_sched_reserve(ctx_clip.sched.get(), gf); for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) { ggml_backend_t backend = ctx_clip.backend_ptrs[i]; @@ -1575,11 +1571,11 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p } void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) { - ctx_clip->load_image_size = load_image_size; + ctx_clip->load_image_size = *load_image_size; // copy } struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) { - return ctx_clip->load_image_size; + return &ctx_clip->load_image_size; } struct clip_image_size * clip_image_size_init() { @@ -1597,6 +1593,10 @@ struct clip_image_f32 * clip_image_f32_init() { return new clip_image_f32(); } +struct clip_image_f32_batch * clip_image_f32_batch_init() { + return new clip_image_f32_batch(); +} + unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) { if (nx) *nx = img->nx; if (ny) *ny = img->ny; @@ -1609,19 +1609,37 @@ void clip_image_size_free(struct clip_image_size * load_image_size) { } delete load_image_size; } -void clip_image_u8_free(struct clip_image_u8 * img) { delete img; } -void clip_image_f32_free(struct clip_image_f32 * img) { delete img; } -void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { - if (batch->size > 0) { - delete[] batch->data; - batch->size = 0; +void clip_image_u8_free(struct clip_image_u8 * img) { if (img) delete img; } +void clip_image_f32_free(struct clip_image_f32 * img) { if (img) delete img; } +void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { if (batch) delete batch; } +void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { if (batch) delete batch; } + +size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) { + return batch->entries.size(); +} + +size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) { + if (idx < 0 || idx >= (int)batch->entries.size()) { + LOG_ERR("%s: invalid index %d\n", __func__, idx); + return 0; + } + return batch->entries[idx]->nx; +} + +size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) { + if (idx < 0 || idx >= (int)batch->entries.size()) { + LOG_ERR("%s: invalid index %d\n", __func__, idx); + return 0; } + return batch->entries[idx]->ny; } -void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { - if (batch->size > 0) { - delete[] batch->data; - batch->size = 0; + +clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) { + if (idx < 0 || idx >= (int)batch->entries.size()) { + LOG_ERR("%s: invalid index %d\n", __func__, idx); + return nullptr; } + return batch->entries[idx].get(); } void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) { @@ -1695,14 +1713,15 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta } // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not -static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) { - dst->nx = src->nx; - dst->ny = src->ny; - dst->buf.resize(src->buf.size()); +static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) { + dst.nx = src.nx; + dst.ny = src.ny; + dst.buf.resize(src.buf.size()); - for (size_t i = 0; i < src->buf.size(); ++i) { + // TODO @ngxson : seems like this could be done more efficiently on cgraph + for (size_t i = 0; i < src.buf.size(); ++i) { int c = i % 3; // rgb - dst->buf[i] = (static_cast(src->buf[i]) / 255.0f - mean[c]) / std[c]; + dst.buf[i] = (static_cast(src.buf[i]) / 255.0f - mean[c]) / std[c]; } } @@ -1710,7 +1729,7 @@ inline int clip(int x, int lower, int upper) { return std::max(lower, std::min(x, upper)); } -static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) { +static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { const int nx = img.nx; const int ny = img.ny; @@ -1848,13 +1867,13 @@ static std::pair select_best_resolution(const std::pair & or return best_fit; } -static std::vector divide_to_patches_u8(const clip_image_u8 & image, int patch_size) { - std::vector patches; +static std::vector divide_to_patches_u8(const clip_image_u8 & image, int patch_size) { + std::vector patches; int width = image.nx; int height = image.ny; for (int i = 0; i < height; i += patch_size) { for (int j = 0; j < width; j += patch_size) { - clip_image_u8 *patch = clip_image_u8_init(); + clip_image_u8_ptr patch(clip_image_u8_init()); patch->nx = std::min(patch_size, width - j); patch->ny = std::min(patch_size, height - i); patch->buf.resize(3 * patch->nx * patch->ny); @@ -1865,7 +1884,7 @@ static std::vector divide_to_patches_u8(const clip_image_u8 & im } } } - patches.push_back(patch); + patches.push_back(std::move(patch)); } } return patches; @@ -1946,7 +1965,7 @@ static std::pair uhd_best_grid(const int max_slice_nums, const int mul // -> https://arxiv.org/pdf/2403.11703 // -> https://github.com/thunlp/LLaVA-UHD // -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 -static std::vector> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) { +static std::vector> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) { const std::pair original_size={img->nx,img->ny}; const int original_width = img->nx; const int original_height = img->ny; @@ -1954,30 +1973,30 @@ static std::vector> uhd_slice_image(const clip_imag const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); const int multiple = fmin(ceil(ratio), max_slice_nums); - std::vector> images; + std::vector> images; LOG_DBG("%s: multiple %d\n", __func__, multiple); - images.push_back(std::vector()); + images.push_back(std::vector()); if (multiple <= 1) { auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true); - clip_image_u8 * source_image = clip_image_u8_init(); + clip_image_u8_ptr source_image(clip_image_u8_init()); bicubic_resize(*img, *source_image, best_size.first, best_size.second); // source_image = image.resize(best_size, Image.Resampling.BICUBIC) - images[images.size()-1].push_back(source_image); + images.back().push_back(std::move(source_image)); } else if (multiple > 1) { auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size); - clip_image_u8 * source_image = clip_image_u8_init(); + clip_image_u8_ptr source_image(clip_image_u8_init()); bicubic_resize(*img, *source_image, best_size.first, best_size.second); // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) LOG_DBG("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); - images[images.size()-1].push_back(source_image); + images.back().push_back(std::move(source_image)); std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); LOG_DBG("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); - clip_image_u8 * refine_image = clip_image_u8_init(); + clip_image_u8_ptr refine_image(clip_image_u8_init()); bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second); LOG_DBG("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); @@ -1988,9 +2007,9 @@ static std::vector> uhd_slice_image(const clip_imag int grid_x = int(width / best_grid.first); int grid_y = int(height / best_grid.second); for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){ - images.push_back(std::vector()); + images.push_back(std::vector()); for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){ - clip_image_u8 * patch = clip_image_u8_init(); + clip_image_u8_ptr patch(clip_image_u8_init()); patch->nx = grid_x; patch->ny = grid_y; patch->buf.resize(3 * patch->nx * patch->ny); @@ -2003,10 +2022,9 @@ static std::vector> uhd_slice_image(const clip_imag patch->buf[j+2] = refine_image->buf[i+2]; } } - images[images.size()-1].push_back(patch); + images.back().push_back(std::move(patch)); } } - clip_image_u8_free(refine_image); } return images; } @@ -2014,8 +2032,8 @@ static std::vector> uhd_slice_image(const clip_imag int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { const int max_slice_nums=9; const int scale_resolution=448; - const int original_width = ctx_clip->load_image_size->width; - const int original_height = ctx_clip->load_image_size->height; + const int original_width = ctx_clip->load_image_size.width; + const int original_height = ctx_clip->load_image_size.height; const float log_ratio = log(1.0*original_width/original_height); const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); const int multiple = fmin(ceil(ratio), max_slice_nums); @@ -2025,64 +2043,44 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found -bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { +bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) { - if(clip_is_minicpmv(ctx)){ + if (clip_is_minicpmv(ctx)) { int max_slice_nums = 9; - std::vector> imgs = uhd_slice_image(img, max_slice_nums); - res_imgs->size = 0; - for (size_t i = 0; i < imgs.size(); ++i){ - res_imgs->size += imgs[i].size(); - } - res_imgs->data = new clip_image_f32[res_imgs->size]; - int idx = 0; + std::vector> imgs = uhd_slice_image(img, max_slice_nums); for (size_t i = 0; i < imgs.size(); ++i) { for (size_t j = 0; j < imgs[i].size(); ++j) { LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); - clip_image_f32 * res = clip_image_f32_init(); - normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std); - res_imgs->data[idx++] = *res; - clip_image_f32_free(res); - } - } - for (size_t i = 0; i < imgs.size(); ++i) { - for (size_t j = 0; j < imgs[i].size(); ++j) { - if (imgs[i][j] != nullptr) { - clip_image_u8_free(imgs[i][j]); - } + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i][j], *res, ctx->image_mean, ctx->image_std); + res_imgs->entries.push_back(std::move(res)); } } return true; } else if (ctx->has_qwen2vl_merger) { - clip_image_u8 * resized = clip_image_u8_init(); - auto patch_size = clip_patch_size(ctx) * 2; + clip_image_u8 resized; + auto patch_size = clip_get_patch_size(ctx) * 2; int nx = ceil((float)img->nx / patch_size) * patch_size; int ny = ceil((float)img->ny / patch_size) * patch_size; - bicubic_resize(*img, *resized, nx, ny); + bicubic_resize(*img, resized, nx, ny); - res_imgs->data = new clip_image_f32[1]; - // clip_image_f32 * res = clip_image_f32_init(); - normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + // clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(resized, *img_f32, ctx->image_mean, ctx->image_std); // res_imgs->data[0] = *res; - res_imgs->size = 1; - - // clip_image_f32_free(res); - clip_image_u8_free(resized); + res_imgs->entries.push_back(std::move(img_f32)); return true; } if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { - res_imgs->size = 1; - res_imgs->data = new clip_image_f32[res_imgs->size]; clip_image_u8 resized_image; int32_t sz=ctx->vision_model.hparams.image_size; bicubic_resize(*img, resized_image,sz,sz); - clip_image_f32 * res = clip_image_f32_init(); + clip_image_f32_ptr img_f32(clip_image_f32_init()); //clip_image_save_to_bmp(resized_image, "resized.bmp"); - normalize_image_u8_to_f32(&resized_image, res, ctx->image_mean, ctx->image_std); - res_imgs->data[0] = *res; - clip_image_f32_free(res); + normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std); + res_imgs->entries.push_back(std::move(img_f32)); return true; } @@ -2097,16 +2095,12 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli pad_to_square = false; } // free the previous res_imgs if any set - if (res_imgs->size > 0) { - clip_image_f32_batch_free(res_imgs); - } - res_imgs->data = nullptr; - res_imgs->size = 0; + res_imgs->entries.clear(); // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 - clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily + clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily if (pad_to_square && img->nx != img->ny) { int longer_side = std::max(img->nx, img->ny); temp->nx = longer_side; @@ -2149,28 +2143,18 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli // clip_image_u8_free(temp2); // } - std::vector patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6) + std::vector patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6) - clip_image_u8 *image_original_resize = clip_image_u8_init(); + clip_image_u8_ptr image_original_resize(clip_image_u8_init()); // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square - patches.insert(patches.begin(), image_original_resize); - // clip_image_f32_batch_init(patches.size()); - res_imgs->size = patches.size(); - res_imgs->data = new clip_image_f32[res_imgs->size]; - int num=0; - for (auto& patch : patches) { - normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std); - num++; - } - - for (size_t i = 0; i < patches.size(); i++) { - // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); - clip_image_u8_free(patches[i]); + patches.insert(patches.begin(), std::move(image_original_resize)); + for (auto & patch : patches) { + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*patch, *res, ctx->image_mean, ctx->image_std); + res_imgs->entries.push_back(std::move(res)); } - clip_image_u8_free(temp); - return true; } else { temp->nx = img->nx; @@ -2186,7 +2170,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli const int nx2 = ctx->vision_model.hparams.image_size; const int ny2 = ctx->vision_model.hparams.image_size; - clip_image_f32 * res = clip_image_f32_init(); + clip_image_f32_ptr res(clip_image_f32_init()); res->nx = nx2; res->ny = ny2; res->buf.resize(3 * nx2 * ny2); @@ -2238,7 +2222,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli } } } - clip_image_u8_free(temp); // { // clip_image_u8 * temp2 = clip_image_u8_init(); @@ -2248,10 +2231,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli // } // res_imgs.push_back(res); - res_imgs->size = 1; - res_imgs->data = new clip_image_f32[res_imgs->size]; - res_imgs->data[0] = *res; - clip_image_f32_free(res); + res_imgs->entries.push_back(std::move(res)); return true; } @@ -2279,15 +2259,15 @@ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); } -int32_t clip_image_size(const struct clip_ctx * ctx) { +int32_t clip_get_image_size(const struct clip_ctx * ctx) { return ctx->vision_model.hparams.image_size; } -int32_t clip_patch_size(const struct clip_ctx * ctx) { +int32_t clip_get_patch_size(const struct clip_ctx * ctx) { return ctx->vision_model.hparams.patch_size; } -int32_t clip_hidden_size(const struct clip_ctx * ctx) { +int32_t clip_get_hidden_size(const struct clip_ctx * ctx) { return ctx->vision_model.hparams.hidden_size; } @@ -2434,19 +2414,23 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3 return false; } - clip_image_f32_batch imgs{}; - imgs.size = 1; - imgs.data = img; + clip_image_f32_batch imgs; + clip_image_f32_ptr img_copy(clip_image_f32_init()); + *img_copy = *img; + imgs.entries.push_back(std::move(img_copy)); + return clip_image_batch_encode(ctx, n_threads, &imgs, vec); } -bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) { +bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) { + const clip_image_f32_batch & imgs = *imgs_c_ptr; + if (!ctx->has_vision_encoder) { LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__); return false; } - int batch_size = imgs->size; + int batch_size = imgs.entries.size(); if (ctx->has_llava_projector) { GGML_ASSERT(batch_size == 1); // TODO: support multiple images } @@ -2473,25 +2457,22 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima int image_size_width = image_size; int image_size_height = image_size; if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) { - image_size_width = imgs->data[0].nx; - image_size_height = imgs->data[0].ny; + image_size_width = imgs.entries[0]->nx; + image_size_height = imgs.entries[0]->ny; } const int patch_size = hparams.patch_size; const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_positions = num_patches + (model.class_embedding ? 1 : 0); - if(ctx->load_image_size==nullptr){ - ctx->load_image_size= clip_image_size_init(); - } - const int pos_w = ctx->load_image_size->width/patch_size; - const int pos_h = ctx->load_image_size->height/patch_size; + const int pos_w = ctx->load_image_size.width / patch_size; + const int pos_h = ctx->load_image_size.height / patch_size; { struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); float * data = (float *)malloc(ggml_nbytes(inp_raw)); - for (size_t i = 0; i < imgs->size; i++) { - const int nx = imgs->data[i].nx; - const int ny = imgs->data[i].ny; + for (size_t i = 0; i < imgs.entries.size(); i++) { + const int nx = imgs.entries[i]->nx; + const int ny = imgs.entries[i]->ny; if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) { GGML_ASSERT(nx == image_size && ny == image_size); } @@ -2502,7 +2483,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima for (int k = 0; k < 3; k++) { for (int y = 0; y < ny; y++) { for (int x = 0; x < nx; x++) { - data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k]; + data[(b * 3 * n) + k * n + y * nx + x] = imgs.entries[b]->buf[3 * (y * nx + x) + k]; } } } @@ -2629,7 +2610,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } } - ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads); + ggml_backend_cpu_set_n_threads(ctx->backend_cpu.get(), n_threads); auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf); if (status != GGML_STATUS_SUCCESS) { @@ -2662,8 +2643,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i /* verbosity */ GGML_LOG_LEVEL_ERROR, }); - const auto & ctx_src = ctx_clip->ctx_gguf; - const auto & ctx_data = ctx_clip->ctx_data; + const auto & ctx_src = ctx_clip->ctx_gguf.get(); + const auto & ctx_data = ctx_clip->ctx_data.get(); auto * ctx_out = gguf_init_empty(); gguf_set_kv(ctx_out, ctx_src); diff --git a/examples/llava/clip.h b/examples/llava/clip.h index f61e0c0b2b3..cc133a58de3 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -30,15 +30,8 @@ struct clip_image_size { int height; }; -struct clip_image_u8_batch { - struct clip_image_u8 * data; - size_t size; -}; - -struct clip_image_f32_batch { - struct clip_image_f32 * data; - size_t size; -}; +struct clip_image_u8_batch; +struct clip_image_f32_batch; struct clip_context_params { bool use_gpu; @@ -55,9 +48,9 @@ CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w); -CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx); -CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx); -CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx); +CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx); +CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx); +CLIP_API int32_t clip_get_hidden_size(const struct clip_ctx * ctx); // TODO: should be enum, not string CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); @@ -73,9 +66,10 @@ CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip); -CLIP_API struct clip_image_size * clip_image_size_init(); -CLIP_API struct clip_image_u8 * clip_image_u8_init (); -CLIP_API struct clip_image_f32 * clip_image_f32_init(); +CLIP_API struct clip_image_size * clip_image_size_init(); +CLIP_API struct clip_image_u8 * clip_image_u8_init (); +CLIP_API struct clip_image_f32 * clip_image_f32_init(); +CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava // nx, ny are the output image dimensions CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny); @@ -86,6 +80,12 @@ CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch); CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch); +// use for accessing underlay data of clip_image_f32_batch +CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size() +CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx +CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny +CLIP_API clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data + /** * Build image from pixels decoded by other libraries instead of stb_image.h for better performance. * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 518aad3f1f7..03a22cbb4c2 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #if defined(LLAVA_LOG_OFF) # define LOG_INF(...) @@ -45,6 +46,17 @@ struct clip_image_grid_shape { int second; }; +// convenience cpp wrapper +struct clip_image_f32_batch_deleter { + void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); } +}; +typedef std::unique_ptr clip_image_f32_batch_ptr; + +struct clip_image_size_deleter { + void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); } +}; +typedef std::unique_ptr clip_image_size_ptr; + /** * Selects the best resolution from a list of possible resolutions based on the original size. * @@ -105,8 +117,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector struct ggml_context * ctx; } model; - const int32_t image_size = clip_image_size(ctx_clip); - const int32_t patch_size = clip_patch_size(ctx_clip); + const int32_t image_size = clip_get_image_size(ctx_clip); + const int32_t patch_size = clip_get_patch_size(ctx_clip); int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches) @@ -246,12 +258,9 @@ static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { // std::vector img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336 - clip_image_f32_batch img_res_v; - img_res_v.size = 0; - img_res_v.data = nullptr; - if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) { + clip_image_f32_batch_ptr img_res_v(clip_image_f32_batch_init()); + if (!clip_image_preprocess(ctx_clip, img, img_res_v.get())) { LOG_ERR("%s: unable to preprocess image\n", __func__); - delete[] img_res_v.data; return false; } @@ -259,66 +268,72 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); + const size_t n_imgs = clip_image_f32_batch_n_images(img_res_v.get()); + if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) { std::vector image_embd_v; - image_embd_v.resize(img_res_v.size); - struct clip_image_size * load_image_size = clip_image_size_init(); + image_embd_v.resize(n_imgs); + clip_image_size load_image_size; - for (size_t i = 0; i < img_res_v.size; i++) { + for (size_t i = 0; i < n_imgs; i++) { const int64_t t_img_enc_step_start_us = ggml_time_us(); - image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny)); - int patch_size=14; - load_image_size->width = img_res_v.data[i].nx; - load_image_size->height = img_res_v.data[i].ny; - clip_add_load_image_size(ctx_clip, load_image_size); + int nx = clip_image_f32_batch_nx(img_res_v.get(), i); + int ny = clip_image_f32_batch_ny(img_res_v.get(), i); + image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, nx, ny)); + int patch_size = 14; + load_image_size.width = nx; + load_image_size.height = ny; + clip_add_load_image_size(ctx_clip, &load_image_size); bool encoded = false; + clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i); if (clip_is_qwen2vl(ctx_clip)) { - encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); + encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]); } else { - encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); + encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(img_res, patch_size), image_embd_v[i]); } if (!encoded) { - LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); + LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs); return false; } const int64_t t_img_enc_steop_batch_us = ggml_time_us(); - LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); + LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)n_imgs, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); } const int64_t t_img_enc_batch_us = ggml_time_us(); - LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); int n_img_pos_out = 0; for (size_t i = 0; i < image_embd_v.size(); i++) { + int nx = clip_image_f32_batch_nx(img_res_v.get(), i); + int ny = clip_image_f32_batch_ny(img_res_v.get(), i); + clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i); std::memcpy( image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], - clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny)); - n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]); + clip_embd_nbytes_by_img(ctx_clip, nx, ny)); + n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res); } *n_img_pos = n_img_pos_out; for (size_t i = 0; i < image_embd_v.size(); i++) { free(image_embd_v[i]); } image_embd_v.clear(); - load_image_size->width = img->nx; - load_image_size->height = img->ny; - clip_add_load_image_size(ctx_clip, load_image_size); - LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height); - delete[] img_res_v.data; - img_res_v.size = 0; - img_res_v.data = nullptr; + load_image_size.width = img->nx; + load_image_size.height = img->ny; + clip_add_load_image_size(ctx_clip, &load_image_size); + LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size.width, load_image_size.height); } else if (clip_is_glm(ctx_clip)){ struct clip_image_size * load_image_size = clip_image_size_init(); - load_image_size->width = img_res_v.data[0].nx; - load_image_size->height = img_res_v.data[0].ny; + load_image_size->width = clip_image_f32_batch_nx(img_res_v.get(), 0); + load_image_size->height = clip_image_f32_batch_ny(img_res_v.get(), 0); clip_add_load_image_size(ctx_clip, load_image_size); - bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); - int pos = int(load_image_size->width/clip_patch_size(ctx_clip)/2); + clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0); + bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); + int pos = int(load_image_size->width/clip_get_patch_size(ctx_clip)/2); *n_img_pos = (pos * pos + 2); if (!encoded){ LOG_ERR("Unable to encode image \n"); @@ -328,8 +343,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { // flat / default llava-1.5 type embedding *n_img_pos = clip_n_patches(ctx_clip); - bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 - delete[] img_res_v.data; + clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0); + bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096 if (!encoded) { LOG_ERR("Unable to encode image\n"); @@ -340,17 +355,18 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli // spatial_unpad llava-1.6 type embedding // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working std::vector image_embd_v; - image_embd_v.resize(img_res_v.size); - for (size_t i = 0; i < img_res_v.size; i++) { + image_embd_v.resize(n_imgs); + for (size_t i = 0; i < n_imgs; i++) { + clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i); image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 - const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside + const bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside if (!encoded) { - LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); + LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs); return false; } } const int64_t t_img_enc_batch_us = ggml_time_us(); - LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); const int32_t * image_grid = clip_image_grid(ctx_clip); const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip); @@ -360,12 +376,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli grid_pinpoints.push_back({image_grid[i], image_grid[i+1]}); } - // free all img_res_v - not needed anymore - delete[] img_res_v.data; - img_res_v.size = 0; - img_res_v.data = nullptr; - - const int32_t image_size = clip_image_size(ctx_clip); + const int32_t image_size = clip_get_image_size(ctx_clip); struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size); diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp index 58503d0b22c..114c274bc12 100644 --- a/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp @@ -41,14 +41,14 @@ struct mtmd_context { }; struct mtmd_image_tokens_data { - clip_image_f32_batch_ptr batch_f32; // preprocessed image patches + clip_image_f32_batch batch_f32; // preprocessed image patches }; struct mtmd_image_tokens { uint32_t nx; // number of tokens in x direction uint32_t ny; // number of tokens in y direction uint32_t n_tokens() const { return nx * ny; } - clip_image_f32_batch_ptr batch_f32; // preprocessed image patches + clip_image_f32_batch batch_f32; // preprocessed image patches }; mtmd_context * mtmd_init_from_file(const char * mmproj_fname, @@ -141,8 +141,8 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx, std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3); // preprocess image - clip_image_f32_batch_ptr batch_f32(new clip_image_f32_batch); - bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), batch_f32.get()); + clip_image_f32_batch batch_f32; + bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32); if (!ok) { LOG_ERR("Unable to preprocess image\n"); return nullptr; @@ -181,7 +181,7 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) bool ok = clip_image_batch_encode( ctx->ctx_clip, ctx->n_threads, - image_tokens->batch_f32.get(), + &image_tokens->batch_f32, ctx->image_embd_v.data()); return ok ? 0 : 1; } From 06bb53ad9b6e6d92b6ab6979927530080b1c990c Mon Sep 17 00:00:00 2001 From: Yuxuan Zhang <2448370773@qq.com> Date: Fri, 11 Apr 2025 18:10:10 +0800 Subject: [PATCH 2/4] llama-model : add Glm4Model implementation for GLM-4-0414 (#12867) * GLM-4-0414 * use original one * Using with tensor map * fix bug * change order * change order * format with flask8 --- README.md | 1 + convert_hf_to_gguf.py | 20 +++- convert_hf_to_gguf_update.py | 1 + gguf-py/gguf/constants.py | 19 +++ gguf-py/gguf/tensor_mapping.py | 8 +- src/llama-arch.cpp | 20 ++++ src/llama-arch.h | 3 + src/llama-model.cpp | 204 +++++++++++++++++++++++++++++++++ src/llama-vocab.cpp | 1 + 9 files changed, 273 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e56042f1cb2..a129d27d54c 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [x] [Flan T5](https://huggingface.co/models?search=flan-t5) - [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca) - [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) + [GLMEdge-1.5b](https://huggingface.co/THUDM/glm-edge-1.5b-chat) + [GLMEdge-4b](https://huggingface.co/THUDM/glm-edge-4b-chat) +- [x] [GLM-4-0414](https://huggingface.co/collections/THUDM/glm-4-0414-67f3cbcb34dd9d252707cb2e) - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4aff03a34bb..2bf97475f78 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -735,6 +735,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406": # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct res = "llama4" + if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2": + # ref: https://huggingface.co/THUDM/glm-4-9b-hf + res = "glm4" if res is None: logger.warning("\n") @@ -4897,6 +4900,22 @@ def prepare_tensors(self): self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) +@Model.register("Glm4ForCausalLM") +class Glm4Model(Model): + model_arch = gguf.MODEL_ARCH.GLM4 + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "yarn": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) + + @Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") class ChatGLMModel(Model): model_arch = gguf.MODEL_ARCH.CHATGLM @@ -5588,7 +5607,6 @@ def main() -> None: with torch.inference_mode(): output_type = ftype_map[args.outtype] model_architecture = hparams["architectures"][0] - try: model_class = Model.from_model_architecture(model_architecture) except NotImplementedError: diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index ce6104da4a0..160c9fe0e61 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -114,6 +114,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", }, {"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", }, {"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", }, + {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", }, ] diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 0410654dde2..162070e6e19 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -280,6 +280,7 @@ class MODEL_ARCH(IntEnum): DEEPSEEK = auto() DEEPSEEK2 = auto() CHATGLM = auto() + GLM4 = auto() BITNET = auto() T5 = auto() T5ENCODER = auto() @@ -487,6 +488,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.DEEPSEEK: "deepseek", MODEL_ARCH.DEEPSEEK2: "deepseek2", MODEL_ARCH.CHATGLM: "chatglm", + MODEL_ARCH.GLM4: "glm4", MODEL_ARCH.BITNET: "bitnet", MODEL_ARCH.T5: "t5", MODEL_ARCH.T5ENCODER: "t5encoder", @@ -1561,6 +1563,23 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.GLM4 : [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_POST_NORM, + ], MODEL_ARCH.BITNET: [ MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a9e681f8e67..35154e9b5da 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -13,7 +13,7 @@ class TensorNameMap: "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone "transformer.word_embeddings", # falcon "word_embeddings", # bloom - "model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 + "model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414 "tok_embeddings", # llama-pth "embeddings.word_embeddings", # bert nomic-bert "language_model.embedding.word_embeddings", # persimmon @@ -241,7 +241,8 @@ class TensorNameMap: ), MODEL_TENSOR.ATTN_POST_NORM: ( - "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 + "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge + "model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414 ), # Rotary embeddings @@ -278,6 +279,7 @@ class TensorNameMap: # Post feed-forward norm MODEL_TENSOR.FFN_POST_NORM: ( "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2 + "model.layers.{bid}.post_mlp_layernorm", # glm-4-0414 ), MODEL_TENSOR.FFN_GATE_INP: ( @@ -316,7 +318,7 @@ class TensorNameMap: "h.{bid}.mlp.c_fc", # gpt2 "transformer.h.{bid}.mlp.fc1", # phi2 "model.layers.{bid}.mlp.fc1", # phi2 - "model.layers.{bid}.mlp.gate_up_proj", # phi3 + "model.layers.{bid}.mlp.gate_up_proj", # phi3 glm-4-0414 "model.layers.layers.{bid}.mlp.up_proj", # plamo "model.layers.{bid}.feed_forward.w3", # internlm2 "encoder.layers.{bid}.mlp.fc11", # nomic-bert diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 264f8c5b98f..a6fddc7fd2e 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -54,6 +54,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_DEEPSEEK, "deepseek" }, { LLM_ARCH_DEEPSEEK2, "deepseek2" }, { LLM_ARCH_CHATGLM, "chatglm" }, + { LLM_ARCH_GLM4, "glm4" }, { LLM_ARCH_BITNET, "bitnet" }, { LLM_ARCH_T5, "t5" }, { LLM_ARCH_T5ENCODER, "t5encoder" }, @@ -1152,6 +1153,25 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, }, }, + { + LLM_ARCH_GLM4, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, + { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, + }, + }, { LLM_ARCH_BITNET, { diff --git a/src/llama-arch.h b/src/llama-arch.h index 2019352812d..2c2099b3c38 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -58,6 +58,7 @@ enum llm_arch { LLM_ARCH_DEEPSEEK, LLM_ARCH_DEEPSEEK2, LLM_ARCH_CHATGLM, + LLM_ARCH_GLM4, LLM_ARCH_BITNET, LLM_ARCH_T5, LLM_ARCH_T5ENCODER, @@ -256,6 +257,8 @@ enum llm_tensor { LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K_NORM, LLM_TENSOR_LAYER_OUT_NORM, + LLM_TENSOR_POST_ATTN_NORM, + LLM_TENSOR_POST_MLP_NORM, LLM_TENSOR_SSM_IN, LLM_TENSOR_SSM_CONV1D, LLM_TENSOR_SSM_X, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ff847701e99..b74dd72cfbf 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1205,6 +1205,15 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_GLM4: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 40: type = LLM_TYPE_9B; break; + case 61: type = LLM_TYPE_32B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_BITNET: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -3476,6 +3485,45 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); } } break; + case LLM_ARCH_GLM4: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED); + layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED); + + if (layer.wqkv == nullptr) { + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); + layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED); + layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED); + } + + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0); + + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); + } + } break; case LLM_ARCH_NEMOTRON: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -10854,6 +10902,157 @@ struct llm_build_chatglm : public llm_graph_context { } }; +struct llm_build_glm4 : public llm_graph_context { + llm_build_glm4(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv_unified(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + // Pre-attention norm + cur = build_norm(inpL, + model.layers[il].attn_norm, + NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + ggml_tensor * Qcur = nullptr; + ggml_tensor * Kcur = nullptr; + ggml_tensor * Vcur = nullptr; + + if (model.layers[il].wqkv == nullptr) { + Qcur = build_lora_mm(model.layers[il].wq, cur); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + } + Kcur = build_lora_mm(model.layers[il].wk, cur); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + } + Vcur = build_lora_mm(model.layers[il].wv, cur); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + } + } else { + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + if (model.layers[il].bqkv) { + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + } + Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, gf, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // Post-attention norm (new!) + cur = build_norm(cur, + model.layers[il].attn_post_norm, + NULL, + LLM_NORM_RMS, il); + cb(cur, "post_attn_norm", il); + + // Add the input (residual connection after post-attention norm) + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // FF + { + // Pre-MLP norm + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, + NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + // MLP + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SWIGLU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); + + // Post-MLP norm + cur = build_norm(cur, + model.layers[il].ffn_post_norm, + NULL, + LLM_NORM_RMS, il); + cb(cur, "post_mlp_norm", il); + } + + // Add residual connection after post-MLP norm + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); + } + + // Final norm + cur = build_norm(inpL, + model.output_norm, + NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // Output projection + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + struct llm_build_nemotron : public llm_graph_context { llm_build_nemotron(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -12735,6 +12934,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; + case LLM_ARCH_GLM4: + { + llm = std::make_unique(*this, params, gf); + } break; case LLM_ARCH_BITNET: { llm = std::make_unique(*this, params, gf); @@ -12932,6 +13135,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_DEEPSEEK2: case LLM_ARCH_PLM: case LLM_ARCH_CHATGLM: + case LLM_ARCH_GLM4: case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_CHAMELEON: diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 0feabd95aaf..464ff01e06f 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1572,6 +1572,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { pre_type = LLAMA_VOCAB_PRE_TYPE_PORO; clean_spaces = false; } else if ( + tokenizer_pre == "glm4" || tokenizer_pre == "chatglm-bpe") { pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4; special_bos_id = LLAMA_TOKEN_NULL; From b2034c2b55b36b2192bdefb3b295db2a911370f5 Mon Sep 17 00:00:00 2001 From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com> Date: Fri, 11 Apr 2025 20:01:56 +0800 Subject: [PATCH 3/4] contrib: support modelscope community (#12664) * support download from modelscope * support login * remove comments * add arguments * fix code * fix win32 * test passed * fix readme * revert readme * change to MODEL_ENDPOINT * revert tail line * fix readme * refactor model endpoint * remove blank line * fix header * fix as comments * update comment * update readme --------- Co-authored-by: tastelikefeet --- README.md | 4 +++- common/arg.cpp | 17 ++++++++--------- common/common.cpp | 13 +++++++++++++ common/common.h | 2 ++ examples/run/run.cpp | 6 ++++-- 5 files changed, 30 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index a129d27d54c..cf45f23cf44 100644 --- a/README.md +++ b/README.md @@ -260,7 +260,9 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt - [Trending](https://huggingface.co/models?library=gguf&sort=trending) - [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf) -You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from Hugging Face by using this CLI argument: `-hf /[:quant]` +You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf /[:quant]`. + +By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`. After downloading a model, use the CLI tools to run it locally - see below. diff --git a/common/arg.cpp b/common/arg.cpp index 642fefb5754..0b57f9da1ee 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -228,12 +228,13 @@ static bool common_download_file_single(const std::string & url, const std::stri curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L); + http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp"); // Check if hf-token or bearer-token was specified if (!bearer_token.empty()) { std::string auth_header = "Authorization: Bearer " + bearer_token; http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); - curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); } + curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); #if defined(_WIN32) // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of @@ -544,7 +545,10 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); curl_slist_ptr http_headers; std::string res_str; - std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag; + + std::string model_endpoint = get_model_endpoint(); + + std::string url = model_endpoint + "v2/" + hf_repo + "/manifests/" + tag; curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data); @@ -659,13 +663,8 @@ static void common_params_handle_model( } } - std::string hf_endpoint = "https://huggingface.co/"; - const char * hf_endpoint_env = getenv("HF_ENDPOINT"); - if (hf_endpoint_env) { - hf_endpoint = hf_endpoint_env; - if (hf_endpoint.back() != '/') hf_endpoint += '/'; - } - model.url = hf_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file; + std::string model_endpoint = get_model_endpoint(); + model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file; // make sure model path is present (for caching purposes) if (model.path.empty()) { // this is to avoid different repo having same file name, or same file name in different subdirs diff --git a/common/common.cpp b/common/common.cpp index d4882c5123c..4e1a020d017 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1027,6 +1027,19 @@ struct common_init_result common_init_from_params(common_params & params) { return iparams; } +std::string get_model_endpoint() { + const char * model_endpoint_env = getenv("MODEL_ENDPOINT"); + // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility. + const char * hf_endpoint_env = getenv("HF_ENDPOINT"); + const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env; + std::string model_endpoint = "https://huggingface.co/"; + if (endpoint_env) { + model_endpoint = endpoint_env; + if (model_endpoint.back() != '/') model_endpoint += '/'; + } + return model_endpoint; +} + void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora) { llama_clear_adapter_lora(ctx); for (auto & la : lora) { diff --git a/common/common.h b/common/common.h index 725b5123d24..e6eaa8e80cf 100644 --- a/common/common.h +++ b/common/common.h @@ -543,6 +543,8 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p // clear LoRA adapters from context, then apply new list of adapters void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora); +std::string get_model_endpoint(); + // // Batch utils // diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 68e94b0b3c3..e63c2aac331 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -697,8 +697,10 @@ class LlamaData { std::vector headers = { "User-Agent: llama-cpp", "Accept: application/json" }; std::string url; + std::string model_endpoint = get_model_endpoint(); + if (pos == std::string::npos) { - auto [model_name, manifest_url] = extract_model_and_tag(model, "https://huggingface.co/v2/"); + auto [model_name, manifest_url] = extract_model_and_tag(model, model_endpoint + "v2/"); hfr = model_name; nlohmann::json manifest; @@ -713,7 +715,7 @@ class LlamaData { hff = model.substr(pos + 1); } - url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff; + url = model_endpoint + hfr + "/resolve/main/" + hff; return download(url, bn, true, headers); } From 578754b3157d662c2fdd51eaa62b6c1f43d3172c Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Fri, 11 Apr 2025 15:32:14 +0200 Subject: [PATCH 4/4] sycl: Support sycl_ext_oneapi_limited_graph (#12873) The current usage of the SYCL-Graph extension checks for the `sycl_ext_oneapi_graph` device aspect. However, it is also possible to support `sycl_ext_oneapi_limied_graph` devices that don't support update --- ggml/src/ggml-sycl/ggml-sycl.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 236b039e618..3e48a9244d3 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3653,7 +3653,8 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_ #ifdef GGML_SYCL_GRAPH if (!g_ggml_sycl_disable_graph) { - if (!sycl_ctx->exec_graph && !dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_graph)) { + const bool graph_support = dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_limited_graph); + if (!graph_support) { GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device); ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph); return GGML_STATUS_SUCCESS; @@ -3664,8 +3665,10 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_ ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph); model_sycl_graph.end_recording(); - if (!sycl_ctx->exec_graph) { - auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}}); + const bool graph_update_support = dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_graph); + if (!sycl_ctx->exec_graph || !graph_update_support) { + auto exec_graph = graph_update_support ? model_sycl_graph.finalize(sycl_ex::property::graph::updatable{}) : + model_sycl_graph.finalize(); sycl_ctx->exec_graph = std::make_unique< sycl_ex::command_graph>(exec_graph); } else {