diff --git a/common/arg.cpp b/common/arg.cpp index 7744fd6c488..cd853119131 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2095,6 +2095,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.no_kv_offload = true; } ).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); + add_opt(common_arg( + {"-nr", "--no-repack"}, + "disable weight repacking", + [](common_params & params) { + params.no_extra_bufts = true; + } + ).set_env("LLAMA_ARG_NO_REPACK")); add_opt(common_arg( {"-ctk", "--cache-type-k"}, "TYPE", string_format( @@ -2373,6 +2380,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } )); + add_opt(common_arg( + {"--cpu-moe"}, + "use CPU for Mixture of Experts (MoE) weights", + [](common_params & params) { + params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()}); + params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()}); + params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()}); + } + ).set_env("LLAMA_ARG_CPU_MOE")); add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", "number of layers to store in VRAM", diff --git a/common/common.cpp b/common/common.cpp index d8c4d988b6f..c6962d1d19b 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1122,6 +1122,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; mparams.check_tensors = params.check_tensors; + mparams.use_extra_bufts = !params.no_extra_bufts; if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; diff --git a/common/common.h b/common/common.h index f5acf37ff9f..b8b01a7e997 100644 --- a/common/common.h +++ b/common/common.h @@ -225,7 +225,7 @@ struct common_params_diffusion { bool visual_mode = false; float eps = 0; // epsilon for timesteps - int32_t block_length = 32; // block length for generation + int32_t block_length = 0; // block length for generation int32_t algorithm = 4; // default algorithm: low-confidence float alg_temp = 0.0f; // algorithm temperature @@ -359,6 +359,7 @@ struct common_params { bool warmup = true; // warmup run bool check_tensors = false; // validate tensor data bool no_op_offload = false; // globally disable offload host tensor operations to device + bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking) bool single_turn = false; // single turn chat conversation diff --git a/docs/multimodal/minicpmo2.6.md b/docs/multimodal/minicpmo2.6.md index 8c6db8efe5b..d4edc97182c 100644 --- a/docs/multimodal/minicpmo2.6.md +++ b/docs/multimodal/minicpmo2.6.md @@ -29,8 +29,8 @@ cmake --build build --config Release Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us) ```bash -python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-o-2_6 -python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4 +python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-o-2_6 +python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --minicpmv_version 4 python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model # quantize int4 version diff --git a/docs/multimodal/minicpmo4.0.md b/docs/multimodal/minicpmo4.0.md new file mode 100644 index 00000000000..49125ea05e0 --- /dev/null +++ b/docs/multimodal/minicpmo4.0.md @@ -0,0 +1,47 @@ +## MiniCPM-o 4 + +### Prepare models and code + +Download [MiniCPM-o-4](https://huggingface.co/openbmb/MiniCPM-o-4) PyTorch model from huggingface to "MiniCPM-o-4" folder. + + +### Build llama.cpp +Readme modification time: 20250206 + +If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) + +Clone llama.cpp: +```bash +git clone https://github.com/ggerganov/llama.cpp +cd llama.cpp +``` + +Build llama.cpp using `CMake`: +```bash +cmake -B build +cmake --build build --config Release +``` + + +### Usage of MiniCPM-o 4 + +Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-4-gguf) by us) + +```bash +python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-o-4 +python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-4 --minicpmv-projector ../MiniCPM-o-4/minicpmv.projector --output-dir ../MiniCPM-o-4/ --minicpmv_version 6 +python ./convert_hf_to_gguf.py ../MiniCPM-o-4/model + +# quantize int4 version +./build/bin/llama-quantize ../MiniCPM-o-4/model/ggml-model-f16.gguf ../MiniCPM-o-4/model/ggml-model-Q4_K_M.gguf Q4_K_M +``` + + +Inference on Linux or Mac +```bash +# run in single-turn mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-o-4/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-4/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" + +# run in conversation mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-o-4/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-4/mmproj-model-f16.gguf +``` diff --git a/docs/multimodal/minicpmv2.5.md b/docs/multimodal/minicpmv2.5.md index 19b439607d4..5eb87bc9693 100644 --- a/docs/multimodal/minicpmv2.5.md +++ b/docs/multimodal/minicpmv2.5.md @@ -28,8 +28,8 @@ cmake --build build --config Release Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us) ```bash -python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5 -python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2 +python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5 +python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --minicpmv_version 2 python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model # quantize int4 version diff --git a/docs/multimodal/minicpmv2.6.md b/docs/multimodal/minicpmv2.6.md index 15c1bbd12eb..1b0ff596918 100644 --- a/docs/multimodal/minicpmv2.6.md +++ b/docs/multimodal/minicpmv2.6.md @@ -28,8 +28,8 @@ cmake --build build --config Release Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us) ```bash -python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-V-2_6 -python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3 +python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-2_6 +python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --minicpmv_version 3 python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model # quantize int4 version diff --git a/docs/multimodal/minicpmv4.0.md b/docs/multimodal/minicpmv4.0.md new file mode 100644 index 00000000000..65887d96019 --- /dev/null +++ b/docs/multimodal/minicpmv4.0.md @@ -0,0 +1,47 @@ +## MiniCPM-V 4 + +### Prepare models and code + +Download [MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4) PyTorch model from huggingface to "MiniCPM-V-4" folder. + + +### Build llama.cpp +Readme modification time: 20250206 + +If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) + +Clone llama.cpp: +```bash +git clone https://github.com/ggerganov/llama.cpp +cd llama.cpp +``` + +Build llama.cpp using `CMake`: +```bash +cmake -B build +cmake --build build --config Release +``` + + +### Usage of MiniCPM-V 4 + +Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-4-gguf) by us) + +```bash +python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-4 +python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-4 --minicpmv-projector ../MiniCPM-V-4/minicpmv.projector --output-dir ../MiniCPM-V-4/ --minicpmv_version 5 +python ./convert_hf_to_gguf.py ../MiniCPM-V-4/model + +# quantize int4 version +./build/bin/llama-quantize ../MiniCPM-V-4/model/ggml-model-f16.gguf ../MiniCPM-V-4/model/ggml-model-Q4_K_M.gguf Q4_K_M +``` + + +Inference on Linux or Mac +```bash +# run in single-turn mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" + +# run in conversation mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4/mmproj-model-f16.gguf +``` diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index b1a1cff904b..75b58c26fc1 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -5225,9 +5225,9 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub } static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { - VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; - std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; + VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << ggml_type_name(src0->type) << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; + std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << ggml_type_name(src1->type) << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << ggml_type_name(dst->type) << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; std::cerr << "), " << (dryrun ? "dryrun" : "") << ")"); GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT @@ -11168,7 +11168,7 @@ size_t comp_nb[GGML_MAX_DIMS]; size_t check_counter = 0; static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) { ggml_tensor * tensor = cgraph->nodes[tensor_idx]; - if (tensor->op == GGML_OP_TRANSPOSE) { + if (tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_SET_ROWS) { return; } @@ -11288,7 +11288,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * tensor_clone = ggml_upscale_ext(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], (ggml_scale_mode) tensor->op_params[0]); } else if (tensor->op == GGML_OP_SCALE) { const float * params = (const float *)tensor->op_params; - tensor_clone = ggml_scale(ggml_ctx, src_clone[0], params[0]); + tensor_clone = ggml_scale_bias(ggml_ctx, src_clone[0], params[0], params[1]); } else if (tensor->op == GGML_OP_SQR) { tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]); } else if (tensor->op == GGML_OP_SIN) { @@ -11399,8 +11399,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * } else { tensor_clone = ggml_cpy(ggml_ctx, src_clone[0], src_clone[1]); } - } else if (tensor->op == GGML_OP_SET_ROWS) { - tensor_clone = ggml_set_rows(ggml_ctx, src_clone[0], src_clone[1]); } else if (tensor->op == GGML_OP_CONT) { tensor_clone = ggml_cont_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); } else if (tensor->op == GGML_OP_RESHAPE) { @@ -11508,7 +11506,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) { ggml_tensor * tensor = cgraph->nodes[tensor_idx]; - if (tensor->op == GGML_OP_TRANSPOSE) { + if (tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_SET_ROWS) { return; } bool fused_rms_norm_mul = false; @@ -11568,6 +11566,9 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * } else if (tensor->type == GGML_TYPE_F16) { correct = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0])); result = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0])); + } else if (tensor->type == GGML_TYPE_BF16) { + correct = ggml_bf16_to_fp32(*(ggml_bf16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0])); + result = ggml_bf16_to_fp32(*(ggml_bf16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0])); } else if (tensor->type == GGML_TYPE_I32) { correct = *(int32_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]); result = *(int32_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]); diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 15adbfa7818..df490fc80e9 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -616,6 +616,7 @@ class TensorNameMap: ), MODEL_TENSOR.SSM_DT_NORM: ( + "model.layers.layers.{bid}.mixer.dt_norm.weight", # plamo2 "model.layers.{bid}.mamba.dt_layernorm", # jamba ), @@ -645,10 +646,6 @@ class TensorNameMap: "model.layers.layers.{bid}.mixer.D", # plamo2 ), - MODEL_TENSOR.SSM_DT_NORM: ( - "model.layers.layers.{bid}.mixer.dt_norm.weight", # plamo2 - ), - MODEL_TENSOR.SSM_NORM: ( "model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid "backbone.layers.{bid}.mixer.norm", # mamba2 diff --git a/include/llama.h b/include/llama.h index 1a51e74a8d6..2cbe18d8cfb 100644 --- a/include/llama.h +++ b/include/llama.h @@ -284,10 +284,11 @@ extern "C" { const struct llama_model_kv_override * kv_overrides; // Keep the booleans together to avoid misalignment during copy-by-value. - bool vocab_only; // only load the vocabulary, no weights - bool use_mmap; // use mmap if possible - bool use_mlock; // force system to keep model in RAM - bool check_tensors; // validate model tensor data + bool vocab_only; // only load the vocabulary, no weights + bool use_mmap; // use mmap if possible + bool use_mlock; // force system to keep model in RAM + bool check_tensors; // validate model tensor data + bool use_extra_bufts; // use extra buffer types (used for weight repacking) }; // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 56c2ecd4cab..3983a693205 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -290,7 +290,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara } // CPU: ACCEL -> GPU host -> CPU extra -> CPU -static buft_list_t make_cpu_buft_list(const std::vector & devices) { +static buft_list_t make_cpu_buft_list(const std::vector & devices, bool use_extra_bufts) { buft_list_t buft_list; // add ACCEL buffer types @@ -319,21 +319,22 @@ static buft_list_t make_cpu_buft_list(const std::vector & de } } - // add extra buffer types, only if no GPU device is present - // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094 - auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - if (cpu_dev == nullptr) { - throw std::runtime_error(format("%s: no CPU backend found", __func__)); - } + // add extra buffer types + if (use_extra_bufts) { + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (cpu_dev == nullptr) { + throw std::runtime_error(format("%s: no CPU backend found", __func__)); + } - auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); - auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t) - ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts"); - if (ggml_backend_dev_get_extra_bufts_fn) { - ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev); - while (extra_bufts && *extra_bufts) { - buft_list.emplace_back(cpu_dev, *extra_bufts); - ++extra_bufts; + auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); + auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t) + ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts"); + if (ggml_backend_dev_get_extra_bufts_fn) { + ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev); + while (extra_bufts && *extra_bufts) { + buft_list.emplace_back(cpu_dev, *extra_bufts); + ++extra_bufts; + } } } @@ -1839,7 +1840,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false"); // build a list of buffer types for the CPU and GPU devices - pimpl->cpu_buft_list = make_cpu_buft_list(devices); + pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts); for (auto * dev : devices) { buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split); // add CPU buffer types as a fallback @@ -2044,7 +2045,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) { for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { std::regex pattern(overrides->pattern); if (std::regex_search(tensor_name, pattern)) { - buft = overrides->buft; + if (overrides->buft == ggml_backend_cpu_buffer_type()) { + // when overriding to a CPU buffer, consider the extra buffer types + buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list); + } else { + buft = overrides->buft; + } + LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n", tensor_name.c_str(), ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type), @@ -17839,6 +17846,7 @@ llama_model_params llama_model_default_params() { /*.use_mmap =*/ true, /*.use_mlock =*/ false, /*.check_tensors =*/ false, + /*.use_extra_bufts =*/ true, }; #ifdef GGML_USE_METAL diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index a00af7a1d17..0756bf09b88 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -875,9 +875,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // get more optimal quantization type based on the tensor shape, layer, etc. if (!params->pure && ggml_is_quantized(default_type)) { + int fallback = qs.n_fallback; new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); - // unless the user specifies a type - if (params->tensor_types) { + // unless the user specifies a type, and the tensor geometry will not require fallback quantisation + if (params->tensor_types && qs.n_fallback - fallback == 0) { const std::vector & tensor_types = *static_cast *>(params->tensor_types); const std::string tensor_name(tensor->name); for (const auto & [tname, qtype] : tensor_types) { @@ -890,7 +891,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } } - if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { new_type = params->token_embedding_type; } diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index a4b62f9afe3..20c2173314a 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -868,10 +868,16 @@ struct clip_graph { int n_head = n_embd/d_head; int num_query = 96; if (ctx->model.hparams.minicpmv_version == 2) { + // MiniCPM-V 2.5 num_query = 96; } else if (ctx->model.hparams.minicpmv_version == 3) { + // MiniCPM-V 2.6 num_query = 64; } else if (ctx->model.hparams.minicpmv_version == 4) { + // MiniCPM-o 2.6 + num_query = 64; + } else if (ctx->model.hparams.minicpmv_version == 5) { + // MiniCPM-V 4.0 num_query = 64; } @@ -3551,10 +3557,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_MINICPMV: { if (params.minicpmv_version == 2) { + // MiniCPM-V 2.5 n_patches_sq = 96; } else if (params.minicpmv_version == 3) { + // MiniCPM-V 2.6 n_patches_sq = 64; } else if (params.minicpmv_version == 4) { + // MiniCPM-o 2.6 + n_patches_sq = 64; + } else if (params.minicpmv_version == 5) { + // MiniCPM-V 4.0 n_patches_sq = 64; } else { GGML_ABORT("Unknown minicpmv version"); @@ -4103,11 +4115,17 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.mm_3_b->ne[0]; case PROJECTOR_TYPE_MINICPMV: if (hparams.minicpmv_version == 2) { + // MiniCPM-V 2.5 return 4096; } else if (hparams.minicpmv_version == 3) { + // MiniCPM-V 2.6 return 3584; } else if (hparams.minicpmv_version == 4) { + // MiniCPM-o 2.6 return 3584; + } else if (hparams.minicpmv_version == 5) { + // MiniCPM-V 4.0 + return 2560; } GGML_ABORT("Unknown minicpmv version"); case PROJECTOR_TYPE_GLM_EDGE: diff --git a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py index cfe0961f989..3c60209544a 100644 --- a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +++ b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py @@ -497,11 +497,11 @@ def bytes_to_unicode(): ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 # Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 -default_image_mean = [0.48145466, 0.4578275, 0.40821073] -default_image_std = [0.26862954, 0.26130258, 0.27577711] +default_image_mean = [0.5, 0.5, 0.5] +default_image_std = [0.5, 0.5, 0.5] ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) -ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4', default=2) +ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4; MiniCPM-V 4.0 use 5; MiniCPM-o-4.0 use 6', default=2) # with proper args = ap.parse_args() @@ -517,6 +517,17 @@ def bytes_to_unicode(): # output in the same directory as the model if output_dir is None dir_model = args.model_dir +# If minicpmv_projector is not specified but the default path exists, use the default path +if args.minicpmv_projector is None: + default_projector_path = os.path.join(dir_model, "minicpmv.projector") + if os.path.isfile(default_projector_path): + args.minicpmv_projector = default_projector_path + print(f"Found default projector file: {default_projector_path}") + +# If output_dir is not specified, use model_dir as the default value +if args.output_dir is None: + args.output_dir = dir_model + if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: vocab = None tokens = None @@ -546,18 +557,21 @@ def bytes_to_unicode(): minicpmv_version = args.minicpmv_version emb_dim = 4096 block_count = 26 -if minicpmv_version == 1: +if minicpmv_version == 1: # MiniCPM-V 2.0 emb_dim = 2304 block_count = 26 -elif minicpmv_version == 2: +elif minicpmv_version == 2: # MiniCPM-V 2.5 emb_dim = 4096 block_count = 27 -elif minicpmv_version == 3: +elif minicpmv_version == 3: # MiniCPM-V 2.6 emb_dim = 3584 block_count = 27 -elif minicpmv_version == 4: +elif minicpmv_version == 4: # MiniCPM-o 2.6 emb_dim = 3584 block_count = 27 +elif minicpmv_version == 5: # MiniCPM-V 4.0 + emb_dim = 2560 + block_count = 27 default_vision_config = { "hidden_size": 1152, @@ -577,6 +591,10 @@ def bytes_to_unicode(): elif minicpmv_version == 4: vision_config = SiglipVisionConfig(**default_vision_config) model = SiglipVisionTransformer(vision_config) +elif minicpmv_version == 5: + default_vision_config["model_type"] = "siglip_vision_model" + vision_config = SiglipVisionConfig(**default_vision_config) + model = SiglipVisionTransformer(vision_config) processor = None # if model.attn_pool is not None: @@ -603,7 +621,7 @@ def bytes_to_unicode(): else: fname_middle = "" -output_dir = args.output_dir if args.output_dir is not None else dir_model +output_dir = args.output_dir os.makedirs(output_dir, exist_ok=True) output_prefix = os.path.basename(output_dir).replace("ggml_", "") fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 45b2f1f2517..a05373d5b3c 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -207,7 +207,7 @@ struct mtmd_context { tok_row_end_trail = false; // no trailing end-of-row token ov_img_first = true; - } else if (minicpmv_version == 3 || minicpmv_version == 4) { + } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5) { // minicpmv 2.6 format: // (overview) (slice) (slice) \n ... slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;