diff --git a/CODEOWNERS b/CODEOWNERS index 3186f8eb1c5..4c0dd4b725d 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -9,3 +9,4 @@ /ggml/src/ggml-cuda/mmvq.* @JohannesGaessler /ggml/src/ggml-opt.cpp @JohannesGaessler /ggml/src/gguf.cpp @JohannesGaessler +/ggml/src/ggml-vulkan/ @0cc4m diff --git a/common/common.cpp b/common/common.cpp index 466271be61c..d8c4d988b6f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -448,6 +448,15 @@ void string_replace_all(std::string & s, const std::string & search, const std:: bool string_ends_with(const std::string_view & str, const std::string_view & suffix) { return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; } + +bool string_remove_suffix(std::string & str, const std::string_view & suffix) { + bool has_suffix = string_ends_with(str, suffix); + if (has_suffix) { + str = str.substr(0, str.size() - suffix.size()); + } + return has_suffix; +} + size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) { if (!str.empty() && !stop.empty()) { const char text_last_char = str.back(); diff --git a/common/common.h b/common/common.h index 27adf552465..11427c51f69 100644 --- a/common/common.h +++ b/common/common.h @@ -534,6 +534,7 @@ static bool string_starts_with(const std::string & str, // While we wait for C++20's std::string::ends_with... bool string_ends_with(const std::string_view & str, const std::string_view & suffix); +bool string_remove_suffix(std::string & str, const std::string_view & suffix); size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop); bool string_parse_kv_override(const char * data, std::vector & overrides); diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 3019a545d58..c3f1369b663 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -328,6 +328,7 @@ struct vk_device_struct { uint64_t max_memory_allocation_size; uint64_t suballocation_block_size; bool fp16; + bool bf16; bool pipeline_robustness; vk::Device device; uint32_t vendor_id; @@ -482,6 +483,7 @@ struct vk_device_struct { vk_pipeline pipeline_rwkv_wkv6_f32; vk_pipeline pipeline_rwkv_wkv7_f32; vk_pipeline pipeline_opt_step_adamw_f32; + vk_pipeline pipeline_conv2d_f32; vk_pipeline pipeline_conv2d_dw_whcn_f32; vk_pipeline pipeline_conv2d_dw_cwhn_f32; @@ -875,6 +877,38 @@ struct vk_op_rwkv_wkv7_push_constants { uint32_t H; }; +struct vk_op_conv2d_push_constants { + uint32_t Cout; + uint32_t Cin; + uint32_t N; + + uint32_t KW; + uint32_t KH; + uint32_t W; + uint32_t H; + uint32_t OW; + uint32_t OH; + + uint32_t s0; + uint32_t s1; + uint32_t p0; + uint32_t p1; + uint32_t d0; + uint32_t d1; + + uint32_t nb01; + uint32_t nb02; + uint32_t nb03; + + uint32_t nb11; + uint32_t nb12; + uint32_t nb13; + + uint32_t nb1; + uint32_t nb2; + uint32_t nb3; +}; + struct vk_op_conv2d_dw_push_constants { uint32_t ne; uint32_t batches; @@ -974,18 +1008,45 @@ class vk_memory_logger { #endif // GGML_VULKAN_MEMORY_DEBUG class vk_perf_logger { -public: + public: void print_timings() { + if (timings.empty()) { + return; + } + uint64_t total_all_op_times = 0; std::cerr << "----------------\nVulkan Timings:" << std::endl; - for (const auto& t : timings) { - uint64_t total = 0; - for (const auto& time : t.second) { - total += time; + for (const auto & t : timings) { + uint64_t total_op_times = 0; + for (const auto & time : t.second) { + total_op_times += time; } - std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl; + std::cerr << t.first << ": " << t.second.size() << " x " << (total_op_times / t.second.size() / 1000.0) + << " us"; + + // If we have as many flops entries as timing entries for the op, then compute and log the flops/S. + auto it = flops.find(t.first); + if (it != flops.end() && (it->second).size() == t.second.size()) { + uint64_t total_op_flops = 0; + for (const auto & elem : it->second) { + total_op_flops += elem; + } + std::cerr << " (" + << (double(total_op_flops) / (1000.0 * 1000.0 * 1000.0)) / + (double(total_op_times) / (1000.0 * 1000.0 * 1000.0)) + << " GFLOPS/s)"; + } + + total_all_op_times += total_op_times; + + std::cerr << std::endl; + } + + if (timings.size() > 0) { + std::cerr << "Total time: " << total_all_op_times / 1000.0 << " us." << std::endl; } timings.clear(); + flops.clear(); } void log_timing(const ggml_tensor * node, uint64_t time) { @@ -994,22 +1055,45 @@ class vk_perf_logger { return; } if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) { - const uint64_t m = node->src[0]->ne[1]; - const uint64_t n = node->src[1]->ne[1]; - const uint64_t k = node->src[1]->ne[0]; - std::string name = ggml_op_name(node->op); + const uint64_t m = node->src[0]->ne[1]; + const uint64_t n = node->src[1]->ne[1]; + const uint64_t k = node->src[1]->ne[0]; + std::string name = ggml_op_name(node->op); if (n == 1) { name += "_VEC m=" + std::to_string(m) + " k=" + std::to_string(k); } else { name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k); } timings[name].push_back(time); + flops[name].push_back(m * n * (k + (k - 1))); + return; + } + if (node->op == GGML_OP_CONV_2D) { + std::string name = ggml_op_name(node->op); + ggml_tensor * knl = node->src[0]; + uint64_t OW = node->ne[0]; + uint64_t OH = node->ne[1]; + uint64_t N = node->ne[3]; + uint64_t Cout = node->ne[2]; + uint64_t KW = knl->ne[0]; + uint64_t KH = knl->ne[1]; + uint64_t Cin = knl->ne[2]; + // KxCRS @ CRSxNPQ = KxNPQ -> M=K, K=CRS, N=NPQ + uint64_t size_M = Cout; + uint64_t size_K = Cin * KW * KH; + uint64_t size_N = N * OW * OH; + uint64_t n_flops = size_M * size_N * (size_K + (size_K - 1)); + name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) + + ", N=N*OW*OH=" + std::to_string(size_N); + flops[name].push_back(n_flops); + timings[name].push_back(time); return; } timings[ggml_op_name(node->op)].push_back(time); } -private: + private: std::map> timings; + std::map> flops; }; struct ggml_backend_vk_context { @@ -2112,6 +2196,7 @@ static void ggml_vk_load_shaders(vk_device& device) { } compile_count++; } + compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), spv_size, spv_data, entrypoint, parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size)); }; @@ -2961,6 +3046,42 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + // conv2d + uint32_t conv2d_WG_SIZE = 256; + uint32_t conv2d_BS_K = 128; + uint32_t conv2d_BS_CRS = 16; + uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices. + if (device->subgroup_shuffle && + device->vendor_id != VK_VENDOR_ID_INTEL) { // Do not enable collectives on Intel, see PR 14316 + use_collectives = 1; + conv2d_BS_CRS = std::min( + device->subgroup_size, + conv2d_BS_CRS); // CRS block size should be capped at sugroup size for correctness when shuffle is used. + } + uint32_t conv2d_BS_NPQ = 128; + uint32_t conv2d_TS_K = 8; + uint32_t conv2d_shmem_req = + (conv2d_BS_K * (conv2d_BS_CRS + 1) + conv2d_BS_CRS * (conv2d_BS_NPQ + 1)) * sizeof(float); + if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) { + conv2d_BS_CRS = 8; + if (use_collectives) { + conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS); + } + } + + if (use_collectives) { + ggml_vk_create_pipeline( + device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, + sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 }, + { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true); + } else { + ggml_vk_create_pipeline( + device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, + sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 }, + { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, + false); + } + ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f32, "conv2d_dw_cwhn_f32", conv2d_dw_cwhn_f32_len, conv2d_dw_cwhn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); @@ -3273,6 +3394,12 @@ static vk_device ggml_vk_get_device(size_t idx) { device->fp16 = device->fp16 && vk12_features.shaderFloat16; +#if defined(VK_KHR_shader_bfloat16) + device->bf16 = bfloat16_support && bfloat16_features.shaderBFloat16Type; +#else + device->bf16 = false; +#endif + device->pipeline_robustness = pl_robustness_features.pipelineRobustness; if (device->subgroup_size_control) { @@ -3615,6 +3742,7 @@ static void ggml_vk_print_gpu_info(size_t idx) { bool coopmat_support = false; bool coopmat2_support = false; bool integer_dot_product = false; + bool bfloat16_support = false; for (auto properties : ext_props) { if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { @@ -3635,6 +3763,11 @@ static void ggml_vk_print_gpu_info(size_t idx) { } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 && !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) { integer_dot_product = true; +#endif +#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + } else if (strcmp("VK_KHR_shader_bfloat16", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_BFLOAT16")) { + bfloat16_support = true; #endif } } @@ -3701,10 +3834,25 @@ static void ggml_vk_print_gpu_info(size_t idx) { last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_features; } +#if defined(VK_KHR_shader_bfloat16) + VkPhysicalDeviceShaderBfloat16FeaturesKHR bfloat16_features {}; + bfloat16_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR; + if (bfloat16_support) { + last_struct->pNext = (VkBaseOutStructure *)&bfloat16_features; + last_struct = (VkBaseOutStructure *)&bfloat16_features; + } +#endif + vkGetPhysicalDeviceFeatures2(physical_device, &device_features2); fp16 = fp16 && vk12_features.shaderFloat16; +#if defined(VK_KHR_shader_bfloat16) + bool bf16 = bfloat16_support && bfloat16_features.shaderBFloat16Type; +#else + bool bf16 = false; +#endif + uint32_t default_subgroup_size = get_subgroup_size("", device_architecture); const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize; const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; @@ -3722,8 +3870,8 @@ static void ggml_vk_print_gpu_info(size_t idx) { std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none"; std::string device_name = props2.properties.deviceName.data(); - GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | shared memory: %d | int dot: %d | matrix cores: %s\n", - idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size, + GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | bf16: %d | warp size: %zu | shared memory: %d | int dot: %d | matrix cores: %s\n", + idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, bf16, subgroup_size, props2.properties.limits.maxComputeSharedMemorySize, integer_dot_product, matrix_cores.c_str()); if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) { @@ -6809,6 +6957,12 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_leaky_relu_f32; } return nullptr; + case GGML_OP_CONV_2D: + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && + ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) { + return ctx->device->pipeline_conv2d_f32; + } + return nullptr; case GGML_OP_CONV_2D_DW: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { if (ggml_is_contiguous(src1)) { @@ -7131,6 +7285,31 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co const uint32_t OW = dst->ne[0]; elements = { N * OC * OH * OW, 1, 1}; } break; + case GGML_OP_CONV_2D: + { + // src0 - kernel: [KW, KH, Cin, Cout] + // src1 - input: [W, H, Cin, N] + // dst - result: [OW, OH, Cout, N] + + // Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) + auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t { + return (ins + 2 * p - d * (ks - 1) - 1) / s + 1; + }; + // parallelize in {OW/BS_K, OH/BS_NPQ, 1} + int64_t W = src1->ne[0]; + int64_t H = src1->ne[1]; + int64_t KW = src0->ne[0]; + int64_t KH = src0->ne[1]; + int64_t Cout = src0->ne[3]; + int64_t N = src1->ne[3]; + int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]); + int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]); + int64_t NPQ = N * OW * OH; + + // Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups + elements = { static_cast(Cout), static_cast(NPQ), 1 }; + } + break; case GGML_OP_ADD: case GGML_OP_SUB: case GGML_OP_DIV: @@ -7997,6 +8176,55 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c }, dryrun); } +static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + GGML_ASSERT(nb0 == sizeof(float)); + + vk_op_conv2d_push_constants p{}; + p.Cout = static_cast(ne03); + p.Cin = static_cast(ne02); + p.N = static_cast(ne13); + + p.KW = static_cast(ne00); + p.KH = static_cast(ne01); + p.W = static_cast(ne10); + p.H = static_cast(ne11); + p.OW = static_cast(ne0); + p.OH = static_cast(ne1); + + p.s0 = static_cast(dst->op_params[0]); + p.s1 = static_cast(dst->op_params[1]); + p.p0 = static_cast(dst->op_params[2]); + p.p1 = static_cast(dst->op_params[3]); + p.d0 = static_cast(dst->op_params[4]); + p.d1 = static_cast(dst->op_params[5]); + + p.nb01 = static_cast(nb01 / nb00); + p.nb02 = static_cast(nb02 / nb00); + p.nb03 = static_cast(nb03 / nb00); + + p.nb11 = static_cast(nb11 / nb10); + p.nb12 = static_cast(nb12 / nb10); + p.nb13 = static_cast(nb13 / nb10); + + p.nb1 = static_cast(nb1 / nb0); + p.nb2 = static_cast(nb2 / nb0); + p.nb3 = static_cast(nb3 / nb0); + + GGML_ASSERT(ne03 == ne2); + GGML_ASSERT(ne02 == ne12); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_2D, std::move(p), dryrun); +} + static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { vk_op_conv2d_dw_push_constants p{}; p.ne = ggml_nelements(dst); @@ -9059,6 +9287,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_POOL_2D: + case GGML_OP_CONV_2D: case GGML_OP_CONV_2D_DW: case GGML_OP_RWKV_WKV6: case GGML_OP_RWKV_WKV7: @@ -9126,6 +9355,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_POOL_2D: + case GGML_OP_CONV_2D: case GGML_OP_CONV_2D_DW: case GGML_OP_LEAKY_RELU: { @@ -9332,6 +9562,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_POOL_2D: ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun); + break; + case GGML_OP_CONV_2D: + ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node, dryrun); + break; case GGML_OP_CONV_2D_DW: ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node, dryrun); @@ -9462,6 +9696,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_POOL_2D: + case GGML_OP_CONV_2D: case GGML_OP_CONV_2D_DW: case GGML_OP_RWKV_WKV6: case GGML_OP_RWKV_WKV7: @@ -10043,6 +10278,12 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false); if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) { total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]); + } else if (cgraph->nodes[i]->op == GGML_OP_CONV_2D) { + // Return CRSxNPQxsizeof(*) to account as many bytes as mul_mat has in im2col->mul_mat mode. + auto CRS_size = + cgraph->nodes[i]->src[0]->ne[0] * cgraph->nodes[i]->src[0]->ne[1] * cgraph->nodes[i]->src[0]->ne[2]; + auto NPQ_size = cgraph->nodes[i]->ne[0] * cgraph->nodes[i]->ne[1] * cgraph->nodes[i]->ne[3]; + total_mat_mul_bytes += NPQ_size * CRS_size * ggml_type_size(cgraph->nodes[i]->type); } i += ctx->num_additional_fused_ops; ctx->num_additional_fused_ops = 0; @@ -10619,6 +10860,20 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm return true; case GGML_OP_CONV_TRANSPOSE_1D: return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; + case GGML_OP_CONV_2D: + { + // Op is disabled for Apple because it segfaults at pipeline create time on MoltenVK + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + const vk_device& device = ggml_vk_get_device(ctx->device); + bool is_Apple = ggml_vk_get_device(ctx->device)->vendor_id == VK_VENDOR_ID_APPLE; + // Channel-contiguous format is not supported yet. + return (op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + op->type == GGML_TYPE_F32 && + ggml_is_contiguous(op->src[0]) && + ggml_is_contiguous(op->src[1]) && + ggml_is_contiguous(op)) && !is_Apple; + } default: return false; } @@ -11177,6 +11432,14 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * const int32_t p1 = tensor->op_params[6]; tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1); + } else if (tensor->op == GGML_OP_CONV_2D) { + const int32_t s0 = tensor->op_params[0]; + const int32_t s1 = tensor->op_params[1]; + const int32_t p0 = tensor->op_params[2]; + const int32_t p1 = tensor->op_params[3]; + const int32_t d0 = tensor->op_params[4]; + const int32_t d1 = tensor->op_params[5]; + tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1); } else if (tensor->op == GGML_OP_LEAKY_RELU) { const float * op_params = (const float *)tensor->op_params; tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp new file mode 100644 index 00000000000..481940a52b3 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp @@ -0,0 +1,265 @@ +#version 450 + +#ifdef USE_COLLECTIVES +# extension GL_KHR_shader_subgroup_shuffle : enable +#endif + +#include "types.comp" + +// Make spec constant +#define SHMEM_PAD 0 + +// shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j +layout(binding = 0) readonly buffer A { + A_TYPE knl_data[]; +}; // src0 - kernel: [KW, KH, Cin, Cout] + +layout(binding = 1) readonly buffer B { + B_TYPE src_data[]; +}; // src1 - input: [W, H, Cin, N] -- channel_first format + +layout(binding = 2) writeonly buffer D { + D_TYPE dst_data[]; +}; // dst - result: [OW, OH, Cout, N] + +layout(push_constant) uniform parameter { + // I/O channels, batch size + uint32_t Cout; + uint32_t Cin; + uint32_t N; + + // Tensor spatial sizes: kernel, input, output + uint32_t KW; + uint32_t KH; + uint32_t W; + uint32_t H; + uint32_t OW; + uint32_t OH; + + // Parameters: stride, padding, dilation - 0=y, 1=x + uint32_t s0; + uint32_t s1; + uint32_t p0; + uint32_t p1; + uint32_t d0; + uint32_t d1; + + // Strides in elements + uint32_t nb01; + uint32_t nb02; + uint32_t nb03; + + uint32_t nb11; + uint32_t nb12; + uint32_t nb13; + + uint32_t nb1; + uint32_t nb2; + uint32_t nb3; +} + +p; + +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; +// Blocktile sizes +layout(constant_id = 1) const uint BS_K = 128; +layout(constant_id = 2) const uint BS_CRS = 16; +layout(constant_id = 3) const uint BS_NPQ = 128; +// Thread-tile sizes +layout(constant_id = 4) const uint TS_K = 8; +layout(constant_id = 5) const uint use_collectives = 1; + +uint32_t tid = gl_LocalInvocationID.x; +const uint32_t WG_SIZE = gl_WorkGroupSize.x; + +uint splitWork(uint work_size, uint block_size) { + return (block_size + work_size - 1) / block_size; +} + +uint32_t K = p.Cout; +uint32_t CRS = p.Cin * p.KH * p.KW; +uint32_t NPQ = p.N * p.OH * p.OW; + +uint32_t n_elems_out = K * NPQ; + +// Number of blocktiles per input +uint32_t NB_CRS = splitWork(CRS, BS_CRS); + +const uint32_t Ash_stride = BS_CRS + SHMEM_PAD; +const uint32_t Bsh_stride = BS_NPQ + SHMEM_PAD; + +const uint32_t Ash_numel = BS_K * BS_CRS; +const uint32_t Bsh_numel = BS_CRS * BS_NPQ; + +const uint32_t Ash_len = BS_K * Ash_stride; +const uint32_t Bsh_len = BS_CRS * Bsh_stride; + +shared float Ash[Ash_len]; // K x CRS +shared float Bsh[Bsh_len]; // CRS x NPQ + +// Threadtile sizes +const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K; + +// Number of threadtiles per blocktile +const uint32_t NT_K = BS_K / TS_K; +const uint32_t NT_NPQ = BS_NPQ / TS_NPQ; + +float regA[TS_K]; +float regB[TS_NPQ]; +float regC[TS_K][TS_NPQ]; + +/* +Compute +KxCRS @ CRSxNPQ = K x NPQ +K=Cout +C=Cin +R,S=KH,KW +P,Q=OH,OW +*/ + +uint32_t B_idx_K = gl_WorkGroupID.x; +uint32_t B_idx_NPQ = gl_WorkGroupID.y; + +uint32_t T_y = tid / NT_NPQ; +uint32_t T_x = tid % NT_NPQ; + +uint32_t Ar = tid / BS_CRS; +uint32_t Ac = tid % BS_CRS; +const uint32_t ArpWg = WG_SIZE / BS_CRS; + +uint32_t Br = tid / BS_NPQ; +uint32_t Bc = tid % BS_NPQ; +const uint32_t BrpWg = WG_SIZE / BS_NPQ; + +void main() { + for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) { + for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) { + regC[T_ly][T_lx] = 0.0; + } + } + /* Advance block in CRS dim */ + for (uint32_t B_idx_CRS = 0; B_idx_CRS < NB_CRS; B_idx_CRS++) { + uint32_t CRS_idx_a; + uint32_t Cin_idx_a; + uint32_t KH_idx_a; + uint32_t KW_idx_a; + +#ifdef USE_COLLECTIVES + uint32_t cached_CRS_idx; + uint32_t cached_Cin_idx; + uint32_t cached_KH_idx; + uint32_t cached_KW_idx; + if (use_collectives == 1) { + cached_CRS_idx = B_idx_CRS * BS_CRS + gl_SubgroupInvocationID; + cached_Cin_idx = cached_CRS_idx / (p.KW * p.KH); + uint32_t cached_CRS_remainder = (cached_CRS_idx - cached_Cin_idx * p.KW * p.KH); + cached_KH_idx = cached_CRS_remainder / p.KW; + cached_KW_idx = cached_CRS_remainder - cached_KH_idx * p.KW; + + CRS_idx_a = subgroupShuffle(cached_CRS_idx, Ac); + Cin_idx_a = subgroupShuffle(cached_Cin_idx, Ac); + KH_idx_a = subgroupShuffle(cached_KH_idx, Ac); + KW_idx_a = subgroupShuffle(cached_KW_idx, Ac); + } else { + CRS_idx_a = B_idx_CRS * BS_CRS + Ac; // Global CRS_idx_a (column index of A) + Cin_idx_a = CRS_idx_a / (p.KW * p.KH); + uint32_t CRS_remainder = CRS_idx_a - Cin_idx_a * p.KW * p.KH; + KH_idx_a = CRS_remainder / p.KW; + KW_idx_a = CRS_remainder - KH_idx_a * p.KW; + } +#else + CRS_idx_a = B_idx_CRS * BS_CRS + Ac; // Global CRS_idx_a (column index of A) + Cin_idx_a = CRS_idx_a / (p.KW * p.KH); + CRS_remainder = CRS_idx_a - Cin_idx_a * p.KW * p.KH; + KH_idx_a = CRS_remainder / p.KW; + KW_idx_a = CRS_remainder - KH_idx_a * p.KW; +#endif + + /* Load kernel to A_block: (BS_K x BS_CRS)*/ + for (uint32_t r_offset = 0; r_offset < BS_K; r_offset += ArpWg) { + uint32_t B_ly = r_offset + Ar; + uint32_t B_lx = Ac; + uint32_t K_idx = B_idx_K * BS_K + B_ly; /* Global K_idx (row index of A)*/ + uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + Cin_idx_a * p.nb02 + K_idx * p.nb03, K * CRS - 1); + float val = knl_data[knl_idx]; + if (K_idx >= K || CRS_idx_a >= CRS) { + val = 0.0; + } + Ash[B_ly * Ash_stride + B_lx] = val; + } + /* Load input to B_block: (BS_CRS x BS_NPQ) */ + for (uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg) { + uint32_t B_ly = r_offset + Br; /* Row index of B block */ + uint32_t B_lx = Bc; + uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + B_lx; /* Global NPQ index (column index of B) */ + uint32_t N_idx = NPQ_idx / (p.OH * p.OW); + uint32_t NPQ_remainder = NPQ_idx - N_idx * p.OH * p.OW; + uint32_t OH_idx = NPQ_remainder / p.OW; + uint32_t OW_idx = NPQ_remainder - OH_idx * p.OW; + + uint32_t CRS_idx_b; + uint32_t Cin_idx_b; + uint32_t KH_idx_b; + uint32_t KW_idx_b; +#ifdef USE_COLLECTIVES + if (use_collectives == 1) { + CRS_idx_b = subgroupShuffle(cached_CRS_idx, r_offset + Br); + Cin_idx_b = subgroupShuffle(cached_Cin_idx, r_offset + Br); + KH_idx_b = subgroupShuffle(cached_KH_idx, r_offset + Br); + KW_idx_b = subgroupShuffle(cached_KW_idx, r_offset + Br); + } else { + CRS_idx_b = B_idx_CRS * BS_CRS + B_ly; /* Global CRS index (row index of B) */ + Cin_idx_b = CRS_idx_b / (p.KW * p.KH); + uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b * p.KW * p.KH; + KH_idx_b = CRS_remainder / p.KW; + KW_idx_b = CRS_remainder - KH_idx_b * p.KW; + } +#else + CRS_idx_b = B_idx_CRS * BS_CRS + B_ly; /* Global CRS index (row index of B) */ + Cin_idx_b = CRS_idx_b / (p.KW * p.KH); + uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b * p.KW * p.KH; + KH_idx_b = CRS_remainder / p.KW; + KW_idx_b = CRS_remainder - KH_idx_b * p.KW; +#endif + + uint32_t H_idx = OH_idx * p.s1 + KH_idx_b * p.d1 - p.p1; + uint32_t W_idx = OW_idx * p.s0 + KW_idx_b * p.d0 - p.p0; + uint32_t src_idx = + min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1); + float val = src_data[src_idx]; + if (CRS_idx_b >= CRS || NPQ_idx >= NPQ || H_idx < 0 || H_idx >= p.H || W_idx < 0 || W_idx >= p.W) { + val = 0.0; + } + Bsh[B_ly * Bsh_stride + B_lx] = val; + } + barrier(); + for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) { + for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) { + regA[T_ly] = Ash[(T_y * TS_K + T_ly) * Ash_stride + CRS_lidx]; + } + for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) { + regB[T_lx] = Bsh[CRS_lidx * Bsh_stride + T_x * TS_NPQ + T_lx]; + } + for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) { + for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) { + regC[T_ly][T_lx] = fma(regA[T_ly], regB[T_lx], regC[T_ly][T_lx]); + } + } + } + barrier(); + } + /* Save C* */ + for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) { + for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) { + uint32_t K_idx = B_idx_K * BS_K + T_y * TS_K + T_ly; + uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + T_x * TS_NPQ + T_lx; + uint32_t N_idx = NPQ_idx / (p.OH * p.OW); + uint32_t OH_idx = (NPQ_idx - N_idx * p.OH * p.OW) / p.OW; + uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW; + uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3; + if (K_idx < K && NPQ_idx < NPQ) { + dst_data[dst_idx] = regC[T_ly][T_lx]; + } + } + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 809c0bd9bd3..598f0370fb8 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -655,6 +655,8 @@ void process_shaders() { string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}})); + string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}}); + string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}})); string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}})); @@ -765,8 +767,8 @@ void write_output_files() { len += "};\n"; } } - fprintf(src, data.c_str()); - fprintf(src, len.c_str()); + fputs(data.c_str(), src); + fputs(len.c_str(), src); } fclose(hdr); fclose(src); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 40e809f1ac8..680210db7e9 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -233,6 +233,11 @@ class Adapter: TYPE = "adapter.type" LORA_ALPHA = "adapter.lora.alpha" + class IMatrix: + CHUNK_COUNT = "imatrix.chunk_count" + CHUNK_SIZE = "imatrix.chunk_size" + DATASETS = "imatrix.datasets" + class Clip: PROJECTOR_TYPE = "clip.projector_type" HAS_VISION_ENCODER = "clip.has_vision_encoder" @@ -282,6 +287,7 @@ class Projector: class GGUFType: MODEL = "model" ADAPTER = "adapter" + IMATRIX = "imatrix" MMPROJ = "mmproj" # dummy, unused for now diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index bc732df5bb4..731b4980af9 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3699,6 +3699,93 @@ struct test_im2col : public test_case { } }; +// CONV_2D +struct test_conv_2d : public test_case { + const std::array ne_input; + const std::array ne_kernel; + const int stride0; + const int stride1; + const int padding0; + const int padding1; + const int dilation0; + const int dilation1; + // Whether the inputs are contiguous in the channel dim or the width dim + const bool cwhn; + + // If true, the direct CONV_2D will be used in the graph, otherwise it + // uses ggml_conv_2d: + // * if the program is called with -o CONV_2D_DIRECT_IMPL, the + // CONV_2D graph will be built, while + // * if the program is called with -o CONV_2D_INDIRECT_IMPL, the + // IM2COL -> MUL_MM graph will be built. + + std::string vars() override { + return VARS_TO_STR9(ne_input, ne_kernel, stride0, stride1, padding0, padding1, dilation0, dilation1, cwhn); + } + + uint64_t op_flops(ggml_tensor * t) override { + GGML_UNUSED(t); + // Just counting matmul costs: + // KxCRS @ CRSxNPQ = KxNPQ --> KxNPQx(CRS+CRS-1) flops + + // Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) + auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t { + return (ins + 2 * p - d * (ks - 1) - 1) / s + 1; + }; + + int64_t W = ne_input[0]; + int64_t H = ne_input[1]; + int64_t KW = ne_kernel[0]; + int64_t KH = ne_kernel[1]; + int64_t Cin = ne_kernel[2]; + int64_t Cout = ne_kernel[3]; + int64_t N = ne_input[3]; + int64_t OH = calc_conv_output_size(H, KH, stride0, padding0, dilation0); + int64_t OW = calc_conv_output_size(W, KW, stride0, padding0, dilation0); + + int64_t K = Cout; + int64_t CRS = Cin * KH * KW; + int64_t NPQ = N * OH * OW; + + return K * NPQ * (2 * CRS - 1); + } + + test_conv_2d(std::array ne_input = { 64, 64, 16, 1 }, + std::array ne_kernel = { 3, 3, 1, 16 }, int stride0 = 1, int stride1 = 1, int padding0 = 0, + int padding1 = 0, int dilation0 = 1, int dilation1 = 1, bool cwhn = false) : + ne_input(ne_input), + ne_kernel(ne_kernel), + stride0(stride0), + stride1(stride1), + padding0(padding0), + padding1(padding1), + dilation0(dilation0), + dilation1(dilation1), + cwhn(cwhn) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data()); + ggml_set_name(input, "input"); + + ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data()); + ggml_set_name(kernel, "kernel"); + + if (cwhn) { + // change memory layout to channel-most-contiguous (CWHN), + // then permute it back so NE matches the original input + input = ggml_cont(ctx, ggml_permute(ctx, input, 1, 2, 0, 3)); + input = ggml_permute(ctx, input, 2, 0, 1, 3); + kernel = ggml_cont(ctx, ggml_permute(ctx, kernel, 2, 3, 1, 0)); + kernel = ggml_permute(ctx, kernel, 3, 2, 0, 1); + } + + ggml_tensor * out = + ggml_conv_2d_direct(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1); + ggml_set_name(out, "out"); + return out; + } +}; + // GGML_OP_CONV_2D_DW struct test_conv_2d_dw : public test_case { const std::array ne_input; @@ -5007,6 +5094,80 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2560}, {3, 3, 1, 2560}, 1, 1, 1, 1, 1, 1, true)); test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true)); +// Conv_2D test cases +#ifdef DETAILED_TESTS + // Probably we do not have enough time to execute these in the pipeline. + uint32_t iwh_idx = 0; + uint32_t kwh_idx = 1; + uint32_t Cout_idx = 2; + uint32_t Cin_idx = 3; + uint32_t B_idx = 4; + + std::vector> cases = { + //{IWH, KWH, Cout, Cin, B} + // K=CRS=NPQ=4096 conv_2d matmul performance + {19, 4, 4096, 256, 16}, + // K=128, CRS=128, NPQ=4096 + { 19, 4, 128, 8, 16}, + // K=130, CRS=128, NPQ=4096 + { 19, 4, 130, 8, 16}, + // Edge case: K x CRS is small + { 19, 2, 4, 4, 16}, + // A ConvNet's first layer + { 224, 3, 8, 3, 1 }, + // A ConvNet's first layer with 2x2 convolution, and 1 channel + { 224, 2, 8, 1, 1 }, + // A ConvNet's first layer with 2x2 convolution, and 1 channel, several images in the batch + { 224, 2, 8, 1, 8 }, + // A middle layer of a ConvNet + { 58, 3, 64, 32, 1 }, + // A middle layer of a ConvNet, several images in the batch + { 58, 3, 64, 32, 8 }, + // A deep layer of a ConvNet, several images in the batch + { 16, 3, 256, 128, 8 } + }; + + for (auto act_case : cases) { + test_cases.emplace_back(new test_conv_2d( + { act_case[iwh_idx], act_case[iwh_idx], act_case[Cin_idx], act_case[B_idx] }, + { act_case[kwh_idx], act_case[kwh_idx], act_case[Cin_idx], act_case[Cout_idx] }, 1, 1, 0, 0, 1, 1, false)); + } +#endif + + // CONV_2D: + auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t { + return (ins + 2 * p - d * (ks - 1) - 1) / s + 1; + }; + + //uint32_t s0 = 3; + uint32_t s1 = 5; + uint32_t p0 = 5; + //uint32_t p1 = 2; + uint32_t d0 = 2; + uint32_t d1 = 4; + + for (uint32_t s0 : { 1, 3 }) { + for (uint32_t p1 : { 2, 5 }) { + for (uint32_t Cin : { 1, 25 }) { + for (uint32_t Cout : { 1, 12 }) { + for (uint32_t KH : { 1, 2, 3, 11 }) { + for (uint32_t KW : { 1, 2, 3, 11 }) { + for (uint32_t H : { 1, 133 }) { + for (uint32_t W : { 1, 141 }) { + if (calc_conv_output_size(W, KW, s0, p0, d0) > 0 && + calc_conv_output_size(H, KH, s1, p1, d1) > 0) { + test_cases.emplace_back(new test_conv_2d( + { W, H, Cin, 2 }, { KW, KH, Cin, Cout }, s0, s1, p0, p1, d0, d1, false)); + } + } + } + } + } + } + } + } + } + // sycl backend will limit task global_range < MAX_INT // test cases for 2D im2col with large input W and H (occurs in stable-diffusion) // however these cases need to alloc more memory which may fail in some devices (Intel Arc770, etc.) @@ -5610,6 +5771,43 @@ static std::vector> make_test_cases_eval() { static std::vector> make_test_cases_perf() { std::vector> test_cases; + // Conv2d: K=CRS=NPQ=4096 matmul performance + uint32_t iwh_idx = 0; + uint32_t kwh_idx = 1; + uint32_t Cout_idx = 2; + uint32_t Cin_idx = 3; + uint32_t B_idx = 4; + std::vector> cases = { + //{IWH, KWH, Cout, Cin, B} + // K=CRS=NPQ=4096 conv2d matmul performance + {19, 4, 4096, 256, 16}, + // K=128, CRS=128, NPQ=4096 + { 19, 4, 128, 8, 16}, + // K=130, CRS=128, NPQ=4096 + { 19, 4, 130, 8, 16}, + // Edge case: K x CRS is small + { 19, 2, 4, 4, 16}, + // A ConvNet's first layer + { 224, 3, 8, 3, 1 }, + // A ConvNet's first layer with 2x2 convolution, and 1 channel + { 224, 2, 8, 1, 1 }, + // A ConvNet's first layer with 2x2 convolution, and 1 channel, several images in the batch + { 224, 2, 8, 1, 8 }, + // A middle layer of a ConvNet + { 58, 3, 64, 32, 1 }, + // A middle layer of a ConvNet, several images in the batch + { 58, 3, 64, 32, 8 }, + // A deep layer of a ConvNet, several images in the batch + { 16, 3, 512, 128, 8 }, + }; + + for (auto act_case : cases) { + // Direct CONV_2D + test_cases.emplace_back(new test_conv_2d( + { act_case[iwh_idx], act_case[iwh_idx], act_case[Cin_idx], act_case[B_idx] }, + { act_case[kwh_idx], act_case[kwh_idx], act_case[Cin_idx], act_case[Cout_idx] }, 1, 1, 0, 0, 1, 1, false)); + } + test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1})); test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1})); diff --git a/tools/imatrix/README.md b/tools/imatrix/README.md index 6d8897d98bb..4ce5ca0ca42 100644 --- a/tools/imatrix/README.md +++ b/tools/imatrix/README.md @@ -7,14 +7,15 @@ More information is available here: https://github.com/ggml-org/llama.cpp/pull/4 ``` ./llama-imatrix \ - -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \ + -m model.gguf -f some-text.txt [-o imatrix.gguf] [--process-output] \ [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \ - [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...] + [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] \ + [--parse-special] ``` Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory. The parameters in square brackets are optional and have the following meaning: -* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used. +* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.gguf` is used. * `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`. * `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks) * `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never) @@ -25,9 +26,9 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument ## Example ```bash -# generate importance matrix (imatrix.dat) +# generate importance matrix (imatrix.gguf) ./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 # use the imatrix to perform a Q4_K_M quantization -./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m +./llama-quantize --imatrix imatrix.gguf ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m ``` diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index daad44e5957..a1f21d7ee56 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -2,7 +2,9 @@ #include "common.h" #include "log.h" #include "llama.h" +#include "gguf.h" +#include #include #include #include @@ -13,7 +15,7 @@ #include #include #include -#include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -22,17 +24,20 @@ static void print_usage(int, char ** argv) { LOG("\nexample usage:\n"); LOG("\n %s \\\n" - " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n" + " -m model.gguf -f some-text.txt [-o imatrix.gguf] [--process-output] \\\n" " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n" - " [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...] \\\n" + " [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] \\\n" " [--parse-special]\n" , argv[0]); LOG("\n"); } +static const char * const LLM_KV_IMATRIX_DATASETS = "imatrix.datasets"; +static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count"; +static const char * const LLM_KV_IMATRIX_CHUNK_SIZE = "imatrix.chunk_size"; + struct Stats { - std::vector values; - std::vector counts; - int ncall = 0; + std::vector values; + std::vector counts; }; class IMatrixCollector { @@ -40,13 +45,16 @@ class IMatrixCollector { IMatrixCollector() = default; void set_params(common_params params) { m_params = std::move(params); } bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); - void save_imatrix(int ncall = -1) const; - bool load_imatrix(const char * fname); + void save_imatrix_legacy(int32_t ncall = -1) const; + void save_imatrix(int32_t n_chunk = -1) const; + bool load_imatrix_legacy(const char * fname); + bool load_imatrix(const char * file_name); private: std::unordered_map m_stats; common_params m_params; std::mutex m_mutex; - int m_last_call = 0; + std::vector m_datasets; + int32_t m_last_chunk = 0; std::vector m_src1_data; std::vector m_ids; // the expert ids from ggml_mul_mat_id }; @@ -77,6 +85,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * const struct ggml_tensor * src1 = t->src[1]; std::string wname = filter_tensor_name(src0->name); + const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel; + // when ask is true, the scheduler wants to know if we are interested in data from this tensor // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection if (ask) { @@ -102,14 +112,21 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * const char * data = is_host ? (const char *) src1->data : m_src1_data.data(); GGML_ASSERT(src1->nb[0] == ggml_element_size(src1)); + // TODO: 4d? (is that even used in practice?) + // the extra dimension would need to be stored somewhere to be reflected in the imatrix file + if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) { + LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str()); + GGML_ASSERT(false); + } + // this has been adapted to the new format of storing merged experts in a single 3d tensor // ref: https://github.com/ggml-org/llama.cpp/pull/6387 if (t->op == GGML_OP_MUL_MAT_ID) { // ids -> [n_experts_used, n_tokens] // src1 -> [cols, n_expert_used, n_tokens] const ggml_tensor * ids = t->src[2]; - const int n_as = src0->ne[2]; - const int n_ids = ids->ne[0]; + const int64_t n_as = src0->ne[2]; + const int64_t n_ids = ids->ne[0]; // the top-k selected expert ids are stored in the ids tensor // for simplicity, always copy ids to host, because it is small @@ -122,23 +139,29 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * auto & e = m_stats[wname]; - ++e.ncall; - + if (e.counts.size() == 1 && n_as > 1) { + // broadcast, when loading an old imatrix + e.counts.resize(n_as, e.counts[0]); + } if (e.values.empty()) { e.values.resize(src1->ne[0]*n_as, 0); - e.counts.resize(src1->ne[0]*n_as, 0); + e.counts.resize(n_as, 0); } else if (e.values.size() != (size_t)src1->ne[0]*n_as) { - LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); + LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0]*n_as)); + exit(1); //GGML_ABORT("fatal error"); + } + else if (e.counts.size() != (size_t)n_as) { + LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_as); exit(1); //GGML_ABORT("fatal error"); } - LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type); + LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type); // loop over all possible experts, regardless if they are used or not in the batch - for (int ex = 0; ex < n_as; ++ex) { + for (int64_t ex = 0; ex < n_as; ++ex) { size_t e_start = ex*src1->ne[0]; - for (int idx = 0; idx < n_ids; ++idx) { - for (int row = 0; row < (int)src1->ne[2]; ++row) { + for (int64_t idx = 0; idx < n_ids; ++idx) { + for (int64_t row = 0; row < src1->ne[2]; ++row) { const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]); GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check @@ -149,57 +172,73 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * const int64_t i12 = row; const float * x = (const float *)(data + i11*src1->nb[1] + i12*src1->nb[2]); - for (int j = 0; j < (int)src1->ne[0]; ++j) { - e.values[e_start + j] += x[j]*x[j]; - e.counts[e_start + j]++; - if (!std::isfinite(e.values[e_start + j])) { - LOG("\n"); - LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str()); + e.counts[ex]++; + + for (int64_t j = 0; j < src1->ne[0]; ++j) { + e.values[e_start + j] += x[j] * x[j]; + if (!std::isfinite((float)e.values[e_start + j])) { + LOG_ERR("%f detected in %s\n", (float)e.values[e_start + j], wname.c_str()); exit(1); } } } } - if (e.ncall > m_last_call) { - m_last_call = e.ncall; - if (m_last_call % m_params.n_out_freq == 0) { + const int32_t n_chunk = e.counts[ex] / chunk_size; + if (n_chunk > m_last_chunk) { + const int32_t chunk_step = n_chunk - m_last_chunk; + m_last_chunk = n_chunk; + if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) { save_imatrix(); } - if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) { - save_imatrix(m_last_call); + if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) { + save_imatrix(m_last_chunk); } } } } else { auto & e = m_stats[wname]; + const int64_t n_mat = src1->ne[2] * src1->ne[3]; + if (e.values.empty()) { - e.values.resize(src1->ne[0], 0); - e.counts.resize(src1->ne[0], 0); + e.values.resize(src1->ne[0] * n_mat, 0); + e.counts.resize(n_mat, 0); } - else if (e.values.size() != (size_t)src1->ne[0]) { - LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); + else if (e.values.size() != (size_t)(src1->ne[0] * n_mat)) { + LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat)); exit(1); //GGML_ABORT("fatal error"); } - ++e.ncall; - LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); - for (int row = 0; row < (int)src1->ne[1]; ++row) { - const float * x = (const float *) (data + row * src1->nb[1]); - for (int j = 0; j < (int)src1->ne[0]; ++j) { - e.values[j] += x[j]*x[j]; - e.counts[j]++; - if (!std::isfinite(e.values[j])) { - LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str()); - exit(1); - } - } + else if (e.counts.size() != (size_t)n_mat) { + LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_mat); + exit(1); //GGML_ABORT("fatal error"); } - if (e.ncall > m_last_call) { - m_last_call = e.ncall; - if (m_last_call % m_params.n_out_freq == 0) { - save_imatrix(); - } - if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) { - save_imatrix(m_last_call); + LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type); + for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) { + for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) { + const int64_t mat_id = i3 * src1->ne[2] + i2; + const int64_t mat_start = mat_id * src1->ne[0]; + + for (int64_t row = 0; row < src1->ne[1]; ++row) { + const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->ne[3]); + e.counts[mat_id]++; + for (int64_t j = 0; j < src1->ne[0]; ++j) { + e.values[mat_start + j] += x[j] * x[j]; + if (!std::isfinite((float)e.values[j])) { + LOG_ERR("%f detected in %s\n", (float)e.values[j], wname.c_str()); + exit(1); + } + } + } + const int32_t n_chunk = e.counts[mat_id] / chunk_size; + if (n_chunk > m_last_chunk) { + const int32_t chunk_step = n_chunk - m_last_chunk; + m_last_chunk = n_chunk; + if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) { + save_imatrix(); + } + if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) { + save_imatrix(m_last_chunk); + } + } } } } @@ -207,7 +246,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * return true; } -void IMatrixCollector::save_imatrix(int ncall) const { +void IMatrixCollector::save_imatrix_legacy(int32_t ncall) const { auto fname = m_params.out_file; if (ncall > 0) { @@ -215,7 +254,7 @@ void IMatrixCollector::save_imatrix(int ncall) const { fname += std::to_string(ncall); } - // avoid writing imatrix entries that do not have full data + // warn when writing imatrix entries that do not have full data // this can happen with MoE models where some of the experts end up not being exercised by the provided training data int n_entries = 0; @@ -247,8 +286,7 @@ void IMatrixCollector::save_imatrix(int ncall) const { } if (n_zeros > 0) { - LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all); - continue; + LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all); } n_entries++; @@ -259,93 +297,378 @@ void IMatrixCollector::save_imatrix(int ncall) const { LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size()); } + // deterministic tensor name order + std::sort(to_store.begin(), to_store.end()); + + const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel; + std::ofstream out(fname, std::ios::binary); out.write((const char *) &n_entries, sizeof(n_entries)); for (const auto & name : to_store) { const auto & stat = m_stats.at(name); - int len = name.size(); + const int32_t len = name.size(); out.write((const char *) &len, sizeof(len)); out.write(name.c_str(), len); - out.write((const char *) &stat.ncall, sizeof(stat.ncall)); - int nval = stat.values.size(); + // ceiling division to avoid accidental zeros + const int32_t ncall = (*std::max_element(stat.counts.begin(), stat.counts.end()) + (chunk_size - 1)) / chunk_size; + out.write((const char *) &ncall, sizeof(ncall)); + const int32_t nval = stat.values.size(); + const int32_t nmat = stat.counts.size(); out.write((const char *) &nval, sizeof(nval)); - if (nval > 0) { + if (nval > 0 && nmat > 0) { std::vector tmp(nval); - for (int i = 0; i < nval; i++) { - tmp[i] = (stat.values[i] / static_cast(stat.counts[i])) * static_cast(stat.ncall); + for (int32_t i = 0; i < nval; i++) { + float count = static_cast(stat.counts[i / (nval / nmat)]); + float value = stat.values[i]; + if (count == 0.0f) { + // store 1 for partial data + value = 1.0f; + count = 1.0f; + } + tmp[i] = (value / count) * static_cast(ncall); } - out.write((const char*)tmp.data(), nval*sizeof(float)); + out.write((const char *) tmp.data(), nval * sizeof(float)); } } // Write the number of call the matrix was computed with - out.write((const char *) &m_last_call, sizeof(m_last_call)); + out.write((const char *) &m_last_chunk, sizeof(m_last_chunk)); // Write the input filename at the end of the file to later on specify it in quantize { - int len = m_params.prompt_file.size(); + const char * dataset_file = m_params.prompt_file.c_str(); + int32_t len = m_params.prompt_file.size(); + // When there is no prompt but there were other imatrix files loaded, use the last dataset + if (m_params.prompt_file.empty() && !m_datasets.empty()) { + const std::string & dataset_str = m_datasets[m_datasets.size() - 1]; + dataset_file = dataset_str.c_str(); + len = dataset_str.size(); + } out.write((const char *) &len, sizeof(len)); - out.write(m_params.prompt_file.c_str(), len); + out.write(dataset_file, len); + } + + LOGV(1, "\n"); + LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str()); +} + +void IMatrixCollector::save_imatrix(int32_t n_chunk) const { + auto fname = m_params.out_file; + + // TODO: use the new format in more cases + if (!string_ends_with(fname, ".gguf")) { + LOG_WRN("\n%s: saving to legacy imatrix format because output suffix is not .gguf\n", __func__); + this->save_imatrix_legacy(n_chunk); + return; + } + + if (n_chunk > 0) { + fname += ".at_"; + fname += std::to_string(n_chunk); + } + + // write imatrix entries even if they don't have full data. (can be corrected when reading) + // this can happen with MoE models where some of the experts end up not being exercised by the provided training data + + std::vector to_store; + size_t data_size = 0; + + bool is_first = true; // for printing + for (const auto & kv : m_stats) { + const int n_all = kv.second.counts.size(); + + int n_zeros = 0; + for (const auto c : kv.second.counts) { + if (c == 0) { + n_zeros++; + } + } + + if (n_zeros != 0 && is_first) { + LOG_INF("\n"); + is_first = false; + } + + if (n_zeros > 0) { + LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all); + } + + to_store.push_back(kv.first); + data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.values.size(), GGML_MEM_ALIGN); + data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.counts.size(), GGML_MEM_ALIGN); } + // deterministic tensor name order + std::sort(to_store.begin(), to_store.end()); + + struct ggml_init_params params = { + /* .mem_size = */ data_size, + /* .mem_buffer = */ NULL, + /* .no_alloc = */ false, + }; + struct ggml_context * ctx = ggml_init(params); + struct gguf_context * ctx_gguf = gguf_init_empty(); + + { + std::vector datasets; + datasets.reserve(m_datasets.size() + 1); + for (size_t i = 0; i < m_datasets.size(); ++i) { + datasets.push_back(m_datasets[i].c_str()); + } + if (!m_params.prompt_file.empty()) { + datasets.push_back(m_params.prompt_file.c_str()); + } + + gguf_set_val_str(ctx_gguf, "general.type", "imatrix"); + // Write the dataset paths + gguf_set_arr_str(ctx_gguf, LLM_KV_IMATRIX_DATASETS, datasets.data(), datasets.size()); + // Write the number of chunks the matrix was computed with + gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT, m_last_chunk); + gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE, m_params.n_ctx / m_params.n_parallel); + } + + for (const auto & name : to_store) { + const auto & stat = m_stats.at(name); + const int32_t nval = (int32_t) stat.values.size(); + const int32_t nmat = (int32_t) stat.counts.size(); + if (nval > 0 && nmat > 0) { + struct ggml_tensor * in_sum2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nval / nmat, nmat); + struct ggml_tensor * counts = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, nmat); + ggml_format_name(in_sum2, "%s.in_sum2", name.c_str()); + ggml_format_name(counts, "%s.counts", name.c_str()); + + for (int32_t j = 0; j < nval; ++j) { + ((float *) in_sum2->data)[j] = (float) stat.values[j]; + } + for (int32_t j = 0; j < nmat; ++j) { + ((float *) counts->data)[j] = (float) stat.counts[j]; + } + + gguf_add_tensor(ctx_gguf, in_sum2); + gguf_add_tensor(ctx_gguf, counts); + } + } + + gguf_write_to_file(ctx_gguf, fname.c_str(), false); + LOGV(1, "\n"); - LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); + LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str()); + + gguf_free(ctx_gguf); + ggml_free(ctx); } -bool IMatrixCollector::load_imatrix(const char * fname) { +bool IMatrixCollector::load_imatrix_legacy(const char * fname) { std::ifstream in(fname, std::ios::binary); if (!in) { - LOG_ERR("%s: failed to open %s\n",__func__, fname); + LOG_ERR("%s: failed to open %s\n", __func__, fname); return false; } int n_entries; - in.read((char*)&n_entries, sizeof(n_entries)); + in.read((char *) &n_entries, sizeof(n_entries)); if (in.fail() || n_entries < 1) { LOG_ERR("%s: no data in file %s\n", __func__, fname); return false; } + // Guess the chunk size because it's not stored in the file + const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel; + for (int i = 0; i < n_entries; ++i) { - int len; in.read((char *)&len, sizeof(len)); - std::vector name_as_vec(len+1); - in.read((char *)name_as_vec.data(), len); + int32_t len = 0; + in.read((char *) &len, sizeof(len)); + std::vector name_as_vec(len + 1); + in.read((char *) name_as_vec.data(), len); if (in.fail()) { - LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname); + LOG_ERR("%s: failed reading name for entry %d from %s\n", __func__, i + 1, fname); return false; } name_as_vec[len] = 0; - std::string name{name_as_vec.data()}; + std::string name{ name_as_vec.data() }; auto & e = m_stats[std::move(name)]; - int ncall; - in.read((char*)&ncall, sizeof(ncall)); - int nval; - in.read((char *)&nval, sizeof(nval)); + int32_t ncall = 0; + in.read((char *) &ncall, sizeof(ncall)); + int32_t nval = 0; + in.read((char *) &nval, sizeof(nval)); if (in.fail() || nval < 1) { - LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i); + LOG_ERR("%s: failed reading number of values for entry %d\n", __func__, i); m_stats = {}; return false; } if (e.values.empty()) { - e.values.resize(nval, 0); - e.counts.resize(nval, 0); + e.values.resize(nval, 0.0f); + e.counts.resize(1, 0); } std::vector tmp(nval); - in.read((char*)tmp.data(), nval*sizeof(float)); + in.read((char *) tmp.data(), nval * sizeof(float)); if (in.fail()) { - LOG_ERR("%s: failed reading data for entry %d\n",__func__,i); + LOG_ERR("%s: failed reading data for entry %d\n", __func__, i); m_stats = {}; return false; } - // Recreate the state as expected by save_imatrix(), and corerct for weighted sum. + // Recreate the state as expected by save_imatrix(), and correct for weighted sum. for (int i = 0; i < nval; i++) { - e.values[i] += tmp[i]; - e.counts[i] += ncall; + e.values[i] += tmp[i] * chunk_size; + } + // The legacy format doesn't distinguish the counts for different experts + for (size_t j = 0; j < e.counts.size(); ++j) { + e.counts[j] += ncall * chunk_size; + } + } + + { + // TODO: extract into its own method; this is also used by the GGUF-based format + // Calculate the last chunk count + int64_t max_count = 0; + for (const auto & stats : m_stats) { + for (int64_t count : stats.second.counts) { + if (count > max_count) { + max_count = count; + } + } + } + m_last_chunk = max_count / (chunk_size); + } + + { + // Read the number of calls the matrix was computed with + int32_t n_calls; + in.read((char *) &n_calls, sizeof(n_calls)); + // ignore it because it's not important + } + + // Read the dataset path to include it when writing to GGUF + if (!in.fail()){ + int32_t len = 0; + in.read((char *) &len, sizeof(len)); + if (!in.fail()) { + std::vector dataset; + dataset.resize(len + 1, 0); + in.read(dataset.data(), len); + if (!in.fail()) { + m_datasets.push_back(dataset.data()); + } + } + } + + return true; +} + +// Using GGUF as the file format, for greater extensibility +bool IMatrixCollector::load_imatrix(const char * file_name) { + struct ggml_context * ctx = nullptr; + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ false, // the data is needed + /* .ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(file_name, meta_gguf_params); + if (!ctx_gguf) { + return this->load_imatrix_legacy(file_name); + } + const int32_t n_entries = gguf_get_n_tensors(ctx_gguf); + if (n_entries < 1) { + LOG_ERR("%s: no data in file %s\n", __func__, file_name); + gguf_free(ctx_gguf); + ggml_free(ctx); + return false; + } + + const int64_t datasets_key = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASETS); + if (datasets_key != -1 && gguf_get_arr_type(ctx_gguf, datasets_key) == GGUF_TYPE_STRING) { + const int64_t n = gguf_get_arr_n(ctx_gguf, datasets_key); + m_datasets.reserve(m_datasets.size() + n); + for (int64_t i = 0; i < n; ++i) { + m_datasets.push_back(gguf_get_arr_str(ctx_gguf, datasets_key, i)); } - e.ncall += ncall; + } + + const std::string in_sum2_suffix{ ".in_sum2" }; + const std::string counts_suffix{ ".counts" }; + + // Could re-use m_stats instead, but this allows + // checking for completeness of *each* loaded imatrix file + // and also makes it easier to re-use a similar implementation in quantize.cpp + // Using an ordered map to get a deterministic iteration order. + std::map> sums_counts_for; + + for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + std::string name = cur->name; + if (name.empty()) { continue; } + + if (string_remove_suffix(name, in_sum2_suffix)) { + // in_sum2 + sums_counts_for[std::move(name)].first = cur; + } else if (string_remove_suffix(name, counts_suffix)) { + // counts + sums_counts_for[std::move(name)].second = cur; + } else { + // ignore other tensors + } } + + for (const auto & sc : sums_counts_for) { + const std::string & name = sc.first; + const struct ggml_tensor * in_sum2 = sc.second.first; + const struct ggml_tensor * counts = sc.second.second; + + if (!in_sum2 || !counts) { + LOG_ERR("%s: mismatched sums and counts for %s\n", __func__, name.c_str()); + gguf_free(ctx_gguf); + ggml_free(ctx); + return false; + } + + auto & e = m_stats[name]; + + int64_t nval = ggml_nelements(in_sum2); + if (e.values.empty()) { + e.values.resize(nval, 0.0f); + } else if ((size_t) nval != e.values.size()) { + LOG_ERR("%s: mismatched sums size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) nval, e.values.size()); + gguf_free(ctx_gguf); + ggml_free(ctx); + return false; + } + + int64_t ncounts = ggml_nelements(counts); + if (e.counts.empty()) { + e.counts.resize(ncounts, 0); + } else if (e.counts.size() == 1 && ncounts > 1) { + // broadcast, when loading an old imatrix + e.counts.resize(ncounts, e.counts[0]); + } else if ((size_t) ncounts != e.counts.size()) { + LOG_ERR("%s: mismatched counts size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) ncounts, e.counts.size()); + gguf_free(ctx_gguf); + ggml_free(ctx); + return false; + } + + // Recreate the state as expected by save_imatrix() + for (int64_t j = 0; j < nval; j++) { + e.values[j] += ((const float *) in_sum2->data)[j]; + } + for (int64_t j = 0; j < ncounts; j++) { + e.counts[j] += std::lround(((const float *) counts->data)[j]); + } + } + + // TODO: extract into its own method; this is also used by the legacy format + // Calculate the last chunk count + int64_t max_count = 0; + for (const auto & stats : m_stats) { + for (int64_t count : stats.second.counts) { + if (count > max_count) { + max_count = count; + } + } + } + m_last_chunk = max_count / (m_params.n_ctx / m_params.n_parallel); + + gguf_free(ctx_gguf); + ggml_free(ctx); return true; } @@ -428,12 +751,11 @@ static void process_logits( } } -static bool compute_imatrix(llama_context * ctx, const common_params & params) { +static bool compute_imatrix(llama_context * ctx, const common_params & params, const int32_t n_ctx) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); const bool add_bos = llama_vocab_get_add_bos(vocab); - const int n_ctx = llama_n_ctx(ctx); GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); @@ -478,45 +800,61 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { double nll = 0.0; double nll2 = 0.0; - LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch); + const int num_batches = (n_ctx + n_batch - 1) / n_batch; + const int n_seq = std::max(1, n_batch / n_ctx); - std::vector workers(std::thread::hardware_concurrency() - 1); + GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0); + GGML_ASSERT(params.n_ctx == n_seq * n_ctx); - const int num_batches = (n_ctx + n_batch - 1) / n_batch; + llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1); std::vector logits; if (params.compute_ppl && num_batches > 1) { logits.reserve((size_t)n_ctx * n_vocab); } - for (int i = 0; i < n_chunk; ++i) { + LOG_INF("%s: computing over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq); + + std::vector workers(std::thread::hardware_concurrency() - 1); + + for (int i = 0; i < n_chunk; i += n_seq) { const int start = i * n_ctx; const int end = start + n_ctx; - std::vector logits; + const int n_seq_batch = std::min(n_seq, n_chunk - i); const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache llama_memory_clear(llama_get_memory(ctx), true); - llama_batch batch = llama_batch_init(n_batch, 0, 1); - for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; const int batch_size = std::min(end - batch_start, n_batch); - // save original token and restore it after eval - const auto token_org = tokens[batch_start]; + // clear the batch + common_batch_clear(batch); + + for (int seq = 0; seq < n_seq_batch; seq++) { + int seq_start = batch_start + seq*n_ctx; - // add BOS token for the first batch of each chunk - if (add_bos && j == 0) { - tokens[batch_start] = llama_vocab_bos(vocab); - } + // save original token and restore it after eval + const auto token_org = tokens[seq_start]; - common_batch_clear(batch); - for (int i = 0; i < batch_size; i++) { - common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true); + // add BOS token for the first batch of each chunk + if (add_bos && j == 0) { + tokens[seq_start] = llama_vocab_bos(vocab); + } + for (int k = 0; k < batch_size; ++k) { + // NOTE: specifying all logits to get activations for the output.weight tensor + // and also for the perplexity calculation. + // TODO: only get outputs when (params.process_output || params.compute_ppl) + // (not possible when this skips FFN computation of the last layer) + common_batch_add(batch, tokens[seq_start + k], j*n_batch + k, { seq }, true); + } + + // restore the original token in case it was set to BOS + tokens[seq_start] = token_org; } if (llama_decode(ctx, batch)) { @@ -525,23 +863,19 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { return false; } - // restore the original token in case it was set to BOS - tokens[batch_start] = token_org; - if (params.compute_ppl && num_batches > 1) { const auto * batch_logits = llama_get_logits(ctx); logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); } } - llama_batch_free(batch); - - const auto t_end = std::chrono::high_resolution_clock::now(); if (i == 0) { + llama_synchronize(ctx); + const auto t_end = std::chrono::high_resolution_clock::now(); const float t_total = std::chrono::duration(t_end - t_start).count(); LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total); - int total_seconds = (int)(t_total * n_chunk); + int total_seconds = (int)(t_total * n_chunk / n_seq); if (total_seconds >= 60*60) { LOG("%d hours ", total_seconds / (60*60)); total_seconds = total_seconds % (60*60); @@ -551,17 +885,27 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { if (params.compute_ppl) { const int first = n_ctx/2; - const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); - process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, - workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); - count += n_ctx - first - 1; + for (int seq = 0; seq < n_seq_batch; seq++) { + const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx); + + llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first; - LOG("[%d]%.4lf,", i + 1, std::exp(nll / count)); + process_logits(n_vocab, all_logits + first*n_vocab, + tokens_data, n_ctx - 1 - first, + workers, nll, nll2, + logit_history.data() + start + seq*n_ctx + first, + prob_history.data() + start + seq*n_ctx + first); + + count += n_ctx - first - 1; + + LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count)); + } fflush(stdout); logits.clear(); } } + LOG("\n"); if (params.compute_ppl) { @@ -577,13 +921,15 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { } } + llama_batch_free(batch); + return true; } int main(int argc, char ** argv) { common_params params; - params.out_file = "imatrix.dat" ; + params.out_file = "imatrix.gguf"; params.n_ctx = 512; params.escape = false; @@ -594,7 +940,22 @@ int main(int argc, char ** argv) { common_init(); - params.n_batch = std::min(params.n_batch, params.n_ctx); + const int32_t n_ctx = params.n_ctx; + + if (n_ctx <= 0) { + LOG_ERR("%s: imatrix tool requires '--ctx-size' > 0\n", __func__); + return 1; + } + + { + const int32_t n_seq = std::max(1, params.n_batch / n_ctx); + const int32_t n_kv = n_seq * n_ctx; + + params.n_parallel = n_seq; + params.n_ctx = n_kv; + + params.n_batch = std::min(params.n_batch, n_kv); + } g_collector.set_params(params); @@ -606,9 +967,23 @@ int main(int argc, char ** argv) { } } - if (params.in_files.size() > 1) { - LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str()); + if (params.prompt.empty()) { + LOG_INF("No prompt provided; combining precomputed matrices only.\n"); + + if (params.in_files.empty()) { + LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n"); + return 1; + } + + if (params.in_files.size() == 1) { + LOG_INF("%s : saving imatrix to '%s'\n", __func__, params.out_file.c_str()); + } else if (params.in_files.size() > 1) { + LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str()); + } + g_collector.save_imatrix(); + + return 0; } llama_backend_init(); @@ -643,19 +1018,10 @@ int main(int argc, char ** argv) { LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } - if (params.prompt.empty()) { - if (params.in_files.empty()) { - LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n"); - return 1; - } - LOG_INF("No prompt provided; combining precomputed matrices only.\n"); - } else { - if (!compute_imatrix(ctx, params)) { - return 1; - } + if (!compute_imatrix(ctx, params, n_ctx)) { + return 1; } - g_collector.save_imatrix(); LOG("\n"); diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 8acc7651788..45c59ecb6ff 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -1,11 +1,13 @@ #include "common.h" #include "llama.h" +#include "gguf.h" #include #include #include #include #include +#include #include #include #include @@ -68,6 +70,11 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count"; static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count"; +// TODO: share with imatrix.cpp +static const char * const LLM_KV_IMATRIX_DATASETS = "imatrix.datasets"; +static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count"; +static const char * const LLM_KV_IMATRIX_CHUNK_SIZE = "imatrix.chunk_size"; + static bool striequals(const char * a, const char * b) { while (*a && *b) { if (std::tolower(*a) != std::tolower(*b)) { @@ -84,7 +91,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp for (auto ch : ftype_str_in) { ftype_str.push_back(std::toupper(ch)); } - for (auto & it : QUANT_OPTIONS) { + for (const auto & it : QUANT_OPTIONS) { if (striequals(it.name.c_str(), ftype_str.c_str())) { ftype = it.ftype; ftype_str_out = it.name; @@ -93,7 +100,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp } try { int ftype_int = std::stoi(ftype_str); - for (auto & it : QUANT_OPTIONS) { + for (const auto & it : QUANT_OPTIONS) { if (it.ftype == ftype_int) { ftype = it.ftype; ftype_str_out = it.name; @@ -129,7 +136,7 @@ static void usage(const char * executable) { printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); printf("Note: --include-weights and --exclude-weights cannot be used together\n"); printf("\nAllowed quantization types:\n"); - for (auto & it : QUANT_OPTIONS) { + for (const auto & it : QUANT_OPTIONS) { if (it.name != "COPY") { printf(" %2d or ", it.ftype); } else { @@ -140,7 +147,7 @@ static void usage(const char * executable) { exit(1); } -static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map> & imatrix_data) { +static int load_legacy_imatrix(const std::string & imatrix_file, std::vector & imatrix_datasets, std::unordered_map> & imatrix_data) { std::ifstream in(imatrix_file.c_str(), std::ios::binary); if (!in) { printf("%s: failed to open %s\n",__func__, imatrix_file.c_str()); @@ -180,7 +187,9 @@ static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_ exit(1); } if (ncall > 0) { - for (auto& v : e) v /= ncall; + for (auto & v : e) { + v /= ncall; + } } if (getenv("LLAMA_TRACE")) { @@ -188,7 +197,7 @@ static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_ } } - // latest imatrix version contains the dataset filename at the end of the file + // latest legacy imatrix version contains the dataset filename at the end of the file int m_last_call = 0; if (in.peek() != EOF) { in.read((char *)&m_last_call, sizeof(m_last_call)); @@ -196,15 +205,130 @@ static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_ in.read((char *)&dataset_len, sizeof(dataset_len)); std::vector dataset_as_vec(dataset_len); in.read(dataset_as_vec.data(), dataset_len); - imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end()); - printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str()); + imatrix_datasets.resize(1); + imatrix_datasets[0].assign(dataset_as_vec.begin(), dataset_as_vec.end()); + printf("%s: imatrix dataset='%s'\n", __func__, imatrix_datasets[0].c_str()); } printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call); return m_last_call; } +static int load_imatrix(const std::string & imatrix_file, std::vector & imatrix_datasets, std::unordered_map> & imatrix_data) { + + struct ggml_context * ctx = nullptr; + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ false, // the data is needed + /* .ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(imatrix_file.c_str(), meta_gguf_params); + if (!ctx_gguf) { + fprintf(stderr, "%s: imatrix file '%s' is using old format\n", __func__, imatrix_file.c_str()); + return load_legacy_imatrix(imatrix_file, imatrix_datasets, imatrix_data); + } + const int32_t n_entries = gguf_get_n_tensors(ctx_gguf); + if (n_entries < 1) { + fprintf(stderr, "%s: no data in file %s\n", __func__, imatrix_file.c_str()); + gguf_free(ctx_gguf); + ggml_free(ctx); + exit(1); + } + + const int dataset_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASETS); + const int chunk_count_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT); + const int chunk_size_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE); + if (dataset_idx < 0 || chunk_count_idx < 0 || chunk_size_idx < 0) { + fprintf(stderr, "%s: missing imatrix metadata in file %s\n", __func__, imatrix_file.c_str()); + gguf_free(ctx_gguf); + ggml_free(ctx); + exit(1); + } + + const uint32_t chunk_size = gguf_get_val_u32(ctx_gguf, chunk_size_idx); + + const std::string sums_suffix{ ".in_sum2" }; + const std::string counts_suffix{ ".counts" }; + + // Using an ordered map to get a deterministic iteration order. + std::map> sums_counts_for; + + for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + std::string name = cur->name; + + if (name.empty()) { continue; } + + if (string_remove_suffix(name, sums_suffix)) { + // in_sum2 + sums_counts_for[std::move(name)].first = cur; + } else if (string_remove_suffix(name, counts_suffix)) { + // counts + sums_counts_for[std::move(name)].second = cur; + } else { + // ignore other tensors + } + } + + for (const auto & sc : sums_counts_for) { + const std::string & name = sc.first; + const struct ggml_tensor * sums = sc.second.first; + const struct ggml_tensor * counts = sc.second.second; + + if (!sums || !counts) { + fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str()); + gguf_free(ctx_gguf); + ggml_free(ctx); + exit(1); + } + + const int64_t ne0 = sums->ne[0]; + const int64_t ne1 = sums->ne[1]; + + auto & e = imatrix_data[name]; + e.resize(ggml_nelements(sums)); + float max_count = 0.0f; + for (int64_t j = 0; j < ne1; ++j) { + const float count = ((const float *) counts->data)[j]; + if (count > 0.0f) { + for (int64_t i = 0; i < ne0; ++i) { + e[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count; + } + } else { + // Partial imatrix data, this tensor never got any input during calibration + for (int64_t i = 0; i < ne0; ++i) { + e[j*ne0 + i] = 1; + } + } + if (count > max_count) { + max_count = count; + } + } + if (getenv("LLAMA_TRACE")) { + printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n", __func__, int(e.size()), int(max_count), int(max_count / chunk_size), name.c_str()); + } + } + + int m_last_chunk = gguf_get_val_u32(ctx_gguf, chunk_count_idx); + + int64_t n_datasets = gguf_get_arr_n(ctx_gguf, dataset_idx); + imatrix_datasets.reserve(n_datasets); + for (int64_t i = 0; i < n_datasets; ++i) { + imatrix_datasets.push_back(gguf_get_val_str(ctx_gguf, dataset_idx)); + } + printf("%s: imatrix datasets=['%s'", __func__, imatrix_datasets[0].c_str()); + for (size_t i = 1; i < imatrix_datasets.size(); ++i) { + printf(", '%s'", imatrix_datasets[i].c_str()); + } + printf("]\n"); + + printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_chunk); + + gguf_free(ctx_gguf); + ggml_free(ctx); + + return m_last_chunk; +} + static int prepare_imatrix(const std::string & imatrix_file, - std::string & imatrix_dataset, + std::vector & imatrix_dataset, const std::vector & included_weights, const std::vector & excluded_weights, std::unordered_map> & imatrix_data) { @@ -216,18 +340,21 @@ static int prepare_imatrix(const std::string & imatrix_file, return m_last_call; } if (!excluded_weights.empty()) { - for (auto& name : excluded_weights) { - for (auto it = imatrix_data.begin(); it != imatrix_data.end(); ) { + for (const auto & name : excluded_weights) { + for (auto it = imatrix_data.begin(); it != imatrix_data.end();) { auto pos = it->first.find(name); - if (pos != std::string::npos) it = imatrix_data.erase(it); - else ++it; + if (pos != std::string::npos) { + it = imatrix_data.erase(it); + } else { + ++it; + } } } } if (!included_weights.empty()) { std::unordered_map> tmp; - for (auto& name : included_weights) { - for (auto& e : imatrix_data) { + for (const auto & name : included_weights) { + for (auto & e : imatrix_data) { auto pos = e.first.find(name); if (pos != std::string::npos) { tmp.emplace(std::move(e)); @@ -396,9 +523,9 @@ int main(int argc, char ** argv) { usage(argv[0]); } - std::string imatrix_dataset; + std::vector imatrix_datasets; std::unordered_map> imatrix_data; - int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data); + int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, imatrix_data); if (!imatrix_data.empty()) { params.imatrix = &imatrix_data; { @@ -409,11 +536,12 @@ int main(int argc, char ** argv) { kvo.val_str[127] = '\0'; kv_overrides.emplace_back(std::move(kvo)); } - if (!imatrix_dataset.empty()) { + if (!imatrix_datasets.empty()) { llama_model_kv_override kvo; + // TODO: list multiple datasets when there are more than one std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET); kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; - strncpy(kvo.val_str, imatrix_dataset.c_str(), 127); + strncpy(kvo.val_str, imatrix_datasets[0].c_str(), 127); kvo.val_str[127] = '\0'; kv_overrides.emplace_back(std::move(kvo)); }