From 606fa42f5de58ef74e0eb9a0a55bb6c0a689e2eb Mon Sep 17 00:00:00 2001 From: "Alessandro de Oliveira Faria (A.K.A.CABELO)" Date: Tue, 21 Apr 2026 11:45:48 -0300 Subject: [PATCH 1/5] vendor : update cpp-httplib to 0.43.1 (#22143) * vendor : update cpp-httplib to 0.43.0 * vendor : update cpp-httplib to 0.43.0 --- scripts/sync_vendor.py | 2 +- vendor/cpp-httplib/httplib.cpp | 595 ++++++++------------------------- vendor/cpp-httplib/httplib.h | 139 ++------ 3 files changed, 179 insertions(+), 557 deletions(-) diff --git a/scripts/sync_vendor.py b/scripts/sync_vendor.py index 7ce09332360..ff1dd075303 100755 --- a/scripts/sync_vendor.py +++ b/scripts/sync_vendor.py @@ -5,7 +5,7 @@ import sys import subprocess -HTTPLIB_VERSION = "refs/tags/v0.42.0" +HTTPLIB_VERSION = "refs/tags/v0.43.1" vendor = { "https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp", diff --git a/vendor/cpp-httplib/httplib.cpp b/vendor/cpp-httplib/httplib.cpp index 1cd71c2ecc4..95bf0eb1bb5 100644 --- a/vendor/cpp-httplib/httplib.cpp +++ b/vendor/cpp-httplib/httplib.cpp @@ -872,7 +872,8 @@ bool write_websocket_frame(Stream &strm, ws::Opcode opcode, if (strm.write(reinterpret_cast(header), 2) < 0) { return false; } uint8_t ext[8]; for (int i = 7; i >= 0; i--) { - ext[7 - i] = static_cast((len >> (i * 8)) & 0xFF); + ext[7 - i] = + static_cast((static_cast(len) >> (i * 8)) & 0xFF); } if (strm.write(reinterpret_cast(ext), 8) < 0) { return false; } } @@ -1034,10 +1035,15 @@ bool canonicalize_path(const char *path, std::string &resolved) { char buf[_MAX_PATH]; if (_fullpath(buf, path, _MAX_PATH) == nullptr) { return false; } resolved = buf; -#else +#elif defined(PATH_MAX) char buf[PATH_MAX]; if (realpath(path, buf) == nullptr) { return false; } resolved = buf; +#else + auto buf = realpath(path, nullptr); + auto guard = scope_exit([&]() { std::free(buf); }); + if (buf == nullptr) { return false; } + resolved = buf; #endif return true; } @@ -2765,6 +2771,35 @@ EncodingType encoding_type(const Request &req, const Response &res) { return best; } +std::unique_ptr make_compressor(EncodingType type) { +#ifdef CPPHTTPLIB_ZLIB_SUPPORT + if (type == EncodingType::Gzip) { + return detail::make_unique(); + } +#endif +#ifdef CPPHTTPLIB_BROTLI_SUPPORT + if (type == EncodingType::Brotli) { + return detail::make_unique(); + } +#endif +#ifdef CPPHTTPLIB_ZSTD_SUPPORT + if (type == EncodingType::Zstd) { + return detail::make_unique(); + } +#endif + (void)type; + return nullptr; +} + +const char *encoding_name(EncodingType type) { + switch (type) { + case EncodingType::Gzip: return "gzip"; + case EncodingType::Brotli: return "br"; + case EncodingType::Zstd: return "zstd"; + default: return ""; + } +} + bool nocompressor::compress(const char *data, size_t data_length, bool /*last*/, Callback callback) { if (!data_length) { return true; } @@ -3097,6 +3132,29 @@ const char *get_header_value(const Headers &headers, return def; } +size_t get_header_value_count(const Headers &headers, + const std::string &key) { + auto r = headers.equal_range(key); + return static_cast(std::distance(r.first, r.second)); +} + +template +typename Map::mapped_type +get_multimap_value(const Map &m, const std::string &key, size_t id) { + auto rng = m.equal_range(key); + auto it = rng.first; + std::advance(it, static_cast(id)); + if (it != rng.second) { return it->second; } + return typename Map::mapped_type(); +} + +void set_header(Headers &headers, const std::string &key, + const std::string &val) { + if (fields::is_field_name(key) && fields::is_field_value(val)) { + headers.emplace(key, val); + } +} + bool read_headers(Stream &strm, Headers &headers) { const auto bufsiz = 2048; char buf[bufsiz]; @@ -5791,16 +5849,12 @@ std::string Request::get_header_value(const std::string &key, } size_t Request::get_header_value_count(const std::string &key) const { - auto r = headers.equal_range(key); - return static_cast(std::distance(r.first, r.second)); + return detail::get_header_value_count(headers, key); } void Request::set_header(const std::string &key, const std::string &val) { - if (detail::fields::is_field_name(key) && - detail::fields::is_field_value(val)) { - headers.emplace(key, val); - } + detail::set_header(headers, key, val); } bool Request::has_trailer(const std::string &key) const { @@ -5809,11 +5863,7 @@ bool Request::has_trailer(const std::string &key) const { std::string Request::get_trailer_value(const std::string &key, size_t id) const { - auto rng = trailers.equal_range(key); - auto it = rng.first; - std::advance(it, static_cast(id)); - if (it != rng.second) { return it->second; } - return std::string(); + return detail::get_multimap_value(trailers, key, id); } size_t Request::get_trailer_value_count(const std::string &key) const { @@ -5827,11 +5877,7 @@ bool Request::has_param(const std::string &key) const { std::string Request::get_param_value(const std::string &key, size_t id) const { - auto rng = params.equal_range(key); - auto it = rng.first; - std::advance(it, static_cast(id)); - if (it != rng.second) { return it->second; } - return std::string(); + return detail::get_multimap_value(params, key, id); } std::vector @@ -5886,11 +5932,7 @@ size_t MultipartFormData::get_field_count(const std::string &key) const { FormData MultipartFormData::get_file(const std::string &key, size_t id) const { - auto rng = files.equal_range(key); - auto it = rng.first; - std::advance(it, static_cast(id)); - if (it != rng.second) { return it->second; } - return FormData(); + return detail::get_multimap_value(files, key, id); } std::vector @@ -5929,16 +5971,12 @@ std::string Response::get_header_value(const std::string &key, } size_t Response::get_header_value_count(const std::string &key) const { - auto r = headers.equal_range(key); - return static_cast(std::distance(r.first, r.second)); + return detail::get_header_value_count(headers, key); } void Response::set_header(const std::string &key, const std::string &val) { - if (detail::fields::is_field_name(key) && - detail::fields::is_field_value(val)) { - headers.emplace(key, val); - } + detail::set_header(headers, key, val); } bool Response::has_trailer(const std::string &key) const { return trailers.find(key) != trailers.end(); @@ -5946,11 +5984,7 @@ bool Response::has_trailer(const std::string &key) const { std::string Response::get_trailer_value(const std::string &key, size_t id) const { - auto rng = trailers.equal_range(key); - auto it = rng.first; - std::advance(it, static_cast(id)); - if (it != rng.second) { return it->second; } - return std::string(); + return detail::get_multimap_value(trailers, key, id); } size_t Response::get_trailer_value_count(const std::string &key) const { @@ -6253,15 +6287,6 @@ void ThreadPool::worker(bool is_dynamic) { assert(true == static_cast(fn)); fn(); - - // Dynamic thread: exit if queue is empty after task completion - if (is_dynamic) { - std::unique_lock lock(mutex_); - if (jobs_.empty()) { - move_to_finished(std::this_thread::get_id()); - break; - } - } } #if defined(CPPHTTPLIB_OPENSSL_SUPPORT) && !defined(OPENSSL_IS_BORINGSSL) && \ @@ -6791,61 +6816,51 @@ Server::make_matcher(const std::string &pattern) { } Server &Server::Get(const std::string &pattern, Handler handler) { - get_handlers_.emplace_back(make_matcher(pattern), std::move(handler)); - return *this; + return add_handler(get_handlers_, pattern, std::move(handler)); } Server &Server::Post(const std::string &pattern, Handler handler) { - post_handlers_.emplace_back(make_matcher(pattern), std::move(handler)); - return *this; + return add_handler(post_handlers_, pattern, std::move(handler)); } Server &Server::Post(const std::string &pattern, HandlerWithContentReader handler) { - post_handlers_for_content_reader_.emplace_back(make_matcher(pattern), - std::move(handler)); - return *this; + return add_handler(post_handlers_for_content_reader_, pattern, + std::move(handler)); } Server &Server::Put(const std::string &pattern, Handler handler) { - put_handlers_.emplace_back(make_matcher(pattern), std::move(handler)); - return *this; + return add_handler(put_handlers_, pattern, std::move(handler)); } Server &Server::Put(const std::string &pattern, HandlerWithContentReader handler) { - put_handlers_for_content_reader_.emplace_back(make_matcher(pattern), - std::move(handler)); - return *this; + return add_handler(put_handlers_for_content_reader_, pattern, + std::move(handler)); } Server &Server::Patch(const std::string &pattern, Handler handler) { - patch_handlers_.emplace_back(make_matcher(pattern), std::move(handler)); - return *this; + return add_handler(patch_handlers_, pattern, std::move(handler)); } Server &Server::Patch(const std::string &pattern, HandlerWithContentReader handler) { - patch_handlers_for_content_reader_.emplace_back(make_matcher(pattern), - std::move(handler)); - return *this; + return add_handler(patch_handlers_for_content_reader_, pattern, + std::move(handler)); } Server &Server::Delete(const std::string &pattern, Handler handler) { - delete_handlers_.emplace_back(make_matcher(pattern), std::move(handler)); - return *this; + return add_handler(delete_handlers_, pattern, std::move(handler)); } Server &Server::Delete(const std::string &pattern, HandlerWithContentReader handler) { - delete_handlers_for_content_reader_.emplace_back(make_matcher(pattern), - std::move(handler)); - return *this; + return add_handler(delete_handlers_for_content_reader_, pattern, + std::move(handler)); } Server &Server::Options(const std::string &pattern, Handler handler) { - options_handlers_.emplace_back(make_matcher(pattern), std::move(handler)); - return *this; + return add_handler(options_handlers_, pattern, std::move(handler)); } Server &Server::WebSocket(const std::string &pattern, @@ -7054,6 +7069,11 @@ Server &Server::set_payload_max_length(size_t length) { return *this; } +Server &Server::set_websocket_max_missed_pongs(int count) { + websocket_max_missed_pongs_ = count; + return *this; +} + Server &Server::set_websocket_ping_interval(time_t sec) { websocket_ping_interval_sec_ = sec; return *this; @@ -7279,23 +7299,10 @@ Server::write_content_with_provider(Stream &strm, const Request &req, if (res.is_chunked_content_provider_) { auto type = detail::encoding_type(req, res); - std::unique_ptr compressor; - if (type == detail::EncodingType::Gzip) { -#ifdef CPPHTTPLIB_ZLIB_SUPPORT - compressor = detail::make_unique(); -#endif - } else if (type == detail::EncodingType::Brotli) { -#ifdef CPPHTTPLIB_BROTLI_SUPPORT - compressor = detail::make_unique(); -#endif - } else if (type == detail::EncodingType::Zstd) { -#ifdef CPPHTTPLIB_ZSTD_SUPPORT - compressor = detail::make_unique(); -#endif - } else { + auto compressor = detail::make_compressor(type); + if (!compressor) { compressor = detail::make_unique(); } - assert(compressor != nullptr); return detail::write_content_chunked(strm, res.content_provider_, is_shutting_down, *compressor); @@ -7917,14 +7924,8 @@ void Server::apply_ranges(const Request &req, Response &res, if (res.content_provider_) { if (res.is_chunked_content_provider_) { res.set_header("Transfer-Encoding", "chunked"); - if (type == detail::EncodingType::Gzip) { - res.set_header("Content-Encoding", "gzip"); - res.set_header("Vary", "Accept-Encoding"); - } else if (type == detail::EncodingType::Brotli) { - res.set_header("Content-Encoding", "br"); - res.set_header("Vary", "Accept-Encoding"); - } else if (type == detail::EncodingType::Zstd) { - res.set_header("Content-Encoding", "zstd"); + if (type != detail::EncodingType::None) { + res.set_header("Content-Encoding", detail::encoding_name(type)); res.set_header("Vary", "Accept-Encoding"); } } @@ -7955,27 +7956,7 @@ void Server::apply_ranges(const Request &req, Response &res, if (type != detail::EncodingType::None) { output_pre_compression_log(req, res); - std::unique_ptr compressor; - std::string content_encoding; - - if (type == detail::EncodingType::Gzip) { -#ifdef CPPHTTPLIB_ZLIB_SUPPORT - compressor = detail::make_unique(); - content_encoding = "gzip"; -#endif - } else if (type == detail::EncodingType::Brotli) { -#ifdef CPPHTTPLIB_BROTLI_SUPPORT - compressor = detail::make_unique(); - content_encoding = "br"; -#endif - } else if (type == detail::EncodingType::Zstd) { -#ifdef CPPHTTPLIB_ZSTD_SUPPORT - compressor = detail::make_unique(); - content_encoding = "zstd"; -#endif - } - - if (compressor) { + if (auto compressor = detail::make_compressor(type)) { std::string compressed; if (compressor->compress(res.body.data(), res.body.size(), true, [&](const char *data, size_t data_len) { @@ -7983,7 +7964,7 @@ void Server::apply_ranges(const Request &req, Response &res, return true; })) { res.body.swap(compressed); - res.set_header("Content-Encoding", content_encoding); + res.set_header("Content-Encoding", detail::encoding_name(type)); res.set_header("Vary", "Accept-Encoding"); } } @@ -8231,7 +8212,8 @@ Server::process_request(Stream &strm, const std::string &remote_addr, { // Use WebSocket-specific read timeout instead of HTTP timeout strm.set_read_timeout(CPPHTTPLIB_WEBSOCKET_READ_TIMEOUT_SECOND, 0); - ws::WebSocket ws(strm, req, true, websocket_ping_interval_sec_); + ws::WebSocket ws(strm, req, true, websocket_ping_interval_sec_, + websocket_max_missed_pongs_); entry.handler(req, ws); } return true; @@ -10822,38 +10804,6 @@ void ClientImpl::enable_server_hostname_verification(bool enabled) { } #endif -// ClientImpl::set_ca_cert_store is defined after TLS namespace (uses helpers) -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -X509_STORE *ClientImpl::create_ca_cert_store(const char *ca_cert, - std::size_t size) const { - auto mem = BIO_new_mem_buf(ca_cert, static_cast(size)); - auto se = detail::scope_exit([&] { BIO_free_all(mem); }); - if (!mem) { return nullptr; } - - auto inf = PEM_X509_INFO_read_bio(mem, nullptr, nullptr, nullptr); - if (!inf) { return nullptr; } - - auto cts = X509_STORE_new(); - if (cts) { - for (auto i = 0; i < static_cast(sk_X509_INFO_num(inf)); i++) { - auto itmp = sk_X509_INFO_value(inf, i); - if (!itmp) { continue; } - - if (itmp->x509) { X509_STORE_add_cert(cts, itmp->x509); } - if (itmp->crl) { X509_STORE_add_crl(cts, itmp->crl); } - } - } - - sk_X509_INFO_pop_free(inf, X509_INFO_free); - return cts; -} - -void ClientImpl::set_server_certificate_verifier( - std::function /*verifier*/) { - // Base implementation does nothing - SSLClient overrides this -} -#endif - void ClientImpl::set_logger(Logger logger) { logger_ = std::move(logger); } @@ -10927,10 +10877,10 @@ Client::Client(const std::string &scheme_host_port, cli_ = detail::make_unique(scheme_host_port, 80, client_cert_path, client_key_path); } -} // namespace detail +} Client::Client(const std::string &host, int port) - : cli_(detail::make_unique(host, port)) {} + : Client(host, port, std::string(), std::string()) {} Client::Client(const std::string &host, int port, const std::string &client_cert_path, @@ -11505,12 +11455,6 @@ void Client::set_follow_location(bool on) { void Client::set_path_encode(bool on) { cli_->set_path_encode(on); } -[[deprecated("Use set_path_encode() instead. " - "This function will be removed by v1.0.0.")]] -void Client::set_url_encode(bool on) { - cli_->set_path_encode(on); -} - void Client::set_compress(bool on) { cli_->set_compress(on); } void Client::set_decompress(bool on) { cli_->set_decompress(on); } @@ -11893,24 +11837,31 @@ SSLClient::SSLClient(const std::string &host) SSLClient::SSLClient(const std::string &host, int port) : SSLClient(host, port, std::string(), std::string()) {} +void SSLClient::init_ctx() { + ctx_ = tls::create_client_context(); + if (ctx_) { tls::set_min_version(ctx_, tls::Version::TLS1_2); } +} + +void SSLClient::reset_ctx_on_error() { + last_backend_error_ = tls::get_error(); + tls::free_context(ctx_); + ctx_ = nullptr; +} + SSLClient::SSLClient(const std::string &host, int port, const std::string &client_cert_path, const std::string &client_key_path, const std::string &private_key_password) : ClientImpl(host, port, client_cert_path, client_key_path) { - ctx_ = tls::create_client_context(); + init_ctx(); if (!ctx_) { return; } - tls::set_min_version(ctx_, tls::Version::TLS1_2); - if (!client_cert_path.empty() && !client_key_path.empty()) { const char *password = private_key_password.empty() ? nullptr : private_key_password.c_str(); if (!tls::set_client_cert_file(ctx_, client_cert_path.c_str(), client_key_path.c_str(), password)) { - last_backend_error_ = tls::get_error(); - tls::free_context(ctx_); - ctx_ = nullptr; + reset_ctx_on_error(); } } } @@ -11918,17 +11869,13 @@ SSLClient::SSLClient(const std::string &host, int port, SSLClient::SSLClient(const std::string &host, int port, const PemMemory &pem) : ClientImpl(host, port) { - ctx_ = tls::create_client_context(); + init_ctx(); if (!ctx_) { return; } - tls::set_min_version(ctx_, tls::Version::TLS1_2); - if (pem.cert_pem && pem.key_pem) { if (!tls::set_client_cert_pem(ctx_, pem.cert_pem, pem.key_pem, pem.private_key_password)) { - last_backend_error_ = tls::get_error(); - tls::free_context(ctx_); - ctx_ = nullptr; + reset_ctx_on_error(); } } } @@ -12479,41 +12426,6 @@ std::string Request::sni() const { * Group 8: TLS abstraction layer - OpenSSL backend */ -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -// These wrappers forward to deprecated APIs that will be removed by v1.0.0. -// Suppress C4996 / -Wdeprecated-declarations so that MSVC /sdl builds (which -// promote C4996 to an error) compile cleanly even though the wrappers -// themselves are also marked [[deprecated]]. -#if defined(_MSC_VER) -#pragma warning(push) -#pragma warning(disable : 4996) -#elif defined(__GNUC__) || defined(__clang__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" -#endif - -SSL_CTX *Client::ssl_context() const { - if (is_ssl_) { return static_cast(*cli_).ssl_context(); } - return nullptr; -} - -void Client::set_server_certificate_verifier( - std::function verifier) { - cli_->set_server_certificate_verifier(verifier); -} - -long Client::get_verify_result() const { - if (is_ssl_) { return static_cast(*cli_).get_verify_result(); } - return -1; // NOTE: -1 doesn't match any of X509_V_ERR_??? -} - -#if defined(_MSC_VER) -#pragma warning(pop) -#elif defined(__GNUC__) || defined(__clang__) -#pragma GCC diagnostic pop -#endif -#endif // CPPHTTPLIB_OPENSSL_SUPPORT - /* * OpenSSL Backend Implementation */ @@ -12523,54 +12435,6 @@ namespace tls { namespace impl { -// OpenSSL-specific helpers for converting native types to PEM -std::string x509_to_pem(X509 *cert) { - if (!cert) return {}; - BIO *bio = BIO_new(BIO_s_mem()); - if (!bio) return {}; - if (PEM_write_bio_X509(bio, cert) != 1) { - BIO_free(bio); - return {}; - } - char *data = nullptr; - long len = BIO_get_mem_data(bio, &data); - std::string pem(data, static_cast(len)); - BIO_free(bio); - return pem; -} - -std::string evp_pkey_to_pem(EVP_PKEY *key) { - if (!key) return {}; - BIO *bio = BIO_new(BIO_s_mem()); - if (!bio) return {}; - if (PEM_write_bio_PrivateKey(bio, key, nullptr, nullptr, 0, nullptr, - nullptr) != 1) { - BIO_free(bio); - return {}; - } - char *data = nullptr; - long len = BIO_get_mem_data(bio, &data); - std::string pem(data, static_cast(len)); - BIO_free(bio); - return pem; -} - -std::string x509_store_to_pem(X509_STORE *store) { - if (!store) return {}; - std::string pem; - auto objs = X509_STORE_get0_objects(store); - if (!objs) return {}; - auto count = sk_X509_OBJECT_num(objs); - for (decltype(count) i = 0; i < count; i++) { - auto obj = sk_X509_OBJECT_value(objs, i); - if (X509_OBJECT_get_type(obj) == X509_LU_X509) { - auto cert = X509_OBJECT_get0_X509(obj); - if (cert) { pem += x509_to_pem(cert); } - } - } - return pem; -} - // Helper to map OpenSSL SSL_get_error to ErrorCode ErrorCode map_ssl_error(int ssl_error, int &out_errno) { switch (ssl_error) { @@ -12603,8 +12467,10 @@ STACK_OF(X509_NAME) * X509 *cert = nullptr; while ((cert = PEM_read_bio_X509(bio, nullptr, nullptr, nullptr)) != nullptr) { - X509_NAME *name = X509_get_subject_name(cert); - if (name) { sk_X509_NAME_push(ca_list, X509_NAME_dup(name)); } + const X509_NAME *name = X509_get_subject_name(cert); + if (name) { + sk_X509_NAME_push(ca_list, X509_NAME_dup(const_cast(name))); + } X509_free(cert); } BIO_free(bio); @@ -12612,45 +12478,6 @@ STACK_OF(X509_NAME) * return ca_list; } -// Helper: Extract CA names from X509_STORE -// Returns a new STACK_OF(X509_NAME)* or nullptr on failure -// Caller takes ownership of returned list -STACK_OF(X509_NAME) * - extract_client_ca_list_from_store(X509_STORE *store) { - if (!store) { return nullptr; } - - auto ca_list = sk_X509_NAME_new_null(); - if (!ca_list) { return nullptr; } - - auto objs = X509_STORE_get0_objects(store); - if (!objs) { - sk_X509_NAME_free(ca_list); - return nullptr; - } - - auto count = sk_X509_OBJECT_num(objs); - for (decltype(count) i = 0; i < count; i++) { - auto obj = sk_X509_OBJECT_value(objs, i); - if (X509_OBJECT_get_type(obj) == X509_LU_X509) { - auto cert = X509_OBJECT_get0_X509(obj); - if (cert) { - auto subject = X509_get_subject_name(cert); - if (subject) { - auto name_dup = X509_NAME_dup(subject); - if (name_dup) { sk_X509_NAME_push(ca_list, name_dup); } - } - } - } - } - - if (sk_X509_NAME_num(ca_list) == 0) { - sk_X509_NAME_free(ca_list); - return nullptr; - } - - return ca_list; -} - // OpenSSL verify callback wrapper int openssl_verify_callback(int preverify_ok, X509_STORE_CTX *ctx) { auto &callback = get_verify_callback(); @@ -13086,6 +12913,9 @@ ssize_t read(session_t session, void *buf, size_t len, TlsError &err) { auto ssl_err = SSL_get_error(ssl, ret); err.code = impl::map_ssl_error(ssl_err, err.sys_errno); + if (err.code == ErrorCode::PeerClosed) { + return 0; + } // Gracefully handle the peer closed state. if (err.code == ErrorCode::Fatal) { err.backend_code = ERR_get_error(); } return -1; } @@ -13523,164 +13353,8 @@ std::string verify_error_string(long error_code) { return str ? str : "unknown error"; } -namespace impl { - -// OpenSSL-specific helpers for public API wrappers -ctx_t create_server_context_from_x509(X509 *cert, EVP_PKEY *key, - X509_STORE *client_ca_store, - int &out_error) { - out_error = 0; - auto cert_pem = x509_to_pem(cert); - auto key_pem = evp_pkey_to_pem(key); - if (cert_pem.empty() || key_pem.empty()) { - out_error = static_cast(ERR_get_error()); - return nullptr; - } - - auto ctx = create_server_context(); - if (!ctx) { - out_error = static_cast(get_error()); - return nullptr; - } - - if (!set_server_cert_pem(ctx, cert_pem.c_str(), key_pem.c_str(), nullptr)) { - out_error = static_cast(get_error()); - free_context(ctx); - return nullptr; - } - - if (client_ca_store) { - // Set cert store for verification (SSL_CTX_set_cert_store takes ownership) - SSL_CTX_set_cert_store(static_cast(ctx), client_ca_store); - - // Extract and set client CA list directly from store (more efficient than - // PEM conversion) - auto ca_list = extract_client_ca_list_from_store(client_ca_store); - if (ca_list) { - SSL_CTX_set_client_CA_list(static_cast(ctx), ca_list); - } - - set_verify_client(ctx, true); - } - - return ctx; -} - -void update_server_certs_from_x509(ctx_t ctx, X509 *cert, EVP_PKEY *key, - X509_STORE *client_ca_store) { - auto cert_pem = x509_to_pem(cert); - auto key_pem = evp_pkey_to_pem(key); - - if (!cert_pem.empty() && !key_pem.empty()) { - update_server_cert(ctx, cert_pem.c_str(), key_pem.c_str(), nullptr); - } - - if (client_ca_store) { - auto ca_pem = x509_store_to_pem(client_ca_store); - if (!ca_pem.empty()) { update_server_client_ca(ctx, ca_pem.c_str()); } - X509_STORE_free(client_ca_store); - } -} - -ctx_t create_client_context_from_x509(X509 *cert, EVP_PKEY *key, - const char *password, - uint64_t &out_error) { - out_error = 0; - auto ctx = create_client_context(); - if (!ctx) { - out_error = get_error(); - return nullptr; - } - - if (cert && key) { - auto cert_pem = x509_to_pem(cert); - auto key_pem = evp_pkey_to_pem(key); - if (cert_pem.empty() || key_pem.empty()) { - out_error = ERR_get_error(); - free_context(ctx); - return nullptr; - } - if (!set_client_cert_pem(ctx, cert_pem.c_str(), key_pem.c_str(), - password)) { - out_error = get_error(); - free_context(ctx); - return nullptr; - } - } - - return ctx; -} - -} // namespace impl - } // namespace tls -// ClientImpl::set_ca_cert_store - defined here to use -// tls::impl::x509_store_to_pem Deprecated: converts X509_STORE to PEM and -// stores for redirect transfer -void ClientImpl::set_ca_cert_store(X509_STORE *ca_cert_store) { - if (ca_cert_store) { - ca_cert_pem_ = tls::impl::x509_store_to_pem(ca_cert_store); - } -} - -SSLServer::SSLServer(X509 *cert, EVP_PKEY *private_key, - X509_STORE *client_ca_cert_store) { - ctx_ = tls::impl::create_server_context_from_x509( - cert, private_key, client_ca_cert_store, last_ssl_error_); -} - -SSLServer::SSLServer( - const std::function &setup_ssl_ctx_callback) { - // Use abstract API to create context - ctx_ = tls::create_server_context(); - if (ctx_) { - // Pass to OpenSSL-specific callback (ctx_ is SSL_CTX* internally) - auto ssl_ctx = static_cast(ctx_); - if (!setup_ssl_ctx_callback(*ssl_ctx)) { - tls::free_context(ctx_); - ctx_ = nullptr; - } - } -} - -SSL_CTX *SSLServer::ssl_context() const { - return static_cast(ctx_); -} - -void SSLServer::update_certs(X509 *cert, EVP_PKEY *private_key, - X509_STORE *client_ca_cert_store) { - std::lock_guard guard(ctx_mutex_); - tls::impl::update_server_certs_from_x509(ctx_, cert, private_key, - client_ca_cert_store); -} - -SSLClient::SSLClient(const std::string &host, int port, - X509 *client_cert, EVP_PKEY *client_key, - const std::string &private_key_password) - : ClientImpl(host, port) { - const char *password = - private_key_password.empty() ? nullptr : private_key_password.c_str(); - ctx_ = tls::impl::create_client_context_from_x509( - client_cert, client_key, password, last_backend_error_); -} - -long SSLClient::get_verify_result() const { return verify_result_; } - -void SSLClient::set_server_certificate_verifier( - std::function verifier) { - // Wrap SSL* callback into backend-independent session_verifier_ - auto v = std::make_shared>( - std::move(verifier)); - session_verifier_ = [v](tls::session_t session) { - return (*v)(static_cast(session)); - }; -} - -SSL_CTX *SSLClient::ssl_context() const { - return static_cast(ctx_); -} - bool SSLClient::verify_host(X509 *server_cert) const { /* Quote from RFC2818 section 3.1 "Server Identity" @@ -16194,7 +15868,11 @@ ReadResult WebSocket::read(std::string &msg) { payload.size(), true, !is_server_); continue; } - case Opcode::Pong: continue; + case Opcode::Pong: { + std::lock_guard lock(ping_mutex_); + unacked_pings_ = 0; + continue; + } case Opcode::Close: { if (!closed_.exchange(true)) { // Echo close frame back @@ -16228,7 +15906,11 @@ ReadResult WebSocket::read(std::string &msg) { true, !is_server_); continue; } - if (cont_opcode == Opcode::Pong) { continue; } + if (cont_opcode == Opcode::Pong) { + std::lock_guard lock(ping_mutex_); + unacked_pings_ = 0; + continue; + } if (cont_opcode == Opcode::Close) { if (!closed_.exchange(true)) { std::lock_guard lock(write_mutex_); @@ -16316,12 +15998,22 @@ void WebSocket::start_heartbeat() { while (!closed_) { ping_cv_.wait_for(lock, std::chrono::seconds(ping_interval_sec_)); if (closed_) { break; } + // If the peer has failed to respond to the previous pings, give up. + // RFC 6455 does not define a pong-timeout mechanism; this is an + // opt-in liveness check controlled by max_missed_pongs_. + if (max_missed_pongs_ > 0 && unacked_pings_ >= max_missed_pongs_) { + lock.unlock(); + close(CloseStatus::GoingAway, "pong timeout"); + return; + } lock.unlock(); if (!send_frame(Opcode::Ping, nullptr, 0)) { + lock.lock(); closed_ = true; break; } lock.lock(); + unacked_pings_++; } }); } @@ -16449,8 +16141,9 @@ bool WebSocketClient::connect() { Request req; req.method = "GET"; req.path = path_; - ws_ = std::unique_ptr( - new WebSocket(std::move(strm), req, false, websocket_ping_interval_sec_)); + ws_ = std::unique_ptr(new WebSocket(std::move(strm), req, false, + websocket_ping_interval_sec_, + websocket_max_missed_pongs_)); return true; } @@ -16494,6 +16187,10 @@ void WebSocketClient::set_websocket_ping_interval(time_t sec) { websocket_ping_interval_sec_ = sec; } +void WebSocketClient::set_websocket_max_missed_pongs(int count) { + websocket_max_missed_pongs_ = count; +} + void WebSocketClient::set_tcp_nodelay(bool on) { tcp_nodelay_ = on; } void WebSocketClient::set_address_family(int family) { diff --git a/vendor/cpp-httplib/httplib.h b/vendor/cpp-httplib/httplib.h index 1d12f2d2b59..8581d1695a8 100644 --- a/vendor/cpp-httplib/httplib.h +++ b/vendor/cpp-httplib/httplib.h @@ -8,8 +8,8 @@ #ifndef CPPHTTPLIB_HTTPLIB_H #define CPPHTTPLIB_HTTPLIB_H -#define CPPHTTPLIB_VERSION "0.42.0" -#define CPPHTTPLIB_VERSION_NUM "0x002a00" +#define CPPHTTPLIB_VERSION "0.43.1" +#define CPPHTTPLIB_VERSION_NUM "0x002b01" #ifdef _WIN32 #if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00 @@ -205,6 +205,10 @@ #define CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND 30 #endif +#ifndef CPPHTTPLIB_WEBSOCKET_MAX_MISSED_PONGS +#define CPPHTTPLIB_WEBSOCKET_MAX_MISSED_PONGS 0 +#endif + /* * Headers */ @@ -1720,6 +1724,8 @@ class Server { Server &set_websocket_ping_interval( const std::chrono::duration &duration); + Server &set_websocket_max_missed_pongs(int count); + bool bind_to_port(const std::string &host, int port, int socket_flags = 0); int bind_to_any_port(const std::string &host, int socket_flags = 0); bool listen_after_bind(); @@ -1756,6 +1762,7 @@ class Server { size_t payload_max_length_ = CPPHTTPLIB_PAYLOAD_MAX_LENGTH; time_t websocket_ping_interval_sec_ = CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND; + int websocket_max_missed_pongs_ = CPPHTTPLIB_WEBSOCKET_MAX_MISSED_PONGS; private: using Handlers = @@ -1767,6 +1774,14 @@ class Server { static std::unique_ptr make_matcher(const std::string &pattern); + template + Server &add_handler( + std::vector, H>> &handlers, + const std::string &pattern, H handler) { + handlers.emplace_back(make_matcher(pattern), std::move(handler)); + return *this; + } + Server &set_error_handler_core(HandlerWithResponse handler, std::true_type); Server &set_error_handler_core(Handler handler, std::false_type); @@ -1928,15 +1943,6 @@ class Result { int ssl_error_ = 0; uint64_t ssl_backend_error_ = 0; #endif - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -public: - [[deprecated("Use ssl_backend_error() instead. " - "This function will be removed by v1.0.0.")]] - uint64_t ssl_openssl_error() const { - return ssl_backend_error_; - } -#endif }; struct ClientConnection { @@ -2409,22 +2415,6 @@ class ClientImpl { int last_ssl_error_ = 0; uint64_t last_backend_error_ = 0; #endif - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -public: - [[deprecated("Use load_ca_cert_store() instead. " - "This function will be removed by v1.0.0.")]] - void set_ca_cert_store(X509_STORE *ca_cert_store); - - [[deprecated("Use tls::create_ca_store() instead. " - "This function will be removed by v1.0.0.")]] - X509_STORE *create_ca_cert_store(const char *ca_cert, std::size_t size) const; - - [[deprecated("Use set_server_certificate_verifier(VerifyCallback) instead. " - "This function will be removed by v1.0.0.")]] - virtual void set_server_certificate_verifier( - std::function verifier); -#endif }; class Client { @@ -2599,7 +2589,6 @@ class Client { void set_follow_location(bool on); void set_path_encode(bool on); - void set_url_encode(bool on); void set_compress(bool on); @@ -2647,22 +2636,6 @@ class Client { private: bool is_ssl_ = false; #endif - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -public: - [[deprecated("Use tls_context() instead. " - "This function will be removed by v1.0.0.")]] - SSL_CTX *ssl_context() const; - - [[deprecated("Use set_session_verifier(session_t) instead. " - "This function will be removed by v1.0.0.")]] - void set_server_certificate_verifier( - std::function verifier); - - [[deprecated("Use Result::ssl_backend_error() instead. " - "This function will be removed by v1.0.0.")]] - long get_verify_result() const; -#endif }; #ifdef CPPHTTPLIB_SSL_ENABLED @@ -2708,29 +2681,6 @@ class SSLServer : public Server { std::mutex ctx_mutex_; int last_ssl_error_ = 0; - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -public: - [[deprecated("Use SSLServer(PemMemory) or " - "SSLServer(ContextSetupCallback) instead. " - "This constructor will be removed by v1.0.0.")]] - SSLServer(X509 *cert, EVP_PKEY *private_key, - X509_STORE *client_ca_cert_store = nullptr); - - [[deprecated("Use SSLServer(ContextSetupCallback) instead. " - "This constructor will be removed by v1.0.0.")]] - SSLServer( - const std::function &setup_ssl_ctx_callback); - - [[deprecated("Use tls_context() instead. " - "This function will be removed by v1.0.0.")]] - SSL_CTX *ssl_context() const; - - [[deprecated("Use update_certs_pem() instead. " - "This function will be removed by v1.0.0.")]] - void update_certs(X509 *cert, EVP_PKEY *private_key, - X509_STORE *client_ca_cert_store = nullptr); -#endif }; class SSLClient final : public ClientImpl { @@ -2794,6 +2744,9 @@ class SSLClient final : public ClientImpl { Response &res, bool &success, Error &error); bool initialize_ssl(Socket &socket, Error &error); + void init_ctx(); + void reset_ctx_on_error(); + bool load_certs(); tls::ctx_t ctx_ = nullptr; @@ -2811,42 +2764,6 @@ class SSLClient final : public ClientImpl { friend class ClientImpl; #ifdef CPPHTTPLIB_OPENSSL_SUPPORT -public: - [[deprecated("Use SSLClient(host, port, PemMemory) instead. " - "This constructor will be removed by v1.0.0.")]] - explicit SSLClient(const std::string &host, int port, X509 *client_cert, - EVP_PKEY *client_key, - const std::string &private_key_password = std::string()); - - [[deprecated("Use Result::ssl_backend_error() instead. " - "This function will be removed by v1.0.0.")]] - long get_verify_result() const; - - [[deprecated("Use tls_context() instead. " - "This function will be removed by v1.0.0.")]] - SSL_CTX *ssl_context() const; - - // Override of a deprecated virtual in ClientImpl. Suppress C4996 / - // -Wdeprecated-declarations on the override declaration itself so that - // MSVC /sdl builds compile cleanly. Will be removed together with the - // base virtual by v1.0.0. -#if defined(_MSC_VER) -#pragma warning(push) -#pragma warning(disable : 4996) -#elif defined(__GNUC__) || defined(__clang__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" -#endif - [[deprecated("Use set_session_verifier(session_t) instead. " - "This function will be removed by v1.0.0.")]] - void set_server_certificate_verifier( - std::function verifier) override; -#if defined(_MSC_VER) -#pragma warning(pop) -#elif defined(__GNUC__) || defined(__clang__) -#pragma GCC diagnostic pop -#endif - private: bool verify_host(X509 *server_cert) const; bool verify_host_with_subject_alt_name(X509 *server_cert) const; @@ -3818,17 +3735,21 @@ class WebSocket { WebSocket( Stream &strm, const Request &req, bool is_server, - time_t ping_interval_sec = CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND) + time_t ping_interval_sec = CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND, + int max_missed_pongs = CPPHTTPLIB_WEBSOCKET_MAX_MISSED_PONGS) : strm_(strm), req_(req), is_server_(is_server), - ping_interval_sec_(ping_interval_sec) { + ping_interval_sec_(ping_interval_sec), + max_missed_pongs_(max_missed_pongs) { start_heartbeat(); } WebSocket( std::unique_ptr &&owned_strm, const Request &req, bool is_server, - time_t ping_interval_sec = CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND) + time_t ping_interval_sec = CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND, + int max_missed_pongs = CPPHTTPLIB_WEBSOCKET_MAX_MISSED_PONGS) : strm_(*owned_strm), owned_strm_(std::move(owned_strm)), req_(req), - is_server_(is_server), ping_interval_sec_(ping_interval_sec) { + is_server_(is_server), ping_interval_sec_(ping_interval_sec), + max_missed_pongs_(max_missed_pongs) { start_heartbeat(); } @@ -3840,6 +3761,8 @@ class WebSocket { Request req_; bool is_server_; time_t ping_interval_sec_; + int max_missed_pongs_; + int unacked_pings_ = 0; std::atomic closed_{false}; std::mutex write_mutex_; std::thread ping_thread_; @@ -3869,6 +3792,7 @@ class WebSocketClient { void set_read_timeout(time_t sec, time_t usec = 0); void set_write_timeout(time_t sec, time_t usec = 0); void set_websocket_ping_interval(time_t sec); + void set_websocket_max_missed_pongs(int count); void set_tcp_nodelay(bool on); void set_address_family(int family); void set_ipv6_v6only(bool on); @@ -3900,6 +3824,7 @@ class WebSocketClient { time_t write_timeout_usec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND; time_t websocket_ping_interval_sec_ = CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND; + int websocket_max_missed_pongs_ = CPPHTTPLIB_WEBSOCKET_MAX_MISSED_PONGS; int address_family_ = AF_UNSPEC; bool tcp_nodelay_ = CPPHTTPLIB_TCP_NODELAY; bool ipv6_v6only_ = CPPHTTPLIB_IPV6_V6ONLY; From 52f1096f21b0e12e4532f34224991e06cf21a69a Mon Sep 17 00:00:00 2001 From: Zijun Yu Date: Tue, 21 Apr 2026 23:58:34 +0800 Subject: [PATCH 2/5] openvino: driver setup, CI split, thread safety, and NPU optimizations (#21944) * Thread safety per request only * Fix ROPE yarn case * Fix sticky stateful config * Use i4/i8 directly for symmetric quant * Use weightless caching * Add WeightlessCacheAttribute to reduce NPU memory usage * Gelu tanh support (#125) * Imrope support (#126) * fix(openvino): explicit ov::Tensor frees in ggml_backend_openvino_free * add GPU,NPU support in OV Dockerfile * add build-openvino.yml ci * Fix sticky stateful config * add concurrency to ov-gpu ci runs. Move OV CI to build-openvino.yml * fix thread-safety of shared runtime context * rope type abstraction for frontend translations * fix editorconfig --------- Co-authored-by: Mustafa Cavus Co-authored-by: Dan Hoffman Co-authored-by: Ravi Panchumarthy --- .devops/openvino.Dockerfile | 50 +- .github/workflows/build-openvino.yml | 120 +++++ .github/workflows/build-self-hosted.yml | 4 + .github/workflows/build.yml | 80 --- docs/backend/OPENVINO.md | 3 - ggml/src/ggml-openvino/ggml-decoder.cpp | 20 +- .../src/ggml-openvino/ggml-openvino-extra.cpp | 29 +- ggml/src/ggml-openvino/ggml-openvino.cpp | 42 +- ggml/src/ggml-openvino/ggml-quants.cpp | 456 ++++++++++-------- ggml/src/ggml-openvino/openvino/op/rope.cpp | 40 +- .../ggml-openvino/openvino/op/unary_gelu.cpp | 25 + ggml/src/ggml-openvino/openvino/op_table.cpp | 1 + ggml/src/ggml-openvino/openvino/op_table.h | 1 + .../openvino/pass/eliminate_zp.cpp | 123 ----- .../openvino/pass/eliminate_zp.h | 17 - .../rt_info/weightless_caching_attributes.hpp | 41 ++ .../openvino/translate_session.cpp | 30 +- ggml/src/ggml-openvino/openvino/utils.cpp | 103 ++-- ggml/src/ggml-openvino/openvino/utils.h | 1 + ggml/src/ggml-openvino/utils.cpp | 145 ++++-- ggml/src/ggml-openvino/utils.h | 26 +- 21 files changed, 818 insertions(+), 539 deletions(-) create mode 100644 .github/workflows/build-openvino.yml create mode 100644 ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp delete mode 100644 ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp delete mode 100644 ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h create mode 100644 ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp diff --git a/.devops/openvino.Dockerfile b/.devops/openvino.Dockerfile index 3ee4dd20180..31b58736d7e 100644 --- a/.devops/openvino.Dockerfile +++ b/.devops/openvino.Dockerfile @@ -2,7 +2,19 @@ ARG OPENVINO_VERSION_MAJOR=2026.0 ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886 ARG UBUNTU_VERSION=24.04 -# Optional proxy build arguments - empty by default +# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases +ARG IGC_VERSION=v2.30.1 +ARG IGC_VERSION_FULL=2_2.30.1+20950 +ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1 +ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0 +ARG IGDGMM_VERSION=22.9.0 + +# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases +ARG NPU_DRIVER_VERSION=v1.32.0 +ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947 +ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2 + +# Optional proxy build arguments ARG http_proxy= ARG https_proxy= @@ -78,13 +90,47 @@ ARG http_proxy ARG https_proxy RUN apt-get update \ - && apt-get install -y libgomp1 libtbb12 curl \ + && apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \ && apt autoremove -y \ && apt clean -y \ && rm -rf /tmp/* /var/tmp/* \ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ && find /var/cache -type f -delete +# Install GPU drivers +ARG IGC_VERSION +ARG IGC_VERSION_FULL +ARG COMPUTE_RUNTIME_VERSION +ARG COMPUTE_RUNTIME_VERSION_FULL +ARG IGDGMM_VERSION +RUN mkdir /tmp/neo/ && cd /tmp/neo/ \ + && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \ + && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \ + && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \ + && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \ + && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \ + && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \ + && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \ + && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \ + && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \ + && dpkg --install *.deb \ + && rm -rf /tmp/neo/ + +# Install NPU drivers +ARG NPU_DRIVER_VERSION +ARG NPU_DRIVER_FULL +ARG LIBZE1_VERSION +RUN mkdir /tmp/npu/ && cd /tmp/npu/ \ + && wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \ + && tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \ + && dpkg --install *.deb \ + && rm -rf /tmp/npu/ + +RUN cd /tmp \ + && wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \ + && dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \ + && rm libze1_${LIBZE1_VERSION}_amd64.deb + COPY --from=build /app/lib/ /app/ ### Full (all binaries) diff --git a/.github/workflows/build-openvino.yml b/.github/workflows/build-openvino.yml new file mode 100644 index 00000000000..f7177f6be37 --- /dev/null +++ b/.github/workflows/build-openvino.yml @@ -0,0 +1,120 @@ +name: CI (openvino) + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + paths: [ + '.github/workflows/build-openvino.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + ] + + pull_request: + types: [opened, synchronize, reopened] + paths: [ + '.github/workflows/build-openvino.yml', + 'ggml/src/ggml-openvino/**' + ] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +env: + GGML_NLOOP: 3 + GGML_N_THREADS: 1 + LLAMA_LOG_COLORS: 1 + LLAMA_LOG_PREFIX: 1 + LLAMA_LOG_TIMESTAMPS: 1 + +jobs: + ubuntu-24-openvino: + name: ubuntu-24-openvino-${{ matrix.openvino_device }} + + concurrency: + group: openvino-${{ matrix.variant }}-${{ github.head_ref || github.ref }} + cancel-in-progress: false + + strategy: + matrix: + include: + - variant: cpu + runner: '"ubuntu-24.04"' + openvino_device: "CPU" + - variant: gpu + runner: '["self-hosted","Linux","Intel","OpenVINO"]' + openvino_device: "GPU" + + runs-on: ${{ fromJSON(matrix.runner) }} + + env: + # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile + OPENVINO_VERSION_MAJOR: "2026.0" + OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886" + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: ccache + if: runner.environment == 'github-hosted' + uses: ggml-org/ccache-action@v1.2.21 + with: + key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1 + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip + sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd + + - name: Use OpenVINO Toolkit Cache + if: runner.environment == 'github-hosted' + uses: actions/cache@v5 + id: cache-openvino + with: + path: ./openvino_toolkit + key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }} + + - name: Setup OpenVINO Toolkit + if: steps.cache-openvino.outputs.cache-hit != 'true' + uses: ./.github/actions/linux-setup-openvino + with: + path: ./openvino_toolkit + version_major: ${{ env.OPENVINO_VERSION_MAJOR }} + version_full: ${{ env.OPENVINO_VERSION_FULL }} + + - name: Install OpenVINO dependencies + run: | + cd ./openvino_toolkit + chmod +x ./install_dependencies/install_openvino_dependencies.sh + echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh + + - name: Build + id: cmake_build + run: | + source ./openvino_toolkit/setupvars.sh + cmake -B build/ReleaseOV -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_OPENVINO=ON + time cmake --build build/ReleaseOV --config Release -j $(nproc) + + - name: Test + id: cmake_test + # TODO: fix and re-enable the `test-llama-archs` test below + run: | + cd ${{ github.workspace }} + if [ "${{ matrix.openvino_device }}" = "GPU" ]; then + export GGML_OPENVINO_DEVICE=GPU + fi + ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000 diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml index 52624a46d77..e9148dd7399 100644 --- a/.github/workflows/build-self-hosted.yml +++ b/.github/workflows/build-self-hosted.yml @@ -265,6 +265,10 @@ jobs: ggml-ci-intel-openvino-gpu-low-perf: runs-on: [self-hosted, Linux, Intel, OpenVINO] + concurrency: + group: openvino-gpu-${{ github.head_ref || github.ref }} + cancel-in-progress: false + env: # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile OPENVINO_VERSION_MAJOR: "2026.0" diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 28c8665bd8b..c7f00e3592b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -656,86 +656,6 @@ jobs: -DGGML_SYCL_F16=ON time cmake --build build --config Release -j $(nproc) - ubuntu-24-openvino: - name: ubuntu-24-openvino-${{ matrix.openvino_device }} - strategy: - matrix: - include: - - variant: cpu - runner: '"ubuntu-24.04"' - openvino_device: "CPU" - - variant: gpu - runner: '["self-hosted","Linux","X64","Intel"]' - openvino_device: "GPU" - - runs-on: ${{ fromJSON(matrix.runner) }} - - env: - # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile - OPENVINO_VERSION_MAJOR: "2026.0" - OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886" - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - if: runner.environment == 'github-hosted' - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1 - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip - sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd - - - name: Use OpenVINO Toolkit Cache - if: runner.environment == 'github-hosted' - uses: actions/cache@v5 - id: cache-openvino - with: - path: ./openvino_toolkit - key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }} - - - name: Setup OpenVINO Toolkit - if: steps.cache-openvino.outputs.cache-hit != 'true' - uses: ./.github/actions/linux-setup-openvino - with: - path: ./openvino_toolkit - version_major: ${{ env.OPENVINO_VERSION_MAJOR }} - version_full: ${{ env.OPENVINO_VERSION_FULL }} - - - name: Install OpenVINO dependencies - run: | - cd ./openvino_toolkit - chmod +x ./install_dependencies/install_openvino_dependencies.sh - echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh - - - name: Build - id: cmake_build - run: | - source ./openvino_toolkit/setupvars.sh - cmake -B build/ReleaseOV -G Ninja \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_OPENVINO=ON - time cmake --build build/ReleaseOV --config Release -j $(nproc) - - - name: Test - id: cmake_test - # TODO: fix and re-enable the `test-llama-archs` test below - run: | - cd ${{ github.workspace }} - if [ "${{ matrix.openvino_device }}" = "GPU" ]; then - export GGML_OPENVINO_DEVICE=GPU - fi - ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000 - windows-latest: runs-on: windows-2025 diff --git a/docs/backend/OPENVINO.md b/docs/backend/OPENVINO.md index 96d0f672e30..c9c005a9981 100644 --- a/docs/backend/OPENVINO.md +++ b/docs/backend/OPENVINO.md @@ -244,7 +244,6 @@ build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" - `-fa 1` is required when running llama-bench with the OpenVINO backend. - `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1` - `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled. -- For Intel GPU, NPU detection in containers, GPU, NPU user-space drivers/libraries must be present inside the image. We will include in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile) > [!NOTE] > The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved. @@ -274,8 +273,6 @@ docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_p Run llama.cpp with OpenVINO backend Docker container. Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below. -> [!NOTE] -> Intel GPU, NPU detection in containers will be included in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile). ```bash # Run Docker container diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0938d2273e9..5095e799849 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -207,8 +206,22 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { break; } case GGML_OP_ROPE: { + const int mode = node->op_params[2]; + switch (mode) { + case GGML_ROPE_TYPE_NEOX: { + op_case = 0x00010000; + break; + } + case GGML_ROPE_TYPE_IMROPE: { + op_case = 0x00020000; + break; + } + default: + op_case = 0x00000000; + break; + } if (node->src[0]->op == GGML_OP_VIEW) { - op_case = 2; + op_case = (op_case | 0x00000002); } break; } @@ -573,9 +586,6 @@ std::map GgmlOvDecoder::get_kv_param_res_names() const } std::map> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) { - static std::mutex weights_mutex; - std::lock_guard lock(weights_mutex); - std::map> model_weights; auto * nodes = cgraph->nodes; auto n_nodes = cgraph->n_nodes; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index cc3cb4583cd..4140136aca2 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include ov::Core & ov_singleton_core() { @@ -42,11 +43,13 @@ void ggml_openvino_device_config::init() { {"NPUW_DQ", "YES" }, {"NPUW_DQ_FULL", "NO" }, }; - if (cache_dir) { + if (cache_dir && strlen(cache_dir) > 0) { compile_config["NPUW_CACHE_DIR"] = cache_dir; + compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE)); } - } else if (cache_dir) { - ov_singleton_core().set_property(ov::cache_dir(cache_dir)); + } else if (cache_dir && strlen(cache_dir) > 0) { + compile_config.insert(ov::cache_dir(cache_dir)); + compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE)); } // Initialize remote context with queue sharing for GPU @@ -259,10 +262,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; int64_t n_blocks = n_elements / layout.weights_per_block; layout.scales_size = n_blocks * sizeof(uint16_t); - // For symmetric quantization, we only need one zp value (not one per block) - // Zero points are stored in U4 or U8 format matching the weight type - size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks; - layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements; + // For symmetric quantization, no zp needed (weights stored as signed) + if (layout.is_symmetric) { + layout.zp_size = 0; + } else { + layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks; + } layout.weights_offset = 0; layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; @@ -313,10 +318,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten // Scales: F16 per block int64_t n_blocks = n_elements / layout.weights_per_block; layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes - // Zero points: U4 or U8 matching weight type - // For symmetric quantization, we only need one zp value (not one per block) - size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks; - layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements; + // For symmetric quantization, no zp needed (weights stored as signed) + if (layout.is_symmetric) { + layout.zp_size = 0; + } else { + layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks; + } // Layout in buffer: [weights | scales | zp] with alignment layout.weights_offset = 0; diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 0c8d3508e87..4f3ebf2536b 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -145,13 +145,18 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer return ctx->data; } +static bool is_stateful_enabled() { + static const auto * stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION"); + return stateful && *stateful != '\0' && strcmp(stateful, "0") != 0; +} + static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; // Put kvcache on device memory for GPU (NPU memory is too small even for kvcache) if (strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && ggml_openvino_get_device_name() == "GPU" && - !getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) { + !is_stateful_enabled()) { GGML_ASSERT(ctx->tensor_extras.empty()); auto device = ctx->device; auto size = ctx->size; @@ -600,6 +605,14 @@ bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft) { static void ggml_backend_openvino_free(ggml_backend_t backend) { ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context; + + if (ctx->runtime_context) { + auto r_ctx = std::static_pointer_cast(ctx->runtime_context); + if (--r_ctx->backend_count == 0) { + r_ctx->clear_caches(); + } + } + delete ctx; delete backend; } @@ -644,7 +657,12 @@ static ggml_guid_t ggml_backend_openvino_guid(void) { } static std::shared_ptr get_ov_runtime_context_ptr() { - static std::shared_ptr r_ctx = std::make_shared(); + static std::shared_ptr r_ctx = [] { + auto ctx = std::make_shared(); + ctx->device = ggml_openvino_get_device_name(); + ctx->stateful = is_stateful_enabled() && !ggml_openvino_is_npu(); + return ctx; + }(); return r_ctx; } @@ -669,8 +687,7 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) { } std::shared_ptr r_ctx = std::static_pointer_cast(ctx->runtime_context); - r_ctx->device = ggml_openvino_get_device_name(); - r_ctx->stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !ggml_openvino_is_npu(); + r_ctx->backend_count++; ggml_backend_t openvino_backend = new ggml_backend{ /* .guid = */ ggml_backend_openvino_guid(), @@ -883,7 +900,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { const int32_t * op_params = op->op_params; const int n_dims = op_params[1]; const int mode = op_params[2]; - if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) { + if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX && mode != GGML_ROPE_TYPE_IMROPE) { // GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode); return true; } @@ -896,14 +913,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { // GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type)); return true; } - float freq_scale; - float ext_factor; - memcpy(&freq_scale, op_params + 6, sizeof(float)); - memcpy(&ext_factor, op_params + 7, sizeof(float)); - if (ext_factor != 0.0f) { - // GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor); - return true; - } if (op->src[0]->op == GGML_OP_VIEW) { if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) { // GGML_LOG_WARN( @@ -913,6 +922,12 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { return true; } } + if (mode == GGML_ROPE_TYPE_IMROPE && + (op->src[2] != 0 || ((const float *) op_params)[6] != 1 || ((const float *) op_params)[7] != 0 || + ((const float *) op_params)[8] != 1)) { + // GGML_LOG_WARN("OpenVINO backend does not support IMROPE with freq_factors, freq_scale, ext_factor, and attn_factor\n"); + return true; + } break; } default: @@ -942,6 +957,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con // GGML_OP_SOFT_MAX, GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY}; static const std::set supported_unary_ops{ + GGML_UNARY_OP_GELU, GGML_UNARY_OP_SILU, }; static const std::set supported_glu_ops{ diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index dbf38646ddd..57d66df4f01 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -46,6 +46,7 @@ void unpack_32_4(const uint8_t * data, uint8_t * dst) { // Extracts (weight, scales, zp) from Q4_0 tensors. // Data layout is: |16 bit scale|32 x 4bit weights|. +// When zp_arr is empty (symmetric), weights are stored as signed i4 (value - 8). void extract_q4_0_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, @@ -55,28 +56,32 @@ void extract_q4_0_data(const ggml_tensor * tensor, auto * data = static_cast(tensor->data); auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * zp = static_cast(zp_arr.data()); - - bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization - // For Q4_0, zero point is always 8 - if (is_scalar_zp) { - zp[0] = 8 | (8 << 4); // Pack two 4-bit values - } + bool is_symmetric = (weights_arr.get_element_type() == ov::element::i4); // Signed i4 path - ov::parallel_for(scales_arr.get_size(), [&](size_t i) { - scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))); - // For asymmetric quantization, compute per-block zero points - if (!is_scalar_zp) { + if (!is_symmetric) { + auto * zp = static_cast(zp_arr.data()); + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))); // Pack two 4-bit zero points per byte if (i % 2 == 0) { zp[i / 2] = 8; // Lower nibble } else { zp[i / 2] |= (8 << 4); // Upper nibble } - } - unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16); - }); + unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16); + }); + } else { + // Symmetric: unpack as u4 then convert to i4 by subtracting 8 (XOR each nibble) + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))); + unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16); + // Convert u4 to i4: subtract 8 from each nibble. XOR 0x88 flips each nibble by 8. + for (int j = 0; j < 16; ++j) { + weights[i * 16 + j] ^= 0x88; + } + }); + } } // Extracts (weight, scales, zp) from Q4_1 tensors. @@ -123,6 +128,7 @@ void extract_q4_1_data(const ggml_tensor * tensor, // Extracts (weight, scales, zp) from Q8_0 tensors. // Data layout is: |16 bit scale|32 x 8bit weights|. +// When zp_arr is empty (symmetric), weights are stored as signed i8 directly. void extract_q8_0_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, @@ -133,29 +139,30 @@ void extract_q8_0_data(const ggml_tensor * tensor, auto * data = static_cast(tensor->data); auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * zp = static_cast(zp_arr.data()); - - bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization - // For Q8_0, zero point is always 128 - if (is_scalar_zp) { - zp[0] = 128; - } + bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8); // Signed i8 path - ov::parallel_for(scales_arr.get_size(), [&](size_t i) { - uint8_t * block_data = data + i * bytes_per_block; - scales[i] = ov::float16::from_bits(*(uint16_t *) block_data); - // For asymmetric quantization, store per-block zero points - if (!is_scalar_zp) { + if (!is_symmetric) { + auto * zp = static_cast(zp_arr.data()); + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + uint8_t * block_data = data + i * bytes_per_block; + scales[i] = ov::float16::from_bits(*(uint16_t *) block_data); zp[i] = 128; - } - for (size_t j = 0; j < weights_per_block; ++j) { - uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes. - // Original data is in int8_t, so we add a bias of -128 and invert the first bit. - x ^= 1 << 7; - weights[i * weights_per_block + j] = x; - } - }); + for (size_t j = 0; j < weights_per_block; ++j) { + uint8_t x = block_data[j + 2]; + x ^= 1 << 7; // Convert int8 to uint8 by flipping sign bit + weights[i * weights_per_block + j] = x; + } + }); + } else { + // Symmetric: store original int8 values directly (no unsigned bias) + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + uint8_t * block_data = data + i * bytes_per_block; + scales[i] = ov::float16::from_bits(*(uint16_t *) block_data); + // Copy int8 weights as-is (the tensor element type is i8) + memcpy(weights + i * weights_per_block, block_data + 2, weights_per_block); + }); + } } void unpack_256_4(const uint8_t * data, uint8_t * dst) { @@ -256,44 +263,62 @@ void extract_q6_k_data(const ggml_tensor * tensor, auto * data = static_cast(tensor->data); auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * zp = static_cast(zp_arr.data()); - - bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization - - // For Q6_K, zero point is always 32 - if (is_scalar_zp) { - zp[0] = 32; - } - - ov::parallel_for(n_super_block, [&](size_t i) { - uint8_t * block_data = data + i * bytes_per_block; - float scale_factor = - static_cast(ov::float16::from_bits(*((uint16_t *) block_data + 104))); // (128+64+16)/2 + bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8); // Signed i8 path - for (size_t j = 0; j < 16; j++) { - scales[j + i * 16] = - ov::float16(scale_factor * static_cast(*((int8_t *) (block_data + 128 + 64 + j)))); - // For asymmetric quantization, store per-block zero points - if (!is_scalar_zp) { + if (!is_symmetric) { + auto * zp = static_cast(zp_arr.data()); + ov::parallel_for(n_super_block, [&](size_t i) { + uint8_t * block_data = data + i * bytes_per_block; + float scale_factor = static_cast(ov::float16::from_bits(*((uint16_t *) block_data + 104))); + for (size_t j = 0; j < 16; j++) { + scales[j + i * 16] = + ov::float16(scale_factor * static_cast(*((int8_t *) (block_data + 128 + 64 + j)))); zp[j + i * 16] = 32; } - } - - uint8_t * ql = block_data; - uint8_t * qh = block_data + 128; - - for (int64_t j = 0; j < 32; ++j) { - weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4); - weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4); - weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4); - weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4); - weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4); - weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4); - weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4); - weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4); - } - }); + uint8_t * ql = block_data; + uint8_t * qh = block_data + 128; + for (int64_t j = 0; j < 32; ++j) { + weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4); + weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4); + weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4); + weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4); + weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4); + weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4); + weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4); + weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4); + } + }); + } else { + // Symmetric: subtract 32 from each weight to store as signed i8 + ov::parallel_for(n_super_block, [&](size_t i) { + uint8_t * block_data = data + i * bytes_per_block; + float scale_factor = static_cast(ov::float16::from_bits(*((uint16_t *) block_data + 104))); + for (size_t j = 0; j < 16; j++) { + scales[j + i * 16] = + ov::float16(scale_factor * static_cast(*((int8_t *) (block_data + 128 + 64 + j)))); + } + uint8_t * ql = block_data; + uint8_t * qh = block_data + 128; + auto * signed_weights = reinterpret_cast(weights); + for (int64_t j = 0; j < 32; ++j) { + signed_weights[i * 256 + j] = static_cast((ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4)) - 32; + signed_weights[i * 256 + j + 32] = + static_cast((ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4)) - 32; + signed_weights[i * 256 + j + 64] = static_cast((ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4)) - 32; + signed_weights[i * 256 + j + 96] = + static_cast((ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4)) - 32; + signed_weights[i * 256 + j + 128] = + static_cast((ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4)) - 32; + signed_weights[i * 256 + j + 160] = + static_cast((ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4)) - 32; + signed_weights[i * 256 + j + 192] = + static_cast((ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4)) - 32; + signed_weights[i * 256 + j + 224] = + static_cast((ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4)) - 32; + } + }); + } } static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) { @@ -389,11 +414,10 @@ ov::Output make_int8_weights(ov::Tensor & weight, size_t group_size, bool use_bias) { ov::Shape orig_shape = weight.get_shape(); + bool is_signed = (weight.get_element_type() == ov::element::i8); // Symmetric: signed weights, no ZP // Expand dimensions for scales and zp/bias auto scale_shape = scales.get_shape(); - auto zp_shape = zp.get_shape(); - bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size}; @@ -403,37 +427,48 @@ ov::Output make_int8_weights(ov::Tensor & weight, } else { scale_shape.push_back(1); scales.set_shape(scale_shape); - // For symmetric quantization, zp remains scalar (don't resize) - if (!is_scalar_zp) { + if (!is_signed && zp.get_size() > 0) { + auto zp_shape = zp.get_shape(); zp_shape.push_back(1); zp.set_shape(zp_shape); } } - // Create graph nodes - auto weights_node = std::make_shared(ov::element::u8, packed_shape, - static_cast(weight.data()), nullptr); - weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto scales_f16 = std::make_shared(scales); - auto weights_f16 = std::make_shared(weights_node, ov::element::f16); ov::Output result; - if (use_bias && !is_scalar_zp) { - // Bias path: w * s + b (zp tensor holds f16 bias values) - auto bias_f16 = std::make_shared(zp); - auto w_s = std::make_shared(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY); - result = std::make_shared(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY); + if (is_signed) { + // Signed path: q * s (no zero point subtraction needed) + auto weights_node = std::make_shared(ov::element::i8, packed_shape, + static_cast(weight.data()), nullptr); + weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; + auto weights_f16 = std::make_shared(weights_node, ov::element::f16); + result = std::make_shared(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY); } else { - // Zero point path: (w - zp) * s - auto zero_point = std::make_shared(zp); - float zp_value; - if (ov::op::util::get_single_value(zero_point, zp_value)) { - zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value}); + // Unsigned path + auto weights_node = std::make_shared(ov::element::u8, packed_shape, + static_cast(weight.data()), nullptr); + weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; + auto weights_f16 = std::make_shared(weights_node, ov::element::f16); + + if (use_bias && zp.get_size() > 0) { + // Bias path: w * s + b (zp tensor holds f16 bias values) + auto bias_f16 = std::make_shared(zp); + auto w_s = + std::make_shared(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY); + result = std::make_shared(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY); + } else { + // Zero point path: (w - zp) * s + auto zero_point = std::make_shared(zp); + float zp_value; + if (ov::op::util::get_single_value(zero_point, zp_value)) { + zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value}); + } + auto zero_point_f16 = std::make_shared(zero_point, ov::element::f16); + auto w_zp = + std::make_shared(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY); + result = std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); } - auto zero_point_f16 = std::make_shared(zero_point, ov::element::f16); - auto w_zp = - std::make_shared(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY); - result = std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); } if (packed_shape.size() != 2) { @@ -452,11 +487,10 @@ ov::Output make_int4_weights(ov::Tensor & weight, size_t group_size, bool use_bias) { ov::Shape orig_weight_shape = weight.get_shape(); + bool is_signed = (weight.get_element_type() == ov::element::i4); // Symmetric: signed weights, no ZP // Expand dimensions for scales and zp/bias ov::Shape scale_shape = scales.get_shape(); - auto zp_shape = zp.get_shape(); - bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization // Create INT4 weight tensor ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size}; @@ -467,36 +501,48 @@ ov::Output make_int4_weights(ov::Tensor & weight, } else { scale_shape.push_back(1); scales.set_shape(scale_shape); - // For symmetric quantization, zp remains scalar (don't resize) - if (!is_scalar_zp) { + if (!is_signed && zp.get_size() > 0) { + auto zp_shape = zp.get_shape(); zp_shape.push_back(1); zp.set_shape(zp_shape); } } - auto weights_node = std::make_shared(ov::element::u4, packed_shape, - static_cast(weight.data()), nullptr); - weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; - auto weights_f16 = std::make_shared(weights_node, ov::element::f16); auto scales_f16 = std::make_shared(scales); ov::Output result; - if (use_bias && !is_scalar_zp) { - // Bias path: w * s + b (zp tensor holds f16 bias values) - auto bias_f16 = std::make_shared(zp); - auto w_s = std::make_shared(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY); - result = std::make_shared(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY); + if (is_signed) { + // Signed path: q * s (no zero point subtraction needed) + auto weights_node = std::make_shared(ov::element::i4, packed_shape, + static_cast(weight.data()), nullptr); + weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; + auto weights_f16 = std::make_shared(weights_node, ov::element::f16); + result = std::make_shared(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY); } else { - // Zero point path: (w - zp) * s - auto zero_points_node = std::make_shared(zp); - float zp_value; - if (ov::op::util::get_single_value(zero_points_node, zp_value)) { - zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value}); + // Unsigned path + auto weights_node = std::make_shared(ov::element::u4, packed_shape, + static_cast(weight.data()), nullptr); + weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; + auto weights_f16 = std::make_shared(weights_node, ov::element::f16); + + if (use_bias && zp.get_size() > 0) { + // Bias path: w * s + b (zp tensor holds f16 bias values) + auto bias_f16 = std::make_shared(zp); + auto w_s = + std::make_shared(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY); + result = std::make_shared(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY); + } else { + // Zero point path: (w - zp) * s + auto zero_points_node = std::make_shared(zp); + float zp_value; + if (ov::op::util::get_single_value(zero_points_node, zp_value)) { + zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value}); + } + auto zero_points_f16 = std::make_shared(zero_points_node, ov::element::f16); + auto w_zp = + std::make_shared(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); + result = std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); } - auto zero_points_f16 = std::make_shared(zero_points_node, ov::element::f16); - auto w_zp = - std::make_shared(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); - result = std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); } if (packed_shape.size() != 2) { @@ -699,24 +745,32 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo // Quantized path (normal extraction or quantized requant) // Create weight/scale/zp tensors - shared between both paths - ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + // For symmetric quantization, use signed types (i4/i8) and no ZP tensor + ov::element::Type weight_type = layout.is_symmetric ? (layout.is_u4 ? ov::element::i4 : ov::element::i8) : + (layout.is_u4 ? ov::element::u4 : ov::element::u8); ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; - ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; if (output_base_ptr) { uint8_t * buf_base = static_cast(output_base_ptr); result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); - result.zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset); + if (!layout.is_symmetric) { + ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + result.zp = ov::Tensor(zp_type, scale_shape, buf_base + layout.zp_offset); + } + // else: result.zp remains default-constructed (empty) for symmetric } else { result.weights = ov::Tensor(weight_type, node_shape); result.scales = ov::Tensor(ov::element::f16, scale_shape); - if (use_bias && !layout.is_symmetric) { - // bias only has effect for asymmetric quant - result.zp = ov::Tensor(ov::element::f16, zp_shape); - } else { - result.zp = ov::Tensor(weight_type, zp_shape); + if (!layout.is_symmetric) { + if (use_bias) { + result.zp = ov::Tensor(ov::element::f16, scale_shape); + } else { + ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + result.zp = ov::Tensor(zp_type, scale_shape); + } } + // else: result.zp remains default-constructed (empty) for symmetric } if (layout.is_requant && layout.requant_type.has_value()) { @@ -741,59 +795,75 @@ void quantize_q4_0(const float * x, auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * zp = static_cast(zp_arr.data()); - bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization - - // For Q4_0, zero point is always 8 - if (is_scalar_zp) { - zp[0] = 8 | (8 << 4); // Pack two 4-bit values - } + bool is_symmetric = (weights_arr.get_element_type() == ov::element::i4); // Signed i4 path - for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max - float max = 0.0f; - - for (int j = 0; j < qk; j++) { - const float v = x[i * qk + j]; - if (amax < fabsf(v)) { - amax = fabsf(v); - max = v; + if (!is_symmetric) { + auto * zp = static_cast(zp_arr.data()); + for (int i = 0; i < nb; i++) { + float amax = 0.0f; + float max = 0.0f; + for (int j = 0; j < qk; j++) { + const float v = x[i * qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } } - } - - const float d = max / -8; - - if (d == 0) { - scales[i] = ov::float16(1.0f); - // zp is already set to 8 for symmetric, or set per-block for asymmetric - if (!is_scalar_zp) { + const float d = max / -8; + if (d == 0) { + scales[i] = ov::float16(1.0f); if (i % 2 == 0) { zp[i / 2] = 8; } else { zp[i / 2] |= (8 << 4); } + memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2); + continue; } - memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2); - continue; - } - - const float id = 1.0f / d; - scales[i] = ov::float16(d); - // For asymmetric quantization, store per-block zero points - if (!is_scalar_zp) { + const float id = 1.0f / d; + scales[i] = ov::float16(d); if (i % 2 == 0) { zp[i / 2] = 8; } else { zp[i / 2] |= (8 << 4); } + for (int j = 0; j < qk / 2; ++j) { + const float x0 = x[i * qk + 2 * j] * id; + const float x1 = x[i * qk + 2 * j + 1] * id; + const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f)); + const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f)); + weights[i * qk / 2 + j] = xi0 | (xi1 << 4); + } } - - for (int j = 0; j < qk / 2; ++j) { - const float x0 = x[i * qk + 2 * j] * id; - const float x1 = x[i * qk + 2 * j + 1] * id; - const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f)); - const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f)); - weights[i * qk / 2 + j] = xi0 | (xi1 << 4); + } else { + // Symmetric: produce signed i4 values in [-8, 7] + for (int i = 0; i < nb; i++) { + float amax = 0.0f; + float max = 0.0f; + for (int j = 0; j < qk; j++) { + const float v = x[i * qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } + } + const float d = max / -8; + if (d == 0) { + scales[i] = ov::float16(1.0f); + // i4 value 0 packed: 0x00 + memset(weights + i * qk / 2, 0, qk / 2); + continue; + } + const float id = 1.0f / d; + scales[i] = ov::float16(d); + for (int j = 0; j < qk / 2; ++j) { + const float x0 = x[i * qk + 2 * j] * id; + const float x1 = x[i * qk + 2 * j + 1] * id; + // Signed i4: range [-8, 7]. Quantize as round(x*id), then pack as 4-bit two's complement. + int8_t si0 = (int8_t) std::max(-8, std::min(7, (int) roundf(x0))); + int8_t si1 = (int8_t) std::max(-8, std::min(7, (int) roundf(x1))); + weights[i * qk / 2 + j] = (si0 & 0x0F) | ((si1 & 0x0F) << 4); + } } } } @@ -809,36 +879,42 @@ void quantize_q8_0(const float * x, auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * zp = static_cast(zp_arr.data()); - bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization - - // For Q8_0, zero point is always 128 - if (is_scalar_zp) { - zp[0] = 128; - } - - for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max + bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8); // Signed i8 path - for (int j = 0; j < qk; j++) { - const float v = x[i * qk + j]; - if (amax < fabsf(v)) { - amax = fabsf(v); + if (!is_symmetric) { + auto * zp = static_cast(zp_arr.data()); + for (int i = 0; i < nb; i++) { + float amax = 0.0f; + for (int j = 0; j < qk; j++) { + const float v = x[i * qk + j]; + amax = std::max(amax, fabsf(v)); } - } - - const float d = amax / 127.0f; - const float id = d ? 1.0f / d : 0.0f; - scales[i] = ov::float16(d); - // For asymmetric quantization, store per-block zero points - if (!is_scalar_zp) { + const float d = amax / 127.0f; + const float id = d ? 1.0f / d : 0.0f; + scales[i] = ov::float16(d); zp[i] = 128; + for (int j = 0; j < qk; ++j) { + const float x0 = x[i * qk + j] * id; + const int8_t xi0 = roundf(x0); + weights[i * qk + j] = (uint8_t) (xi0 + 128); + } } - - for (int j = 0; j < qk; ++j) { - const float x0 = x[i * qk + j] * id; - const int8_t xi0 = roundf(x0); - weights[i * qk + j] = (uint8_t) (xi0 + 128); + } else { + // Symmetric: store signed int8 values directly + auto * signed_weights = reinterpret_cast(weights); + for (int i = 0; i < nb; i++) { + float amax = 0.0f; + for (int j = 0; j < qk; j++) { + const float v = x[i * qk + j]; + amax = std::max(amax, fabsf(v)); + } + const float d = amax / 127.0f; + const float id = d ? 1.0f / d : 0.0f; + scales[i] = ov::float16(d); + for (int j = 0; j < qk; ++j) { + const float x0 = x[i * qk + j] * id; + signed_weights[i * qk + j] = (int8_t) roundf(x0); + } } } } @@ -861,12 +937,8 @@ void quantize_q8_1(const float * x, for (int j = 0; j < qk; j++) { const float v = x[i * qk + j]; - if (v < min) { - min = v; - } - if (v > max) { - max = v; - } + min = std::min(v, min); + max = std::max(v, max); } const float d = (max - min) / ((1 << 8) - 1); diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 26dc2d24f82..a8db9b38930 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -9,12 +9,17 @@ #include #include #include +#include +#include +#include #include #include #include +#include #include #include #include +#include #include #include @@ -33,6 +38,12 @@ OutputVector translate_rope(const NodeContext & context) { auto data_node = context.get_input(0).get_node_shared_ptr(); auto output_shape = context.get_output_shape().to_shape(); int32_t * op_params = context.get_output_op_params(); + const int mode = (op_case & 0xFFFF0000) >> 16; + op_case = (op_case & 0x0000FFFF); + + constexpr int TYPE_NORMAL = 0; + constexpr int TYPE_NEOX = 1; + constexpr int TYPE_IMROPE = 2; Output cos_theta_node; Output sin_theta_node; @@ -45,7 +56,7 @@ OutputVector translate_rope(const NodeContext & context) { if (context.get_input_size() == 3) { rope_freqs_weight = context.get_input(2).get_node_shared_ptr(); } - auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight); + auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight, mode == TYPE_IMROPE); sin_theta_node = sin_cos.first; cos_theta_node = sin_cos.second; } @@ -65,11 +76,7 @@ OutputVector translate_rope(const NodeContext & context) { } } - const int mode = op_params[2]; - constexpr int ROPE_TYPE_NORMAL = 0; - constexpr int ROPE_TYPE_NEOX = 2; - - if (mode == ROPE_TYPE_NORMAL) { + if (mode == TYPE_NORMAL) { auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); @@ -97,7 +104,7 @@ OutputVector translate_rope(const NodeContext & context) { auto data_shape = ov::op::v0::Constant::create( ov::element::i64, {4}, std::vector{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); res = std::make_shared(stack, data_shape, false); - } else if (mode == ROPE_TYPE_NEOX) { + } else if (mode == TYPE_NEOX) { auto data_split = std::make_shared( data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}), 2); Output slice_data_node_0 = data_split->outputs()[0]; @@ -112,6 +119,25 @@ OutputVector translate_rope(const NodeContext & context) { std::make_shared(slice_data_node_1, cos_theta_node)); res = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, -1); + } else if (mode == TYPE_IMROPE) { + int64_t n_dims = data_node->get_shape()[3]; + auto cos_sin_shape = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{1,-1,1,(n_dims >> 1)}); + auto cos_reshaped = std::make_shared(cos_theta_node, cos_sin_shape, true); + auto sin_reshaped = std::make_shared(sin_theta_node, cos_sin_shape, true); + + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {3}); + auto split_a = std::make_shared(data_node, split_axis, 2); + auto x0 = split_a->output(0); + auto x1 = split_a->output(1); + auto mul_a = std::make_shared(x0, cos_reshaped); + auto mul_b = std::make_shared(x1, sin_reshaped); + auto sub = std::make_shared(mul_a, mul_b); + + auto mul_c = std::make_shared(x0, sin_reshaped); + auto mul_d = std::make_shared(x1, cos_reshaped); + auto add = std::make_shared(mul_c, mul_d); + + res = std::make_shared(ov::OutputVector{sub, add}, 3); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp b/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp new file mode 100644 index 00000000000..d1e9efc33a5 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp @@ -0,0 +1,25 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_unary_gelu(const NodeContext & context) { + num_inputs_check(context, 1, 1); + + auto input = context.get_input(0); + auto res = std::make_shared(input); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index beadafe8103..1385539279c 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -31,6 +31,7 @@ std::unordered_map get_supported_ops() { {"GGML_OP_SOFT_MAX", op::translate_soft_max }, {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, {"GGML_OP_TRANSPOSE", op::translate_transpose }, + {"GGML_UNARY_OP_GELU", op::translate_unary_gelu }, {"GGML_UNARY_OP_SILU", op::translate_unary_silu }, {"GGML_OP_VIEW", op::translate_view }, {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, diff --git a/ggml/src/ggml-openvino/openvino/op_table.h b/ggml/src/ggml-openvino/openvino/op_table.h index 37f763117aa..f546796d2ee 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.h +++ b/ggml/src/ggml-openvino/openvino/op_table.h @@ -21,6 +21,7 @@ GGML_OP_CONVERTER(translate_rms_norm); GGML_OP_CONVERTER(translate_rope); GGML_OP_CONVERTER(translate_scale); GGML_OP_CONVERTER(translate_unary_silu); +GGML_OP_CONVERTER(translate_unary_gelu); GGML_OP_CONVERTER(translate_soft_max); GGML_OP_CONVERTER(translate_transpose); GGML_OP_CONVERTER(translate_view); diff --git a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp deleted file mode 100644 index ed2a3ab6d1b..00000000000 --- a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +++ /dev/null @@ -1,123 +0,0 @@ -#include "eliminate_zp.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace ov { -namespace frontend { -namespace ggml { -namespace pass { - -EliminateZeroPoints::EliminateZeroPoints() { - // Find pattern: - // (Multiply Any(scale) - // (Subtract (Convert Constant(data))) - // (Convert Constant(zero_point))) - // where zero_point is a scalar - // If data is u4 and zp value is 8 (q4_0), Replace the Subtract with an i4 Constant whose value is data - zp_val - // If data is u8 and zp value is 128 (q8_0) or 32 (q6_k), Replace the Subtract with an i8 Constant - - auto m_data_constant = ov::pass::pattern::wrap_type(); - auto m_data_convert = ov::pass::pattern::wrap_type({m_data_constant}); - - auto m_zp_constant = ov::pass::pattern::wrap_type(); - auto m_zp_convert = ov::pass::pattern::wrap_type({m_zp_constant}); - - auto m_subtract = ov::pass::pattern::wrap_type({m_data_convert, m_zp_convert}); - auto m_scale = ov::pass::pattern::any_input(); - auto m_multiply = ov::pass::pattern::wrap_type({m_scale, m_subtract}); - - const auto callback = [=](ov::pass::pattern::Matcher & m) { - const auto & pattern_map = m.get_pattern_value_map(); - - auto multiply_node = - std::dynamic_pointer_cast(pattern_map.at(m_multiply).get_node_shared_ptr()); - auto subtract_node = - std::dynamic_pointer_cast(pattern_map.at(m_subtract).get_node_shared_ptr()); - auto data_constant = - std::dynamic_pointer_cast(pattern_map.at(m_data_constant).get_node_shared_ptr()); - auto zp_constant = - std::dynamic_pointer_cast(pattern_map.at(m_zp_constant).get_node_shared_ptr()); - - if (!multiply_node || !subtract_node || !data_constant || !zp_constant) { - return false; - } - - if (ov::shape_size(zp_constant->get_shape()) != 1) { - return false; - } - - auto data_type = data_constant->get_element_type(); - auto zp_data = zp_constant->cast_vector(); - - if (zp_data.empty()) { - return false; - } - - int zp_value = zp_data[0]; - - bool should_eliminate = false; - ov::element::Type target_type; - - if (data_type == ov::element::u4 && zp_value == 8) { - should_eliminate = true; - target_type = ov::element::i4; - } else if (data_type == ov::element::u8 && (zp_value == 128 || zp_value == 32)) { - should_eliminate = true; - target_type = ov::element::i8; - } - - if (!should_eliminate) { - return false; - } - - auto data_shape = data_constant->get_shape(); - size_t total_elements = ov::shape_size(data_shape); - - std::shared_ptr new_constant; - - // TODO improve performance - if (data_type == ov::element::u4) { - auto data_values = data_constant->cast_vector(); - std::vector adjusted_values(total_elements); - - ov::parallel_for(total_elements, [&](size_t i) { - adjusted_values[i] = static_cast(static_cast(data_values[i]) - 8); - }); - - new_constant = std::make_shared(target_type, data_shape, adjusted_values); - } else if (data_type == ov::element::u8) { - auto data_values = data_constant->cast_vector(); - std::vector adjusted_values(total_elements); - - ov::parallel_for(total_elements, [&, zp_value](size_t i) { - adjusted_values[i] = static_cast(static_cast(data_values[i]) - zp_value); - }); - - new_constant = std::make_shared(target_type, data_shape, adjusted_values); - } - - auto new_convert = - std::make_shared(new_constant, subtract_node->get_output_element_type(0)); - ov::replace_node(subtract_node, new_convert); - - return true; - }; - - register_matcher( - std::make_shared(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"), - callback); -} - -} // namespace pass -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h deleted file mode 100644 index edd3cd718d9..00000000000 --- a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +++ /dev/null @@ -1,17 +0,0 @@ -#include "openvino/pass/matcher_pass.hpp" - -namespace ov { -namespace frontend { -namespace ggml { -namespace pass { - -class EliminateZeroPoints : public ov::pass::MatcherPass { -public: - OPENVINO_MATCHER_PASS_RTTI("ov::frontend::ggml::pass::EliminateZeroPoints") - EliminateZeroPoints(); -}; - -} // namespace pass -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp b/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp new file mode 100644 index 00000000000..f051891c481 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +namespace ov { + +/** + * @brief Holds weightless caching attributes of a single constant. + * + * WeightlessCacheAttribute class represents runtime info attribute that holds + * the values of original size of the constant in bytes and the binary offset of the + * constant's data in the weights file used by the weightless caching mechanism. It's + * not copyable in case the data was changed (the original node was replaced by a new + * one produced during the tranformation pipeline) - in that case weightless caching + * can't be used for that constant. + */ +class OPENVINO_API WeightlessCacheAttribute : public RuntimeAttribute { +public: + OPENVINO_RTTI("WeightlessCacheAttribute", "0", RuntimeAttribute) + + WeightlessCacheAttribute() = delete; + + WeightlessCacheAttribute(size_t original_size, size_t bin_offset, ov::element::Type original_dtype) + : original_size(original_size), + bin_offset(bin_offset), + original_dtype(original_dtype) {} + + bool is_copyable() const override; + + size_t original_size; + size_t bin_offset; + ov::element::Type original_dtype; +}; + +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 23a1dea2496..0f68a1f5062 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -3,15 +3,16 @@ #include "ggml-openvino/openvino/node_context.h" #include "ggml-openvino/openvino/utils.h" #include "input_model.h" -#include "pass/eliminate_zp.h" #include "pass/mark_decompression_convert_constant_folding.h" #include "pass/squeeze_matmul.h" +#include "rt_info/weightless_caching_attributes.hpp" #include #include #include #include #include +#include #include #include #include @@ -33,7 +34,6 @@ #include #include #include -#include namespace ov { namespace frontend { @@ -240,6 +240,31 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo resulting_model = std::make_shared(results, used_params); apply_transformations(resulting_model); + + // Set WeightlessCacheAttribute on large constants to avoid unnecessary memory copies + // in the NPUW plugin. Without this attribute, NPUW's LazyTensor constructor + // (lazy_tensor.cpp, op::Const::Const) will memcpy every constant "in case export + // occurs", doubling memory usage per compile_model call. + // + // The bin_offset field serves as a unique key (not a real file offset) — this is + // the same convention the GPU plugin uses for non-IR models (see + // Plugin::set_weightless_cache_attributes in intel_gpu/src/plugin/plugin.cpp). + // Each constant must have a distinct bin_offset, otherwise GPU's weightless cache + // import will map multiple constants to the same data. + // + // Small constants (< 16 elements) are excluded since they may be introduced by + // optimization patterns and the overhead is negligible. + size_t offset = 0; + for (auto & node : resulting_model->get_ordered_ops()) { + if (auto cnst = ov::as_type_ptr(node); + cnst && cnst->get_byte_size() / cnst->get_element_type().size() >= 16) { + auto & rt_info = cnst->get_rt_info(); + if (rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()) == rt_info.end()) { + rt_info[ov::WeightlessCacheAttribute::get_type_info_static()] = + ov::WeightlessCacheAttribute(cnst->get_byte_size(), offset++, cnst->get_element_type()); + } + } + } return resulting_model; } @@ -257,7 +282,6 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptris_static()) { - manager.register_pass(); manager.register_pass(); } manager.run_passes(model); diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index 65356a51b51..0baaf88e17a 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -2,6 +2,7 @@ #include "ggml-impl.h" +#include #include #include #include @@ -13,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -87,8 +89,11 @@ ov::Output rope_yarn_ramp_mix(int n_dims, const float corr_dims[2], fl auto ramp_y = std::make_shared(std::make_shared(dim_ids, corr_low), denom); auto ramp_clamped = std::make_shared(ramp_y, 0.0f, 1.0f); + // rope_yarn_ramp returns (1 - clamp(y)), so invert before scaling + auto one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f}); + auto ramp_inverted = std::make_shared(one, ramp_clamped); auto ext_factor_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {ext_factor}); - auto ramp_mix = std::make_shared(ramp_clamped, ext_factor_node); + auto ramp_mix = std::make_shared(ramp_inverted, ext_factor_node); return ramp_mix; } @@ -115,6 +120,7 @@ void ggml_rope_yarn_corr_dims(int n_dims, std::pair, ov::Output> make_sin_cos(int32_t * rope_params, std::shared_ptr inp_pos, std::shared_ptr rope_freqs_weight, + bool imrope, bool stateful) { if (stateful) { inp_pos = std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); @@ -122,6 +128,13 @@ std::pair, ov::Output> make_sin_cos(int32_t * rope_params auto pos_perm = std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); inp_pos = std::make_shared(inp_pos, pos_perm); + } else if (imrope) { + inp_pos = std::make_shared(inp_pos, ov::element::f32); + auto pos_shape = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{5}, {0, 0, 0, 4, -1}); + inp_pos = std::make_shared(inp_pos, pos_shape, true); + auto pos_transpose_shape = + std::make_shared(ov::element::i64, ov::Shape{5}, std::vector{0, 1, 2, 4, 3}); + inp_pos = std::make_shared(inp_pos, pos_transpose_shape); } else { inp_pos = std::make_shared(inp_pos, ov::element::f32); auto pos_perm = @@ -136,6 +149,7 @@ std::pair, ov::Output> make_sin_cos(int32_t * rope_params float beta_fast; float beta_slow; const int n_dims = rope_params[1]; + const size_t n_dims_half = n_dims >> 1; const int n_ctx_orig = rope_params[4]; memcpy(&freq_base, rope_params + 5, sizeof(float)); memcpy(&freq_scale, rope_params + 6, sizeof(float)); @@ -146,57 +160,74 @@ std::pair, ov::Output> make_sin_cos(int32_t * rope_params const float theta_scale = powf(freq_base, -2.0f / n_dims); - float corr_dims[2]; - ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - - std::vector factor(n_dims / 2); - factor[0] = 1.0f; - for (size_t i = 1; i < factor.size(); i++) { - factor[i] = theta_scale * factor[i - 1]; - } + std::vector factor(n_dims_half); Output freq_factors; - if (stateful) { - freq_factors = - std::make_shared(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor); - } else { - freq_factors = - std::make_shared(ov::element::f32, ov::Shape{1, 1, 1, factor.size()}, factor); - } - if (rope_freqs_weight) { - freq_factors = std::make_shared(freq_factors, rope_freqs_weight); - } - - auto theta_extrap = std::make_shared(freq_factors, inp_pos); - auto theta_interp = std::make_shared( - theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale})); Output theta; float mscale = attn_factor; - if (ext_factor == 0.0f) { - theta = theta_interp; + if (imrope) { + std::vector gather_indices(n_dims_half); + for (size_t j = 0; j < n_dims_half; j++) { + gather_indices[j] = j % 3; + factor[j] = std::pow(theta_scale, j); + } + auto gather_indices_const = + std::make_shared(ov::element::i64, ov::Shape{n_dims_half}, gather_indices); + auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {4}); + inp_pos = std::make_shared(inp_pos, gather_indices_const, gather_axis); + auto factor_const = std::make_shared(ov::element::f32, ov::Shape{n_dims_half}, factor); + theta = std::make_shared(inp_pos, factor_const); } else { - auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor); - Output one; + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); + factor[0] = 1.0f; + for (size_t i = 1; i < factor.size(); i++) { + factor[i] = theta_scale * factor[i - 1]; + } if (stateful) { - one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f}); + freq_factors = + std::make_shared(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor); } else { - one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f}); + freq_factors = + std::make_shared(ov::element::f32, ov::Shape{1, 1, 1, factor.size()}, factor); + } + if (rope_freqs_weight) { + freq_factors = std::make_shared(freq_factors, rope_freqs_weight); } - auto one_minus_ramp = std::make_shared(one, ramp_mix); - theta = std::make_shared(std::make_shared(theta_interp, one_minus_ramp), - std::make_shared(theta_extrap, ramp_mix)); - mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale)); + auto theta_extrap = std::make_shared(freq_factors, inp_pos); + auto theta_interp = std::make_shared( + theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale})); + + if (ext_factor == 0.0f) { + theta = theta_interp; + } else { + auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor); + Output one; + if (stateful) { + one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f}); + } else { + one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f}); + } + auto one_minus_ramp = std::make_shared(one, ramp_mix); + + theta = std::make_shared(std::make_shared(theta_interp, one_minus_ramp), + std::make_shared(theta_extrap, ramp_mix)); + mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale)); + } } Output cos_theta = std::make_shared(theta); Output sin_theta = std::make_shared(theta); - auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale}); + if (!imrope) { + auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale}); + + cos_theta = std::make_shared(cos_theta, mscale_node); + sin_theta = std::make_shared(sin_theta, mscale_node); + } - cos_theta = std::make_shared(cos_theta, mscale_node); - sin_theta = std::make_shared(sin_theta, mscale_node); return std::make_pair(sin_theta, cos_theta); } diff --git a/ggml/src/ggml-openvino/openvino/utils.h b/ggml/src/ggml-openvino/openvino/utils.h index 88dcad4c906..767dd4c53ea 100644 --- a/ggml/src/ggml-openvino/openvino/utils.h +++ b/ggml/src/ggml-openvino/openvino/utils.h @@ -67,6 +67,7 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std:: std::pair, ov::Output> make_sin_cos(int32_t* rope_params, std::shared_ptr inp_pos, std::shared_ptr rope_freqs_weight = nullptr, + bool imrope = false, bool stateful = false); ov::Output process_view_input(const NodeContext& context, int input_index, int slice_len = 0); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 1b553a0de00..998ef7c9eb4 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -81,8 +81,8 @@ ov::Tensor create_ov_output_tensor(std::shared_ptr ggml_decoder, enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr r_ctx) { auto & core = ov_singleton_core(); const auto & config = ggml_openvino_get_compile_config(); - auto device = r_ctx->device; - bool stateful = r_ctx->stateful; + const auto & device = r_ctx->device; + const auto & stateful = r_ctx->stateful; static auto is_static = false; if (is_naive(cgraph)) { @@ -106,14 +106,26 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< int64_t infer_end_time; { - std::lock_guard lock(r_ctx->ov_compute_mutex); + std::shared_ptr entry; + ModelParams old_m_params; - auto it = r_ctx->decoder_cache.find(key); + { + std::lock_guard map_lock(r_ctx->ctx_mutex); + auto it = r_ctx->decoder_cache.find(key); + cache_hit = it != r_ctx->decoder_cache.end(); + if (cache_hit) { + entry = it->second; + } else { + auto mutex = std::make_shared(); + entry = std::make_shared(mutex); + r_ctx->decoder_cache[key] = entry; + } + } + + std::lock_guard lock(*(entry->mutex)); - cache_hit = it != r_ctx->decoder_cache.end(); - ModelParams old_m_params; if (cache_hit) { - ggml_decoder = it->second; + ggml_decoder = entry->ptr; old_m_params = ggml_decoder->get_model_params(); cache_hit = old_m_params.can_reuse_dynamically(m_params); } @@ -126,7 +138,10 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< ggml_decoder->update_io(cgraph); } ggml_decoder->add_extra_inputs(); - infer_request = r_ctx->infer_request_cache.at(key); + { + std::lock_guard map_lock(r_ctx->ctx_mutex); + infer_request = r_ctx->infer_request_cache.at(key); + } if (stateful) { const auto * inp_pos = get_inp_pos_tensor(cgraph); @@ -170,7 +185,10 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< conversion_end_time = decoder_end_time; compile_end_time = decoder_end_time; } else { - r_ctx->infer_request_cache.erase(key); + { + std::lock_guard map_lock(r_ctx->ctx_mutex); + r_ctx->infer_request_cache.erase(key); + } std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); @@ -199,8 +217,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< } compile_end_time = ggml_time_us(); infer_request = std::make_shared(compiled_model.create_infer_request()); - r_ctx->infer_request_cache[key] = infer_request; - r_ctx->decoder_cache[key] = ggml_decoder; + entry->ptr = ggml_decoder; std::vector ov_input_names; std::vector ov_output_names; @@ -210,8 +227,13 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< for (const auto & ov_output : model->get_results()) { ov_output_names.push_back(ov_output->get_friendly_name()); } - r_ctx->ov_input_names_cache[key] = std::move(ov_input_names); - r_ctx->ov_output_names_cache[key] = std::move(ov_output_names); + + { + std::lock_guard map_lock(r_ctx->ctx_mutex); + r_ctx->infer_request_cache[key] = infer_request; + r_ctx->ov_input_names_cache[key] = std::move(ov_input_names); + r_ctx->ov_output_names_cache[key] = std::move(ov_output_names); + } if (stateful) { const auto * inp_pos = get_inp_pos_tensor(cgraph); @@ -224,8 +246,13 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< } } - auto ov_input_names = r_ctx->ov_input_names_cache[key]; - auto ov_output_names = r_ctx->ov_output_names_cache[key]; + std::vector ov_input_names; + std::vector ov_output_names; + { + std::lock_guard map_lock(r_ctx->ctx_mutex); + ov_input_names = r_ctx->ov_input_names_cache[key]; + ov_output_names = r_ctx->ov_output_names_cache[key]; + } for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; @@ -306,12 +333,26 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrdecoder_cache.find(key); - - cache_hit = it != r_ctx->decoder_cache.end(); + std::shared_ptr entry; ModelParams old_m_params; + + { + std::lock_guard map_lock(r_ctx->ctx_mutex); + auto it = r_ctx->decoder_cache.find(key); + cache_hit = it != r_ctx->decoder_cache.end(); + if (cache_hit) { + entry = it->second; + } else { + auto mutex = std::make_shared(); + entry = std::make_shared(mutex); + r_ctx->decoder_cache[key] = entry; + } + } + + std::lock_guard lock(*(entry->mutex)); + if (cache_hit) { - ggml_decoder = it->second; + ggml_decoder = entry->ptr; old_m_params = ggml_decoder->get_model_params(); cache_hit = old_m_params.can_reuse_statically(m_params); } @@ -325,14 +366,21 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrupdate_io(cgraph); } ggml_decoder->add_extra_inputs(); - infer_request = is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key); + { + std::lock_guard map_lock(r_ctx->ctx_mutex); + infer_request = + is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key); + } decoder_end_time = ggml_time_us(); conversion_end_time = decoder_end_time; compile_end_time = decoder_end_time; } else { - r_ctx->infer_request_cache.erase(key); - r_ctx->infer_request_cache_prefill.erase(key); + { + std::lock_guard map_lock(r_ctx->ctx_mutex); + r_ctx->infer_request_cache.erase(key); + r_ctx->infer_request_cache_prefill.erase(key); + } std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); @@ -372,16 +420,14 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrinfer_request_cache_prefill[key] = - std::make_shared(compiled_model_prefill.create_infer_request()); - r_ctx->infer_request_cache[key] = - std::make_shared(compiled_model_decode.create_infer_request()); + auto infer_request_prefill = std::make_shared(compiled_model_prefill.create_infer_request()); + auto infer_request_decode = std::make_shared(compiled_model_decode.create_infer_request()); compile_end_time = ggml_time_us(); model = is_prefill ? model_prefill : model_decode; ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode; - infer_request = is_prefill ? r_ctx->infer_request_cache_prefill[key] : r_ctx->infer_request_cache[key]; - r_ctx->decoder_cache[key] = ggml_decoder; + infer_request = is_prefill ? infer_request_prefill : infer_request_decode; + entry->ptr = ggml_decoder; std::vector ov_input_names; std::vector ov_output_names; @@ -391,18 +437,29 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrget_results()) { ov_output_names.push_back(ov_output->get_friendly_name()); } - r_ctx->ov_input_names_cache[key] = std::move(ov_input_names); - r_ctx->ov_output_names_cache[key] = std::move(ov_output_names); + + { + std::lock_guard map_lock(r_ctx->ctx_mutex); + r_ctx->infer_request_cache_prefill[key] = infer_request_prefill; + r_ctx->infer_request_cache[key] = infer_request_decode; + r_ctx->ov_input_names_cache[key] = std::move(ov_input_names); + r_ctx->ov_output_names_cache[key] = std::move(ov_output_names); + } } - auto ov_input_names = r_ctx->ov_input_names_cache[key]; - auto ov_output_names = r_ctx->ov_output_names_cache[key]; + std::vector ov_input_names_local; + std::vector ov_output_names_local; + { + std::lock_guard map_lock(r_ctx->ctx_mutex); + ov_input_names_local = r_ctx->ov_input_names_cache[key]; + ov_output_names_local = r_ctx->ov_output_names_cache[key]; + } if (is_prefill) { auto inp_len = inp_pos->ne[0]; for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) { - for (size_t i = 0; i < ov_input_names.size(); i++) { - auto param_name = ov_input_names[i]; + for (size_t i = 0; i < ov_input_names_local.size(); i++) { + auto param_name = ov_input_names_local[i]; auto input_tensor = get_ov_input_tensor_static_prefill(ggml_decoder, param_name, chunk_index); infer_request->set_input_tensor(i, input_tensor); @@ -412,8 +469,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrget_model_outputs().at(ov_output_names[i]); + for (size_t i = 0; i < ov_output_names_local.size(); i++) { + auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names_local[i]); auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor); infer_request->set_output_tensor(i, output_tensor); } @@ -421,16 +478,16 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrinfer(); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { - for (size_t i = 0; i < ov_output_names.size(); i++) { + for (size_t i = 0; i < ov_output_names_local.size(); i++) { const auto output_tensor = infer_request->get_output_tensor(i); - print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); + print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data()); } } } infer_end_time = ggml_time_us(); } else { - for (size_t i = 0; i < ov_input_names.size(); i++) { - auto param_name = ov_input_names[i]; + for (size_t i = 0; i < ov_input_names_local.size(); i++) { + auto param_name = ov_input_names_local[i]; auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name); infer_request->set_input_tensor(i, input_tensor); @@ -440,8 +497,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrget_model_outputs().at(ov_output_names[i]); + for (size_t i = 0; i < ov_output_names_local.size(); i++) { + auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names_local[i]); auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor); infer_request->set_output_tensor(i, output_tensor); } @@ -450,9 +507,9 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrget_output_tensor(i); - print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); + print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data()); } } } diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 656573d1389..2c72e33c352 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -3,12 +3,15 @@ #include "ggml-impl.h" #include +#include #include #include +#include #include #include #include #include +#include #include struct graph_key { @@ -40,11 +43,17 @@ struct graph_key_hash { } }; +struct decoder_runtime_ctx { + decoder_runtime_ctx(std::shared_ptr mutex) : mutex(std::move(mutex)) {} + std::shared_ptr mutex; + std::shared_ptr ptr; +}; + struct ov_runtime_context { - std::mutex ov_compute_mutex; + mutable std::mutex ctx_mutex; std::string device; bool stateful; - std::unordered_map, graph_key_hash> decoder_cache; + std::unordered_map, graph_key_hash> decoder_cache; std::unordered_map, graph_key_hash> infer_request_cache; std::unordered_map, graph_key_hash> infer_request_cache_prefill; std::unordered_map, graph_key_hash> ov_input_names_cache; @@ -53,11 +62,22 @@ struct ov_runtime_context { // Simultanous stateful inference request support to be added. size_t stateful_kv_size; std::map kv_state_input_name_map; + std::atomic backend_count; ov_runtime_context() : device("CPU"), stateful(false), - stateful_kv_size(0) {} + stateful_kv_size(0), + backend_count(0) {} + + void clear_caches() { + std::lock_guard lock(ctx_mutex); + decoder_cache.clear(); + infer_request_cache.clear(); + infer_request_cache_prefill.clear(); + ov_input_names_cache.clear(); + ov_output_names_cache.clear(); + } }; enum ggml_status ov_graph_compute(struct ggml_cgraph * cgraph, ggml_backend_t backend); From 84652b80cfd165c1b6f8c0541803e1967e5c4c34 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 21 Apr 2026 19:52:02 +0300 Subject: [PATCH 3/5] arg : add --spec-default (#22223) --- common/arg.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/common/arg.cpp b/common/arg.cpp index 6751a55ab0c..5c0bbe38e09 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3902,6 +3902,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); + add_opt(common_arg( + {"--spec-default"}, + string_format("enable default speculative decoding config"), + [](common_params & params) { + params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD; + params.speculative.ngram_size_n = 24; + params.speculative.n_min = 48; + params.speculative.n_max = 64; + } + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); + return ctx_arg; } From 98d2d2884e28ca8718774caad691ddd525f2f7b2 Mon Sep 17 00:00:00 2001 From: Kwa Jie Hao <31984694+kwajiehao@users.noreply.github.com> Date: Wed, 22 Apr 2026 02:02:49 +0800 Subject: [PATCH 4/5] mtmd: Add support for Reka Edge 2603 (#21616) * feat: (vocab) fix stray text appended in llama_decode_text Remove accidental concatenation of the full `text` string when formatting UNK_BYTE hex escapes. Only the closing "]" should be appended. * feat(mtmd): add Yasa2 vision encoder support Add a Yasa2 (ConvNeXtV2-based) vision encoder for reka-edge: - Register PROJECTOR_TYPE_YASA2 and tensor name definitions - Add yasa2_block/yasa2_stage model structs - Implement graph builder with ConvNeXt stages, GRN, adaptive pooling - Wire into clip.cpp switch statements and mtmd.cpp init_vision - Use mtmd_image_preprocessor_fixed_size for image preprocessing * feat(chat): add reka-edge template handler (tools, thinking) - Add chat-reka.cpp/h implementing PEG-based parser for reka-edge format - Add Reka-Edge.jinja chat template - Detect reka-edge template in try_specialized_template() - Add LLAMA_EXAMPLE_MTMD to chat-template-file arg * feat: add reka vlm to gguf conversion script Converts Reka Yasa2 hf checkpoints to GGUF format: - Text decoder: Llama-arch with tiktoken/BPE vocab - Mmproj (--mmproj): ConvNeXt vision backbone + language_projection - Generates 2D sincos positional embeddings for vision encoder * test: add Reka Edge chat template and parser tests - test-chat-template: oracle tests comparing Jinja engine output vs common_chat_templates_apply for text, tools, thinking, images, video - test-chat: PEG parser tests for Reka Edge format, round-trip tests for image/video content parts, common path integration tests * scripts: add Reka Edge mixed quantization helper Q4_0 base quantization with Q8_0 override for the last 8 transformer blocks (layers 24-31) via --tensor-type regex. * fix: adapt chat-reka and tests to upstream API - Use autoparser::generation_params (not templates_params) - Add p.prefix(generation_prompt) to PEG parser - Simplify reasoning parser to match LFM2 pattern - Remove image/video oracle tests (unsupported by oaicompat parser; no other multimodal models test this path) * fix: avoid duplicate tensor loading in yasa2 vision encoder TN_YASA_PATCH_W and TN_PATCH_EMBD both resolve to "v.patch_embd.weight", causing the same tensor to be loaded twice into ctx_data and overflowing the memory pool. Reuse the tensors already loaded by the common section. * chore: update image pre-processing settings The reka-edge model depends on the following settings in an older fork of llama.cpp: 1. Fixed square resize 2. BICUBIC 3. add_padding=false In current llama.cpp, this means setting: - image_resize_algo = RESIZE_ALGO_BICUBIC - image_resize_pad = false * chore: remove reka gguf conversion script * chore: remove reka quantization script * chore: remove unnecessary changes from PR scope This commit removes a couple of unnecessary changes for the PR scope: 1. BPE decoder bug fix - this affects reka edge because there's a bug in our tokenization that doesn't represent tokens as special tokens. However this isn't meant to be a thinking model so when run with --reasoning off the edge case does not affect us 2. --chat-template-file support from llama-mtmd-cli - the focus is on llama-server and the reka edge gguf contains the necessary metadata to detect the chat template 3. reka edge oracle test cases - no other model has similar test cases, so I removed it for standardization * chore: remove unnecessary ggml_cast This commit removes unnecessary ggml_cast after updating the reka vlm -> gguf conversion script on hugging face. * chore: remove redundant code * chore: remove unnecessary ggml_cont calls This commit removes all ggml_cont calls except the four that precede ggml_reshape_3d/ggml_reshape_4d. Those are necessary because ggml_reshape recomputes strides assuming contiguous layout and asserts ggml_is_contiguous. Other operations (ggml_mean, ggml_add, ggml_mul etc.) use stride-based indexing and handle non-contiguous inputs correctly and so we are ok to remove ggml_cont for those. * chore: remove unnecessary ggml_repeat calls This commit removes unnecessary ggml_repeat calls because the underlying ops already broadcast automatically. Every ggml_repeat in yasa2.cpp was expanding a smaller tensor to match a larger one's shape before passing both to an elementwise op (ggml_add, ggml_sub, ggml_mul, or ggml_div). This is unnecessary because all four of these ops already support broadcasting internally. * chore: restore ggml_cont needed for cpu operations * refactor: locate reka chat template handler in chat.cpp * chore: remove unnecessary warmup tokens * chore: add code comments on image_resize_pad * chore: remove custom reka parsing code * chore: revert common/chat.cpp * Uncomment debug logging for PEG input parsing --------- Co-authored-by: Piotr Wilkin (ilintar) --- tests/test-chat.cpp | 95 ++++++++++++++++++ tools/mtmd/CMakeLists.txt | 1 + tools/mtmd/clip-impl.h | 11 +++ tools/mtmd/clip-model.h | 30 ++++++ tools/mtmd/clip.cpp | 69 +++++++++++++ tools/mtmd/models/models.h | 8 ++ tools/mtmd/models/yasa2.cpp | 191 ++++++++++++++++++++++++++++++++++++ tools/mtmd/mtmd.cpp | 13 +++ 8 files changed, 418 insertions(+) create mode 100644 tools/mtmd/models/yasa2.cpp diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index eee28271be2..79e1891b8ea 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -3595,6 +3595,51 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .run(); } + // Reka Edge + { + auto tst = peg_tester("models/templates/Reka-Edge.jinja", detailed_debug); + tst.test("Hello, world!\nWhat's up?") + .enable_thinking(false) + .expect(message_assist) + .run(); + tst.test("I'm\nthinking\n\nHello, world!\nWhat's up?") + .enable_thinking(true) + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .expect(message_assist_thoughts) + .run(); + tst.test("\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n") + .enable_thinking(false) + .tools({ special_function_tool }) + .expect(message_assist_call) + .run(); + tst.test("Hello, world!\nWhat's up?\n\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n") + .enable_thinking(false) + .tools({ special_function_tool }) + .expect(message_assist_call_content) + .run(); + tst.test("I'm\nthinking\n\n\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n") + .enable_thinking(true) + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .tools({ special_function_tool }) + .expect(message_assist_call_thoughts) + .run(); + tst.test("\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n\n\n{\"name\": \"special_function_with_opt\", \"arguments\": {\"arg1\": 1, \"arg2\": 2}}\n") + .enable_thinking(false) + .parallel_tool_calls(true) + .tools({ special_function_tool, special_function_tool_with_optional_param }) + .expect_tool_calls({ + { "special_function", R"({"arg1": 1})", {} }, + { "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} }, + }) + .run(); + tst.test("\n{\"name\": \"special_function\", \"arguments\": {\"arg") + .enable_thinking(false) + .tools({ special_function_tool }) + .is_partial(true) + .expect(message_assist_call_cutoff_args) + .run(); + } + // Apriel 1.5 { auto tst = peg_tester("models/templates/unsloth-Apriel-1.5.jinja", detailed_debug); @@ -4077,6 +4122,55 @@ static void test_template_output_peg_parsers(bool detailed_debug) { } } +static void test_reka_edge_common_path() { + auto tmpls = read_templates("models/templates/Reka-Edge.jinja"); + + { + common_chat_templates_inputs inputs; + common_chat_msg system_msg; + system_msg.role = "system"; + system_msg.content = "Use tools when needed."; + + common_chat_msg tool_call_msg = simple_assist_msg("", "", "special_function", "{\"arg1\": 1}"); + + common_chat_msg tool_msg; + tool_msg.role = "tool"; + tool_msg.tool_name = "special_function"; + tool_msg.tool_call_id = "call0"; + tool_msg.content = "Sunny"; + + inputs.messages = { system_msg, message_user, tool_call_msg, tool_msg, message_user }; + inputs.tools = { special_function_tool }; + inputs.enable_thinking = true; + inputs.add_generation_prompt = true; + + auto params = common_chat_templates_apply(tmpls.get(), inputs); + + if (params.prompt.find("\nSunny\n") == std::string::npos) { + throw std::runtime_error("Reka Edge prompt did not render tool response history"); + } + if (params.prompt.rfind("assistant: \n") == std::string::npos) { + throw std::runtime_error("Reka Edge prompt did not render thinking generation prompt"); + } + } + + { + common_chat_templates_inputs inputs; + inputs.messages = { + message_user, + simple_assist_msg("The first point is") + }; + inputs.add_generation_prompt = false; + inputs.enable_thinking = false; + inputs.chat_template_kwargs["continue_final_message"] = "true"; + + auto params = common_chat_templates_apply(tmpls.get(), inputs); + if (string_ends_with(params.prompt, "")) { + throw std::runtime_error("Reka Edge continue_final_message unexpectedly closed the assistant turn"); + } + } +} + // Test the developer role to system workaround with a simple mock template static void test_developer_role_to_system_workaround() { LOG_DBG("%s\n", __func__); @@ -4256,6 +4350,7 @@ int main(int argc, char ** argv) { test_msgs_oaicompat_json_conversion(); test_tools_oaicompat_json_conversion(); test_developer_role_to_system_workaround(); + test_reka_edge_common_path(); test_template_output_peg_parsers(detailed_debug); std::cout << "\n[chat] All tests passed!" << '\n'; } diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 399876128ef..35d721d5a4c 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -40,6 +40,7 @@ add_library(mtmd models/deepseekocr.cpp models/mobilenetv5.cpp models/youtuvl.cpp + models/yasa2.cpp ) set_target_properties(mtmd PROPERTIES diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 17cb703f7fb..61fe82439f1 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -242,6 +242,15 @@ #define TN_STD_BIAS "v.std_bias" #define TN_STD_SCALE "v.std_scale" +// yasa2 +#define TN_YASA_PATCH_LN_W "v.patch_ln.weight" +#define TN_YASA_PATCH_LN_B "v.patch_ln.bias" +#define TN_YASA_BACKBONE_LN_W "v.backbone_ln.weight" +#define TN_YASA_BACKBONE_LN_B "v.backbone_ln.bias" +#define TN_YASA_POS_EMBD "v.vision_pos_embed" +#define TN_YASA_STAGE_DOWN_LN "v.stage.%d.down.ln.%s" +#define TN_YASA_STAGE_DOWN_CONV "v.stage.%d.down.conv.%s" +#define TN_YASA_STAGE_BLK "v.stage.%d.blk.%d.%s.%s" // align x to upper multiple of n #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) @@ -290,6 +299,7 @@ enum projector_type { PROJECTOR_TYPE_LFM2A, PROJECTOR_TYPE_GLM4V, PROJECTOR_TYPE_YOUTUVL, + PROJECTOR_TYPE_YASA2, PROJECTOR_TYPE_KIMIK25, PROJECTOR_TYPE_NEMOTRON_V2_VL, PROJECTOR_TYPE_HUNYUANOCR, @@ -335,6 +345,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_LFM2A, "lfm2a"}, { PROJECTOR_TYPE_GLM4V, "glm4v"}, { PROJECTOR_TYPE_YOUTUVL, "youtuvl"}, + { PROJECTOR_TYPE_YASA2, "yasa2"}, { PROJECTOR_TYPE_KIMIK25, "kimik25"}, { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"}, { PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"}, diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 9a93584d9be..bf8031b55b2 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -268,6 +268,27 @@ struct mobilenetv5_block { ggml_tensor * attn_norm_w = nullptr; }; +struct yasa2_block { + ggml_tensor * dw_w = nullptr; + ggml_tensor * dw_b = nullptr; + ggml_tensor * ln_w = nullptr; + ggml_tensor * ln_b = nullptr; + ggml_tensor * pw1_w = nullptr; + ggml_tensor * pw1_b = nullptr; + ggml_tensor * grn_w = nullptr; + ggml_tensor * grn_b = nullptr; + ggml_tensor * pw2_w = nullptr; + ggml_tensor * pw2_b = nullptr; +}; + +struct yasa2_stage { + ggml_tensor * down_ln_w = nullptr; + ggml_tensor * down_ln_b = nullptr; + ggml_tensor * down_conv_w = nullptr; + ggml_tensor * down_conv_b = nullptr; + std::vector blocks; +}; + struct clip_model { clip_modality modality = CLIP_MODALITY_VISION; projector_type proj_type = PROJECTOR_TYPE_MLP; @@ -402,6 +423,15 @@ struct clip_model { ggml_tensor * msfa_ffn_expand_bn = nullptr; ggml_tensor * msfa_ffn_project_bn = nullptr; + // yasa2 + ggml_tensor * yasa_patch_w = nullptr; + ggml_tensor * yasa_patch_b = nullptr; + ggml_tensor * yasa_patch_ln_w = nullptr; + ggml_tensor * yasa_patch_ln_b = nullptr; + ggml_tensor * yasa_backbone_ln_w = nullptr; + ggml_tensor * yasa_backbone_ln_b = nullptr; + ggml_tensor * yasa_vision_pos_embed = nullptr; + std::vector yasa_stages; // pixtral, glm4v ggml_tensor * token_embd_img_break = nullptr; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index f0e8786b660..540b0ea4143 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -947,6 +947,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_YASA2: + { + builder = std::make_unique(ctx, img); + } break; default: GGML_ABORT("missing cgraph builder"); } @@ -1389,6 +1393,16 @@ struct clip_model_loader { hparams.set_limit_image_tokens(1, 62500); hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup } break; + case PROJECTOR_TYPE_YASA2: + { + hparams.ffn_op = FFN_GELU_ERF; + log_ffn_op = "gelu_erf"; + hparams.image_resize_algo = RESIZE_ALGO_BICUBIC; + + // reka model performs better when using resize_bicubic, which stretches + // the image to fit fixed square size + hparams.image_resize_pad = false; + } break; case PROJECTOR_TYPE_GLM4V: { hparams.rope_theta = 10000.0f; @@ -1839,6 +1853,55 @@ struct clip_model_loader { model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); // merger.mlp.2 model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); } break; + case PROJECTOR_TYPE_YASA2: + { + // reuse tensors already loaded by the common section + // (TN_PATCH_EMBD and TN_PATCH_BIAS have the same tensor names) + GGML_ASSERT(model.patch_embeddings_0 && "yasa2 requires v.patch_embd.weight"); + model.yasa_patch_w = model.patch_embeddings_0; + model.yasa_patch_b = model.patch_bias; + model.yasa_patch_ln_w = get_tensor(TN_YASA_PATCH_LN_W, false); + model.yasa_patch_ln_b = get_tensor(TN_YASA_PATCH_LN_B, false); + model.yasa_backbone_ln_w = get_tensor(TN_YASA_BACKBONE_LN_W, false); + model.yasa_backbone_ln_b = get_tensor(TN_YASA_BACKBONE_LN_B, false); + model.yasa_vision_pos_embed = get_tensor(TN_YASA_POS_EMBD, false); + model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false); + model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false); + + model.yasa_stages.clear(); + for (int s = 0; ; ++s) { + yasa2_stage stage; + stage.down_ln_w = get_tensor(string_format(TN_YASA_STAGE_DOWN_LN, s, "weight"), false); + stage.down_ln_b = get_tensor(string_format(TN_YASA_STAGE_DOWN_LN, s, "bias"), false); + stage.down_conv_w = get_tensor(string_format(TN_YASA_STAGE_DOWN_CONV, s, "weight"), false); + stage.down_conv_b = get_tensor(string_format(TN_YASA_STAGE_DOWN_CONV, s, "bias"), false); + + for (int bi = 0; ; ++bi) { + yasa2_block blk; + blk.dw_w = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "dw", "weight"), false); + if (!blk.dw_w) { + break; + } + blk.dw_b = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "dw", "bias"), false); + blk.ln_w = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "ln", "weight"), false); + blk.ln_b = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "ln", "bias"), false); + blk.pw1_w = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "pw1", "weight"), false); + blk.pw1_b = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "pw1", "bias"), false); + blk.grn_w = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "grn", "weight"), false); + blk.grn_b = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "grn", "bias"), false); + blk.pw2_w = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "pw2", "weight"), false); + blk.pw2_b = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "pw2", "bias"), false); + stage.blocks.push_back(blk); + } + + if (!stage.down_conv_w && stage.blocks.empty()) { + break; + } + model.yasa_stages.push_back(std::move(stage)); + } + } break; case PROJECTOR_TYPE_GLM4V: { model.mm_fc_w = get_tensor(string_format(TN_MM_PROJECTOR, "weight")); @@ -2843,6 +2906,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im { // do nothing } break; + case PROJECTOR_TYPE_YASA2: + { + n_patches = 64; // adaptive average pooling to 8x8 tokens + } break; case PROJECTOR_TYPE_LDP: case PROJECTOR_TYPE_LDPV2: case PROJECTOR_TYPE_GLM_EDGE: @@ -3463,6 +3530,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_PHI4: case PROJECTOR_TYPE_COGVLM: case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_YASA2: { // do nothing } break; @@ -3689,6 +3757,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_KIMIVL: case PROJECTOR_TYPE_PADDLEOCR: case PROJECTOR_TYPE_KIMIK25: + case PROJECTOR_TYPE_YASA2: return ctx->model.mm_2_w->ne[1]; case PROJECTOR_TYPE_HUNYUANOCR: return ctx->model.mm_model_proj->ne[1]; diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 03d99e15b05..c30d79133ef 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -43,6 +43,14 @@ struct clip_graph_youtuvl : clip_graph { ggml_cgraph * build() override; }; +struct clip_graph_yasa2 : clip_graph { + clip_graph_yasa2(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; + + ggml_tensor * layer_norm_channels(ggml_tensor * inp, ggml_tensor * w, ggml_tensor * b, float eps = 1e-6f); + ggml_tensor * convnext_grn(ggml_tensor * inp, ggml_tensor * w, ggml_tensor * b); +}; + struct clip_graph_minicpmv : clip_graph { clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; diff --git a/tools/mtmd/models/yasa2.cpp b/tools/mtmd/models/yasa2.cpp new file mode 100644 index 00000000000..e8cd3dacbf5 --- /dev/null +++ b/tools/mtmd/models/yasa2.cpp @@ -0,0 +1,191 @@ +// ABOUTME: Yasa2 vision encoder graph builder for ConvNeXt-based architecture. +// ABOUTME: Implements patch embedding, ConvNeXt stages with GRN, and adaptive pooling. + +#include "models.h" + +static ggml_tensor * add_channel_bias( + ggml_context * ctx0, + ggml_tensor * x_whcb, + ggml_tensor * b_c) { + if (!b_c) { + return x_whcb; + } + ggml_tensor * b4 = ggml_reshape_4d(ctx0, b_c, 1, 1, b_c->ne[0], 1); + return ggml_add(ctx0, x_whcb, b4); +} + +static ggml_tensor * mul_channel_weight( + ggml_context * ctx0, + ggml_tensor * x_whcb, + ggml_tensor * w_c) { + if (!w_c) { + return x_whcb; + } + ggml_tensor * w4 = ggml_reshape_4d(ctx0, w_c, 1, 1, w_c->ne[0], 1); + return ggml_mul(ctx0, x_whcb, w4); +} + +ggml_tensor * clip_graph_yasa2::layer_norm_channels(ggml_tensor * inp, ggml_tensor * w, ggml_tensor * b, float eps) { + // Match HF ConvNextLayerNorm(channels_first): + // u = mean_c(x), s = mean_c((x-u)^2), x = (x-u)/sqrt(s+eps) + // cast back to input dtype before affine. + ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3); // [W,H,C,B] -> [C,H,W,B] + cur = ggml_cont(ctx0, cur); + + ggml_tensor * u = ggml_mean(ctx0, cur); // [1,H,W,B] + ggml_tensor * xm = ggml_sub(ctx0, cur, u); // [C,H,W,B] + + ggml_tensor * s = ggml_mul(ctx0, xm, xm); // [C,H,W,B] + s = ggml_mean(ctx0, s); // [1,H,W,B] + s = ggml_clamp(ctx0, s, eps, 1e30f); // avoid div-by-zero in no-alloc warmup + s = ggml_sqrt(ctx0, s); // [1,H,W,B] + + ggml_tensor * xhat = ggml_div(ctx0, xm, s); // [C,H,W,B] + xhat = ggml_permute(ctx0, xhat, 2, 1, 0, 3); // [W,H,C,B] + xhat = ggml_cont(ctx0, xhat); + xhat = mul_channel_weight(ctx0, xhat, w); + xhat = add_channel_bias(ctx0, xhat, b); + return xhat; +} + +ggml_tensor * clip_graph_yasa2::convnext_grn(ggml_tensor * inp, ggml_tensor * w, ggml_tensor * b) { + // Exact ConvNeXtV2 GRN: + // Gx = ||x||_2 over spatial dims (W,H), Nx = Gx / (mean_c(Gx) + eps) + // y = w * (x * Nx) + b + x + const int64_t wdim = inp->ne[0]; + const int64_t hdim = inp->ne[1]; + const int64_t cdim = inp->ne[2]; + const int64_t bdim = inp->ne[3]; + + // Keep GRN math in fp32 for stability; fp16/bf16 accumulation can drift. + ggml_tensor * sq = ggml_mul(ctx0, inp, inp); + ggml_tensor * sq_flat = ggml_reshape_4d(ctx0, sq, wdim * hdim, cdim, 1, bdim); // [WH,C,1,B] + ggml_tensor * gx = ggml_sum_rows(ctx0, sq_flat); // [1,C,1,B] + gx = ggml_sqrt(ctx0, gx); // [1,C,1,B] + + ggml_tensor * gx_ch_first = ggml_permute(ctx0, gx, 1, 0, 2, 3); // [C,1,1,B] + gx_ch_first = ggml_cont(ctx0, gx_ch_first); + ggml_tensor * gx_mean = ggml_mean(ctx0, gx_ch_first); // [1,1,1,B] + + gx_mean = ggml_clamp(ctx0, gx_mean, 1e-6f, 1e30f); // approx +eps, warmup-safe + ggml_tensor * nx = ggml_div(ctx0, gx, gx_mean); // [1,C,1,B] + nx = ggml_permute(ctx0, nx, 0, 2, 1, 3); // [1,1,C,B] + nx = ggml_cont(ctx0, nx); + + ggml_tensor * xnx = ggml_mul(ctx0, inp, nx); + xnx = mul_channel_weight(ctx0, xnx, w); + xnx = add_channel_bias(ctx0, xnx, b); + return ggml_add(ctx0, inp, xnx); +} + +ggml_cgraph * clip_graph_yasa2::build() { + ggml_tensor * cur = build_inp_raw(); + + // Patch embedding Conv2d(kernel=4, stride=4) + cur = ggml_conv_2d(ctx0, model.yasa_patch_w, cur, patch_size, patch_size, 0, 0, 1, 1); + cur = add_channel_bias(ctx0, cur, model.yasa_patch_b); + ggml_set_name(cur, "yasa2_patch_conv_out"); + cb(cur, "yasa2_patch_conv_out", -1); + cur = layer_norm_channels(cur, model.yasa_patch_ln_w, model.yasa_patch_ln_b, eps); + ggml_set_name(cur, "yasa2_patch_ln_out"); + cb(cur, "yasa2_patch_ln_out", -1); + + // ConvNeXt stages + for (size_t s = 0; s < model.yasa_stages.size(); ++s) { + const auto & stage = model.yasa_stages[s]; + + if (stage.down_conv_w) { + cur = layer_norm_channels(cur, stage.down_ln_w, stage.down_ln_b, eps); + cur = ggml_conv_2d(ctx0, stage.down_conv_w, cur, 2, 2, 0, 0, 1, 1); + cur = add_channel_bias(ctx0, cur, stage.down_conv_b); + ggml_format_name(cur, "yasa2_stage%zu_down_out", s); + } + + for (size_t bi = 0; bi < stage.blocks.size(); ++bi) { + const auto & blk = stage.blocks[bi]; + ggml_tensor * res = cur; + + ggml_tensor * x = ggml_conv_2d_dw(ctx0, blk.dw_w, cur, 1, 1, 3, 3, 1, 1); + x = add_channel_bias(ctx0, x, blk.dw_b); + x = layer_norm_channels(x, blk.ln_w, blk.ln_b, eps); + + // pwconv1/pwconv2 are HF Linear layers over channels; implement via matmul on tokens. + const int64_t w = x->ne[0]; + const int64_t h = x->ne[1]; + const int64_t b = x->ne[3]; + + ggml_tensor * tok = ggml_reshape_3d(ctx0, x, w * h, x->ne[2], b); // [T,C,B] + tok = ggml_permute(ctx0, tok, 1, 0, 2, 3); // [C,T,B] + tok = ggml_cont(ctx0, tok); + + tok = ggml_mul_mat(ctx0, blk.pw1_w, tok); // [4C,T,B] + if (blk.pw1_b) { + ggml_tensor * b1 = ggml_reshape_3d(ctx0, blk.pw1_b, blk.pw1_b->ne[0], 1, 1); // [4C,1,1] + tok = ggml_add(ctx0, tok, b1); + } + x = ggml_permute(ctx0, tok, 1, 0, 2, 3); // [T,4C,B] + x = ggml_cont(ctx0, x); + x = ggml_reshape_4d(ctx0, x, w, h, tok->ne[0], b); // [W,H,4C,B] + x = ggml_gelu_erf(ctx0, x); + x = convnext_grn(x, blk.grn_w, blk.grn_b); + + tok = ggml_reshape_3d(ctx0, x, w * h, x->ne[2], b); // [T,4C,B] + tok = ggml_permute(ctx0, tok, 1, 0, 2, 3); // [4C,T,B] + tok = ggml_cont(ctx0, tok); + + tok = ggml_mul_mat(ctx0, blk.pw2_w, tok); // [C,T,B] + if (blk.pw2_b) { + ggml_tensor * b2 = ggml_reshape_3d(ctx0, blk.pw2_b, blk.pw2_b->ne[0], 1, 1); // [C,1,1] + tok = ggml_add(ctx0, tok, b2); + } + x = ggml_permute(ctx0, tok, 1, 0, 2, 3); // [T,C,B] + x = ggml_cont(ctx0, x); + x = ggml_reshape_4d(ctx0, x, w, h, tok->ne[0], b); // [W,H,C,B] + + cur = ggml_add(ctx0, res, x); + ggml_format_name(cur, "yasa2_stage%zu_blk%zu_out", s, bi); + } + } + + // HF path adds vision position embeddings BEFORE adaptive pooling. + const int64_t pre_w = cur->ne[0]; + const int64_t pre_h = cur->ne[1]; + ggml_tensor * tokens_pre = ggml_reshape_3d(ctx0, cur, pre_w * pre_h, cur->ne[2], cur->ne[3]); // [T,C,B] + tokens_pre = ggml_permute(ctx0, tokens_pre, 1, 0, 2, 3); // [C,T,B] + tokens_pre = ggml_cont(ctx0, tokens_pre); + if (model.yasa_vision_pos_embed && tokens_pre->ne[1] == model.yasa_vision_pos_embed->ne[1]) { + const int64_t n_ch = model.yasa_vision_pos_embed->ne[0]; + const int64_t n_tokens = model.yasa_vision_pos_embed->ne[1]; + ggml_tensor * pos = ggml_reshape_3d(ctx0, model.yasa_vision_pos_embed, (int) n_ch, (int) n_tokens, 1); + tokens_pre = ggml_add(ctx0, tokens_pre, pos); + } + cur = ggml_permute(ctx0, tokens_pre, 1, 0, 2, 3); // [T,C,B] + cur = ggml_cont(ctx0, cur); + cur = ggml_reshape_4d(ctx0, cur, pre_w, pre_h, cur->ne[1], cur->ne[2]); // [W,H,C,B] + + // AdaptiveAvgPool2d target is 8x8 for real inputs, but warmup can use tiny images. + const int pooled_w = std::min(8, (int) cur->ne[0]); + const int pooled_h = std::min(8, (int) cur->ne[1]); + const int kw = std::max(1, (int) cur->ne[0] / pooled_w); + const int kh = std::max(1, (int) cur->ne[1] / pooled_h); + cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kw, kh, kw, kh, 0, 0); + + // [W,H,C,B] -> [C,T,B] + ggml_tensor * tokens = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2], cur->ne[3]); + tokens = ggml_permute(ctx0, tokens, 1, 0, 2, 3); + tokens = ggml_cont(ctx0, tokens); + cb(tokens, "yasa2_tokens", -1); + + GGML_ASSERT(model.mm_0_w && model.mm_2_w); + ggml_tensor * embeddings = build_ffn( + tokens, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU_ERF, + -1); + cb(embeddings, "yasa2_emb", -1); + + ggml_build_forward_expand(gf, embeddings); + return gf; +} diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 854ac81e0a5..cc3de6a858c 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -316,6 +316,19 @@ struct mtmd_context { img_end = "<|vision_end|>"; image_preproc = std::make_unique(ctx_v); } break; + case PROJECTOR_TYPE_YASA2: + { + img_beg = ""; + img_end = ""; + // Currently only supprots single-tile preprocessing: any input is downscaled + // to one image_size x image_size tile (64 output tokens via 8x8 adaptive avg + // pool). + // However, the model itself supports llava-uhd multi-tile tiling for high-res + // images. This will be implemented in a future PR (dispatch on has_pinpoints + // - see LDP/COGVLM branch above) and emit image_grid_pinpoints in the conversion + // script. + image_preproc = std::make_unique(ctx_v); + } break; case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA3NV: { From 72d693e4fbf0a188172951fe667d5e2f354ee438 Mon Sep 17 00:00:00 2001 From: Paul Dubs Date: Tue, 21 Apr 2026 20:29:07 +0200 Subject: [PATCH 5/5] spec : reset i_last when low acceptance streak occurs (#22168) By resetting i_last to zero, we will include the current context when rebuilding the speculative map. --- common/speculative.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/common/speculative.cpp b/common/speculative.cpp index daa2b5a8ac9..20e38bec46c 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -749,6 +749,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_state { mod.reset(); n_low = 0; + i_last = 0; } } else { n_low = 0;