From d87bf08871e2c2995e83f551aa61443e35fd865c Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 26 Apr 2026 21:41:32 -0700 Subject: [PATCH 1/5] feat: Update llama.cpp to ggerganov/llama.cpp@f53577432 (#2189) * feat: Update llama.cpp to ggerganov/llama.cpp@f53577432 * docs: Update changelog for llama.cpp f53577432 * docs: Keep one unreleased llama.cpp changelog entry --- CHANGELOG.md | 2 +- llama_cpp/llama_cpp.py | 55 ------------------------------- llama_cpp/mtmd_cpp.py | 73 ++++++++++++++++++++++++++++++++++++------ vendor/llama.cpp | 2 +- 4 files changed, 65 insertions(+), 67 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fbe5b6b6fd..ea7beaaa79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -- feat: Update llama.cpp to ggerganov/llama.cpp@3bd9aa1f9 and sync Python bindings +- feat: Update llama.cpp to ggerganov/llama.cpp@f53577432 and sync Python bindings ## [0.3.20] diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index e445ed66ae..d032371402 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1516,54 +1516,6 @@ def llama_free(ctx: llama_context_p, /): ... -# enum llama_params_fit_status { -# LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, -# LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, -# LLAMA_PARAMS_FIT_STATUS_ERROR = 2, -# }; -LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0 -LLAMA_PARAMS_FIT_STATUS_FAILURE = 1 -LLAMA_PARAMS_FIT_STATUS_ERROR = 2 - - -# LLAMA_API enum llama_params_fit_status llama_params_fit( -# const char * path_model, -# struct llama_model_params * mparams, -# struct llama_context_params * cparams, -# float * tensor_split, -# struct llama_model_tensor_buft_override * tensor_buft_overrides, -# size_t * margins, -# uint32_t n_ctx_min, -# enum ggml_log_level log_level); -@ctypes_function( - "llama_params_fit", - [ - ctypes.c_char_p, - ctypes.POINTER(llama_model_params), - ctypes.POINTER(llama_context_params), - ctypes.POINTER(ctypes.c_float), - ctypes.c_void_p, - ctypes.POINTER(ctypes.c_size_t), - ctypes.c_uint32, - ctypes.c_int, - ], - ctypes.c_int, -) -def llama_params_fit( - path_model: bytes, - mparams: CtypesPointerOrRef[llama_model_params], - cparams: CtypesPointerOrRef[llama_context_params], - tensor_split: Optional[CtypesPointer[ctypes.c_float]], - tensor_buft_overrides: ctypes.c_void_p, - margins: Optional[CtypesPointer[ctypes.c_size_t]], - n_ctx_min: int, - log_level: int, - /, -) -> int: - """Fit model and context parameters for a model path.""" - ... - - # LLAMA_API int64_t llama_time_us(void); @ctypes_function( "llama_time_us", @@ -4869,13 +4821,6 @@ def llama_perf_sampler_print(chain: llama_sampler_p, /): ... def llama_perf_sampler_reset(chain: llama_sampler_p, /): ... -# // print a breakdown of per-device memory use via LLAMA_LOG: -@ctypes_function("llama_memory_breakdown_print", [llama_context_p_ctypes], None) -def llama_memory_breakdown_print(ctx: llama_context_p, /): - """Print a breakdown of per-device memory use.""" - ... - - # // # // training # // diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 550c9bd59f..485dc5d8c6 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -8,9 +8,9 @@ c_int, c_uint8, c_uint32, + c_size_t, c_float, c_void_p, - c_size_t, POINTER, _Pointer, # type: ignore Structure, @@ -123,6 +123,17 @@ class mtmd_input_text(Structure): ] +class mtmd_decoder_pos(Structure): + """Decoder attention position for M-RoPE models.""" + + _fields_ = [ + ("t", c_uint32), + ("x", c_uint32), + ("y", c_uint32), + ("z", c_uint32), + ] + + ################################################ # mtmd.h functions ################################################ @@ -165,35 +176,41 @@ def mtmd_init_from_file( def mtmd_free(ctx: mtmd_context_p, /): ... -# MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx); -@ctypes_function("mtmd_decode_use_non_causal", [mtmd_context_p_ctypes], c_bool) -def mtmd_decode_use_non_causal(ctx: mtmd_context_p, /) -> bool: +# MTMD_API bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk); +@ctypes_function( + "mtmd_decode_use_non_causal", + [mtmd_context_p_ctypes, mtmd_input_chunk_p_ctypes], + c_bool, +) +def mtmd_decode_use_non_causal( + ctx: mtmd_context_p, chunk: Optional[mtmd_input_chunk_p], / +) -> bool: """Check whether MTMD decoding uses non-causal attention.""" ... -# MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx); +# MTMD_API bool mtmd_decode_use_mrope(const mtmd_context * ctx); @ctypes_function("mtmd_decode_use_mrope", [mtmd_context_p_ctypes], c_bool) def mtmd_decode_use_mrope(ctx: mtmd_context_p, /) -> bool: """Check whether MTMD decoding uses mRoPE.""" ... -# MTMD_API bool mtmd_support_vision(mtmd_context * ctx); +# MTMD_API bool mtmd_support_vision(const mtmd_context * ctx); @ctypes_function("mtmd_support_vision", [mtmd_context_p_ctypes], c_bool) def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool: """Check whether the current model supports vision input.""" ... -# MTMD_API bool mtmd_support_audio(mtmd_context * ctx); +# MTMD_API bool mtmd_support_audio(const mtmd_context * ctx); @ctypes_function("mtmd_support_audio", [mtmd_context_p_ctypes], c_bool) def mtmd_support_audio(ctx: mtmd_context_p, /) -> bool: """Check whether MTMD supports audio.""" ... -# MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx); +# MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx); @ctypes_function("mtmd_get_audio_sample_rate", [mtmd_context_p_ctypes], c_int) def mtmd_get_audio_sample_rate(ctx: mtmd_context_p, /) -> int: """Get the audio sample rate in Hz. Returns -1 if audio is not supported.""" @@ -418,14 +435,16 @@ def mtmd_image_tokens_get_n_tokens(image_tokens: mtmd_image_tokens_p, /) -> int: ... -# MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens); +# DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens), +# "use mtmd_image_tokens_get_decoder_pos() instead"); @ctypes_function("mtmd_image_tokens_get_nx", [mtmd_image_tokens_p_ctypes], c_size_t) def mtmd_image_tokens_get_nx(image_tokens: mtmd_image_tokens_p, /) -> int: """Get the image token grid width.""" ... -# MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens); +# DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens), +# "use mtmd_image_tokens_get_decoder_pos() instead"); @ctypes_function("mtmd_image_tokens_get_ny", [mtmd_image_tokens_p_ctypes], c_size_t) def mtmd_image_tokens_get_ny(image_tokens: mtmd_image_tokens_p, /) -> int: """Get the image token grid height.""" @@ -450,6 +469,23 @@ def mtmd_image_tokens_get_n_pos(image_tokens: mtmd_image_tokens_p, /) -> int: ... +# MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos( +# const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i); +@ctypes_function( + "mtmd_image_tokens_get_decoder_pos", + [mtmd_image_tokens_p_ctypes, llama_cpp.llama_pos, c_size_t], + mtmd_decoder_pos, +) +def mtmd_image_tokens_get_decoder_pos( + image_tokens: mtmd_image_tokens_p, + pos_0: llama_cpp.llama_pos, + i: Union[c_size_t, int], + /, +) -> mtmd_decoder_pos: + """Get decoder attention position for an image embedding token.""" + ... + + # MTMD_API int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens); @ctypes_function( "mtmd_encode", @@ -534,6 +570,23 @@ def mtmd_helper_get_n_pos(chunks: mtmd_input_chunks_p, /) -> int: ... +# MTMD_API void mtmd_helper_image_get_decoder_pos( +# const mtmd_image_tokens * image, llama_pos pos_0, struct mtmd_decoder_pos * out_pos); +@ctypes_function( + "mtmd_helper_image_get_decoder_pos", + [mtmd_image_tokens_p_ctypes, llama_cpp.llama_pos, POINTER(mtmd_decoder_pos)], + None, +) +def mtmd_helper_image_get_decoder_pos( + image: mtmd_image_tokens_p, + pos_0: llama_cpp.llama_pos, + out_pos: "_Pointer[mtmd_decoder_pos]", + /, +): + """Fill decoder attention positions for all image embedding tokens.""" + ... + + # MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx, # struct llama_context * lctx, # const mtmd_input_chunks * chunks, diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 227ed28e12..f535774325 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 227ed28e128e93b4d63ae5108560c550c9ab16c8 +Subproject commit f53577432541bb9edc1588c4ef45c66bf07e4468 From 511b3f414359e8d98e9123d007bdd935cd1f7c3f Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 26 Apr 2026 22:02:24 -0700 Subject: [PATCH 2/5] fix(ci): Build one arm64 py3 release wheel (#2191) * fix(ci): Build one arm64 py3 release wheel * docs: Update changelog for arm64 release wheel fix --- .github/workflows/build-and-release.yaml | 4 +++- CHANGELOG.md | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 6cbac0cb1c..039e376b69 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -82,7 +82,9 @@ jobs: # Keep native arm64 builds on a portable CPU baseline instead of # tuning wheels to the hosted runner. CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off" - CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*" + # The release wheel is tagged py3-none, so one build covers all + # supported Python versions and avoids duplicate wheel names. + CIBW_BUILD: "cp38-*" with: output-dir: wheelhouse diff --git a/CHANGELOG.md b/CHANGELOG.md index ea7beaaa79..fe376ebd33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - feat: Update llama.cpp to ggerganov/llama.cpp@f53577432 and sync Python bindings +- fix(ci): Build one arm64 release wheel for `py3-none` wheel publishing ## [0.3.20] From c8075d1dfe2019a0390af613419ecfaea292c9d5 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 26 Apr 2026 22:13:13 -0700 Subject: [PATCH 3/5] chore: bump version to 0.3.21 (#2192) --- CHANGELOG.md | 2 ++ llama_cpp/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fe376ebd33..eeb42b6449 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.21] + - feat: Update llama.cpp to ggerganov/llama.cpp@f53577432 and sync Python bindings - fix(ci): Build one arm64 release wheel for `py3-none` wheel publishing diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 83177c065d..fbad5c28b2 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.20" +__version__ = "0.3.21" From 195cc59a187687ca64c8e0939e5e549d456aa2fb Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 26 Apr 2026 22:39:59 -0700 Subject: [PATCH 4/5] fix(ci): Repair py3 CPU release wheels (#2193) --- .github/workflows/build-and-release.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 039e376b69..f67fb558dc 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -48,7 +48,10 @@ jobs: CIBW_REPAIR_WHEEL_COMMAND: "" # Linux needs auditwheel repair so manylinux and musllinux wheels are # published with distinct platform tags instead of generic linux tags. - CIBW_REPAIR_WHEEL_COMMAND_LINUX: "auditwheel repair -w {dest_dir} {wheel}" + CIBW_REPAIR_WHEEL_COMMAND_LINUX: "LD_LIBRARY_PATH=/project/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}" + # The release wheel is tagged py3-none, so one build per platform + # covers all supported Python versions and avoids duplicate names. + CIBW_BUILD: "cp38-*" # Skip cibuildwheel's default i686 sidecar and keep Linux release # wheels on a portable x86_64 CPU baseline. CIBW_ARCHS_LINUX: "auto64" From d2bcbac46605f11d382426dd88d67e8b5c124cd7 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 26 Apr 2026 22:55:04 -0700 Subject: [PATCH 5/5] fix(ci): Scope CPU release wheel selectors by OS (#2194) --- .github/workflows/build-and-release.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index f67fb558dc..df6201ee75 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -51,10 +51,13 @@ jobs: CIBW_REPAIR_WHEEL_COMMAND_LINUX: "LD_LIBRARY_PATH=/project/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}" # The release wheel is tagged py3-none, so one build per platform # covers all supported Python versions and avoids duplicate names. - CIBW_BUILD: "cp38-*" + CIBW_BUILD_LINUX: "cp38-*" + CIBW_BUILD_MACOS: "cp39-*" + CIBW_BUILD_WINDOWS: "cp39-*" # Skip cibuildwheel's default i686 sidecar and keep Linux release # wheels on a portable x86_64 CPU baseline. CIBW_ARCHS_LINUX: "auto64" + CIBW_ARCHS_WINDOWS: "AMD64" CIBW_ENVIRONMENT_LINUX: CMAKE_ARGS="-DGGML_NATIVE=off" # Keep macOS release wheels on a portable CPU baseline instead of # inheriting the hosted runner's native flags.