From d87bf08871e2c2995e83f551aa61443e35fd865c Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 26 Apr 2026 21:41:32 -0700
Subject: [PATCH 1/5] feat: Update llama.cpp to ggerganov/llama.cpp@f53577432
 (#2189)

* feat: Update llama.cpp to ggerganov/llama.cpp@f53577432

* docs: Update changelog for llama.cpp f53577432

* docs: Keep one unreleased llama.cpp changelog entry
---
 CHANGELOG.md           |  2 +-
 llama_cpp/llama_cpp.py | 55 -------------------------------
 llama_cpp/mtmd_cpp.py  | 73 ++++++++++++++++++++++++++++++++++++------
 vendor/llama.cpp       |  2 +-
 4 files changed, 65 insertions(+), 67 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fbe5b6b6fd..ea7beaaa79 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-- feat: Update llama.cpp to ggerganov/llama.cpp@3bd9aa1f9 and sync Python bindings
+- feat: Update llama.cpp to ggerganov/llama.cpp@f53577432 and sync Python bindings
 
 ## [0.3.20]
 
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index e445ed66ae..d032371402 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -1516,54 +1516,6 @@ def llama_free(ctx: llama_context_p, /):
     ...
 
 
-# enum llama_params_fit_status {
-#     LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0,
-#     LLAMA_PARAMS_FIT_STATUS_FAILURE = 1,
-#     LLAMA_PARAMS_FIT_STATUS_ERROR   = 2,
-# };
-LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0
-LLAMA_PARAMS_FIT_STATUS_FAILURE = 1
-LLAMA_PARAMS_FIT_STATUS_ERROR = 2
-
-
-# LLAMA_API enum llama_params_fit_status llama_params_fit(
-#                                const char   * path_model,
-#                 struct llama_model_params   * mparams,
-#                 struct llama_context_params * cparams,
-#                                       float * tensor_split,
-#     struct llama_model_tensor_buft_override * tensor_buft_overrides,
-#                                      size_t * margins,
-#                                    uint32_t   n_ctx_min,
-#                         enum ggml_log_level   log_level);
-@ctypes_function(
-    "llama_params_fit",
-    [
-        ctypes.c_char_p,
-        ctypes.POINTER(llama_model_params),
-        ctypes.POINTER(llama_context_params),
-        ctypes.POINTER(ctypes.c_float),
-        ctypes.c_void_p,
-        ctypes.POINTER(ctypes.c_size_t),
-        ctypes.c_uint32,
-        ctypes.c_int,
-    ],
-    ctypes.c_int,
-)
-def llama_params_fit(
-    path_model: bytes,
-    mparams: CtypesPointerOrRef[llama_model_params],
-    cparams: CtypesPointerOrRef[llama_context_params],
-    tensor_split: Optional[CtypesPointer[ctypes.c_float]],
-    tensor_buft_overrides: ctypes.c_void_p,
-    margins: Optional[CtypesPointer[ctypes.c_size_t]],
-    n_ctx_min: int,
-    log_level: int,
-    /,
-) -> int:
-    """Fit model and context parameters for a model path."""
-    ...
-
-
 # LLAMA_API int64_t llama_time_us(void);
 @ctypes_function(
     "llama_time_us",
@@ -4869,13 +4821,6 @@ def llama_perf_sampler_print(chain: llama_sampler_p, /): ...
 def llama_perf_sampler_reset(chain: llama_sampler_p, /): ...
 
 
-# // print a breakdown of per-device memory use via LLAMA_LOG:
-@ctypes_function("llama_memory_breakdown_print", [llama_context_p_ctypes], None)
-def llama_memory_breakdown_print(ctx: llama_context_p, /):
-    """Print a breakdown of per-device memory use."""
-    ...
-
-
 # //
 # // training
 # //
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 550c9bd59f..485dc5d8c6 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -8,9 +8,9 @@
     c_int,
     c_uint8,
     c_uint32,
+    c_size_t,
     c_float,
     c_void_p,
-    c_size_t,
     POINTER,
     _Pointer,  # type: ignore
     Structure,
@@ -123,6 +123,17 @@ class mtmd_input_text(Structure):
     ]
 
 
+class mtmd_decoder_pos(Structure):
+    """Decoder attention position for M-RoPE models."""
+
+    _fields_ = [
+        ("t", c_uint32),
+        ("x", c_uint32),
+        ("y", c_uint32),
+        ("z", c_uint32),
+    ]
+
+
 ################################################
 # mtmd.h functions
 ################################################
@@ -165,35 +176,41 @@ def mtmd_init_from_file(
 def mtmd_free(ctx: mtmd_context_p, /): ...
 
 
-# MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
-@ctypes_function("mtmd_decode_use_non_causal", [mtmd_context_p_ctypes], c_bool)
-def mtmd_decode_use_non_causal(ctx: mtmd_context_p, /) -> bool:
+# MTMD_API bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk);
+@ctypes_function(
+    "mtmd_decode_use_non_causal",
+    [mtmd_context_p_ctypes, mtmd_input_chunk_p_ctypes],
+    c_bool,
+)
+def mtmd_decode_use_non_causal(
+    ctx: mtmd_context_p, chunk: Optional[mtmd_input_chunk_p], /
+) -> bool:
     """Check whether MTMD decoding uses non-causal attention."""
     ...
 
 
-# MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
+# MTMD_API bool mtmd_decode_use_mrope(const mtmd_context * ctx);
 @ctypes_function("mtmd_decode_use_mrope", [mtmd_context_p_ctypes], c_bool)
 def mtmd_decode_use_mrope(ctx: mtmd_context_p, /) -> bool:
     """Check whether MTMD decoding uses mRoPE."""
     ...
 
 
-# MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
+# MTMD_API bool mtmd_support_vision(const mtmd_context * ctx);
 @ctypes_function("mtmd_support_vision", [mtmd_context_p_ctypes], c_bool)
 def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool:
     """Check whether the current model supports vision input."""
     ...
 
 
-# MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
+# MTMD_API bool mtmd_support_audio(const mtmd_context * ctx);
 @ctypes_function("mtmd_support_audio", [mtmd_context_p_ctypes], c_bool)
 def mtmd_support_audio(ctx: mtmd_context_p, /) -> bool:
     """Check whether MTMD supports audio."""
     ...
 
 
-# MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx);
+# MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx);
 @ctypes_function("mtmd_get_audio_sample_rate", [mtmd_context_p_ctypes], c_int)
 def mtmd_get_audio_sample_rate(ctx: mtmd_context_p, /) -> int:
     """Get the audio sample rate in Hz. Returns -1 if audio is not supported."""
@@ -418,14 +435,16 @@ def mtmd_image_tokens_get_n_tokens(image_tokens: mtmd_image_tokens_p, /) -> int:
     ...
 
 
-# MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
+# DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens),
+#            "use mtmd_image_tokens_get_decoder_pos() instead");
 @ctypes_function("mtmd_image_tokens_get_nx", [mtmd_image_tokens_p_ctypes], c_size_t)
 def mtmd_image_tokens_get_nx(image_tokens: mtmd_image_tokens_p, /) -> int:
     """Get the image token grid width."""
     ...
 
 
-# MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
+# DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens),
+#            "use mtmd_image_tokens_get_decoder_pos() instead");
 @ctypes_function("mtmd_image_tokens_get_ny", [mtmd_image_tokens_p_ctypes], c_size_t)
 def mtmd_image_tokens_get_ny(image_tokens: mtmd_image_tokens_p, /) -> int:
     """Get the image token grid height."""
@@ -450,6 +469,23 @@ def mtmd_image_tokens_get_n_pos(image_tokens: mtmd_image_tokens_p, /) -> int:
     ...
 
 
+# MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(
+#     const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i);
+@ctypes_function(
+    "mtmd_image_tokens_get_decoder_pos",
+    [mtmd_image_tokens_p_ctypes, llama_cpp.llama_pos, c_size_t],
+    mtmd_decoder_pos,
+)
+def mtmd_image_tokens_get_decoder_pos(
+    image_tokens: mtmd_image_tokens_p,
+    pos_0: llama_cpp.llama_pos,
+    i: Union[c_size_t, int],
+    /,
+) -> mtmd_decoder_pos:
+    """Get decoder attention position for an image embedding token."""
+    ...
+
+
 # MTMD_API int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens);
 @ctypes_function(
     "mtmd_encode",
@@ -534,6 +570,23 @@ def mtmd_helper_get_n_pos(chunks: mtmd_input_chunks_p, /) -> int:
     ...
 
 
+# MTMD_API void mtmd_helper_image_get_decoder_pos(
+#     const mtmd_image_tokens * image, llama_pos pos_0, struct mtmd_decoder_pos * out_pos);
+@ctypes_function(
+    "mtmd_helper_image_get_decoder_pos",
+    [mtmd_image_tokens_p_ctypes, llama_cpp.llama_pos, POINTER(mtmd_decoder_pos)],
+    None,
+)
+def mtmd_helper_image_get_decoder_pos(
+    image: mtmd_image_tokens_p,
+    pos_0: llama_cpp.llama_pos,
+    out_pos: "_Pointer[mtmd_decoder_pos]",
+    /,
+):
+    """Fill decoder attention positions for all image embedding tokens."""
+    ...
+
+
 # MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
 #                                          struct llama_context * lctx,
 #                                          const mtmd_input_chunks * chunks,
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 227ed28e12..f535774325 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 227ed28e128e93b4d63ae5108560c550c9ab16c8
+Subproject commit f53577432541bb9edc1588c4ef45c66bf07e4468

From 511b3f414359e8d98e9123d007bdd935cd1f7c3f Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 26 Apr 2026 22:02:24 -0700
Subject: [PATCH 2/5] fix(ci): Build one arm64 py3 release wheel (#2191)

* fix(ci): Build one arm64 py3 release wheel

* docs: Update changelog for arm64 release wheel fix
---
 .github/workflows/build-and-release.yaml | 4 +++-
 CHANGELOG.md                             | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index 6cbac0cb1c..039e376b69 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -82,7 +82,9 @@ jobs:
           # Keep native arm64 builds on a portable CPU baseline instead of
           # tuning wheels to the hosted runner.
           CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off"
-          CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*"
+          # The release wheel is tagged py3-none, so one build covers all
+          # supported Python versions and avoids duplicate wheel names.
+          CIBW_BUILD: "cp38-*"
         with:
           output-dir: wheelhouse
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ea7beaaa79..fe376ebd33 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@f53577432 and sync Python bindings
+- fix(ci): Build one arm64 release wheel for `py3-none` wheel publishing
 
 ## [0.3.20]
 

From c8075d1dfe2019a0390af613419ecfaea292c9d5 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 26 Apr 2026 22:13:13 -0700
Subject: [PATCH 3/5] chore: bump version to 0.3.21 (#2192)

---
 CHANGELOG.md          | 2 ++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fe376ebd33..eeb42b6449 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.21]
+
 - feat: Update llama.cpp to ggerganov/llama.cpp@f53577432 and sync Python bindings
 - fix(ci): Build one arm64 release wheel for `py3-none` wheel publishing
 
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 83177c065d..fbad5c28b2 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.20"
+__version__ = "0.3.21"

From 195cc59a187687ca64c8e0939e5e549d456aa2fb Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 26 Apr 2026 22:39:59 -0700
Subject: [PATCH 4/5] fix(ci): Repair py3 CPU release wheels (#2193)

---
 .github/workflows/build-and-release.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index 039e376b69..f67fb558dc 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -48,7 +48,10 @@ jobs:
           CIBW_REPAIR_WHEEL_COMMAND: ""
           # Linux needs auditwheel repair so manylinux and musllinux wheels are
           # published with distinct platform tags instead of generic linux tags.
-          CIBW_REPAIR_WHEEL_COMMAND_LINUX: "auditwheel repair -w {dest_dir} {wheel}"
+          CIBW_REPAIR_WHEEL_COMMAND_LINUX: "LD_LIBRARY_PATH=/project/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}"
+          # The release wheel is tagged py3-none, so one build per platform
+          # covers all supported Python versions and avoids duplicate names.
+          CIBW_BUILD: "cp38-*"
           # Skip cibuildwheel's default i686 sidecar and keep Linux release
           # wheels on a portable x86_64 CPU baseline.
           CIBW_ARCHS_LINUX: "auto64"

From d2bcbac46605f11d382426dd88d67e8b5c124cd7 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 26 Apr 2026 22:55:04 -0700
Subject: [PATCH 5/5] fix(ci): Scope CPU release wheel selectors by OS (#2194)

---
 .github/workflows/build-and-release.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index f67fb558dc..df6201ee75 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -51,10 +51,13 @@ jobs:
           CIBW_REPAIR_WHEEL_COMMAND_LINUX: "LD_LIBRARY_PATH=/project/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}"
           # The release wheel is tagged py3-none, so one build per platform
           # covers all supported Python versions and avoids duplicate names.
-          CIBW_BUILD: "cp38-*"
+          CIBW_BUILD_LINUX: "cp38-*"
+          CIBW_BUILD_MACOS: "cp39-*"
+          CIBW_BUILD_WINDOWS: "cp39-*"
           # Skip cibuildwheel's default i686 sidecar and keep Linux release
           # wheels on a portable x86_64 CPU baseline.
           CIBW_ARCHS_LINUX: "auto64"
+          CIBW_ARCHS_WINDOWS: "AMD64"
           CIBW_ENVIRONMENT_LINUX: CMAKE_ARGS="-DGGML_NATIVE=off"
           # Keep macOS release wheels on a portable CPU baseline instead of
           # inheriting the hosted runner's native flags.