From f8c1f36be8116b1213e0e77df7fa9403ba3acd59 Mon Sep 17 00:00:00 2001
From: Tai An <antai12232931@outlook.com>
Date: Sun, 10 May 2026 22:53:57 -0700
Subject: [PATCH 1/2] fix(embed): mark all tokens as output to suppress
 llama.cpp 'overriding' INFO (#2208) (#2212)

---
 CHANGELOG.md       | 1 +
 llama_cpp/llama.py | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5031e58080..808a3647d6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - feat: Update llama.cpp to ggerganov/llama.cpp@5d6f18a63 and sync Python bindings
 - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205
 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls
+- fix: Mark all embedding input tokens as outputs to avoid llama.cpp override warnings by @Anai-Guo in #2212
 
 ## [0.3.22]
 
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 752c25dd3d..2afa4c8e97 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1040,7 +1040,13 @@ def embed(
 
         # get pooling information
         pooling_type = self.pooling_type()
-        logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE
+        # In embedding mode every input token must be marked as an output, regardless of
+        # pooling type. llama.cpp would otherwise override per-token `logits[i]` and emit
+        # "embeddings required but some input tokens were not marked as outputs ->
+        # overriding" once per input. Pooling NONE vs MEAN/CLS only changes how the
+        # per-token outputs are read back (see decode_batch below), not whether they are
+        # produced. See abetlen/llama-cpp-python#2208.
+        logits_all = True
 
         if self.context_params.embeddings is False:
             raise RuntimeError(

From 568411233f5f326f80c41c6e026bc80f27c00e69 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 10 May 2026 23:27:25 -0700
Subject: [PATCH 2/2] feat: update llama.cpp to 7d442abf (#2214)

---
 CHANGELOG.md     | 2 +-
 vendor/llama.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 808a3647d6..a783fab424 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-- feat: Update llama.cpp to ggerganov/llama.cpp@5d6f18a63 and sync Python bindings
+- feat: Update llama.cpp to ggerganov/llama.cpp@7d442abf
 - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205
 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls
 - fix: Mark all embedding input tokens as outputs to avoid llama.cpp override warnings by @Anai-Guo in #2212
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 5d6f18a638..7d442abf5c 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 5d6f18a6387a7066fe387233f2ca6f113cb209fb
+Subproject commit 7d442abf5c6244117fd5a1dc51a5d19f00792491