feat: update llama.cpp to 210a6570c (abetlen#2242)

abetlen · web-flow · commit 4b66c45ebc20 · 2026-06-01T19:32:51.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: Update llama.cpp to ggml-org/llama.cpp@210a6570c by @abetlen in #2242
 - feat: add Gemma 4 multimodal chat support by @abetlen in #2241
 - feat(ci): add CUDA 13.0 and 13.2 wheel builds by @abetlen in #2239
 - feat(ci): add CUDA 11.8 wheel builds by @abetlen in #2238
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -903,6 +903,7 @@ class llama_sampler_seq_config(ctypes.Structure):
 #     uint32_t n_ubatch;          // physical maximum batch size
 #     uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
 #     uint32_t n_rs_seq;          // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL]
+#     uint32_t n_outputs_max;     // max outputs in a ubatch (0 = n_batch)
 #     int32_t  n_threads;         // number of threads to use for generation
 #     int32_t  n_threads_batch;   // number of threads to use for batch processing
 
@@ -958,6 +959,7 @@ class llama_context_params(ctypes.Structure):
         n_ubatch (int): physical maximum batch size
         n_seq_max (int): max number of sequences (i.e. distinct states for recurrent models)
         n_rs_seq (int): number of recurrent-state snapshots per sequence for rollback
+        n_outputs_max (int): max outputs in a ubatch, 0 = n_batch
         n_threads (int): number of threads to use for generation
         n_threads_batch (int): number of threads to use for batch processing
         ctx_type (int): context type, from `enum llama_context_type`
@@ -995,6 +997,7 @@ class llama_context_params(ctypes.Structure):
         n_ubatch: int
         n_seq_max: int
         n_rs_seq: int
+        n_outputs_max: int
         n_threads: int
         n_threads_batch: int
         ctx_type: int
@@ -1031,6 +1034,7 @@ class llama_context_params(ctypes.Structure):
         ("n_ubatch", ctypes.c_uint32),
         ("n_seq_max", ctypes.c_uint32),
         ("n_rs_seq", ctypes.c_uint32),
+        ("n_outputs_max", ctypes.c_uint32),
         ("n_threads", ctypes.c_int32),
         ("n_threads_batch", ctypes.c_int32),
         ("ctx_type", ctypes.c_int),
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit af6528e6df5d798f7f1363ec1141699be0f638e2
+Subproject commit 210a6570ceda20c5d6439172c09ada08c3754cc9