From 9d49acb2a7102db476c8e09e9322d4a9aaea3b4b Mon Sep 17 00:00:00 2001
From: Yes You Can Have Your Own <188969017+yychyo@users.noreply.github.com>
Date: Mon, 20 Apr 2026 08:30:24 +0300
Subject: [PATCH 1/4] server: rename --clear-idle to --cache-idle-slots
(#21741)
---
common/arg.cpp | 8 ++++----
common/common.h | 2 +-
tools/server/README.md | 2 +-
tools/server/server-context.cpp | 12 ++++++------
tools/server/tests/unit/test_kv_keep_only_active.py | 2 +-
tools/server/tests/utils.py | 6 +++---
6 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/common/arg.cpp b/common/arg.cpp
index 6f22f781915..43fe5a25d5d 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1316,13 +1316,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
add_opt(common_arg(
- {"--clear-idle"},
- {"--no-clear-idle"},
+ {"--cache-idle-slots"},
+ {"--no-cache-idle-slots"},
"save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
[](common_params & params, bool value) {
- params.clear_idle = value;
+ params.cache_idle_slots = value;
}
- ).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER}));
+ ).set_env("LLAMA_ARG_CACHE_IDLE_SLOTS").set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--context-shift"},
{"--no-context-shift"},
diff --git a/common/common.h b/common/common.h
index 4c36e85e0bc..027339294e0 100644
--- a/common/common.h
+++ b/common/common.h
@@ -567,7 +567,7 @@ struct common_params {
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
bool cache_prompt = true; // whether to enable prompt caching
- bool clear_idle = true; // save and clear idle slots upon starting a new task
+ bool cache_idle_slots = true; // save and clear idle slots upon starting a new task
int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
diff --git a/tools/server/README.md b/tools/server/README.md
index 84a29cba0b6..db1f2703904 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -167,7 +167,7 @@ For the full list of features, please refer to [server's changelog](https://gith
| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) |
| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) |
| `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) |
-| `--clear-idle, --no-clear-idle` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)
(env: LLAMA_ARG_CLEAR_IDLE) |
+| `--cache-idle-slots, --no-cache-idle-slots` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)
(env: LLAMA_ARG_CACHE_IDLE_SLOTS) |
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) |
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
| `-sp, --special` | special tokens output enabled (default: false) |
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 70ebcc225e3..7ffe6a30310 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -987,13 +987,13 @@ struct server_context_impl {
metrics.init();
- if (params_base.clear_idle) {
+ if (params_base.cache_idle_slots) {
if (!params_base.kv_unified) {
- SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__);
- params_base.clear_idle = false;
+ SRV_WRN("%s: --cache-idle-slots requires --kv-unified, disabling\n", __func__);
+ params_base.cache_idle_slots = false;
} else if (params_base.cache_ram_mib == 0) {
- SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__);
- params_base.clear_idle = false;
+ SRV_WRN("%s: --cache-idle-slots requires --cache-ram, disabling\n", __func__);
+ params_base.cache_idle_slots = false;
} else {
SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
@@ -1886,7 +1886,7 @@ struct server_context_impl {
break; // drop the task
}
- if (params_base.clear_idle) {
+ if (params_base.cache_idle_slots) {
for (auto & s : slots) {
if (!s.is_processing()) {
slot_save_and_clear(s);
diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py
index da93d50011e..f4b08b5dd0e 100644
--- a/tools/server/tests/unit/test_kv_keep_only_active.py
+++ b/tools/server/tests/unit/test_kv_keep_only_active.py
@@ -91,7 +91,7 @@ def test_clear_and_restore():
def test_disabled_with_flag():
global server
- server.no_clear_idle = True
+ server.no_cache_idle_slots = True
server.start()
log = LogReader(server.log_path)
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index 5ddac5be496..ddbb76c9adb 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -103,7 +103,7 @@ class ServerProcess:
media_path: str | None = None
sleep_idle_seconds: int | None = None
cache_ram: int | None = None
- no_clear_idle: bool = False
+ no_cache_idle_slots: bool = False
log_path: str | None = None
webui_mcp_proxy: bool = False
@@ -242,8 +242,8 @@ def start(self, timeout_seconds: int = DEFAULT_HTTP_TIMEOUT) -> None:
server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
if self.cache_ram is not None:
server_args.extend(["--cache-ram", self.cache_ram])
- if self.no_clear_idle:
- server_args.append("--no-clear-idle")
+ if self.no_cache_idle_slots:
+ server_args.append("--no-cache-idle-slots")
if self.webui_mcp_proxy:
server_args.append("--webui-mcp-proxy")
From 788fcbc5ddb16101e6388170bf84c0d10bd8c006 Mon Sep 17 00:00:00 2001
From: Katostrofik
Date: Mon, 20 Apr 2026 01:39:45 -0400
Subject: [PATCH 2/4] [SYCL] Fix reorder MMVQ assert on unaligned vocab sizes
(#22035)
* [SYCL] Fix reorder MMVQ assert on unaligned vocab sizes
The reorder mul_mat_vec_q dispatchers for Q4_0, Q8_0, Q4_K, and Q6_K
asserted that block_num_y was a multiple of 16 subgroups. Models with
a vocab size not divisible by 16 (for example HY-MT at 120818) aborted
on model load when the output projection tripped the assert.
I replaced the assert with padding: block_num_y now rounds up to a
whole number of subgroup-sized workgroups. The kernel already has the
row bounds check (`if (row >= nrows) return;`) so the extra padded
threads early-exit cleanly. Row values are uniform across a subgroup
so the collective reduce stays safe.
For aligned vocab sizes the padded block_num_y equals the old value,
so the kernel launch is identical and there is no regression.
Thanks to @arthw for flagging the relationship to #21527.
Fixes #22020.
AI assisted coding, tested on Intel B70 hardware.
* sycl: use WARP_SIZE for num_subgroups in reorder MMVQ launches
Replaces the hardcoded 16 with WARP_SIZE in the four reorder_mul_mat_vec
launch helpers (Q4_0, Q8_0, Q4_K, Q6_K). Compile-time no-op on the Intel
target where WARP_SIZE is 16, but makes the relationship to subgroup
size explicit. Per review by @NeoZhangJianyu on #22035.
Assisted by Claude.
---
ggml/src/ggml-sycl/mmvq.cpp | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp
index af22b98dddb..3a4577ecbbc 100644
--- a/ggml/src/ggml-sycl/mmvq.cpp
+++ b/ggml/src/ggml-sycl/mmvq.cpp
@@ -537,9 +537,9 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
const int nrows, dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK4_0 == 0);
- const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
- constexpr size_t num_subgroups = 16;
- GGML_ASSERT(block_num_y % num_subgroups == 0);
+ // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel.
+ constexpr size_t num_subgroups = WARP_SIZE;
+ const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups;
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE));
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
@@ -682,9 +682,9 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
static void reorder_mul_mat_vec_q8_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
const int nrows, dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK8_0 == 0);
- const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
- constexpr size_t num_subgroups = 16;
- GGML_ASSERT(block_num_y % num_subgroups == 0);
+ // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel.
+ constexpr size_t num_subgroups = WARP_SIZE;
+ const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups;
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE));
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
@@ -798,9 +798,9 @@ static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy,
const int nrows, dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
- constexpr size_t num_subgroups = 16;
- GGML_ASSERT(block_num_y % num_subgroups == 0);
+ // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel.
+ constexpr size_t num_subgroups = WARP_SIZE;
+ const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups;
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
@@ -842,9 +842,9 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
const int nrows, dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
- constexpr size_t num_subgroups = 16;
- GGML_ASSERT(block_num_y % num_subgroups == 0);
+ // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel.
+ constexpr size_t num_subgroups = WARP_SIZE;
+ const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups;
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
From de71b5f81c3b6b9f8bdaf1b2a21198e1eede3fda Mon Sep 17 00:00:00 2001
From: Georgi Gerganov
Date: Mon, 20 Apr 2026 08:42:37 +0300
Subject: [PATCH 3/4] server : refactor "use checkpoint" logic (#22114)
---
common/arg.cpp | 2 +-
common/common.cpp | 38 +++++++++++++++++++-
common/common.h | 25 ++++++++++---
common/hf-cache.cpp | 4 +--
common/speculative.cpp | 62 ++++++++-------------------------
common/speculative.h | 10 ------
tools/server/server-context.cpp | 44 ++++++++++-------------
7 files changed, 93 insertions(+), 92 deletions(-)
diff --git a/common/arg.cpp b/common/arg.cpp
index 43fe5a25d5d..099f0aeab24 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -292,7 +292,7 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
hf_tag = "default";
}
- std::string model_endpoint = get_model_endpoint();
+ std::string model_endpoint = common_get_model_endpoint();
auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
// prepare local path for caching
diff --git a/common/common.cpp b/common/common.cpp
index d3f1cee394c..6cde71d819a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1382,7 +1382,7 @@ common_init_result_ptr common_init_from_params(common_params & params) {
common_init_result::~common_init_result() = default;
-std::string get_model_endpoint() {
+std::string common_get_model_endpoint() {
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
const char * hf_endpoint_env = getenv("HF_ENDPOINT");
@@ -1397,6 +1397,42 @@ std::string get_model_endpoint() {
return model_endpoint;
}
+common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {
+ auto * mem = llama_get_memory(ctx);
+ if (mem == nullptr) {
+ return COMMON_CONTEXT_SEQ_RM_TYPE_NO;
+ }
+
+ common_context_seq_rm_type res = COMMON_CONTEXT_SEQ_RM_TYPE_PART;
+
+ llama_memory_clear(mem, true);
+
+ // eval 2 tokens to check if the context is compatible
+ std::vector tmp;
+ tmp.push_back(0);
+ tmp.push_back(0);
+
+ int ret = llama_decode(ctx, llama_batch_get_one(tmp.data(), tmp.size()));
+ if (ret != 0) {
+ LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
+ res = COMMON_CONTEXT_SEQ_RM_TYPE_NO;
+ goto done;
+ }
+
+ // try to remove the last tokens
+ if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
+ LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
+ res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
+ goto done;
+ }
+
+done:
+ llama_memory_clear(mem, true);
+ llama_synchronize(ctx);
+
+ return res;
+}
+
void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora) {
std::vector loras;
std::vector scales;
diff --git a/common/common.h b/common/common.h
index 027339294e0..cbcb3bb6586 100644
--- a/common/common.h
+++ b/common/common.h
@@ -308,10 +308,9 @@ struct common_params_speculative {
// ngram-based speculative decoding
- uint16_t ngram_size_n = 12; // ngram size for lookup
- uint16_t ngram_size_m = 48; // mgram size for speculative tokens
- uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
- bool use_checkpoints = false; // use checkpoints to rewind in token history of recurrent models
+ uint16_t ngram_size_n = 12; // ngram size for lookup
+ uint16_t ngram_size_m = 48; // mgram size for speculative tokens
+ uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
std::shared_ptr ngram_mod;
@@ -847,7 +846,23 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
// clear LoRA adapters from context, then apply new list of adapters
void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora);
-std::string get_model_endpoint();
+// model endpoint from env
+std::string common_get_model_endpoint();
+
+//
+// Context utils
+//
+
+enum common_context_seq_rm_type {
+ COMMON_CONTEXT_SEQ_RM_TYPE_NO = 0, // seq_rm not supported (e.g. no memory module)
+ COMMON_CONTEXT_SEQ_RM_TYPE_PART = 1, // can seq_rm partial sequences
+ COMMON_CONTEXT_SEQ_RM_TYPE_FULL = 2, // can seq_rm full sequences only
+};
+
+// check if the llama_context can remove sequences
+// note: clears the memory of the context
+common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx);
+
//
// Batch utils
diff --git a/common/hf-cache.cpp b/common/hf-cache.cpp
index 38a4c17a98e..ea5b2150de4 100644
--- a/common/hf-cache.cpp
+++ b/common/hf-cache.cpp
@@ -230,7 +230,7 @@ static nl::json api_get(const std::string & url,
static std::string get_repo_commit(const std::string & repo_id,
const std::string & token) {
try {
- auto endpoint = get_model_endpoint();
+ auto endpoint = common_get_model_endpoint();
auto json = api_get(endpoint + "api/models/" + repo_id + "/refs", token);
if (!json.is_object() ||
@@ -308,7 +308,7 @@ hf_files get_repo_files(const std::string & repo_id,
hf_files files;
try {
- auto endpoint = get_model_endpoint();
+ auto endpoint = common_get_model_endpoint();
auto json = api_get(endpoint + "api/models/" + repo_id + "/tree/" + commit + "?recursive=true", token);
if (!json.is_array()) {
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 1789560eeaa..daa2b5a8ac9 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -164,8 +164,8 @@ struct common_speculative_state_draft : public common_speculative_state {
llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
llama_context * ctx_dft;
+ bool use_ckpt = false;
struct common_speculative_checkpoint ckpt;
- bool use_checkpoint;
common_sampler * smpl;
@@ -180,11 +180,11 @@ struct common_speculative_state_draft : public common_speculative_state {
llama_context * ctx_tgt,
llama_context * ctx_dft,
const std::vector> & replacements,
- bool use_checkpoint)
+ bool use_ckpt)
: common_speculative_state(type)
, ctx_tgt(ctx_tgt)
, ctx_dft(ctx_dft)
- , use_checkpoint(use_checkpoint)
+ , use_ckpt(use_ckpt)
{
batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
smpl = nullptr;
@@ -239,7 +239,7 @@ struct common_speculative_state_draft : public common_speculative_state {
}
void begin(const llama_tokens & prompt) override {
- if (use_checkpoint && ckpt.size() > 0) {
+ if (use_ckpt && ckpt.size() > 0) {
// delete checkpoint
LOG_DBG("%s: delete checkpoint, prompt.size=%zu, pos_min=%d, pos_max=%d, n_tokens=%" PRId64 ", size=%.3f MiB\n",
__func__, prompt.size(), ckpt.pos_min, ckpt.pos_max, ckpt.n_tokens, (float) ckpt.data.size() / 1024 / 1024);
@@ -351,7 +351,7 @@ struct common_speculative_state_draft : public common_speculative_state {
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, #prompt_dft = %zu, #prompt_cur = %zu\n",
__func__, reuse_i, reuse_n, prompt_dft.size(), prompt_cur.size());
- if (use_checkpoint && ckpt.ckpt_size == 0 && reuse_n > 0) {
+ if (use_ckpt && ckpt.ckpt_size == 0 && reuse_n > 0) {
LOG_DBG("%s: no checkpoint available, no reuse, (reuse_i=%d, reuse_n=%d) -> (0, 0)\n",
__func__, reuse_i, reuse_n);
reuse_i = 0;
@@ -361,8 +361,8 @@ struct common_speculative_state_draft : public common_speculative_state {
result.clear();
result.reserve(params.n_max);
- bool needs_ckpt = use_checkpoint && prompt_dft.size() > 0;
- if (reuse_n == 0 || (use_checkpoint && reuse_i > 0)) {
+ bool needs_ckpt = use_ckpt && prompt_dft.size() > 0;
+ if (reuse_n == 0 || (use_ckpt && reuse_i > 0)) {
llama_memory_clear(mem_dft, false);
prompt_dft.clear();
} else {
@@ -400,7 +400,7 @@ struct common_speculative_state_draft : public common_speculative_state {
}
if (reuse_n < (int) prompt_dft.size() || do_restore) {
- if (use_checkpoint) {
+ if (use_ckpt) {
if (ckpt.n_tokens > (int64_t) prompt_dft.size()) {
LOG_INF("%s: checkpoint is too large, prompt_tgt.size=%zu, ckpt.n_tokens=%" PRId64 ", reuse_n=%d, prompt_dft.size=%zu\n",
__func__, prompt_tgt.size(), ckpt.n_tokens, reuse_n, prompt_dft.size());
@@ -912,42 +912,6 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
return it->second;
}
-common_speculative_compat_type common_speculative_is_compat(llama_context * ctx_tgt) {
- auto * mem = llama_get_memory(ctx_tgt);
- if (mem == nullptr) {
- return COMMON_SPECULATIVE_COMPAT_TYPE_NO;
- }
-
- common_speculative_compat_type res = COMMON_SPECULATIVE_COMPAT_TYPE_FULL;
-
- llama_memory_clear(mem, true);
-
- // eval 2 tokens to check if the context is compatible
- std::vector tmp;
- tmp.push_back(0);
- tmp.push_back(0);
-
- int ret = llama_decode(ctx_tgt, llama_batch_get_one(tmp.data(), tmp.size()));
- if (ret != 0) {
- LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
- res = COMMON_SPECULATIVE_COMPAT_TYPE_NO;
- goto done;
- }
-
- // try to remove the last tokens
- if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
- LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
- res = COMMON_SPECULATIVE_COMPAT_TYPE_CKPT;
- goto done;
- }
-
-done:
- llama_memory_clear(mem, true);
- llama_synchronize(ctx_tgt);
-
- return res;
-}
-
// initialization of the speculative decoding system
//
common_speculative * common_speculative_init(
@@ -1022,11 +986,13 @@ common_speculative * common_speculative_init(
case COMMON_SPECULATIVE_TYPE_NONE:
break;
case COMMON_SPECULATIVE_TYPE_DRAFT: {
+ const bool use_ckpt = common_context_can_seq_rm(ctx_dft) == COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
+
impls.push_back(std::make_unique(config.type,
- /* .ctx_tgt = */ ctx_tgt,
- /* .ctx_dft = */ ctx_dft,
- /* .replacements = */ params.replacements,
- /* .use_checkpoint= */ params.use_checkpoints // TODO: this should be based on the draft model!
+ /* .ctx_tgt = */ ctx_tgt,
+ /* .ctx_dft = */ ctx_dft,
+ /* .replacements = */ params.replacements,
+ /* .use_ckpt = */ use_ckpt
));
break;
}
diff --git a/common/speculative.h b/common/speculative.h
index cbe6e5bdb73..bca78d32b5b 100644
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -14,16 +14,6 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
// convert type to string
std::string common_speculative_type_to_str(enum common_speculative_type type);
-enum common_speculative_compat_type {
- COMMON_SPECULATIVE_COMPAT_TYPE_NO = 0,
- COMMON_SPECULATIVE_COMPAT_TYPE_FULL = 1,
- COMMON_SPECULATIVE_COMPAT_TYPE_CKPT = 2,
-};
-
-// check if the llama_context is compatible for speculative decoding
-// note: clears the memory of the context
-common_speculative_compat_type common_speculative_is_compat(llama_context * ctx_tgt);
-
common_speculative * common_speculative_init(
common_params_speculative & params,
llama_context * ctx_tgt);
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 7ffe6a30310..99856e6c3ac 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -78,9 +78,10 @@ enum server_state {
struct server_slot {
int id;
- // TODO: change to unique_ptrs for consistency:
llama_context * ctx = nullptr;
+ common_context_seq_rm_type ctx_seq_rm_type = COMMON_CONTEXT_SEQ_RM_TYPE_NO;
+
// multimodal
mtmd_context * mctx = nullptr;
@@ -90,7 +91,6 @@ struct server_slot {
server_prompt_checkpoint spec_ckpt;
common_speculative_ptr spec;
-
// TODO: move members that belong to the task (such as `generated_text`, `has_new_line`) to task_results_state
// see https://github.com/ggml-org/llama.cpp/pull/18283#issuecomment-3710175837
std::unique_ptr task;
@@ -343,7 +343,7 @@ struct server_slot {
if (!spec_draft.empty()) {
// we have a previous (partial) draft to reuse
- if (task->params.speculative.use_checkpoints) {
+ if (ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) {
GGML_ASSERT(!spec_ckpt.empty());
}
} else {
@@ -362,15 +362,13 @@ struct server_slot {
spec_draft.clear();
}
- if (!spec_draft.empty() && params_spec.use_checkpoints) {
+ if (!spec_draft.empty() && ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) {
const auto n_tokens = prompt.tokens.size();
- auto & ckpt = spec_ckpt;
-
- ckpt = server_get_checkpoint(ctx, this->id, n_tokens);
+ spec_ckpt = server_get_checkpoint(ctx, this->id, n_tokens);
SLT_DBG(*this, "created speculative checkpoint (pos_min = %d, pos_max = %d, n_tokens = %zu, size = %.3f MiB)\n",
- ckpt.pos_min, ckpt.pos_max, n_tokens, (float) ckpt.data.size() / 1024 / 1024);
+ spec_ckpt.pos_min, spec_ckpt.pos_max, n_tokens, (float) spec_ckpt.data.size() / 1024 / 1024);
}
}
@@ -871,14 +869,13 @@ struct server_context_impl {
slots.clear();
- const auto spec_type = common_speculative_is_compat(ctx);
- if (spec_type == COMMON_SPECULATIVE_COMPAT_TYPE_NO) {
+ const auto ctx_seq_rm_type = common_context_can_seq_rm(ctx);
+ if (ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_NO) {
SRV_WRN("%s", "speculative decoding not supported by this context\n");
}
- if (spec_type == COMMON_SPECULATIVE_COMPAT_TYPE_CKPT) {
+ if (ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) {
SRV_WRN("%s", "speculative decoding will use checkpoints\n");
- params_base.speculative.use_checkpoints = true;
}
// initialize slots
@@ -893,11 +890,13 @@ struct server_context_impl {
slot.ctx = ctx;
slot.n_ctx = n_ctx_slot;
+ slot.ctx_seq_rm_type = ctx_seq_rm_type;
+
slot.mctx = mctx;
slot.prompt.tokens.has_mtmd = mctx != nullptr;
// try speculative decoding
- if (spec_type != COMMON_SPECULATIVE_COMPAT_TYPE_NO) {
+ if (ctx_seq_rm_type != COMMON_CONTEXT_SEQ_RM_TYPE_NO) {
slot.spec.reset(common_speculative_init(params_base.speculative, slot.ctx));
if (slot.spec) {
@@ -2588,15 +2587,11 @@ struct server_context_impl {
// make a checkpoint of the parts of the memory that cannot be rolled back.
// checkpoints are created only if:
+ // - the model does not support partial sequence removal
// - the model uses SWA and we are not using `swa_full`
- // - the model architecture is marked as recurrent or hybrid
- //
- // TODO: try to make this conditional on the context or the memory module, instead of the model type
do_checkpoint = do_checkpoint && (
- llama_model_is_recurrent(model) ||
- llama_model_is_hybrid(model) ||
- (llama_model_n_swa(model) > 0 && !params_base.swa_full)
- );
+ (slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) ||
+ (llama_model_n_swa(model) > 0 && !params_base.swa_full));
bool has_mtmd = false;
@@ -2965,8 +2960,6 @@ struct server_context_impl {
// verify and try to accept the draft
{
- const auto & params_spec = slot.task->params.speculative;
-
common_sampler_ptr smpl_save(common_sampler_clone(slot.smpl.get()));
GGML_ASSERT(slot.spec_i_batch.size() == n_draft + 1);
@@ -2979,13 +2972,14 @@ struct server_context_impl {
// check for partial draft acceptance
if (accepted.size() < slot.spec_draft.size() + 1) {
- if (params_spec.use_checkpoints) {
+ if (slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) {
// partial acceptance is not supported by the context -> truncate the draft and restore the state
slot.spec_draft = std::move(accepted);
- auto & ckpt = slot.spec_ckpt;
+ const auto & ckpt = slot.spec_ckpt;
- SLT_DBG(slot, "restoring speculative checkpoint (pos_min = %d, pos_max = %d, size = %zu)\n", ckpt.pos_min, ckpt.pos_max, ckpt.size());
+ SLT_DBG(slot, "restoring speculative checkpoint (pos_min = %d, pos_max = %d, size = %zu)\n",
+ ckpt.pos_min, ckpt.pos_max, ckpt.size());
const size_t n = llama_state_seq_set_data_ext(slot.ctx, ckpt.data.data(), ckpt.size(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
if (n != ckpt.size()) {
From 81df3f7cfaa6f99de14e792b38d5771bf427383e Mon Sep 17 00:00:00 2001
From: SamareshSingh <97642706+ssam18@users.noreply.github.com>
Date: Mon, 20 Apr 2026 02:32:46 -0500
Subject: [PATCH 4/4] fix: GLM-DSA crash in llama-tokenize when using
vocab_only (#22102)
* llama: fix crash in print_info for GLM-DSA when vocab_only is set
* addressed code review comments
* cont : simplify
---------
Co-authored-by: Georgi Gerganov
---
src/llama-model.cpp | 186 ++++++++++++++++++++++----------------------
1 file changed, 93 insertions(+), 93 deletions(-)
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 4ded484dd1c..5f543e762c3 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -8180,114 +8180,114 @@ void llama_model::print_info() const {
LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
size_t i = 0;
- for (auto label : classifier_labels) {
+ for (const auto & label : classifier_labels) {
LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
}
}
- }
- if (arch == LLM_ARCH_MAMBA ||
- arch == LLM_ARCH_MAMBA2 ||
- arch == LLM_ARCH_JAMBA ||
- arch == LLM_ARCH_FALCON_H1 ||
- arch == LLM_ARCH_PLAMO2 ||
- arch == LLM_ARCH_GRANITE_HYBRID ||
- arch == LLM_ARCH_QWEN3NEXT ||
- arch == LLM_ARCH_QWEN35 ||
- arch == LLM_ARCH_QWEN35MOE ||
- arch == LLM_ARCH_NEMOTRON_H ||
- arch == LLM_ARCH_NEMOTRON_H_MOE) {
- LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
- LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
- LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
- LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
- LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
- LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
- }
+ if (arch == LLM_ARCH_MAMBA ||
+ arch == LLM_ARCH_MAMBA2 ||
+ arch == LLM_ARCH_JAMBA ||
+ arch == LLM_ARCH_FALCON_H1 ||
+ arch == LLM_ARCH_PLAMO2 ||
+ arch == LLM_ARCH_GRANITE_HYBRID ||
+ arch == LLM_ARCH_QWEN3NEXT ||
+ arch == LLM_ARCH_QWEN35 ||
+ arch == LLM_ARCH_QWEN35MOE ||
+ arch == LLM_ARCH_NEMOTRON_H ||
+ arch == LLM_ARCH_NEMOTRON_H_MOE) {
+ LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
+ LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
+ LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
+ LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
+ LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
+ LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
+ }
- LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
- if (pimpl->n_elements >= 1e12) {
- LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
- } else if (pimpl->n_elements >= 1e9) {
- LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
- } else if (pimpl->n_elements >= 1e6) {
- LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
- } else {
- LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
- }
+ LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
+ if (pimpl->n_elements >= 1e12) {
+ LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
+ } else if (pimpl->n_elements >= 1e9) {
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
+ } else if (pimpl->n_elements >= 1e6) {
+ LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
+ } else {
+ LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
+ }
- // general kv
- LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
+ // general kv
+ LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
- if (arch == LLM_ARCH_DEEPSEEK) {
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
- }
+ if (arch == LLM_ARCH_DEEPSEEK) {
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
+ }
- if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
- LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
- LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
- LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla());
- LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla());
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
- LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
- LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
- }
+ if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
+ LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
+ LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
+ LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla());
+ LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla());
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
+ }
- if (arch == LLM_ARCH_QWEN2MOE) {
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
- }
+ if (arch == LLM_ARCH_QWEN2MOE) {
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
+ }
- if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- }
+ if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ }
- if (arch == LLM_ARCH_MINICPM ||
- arch == LLM_ARCH_GRANITE ||
- arch == LLM_ARCH_GRANITE_MOE ||
- arch == LLM_ARCH_GRANITE_HYBRID ||
- arch == LLM_ARCH_NEMOTRON_H_MOE) {
- LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
- LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
- LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
- LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
- }
+ if (arch == LLM_ARCH_MINICPM ||
+ arch == LLM_ARCH_GRANITE ||
+ arch == LLM_ARCH_GRANITE_MOE ||
+ arch == LLM_ARCH_GRANITE_HYBRID ||
+ arch == LLM_ARCH_NEMOTRON_H_MOE) {
+ LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
+ LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
+ LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
+ }
- if (arch == LLM_ARCH_BAILINGMOE) {
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
- LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
- }
+ if (arch == LLM_ARCH_BAILINGMOE) {
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
+ }
- if (arch == LLM_ARCH_BAILINGMOE2) {
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
- LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
- LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
- LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
- }
+ if (arch == LLM_ARCH_BAILINGMOE2) {
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
+ LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
+ }
- if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
- }
+ if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
+ }
- if (arch == LLM_ARCH_GROVEMOE) {
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
- LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
- LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
+ if (arch == LLM_ARCH_GROVEMOE) {
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
+ LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
+ LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
+ }
}
vocab.print_info();