From 9d49acb2a7102db476c8e09e9322d4a9aaea3b4b Mon Sep 17 00:00:00 2001 From: Yes You Can Have Your Own <188969017+yychyo@users.noreply.github.com> Date: Mon, 20 Apr 2026 08:30:24 +0300 Subject: [PATCH 1/4] server: rename --clear-idle to --cache-idle-slots (#21741) --- common/arg.cpp | 8 ++++---- common/common.h | 2 +- tools/server/README.md | 2 +- tools/server/server-context.cpp | 12 ++++++------ tools/server/tests/unit/test_kv_keep_only_active.py | 2 +- tools/server/tests/utils.py | 6 +++--- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 6f22f781915..43fe5a25d5d 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1316,13 +1316,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL})); add_opt(common_arg( - {"--clear-idle"}, - {"--no-clear-idle"}, + {"--cache-idle-slots"}, + {"--no-cache-idle-slots"}, "save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)", [](common_params & params, bool value) { - params.clear_idle = value; + params.cache_idle_slots = value; } - ).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_env("LLAMA_ARG_CACHE_IDLE_SLOTS").set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--context-shift"}, {"--no-context-shift"}, diff --git a/common/common.h b/common/common.h index 4c36e85e0bc..027339294e0 100644 --- a/common/common.h +++ b/common/common.h @@ -567,7 +567,7 @@ struct common_params { int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting bool cache_prompt = true; // whether to enable prompt caching - bool clear_idle = true; // save and clear idle slots upon starting a new task + bool cache_idle_slots = true; // save and clear idle slots upon starting a new task int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc. diff --git a/tools/server/README.md b/tools/server/README.md index 84a29cba0b6..db1f2703904 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -167,7 +167,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | -| `--clear-idle, --no-clear-idle` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)
(env: LLAMA_ARG_CLEAR_IDLE) | +| `--cache-idle-slots, --no-cache-idle-slots` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)
(env: LLAMA_ARG_CACHE_IDLE_SLOTS) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode | | `-sp, --special` | special tokens output enabled (default: false) | diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 70ebcc225e3..7ffe6a30310 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -987,13 +987,13 @@ struct server_context_impl { metrics.init(); - if (params_base.clear_idle) { + if (params_base.cache_idle_slots) { if (!params_base.kv_unified) { - SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__); - params_base.clear_idle = false; + SRV_WRN("%s: --cache-idle-slots requires --kv-unified, disabling\n", __func__); + params_base.cache_idle_slots = false; } else if (params_base.cache_ram_mib == 0) { - SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__); - params_base.clear_idle = false; + SRV_WRN("%s: --cache-idle-slots requires --cache-ram, disabling\n", __func__); + params_base.cache_idle_slots = false; } else { SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__); SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n"); @@ -1886,7 +1886,7 @@ struct server_context_impl { break; // drop the task } - if (params_base.clear_idle) { + if (params_base.cache_idle_slots) { for (auto & s : slots) { if (!s.is_processing()) { slot_save_and_clear(s); diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py index da93d50011e..f4b08b5dd0e 100644 --- a/tools/server/tests/unit/test_kv_keep_only_active.py +++ b/tools/server/tests/unit/test_kv_keep_only_active.py @@ -91,7 +91,7 @@ def test_clear_and_restore(): def test_disabled_with_flag(): global server - server.no_clear_idle = True + server.no_cache_idle_slots = True server.start() log = LogReader(server.log_path) diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index 5ddac5be496..ddbb76c9adb 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -103,7 +103,7 @@ class ServerProcess: media_path: str | None = None sleep_idle_seconds: int | None = None cache_ram: int | None = None - no_clear_idle: bool = False + no_cache_idle_slots: bool = False log_path: str | None = None webui_mcp_proxy: bool = False @@ -242,8 +242,8 @@ def start(self, timeout_seconds: int = DEFAULT_HTTP_TIMEOUT) -> None: server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds]) if self.cache_ram is not None: server_args.extend(["--cache-ram", self.cache_ram]) - if self.no_clear_idle: - server_args.append("--no-clear-idle") + if self.no_cache_idle_slots: + server_args.append("--no-cache-idle-slots") if self.webui_mcp_proxy: server_args.append("--webui-mcp-proxy") From 788fcbc5ddb16101e6388170bf84c0d10bd8c006 Mon Sep 17 00:00:00 2001 From: Katostrofik Date: Mon, 20 Apr 2026 01:39:45 -0400 Subject: [PATCH 2/4] [SYCL] Fix reorder MMVQ assert on unaligned vocab sizes (#22035) * [SYCL] Fix reorder MMVQ assert on unaligned vocab sizes The reorder mul_mat_vec_q dispatchers for Q4_0, Q8_0, Q4_K, and Q6_K asserted that block_num_y was a multiple of 16 subgroups. Models with a vocab size not divisible by 16 (for example HY-MT at 120818) aborted on model load when the output projection tripped the assert. I replaced the assert with padding: block_num_y now rounds up to a whole number of subgroup-sized workgroups. The kernel already has the row bounds check (`if (row >= nrows) return;`) so the extra padded threads early-exit cleanly. Row values are uniform across a subgroup so the collective reduce stays safe. For aligned vocab sizes the padded block_num_y equals the old value, so the kernel launch is identical and there is no regression. Thanks to @arthw for flagging the relationship to #21527. Fixes #22020. AI assisted coding, tested on Intel B70 hardware. * sycl: use WARP_SIZE for num_subgroups in reorder MMVQ launches Replaces the hardcoded 16 with WARP_SIZE in the four reorder_mul_mat_vec launch helpers (Q4_0, Q8_0, Q4_K, Q6_K). Compile-time no-op on the Intel target where WARP_SIZE is 16, but makes the relationship to subgroup size explicit. Per review by @NeoZhangJianyu on #22035. Assisted by Claude. --- ggml/src/ggml-sycl/mmvq.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index af22b98dddb..3a4577ecbbc 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -537,9 +537,9 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx, static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK4_0 == 0); - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); - constexpr size_t num_subgroups = 16; - GGML_ASSERT(block_num_y % num_subgroups == 0); + // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel. + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups; const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE)); const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); @@ -682,9 +682,9 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, static void reorder_mul_mat_vec_q8_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK8_0 == 0); - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); - constexpr size_t num_subgroups = 16; - GGML_ASSERT(block_num_y % num_subgroups == 0); + // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel. + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups; const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE)); const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); @@ -798,9 +798,9 @@ static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); - constexpr size_t num_subgroups = 16; - GGML_ASSERT(block_num_y % num_subgroups == 0); + // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel. + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups; const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); @@ -842,9 +842,9 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); - constexpr size_t num_subgroups = 16; - GGML_ASSERT(block_num_y % num_subgroups == 0); + // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel. + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups; const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); From de71b5f81c3b6b9f8bdaf1b2a21198e1eede3fda Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 20 Apr 2026 08:42:37 +0300 Subject: [PATCH 3/4] server : refactor "use checkpoint" logic (#22114) --- common/arg.cpp | 2 +- common/common.cpp | 38 +++++++++++++++++++- common/common.h | 25 ++++++++++--- common/hf-cache.cpp | 4 +-- common/speculative.cpp | 62 ++++++++------------------------- common/speculative.h | 10 ------ tools/server/server-context.cpp | 44 ++++++++++------------- 7 files changed, 93 insertions(+), 92 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 43fe5a25d5d..099f0aeab24 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -292,7 +292,7 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa hf_tag = "default"; } - std::string model_endpoint = get_model_endpoint(); + std::string model_endpoint = common_get_model_endpoint(); auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini"; // prepare local path for caching diff --git a/common/common.cpp b/common/common.cpp index d3f1cee394c..6cde71d819a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1382,7 +1382,7 @@ common_init_result_ptr common_init_from_params(common_params & params) { common_init_result::~common_init_result() = default; -std::string get_model_endpoint() { +std::string common_get_model_endpoint() { const char * model_endpoint_env = getenv("MODEL_ENDPOINT"); // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility. const char * hf_endpoint_env = getenv("HF_ENDPOINT"); @@ -1397,6 +1397,42 @@ std::string get_model_endpoint() { return model_endpoint; } +common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) { + auto * mem = llama_get_memory(ctx); + if (mem == nullptr) { + return COMMON_CONTEXT_SEQ_RM_TYPE_NO; + } + + common_context_seq_rm_type res = COMMON_CONTEXT_SEQ_RM_TYPE_PART; + + llama_memory_clear(mem, true); + + // eval 2 tokens to check if the context is compatible + std::vector tmp; + tmp.push_back(0); + tmp.push_back(0); + + int ret = llama_decode(ctx, llama_batch_get_one(tmp.data(), tmp.size())); + if (ret != 0) { + LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret); + res = COMMON_CONTEXT_SEQ_RM_TYPE_NO; + goto done; + } + + // try to remove the last tokens + if (!llama_memory_seq_rm(mem, 0, 1, -1)) { + LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__); + res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL; + goto done; + } + +done: + llama_memory_clear(mem, true); + llama_synchronize(ctx); + + return res; +} + void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora) { std::vector loras; std::vector scales; diff --git a/common/common.h b/common/common.h index 027339294e0..cbcb3bb6586 100644 --- a/common/common.h +++ b/common/common.h @@ -308,10 +308,9 @@ struct common_params_speculative { // ngram-based speculative decoding - uint16_t ngram_size_n = 12; // ngram size for lookup - uint16_t ngram_size_m = 48; // mgram size for speculative tokens - uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed - bool use_checkpoints = false; // use checkpoints to rewind in token history of recurrent models + uint16_t ngram_size_n = 12; // ngram size for lookup + uint16_t ngram_size_m = 48; // mgram size for speculative tokens + uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed std::shared_ptr ngram_mod; @@ -847,7 +846,23 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p // clear LoRA adapters from context, then apply new list of adapters void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora); -std::string get_model_endpoint(); +// model endpoint from env +std::string common_get_model_endpoint(); + +// +// Context utils +// + +enum common_context_seq_rm_type { + COMMON_CONTEXT_SEQ_RM_TYPE_NO = 0, // seq_rm not supported (e.g. no memory module) + COMMON_CONTEXT_SEQ_RM_TYPE_PART = 1, // can seq_rm partial sequences + COMMON_CONTEXT_SEQ_RM_TYPE_FULL = 2, // can seq_rm full sequences only +}; + +// check if the llama_context can remove sequences +// note: clears the memory of the context +common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx); + // // Batch utils diff --git a/common/hf-cache.cpp b/common/hf-cache.cpp index 38a4c17a98e..ea5b2150de4 100644 --- a/common/hf-cache.cpp +++ b/common/hf-cache.cpp @@ -230,7 +230,7 @@ static nl::json api_get(const std::string & url, static std::string get_repo_commit(const std::string & repo_id, const std::string & token) { try { - auto endpoint = get_model_endpoint(); + auto endpoint = common_get_model_endpoint(); auto json = api_get(endpoint + "api/models/" + repo_id + "/refs", token); if (!json.is_object() || @@ -308,7 +308,7 @@ hf_files get_repo_files(const std::string & repo_id, hf_files files; try { - auto endpoint = get_model_endpoint(); + auto endpoint = common_get_model_endpoint(); auto json = api_get(endpoint + "api/models/" + repo_id + "/tree/" + commit + "?recursive=true", token); if (!json.is_array()) { diff --git a/common/speculative.cpp b/common/speculative.cpp index 1789560eeaa..daa2b5a8ac9 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -164,8 +164,8 @@ struct common_speculative_state_draft : public common_speculative_state { llama_context * ctx_tgt; // only used for retokenizing from ctx_dft llama_context * ctx_dft; + bool use_ckpt = false; struct common_speculative_checkpoint ckpt; - bool use_checkpoint; common_sampler * smpl; @@ -180,11 +180,11 @@ struct common_speculative_state_draft : public common_speculative_state { llama_context * ctx_tgt, llama_context * ctx_dft, const std::vector> & replacements, - bool use_checkpoint) + bool use_ckpt) : common_speculative_state(type) , ctx_tgt(ctx_tgt) , ctx_dft(ctx_dft) - , use_checkpoint(use_checkpoint) + , use_ckpt(use_ckpt) { batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1); smpl = nullptr; @@ -239,7 +239,7 @@ struct common_speculative_state_draft : public common_speculative_state { } void begin(const llama_tokens & prompt) override { - if (use_checkpoint && ckpt.size() > 0) { + if (use_ckpt && ckpt.size() > 0) { // delete checkpoint LOG_DBG("%s: delete checkpoint, prompt.size=%zu, pos_min=%d, pos_max=%d, n_tokens=%" PRId64 ", size=%.3f MiB\n", __func__, prompt.size(), ckpt.pos_min, ckpt.pos_max, ckpt.n_tokens, (float) ckpt.data.size() / 1024 / 1024); @@ -351,7 +351,7 @@ struct common_speculative_state_draft : public common_speculative_state { LOG_DBG("%s: reuse_i = %d, reuse_n = %d, #prompt_dft = %zu, #prompt_cur = %zu\n", __func__, reuse_i, reuse_n, prompt_dft.size(), prompt_cur.size()); - if (use_checkpoint && ckpt.ckpt_size == 0 && reuse_n > 0) { + if (use_ckpt && ckpt.ckpt_size == 0 && reuse_n > 0) { LOG_DBG("%s: no checkpoint available, no reuse, (reuse_i=%d, reuse_n=%d) -> (0, 0)\n", __func__, reuse_i, reuse_n); reuse_i = 0; @@ -361,8 +361,8 @@ struct common_speculative_state_draft : public common_speculative_state { result.clear(); result.reserve(params.n_max); - bool needs_ckpt = use_checkpoint && prompt_dft.size() > 0; - if (reuse_n == 0 || (use_checkpoint && reuse_i > 0)) { + bool needs_ckpt = use_ckpt && prompt_dft.size() > 0; + if (reuse_n == 0 || (use_ckpt && reuse_i > 0)) { llama_memory_clear(mem_dft, false); prompt_dft.clear(); } else { @@ -400,7 +400,7 @@ struct common_speculative_state_draft : public common_speculative_state { } if (reuse_n < (int) prompt_dft.size() || do_restore) { - if (use_checkpoint) { + if (use_ckpt) { if (ckpt.n_tokens > (int64_t) prompt_dft.size()) { LOG_INF("%s: checkpoint is too large, prompt_tgt.size=%zu, ckpt.n_tokens=%" PRId64 ", reuse_n=%d, prompt_dft.size=%zu\n", __func__, prompt_tgt.size(), ckpt.n_tokens, reuse_n, prompt_dft.size()); @@ -912,42 +912,6 @@ enum common_speculative_type common_speculative_type_from_name(const std::string return it->second; } -common_speculative_compat_type common_speculative_is_compat(llama_context * ctx_tgt) { - auto * mem = llama_get_memory(ctx_tgt); - if (mem == nullptr) { - return COMMON_SPECULATIVE_COMPAT_TYPE_NO; - } - - common_speculative_compat_type res = COMMON_SPECULATIVE_COMPAT_TYPE_FULL; - - llama_memory_clear(mem, true); - - // eval 2 tokens to check if the context is compatible - std::vector tmp; - tmp.push_back(0); - tmp.push_back(0); - - int ret = llama_decode(ctx_tgt, llama_batch_get_one(tmp.data(), tmp.size())); - if (ret != 0) { - LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret); - res = COMMON_SPECULATIVE_COMPAT_TYPE_NO; - goto done; - } - - // try to remove the last tokens - if (!llama_memory_seq_rm(mem, 0, 1, -1)) { - LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__); - res = COMMON_SPECULATIVE_COMPAT_TYPE_CKPT; - goto done; - } - -done: - llama_memory_clear(mem, true); - llama_synchronize(ctx_tgt); - - return res; -} - // initialization of the speculative decoding system // common_speculative * common_speculative_init( @@ -1022,11 +986,13 @@ common_speculative * common_speculative_init( case COMMON_SPECULATIVE_TYPE_NONE: break; case COMMON_SPECULATIVE_TYPE_DRAFT: { + const bool use_ckpt = common_context_can_seq_rm(ctx_dft) == COMMON_CONTEXT_SEQ_RM_TYPE_FULL; + impls.push_back(std::make_unique(config.type, - /* .ctx_tgt = */ ctx_tgt, - /* .ctx_dft = */ ctx_dft, - /* .replacements = */ params.replacements, - /* .use_checkpoint= */ params.use_checkpoints // TODO: this should be based on the draft model! + /* .ctx_tgt = */ ctx_tgt, + /* .ctx_dft = */ ctx_dft, + /* .replacements = */ params.replacements, + /* .use_ckpt = */ use_ckpt )); break; } diff --git a/common/speculative.h b/common/speculative.h index cbe6e5bdb73..bca78d32b5b 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -14,16 +14,6 @@ enum common_speculative_type common_speculative_type_from_name(const std::string // convert type to string std::string common_speculative_type_to_str(enum common_speculative_type type); -enum common_speculative_compat_type { - COMMON_SPECULATIVE_COMPAT_TYPE_NO = 0, - COMMON_SPECULATIVE_COMPAT_TYPE_FULL = 1, - COMMON_SPECULATIVE_COMPAT_TYPE_CKPT = 2, -}; - -// check if the llama_context is compatible for speculative decoding -// note: clears the memory of the context -common_speculative_compat_type common_speculative_is_compat(llama_context * ctx_tgt); - common_speculative * common_speculative_init( common_params_speculative & params, llama_context * ctx_tgt); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 7ffe6a30310..99856e6c3ac 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -78,9 +78,10 @@ enum server_state { struct server_slot { int id; - // TODO: change to unique_ptrs for consistency: llama_context * ctx = nullptr; + common_context_seq_rm_type ctx_seq_rm_type = COMMON_CONTEXT_SEQ_RM_TYPE_NO; + // multimodal mtmd_context * mctx = nullptr; @@ -90,7 +91,6 @@ struct server_slot { server_prompt_checkpoint spec_ckpt; common_speculative_ptr spec; - // TODO: move members that belong to the task (such as `generated_text`, `has_new_line`) to task_results_state // see https://github.com/ggml-org/llama.cpp/pull/18283#issuecomment-3710175837 std::unique_ptr task; @@ -343,7 +343,7 @@ struct server_slot { if (!spec_draft.empty()) { // we have a previous (partial) draft to reuse - if (task->params.speculative.use_checkpoints) { + if (ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) { GGML_ASSERT(!spec_ckpt.empty()); } } else { @@ -362,15 +362,13 @@ struct server_slot { spec_draft.clear(); } - if (!spec_draft.empty() && params_spec.use_checkpoints) { + if (!spec_draft.empty() && ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) { const auto n_tokens = prompt.tokens.size(); - auto & ckpt = spec_ckpt; - - ckpt = server_get_checkpoint(ctx, this->id, n_tokens); + spec_ckpt = server_get_checkpoint(ctx, this->id, n_tokens); SLT_DBG(*this, "created speculative checkpoint (pos_min = %d, pos_max = %d, n_tokens = %zu, size = %.3f MiB)\n", - ckpt.pos_min, ckpt.pos_max, n_tokens, (float) ckpt.data.size() / 1024 / 1024); + spec_ckpt.pos_min, spec_ckpt.pos_max, n_tokens, (float) spec_ckpt.data.size() / 1024 / 1024); } } @@ -871,14 +869,13 @@ struct server_context_impl { slots.clear(); - const auto spec_type = common_speculative_is_compat(ctx); - if (spec_type == COMMON_SPECULATIVE_COMPAT_TYPE_NO) { + const auto ctx_seq_rm_type = common_context_can_seq_rm(ctx); + if (ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_NO) { SRV_WRN("%s", "speculative decoding not supported by this context\n"); } - if (spec_type == COMMON_SPECULATIVE_COMPAT_TYPE_CKPT) { + if (ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) { SRV_WRN("%s", "speculative decoding will use checkpoints\n"); - params_base.speculative.use_checkpoints = true; } // initialize slots @@ -893,11 +890,13 @@ struct server_context_impl { slot.ctx = ctx; slot.n_ctx = n_ctx_slot; + slot.ctx_seq_rm_type = ctx_seq_rm_type; + slot.mctx = mctx; slot.prompt.tokens.has_mtmd = mctx != nullptr; // try speculative decoding - if (spec_type != COMMON_SPECULATIVE_COMPAT_TYPE_NO) { + if (ctx_seq_rm_type != COMMON_CONTEXT_SEQ_RM_TYPE_NO) { slot.spec.reset(common_speculative_init(params_base.speculative, slot.ctx)); if (slot.spec) { @@ -2588,15 +2587,11 @@ struct server_context_impl { // make a checkpoint of the parts of the memory that cannot be rolled back. // checkpoints are created only if: + // - the model does not support partial sequence removal // - the model uses SWA and we are not using `swa_full` - // - the model architecture is marked as recurrent or hybrid - // - // TODO: try to make this conditional on the context or the memory module, instead of the model type do_checkpoint = do_checkpoint && ( - llama_model_is_recurrent(model) || - llama_model_is_hybrid(model) || - (llama_model_n_swa(model) > 0 && !params_base.swa_full) - ); + (slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) || + (llama_model_n_swa(model) > 0 && !params_base.swa_full)); bool has_mtmd = false; @@ -2965,8 +2960,6 @@ struct server_context_impl { // verify and try to accept the draft { - const auto & params_spec = slot.task->params.speculative; - common_sampler_ptr smpl_save(common_sampler_clone(slot.smpl.get())); GGML_ASSERT(slot.spec_i_batch.size() == n_draft + 1); @@ -2979,13 +2972,14 @@ struct server_context_impl { // check for partial draft acceptance if (accepted.size() < slot.spec_draft.size() + 1) { - if (params_spec.use_checkpoints) { + if (slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) { // partial acceptance is not supported by the context -> truncate the draft and restore the state slot.spec_draft = std::move(accepted); - auto & ckpt = slot.spec_ckpt; + const auto & ckpt = slot.spec_ckpt; - SLT_DBG(slot, "restoring speculative checkpoint (pos_min = %d, pos_max = %d, size = %zu)\n", ckpt.pos_min, ckpt.pos_max, ckpt.size()); + SLT_DBG(slot, "restoring speculative checkpoint (pos_min = %d, pos_max = %d, size = %zu)\n", + ckpt.pos_min, ckpt.pos_max, ckpt.size()); const size_t n = llama_state_seq_set_data_ext(slot.ctx, ckpt.data.data(), ckpt.size(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); if (n != ckpt.size()) { From 81df3f7cfaa6f99de14e792b38d5771bf427383e Mon Sep 17 00:00:00 2001 From: SamareshSingh <97642706+ssam18@users.noreply.github.com> Date: Mon, 20 Apr 2026 02:32:46 -0500 Subject: [PATCH 4/4] fix: GLM-DSA crash in llama-tokenize when using vocab_only (#22102) * llama: fix crash in print_info for GLM-DSA when vocab_only is set * addressed code review comments * cont : simplify --------- Co-authored-by: Georgi Gerganov --- src/llama-model.cpp | 186 ++++++++++++++++++++++---------------------- 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 4ded484dd1c..5f543e762c3 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -8180,114 +8180,114 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out); size_t i = 0; - for (auto label : classifier_labels) { + for (const auto & label : classifier_labels) { LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str()); } } - } - if (arch == LLM_ARCH_MAMBA || - arch == LLM_ARCH_MAMBA2 || - arch == LLM_ARCH_JAMBA || - arch == LLM_ARCH_FALCON_H1 || - arch == LLM_ARCH_PLAMO2 || - arch == LLM_ARCH_GRANITE_HYBRID || - arch == LLM_ARCH_QWEN3NEXT || - arch == LLM_ARCH_QWEN35 || - arch == LLM_ARCH_QWEN35MOE || - arch == LLM_ARCH_NEMOTRON_H || - arch == LLM_ARCH_NEMOTRON_H_MOE) { - LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv); - LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner); - LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state); - LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank); - LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group); - LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms); - } + if (arch == LLM_ARCH_MAMBA || + arch == LLM_ARCH_MAMBA2 || + arch == LLM_ARCH_JAMBA || + arch == LLM_ARCH_FALCON_H1 || + arch == LLM_ARCH_PLAMO2 || + arch == LLM_ARCH_GRANITE_HYBRID || + arch == LLM_ARCH_QWEN3NEXT || + arch == LLM_ARCH_QWEN35 || + arch == LLM_ARCH_QWEN35MOE || + arch == LLM_ARCH_NEMOTRON_H || + arch == LLM_ARCH_NEMOTRON_H_MOE) { + LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv); + LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner); + LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state); + LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank); + LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group); + LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms); + } - LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str()); - if (pimpl->n_elements >= 1e12) { - LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12); - } else if (pimpl->n_elements >= 1e9) { - LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9); - } else if (pimpl->n_elements >= 1e6) { - LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6); - } else { - LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3); - } + LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str()); + if (pimpl->n_elements >= 1e12) { + LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12); + } else if (pimpl->n_elements >= 1e9) { + LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9); + } else if (pimpl->n_elements >= 1e6) { + LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6); + } else { + LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3); + } - // general kv - LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str()); + // general kv + LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str()); - if (arch == LLM_ARCH_DEEPSEEK) { - LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); - LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); - LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); - LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); - } + if (arch == LLM_ARCH_DEEPSEEK) { + LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); + LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); + LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); + LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); + } - if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) { - LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); - LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q); - LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv); - LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla()); - LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla()); - LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); - LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); - LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); - LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); - LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); - } + if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) { + LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); + LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q); + LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv); + LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla()); + LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla()); + LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); + LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); + LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); + LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); + LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); + } - if (arch == LLM_ARCH_QWEN2MOE) { - LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); - LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); - } + if (arch == LLM_ARCH_QWEN2MOE) { + LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); + LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); + } - if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) { - LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); - } + if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) { + LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); + } - if (arch == LLM_ARCH_MINICPM || - arch == LLM_ARCH_GRANITE || - arch == LLM_ARCH_GRANITE_MOE || - arch == LLM_ARCH_GRANITE_HYBRID || - arch == LLM_ARCH_NEMOTRON_H_MOE) { - LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale); - LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale); - LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale); - LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); - } + if (arch == LLM_ARCH_MINICPM || + arch == LLM_ARCH_GRANITE || + arch == LLM_ARCH_GRANITE_MOE || + arch == LLM_ARCH_GRANITE_HYBRID || + arch == LLM_ARCH_NEMOTRON_H_MOE) { + LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale); + LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale); + LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale); + LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); + } - if (arch == LLM_ARCH_BAILINGMOE) { - LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); - LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); - LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); - LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); - LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); - } + if (arch == LLM_ARCH_BAILINGMOE) { + LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); + LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); + LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); + LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); + LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); + } - if (arch == LLM_ARCH_BAILINGMOE2) { - LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); - LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); - LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); - LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); - LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); - LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); - LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); - LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers); - } + if (arch == LLM_ARCH_BAILINGMOE2) { + LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); + LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); + LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); + LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); + LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); + LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); + LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); + LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers); + } - if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) { - LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); - LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); - } + if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) { + LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); + LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); + } - if (arch == LLM_ARCH_GROVEMOE) { - LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); - LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp); - LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts); - LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale); + if (arch == LLM_ARCH_GROVEMOE) { + LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); + LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp); + LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts); + LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale); + } } vocab.print_info();