From fd1c0ec3f037f47f8ed76d99d0fd3cacdcc0baab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 18 Apr 2026 08:16:04 +0200 Subject: [PATCH 1/4] llama: fit ctx size for CPU only (#21568) --- src/llama.cpp | 157 +++++++++++++++++++++++++++++++------------------- 1 file changed, 99 insertions(+), 58 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 484372d8d10..7d3a34e98aa 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -91,12 +91,16 @@ static std::vector llama_get_device_memory_data( throw std::runtime_error("failed to create llama_context from model"); } - std::vector ret(model->devices.size()); + const size_t nd = model->n_devices(); + std::vector ret(nd + 1); std::map memory_breakdown = ctx->memory_breakdown(); for (const auto & [buft, mb] : memory_breakdown) { if (ggml_backend_buft_is_host(buft)) { + ret.back().mb.model += mb.model; + ret.back().mb.context += mb.context; + ret.back().mb.compute += mb.compute; continue; } @@ -104,7 +108,7 @@ static std::vector llama_get_device_memory_data( if (!dev) { continue; } - for (size_t i = 0; i < ret.size(); i++) { + for (size_t i = 0; i < nd; i++) { if (model->devices[i].dev == dev) { ret[i].mb.model += mb.model; ret[i].mb.context += mb.context; @@ -113,7 +117,19 @@ static std::vector llama_get_device_memory_data( } } } - for (size_t i = 0; i < ret.size(); i++) { + + { + ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (cpu_dev == nullptr) { + throw std::runtime_error(format("%s: no CPU backend found", __func__)); + } + size_t free; + size_t total; + ggml_backend_dev_memory(cpu_dev, &free, &total); + ret.back().free = free; + ret.back().total = total; + } + for (size_t i = 0; i < nd; i++) { size_t free; size_t total; ggml_backend_dev_memory(model->devices[i].dev, &free, &total); @@ -122,11 +138,8 @@ static std::vector llama_get_device_memory_data( // have any to report. in this case, we will use the host memory as a fallback // fixes: https://github.com/ggml-org/llama.cpp/issues/18577 if (free == 0 && total == 0) { - ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - if (cpu_dev == nullptr) { - throw std::runtime_error(format("%s: no CPU backend found", __func__)); - } - ggml_backend_dev_memory(cpu_dev, &free, &total); + free = ret.back().free; + total = ret.back().total; } ret[i].free = free; ret[i].total = total; @@ -180,15 +193,15 @@ static void llama_params_fit_impl( LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__); const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); const size_t nd = devs.size(); // number of devices - if (nd == 0) { - LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__); - return; - } std::vector margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits margins.reserve(nd); - for (size_t id = 0; id < nd; id++) { - margins.push_back(margins_s[id]); + if (nd == 0) { + margins.push_back(margins_s[0]); + } else { + for (size_t id = 0; id < nd; id++) { + margins.push_back(margins_s[id]); + } } std::vector dev_names; @@ -215,46 +228,59 @@ static void llama_params_fit_impl( std::vector projected_free_per_device; projected_free_per_device.reserve(nd); - if (nd > 1) { - LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__); - } - for (size_t id = 0; id < nd; id++) { - const llama_device_memory_data & dmd = dmds_full[id]; - - const int64_t projected_used = dmd.mb.total(); - const int64_t projected_free = dmd.free - projected_used; - projected_free_per_device.push_back(projected_free); - - sum_free += dmd.free; - sum_projected_used += projected_used; - sum_projected_free += projected_free; - sum_projected_model += dmd.mb.model; - - if (nd > 1) { - LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n", - __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB); - } - } - assert(sum_free >= 0 && sum_projected_used >= 0); - LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n", - __func__, sum_projected_used/MiB, sum_free/MiB); - if (nd == 1) { - if (projected_free_per_device[0] >= margins[0]) { - LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n", - __func__, projected_free_per_device[0]/MiB, margins[0]/MiB); + if (nd == 0) { + sum_projected_used = dmds_full.back().mb.total(); + sum_free = dmds_full.back().total; + sum_projected_free = sum_free - sum_projected_used; + LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n", + __func__, sum_projected_used/MiB, sum_free/MiB); + if (sum_projected_free >= margins[0]) { + LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n", + __func__, sum_projected_free/MiB, margins[0]/MiB); return; } } else { - bool changes_needed = false; + if (nd > 1) { + LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__); + } for (size_t id = 0; id < nd; id++) { - if (projected_free_per_device[id] < margins[id]) { - changes_needed = true; - break; + const llama_device_memory_data & dmd = dmds_full[id]; + + const int64_t projected_used = dmd.mb.total(); + const int64_t projected_free = dmd.free - projected_used; + projected_free_per_device.push_back(projected_free); + + sum_free += dmd.free; + sum_projected_used += projected_used; + sum_projected_free += projected_free; + sum_projected_model += dmd.mb.model; + + if (nd > 1) { + LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n", + __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB); } } - if (!changes_needed) { - LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__); - return; + assert(sum_free >= 0 && sum_projected_used >= 0); + LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n", + __func__, sum_projected_used/MiB, sum_free/MiB); + if (nd == 1) { + if (projected_free_per_device[0] >= margins[0]) { + LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n", + __func__, projected_free_per_device[0]/MiB, margins[0]/MiB); + return; + } + } else { + bool changes_needed = false; + for (size_t id = 0; id < nd; id++) { + if (projected_free_per_device[id] < margins[id]) { + changes_needed = true; + break; + } + } + if (!changes_needed) { + LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__); + return; + } } } @@ -262,11 +288,15 @@ static void llama_params_fit_impl( { int64_t global_surplus = sum_projected_free; - for (size_t id = 0; id < nd; id++) { - global_surplus -= margins[id]; + if (nd == 0) { + global_surplus -= margins[0]; + } else { + for (size_t id = 0; id < nd; id++) { + global_surplus -= margins[id]; + } } if (global_surplus < 0) { - if (nd == 1) { + if (nd <= 1) { LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n", __func__, margins[0]/MiB, -global_surplus/MiB); } else { @@ -277,8 +307,12 @@ static void llama_params_fit_impl( if (cparams->n_ctx == 0) { if (hp_nct > n_ctx_min) { int64_t sum_used_target = sum_free; - for (size_t id = 0; id < nd; id++) { - sum_used_target -= margins[id]; + if (nd == 0) { + sum_used_target -= margins[0]; + } else { + for (size_t id = 0; id < nd; id++) { + sum_used_target -= margins[id]; + } } if (nd > 1) { // for multiple devices we need to be more conservative in terms of how much context we think can fit: @@ -293,8 +327,12 @@ static void llama_params_fit_impl( int64_t sum_projected_used_min_ctx = 0; cparams->n_ctx = n_ctx_min; const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); - for (const auto & dmd : dmds_min_ctx) { - sum_projected_used_min_ctx += dmd.mb.total(); + if (nd == 0) { + sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total(); + } else { + for (size_t id = 0; id < nd; id++) { + sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total(); + } } if (sum_used_target > sum_projected_used_min_ctx) { // linear interpolation between minimum and maximum context size: @@ -306,7 +344,7 @@ static void llama_params_fit_impl( const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx; LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); - if (nd == 1) { + if (nd <= 1) { LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__); return; } @@ -329,6 +367,9 @@ static void llama_params_fit_impl( } } } + if (nd == 0) { + throw llama_params_fit_exception("was unable to fit model into system memory by reducing context, abort"); + } if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) { throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort"); @@ -476,8 +517,8 @@ static void llama_params_fit_impl( std::vector ret; ret.reserve(nd); - for (const llama_device_memory_data & dmd : dmd_nl) { - ret.push_back(dmd.mb.total()); + for (size_t id = 0; id < nd; id++) { + ret.push_back(dmd_nl[id].mb.total()); } return ret; }; From 89a5474f0e7d96450adf1764a2c79f4e2d55fa8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 18 Apr 2026 09:36:41 +0200 Subject: [PATCH 2/4] convert : fix (ignore for now) typings errors (#22002) --- convert_hf_to_gguf.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 42d559dfecf..2df5e94fe23 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -10912,14 +10912,14 @@ def set_vocab(self): vocab_size = -(vocab_size // -pad_vocab) * pad_vocab self.hparams["vocab_size"] = vocab_size - assert max(tokenizer.vocab.values()) < vocab_size + assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute] tokpre = self.get_vocab_base_pre(tokenizer) - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} - added_vocab = tokenizer.get_added_vocab() + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - added_tokens_decoder = tokenizer.added_tokens_decoder + added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] for i in range(vocab_size): if i not in reverse_vocab: @@ -10930,7 +10930,7 @@ def set_vocab(self): if token in added_vocab: if not added_tokens_decoder[i].normalized: previous_token = token - token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment] if previous_token != token: logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") From 83d58e02fcf7131fa601e366bf2460ecd9db5a77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 18 Apr 2026 09:37:30 +0200 Subject: [PATCH 3/4] ci : free disk space for rocm release (#22012) --- .github/workflows/release.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8a49715b395..f1cc12cd452 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -687,6 +687,11 @@ jobs: with: fetch-depth: 0 + - name: Free up disk space + uses: ggml-org/free-disk-space@v1.3.1 + with: + tool-cache: true + - name: ccache uses: ggml-org/ccache-action@v1.2.21 with: From 59accc8863f0e715442496300540692da54ab0c5 Mon Sep 17 00:00:00 2001 From: SamareshSingh <97642706+ssam18@users.noreply.github.com> Date: Sat, 18 Apr 2026 03:04:51 -0500 Subject: [PATCH 4/4] ggml-backend-meta: add multi-segment read support in get_tensor (#22063) --- ggml/src/ggml-backend-meta.cpp | 40 +++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp index 1ee3eeb4d96..24f6bc0639d 100644 --- a/ggml/src/ggml-backend-meta.cpp +++ b/ggml/src/ggml-backend-meta.cpp @@ -1270,7 +1270,45 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co GGML_ASSERT(ggml_is_contiguous(tensor)); const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false); - GGML_ASSERT(split_state.n_segments == 1); + + if (split_state.n_segments != 1) { + GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS); + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + GGML_ASSERT(tensor->ne[3] == 1); + size_t offset_data = 0; + std::vector simple_offsets(n_bufs, 0); + if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) { + GGML_ASSERT(tensor->ne[2] == 1); + const int64_t blck_size = ggml_blck_size(tensor->type); + for (size_t s = 0; s < split_state.n_segments; s++) { + for (size_t j = 0; j < n_bufs; j++) { + const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); + GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0); + const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0]; + ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes, + tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]); + offset_data += nbytes; + simple_offsets[j] += nbytes; + } + } + GGML_ASSERT(offset_data*tensor->ne[1] == size); + return; + } + GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1); + for (size_t s = 0; s < split_state.n_segments; s++) { + for (size_t j = 0; j < n_bufs; j++) { + const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); + const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1]; + ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes, + tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]); + offset_data += nbytes; + simple_offsets[j] += nbytes; + } + } + GGML_ASSERT(offset_data*tensor->ne[2] == size); + return; + } switch (split_state.axis) { case GGML_BACKEND_SPLIT_AXIS_0: