Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -687,6 +687,11 @@ jobs:
with:
fetch-depth: 0

- name: Free up disk space
uses: ggml-org/free-disk-space@v1.3.1
with:
tool-cache: true

- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
Expand Down
10 changes: 5 additions & 5 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10912,14 +10912,14 @@ def set_vocab(self):
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
self.hparams["vocab_size"] = vocab_size

assert max(tokenizer.vocab.values()) < vocab_size
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]

tokpre = self.get_vocab_base_pre(tokenizer)

reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
added_vocab = tokenizer.get_added_vocab()
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]

added_tokens_decoder = tokenizer.added_tokens_decoder
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]

for i in range(vocab_size):
if i not in reverse_vocab:
Expand All @@ -10930,7 +10930,7 @@ def set_vocab(self):
if token in added_vocab:
if not added_tokens_decoder[i].normalized:
previous_token = token
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment]
if previous_token != token:
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")

Expand Down
40 changes: 39 additions & 1 deletion ggml/src/ggml-backend-meta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1270,7 +1270,45 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
GGML_ASSERT(ggml_is_contiguous(tensor));

const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false);
GGML_ASSERT(split_state.n_segments == 1);

if (split_state.n_segments != 1) {
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));
GGML_ASSERT(tensor->ne[3] == 1);
size_t offset_data = 0;
std::vector<size_t> simple_offsets(n_bufs, 0);
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
GGML_ASSERT(tensor->ne[2] == 1);
const int64_t blck_size = ggml_blck_size(tensor->type);
for (size_t s = 0; s < split_state.n_segments; s++) {
for (size_t j = 0; j < n_bufs; j++) {
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]);
offset_data += nbytes;
simple_offsets[j] += nbytes;
}
}
GGML_ASSERT(offset_data*tensor->ne[1] == size);
return;
}
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
for (size_t s = 0; s < split_state.n_segments; s++) {
for (size_t j = 0; j < n_bufs; j++) {
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]);
offset_data += nbytes;
simple_offsets[j] += nbytes;
}
}
GGML_ASSERT(offset_data*tensor->ne[2] == size);
return;
}

switch (split_state.axis) {
case GGML_BACKEND_SPLIT_AXIS_0:
Expand Down
157 changes: 99 additions & 58 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,20 +91,24 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
throw std::runtime_error("failed to create llama_context from model");
}

std::vector<llama_device_memory_data> ret(model->devices.size());
const size_t nd = model->n_devices();
std::vector<llama_device_memory_data> ret(nd + 1);

std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();

for (const auto & [buft, mb] : memory_breakdown) {
if (ggml_backend_buft_is_host(buft)) {
ret.back().mb.model += mb.model;
ret.back().mb.context += mb.context;
ret.back().mb.compute += mb.compute;
continue;
}

ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
if (!dev) {
continue;
}
for (size_t i = 0; i < ret.size(); i++) {
for (size_t i = 0; i < nd; i++) {
if (model->devices[i].dev == dev) {
ret[i].mb.model += mb.model;
ret[i].mb.context += mb.context;
Expand All @@ -113,7 +117,19 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
}
}
}
for (size_t i = 0; i < ret.size(); i++) {

{
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (cpu_dev == nullptr) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
}
size_t free;
size_t total;
ggml_backend_dev_memory(cpu_dev, &free, &total);
ret.back().free = free;
ret.back().total = total;
}
for (size_t i = 0; i < nd; i++) {
size_t free;
size_t total;
ggml_backend_dev_memory(model->devices[i].dev, &free, &total);
Expand All @@ -122,11 +138,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
// have any to report. in this case, we will use the host memory as a fallback
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
if (free == 0 && total == 0) {
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (cpu_dev == nullptr) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
}
ggml_backend_dev_memory(cpu_dev, &free, &total);
free = ret.back().free;
total = ret.back().total;
}
ret[i].free = free;
ret[i].total = total;
Expand Down Expand Up @@ -180,15 +193,15 @@ static void llama_params_fit_impl(
LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
const size_t nd = devs.size(); // number of devices
if (nd == 0) {
LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
return;
}

std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
margins.reserve(nd);
for (size_t id = 0; id < nd; id++) {
margins.push_back(margins_s[id]);
if (nd == 0) {
margins.push_back(margins_s[0]);
} else {
for (size_t id = 0; id < nd; id++) {
margins.push_back(margins_s[id]);
}
}

std::vector<std::string> dev_names;
Expand All @@ -215,58 +228,75 @@ static void llama_params_fit_impl(
std::vector<int64_t> projected_free_per_device;
projected_free_per_device.reserve(nd);

if (nd > 1) {
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
}
for (size_t id = 0; id < nd; id++) {
const llama_device_memory_data & dmd = dmds_full[id];

const int64_t projected_used = dmd.mb.total();
const int64_t projected_free = dmd.free - projected_used;
projected_free_per_device.push_back(projected_free);

sum_free += dmd.free;
sum_projected_used += projected_used;
sum_projected_free += projected_free;
sum_projected_model += dmd.mb.model;

if (nd > 1) {
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
}
}
assert(sum_free >= 0 && sum_projected_used >= 0);
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
__func__, sum_projected_used/MiB, sum_free/MiB);
if (nd == 1) {
if (projected_free_per_device[0] >= margins[0]) {
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
if (nd == 0) {
sum_projected_used = dmds_full.back().mb.total();
sum_free = dmds_full.back().total;
sum_projected_free = sum_free - sum_projected_used;
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
__func__, sum_projected_used/MiB, sum_free/MiB);
if (sum_projected_free >= margins[0]) {
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
__func__, sum_projected_free/MiB, margins[0]/MiB);
return;
}
} else {
bool changes_needed = false;
if (nd > 1) {
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
}
for (size_t id = 0; id < nd; id++) {
if (projected_free_per_device[id] < margins[id]) {
changes_needed = true;
break;
const llama_device_memory_data & dmd = dmds_full[id];

const int64_t projected_used = dmd.mb.total();
const int64_t projected_free = dmd.free - projected_used;
projected_free_per_device.push_back(projected_free);

sum_free += dmd.free;
sum_projected_used += projected_used;
sum_projected_free += projected_free;
sum_projected_model += dmd.mb.model;

if (nd > 1) {
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
}
}
if (!changes_needed) {
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
return;
assert(sum_free >= 0 && sum_projected_used >= 0);
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
__func__, sum_projected_used/MiB, sum_free/MiB);
if (nd == 1) {
if (projected_free_per_device[0] >= margins[0]) {
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
return;
}
} else {
bool changes_needed = false;
for (size_t id = 0; id < nd; id++) {
if (projected_free_per_device[id] < margins[id]) {
changes_needed = true;
break;
}
}
if (!changes_needed) {
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
return;
}
}
}

// step 2: try reducing memory use by reducing the context size

{
int64_t global_surplus = sum_projected_free;
for (size_t id = 0; id < nd; id++) {
global_surplus -= margins[id];
if (nd == 0) {
global_surplus -= margins[0];
} else {
for (size_t id = 0; id < nd; id++) {
global_surplus -= margins[id];
}
}
if (global_surplus < 0) {
if (nd == 1) {
if (nd <= 1) {
LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
__func__, margins[0]/MiB, -global_surplus/MiB);
} else {
Expand All @@ -277,8 +307,12 @@ static void llama_params_fit_impl(
if (cparams->n_ctx == 0) {
if (hp_nct > n_ctx_min) {
int64_t sum_used_target = sum_free;
for (size_t id = 0; id < nd; id++) {
sum_used_target -= margins[id];
if (nd == 0) {
sum_used_target -= margins[0];
} else {
for (size_t id = 0; id < nd; id++) {
sum_used_target -= margins[id];
}
}
if (nd > 1) {
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
Expand All @@ -293,8 +327,12 @@ static void llama_params_fit_impl(
int64_t sum_projected_used_min_ctx = 0;
cparams->n_ctx = n_ctx_min;
const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
for (const auto & dmd : dmds_min_ctx) {
sum_projected_used_min_ctx += dmd.mb.total();
if (nd == 0) {
sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
} else {
for (size_t id = 0; id < nd; id++) {
sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
}
}
if (sum_used_target > sum_projected_used_min_ctx) {
// linear interpolation between minimum and maximum context size:
Expand All @@ -306,7 +344,7 @@ static void llama_params_fit_impl(
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
if (nd == 1) {
if (nd <= 1) {
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
return;
}
Expand All @@ -329,6 +367,9 @@ static void llama_params_fit_impl(
}
}
}
if (nd == 0) {
throw llama_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
}

if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
Expand Down Expand Up @@ -476,8 +517,8 @@ static void llama_params_fit_impl(

std::vector<int64_t> ret;
ret.reserve(nd);
for (const llama_device_memory_data & dmd : dmd_nl) {
ret.push_back(dmd.mb.total());
for (size_t id = 0; id < nd; id++) {
ret.push_back(dmd_nl[id].mb.total());
}
return ret;
};
Expand Down
Loading