diff --git a/common/arg.cpp b/common/arg.cpp index 060053595db..7744fd6c488 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -977,6 +977,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context for (auto & seq_breaker : params.sampling.dry_sequence_breakers) { string_process_escapes(seq_breaker); } + for (auto & pair : params.speculative.replacements) { + string_process_escapes(pair.first); + string_process_escapes(pair.second); + } } if (!params.kv_overrides.empty()) { @@ -3249,6 +3253,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.speculative.model.path = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT")); + add_opt(common_arg( + {"--spec-replace"}, "TARGET", "DRAFT", + "translate the string in TARGET into DRAFT if the draft model and main model are not compatible", + [](common_params & params, const std::string & tgt, const std::string & dft) { + params.speculative.replacements.push_back({ tgt, dft }); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-ctkd", "--cache-type-k-draft"}, "TYPE", string_format( @@ -3438,12 +3449,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER})); - // diffusion parameters add_opt(common_arg( { "--diffusion-steps" }, "N", string_format("number of diffusion steps (default: %d)", params.diffusion.steps), [](common_params & params, int value) { params.diffusion.steps = value; } ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + { "--diffusion-visual" }, + string_format("enable visual diffusion mode (show progressive generation) (default: %s)", + params.diffusion.visual_mode ? "true" : "false"), + [](common_params & params) { params.diffusion.visual_mode = true; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( { "--diffusion-eps" }, "F", string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps), @@ -3451,21 +3468,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); add_opt(common_arg( { "--diffusion-algorithm" }, "N", - string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)", + string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm), [](common_params & params, int value) { params.diffusion.algorithm = value; } ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); add_opt(common_arg( { "--diffusion-alg-temp" }, "F", - string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp), + string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp), [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); } ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( - { "--diffusion-visual" }, - string_format("enable visual diffusion mode (show progressive generation) (default: %s)", - params.diffusion.visual_mode ? "true" : "false"), - [](common_params & params) { params.diffusion.visual_mode = true; } + { "--diffusion-block-length" }, "N", + string_format("llada block length for generation (default: %d)", params.diffusion.block_length), + [](common_params & params, int value) { params.diffusion.block_length = value; } ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + { "--diffusion-cfg-scale" }, "F", + string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale), + [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + { "--diffusion-add-gumbel-noise" }, "F", + string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"), + [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + return ctx_arg; } diff --git a/common/common.h b/common/common.h index 00f42694eaf..f5acf37ff9f 100644 --- a/common/common.h +++ b/common/common.h @@ -201,6 +201,7 @@ struct common_params_speculative { int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default) float p_split = 0.1f; // speculative decoding split probability float p_min = 0.75f; // minimum speculative decoding probability (greedy) + std::vector> replacements; // main to speculative model replacements ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V @@ -220,11 +221,17 @@ struct common_params_vocoder { }; struct common_params_diffusion { - int32_t steps = 64; // number of diffusion steps - float eps = 1e-3f; // epsilon for timesteps - int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY) - float alg_temp = 0.0f; // algorithm temperature - bool visual_mode = false; // show progressive diffusion on screen + int32_t steps = 128; + bool visual_mode = false; + + float eps = 0; // epsilon for timesteps + int32_t block_length = 32; // block length for generation + + int32_t algorithm = 4; // default algorithm: low-confidence + float alg_temp = 0.0f; // algorithm temperature + + float cfg_scale = 0; // classifier-free guidance scale + bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0 }; enum common_reasoning_format { diff --git a/common/speculative.cpp b/common/speculative.cpp index 843bd1ddbdb..262b2c23e72 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -1,30 +1,39 @@ #include "speculative.h" +#include "ggml.h" +#include "llama.h" #include "log.h" #include "common.h" #include "sampling.h" #include #include +#include #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 struct common_speculative { - struct llama_context * ctx; + struct llama_context * ctx_tgt; // only used for retokenizing from ctx_dft + struct llama_context * ctx_dft; struct common_sampler * smpl; llama_batch batch; - llama_tokens prompt; + llama_tokens prompt_dft; + bool vocab_dft_compatible = true; // whether retokenization is needed + std::map tgt_dft_replacements = {}; }; struct common_speculative * common_speculative_init( + struct llama_context * ctx_tgt, struct llama_context * ctx_dft) { auto * result = new common_speculative { - /* .ctx = */ ctx_dft, - /* .smpl = */ nullptr, - /* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1), - /* .prompt = */ {}, + /* .ctx_tgt = */ ctx_tgt, + /* .ctx_dft = */ ctx_dft, + /* .smpl = */ nullptr, + /* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1), + /* .prompt_dft = */ {}, + /* .vocab_dft_compatible = */ false, }; // TODO: optimize or pass from outside? @@ -59,6 +68,9 @@ struct common_speculative * common_speculative_init( } #endif + result->vocab_dft_compatible = common_speculative_are_compatible(ctx_tgt, ctx_dft); + LOG_DBG("vocab_dft_compatible = %d\n", result->vocab_dft_compatible); + return result; } @@ -75,8 +87,8 @@ void common_speculative_free(struct common_speculative * spec) { } bool common_speculative_are_compatible( - const struct llama_context * ctx_tgt, - const struct llama_context * ctx_dft) { + const struct llama_context * ctx_tgt, + const struct llama_context * ctx_dft) { const struct llama_model * model_tgt = llama_get_model(ctx_tgt); const struct llama_model * model_dft = llama_get_model(ctx_dft); @@ -90,31 +102,32 @@ bool common_speculative_are_compatible( LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft); if (vocab_type_tgt != vocab_type_dft) { - LOG_ERR("%s: draft model vocab type must match target model to use speculation but " - "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt); + LOG_DBG("%s: draft model vocab type must match target model to use speculation but ", __func__); + LOG_DBG("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt); return false; } - if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) || + if ( + llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) || llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) || llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) || - llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) { - LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__); - LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt)); - LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft)); + llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft) + ) { + LOG_DBG("%s: draft model special tokens must match target model to use speculation\n", __func__); return false; } { const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt); const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft); - - const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft); + const int vocab_diff = n_vocab_tgt > n_vocab_dft + ? n_vocab_tgt - n_vocab_dft + : n_vocab_dft - n_vocab_tgt; if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) { - LOG_ERR("%s: draft model vocab must closely match target model to use speculation but " - "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", - __func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); + LOG_DBG("%s: draft model vocab must closely match target model to use speculation but ", __func__); + LOG_DBG("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", + n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); return false; } @@ -122,8 +135,8 @@ bool common_speculative_are_compatible( const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i); const char * token_text_dft = llama_vocab_get_text(vocab_dft, i); if (std::strcmp(token_text_tgt, token_text_dft) != 0) { - LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but " - "token %d content differs - target '%s', draft '%s'\n", __func__, i, + LOG_DBG("%s: draft model vocab must match target model to use speculation but ", __func__); + LOG_DBG("token %d content differs - target '%s', draft '%s'\n", i, common_token_to_piece(ctx_tgt, i).c_str(), common_token_to_piece(ctx_dft, i).c_str()); return false; @@ -134,32 +147,93 @@ bool common_speculative_are_compatible( return true; } +void common_speculative_add_replacement_tgt_dft( + struct common_speculative * spec, + const char *source, const char *dest) { + spec->tgt_dft_replacements[source] = dest; +} + +static std::string replace_to_dft( + struct common_speculative * spec, + const std::string& input) { + std::string result = input; + for (const auto & pair : spec->tgt_dft_replacements) { + size_t pos = result.find(pair.first); + while (pos != std::string::npos) { + result.replace(pos, pair.first.length(), pair.second); + pos = result.find(pair.first, pos + pair.second.length()); + } + } + return result; +} + +static std::string replace_to_tgt( + struct common_speculative * spec, + const std::string& input) { + std::string result = input; + for (const auto& pair : spec->tgt_dft_replacements) { + size_t pos = result.find(pair.second); + while (pos != std::string::npos) { + result.replace(pos, pair.second.length(), pair.first); + pos = result.find(pair.second, pos + pair.first.length()); + } + } + return result; +} + + llama_tokens common_speculative_gen_draft( struct common_speculative * spec, struct common_speculative_params params, - const llama_tokens & prompt_tgt, + const llama_tokens & prompt_tgt_main_model, // specified in target model vocab llama_token id_last) { auto & batch = spec->batch; - auto & ctx = spec->ctx; + auto & ctx_tgt = spec->ctx_tgt; + auto & ctx_dft = spec->ctx_dft; auto & smpl = spec->smpl; - auto & prompt = spec->prompt; + auto & prompt_dft = spec->prompt_dft; - auto * mem = llama_get_memory(ctx); + auto * mem_dft = llama_get_memory(ctx_dft); int reuse_i = 0; int reuse_n = 0; - const int n_ctx = llama_n_ctx(ctx) - params.n_draft; + const int n_ctx = llama_n_ctx(ctx_dft) - params.n_draft; + + llama_tokens prompt_tgt_draft_model; + if (!spec->vocab_dft_compatible) { + std::string text; + text = common_detokenize(ctx_tgt, prompt_tgt_main_model, true); + text = replace_to_dft(spec, text); + LOG_DBG("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str()); + prompt_tgt_draft_model = common_tokenize(ctx_dft, text, false, true); + + // convert id_last to draft vocab. llama_detokenize is called directly to avoid an allocation + const auto * model_tgt = llama_get_model(ctx_tgt); + const auto * vocab_tgt = llama_model_get_vocab(model_tgt); + + int32_t n_chars = llama_detokenize(vocab_tgt, &id_last, 1, nullptr, 0, false, false); + GGML_ASSERT(n_chars < 0 && "failed to detokenize id_last"); + text.resize(-n_chars); + llama_detokenize(vocab_tgt, &id_last, 1, text.data(), text.size(), false, false); + text = replace_to_dft(spec, text); + + LOG_DBG("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str()); + id_last = common_tokenize(ctx_dft, text, false, true)[0]; + } + // prompt_tgt's tokens will always be compatible with ctx_dft + const llama_tokens &prompt_tgt = + spec->vocab_dft_compatible ? prompt_tgt_main_model : prompt_tgt_draft_model; const int i_start = std::max(0, (int) prompt_tgt.size() - n_ctx); // reuse as much as possible from the old draft context // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt - for (int i = 0; i < (int) prompt.size(); ++i) { + for (int i = 0; i < (int) prompt_dft.size(); ++i) { int cur = 0; while (i_start + cur < (int) prompt_tgt.size() && - i + cur < (int) prompt.size() && - prompt_tgt[i_start + cur] == prompt[i + cur]) { + i + cur < (int) prompt_dft.size() && + prompt_tgt[i_start + cur] == prompt_dft[i + cur]) { cur++; } @@ -169,21 +243,20 @@ llama_tokens common_speculative_gen_draft( } } - LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size()); + LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size()); llama_tokens result; result.reserve(params.n_draft); if (reuse_n == 0) { - llama_memory_clear(mem, false); - - prompt.clear(); + llama_memory_clear(mem_dft, false); + prompt_dft.clear(); } else { // this happens when a previous draft has been discarded (for example, due to being too small), but the // target model agreed with it. in this case, we simply pass back the previous results to save compute - if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) { - for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) { - result.push_back(prompt[i]); + if (reuse_i + reuse_n < (int) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) { + for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) { + result.push_back(prompt_dft[i]); if (params.n_draft <= (int) result.size()) { break; @@ -194,16 +267,15 @@ llama_tokens common_speculative_gen_draft( } if (reuse_i > 0) { - llama_memory_seq_rm (mem, 0, 0, reuse_i); - llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i); + llama_memory_seq_rm (mem_dft, 0, 0, reuse_i); + llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i); - prompt.erase(prompt.begin(), prompt.begin() + reuse_i); + prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i); } - if (reuse_n < (int) prompt.size()) { - llama_memory_seq_rm (mem, 0, reuse_n, -1); - - prompt.erase(prompt.begin() + reuse_n, prompt.end()); + if (reuse_n < (int) prompt_dft.size()) { + llama_memory_seq_rm (mem_dft, 0, reuse_n, -1); + prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end()); } } @@ -214,28 +286,28 @@ llama_tokens common_speculative_gen_draft( //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]); common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false); - prompt.push_back(prompt_tgt[i]); + prompt_dft.push_back(prompt_tgt[i]); } // we should rarely end-up here during normal decoding if (batch.n_tokens > 0) { //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str()); - llama_decode(ctx, batch); + llama_decode(ctx_dft, batch); } - const llama_pos n_past = prompt.size(); + const llama_pos n_past = prompt_dft.size(); LOG_DBG("%s: n_past = %d\n", __func__, n_past); common_batch_clear(batch); common_batch_add (batch, id_last, n_past, { 0 }, true); - prompt.push_back(id_last); + prompt_dft.push_back(id_last); - //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str()); + LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str()); - llama_decode(ctx, batch); + llama_decode(ctx_dft, batch); common_sampler_reset(smpl); @@ -243,13 +315,13 @@ llama_tokens common_speculative_gen_draft( for (int i = 0; i < params.n_draft; ++i) { common_batch_clear(batch); - common_sampler_sample(smpl, ctx, 0, true); + common_sampler_sample(smpl, ctx_dft, 0, true); const auto * cur_p = common_sampler_get_candidates(smpl); for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) { LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n", - k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str()); + k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str()); } // add drafted token for each sequence @@ -271,10 +343,19 @@ llama_tokens common_speculative_gen_draft( common_batch_add(batch, id, n_past + i + 1, { 0 }, true); // evaluate the drafted tokens on the draft model - llama_decode(ctx, batch); + llama_decode(ctx_dft, batch); - prompt.push_back(id); + prompt_dft.push_back(id); } + if (!spec->vocab_dft_compatible) { + std::string detokenized = common_detokenize(ctx_dft, result, true); + detokenized = replace_to_tgt(spec, detokenized); + LOG_DBG("draft->main detokenized string: '%s'\n", detokenized.c_str()); + result = common_tokenize(ctx_tgt, detokenized, false, true); + if (result.size() > (size_t)params.n_draft) { + result.resize(params.n_draft); + } + } return result; } diff --git a/common/speculative.h b/common/speculative.h index 2b51a70ca1f..e69d7aaa1eb 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -12,7 +12,10 @@ struct common_speculative_params { float p_min = 0.75f; // min probability required to accept a token in the draft }; -struct common_speculative * common_speculative_init(struct llama_context * ctx_dft); +struct common_speculative * common_speculative_init( + struct llama_context * ctx_tgt, + struct llama_context * ctx_dft +); void common_speculative_free(struct common_speculative * spec); @@ -20,6 +23,10 @@ bool common_speculative_are_compatible( const struct llama_context * ctx_tgt, const struct llama_context * ctx_dft); +void common_speculative_add_replacement_tgt_dft( + struct common_speculative * spec, + const char *source, const char *dest); + // sample up to n_draft tokens and add them to the batch using the draft model llama_tokens common_speculative_gen_draft( struct common_speculative * spec, diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 3f5cefe007c..db4112318d4 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2904,6 +2904,107 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("LLaDAModelLM") +class LLaDAModel(TextModel): + model_arch = gguf.MODEL_ARCH.LLADA + undo_permute = True + + def get_vocab_base(self) -> tuple[list[str], list[int], str]: + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + + vocab_dict = tokenizer.get_vocab() + vocab_size = self.hparams.get("vocab_size", len(vocab_dict)) + assert max(vocab_dict.values()) < vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()} + added_vocab = tokenizer.get_added_vocab() + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + # Check if it's a special token - treat special tokens as CONTROL tokens + if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder: + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + # Fallback: treat all added vocab as control tokens for special tokens like <|im_start|> + toktypes.append(gguf.TokenType.CONTROL) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + return tokens, toktypes, tokpre + + def set_vocab(self): + self._set_vocab_gpt2() + + # LLaDA specific parameters + self.gguf_writer.add_add_bos_token(True) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self._try_set_pooling_type() + + # Add parameters similar to LlamaModel + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if (rope_dim := hparams.get("head_dim")) is None: + n_heads = hparams.get("num_attention_heads", hparams.get("n_heads")) + rope_dim = hparams.get("hidden_size", hparams.get("d_model")) // n_heads + self.gguf_writer.add_rope_dimension_count(rope_dim) + + # Set context length for LLaDA + context_length = self.hparams.get("max_sequence_length", 4096) + self.gguf_writer.add_context_length(context_length) + + # Set embedding length (dimension size) + embedding_length = self.hparams.get("d_model", 4096) + self.gguf_writer.add_embedding_length(embedding_length) + + # Set feed forward length (MLP hidden size) + feed_forward_length = self.hparams.get("mlp_hidden_size", 12288) + self.gguf_writer.add_feed_forward_length(feed_forward_length) + + # LLaDA models use non-causal attention for diffusion, similar to Dream + self.gguf_writer.add_causal_attention(False) + + # LLaDA models don't shift their logits + self.gguf_writer.add_diffusion_shift_logits(False) + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams.get("num_attention_heads", self.hparams.get("n_heads")) + n_kv_head = self.hparams.get("num_key_value_heads", self.hparams.get("n_kv_heads")) + + if self.undo_permute: + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LLaDAModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LLaDAModel.permute(data_torch, n_head, n_kv_head) + + # LLaDA model tensors should be mapped directly since it's the base model + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("Ernie4_5_ForCausalLM") class Ernie4_5Model(TextModel): model_arch = gguf.MODEL_ARCH.ERNIE4_5 diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md index 2b001f09abe..325e09bd380 100755 --- a/docs/backend/CANN.md +++ b/docs/backend/CANN.md @@ -310,5 +310,7 @@ Specifies the memory pool management strategy: Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies. -## TODO -- Support more models and data types. +### GGML_CANN_WEIGHT_NZ + +Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU. + diff --git a/examples/diffusion/README.md b/examples/diffusion/README.md new file mode 100644 index 00000000000..26de5668aa8 --- /dev/null +++ b/examples/diffusion/README.md @@ -0,0 +1,13 @@ +# Diffusion Text Generation + +This directory contains implementations for Diffusion LLMs (DLLMs) + +More Info: +- https://github.com/ggml-org/llama.cpp/pull/14644 +- https://github.com/ggml-org/llama.cpp/pull/14771 + + +Example of using Dream architechture: `llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual` + +Example of using LLaDA architechture: `llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual` + diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index 3e11ce1160b..8431dcea8fe 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -5,67 +5,212 @@ #include "log.h" #include -#include -#include + #include #include +#include #include #include +#include +#include -typedef bool (*diffusion_step_callback_t)(int32_t step, - int32_t total_steps, - const llama_token * tokens, - int32_t n_tokens, - void * user_data); - -enum diffusion_alg { - DIFFUSION_ALG_ORIGIN = 0, - DIFFUSION_ALG_MASKGIT_PLUS = 1, - DIFFUSION_ALG_TOPK_MARGIN = 2, - DIFFUSION_ALG_ENTROPY = 3, +enum diffusion_algorithm { ORIGIN = 0, ENTROPY_BASED = 1, MARGIN_BASED = 2, RANDOM = 3, CONFIDENCE_BASED = 4 }; + +// Unified transfer scheduling methods +enum transfer_schedule { + TIMESTEP_BASED = 0, // Dream-style: (1.0 - s/t) * remaining + BLOCK_BASED = 1, // LLaDA-style: process in blocks with get_num_transfer_tokens }; +typedef bool (*diffusion_step_callback_t)(int32_t step, + int32_t total_steps, + const llama_token * tokens, + int32_t n_tokens, + void * user_data); + struct diffusion_params { - int32_t steps; - float eps; - float temperature; - float top_p; - int32_t top_k; - llama_token mask_token_id; - enum diffusion_alg algorithm; - float alg_temp; - diffusion_step_callback_t step_callback; - void * step_callback_user_data; - int32_t seed; + int32_t steps = 0; + float temperature = 0; + llama_token mask_token_id = LLAMA_TOKEN_NULL; + diffusion_step_callback_t step_callback = nullptr; + void * step_callback_user_data = nullptr; + int32_t seed = 0; + bool visual_mode = false; + bool shift_logits = false; // Shift logits by -1 after decode + + float top_p = 0.; + int32_t top_k = 0.; + + diffusion_algorithm algorithm = CONFIDENCE_BASED; + transfer_schedule schedule = TIMESTEP_BASED; + + float cfg_scale = 0.; // Config scale for classifier-free guidance + float eps = 0.; // Timestep scheduling + int32_t block_length = 0; // Block size (for block scheduling) + float alg_temp = 0; // algorithm temperature (0.0 = deterministic) + bool add_gumbel_noise = false; // Add gumbel noise to the logits if temp > 0.0 + + int32_t max_length = 0; // Maximum sequence length }; +struct callback_data { + diffusion_params * diff_params; + const llama_vocab * vocab; + int32_t n_input; +}; + +static float calculate_confidence(const llama_token_data_array & cur_p, + diffusion_algorithm algorithm, + std::mt19937 & rng) { + switch (algorithm) { + case CONFIDENCE_BASED: + return cur_p.data[cur_p.selected].p; // Selected token probability + + case ENTROPY_BASED: + { + float entropy = 0.0f; + const float epsilon = 1e-10f; + for (size_t i = 0; i < cur_p.size; i++) { + float prob = cur_p.data[i].p; + entropy += prob * logf(prob + epsilon); + } + return -entropy; // Higher entropy = lower confidence + } + + case MARGIN_BASED: + return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p; + + case RANDOM: + { + std::uniform_real_distribution uniform(0.0f, 1.0f); + return uniform(rng); // Random confidence + } + + case ORIGIN: + return cur_p.data[cur_p.selected].p; + + default: + return 0.0f; + } +} + +// Unified transfer count calculation function +static int32_t calculate_transfer_count(int32_t step, + int32_t total_steps, + int32_t remaining_masked, + transfer_schedule schedule, + float eps, + const std::vector & num_transfer_tokens = {}) { + switch (schedule) { + case TIMESTEP_BASED: + { + float t = 1.0f - (float) step / total_steps * (1.0f - eps); + float s = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps); + float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f; + return (int32_t) (remaining_masked * p_transfer); + } + + case BLOCK_BASED: + if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) { + return num_transfer_tokens[step]; + } + return remaining_masked / (total_steps - step); // Fallback + + default: + return remaining_masked / (total_steps - step); + } +} + +static bool diffusion_step_callback(int32_t step, + int32_t total_steps, + const llama_token * tokens, + int32_t n_tokens, + void * user_data) { + (void) user_data; + + callback_data * data = static_cast(user_data); + + auto print_progress_bar = [](int32_t step, int32_t total_steps) { + int progress_percent = (step * 100) / total_steps; + int progress_bars = (step * 50) / total_steps; + LOG_INF("\rdiffusion step: %d/%d [%s%s] %d%%", + step, + total_steps, + std::string(progress_bars, '=').c_str(), + std::string(50 - progress_bars, ' ').c_str(), + progress_percent); + }; + + if (data->diff_params->visual_mode) { + // Visual mode: clear + LOG_INF("\033[2J\033[H"); // Clear screen and move cursor to top-left + + print_progress_bar(step, total_steps); + + LOG_INF("\n"); + + std::string current_text = " "; + + for (int32_t i = data->n_input; i < n_tokens; i++) { + std::string token_str; + if (tokens[i] != llama_vocab_mask(data->vocab)) { + char piece[256]; + int n_chars = llama_token_to_piece(data->vocab, tokens[i], piece, sizeof(piece), 0, false); + if (n_chars > 0) { + piece[n_chars] = '\0'; + token_str = piece; + } + } else { + token_str = " "; + } + + current_text += token_str; + } -static diffusion_params diffusion_default_params() { - diffusion_params params = {}; - params.steps = 64; - params.eps = 1e-3f; - params.temperature = 0.2f; - params.top_p = 0.95f; - params.top_k = 0; - params.mask_token_id = LLAMA_TOKEN_NULL; - params.algorithm = DIFFUSION_ALG_ORIGIN; - params.alg_temp = 0.0f; - params.step_callback = nullptr; - params.step_callback_user_data = nullptr; - params.seed = 0; - return params; + LOG_INF("%s\n", current_text.c_str()); + } else { + print_progress_bar(step, total_steps); + } + + return true; } -static void diffusion_generate(llama_context * ctx, - const llama_token * input_tokens, - llama_token * output_tokens, - int32_t n_input, - int32_t max_length, - struct diffusion_params params, - int32_t & n_generated) { +static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) { + if (temperature == 0.0f) { + return; + } + + std::uniform_real_distribution uniform(0.0, 1.0); + for (int32_t i = 0; i < n_vocab; i++) { + double noise = uniform(rng); + // Prevent log(0) + noise = std::max(noise, 1e-20); + double gumbel_noise = std::pow(-std::log(noise), temperature); + logits[i] = std::exp(logits[i]) / gumbel_noise; + } +} + +static std::vector get_num_transfer_tokens(int32_t mask_count, int32_t steps) { + std::vector num_transfer_tokens(steps); + + int32_t base = mask_count / steps; + int32_t remainder = mask_count % steps; + + for (int32_t i = 0; i < steps; i++) { + num_transfer_tokens[i] = base + (i < remainder ? 1 : 0); + } + + return num_transfer_tokens; +} +static void diffusion_generate(llama_context * ctx, + const llama_token * input_tokens, + llama_token * output_tokens, + int32_t n_input, + const diffusion_params & params, + int32_t & n_generated) { n_generated = 0; - if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || max_length <= n_input) { + if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) { return; } @@ -73,27 +218,21 @@ static void diffusion_generate(llama_context * ctx, // Initialize with input and pad with mask tokens std::copy(input_tokens, input_tokens + n_input, output_tokens); - std::fill(output_tokens + n_input, output_tokens + max_length, params.mask_token_id); + std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id); std::mt19937 rng(params.seed); - std::vector timesteps(params.steps + 1); - for (int32_t i = 0; i <= params.steps; i++) { - timesteps[i] = 1.0f - (float) i / params.steps * (1.0f - params.eps); - } - llama_set_causal_attn(ctx, false); int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model)); std::vector candidates(n_vocab); - std::vector conf_candidates; - conf_candidates.reserve(max_length); - + conf_candidates.reserve(params.max_length); std::vector mask_positions; - mask_positions.reserve(max_length); + mask_positions.reserve(params.max_length); + // Setup sampler chain struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params()); if (params.top_k > 0) { llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k)); @@ -108,210 +247,269 @@ static void diffusion_generate(llama_context * ctx, struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed); - llama_batch batch = llama_batch_init(max_length, 0, 1); - batch.n_tokens = max_length; + llama_batch batch = llama_batch_init(params.max_length, 0, 1); + batch.n_tokens = params.max_length; - int64_t total_sampling_time = 0; - int64_t total_time = 0; + // Pre-allocate buffers for CFG if needed + int32_t logits_size = n_vocab * params.max_length; + std::vector cond_logits_buffer; + std::vector un_x_buffer; + if (params.cfg_scale > 0.0f) { + cond_logits_buffer.resize(logits_size); + un_x_buffer.resize(params.max_length); + } - int64_t time_start = ggml_time_us(); - for (int32_t step = 0; step < params.steps; step++) { - if (params.step_callback) { - if (!params.step_callback(step, params.steps, output_tokens, max_length, params.step_callback_user_data)) { - break; - } - } + // For block-based processing + std::vector num_transfer_tokens; + int32_t num_blocks = 1; + int32_t steps_per_block = params.steps; - for (int32_t i = 0; i < max_length; i++) { - batch.token[i] = output_tokens[i]; - batch.pos[i] = i; - batch.n_seq_id[i] = 1; - batch.seq_id[i][0] = 0; - batch.logits[i] = 1; - } + if (params.schedule == BLOCK_BASED) { + GGML_ASSERT(params.max_length % params.block_length == 0); + num_blocks = params.max_length / params.block_length; + GGML_ASSERT(params.steps % num_blocks == 0); + steps_per_block = params.steps / num_blocks; + } - int ret = llama_decode(ctx, batch); - if (ret != 0) { - LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, step, ret); - break; - } + std::vector confidence(params.max_length); - float * raw_logits = llama_get_logits(ctx); - if (!raw_logits) { - LOG_ERR("%s: failed to get logits at step %d\n", __func__, step); - break; + int64_t total_sampling_time = 0; + int64_t total_time = 0; + int64_t time_start = ggml_time_us(); + + for (int block_num = 0; block_num < num_blocks; block_num++) { + int32_t block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0; + int32_t block_end = (params.schedule == BLOCK_BASED) ? + std::min(n_input + (block_num + 1) * params.block_length, params.max_length) : + params.max_length; + + // Count masked tokens in current block for block-based processing + if (params.schedule == BLOCK_BASED) { + int32_t block_mask_count = 0; + for (int i = block_start; i < block_end; i++) { + if (output_tokens[i] == params.mask_token_id) { + block_mask_count++; + } + } + num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block); } - auto get_logits_for_pos = [&](int32_t pos) -> const float * { - return pos == 0 ? raw_logits : raw_logits + (pos - 1) * n_vocab; - }; - - int64_t time_start_sampling = ggml_time_us(); + for (int32_t step = 0; step < steps_per_block; step++) { + int32_t global_step = block_num * steps_per_block + step; - mask_positions.clear(); - for (int32_t i = 0; i < max_length; i++) { - if (output_tokens[i] == params.mask_token_id) { - mask_positions.push_back(i); + if (params.step_callback) { + if (!params.step_callback( + global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) { + break; + } } - } - if (mask_positions.empty()) { - break; - } + // Setup batch + for (int32_t i = 0; i < params.max_length; i++) { + batch.token[i] = output_tokens[i]; + batch.pos[i] = i; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = 0; + batch.logits[i] = 1; + } - float t = timesteps[step]; - float s = timesteps[step + 1]; + float * logits = nullptr; - if (params.algorithm == DIFFUSION_ALG_ORIGIN) { - float p_transfer = (step < params.steps - 1) ? (1.0f - s / t) : 1.0f; + if (params.cfg_scale > 0.0f) { + int ret = llama_decode(ctx, batch); + if (ret != 0) { + LOG_ERR("Failed to generate conditional"); + break; + } + float * cond_logits_ptr = llama_get_logits(ctx); + std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float)); - for (int32_t pos : mask_positions) { - if (std::uniform_real_distribution(0.0f, 1.0f)(rng) < p_transfer) { - const float * pos_logits = get_logits_for_pos(pos); - for (int32_t token_id = 0; token_id < n_vocab; token_id++) { - candidates[token_id].id = token_id; - candidates[token_id].logit = pos_logits[token_id]; - candidates[token_id].p = 0.0f; - } + // Unconditional generation (mask input) + std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin()); + for (int32_t i = 0; i < n_input; i++) { + un_x_buffer[i] = params.mask_token_id; + } - llama_token_data_array cur_p = { - /* .data = */ candidates.data(), - /* .size = */ (size_t) n_vocab, // Reset size to full vocab - /* .selected = */ -1, - /* .sorted = */ false, - }; + for (int32_t i = 0; i < params.max_length; i++) { + batch.token[i] = un_x_buffer[i]; + } + ret = llama_decode(ctx, batch); + if (ret != 0) { + LOG_ERR("Failed to generate unconditional"); + break; + } + float * uncond_logits = llama_get_logits(ctx); - llama_sampler_apply(sampler, &cur_p); - output_tokens[pos] = cur_p.data[cur_p.selected].id; + // Apply CFG + for (int32_t i = 0; i < logits_size; i++) { + cond_logits_buffer[i] = + uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]); } - } - } else { - std::vector> confidences; - std::vector sampled_tokens(mask_positions.size()); - - for (size_t i = 0; i < mask_positions.size(); i++) { - int32_t pos = mask_positions[i]; - const float * pos_logits = get_logits_for_pos(pos); - - for (int32_t token_id = 0; token_id < n_vocab; token_id++) { - candidates[token_id].logit = pos_logits[token_id]; - candidates[token_id].p = 0.0f; - candidates[token_id].id = token_id; + logits = cond_logits_buffer.data(); + } else { + int ret = llama_decode(ctx, batch); + if (ret != 0) { + LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret); + break; } + logits = llama_get_logits(ctx); + } - llama_token_data_array cur_p = { - /* .data = */ candidates.data(), - /* .size = */ candidates.size(), - /* .selected = */ -1, - /* .sorted = */ false, - }; + if (!logits) { + LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step); + break; + } - llama_sampler_apply(sampler, &cur_p); + auto get_logits_for_pos = [&](int32_t pos) -> const float * { + if (params.shift_logits) { + return pos == 0 ? logits : logits + (pos - 1) * n_vocab; + } + return logits + (pos) *n_vocab; + }; - llama_token sampled_token = cur_p.data[cur_p.selected].id; + int64_t time_start_sampling = ggml_time_us(); - float confidence = 0.0f; - if (params.algorithm == DIFFUSION_ALG_ENTROPY) { - const float epsilon = 1e-10f; - for (size_t j = 0; j < cur_p.size; j++) { - float prob = cur_p.data[j].p; - confidence += prob * logf(prob + epsilon); + mask_positions.clear(); + for (int32_t i = 0; i < params.max_length; i++) { + if (output_tokens[i] == params.mask_token_id) { + // For block-based, only consider current block + if (params.schedule != BLOCK_BASED || (i >= block_start && i < block_end)) { + mask_positions.push_back(i); } - } else if (params.algorithm == DIFFUSION_ALG_TOPK_MARGIN) { - confidence = cur_p.data[0].p - cur_p.data[1].p; - } else { - confidence = cur_p.data[cur_p.selected].p; } + } - sampled_tokens[i] = sampled_token; - confidences.emplace_back(confidence, i); + if (mask_positions.empty()) { + break; } - int32_t num_transfer = - (step < params.steps - 1) ? (int32_t) (mask_positions.size() * (1.0f - s / t)) : mask_positions.size(); - - if (num_transfer > 0) { - if (params.alg_temp == 0.0f) { - std::partial_sort(confidences.begin(), confidences.begin() + num_transfer, confidences.end(), - [](const std::pair & a, const std::pair & b) { - if (a.first != b.first) { - return a.first > b.first; - } - return a.second < b.second; - }); - } else { - conf_candidates.clear(); - - for (int32_t pos = 0; pos < max_length; pos++) { - float conf_logit = -std::numeric_limits::infinity(); - - auto it = std::find(mask_positions.begin(), mask_positions.end(), pos); - if (it != mask_positions.end()) { - size_t mask_idx = std::distance(mask_positions.begin(), it); - conf_logit = confidences[mask_idx].first / params.alg_temp; // Apply temperature scaling + if (params.add_gumbel_noise && params.temperature > 0.0f) { + add_gumbel_noise(logits, n_vocab, params.temperature, rng); + } + + if (params.algorithm == ORIGIN) { + int32_t transfer_count = calculate_transfer_count( + step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens); + float p_transfer = (float) transfer_count / mask_positions.size(); + + for (int32_t pos : mask_positions) { + if (std::uniform_real_distribution(0.0f, 1.0f)(rng) < p_transfer) { + const float * pos_logits = get_logits_for_pos(pos); + for (int32_t token_id = 0; token_id < n_vocab; token_id++) { + candidates[token_id].id = token_id; + candidates[token_id].logit = pos_logits[token_id]; + candidates[token_id].p = 0.0f; } - conf_candidates.emplace_back(llama_token_data{ pos, conf_logit, 0.0f }); + llama_token_data_array cur_p = { + candidates.data(), + (size_t) n_vocab, + -1, + false, + }; + + llama_sampler_apply(sampler, &cur_p); + output_tokens[pos] = cur_p.data[cur_p.selected].id; + } + } + } else { + std::vector> confidences; + std::vector sampled_tokens(mask_positions.size()); + + for (size_t i = 0; i < mask_positions.size(); i++) { + int32_t pos = mask_positions[i]; + const float * pos_logits = get_logits_for_pos(pos); + + for (int32_t token_id = 0; token_id < n_vocab; token_id++) { + candidates[token_id].logit = pos_logits[token_id]; + candidates[token_id].p = 0.0f; + candidates[token_id].id = token_id; } - llama_token_data_array conf_array = { - /* .data = */ conf_candidates.data(), - /* .size = */ conf_candidates.size(), - /* .selected = */ -1, - /* .sorted = */ false, + llama_token_data_array cur_p = { + candidates.data(), + candidates.size(), + -1, + false, }; - for (int32_t i = 0; i < num_transfer; i++) { - // Apply distribution sampler to get selected index - llama_sampler_apply(dist_sampler, &conf_array); - int selected_idx = conf_array.selected; - confidences[i].second = conf_candidates[selected_idx].id; + llama_sampler_apply(sampler, &cur_p); + llama_token sampled_token = cur_p.data[cur_p.selected].id; + + float conf = calculate_confidence(cur_p, params.algorithm, rng); - conf_candidates[selected_idx].p = 0.0f; - conf_array.selected = -1; - } + sampled_tokens[i] = sampled_token; + confidences.emplace_back(conf, i); } - if (params.alg_temp == 0.0f) { - // Deterministic - use confidence order - for (int32_t i = 0; i < num_transfer; i++) { - int32_t mask_idx = confidences[i].second; - int32_t pos = mask_positions[mask_idx]; - llama_token token = sampled_tokens[mask_idx]; - output_tokens[pos] = token; - } - } else { - for (int32_t i = 0; i < num_transfer; i++) { - int32_t pos = confidences[i].second; - auto it = std::find(mask_positions.begin(), mask_positions.end(), pos); - if (it != mask_positions.end()) { - int32_t mask_idx = std::distance(mask_positions.begin(), it); + int32_t transfer_count = calculate_transfer_count( + step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens); + + if (transfer_count > 0) { + if (params.alg_temp == 0.0f) { + std::partial_sort(confidences.begin(), + confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()), + confidences.end(), + [](const std::pair & a, const std::pair & b) { + if (a.first != b.first) { + return a.first > b.first; + } + return a.second < b.second; + }); + + for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) { + int32_t mask_idx = confidences[i].second; + int32_t pos = mask_positions[mask_idx]; output_tokens[pos] = sampled_tokens[mask_idx]; } + } else { + conf_candidates.clear(); + for (size_t i = 0; i < confidences.size(); i++) { + float conf_logit = confidences[i].first / params.alg_temp; + conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f }); + } + + llama_token_data_array conf_array = { + conf_candidates.data(), + conf_candidates.size(), + -1, + false, + }; + + for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) { + llama_sampler_apply(dist_sampler, &conf_array); + int32_t selected_idx = conf_array.selected; + int32_t mask_idx = selected_idx; + int32_t pos = mask_positions[mask_idx]; + output_tokens[pos] = sampled_tokens[mask_idx]; + + conf_candidates[selected_idx].p = 0.0f; + conf_array.selected = -1; + } } } } + + int64_t time_end_sampling = ggml_time_us(); + total_sampling_time += time_end_sampling - time_start_sampling; } - int64_t time_end_sampling = ggml_time_us(); - total_sampling_time += time_end_sampling - time_start_sampling; } + int64_t time_end = ggml_time_us(); total_time += time_end - time_start; LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n", - total_time / 1000.0, total_time / 1000.0 / params.steps, total_sampling_time / 1000.0 / params.steps); - + total_time / 1000.0, + total_time / 1000.0 / params.steps, + total_sampling_time / 1000.0 / params.steps); llama_batch_free(batch); llama_sampler_free(sampler); llama_sampler_free(dist_sampler); - n_generated = max_length; + n_generated = params.max_length; } - - - static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) { if (!use_chat_template) { return prompt; @@ -331,66 +529,6 @@ static std::string format_input_text(const std::string & prompt, bool use_chat_t return result.prompt; } -struct callback_data { - const common_params_diffusion * diff_params; - const llama_vocab * vocab; - int32_t n_input; -}; - -static bool diffusion_step_callback(int32_t step, - int32_t total_steps, - const llama_token * tokens, - int32_t n_tokens, - void * user_data) { - (void)user_data; - - callback_data * data = static_cast(user_data); - - auto print_progress_bar = [](int32_t step, int32_t total_steps) { - int progress_percent = (step * 100) / total_steps; - int progress_bars = (step * 50) / total_steps; - LOG_INF("\rdiffusion step: %d/%d [%s%s] %d%%", - step, - total_steps, - std::string(progress_bars, '=').c_str(), - std::string(50 - progress_bars, ' ').c_str(), - progress_percent); - }; - - if (data->diff_params->visual_mode) { - // Visual mode: clear - LOG_INF("\033[2J\033[H"); // Clear screen and move cursor to top-left - - print_progress_bar(step, total_steps); - - LOG_INF("\n"); - - std::string current_text = " "; - - for (int32_t i = data->n_input; i < n_tokens; i++) { - std::string token_str; - if (tokens[i] != llama_vocab_mask(data->vocab)) { - char piece[256]; - int n_chars = llama_token_to_piece(data->vocab, tokens[i], piece, sizeof(piece), 0, false); - if (n_chars > 0) { - piece[n_chars] = '\0'; - token_str = piece; - } - } else { - token_str = " "; - } - - current_text += token_str; - } - - LOG_INF("%s\n", current_text.c_str()); - } else { - print_progress_bar(step, total_steps); - } - - return true; -} - int main(int argc, char ** argv) { ggml_time_init(); @@ -400,11 +538,6 @@ int main(int argc, char ** argv) { return 1; } - const char * alg_names[] = { "ORIGIN", "MASKGIT_PLUS", "TOPK_MARGIN", "ENTROPY" }; - const char * alg_name = (params.diffusion.algorithm >= 0 && params.diffusion.algorithm <= 3) ? - alg_names[params.diffusion.algorithm] : - "UNKNOWN"; - common_init(); llama_backend_init(); @@ -421,6 +554,12 @@ int main(int argc, char ** argv) { return 1; } + if (!llama_model_is_diffusion(model)) { + LOG_ERR("error: unsupported model for diffusion"); + llama_model_free(model); + return 1; + } + llama_context_params ctx_params = llama_context_default_params(); ctx_params.n_ctx = params.n_ctx; ctx_params.n_batch = params.n_batch; @@ -442,10 +581,12 @@ int main(int argc, char ** argv) { const llama_vocab * vocab = llama_model_get_vocab(model); std::string formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model); - std::vector input_tokens = common_tokenize(vocab, formatted_prompt, + std::vector input_tokens = common_tokenize(vocab, + formatted_prompt, /*add special tokens*/ true, /*parse special*/ true); - int n_input = input_tokens.size(); + + int n_input = input_tokens.size(); if (n_input >= params.n_ctx) { LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx); @@ -454,44 +595,79 @@ int main(int argc, char ** argv) { return 1; } - struct diffusion_params ldiff_params = diffusion_default_params(); - ldiff_params.steps = params.diffusion.steps; - ldiff_params.eps = params.diffusion.eps; - ldiff_params.temperature = params.sampling.temp; - ldiff_params.top_p = params.sampling.top_p; - ldiff_params.top_k = params.sampling.top_k; - ldiff_params.algorithm = static_cast(params.diffusion.algorithm); - ldiff_params.alg_temp = params.diffusion.alg_temp; - ldiff_params.seed = params.sampling.seed; - llama_token mask_token_id = llama_vocab_mask(vocab); GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL); - LOG_INF("diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id); - LOG_INF("diffusion_params: - %-25s u32 = %d\n", "steps", params.diffusion.steps); - LOG_INF("diffusion_params: - %-25s f32 = %.6f\n", "eps", params.diffusion.eps); - LOG_INF("diffusion_params: - %-25s u32 = %d (%s)\n", "algorithm", params.diffusion.algorithm, - alg_name); - LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", params.diffusion.alg_temp); + bool visual_mode = params.diffusion.visual_mode; - ldiff_params.mask_token_id = mask_token_id; + int32_t n_generated = 0; + std::vector output_tokens(params.n_ubatch); - callback_data cb_data = { ¶ms.diffusion, vocab, n_input }; + struct diffusion_params diff_params; - ldiff_params.step_callback = diffusion_step_callback; - ldiff_params.step_callback_user_data = &cb_data; + char shift_logits_str[8]; + if (llama_model_meta_val_str(model, "diffusion.shift_logits", shift_logits_str, sizeof(shift_logits_str)) >= 0) { + diff_params.shift_logits = (strcmp(shift_logits_str, "true") == 0); + } else { + diff_params.shift_logits = true; + } - int32_t n_generated = 0; + //Use either eps or block length, but not both + GGML_ASSERT((params.diffusion.eps == 0) ^ (params.diffusion.block_length == 0)); - std::vector output_tokens(params.n_ubatch); - diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, params.n_ubatch, - ldiff_params, n_generated); + if (params.diffusion.eps) { + diff_params.schedule = TIMESTEP_BASED; + diff_params.eps = params.diffusion.eps; + } else if (params.diffusion.block_length) { + diff_params.schedule = BLOCK_BASED; + diff_params.block_length = params.diffusion.block_length; + } + + diff_params.mask_token_id = mask_token_id; + diff_params.seed = params.sampling.seed; + diff_params.temperature = params.sampling.temp; + diff_params.steps = params.diffusion.steps; + diff_params.algorithm = static_cast(params.diffusion.algorithm); + diff_params.max_length = params.n_ubatch; + diff_params.top_p = params.sampling.top_p; + diff_params.top_k = params.sampling.top_k; + diff_params.visual_mode = params.diffusion.visual_mode; + diff_params.add_gumbel_noise = params.diffusion.add_gumbel_noise; + + diff_params.step_callback = diffusion_step_callback; + callback_data cb_data = { &diff_params, vocab, n_input }; + diff_params.step_callback_user_data = &cb_data; + + const char * alg_names[] = { "ORIGIN", "ENTROPY_BASED", "MARGIN_BASED", "RANDOM", "CONFIDENCE_BASED" }; + const char * sched_names[] = { "TIMESTEP_BASED", "BLOCK_BASED" }; + const char * alg_name = + (diff_params.algorithm >= 0 && diff_params.algorithm <= 4) ? alg_names[diff_params.algorithm] : "UNKNOWN"; + const char * sched_name = + (diff_params.schedule >= 0 && diff_params.schedule <= 1) ? sched_names[diff_params.schedule] : "UNKNOWN"; + + LOG_INF("diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id); + LOG_INF("diffusion_params: - %-25s u32 = %d\n", "steps", diff_params.steps); + LOG_INF("diffusion_params: - %-25s u32 = %d\n", "max_length", diff_params.max_length); + LOG_INF("diffusion_params: - %-25s enum = %d (%s)\n", "algorithm", diff_params.algorithm, alg_name); + LOG_INF("diffusion_params: - %-25s enum = %d (%s)\n", "schedule", diff_params.schedule, sched_name); + LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "temperature", diff_params.temperature); + if (diff_params.schedule == TIMESTEP_BASED) { + LOG_INF("diffusion_params: - %-25s f32 = %.6f\n", "eps", diff_params.eps); + LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", diff_params.alg_temp); + } + if (diff_params.schedule == BLOCK_BASED) { + LOG_INF("diffusion_params: - %-25s u32 = %d\n", "block_length", diff_params.block_length); + LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "cfg_scale", diff_params.cfg_scale); + } + + diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, diff_params, n_generated); if (n_generated > 0) { - if (params.diffusion.visual_mode) { + if (visual_mode) { //clear screen and move cursor to top-left LOG_INF("\033[2J\033[H"); } + output_tokens.erase(output_tokens.begin(), output_tokens.begin() + n_input); std::string output_data = common_detokenize(vocab, output_tokens, false); LOG_INF("\n%s\n", output_data.c_str()); diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index 99196c9d047..722cd7f40f0 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -65,7 +65,7 @@ int main(int argc, char ** argv) { ctx_dft = llama_init_dft.context.get(); if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) { - return 1; + LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str()); } // Tokenize the prompt @@ -130,7 +130,10 @@ int main(int argc, char ** argv) { params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft; params_spec.p_min = p_min; - struct common_speculative * spec = common_speculative_init(ctx_dft); + struct common_speculative * spec = common_speculative_init(ctx_tgt, ctx_dft); + for (auto &pair : params.speculative.replacements) { + common_speculative_add_replacement_tgt_dft(spec, pair.first.c_str(), pair.second.c_str()); + } llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1); diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 316055193d8..07d6b8b67d4 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -1913,11 +1913,9 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, bcast_weight_nb[4], bcast_weight_nb[5]}; aclTensor* acl_weight_tensor; - bool weightToNZ = false; -#ifdef ASCEND_310P - weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr); -#endif - if (weightToNZ && is_matmul_weight(weight)) { + // Only check env once. + static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("")); + if (weight_to_nz && is_matmul_weight(weight)) { int64_t acl_stride[2] = {1, transpose_ne[1]}; // Reverse ne. diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 4dac2e8b789..49f55891d85 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1116,61 +1116,59 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor( return GGML_STATUS_SUCCESS; } -static int CreateAclTensorWeight(const void *hostData, const std::vector &shape, void **deviceAddr, - aclDataType dataType, aclTensor **tensor) -{ - uint64_t size = 1; - for (auto i : shape) { - size *= i; +// ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed +namespace { + void* g_nz_workspace = nullptr; + size_t g_nz_workspace_allocated = 0; + + void release_nz_workspace() { + if (g_nz_workspace) { + aclrtFree(g_nz_workspace); + g_nz_workspace = nullptr; + g_nz_workspace_allocated = 0; + } } - const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size()); - ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size)); - - size *= sizeof(int16_t); - - ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST)); - aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE); - - std::vector strides(shape.size(), 1); - for (int64_t i = shape.size() - 2; i >= 0; i--) { - strides[i] = shape[i + 1] * strides[i + 1]; + void relloc_nz_workspace(size_t new_size) { + if (new_size > g_nz_workspace_allocated) { + if (g_nz_workspace) { + aclrtFree(g_nz_workspace); + g_nz_workspace = nullptr; + } + ACL_CHECK(aclrtMalloc(&g_nz_workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST)); + g_nz_workspace_allocated = new_size; + } } - - *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND, - shape.data(), shape.size(), *deviceAddr); - return 0; } +/** + * @brief Convert tensor weights to NZ format using Ascend CANN API. + * + * This function creates a transposed tensor descriptor and performs the + * TransMatmulWeight operation. Converting tensor formats can significantly + * improve performance on certain hardware. + * + * @param tensor Pointer to the input ggml_tensor containing the weights. + * @param data Pointer to the raw data buffer for the tensor weights. + * @param offset Byte offset within the tensor data buffer where weights start. + * + * @note The workspace buffer used in this function is managed globally and reused + * across calls. This reduces overhead from repeated memory allocation and deallocation. + */ static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) { - aclrtStream stream; - ACL_CHECK(aclrtCreateStream(&stream)); - - std::vector weightTransposedShape = {tensor->ne[1], tensor->ne[0]}; - void *weightTransposedDeviceAddr = nullptr; - aclTensor *weightTransposed = nullptr; - CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr, - ggml_cann_type_mapping(tensor->type), &weightTransposed); - + aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, + tensor->nb, 2, ACL_FORMAT_ND, offset); uint64_t workspaceSize = 0; aclOpExecutor *executor; - void *workspaceAddr = nullptr; // TransMatmulWeight - ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor)); - std::unique_ptr workspaceAddrPtrTrans(nullptr, aclrtFree); - if (workspaceSize > 0) { - ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); - workspaceAddrPtrTrans.reset(workspaceAddr); - } - ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream)); + ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, + &workspaceSize, &executor)); + // Avoid frequent malloc/free of the workspace. + relloc_nz_workspace(workspaceSize); - size_t size = ggml_nelements(tensor) * ggml_element_size(tensor); - - aclrtMemcpy((char *)tensor->data + offset, size, - weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE); + ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr)); ACL_CHECK(aclDestroyTensor(weightTransposed)); - aclrtFree(weightTransposedDeviceAddr); } // TODO: need handle tensor which has paddings. @@ -1197,14 +1195,14 @@ static void ggml_backend_cann_buffer_set_tensor( // For acl, synchronous functions use this default stream. // Why aclrtSynchronizeDevice? - bool weightToNZ = false; -#ifdef ASCEND_310P - weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr); -#endif + // Only check env once. + static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("")); if (!need_transform(tensor->type)) { ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE)); - if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) { + if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) { + GGML_ASSERT(tensor->ne[2] == 1); + GGML_ASSERT(tensor->ne[3] == 1); weight_format_to_nz(tensor, data, offset); } } else { @@ -1440,20 +1438,32 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size( size_t size = ggml_nbytes(tensor); int64_t ne0 = tensor->ne[0]; + // Only check env once. + static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("")); + // last line must bigger than 32, because every single op deal at // least 32 bytes. // TODO: quantized type? // int64_t line_size = ne0 * ggml_element_size(tensor); // int64_t line_size_align_32 = (line_size + 31) & ~31; // size += (line_size_align_32 - line_size); - - // TODO: not support quantized yet. - // TODO: consider un-continue tensor. if (ggml_is_quantized(tensor->type)) { if (ne0 % MATRIX_ROW_PADDING != 0) { size += ggml_row_size( tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); } + } else if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) { + // NZ format weight are not support quantized yet. + // If ND tensor transform to NZ, size may changed. + int64_t shape[] = {tensor->ne[1], tensor->ne[0]}; + GGML_ASSERT(tensor->ne[2] == 1); + GGML_ASSERT(tensor->ne[3] == 1); + const aclIntArray *acl_shape = aclCreateIntArray(shape, 2); + size_t new_size; + ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape, + ggml_cann_type_mapping(tensor->type), &new_size)); + ACL_CHECK(aclDestroyIntArray(acl_shape)); + size = std::max(size, new_size); } return size; @@ -2080,6 +2090,8 @@ static enum ggml_status ggml_backend_cann_graph_compute( (ggml_backend_cann_context*)backend->context; ggml_cann_set_device(cann_ctx->device); + //release temp buffer create by set tensor. + release_nz_workspace(); for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor* node = cgraph->nodes[i]; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c97b61d09c7..ef47ea7359e 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -279,6 +279,9 @@ class Attention: class Projector: STACK_FACTOR = "clip.audio.projector.stack_factor" + class Diffusion: + SHIFT_LOGITS = "diffusion.shift_logits" + # # recommended mapping of model tensor names for storage in gguf # @@ -377,6 +380,7 @@ class MODEL_ARCH(IntEnum): LFM2 = auto() DREAM = auto() SMALLTHINKER = auto() + LLADA = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -697,6 +701,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.LFM2: "lfm2", MODEL_ARCH.DREAM: "dream", MODEL_ARCH.SMALLTHINKER: "smallthinker", + MODEL_ARCH.LLADA: "llada", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -1318,6 +1323,21 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.LLADA: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.QWEN2VL: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 4f23f9b0246..f4fd64ad822 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1047,6 +1047,11 @@ def add_audio_num_mel_bins(self, value: int) -> None: def add_audio_stack_factor(self, value: int) -> None: self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value) + # diffusion models + + def add_diffusion_shift_logits(self, value: bool) -> None: + self.add_bool(Keys.Diffusion.SHIFT_LOGITS, value) + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: pack_prefix = '' if not skip_pack_prefix: diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index bfd4fd37a3f..15adbfa7818 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -32,6 +32,7 @@ class TensorNameMap: "model.word_embeddings", # bailingmoe "language_model.model.embed_tokens", # llama4 "encoder", # neobert + "model.transformer.wte", # llada ), # Token type embeddings @@ -71,6 +72,7 @@ class TensorNameMap: "head", # rwkv "head.out", # wavtokenizer "lm_head", # llama4 + "model.transformer.ff_out", # llada ), # Output norm @@ -94,6 +96,7 @@ class TensorNameMap: "model.ln_out", # rwkv7 "backbone.final_layer_norm", # wavtokenizer "model.norm", # llama4 + "model.transformer.ln_f", # llada ), # Rope frequencies @@ -139,6 +142,7 @@ class TensorNameMap: "model.layers.{bid}.input_layernorm", # llama4 "transformer_encoder.{bid}.attention_norm", # neobert "model.layers.{bid}.operator_norm", # lfm2 + "model.transformer.blocks.{bid}.attn_norm", # llada ), # Attention norm 2 @@ -183,6 +187,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok "transformer.h.{bid}.attn.attention.q_proj", # exaone "model.layers.{bid}.self_attn.q_proj", # llama4 + "model.transformer.blocks.{bid}.q_proj", # llada ), # Attention key @@ -199,6 +204,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok "transformer.h.{bid}.attn.attention.k_proj", # exaone "model.layers.{bid}.self_attn.k_proj", # llama4 + "model.transformer.blocks.{bid}.k_proj", # llada ), # Attention value @@ -214,6 +220,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok "transformer.h.{bid}.attn.attention.v_proj", # exaone "model.layers.{bid}.self_attn.v_proj", # llama4 + "model.transformer.blocks.{bid}.v_proj", # llada ), # Attention output @@ -246,6 +253,7 @@ class TensorNameMap: "transformer.h.{bid}.attn.attention.out_proj", # exaone "model.layers.{bid}.self_attn.o_proj", # llama4 "transformer_encoder.{bid}.wo", # neobert + "model.transformer.blocks.{bid}.attn_out", # llada ), # Attention output norm @@ -291,6 +299,7 @@ class TensorNameMap: "model.layers.{bid}.post_attention_layernorm", # llama4 "transformer_encoder.{bid}.ffn_norm", # neobert "model.layers.layers.{bid}.pre_mlp_norm", # plamo2 + "model.transformer.blocks.{bid}.ff_norm", # llada ), # Post feed-forward norm @@ -364,6 +373,7 @@ class TensorNameMap: "model.layers.{bid}.feed_forward.up_proj", # llama4 jamba granite-hybrid "transformer_encoder.{bid}.ffn.w12", # neobert "model.layers.{bid}.block_sparse_moe.up", # smallthinker + "model.transformer.blocks.{bid}.up_proj", # llada ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -405,6 +415,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.c_fc_0", # exaone "model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba granite-hybrid "model.layers.{bid}.block_sparse_moe.gate", # smallthinker + "model.transformer.blocks.{bid}.ff_proj", # llada ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -454,6 +465,7 @@ class TensorNameMap: "model.layers.{bid}.feed_forward.down_proj", # llama4 jamba granite-hybrid "transformer_encoder.{bid}.ffn.w3", # neobert "model.layers.{bid}.block_sparse_moe.down", # smallthinker + "model.transformer.blocks.{bid}.ff_out", # llada ), MODEL_TENSOR.FFN_DOWN_EXP: ( diff --git a/include/llama.h b/include/llama.h index 6f454a508a0..1a51e74a8d6 100644 --- a/include/llama.h +++ b/include/llama.h @@ -537,6 +537,9 @@ extern "C" { // Returns true if the model is recurrent (like Mamba, RWKV, etc.) LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model); + // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.) + LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model); + // Returns 0 on success LLAMA_API uint32_t llama_model_quantize( const char * fname_inp, diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index dbf977443ae..15fb9d0b508 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -89,6 +89,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_LFM2, "lfm2" }, { LLM_ARCH_DREAM, "dream" }, { LLM_ARCH_SMALLTHINKER, "smallthinker" }, + { LLM_ARCH_LLADA, "llada" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1972,6 +1973,23 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_LLADA, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2224,6 +2242,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { bool llm_arch_is_diffusion(const llm_arch & arch) { switch (arch) { case LLM_ARCH_DREAM: + case LLM_ARCH_LLADA: return true; default: return false; diff --git a/src/llama-arch.h b/src/llama-arch.h index 8267a8d3aa4..8ea80806c9c 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -93,6 +93,7 @@ enum llm_arch { LLM_ARCH_LFM2, LLM_ARCH_DREAM, LLM_ARCH_SMALLTHINKER, + LLM_ARCH_LLADA, LLM_ARCH_UNKNOWN, }; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index ee861bd7ec1..491a26b6346 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -785,13 +785,20 @@ ggml_tensor * llm_graph_context::build_moe_ffn( bool scale_w, float w_scale, llama_expert_gating_func_type gating_op, - int il) const { + int il, + ggml_tensor * probs_in) const { const int64_t n_embd = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN - ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens] - cb(logits, "ffn_moe_logits", il); + ggml_tensor * logits = nullptr; + + if (probs_in == nullptr) { + logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens] + cb(logits, "ffn_moe_logits", il); + } else { + logits = probs_in; + } ggml_tensor * probs = nullptr; switch (gating_op) { @@ -884,6 +891,14 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cur = ggml_gelu(ctx0, cur); cb(cur, "ffn_moe_gelu", il); } break; + case LLM_FFN_RELU: + if (gate_exps) { + cur = ggml_reglu_split(ctx0, cur, up); + cb(cur, "ffn_moe_reglu", il); + } else { + cur = ggml_relu(ctx0, cur); + cb(cur, "ffn_moe_relu", il); + } break; default: GGML_ABORT("fatal error"); } @@ -927,100 +942,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn( return moe_out; } -ggml_tensor * llm_graph_context::build_moe_ffn_from_probs( - ggml_tensor * cur, - ggml_tensor * probs, - ggml_tensor * up_exps, - ggml_tensor * gate_exps, - ggml_tensor * down_exps, - ggml_tensor * exp_probs_b, - int64_t n_expert, - int64_t n_expert_used, - llama_expert_gating_func_type gating_op, - int il) const { - const int64_t n_embd = cur->ne[0]; - const int64_t n_tokens = cur->ne[1]; - - // add experts selection bias - introduced in DeepSeek V3 - // leave probs unbiased as it's later used to get expert weights - ggml_tensor * selection_probs = probs; - if (exp_probs_b != nullptr) { - selection_probs = ggml_add(ctx0, probs, exp_probs_b); - cb(selection_probs, "ffn_moe_probs_biased", il); - } - - // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens] - cb(selected_experts->src[0], "ffn_moe_argsort", il); - cb(selected_experts, "ffn_moe_topk", il); - - ggml_tensor * weights = ggml_get_rows(ctx0, - ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens] - cb(weights, "ffn_moe_weights", il); - - weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); - if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX) { - weights = ggml_soft_max(ctx0, weights); - } else { - weights = ggml_sigmoid(ctx0, weights); - ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens] - cb(weights_sum, "ffn_moe_weights_sum", il); - - weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens] - cb(weights, "ffn_moe_weights_norm", il); - } - - weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens); - - cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); - - ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(up, "ffn_moe_up", il); - - ggml_tensor * experts = nullptr; - cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(cur, "ffn_moe_gate", il); - - cur = ggml_reglu_split(ctx0, cur, up); - cb(cur, "ffn_moe_reglu", il); - - experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens] - cb(experts, "ffn_moe_down", il); - - experts = ggml_mul(ctx0, experts, weights); - cb(cur, "ffn_moe_weighted", il); - - ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr }; - - assert(n_expert_used > 0); - - // order the views before the adds - for (uint32_t i = 0; i < hparams.n_expert_used; ++i) { - cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]); - - ggml_build_forward_expand(gf, cur_experts[i]); - } - - // aggregate experts - // note: here we explicitly use hparams.n_expert_used instead of n_expert_used - // to avoid potentially a large number of add nodes during warmup - // ref: https://github.com/ggml-org/llama.cpp/pull/14753 - ggml_tensor * moe_out = cur_experts[0]; - - for (uint32_t i = 1; i < hparams.n_expert_used; ++i) { - moe_out = ggml_add(ctx0, moe_out, cur_experts[i]); - } - - if (n_expert_used == 1) { - // avoid returning a non-contiguous tensor - moe_out = ggml_cont(ctx0, moe_out); - } - - cb(moe_out, "ffn_moe_out", il); - - return moe_out; -} - // input embeddings with optional lora ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { const int64_t n_embd = hparams.n_embd; diff --git a/src/llama-graph.h b/src/llama-graph.h index 55a6b6f3e05..94d778f3847 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -631,19 +631,8 @@ struct llm_graph_context { bool scale_w, float w_scale, llama_expert_gating_func_type gating_op, - int il) const; - - ggml_tensor * build_moe_ffn_from_probs( - ggml_tensor * cur, - ggml_tensor * probs, - ggml_tensor * up_exps, - ggml_tensor * gate_exps, - ggml_tensor * down_exps, - ggml_tensor * exp_probs_b, - int64_t n_expert, - int64_t n_expert_used, - llama_expert_gating_func_type gating_op, - int il) const; + int il, + ggml_tensor * probs_in = nullptr) const; // // inputs diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e3aa9e6f91a..56c2ecd4cab 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -869,6 +869,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.causal_attn = false; } break; + case LLM_ARCH_LLADA: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion + switch (hparams.n_layer) { + case 32: + type = LLM_TYPE_8B; + break; + default: + type = LLM_TYPE_UNKNOWN; + } + // Set non-causal attention for diffusion models + hparams.causal_attn = false; + } + break; case LLM_ARCH_QWEN2MOE: { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); @@ -2149,6 +2164,53 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } } break; + case LLM_ARCH_LLADA: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = + create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); + + // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock + layer.wq = + create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0); + // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false + layer.wo = + create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0); + + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 }, + TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); + + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0); + + // optional MLP bias + layer.ffn_gate_b = + create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED); + layer.ffn_down_b = + create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED); + } + } + break; case LLM_ARCH_LLAMA4: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -8042,6 +8104,106 @@ struct llm_build_dream : public llm_graph_context { } }; +struct llm_build_llada : public llm_graph_context { + llm_build_llada(const llama_model & model, const llm_graph_params & params) : + llm_graph_context(params) { + // LLaDA is similar to LLaMA but uses non-causal attention for diffusion + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + // Non-causal attention for diffusion + auto * inp_attn = build_attn_inp_no_cache(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute separate Q, K, V projections without bias, matching LLaDALlamaBlock + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, + 1.0f / sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + struct llm_build_qwen2vl : public llm_graph_context { llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -17158,10 +17320,18 @@ struct llm_build_smallthinker : public llm_graph_context{ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - ggml_tensor * ffn_out = build_moe_ffn_from_probs(cur, probs, model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, - nullptr, n_expert, n_expert_used, - static_cast(hparams.expert_gating_func), il); + ggml_tensor * ffn_out = + build_moe_ffn(cur, + nullptr, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_RELU, true, + false, 0.0, + static_cast(hparams.expert_gating_func), + il, probs); cb(ffn_out, "ffn_out", il); cur = ffn_out; @@ -17201,6 +17371,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, case LLM_ARCH_NEO_BERT: case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_DREAM: + case LLM_ARCH_LLADA: { res = nullptr; } break; @@ -17367,6 +17538,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_LLADA: + { + llm = std::make_unique(*this, params); + } + break; case LLM_ARCH_QWEN2VL: { llm = std::make_unique(*this, params); @@ -17765,6 +17941,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { // use what we call a normal RoPE, operating on pairs of consecutive head values case LLM_ARCH_LLAMA: + case LLM_ARCH_LLADA: case LLM_ARCH_LLAMA4: case LLM_ARCH_DECI: case LLM_ARCH_BAICHUAN: @@ -17943,6 +18120,10 @@ bool llama_model_is_recurrent(const llama_model * model) { return llm_arch_is_recurrent(model->arch); } +bool llama_model_is_diffusion(const llama_model * model) { + return llm_arch_is_diffusion(model->arch); +} + const std::vector> & llama_internal_get_tensor_map(const llama_model * model) { return model->tensors_by_name; } diff --git a/tools/server/README.md b/tools/server/README.md index f3f4caed85c..87cef75730a 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -469,7 +469,7 @@ These words will not be included in the completion, so make sure to add them to `ignore_eos`: Ignore end of stream token and continue generating. Default: `false` -`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]` +`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. For compatibility with the OpenAI API, a JSON object {"": bias, ...} can also be passed. Default: `[]` `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0` diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 2e4c40af783..35d6610428e 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -473,6 +473,33 @@ struct server_task { } } } + } else if (logit_bias != data.end() && logit_bias->is_object()) { + const int n_vocab = llama_vocab_n_tokens(vocab); + for (const auto & el : logit_bias->items()) { + float bias; + const auto & key = el.key(); + const auto & value = el.value(); + if (value.is_number()) { + bias = value.get(); + } else if (value.is_boolean() && !value.get()) { + bias = -INFINITY; + } else { + continue; + } + + char *end; + llama_token tok = strtol(key.c_str(), &end, 10); + if (*end == 0) { + if (tok >= 0 && tok < n_vocab) { + params.sampling.logit_bias.push_back({tok, bias}); + } + } else { + auto toks = common_tokenize(vocab, key, false); + for (auto tok : toks) { + params.sampling.logit_bias.push_back({tok, bias}); + } + } + } } params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos); @@ -1902,6 +1929,7 @@ struct server_context { mtmd_context * mctx = nullptr; const llama_vocab * vocab = nullptr; + bool vocab_dft_compatible = true; llama_model * model_dft = nullptr; @@ -1992,10 +2020,9 @@ struct server_context { return false; } - if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) { - SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str()); - - return false; + vocab_dft_compatible = common_speculative_are_compatible(ctx, llama_init_dft.context.get()); + if (!vocab_dft_compatible) { + SRV_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str()); } const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get()); @@ -2085,11 +2112,14 @@ struct server_context { return; } - slot.spec = common_speculative_init(slot.ctx_dft); + slot.spec = common_speculative_init(slot.ctx, slot.ctx_dft); if (slot.spec == nullptr) { SRV_ERR("%s", "failed to create speculator\n"); return; } + for (auto &pair : params_base.speculative.replacements) { + common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str()); + } } SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py index 7ee9a165140..6c6f64f5e2e 100644 --- a/tools/server/tests/unit/test_chat_completion.py +++ b/tools/server/tests/unit/test_chat_completion.py @@ -351,3 +351,32 @@ def test_logprobs_stream(): assert token.top_logprobs is not None assert len(token.top_logprobs) > 0 assert aggregated_text == output_text + + +def test_logit_bias(): + global server + server.start() + + exclude = ["i", "I", "the", "The", "to", "a", "an", "be", "is", "was", "but", "But", "and", "And", "so", "So", "you", "You", "he", "He", "she", "She", "we", "We", "they", "They", "it", "It", "his", "His", "her", "Her", "book", "Book"] + + res = server.make_request("POST", "/tokenize", data={ + "content": " " + " ".join(exclude) + " ", + }) + assert res.status_code == 200 + tokens = res.body["tokens"] + logit_bias = {tok: -100 for tok in tokens} + + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") + res = client.chat.completions.create( + model="gpt-3.5-turbo-instruct", + temperature=0.0, + messages=[ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + max_tokens=64, + logit_bias=logit_bias + ) + output_text = res.choices[0].message.content + assert output_text + assert all(output_text.find(" " + tok + " ") == -1 for tok in exclude) diff --git a/tools/server/tests/unit/test_completion.py b/tools/server/tests/unit/test_completion.py index f6909e9ae78..be3a0052c64 100644 --- a/tools/server/tests/unit/test_completion.py +++ b/tools/server/tests/unit/test_completion.py @@ -444,6 +444,39 @@ def test_n_probs_post_sampling(): assert any(prob["prob"] == 1.0 for prob in tok["top_probs"]) +@pytest.mark.parametrize("tokenize,openai_style", [(False, False), (False, True), (True, False), (True, True)]) +def test_logit_bias(tokenize, openai_style): + global server + server.start() + + exclude = ["i", "I", "the", "The", "to", "a", "an", "be", "is", "was", "but", "But", "and", "And", "so", "So", "you", "You", "he", "He", "she", "She", "we", "We", "they", "They", "it", "It", "his", "His", "her", "Her", "book", "Book"] + + logit_bias = [] + if tokenize: + res = server.make_request("POST", "/tokenize", data={ + "content": " " + " ".join(exclude) + " ", + }) + assert res.status_code == 200 + tokens = res.body["tokens"] + logit_bias = [[tok, -100] for tok in tokens] + + else: + logit_bias = [[" " + tok + " ", -100] for tok in exclude] + + if openai_style: + logit_bias = {el[0]: -100 for el in logit_bias} + + res = server.make_request("POST", "/completion", data={ + "n_predict": 64, + "prompt": "What is the best book", + "logit_bias": logit_bias, + "temperature": 0.0 + }) + assert res.status_code == 200 + output_text = res.body["content"] + assert all(output_text.find(" " + tok + " ") == -1 for tok in exclude) + + def test_cancel_request(): global server server.n_ctx = 4096