diff --git a/common/arg.cpp b/common/arg.cpp
index 060053595db..7744fd6c488 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -977,6 +977,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
             string_process_escapes(seq_breaker);
         }
+        for (auto & pair : params.speculative.replacements) {
+            string_process_escapes(pair.first);
+            string_process_escapes(pair.second);
+        }
     }
 
     if (!params.kv_overrides.empty()) {
@@ -3249,6 +3253,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.speculative.model.path = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
+    add_opt(common_arg(
+        {"--spec-replace"}, "TARGET", "DRAFT",
+        "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
+        [](common_params & params, const std::string & tgt, const std::string & dft) {
+            params.speculative.replacements.push_back({ tgt, dft });
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-ctkd", "--cache-type-k-draft"}, "TYPE",
         string_format(
@@ -3438,12 +3449,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
 
-    // diffusion parameters
     add_opt(common_arg(
         { "--diffusion-steps" }, "N",
         string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
         [](common_params & params, int value) { params.diffusion.steps = value; }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-visual" },
+        string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
+                      params.diffusion.visual_mode ? "true" : "false"),
+        [](common_params & params) { params.diffusion.visual_mode = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+
     add_opt(common_arg(
         { "--diffusion-eps" }, "F",
         string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
@@ -3451,21 +3468,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
     add_opt(common_arg(
         { "--diffusion-algorithm" }, "N",
-        string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
+        string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
                       params.diffusion.algorithm),
         [](common_params & params, int value) { params.diffusion.algorithm = value; }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
     add_opt(common_arg(
         { "--diffusion-alg-temp" }, "F",
-        string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
+        string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
         [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+
     add_opt(common_arg(
-        { "--diffusion-visual" },
-        string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
-                      params.diffusion.visual_mode ? "true" : "false"),
-        [](common_params & params) { params.diffusion.visual_mode = true; }
+        { "--diffusion-block-length" }, "N",
+        string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
+        [](common_params & params, int value) { params.diffusion.block_length = value; }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-cfg-scale" }, "F",
+        string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
+        [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-add-gumbel-noise" }, "F",
+        string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
+        [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+
 
     return ctx_arg;
 }
diff --git a/common/common.h b/common/common.h
index 00f42694eaf..f5acf37ff9f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -201,6 +201,7 @@ struct common_params_speculative {
     int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
     float   p_split      =  0.1f; // speculative decoding split probability
     float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
+    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
 
     ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
     ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
@@ -220,11 +221,17 @@ struct common_params_vocoder {
 };
 
 struct common_params_diffusion {
-    int32_t steps       = 64;     // number of diffusion steps
-    float   eps         = 1e-3f;  // epsilon for timesteps
-    int32_t algorithm   = 0;      // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
-    float   alg_temp    = 0.0f;   // algorithm temperature
-    bool    visual_mode = false;  // show progressive diffusion on screen
+    int32_t steps         = 128;
+    bool    visual_mode   = false;
+
+    float   eps           = 0;        // epsilon for timesteps
+    int32_t block_length  = 32;       // block length for generation
+
+    int32_t algorithm     = 4;        // default algorithm: low-confidence
+    float   alg_temp      = 0.0f;     // algorithm temperature
+
+    float   cfg_scale     = 0;        // classifier-free guidance scale
+    bool    add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
 };
 
 enum common_reasoning_format {
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 843bd1ddbdb..262b2c23e72 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -1,30 +1,39 @@
 #include "speculative.h"
 
+#include "ggml.h"
+#include "llama.h"
 #include "log.h"
 #include "common.h"
 #include "sampling.h"
 
 #include <cstring>
 #include <algorithm>
+#include <map>
 
 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
 
 struct common_speculative {
-    struct llama_context * ctx;
+    struct llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
+    struct llama_context * ctx_dft;
     struct common_sampler * smpl;
 
     llama_batch batch;
-    llama_tokens prompt;
+    llama_tokens prompt_dft;
+    bool vocab_dft_compatible = true; // whether retokenization is needed
+    std::map<std::string, std::string> tgt_dft_replacements = {};
 };
 
 struct common_speculative * common_speculative_init(
+        struct llama_context * ctx_tgt,
         struct llama_context * ctx_dft) {
     auto * result = new common_speculative {
-        /* .ctx    = */ ctx_dft,
-        /* .smpl   = */ nullptr,
-        /* .batch  = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
-        /* .prompt = */ {},
+        /* .ctx_tgt    = */ ctx_tgt,
+        /* .ctx_dft    = */ ctx_dft,
+        /* .smpl       = */ nullptr,
+        /* .batch      = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
+        /* .prompt_dft = */ {},
+        /* .vocab_dft_compatible = */ false,
     };
 
     // TODO: optimize or pass from outside?
@@ -59,6 +68,9 @@ struct common_speculative * common_speculative_init(
     }
 #endif
 
+    result->vocab_dft_compatible = common_speculative_are_compatible(ctx_tgt, ctx_dft);
+    LOG_DBG("vocab_dft_compatible = %d\n", result->vocab_dft_compatible);
+
     return result;
 }
 
@@ -75,8 +87,8 @@ void common_speculative_free(struct common_speculative * spec) {
 }
 
 bool common_speculative_are_compatible(
-        const struct llama_context * ctx_tgt,
-        const struct llama_context * ctx_dft) {
+    const struct llama_context * ctx_tgt,
+    const struct llama_context * ctx_dft) {
     const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
     const struct llama_model * model_dft = llama_get_model(ctx_dft);
 
@@ -90,31 +102,32 @@ bool common_speculative_are_compatible(
     LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
 
     if (vocab_type_tgt != vocab_type_dft) {
-        LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
-                     "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
+        LOG_DBG("%s: draft model vocab type must match target model to use speculation but ", __func__);
+        LOG_DBG("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
         return false;
     }
 
-    if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
+    if (
+        llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
         llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
         llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
-        llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) {
-        LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__);
-        LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt));
-        LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft));
+        llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
+    ) {
+        LOG_DBG("%s: draft model special tokens must match target model to use speculation\n", __func__);
         return false;
     }
 
     {
         const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
         const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
-
-        const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
+        const int vocab_diff  = n_vocab_tgt > n_vocab_dft
+            ? n_vocab_tgt - n_vocab_dft
+            : n_vocab_dft - n_vocab_tgt;
 
         if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
-            LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
-                         "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
-                    __func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
+            LOG_DBG("%s: draft model vocab must closely match target model to use speculation but ", __func__);
+            LOG_DBG("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
+                    n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
             return false;
         }
 
@@ -122,8 +135,8 @@ bool common_speculative_are_compatible(
             const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
             const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
             if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
-                LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but "
-                             "token %d content differs - target '%s', draft '%s'\n", __func__, i,
+                LOG_DBG("%s: draft model vocab must match target model to use speculation but ", __func__);
+                LOG_DBG("token %d content differs - target '%s', draft '%s'\n", i,
                         common_token_to_piece(ctx_tgt, i).c_str(),
                         common_token_to_piece(ctx_dft, i).c_str());
                 return false;
@@ -134,32 +147,93 @@ bool common_speculative_are_compatible(
     return true;
 }
 
+void common_speculative_add_replacement_tgt_dft(
+        struct common_speculative * spec,
+        const char *source, const char *dest) {
+    spec->tgt_dft_replacements[source] = dest;
+}
+
+static std::string replace_to_dft(
+        struct common_speculative * spec,
+        const std::string& input) {
+    std::string result = input;
+    for (const auto & pair : spec->tgt_dft_replacements) {
+        size_t pos = result.find(pair.first);
+        while (pos != std::string::npos) {
+            result.replace(pos, pair.first.length(), pair.second);
+            pos = result.find(pair.first, pos + pair.second.length());
+        }
+    }
+    return result;
+}
+
+static std::string replace_to_tgt(
+        struct common_speculative * spec,
+        const std::string& input) {
+    std::string result = input;
+    for (const auto& pair : spec->tgt_dft_replacements) {
+        size_t pos = result.find(pair.second);
+        while (pos != std::string::npos) {
+            result.replace(pos, pair.second.length(), pair.first);
+            pos = result.find(pair.second, pos + pair.first.length());
+        }
+    }
+    return result;
+}
+
+
 llama_tokens common_speculative_gen_draft(
         struct common_speculative * spec,
         struct common_speculative_params params,
-        const llama_tokens & prompt_tgt,
+        const llama_tokens & prompt_tgt_main_model, // specified in target model vocab
         llama_token id_last) {
     auto & batch  = spec->batch;
-    auto & ctx    = spec->ctx;
+    auto & ctx_tgt = spec->ctx_tgt;
+    auto & ctx_dft = spec->ctx_dft;
     auto & smpl   = spec->smpl;
-    auto & prompt = spec->prompt;
+    auto & prompt_dft = spec->prompt_dft;
 
-    auto * mem = llama_get_memory(ctx);
+    auto * mem_dft = llama_get_memory(ctx_dft);
 
     int reuse_i = 0;
     int reuse_n = 0;
 
-    const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
+    const int n_ctx = llama_n_ctx(ctx_dft) - params.n_draft;
+
+    llama_tokens prompt_tgt_draft_model;
+    if (!spec->vocab_dft_compatible) {
+        std::string text;
+        text = common_detokenize(ctx_tgt, prompt_tgt_main_model, true);
+        text = replace_to_dft(spec, text);
+        LOG_DBG("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str());
+        prompt_tgt_draft_model = common_tokenize(ctx_dft, text, false, true);
+
+        // convert id_last to draft vocab. llama_detokenize is called directly to avoid an allocation
+        const auto * model_tgt = llama_get_model(ctx_tgt);
+        const auto * vocab_tgt = llama_model_get_vocab(model_tgt);
+
+        int32_t n_chars = llama_detokenize(vocab_tgt, &id_last, 1, nullptr, 0, false, false);
+        GGML_ASSERT(n_chars < 0 && "failed to detokenize id_last");
+        text.resize(-n_chars);
+        llama_detokenize(vocab_tgt, &id_last, 1, text.data(), text.size(), false, false);
+        text = replace_to_dft(spec, text);
+
+        LOG_DBG("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str());
+        id_last = common_tokenize(ctx_dft, text, false, true)[0];
+    }
+    // prompt_tgt's tokens will always be compatible with ctx_dft
+    const llama_tokens &prompt_tgt =
+        spec->vocab_dft_compatible ? prompt_tgt_main_model : prompt_tgt_draft_model;
 
     const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
 
     // reuse as much as possible from the old draft context
     // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
-    for (int i = 0; i < (int) prompt.size(); ++i) {
+    for (int i = 0; i < (int) prompt_dft.size(); ++i) {
         int cur = 0;
         while (i_start + cur < (int) prompt_tgt.size() &&
-               i       + cur < (int) prompt.size() &&
-               prompt_tgt[i_start + cur] == prompt[i + cur]) {
+               i       + cur < (int) prompt_dft.size() &&
+               prompt_tgt[i_start + cur] == prompt_dft[i + cur]) {
             cur++;
         }
 
@@ -169,21 +243,20 @@ llama_tokens common_speculative_gen_draft(
         }
     }
 
-    LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
+    LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
 
     llama_tokens result;
     result.reserve(params.n_draft);
 
     if (reuse_n == 0) {
-        llama_memory_clear(mem, false);
-
-        prompt.clear();
+        llama_memory_clear(mem_dft, false);
+        prompt_dft.clear();
     } else {
         // this happens when a previous draft has been discarded (for example, due to being too small), but the
         // target model agreed with it. in this case, we simply pass back the previous results to save compute
-        if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
-            for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
-                result.push_back(prompt[i]);
+        if (reuse_i + reuse_n < (int) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
+            for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
+                result.push_back(prompt_dft[i]);
 
                 if (params.n_draft <= (int) result.size()) {
                     break;
@@ -194,16 +267,15 @@ llama_tokens common_speculative_gen_draft(
         }
 
         if (reuse_i > 0) {
-            llama_memory_seq_rm (mem, 0, 0, reuse_i);
-            llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
+            llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
+            llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);
 
-            prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
+            prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
         }
 
-        if (reuse_n < (int) prompt.size()) {
-            llama_memory_seq_rm (mem, 0, reuse_n, -1);
-
-            prompt.erase(prompt.begin() + reuse_n, prompt.end());
+        if (reuse_n < (int) prompt_dft.size()) {
+            llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
+            prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
         }
     }
 
@@ -214,28 +286,28 @@ llama_tokens common_speculative_gen_draft(
         //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
         common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
 
-        prompt.push_back(prompt_tgt[i]);
+        prompt_dft.push_back(prompt_tgt[i]);
     }
 
     // we should rarely end-up here during normal decoding
     if (batch.n_tokens > 0) {
         //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
 
-        llama_decode(ctx, batch);
+        llama_decode(ctx_dft, batch);
     }
 
-    const llama_pos n_past = prompt.size();
+    const llama_pos n_past = prompt_dft.size();
 
     LOG_DBG("%s: n_past = %d\n", __func__, n_past);
 
     common_batch_clear(batch);
     common_batch_add  (batch, id_last, n_past, { 0 }, true);
 
-    prompt.push_back(id_last);
+    prompt_dft.push_back(id_last);
 
-    //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
+    LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
 
-    llama_decode(ctx, batch);
+    llama_decode(ctx_dft, batch);
 
     common_sampler_reset(smpl);
 
@@ -243,13 +315,13 @@ llama_tokens common_speculative_gen_draft(
     for (int i = 0; i < params.n_draft; ++i) {
         common_batch_clear(batch);
 
-        common_sampler_sample(smpl, ctx, 0, true);
+        common_sampler_sample(smpl, ctx_dft, 0, true);
 
         const auto * cur_p = common_sampler_get_candidates(smpl);
 
         for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
             LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
-                    k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
+                    k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
         }
 
         // add drafted token for each sequence
@@ -271,10 +343,19 @@ llama_tokens common_speculative_gen_draft(
         common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
 
         // evaluate the drafted tokens on the draft model
-        llama_decode(ctx, batch);
+        llama_decode(ctx_dft, batch);
 
-        prompt.push_back(id);
+        prompt_dft.push_back(id);
     }
 
+    if (!spec->vocab_dft_compatible) {
+        std::string detokenized = common_detokenize(ctx_dft, result, true);
+        detokenized = replace_to_tgt(spec, detokenized);
+        LOG_DBG("draft->main detokenized string: '%s'\n", detokenized.c_str());
+        result = common_tokenize(ctx_tgt, detokenized, false, true);
+        if (result.size() > (size_t)params.n_draft) {
+            result.resize(params.n_draft);
+        }
+    }
     return result;
 }
diff --git a/common/speculative.h b/common/speculative.h
index 2b51a70ca1f..e69d7aaa1eb 100644
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -12,7 +12,10 @@ struct common_speculative_params {
     float p_min = 0.75f; // min probability required to accept a token in the draft
 };
 
-struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
+struct common_speculative * common_speculative_init(
+        struct llama_context * ctx_tgt,
+        struct llama_context * ctx_dft
+);
 
 void common_speculative_free(struct common_speculative * spec);
 
@@ -20,6 +23,10 @@ bool common_speculative_are_compatible(
         const struct llama_context * ctx_tgt,
         const struct llama_context * ctx_dft);
 
+void common_speculative_add_replacement_tgt_dft(
+        struct common_speculative * spec,
+        const char *source, const char *dest);
+
 // sample up to n_draft tokens and add them to the batch using the draft model
 llama_tokens common_speculative_gen_draft(
                struct common_speculative * spec,
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 3f5cefe007c..db4112318d4 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2904,6 +2904,107 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         yield from super().modify_tensors(data_torch, name, bid)
 
 
+@ModelBase.register("LLaDAModelLM")
+class LLaDAModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.LLADA
+    undo_permute = True
+
+    def get_vocab_base(self) -> tuple[list[str], list[int], str]:
+        tokens: list[str] = []
+        toktypes: list[int] = []
+
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
+
+        vocab_dict = tokenizer.get_vocab()
+        vocab_size = self.hparams.get("vocab_size", len(vocab_dict))
+        assert max(vocab_dict.values()) < vocab_size
+
+        tokpre = self.get_vocab_base_pre(tokenizer)
+
+        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()}
+        added_vocab = tokenizer.get_added_vocab()
+
+        for i in range(vocab_size):
+            if i not in reverse_vocab:
+                tokens.append(f"[PAD{i}]")
+                toktypes.append(gguf.TokenType.UNUSED)
+            elif reverse_vocab[i] in added_vocab:
+                tokens.append(reverse_vocab[i])
+                # Check if it's a special token - treat special tokens as CONTROL tokens
+                if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder:
+                    if tokenizer.added_tokens_decoder[i].special:
+                        toktypes.append(gguf.TokenType.CONTROL)
+                    else:
+                        toktypes.append(gguf.TokenType.USER_DEFINED)
+                else:
+                    # Fallback: treat all added vocab as control tokens for special tokens like <|im_start|>
+                    toktypes.append(gguf.TokenType.CONTROL)
+            else:
+                tokens.append(reverse_vocab[i])
+                toktypes.append(gguf.TokenType.NORMAL)
+
+        return tokens, toktypes, tokpre
+
+    def set_vocab(self):
+        self._set_vocab_gpt2()
+
+        # LLaDA specific parameters
+        self.gguf_writer.add_add_bos_token(True)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self._try_set_pooling_type()
+
+        # Add parameters similar to LlamaModel
+        hparams = self.hparams
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+
+        if (rope_dim := hparams.get("head_dim")) is None:
+            n_heads = hparams.get("num_attention_heads", hparams.get("n_heads"))
+            rope_dim = hparams.get("hidden_size", hparams.get("d_model")) // n_heads
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
+
+        # Set context length for LLaDA
+        context_length = self.hparams.get("max_sequence_length", 4096)
+        self.gguf_writer.add_context_length(context_length)
+
+        # Set embedding length (dimension size)
+        embedding_length = self.hparams.get("d_model", 4096)
+        self.gguf_writer.add_embedding_length(embedding_length)
+
+        # Set feed forward length (MLP hidden size)
+        feed_forward_length = self.hparams.get("mlp_hidden_size", 12288)
+        self.gguf_writer.add_feed_forward_length(feed_forward_length)
+
+        # LLaDA models use non-causal attention for diffusion, similar to Dream
+        self.gguf_writer.add_causal_attention(False)
+
+        # LLaDA models don't shift their logits
+        self.gguf_writer.add_diffusion_shift_logits(False)
+
+    @staticmethod
+    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
+        if n_head_kv is not None and n_head != n_head_kv:
+            n_head = n_head_kv
+        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+                .swapaxes(1, 2)
+                .reshape(weights.shape))
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        n_head = self.hparams.get("num_attention_heads", self.hparams.get("n_heads"))
+        n_kv_head = self.hparams.get("num_key_value_heads", self.hparams.get("n_kv_heads"))
+
+        if self.undo_permute:
+            if name.endswith(("q_proj.weight", "q_proj.bias")):
+                data_torch = LLaDAModel.permute(data_torch, n_head, n_head)
+            if name.endswith(("k_proj.weight", "k_proj.bias")):
+                data_torch = LLaDAModel.permute(data_torch, n_head, n_kv_head)
+
+        # LLaDA model tensors should be mapped directly since it's the base model
+        yield from super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("Ernie4_5_ForCausalLM")
 class Ernie4_5Model(TextModel):
     model_arch = gguf.MODEL_ARCH.ERNIE4_5
diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md
index 2b001f09abe..325e09bd380 100755
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -310,5 +310,7 @@ Specifies the memory pool management strategy:
 
 Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies.
 
-## TODO
-- Support more models and data types.
+### GGML_CANN_WEIGHT_NZ
+
+Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
+
diff --git a/examples/diffusion/README.md b/examples/diffusion/README.md
new file mode 100644
index 00000000000..26de5668aa8
--- /dev/null
+++ b/examples/diffusion/README.md
@@ -0,0 +1,13 @@
+# Diffusion Text Generation
+
+This directory contains implementations for Diffusion LLMs (DLLMs)
+
+More Info:
+- https://github.com/ggml-org/llama.cpp/pull/14644
+- https://github.com/ggml-org/llama.cpp/pull/14771
+
+
+Example of using Dream architechture: `llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual`
+
+Example of using LLaDA architechture: `llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual`
+
diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp
index 3e11ce1160b..8431dcea8fe 100644
--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@@ -5,67 +5,212 @@
 #include "log.h"
 
 #include <limits.h>
-#include <string>
-#include <vector>
+
 #include <algorithm>
 #include <cmath>
+#include <cstring>
 #include <limits>
 #include <random>
+#include <string>
+#include <vector>
 
-typedef bool (*diffusion_step_callback_t)(int32_t step,
-                                          int32_t total_steps,
-                                          const llama_token * tokens,
-                                          int32_t n_tokens,
-                                          void * user_data);
-
-enum diffusion_alg {
-    DIFFUSION_ALG_ORIGIN       = 0,
-    DIFFUSION_ALG_MASKGIT_PLUS = 1,
-    DIFFUSION_ALG_TOPK_MARGIN  = 2,
-    DIFFUSION_ALG_ENTROPY      = 3,
+enum diffusion_algorithm { ORIGIN = 0, ENTROPY_BASED = 1, MARGIN_BASED = 2, RANDOM = 3, CONFIDENCE_BASED = 4 };
+
+// Unified transfer scheduling methods
+enum transfer_schedule {
+    TIMESTEP_BASED = 0,  // Dream-style: (1.0 - s/t) * remaining
+    BLOCK_BASED    = 1,  // LLaDA-style: process in blocks with get_num_transfer_tokens
 };
 
+typedef bool (*diffusion_step_callback_t)(int32_t             step,
+                                          int32_t             total_steps,
+                                          const llama_token * tokens,
+                                          int32_t             n_tokens,
+                                          void *              user_data);
+
 struct diffusion_params {
-    int32_t                   steps;
-    float                     eps;
-    float                     temperature;
-    float                     top_p;
-    int32_t                   top_k;
-    llama_token               mask_token_id;
-    enum diffusion_alg        algorithm;
-    float                     alg_temp;
-    diffusion_step_callback_t step_callback;
-    void *                    step_callback_user_data;
-    int32_t                   seed;
+    int32_t                   steps                   = 0;
+    float                     temperature             = 0;
+    llama_token               mask_token_id           = LLAMA_TOKEN_NULL;
+    diffusion_step_callback_t step_callback           = nullptr;
+    void *                    step_callback_user_data = nullptr;
+    int32_t                   seed                    = 0;
+    bool                      visual_mode             = false;
+    bool                      shift_logits            = false;  // Shift logits by -1 after decode
+
+    float   top_p = 0.;
+    int32_t top_k = 0.;
+
+    diffusion_algorithm algorithm = CONFIDENCE_BASED;
+    transfer_schedule   schedule  = TIMESTEP_BASED;
+
+    float   cfg_scale        = 0.;     // Config scale for classifier-free guidance
+    float   eps              = 0.;     // Timestep scheduling
+    int32_t block_length     = 0;      // Block size (for block scheduling)
+    float   alg_temp         = 0;      // algorithm temperature (0.0 = deterministic)
+    bool    add_gumbel_noise = false;  // Add gumbel noise to the logits if temp > 0.0
+
+    int32_t max_length = 0;            // Maximum sequence length
 };
 
+struct callback_data {
+    diffusion_params *  diff_params;
+    const llama_vocab * vocab;
+    int32_t             n_input;
+};
+
+static float calculate_confidence(const llama_token_data_array & cur_p,
+                                  diffusion_algorithm            algorithm,
+                                  std::mt19937 &                 rng) {
+    switch (algorithm) {
+        case CONFIDENCE_BASED:
+            return cur_p.data[cur_p.selected].p;  // Selected token probability
+
+        case ENTROPY_BASED:
+            {
+                float       entropy = 0.0f;
+                const float epsilon = 1e-10f;
+                for (size_t i = 0; i < cur_p.size; i++) {
+                    float prob = cur_p.data[i].p;
+                    entropy += prob * logf(prob + epsilon);
+                }
+                return -entropy;  // Higher entropy = lower confidence
+            }
+
+        case MARGIN_BASED:
+            return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p;
+
+        case RANDOM:
+            {
+                std::uniform_real_distribution<float> uniform(0.0f, 1.0f);
+                return uniform(rng);  // Random confidence
+            }
+
+        case ORIGIN:
+            return cur_p.data[cur_p.selected].p;
+
+        default:
+            return 0.0f;
+    }
+}
+
+// Unified transfer count calculation function
+static int32_t calculate_transfer_count(int32_t                      step,
+                                        int32_t                      total_steps,
+                                        int32_t                      remaining_masked,
+                                        transfer_schedule            schedule,
+                                        float                        eps,
+                                        const std::vector<int32_t> & num_transfer_tokens = {}) {
+    switch (schedule) {
+        case TIMESTEP_BASED:
+            {
+                float t          = 1.0f - (float) step / total_steps * (1.0f - eps);
+                float s          = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps);
+                float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f;
+                return (int32_t) (remaining_masked * p_transfer);
+            }
+
+        case BLOCK_BASED:
+            if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) {
+                return num_transfer_tokens[step];
+            }
+            return remaining_masked / (total_steps - step);  // Fallback
+
+        default:
+            return remaining_masked / (total_steps - step);
+    }
+}
+
+static bool diffusion_step_callback(int32_t             step,
+                                    int32_t             total_steps,
+                                    const llama_token * tokens,
+                                    int32_t             n_tokens,
+                                    void *              user_data) {
+    (void) user_data;
+
+    callback_data * data = static_cast<callback_data *>(user_data);
+
+    auto print_progress_bar = [](int32_t step, int32_t total_steps) {
+        int progress_percent = (step * 100) / total_steps;
+        int progress_bars    = (step * 50) / total_steps;
+        LOG_INF("\rdiffusion step: %d/%d [%s%s] %d%%",
+                step,
+                total_steps,
+                std::string(progress_bars, '=').c_str(),
+                std::string(50 - progress_bars, ' ').c_str(),
+                progress_percent);
+    };
+
+    if (data->diff_params->visual_mode) {
+        // Visual mode: clear
+        LOG_INF("\033[2J\033[H");  // Clear screen and move cursor to top-left
+
+        print_progress_bar(step, total_steps);
+
+        LOG_INF("\n");
+
+        std::string current_text = " ";
+
+        for (int32_t i = data->n_input; i < n_tokens; i++) {
+            std::string token_str;
+            if (tokens[i] != llama_vocab_mask(data->vocab)) {
+                char piece[256];
+                int  n_chars = llama_token_to_piece(data->vocab, tokens[i], piece, sizeof(piece), 0, false);
+                if (n_chars > 0) {
+                    piece[n_chars] = '\0';
+                    token_str      = piece;
+                }
+            } else {
+                token_str = " ";
+            }
+
+            current_text += token_str;
+        }
 
-static diffusion_params diffusion_default_params() {
-    diffusion_params params        = {};
-    params.steps                   = 64;
-    params.eps                     = 1e-3f;
-    params.temperature             = 0.2f;
-    params.top_p                   = 0.95f;
-    params.top_k                   = 0;
-    params.mask_token_id           = LLAMA_TOKEN_NULL;
-    params.algorithm               = DIFFUSION_ALG_ORIGIN;
-    params.alg_temp                = 0.0f;
-    params.step_callback           = nullptr;
-    params.step_callback_user_data = nullptr;
-    params.seed                    = 0;
-    return params;
+        LOG_INF("%s\n", current_text.c_str());
+    } else {
+        print_progress_bar(step, total_steps);
+    }
+
+    return true;
 }
 
-static void diffusion_generate(llama_context * ctx,
-                        const llama_token * input_tokens,
-                        llama_token * output_tokens,
-                        int32_t n_input,
-                        int32_t max_length,
-                        struct diffusion_params params,
-                        int32_t & n_generated) {
+static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) {
+    if (temperature == 0.0f) {
+        return;
+    }
+
+    std::uniform_real_distribution<double> uniform(0.0, 1.0);
+    for (int32_t i = 0; i < n_vocab; i++) {
+        double noise        = uniform(rng);
+        // Prevent log(0)
+        noise               = std::max(noise, 1e-20);
+        double gumbel_noise = std::pow(-std::log(noise), temperature);
+        logits[i]           = std::exp(logits[i]) / gumbel_noise;
+    }
+}
+
+static std::vector<int32_t> get_num_transfer_tokens(int32_t mask_count, int32_t steps) {
+    std::vector<int32_t> num_transfer_tokens(steps);
+
+    int32_t base      = mask_count / steps;
+    int32_t remainder = mask_count % steps;
+
+    for (int32_t i = 0; i < steps; i++) {
+        num_transfer_tokens[i] = base + (i < remainder ? 1 : 0);
+    }
+
+    return num_transfer_tokens;
+}
 
+static void diffusion_generate(llama_context *          ctx,
+                               const llama_token *      input_tokens,
+                               llama_token *            output_tokens,
+                               int32_t                  n_input,
+                               const diffusion_params & params,
+                               int32_t &                n_generated) {
     n_generated = 0;
-    if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || max_length <= n_input) {
+    if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) {
         return;
     }
 
@@ -73,27 +218,21 @@ static void diffusion_generate(llama_context * ctx,
 
     // Initialize with input and pad with mask tokens
     std::copy(input_tokens, input_tokens + n_input, output_tokens);
-    std::fill(output_tokens + n_input, output_tokens + max_length, params.mask_token_id);
+    std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id);
 
     std::mt19937 rng(params.seed);
 
-    std::vector<float> timesteps(params.steps + 1);
-    for (int32_t i = 0; i <= params.steps; i++) {
-        timesteps[i] = 1.0f - (float) i / params.steps * (1.0f - params.eps);
-    }
-
     llama_set_causal_attn(ctx, false);
 
     int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
 
     std::vector<llama_token_data> candidates(n_vocab);
-
     std::vector<llama_token_data> conf_candidates;
-    conf_candidates.reserve(max_length);
-
+    conf_candidates.reserve(params.max_length);
     std::vector<int32_t> mask_positions;
-    mask_positions.reserve(max_length);
+    mask_positions.reserve(params.max_length);
 
+    // Setup sampler chain
     struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
     if (params.top_k > 0) {
         llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
@@ -108,210 +247,269 @@ static void diffusion_generate(llama_context * ctx,
 
     struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
 
-    llama_batch batch = llama_batch_init(max_length, 0, 1);
-    batch.n_tokens    = max_length;
+    llama_batch batch = llama_batch_init(params.max_length, 0, 1);
+    batch.n_tokens    = params.max_length;
 
-    int64_t total_sampling_time = 0;
-    int64_t total_time = 0;
+    // Pre-allocate buffers for CFG if needed
+    int32_t                  logits_size = n_vocab * params.max_length;
+    std::vector<float>       cond_logits_buffer;
+    std::vector<llama_token> un_x_buffer;
+    if (params.cfg_scale > 0.0f) {
+        cond_logits_buffer.resize(logits_size);
+        un_x_buffer.resize(params.max_length);
+    }
 
-    int64_t time_start = ggml_time_us();
-    for (int32_t step = 0; step < params.steps; step++) {
-        if (params.step_callback) {
-            if (!params.step_callback(step, params.steps, output_tokens, max_length, params.step_callback_user_data)) {
-                break;
-            }
-        }
+    // For block-based processing
+    std::vector<int32_t> num_transfer_tokens;
+    int32_t              num_blocks      = 1;
+    int32_t              steps_per_block = params.steps;
 
-        for (int32_t i = 0; i < max_length; i++) {
-            batch.token[i]     = output_tokens[i];
-            batch.pos[i]       = i;
-            batch.n_seq_id[i]  = 1;
-            batch.seq_id[i][0] = 0;
-            batch.logits[i]    = 1;
-        }
+    if (params.schedule == BLOCK_BASED) {
+        GGML_ASSERT(params.max_length % params.block_length == 0);
+        num_blocks = params.max_length / params.block_length;
+        GGML_ASSERT(params.steps % num_blocks == 0);
+        steps_per_block = params.steps / num_blocks;
+    }
 
-        int ret = llama_decode(ctx, batch);
-        if (ret != 0) {
-            LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, step, ret);
-            break;
-        }
+    std::vector<float> confidence(params.max_length);
 
-        float * raw_logits = llama_get_logits(ctx);
-        if (!raw_logits) {
-            LOG_ERR("%s: failed to get logits at step %d\n", __func__, step);
-            break;
+    int64_t total_sampling_time = 0;
+    int64_t total_time          = 0;
+    int64_t time_start          = ggml_time_us();
+
+    for (int block_num = 0; block_num < num_blocks; block_num++) {
+        int32_t block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0;
+        int32_t block_end   = (params.schedule == BLOCK_BASED) ?
+                                  std::min(n_input + (block_num + 1) * params.block_length, params.max_length) :
+                                  params.max_length;
+
+        // Count masked tokens in current block for block-based processing
+        if (params.schedule == BLOCK_BASED) {
+            int32_t block_mask_count = 0;
+            for (int i = block_start; i < block_end; i++) {
+                if (output_tokens[i] == params.mask_token_id) {
+                    block_mask_count++;
+                }
+            }
+            num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block);
         }
 
-        auto get_logits_for_pos = [&](int32_t pos) -> const float * {
-            return pos == 0 ? raw_logits : raw_logits + (pos - 1) * n_vocab;
-        };
-
-        int64_t time_start_sampling = ggml_time_us();
+        for (int32_t step = 0; step < steps_per_block; step++) {
+            int32_t global_step = block_num * steps_per_block + step;
 
-        mask_positions.clear();
-        for (int32_t i = 0; i < max_length; i++) {
-            if (output_tokens[i] == params.mask_token_id) {
-                mask_positions.push_back(i);
+            if (params.step_callback) {
+                if (!params.step_callback(
+                        global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) {
+                    break;
+                }
             }
-        }
 
-        if (mask_positions.empty()) {
-            break;
-        }
+            // Setup batch
+            for (int32_t i = 0; i < params.max_length; i++) {
+                batch.token[i]     = output_tokens[i];
+                batch.pos[i]       = i;
+                batch.n_seq_id[i]  = 1;
+                batch.seq_id[i][0] = 0;
+                batch.logits[i]    = 1;
+            }
 
-        float t = timesteps[step];
-        float s = timesteps[step + 1];
+            float * logits = nullptr;
 
-        if (params.algorithm == DIFFUSION_ALG_ORIGIN) {
-            float p_transfer = (step < params.steps - 1) ? (1.0f - s / t) : 1.0f;
+            if (params.cfg_scale > 0.0f) {
+                int ret = llama_decode(ctx, batch);
+                if (ret != 0) {
+                    LOG_ERR("Failed to generate conditional");
+                    break;
+                }
+                float * cond_logits_ptr = llama_get_logits(ctx);
+                std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float));
 
-            for (int32_t pos : mask_positions) {
-                if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
-                    const float * pos_logits = get_logits_for_pos(pos);
-                    for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
-                        candidates[token_id].id    = token_id;
-                        candidates[token_id].logit = pos_logits[token_id];
-                        candidates[token_id].p     = 0.0f;
-                    }
+                // Unconditional generation (mask input)
+                std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin());
+                for (int32_t i = 0; i < n_input; i++) {
+                    un_x_buffer[i] = params.mask_token_id;
+                }
 
-                    llama_token_data_array cur_p = {
-                        /* .data       = */ candidates.data(),
-                        /* .size       = */ (size_t) n_vocab,  // Reset size to full vocab
-                        /* .selected   = */ -1,
-                        /* .sorted     = */ false,
-                    };
+                for (int32_t i = 0; i < params.max_length; i++) {
+                    batch.token[i] = un_x_buffer[i];
+                }
+                ret = llama_decode(ctx, batch);
+                if (ret != 0) {
+                    LOG_ERR("Failed to generate unconditional");
+                    break;
+                }
+                float * uncond_logits = llama_get_logits(ctx);
 
-                    llama_sampler_apply(sampler, &cur_p);
-                    output_tokens[pos] = cur_p.data[cur_p.selected].id;
+                // Apply CFG
+                for (int32_t i = 0; i < logits_size; i++) {
+                    cond_logits_buffer[i] =
+                        uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]);
                 }
-            }
-        } else {
-            std::vector<std::pair<float, int32_t>> confidences;
-            std::vector<llama_token>               sampled_tokens(mask_positions.size());
-
-            for (size_t i = 0; i < mask_positions.size(); i++) {
-                int32_t       pos        = mask_positions[i];
-                const float * pos_logits = get_logits_for_pos(pos);
-
-                for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
-                    candidates[token_id].logit = pos_logits[token_id];
-                    candidates[token_id].p     = 0.0f;
-                    candidates[token_id].id    = token_id;
+                logits = cond_logits_buffer.data();
+            } else {
+                int ret = llama_decode(ctx, batch);
+                if (ret != 0) {
+                    LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret);
+                    break;
                 }
+                logits = llama_get_logits(ctx);
+            }
 
-                llama_token_data_array cur_p = {
-                    /* .data       = */ candidates.data(),
-                    /* .size       = */ candidates.size(),
-                    /* .selected   = */ -1,
-                    /* .sorted     = */ false,
-                };
+            if (!logits) {
+                LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step);
+                break;
+            }
 
-                llama_sampler_apply(sampler, &cur_p);
+            auto get_logits_for_pos = [&](int32_t pos) -> const float * {
+                if (params.shift_logits) {
+                    return pos == 0 ? logits : logits + (pos - 1) * n_vocab;
+                }
+                return logits + (pos) *n_vocab;
+            };
 
-                llama_token sampled_token = cur_p.data[cur_p.selected].id;
+            int64_t time_start_sampling = ggml_time_us();
 
-                float confidence = 0.0f;
-                if (params.algorithm == DIFFUSION_ALG_ENTROPY) {
-                    const float epsilon = 1e-10f;
-                    for (size_t j = 0; j < cur_p.size; j++) {
-                        float prob = cur_p.data[j].p;
-                        confidence += prob * logf(prob + epsilon);
+            mask_positions.clear();
+            for (int32_t i = 0; i < params.max_length; i++) {
+                if (output_tokens[i] == params.mask_token_id) {
+                    // For block-based, only consider current block
+                    if (params.schedule != BLOCK_BASED || (i >= block_start && i < block_end)) {
+                        mask_positions.push_back(i);
                     }
-                } else if (params.algorithm == DIFFUSION_ALG_TOPK_MARGIN) {
-                    confidence = cur_p.data[0].p - cur_p.data[1].p;
-                } else {
-                    confidence = cur_p.data[cur_p.selected].p;
                 }
+            }
 
-                sampled_tokens[i] = sampled_token;
-                confidences.emplace_back(confidence, i);
+            if (mask_positions.empty()) {
+                break;
             }
 
-            int32_t num_transfer =
-                (step < params.steps - 1) ? (int32_t) (mask_positions.size() * (1.0f - s / t)) : mask_positions.size();
-
-            if (num_transfer > 0) {
-                if (params.alg_temp == 0.0f) {
-                    std::partial_sort(confidences.begin(), confidences.begin() + num_transfer, confidences.end(),
-                                      [](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
-                                          if (a.first != b.first) {
-                                              return a.first > b.first;
-                                          }
-                                          return a.second < b.second;
-                                      });
-                } else {
-                    conf_candidates.clear();
-
-                    for (int32_t pos = 0; pos < max_length; pos++) {
-                        float conf_logit = -std::numeric_limits<float>::infinity();
-
-                        auto it = std::find(mask_positions.begin(), mask_positions.end(), pos);
-                        if (it != mask_positions.end()) {
-                            size_t mask_idx = std::distance(mask_positions.begin(), it);
-                            conf_logit = confidences[mask_idx].first / params.alg_temp;  // Apply temperature scaling
+            if (params.add_gumbel_noise && params.temperature > 0.0f) {
+                add_gumbel_noise(logits, n_vocab, params.temperature, rng);
+            }
+
+            if (params.algorithm == ORIGIN) {
+                int32_t transfer_count = calculate_transfer_count(
+                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
+                float p_transfer = (float) transfer_count / mask_positions.size();
+
+                for (int32_t pos : mask_positions) {
+                    if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
+                        const float * pos_logits = get_logits_for_pos(pos);
+                        for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
+                            candidates[token_id].id    = token_id;
+                            candidates[token_id].logit = pos_logits[token_id];
+                            candidates[token_id].p     = 0.0f;
                         }
 
-                        conf_candidates.emplace_back(llama_token_data{ pos, conf_logit, 0.0f });
+                        llama_token_data_array cur_p = {
+                            candidates.data(),
+                            (size_t) n_vocab,
+                            -1,
+                            false,
+                        };
+
+                        llama_sampler_apply(sampler, &cur_p);
+                        output_tokens[pos] = cur_p.data[cur_p.selected].id;
+                    }
+                }
+            } else {
+                std::vector<std::pair<float, int32_t>> confidences;
+                std::vector<llama_token>               sampled_tokens(mask_positions.size());
+
+                for (size_t i = 0; i < mask_positions.size(); i++) {
+                    int32_t       pos        = mask_positions[i];
+                    const float * pos_logits = get_logits_for_pos(pos);
+
+                    for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
+                        candidates[token_id].logit = pos_logits[token_id];
+                        candidates[token_id].p     = 0.0f;
+                        candidates[token_id].id    = token_id;
                     }
 
-                    llama_token_data_array conf_array = {
-                        /* .data       = */ conf_candidates.data(),
-                        /* .size       = */ conf_candidates.size(),
-                        /* .selected   = */ -1,
-                        /* .sorted     = */ false,
+                    llama_token_data_array cur_p = {
+                        candidates.data(),
+                        candidates.size(),
+                        -1,
+                        false,
                     };
 
-                    for (int32_t i = 0; i < num_transfer; i++) {
-                        // Apply distribution sampler to get selected index
-                        llama_sampler_apply(dist_sampler, &conf_array);
-                        int selected_idx      = conf_array.selected;
-                        confidences[i].second = conf_candidates[selected_idx].id;
+                    llama_sampler_apply(sampler, &cur_p);
+                    llama_token sampled_token = cur_p.data[cur_p.selected].id;
+
+                    float conf = calculate_confidence(cur_p, params.algorithm, rng);
 
-                        conf_candidates[selected_idx].p = 0.0f;
-                        conf_array.selected             = -1;
-                    }
+                    sampled_tokens[i] = sampled_token;
+                    confidences.emplace_back(conf, i);
                 }
 
-                if (params.alg_temp == 0.0f) {
-                    // Deterministic - use confidence order
-                    for (int32_t i = 0; i < num_transfer; i++) {
-                        int32_t     mask_idx = confidences[i].second;
-                        int32_t     pos      = mask_positions[mask_idx];
-                        llama_token token    = sampled_tokens[mask_idx];
-                        output_tokens[pos]   = token;
-                    }
-                } else {
-                    for (int32_t i = 0; i < num_transfer; i++) {
-                        int32_t pos = confidences[i].second;
-                        auto    it  = std::find(mask_positions.begin(), mask_positions.end(), pos);
-                        if (it != mask_positions.end()) {
-                            int32_t mask_idx   = std::distance(mask_positions.begin(), it);
+                int32_t transfer_count = calculate_transfer_count(
+                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
+
+                if (transfer_count > 0) {
+                    if (params.alg_temp == 0.0f) {
+                        std::partial_sort(confidences.begin(),
+                                          confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()),
+                                          confidences.end(),
+                                          [](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
+                                              if (a.first != b.first) {
+                                                  return a.first > b.first;
+                                              }
+                                              return a.second < b.second;
+                                          });
+
+                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
+                            int32_t mask_idx   = confidences[i].second;
+                            int32_t pos        = mask_positions[mask_idx];
                             output_tokens[pos] = sampled_tokens[mask_idx];
                         }
+                    } else {
+                        conf_candidates.clear();
+                        for (size_t i = 0; i < confidences.size(); i++) {
+                            float conf_logit = confidences[i].first / params.alg_temp;
+                            conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f });
+                        }
+
+                        llama_token_data_array conf_array = {
+                            conf_candidates.data(),
+                            conf_candidates.size(),
+                            -1,
+                            false,
+                        };
+
+                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
+                            llama_sampler_apply(dist_sampler, &conf_array);
+                            int32_t selected_idx = conf_array.selected;
+                            int32_t mask_idx     = selected_idx;
+                            int32_t pos          = mask_positions[mask_idx];
+                            output_tokens[pos]   = sampled_tokens[mask_idx];
+
+                            conf_candidates[selected_idx].p = 0.0f;
+                            conf_array.selected             = -1;
+                        }
                     }
                 }
             }
+
+            int64_t time_end_sampling = ggml_time_us();
+            total_sampling_time += time_end_sampling - time_start_sampling;
         }
-        int64_t time_end_sampling = ggml_time_us();
-        total_sampling_time += time_end_sampling - time_start_sampling;
     }
+
     int64_t time_end = ggml_time_us();
     total_time += time_end - time_start;
 
     LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
-            total_time / 1000.0, total_time / 1000.0 / params.steps, total_sampling_time / 1000.0 / params.steps);
-
+            total_time / 1000.0,
+            total_time / 1000.0 / params.steps,
+            total_sampling_time / 1000.0 / params.steps);
 
     llama_batch_free(batch);
     llama_sampler_free(sampler);
     llama_sampler_free(dist_sampler);
 
-    n_generated = max_length;
+    n_generated = params.max_length;
 }
 
-
-
-
 static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
     if (!use_chat_template) {
         return prompt;
@@ -331,66 +529,6 @@ static std::string format_input_text(const std::string & prompt, bool use_chat_t
     return result.prompt;
 }
 
-struct callback_data {
-    const common_params_diffusion * diff_params;
-    const llama_vocab *             vocab;
-    int32_t                         n_input;
-};
-
-static bool diffusion_step_callback(int32_t step,
-                                    int32_t total_steps,
-                                    const llama_token * tokens,
-                                    int32_t n_tokens,
-                                    void * user_data) {
-    (void)user_data;
-
-    callback_data * data = static_cast<callback_data *>(user_data);
-
-    auto print_progress_bar = [](int32_t step, int32_t total_steps) {
-        int progress_percent = (step * 100) / total_steps;
-        int progress_bars    = (step * 50) / total_steps;
-        LOG_INF("\rdiffusion step: %d/%d [%s%s] %d%%",
-            step,
-            total_steps,
-            std::string(progress_bars, '=').c_str(),
-            std::string(50 - progress_bars, ' ').c_str(),
-            progress_percent);
-    };
-
-    if (data->diff_params->visual_mode) {
-        // Visual mode: clear
-        LOG_INF("\033[2J\033[H");  // Clear screen and move cursor to top-left
-
-        print_progress_bar(step, total_steps);
-
-        LOG_INF("\n");
-
-        std::string current_text = " ";
-
-        for (int32_t i = data->n_input; i < n_tokens; i++) {
-            std::string token_str;
-            if (tokens[i] != llama_vocab_mask(data->vocab)) {
-                char piece[256];
-                int  n_chars = llama_token_to_piece(data->vocab, tokens[i], piece, sizeof(piece), 0, false);
-                if (n_chars > 0) {
-                    piece[n_chars] = '\0';
-                    token_str      = piece;
-                }
-            } else {
-                token_str = " ";
-            }
-
-            current_text += token_str;
-        }
-
-        LOG_INF("%s\n", current_text.c_str());
-    } else {
-        print_progress_bar(step, total_steps);
-    }
-
-    return true;
-}
-
 int main(int argc, char ** argv) {
     ggml_time_init();
 
@@ -400,11 +538,6 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    const char * alg_names[] = { "ORIGIN", "MASKGIT_PLUS", "TOPK_MARGIN", "ENTROPY" };
-    const char * alg_name    = (params.diffusion.algorithm >= 0 && params.diffusion.algorithm <= 3) ?
-                                   alg_names[params.diffusion.algorithm] :
-                                   "UNKNOWN";
-
     common_init();
     llama_backend_init();
 
@@ -421,6 +554,12 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    if (!llama_model_is_diffusion(model)) {
+        LOG_ERR("error: unsupported model for diffusion");
+        llama_model_free(model);
+        return 1;
+    }
+
     llama_context_params ctx_params = llama_context_default_params();
     ctx_params.n_ctx                = params.n_ctx;
     ctx_params.n_batch              = params.n_batch;
@@ -442,10 +581,12 @@ int main(int argc, char ** argv) {
     const llama_vocab * vocab            = llama_model_get_vocab(model);
     std::string         formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model);
 
-    std::vector<llama_token> input_tokens = common_tokenize(vocab, formatted_prompt,
+    std::vector<llama_token> input_tokens = common_tokenize(vocab,
+                                                            formatted_prompt,
                                                             /*add special tokens*/ true,
                                                             /*parse special*/ true);
-    int                      n_input      = input_tokens.size();
+
+    int n_input = input_tokens.size();
 
     if (n_input >= params.n_ctx) {
         LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx);
@@ -454,44 +595,79 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    struct diffusion_params ldiff_params = diffusion_default_params();
-    ldiff_params.steps                   = params.diffusion.steps;
-    ldiff_params.eps                     = params.diffusion.eps;
-    ldiff_params.temperature             = params.sampling.temp;
-    ldiff_params.top_p                   = params.sampling.top_p;
-    ldiff_params.top_k                   = params.sampling.top_k;
-    ldiff_params.algorithm               = static_cast<enum diffusion_alg>(params.diffusion.algorithm);
-    ldiff_params.alg_temp                = params.diffusion.alg_temp;
-    ldiff_params.seed                    = params.sampling.seed;
-
     llama_token mask_token_id = llama_vocab_mask(vocab);
     GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL);
 
-    LOG_INF("diffusion_params: - %-25s llama_token      = %d\n", "mask_token_id", mask_token_id);
-    LOG_INF("diffusion_params: - %-25s u32              = %d\n", "steps", params.diffusion.steps);
-    LOG_INF("diffusion_params: - %-25s f32              = %.6f\n", "eps", params.diffusion.eps);
-    LOG_INF("diffusion_params: - %-25s u32              = %d (%s)\n", "algorithm", params.diffusion.algorithm,
-            alg_name);
-    LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "alg_temp", params.diffusion.alg_temp);
+    bool visual_mode = params.diffusion.visual_mode;
 
-    ldiff_params.mask_token_id = mask_token_id;
+    int32_t                  n_generated = 0;
+    std::vector<llama_token> output_tokens(params.n_ubatch);
 
-    callback_data cb_data = { &params.diffusion, vocab, n_input };
+    struct diffusion_params diff_params;
 
-    ldiff_params.step_callback           = diffusion_step_callback;
-    ldiff_params.step_callback_user_data = &cb_data;
+    char shift_logits_str[8];
+    if (llama_model_meta_val_str(model, "diffusion.shift_logits", shift_logits_str, sizeof(shift_logits_str)) >= 0) {
+        diff_params.shift_logits = (strcmp(shift_logits_str, "true") == 0);
+    } else {
+        diff_params.shift_logits = true;
+    }
 
-    int32_t n_generated = 0;
+    //Use either eps or block length, but not both
+    GGML_ASSERT((params.diffusion.eps == 0) ^ (params.diffusion.block_length == 0));
 
-    std::vector<llama_token> output_tokens(params.n_ubatch);
-    diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, params.n_ubatch,
-                       ldiff_params, n_generated);
+    if (params.diffusion.eps) {
+        diff_params.schedule = TIMESTEP_BASED;
+        diff_params.eps      = params.diffusion.eps;
+    } else if (params.diffusion.block_length) {
+        diff_params.schedule     = BLOCK_BASED;
+        diff_params.block_length = params.diffusion.block_length;
+    }
+
+    diff_params.mask_token_id    = mask_token_id;
+    diff_params.seed             = params.sampling.seed;
+    diff_params.temperature      = params.sampling.temp;
+    diff_params.steps            = params.diffusion.steps;
+    diff_params.algorithm        = static_cast<diffusion_algorithm>(params.diffusion.algorithm);
+    diff_params.max_length       = params.n_ubatch;
+    diff_params.top_p            = params.sampling.top_p;
+    diff_params.top_k            = params.sampling.top_k;
+    diff_params.visual_mode      = params.diffusion.visual_mode;
+    diff_params.add_gumbel_noise = params.diffusion.add_gumbel_noise;
+
+    diff_params.step_callback           = diffusion_step_callback;
+    callback_data cb_data               = { &diff_params, vocab, n_input };
+    diff_params.step_callback_user_data = &cb_data;
+
+    const char * alg_names[]   = { "ORIGIN", "ENTROPY_BASED", "MARGIN_BASED", "RANDOM", "CONFIDENCE_BASED" };
+    const char * sched_names[] = { "TIMESTEP_BASED", "BLOCK_BASED" };
+    const char * alg_name =
+        (diff_params.algorithm >= 0 && diff_params.algorithm <= 4) ? alg_names[diff_params.algorithm] : "UNKNOWN";
+    const char * sched_name =
+        (diff_params.schedule >= 0 && diff_params.schedule <= 1) ? sched_names[diff_params.schedule] : "UNKNOWN";
+
+    LOG_INF("diffusion_params: - %-25s llama_token      = %d\n", "mask_token_id", mask_token_id);
+    LOG_INF("diffusion_params: - %-25s u32              = %d\n", "steps", diff_params.steps);
+    LOG_INF("diffusion_params: - %-25s u32              = %d\n", "max_length", diff_params.max_length);
+    LOG_INF("diffusion_params: - %-25s enum             = %d (%s)\n", "algorithm", diff_params.algorithm, alg_name);
+    LOG_INF("diffusion_params: - %-25s enum             = %d (%s)\n", "schedule", diff_params.schedule, sched_name);
+    LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "temperature", diff_params.temperature);
+    if (diff_params.schedule == TIMESTEP_BASED) {
+        LOG_INF("diffusion_params: - %-25s f32              = %.6f\n", "eps", diff_params.eps);
+        LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "alg_temp", diff_params.alg_temp);
+    }
+    if (diff_params.schedule == BLOCK_BASED) {
+        LOG_INF("diffusion_params: - %-25s u32              = %d\n", "block_length", diff_params.block_length);
+        LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "cfg_scale", diff_params.cfg_scale);
+    }
+
+    diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, diff_params, n_generated);
 
     if (n_generated > 0) {
-        if (params.diffusion.visual_mode) {
+        if (visual_mode) {
             //clear screen and move cursor to top-left
             LOG_INF("\033[2J\033[H");
         }
+
         output_tokens.erase(output_tokens.begin(), output_tokens.begin() + n_input);
         std::string output_data = common_detokenize(vocab, output_tokens, false);
         LOG_INF("\n%s\n", output_data.c_str());
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
index 99196c9d047..722cd7f40f0 100644
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -65,7 +65,7 @@ int main(int argc, char ** argv) {
     ctx_dft   = llama_init_dft.context.get();
 
     if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
-        return 1;
+        LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str());
     }
 
     // Tokenize the prompt
@@ -130,7 +130,10 @@ int main(int argc, char ** argv) {
     params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft;
     params_spec.p_min   = p_min;
 
-    struct common_speculative * spec = common_speculative_init(ctx_dft);
+    struct common_speculative * spec = common_speculative_init(ctx_tgt, ctx_dft);
+    for (auto &pair : params.speculative.replacements) {
+        common_speculative_add_replacement_tgt_dft(spec, pair.first.c_str(), pair.second.c_str());
+    }
 
     llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
 
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 316055193d8..07d6b8b67d4 100755
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -1913,11 +1913,9 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
                              bcast_weight_nb[4], bcast_weight_nb[5]};
     aclTensor* acl_weight_tensor;
 
-    bool weightToNZ = false;
-#ifdef ASCEND_310P
-    weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
-#endif
-    if (weightToNZ && is_matmul_weight(weight)) {
+    // Only check env once.
+    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
+    if (weight_to_nz && is_matmul_weight(weight)) {
         int64_t acl_stride[2] = {1, transpose_ne[1]};
 
         // Reverse ne.
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 4dac2e8b789..49f55891d85 100755
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1116,61 +1116,59 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
     return GGML_STATUS_SUCCESS;
 }
 
-static int CreateAclTensorWeight(const void *hostData, const std::vector<int64_t> &shape, void **deviceAddr,
-                      aclDataType dataType, aclTensor **tensor)
-{
-    uint64_t size = 1;
-    for (auto i : shape) {
-        size *= i;
+// ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed
+namespace {
+    void* g_nz_workspace = nullptr;
+    size_t g_nz_workspace_allocated = 0;
+
+    void release_nz_workspace() {
+        if (g_nz_workspace) {
+            aclrtFree(g_nz_workspace);
+            g_nz_workspace = nullptr;
+            g_nz_workspace_allocated = 0;
+        }
     }
 
-    const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size());
-    ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size));
-
-    size *= sizeof(int16_t);
-
-    ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
-    aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
-
-    std::vector<int64_t> strides(shape.size(), 1);
-    for (int64_t i = shape.size() - 2; i >= 0; i--) {
-        strides[i] = shape[i + 1] * strides[i + 1];
+    void relloc_nz_workspace(size_t new_size) {
+        if (new_size > g_nz_workspace_allocated) {
+        if (g_nz_workspace) {
+            aclrtFree(g_nz_workspace);
+            g_nz_workspace = nullptr;
+        }
+        ACL_CHECK(aclrtMalloc(&g_nz_workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
+        g_nz_workspace_allocated = new_size;
+    }
     }
-
-    *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
-                              shape.data(), shape.size(), *deviceAddr);
-    return 0;
 }
 
+/**
+ * @brief Convert tensor weights to NZ format using Ascend CANN API.
+ *
+ * This function creates a transposed tensor descriptor and performs the
+ * TransMatmulWeight operation. Converting tensor formats can significantly
+ * improve performance on certain hardware.
+ *
+ * @param tensor Pointer to the input ggml_tensor containing the weights.
+ * @param data Pointer to the raw data buffer for the tensor weights.
+ * @param offset Byte offset within the tensor data buffer where weights start.
+ *
+ * @note The workspace buffer used in this function is managed globally and reused
+ *       across calls. This reduces overhead from repeated memory allocation and deallocation.
+ */
 static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
-    aclrtStream stream;
-    ACL_CHECK(aclrtCreateStream(&stream));
-
-    std::vector<int64_t> weightTransposedShape = {tensor->ne[1], tensor->ne[0]};
-    void *weightTransposedDeviceAddr = nullptr;
-    aclTensor *weightTransposed = nullptr;
-    CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr,
-                          ggml_cann_type_mapping(tensor->type), &weightTransposed);
-
+    aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne,
+                                    tensor->nb, 2, ACL_FORMAT_ND, offset);
     uint64_t workspaceSize = 0;
     aclOpExecutor *executor;
-    void *workspaceAddr = nullptr;
 
     // TransMatmulWeight
-    ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor));
-    std::unique_ptr<void, aclError (*)(void *)> workspaceAddrPtrTrans(nullptr, aclrtFree);
-    if (workspaceSize > 0) {
-        ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
-        workspaceAddrPtrTrans.reset(workspaceAddr);
-    }
-    ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream));
+    ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
+                                                    &workspaceSize, &executor));
+    // Avoid frequent malloc/free of the workspace.
+    relloc_nz_workspace(workspaceSize);
 
-    size_t size = ggml_nelements(tensor) * ggml_element_size(tensor);
-
-    aclrtMemcpy((char *)tensor->data + offset, size,
-                weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
+    ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
     ACL_CHECK(aclDestroyTensor(weightTransposed));
-    aclrtFree(weightTransposedDeviceAddr);
 }
 
 // TODO: need handle tensor which has paddings.
@@ -1197,14 +1195,14 @@ static void ggml_backend_cann_buffer_set_tensor(
     // For acl, synchronous functions use this default stream.
     // Why aclrtSynchronizeDevice?
 
-    bool weightToNZ = false;
-#ifdef ASCEND_310P
-    weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
-#endif
+    // Only check env once.
+    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
     if (!need_transform(tensor->type)) {
         ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
                               ACL_MEMCPY_HOST_TO_DEVICE));
-        if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) {
+        if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
+            GGML_ASSERT(tensor->ne[2] == 1);
+            GGML_ASSERT(tensor->ne[3] == 1);
             weight_format_to_nz(tensor, data, offset);
         }
     } else {
@@ -1440,20 +1438,32 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
     size_t size = ggml_nbytes(tensor);
     int64_t ne0 = tensor->ne[0];
 
+    // Only check env once.
+    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
+
     // last line must bigger than 32, because every single op deal at
     // least 32 bytes.
     // TODO: quantized type?
     // int64_t line_size = ne0 * ggml_element_size(tensor);
     // int64_t line_size_align_32 = (line_size + 31) & ~31;
     // size += (line_size_align_32 - line_size);
-
-    // TODO: not support quantized yet.
-    // TODO: consider un-continue tensor.
     if (ggml_is_quantized(tensor->type)) {
         if (ne0 % MATRIX_ROW_PADDING != 0) {
             size += ggml_row_size(
                 tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
         }
+    } else if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
+        // NZ format weight are not support quantized yet.
+        // If ND tensor transform to NZ, size may changed.
+        int64_t shape[] = {tensor->ne[1], tensor->ne[0]};
+        GGML_ASSERT(tensor->ne[2] == 1);
+        GGML_ASSERT(tensor->ne[3] == 1);
+        const aclIntArray *acl_shape = aclCreateIntArray(shape, 2);
+        size_t new_size;
+        ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape,
+                    ggml_cann_type_mapping(tensor->type), &new_size));
+        ACL_CHECK(aclDestroyIntArray(acl_shape));
+        size = std::max(size, new_size);
     }
 
     return size;
@@ -2080,6 +2090,8 @@ static enum ggml_status ggml_backend_cann_graph_compute(
         (ggml_backend_cann_context*)backend->context;
 
     ggml_cann_set_device(cann_ctx->device);
+    //release temp buffer create by set tensor.
+    release_nz_workspace();
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor* node = cgraph->nodes[i];
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index c97b61d09c7..ef47ea7359e 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -279,6 +279,9 @@ class Attention:
         class Projector:
             STACK_FACTOR    = "clip.audio.projector.stack_factor"
 
+    class Diffusion:
+        SHIFT_LOGITS        = "diffusion.shift_logits"
+
 #
 # recommended mapping of model tensor names for storage in gguf
 #
@@ -377,6 +380,7 @@ class MODEL_ARCH(IntEnum):
     LFM2             = auto()
     DREAM            = auto()
     SMALLTHINKER     = auto()
+    LLADA            = auto()
 
 
 class VISION_PROJECTOR_TYPE(IntEnum):
@@ -697,6 +701,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.LFM2:             "lfm2",
     MODEL_ARCH.DREAM:            "dream",
     MODEL_ARCH.SMALLTHINKER:     "smallthinker",
+    MODEL_ARCH.LLADA:            "llada",
 }
 
 VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -1318,6 +1323,21 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.LLADA: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     MODEL_ARCH.QWEN2VL: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 4f23f9b0246..f4fd64ad822 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -1047,6 +1047,11 @@ def add_audio_num_mel_bins(self, value: int) -> None:
     def add_audio_stack_factor(self, value: int) -> None:
         self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)
 
+    # diffusion models
+
+    def add_diffusion_shift_logits(self, value: bool) -> None:
+        self.add_bool(Keys.Diffusion.SHIFT_LOGITS, value)
+
     def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
         pack_prefix = ''
         if not skip_pack_prefix:
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index bfd4fd37a3f..15adbfa7818 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -32,6 +32,7 @@ class TensorNameMap:
             "model.word_embeddings",                     # bailingmoe
             "language_model.model.embed_tokens",         # llama4
             "encoder",                                   # neobert
+            "model.transformer.wte",                     # llada
         ),
 
         # Token type embeddings
@@ -71,6 +72,7 @@ class TensorNameMap:
             "head",                      # rwkv
             "head.out",                  # wavtokenizer
             "lm_head",                   # llama4
+            "model.transformer.ff_out",  # llada
         ),
 
         # Output norm
@@ -94,6 +96,7 @@ class TensorNameMap:
             "model.ln_out",                            # rwkv7
             "backbone.final_layer_norm",               # wavtokenizer
             "model.norm",                              # llama4
+            "model.transformer.ln_f",                  # llada
         ),
 
         # Rope frequencies
@@ -139,6 +142,7 @@ class TensorNameMap:
             "model.layers.{bid}.input_layernorm",                   # llama4
             "transformer_encoder.{bid}.attention_norm",             # neobert
             "model.layers.{bid}.operator_norm",                     # lfm2
+            "model.transformer.blocks.{bid}.attn_norm",             # llada
         ),
 
         # Attention norm 2
@@ -183,6 +187,7 @@ class TensorNameMap:
             "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
             "transformer.h.{bid}.attn.attention.q_proj",                 # exaone
             "model.layers.{bid}.self_attn.q_proj",                       # llama4
+            "model.transformer.blocks.{bid}.q_proj",                     # llada
         ),
 
         # Attention key
@@ -199,6 +204,7 @@ class TensorNameMap:
             "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
             "transformer.h.{bid}.attn.attention.k_proj",               # exaone
             "model.layers.{bid}.self_attn.k_proj",                     # llama4
+            "model.transformer.blocks.{bid}.k_proj",                   # llada
         ),
 
         # Attention value
@@ -214,6 +220,7 @@ class TensorNameMap:
             "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
             "transformer.h.{bid}.attn.attention.v_proj",                 # exaone
             "model.layers.{bid}.self_attn.v_proj",                       # llama4
+            "model.transformer.blocks.{bid}.v_proj",                     # llada
         ),
 
         # Attention output
@@ -246,6 +253,7 @@ class TensorNameMap:
             "transformer.h.{bid}.attn.attention.out_proj",                  # exaone
             "model.layers.{bid}.self_attn.o_proj",                          # llama4
             "transformer_encoder.{bid}.wo",                                 # neobert
+            "model.transformer.blocks.{bid}.attn_out",                      # llada
         ),
 
         # Attention output norm
@@ -291,6 +299,7 @@ class TensorNameMap:
             "model.layers.{bid}.post_attention_layernorm",                   # llama4
             "transformer_encoder.{bid}.ffn_norm",                            # neobert
             "model.layers.layers.{bid}.pre_mlp_norm",                        # plamo2
+            "model.transformer.blocks.{bid}.ff_norm",                        # llada
         ),
 
         # Post feed-forward norm
@@ -364,6 +373,7 @@ class TensorNameMap:
             "model.layers.{bid}.feed_forward.up_proj",                # llama4 jamba granite-hybrid
             "transformer_encoder.{bid}.ffn.w12",                      # neobert
             "model.layers.{bid}.block_sparse_moe.up",                 # smallthinker
+            "model.transformer.blocks.{bid}.up_proj",                  # llada
         ),
 
         MODEL_TENSOR.FFN_UP_EXP: (
@@ -405,6 +415,7 @@ class TensorNameMap:
             "transformer.h.{bid}.mlp.c_fc_0",             # exaone
             "model.layers.{bid}.feed_forward.gate_proj",  # llama4 jamba granite-hybrid
             "model.layers.{bid}.block_sparse_moe.gate",   # smallthinker
+            "model.transformer.blocks.{bid}.ff_proj",     # llada
         ),
 
         MODEL_TENSOR.FFN_GATE_EXP: (
@@ -454,6 +465,7 @@ class TensorNameMap:
             "model.layers.{bid}.feed_forward.down_proj",              # llama4 jamba granite-hybrid
             "transformer_encoder.{bid}.ffn.w3",                       # neobert
             "model.layers.{bid}.block_sparse_moe.down",               # smallthinker
+            "model.transformer.blocks.{bid}.ff_out",                   # llada
         ),
 
         MODEL_TENSOR.FFN_DOWN_EXP: (
diff --git a/include/llama.h b/include/llama.h
index 6f454a508a0..1a51e74a8d6 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -537,6 +537,9 @@ extern "C" {
     // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
     LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
 
+    // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
+    LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
+
     // Returns 0 on success
     LLAMA_API uint32_t llama_model_quantize(
             const char * fname_inp,
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index dbf977443ae..15fb9d0b508 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -89,6 +89,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_LFM2,             "lfm2"             },
     { LLM_ARCH_DREAM,            "dream"            },
     { LLM_ARCH_SMALLTHINKER,     "smallthinker"     },
+    { LLM_ARCH_LLADA,            "llada"            },
     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
 
@@ -1972,6 +1973,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
+    {
+        LLM_ARCH_LLADA,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -2224,6 +2242,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
 bool llm_arch_is_diffusion(const llm_arch & arch) {
     switch (arch) {
         case LLM_ARCH_DREAM:
+        case LLM_ARCH_LLADA:
             return true;
         default:
             return false;
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 8267a8d3aa4..8ea80806c9c 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -93,6 +93,7 @@ enum llm_arch {
     LLM_ARCH_LFM2,
     LLM_ARCH_DREAM,
     LLM_ARCH_SMALLTHINKER,
+    LLM_ARCH_LLADA,
     LLM_ARCH_UNKNOWN,
 };
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index ee861bd7ec1..491a26b6346 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -785,13 +785,20 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
                 bool   scale_w,
                float   w_scale,
          llama_expert_gating_func_type gating_op,
-                 int   il) const {
+                 int   il,
+         ggml_tensor * probs_in) const {
     const int64_t n_embd   = cur->ne[0];
     const int64_t n_tokens = cur->ne[1];
     const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
 
-    ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
-    cb(logits, "ffn_moe_logits", il);
+    ggml_tensor * logits = nullptr;
+
+    if (probs_in == nullptr) {
+        logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
+        cb(logits, "ffn_moe_logits", il);
+    } else {
+        logits = probs_in;
+    }
 
     ggml_tensor * probs = nullptr;
     switch (gating_op) {
@@ -884,6 +891,14 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
                 cur = ggml_gelu(ctx0, cur);
                 cb(cur, "ffn_moe_gelu", il);
             } break;
+        case LLM_FFN_RELU:
+            if (gate_exps) {
+                cur = ggml_reglu_split(ctx0, cur, up);
+                cb(cur, "ffn_moe_reglu", il);
+            } else {
+                cur = ggml_relu(ctx0, cur);
+                cb(cur, "ffn_moe_relu", il);
+            } break;
         default:
             GGML_ABORT("fatal error");
     }
@@ -927,100 +942,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
     return moe_out;
 }
 
-ggml_tensor * llm_graph_context::build_moe_ffn_from_probs(
-         ggml_tensor * cur,
-         ggml_tensor * probs,
-         ggml_tensor * up_exps,
-         ggml_tensor * gate_exps,
-         ggml_tensor * down_exps,
-         ggml_tensor * exp_probs_b,
-             int64_t   n_expert,
-             int64_t   n_expert_used,
-             llama_expert_gating_func_type gating_op,
-                 int   il) const {
-    const int64_t n_embd   = cur->ne[0];
-    const int64_t n_tokens = cur->ne[1];
-
-    // add experts selection bias - introduced in DeepSeek V3
-    // leave probs unbiased as it's later used to get expert weights
-    ggml_tensor * selection_probs = probs;
-    if (exp_probs_b != nullptr) {
-        selection_probs = ggml_add(ctx0, probs, exp_probs_b);
-        cb(selection_probs, "ffn_moe_probs_biased", il);
-    }
-
-    // select experts
-    ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
-    cb(selected_experts->src[0], "ffn_moe_argsort", il);
-    cb(selected_experts, "ffn_moe_topk", il);
-
-    ggml_tensor * weights = ggml_get_rows(ctx0,
-            ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
-    cb(weights, "ffn_moe_weights", il);
-
-    weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
-     if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX) {
-        weights = ggml_soft_max(ctx0, weights);
-    } else {
-        weights = ggml_sigmoid(ctx0, weights);
-        ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
-        cb(weights_sum, "ffn_moe_weights_sum", il);
-
-        weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
-        cb(weights, "ffn_moe_weights_norm", il);
-    }
-
-    weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
-
-    cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
-
-    ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-    cb(up, "ffn_moe_up", il);
-
-    ggml_tensor * experts = nullptr;
-    cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-    cb(cur, "ffn_moe_gate", il);
-
-    cur = ggml_reglu_split(ctx0, cur, up);
-    cb(cur, "ffn_moe_reglu", il);
-
-    experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
-    cb(experts, "ffn_moe_down", il);
-
-    experts = ggml_mul(ctx0, experts, weights);
-    cb(cur, "ffn_moe_weighted", il);
-
-    ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
-
-    assert(n_expert_used > 0);
-
-    // order the views before the adds
-    for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
-        cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
-
-        ggml_build_forward_expand(gf, cur_experts[i]);
-    }
-
-    // aggregate experts
-    // note: here we explicitly use hparams.n_expert_used instead of n_expert_used
-    //       to avoid potentially a large number of add nodes during warmup
-    //       ref: https://github.com/ggml-org/llama.cpp/pull/14753
-    ggml_tensor * moe_out = cur_experts[0];
-
-    for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
-        moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
-    }
-
-    if (n_expert_used == 1) {
-        // avoid returning a non-contiguous tensor
-        moe_out = ggml_cont(ctx0, moe_out);
-    }
-
-    cb(moe_out, "ffn_moe_out", il);
-
-    return moe_out;
-}
-
 // input embeddings with optional lora
 ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
     const int64_t n_embd = hparams.n_embd;
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 55a6b6f3e05..94d778f3847 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -631,19 +631,8 @@ struct llm_graph_context {
                     bool   scale_w,
                    float   w_scale,
             llama_expert_gating_func_type gating_op,
-                     int   il) const;
-
-    ggml_tensor * build_moe_ffn_from_probs(
-             ggml_tensor * cur,
-             ggml_tensor * probs,
-             ggml_tensor * up_exps,
-             ggml_tensor * gate_exps,
-             ggml_tensor * down_exps,
-             ggml_tensor * exp_probs_b,
-                 int64_t   n_expert,
-                 int64_t   n_expert_used,
-            llama_expert_gating_func_type gating_op,
-                     int   il) const;
+                     int   il,
+             ggml_tensor * probs_in = nullptr) const;
 
     //
     // inputs
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index e3aa9e6f91a..56c2ecd4cab 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -869,6 +869,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 hparams.causal_attn = false;
             }
             break;
+        case LLM_ARCH_LLADA:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
+                switch (hparams.n_layer) {
+                    case 32:
+                        type = LLM_TYPE_8B;
+                        break;
+                    default:
+                        type = LLM_TYPE_UNKNOWN;
+                }
+                // Set non-causal attention for diffusion models
+                hparams.causal_attn = false;
+            }
+            break;
         case LLM_ARCH_QWEN2MOE:
             {
                 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
@@ -2149,6 +2164,53 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         }
                     }
                 } break;
+            case LLM_ARCH_LLADA:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output =
+                            create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+
+                        // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
+                        layer.wq =
+                            create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
+                        // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
+                        layer.wo =
+                            create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
+
+                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
+                                                         TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
+
+                        // optional MLP bias
+                        layer.ffn_gate_b =
+                            create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
+                        layer.ffn_down_b =
+                            create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
+                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
+                    }
+                }
+                break;
             case LLM_ARCH_LLAMA4:
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -8042,6 +8104,106 @@ struct llm_build_dream : public llm_graph_context {
     }
 };
 
+struct llm_build_llada : public llm_graph_context {
+    llm_build_llada(const llama_model & model, const llm_graph_params & params) :
+        llm_graph_context(params) {
+        // LLaDA is similar to LLaMA but uses non-causal attention for diffusion
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        ggml_tensor * cur;
+        ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        ggml_tensor * inp_pos = build_inp_pos();
+
+        // Non-causal attention for diffusion
+        auto * inp_attn = build_attn_inp_no_cache();
+
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute separate Q, K, V projections without bias, matching LLaDALlamaBlock
+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                                     ext_factor, attn_factor, beta_fast, beta_slow);
+
+                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                                     ext_factor, attn_factor, beta_fast, beta_slow);
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr,
+                                 1.0f / sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1 && inp_out_ids) {
+                cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
+                            model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = build_cvec(cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+        cb(cur, "result_norm", -1);
+        res->t_embd = cur;
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+
+        cb(cur, "result_output", -1);
+        res->t_logits = cur;
+
+        ggml_build_forward_expand(gf, cur);
+    }
+};
+
 struct llm_build_qwen2vl : public llm_graph_context {
     llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -17158,10 +17320,18 @@ struct llm_build_smallthinker : public llm_graph_context{
             cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
-            ggml_tensor * ffn_out = build_moe_ffn_from_probs(cur, probs, model.layers[il].ffn_up_exps,
-                                                model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
-                                                nullptr, n_expert, n_expert_used,
-                                                static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
+            ggml_tensor * ffn_out =
+                build_moe_ffn(cur,
+                        nullptr,
+                        model.layers[il].ffn_up_exps,
+                        model.layers[il].ffn_gate_exps,
+                        model.layers[il].ffn_down_exps,
+                        nullptr,
+                        n_expert, n_expert_used,
+                        LLM_FFN_RELU, true,
+                        false, 0.0,
+                        static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
+                        il, probs);
 
             cb(ffn_out, "ffn_out", il);
             cur = ffn_out;
@@ -17201,6 +17371,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
         case LLM_ARCH_NEO_BERT:
         case LLM_ARCH_WAVTOKENIZER_DEC:
         case LLM_ARCH_DREAM:
+        case LLM_ARCH_LLADA:
             {
                 res = nullptr;
             } break;
@@ -17367,6 +17538,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
                 llm = std::make_unique<llm_build_dream>(*this, params);
             }
             break;
+        case LLM_ARCH_LLADA:
+            {
+                llm = std::make_unique<llm_build_llada>(*this, params);
+            }
+            break;
         case LLM_ARCH_QWEN2VL:
             {
                 llm = std::make_unique<llm_build_qwen2vl>(*this, params);
@@ -17765,6 +17941,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
 
         // use what we call a normal RoPE, operating on pairs of consecutive head values
         case LLM_ARCH_LLAMA:
+        case LLM_ARCH_LLADA:
         case LLM_ARCH_LLAMA4:
         case LLM_ARCH_DECI:
         case LLM_ARCH_BAICHUAN:
@@ -17943,6 +18120,10 @@ bool llama_model_is_recurrent(const llama_model * model) {
     return llm_arch_is_recurrent(model->arch);
 }
 
+bool llama_model_is_diffusion(const llama_model * model) {
+    return llm_arch_is_diffusion(model->arch);
+}
+
 const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
     return model->tensors_by_name;
 }
diff --git a/tools/server/README.md b/tools/server/README.md
index f3f4caed85c..87cef75730a 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -469,7 +469,7 @@ These words will not be included in the completion, so make sure to add them to
 
 `ignore_eos`: Ignore end of stream token and continue generating.  Default: `false`
 
-`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`
+`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. For compatibility with the OpenAI API, a JSON object {"<string or token id>": bias, ...} can also be passed. Default: `[]`
 
 `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0`
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 2e4c40af783..35d6610428e 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -473,6 +473,33 @@ struct server_task {
                         }
                     }
                 }
+           } else if (logit_bias != data.end() && logit_bias->is_object()) {
+                const int n_vocab = llama_vocab_n_tokens(vocab);
+                for (const auto & el : logit_bias->items()) {
+                    float bias;
+                    const auto & key = el.key();
+                    const auto & value = el.value();
+                    if (value.is_number()) {
+                        bias = value.get<float>();
+                    } else if (value.is_boolean() && !value.get<bool>()) {
+                        bias = -INFINITY;
+                    } else {
+                        continue;
+                    }
+
+                    char *end;
+                    llama_token tok = strtol(key.c_str(), &end, 10);
+                    if (*end == 0) {
+                        if (tok >= 0 && tok < n_vocab) {
+                            params.sampling.logit_bias.push_back({tok, bias});
+                        }
+                    } else {
+                        auto toks = common_tokenize(vocab, key, false);
+                        for (auto tok : toks) {
+                            params.sampling.logit_bias.push_back({tok, bias});
+                        }
+                    }
+                }
             }
 
             params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos);
@@ -1902,6 +1929,7 @@ struct server_context {
     mtmd_context * mctx = nullptr;
 
     const llama_vocab * vocab = nullptr;
+    bool vocab_dft_compatible = true;
 
     llama_model * model_dft = nullptr;
 
@@ -1992,10 +2020,9 @@ struct server_context {
                 return false;
             }
 
-            if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
-                SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
-
-                return false;
+            vocab_dft_compatible = common_speculative_are_compatible(ctx, llama_init_dft.context.get());
+            if (!vocab_dft_compatible) {
+                SRV_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
             }
 
             const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());
@@ -2085,11 +2112,14 @@ struct server_context {
                     return;
                 }
 
-                slot.spec = common_speculative_init(slot.ctx_dft);
+                slot.spec = common_speculative_init(slot.ctx, slot.ctx_dft);
                 if (slot.spec == nullptr) {
                     SRV_ERR("%s", "failed to create speculator\n");
                     return;
                 }
+                for (auto &pair : params_base.speculative.replacements) {
+                    common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str());
+                }
             }
 
             SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py
index 7ee9a165140..6c6f64f5e2e 100644
--- a/tools/server/tests/unit/test_chat_completion.py
+++ b/tools/server/tests/unit/test_chat_completion.py
@@ -351,3 +351,32 @@ def test_logprobs_stream():
                     assert token.top_logprobs is not None
                     assert len(token.top_logprobs) > 0
     assert aggregated_text == output_text
+
+
+def test_logit_bias():
+    global server
+    server.start()
+
+    exclude = ["i", "I", "the", "The", "to", "a", "an", "be", "is", "was", "but", "But", "and", "And", "so", "So", "you", "You", "he", "He", "she", "She", "we", "We", "they", "They", "it", "It", "his", "His", "her", "Her", "book", "Book"]
+
+    res = server.make_request("POST", "/tokenize", data={
+        "content": " " + " ".join(exclude) + " ",
+    })
+    assert res.status_code == 200
+    tokens = res.body["tokens"]
+    logit_bias = {tok: -100 for tok in tokens}
+
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.chat.completions.create(
+        model="gpt-3.5-turbo-instruct",
+        temperature=0.0,
+        messages=[
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+        max_tokens=64,
+        logit_bias=logit_bias
+    )
+    output_text = res.choices[0].message.content
+    assert output_text
+    assert all(output_text.find(" " + tok + " ") == -1 for tok in exclude)
diff --git a/tools/server/tests/unit/test_completion.py b/tools/server/tests/unit/test_completion.py
index f6909e9ae78..be3a0052c64 100644
--- a/tools/server/tests/unit/test_completion.py
+++ b/tools/server/tests/unit/test_completion.py
@@ -444,6 +444,39 @@ def test_n_probs_post_sampling():
         assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])
 
 
+@pytest.mark.parametrize("tokenize,openai_style", [(False, False), (False, True), (True, False), (True, True)])
+def test_logit_bias(tokenize, openai_style):
+    global server
+    server.start()
+
+    exclude = ["i", "I", "the", "The", "to", "a", "an", "be", "is", "was", "but", "But", "and", "And", "so", "So", "you", "You", "he", "He", "she", "She", "we", "We", "they", "They", "it", "It", "his", "His", "her", "Her", "book", "Book"]
+
+    logit_bias = []
+    if tokenize:
+        res = server.make_request("POST", "/tokenize", data={
+            "content": " " + " ".join(exclude) + " ",
+        })
+        assert res.status_code == 200
+        tokens = res.body["tokens"]
+        logit_bias = [[tok, -100] for tok in tokens]
+
+    else:
+        logit_bias = [[" " + tok + " ", -100] for tok in exclude]
+
+    if openai_style:
+        logit_bias = {el[0]: -100 for el in logit_bias}
+
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 64,
+        "prompt": "What is the best book",
+        "logit_bias": logit_bias,
+        "temperature": 0.0
+    })
+    assert res.status_code == 200
+    output_text = res.body["content"]
+    assert all(output_text.find(" " + tok + " ") == -1 for tok in exclude)
+
+
 def test_cancel_request():
     global server
     server.n_ctx = 4096