lnigam · lnigam · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -57,6 +57,7 @@ static std::initializer_list<enum llama_example> mmproj_examples = {
     LLAMA_EXAMPLE_MTMD,
     LLAMA_EXAMPLE_SERVER,
     LLAMA_EXAMPLE_CLI,
+    LLAMA_EXAMPLE_DIFFUSION,
 };
 
 static std::string read_file(const std::string & fname) {
@@ -2228,7 +2229,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.image.emplace_back(item);
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
+    ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
     add_opt(common_arg(
         {"--image-min-tokens"}, "N",
         "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
@@ -3864,6 +3865,116 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
         [](common_params & params) { params.diffusion.visual_mode = true; }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--no-diffusion-gpu-sampling"},
+        "disable CUDA block-diffusion sampling fast path",
+        [](common_params & params) { params.diffusion.gpu_sampling = false; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--no-diffusion-device-selfcond"},
+        "disable device-resident block-diffusion self-conditioning",
+        [](common_params & params) { params.diffusion.device_self_cond = false; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--no-diffusion-device-denoise-loop"},
+        "disable device-side block-diffusion canvas and stop-state updates",
+        [](common_params & params) { params.diffusion.device_denoise_loop = false; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-pin-host-outputs"},
+        "register compact diffusion output buffers as pinned host memory",
+        [](common_params & params) { params.diffusion.pin_host_outputs = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-self-cond-top-k"}, "N",
+        string_format("block-diffusion sparse self-conditioning width (default: %d)", params.diffusion.self_cond_top_k),
+        [](common_params & params, int value) { params.diffusion.self_cond_top_k = value; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-input-gpu-groups"}, "N",
+        string_format("bitmask of block-diffusion decoder input groups assigned to GPU backend (default: %u)", params.diffusion.input_gpu_groups),
+        [](common_params & params, int value) { params.diffusion.input_gpu_groups = (uint32_t) std::max(value, 0); }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-default-top-k"}, "N",
+        "block-diffusion top-k used when --top-k is not explicitly provided",
+        [](common_params & params, int value) { params.diffusion.default_top_k = value; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-force-top-k"}, "N",
+        "block-diffusion server: override per-request top_k when N > 0",
+        [](common_params & params, int value) { params.diffusion.force_top_k = value; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-fused-self-cond-embd"},
+        "use fused device self-conditioning embedding input for block diffusion",
+        [](common_params & params) { params.diffusion.fused_self_cond_embd = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-fuse-final-softcap"},
+        "move final logit softcap into the CUDA diffusion sampling kernel",
+        [](common_params & params) { params.diffusion.fuse_final_logit_softcap = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-separate-encoder-decoder"},
+        "build separate block-diffusion encoder and decoder graph variants",
+        [](common_params & params) { params.diffusion.separate_encoder_decoder = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-cuda-direct-self-cond"},
+        "write CUDA diffusion self-conditioning directly into decoder graph inputs",
+        [](common_params & params) { params.diffusion.cuda_direct_self_cond = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-cuda-final-tokens-on-stop"},
+        "copy final diffusion tokens only when the device stop condition is reached",
+        [](common_params & params) { params.diffusion.cuda_final_tokens_on_stop = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-cuda-fused-top-k-sample"},
+        "fuse CUDA diffusion top-k selection and sampling",
+        [](common_params & params) { params.diffusion.cuda_fused_top_k_sample = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-cuda-tight-top-k"},
+        "avoid extra CUDA diffusion top-k scratch width when possible",
+        [](common_params & params) { params.diffusion.cuda_tight_top_k = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-cuda-parallel-full-softmax"},
+        "parallelize CUDA diffusion full-vocab sampling when top-k is 0",
+        [](common_params & params) { params.diffusion.cuda_parallel_full_softmax = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-cuda-fused-full-softmax"},
+        "fuse CUDA diffusion full-vocab softmax sampling and self-conditioning",
+        [](common_params & params) { params.diffusion.cuda_fused_full_softmax = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-cuda-top-k-local-k"}, "N",
+        "CUDA diffusion local top-k candidates per thread (0 = backend default)",
+        [](common_params & params, int value) { params.diffusion.cuda_top_k_local_k = value; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--no-diffusion-cuda-fast-top-k"},
+        "disable CUDA diffusion CUB/fast top-k selection path",
+        [](common_params & params) { params.diffusion.cuda_fast_top_k = false; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--top-k-start"}, "N",
+        "block-diffusion: anneal top-k from N at the first (high-entropy) denoising step (with --top-k-end)",
+        [](common_params & params, int value) { params.diffusion.top_k_start = value; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--top-k-end"}, "N",
+        "block-diffusion: anneal top-k to N at the last denoising step (with --top-k-start)",
+        [](common_params & params, int value) { params.diffusion.top_k_end = value; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--top-k-tail-correction"},
+        "block-diffusion: use the exact full-vocab entropy (logsumexp) for the accept/stop signal under top-k",
+        [](common_params & params) { params.diffusion.top_k_tail_correction = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
     add_opt(common_arg(
         {"--diffusion-eps"}, "F",
         string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),

diff --git a/common/common.cpp b/common/common.cpp
@@ -1585,6 +1585,11 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.op_offload        = !params.no_op_offload;
     cparams.swa_full          = params.swa_full;
     cparams.kv_unified        = params.kv_unified;
+    cparams.diffusion_self_cond_top_k = params.diffusion.self_cond_top_k;
+    cparams.diffusion_input_gpu_groups = params.diffusion.input_gpu_groups;
+    cparams.diffusion_fused_self_cond_embd = params.diffusion.fused_self_cond_embd;
+    cparams.diffusion_fuse_final_logit_softcap = params.diffusion.fuse_final_logit_softcap;
+    cparams.diffusion_separate_encoder_decoder = params.diffusion.separate_encoder_decoder;
 
     cparams.type_k = params.cache_type_k;
     cparams.type_v = params.cache_type_v;

diff --git a/common/common.h b/common/common.h
@@ -381,6 +381,10 @@ struct common_params_vocoder {
 struct common_params_diffusion {
     int32_t steps         = 128;
     bool    visual_mode   = false;
+    bool    gpu_sampling  = true;     // use CUDA diffusion sampling fast path when available
+    bool    device_self_cond = true;  // keep diffusion self-conditioning state on device
+    bool    device_denoise_loop = true; // update diffusion canvas/stop state on device
+    bool    pin_host_outputs = false; // register compact D2H output buffers as pinned host memory
 
     float   eps           = 0;        // epsilon for timesteps
     int32_t block_length  = 0;        // block length for generation
@@ -390,6 +394,30 @@ struct common_params_diffusion {
 
     float   cfg_scale     = 0;        // classifier-free guidance scale
     bool    add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
+
+    // block-diffusion (diffusion-gemma) top-k host sampling knobs
+    int32_t top_k_start        = 0;     // anneal top-k from this (first/high-entropy step) ...
+    int32_t top_k_end          = 0;     // ... to this (last step); both > 0 enables annealing
+    bool    top_k_tail_correction = false; // use exact full-vocab entropy for accept/stop
+    int32_t default_top_k      = 0;     // top-k used when --top-k is not explicitly provided
+    int32_t force_top_k        = 0;     // server: override per-request top_k when > 0
+    int32_t self_cond_top_k    = 256;   // sparse self-conditioning gather width
+    uint32_t input_gpu_groups  = 63;    // decoder input tensor groups assigned to GPU backend
+
+    // CUDA diffusion sampling fast-path knobs. Defaults preserve behavior when no tuning flags are passed.
+    bool    cuda_fast_top_k                = true;
+    bool    cuda_direct_self_cond          = false;
+    bool    cuda_final_tokens_on_stop      = false;
+    bool    cuda_fused_top_k_sample        = false;
+    bool    cuda_tight_top_k               = false;
+    bool    cuda_parallel_full_softmax     = false;
+    bool    cuda_fused_full_softmax        = false;
+    int32_t cuda_top_k_local_k             = 0;     // 0 = backend default
+
+    // Diffusion graph-shape knobs.
+    bool    fused_self_cond_embd       = false;
+    bool    fuse_final_logit_softcap   = false;
+    bool    separate_encoder_decoder   = false;
 };
 
 // reasoning API response format (not to be confused as chat template's reasoning format)

diff --git a/conversion/__init__.py b/conversion/__init__.py
@@ -78,6 +78,7 @@
     "Gemma4AssistantForCausalLM": "gemma",
     "Gemma4ForConditionalGeneration": "gemma",
     "Gemma4ForCausalLM": "gemma",
+    "DiffusionGemmaForBlockDiffusion": "gemma",
     "Gemma4UnifiedForConditionalGeneration": "gemma",
     "Gemma4UnifiedAssistantForCausalLM": "gemma",
     "GemmaForCausalLM": "gemma",
@@ -245,6 +246,7 @@
     "CogVLMForCausalLM": "cogvlm",
     "DeepseekOCR2ForCausalLM": "deepseek",
     "DeepseekOCRForCausalLM": "deepseek",
+    "DiffusionGemmaForBlockDiffusion": "gemma",
     "DotsOCRForCausalLM": "dotsocr",
     "Exaone4_5_ForConditionalGeneration": "exaone",
     "Gemma3ForConditionalGeneration": "gemma",

diff --git a/conversion/gemma.py b/conversion/gemma.py
@@ -655,7 +655,7 @@ def set_vocab(self):
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
 
-        num_kv_shared_layers = self.hparams["num_kv_shared_layers"]
+        num_kv_shared_layers = self.hparams.get("num_kv_shared_layers", 0)
         self.gguf_writer.add_shared_kv_layers(num_kv_shared_layers)
 
         # per-layer embedding is optional
@@ -764,7 +764,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         yield from super().modify_tensors(data_torch, name, bid)
 
-
 @ModelBase.register("Gemma4UnifiedForConditionalGeneration")
 class Gemma4UnifiedModel(Gemma4Model):
     model_arch = gguf.MODEL_ARCH.GEMMA4
@@ -805,6 +804,32 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_nextn_predict_layers(self.block_count)
 
 
+@ModelBase.register("DiffusionGemmaForBlockDiffusion")
+class DiffusionGemmaModel(Gemma4Model):
+    # Block-diffusion variant of Gemma 4. Reuses the gemma4 decoder block; adds the
+    # self-conditioning MLP and nests the language model under `model.decoder.`.
+    model_arch = gguf.MODEL_ARCH.DIFFUSION_GEMMA
+
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, _ = item
+        # The text encoder shares every weight with the decoder except its own
+        # per-layer `layer_scalar`. The single-stack graph uses the decoder scalars,
+        # so the encoder-only tensors are dropped here.
+        if name.startswith("model.encoder."):
+            return None
+        return super().filter_tensors(item)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # diffusion_gemma nests the language model under `model.decoder.`; strip it so
+        # the shared gemma4 tensor mappings apply. `model.decoder.self_conditioning.*`
+        # then maps to the SELF_COND_* tensors.
+        if name.startswith("model.decoder."):
+            name = "model." + name[len("model.decoder."):]
+
+        yield from super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("Gemma4ForConditionalGeneration")
 class Gemma4VisionAudioModel(MmprojModel):
     has_audio_encoder = True
@@ -882,7 +907,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min"))
             yield (mapped_name, data_torch)
 
-
 @ModelBase.register("Gemma4UnifiedForConditionalGeneration")
 class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel):
     has_audio_encoder = True
@@ -945,3 +969,36 @@ def modify_tensors(self, data_torch, name, bid):
             perm = row * p * 3 + col * 3 + ch
             data_torch = data_torch[perm]
         return super().modify_tensors(data_torch, name, bid)
+
+
+@ModelBase.register("DiffusionGemmaForBlockDiffusion")
+class DiffusionGemmaVisionModel(Gemma4VisionAudioModel):
+    # mmproj (vision) export for the v7 diffusion_gemma multimodal model. Reuses the gemma4
+    # vision tower (GEMMA4V); the v7 checkpoint nests it under `model.encoder.*` and has no
+    # audio encoder, so only the vision tower + vision projector are exported.
+    has_audio_encoder = False
+    has_vision_encoder = True
+
+    def set_gguf_parameters(self):
+        # MmprojModel base writes the generic vision params; do NOT call the gemma4
+        # vision+audio set_gguf_parameters (it asserts an audio config, which v7 lacks).
+        MmprojModel.set_gguf_parameters(self)
+        assert self.hparams_vision is not None
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA4V)
+        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
+
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, _ = item
+        # keep only the vision tower + vision projector; drop the diffusion decoder
+        # (the text-encoder language_model.* tensors are dropped by MmprojModel.filter_tensors)
+        if name.startswith("model.decoder."):
+            return None
+        return super().filter_tensors(item)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # v7 nests the vision tower / projector under `model.encoder.`; strip it so the gemma4
+        # vision tensor mappings (model.vision_tower.* / model.embed_vision.*) apply.
+        if name.startswith("model.encoder."):
+            name = "model." + name[len("model.encoder."):]
+        yield from super().modify_tensors(data_torch, name, bid)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -34,6 +34,7 @@ else()
     add_subdirectory(gen-docs)
     add_subdirectory(training)
     add_subdirectory(diffusion)
+    add_subdirectory(diffusion-gemma)
     if (NOT GGML_BACKEND_DL)
         add_subdirectory(convert-llama2c-to-ggml)
         # these examples use the backends directly and cannot be built with dynamic loading

diff --git a/examples/diffusion-gemma/CMakeLists.txt b/examples/diffusion-gemma/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(TARGET llama-diffusion-gemma-cli)
+add_executable(${TARGET} diffusion-gemma-cli.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama llama-common mtmd ${CMAKE_THREAD_LIBS_INIT})
+# mtmd (tools/) is added after examples/, so add its include dir explicitly for the headers
+target_include_directories(${TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../tools/mtmd)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+# OpenAI-compatible HTTP server for the block-diffusion models (llama-server analogue)
+set(TARGET llama-diffusion-gemma-server)
+add_executable(${TARGET} diffusion-gemma-server.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama llama-common mtmd cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
+target_include_directories(${TARGET} PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../tools/mtmd
+    ${CMAKE_SOURCE_DIR}/vendor)   # cpp-httplib/httplib.h, nlohmann/json.hpp
+target_compile_features(${TARGET} PRIVATE cxx_std_17)