Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
5bddf43
convert: add diffusion_gemma4 (block-diffusion Gemma 4) conversion su…
lnigam Jun 1, 2026
e814e44
model: register diffusion-gemma4 arch (reuses gemma4 graph)
lnigam Jun 1, 2026
aac0349
model: diffusion-gemma4 bidirectional graph + self-conditioning input
lnigam Jun 1, 2026
08e6dfa
model: wire self-conditioning MLP into the diffusion-gemma4 graph
lnigam Jun 1, 2026
27a4fc5
examples: block-diffusion generation CLI for diffusion-gemma4
lnigam Jun 2, 2026
0804f74
diffusion-gemma4: self-conditioning input channel (feedback)
lnigam Jun 3, 2026
2c40f4a
diffusion-gemma4: prompt conditioning via prefix attention
lnigam Jun 3, 2026
d7dc3ea
diffusion-gemma4: apply the chat template to the prompt
lnigam Jun 3, 2026
724ba77
diffusion-gemma4: fix prompt conditioning (keep prompt tokens active)
lnigam Jun 3, 2026
212a698
diffusion-gemma4: greedy read-out of the final canvas
lnigam Jun 3, 2026
0daa231
diffusion-gemma4: extract the clean final answer
lnigam Jun 3, 2026
27c35eb
diffusion-gemma4: offload to GPU by default
lnigam Jun 3, 2026
03b9c2b
diffusion_gemma4: KV-cache reuse for block-diffusion generation
lnigam Jun 8, 2026
868ef51
diffusion-gemma (v7): rename arch + multimodal (gemma4 vision) support
lnigam Jun 8, 2026
9c53a9d
diffusion-gemma cli: -n controls block count, generation timing, inli…
lnigam Jun 8, 2026
5ffc5f4
diffusion-gemma: place precomputed transposed embedding on the offloa…
lnigam Jun 8, 2026
ea2e26a
diffusion-gemma cli: report encoder-phase prefill timing
lnigam Jun 8, 2026
459e35e
diffusion-gemma: top-k host sampling (top-k softmax/entropy/self-cond…
lnigam Jun 8, 2026
7e33c60
diffusion-gemma: sparse top-k self-conditioning via on-device F16 gather
lnigam Jun 8, 2026
2033bc8
diffusion-gemma: chat-completions-compatible HTTP server (llama-serve…
lnigam Jun 8, 2026
ed011ec
diffusion-gemma: add CUDA graph GPU sampling server path
lnigam Jun 9, 2026
0c69257
diffusion-gemma: keep self-cond sampling on device
lnigam Jun 9, 2026
a1b0874
diffusion-gemma: keep denoising loop on device
lnigam Jun 9, 2026
ba31e9a
diffusion-gemma: use output flag for persistent inputs
lnigam Jun 9, 2026
7a7c0a8
diffusion-gemma: checkpoint device-loop early stop
lnigam Jun 9, 2026
4823ee0
diffusion-gemma: make device early stop every step
lnigam Jun 9, 2026
2a3dcf5
examples: update diffusion gemma mtmd bitmap loading
lnigam Jun 9, 2026
1347c3d
diffusion: add gated kernel optimization experiments
lnigam Jun 9, 2026
a83d05d
diffusion: poll device stop state every denoise step
lnigam Jun 9, 2026
1f1e91c
diffusion: add gated topk and mmq kernel experiments
lnigam Jun 9, 2026
027b998
diffusion: add kernel optimization controls
lnigam Jun 10, 2026
89856ff
diffusion: expose tuning switches as options
lnigam Jun 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 112 additions & 1 deletion common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ static std::initializer_list<enum llama_example> mmproj_examples = {
LLAMA_EXAMPLE_MTMD,
LLAMA_EXAMPLE_SERVER,
LLAMA_EXAMPLE_CLI,
LLAMA_EXAMPLE_DIFFUSION,
};

static std::string read_file(const std::string & fname) {
Expand Down Expand Up @@ -2228,7 +2229,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.image.emplace_back(item);
}
}
).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
add_opt(common_arg(
{"--image-min-tokens"}, "N",
"minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
Expand Down Expand Up @@ -3864,6 +3865,116 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
[](common_params & params) { params.diffusion.visual_mode = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--no-diffusion-gpu-sampling"},
"disable CUDA block-diffusion sampling fast path",
[](common_params & params) { params.diffusion.gpu_sampling = false; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--no-diffusion-device-selfcond"},
"disable device-resident block-diffusion self-conditioning",
[](common_params & params) { params.diffusion.device_self_cond = false; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--no-diffusion-device-denoise-loop"},
"disable device-side block-diffusion canvas and stop-state updates",
[](common_params & params) { params.diffusion.device_denoise_loop = false; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-pin-host-outputs"},
"register compact diffusion output buffers as pinned host memory",
[](common_params & params) { params.diffusion.pin_host_outputs = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-self-cond-top-k"}, "N",
string_format("block-diffusion sparse self-conditioning width (default: %d)", params.diffusion.self_cond_top_k),
[](common_params & params, int value) { params.diffusion.self_cond_top_k = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-input-gpu-groups"}, "N",
string_format("bitmask of block-diffusion decoder input groups assigned to GPU backend (default: %u)", params.diffusion.input_gpu_groups),
[](common_params & params, int value) { params.diffusion.input_gpu_groups = (uint32_t) std::max(value, 0); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-default-top-k"}, "N",
"block-diffusion top-k used when --top-k is not explicitly provided",
[](common_params & params, int value) { params.diffusion.default_top_k = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-force-top-k"}, "N",
"block-diffusion server: override per-request top_k when N > 0",
[](common_params & params, int value) { params.diffusion.force_top_k = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-fused-self-cond-embd"},
"use fused device self-conditioning embedding input for block diffusion",
[](common_params & params) { params.diffusion.fused_self_cond_embd = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-fuse-final-softcap"},
"move final logit softcap into the CUDA diffusion sampling kernel",
[](common_params & params) { params.diffusion.fuse_final_logit_softcap = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-separate-encoder-decoder"},
"build separate block-diffusion encoder and decoder graph variants",
[](common_params & params) { params.diffusion.separate_encoder_decoder = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-cuda-direct-self-cond"},
"write CUDA diffusion self-conditioning directly into decoder graph inputs",
[](common_params & params) { params.diffusion.cuda_direct_self_cond = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-cuda-final-tokens-on-stop"},
"copy final diffusion tokens only when the device stop condition is reached",
[](common_params & params) { params.diffusion.cuda_final_tokens_on_stop = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-cuda-fused-top-k-sample"},
"fuse CUDA diffusion top-k selection and sampling",
[](common_params & params) { params.diffusion.cuda_fused_top_k_sample = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-cuda-tight-top-k"},
"avoid extra CUDA diffusion top-k scratch width when possible",
[](common_params & params) { params.diffusion.cuda_tight_top_k = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-cuda-parallel-full-softmax"},
"parallelize CUDA diffusion full-vocab sampling when top-k is 0",
[](common_params & params) { params.diffusion.cuda_parallel_full_softmax = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-cuda-fused-full-softmax"},
"fuse CUDA diffusion full-vocab softmax sampling and self-conditioning",
[](common_params & params) { params.diffusion.cuda_fused_full_softmax = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-cuda-top-k-local-k"}, "N",
"CUDA diffusion local top-k candidates per thread (0 = backend default)",
[](common_params & params, int value) { params.diffusion.cuda_top_k_local_k = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--no-diffusion-cuda-fast-top-k"},
"disable CUDA diffusion CUB/fast top-k selection path",
[](common_params & params) { params.diffusion.cuda_fast_top_k = false; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--top-k-start"}, "N",
"block-diffusion: anneal top-k from N at the first (high-entropy) denoising step (with --top-k-end)",
[](common_params & params, int value) { params.diffusion.top_k_start = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--top-k-end"}, "N",
"block-diffusion: anneal top-k to N at the last denoising step (with --top-k-start)",
[](common_params & params, int value) { params.diffusion.top_k_end = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--top-k-tail-correction"},
"block-diffusion: use the exact full-vocab entropy (logsumexp) for the accept/stop signal under top-k",
[](common_params & params) { params.diffusion.top_k_tail_correction = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-eps"}, "F",
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
Expand Down
5 changes: 5 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1585,6 +1585,11 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.op_offload = !params.no_op_offload;
cparams.swa_full = params.swa_full;
cparams.kv_unified = params.kv_unified;
cparams.diffusion_self_cond_top_k = params.diffusion.self_cond_top_k;
cparams.diffusion_input_gpu_groups = params.diffusion.input_gpu_groups;
cparams.diffusion_fused_self_cond_embd = params.diffusion.fused_self_cond_embd;
cparams.diffusion_fuse_final_logit_softcap = params.diffusion.fuse_final_logit_softcap;
cparams.diffusion_separate_encoder_decoder = params.diffusion.separate_encoder_decoder;

cparams.type_k = params.cache_type_k;
cparams.type_v = params.cache_type_v;
Expand Down
28 changes: 28 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,10 @@ struct common_params_vocoder {
struct common_params_diffusion {
int32_t steps = 128;
bool visual_mode = false;
bool gpu_sampling = true; // use CUDA diffusion sampling fast path when available
bool device_self_cond = true; // keep diffusion self-conditioning state on device
bool device_denoise_loop = true; // update diffusion canvas/stop state on device
bool pin_host_outputs = false; // register compact D2H output buffers as pinned host memory

float eps = 0; // epsilon for timesteps
int32_t block_length = 0; // block length for generation
Expand All @@ -390,6 +394,30 @@ struct common_params_diffusion {

float cfg_scale = 0; // classifier-free guidance scale
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0

// block-diffusion (diffusion-gemma) top-k host sampling knobs
int32_t top_k_start = 0; // anneal top-k from this (first/high-entropy step) ...
int32_t top_k_end = 0; // ... to this (last step); both > 0 enables annealing
bool top_k_tail_correction = false; // use exact full-vocab entropy for accept/stop
int32_t default_top_k = 0; // top-k used when --top-k is not explicitly provided
int32_t force_top_k = 0; // server: override per-request top_k when > 0
int32_t self_cond_top_k = 256; // sparse self-conditioning gather width
uint32_t input_gpu_groups = 63; // decoder input tensor groups assigned to GPU backend

// CUDA diffusion sampling fast-path knobs. Defaults preserve behavior when no tuning flags are passed.
bool cuda_fast_top_k = true;
bool cuda_direct_self_cond = false;
bool cuda_final_tokens_on_stop = false;
bool cuda_fused_top_k_sample = false;
bool cuda_tight_top_k = false;
bool cuda_parallel_full_softmax = false;
bool cuda_fused_full_softmax = false;
int32_t cuda_top_k_local_k = 0; // 0 = backend default

// Diffusion graph-shape knobs.
bool fused_self_cond_embd = false;
bool fuse_final_logit_softcap = false;
bool separate_encoder_decoder = false;
};

// reasoning API response format (not to be confused as chat template's reasoning format)
Expand Down
2 changes: 2 additions & 0 deletions conversion/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
"Gemma4AssistantForCausalLM": "gemma",
"Gemma4ForConditionalGeneration": "gemma",
"Gemma4ForCausalLM": "gemma",
"DiffusionGemmaForBlockDiffusion": "gemma",
"Gemma4UnifiedForConditionalGeneration": "gemma",
"Gemma4UnifiedAssistantForCausalLM": "gemma",
"GemmaForCausalLM": "gemma",
Expand Down Expand Up @@ -245,6 +246,7 @@
"CogVLMForCausalLM": "cogvlm",
"DeepseekOCR2ForCausalLM": "deepseek",
"DeepseekOCRForCausalLM": "deepseek",
"DiffusionGemmaForBlockDiffusion": "gemma",
"DotsOCRForCausalLM": "dotsocr",
"Exaone4_5_ForConditionalGeneration": "exaone",
"Gemma3ForConditionalGeneration": "gemma",
Expand Down
63 changes: 60 additions & 3 deletions conversion/gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ def set_vocab(self):
def set_gguf_parameters(self):
super().set_gguf_parameters()

num_kv_shared_layers = self.hparams["num_kv_shared_layers"]
num_kv_shared_layers = self.hparams.get("num_kv_shared_layers", 0)
self.gguf_writer.add_shared_kv_layers(num_kv_shared_layers)

# per-layer embedding is optional
Expand Down Expand Up @@ -764,7 +764,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter

yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("Gemma4UnifiedForConditionalGeneration")
class Gemma4UnifiedModel(Gemma4Model):
model_arch = gguf.MODEL_ARCH.GEMMA4
Expand Down Expand Up @@ -805,6 +804,32 @@ def set_gguf_parameters(self):
self.gguf_writer.add_nextn_predict_layers(self.block_count)


@ModelBase.register("DiffusionGemmaForBlockDiffusion")
class DiffusionGemmaModel(Gemma4Model):
# Block-diffusion variant of Gemma 4. Reuses the gemma4 decoder block; adds the
# self-conditioning MLP and nests the language model under `model.decoder.`.
model_arch = gguf.MODEL_ARCH.DIFFUSION_GEMMA

@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, _ = item
# The text encoder shares every weight with the decoder except its own
# per-layer `layer_scalar`. The single-stack graph uses the decoder scalars,
# so the encoder-only tensors are dropped here.
if name.startswith("model.encoder."):
return None
return super().filter_tensors(item)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# diffusion_gemma nests the language model under `model.decoder.`; strip it so
# the shared gemma4 tensor mappings apply. `model.decoder.self_conditioning.*`
# then maps to the SELF_COND_* tensors.
if name.startswith("model.decoder."):
name = "model." + name[len("model.decoder."):]

yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("Gemma4ForConditionalGeneration")
class Gemma4VisionAudioModel(MmprojModel):
has_audio_encoder = True
Expand Down Expand Up @@ -882,7 +907,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min"))
yield (mapped_name, data_torch)


@ModelBase.register("Gemma4UnifiedForConditionalGeneration")
class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel):
has_audio_encoder = True
Expand Down Expand Up @@ -945,3 +969,36 @@ def modify_tensors(self, data_torch, name, bid):
perm = row * p * 3 + col * 3 + ch
data_torch = data_torch[perm]
return super().modify_tensors(data_torch, name, bid)


@ModelBase.register("DiffusionGemmaForBlockDiffusion")
class DiffusionGemmaVisionModel(Gemma4VisionAudioModel):
# mmproj (vision) export for the v7 diffusion_gemma multimodal model. Reuses the gemma4
# vision tower (GEMMA4V); the v7 checkpoint nests it under `model.encoder.*` and has no
# audio encoder, so only the vision tower + vision projector are exported.
has_audio_encoder = False
has_vision_encoder = True

def set_gguf_parameters(self):
# MmprojModel base writes the generic vision params; do NOT call the gemma4
# vision+audio set_gguf_parameters (it asserts an audio config, which v7 lacks).
MmprojModel.set_gguf_parameters(self)
assert self.hparams_vision is not None
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA4V)
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))

@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, _ = item
# keep only the vision tower + vision projector; drop the diffusion decoder
# (the text-encoder language_model.* tensors are dropped by MmprojModel.filter_tensors)
if name.startswith("model.decoder."):
return None
return super().filter_tensors(item)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# v7 nests the vision tower / projector under `model.encoder.`; strip it so the gemma4
# vision tensor mappings (model.vision_tower.* / model.embed_vision.*) apply.
if name.startswith("model.encoder."):
name = "model." + name[len("model.encoder."):]
yield from super().modify_tensors(data_torch, name, bid)
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ else()
add_subdirectory(gen-docs)
add_subdirectory(training)
add_subdirectory(diffusion)
add_subdirectory(diffusion-gemma)
if (NOT GGML_BACKEND_DL)
add_subdirectory(convert-llama2c-to-ggml)
# these examples use the backends directly and cannot be built with dynamic loading
Expand Down
17 changes: 17 additions & 0 deletions examples/diffusion-gemma/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
set(TARGET llama-diffusion-gemma-cli)
add_executable(${TARGET} diffusion-gemma-cli.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama llama-common mtmd ${CMAKE_THREAD_LIBS_INIT})
# mtmd (tools/) is added after examples/, so add its include dir explicitly for the headers
target_include_directories(${TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../tools/mtmd)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

# OpenAI-compatible HTTP server for the block-diffusion models (llama-server analogue)
set(TARGET llama-diffusion-gemma-server)
add_executable(${TARGET} diffusion-gemma-server.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama llama-common mtmd cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../../tools/mtmd
${CMAKE_SOURCE_DIR}/vendor) # cpp-httplib/httplib.h, nlohmann/json.hpp
target_compile_features(${TARGET} PRIVATE cxx_std_17)
Loading