From 9d9cda895100bd9d863f59290fb7806cd53b385d Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sun, 21 Jun 2026 18:38:35 -0300 Subject: [PATCH 1/6] sd: sync with master-714-b12098f --- Makefile | 2 +- otherarch/sdcpp/examples/cli/main.cpp | 4 + otherarch/sdcpp/examples/common/common.cpp | 55 +- otherarch/sdcpp/examples/common/common.h | 1 + .../sdcpp/src/conditioning/conditioner.hpp | 98 +- otherarch/sdcpp/src/convert.cpp | 2 +- otherarch/sdcpp/src/core/util.cpp | 9 + otherarch/sdcpp/src/core/util.h | 2 + otherarch/sdcpp/src/model.h | 9 + otherarch/sdcpp/src/model/common/rope.hpp | 4 +- otherarch/sdcpp/src/model/diffusion/anima.hpp | 7 +- otherarch/sdcpp/src/model/diffusion/boogu.hpp | 835 ++++++++++++++++++ .../sdcpp/src/model/diffusion/ernie_image.hpp | 4 +- otherarch/sdcpp/src/model/te/llm.hpp | 72 +- .../sdcpp/src/model/vae/auto_encoder_kl.hpp | 2 +- otherarch/sdcpp/src/model_loader.cpp | 3 + otherarch/sdcpp/src/name_conversion.cpp | 25 + otherarch/sdcpp/src/stable-diffusion.cpp | 29 +- .../sdcpp/src/tokenizers/bpe_tokenizer.cpp | 5 +- .../sdcpp/src/tokenizers/clip_tokenizer.cpp | 7 +- otherarch/sdcpp/src/tokenizers/tokenizer.h | 7 +- 21 files changed, 1129 insertions(+), 53 deletions(-) create mode 100644 otherarch/sdcpp/src/model/diffusion/boogu.hpp diff --git a/Makefile b/Makefile index cd5678538bcc..79a37a71889f 100644 --- a/Makefile +++ b/Makefile @@ -699,7 +699,7 @@ llama-impl.o: src/llama-impl.cpp src/llama-impl.h budget.o: common/reasoning-budget.cpp common/reasoning-budget.h $(CXX) $(CXXFLAGS) -c $< -o $@ -SDCPP_COMMON_BASENAMES := include/stable-diffusion.h src/conditioning/conditioner.hpp src/core/ggml_extend_backend.cpp src/core/ggml_extend_backend.h src/core/ggml_extend.hpp src/core/ggml_graph_cut.cpp src/core/ggml_graph_cut.h src/core/ordered_map.hpp src/core/rng.hpp src/core/rng_mt19937.hpp src/core/rng_philox.hpp src/core/tensor_ggml.hpp src/core/tensor.hpp src/core/util.cpp src/core/util.h src/extensions/generation_extension.h src/extensions/photomaker_extension.cpp src/extensions/pulid_extension.cpp src/kcpp_sd_extensions.h src/model/adapter/lora.hpp src/model/adapter/pmid.hpp src/model/adapter/pulid.hpp src/model/common/block.hpp src/model/common/rope.hpp src/model/diffusion/anima.hpp src/model/diffusion/control.hpp src/model/diffusion/dit.hpp src/model/diffusion/ernie_image.hpp src/model/diffusion/flux.hpp src/model/diffusion/hidream_o1.hpp src/model/diffusion/ideogram4.hpp src/model/diffusion/lens.hpp src/model/diffusion/ltxv.hpp src/model/diffusion/mmdit.hpp src/model/diffusion/model.hpp src/model/diffusion/pid.hpp src/model/diffusion/qwen_image.hpp src/model/diffusion/unet.hpp src/model/diffusion/wan.hpp src/model/diffusion/z_image.hpp src/model.h src/model_io/binary_io.h src/model_io/gguf_io.cpp src/model_io/gguf_io.h src/model_io/gguf_reader_ext.h src/model_io/pickle_io.cpp src/model_io/pickle_io.h src/model_io/safetensors_io.cpp src/model_io/safetensors_io.h src/model_io/tensor_storage.h src/model_io/torch_legacy_io.cpp src/model_io/torch_legacy_io.h src/model_io/torch_zip_io.cpp src/model_io/torch_zip_io.h src/model_loader.cpp src/model_loader.h src/model_manager.cpp src/model_manager.h src/model/te/clip.hpp src/model/te/llm.hpp src/model/te/t5.hpp src/model/upscaler/esrgan.hpp src/model/upscaler/ltx_latent_upscaler.hpp src/model/vae/auto_encoder_kl.hpp src/model/vae/ltx_audio_vae.hpp src/model/vae/ltx_vae.hpp src/model/vae/tae.hpp src/model/vae/vae.hpp src/model/vae/wan_vae.hpp src/name_conversion.cpp src/name_conversion.h src/runtime/cache_dit.hpp src/runtime/condition_cache_utils.hpp src/runtime/denoiser.hpp src/runtime/easycache.hpp src/runtime/gits_noise.h src/runtime/guidance.cpp src/runtime/guidance.h src/runtime/latent-preview.h src/runtime/preprocessing.hpp src/runtime/sample-cache.cpp src/runtime/sample-cache.h src/runtime/spectrum.hpp src/runtime/ucache.hpp src/stable-diffusion.cpp src/tokenizers/bpe_tokenizer.cpp src/tokenizers/bpe_tokenizer.h src/tokenizers/clip_tokenizer.cpp src/tokenizers/clip_tokenizer.h src/tokenizers/gemma_tokenizer.cpp src/tokenizers/gemma_tokenizer.h src/tokenizers/gpt_oss_tokenizer.cpp src/tokenizers/gpt_oss_tokenizer.h src/tokenizers/mistral_tokenizer.cpp src/tokenizers/mistral_tokenizer.h src/tokenizers/qwen2_tokenizer.cpp src/tokenizers/qwen2_tokenizer.h src/tokenizers/t5_unigram_tokenizer.cpp src/tokenizers/t5_unigram_tokenizer.h src/tokenizers/tokenizer.cpp src/tokenizers/tokenizer.h src/tokenizers/tokenize_util.cpp src/tokenizers/tokenize_util.h src/tokenizers/vocab/vocab.h src/upscaler.cpp src/upscaler.h src/weight_manager.h +SDCPP_COMMON_BASENAMES := include/stable-diffusion.h src/conditioning/conditioner.hpp src/core/ggml_extend_backend.cpp src/core/ggml_extend_backend.h src/core/ggml_extend.hpp src/core/ggml_graph_cut.cpp src/core/ggml_graph_cut.h src/core/ordered_map.hpp src/core/rng.hpp src/core/rng_mt19937.hpp src/core/rng_philox.hpp src/core/tensor_ggml.hpp src/core/tensor.hpp src/core/util.cpp src/core/util.h src/extensions/generation_extension.h src/extensions/photomaker_extension.cpp src/extensions/pulid_extension.cpp src/kcpp_sd_extensions.h src/model/adapter/lora.hpp src/model/adapter/pmid.hpp src/model/adapter/pulid.hpp src/model/common/block.hpp src/model/common/rope.hpp src/model/diffusion/anima.hpp src/model/diffusion/boogu.hpp src/model/diffusion/control.hpp src/model/diffusion/dit.hpp src/model/diffusion/ernie_image.hpp src/model/diffusion/flux.hpp src/model/diffusion/hidream_o1.hpp src/model/diffusion/ideogram4.hpp src/model/diffusion/lens.hpp src/model/diffusion/ltxv.hpp src/model/diffusion/mmdit.hpp src/model/diffusion/model.hpp src/model/diffusion/pid.hpp src/model/diffusion/qwen_image.hpp src/model/diffusion/unet.hpp src/model/diffusion/wan.hpp src/model/diffusion/z_image.hpp src/model.h src/model_io/binary_io.h src/model_io/gguf_io.cpp src/model_io/gguf_io.h src/model_io/gguf_reader_ext.h src/model_io/pickle_io.cpp src/model_io/pickle_io.h src/model_io/safetensors_io.cpp src/model_io/safetensors_io.h src/model_io/tensor_storage.h src/model_io/torch_legacy_io.cpp src/model_io/torch_legacy_io.h src/model_io/torch_zip_io.cpp src/model_io/torch_zip_io.h src/model_loader.cpp src/model_loader.h src/model_manager.cpp src/model_manager.h src/model/te/clip.hpp src/model/te/llm.hpp src/model/te/t5.hpp src/model/upscaler/esrgan.hpp src/model/upscaler/ltx_latent_upscaler.hpp src/model/vae/auto_encoder_kl.hpp src/model/vae/ltx_audio_vae.hpp src/model/vae/ltx_vae.hpp src/model/vae/tae.hpp src/model/vae/vae.hpp src/model/vae/wan_vae.hpp src/name_conversion.cpp src/name_conversion.h src/runtime/cache_dit.hpp src/runtime/condition_cache_utils.hpp src/runtime/denoiser.hpp src/runtime/easycache.hpp src/runtime/gits_noise.h src/runtime/guidance.cpp src/runtime/guidance.h src/runtime/latent-preview.h src/runtime/preprocessing.hpp src/runtime/sample-cache.cpp src/runtime/sample-cache.h src/runtime/spectrum.hpp src/runtime/ucache.hpp src/stable-diffusion.cpp src/tokenizers/bpe_tokenizer.cpp src/tokenizers/bpe_tokenizer.h src/tokenizers/clip_tokenizer.cpp src/tokenizers/clip_tokenizer.h src/tokenizers/gemma_tokenizer.cpp src/tokenizers/gemma_tokenizer.h src/tokenizers/gpt_oss_tokenizer.cpp src/tokenizers/gpt_oss_tokenizer.h src/tokenizers/mistral_tokenizer.cpp src/tokenizers/mistral_tokenizer.h src/tokenizers/qwen2_tokenizer.cpp src/tokenizers/qwen2_tokenizer.h src/tokenizers/t5_unigram_tokenizer.cpp src/tokenizers/t5_unigram_tokenizer.h src/tokenizers/tokenizer.cpp src/tokenizers/tokenizer.h src/tokenizers/tokenize_util.cpp src/tokenizers/tokenize_util.h src/tokenizers/vocab/vocab.h src/upscaler.cpp src/upscaler.h src/weight_manager.h SDCPP_MAIN_BASENAMES := examples/cli/image_metadata.cpp examples/cli/image_metadata.h examples/cli/main.cpp examples/cli/msf_gif.h examples/common/common.cpp examples/common/common.h examples/common/log.cpp examples/common/log.h examples/common/media_io.cpp examples/common/media_io.h examples/common/resource_owners.hpp src/tokenizers/vocab/clip_merges.hpp src/tokenizers/vocab/gemma2_merges.hpp src/tokenizers/vocab/gemma2_vocab.hpp src/tokenizers/vocab/gemma_merges.hpp src/tokenizers/vocab/gemma_vocab.hpp src/tokenizers/vocab/gpt_oss_merges.hpp src/tokenizers/vocab/gpt_oss_vocab.hpp src/tokenizers/vocab/mistral_merges.hpp src/tokenizers/vocab/mistral_vocab.hpp src/tokenizers/vocab/qwen_merges.hpp src/tokenizers/vocab/t5.hpp src/tokenizers/vocab/umt5.hpp src/tokenizers/vocab/vocab.cpp src/convert.cpp src/version.cpp diff --git a/otherarch/sdcpp/examples/cli/main.cpp b/otherarch/sdcpp/examples/cli/main.cpp index bb5d6862c2da..84e9e7853324 100644 --- a/otherarch/sdcpp/examples/cli/main.cpp +++ b/otherarch/sdcpp/examples/cli/main.cpp @@ -62,18 +62,22 @@ struct SDCliParams { {"-o", "--output", "path to write result image to. you can use printf-style %d format specifiers for image sequences (default: ./output.png) (eg. output_%03d.png). Single-file video outputs support .avi, .webm, and animated .webp", + 0, &output_path}, {"", "--image", "path to the image to inspect (for metadata mode)", + 0, &image_path}, {"", "--metadata-format", "metadata output format, one of [text, json] (default: text)", + 0, &metadata_format}, {"", "--preview-path", "path to write preview image to (default: ./preview.png). Multi-frame previews support .avi, .webm, and animated .webp", + 0, &preview_path}, }; diff --git a/otherarch/sdcpp/examples/common/common.cpp b/otherarch/sdcpp/examples/common/common.cpp index dd5d35055f29..ad3f97a08080 100644 --- a/otherarch/sdcpp/examples/common/common.cpp +++ b/otherarch/sdcpp/examples/common/common.cpp @@ -260,7 +260,14 @@ bool parse_options(int argc, const char** argv, const std::vector& o invalid_arg = true; return; } - *option.target = argv_to_utf8(i, argv); + if(option.concat && !option.target->empty()){ + if(option.concat > 0 && option.concat <= 0xff){ + *option.target += static_cast(option.concat); + } + *option.target += argv_to_utf8(i, argv); + } else { + *option.target = argv_to_utf8(i, argv); + } found_arg = true; })) break; @@ -324,120 +331,151 @@ ArgOptions SDContextParams::get_options() { {"-m", "--model", "path to full model", + 0, &model_path}, {"", "--clip_l", - "path to the clip-l text encoder", &clip_l_path}, + "path to the clip-l text encoder", + 0, + &clip_l_path}, {"", "--clip_g", "path to the clip-g text encoder", + 0, &clip_g_path}, {"", "--clip_vision", "path to the clip-vision encoder", + 0, &clip_vision_path}, {"", "--t5xxl", "path to the t5xxl text encoder", + 0, &t5xxl_path}, {"", "--llm", "path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)", + 0, &llm_path}, {"", "--llm_vision", "path to the llm vit", + 0, &llm_vision_path}, {"", "--qwen2vl", "alias of --llm. Deprecated.", + 0, &llm_path}, {"", "--qwen2vl_vision", "alias of --llm_vision. Deprecated.", + 0, &llm_vision_path}, {"", "--diffusion-model", "path to the standalone diffusion model", + 0, &diffusion_model_path}, {"", "--high-noise-diffusion-model", "path to the standalone high noise diffusion model", + 0, &high_noise_diffusion_model_path}, {"", "--uncond-diffusion-model", "path to the standalone unconditional diffusion model, currently used by Ideogram4 CFG", + 0, &uncond_diffusion_model_path}, {"", "--embeddings-connectors", "path to LTXAV embeddings connectors", + 0, &embeddings_connectors_path}, {"", "--vae", "path to standalone vae model", + 0, &vae_path}, {"", "--vae-format", "VAE latent format override: auto, flux, sd3, or flux2 (default: auto)", + 0, &vae_format}, {"", "--audio-vae", "path to standalone LTX audio vae model", + 0, &audio_vae_path}, {"", "--taesd", "path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)", + 0, &taesd_path}, {"", "--tae", "alias of --taesd", + 0, &taesd_path}, {"", "--control-net", "path to control net model", + 0, &control_net_path}, {"", "--embd-dir", "embeddings directory", + 0, &embedding_dir}, {"", "--lora-model-dir", "lora model directory", + 0, &lora_model_dir}, {"", "--hires-upscalers-dir", "highres fix upscaler model directory", + 0, &hires_upscalers_dir}, {"", "--tensor-type-rules", "weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")", + (int)',', &tensor_type_rules}, {"", "--photo-maker", "path to PHOTOMAKER model", + 0, &photo_maker_path}, {"", "--pulid-weights", "path to PuLID Flux weights", + 0, &pulid_weights_path}, {"", "--upscale-model", "path to esrgan model.", + 0, &esrgan_path}, {"", "--backend", "runtime backend assignment, e.g. cpu or clip=cpu,vae=cuda0,diffusion=vulkan0", + (int)',', &backend}, {"", "--params-backend", "parameter backend assignment, e.g. disk, cpu, or diffusion=disk,clip=cpu", + (int)',', ¶ms_backend}, {"", "--rpc-servers", "comma-separated list of RPC servers to connect to for offloading, in the format host:port, e.g. localhost:50052,192.168.1.3:50052", + (int)',', &rpc_servers}, {"", "--max-vram", "maximum VRAM budget in GiB for graph-cut segmented execution. Accepts a single value or assignments by backend/device, e.g. 6 or cuda0=6,vulkan0=4. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value", + 0, &max_vram}, }; @@ -857,58 +895,71 @@ ArgOptions SDGenerationParams::get_options() { {"-p", "--prompt", "the prompt to render", + 0, &prompt}, {"-n", "--negative-prompt", "the negative prompt (default: \"\")", + 0, &negative_prompt}, {"-i", "--init-img", "path to the init image", + 0, &init_image_path}, {"", "--end-img", "path to the end image, required by flf2v", + 0, &end_image_path}, {"", "--mask", "path to the mask image", + 0, &mask_image_path}, {"", "--control-image", "path to control image, control net", + 0, &control_image_path}, {"", "--control-video", "path to control video frames, It must be a directory path. The video frames inside should be stored as images in " "lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images " "such as 00.png, 01.png, ... etc.", + 0, &control_video_path}, {"", "--pm-id-images-dir", "path to PHOTOMAKER input id images dir", + 0, &pm_id_images_dir}, {"", "--pm-id-embed-path", "path to PHOTOMAKER v2 id embed", + 0, &pm_id_embed_path}, {"", "--pulid-id-embedding", "path to PuLID id embedding", + 0, &pulid_id_embedding_path}, {"", "--hires-upscaler", "highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), " "Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name " "under --hires-upscalers-dir (default: Latent)", + 0, &hires_upscaler}, {"", "--extra-sample-args", "extra sampler/scheduler/guidance args, key=value list. APG supports apg_eta, apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma", + (int)',', &extra_sample_args}, {"", "--extra-tiling-args", "extra VAE tiling args, key=value list. LTX video VAE supports temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)", + (int)',', &extra_tiling_args}, }; diff --git a/otherarch/sdcpp/examples/common/common.h b/otherarch/sdcpp/examples/common/common.h index fcf9840db692..587cad29f699 100644 --- a/otherarch/sdcpp/examples/common/common.h +++ b/otherarch/sdcpp/examples/common/common.h @@ -31,6 +31,7 @@ struct StringOption { std::string short_name; std::string long_name; std::string desc; + int concat; std::string* target; }; diff --git a/otherarch/sdcpp/src/conditioning/conditioner.hpp b/otherarch/sdcpp/src/conditioning/conditioner.hpp index b5dda4c0e435..ae1a5b5b387e 100644 --- a/otherarch/sdcpp/src/conditioning/conditioner.hpp +++ b/otherarch/sdcpp/src/conditioning/conditioner.hpp @@ -142,8 +142,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { std::shared_ptr weight_manager = nullptr) : version(version), tokenizer(sd_version_is_sd2(version) ? 0 : 49407) { for (const auto& kv : orig_embedding_map) { - std::string name = kv.first; - std::transform(name.begin(), name.end(), name.begin(), [](unsigned char c) { return std::tolower(c); }); + std::string name = normalize_embedding_name(kv.first); embedding_map[name] = kv.second; tokenizer.add_special_token(name); } @@ -278,17 +277,23 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { return true; } + static std::string normalize_embedding_name(std::string name) { + std::transform(name.begin(), name.end(), name.begin(), [](unsigned char c) { return std::tolower(c); }); + return name; + } + + bool append_embedding_tokens(std::string str, std::vector& bpe_tokens) { + std::string name = normalize_embedding_name(std::move(str)); + auto iter = embedding_map.find(name); + if (iter == embedding_map.end()) { + return false; + } + return load_embedding(name, iter->second, bpe_tokens); + } + std::vector convert_token_to_id(std::string text) { auto on_new_token_cb = [&](std::string& str, std::vector& bpe_tokens) -> bool { - auto iter = embedding_map.find(str); - if (iter == embedding_map.end()) { - return false; - } - std::string embedding_path = iter->second; - if (load_embedding(str, embedding_path, bpe_tokens)) { - return true; - } - return false; + return append_embedding_tokens(str, bpe_tokens); }; std::vector curr_tokens = tokenizer.encode(text, on_new_token_cb); return curr_tokens; @@ -315,15 +320,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } auto on_new_token_cb = [&](std::string& str, std::vector& bpe_tokens) -> bool { - auto iter = embedding_map.find(str); - if (iter == embedding_map.end()) { - return false; - } - std::string embedding_path = iter->second; - if (load_embedding(str, embedding_path, bpe_tokens)) { - return true; - } - return false; + return append_embedding_tokens(str, bpe_tokens); }; std::vector tokens; @@ -1521,7 +1518,7 @@ struct LLMEmbedder : public Conditioner { arch = LLM::LLMArch::GPT_OSS_20B; } else if (sd_version_is_pid(version)) { arch = LLM::LLMArch::GEMMA2_2B; - } else if (sd_version_is_ideogram4(version)) { + } else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version)) { arch = LLM::LLMArch::QWEN3_VL; } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) { arch = LLM::LLMArch::QWEN3; @@ -1781,6 +1778,65 @@ struct LLMEmbedder : public Conditioner { prompt += "<|im_end|>\n<|im_start|>assistant\n"; } + } else if (sd_version_is_boogu_image(version)) { + prompt_template_encode_start_idx = 0; + + const std::string t2i_system_prompt = + "You are a helpful assistant that generates high-quality images based on user instructions. The instructions are as follows."; + const std::string edit_system_prompt = + "Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate."; + const bool has_ref_images = llm->enable_vision && conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty(); + const bool text_empty = conditioner_params.text.find_first_not_of(" \t\r\n") == std::string::npos; + + if (has_ref_images) { + LOG_INFO("BooguImageEditPipeline"); + const std::string prompt_prefix = "<|im_start|>system\n" + edit_system_prompt + "<|im_end|>\n<|im_start|>user\n"; + std::string img_prompt; + const std::string placeholder = "<|image_pad|>"; + + for (int i = 0; i < conditioner_params.ref_images->size(); i++) { + const auto& image = (*conditioner_params.ref_images)[i]; + double factor = llm->config.vision.patch_size * llm->config.vision.spatial_merge_size; + int height = static_cast(image.shape()[1]); + int width = static_cast(image.shape()[0]); + double beta = std::sqrt((384.0 * 384.0) / (static_cast(height) * static_cast(width))); + int h_bar = std::max(static_cast(factor), + static_cast(std::round(height * beta / factor)) * static_cast(factor)); + int w_bar = std::max(static_cast(factor), + static_cast(std::round(width * beta / factor)) * static_cast(factor)); + + LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, height, width, h_bar, w_bar); + + auto resized_image = clip_preprocess(image, w_bar, h_bar); + auto image_embed = llm->encode_image(n_threads, resized_image, false, true, true); + GGML_ASSERT(!image_embed.empty()); + + std::string image_prefix = prompt_prefix + img_prompt + "<|vision_start|>"; + int image_embed_idx = static_cast(tokenizer->encode(image_prefix, nullptr).size()); + image_embeds.emplace_back(image_embed_idx, image_embed); + + img_prompt += "<|vision_start|>"; + int64_t num_image_tokens = image_embed.shape()[1]; + img_prompt.reserve(img_prompt.size() + static_cast(num_image_tokens) * placeholder.size() + 32); + for (int j = 0; j < num_image_tokens; j++) { + img_prompt += placeholder; + } + img_prompt += "<|vision_end|>"; + } + + prompt = prompt_prefix + img_prompt; + prompt_attn_range.first = static_cast(prompt.size()); + prompt += conditioner_params.text; + prompt_attn_range.second = static_cast(prompt.size()); + prompt += "<|im_end|>\n"; + } else { + const std::string& system_prompt = text_empty ? edit_system_prompt : t2i_system_prompt; + prompt = "<|im_start|>system\n" + system_prompt + "<|im_end|>\n<|im_start|>user\n"; + prompt_attn_range.first = static_cast(prompt.size()); + prompt += conditioner_params.text; + prompt_attn_range.second = static_cast(prompt.size()); + prompt += "<|im_end|>\n"; + } } else if (sd_version_is_longcat(version)) { spell_quotes = true; diff --git a/otherarch/sdcpp/src/convert.cpp b/otherarch/sdcpp/src/convert.cpp index 5ad066c105c7..27d377ec09c4 100644 --- a/otherarch/sdcpp/src/convert.cpp +++ b/otherarch/sdcpp/src/convert.cpp @@ -99,7 +99,7 @@ bool convert(const char* input_path, model_loader.convert_tensors_name(); } - ggml_type type = (ggml_type)output_type; + ggml_type type = sd_type_to_ggml_type(output_type); bool output_is_safetensors = ends_with(output_path, ".safetensors"); TensorTypeRules type_rules = parse_tensor_type_rules(tensor_type_rules); diff --git a/otherarch/sdcpp/src/core/util.cpp b/otherarch/sdcpp/src/core/util.cpp index a70722af201f..05c308d9b93a 100644 --- a/otherarch/sdcpp/src/core/util.cpp +++ b/otherarch/sdcpp/src/core/util.cpp @@ -420,6 +420,15 @@ std::vector split_string(const std::string& str, char delimiter) { return result; } +ggml_type sd_type_to_ggml_type(sd_type_t sdtype) { + const int type_value = static_cast(sdtype); + if (type_value < std::min(SD_TYPE_COUNT, GGML_TYPE_COUNT)) { + return static_cast(type_value); + } else { + return GGML_TYPE_COUNT; + } +} + KeyValueArgs parse_key_value_args(const char* args, const char* context) { KeyValueArgs pairs; diff --git a/otherarch/sdcpp/src/core/util.h b/otherarch/sdcpp/src/core/util.h index a271c1f71869..ec4c4559524f 100644 --- a/otherarch/sdcpp/src/core/util.h +++ b/otherarch/sdcpp/src/core/util.h @@ -83,6 +83,8 @@ void pretty_bytes_progress(int step, int steps, uint64_t bytes_processed, float void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...); +ggml_type sd_type_to_ggml_type(sd_type_t sdtype); + std::string trim(const std::string& s); std::vector> parse_prompt_attention(const std::string& text); diff --git a/otherarch/sdcpp/src/model.h b/otherarch/sdcpp/src/model.h index a62c4d1bf89e..17272f7d69d9 100644 --- a/otherarch/sdcpp/src/model.h +++ b/otherarch/sdcpp/src/model.h @@ -42,6 +42,7 @@ enum SDVersion { VERSION_LTXAV, VERSION_HIDREAM_O1, VERSION_Z_IMAGE, + VERSION_BOOGU_IMAGE, VERSION_OVIS_IMAGE, VERSION_ERNIE_IMAGE, VERSION_LENS, @@ -143,6 +144,13 @@ static inline bool sd_version_is_z_image(SDVersion version) { return false; } +static inline bool sd_version_is_boogu_image(SDVersion version) { + if (version == VERSION_BOOGU_IMAGE) { + return true; + } + return false; +} + static inline bool sd_version_is_longcat(SDVersion version) { if (version == VERSION_LONGCAT) { return true; @@ -206,6 +214,7 @@ static inline bool sd_version_is_dit(SDVersion version) { version == VERSION_HIDREAM_O1 || sd_version_is_anima(version) || sd_version_is_z_image(version) || + sd_version_is_boogu_image(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_longcat(version) || diff --git a/otherarch/sdcpp/src/model/common/rope.hpp b/otherarch/sdcpp/src/model/common/rope.hpp index c0077de33bc5..2e21ef7c2b76 100644 --- a/otherarch/sdcpp/src/model/common/rope.hpp +++ b/otherarch/sdcpp/src/model/common/rope.hpp @@ -899,10 +899,12 @@ namespace Rope { // q,k,v: [N, L, n_head, d_head] // pe: [L, d_head/2, 2, 2] // return: [N, L, n_head*d_head] + int64_t n_head = q->ne[1]; + q = apply_rope(ctx->ggml_ctx, q, pe, rope_interleaved); // [N*n_head, L, d_head] k = apply_rope(ctx->ggml_ctx, k, pe, rope_interleaved); // [N*n_head, L, d_head] - auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, v->ne[1], mask, true, ctx->flash_attn_enabled, kv_scale); // [N, L, n_head*d_head] + auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, mask, true, ctx->flash_attn_enabled, kv_scale); // [N, L, n_head*d_head] return x; } }; // namespace Rope diff --git a/otherarch/sdcpp/src/model/diffusion/anima.hpp b/otherarch/sdcpp/src/model/diffusion/anima.hpp index 6042516a90c6..504904d41f8c 100644 --- a/otherarch/sdcpp/src/model/diffusion/anima.hpp +++ b/otherarch/sdcpp/src/model/diffusion/anima.hpp @@ -227,6 +227,7 @@ namespace Anima { k4 = k_norm->forward(ctx, k4); ggml_tensor* attn_out = nullptr; + float scale = (sd_backend_is(ctx->backend, "Vulkan") && ctx->flash_attn_enabled) ? 1.0f / 32.0f : 1.0f; if (pe_q != nullptr || pe_k != nullptr) { if (pe_q == nullptr) { pe_q = pe_k; @@ -244,7 +245,8 @@ namespace Anima { num_heads, nullptr, true, - ctx->flash_attn_enabled); + ctx->flash_attn_enabled, + scale); } else { auto q_flat = ggml_reshape_3d(ctx->ggml_ctx, q4, head_dim * num_heads, L_q, N); auto k_flat = ggml_reshape_3d(ctx->ggml_ctx, k4, head_dim * num_heads, L_k, N); @@ -256,7 +258,8 @@ namespace Anima { num_heads, nullptr, false, - ctx->flash_attn_enabled); + ctx->flash_attn_enabled, + scale); } return out_proj->forward(ctx, attn_out); diff --git a/otherarch/sdcpp/src/model/diffusion/boogu.hpp b/otherarch/sdcpp/src/model/diffusion/boogu.hpp new file mode 100644 index 000000000000..27e13aebd40c --- /dev/null +++ b/otherarch/sdcpp/src/model/diffusion/boogu.hpp @@ -0,0 +1,835 @@ +#ifndef __SD_MODEL_DIFFUSION_BOOGU_HPP__ +#define __SD_MODEL_DIFFUSION_BOOGU_HPP__ + +#include +#include +#include +#include + +#include "core/ggml_extend.hpp" +#include "model/common/rope.hpp" +#include "model/diffusion/dit.hpp" +#include "model/diffusion/model.hpp" +#include "model/diffusion/qwen_image.hpp" +#include "model_loader.h" + +namespace Boogu { + constexpr int BOOGU_GRAPH_SIZE = 65536; + + struct BooguConfig { + int patch_size = 2; + int64_t in_channels = 16; + int64_t out_channels = 16; + int64_t hidden_size = 3360; + int64_t num_layers = 32; + int64_t num_double_stream_layers = 8; + int64_t num_refiner_layers = 2; + int64_t num_attention_heads = 28; + int64_t num_kv_heads = 7; + int64_t head_dim = 120; + int64_t multiple_of = 256; + int64_t instruction_feat_dim = 4096; + int64_t timestep_embed_dim = 1024; + int theta = 10000; + float timestep_scale = 1000.0f; + float norm_eps = 1e-5f; + std::vector axes_dim = {40, 40, 40}; + int64_t axes_dim_sum = 120; + + static int64_t count_blocks(const String2TensorStorage& tensor_storage_map, + const std::string& prefix, + const std::string& block_prefix) { + int64_t count = 0; + for (const auto& [name, _] : tensor_storage_map) { + if (!starts_with(name, prefix)) { + continue; + } + size_t pos = name.find(block_prefix); + if (pos == std::string::npos) { + continue; + } + auto items = split_string(name.substr(pos), '.'); + if (items.size() > 1) { + count = std::max(count, atoi(items[1].c_str()) + 1); + } + } + return count; + } + + static BooguConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) { + BooguConfig config; + int64_t detected_head_dim = 0; + int64_t detected_kv_dim = 0; + + for (const auto& [name, tensor_storage] : tensor_storage_map) { + if (!starts_with(name, prefix)) { + continue; + } + if (ends_with(name, "x_embedder.weight") && tensor_storage.n_dims == 2) { + int64_t patch_area = config.patch_size * config.patch_size; + config.in_channels = tensor_storage.ne[0] / patch_area; + config.hidden_size = tensor_storage.ne[1]; + } else if (ends_with(name, "time_caption_embed.caption_embedder.1.weight") && tensor_storage.n_dims == 2) { + config.instruction_feat_dim = tensor_storage.ne[0]; + config.hidden_size = tensor_storage.ne[1]; + } else if (ends_with(name, "single_stream_layers.0.attn.norm_q.weight") && tensor_storage.n_dims == 1) { + detected_head_dim = tensor_storage.ne[0]; + } else if (ends_with(name, "double_stream_layers.0.img_self_attn.norm_q.weight") && tensor_storage.n_dims == 1) { + detected_head_dim = tensor_storage.ne[0]; + } else if (ends_with(name, "single_stream_layers.0.attn.to_k.weight") && tensor_storage.n_dims == 2) { + detected_kv_dim = tensor_storage.ne[1]; + } else if (ends_with(name, "double_stream_layers.0.img_instruct_attn.processor.img_to_k.weight") && tensor_storage.n_dims == 2) { + detected_kv_dim = tensor_storage.ne[1]; + } else if (ends_with(name, "norm_out.linear_2.weight") && tensor_storage.n_dims == 2) { + int64_t patch_area = config.patch_size * config.patch_size; + config.out_channels = tensor_storage.ne[1] / patch_area; + } + } + + config.num_layers = std::max(1, count_blocks(tensor_storage_map, prefix, "single_stream_layers.")); + config.num_double_stream_layers = std::max(0, count_blocks(tensor_storage_map, prefix, "double_stream_layers.")); + int64_t noise_refiner_layers = count_blocks(tensor_storage_map, prefix, "noise_refiner."); + int64_t ref_refiner_layers = count_blocks(tensor_storage_map, prefix, "ref_image_refiner."); + int64_t context_refiner_layers = count_blocks(tensor_storage_map, prefix, "context_refiner."); + config.num_refiner_layers = std::max(1, std::max(noise_refiner_layers, std::max(ref_refiner_layers, context_refiner_layers))); + + if (detected_head_dim > 0) { + config.head_dim = detected_head_dim; + config.num_attention_heads = config.hidden_size / config.head_dim; + config.axes_dim_sum = config.head_dim; + if (detected_kv_dim > 0) { + config.num_kv_heads = detected_kv_dim / config.head_dim; + } + if (config.axes_dim_sum == 120) { + config.axes_dim = {40, 40, 40}; + } else if (config.axes_dim_sum % 3 == 0) { + int axis = static_cast(config.axes_dim_sum / 3); + config.axes_dim = {axis, axis, axis}; + } + } + config.timestep_embed_dim = std::min(config.hidden_size, 1024); + + LOG_DEBUG("boogu_image: layers=%" PRId64 ", double_stream_layers=%" PRId64 ", refiner_layers=%" PRId64 ", hidden=%" PRId64 ", heads=%" PRId64 ", kv_heads=%" PRId64 ", head_dim=%" PRId64 ", in_channels=%" PRId64 ", out_channels=%" PRId64, + config.num_layers, + config.num_double_stream_layers, + config.num_refiner_layers, + config.hidden_size, + config.num_attention_heads, + config.num_kv_heads, + config.head_dim, + config.in_channels, + config.out_channels); + return config; + } + }; + + __STATIC_INLINE__ ggml_tensor* scale_modulate(ggml_context* ctx, ggml_tensor* x, ggml_tensor* scale) { + scale = ggml_reshape_3d(ctx, scale, scale->ne[0], 1, scale->ne[1]); + return ggml_add(ctx, x, ggml_mul(ctx, x, scale)); + } + + __STATIC_INLINE__ ggml_tensor* gate_residual(ggml_context* ctx, ggml_tensor* residual, ggml_tensor* x, ggml_tensor* gate) { + gate = ggml_tanh(ctx, gate); + gate = ggml_reshape_3d(ctx, gate, gate->ne[0], 1, gate->ne[1]); + x = ggml_mul(ctx, x, gate); + return ggml_add(ctx, residual, x); + } + + struct LuminaCombinedTimestepCaptionEmbedding : public GGMLBlock { + int64_t frequency_embedding_size; + float timestep_scale; + + LuminaCombinedTimestepCaptionEmbedding(int64_t hidden_size, + int64_t instruction_feat_dim, + int64_t frequency_embedding_size, + float norm_eps, + float timestep_scale) + : frequency_embedding_size(frequency_embedding_size), + timestep_scale(timestep_scale) { + blocks["timestep_embedder"] = std::make_shared(frequency_embedding_size, std::min(hidden_size, 1024)); + blocks["caption_embedder.0"] = std::make_shared(instruction_feat_dim, norm_eps); + blocks["caption_embedder.1"] = std::make_shared(instruction_feat_dim, hidden_size, true); + } + + std::pair forward(GGMLRunnerContext* ctx, ggml_tensor* timestep, ggml_tensor* text_hidden_states) { + auto timestep_embedder = std::dynamic_pointer_cast(blocks["timestep_embedder"]); + auto caption_embedder_0 = std::dynamic_pointer_cast(blocks["caption_embedder.0"]); + auto caption_embedder_1 = std::dynamic_pointer_cast(blocks["caption_embedder.1"]); + + auto timestep_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast(frequency_embedding_size), 10000, timestep_scale); + auto time_embed = timestep_embedder->forward(ctx, timestep_proj); + auto caption_embed = caption_embedder_1->forward(ctx, caption_embedder_0->forward(ctx, text_hidden_states)); + return {time_embed, caption_embed}; + } + }; + + struct LuminaRMSNormZero : public GGMLBlock { + LuminaRMSNormZero(int64_t embedding_dim, int64_t conditioning_embedding_dim, float norm_eps) { + blocks["linear"] = std::make_shared(conditioning_embedding_dim, 4 * embedding_dim, true); + blocks["norm"] = std::make_shared(embedding_dim, norm_eps); + } + + std::tuple forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* emb) { + auto linear = std::dynamic_pointer_cast(blocks["linear"]); + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + + emb = linear->forward(ctx, ggml_silu(ctx->ggml_ctx, emb)); + auto mods = ggml_ext_chunk(ctx->ggml_ctx, emb, 4, 0); + + auto scale_msa = mods[0]; + auto gate_msa = mods[1]; + auto scale_mlp = mods[2]; + auto gate_mlp = mods[3]; + + x = scale_modulate(ctx->ggml_ctx, norm->forward(ctx, x), scale_msa); + return {x, gate_msa, scale_mlp, gate_mlp}; + } + }; + + struct LuminaFeedForward : public GGMLBlock { + LuminaFeedForward(int64_t dim, int64_t inner_dim, int64_t multiple_of) { + inner_dim = multiple_of * ((inner_dim + multiple_of - 1) / multiple_of); + blocks["linear_1"] = std::make_shared(dim, inner_dim, false); + blocks["linear_2"] = std::make_shared(inner_dim, dim, false); + blocks["linear_3"] = std::make_shared(dim, inner_dim, false); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto linear_1 = std::dynamic_pointer_cast(blocks["linear_1"]); + auto linear_2 = std::dynamic_pointer_cast(blocks["linear_2"]); + auto linear_3 = std::dynamic_pointer_cast(blocks["linear_3"]); + + if (sd_backend_is(ctx->backend, "Vulkan")) { + linear_2->set_force_prec_f32(true); + } + + auto h1 = linear_1->forward(ctx, x); + auto h2 = linear_3->forward(ctx, x); + x = ggml_swiglu_split(ctx->ggml_ctx, h1, h2); + x = linear_2->forward(ctx, x); + return x; + } + }; + + struct LuminaLayerNormContinuous : public GGMLBlock { + LuminaLayerNormContinuous(int64_t embedding_dim, + int64_t conditioning_embedding_dim, + int64_t out_dim) { + blocks["linear_1"] = std::make_shared(conditioning_embedding_dim, embedding_dim, true); + blocks["norm"] = std::make_shared(embedding_dim, 1e-6f, false); + blocks["linear_2"] = std::make_shared(embedding_dim, out_dim, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* conditioning_embedding) { + auto linear_1 = std::dynamic_pointer_cast(blocks["linear_1"]); + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + auto linear_2 = std::dynamic_pointer_cast(blocks["linear_2"]); + + auto emb = linear_1->forward(ctx, ggml_silu(ctx->ggml_ctx, conditioning_embedding)); + x = scale_modulate(ctx->ggml_ctx, norm->forward(ctx, x), emb); + x = linear_2->forward(ctx, x); + return x; + } + }; + + struct Attention : public GGMLBlock { + int64_t dim_head; + int64_t heads; + int64_t kv_heads; + + Attention(int64_t query_dim, int64_t dim_head, int64_t heads, int64_t kv_heads, float eps = 1e-5f) + : dim_head(dim_head), heads(heads), kv_heads(kv_heads) { + blocks["to_q"] = std::make_shared(query_dim, heads * dim_head, false); + blocks["to_k"] = std::make_shared(query_dim, kv_heads * dim_head, false); + blocks["to_v"] = std::make_shared(query_dim, kv_heads * dim_head, false); + blocks["norm_q"] = std::make_shared(dim_head, eps); + blocks["norm_k"] = std::make_shared(dim_head, eps); + blocks["to_out.0"] = std::make_shared(heads * dim_head, query_dim, false); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* hidden_states, + ggml_tensor* encoder_hidden_states, + ggml_tensor* rotary_emb, + ggml_tensor* attention_mask = nullptr) { + auto to_q = std::dynamic_pointer_cast(blocks["to_q"]); + auto to_k = std::dynamic_pointer_cast(blocks["to_k"]); + auto to_v = std::dynamic_pointer_cast(blocks["to_v"]); + auto norm_q = std::dynamic_pointer_cast(blocks["norm_q"]); + auto norm_k = std::dynamic_pointer_cast(blocks["norm_k"]); + auto to_out_0 = std::dynamic_pointer_cast(blocks["to_out.0"]); + + if (sd_backend_is(ctx->backend, "Vulkan")) { + to_out_0->set_force_prec_f32(true); + } + + int64_t N = hidden_states->ne[2]; + int64_t Lq = hidden_states->ne[1]; + int64_t Lk = encoder_hidden_states->ne[1]; + + auto q = to_q->forward(ctx, hidden_states); + q = ggml_reshape_4d(ctx->ggml_ctx, q, dim_head, heads, Lq, N); + auto k = to_k->forward(ctx, encoder_hidden_states); + k = ggml_reshape_4d(ctx->ggml_ctx, k, dim_head, kv_heads, Lk, N); + auto v = to_v->forward(ctx, encoder_hidden_states); + v = ggml_reshape_4d(ctx->ggml_ctx, v, dim_head, kv_heads, Lk, N); + + q = norm_q->forward(ctx, q); + k = norm_k->forward(ctx, k); + + auto out = Rope::attention(ctx, q, k, v, rotary_emb, attention_mask); + out = to_out_0->forward(ctx, out); + return out; + } + }; + + struct BooguImageTransformerBlock : public GGMLBlock { + bool modulation; + + BooguImageTransformerBlock(int64_t dim, + int64_t num_attention_heads, + int64_t num_kv_heads, + int64_t multiple_of, + float norm_eps, + bool modulation) + : modulation(modulation) { + int64_t head_dim = dim / num_attention_heads; + blocks["attn"] = std::make_shared(dim, head_dim, num_attention_heads, num_kv_heads, 1e-5f); + blocks["feed_forward"] = std::make_shared(dim, 4 * dim, multiple_of); + if (modulation) { + blocks["norm1"] = std::make_shared(dim, std::min(dim, 1024), norm_eps); + } else { + blocks["norm1"] = std::make_shared(dim, norm_eps); + } + blocks["ffn_norm1"] = std::make_shared(dim, norm_eps); + blocks["norm2"] = std::make_shared(dim, norm_eps); + blocks["ffn_norm2"] = std::make_shared(dim, norm_eps); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* hidden_states, + ggml_tensor* rotary_emb, + ggml_tensor* temb = nullptr, + ggml_tensor* attention_mask = nullptr) { + auto attn = std::dynamic_pointer_cast(blocks["attn"]); + auto feed_forward = std::dynamic_pointer_cast(blocks["feed_forward"]); + auto ffn_norm1 = std::dynamic_pointer_cast(blocks["ffn_norm1"]); + auto norm2 = std::dynamic_pointer_cast(blocks["norm2"]); + auto ffn_norm2 = std::dynamic_pointer_cast(blocks["ffn_norm2"]); + + if (modulation) { + auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]); + auto mods = norm1->forward(ctx, hidden_states, temb); + + auto norm_hidden_states = std::get<0>(mods); + auto gate_msa = std::get<1>(mods); + auto scale_mlp = std::get<2>(mods); + auto gate_mlp = std::get<3>(mods); + + auto attn_output = attn->forward(ctx, norm_hidden_states, norm_hidden_states, rotary_emb, attention_mask); + hidden_states = gate_residual(ctx->ggml_ctx, hidden_states, norm2->forward(ctx, attn_output), gate_msa); + + auto mlp_input = scale_modulate(ctx->ggml_ctx, ffn_norm1->forward(ctx, hidden_states), scale_mlp); + auto mlp_output = feed_forward->forward(ctx, mlp_input); + hidden_states = gate_residual(ctx->ggml_ctx, hidden_states, ffn_norm2->forward(ctx, mlp_output), gate_mlp); + } else { + auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]); + + auto norm_hidden_states = norm1->forward(ctx, hidden_states); + auto attn_output = attn->forward(ctx, norm_hidden_states, norm_hidden_states, rotary_emb, attention_mask); + hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, norm2->forward(ctx, attn_output)); + + auto mlp_output = feed_forward->forward(ctx, ffn_norm1->forward(ctx, hidden_states)); + hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, ffn_norm2->forward(ctx, mlp_output)); + } + return hidden_states; + } + }; + + struct BooguImageJointAttention : public GGMLBlock { + int64_t dim_head; + int64_t heads; + int64_t kv_heads; + + BooguImageJointAttention(int64_t dim, int64_t dim_head, int64_t heads, int64_t kv_heads) + : dim_head(dim_head), heads(heads), kv_heads(kv_heads) { + blocks["norm_q"] = std::make_shared(dim_head, 1e-5f); + blocks["norm_k"] = std::make_shared(dim_head, 1e-5f); + blocks["to_out.0"] = std::make_shared(heads * dim_head, dim, false); + blocks["processor.img_to_q"] = std::make_shared(dim, heads * dim_head, false); + blocks["processor.img_to_k"] = std::make_shared(dim, kv_heads * dim_head, false); + blocks["processor.img_to_v"] = std::make_shared(dim, kv_heads * dim_head, false); + blocks["processor.instruct_to_q"] = std::make_shared(dim, heads * dim_head, false); + blocks["processor.instruct_to_k"] = std::make_shared(dim, kv_heads * dim_head, false); + blocks["processor.instruct_to_v"] = std::make_shared(dim, kv_heads * dim_head, false); + blocks["processor.instruct_out"] = std::make_shared(heads * dim_head, dim, false); + blocks["processor.img_out"] = std::make_shared(heads * dim_head, dim, false); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* img_hidden_states, + ggml_tensor* instruct_hidden_states, + ggml_tensor* rotary_emb, + ggml_tensor* attention_mask = nullptr) { + auto norm_q = std::dynamic_pointer_cast(blocks["norm_q"]); + auto norm_k = std::dynamic_pointer_cast(blocks["norm_k"]); + auto to_out_0 = std::dynamic_pointer_cast(blocks["to_out.0"]); + auto img_to_q = std::dynamic_pointer_cast(blocks["processor.img_to_q"]); + auto img_to_k = std::dynamic_pointer_cast(blocks["processor.img_to_k"]); + auto img_to_v = std::dynamic_pointer_cast(blocks["processor.img_to_v"]); + auto instruct_to_q = std::dynamic_pointer_cast(blocks["processor.instruct_to_q"]); + auto instruct_to_k = std::dynamic_pointer_cast(blocks["processor.instruct_to_k"]); + auto instruct_to_v = std::dynamic_pointer_cast(blocks["processor.instruct_to_v"]); + auto instruct_out = std::dynamic_pointer_cast(blocks["processor.instruct_out"]); + auto img_out = std::dynamic_pointer_cast(blocks["processor.img_out"]); + + if (sd_backend_is(ctx->backend, "Vulkan")) { + to_out_0->set_force_prec_f32(true); + } + + int64_t N = img_hidden_states->ne[2]; + int64_t L_img = img_hidden_states->ne[1]; + int64_t L_instruct = instruct_hidden_states->ne[1]; + + auto img_q = img_to_q->forward(ctx, img_hidden_states); + img_q = ggml_reshape_4d(ctx->ggml_ctx, img_q, dim_head, heads, L_img, N); + auto img_k = img_to_k->forward(ctx, img_hidden_states); + img_k = ggml_reshape_4d(ctx->ggml_ctx, img_k, dim_head, kv_heads, L_img, N); + auto img_v = img_to_v->forward(ctx, img_hidden_states); + img_v = ggml_reshape_4d(ctx->ggml_ctx, img_v, dim_head, kv_heads, L_img, N); + + auto instruct_q = instruct_to_q->forward(ctx, instruct_hidden_states); + instruct_q = ggml_reshape_4d(ctx->ggml_ctx, instruct_q, dim_head, heads, L_instruct, N); + auto instruct_k = instruct_to_k->forward(ctx, instruct_hidden_states); + instruct_k = ggml_reshape_4d(ctx->ggml_ctx, instruct_k, dim_head, kv_heads, L_instruct, N); + auto instruct_v = instruct_to_v->forward(ctx, instruct_hidden_states); + instruct_v = ggml_reshape_4d(ctx->ggml_ctx, instruct_v, dim_head, kv_heads, L_instruct, N); + + auto q = ggml_concat(ctx->ggml_ctx, instruct_q, img_q, 2); + auto k = ggml_concat(ctx->ggml_ctx, instruct_k, img_k, 2); + auto v = ggml_concat(ctx->ggml_ctx, instruct_v, img_v, 2); + q = norm_q->forward(ctx, q); + k = norm_k->forward(ctx, k); + + auto hidden_states = Rope::attention(ctx, q, k, v, rotary_emb, attention_mask); + auto instruct_attn = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, 0, L_instruct); + auto img_attn = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, L_instruct, L_instruct + L_img); + + instruct_attn = instruct_out->forward(ctx, instruct_attn); + img_attn = img_out->forward(ctx, img_attn); + hidden_states = ggml_concat(ctx->ggml_ctx, instruct_attn, img_attn, 1); + hidden_states = to_out_0->forward(ctx, hidden_states); + return hidden_states; + } + }; + + struct BooguImageDoubleStreamBlock : public GGMLBlock { + BooguImageDoubleStreamBlock(int64_t dim, + int64_t num_attention_heads, + int64_t num_kv_heads, + int64_t multiple_of, + float norm_eps) { + int64_t head_dim = dim / num_attention_heads; + blocks["img_instruct_attn"] = std::make_shared(dim, head_dim, num_attention_heads, num_kv_heads); + blocks["img_self_attn"] = std::make_shared(dim, head_dim, num_attention_heads, num_kv_heads, 1e-5f); + blocks["img_feed_forward"] = std::make_shared(dim, 4 * dim, multiple_of); + blocks["instruct_feed_forward"] = std::make_shared(dim, 4 * dim, multiple_of); + blocks["img_norm1"] = std::make_shared(dim, std::min(dim, 1024), norm_eps); + blocks["img_norm2"] = std::make_shared(dim, std::min(dim, 1024), norm_eps); + blocks["img_norm3"] = std::make_shared(dim, std::min(dim, 1024), norm_eps); + blocks["instruct_norm1"] = std::make_shared(dim, std::min(dim, 1024), norm_eps); + blocks["instruct_norm2"] = std::make_shared(dim, std::min(dim, 1024), norm_eps); + blocks["img_attn_norm"] = std::make_shared(dim, norm_eps); + blocks["img_self_attn_norm"] = std::make_shared(dim, norm_eps); + blocks["img_ffn_norm1"] = std::make_shared(dim, norm_eps); + blocks["img_ffn_norm2"] = std::make_shared(dim, norm_eps); + blocks["instruct_attn_norm"] = std::make_shared(dim, norm_eps); + blocks["instruct_ffn_norm1"] = std::make_shared(dim, norm_eps); + blocks["instruct_ffn_norm2"] = std::make_shared(dim, norm_eps); + } + + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* img_hidden_states, + ggml_tensor* instruct_hidden_states, + ggml_tensor* joint_rotary_emb, + ggml_tensor* img_rotary_emb, + ggml_tensor* temb) { + auto img_instruct_attn = std::dynamic_pointer_cast(blocks["img_instruct_attn"]); + auto img_self_attn = std::dynamic_pointer_cast(blocks["img_self_attn"]); + auto img_feed_forward = std::dynamic_pointer_cast(blocks["img_feed_forward"]); + auto instruct_feed_forward = std::dynamic_pointer_cast(blocks["instruct_feed_forward"]); + auto img_norm1 = std::dynamic_pointer_cast(blocks["img_norm1"]); + auto img_norm2 = std::dynamic_pointer_cast(blocks["img_norm2"]); + auto img_norm3 = std::dynamic_pointer_cast(blocks["img_norm3"]); + auto instruct_norm1 = std::dynamic_pointer_cast(blocks["instruct_norm1"]); + auto instruct_norm2 = std::dynamic_pointer_cast(blocks["instruct_norm2"]); + auto img_attn_norm = std::dynamic_pointer_cast(blocks["img_attn_norm"]); + auto img_self_attn_norm = std::dynamic_pointer_cast(blocks["img_self_attn_norm"]); + auto img_ffn_norm1 = std::dynamic_pointer_cast(blocks["img_ffn_norm1"]); + auto img_ffn_norm2 = std::dynamic_pointer_cast(blocks["img_ffn_norm2"]); + auto instruct_attn_norm = std::dynamic_pointer_cast(blocks["instruct_attn_norm"]); + auto instruct_ffn_norm1 = std::dynamic_pointer_cast(blocks["instruct_ffn_norm1"]); + auto instruct_ffn_norm2 = std::dynamic_pointer_cast(blocks["instruct_ffn_norm2"]); + + int64_t L_instruct = instruct_hidden_states->ne[1]; + + auto img_norm1_out_vec = img_norm1->forward(ctx, img_hidden_states, temb); + auto img_norm2_out_vec = img_norm2->forward(ctx, img_hidden_states, temb); + auto img_norm3_out_vec = img_norm3->forward(ctx, img_hidden_states, temb); + auto instruct_norm1_out_vec = instruct_norm1->forward(ctx, instruct_hidden_states, temb); + auto instruct_norm2_out_vec = instruct_norm2->forward(ctx, instruct_hidden_states, temb); + + auto img_norm1_out = std::get<0>(img_norm1_out_vec); + auto img_gate_msa = std::get<1>(img_norm1_out_vec); + auto img_scale_mlp = std::get<2>(img_norm1_out_vec); + auto img_gate_mlp = std::get<3>(img_norm1_out_vec); + + auto img_norm2_out = std::get<0>(img_norm2_out_vec); + auto img_shift_mlp = std::get<1>(img_norm2_out_vec); + + auto img_norm3_out = std::get<0>(img_norm3_out_vec); + auto img_gate_self = std::get<1>(img_norm3_out_vec); + + auto instruct_norm1_out = std::get<0>(instruct_norm1_out_vec); + auto instruct_gate_msa = std::get<1>(instruct_norm1_out_vec); + auto instruct_scale_mlp = std::get<2>(instruct_norm1_out_vec); + auto instruct_gate_mlp = std::get<3>(instruct_norm1_out_vec); + + auto instruct_norm2_out = std::get<0>(instruct_norm2_out_vec); + auto instruct_shift_mlp = std::get<1>(instruct_norm2_out_vec); + + auto joint_attn_out = img_instruct_attn->forward(ctx, img_norm1_out, instruct_norm1_out, joint_rotary_emb); + auto instruct_attn_out = ggml_ext_slice(ctx->ggml_ctx, joint_attn_out, 1, 0, L_instruct); + auto img_attn_out = ggml_ext_slice(ctx->ggml_ctx, joint_attn_out, 1, L_instruct, joint_attn_out->ne[1]); + + auto img_self_attn_out = img_self_attn->forward(ctx, img_norm3_out, img_norm3_out, img_rotary_emb); + + img_hidden_states = gate_residual(ctx->ggml_ctx, img_hidden_states, img_attn_norm->forward(ctx, img_attn_out), img_gate_msa); + img_hidden_states = gate_residual(ctx->ggml_ctx, img_hidden_states, img_self_attn_norm->forward(ctx, img_self_attn_out), img_gate_self); + + auto img_mlp_input = scale_modulate(ctx->ggml_ctx, img_norm2_out, img_scale_mlp); + img_shift_mlp = ggml_reshape_3d(ctx->ggml_ctx, img_shift_mlp, img_shift_mlp->ne[0], 1, img_shift_mlp->ne[1]); + img_mlp_input = ggml_add(ctx->ggml_ctx, img_mlp_input, img_shift_mlp); + auto img_mlp_out = img_feed_forward->forward(ctx, img_ffn_norm1->forward(ctx, img_mlp_input)); + img_hidden_states = gate_residual(ctx->ggml_ctx, img_hidden_states, img_ffn_norm2->forward(ctx, img_mlp_out), img_gate_mlp); + + instruct_hidden_states = gate_residual(ctx->ggml_ctx, instruct_hidden_states, instruct_attn_norm->forward(ctx, instruct_attn_out), instruct_gate_msa); + auto instruct_mlp_input = scale_modulate(ctx->ggml_ctx, instruct_norm2_out, instruct_scale_mlp); + instruct_shift_mlp = ggml_reshape_3d(ctx->ggml_ctx, instruct_shift_mlp, instruct_shift_mlp->ne[0], 1, instruct_shift_mlp->ne[1]); + instruct_mlp_input = ggml_add(ctx->ggml_ctx, instruct_mlp_input, instruct_shift_mlp); + auto instruct_mlp_out = instruct_feed_forward->forward(ctx, instruct_ffn_norm1->forward(ctx, instruct_mlp_input)); + instruct_hidden_states = gate_residual(ctx->ggml_ctx, instruct_hidden_states, instruct_ffn_norm2->forward(ctx, instruct_mlp_out), instruct_gate_mlp); + + return {img_hidden_states, instruct_hidden_states}; + } + }; + + struct BooguImageModel : public GGMLBlock { + BooguConfig config; + + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + GGML_UNUSED(tensor_storage_map); + GGML_UNUSED(prefix); + params["image_index_embedding"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, config.hidden_size, 5); + } + + BooguImageModel() = default; + BooguImageModel(BooguConfig config) + : config(std::move(config)) { + blocks["x_embedder"] = std::make_shared(this->config.patch_size * this->config.patch_size * this->config.in_channels, this->config.hidden_size, true); + blocks["ref_image_patch_embedder"] = std::make_shared(this->config.patch_size * this->config.patch_size * this->config.in_channels, this->config.hidden_size, true); + blocks["time_caption_embed"] = std::make_shared(this->config.hidden_size, + this->config.instruction_feat_dim, + 256, + this->config.norm_eps, + this->config.timestep_scale); + + for (int i = 0; i < this->config.num_refiner_layers; i++) { + blocks["noise_refiner." + std::to_string(i)] = std::make_shared(this->config.hidden_size, + this->config.num_attention_heads, + this->config.num_kv_heads, + this->config.multiple_of, + this->config.norm_eps, + true); + blocks["ref_image_refiner." + std::to_string(i)] = std::make_shared(this->config.hidden_size, + this->config.num_attention_heads, + this->config.num_kv_heads, + this->config.multiple_of, + this->config.norm_eps, + true); + blocks["context_refiner." + std::to_string(i)] = std::make_shared(this->config.hidden_size, + this->config.num_attention_heads, + this->config.num_kv_heads, + this->config.multiple_of, + this->config.norm_eps, + false); + } + + for (int i = 0; i < this->config.num_double_stream_layers; i++) { + blocks["double_stream_layers." + std::to_string(i)] = std::make_shared(this->config.hidden_size, + this->config.num_attention_heads, + this->config.num_kv_heads, + this->config.multiple_of, + this->config.norm_eps); + } + + for (int i = 0; i < this->config.num_layers; i++) { + blocks["single_stream_layers." + std::to_string(i)] = std::make_shared(this->config.hidden_size, + this->config.num_attention_heads, + this->config.num_kv_heads, + this->config.multiple_of, + this->config.norm_eps, + true); + } + + blocks["norm_out"] = std::make_shared(this->config.hidden_size, + this->config.timestep_embed_dim, + this->config.patch_size * this->config.patch_size * this->config.out_channels); + } + + ggml_tensor* image_index_embedding(GGMLRunnerContext* ctx, int index) { + GGML_ASSERT(index >= 0 && index < 5); + auto embedding = params["image_index_embedding"]; + auto out = ggml_view_1d(ctx->ggml_ctx, + embedding, + config.hidden_size, + index * config.hidden_size * ggml_element_size(embedding)); + out = ggml_reshape_3d(ctx->ggml_ctx, out, config.hidden_size, 1, 1); + return out; + } + + ggml_tensor* embed_refs(GGMLRunnerContext* ctx, const std::vector& ref_latents) { + if (ref_latents.empty()) { + return nullptr; + } + auto ref_image_patch_embedder = std::dynamic_pointer_cast(blocks["ref_image_patch_embedder"]); + + ggml_tensor* ref_img = nullptr; + for (int i = 0; i < static_cast(ref_latents.size()); i++) { + auto ref = DiT::pad_and_patchify(ctx, ref_latents[i], config.patch_size, config.patch_size, false); + ref = ref_image_patch_embedder->forward(ctx, ref); + ref = ggml_add(ctx->ggml_ctx, ref, image_index_embedding(ctx, std::min(i, 4))); + ref_img = ref_img == nullptr ? ref : ggml_concat(ctx->ggml_ctx, ref_img, ref, 1); + } + return ref_img; + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, + ggml_tensor* pe, + std::vector ref_latents = {}) { + int64_t W = x->ne[0]; + int64_t H = x->ne[1]; + int64_t N = x->ne[3]; + GGML_ASSERT(N == 1); + + auto x_embedder = std::dynamic_pointer_cast(blocks["x_embedder"]); + auto time_caption_embed = std::dynamic_pointer_cast(blocks["time_caption_embed"]); + auto norm_out = std::dynamic_pointer_cast(blocks["norm_out"]); + + auto timestep = ggml_sub(ctx->ggml_ctx, ggml_ext_ones_like(ctx->ggml_ctx, timesteps), timesteps); + auto embeds = time_caption_embed->forward(ctx, timestep, context); + auto temb = embeds.first; + auto txt = embeds.second; + + auto img = DiT::pad_and_patchify(ctx, x, config.patch_size, config.patch_size, false); + int64_t img_len = img->ne[1]; + img = x_embedder->forward(ctx, img); + auto ref_img = embed_refs(ctx, ref_latents); + int64_t ref_len = ref_img != nullptr ? ref_img->ne[1] : 0; + int64_t txt_len = txt->ne[1]; + + GGML_ASSERT(pe->ne[3] == txt_len + ref_len + img_len); + auto txt_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, 0, txt_len); + auto noise_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt_len + ref_len, txt_len + ref_len + img_len); + + for (int i = 0; i < config.num_refiner_layers; i++) { + auto block = std::dynamic_pointer_cast(blocks["context_refiner." + std::to_string(i)]); + txt = block->forward(ctx, txt, txt_pe); + sd::ggml_graph_cut::mark_graph_cut(txt, "boogu.context_refiner." + std::to_string(i), "txt"); + } + + for (int i = 0; i < config.num_refiner_layers; i++) { + auto block = std::dynamic_pointer_cast(blocks["noise_refiner." + std::to_string(i)]); + img = block->forward(ctx, img, noise_pe, temb); + sd::ggml_graph_cut::mark_graph_cut(img, "boogu.noise_refiner." + std::to_string(i), "img"); + } + + ggml_tensor* combined_img = img; + if (ref_img != nullptr) { + auto ref_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt_len, txt_len + ref_len); + for (int i = 0; i < config.num_refiner_layers; i++) { + auto block = std::dynamic_pointer_cast(blocks["ref_image_refiner." + std::to_string(i)]); + ref_img = block->forward(ctx, ref_img, ref_pe, temb); + sd::ggml_graph_cut::mark_graph_cut(ref_img, "boogu.ref_image_refiner." + std::to_string(i), "ref_img"); + } + combined_img = ggml_concat(ctx->ggml_ctx, ref_img, img, 1); + } + + auto img_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt_len, txt_len + combined_img->ne[1]); + for (int i = 0; i < config.num_double_stream_layers; i++) { + auto block = std::dynamic_pointer_cast(blocks["double_stream_layers." + std::to_string(i)]); + auto result = block->forward(ctx, combined_img, txt, pe, img_pe, temb); + combined_img = result.first; + txt = result.second; + sd::ggml_graph_cut::mark_graph_cut(combined_img, "boogu.double_stream_layers." + std::to_string(i), "img"); + sd::ggml_graph_cut::mark_graph_cut(txt, "boogu.double_stream_layers." + std::to_string(i), "txt"); + } + + auto hidden_states = ggml_concat(ctx->ggml_ctx, txt, combined_img, 1); + for (int i = 0; i < config.num_layers; i++) { + auto block = std::dynamic_pointer_cast(blocks["single_stream_layers." + std::to_string(i)]); + hidden_states = block->forward(ctx, hidden_states, pe, temb); + sd::ggml_graph_cut::mark_graph_cut(hidden_states, "boogu.single_stream_layers." + std::to_string(i), "hidden_states"); + } + + hidden_states = norm_out->forward(ctx, hidden_states, temb); + hidden_states = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, hidden_states->ne[1] - img_len, hidden_states->ne[1]); + hidden_states = DiT::unpatchify_and_crop(ctx->ggml_ctx, hidden_states, H, W, config.patch_size, config.patch_size, false); + hidden_states = ggml_ext_scale(ctx->ggml_ctx, hidden_states, -1.f); + return hidden_states; + } + }; + + __STATIC_INLINE__ int patched_token_count(int64_t size, int patch_size) { + int pad = (patch_size - (static_cast(size) % patch_size)) % patch_size; + return (static_cast(size) + pad) / patch_size; + } + + __STATIC_INLINE__ void append_spatial_ids(std::vector>& ids, + int bs, + int pe_shift, + int h_tokens, + int w_tokens) { + std::vector> image_ids(h_tokens * w_tokens, std::vector(3, 0.0f)); + for (int h = 0; h < h_tokens; h++) { + for (int w = 0; w < w_tokens; w++) { + image_ids[h * w_tokens + w][0] = static_cast(pe_shift); + image_ids[h * w_tokens + w][1] = static_cast(h); + image_ids[h * w_tokens + w][2] = static_cast(w); + } + } + for (int b = 0; b < bs; b++) { + ids.insert(ids.end(), image_ids.begin(), image_ids.end()); + } + } + + __STATIC_INLINE__ std::vector gen_boogu_pe(int h, + int w, + int patch_size, + int bs, + int context_len, + const std::vector& ref_latents, + int theta, + const std::vector& axes_dim) { + std::vector> ids; + ids.reserve(static_cast(bs) * context_len); + for (int b = 0; b < bs; b++) { + for (int i = 0; i < context_len; i++) { + float pos = static_cast(i); + ids.push_back({pos, pos, pos}); + } + } + + int pe_shift = context_len; + for (ggml_tensor* ref : ref_latents) { + int ref_h_tokens = patched_token_count(ref->ne[1], patch_size); + int ref_w_tokens = patched_token_count(ref->ne[0], patch_size); + append_spatial_ids(ids, bs, pe_shift, ref_h_tokens, ref_w_tokens); + pe_shift += std::max(ref_h_tokens, ref_w_tokens); + } + + int h_tokens = patched_token_count(h, patch_size); + int w_tokens = patched_token_count(w, patch_size); + append_spatial_ids(ids, bs, pe_shift, h_tokens, w_tokens); + + return Rope::embed_nd(ids, bs, static_cast(theta), axes_dim); + } + + struct BooguImageRunner : public DiffusionModelRunner { + BooguConfig config; + BooguImageModel boogu; + std::vector pe_vec; + + BooguImageRunner(ggml_backend_t backend, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "", + SDVersion version = VERSION_BOOGU_IMAGE, + std::shared_ptr weight_manager = nullptr) + : DiffusionModelRunner(backend, prefix, weight_manager), + config(BooguConfig::detect_from_weights(tensor_storage_map, prefix)) { + boogu = BooguImageModel(config); + boogu.init(params_ctx, tensor_storage_map, prefix); + } + + std::string get_desc() override { + return "boogu_image"; + } + + void get_param_tensors(std::map& tensors, const std::string& prefix) override { + boogu.get_param_tensors(tensors, prefix); + } + + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor, + const std::vector>& ref_latents_tensor = {}) { + ggml_cgraph* gf = new_graph_custom(BOOGU_GRAPH_SIZE); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); + GGML_ASSERT(x->ne[3] == 1); + GGML_ASSERT(!context_tensor.empty()); + ggml_tensor* context = make_input(context_tensor); + + std::vector ref_latents; + ref_latents.reserve(ref_latents_tensor.size()); + for (const auto& ref_latent_tensor : ref_latents_tensor) { + ref_latents.push_back(make_input(ref_latent_tensor)); + } + + pe_vec = gen_boogu_pe(static_cast(x->ne[1]), + static_cast(x->ne[0]), + config.patch_size, + static_cast(x->ne[3]), + static_cast(context->ne[1]), + ref_latents, + config.theta, + config.axes_dim); + int pos_len = static_cast(pe_vec.size() / config.axes_dim_sum / 2); + auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len); + set_backend_tensor_data(pe, pe_vec.data()); + + auto runner_ctx = get_context(); + ggml_tensor* out = boogu.forward(&runner_ctx, x, timesteps, context, pe, ref_latents); + ggml_build_forward_expand(gf, out); + return gf; + } + + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context, + const std::vector>& ref_latents = {}) { + auto get_graph = [&]() -> ggml_cgraph* { + return build_graph(x, timesteps, context, ref_latents); + }; + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); + } + + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); + static const std::vector> empty_ref_latents; + return compute(n_threads, + *diffusion_params.x, + *diffusion_params.timesteps, + tensor_or_empty(diffusion_params.context), + diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents); + } + }; +} // namespace Boogu + +#endif // __SD_MODEL_DIFFUSION_BOOGU_HPP__ diff --git a/otherarch/sdcpp/src/model/diffusion/ernie_image.hpp b/otherarch/sdcpp/src/model/diffusion/ernie_image.hpp index 12fcada597ee..0427b3b384cc 100644 --- a/otherarch/sdcpp/src/model/diffusion/ernie_image.hpp +++ b/otherarch/sdcpp/src/model/diffusion/ernie_image.hpp @@ -162,6 +162,8 @@ namespace ErnieImage { int64_t S = x->ne[1]; int64_t N = x->ne[2]; + float scale = (sd_backend_is(ctx->backend, "Vulkan") && ctx->flash_attn_enabled) ? 1.0f / 32.0f : 1.0f; + auto q = to_q->forward(ctx, x); auto k = to_k->forward(ctx, x); auto v = to_v->forward(ctx, x); @@ -182,7 +184,7 @@ namespace ErnieImage { k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 0, 2, 1, 3)); // [N, heads, S, head_dim] k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]); - x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, ctx->flash_attn_enabled); // [N, S, hidden_size] + x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, ctx->flash_attn_enabled, scale); // [N, S, hidden_size] x = to_out_0->forward(ctx, x); return x; } diff --git a/otherarch/sdcpp/src/model/te/llm.hpp b/otherarch/sdcpp/src/model/te/llm.hpp index 74dc232e5706..12daf5637f3c 100644 --- a/otherarch/sdcpp/src/model/te/llm.hpp +++ b/otherarch/sdcpp/src/model/te/llm.hpp @@ -79,6 +79,7 @@ namespace LLM { int window_size = 112; int num_position_embeddings = 0; std::set fullatt_block_indexes = {7, 15, 23, 31}; + bool split_patch_embed = false; }; struct LLMConfig { @@ -179,7 +180,8 @@ namespace LLM { config.num_experts_per_tok = 4; } - config.num_layers = 0; + config.num_layers = 0; + int detected_vision_layers = 0; for (const auto& [name, tensor_storage] : tensor_storage_map) { if (!starts_with(name, prefix)) { continue; @@ -190,6 +192,38 @@ namespace LLM { if (contains(name, "attn.q_proj")) { config.llama_cpp_style = true; } + if (contains(name, "visual.patch_embed.proj.1.weight")) { + config.vision.split_patch_embed = true; + } + if (contains(name, "visual.patch_embed.proj.0.weight")) { + config.vision.patch_size = static_cast(tensor_storage.ne[0]); + config.vision.in_channels = tensor_storage.ne[2]; + config.vision.hidden_size = tensor_storage.ne[3]; + } + if (contains(name, "visual.patch_embed.bias")) { + config.vision.hidden_size = tensor_storage.ne[0]; + } + if (contains(name, "visual.pos_embed.weight")) { + config.vision.hidden_size = tensor_storage.ne[0]; + config.vision.num_position_embeddings = static_cast(tensor_storage.ne[1]); + } + if (contains(name, "visual.blocks.")) { + auto items = split_string(name.substr(pos), '.'); + if (items.size() > 2) { + int block_index = atoi(items[2].c_str()); + if (block_index + 1 > detected_vision_layers) { + detected_vision_layers = block_index + 1; + } + } + } + if (contains(name, "visual.blocks.0.mlp.linear_fc1.weight") || + contains(name, "visual.blocks.0.mlp.gate_proj.weight")) { + config.vision.intermediate_size = tensor_storage.ne[1]; + } + if (contains(name, "visual.merger.linear_fc2.weight") || + contains(name, "visual.merger.mlp.2.weight")) { + config.vision.out_hidden_size = tensor_storage.ne[1]; + } continue; } pos = name.find("layers."); @@ -219,6 +253,9 @@ namespace LLM { if (arch == LLMArch::QWEN3 && config.num_layers == 28) { config.num_heads = 16; } + if (detected_vision_layers > 0) { + config.vision.num_layers = detected_vision_layers; + } LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64, config.num_layers, config.vocab_size, @@ -539,40 +576,51 @@ namespace LLM { struct VisionPatchEmbed : public GGMLBlock { protected: - bool llama_cpp_style; + bool split_patch_embed; + bool bias; int patch_size; int temporal_patch_size; int64_t in_channels; int64_t embed_dim; + void init_params(ggml_context* ctx, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "") override { + GGML_UNUSED(tensor_storage_map); + GGML_UNUSED(prefix); + if (split_patch_embed && bias) { + params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, embed_dim); + } + } + public: - VisionPatchEmbed(bool llama_cpp_style, + VisionPatchEmbed(bool split_patch_embed, LLMVisionArch arch, int patch_size = 14, int temporal_patch_size = 2, int64_t in_channels = 3, int64_t embed_dim = 1152) - : llama_cpp_style(llama_cpp_style), + : split_patch_embed(split_patch_embed), + bias(arch == LLMVisionArch::QWEN3_VL), patch_size(patch_size), temporal_patch_size(temporal_patch_size), in_channels(in_channels), embed_dim(embed_dim) { - bool bias = arch == LLMVisionArch::QWEN3_VL; - if (llama_cpp_style) { + if (split_patch_embed) { blocks["proj.0"] = std::shared_ptr(new Conv2d(in_channels, embed_dim, {patch_size, patch_size}, {patch_size, patch_size}, {0, 0}, {1, 1}, - bias)); + false)); blocks["proj.1"] = std::shared_ptr(new Conv2d(in_channels, embed_dim, {patch_size, patch_size}, {patch_size, patch_size}, {0, 0}, {1, 1}, - bias)); + false)); } else { std::tuple kernel_size = {(int)temporal_patch_size, (int)patch_size, (int)patch_size}; blocks["proj"] = std::shared_ptr(new Conv3d(in_channels, @@ -593,7 +641,7 @@ namespace LLM { temporal_patch_size, ggml_nelements(x) / (temporal_patch_size * patch_size * patch_size)); - if (llama_cpp_style) { + if (split_patch_embed) { auto proj_0 = std::dynamic_pointer_cast(blocks["proj.0"]); auto proj_1 = std::dynamic_pointer_cast(blocks["proj.1"]); @@ -606,6 +654,10 @@ namespace LLM { x1 = proj_1->forward(ctx, x1); x = ggml_add(ctx->ggml_ctx, x0, x1); + if (bias) { + auto b = ggml_reshape_4d(ctx->ggml_ctx, params["bias"], 1, 1, embed_dim, 1); + x = ggml_add_inplace(ctx->ggml_ctx, x, b); + } } else { auto proj = std::dynamic_pointer_cast(blocks["proj"]); @@ -798,7 +850,7 @@ namespace LLM { spatial_merge_size(vision_params.spatial_merge_size), num_grid_per_side(vision_params.num_position_embeddings > 0 ? static_cast(std::sqrt(vision_params.num_position_embeddings)) : 0), fullatt_block_indexes(vision_params.fullatt_block_indexes) { - blocks["patch_embed"] = std::shared_ptr(new VisionPatchEmbed(llama_cpp_style, + blocks["patch_embed"] = std::shared_ptr(new VisionPatchEmbed(vision_params.split_patch_embed, arch_, vision_params.patch_size, vision_params.temporal_patch_size, diff --git a/otherarch/sdcpp/src/model/vae/auto_encoder_kl.hpp b/otherarch/sdcpp/src/model/vae/auto_encoder_kl.hpp index 478b18edb69f..51e1feda72bf 100644 --- a/otherarch/sdcpp/src/model/vae/auto_encoder_kl.hpp +++ b/otherarch/sdcpp/src/model/vae/auto_encoder_kl.hpp @@ -682,7 +682,7 @@ struct AutoEncoderKL : public VAE { } else if (sd_version_is_sd3(version)) { scale_factor = 1.5305f; shift_factor = 0.0609f; - } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) { + } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_boogu_image(version) || sd_version_is_longcat(version)) { scale_factor = 0.3611f; shift_factor = 0.1159f; } else if (sd_version_uses_flux2_vae(version)) { diff --git a/otherarch/sdcpp/src/model_loader.cpp b/otherarch/sdcpp/src/model_loader.cpp index a1788bfbdd25..5c2d57cdec5b 100644 --- a/otherarch/sdcpp/src/model_loader.cpp +++ b/otherarch/sdcpp/src/model_loader.cpp @@ -513,6 +513,9 @@ SDVersion ModelLoader::get_sd_version() { if (tensor_storage.name.find("model.diffusion_model.cap_embedder.0.weight") != std::string::npos) { return VERSION_Z_IMAGE; } + if (tensor_storage.name.find("double_stream_layers.0.img_instruct_attn.processor.img_to_q.weight") != std::string::npos) { + return VERSION_BOOGU_IMAGE; + } if (tensor_storage.name.find("model.diffusion_model.layers.0.adaLN_sa_ln.weight") != std::string::npos) { return VERSION_ERNIE_IMAGE; } diff --git a/otherarch/sdcpp/src/name_conversion.cpp b/otherarch/sdcpp/src/name_conversion.cpp index 4b7b4008df2b..da2a8d5eda0f 100644 --- a/otherarch/sdcpp/src/name_conversion.cpp +++ b/otherarch/sdcpp/src/name_conversion.cpp @@ -184,6 +184,27 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix) return name; } +std::string convert_qwen3_vl_vision_name(std::string name) { + static const std::vector> qwen3_vl_vision_name_map{ + {"mm.0.", "merger.linear_fc1."}, + {"mm.2.", "merger.linear_fc2."}, + {"v.post_ln.", "merger.norm."}, + {"v.position_embd.weight", "pos_embed.weight"}, + {"v.patch_embd.weight.1", "patch_embed.proj.1.weight"}, + {"v.patch_embd.weight", "patch_embed.proj.0.weight"}, + {"v.patch_embd.bias", "patch_embed.bias"}, + {"v.blk.", "blocks."}, + {"attn_qkv.", "attn.qkv."}, + {"attn_out.", "attn.proj."}, + {"ffn_up.", "mlp.linear_fc1."}, + {"ffn_down.", "mlp.linear_fc2."}, + {"ln1.", "norm1."}, + {"ln2.", "norm2."}, + }; + replace_with_name_map(name, qwen3_vl_vision_name_map); + return name; +} + // ref: https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py std::string convert_diffusers_unet_to_original_sd1(std::string name) { // (stable-diffusion, HF Diffusers) @@ -1154,6 +1175,10 @@ std::string convert_tensor_name(std::string name, SDVersion version) { replace_with_prefix_map(name, prefix_map); + if (sd_version_is_boogu_image(version) && starts_with(name, "text_encoders.llm.visual.")) { + name = convert_qwen3_vl_vision_name(std::move(name)); + } + // diffusion model { for (const auto& prefix : diffuison_model_prefix_vec) { diff --git a/otherarch/sdcpp/src/stable-diffusion.cpp b/otherarch/sdcpp/src/stable-diffusion.cpp index 99e0709097d6..cd4705696cf1 100644 --- a/otherarch/sdcpp/src/stable-diffusion.cpp +++ b/otherarch/sdcpp/src/stable-diffusion.cpp @@ -22,6 +22,7 @@ #include "extensions/generation_extension.h" #include "model/adapter/lora.hpp" #include "model/diffusion/anima.hpp" +#include "model/diffusion/boogu.hpp" #include "model/diffusion/control.hpp" #include "model/diffusion/ernie_image.hpp" #include "model/diffusion/flux.hpp" @@ -89,6 +90,7 @@ const char* model_version_to_str[] = { "LTXAV", "HiDream O1", "Z-Image", + "Boogu Image", "Ovis Image", "Ernie Image", "Lens", @@ -126,7 +128,8 @@ static bool sd_version_supports_ref_latent_img_cfg(SDVersion version) { sd_version_is_flux2(version) || sd_version_is_qwen_image(version) || sd_version_is_longcat(version) || - sd_version_is_z_image(version); + sd_version_is_z_image(version) || + sd_version_is_boogu_image(version); } static bool sd_version_supports_img_cfg(SDVersion version, bool has_ref_images) { @@ -762,9 +765,7 @@ class StableDiffusionGGML { auto& tensor_storage_map = model_loader.get_tensor_storage_map(); LOG_INFO("Version: %s ", model_version_to_str[version]); - ggml_type wtype = (int)sd_ctx_params->wtype < std::min(SD_TYPE_COUNT, GGML_TYPE_COUNT) - ? (ggml_type)sd_ctx_params->wtype - : GGML_TYPE_COUNT; + ggml_type wtype = sd_type_to_ggml_type(sd_ctx_params->wtype); std::string tensor_type_rules = SAFE_STR(sd_ctx_params->tensor_type_rules); //kcpp: patch hidream to fix broken images on vulkan https://github.com/leejet/stable-diffusion.cpp/issues/1496 if(version == VERSION_HIDREAM_O1 && tensor_type_rules.size()==0) @@ -1031,6 +1032,18 @@ class StableDiffusionGGML { "model.diffusion_model", version, model_manager); + } else if (sd_version_is_boogu_image(version)) { + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + tensor_storage_map, + version, + "", + true, + model_manager); + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + tensor_storage_map, + "model.diffusion_model", + version, + model_manager); } else if (sd_version_is_ernie_image(version)) { cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), tensor_storage_map, @@ -1467,6 +1480,7 @@ class StableDiffusionGGML { sd_version_is_anima(version) || sd_version_is_ernie_image(version) || sd_version_is_z_image(version) || + sd_version_is_boogu_image(version) || sd_version_is_pid(version) || sd_version_is_ideogram4(version)) { pred_type = FLOW_PRED; @@ -1478,6 +1492,8 @@ class StableDiffusionGGML { default_flow_shift = 1.5f; } else if (sd_version_is_ideogram4(version)) { default_flow_shift = 1.0f; + } else if (sd_version_is_boogu_image(version)) { + default_flow_shift = 3.16f; } else { default_flow_shift = 3.f; } @@ -1957,7 +1973,7 @@ class StableDiffusionGGML { if (sd_version_is_sd3(version)) { latent_rgb_proj = sd3_latent_rgb_proj; latent_rgb_bias = sd3_latent_rgb_bias; - } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) { + } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_boogu_image(version) || sd_version_is_longcat(version)) { latent_rgb_proj = flux_latent_rgb_proj; latent_rgb_bias = flux_latent_rgb_bias; } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { @@ -2052,6 +2068,9 @@ class StableDiffusionGGML { if (sd_version_is_anima(version)) { return std::vector{t / static_cast(TIMESTEPS)}; } + if (sd_version_is_boogu_image(version)) { + return std::vector{t / static_cast(TIMESTEPS)}; + } if (version == VERSION_HIDREAM_O1) { return std::vector{1.0f - (t / static_cast(TIMESTEPS))}; } diff --git a/otherarch/sdcpp/src/tokenizers/bpe_tokenizer.cpp b/otherarch/sdcpp/src/tokenizers/bpe_tokenizer.cpp index 896975a217cf..7733f00d36b7 100644 --- a/otherarch/sdcpp/src/tokenizers/bpe_tokenizer.cpp +++ b/otherarch/sdcpp/src/tokenizers/bpe_tokenizer.cpp @@ -134,7 +134,8 @@ std::vector BPETokenizer::encode(const std::string& text, on_new_token_cb_t std::vector bpe_tokens; std::vector token_strs; - auto splited_texts = split_with_special_tokens(text, special_tokens); + std::string normalized_text = normalize_before_split ? normalize(text) : text; + auto splited_texts = split_with_special_tokens(normalized_text, special_tokens); for (auto& splited_text : splited_texts) { if (is_special_token(splited_text)) { @@ -159,7 +160,7 @@ std::vector BPETokenizer::encode(const std::string& text, on_new_token_cb_t } } - std::string token_str = normalize(token); + std::string token_str = normalize_before_split ? token : normalize(token); std::u32string utf32_token; if (byte_level_bpe) { for (int i = 0; i < token_str.length(); i++) { diff --git a/otherarch/sdcpp/src/tokenizers/clip_tokenizer.cpp b/otherarch/sdcpp/src/tokenizers/clip_tokenizer.cpp index d11c18c31287..d51eadec4022 100644 --- a/otherarch/sdcpp/src/tokenizers/clip_tokenizer.cpp +++ b/otherarch/sdcpp/src/tokenizers/clip_tokenizer.cpp @@ -22,9 +22,10 @@ CLIPTokenizer::CLIPTokenizer(int pad_token_id, const std::string& merges_utf8_st EOS_TOKEN_ID = 49407; PAD_TOKEN_ID = pad_token_id; - end_of_word_suffix = ""; - add_bos_token = true; - add_eos_token = true; + end_of_word_suffix = ""; + add_bos_token = true; + add_eos_token = true; + normalize_before_split = true; if (merges_utf8_str.size() > 0) { load_from_merges(merges_utf8_str); diff --git a/otherarch/sdcpp/src/tokenizers/tokenizer.h b/otherarch/sdcpp/src/tokenizers/tokenizer.h index e044285bbd76..893759e0fba5 100644 --- a/otherarch/sdcpp/src/tokenizers/tokenizer.h +++ b/otherarch/sdcpp/src/tokenizers/tokenizer.h @@ -12,9 +12,10 @@ using on_new_token_cb_t = std::function& class Tokenizer { protected: std::vector special_tokens; - bool add_bos_token = false; - bool add_eos_token = false; - bool pad_left = false; + bool add_bos_token = false; + bool add_eos_token = false; + bool pad_left = false; + bool normalize_before_split = false; std::string end_of_word_suffix; virtual std::string decode_token(int token_id) const = 0; From eb7340014a61098259a20b15ae610abb23db32fc Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sun, 21 Jun 2026 18:39:16 -0300 Subject: [PATCH 2/6] sd: support for boogu and longcat edit --- otherarch/sdcpp/src/stable-diffusion.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/otherarch/sdcpp/src/stable-diffusion.cpp b/otherarch/sdcpp/src/stable-diffusion.cpp index cd4705696cf1..079f8fb63155 100644 --- a/otherarch/sdcpp/src/stable-diffusion.cpp +++ b/otherarch/sdcpp/src/stable-diffusion.cpp @@ -491,7 +491,9 @@ class StableDiffusionGGML { bool is_lens = sd_version_is_lens(tempver); bool is_ltx = sd_version_is_ltxav(tempver); bool is_ideogram = sd_version_is_ideogram4(tempver); - bool conditioner_is_llm = (is_qwenimg || iszimg || isflux2 || is_ovis || is_anima || is_ernie || is_longcat || is_lens || is_ltx || is_ideogram); + bool is_boogu = sd_version_is_boogu_image(tempver); + bool conditioner_is_llm = (is_qwenimg || iszimg || isflux2 || is_ovis || is_anima || is_ernie || is_longcat || is_lens || is_ltx || is_ideogram || is_boogu); + bool has_llm_vision = (is_qwenimg || is_longcat || is_boogu); //kcpp qol fallback: if a llm was loaded as t5 by mistake if(conditioner_is_llm && t5_path_fixed!="") @@ -542,7 +544,7 @@ class StableDiffusionGGML { clip_vision_fixed = clipg_path_fixed; clipg_path_fixed = ""; } - else if(is_qwenimg && llm_vision_path_fixed=="") + else if(has_llm_vision && llm_vision_path_fixed=="") { llm_vision_path_fixed = clipg_path_fixed; clipg_path_fixed = ""; @@ -584,7 +586,7 @@ class StableDiffusionGGML { { to_replace = "taesd_xl.embd"; } - else if(sd_version_is_flux(tempver)||sd_version_is_z_image(tempver)||tempver == VERSION_OVIS_IMAGE||is_longcat) + else if(sd_version_is_flux(tempver)||sd_version_is_z_image(tempver)||tempver == VERSION_OVIS_IMAGE||is_longcat||is_boogu) { to_replace = "taesd_f.embd"; } From bb1bed614de9f6e00fa48233afde6d6ea1a9a565 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sun, 21 Jun 2026 18:41:18 -0300 Subject: [PATCH 3/6] sd: remove SD_TYPE_COUNT == GGML_TYPE_COUNT assertion The current code should be able to deal with an out-of-sync ggml. --- otherarch/sdcpp/src/stable-diffusion.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/otherarch/sdcpp/src/stable-diffusion.cpp b/otherarch/sdcpp/src/stable-diffusion.cpp index 079f8fb63155..9b788c48261e 100644 --- a/otherarch/sdcpp/src/stable-diffusion.cpp +++ b/otherarch/sdcpp/src/stable-diffusion.cpp @@ -6078,9 +6078,6 @@ SD_API void free_sd_images(sd_image_t* result_images, int num_images) { namespace kcpp_sd { - static_assert((int)SD_TYPE_COUNT == (int)GGML_TYPE_COUNT, - "inconsistency between SD_TYPE_COUNT and GGML_TYPE_COUNT"); - int get_loaded_sd_version(sd_ctx_t* ctx) { return ctx->sd->version; } From 80742da33f7da9be0914696d7e51aec8d61009e7 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sun, 21 Jun 2026 19:58:34 -0300 Subject: [PATCH 4/6] sd: generalize edit mode support --- otherarch/sdcpp/sdtype_adapter.cpp | 4 ++-- otherarch/sdcpp/src/kcpp_sd_extensions.h | 2 ++ otherarch/sdcpp/src/stable-diffusion.cpp | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp index 4880c3ba8f08..2a5c61f40173 100644 --- a/otherarch/sdcpp/sdtype_adapter.cpp +++ b/otherarch/sdcpp/sdtype_adapter.cpp @@ -972,7 +972,7 @@ static sd_audio_t load_audio_from_b64(const std::string& b64audio) { bool supports_reference_images(kcpp_sd::model_info info) { - bool supported = (info.is_wan || info.is_ltx || info.is_qwenimg || info.is_flux2 || info.is_kontext || photomaker_enabled); + bool supported = (info.is_wan || info.is_ltx || info.supports_ref_image || info.is_kontext || photomaker_enabled); return supported; } @@ -1161,7 +1161,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs) wan_imgs.push_back(extraimage_reference); } } - else if(info.is_qwenimg || info.is_flux2) + else if(info.supports_ref_image) { uint8_t * loaded = load_image_from_b64(extra_image_data[i],nx2,ny2); if(loaded) diff --git a/otherarch/sdcpp/src/kcpp_sd_extensions.h b/otherarch/sdcpp/src/kcpp_sd_extensions.h index c65c11c35b29..0e0c6795ad84 100644 --- a/otherarch/sdcpp/src/kcpp_sd_extensions.h +++ b/otherarch/sdcpp/src/kcpp_sd_extensions.h @@ -19,6 +19,8 @@ namespace kcpp_sd { bool is_wan; bool is_zimage; bool is_ltx; + bool is_boogu; + bool supports_ref_image; int vae_scale_factor; int spatial_multiple; }; diff --git a/otherarch/sdcpp/src/stable-diffusion.cpp b/otherarch/sdcpp/src/stable-diffusion.cpp index 9b788c48261e..f50bdbb1e1c6 100644 --- a/otherarch/sdcpp/src/stable-diffusion.cpp +++ b/otherarch/sdcpp/src/stable-diffusion.cpp @@ -6112,6 +6112,8 @@ namespace kcpp_sd { res.is_sd2 = (loadedsdver == SDVersion::VERSION_SD2); res.is_sdxl = sd_version_is_sdxl((SDVersion)loadedsdver); res.is_ltx = sd_version_is_ltxav((SDVersion)loadedsdver); + res.is_boogu = sd_version_is_boogu_image((SDVersion)loadedsdver); + res.supports_ref_image = sd_version_supports_ref_latent_img_cfg((SDVersion)loadedsdver); res.vae_scale_factor = ctx->sd->get_vae_scale_factor(); res.spatial_multiple = get_spatial_multiple(ctx); return res; From 4786837748e40700eae53c3c2bc565ebdd4d0570 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Mon, 22 Jun 2026 18:52:18 -0300 Subject: [PATCH 5/6] sd: sync with master-719-f440ad9 --- otherarch/sdcpp/examples/common/common.cpp | 59 +++++++++- otherarch/sdcpp/examples/common/common.h | 1 + otherarch/sdcpp/include/stable-diffusion.h | 1 + otherarch/sdcpp/src/model.h | 7 ++ .../sdcpp/src/model/vae/auto_encoder_kl.hpp | 2 +- otherarch/sdcpp/src/model_manager.cpp | 13 ++- otherarch/sdcpp/src/model_manager.h | 3 + otherarch/sdcpp/src/runtime/guidance.cpp | 102 ++++++++++++++++-- otherarch/sdcpp/src/runtime/guidance.h | 16 ++- otherarch/sdcpp/src/stable-diffusion.cpp | 51 ++++++++- 10 files changed, 229 insertions(+), 26 deletions(-) diff --git a/otherarch/sdcpp/examples/common/common.cpp b/otherarch/sdcpp/examples/common/common.cpp index ad3f97a08080..3f357512e66c 100644 --- a/otherarch/sdcpp/examples/common/common.cpp +++ b/otherarch/sdcpp/examples/common/common.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -260,15 +261,15 @@ bool parse_options(int argc, const char** argv, const std::vector& o invalid_arg = true; return; } - if(option.concat && !option.target->empty()){ - if(option.concat > 0 && option.concat <= 0xff){ + if (option.concat && !option.target->empty()) { + if (option.concat > 0 && option.concat <= 0xff) { *option.target += static_cast(option.concat); } *option.target += argv_to_utf8(i, argv); } else { *option.target = argv_to_utf8(i, argv); } - found_arg = true; + found_arg = true; })) break; @@ -496,6 +497,10 @@ ArgOptions SDContextParams::get_options() { "--stream-layers", "enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram; defaults to false)", true, &stream_layers}, + {"", + "--eager-load", + "load all params into the params backend at model-load time instead of lazily on first use (defaults to false)", + true, &eager_load}, {"", "--force-sdxl-vae-conv-scale", "force use of conv scale on sdxl vae", @@ -799,6 +804,7 @@ std::string SDContextParams::to_string() const { << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" << " max_vram: \"" << max_vram << "\",\n" << " stream_layers: " << (stream_layers ? "true" : "false") << ",\n" + << " eager_load: " << (eager_load ? "true" : "false") << ",\n" << " backend: \"" << backend << "\",\n" << " params_backend: \"" << params_backend << "\",\n" << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n" @@ -878,6 +884,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) { sd_ctx_params.vae_format = str_to_vae_format(vae_format); sd_ctx_params.max_vram = max_vram.c_str(); sd_ctx_params.stream_layers = stream_layers; + sd_ctx_params.eager_load = eager_load; sd_ctx_params.backend = effective_backend.c_str(); sd_ctx_params.params_backend = effective_params_backend.c_str(); sd_ctx_params.rpc_servers = rpc_servers.c_str(); @@ -953,7 +960,7 @@ ArgOptions SDGenerationParams::get_options() { &hires_upscaler}, {"", "--extra-sample-args", - "extra sampler/scheduler/guidance args, key=value list. APG supports apg_eta, apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma", + "extra sampler/scheduler/guidance args, key=value list. CFG supports guidance_schedule; APG supports apg_eta, apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma;", (int)',', &extra_sample_args}, {"", @@ -1415,6 +1422,42 @@ ArgOptions SDGenerationParams::get_options() { return 1; }; + auto on_prompt_file_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + std::ifstream f(arg, std::ios::binary); + try { + prompt = std::string(std::istreambuf_iterator{f}, {}); + } catch (const std::ios_base::failure&) { + f.setstate(std::ios_base::failbit); + } + if (f.fail()) { + LOG_ERROR("error: failed to read prompt file '%s'\n", arg); + return -1; + } + return 1; + }; + + auto on_negative_prompt_file_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + std::ifstream f(arg, std::ios::binary); + try { + negative_prompt = std::string(std::istreambuf_iterator{f}, {}); + } catch (const std::ios_base::failure&) { + f.setstate(std::ios_base::failbit); + } + if (f.fail()) { + LOG_ERROR("error: failed to read negative prompt file '%s'\n", arg); + return -1; + } + return 1; + }; + options.manual_options = { {"-s", "--seed", @@ -1478,6 +1521,14 @@ ArgOptions SDGenerationParams::get_options() { "--vae-relative-tile-size", "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)", on_relative_tile_size_arg}, + {"", + "--prompt-file", + "path to the file containing the prompt to render", + on_prompt_file_arg}, + {"", + "--negative-prompt-file", + "path to the file containing the negative prompt", + on_negative_prompt_file_arg}, }; diff --git a/otherarch/sdcpp/examples/common/common.h b/otherarch/sdcpp/examples/common/common.h index 587cad29f699..e7c25015bfa1 100644 --- a/otherarch/sdcpp/examples/common/common.h +++ b/otherarch/sdcpp/examples/common/common.h @@ -148,6 +148,7 @@ struct SDContextParams { bool offload_params_to_cpu = false; std::string max_vram = "0"; bool stream_layers = false; + bool eager_load = false; std::string backend; std::string params_backend; std::string rpc_servers; diff --git a/otherarch/sdcpp/include/stable-diffusion.h b/otherarch/sdcpp/include/stable-diffusion.h index bfcd909ccaf2..8772865daadb 100644 --- a/otherarch/sdcpp/include/stable-diffusion.h +++ b/otherarch/sdcpp/include/stable-diffusion.h @@ -219,6 +219,7 @@ typedef struct { enum sd_vae_format_t vae_format; const char* max_vram; // GiB budget or backend assignment spec for graph-cut segmented param offload (0 = disabled, -1 = auto) bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram) + bool eager_load; // Load all params into the params backend at model-load time instead of lazily on first use const char* backend; const char* params_backend; const char* rpc_servers; diff --git a/otherarch/sdcpp/src/model.h b/otherarch/sdcpp/src/model.h index 17272f7d69d9..d02ed65b8f18 100644 --- a/otherarch/sdcpp/src/model.h +++ b/otherarch/sdcpp/src/model.h @@ -186,6 +186,13 @@ static inline bool sd_version_is_ideogram4(SDVersion version) { return false; } +static inline bool sd_version_uses_flux_vae(SDVersion version) { + if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_boogu_image(version) || sd_version_is_longcat(version)) { + return true; + } + return false; +} + static inline bool sd_version_uses_flux2_vae(SDVersion version) { if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version)) { return true; diff --git a/otherarch/sdcpp/src/model/vae/auto_encoder_kl.hpp b/otherarch/sdcpp/src/model/vae/auto_encoder_kl.hpp index 51e1feda72bf..e41f5fd46a44 100644 --- a/otherarch/sdcpp/src/model/vae/auto_encoder_kl.hpp +++ b/otherarch/sdcpp/src/model/vae/auto_encoder_kl.hpp @@ -682,7 +682,7 @@ struct AutoEncoderKL : public VAE { } else if (sd_version_is_sd3(version)) { scale_factor = 1.5305f; shift_factor = 0.0609f; - } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_boogu_image(version) || sd_version_is_longcat(version)) { + } else if (sd_version_uses_flux_vae(version)) { scale_factor = 0.3611f; shift_factor = 0.1159f; } else if (sd_version_uses_flux2_vae(version)) { diff --git a/otherarch/sdcpp/src/model_manager.cpp b/otherarch/sdcpp/src/model_manager.cpp index 5287e1069af2..7095ec6a96f8 100644 --- a/otherarch/sdcpp/src/model_manager.cpp +++ b/otherarch/sdcpp/src/model_manager.cpp @@ -147,6 +147,17 @@ bool ModelManager::register_param_tensors(const std::string& desc, return true; } +bool ModelManager::load_all_params_eagerly() { + std::vector all_states; + all_states.reserve(tensor_states_.size()); + for (const auto& s : tensor_states_) { + if (s != nullptr) { + all_states.push_back(s.get()); + } + } + return load_tensors_to_params_backend(all_states); +} + bool ModelManager::validate_registered_tensors() { bool ok = true; for (const auto& state : tensor_states_) { @@ -469,7 +480,7 @@ bool ModelManager::mmap_params(const std::vector& states, return true; } - auto mmap_store = model_loader_.mmap_tensors(mmap_candidates, {}, true); + auto mmap_store = model_loader_.mmap_tensors(mmap_candidates, {}, writable_mmap_); if (mmap_store.empty()) { return true; } diff --git a/otherarch/sdcpp/src/model_manager.h b/otherarch/sdcpp/src/model_manager.h index 1a414c15cd37..9225e3ea6935 100644 --- a/otherarch/sdcpp/src/model_manager.h +++ b/otherarch/sdcpp/src/model_manager.h @@ -69,6 +69,7 @@ class ModelManager : public RunnerWeightManager { uint64_t current_lora_epoch_ = 0; int n_threads_ = 0; bool enable_mmap_ = false; + bool writable_mmap_ = false; void finish_compute_backend_usage(const std::vector& states); void release_all(); @@ -110,6 +111,7 @@ class ModelManager : public RunnerWeightManager { model_loader_.set_n_threads(n_threads); } void set_enable_mmap(bool enable_mmap) { enable_mmap_ = enable_mmap; } + void set_writable_mmap(bool writable_mmap) { writable_mmap_ = writable_mmap; } void set_common_ignore_tensors(std::set ignore_tensors); void set_loras(std::vector loras, SDVersion version); @@ -158,6 +160,7 @@ class ModelManager : public RunnerWeightManager { } bool validate_registered_tensors(); + bool load_all_params_eagerly(); bool prepare_params(const std::vector& tensors) override; void release_compute_backend_params(const std::vector& tensors) override; diff --git a/otherarch/sdcpp/src/runtime/guidance.cpp b/otherarch/sdcpp/src/runtime/guidance.cpp index f925b4b8c9f8..bfb773b0a14c 100644 --- a/otherarch/sdcpp/src/runtime/guidance.cpp +++ b/otherarch/sdcpp/src/runtime/guidance.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -63,6 +64,82 @@ namespace sd::guidance { return uncond; } + std::vector parse_guidance_schedule_from_spec(std::string spec) { + std::vector schedule; + + while (!spec.empty()) { + auto sep = spec.find('+'); + auto segment = spec.substr(0, sep); + + auto x = segment.find('x'); + if (x == std::string::npos) { + LOG_ERROR("Invalid guidance schedule segment: '%s' (expected x)", segment.c_str()); + return {}; + } + + float guidance; + int count; + + auto guidance_str = segment.substr(0, x); + auto count_str = segment.substr(x + 1); + + try { + size_t idx = 0; + guidance = std::stof(guidance_str, &idx); + if (idx != guidance_str.size()) { + LOG_ERROR("Invalid guidance value in guidance schedule: '%s'", guidance_str.c_str()); + return {}; + } + } catch (const std::exception&) { + LOG_ERROR("Invalid guidance value in guidance schedule: '%s'", guidance_str.c_str()); + return {}; + } + + try { + size_t idx = 0; + count = std::stoi(count_str, &idx); + if (idx != count_str.size()) { + LOG_ERROR("Invalid count in guidance schedule: '%s'", count_str.c_str()); + return {}; + } + } catch (const std::exception&) { + LOG_ERROR("Invalid count in guidance schedule: '%s'", count_str.c_str()); + return {}; + } + + if (count <= 0) { + LOG_ERROR("Guidance schedule count must be positive"); + return {}; + } + + schedule.insert(schedule.end(), count, guidance); + + if (sep == std::string::npos) { + break; + } + + spec = spec.substr(sep + 1); + } + + return schedule; + } + + std::vector parse_guidance_schedule(const char* extra_sample_args) { + std::vector guidance_schedule; + std::string guidance_schedule_str = ""; + for (const auto& [key, value] : parse_key_value_args(extra_sample_args, "extra sample arg")) { + float parsed = 0.0f; + if (key == "guidance_schedule") { + guidance_schedule_str = value; + } + } + + if (!guidance_schedule_str.empty()) { + guidance_schedule = parse_guidance_schedule_from_spec(guidance_schedule_str); + } + return guidance_schedule; + } + ClassifierFreeGuidance::ClassifierFreeGuidance(float guidance_scale, float image_guidance_scale) : guidance_scale_(guidance_scale), @@ -70,8 +147,10 @@ namespace sd::guidance { } GuiderOutput ClassifierFreeGuidance::forward(const GuidanceInput& input, - GuiderOutput previous) const { + GuiderOutput previous, + std::optional scale_override) const { (void)previous; + float guidance_scale = scale_override.value_or(guidance_scale_); GuiderOutput output; if (!has_tensor(input.pred_cond)) { @@ -86,14 +165,14 @@ namespace sd::guidance { const sd::Tensor& pred_img_uncond = *input.pred_img_uncond; output.pred = pred_img_uncond + image_guidance_scale_ * (pred_uncond - pred_img_uncond) + - guidance_scale_ * (pred_cond - pred_uncond); + guidance_scale * (pred_cond - pred_uncond); } else { - output.pred = pred_uncond + guidance_scale_ * (pred_cond - pred_uncond); + output.pred = pred_uncond + guidance_scale * (pred_cond - pred_uncond); } } else if (has_tensor(input.pred_img_uncond)) { const sd::Tensor& pred_img_uncond = *input.pred_img_uncond; - output.pred = pred_img_uncond + guidance_scale_ * (pred_cond - pred_img_uncond); + output.pred = pred_img_uncond + guidance_scale * (pred_cond - pred_img_uncond); } return output; @@ -128,8 +207,10 @@ namespace sd::guidance { } GuiderOutput AdaptiveProjectedGuidance::forward(const GuidanceInput& input, - GuiderOutput previous) const { + GuiderOutput previous, + std::optional scale_override) const { (void)previous; + float guidance_scale = scale_override.value_or(guidance_scale_); GuiderOutput output; if (!has_tensor(input.pred_cond)) { @@ -144,13 +225,13 @@ namespace sd::guidance { const sd::Tensor& pred_img_uncond = *input.pred_img_uncond; output.pred = pred_img_uncond + image_guidance_scale_ * (pred_uncond - pred_img_uncond) + - guidance_scale_ * (pred_cond - pred_uncond); + guidance_scale * (pred_cond - pred_uncond); } else { - output.pred = pred_uncond + guidance_scale_ * (pred_cond - pred_uncond); + output.pred = pred_uncond + guidance_scale * (pred_cond - pred_uncond); } } else if (has_tensor(input.pred_img_uncond)) { const sd::Tensor& pred_img_uncond = *input.pred_img_uncond; - output.pred = pred_img_uncond + guidance_scale_ * (pred_cond - pred_img_uncond); + output.pred = pred_img_uncond + guidance_scale * (pred_cond - pred_img_uncond); } if (!has_tensor(input.pred_uncond) && !has_tensor(input.pred_img_uncond)) { return output; @@ -162,7 +243,7 @@ namespace sd::guidance { sd::Tensor deltas = calculate_guidance_delta(pred_cond, pred_uncond, pred_img_uncond, - guidance_scale_, + guidance_scale, image_guidance_scale_); if (params_.momentum != 0.0f) { if (momentum_buffer_.shape() != deltas.shape()) { @@ -239,7 +320,8 @@ namespace sd::guidance { } GuiderOutput SkipLayerGuidance::forward(const GuidanceInput& input, - GuiderOutput output) const { + GuiderOutput output, + std::optional /*scale_override*/) const { if (scale_ == 0.0f || !is_enabled_for_step(input) || !input.predict_skip_layer) { return output; } diff --git a/otherarch/sdcpp/src/runtime/guidance.h b/otherarch/sdcpp/src/runtime/guidance.h index aeba06fd0739..3de337042d33 100644 --- a/otherarch/sdcpp/src/runtime/guidance.h +++ b/otherarch/sdcpp/src/runtime/guidance.h @@ -3,6 +3,7 @@ #include #include +#include #include #include "core/tensor.hpp" @@ -27,6 +28,7 @@ namespace sd::guidance { AdaptiveProjectedGuidanceParams parse_adaptive_projected_guidance_args(const char* extra_sample_args); bool is_adaptive_projected_guidance_enabled(const AdaptiveProjectedGuidanceParams& params); bool parse_skip_layer_guidance_uncond_arg(const char* extra_sample_args); + std::vector parse_guidance_schedule(const char* extra_sample_args); struct GuidanceInput { int step = 0; @@ -40,9 +42,10 @@ namespace sd::guidance { class BaseGuidance { public: - virtual ~BaseGuidance() = default; + virtual ~BaseGuidance() = default; virtual GuiderOutput forward(const GuidanceInput& input, - GuiderOutput previous) const = 0; + GuiderOutput previous, + std::optional scale_override = std::nullopt) const = 0; }; class ClassifierFreeGuidance : public BaseGuidance { @@ -54,7 +57,8 @@ namespace sd::guidance { float image_guidance_scale); GuiderOutput forward(const GuidanceInput& input, - GuiderOutput previous) const override; + GuiderOutput previous, + std::optional scale_override = std::nullopt) const override; }; class AdaptiveProjectedGuidance : public BaseGuidance { @@ -69,7 +73,8 @@ namespace sd::guidance { AdaptiveProjectedGuidanceParams params); GuiderOutput forward(const GuidanceInput& input, - GuiderOutput previous) const override; + GuiderOutput previous, + std::optional scale_override = std::nullopt) const override; }; class SkipLayerGuidance : public BaseGuidance { @@ -88,7 +93,8 @@ namespace sd::guidance { const std::vector& layers() const; GuiderOutput forward(const GuidanceInput& input, - GuiderOutput previous) const override; + GuiderOutput previous, + std::optional scale_override = std::nullopt) const override; }; } // namespace sd::guidance diff --git a/otherarch/sdcpp/src/stable-diffusion.cpp b/otherarch/sdcpp/src/stable-diffusion.cpp index f50bdbb1e1c6..2c0f53a37596 100644 --- a/otherarch/sdcpp/src/stable-diffusion.cpp +++ b/otherarch/sdcpp/src/stable-diffusion.cpp @@ -203,6 +203,7 @@ class StableDiffusionGGML { bool enable_mmap = false; sd::ggml_graph_cut::MaxVramAssignment max_vram_assignment; bool stream_layers = false; + bool eager_load = false; std::string backend_spec; std::string params_backend_spec; @@ -349,6 +350,7 @@ class StableDiffusionGGML { n_threads = sd_ctx_params->n_threads; enable_mmap = sd_ctx_params->enable_mmap; stream_layers = sd_ctx_params->stream_layers; + eager_load = sd_ctx_params->eager_load; backend_spec = SAFE_STR(sd_ctx_params->backend); params_backend_spec = SAFE_STR(sd_ctx_params->params_backend); max_vram_assignment.reset(0.f); @@ -586,7 +588,7 @@ class StableDiffusionGGML { { to_replace = "taesd_xl.embd"; } - else if(sd_version_is_flux(tempver)||sd_version_is_z_image(tempver)||tempver == VERSION_OVIS_IMAGE||is_longcat||is_boogu) + else if(sd_version_uses_flux_vae(tempver)) { to_replace = "taesd_f.embd"; } @@ -777,7 +779,6 @@ class StableDiffusionGGML { if (wtype != GGML_TYPE_COUNT || tensor_type_rules.size() > 0) { model_loader.set_wtype_override(wtype, tensor_type_rules); } - model_loader.process_model_files(enable_mmap, true); std::map wtype_stat = model_loader.get_wtype_stat(); std::map conditioner_wtype_stat = model_loader.get_conditioner_wtype_stat(); @@ -831,9 +832,12 @@ class StableDiffusionGGML { apply_lora_immediately = false; } + bool needs_writable_mmap = enable_mmap && apply_lora_immediately; + model_manager->set_writable_mmap(needs_writable_mmap); if (enable_mmap && apply_lora_immediately) { LOG_WARN("in mode 'immediately', LoRAs will cause extra memory usage with mmap"); } + model_loader.process_model_files(enable_mmap, needs_writable_mmap); load_alphas_cumprod(model_loader); size_t text_encoder_params_mem_size = 0; @@ -1400,7 +1404,15 @@ class StableDiffusionGGML { return false; } - LOG_DEBUG("model metadata validated; weights will be prepared lazily"); + if (eager_load) { + if (!model_manager->load_all_params_eagerly()) { + LOG_ERROR("model params eager load failed"); + return false; + } + LOG_DEBUG("model metadata validated; weights pre-loaded to params backend"); + } else { + LOG_DEBUG("model metadata validated; weights will be prepared lazily"); + } { size_t total_params_ram_size = 0; @@ -1975,7 +1987,7 @@ class StableDiffusionGGML { if (sd_version_is_sd3(version)) { latent_rgb_proj = sd3_latent_rgb_proj; latent_rgb_bias = sd3_latent_rgb_bias; - } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_boogu_image(version) || sd_version_is_longcat(version)) { + } else if (sd_version_uses_flux_vae(version)) { latent_rgb_proj = flux_latent_rgb_proj; latent_rgb_bias = flux_latent_rgb_bias; } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { @@ -2198,6 +2210,32 @@ class StableDiffusionGGML { float slg_scale = guidance.slg.scale; bool slg_uncond = sd::guidance::parse_skip_layer_guidance_uncond_arg(extra_sample_args); + std::vector guidance_schedule = sd::guidance::parse_guidance_schedule(extra_sample_args); + if (!guidance_schedule.empty() && guidance_schedule.size() != sigmas.size() - 1) { + if (guidance_schedule.size() > sigmas.size()) { + LOG_WARN("guidance_schedule length (%zu) is greater than number of steps (%zu)", guidance_schedule.size(), sigmas.size() - 1); + LOG_WARN("truncating guidance_schedule to match step count"); + guidance_schedule.resize(sigmas.size() - 1); + } else { + LOG_INFO("padding guidance_schedule with cfg_scale"); + while (guidance_schedule.size() < sigmas.size() - 1) { + guidance_schedule.push_back(cfg_scale); + } + } + } + + if (!guidance_schedule.empty()) { + std::string schedule_str = "["; + for (size_t i = 0; i < guidance_schedule.size(); ++i) { + schedule_str += std::to_string(guidance_schedule[i]); + if (i < guidance_schedule.size() - 1) { + schedule_str += ", "; + } + } + schedule_str += "]"; + LOG_DEBUG("using guidance schedule: %s", schedule_str.c_str()); + } + sd_sample::SampleCacheRuntime cache_runtime = sd_sample::init_sample_cache_runtime(version, cache_params, denoiser.get(), @@ -2438,7 +2476,7 @@ class StableDiffusionGGML { guidance_input.pred_uncond = uncond_out.empty() ? nullptr : &uncond_out; guidance_input.pred_img_uncond = img_uncond_out.empty() ? nullptr : &img_uncond_out; - sd::guidance::GuiderOutput guided = primary_guidance.forward(guidance_input, {}); + sd::guidance::GuiderOutput guided = guidance_schedule.empty() ? primary_guidance.forward(guidance_input, {}) : primary_guidance.forward(guidance_input, {}, guidance_schedule[guidance_schedule.size() - 1 - step]); if (guided.pred.empty()) { return {}; } @@ -2979,6 +3017,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->lora_apply_mode = LORA_APPLY_AUTO; sd_ctx_params->max_vram = nullptr; sd_ctx_params->stream_layers = false; + sd_ctx_params->eager_load = false; sd_ctx_params->enable_mmap = false; sd_ctx_params->diffusion_flash_attn = false; sd_ctx_params->circular_x = false; @@ -3025,6 +3064,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "prediction: %s\n" "max_vram: %s\n" "stream_layers: %s\n" + "eager_load: %s\n" "backend: %s\n" "params_backend: %s\n" "flash_attn: %s\n" @@ -3060,6 +3100,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { sd_prediction_name(sd_ctx_params->prediction), SAFE_STR(sd_ctx_params->max_vram), BOOL_STR(sd_ctx_params->stream_layers), + BOOL_STR(sd_ctx_params->eager_load), SAFE_STR(sd_ctx_params->backend), SAFE_STR(sd_ctx_params->params_backend), BOOL_STR(sd_ctx_params->flash_attn), From 5a45265f7e25c48ecffca1067fec7eef182d4a93 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Wed, 24 Jun 2026 20:06:34 -0300 Subject: [PATCH 6/6] sd: sync with master-721-8caa3f9 --- Makefile | 2 +- otherarch/sdcpp/examples/common/common.cpp | 4 +- otherarch/sdcpp/include/stable-diffusion.h | 1 + .../sdcpp/src/conditioning/conditioner.hpp | 13 +- otherarch/sdcpp/src/core/ggml_extend.hpp | 11 +- otherarch/sdcpp/src/model.h | 11 +- otherarch/sdcpp/src/model/diffusion/krea2.hpp | 683 ++++++++++++++++++ otherarch/sdcpp/src/model_loader.cpp | 4 + otherarch/sdcpp/src/name_conversion.cpp | 36 +- otherarch/sdcpp/src/runtime/denoiser.hpp | 202 ++++++ otherarch/sdcpp/src/stable-diffusion.cpp | 30 +- 11 files changed, 986 insertions(+), 11 deletions(-) create mode 100644 otherarch/sdcpp/src/model/diffusion/krea2.hpp diff --git a/Makefile b/Makefile index 79a37a71889f..051ea092e414 100644 --- a/Makefile +++ b/Makefile @@ -699,7 +699,7 @@ llama-impl.o: src/llama-impl.cpp src/llama-impl.h budget.o: common/reasoning-budget.cpp common/reasoning-budget.h $(CXX) $(CXXFLAGS) -c $< -o $@ -SDCPP_COMMON_BASENAMES := include/stable-diffusion.h src/conditioning/conditioner.hpp src/core/ggml_extend_backend.cpp src/core/ggml_extend_backend.h src/core/ggml_extend.hpp src/core/ggml_graph_cut.cpp src/core/ggml_graph_cut.h src/core/ordered_map.hpp src/core/rng.hpp src/core/rng_mt19937.hpp src/core/rng_philox.hpp src/core/tensor_ggml.hpp src/core/tensor.hpp src/core/util.cpp src/core/util.h src/extensions/generation_extension.h src/extensions/photomaker_extension.cpp src/extensions/pulid_extension.cpp src/kcpp_sd_extensions.h src/model/adapter/lora.hpp src/model/adapter/pmid.hpp src/model/adapter/pulid.hpp src/model/common/block.hpp src/model/common/rope.hpp src/model/diffusion/anima.hpp src/model/diffusion/boogu.hpp src/model/diffusion/control.hpp src/model/diffusion/dit.hpp src/model/diffusion/ernie_image.hpp src/model/diffusion/flux.hpp src/model/diffusion/hidream_o1.hpp src/model/diffusion/ideogram4.hpp src/model/diffusion/lens.hpp src/model/diffusion/ltxv.hpp src/model/diffusion/mmdit.hpp src/model/diffusion/model.hpp src/model/diffusion/pid.hpp src/model/diffusion/qwen_image.hpp src/model/diffusion/unet.hpp src/model/diffusion/wan.hpp src/model/diffusion/z_image.hpp src/model.h src/model_io/binary_io.h src/model_io/gguf_io.cpp src/model_io/gguf_io.h src/model_io/gguf_reader_ext.h src/model_io/pickle_io.cpp src/model_io/pickle_io.h src/model_io/safetensors_io.cpp src/model_io/safetensors_io.h src/model_io/tensor_storage.h src/model_io/torch_legacy_io.cpp src/model_io/torch_legacy_io.h src/model_io/torch_zip_io.cpp src/model_io/torch_zip_io.h src/model_loader.cpp src/model_loader.h src/model_manager.cpp src/model_manager.h src/model/te/clip.hpp src/model/te/llm.hpp src/model/te/t5.hpp src/model/upscaler/esrgan.hpp src/model/upscaler/ltx_latent_upscaler.hpp src/model/vae/auto_encoder_kl.hpp src/model/vae/ltx_audio_vae.hpp src/model/vae/ltx_vae.hpp src/model/vae/tae.hpp src/model/vae/vae.hpp src/model/vae/wan_vae.hpp src/name_conversion.cpp src/name_conversion.h src/runtime/cache_dit.hpp src/runtime/condition_cache_utils.hpp src/runtime/denoiser.hpp src/runtime/easycache.hpp src/runtime/gits_noise.h src/runtime/guidance.cpp src/runtime/guidance.h src/runtime/latent-preview.h src/runtime/preprocessing.hpp src/runtime/sample-cache.cpp src/runtime/sample-cache.h src/runtime/spectrum.hpp src/runtime/ucache.hpp src/stable-diffusion.cpp src/tokenizers/bpe_tokenizer.cpp src/tokenizers/bpe_tokenizer.h src/tokenizers/clip_tokenizer.cpp src/tokenizers/clip_tokenizer.h src/tokenizers/gemma_tokenizer.cpp src/tokenizers/gemma_tokenizer.h src/tokenizers/gpt_oss_tokenizer.cpp src/tokenizers/gpt_oss_tokenizer.h src/tokenizers/mistral_tokenizer.cpp src/tokenizers/mistral_tokenizer.h src/tokenizers/qwen2_tokenizer.cpp src/tokenizers/qwen2_tokenizer.h src/tokenizers/t5_unigram_tokenizer.cpp src/tokenizers/t5_unigram_tokenizer.h src/tokenizers/tokenizer.cpp src/tokenizers/tokenizer.h src/tokenizers/tokenize_util.cpp src/tokenizers/tokenize_util.h src/tokenizers/vocab/vocab.h src/upscaler.cpp src/upscaler.h src/weight_manager.h +SDCPP_COMMON_BASENAMES := include/stable-diffusion.h src/conditioning/conditioner.hpp src/core/ggml_extend_backend.cpp src/core/ggml_extend_backend.h src/core/ggml_extend.hpp src/core/ggml_graph_cut.cpp src/core/ggml_graph_cut.h src/core/ordered_map.hpp src/core/rng.hpp src/core/rng_mt19937.hpp src/core/rng_philox.hpp src/core/tensor_ggml.hpp src/core/tensor.hpp src/core/util.cpp src/core/util.h src/extensions/generation_extension.h src/extensions/photomaker_extension.cpp src/extensions/pulid_extension.cpp src/kcpp_sd_extensions.h src/model/adapter/lora.hpp src/model/adapter/pmid.hpp src/model/adapter/pulid.hpp src/model/common/block.hpp src/model/common/rope.hpp src/model/diffusion/anima.hpp src/model/diffusion/boogu.hpp src/model/diffusion/control.hpp src/model/diffusion/dit.hpp src/model/diffusion/ernie_image.hpp src/model/diffusion/flux.hpp src/model/diffusion/hidream_o1.hpp src/model/diffusion/ideogram4.hpp src/model/diffusion/krea2.hpp src/model/diffusion/lens.hpp src/model/diffusion/ltxv.hpp src/model/diffusion/mmdit.hpp src/model/diffusion/model.hpp src/model/diffusion/pid.hpp src/model/diffusion/qwen_image.hpp src/model/diffusion/unet.hpp src/model/diffusion/wan.hpp src/model/diffusion/z_image.hpp src/model.h src/model_io/binary_io.h src/model_io/gguf_io.cpp src/model_io/gguf_io.h src/model_io/gguf_reader_ext.h src/model_io/pickle_io.cpp src/model_io/pickle_io.h src/model_io/safetensors_io.cpp src/model_io/safetensors_io.h src/model_io/tensor_storage.h src/model_io/torch_legacy_io.cpp src/model_io/torch_legacy_io.h src/model_io/torch_zip_io.cpp src/model_io/torch_zip_io.h src/model_loader.cpp src/model_loader.h src/model_manager.cpp src/model_manager.h src/model/te/clip.hpp src/model/te/llm.hpp src/model/te/t5.hpp src/model/upscaler/esrgan.hpp src/model/upscaler/ltx_latent_upscaler.hpp src/model/vae/auto_encoder_kl.hpp src/model/vae/ltx_audio_vae.hpp src/model/vae/ltx_vae.hpp src/model/vae/tae.hpp src/model/vae/vae.hpp src/model/vae/wan_vae.hpp src/name_conversion.cpp src/name_conversion.h src/runtime/cache_dit.hpp src/runtime/condition_cache_utils.hpp src/runtime/denoiser.hpp src/runtime/easycache.hpp src/runtime/gits_noise.h src/runtime/guidance.cpp src/runtime/guidance.h src/runtime/latent-preview.h src/runtime/preprocessing.hpp src/runtime/sample-cache.cpp src/runtime/sample-cache.h src/runtime/spectrum.hpp src/runtime/ucache.hpp src/stable-diffusion.cpp src/tokenizers/bpe_tokenizer.cpp src/tokenizers/bpe_tokenizer.h src/tokenizers/clip_tokenizer.cpp src/tokenizers/clip_tokenizer.h src/tokenizers/gemma_tokenizer.cpp src/tokenizers/gemma_tokenizer.h src/tokenizers/gpt_oss_tokenizer.cpp src/tokenizers/gpt_oss_tokenizer.h src/tokenizers/mistral_tokenizer.cpp src/tokenizers/mistral_tokenizer.h src/tokenizers/qwen2_tokenizer.cpp src/tokenizers/qwen2_tokenizer.h src/tokenizers/t5_unigram_tokenizer.cpp src/tokenizers/t5_unigram_tokenizer.h src/tokenizers/tokenizer.cpp src/tokenizers/tokenizer.h src/tokenizers/tokenize_util.cpp src/tokenizers/tokenize_util.h src/tokenizers/vocab/vocab.h src/upscaler.cpp src/upscaler.h src/weight_manager.h SDCPP_MAIN_BASENAMES := examples/cli/image_metadata.cpp examples/cli/image_metadata.h examples/cli/main.cpp examples/cli/msf_gif.h examples/common/common.cpp examples/common/common.h examples/common/log.cpp examples/common/log.h examples/common/media_io.cpp examples/common/media_io.h examples/common/resource_owners.hpp src/tokenizers/vocab/clip_merges.hpp src/tokenizers/vocab/gemma2_merges.hpp src/tokenizers/vocab/gemma2_vocab.hpp src/tokenizers/vocab/gemma_merges.hpp src/tokenizers/vocab/gemma_vocab.hpp src/tokenizers/vocab/gpt_oss_merges.hpp src/tokenizers/vocab/gpt_oss_vocab.hpp src/tokenizers/vocab/mistral_merges.hpp src/tokenizers/vocab/mistral_vocab.hpp src/tokenizers/vocab/qwen_merges.hpp src/tokenizers/vocab/t5.hpp src/tokenizers/vocab/umt5.hpp src/tokenizers/vocab/vocab.cpp src/convert.cpp src/version.cpp diff --git a/otherarch/sdcpp/examples/common/common.cpp b/otherarch/sdcpp/examples/common/common.cpp index 3f357512e66c..744005af909b 100644 --- a/otherarch/sdcpp/examples/common/common.cpp +++ b/otherarch/sdcpp/examples/common/common.cpp @@ -960,7 +960,7 @@ ArgOptions SDGenerationParams::get_options() { &hires_upscaler}, {"", "--extra-sample-args", - "extra sampler/scheduler/guidance args, key=value list. CFG supports guidance_schedule; APG supports apg_eta, apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma;", + "extra sampler/scheduler/guidance args, key=value list. CFG supports guidance_schedule; APG supports apg_eta, apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma;; logit_normal supports mu, std, logsnr_min, logsnr_max, resolution_aware", (int)',', &extra_sample_args}, {"", @@ -1475,7 +1475,7 @@ ArgOptions SDGenerationParams::get_options() { on_high_noise_sample_method_arg}, {"", "--scheduler", - "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent, ltx2], default: model-specific", + "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent, ltx2, logit_normal], default: model-specific", on_scheduler_arg}, {"", "--sigmas", diff --git a/otherarch/sdcpp/include/stable-diffusion.h b/otherarch/sdcpp/include/stable-diffusion.h index 8772865daadb..7058852cc299 100644 --- a/otherarch/sdcpp/include/stable-diffusion.h +++ b/otherarch/sdcpp/include/stable-diffusion.h @@ -70,6 +70,7 @@ enum scheduler_t { LCM_SCHEDULER, BONG_TANGENT_SCHEDULER, LTX2_SCHEDULER, + LOGIT_NORMAL_SCHEDULER, SCHEDULER_COUNT }; diff --git a/otherarch/sdcpp/src/conditioning/conditioner.hpp b/otherarch/sdcpp/src/conditioning/conditioner.hpp index ae1a5b5b387e..e037fe76b081 100644 --- a/otherarch/sdcpp/src/conditioning/conditioner.hpp +++ b/otherarch/sdcpp/src/conditioning/conditioner.hpp @@ -1518,7 +1518,7 @@ struct LLMEmbedder : public Conditioner { arch = LLM::LLMArch::GPT_OSS_20B; } else if (sd_version_is_pid(version)) { arch = LLM::LLMArch::GEMMA2_2B; - } else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version)) { + } else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version) || sd_version_is_krea2(version)) { arch = LLM::LLMArch::QWEN3_VL; } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) { arch = LLM::LLMArch::QWEN3; @@ -1837,6 +1837,17 @@ struct LLMEmbedder : public Conditioner { prompt_attn_range.second = static_cast(prompt.size()); prompt += "<|im_end|>\n"; } + } else if (sd_version_is_krea2(version)) { + prompt_template_encode_start_idx = 34; + out_layers = {2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 32, 35}; + + prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n"; + + prompt_attn_range.first = static_cast(prompt.size()); + prompt += conditioner_params.text; + prompt_attn_range.second = static_cast(prompt.size()); + + prompt += "<|im_end|>\n<|im_start|>assistant\n"; } else if (sd_version_is_longcat(version)) { spell_quotes = true; diff --git a/otherarch/sdcpp/src/core/ggml_extend.hpp b/otherarch/sdcpp/src/core/ggml_extend.hpp index f10a84ffd0db..9883103e2f71 100644 --- a/otherarch/sdcpp/src/core/ggml_extend.hpp +++ b/otherarch/sdcpp/src/core/ggml_extend.hpp @@ -1382,7 +1382,16 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_attention_ext(ggml_context* ctx, if (!ggml_backend_supports_op(backend, kqv)) { kqv = nullptr; } else { - kqv = ggml_view_3d(ctx, kqv, d_head, n_head, L_q, kqv->nb[1], kqv->nb[2], 0); + kqv = ggml_view_4d(ctx, + kqv, + d_head, + n_head, + L_q, + N, + kqv->nb[1], + kqv->nb[2], + kqv->nb[1] * n_head, + 0); } } } diff --git a/otherarch/sdcpp/src/model.h b/otherarch/sdcpp/src/model.h index d02ed65b8f18..cce309138baf 100644 --- a/otherarch/sdcpp/src/model.h +++ b/otherarch/sdcpp/src/model.h @@ -49,6 +49,7 @@ enum SDVersion { VERSION_LONGCAT, VERSION_PID, VERSION_IDEOGRAM4, + VERSION_KREA2, VERSION_ESRGAN, VERSION_COUNT, }; @@ -186,6 +187,13 @@ static inline bool sd_version_is_ideogram4(SDVersion version) { return false; } +static inline bool sd_version_is_krea2(SDVersion version) { + if (version == VERSION_KREA2) { + return true; + } + return false; +} + static inline bool sd_version_uses_flux_vae(SDVersion version) { if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_boogu_image(version) || sd_version_is_longcat(version)) { return true; @@ -226,7 +234,8 @@ static inline bool sd_version_is_dit(SDVersion version) { sd_version_is_lens(version) || sd_version_is_longcat(version) || sd_version_is_pid(version) || - sd_version_is_ideogram4(version)) { + sd_version_is_ideogram4(version) || + sd_version_is_krea2(version)) { return true; } return false; diff --git a/otherarch/sdcpp/src/model/diffusion/krea2.hpp b/otherarch/sdcpp/src/model/diffusion/krea2.hpp new file mode 100644 index 000000000000..02e655590769 --- /dev/null +++ b/otherarch/sdcpp/src/model/diffusion/krea2.hpp @@ -0,0 +1,683 @@ +#ifndef __SD_MODEL_DIFFUSION_KREA2_HPP__ +#define __SD_MODEL_DIFFUSION_KREA2_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "core/ggml_extend.hpp" +#include "core/ggml_graph_cut.h" +#include "model/common/rope.hpp" +#include "model/diffusion/dit.hpp" +#include "model/diffusion/flux.hpp" +#include "model/diffusion/model.hpp" +#include "model_loader.h" + +namespace Krea2 { + constexpr int KREA2_GRAPH_SIZE = 65536; + + struct Krea2Config { + int patch_size = 2; + int64_t in_channels = 16; + int64_t out_channels = 16; + int64_t features = 6144; + int64_t timestep_dim = 256; + int64_t text_dim = 2560; + int64_t text_layers = 12; + int64_t layers = 28; + int64_t heads = 48; + int64_t kv_heads = 12; + int64_t text_heads = 20; + int64_t text_kv_heads = 20; + int64_t mlp_multiplier = 4; + float theta = 1000.f; + float norm_eps = 1e-5f; + std::vector axes_dim = {32, 48, 48}; + int axes_dim_sum = 128; + + int64_t head_dim() const { + return features / heads; + } + + static int64_t count_blocks(const String2TensorStorage& tensor_storage_map, + const std::string& prefix, + const std::string& block_prefix) { + int64_t count = 0; + std::string full_prefix = prefix.empty() ? block_prefix : prefix + "." + block_prefix; + for (const auto& [name, _] : tensor_storage_map) { + if (!starts_with(name, full_prefix)) { + continue; + } + std::string tail = name.substr(full_prefix.size()); + size_t dot = tail.find('.'); + if (dot == std::string::npos) { + continue; + } + int block_index = std::atoi(tail.substr(0, dot).c_str()); + count = std::max(count, block_index + 1); + } + return count; + } + + void update_axes_dim() { + int64_t dim_head = head_dim(); + int64_t unit = dim_head / 16; + axes_dim = { + static_cast(dim_head - 12 * unit), + static_cast(6 * unit), + static_cast(6 * unit), + }; + axes_dim_sum = axes_dim[0] + axes_dim[1] + axes_dim[2]; + } + + static Krea2Config detect_from_weights(const String2TensorStorage& tensor_storage_map, + const std::string& prefix) { + Krea2Config config; + int64_t detected_head_dim = 0; + int64_t detected_text_head_dim = 0; + + for (const auto& [name, tensor_storage] : tensor_storage_map) { + if (!starts_with(name, prefix)) { + continue; + } + if (ends_with(name, "first.weight") && tensor_storage.n_dims == 2) { + config.in_channels = tensor_storage.ne[0] / (config.patch_size * config.patch_size); + config.out_channels = config.in_channels; + config.features = tensor_storage.ne[1]; + } else if (ends_with(name, "blocks.0.attn.qknorm.qnorm.scale") && tensor_storage.n_dims == 1) { + detected_head_dim = tensor_storage.ne[0]; + } else if (ends_with(name, "blocks.0.attn.wq.weight") && tensor_storage.n_dims == 2) { + if (detected_head_dim > 0) { + config.heads = tensor_storage.ne[1] / detected_head_dim; + } + } else if (ends_with(name, "blocks.0.attn.wk.weight") && tensor_storage.n_dims == 2) { + if (detected_head_dim > 0) { + config.kv_heads = tensor_storage.ne[1] / detected_head_dim; + } + } else if (ends_with(name, "txtfusion.projector.weight") && tensor_storage.n_dims == 2) { + config.text_layers = tensor_storage.ne[0]; + } else if (ends_with(name, "txtfusion.layerwise_blocks.0.prenorm.scale") && tensor_storage.n_dims == 1) { + config.text_dim = tensor_storage.ne[0]; + } else if (ends_with(name, "txtfusion.layerwise_blocks.0.attn.qknorm.qnorm.scale") && tensor_storage.n_dims == 1) { + detected_text_head_dim = tensor_storage.ne[0]; + } else if (ends_with(name, "txtfusion.layerwise_blocks.0.attn.wq.weight") && tensor_storage.n_dims == 2) { + if (detected_text_head_dim > 0) { + config.text_heads = tensor_storage.ne[1] / detected_text_head_dim; + } + } else if (ends_with(name, "txtfusion.layerwise_blocks.0.attn.wk.weight") && tensor_storage.n_dims == 2) { + if (detected_text_head_dim > 0) { + config.text_kv_heads = tensor_storage.ne[1] / detected_text_head_dim; + } + } else if (ends_with(name, "last.linear.weight") && tensor_storage.n_dims == 2) { + config.out_channels = tensor_storage.ne[1] / (config.patch_size * config.patch_size); + } + } + + config.layers = std::max(1, count_blocks(tensor_storage_map, prefix, "blocks.")); + if (detected_head_dim > 0 && config.features > 0) { + config.heads = config.features / detected_head_dim; + } + if (detected_head_dim > 0) { + std::string wk_name = prefix.empty() ? "blocks.0.attn.wk.weight" : prefix + ".blocks.0.attn.wk.weight"; + auto it = tensor_storage_map.find(wk_name); + if (it != tensor_storage_map.end() && it->second.n_dims == 2) { + config.kv_heads = it->second.ne[1] / detected_head_dim; + } + } + if (detected_text_head_dim > 0 && config.text_dim > 0) { + config.text_heads = config.text_dim / detected_text_head_dim; + } + if (detected_text_head_dim > 0) { + std::string wk_name = prefix.empty() ? "txtfusion.layerwise_blocks.0.attn.wk.weight" : prefix + ".txtfusion.layerwise_blocks.0.attn.wk.weight"; + auto it = tensor_storage_map.find(wk_name); + if (it != tensor_storage_map.end() && it->second.n_dims == 2) { + config.text_kv_heads = it->second.ne[1] / detected_text_head_dim; + } + } + config.update_axes_dim(); + + LOG_DEBUG("krea2: layers=%" PRId64 ", features=%" PRId64 ", heads=%" PRId64 ", kv_heads=%" PRId64 ", text_dim=%" PRId64 ", text_layers=%" PRId64 ", text_heads=%" PRId64 ", text_kv_heads=%" PRId64 ", channels=%" PRId64, + config.layers, + config.features, + config.heads, + config.kv_heads, + config.text_dim, + config.text_layers, + config.text_heads, + config.text_kv_heads, + config.in_channels); + return config; + } + }; + + __STATIC_INLINE__ int64_t ceil_to_multiple(int64_t value, int64_t multiple) { + return ((value + multiple - 1) / multiple) * multiple; + } + + class KreaRMSNorm : public UnaryBlock { + protected: + int64_t hidden_size; + float eps; + std::string prefix; + + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + GGML_UNUSED(tensor_storage_map); + this->prefix = prefix; + params["scale"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); + } + + public: + KreaRMSNorm(int64_t hidden_size, float eps = 1e-5f) + : hidden_size(hidden_size), + eps(eps) {} + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + ggml_tensor* scale = params["scale"]; + scale = ggml_add(ctx->ggml_ctx, scale, ggml_ext_ones(ctx->ggml_ctx, scale->ne[0], 1, 1, 1)); + x = ggml_rms_norm(ctx->ggml_ctx, x, eps); + x = ggml_mul_inplace(ctx->ggml_ctx, x, scale); + return x; + } + }; + + class KreaSwiGLU : public UnaryBlock { + public: + KreaSwiGLU(int64_t features, int64_t multiplier) { + int64_t mlp_dim = ceil_to_multiple(((2 * features) / 3) * multiplier, 128); + blocks["gate"] = std::make_shared(features, mlp_dim, false); + blocks["up"] = std::make_shared(features, mlp_dim, false); + blocks["down"] = std::make_shared(mlp_dim, features, false); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + auto gate = std::dynamic_pointer_cast(blocks["gate"]); + auto up = std::dynamic_pointer_cast(blocks["up"]); + auto down = std::dynamic_pointer_cast(blocks["down"]); + + auto gated = ggml_silu(ctx->ggml_ctx, gate->forward(ctx, x)); + auto up_x = up->forward(ctx, x); + x = ggml_mul(ctx->ggml_ctx, gated, up_x); + return down->forward(ctx, x); + } + }; + + class KreaAttention : public GGMLBlock { + protected: + int64_t features; + int64_t heads; + int64_t kv_heads; + int64_t head_dim_; + + ggml_tensor* attention_no_rope(GGMLRunnerContext* ctx, + ggml_tensor* q, + ggml_tensor* k, + ggml_tensor* v, + ggml_tensor* mask) { + int64_t Lq = q->ne[2]; + int64_t Lk = k->ne[2]; + int64_t N = q->ne[3]; + q = ggml_reshape_3d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, q), head_dim_ * heads, Lq, N); + k = ggml_reshape_3d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, k), head_dim_ * kv_heads, Lk, N); + v = ggml_reshape_3d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, v), head_dim_ * kv_heads, Lk, N); + return ggml_ext_attention_ext(ctx->ggml_ctx, + ctx->backend, + q, + k, + v, + heads, + mask, + false, + ctx->flash_attn_enabled); + } + + public: + KreaAttention(int64_t features, + int64_t heads, + int64_t kv_heads, + float eps = 1e-5f) + : features(features), + heads(heads), + kv_heads(kv_heads), + head_dim_(features / heads) { + blocks["wq"] = std::make_shared(features, heads * head_dim_, false); + blocks["wk"] = std::make_shared(features, kv_heads * head_dim_, false); + blocks["wv"] = std::make_shared(features, kv_heads * head_dim_, false); + blocks["gate"] = std::make_shared(features, features, false); + blocks["qknorm.qnorm"] = std::make_shared(head_dim_, eps); + blocks["qknorm.knorm"] = std::make_shared(head_dim_, eps); + blocks["wo"] = std::make_shared(features, features, false); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* pe = nullptr, + ggml_tensor* mask = nullptr) { + auto wq = std::dynamic_pointer_cast(blocks["wq"]); + auto wk = std::dynamic_pointer_cast(blocks["wk"]); + auto wv = std::dynamic_pointer_cast(blocks["wv"]); + auto gate = std::dynamic_pointer_cast(blocks["gate"]); + auto qnorm = std::dynamic_pointer_cast(blocks["qknorm.qnorm"]); + auto knorm = std::dynamic_pointer_cast(blocks["qknorm.knorm"]); + auto wo = std::dynamic_pointer_cast(blocks["wo"]); + + if (sd_backend_is(ctx->backend, "Vulkan")) { + wo->set_force_prec_f32(true); + } + + int64_t L = x->ne[1]; + int64_t N = x->ne[2]; + + auto q = wq->forward(ctx, x); + q = ggml_reshape_4d(ctx->ggml_ctx, q, head_dim_, heads, L, N); + auto k = wk->forward(ctx, x); + k = ggml_reshape_4d(ctx->ggml_ctx, k, head_dim_, kv_heads, L, N); + auto v = wv->forward(ctx, x); + v = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim_, kv_heads, L, N); + + q = qnorm->forward(ctx, q); + k = knorm->forward(ctx, k); + + auto out = pe != nullptr ? Rope::attention(ctx, q, k, v, pe, mask) + : attention_no_rope(ctx, q, k, v, mask); + out = ggml_mul(ctx->ggml_ctx, out, ggml_sigmoid(ctx->ggml_ctx, gate->forward(ctx, x))); + out = wo->forward(ctx, out); + return out; + } + }; + + class KreaDoubleSharedModulation : public GGMLBlock { + protected: + int64_t dim; + + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + GGML_UNUSED(tensor_storage_map); + GGML_UNUSED(prefix); + params["lin"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim * 6); + } + + public: + KreaDoubleSharedModulation(int64_t dim) + : dim(dim) {} + + std::vector forward(GGMLRunnerContext* ctx, ggml_tensor* vec) { + auto lin = ggml_repeat(ctx->ggml_ctx, params["lin"], vec); + auto out = ggml_add(ctx->ggml_ctx, vec, lin); + return ggml_ext_chunk(ctx->ggml_ctx, out, 6, 0); + } + }; + + class KreaFinalModulation : public GGMLBlock { + protected: + int64_t dim; + + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + GGML_UNUSED(tensor_storage_map); + GGML_UNUSED(prefix); + params["lin"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, 2); + } + + public: + KreaFinalModulation(int64_t dim) + : dim(dim) {} + + std::vector forward(GGMLRunnerContext* ctx, ggml_tensor* vec) { + auto out = ggml_add(ctx->ggml_ctx, params["lin"], vec); + return ggml_ext_chunk(ctx->ggml_ctx, out, 2, 1); + } + }; + + class KreaTextFusionBlock : public UnaryBlock { + public: + KreaTextFusionBlock(int64_t dim, + int64_t heads, + int64_t kv_heads, + int64_t multiplier, + float eps) { + blocks["prenorm"] = std::make_shared(dim, eps); + blocks["postnorm"] = std::make_shared(dim, eps); + blocks["attn"] = std::make_shared(dim, heads, kv_heads, eps); + blocks["mlp"] = std::make_shared(dim, multiplier); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + auto prenorm = std::dynamic_pointer_cast(blocks["prenorm"]); + auto postnorm = std::dynamic_pointer_cast(blocks["postnorm"]); + auto attn = std::dynamic_pointer_cast(blocks["attn"]); + auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); + + x = ggml_add(ctx->ggml_ctx, x, attn->forward(ctx, prenorm->forward(ctx, x))); + x = ggml_add(ctx->ggml_ctx, x, mlp->forward(ctx, postnorm->forward(ctx, x))); + return x; + } + }; + + class KreaTextFusionTransformer : public UnaryBlock { + protected: + Krea2Config config; + + public: + explicit KreaTextFusionTransformer(Krea2Config config) + : config(std::move(config)) { + for (int i = 0; i < 2; ++i) { + blocks["layerwise_blocks." + std::to_string(i)] = std::make_shared(this->config.text_dim, + this->config.text_heads, + this->config.text_kv_heads, + this->config.mlp_multiplier, + this->config.norm_eps); + blocks["refiner_blocks." + std::to_string(i)] = std::make_shared(this->config.text_dim, + this->config.text_heads, + this->config.text_kv_heads, + this->config.mlp_multiplier, + this->config.norm_eps); + } + blocks["projector"] = std::make_shared(this->config.text_layers, 1, false); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* context) override { + int64_t text_tokens = context->ne[1]; + int64_t batch = context->ne[2]; + + context = ggml_reshape_3d(ctx->ggml_ctx, + context, + config.text_dim, + config.text_layers, + text_tokens * batch); + + for (int i = 0; i < 2; ++i) { + auto block = std::dynamic_pointer_cast(blocks["layerwise_blocks." + std::to_string(i)]); + context = block->forward(ctx, context); + } + + context = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, context, 1, 0, 2, 3)); + auto projector = std::dynamic_pointer_cast(blocks["projector"]); + context = projector->forward(ctx, context); + context = ggml_reshape_3d(ctx->ggml_ctx, context, config.text_dim, text_tokens, batch); + + for (int i = 0; i < 2; ++i) { + auto block = std::dynamic_pointer_cast(blocks["refiner_blocks." + std::to_string(i)]); + context = block->forward(ctx, context); + } + return context; + } + }; + + class KreaSingleStreamBlock : public UnaryBlock { + public: + explicit KreaSingleStreamBlock(Krea2Config config) { + blocks["mod"] = std::make_shared(config.features); + blocks["prenorm"] = std::make_shared(config.features, config.norm_eps); + blocks["postnorm"] = std::make_shared(config.features, config.norm_eps); + blocks["attn"] = std::make_shared(config.features, config.heads, config.kv_heads, config.norm_eps); + blocks["mlp"] = std::make_shared(config.features, config.mlp_multiplier); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* vec, + ggml_tensor* pe) { + auto mod = std::dynamic_pointer_cast(blocks["mod"]); + auto prenorm = std::dynamic_pointer_cast(blocks["prenorm"]); + auto postnorm = std::dynamic_pointer_cast(blocks["postnorm"]); + auto attn = std::dynamic_pointer_cast(blocks["attn"]); + auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); + + auto mods = mod->forward(ctx, vec); + auto attn_input = Flux::modulate(ctx->ggml_ctx, + prenorm->forward(ctx, x), + mods[1], + mods[0], + true); + auto attn_out = attn->forward(ctx, attn_input, pe); + x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn_out, mods[2])); + + auto mlp_input = Flux::modulate(ctx->ggml_ctx, + postnorm->forward(ctx, x), + mods[4], + mods[3], + true); + auto mlp_out = mlp->forward(ctx, mlp_input); + x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, mlp_out, mods[5])); + return x; + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + GGML_UNUSED(ctx); + GGML_UNUSED(x); + GGML_ABORT("KreaSingleStreamBlock requires conditioning"); + return nullptr; + } + }; + + class KreaTimeMLP : public UnaryBlock { + public: + explicit KreaTimeMLP(Krea2Config config) { + blocks["0"] = std::make_shared(config.timestep_dim, config.features, true); + blocks["2"] = std::make_shared(config.features, config.features, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + auto linear_0 = std::dynamic_pointer_cast(blocks["0"]); + auto linear_2 = std::dynamic_pointer_cast(blocks["2"]); + x = linear_0->forward(ctx, x); + x = ggml_ext_gelu(ctx->ggml_ctx, x, false); + x = linear_2->forward(ctx, x); + return x; + } + }; + + class KreaTProj : public UnaryBlock { + public: + explicit KreaTProj(Krea2Config config) { + blocks["1"] = std::make_shared(config.features, config.features * 6, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + auto linear_1 = std::dynamic_pointer_cast(blocks["1"]); + x = ggml_ext_gelu(ctx->ggml_ctx, x, false); + x = linear_1->forward(ctx, x); + return x; + } + }; + + class KreaTextMLP : public UnaryBlock { + public: + explicit KreaTextMLP(Krea2Config config) { + blocks["0"] = std::make_shared(config.text_dim, config.norm_eps); + blocks["1"] = std::make_shared(config.text_dim, config.features, true); + blocks["3"] = std::make_shared(config.features, config.features, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + auto norm = std::dynamic_pointer_cast(blocks["0"]); + auto linear_1 = std::dynamic_pointer_cast(blocks["1"]); + auto linear_3 = std::dynamic_pointer_cast(blocks["3"]); + x = norm->forward(ctx, x); + x = linear_1->forward(ctx, x); + x = ggml_ext_gelu(ctx->ggml_ctx, x, true); + x = linear_3->forward(ctx, x); + return x; + } + }; + + class KreaLastLayer : public GGMLBlock { + public: + explicit KreaLastLayer(Krea2Config config) { + blocks["norm"] = std::make_shared(config.features, config.norm_eps); + blocks["linear"] = std::make_shared(config.features, config.patch_size * config.patch_size * config.out_channels, true); + blocks["modulation"] = std::make_shared(config.features); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* vec) { + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + auto linear = std::dynamic_pointer_cast(blocks["linear"]); + auto modulation = std::dynamic_pointer_cast(blocks["modulation"]); + + auto mods = modulation->forward(ctx, vec); + x = Flux::modulate(ctx->ggml_ctx, + norm->forward(ctx, x), + mods[1], + mods[0], + true); + x = linear->forward(ctx, x); + return x; + } + }; + + class Krea2Model : public GGMLBlock { + protected: + Krea2Config config; + + public: + Krea2Model() = default; + explicit Krea2Model(Krea2Config config) + : config(std::move(config)) { + blocks["first"] = std::make_shared(this->config.patch_size * this->config.patch_size * this->config.in_channels, + this->config.features, + true); + blocks["tmlp"] = std::make_shared(this->config); + blocks["txtfusion"] = std::make_shared(this->config); + blocks["txtmlp"] = std::make_shared(this->config); + blocks["tproj"] = std::make_shared(this->config); + for (int i = 0; i < this->config.layers; ++i) { + blocks["blocks." + std::to_string(i)] = std::make_shared(this->config); + } + blocks["last"] = std::make_shared(this->config); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timestep, + ggml_tensor* context, + ggml_tensor* pe) { + int64_t W = x->ne[0]; + int64_t H = x->ne[1]; + int64_t N = x->ne[3]; + GGML_ASSERT(N == 1); + + auto first = std::dynamic_pointer_cast(blocks["first"]); + auto tmlp = std::dynamic_pointer_cast(blocks["tmlp"]); + auto txtfusion = std::dynamic_pointer_cast(blocks["txtfusion"]); + auto txtmlp = std::dynamic_pointer_cast(blocks["txtmlp"]); + auto tproj = std::dynamic_pointer_cast(blocks["tproj"]); + auto last = std::dynamic_pointer_cast(blocks["last"]); + + auto img = DiT::pad_and_patchify(ctx, x, config.patch_size, config.patch_size, true); + int64_t img_len = img->ne[1]; + img = first->forward(ctx, img); + + auto t = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast(config.timestep_dim), 10000, 1000.f); + t = tmlp->forward(ctx, t); + t = ggml_reshape_3d(ctx->ggml_ctx, t, t->ne[0], 1, t->ne[1]); + auto tvec = tproj->forward(ctx, t); + + auto txt = txtfusion->forward(ctx, context); + txt = txtmlp->forward(ctx, txt); + int64_t txt_len = txt->ne[1]; + + auto hidden_states = ggml_concat(ctx->ggml_ctx, txt, img, 1); + for (int i = 0; i < config.layers; ++i) { + auto block = std::dynamic_pointer_cast(blocks["blocks." + std::to_string(i)]); + hidden_states = block->forward(ctx, hidden_states, tvec, pe); + sd::ggml_graph_cut::mark_graph_cut(hidden_states, "krea2.blocks." + std::to_string(i), "hidden_states"); + } + + hidden_states = last->forward(ctx, hidden_states, t); + hidden_states = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, txt_len, txt_len + img_len); + hidden_states = DiT::unpatchify_and_crop(ctx->ggml_ctx, hidden_states, H, W, config.patch_size, config.patch_size, true); + return hidden_states; + } + }; + + __STATIC_INLINE__ std::vector gen_krea2_pe(int h, + int w, + int patch_size, + int bs, + int context_len, + float theta, + const std::vector& axes_dim) { + auto txt_ids = Rope::gen_flux_txt_ids(bs, context_len, 3, {}); + auto img_ids = Rope::gen_flux_img_ids(h, w, patch_size, bs, 3, 0, 0, 0, false); + auto ids = Rope::concat_ids(txt_ids, img_ids, bs); + return Rope::embed_nd(ids, bs, theta, axes_dim); + } + + struct Krea2Runner : public DiffusionModelRunner { + Krea2Config config; + Krea2Model model; + std::vector pe_vec; + + Krea2Runner(ggml_backend_t backend, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "", + std::shared_ptr weight_manager = nullptr) + : DiffusionModelRunner(backend, prefix, weight_manager), + config(Krea2Config::detect_from_weights(tensor_storage_map, prefix)) { + model = Krea2Model(config); + model.init(params_ctx, tensor_storage_map, prefix); + } + + std::string get_desc() override { + return "krea2"; + } + + void get_param_tensors(std::map& tensors, const std::string& prefix) override { + model.get_param_tensors(tensors, prefix); + } + + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor) { + ggml_cgraph* gf = new_graph_custom(KREA2_GRAPH_SIZE); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); + GGML_ASSERT(x->ne[3] == 1); + GGML_ASSERT(!context_tensor.empty()); + ggml_tensor* context = make_input(context_tensor); + + pe_vec = gen_krea2_pe(static_cast(x->ne[1]), + static_cast(x->ne[0]), + config.patch_size, + static_cast(x->ne[3]), + static_cast(context->ne[1]), + config.theta, + config.axes_dim); + int pos_len = static_cast(pe_vec.size() / config.axes_dim_sum / 2); + auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len); + set_backend_tensor_data(pe, pe_vec.data()); + + auto runner_ctx = get_context(); + ggml_tensor* out = model.forward(&runner_ctx, x, timesteps, context, pe); + ggml_build_forward_expand(gf, out); + return gf; + } + + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context) { + auto get_graph = [&]() -> ggml_cgraph* { + return build_graph(x, timesteps, context); + }; + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); + } + + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); + return compute(n_threads, + *diffusion_params.x, + *diffusion_params.timesteps, + tensor_or_empty(diffusion_params.context)); + } + }; +} // namespace Krea2 + +#endif // __SD_MODEL_DIFFUSION_KREA2_HPP__ diff --git a/otherarch/sdcpp/src/model_loader.cpp b/otherarch/sdcpp/src/model_loader.cpp index 5c2d57cdec5b..788663103e08 100644 --- a/otherarch/sdcpp/src/model_loader.cpp +++ b/otherarch/sdcpp/src/model_loader.cpp @@ -481,6 +481,10 @@ SDVersion ModelLoader::get_sd_version() { if (tensor_storage.name.find("embed_image_indicator.weight") != std::string::npos) { return VERSION_IDEOGRAM4; } + if (tensor_storage.name.find("model.diffusion_model.txtfusion.projector.weight") != std::string::npos || + tensor_storage.name.find("model.diffusion_model.text_fusion.projector.weight") != std::string::npos) { + return VERSION_KREA2; + } if (tensor_storage.name.find("model.diffusion_model.nerf_final_layer_conv.") != std::string::npos) { return VERSION_CHROMA_RADIANCE; } diff --git a/otherarch/sdcpp/src/name_conversion.cpp b/otherarch/sdcpp/src/name_conversion.cpp index da2a8d5eda0f..ccc8347b729b 100644 --- a/otherarch/sdcpp/src/name_conversion.cpp +++ b/otherarch/sdcpp/src/name_conversion.cpp @@ -704,6 +704,38 @@ std::string convert_other_dit_to_original_anima(std::string name) { return name; } +std::string convert_diffusers_dit_to_original_krea2(std::string name) { + static const std::vector> prefix_map = { + {"img_in.", "first."}, + {"time_embed.linear_1.", "tmlp.0."}, + {"time_embed.linear_2.", "tmlp.2."}, + {"time_mod_proj.", "tproj.1."}, + {"txt_in.linear_1.", "txtmlp.1."}, + {"txt_in.linear_2.", "txtmlp.3."}, + {"text_fusion.", "txtfusion."}, + {"transformer_blocks.", "blocks."}, + {"final_layer.", "last."}, + }; + static const std::vector> name_map = { + {"attn.to_out.0.", "attn.wo."}, + {"attn.to_out.", "attn.wo."}, + {"attn.to_gate.", "attn.gate."}, + {"attn.to_q.", "attn.wq."}, + {"attn.to_k.", "attn.wk."}, + {"attn.to_v.", "attn.wv."}, + {"ff.gate.", "mlp.gate."}, + {"ff.up.", "mlp.up."}, + {"ff.down.", "mlp.down."}, + {"txt_in.norm.", "txtmlp.0."}, + {"last.norm.weight", "last.norm.scale"}, + {"last.modulation.weight", "last.modulation.lin"}, + }; + + replace_with_prefix_map(name, prefix_map); + replace_with_name_map(name, name_map); + return name; +} + std::string convert_diffusion_model_name(std::string name, std::string prefix, SDVersion version) { if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) { name = convert_diffusers_unet_to_original_sd1(name); @@ -717,6 +749,8 @@ std::string convert_diffusion_model_name(std::string name, std::string prefix, S name = convert_diffusers_dit_to_original_lumina2(name); } else if (sd_version_is_anima(version)) { name = convert_other_dit_to_original_anima(name); + } else if (sd_version_is_krea2(version)) { + name = convert_diffusers_dit_to_original_krea2(name); } return name; } @@ -1175,7 +1209,7 @@ std::string convert_tensor_name(std::string name, SDVersion version) { replace_with_prefix_map(name, prefix_map); - if (sd_version_is_boogu_image(version) && starts_with(name, "text_encoders.llm.visual.")) { + if ((sd_version_is_boogu_image(version) || sd_version_is_krea2(version)) && starts_with(name, "text_encoders.llm.visual.")) { name = convert_qwen3_vl_vision_name(std::move(name)); } diff --git a/otherarch/sdcpp/src/runtime/denoiser.hpp b/otherarch/sdcpp/src/runtime/denoiser.hpp index fed5911bc71c..28b29ef2772a 100644 --- a/otherarch/sdcpp/src/runtime/denoiser.hpp +++ b/otherarch/sdcpp/src/runtime/denoiser.hpp @@ -559,6 +559,203 @@ struct LTX2Scheduler : SigmaScheduler { } }; +/* + * Logit-Normal Scheduler + * Based on: https://github.com/ideogram-oss/ideogram4/blob/main/src/ideogram4/scheduler.py + */ +struct LogitNormalScheduler : SigmaScheduler { + float mean = 0.0f; + float std = 1.75f; + float logsnr_min = -15.0f; + float logsnr_max = 18.0f; + + bool resolution_aware = true; + + float one_minus_t_min, one_minus_t_max; + + void parse_extra_sample_args(int image_seq_len = 0, const char* extra_sample_args = nullptr) { + const int known_seq_len = (512 * 512) / (16 * 16); + if (extra_sample_args) { + for (const auto& [key, value] : parse_key_value_args(extra_sample_args, "logit-normal scheduler arg")) { + if (key == "mu") { + if (!parse_strict_float(value, mean)) { + LOG_WARN("ignoring invalid logit-normal scheduler arg '%s=%s'", key.c_str(), value.c_str()); + } + } else if (key == "std") { + if (!parse_strict_float(value, std)) { + LOG_WARN("ignoring invalid logit-normal scheduler arg '%s=%s'", key.c_str(), value.c_str()); + } + } + if (key == "logsnr_min") { + if (!parse_strict_float(value, logsnr_min)) { + LOG_WARN("ignoring invalid logit-normal scheduler arg '%s=%s'", key.c_str(), value.c_str()); + } + } else if (key == "logsnr_max") { + if (!parse_strict_float(value, logsnr_max)) { + LOG_WARN("ignoring invalid logit-normal scheduler arg '%s=%s'", key.c_str(), value.c_str()); + } + } else if (key == "resolution_aware") { + if (!parse_strict_bool(value, resolution_aware)) { + LOG_WARN("ignoring invalid logit-normal scheduler arg '%s=%s'", key.c_str(), value.c_str()); + } + } + } + } + if (image_seq_len > 0 && resolution_aware) { + mean += 0.5 * std::log(static_cast(image_seq_len) / static_cast(known_seq_len)); + } + } + + float sigmoid(float x) { + return 1.0f / (1.0f + std::exp(-x)); + } + + LogitNormalScheduler(float mean = 0.0f, float std = 1.75f, float logsnr_min = -18.0f, float logsnr_max = 15.0f) + : mean(mean), std(std), logsnr_min(logsnr_min), logsnr_max(logsnr_max) { + // t_min = 1.0f / (1.0f + std::exp(0.5f * logsnr_max)); + one_minus_t_min = sigmoid(0.5f * logsnr_max); + // t_max = 1.0f / (1.0f + std::exp(0.5f * logsnr_min)); + one_minus_t_max = sigmoid(0.5f * logsnr_min); + + } + + LogitNormalScheduler(int image_seq_len = 0, const char* extra_sample_args = nullptr) { + mean = 0.0f; + std = 1.75f; + logsnr_min = -15.0f; + logsnr_max = 18.0f; + + parse_extra_sample_args(image_seq_len, extra_sample_args); + // t_min = 1.0f / (1.0f + std::exp(0.5f * logsnr_max)); + one_minus_t_min = sigmoid(0.5f * logsnr_max); + // t_max = 1.0f / (1.0f + std::exp(0.5f * logsnr_min)); + one_minus_t_max = sigmoid(0.5f * logsnr_min); + } + + // https://stackedboxes.org/2017/05/01/acklams-normal-quantile-function/ + double ndtri(double p) { + if (p <= 0.0) { + return -std::numeric_limits::infinity(); + } else if (p >= 1.0) { + return std::numeric_limits::infinity(); + } + + static const double p_low = 0.02425; + static const double p_high = 1.0 - p_low; + + static const double c[6] = {-7.784894002430293e-03, + -3.223964580411365e-01, + -2.400758277161838e+00, + -2.549732539343734e+00, + 4.374664141464968e+00, + 2.938163982698783e+00}; + + static const double d[5] = {7.784695709041462e-03, + 3.224671290700398e-01, + 2.445134137142996e+00, + 3.754408661907416e+00, + 1.0}; + + // Coefficients for the central region + static const double a[6] = {-3.969683028665376e+01, + 2.209460984245205e+02, + -2.759285104469687e+02, + 1.383577518672690e+02, + -3.066479806614716e+01, + 2.506628277459239e+00}; + + static const double b[6] = {-5.447609879822406e+01, + 1.615858368580409e+02, + -1.556989798598866e+02, + 6.680131188771972e+01, + -1.328068155288572e+01, + 1.0}; + + double x = 0.0; + + if (p < p_low) { + // Lower region + double q = std::sqrt(-2.0 * std::log(p)); + + // Numerator: c[0]*q^5 + c[1]*q^4 + ... + c[5] + double numerator = c[0]; + for (int i = 1; i < 6; ++i) { + numerator = numerator * q + c[i]; + } + + // Denominator: d[0]*q^4 + d[1]*q^3 + ... + d[3]*q + 1 + double denominator = d[0]; + for (int i = 1; i < 5; ++i) { + denominator = denominator * q + d[i]; + } + + x = numerator / denominator; + } else if (p > p_high) { + // Upper region + double q = std::sqrt(-2.0 * std::log(1.0 - p)); + + double numerator = c[0]; + for (int i = 1; i < 6; ++i) { + numerator = numerator * q + c[i]; + } + + double denominator = d[0]; + for (int i = 1; i < 5; ++i) { + denominator = denominator * q + d[i]; + } + + x = -(numerator / denominator); + } else { + // Central region + double q = p - 0.5; + double r = q * q; + + // Numerator: (a[0]*r^5 + a[1]*r^4 + ... + a[5])*q + double numerator = a[0]; + for (int i = 1; i < 6; ++i) { + numerator = numerator * r + a[i]; + } + numerator *= q; + + // Denominator: b[0]*r^4 + b[1]*r^3 + ... + b[4]*r + 1 + double denominator = b[0]; + for (int i = 1; i < 6; ++i) { + denominator = denominator * r + b[i]; + } + + x = numerator / denominator; + } + return x; + } + + std::vector get_sigmas(uint32_t n, float /*sigma_min*/, float /*sigma_max*/, t_to_sigma_t /*t_to_sigma*/) override { + std::vector sigmas; + LOG_INFO("LOGIT_NORMAL_SCHEDULER using mean=%.4f, std=%.4f, logsnr_min=%.4f, logsnr_max=%.4f", mean, std, logsnr_min, logsnr_max); + sigmas.reserve(n + 1); + for (uint32_t i = 0; i <= n; ++i) { + float t = static_cast(i) / static_cast(n); + + // ndtri(1-t) == -ndtri(t) + float z = -ndtri(t); + + float y = mean + std * z; + + float timestep = sigmoid(y); + + if (timestep > one_minus_t_min) + timestep = one_minus_t_min; + if (timestep < one_minus_t_max) + timestep = one_minus_t_max; + + float sigma = timestep; + + sigmas.push_back(sigma); + } + sigmas[n] = 0.0f; + return sigmas; + } +}; + struct Denoiser { virtual float sigma_min() = 0; virtual float sigma_max() = 0; @@ -623,6 +820,11 @@ struct Denoiser { LOG_INFO("get_sigmas with LTX2 scheduler"); scheduler = std::make_shared(image_seq_len, extra_sample_args); break; + case LOGIT_NORMAL_SCHEDULER: { + LOG_INFO("get_sigmas with Logit-Normal scheduler"); + scheduler = std::make_shared(image_seq_len, extra_sample_args); + break; + } default: LOG_INFO("get_sigmas with discrete scheduler (default)"); scheduler = std::make_shared(); diff --git a/otherarch/sdcpp/src/stable-diffusion.cpp b/otherarch/sdcpp/src/stable-diffusion.cpp index 2c0f53a37596..e951e414324a 100644 --- a/otherarch/sdcpp/src/stable-diffusion.cpp +++ b/otherarch/sdcpp/src/stable-diffusion.cpp @@ -28,6 +28,7 @@ #include "model/diffusion/flux.hpp" #include "model/diffusion/hidream_o1.hpp" #include "model/diffusion/ideogram4.hpp" +#include "model/diffusion/krea2.hpp" #include "model/diffusion/lens.hpp" #include "model/diffusion/ltxv.hpp" #include "model/diffusion/mmdit.hpp" @@ -97,6 +98,7 @@ const char* model_version_to_str[] = { "Longcat-Image", "PiD", "Ideogram 4", + "Krea2", "ESRGAN", }; @@ -494,7 +496,8 @@ class StableDiffusionGGML { bool is_ltx = sd_version_is_ltxav(tempver); bool is_ideogram = sd_version_is_ideogram4(tempver); bool is_boogu = sd_version_is_boogu_image(tempver); - bool conditioner_is_llm = (is_qwenimg || iszimg || isflux2 || is_ovis || is_anima || is_ernie || is_longcat || is_lens || is_ltx || is_ideogram || is_boogu); + bool is_krea2 = sd_version_is_krea2(tempver); + bool conditioner_is_llm = (is_qwenimg || iszimg || isflux2 || is_ovis || is_anima || is_ernie || is_longcat || is_lens || is_ltx || is_ideogram || is_boogu || is_krea2); bool has_llm_vision = (is_qwenimg || is_longcat || is_boogu); //kcpp qol fallback: if a llm was loaded as t5 by mistake @@ -600,7 +603,7 @@ class StableDiffusionGGML { { to_replace = "taesd_f2.embd"; } - else if(is_wan21||is_qwenimg||sd_version_is_anima(tempver)) + else if(is_wan21||is_qwenimg||is_anima||is_krea2) { to_replace = "taesd_w21.embd"; } @@ -892,6 +895,17 @@ class StableDiffusionGGML { tensor_storage_map, "model.diffusion_model", model_manager); + } else if (sd_version_is_krea2(version)) { + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + tensor_storage_map, + version, + "", + false, + model_manager); + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + tensor_storage_map, + "model.diffusion_model", + model_manager); } else if (sd_version_is_flux(version)) { bool is_chroma = false; for (auto pair : tensor_storage_map) { @@ -1128,6 +1142,7 @@ class StableDiffusionGGML { auto create_tae = [&](bool decode_only) -> std::shared_ptr { if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || + sd_version_is_krea2(version) || sd_version_is_anima(version) || sd_version_is_ltxav(version)) { return std::make_shared(backend_for(SDBackendModule::VAE), @@ -1168,6 +1183,7 @@ class StableDiffusionGGML { model_manager); } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || + sd_version_is_krea2(version) || sd_version_is_anima(version)) { return std::make_shared(backend_for(SDBackendModule::VAE), tensor_storage_map, @@ -1514,7 +1530,8 @@ class StableDiffusionGGML { } else if (sd_version_is_flux(version) || sd_version_is_longcat(version) || sd_version_is_lens(version) || - sd_version_is_ltxav(version)) { + sd_version_is_ltxav(version) || + sd_version_is_krea2(version)) { pred_type = FLUX_FLOW_PRED; default_flow_shift = 1.0f; // TODO: validate @@ -1530,6 +1547,8 @@ class StableDiffusionGGML { default_flow_shift = 1.83f; } else if (sd_version_is_ltxav(version)) { default_flow_shift = 2.37f; + } else if (sd_version_is_krea2(version)) { + default_flow_shift = 1.15f; } } else if (sd_version_is_flux2(version)) { pred_type = FLUX2_FLOW_PRED; @@ -1990,7 +2009,7 @@ class StableDiffusionGGML { } else if (sd_version_uses_flux_vae(version)) { latent_rgb_proj = flux_latent_rgb_proj; latent_rgb_bias = flux_latent_rgb_bias; - } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { + } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || sd_version_is_krea2(version)) { latent_rgb_proj = wan_21_latent_rgb_proj; latent_rgb_bias = wan_21_latent_rgb_bias; } else { @@ -2818,6 +2837,7 @@ const char* scheduler_to_str[] = { "lcm", "bong_tangent", "ltx2", + "logit_normal", }; const char* sd_scheduler_name(enum scheduler_t scheduler) { @@ -3492,6 +3512,8 @@ enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx, enum sample_me return SIMPLE_SCHEDULER; } else if (sd_ctx != nullptr && sd_ctx->sd != nullptr && sd_version_is_ltxav(sd_ctx->sd->version)) { return LTX2_SCHEDULER; + } else if(sd_ctx != nullptr && sd_ctx->sd != nullptr && sd_version_is_ideogram4(sd_ctx->sd->version)) { + return LOGIT_NORMAL_SCHEDULER; } return DISCRETE_SCHEDULER; }