From 9d9cda895100bd9d863f59290fb7806cd53b385d Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@yahoo.com>
Date: Sun, 21 Jun 2026 18:38:35 -0300
Subject: [PATCH 1/6] sd: sync with master-714-b12098f

---
 Makefile                                      |   2 +-
 otherarch/sdcpp/examples/cli/main.cpp         |   4 +
 otherarch/sdcpp/examples/common/common.cpp    |  55 +-
 otherarch/sdcpp/examples/common/common.h      |   1 +
 .../sdcpp/src/conditioning/conditioner.hpp    |  98 +-
 otherarch/sdcpp/src/convert.cpp               |   2 +-
 otherarch/sdcpp/src/core/util.cpp             |   9 +
 otherarch/sdcpp/src/core/util.h               |   2 +
 otherarch/sdcpp/src/model.h                   |   9 +
 otherarch/sdcpp/src/model/common/rope.hpp     |   4 +-
 otherarch/sdcpp/src/model/diffusion/anima.hpp |   7 +-
 otherarch/sdcpp/src/model/diffusion/boogu.hpp | 835 ++++++++++++++++++
 .../sdcpp/src/model/diffusion/ernie_image.hpp |   4 +-
 otherarch/sdcpp/src/model/te/llm.hpp          |  72 +-
 .../sdcpp/src/model/vae/auto_encoder_kl.hpp   |   2 +-
 otherarch/sdcpp/src/model_loader.cpp          |   3 +
 otherarch/sdcpp/src/name_conversion.cpp       |  25 +
 otherarch/sdcpp/src/stable-diffusion.cpp      |  29 +-
 .../sdcpp/src/tokenizers/bpe_tokenizer.cpp    |   5 +-
 .../sdcpp/src/tokenizers/clip_tokenizer.cpp   |   7 +-
 otherarch/sdcpp/src/tokenizers/tokenizer.h    |   7 +-
 21 files changed, 1129 insertions(+), 53 deletions(-)
 create mode 100644 otherarch/sdcpp/src/model/diffusion/boogu.hpp

diff --git a/Makefile b/Makefile
index cd5678538bcc..79a37a71889f 100644
--- a/Makefile
+++ b/Makefile
@@ -699,7 +699,7 @@ llama-impl.o: src/llama-impl.cpp src/llama-impl.h
 budget.o: common/reasoning-budget.cpp common/reasoning-budget.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-SDCPP_COMMON_BASENAMES := include/stable-diffusion.h src/conditioning/conditioner.hpp src/core/ggml_extend_backend.cpp src/core/ggml_extend_backend.h src/core/ggml_extend.hpp src/core/ggml_graph_cut.cpp src/core/ggml_graph_cut.h src/core/ordered_map.hpp src/core/rng.hpp src/core/rng_mt19937.hpp src/core/rng_philox.hpp src/core/tensor_ggml.hpp src/core/tensor.hpp src/core/util.cpp src/core/util.h src/extensions/generation_extension.h src/extensions/photomaker_extension.cpp src/extensions/pulid_extension.cpp src/kcpp_sd_extensions.h src/model/adapter/lora.hpp src/model/adapter/pmid.hpp src/model/adapter/pulid.hpp src/model/common/block.hpp src/model/common/rope.hpp src/model/diffusion/anima.hpp src/model/diffusion/control.hpp src/model/diffusion/dit.hpp src/model/diffusion/ernie_image.hpp src/model/diffusion/flux.hpp src/model/diffusion/hidream_o1.hpp src/model/diffusion/ideogram4.hpp src/model/diffusion/lens.hpp src/model/diffusion/ltxv.hpp src/model/diffusion/mmdit.hpp src/model/diffusion/model.hpp src/model/diffusion/pid.hpp src/model/diffusion/qwen_image.hpp src/model/diffusion/unet.hpp src/model/diffusion/wan.hpp src/model/diffusion/z_image.hpp src/model.h src/model_io/binary_io.h src/model_io/gguf_io.cpp src/model_io/gguf_io.h src/model_io/gguf_reader_ext.h src/model_io/pickle_io.cpp src/model_io/pickle_io.h src/model_io/safetensors_io.cpp src/model_io/safetensors_io.h src/model_io/tensor_storage.h src/model_io/torch_legacy_io.cpp src/model_io/torch_legacy_io.h src/model_io/torch_zip_io.cpp src/model_io/torch_zip_io.h src/model_loader.cpp src/model_loader.h src/model_manager.cpp src/model_manager.h src/model/te/clip.hpp src/model/te/llm.hpp src/model/te/t5.hpp src/model/upscaler/esrgan.hpp src/model/upscaler/ltx_latent_upscaler.hpp src/model/vae/auto_encoder_kl.hpp src/model/vae/ltx_audio_vae.hpp src/model/vae/ltx_vae.hpp src/model/vae/tae.hpp src/model/vae/vae.hpp src/model/vae/wan_vae.hpp src/name_conversion.cpp src/name_conversion.h src/runtime/cache_dit.hpp src/runtime/condition_cache_utils.hpp src/runtime/denoiser.hpp src/runtime/easycache.hpp src/runtime/gits_noise.h src/runtime/guidance.cpp src/runtime/guidance.h src/runtime/latent-preview.h src/runtime/preprocessing.hpp src/runtime/sample-cache.cpp src/runtime/sample-cache.h src/runtime/spectrum.hpp src/runtime/ucache.hpp src/stable-diffusion.cpp src/tokenizers/bpe_tokenizer.cpp src/tokenizers/bpe_tokenizer.h src/tokenizers/clip_tokenizer.cpp src/tokenizers/clip_tokenizer.h src/tokenizers/gemma_tokenizer.cpp src/tokenizers/gemma_tokenizer.h src/tokenizers/gpt_oss_tokenizer.cpp src/tokenizers/gpt_oss_tokenizer.h src/tokenizers/mistral_tokenizer.cpp src/tokenizers/mistral_tokenizer.h src/tokenizers/qwen2_tokenizer.cpp src/tokenizers/qwen2_tokenizer.h src/tokenizers/t5_unigram_tokenizer.cpp src/tokenizers/t5_unigram_tokenizer.h src/tokenizers/tokenizer.cpp src/tokenizers/tokenizer.h src/tokenizers/tokenize_util.cpp src/tokenizers/tokenize_util.h src/tokenizers/vocab/vocab.h src/upscaler.cpp src/upscaler.h src/weight_manager.h
+SDCPP_COMMON_BASENAMES := include/stable-diffusion.h src/conditioning/conditioner.hpp src/core/ggml_extend_backend.cpp src/core/ggml_extend_backend.h src/core/ggml_extend.hpp src/core/ggml_graph_cut.cpp src/core/ggml_graph_cut.h src/core/ordered_map.hpp src/core/rng.hpp src/core/rng_mt19937.hpp src/core/rng_philox.hpp src/core/tensor_ggml.hpp src/core/tensor.hpp src/core/util.cpp src/core/util.h src/extensions/generation_extension.h src/extensions/photomaker_extension.cpp src/extensions/pulid_extension.cpp src/kcpp_sd_extensions.h src/model/adapter/lora.hpp src/model/adapter/pmid.hpp src/model/adapter/pulid.hpp src/model/common/block.hpp src/model/common/rope.hpp src/model/diffusion/anima.hpp src/model/diffusion/boogu.hpp src/model/diffusion/control.hpp src/model/diffusion/dit.hpp src/model/diffusion/ernie_image.hpp src/model/diffusion/flux.hpp src/model/diffusion/hidream_o1.hpp src/model/diffusion/ideogram4.hpp src/model/diffusion/lens.hpp src/model/diffusion/ltxv.hpp src/model/diffusion/mmdit.hpp src/model/diffusion/model.hpp src/model/diffusion/pid.hpp src/model/diffusion/qwen_image.hpp src/model/diffusion/unet.hpp src/model/diffusion/wan.hpp src/model/diffusion/z_image.hpp src/model.h src/model_io/binary_io.h src/model_io/gguf_io.cpp src/model_io/gguf_io.h src/model_io/gguf_reader_ext.h src/model_io/pickle_io.cpp src/model_io/pickle_io.h src/model_io/safetensors_io.cpp src/model_io/safetensors_io.h src/model_io/tensor_storage.h src/model_io/torch_legacy_io.cpp src/model_io/torch_legacy_io.h src/model_io/torch_zip_io.cpp src/model_io/torch_zip_io.h src/model_loader.cpp src/model_loader.h src/model_manager.cpp src/model_manager.h src/model/te/clip.hpp src/model/te/llm.hpp src/model/te/t5.hpp src/model/upscaler/esrgan.hpp src/model/upscaler/ltx_latent_upscaler.hpp src/model/vae/auto_encoder_kl.hpp src/model/vae/ltx_audio_vae.hpp src/model/vae/ltx_vae.hpp src/model/vae/tae.hpp src/model/vae/vae.hpp src/model/vae/wan_vae.hpp src/name_conversion.cpp src/name_conversion.h src/runtime/cache_dit.hpp src/runtime/condition_cache_utils.hpp src/runtime/denoiser.hpp src/runtime/easycache.hpp src/runtime/gits_noise.h src/runtime/guidance.cpp src/runtime/guidance.h src/runtime/latent-preview.h src/runtime/preprocessing.hpp src/runtime/sample-cache.cpp src/runtime/sample-cache.h src/runtime/spectrum.hpp src/runtime/ucache.hpp src/stable-diffusion.cpp src/tokenizers/bpe_tokenizer.cpp src/tokenizers/bpe_tokenizer.h src/tokenizers/clip_tokenizer.cpp src/tokenizers/clip_tokenizer.h src/tokenizers/gemma_tokenizer.cpp src/tokenizers/gemma_tokenizer.h src/tokenizers/gpt_oss_tokenizer.cpp src/tokenizers/gpt_oss_tokenizer.h src/tokenizers/mistral_tokenizer.cpp src/tokenizers/mistral_tokenizer.h src/tokenizers/qwen2_tokenizer.cpp src/tokenizers/qwen2_tokenizer.h src/tokenizers/t5_unigram_tokenizer.cpp src/tokenizers/t5_unigram_tokenizer.h src/tokenizers/tokenizer.cpp src/tokenizers/tokenizer.h src/tokenizers/tokenize_util.cpp src/tokenizers/tokenize_util.h src/tokenizers/vocab/vocab.h src/upscaler.cpp src/upscaler.h src/weight_manager.h
 
 SDCPP_MAIN_BASENAMES := examples/cli/image_metadata.cpp examples/cli/image_metadata.h examples/cli/main.cpp examples/cli/msf_gif.h examples/common/common.cpp examples/common/common.h examples/common/log.cpp examples/common/log.h examples/common/media_io.cpp examples/common/media_io.h examples/common/resource_owners.hpp src/tokenizers/vocab/clip_merges.hpp src/tokenizers/vocab/gemma2_merges.hpp src/tokenizers/vocab/gemma2_vocab.hpp src/tokenizers/vocab/gemma_merges.hpp src/tokenizers/vocab/gemma_vocab.hpp src/tokenizers/vocab/gpt_oss_merges.hpp src/tokenizers/vocab/gpt_oss_vocab.hpp src/tokenizers/vocab/mistral_merges.hpp src/tokenizers/vocab/mistral_vocab.hpp src/tokenizers/vocab/qwen_merges.hpp src/tokenizers/vocab/t5.hpp src/tokenizers/vocab/umt5.hpp src/tokenizers/vocab/vocab.cpp src/convert.cpp src/version.cpp
 
diff --git a/otherarch/sdcpp/examples/cli/main.cpp b/otherarch/sdcpp/examples/cli/main.cpp
index bb5d6862c2da..84e9e7853324 100644
--- a/otherarch/sdcpp/examples/cli/main.cpp
+++ b/otherarch/sdcpp/examples/cli/main.cpp
@@ -62,18 +62,22 @@ struct SDCliParams {
             {"-o",
              "--output",
              "path to write result image to. you can use printf-style %d format specifiers for image sequences (default: ./output.png) (eg. output_%03d.png). Single-file video outputs support .avi, .webm, and animated .webp",
+             0,
              &output_path},
             {"",
              "--image",
              "path to the image to inspect (for metadata mode)",
+             0,
              &image_path},
             {"",
              "--metadata-format",
              "metadata output format, one of [text, json] (default: text)",
+             0,
              &metadata_format},
             {"",
              "--preview-path",
              "path to write preview image to (default: ./preview.png). Multi-frame previews support .avi, .webm, and animated .webp",
+             0,
              &preview_path},
         };
 
diff --git a/otherarch/sdcpp/examples/common/common.cpp b/otherarch/sdcpp/examples/common/common.cpp
index dd5d35055f29..ad3f97a08080 100644
--- a/otherarch/sdcpp/examples/common/common.cpp
+++ b/otherarch/sdcpp/examples/common/common.cpp
@@ -260,7 +260,14 @@ bool parse_options(int argc, const char** argv, const std::vector<ArgOptions>& o
                         invalid_arg = true;
                         return;
                     }
-                    *option.target = argv_to_utf8(i, argv);
+                    if(option.concat && !option.target->empty()){
+                        if(option.concat > 0 && option.concat <= 0xff){
+                            *option.target += static_cast<char>(option.concat);
+                        }
+                        *option.target += argv_to_utf8(i, argv);
+                    } else {
+                        *option.target = argv_to_utf8(i, argv);
+                    }
                     found_arg      = true;
                 }))
                 break;
@@ -324,120 +331,151 @@ ArgOptions SDContextParams::get_options() {
         {"-m",
          "--model",
          "path to full model",
+         0,
          &model_path},
         {"",
          "--clip_l",
-         "path to the clip-l text encoder", &clip_l_path},
+         "path to the clip-l text encoder",
+         0,
+         &clip_l_path},
         {"", "--clip_g",
          "path to the clip-g text encoder",
+         0,
          &clip_g_path},
         {"",
          "--clip_vision",
          "path to the clip-vision encoder",
+         0,
          &clip_vision_path},
         {"",
          "--t5xxl",
          "path to the t5xxl text encoder",
+         0,
          &t5xxl_path},
         {"",
          "--llm",
          "path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)",
+         0,
          &llm_path},
         {"",
          "--llm_vision",
          "path to the llm vit",
+         0,
          &llm_vision_path},
         {"",
          "--qwen2vl",
          "alias of --llm. Deprecated.",
+         0,
          &llm_path},
         {"",
          "--qwen2vl_vision",
          "alias of --llm_vision. Deprecated.",
+         0,
          &llm_vision_path},
         {"",
          "--diffusion-model",
          "path to the standalone diffusion model",
+         0,
          &diffusion_model_path},
         {"",
          "--high-noise-diffusion-model",
          "path to the standalone high noise diffusion model",
+         0,
          &high_noise_diffusion_model_path},
         {"",
          "--uncond-diffusion-model",
          "path to the standalone unconditional diffusion model, currently used by Ideogram4 CFG",
+         0,
          &uncond_diffusion_model_path},
         {"",
          "--embeddings-connectors",
          "path to LTXAV embeddings connectors",
+         0,
          &embeddings_connectors_path},
         {"",
          "--vae",
          "path to standalone vae model",
+         0,
          &vae_path},
         {"",
          "--vae-format",
          "VAE latent format override: auto, flux, sd3, or flux2 (default: auto)",
+         0,
          &vae_format},
         {"",
          "--audio-vae",
          "path to standalone LTX audio vae model",
+         0,
          &audio_vae_path},
         {"",
          "--taesd",
          "path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)",
+         0,
          &taesd_path},
         {"",
          "--tae",
          "alias of --taesd",
+         0,
          &taesd_path},
         {"",
          "--control-net",
          "path to control net model",
+         0,
          &control_net_path},
         {"",
          "--embd-dir",
          "embeddings directory",
+         0,
          &embedding_dir},
         {"",
          "--lora-model-dir",
          "lora model directory",
+         0,
          &lora_model_dir},
         {"",
          "--hires-upscalers-dir",
          "highres fix upscaler model directory",
+         0,
          &hires_upscalers_dir},
         {"",
          "--tensor-type-rules",
          "weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")",
+         (int)',',
          &tensor_type_rules},
         {"",
          "--photo-maker",
          "path to PHOTOMAKER model",
+         0,
          &photo_maker_path},
         {"",
          "--pulid-weights",
          "path to PuLID Flux weights",
+         0,
          &pulid_weights_path},
         {"",
          "--upscale-model",
          "path to esrgan model.",
+         0,
          &esrgan_path},
         {"",
          "--backend",
          "runtime backend assignment, e.g. cpu or clip=cpu,vae=cuda0,diffusion=vulkan0",
+         (int)',',
          &backend},
         {"",
          "--params-backend",
          "parameter backend assignment, e.g. disk, cpu, or diffusion=disk,clip=cpu",
+         (int)',',
          &params_backend},
         {"",
          "--rpc-servers",
          "comma-separated list of RPC servers to connect to for offloading, in the format host:port, e.g. localhost:50052,192.168.1.3:50052",
+         (int)',',
          &rpc_servers},
         {"",
          "--max-vram",
          "maximum VRAM budget in GiB for graph-cut segmented execution. Accepts a single value or assignments by backend/device, e.g. 6 or cuda0=6,vulkan0=4. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value",
+         0,
          &max_vram},
     };
 
@@ -857,58 +895,71 @@ ArgOptions SDGenerationParams::get_options() {
         {"-p",
          "--prompt",
          "the prompt to render",
+         0,
          &prompt},
         {"-n",
          "--negative-prompt",
          "the negative prompt (default: \"\")",
+         0,
          &negative_prompt},
         {"-i",
          "--init-img",
          "path to the init image",
+         0,
          &init_image_path},
         {"",
          "--end-img",
          "path to the end image, required by flf2v",
+         0,
          &end_image_path},
         {"",
          "--mask",
          "path to the mask image",
+         0,
          &mask_image_path},
         {"",
          "--control-image",
          "path to control image, control net",
+         0,
          &control_image_path},
         {"",
          "--control-video",
          "path to control video frames, It must be a directory path. The video frames inside should be stored as images in "
          "lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images "
          "such as 00.png, 01.png, ... etc.",
+         0,
          &control_video_path},
         {"",
          "--pm-id-images-dir",
          "path to PHOTOMAKER input id images dir",
+         0,
          &pm_id_images_dir},
         {"",
          "--pm-id-embed-path",
          "path to PHOTOMAKER v2 id embed",
+         0,
          &pm_id_embed_path},
         {"",
          "--pulid-id-embedding",
          "path to PuLID id embedding",
+         0,
          &pulid_id_embedding_path},
         {"",
          "--hires-upscaler",
          "highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), "
          "Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name "
          "under --hires-upscalers-dir (default: Latent)",
+         0,
          &hires_upscaler},
         {"",
          "--extra-sample-args",
          "extra sampler/scheduler/guidance args, key=value list. APG supports apg_eta, apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma",
+         (int)',',
          &extra_sample_args},
         {"",
          "--extra-tiling-args",
          "extra VAE tiling args, key=value list. LTX video VAE supports temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)",
+         (int)',',
          &extra_tiling_args},
     };
 
diff --git a/otherarch/sdcpp/examples/common/common.h b/otherarch/sdcpp/examples/common/common.h
index fcf9840db692..587cad29f699 100644
--- a/otherarch/sdcpp/examples/common/common.h
+++ b/otherarch/sdcpp/examples/common/common.h
@@ -31,6 +31,7 @@ struct StringOption {
     std::string short_name;
     std::string long_name;
     std::string desc;
+    int concat;
     std::string* target;
 };
 
diff --git a/otherarch/sdcpp/src/conditioning/conditioner.hpp b/otherarch/sdcpp/src/conditioning/conditioner.hpp
index b5dda4c0e435..ae1a5b5b387e 100644
--- a/otherarch/sdcpp/src/conditioning/conditioner.hpp
+++ b/otherarch/sdcpp/src/conditioning/conditioner.hpp
@@ -142,8 +142,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                       std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
         : version(version), tokenizer(sd_version_is_sd2(version) ? 0 : 49407) {
         for (const auto& kv : orig_embedding_map) {
-            std::string name = kv.first;
-            std::transform(name.begin(), name.end(), name.begin(), [](unsigned char c) { return std::tolower(c); });
+            std::string name    = normalize_embedding_name(kv.first);
             embedding_map[name] = kv.second;
             tokenizer.add_special_token(name);
         }
@@ -278,17 +277,23 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         return true;
     }
 
+    static std::string normalize_embedding_name(std::string name) {
+        std::transform(name.begin(), name.end(), name.begin(), [](unsigned char c) { return std::tolower(c); });
+        return name;
+    }
+
+    bool append_embedding_tokens(std::string str, std::vector<int32_t>& bpe_tokens) {
+        std::string name = normalize_embedding_name(std::move(str));
+        auto iter        = embedding_map.find(name);
+        if (iter == embedding_map.end()) {
+            return false;
+        }
+        return load_embedding(name, iter->second, bpe_tokens);
+    }
+
     std::vector<int> convert_token_to_id(std::string text) {
         auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
-            auto iter = embedding_map.find(str);
-            if (iter == embedding_map.end()) {
-                return false;
-            }
-            std::string embedding_path = iter->second;
-            if (load_embedding(str, embedding_path, bpe_tokens)) {
-                return true;
-            }
-            return false;
+            return append_embedding_tokens(str, bpe_tokens);
         };
         std::vector<int> curr_tokens = tokenizer.encode(text, on_new_token_cb);
         return curr_tokens;
@@ -315,15 +320,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         }
 
         auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
-            auto iter = embedding_map.find(str);
-            if (iter == embedding_map.end()) {
-                return false;
-            }
-            std::string embedding_path = iter->second;
-            if (load_embedding(str, embedding_path, bpe_tokens)) {
-                return true;
-            }
-            return false;
+            return append_embedding_tokens(str, bpe_tokens);
         };
 
         std::vector<int> tokens;
@@ -1521,7 +1518,7 @@ struct LLMEmbedder : public Conditioner {
             arch = LLM::LLMArch::GPT_OSS_20B;
         } else if (sd_version_is_pid(version)) {
             arch = LLM::LLMArch::GEMMA2_2B;
-        } else if (sd_version_is_ideogram4(version)) {
+        } else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version)) {
             arch = LLM::LLMArch::QWEN3_VL;
         } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
             arch = LLM::LLMArch::QWEN3;
@@ -1781,6 +1778,65 @@ struct LLMEmbedder : public Conditioner {
 
                 prompt += "<|im_end|>\n<|im_start|>assistant\n";
             }
+        } else if (sd_version_is_boogu_image(version)) {
+            prompt_template_encode_start_idx = 0;
+
+            const std::string t2i_system_prompt =
+                "You are a helpful assistant that generates high-quality images based on user instructions. The instructions are as follows.";
+            const std::string edit_system_prompt =
+                "Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.";
+            const bool has_ref_images = llm->enable_vision && conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty();
+            const bool text_empty     = conditioner_params.text.find_first_not_of(" \t\r\n") == std::string::npos;
+
+            if (has_ref_images) {
+                LOG_INFO("BooguImageEditPipeline");
+                const std::string prompt_prefix = "<|im_start|>system\n" + edit_system_prompt + "<|im_end|>\n<|im_start|>user\n";
+                std::string img_prompt;
+                const std::string placeholder = "<|image_pad|>";
+
+                for (int i = 0; i < conditioner_params.ref_images->size(); i++) {
+                    const auto& image = (*conditioner_params.ref_images)[i];
+                    double factor     = llm->config.vision.patch_size * llm->config.vision.spatial_merge_size;
+                    int height        = static_cast<int>(image.shape()[1]);
+                    int width         = static_cast<int>(image.shape()[0]);
+                    double beta       = std::sqrt((384.0 * 384.0) / (static_cast<double>(height) * static_cast<double>(width)));
+                    int h_bar         = std::max(static_cast<int>(factor),
+                                                 static_cast<int>(std::round(height * beta / factor)) * static_cast<int>(factor));
+                    int w_bar         = std::max(static_cast<int>(factor),
+                                                 static_cast<int>(std::round(width * beta / factor)) * static_cast<int>(factor));
+
+                    LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, height, width, h_bar, w_bar);
+
+                    auto resized_image = clip_preprocess(image, w_bar, h_bar);
+                    auto image_embed   = llm->encode_image(n_threads, resized_image, false, true, true);
+                    GGML_ASSERT(!image_embed.empty());
+
+                    std::string image_prefix = prompt_prefix + img_prompt + "<|vision_start|>";
+                    int image_embed_idx      = static_cast<int>(tokenizer->encode(image_prefix, nullptr).size());
+                    image_embeds.emplace_back(image_embed_idx, image_embed);
+
+                    img_prompt += "<|vision_start|>";
+                    int64_t num_image_tokens = image_embed.shape()[1];
+                    img_prompt.reserve(img_prompt.size() + static_cast<size_t>(num_image_tokens) * placeholder.size() + 32);
+                    for (int j = 0; j < num_image_tokens; j++) {
+                        img_prompt += placeholder;
+                    }
+                    img_prompt += "<|vision_end|>";
+                }
+
+                prompt                  = prompt_prefix + img_prompt;
+                prompt_attn_range.first = static_cast<int>(prompt.size());
+                prompt += conditioner_params.text;
+                prompt_attn_range.second = static_cast<int>(prompt.size());
+                prompt += "<|im_end|>\n";
+            } else {
+                const std::string& system_prompt = text_empty ? edit_system_prompt : t2i_system_prompt;
+                prompt                           = "<|im_start|>system\n" + system_prompt + "<|im_end|>\n<|im_start|>user\n";
+                prompt_attn_range.first          = static_cast<int>(prompt.size());
+                prompt += conditioner_params.text;
+                prompt_attn_range.second = static_cast<int>(prompt.size());
+                prompt += "<|im_end|>\n";
+            }
         } else if (sd_version_is_longcat(version)) {
             spell_quotes = true;
 
diff --git a/otherarch/sdcpp/src/convert.cpp b/otherarch/sdcpp/src/convert.cpp
index 5ad066c105c7..27d377ec09c4 100644
--- a/otherarch/sdcpp/src/convert.cpp
+++ b/otherarch/sdcpp/src/convert.cpp
@@ -99,7 +99,7 @@ bool convert(const char* input_path,
         model_loader.convert_tensors_name();
     }
 
-    ggml_type type             = (ggml_type)output_type;
+    ggml_type type             = sd_type_to_ggml_type(output_type);
     bool output_is_safetensors = ends_with(output_path, ".safetensors");
     TensorTypeRules type_rules = parse_tensor_type_rules(tensor_type_rules);
 
diff --git a/otherarch/sdcpp/src/core/util.cpp b/otherarch/sdcpp/src/core/util.cpp
index a70722af201f..05c308d9b93a 100644
--- a/otherarch/sdcpp/src/core/util.cpp
+++ b/otherarch/sdcpp/src/core/util.cpp
@@ -420,6 +420,15 @@ std::vector<std::string> split_string(const std::string& str, char delimiter) {
     return result;
 }
 
+ggml_type sd_type_to_ggml_type(sd_type_t sdtype) {
+    const int type_value = static_cast<int>(sdtype);
+    if (type_value < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT)) {
+        return static_cast<ggml_type>(type_value);
+    } else {
+        return GGML_TYPE_COUNT;
+    }
+}
+
 KeyValueArgs parse_key_value_args(const char* args, const char* context) {
     KeyValueArgs pairs;
 
diff --git a/otherarch/sdcpp/src/core/util.h b/otherarch/sdcpp/src/core/util.h
index a271c1f71869..ec4c4559524f 100644
--- a/otherarch/sdcpp/src/core/util.h
+++ b/otherarch/sdcpp/src/core/util.h
@@ -83,6 +83,8 @@ void pretty_bytes_progress(int step, int steps, uint64_t bytes_processed, float
 
 void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...);
 
+ggml_type sd_type_to_ggml_type(sd_type_t sdtype);
+
 std::string trim(const std::string& s);
 
 std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text);
diff --git a/otherarch/sdcpp/src/model.h b/otherarch/sdcpp/src/model.h
index a62c4d1bf89e..17272f7d69d9 100644
--- a/otherarch/sdcpp/src/model.h
+++ b/otherarch/sdcpp/src/model.h
@@ -42,6 +42,7 @@ enum SDVersion {
     VERSION_LTXAV,
     VERSION_HIDREAM_O1,
     VERSION_Z_IMAGE,
+    VERSION_BOOGU_IMAGE,
     VERSION_OVIS_IMAGE,
     VERSION_ERNIE_IMAGE,
     VERSION_LENS,
@@ -143,6 +144,13 @@ static inline bool sd_version_is_z_image(SDVersion version) {
     return false;
 }
 
+static inline bool sd_version_is_boogu_image(SDVersion version) {
+    if (version == VERSION_BOOGU_IMAGE) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_longcat(SDVersion version) {
     if (version == VERSION_LONGCAT) {
         return true;
@@ -206,6 +214,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
         version == VERSION_HIDREAM_O1 ||
         sd_version_is_anima(version) ||
         sd_version_is_z_image(version) ||
+        sd_version_is_boogu_image(version) ||
         sd_version_is_ernie_image(version) ||
         sd_version_is_lens(version) ||
         sd_version_is_longcat(version) ||
diff --git a/otherarch/sdcpp/src/model/common/rope.hpp b/otherarch/sdcpp/src/model/common/rope.hpp
index c0077de33bc5..2e21ef7c2b76 100644
--- a/otherarch/sdcpp/src/model/common/rope.hpp
+++ b/otherarch/sdcpp/src/model/common/rope.hpp
@@ -899,10 +899,12 @@ namespace Rope {
         // q,k,v: [N, L, n_head, d_head]
         // pe: [L, d_head/2, 2, 2]
         // return: [N, L, n_head*d_head]
+        int64_t n_head = q->ne[1];
+
         q = apply_rope(ctx->ggml_ctx, q, pe, rope_interleaved);  // [N*n_head, L, d_head]
         k = apply_rope(ctx->ggml_ctx, k, pe, rope_interleaved);  // [N*n_head, L, d_head]
 
-        auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, v->ne[1], mask, true, ctx->flash_attn_enabled, kv_scale);  // [N, L, n_head*d_head]
+        auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, mask, true, ctx->flash_attn_enabled, kv_scale);  // [N, L, n_head*d_head]
         return x;
     }
 };  // namespace Rope
diff --git a/otherarch/sdcpp/src/model/diffusion/anima.hpp b/otherarch/sdcpp/src/model/diffusion/anima.hpp
index 6042516a90c6..504904d41f8c 100644
--- a/otherarch/sdcpp/src/model/diffusion/anima.hpp
+++ b/otherarch/sdcpp/src/model/diffusion/anima.hpp
@@ -227,6 +227,7 @@ namespace Anima {
             k4 = k_norm->forward(ctx, k4);
 
             ggml_tensor* attn_out = nullptr;
+            float scale           = (sd_backend_is(ctx->backend, "Vulkan") && ctx->flash_attn_enabled) ? 1.0f / 32.0f : 1.0f;
             if (pe_q != nullptr || pe_k != nullptr) {
                 if (pe_q == nullptr) {
                     pe_q = pe_k;
@@ -244,7 +245,8 @@ namespace Anima {
                                                      num_heads,
                                                      nullptr,
                                                      true,
-                                                     ctx->flash_attn_enabled);
+                                                     ctx->flash_attn_enabled,
+                                                     scale);
             } else {
                 auto q_flat = ggml_reshape_3d(ctx->ggml_ctx, q4, head_dim * num_heads, L_q, N);
                 auto k_flat = ggml_reshape_3d(ctx->ggml_ctx, k4, head_dim * num_heads, L_k, N);
@@ -256,7 +258,8 @@ namespace Anima {
                                                      num_heads,
                                                      nullptr,
                                                      false,
-                                                     ctx->flash_attn_enabled);
+                                                     ctx->flash_attn_enabled,
+                                                     scale);
             }
 
             return out_proj->forward(ctx, attn_out);
diff --git a/otherarch/sdcpp/src/model/diffusion/boogu.hpp b/otherarch/sdcpp/src/model/diffusion/boogu.hpp
new file mode 100644
index 000000000000..27e13aebd40c
--- /dev/null
+++ b/otherarch/sdcpp/src/model/diffusion/boogu.hpp
@@ -0,0 +1,835 @@
+#ifndef __SD_MODEL_DIFFUSION_BOOGU_HPP__
+#define __SD_MODEL_DIFFUSION_BOOGU_HPP__
+
+#include <algorithm>
+#include <cmath>
+#include <tuple>
+#include <vector>
+
+#include "core/ggml_extend.hpp"
+#include "model/common/rope.hpp"
+#include "model/diffusion/dit.hpp"
+#include "model/diffusion/model.hpp"
+#include "model/diffusion/qwen_image.hpp"
+#include "model_loader.h"
+
+namespace Boogu {
+    constexpr int BOOGU_GRAPH_SIZE = 65536;
+
+    struct BooguConfig {
+        int patch_size                   = 2;
+        int64_t in_channels              = 16;
+        int64_t out_channels             = 16;
+        int64_t hidden_size              = 3360;
+        int64_t num_layers               = 32;
+        int64_t num_double_stream_layers = 8;
+        int64_t num_refiner_layers       = 2;
+        int64_t num_attention_heads      = 28;
+        int64_t num_kv_heads             = 7;
+        int64_t head_dim                 = 120;
+        int64_t multiple_of              = 256;
+        int64_t instruction_feat_dim     = 4096;
+        int64_t timestep_embed_dim       = 1024;
+        int theta                        = 10000;
+        float timestep_scale             = 1000.0f;
+        float norm_eps                   = 1e-5f;
+        std::vector<int> axes_dim        = {40, 40, 40};
+        int64_t axes_dim_sum             = 120;
+
+        static int64_t count_blocks(const String2TensorStorage& tensor_storage_map,
+                                    const std::string& prefix,
+                                    const std::string& block_prefix) {
+            int64_t count = 0;
+            for (const auto& [name, _] : tensor_storage_map) {
+                if (!starts_with(name, prefix)) {
+                    continue;
+                }
+                size_t pos = name.find(block_prefix);
+                if (pos == std::string::npos) {
+                    continue;
+                }
+                auto items = split_string(name.substr(pos), '.');
+                if (items.size() > 1) {
+                    count = std::max<int64_t>(count, atoi(items[1].c_str()) + 1);
+                }
+            }
+            return count;
+        }
+
+        static BooguConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
+            BooguConfig config;
+            int64_t detected_head_dim = 0;
+            int64_t detected_kv_dim   = 0;
+
+            for (const auto& [name, tensor_storage] : tensor_storage_map) {
+                if (!starts_with(name, prefix)) {
+                    continue;
+                }
+                if (ends_with(name, "x_embedder.weight") && tensor_storage.n_dims == 2) {
+                    int64_t patch_area = config.patch_size * config.patch_size;
+                    config.in_channels = tensor_storage.ne[0] / patch_area;
+                    config.hidden_size = tensor_storage.ne[1];
+                } else if (ends_with(name, "time_caption_embed.caption_embedder.1.weight") && tensor_storage.n_dims == 2) {
+                    config.instruction_feat_dim = tensor_storage.ne[0];
+                    config.hidden_size          = tensor_storage.ne[1];
+                } else if (ends_with(name, "single_stream_layers.0.attn.norm_q.weight") && tensor_storage.n_dims == 1) {
+                    detected_head_dim = tensor_storage.ne[0];
+                } else if (ends_with(name, "double_stream_layers.0.img_self_attn.norm_q.weight") && tensor_storage.n_dims == 1) {
+                    detected_head_dim = tensor_storage.ne[0];
+                } else if (ends_with(name, "single_stream_layers.0.attn.to_k.weight") && tensor_storage.n_dims == 2) {
+                    detected_kv_dim = tensor_storage.ne[1];
+                } else if (ends_with(name, "double_stream_layers.0.img_instruct_attn.processor.img_to_k.weight") && tensor_storage.n_dims == 2) {
+                    detected_kv_dim = tensor_storage.ne[1];
+                } else if (ends_with(name, "norm_out.linear_2.weight") && tensor_storage.n_dims == 2) {
+                    int64_t patch_area  = config.patch_size * config.patch_size;
+                    config.out_channels = tensor_storage.ne[1] / patch_area;
+                }
+            }
+
+            config.num_layers               = std::max<int64_t>(1, count_blocks(tensor_storage_map, prefix, "single_stream_layers."));
+            config.num_double_stream_layers = std::max<int64_t>(0, count_blocks(tensor_storage_map, prefix, "double_stream_layers."));
+            int64_t noise_refiner_layers    = count_blocks(tensor_storage_map, prefix, "noise_refiner.");
+            int64_t ref_refiner_layers      = count_blocks(tensor_storage_map, prefix, "ref_image_refiner.");
+            int64_t context_refiner_layers  = count_blocks(tensor_storage_map, prefix, "context_refiner.");
+            config.num_refiner_layers       = std::max<int64_t>(1, std::max(noise_refiner_layers, std::max(ref_refiner_layers, context_refiner_layers)));
+
+            if (detected_head_dim > 0) {
+                config.head_dim            = detected_head_dim;
+                config.num_attention_heads = config.hidden_size / config.head_dim;
+                config.axes_dim_sum        = config.head_dim;
+                if (detected_kv_dim > 0) {
+                    config.num_kv_heads = detected_kv_dim / config.head_dim;
+                }
+                if (config.axes_dim_sum == 120) {
+                    config.axes_dim = {40, 40, 40};
+                } else if (config.axes_dim_sum % 3 == 0) {
+                    int axis        = static_cast<int>(config.axes_dim_sum / 3);
+                    config.axes_dim = {axis, axis, axis};
+                }
+            }
+            config.timestep_embed_dim = std::min<int64_t>(config.hidden_size, 1024);
+
+            LOG_DEBUG("boogu_image: layers=%" PRId64 ", double_stream_layers=%" PRId64 ", refiner_layers=%" PRId64 ", hidden=%" PRId64 ", heads=%" PRId64 ", kv_heads=%" PRId64 ", head_dim=%" PRId64 ", in_channels=%" PRId64 ", out_channels=%" PRId64,
+                      config.num_layers,
+                      config.num_double_stream_layers,
+                      config.num_refiner_layers,
+                      config.hidden_size,
+                      config.num_attention_heads,
+                      config.num_kv_heads,
+                      config.head_dim,
+                      config.in_channels,
+                      config.out_channels);
+            return config;
+        }
+    };
+
+    __STATIC_INLINE__ ggml_tensor* scale_modulate(ggml_context* ctx, ggml_tensor* x, ggml_tensor* scale) {
+        scale = ggml_reshape_3d(ctx, scale, scale->ne[0], 1, scale->ne[1]);
+        return ggml_add(ctx, x, ggml_mul(ctx, x, scale));
+    }
+
+    __STATIC_INLINE__ ggml_tensor* gate_residual(ggml_context* ctx, ggml_tensor* residual, ggml_tensor* x, ggml_tensor* gate) {
+        gate = ggml_tanh(ctx, gate);
+        gate = ggml_reshape_3d(ctx, gate, gate->ne[0], 1, gate->ne[1]);
+        x    = ggml_mul(ctx, x, gate);
+        return ggml_add(ctx, residual, x);
+    }
+
+    struct LuminaCombinedTimestepCaptionEmbedding : public GGMLBlock {
+        int64_t frequency_embedding_size;
+        float timestep_scale;
+
+        LuminaCombinedTimestepCaptionEmbedding(int64_t hidden_size,
+                                               int64_t instruction_feat_dim,
+                                               int64_t frequency_embedding_size,
+                                               float norm_eps,
+                                               float timestep_scale)
+            : frequency_embedding_size(frequency_embedding_size),
+              timestep_scale(timestep_scale) {
+            blocks["timestep_embedder"]  = std::make_shared<Qwen::TimestepEmbedding>(frequency_embedding_size, std::min<int64_t>(hidden_size, 1024));
+            blocks["caption_embedder.0"] = std::make_shared<RMSNorm>(instruction_feat_dim, norm_eps);
+            blocks["caption_embedder.1"] = std::make_shared<Linear>(instruction_feat_dim, hidden_size, true);
+        }
+
+        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx, ggml_tensor* timestep, ggml_tensor* text_hidden_states) {
+            auto timestep_embedder  = std::dynamic_pointer_cast<Qwen::TimestepEmbedding>(blocks["timestep_embedder"]);
+            auto caption_embedder_0 = std::dynamic_pointer_cast<RMSNorm>(blocks["caption_embedder.0"]);
+            auto caption_embedder_1 = std::dynamic_pointer_cast<Linear>(blocks["caption_embedder.1"]);
+
+            auto timestep_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast<int>(frequency_embedding_size), 10000, timestep_scale);
+            auto time_embed    = timestep_embedder->forward(ctx, timestep_proj);
+            auto caption_embed = caption_embedder_1->forward(ctx, caption_embedder_0->forward(ctx, text_hidden_states));
+            return {time_embed, caption_embed};
+        }
+    };
+
+    struct LuminaRMSNormZero : public GGMLBlock {
+        LuminaRMSNormZero(int64_t embedding_dim, int64_t conditioning_embedding_dim, float norm_eps) {
+            blocks["linear"] = std::make_shared<Linear>(conditioning_embedding_dim, 4 * embedding_dim, true);
+            blocks["norm"]   = std::make_shared<RMSNorm>(embedding_dim, norm_eps);
+        }
+
+        std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* emb) {
+            auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
+            auto norm   = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
+
+            emb       = linear->forward(ctx, ggml_silu(ctx->ggml_ctx, emb));
+            auto mods = ggml_ext_chunk(ctx->ggml_ctx, emb, 4, 0);
+
+            auto scale_msa = mods[0];
+            auto gate_msa  = mods[1];
+            auto scale_mlp = mods[2];
+            auto gate_mlp  = mods[3];
+
+            x = scale_modulate(ctx->ggml_ctx, norm->forward(ctx, x), scale_msa);
+            return {x, gate_msa, scale_mlp, gate_mlp};
+        }
+    };
+
+    struct LuminaFeedForward : public GGMLBlock {
+        LuminaFeedForward(int64_t dim, int64_t inner_dim, int64_t multiple_of) {
+            inner_dim          = multiple_of * ((inner_dim + multiple_of - 1) / multiple_of);
+            blocks["linear_1"] = std::make_shared<Linear>(dim, inner_dim, false);
+            blocks["linear_2"] = std::make_shared<Linear>(inner_dim, dim, false);
+            blocks["linear_3"] = std::make_shared<Linear>(dim, inner_dim, false);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
+            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
+            auto linear_3 = std::dynamic_pointer_cast<Linear>(blocks["linear_3"]);
+
+            if (sd_backend_is(ctx->backend, "Vulkan")) {
+                linear_2->set_force_prec_f32(true);
+            }
+
+            auto h1 = linear_1->forward(ctx, x);
+            auto h2 = linear_3->forward(ctx, x);
+            x       = ggml_swiglu_split(ctx->ggml_ctx, h1, h2);
+            x       = linear_2->forward(ctx, x);
+            return x;
+        }
+    };
+
+    struct LuminaLayerNormContinuous : public GGMLBlock {
+        LuminaLayerNormContinuous(int64_t embedding_dim,
+                                  int64_t conditioning_embedding_dim,
+                                  int64_t out_dim) {
+            blocks["linear_1"] = std::make_shared<Linear>(conditioning_embedding_dim, embedding_dim, true);
+            blocks["norm"]     = std::make_shared<LayerNorm>(embedding_dim, 1e-6f, false);
+            blocks["linear_2"] = std::make_shared<Linear>(embedding_dim, out_dim, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* conditioning_embedding) {
+            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
+            auto norm     = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
+            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
+
+            auto emb = linear_1->forward(ctx, ggml_silu(ctx->ggml_ctx, conditioning_embedding));
+            x        = scale_modulate(ctx->ggml_ctx, norm->forward(ctx, x), emb);
+            x        = linear_2->forward(ctx, x);
+            return x;
+        }
+    };
+
+    struct Attention : public GGMLBlock {
+        int64_t dim_head;
+        int64_t heads;
+        int64_t kv_heads;
+
+        Attention(int64_t query_dim, int64_t dim_head, int64_t heads, int64_t kv_heads, float eps = 1e-5f)
+            : dim_head(dim_head), heads(heads), kv_heads(kv_heads) {
+            blocks["to_q"]     = std::make_shared<Linear>(query_dim, heads * dim_head, false);
+            blocks["to_k"]     = std::make_shared<Linear>(query_dim, kv_heads * dim_head, false);
+            blocks["to_v"]     = std::make_shared<Linear>(query_dim, kv_heads * dim_head, false);
+            blocks["norm_q"]   = std::make_shared<RMSNorm>(dim_head, eps);
+            blocks["norm_k"]   = std::make_shared<RMSNorm>(dim_head, eps);
+            blocks["to_out.0"] = std::make_shared<Linear>(heads * dim_head, query_dim, false);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* hidden_states,
+                             ggml_tensor* encoder_hidden_states,
+                             ggml_tensor* rotary_emb,
+                             ggml_tensor* attention_mask = nullptr) {
+            auto to_q     = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
+            auto to_k     = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
+            auto to_v     = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
+            auto norm_q   = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_q"]);
+            auto norm_k   = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_k"]);
+            auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
+
+            if (sd_backend_is(ctx->backend, "Vulkan")) {
+                to_out_0->set_force_prec_f32(true);
+            }
+
+            int64_t N  = hidden_states->ne[2];
+            int64_t Lq = hidden_states->ne[1];
+            int64_t Lk = encoder_hidden_states->ne[1];
+
+            auto q = to_q->forward(ctx, hidden_states);
+            q      = ggml_reshape_4d(ctx->ggml_ctx, q, dim_head, heads, Lq, N);
+            auto k = to_k->forward(ctx, encoder_hidden_states);
+            k      = ggml_reshape_4d(ctx->ggml_ctx, k, dim_head, kv_heads, Lk, N);
+            auto v = to_v->forward(ctx, encoder_hidden_states);
+            v      = ggml_reshape_4d(ctx->ggml_ctx, v, dim_head, kv_heads, Lk, N);
+
+            q = norm_q->forward(ctx, q);
+            k = norm_k->forward(ctx, k);
+
+            auto out = Rope::attention(ctx, q, k, v, rotary_emb, attention_mask);
+            out      = to_out_0->forward(ctx, out);
+            return out;
+        }
+    };
+
+    struct BooguImageTransformerBlock : public GGMLBlock {
+        bool modulation;
+
+        BooguImageTransformerBlock(int64_t dim,
+                                   int64_t num_attention_heads,
+                                   int64_t num_kv_heads,
+                                   int64_t multiple_of,
+                                   float norm_eps,
+                                   bool modulation)
+            : modulation(modulation) {
+            int64_t head_dim       = dim / num_attention_heads;
+            blocks["attn"]         = std::make_shared<Attention>(dim, head_dim, num_attention_heads, num_kv_heads, 1e-5f);
+            blocks["feed_forward"] = std::make_shared<LuminaFeedForward>(dim, 4 * dim, multiple_of);
+            if (modulation) {
+                blocks["norm1"] = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
+            } else {
+                blocks["norm1"] = std::make_shared<RMSNorm>(dim, norm_eps);
+            }
+            blocks["ffn_norm1"] = std::make_shared<RMSNorm>(dim, norm_eps);
+            blocks["norm2"]     = std::make_shared<RMSNorm>(dim, norm_eps);
+            blocks["ffn_norm2"] = std::make_shared<RMSNorm>(dim, norm_eps);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* hidden_states,
+                             ggml_tensor* rotary_emb,
+                             ggml_tensor* temb           = nullptr,
+                             ggml_tensor* attention_mask = nullptr) {
+            auto attn         = std::dynamic_pointer_cast<Attention>(blocks["attn"]);
+            auto feed_forward = std::dynamic_pointer_cast<LuminaFeedForward>(blocks["feed_forward"]);
+            auto ffn_norm1    = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm1"]);
+            auto norm2        = std::dynamic_pointer_cast<RMSNorm>(blocks["norm2"]);
+            auto ffn_norm2    = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm2"]);
+
+            if (modulation) {
+                auto norm1 = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["norm1"]);
+                auto mods  = norm1->forward(ctx, hidden_states, temb);
+
+                auto norm_hidden_states = std::get<0>(mods);
+                auto gate_msa           = std::get<1>(mods);
+                auto scale_mlp          = std::get<2>(mods);
+                auto gate_mlp           = std::get<3>(mods);
+
+                auto attn_output = attn->forward(ctx, norm_hidden_states, norm_hidden_states, rotary_emb, attention_mask);
+                hidden_states    = gate_residual(ctx->ggml_ctx, hidden_states, norm2->forward(ctx, attn_output), gate_msa);
+
+                auto mlp_input  = scale_modulate(ctx->ggml_ctx, ffn_norm1->forward(ctx, hidden_states), scale_mlp);
+                auto mlp_output = feed_forward->forward(ctx, mlp_input);
+                hidden_states   = gate_residual(ctx->ggml_ctx, hidden_states, ffn_norm2->forward(ctx, mlp_output), gate_mlp);
+            } else {
+                auto norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["norm1"]);
+
+                auto norm_hidden_states = norm1->forward(ctx, hidden_states);
+                auto attn_output        = attn->forward(ctx, norm_hidden_states, norm_hidden_states, rotary_emb, attention_mask);
+                hidden_states           = ggml_add(ctx->ggml_ctx, hidden_states, norm2->forward(ctx, attn_output));
+
+                auto mlp_output = feed_forward->forward(ctx, ffn_norm1->forward(ctx, hidden_states));
+                hidden_states   = ggml_add(ctx->ggml_ctx, hidden_states, ffn_norm2->forward(ctx, mlp_output));
+            }
+            return hidden_states;
+        }
+    };
+
+    struct BooguImageJointAttention : public GGMLBlock {
+        int64_t dim_head;
+        int64_t heads;
+        int64_t kv_heads;
+
+        BooguImageJointAttention(int64_t dim, int64_t dim_head, int64_t heads, int64_t kv_heads)
+            : dim_head(dim_head), heads(heads), kv_heads(kv_heads) {
+            blocks["norm_q"]                  = std::make_shared<RMSNorm>(dim_head, 1e-5f);
+            blocks["norm_k"]                  = std::make_shared<RMSNorm>(dim_head, 1e-5f);
+            blocks["to_out.0"]                = std::make_shared<Linear>(heads * dim_head, dim, false);
+            blocks["processor.img_to_q"]      = std::make_shared<Linear>(dim, heads * dim_head, false);
+            blocks["processor.img_to_k"]      = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
+            blocks["processor.img_to_v"]      = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
+            blocks["processor.instruct_to_q"] = std::make_shared<Linear>(dim, heads * dim_head, false);
+            blocks["processor.instruct_to_k"] = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
+            blocks["processor.instruct_to_v"] = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
+            blocks["processor.instruct_out"]  = std::make_shared<Linear>(heads * dim_head, dim, false);
+            blocks["processor.img_out"]       = std::make_shared<Linear>(heads * dim_head, dim, false);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* img_hidden_states,
+                             ggml_tensor* instruct_hidden_states,
+                             ggml_tensor* rotary_emb,
+                             ggml_tensor* attention_mask = nullptr) {
+            auto norm_q        = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_q"]);
+            auto norm_k        = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_k"]);
+            auto to_out_0      = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
+            auto img_to_q      = std::dynamic_pointer_cast<Linear>(blocks["processor.img_to_q"]);
+            auto img_to_k      = std::dynamic_pointer_cast<Linear>(blocks["processor.img_to_k"]);
+            auto img_to_v      = std::dynamic_pointer_cast<Linear>(blocks["processor.img_to_v"]);
+            auto instruct_to_q = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_to_q"]);
+            auto instruct_to_k = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_to_k"]);
+            auto instruct_to_v = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_to_v"]);
+            auto instruct_out  = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_out"]);
+            auto img_out       = std::dynamic_pointer_cast<Linear>(blocks["processor.img_out"]);
+
+            if (sd_backend_is(ctx->backend, "Vulkan")) {
+                to_out_0->set_force_prec_f32(true);
+            }
+
+            int64_t N          = img_hidden_states->ne[2];
+            int64_t L_img      = img_hidden_states->ne[1];
+            int64_t L_instruct = instruct_hidden_states->ne[1];
+
+            auto img_q = img_to_q->forward(ctx, img_hidden_states);
+            img_q      = ggml_reshape_4d(ctx->ggml_ctx, img_q, dim_head, heads, L_img, N);
+            auto img_k = img_to_k->forward(ctx, img_hidden_states);
+            img_k      = ggml_reshape_4d(ctx->ggml_ctx, img_k, dim_head, kv_heads, L_img, N);
+            auto img_v = img_to_v->forward(ctx, img_hidden_states);
+            img_v      = ggml_reshape_4d(ctx->ggml_ctx, img_v, dim_head, kv_heads, L_img, N);
+
+            auto instruct_q = instruct_to_q->forward(ctx, instruct_hidden_states);
+            instruct_q      = ggml_reshape_4d(ctx->ggml_ctx, instruct_q, dim_head, heads, L_instruct, N);
+            auto instruct_k = instruct_to_k->forward(ctx, instruct_hidden_states);
+            instruct_k      = ggml_reshape_4d(ctx->ggml_ctx, instruct_k, dim_head, kv_heads, L_instruct, N);
+            auto instruct_v = instruct_to_v->forward(ctx, instruct_hidden_states);
+            instruct_v      = ggml_reshape_4d(ctx->ggml_ctx, instruct_v, dim_head, kv_heads, L_instruct, N);
+
+            auto q = ggml_concat(ctx->ggml_ctx, instruct_q, img_q, 2);
+            auto k = ggml_concat(ctx->ggml_ctx, instruct_k, img_k, 2);
+            auto v = ggml_concat(ctx->ggml_ctx, instruct_v, img_v, 2);
+            q      = norm_q->forward(ctx, q);
+            k      = norm_k->forward(ctx, k);
+
+            auto hidden_states = Rope::attention(ctx, q, k, v, rotary_emb, attention_mask);
+            auto instruct_attn = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, 0, L_instruct);
+            auto img_attn      = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, L_instruct, L_instruct + L_img);
+
+            instruct_attn = instruct_out->forward(ctx, instruct_attn);
+            img_attn      = img_out->forward(ctx, img_attn);
+            hidden_states = ggml_concat(ctx->ggml_ctx, instruct_attn, img_attn, 1);
+            hidden_states = to_out_0->forward(ctx, hidden_states);
+            return hidden_states;
+        }
+    };
+
+    struct BooguImageDoubleStreamBlock : public GGMLBlock {
+        BooguImageDoubleStreamBlock(int64_t dim,
+                                    int64_t num_attention_heads,
+                                    int64_t num_kv_heads,
+                                    int64_t multiple_of,
+                                    float norm_eps) {
+            int64_t head_dim                = dim / num_attention_heads;
+            blocks["img_instruct_attn"]     = std::make_shared<BooguImageJointAttention>(dim, head_dim, num_attention_heads, num_kv_heads);
+            blocks["img_self_attn"]         = std::make_shared<Attention>(dim, head_dim, num_attention_heads, num_kv_heads, 1e-5f);
+            blocks["img_feed_forward"]      = std::make_shared<LuminaFeedForward>(dim, 4 * dim, multiple_of);
+            blocks["instruct_feed_forward"] = std::make_shared<LuminaFeedForward>(dim, 4 * dim, multiple_of);
+            blocks["img_norm1"]             = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
+            blocks["img_norm2"]             = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
+            blocks["img_norm3"]             = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
+            blocks["instruct_norm1"]        = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
+            blocks["instruct_norm2"]        = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
+            blocks["img_attn_norm"]         = std::make_shared<RMSNorm>(dim, norm_eps);
+            blocks["img_self_attn_norm"]    = std::make_shared<RMSNorm>(dim, norm_eps);
+            blocks["img_ffn_norm1"]         = std::make_shared<RMSNorm>(dim, norm_eps);
+            blocks["img_ffn_norm2"]         = std::make_shared<RMSNorm>(dim, norm_eps);
+            blocks["instruct_attn_norm"]    = std::make_shared<RMSNorm>(dim, norm_eps);
+            blocks["instruct_ffn_norm1"]    = std::make_shared<RMSNorm>(dim, norm_eps);
+            blocks["instruct_ffn_norm2"]    = std::make_shared<RMSNorm>(dim, norm_eps);
+        }
+
+        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                      ggml_tensor* img_hidden_states,
+                                                      ggml_tensor* instruct_hidden_states,
+                                                      ggml_tensor* joint_rotary_emb,
+                                                      ggml_tensor* img_rotary_emb,
+                                                      ggml_tensor* temb) {
+            auto img_instruct_attn     = std::dynamic_pointer_cast<BooguImageJointAttention>(blocks["img_instruct_attn"]);
+            auto img_self_attn         = std::dynamic_pointer_cast<Attention>(blocks["img_self_attn"]);
+            auto img_feed_forward      = std::dynamic_pointer_cast<LuminaFeedForward>(blocks["img_feed_forward"]);
+            auto instruct_feed_forward = std::dynamic_pointer_cast<LuminaFeedForward>(blocks["instruct_feed_forward"]);
+            auto img_norm1             = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["img_norm1"]);
+            auto img_norm2             = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["img_norm2"]);
+            auto img_norm3             = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["img_norm3"]);
+            auto instruct_norm1        = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["instruct_norm1"]);
+            auto instruct_norm2        = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["instruct_norm2"]);
+            auto img_attn_norm         = std::dynamic_pointer_cast<RMSNorm>(blocks["img_attn_norm"]);
+            auto img_self_attn_norm    = std::dynamic_pointer_cast<RMSNorm>(blocks["img_self_attn_norm"]);
+            auto img_ffn_norm1         = std::dynamic_pointer_cast<RMSNorm>(blocks["img_ffn_norm1"]);
+            auto img_ffn_norm2         = std::dynamic_pointer_cast<RMSNorm>(blocks["img_ffn_norm2"]);
+            auto instruct_attn_norm    = std::dynamic_pointer_cast<RMSNorm>(blocks["instruct_attn_norm"]);
+            auto instruct_ffn_norm1    = std::dynamic_pointer_cast<RMSNorm>(blocks["instruct_ffn_norm1"]);
+            auto instruct_ffn_norm2    = std::dynamic_pointer_cast<RMSNorm>(blocks["instruct_ffn_norm2"]);
+
+            int64_t L_instruct = instruct_hidden_states->ne[1];
+
+            auto img_norm1_out_vec      = img_norm1->forward(ctx, img_hidden_states, temb);
+            auto img_norm2_out_vec      = img_norm2->forward(ctx, img_hidden_states, temb);
+            auto img_norm3_out_vec      = img_norm3->forward(ctx, img_hidden_states, temb);
+            auto instruct_norm1_out_vec = instruct_norm1->forward(ctx, instruct_hidden_states, temb);
+            auto instruct_norm2_out_vec = instruct_norm2->forward(ctx, instruct_hidden_states, temb);
+
+            auto img_norm1_out = std::get<0>(img_norm1_out_vec);
+            auto img_gate_msa  = std::get<1>(img_norm1_out_vec);
+            auto img_scale_mlp = std::get<2>(img_norm1_out_vec);
+            auto img_gate_mlp  = std::get<3>(img_norm1_out_vec);
+
+            auto img_norm2_out = std::get<0>(img_norm2_out_vec);
+            auto img_shift_mlp = std::get<1>(img_norm2_out_vec);
+
+            auto img_norm3_out = std::get<0>(img_norm3_out_vec);
+            auto img_gate_self = std::get<1>(img_norm3_out_vec);
+
+            auto instruct_norm1_out = std::get<0>(instruct_norm1_out_vec);
+            auto instruct_gate_msa  = std::get<1>(instruct_norm1_out_vec);
+            auto instruct_scale_mlp = std::get<2>(instruct_norm1_out_vec);
+            auto instruct_gate_mlp  = std::get<3>(instruct_norm1_out_vec);
+
+            auto instruct_norm2_out = std::get<0>(instruct_norm2_out_vec);
+            auto instruct_shift_mlp = std::get<1>(instruct_norm2_out_vec);
+
+            auto joint_attn_out    = img_instruct_attn->forward(ctx, img_norm1_out, instruct_norm1_out, joint_rotary_emb);
+            auto instruct_attn_out = ggml_ext_slice(ctx->ggml_ctx, joint_attn_out, 1, 0, L_instruct);
+            auto img_attn_out      = ggml_ext_slice(ctx->ggml_ctx, joint_attn_out, 1, L_instruct, joint_attn_out->ne[1]);
+
+            auto img_self_attn_out = img_self_attn->forward(ctx, img_norm3_out, img_norm3_out, img_rotary_emb);
+
+            img_hidden_states = gate_residual(ctx->ggml_ctx, img_hidden_states, img_attn_norm->forward(ctx, img_attn_out), img_gate_msa);
+            img_hidden_states = gate_residual(ctx->ggml_ctx, img_hidden_states, img_self_attn_norm->forward(ctx, img_self_attn_out), img_gate_self);
+
+            auto img_mlp_input = scale_modulate(ctx->ggml_ctx, img_norm2_out, img_scale_mlp);
+            img_shift_mlp      = ggml_reshape_3d(ctx->ggml_ctx, img_shift_mlp, img_shift_mlp->ne[0], 1, img_shift_mlp->ne[1]);
+            img_mlp_input      = ggml_add(ctx->ggml_ctx, img_mlp_input, img_shift_mlp);
+            auto img_mlp_out   = img_feed_forward->forward(ctx, img_ffn_norm1->forward(ctx, img_mlp_input));
+            img_hidden_states  = gate_residual(ctx->ggml_ctx, img_hidden_states, img_ffn_norm2->forward(ctx, img_mlp_out), img_gate_mlp);
+
+            instruct_hidden_states  = gate_residual(ctx->ggml_ctx, instruct_hidden_states, instruct_attn_norm->forward(ctx, instruct_attn_out), instruct_gate_msa);
+            auto instruct_mlp_input = scale_modulate(ctx->ggml_ctx, instruct_norm2_out, instruct_scale_mlp);
+            instruct_shift_mlp      = ggml_reshape_3d(ctx->ggml_ctx, instruct_shift_mlp, instruct_shift_mlp->ne[0], 1, instruct_shift_mlp->ne[1]);
+            instruct_mlp_input      = ggml_add(ctx->ggml_ctx, instruct_mlp_input, instruct_shift_mlp);
+            auto instruct_mlp_out   = instruct_feed_forward->forward(ctx, instruct_ffn_norm1->forward(ctx, instruct_mlp_input));
+            instruct_hidden_states  = gate_residual(ctx->ggml_ctx, instruct_hidden_states, instruct_ffn_norm2->forward(ctx, instruct_mlp_out), instruct_gate_mlp);
+
+            return {img_hidden_states, instruct_hidden_states};
+        }
+    };
+
+    struct BooguImageModel : public GGMLBlock {
+        BooguConfig config;
+
+        void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+            GGML_UNUSED(tensor_storage_map);
+            GGML_UNUSED(prefix);
+            params["image_index_embedding"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, config.hidden_size, 5);
+        }
+
+        BooguImageModel() = default;
+        BooguImageModel(BooguConfig config)
+            : config(std::move(config)) {
+            blocks["x_embedder"]               = std::make_shared<Linear>(this->config.patch_size * this->config.patch_size * this->config.in_channels, this->config.hidden_size, true);
+            blocks["ref_image_patch_embedder"] = std::make_shared<Linear>(this->config.patch_size * this->config.patch_size * this->config.in_channels, this->config.hidden_size, true);
+            blocks["time_caption_embed"]       = std::make_shared<LuminaCombinedTimestepCaptionEmbedding>(this->config.hidden_size,
+                                                                                                    this->config.instruction_feat_dim,
+                                                                                                    256,
+                                                                                                    this->config.norm_eps,
+                                                                                                    this->config.timestep_scale);
+
+            for (int i = 0; i < this->config.num_refiner_layers; i++) {
+                blocks["noise_refiner." + std::to_string(i)]     = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
+                                                                                                            this->config.num_attention_heads,
+                                                                                                            this->config.num_kv_heads,
+                                                                                                            this->config.multiple_of,
+                                                                                                            this->config.norm_eps,
+                                                                                                            true);
+                blocks["ref_image_refiner." + std::to_string(i)] = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
+                                                                                                                this->config.num_attention_heads,
+                                                                                                                this->config.num_kv_heads,
+                                                                                                                this->config.multiple_of,
+                                                                                                                this->config.norm_eps,
+                                                                                                                true);
+                blocks["context_refiner." + std::to_string(i)]   = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
+                                                                                                              this->config.num_attention_heads,
+                                                                                                              this->config.num_kv_heads,
+                                                                                                              this->config.multiple_of,
+                                                                                                              this->config.norm_eps,
+                                                                                                              false);
+            }
+
+            for (int i = 0; i < this->config.num_double_stream_layers; i++) {
+                blocks["double_stream_layers." + std::to_string(i)] = std::make_shared<BooguImageDoubleStreamBlock>(this->config.hidden_size,
+                                                                                                                    this->config.num_attention_heads,
+                                                                                                                    this->config.num_kv_heads,
+                                                                                                                    this->config.multiple_of,
+                                                                                                                    this->config.norm_eps);
+            }
+
+            for (int i = 0; i < this->config.num_layers; i++) {
+                blocks["single_stream_layers." + std::to_string(i)] = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
+                                                                                                                   this->config.num_attention_heads,
+                                                                                                                   this->config.num_kv_heads,
+                                                                                                                   this->config.multiple_of,
+                                                                                                                   this->config.norm_eps,
+                                                                                                                   true);
+            }
+
+            blocks["norm_out"] = std::make_shared<LuminaLayerNormContinuous>(this->config.hidden_size,
+                                                                             this->config.timestep_embed_dim,
+                                                                             this->config.patch_size * this->config.patch_size * this->config.out_channels);
+        }
+
+        ggml_tensor* image_index_embedding(GGMLRunnerContext* ctx, int index) {
+            GGML_ASSERT(index >= 0 && index < 5);
+            auto embedding = params["image_index_embedding"];
+            auto out       = ggml_view_1d(ctx->ggml_ctx,
+                                          embedding,
+                                          config.hidden_size,
+                                          index * config.hidden_size * ggml_element_size(embedding));
+            out            = ggml_reshape_3d(ctx->ggml_ctx, out, config.hidden_size, 1, 1);
+            return out;
+        }
+
+        ggml_tensor* embed_refs(GGMLRunnerContext* ctx, const std::vector<ggml_tensor*>& ref_latents) {
+            if (ref_latents.empty()) {
+                return nullptr;
+            }
+            auto ref_image_patch_embedder = std::dynamic_pointer_cast<Linear>(blocks["ref_image_patch_embedder"]);
+
+            ggml_tensor* ref_img = nullptr;
+            for (int i = 0; i < static_cast<int>(ref_latents.size()); i++) {
+                auto ref = DiT::pad_and_patchify(ctx, ref_latents[i], config.patch_size, config.patch_size, false);
+                ref      = ref_image_patch_embedder->forward(ctx, ref);
+                ref      = ggml_add(ctx->ggml_ctx, ref, image_index_embedding(ctx, std::min(i, 4)));
+                ref_img  = ref_img == nullptr ? ref : ggml_concat(ctx->ggml_ctx, ref_img, ref, 1);
+            }
+            return ref_img;
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* timesteps,
+                             ggml_tensor* context,
+                             ggml_tensor* pe,
+                             std::vector<ggml_tensor*> ref_latents = {}) {
+            int64_t W = x->ne[0];
+            int64_t H = x->ne[1];
+            int64_t N = x->ne[3];
+            GGML_ASSERT(N == 1);
+
+            auto x_embedder         = std::dynamic_pointer_cast<Linear>(blocks["x_embedder"]);
+            auto time_caption_embed = std::dynamic_pointer_cast<LuminaCombinedTimestepCaptionEmbedding>(blocks["time_caption_embed"]);
+            auto norm_out           = std::dynamic_pointer_cast<LuminaLayerNormContinuous>(blocks["norm_out"]);
+
+            auto timestep = ggml_sub(ctx->ggml_ctx, ggml_ext_ones_like(ctx->ggml_ctx, timesteps), timesteps);
+            auto embeds   = time_caption_embed->forward(ctx, timestep, context);
+            auto temb     = embeds.first;
+            auto txt      = embeds.second;
+
+            auto img        = DiT::pad_and_patchify(ctx, x, config.patch_size, config.patch_size, false);
+            int64_t img_len = img->ne[1];
+            img             = x_embedder->forward(ctx, img);
+            auto ref_img    = embed_refs(ctx, ref_latents);
+            int64_t ref_len = ref_img != nullptr ? ref_img->ne[1] : 0;
+            int64_t txt_len = txt->ne[1];
+
+            GGML_ASSERT(pe->ne[3] == txt_len + ref_len + img_len);
+            auto txt_pe   = ggml_ext_slice(ctx->ggml_ctx, pe, 3, 0, txt_len);
+            auto noise_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt_len + ref_len, txt_len + ref_len + img_len);
+
+            for (int i = 0; i < config.num_refiner_layers; i++) {
+                auto block = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["context_refiner." + std::to_string(i)]);
+                txt        = block->forward(ctx, txt, txt_pe);
+                sd::ggml_graph_cut::mark_graph_cut(txt, "boogu.context_refiner." + std::to_string(i), "txt");
+            }
+
+            for (int i = 0; i < config.num_refiner_layers; i++) {
+                auto block = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["noise_refiner." + std::to_string(i)]);
+                img        = block->forward(ctx, img, noise_pe, temb);
+                sd::ggml_graph_cut::mark_graph_cut(img, "boogu.noise_refiner." + std::to_string(i), "img");
+            }
+
+            ggml_tensor* combined_img = img;
+            if (ref_img != nullptr) {
+                auto ref_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt_len, txt_len + ref_len);
+                for (int i = 0; i < config.num_refiner_layers; i++) {
+                    auto block = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["ref_image_refiner." + std::to_string(i)]);
+                    ref_img    = block->forward(ctx, ref_img, ref_pe, temb);
+                    sd::ggml_graph_cut::mark_graph_cut(ref_img, "boogu.ref_image_refiner." + std::to_string(i), "ref_img");
+                }
+                combined_img = ggml_concat(ctx->ggml_ctx, ref_img, img, 1);
+            }
+
+            auto img_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt_len, txt_len + combined_img->ne[1]);
+            for (int i = 0; i < config.num_double_stream_layers; i++) {
+                auto block   = std::dynamic_pointer_cast<BooguImageDoubleStreamBlock>(blocks["double_stream_layers." + std::to_string(i)]);
+                auto result  = block->forward(ctx, combined_img, txt, pe, img_pe, temb);
+                combined_img = result.first;
+                txt          = result.second;
+                sd::ggml_graph_cut::mark_graph_cut(combined_img, "boogu.double_stream_layers." + std::to_string(i), "img");
+                sd::ggml_graph_cut::mark_graph_cut(txt, "boogu.double_stream_layers." + std::to_string(i), "txt");
+            }
+
+            auto hidden_states = ggml_concat(ctx->ggml_ctx, txt, combined_img, 1);
+            for (int i = 0; i < config.num_layers; i++) {
+                auto block    = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["single_stream_layers." + std::to_string(i)]);
+                hidden_states = block->forward(ctx, hidden_states, pe, temb);
+                sd::ggml_graph_cut::mark_graph_cut(hidden_states, "boogu.single_stream_layers." + std::to_string(i), "hidden_states");
+            }
+
+            hidden_states = norm_out->forward(ctx, hidden_states, temb);
+            hidden_states = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, hidden_states->ne[1] - img_len, hidden_states->ne[1]);
+            hidden_states = DiT::unpatchify_and_crop(ctx->ggml_ctx, hidden_states, H, W, config.patch_size, config.patch_size, false);
+            hidden_states = ggml_ext_scale(ctx->ggml_ctx, hidden_states, -1.f);
+            return hidden_states;
+        }
+    };
+
+    __STATIC_INLINE__ int patched_token_count(int64_t size, int patch_size) {
+        int pad = (patch_size - (static_cast<int>(size) % patch_size)) % patch_size;
+        return (static_cast<int>(size) + pad) / patch_size;
+    }
+
+    __STATIC_INLINE__ void append_spatial_ids(std::vector<std::vector<float>>& ids,
+                                              int bs,
+                                              int pe_shift,
+                                              int h_tokens,
+                                              int w_tokens) {
+        std::vector<std::vector<float>> image_ids(h_tokens * w_tokens, std::vector<float>(3, 0.0f));
+        for (int h = 0; h < h_tokens; h++) {
+            for (int w = 0; w < w_tokens; w++) {
+                image_ids[h * w_tokens + w][0] = static_cast<float>(pe_shift);
+                image_ids[h * w_tokens + w][1] = static_cast<float>(h);
+                image_ids[h * w_tokens + w][2] = static_cast<float>(w);
+            }
+        }
+        for (int b = 0; b < bs; b++) {
+            ids.insert(ids.end(), image_ids.begin(), image_ids.end());
+        }
+    }
+
+    __STATIC_INLINE__ std::vector<float> gen_boogu_pe(int h,
+                                                      int w,
+                                                      int patch_size,
+                                                      int bs,
+                                                      int context_len,
+                                                      const std::vector<ggml_tensor*>& ref_latents,
+                                                      int theta,
+                                                      const std::vector<int>& axes_dim) {
+        std::vector<std::vector<float>> ids;
+        ids.reserve(static_cast<size_t>(bs) * context_len);
+        for (int b = 0; b < bs; b++) {
+            for (int i = 0; i < context_len; i++) {
+                float pos = static_cast<float>(i);
+                ids.push_back({pos, pos, pos});
+            }
+        }
+
+        int pe_shift = context_len;
+        for (ggml_tensor* ref : ref_latents) {
+            int ref_h_tokens = patched_token_count(ref->ne[1], patch_size);
+            int ref_w_tokens = patched_token_count(ref->ne[0], patch_size);
+            append_spatial_ids(ids, bs, pe_shift, ref_h_tokens, ref_w_tokens);
+            pe_shift += std::max(ref_h_tokens, ref_w_tokens);
+        }
+
+        int h_tokens = patched_token_count(h, patch_size);
+        int w_tokens = patched_token_count(w, patch_size);
+        append_spatial_ids(ids, bs, pe_shift, h_tokens, w_tokens);
+
+        return Rope::embed_nd(ids, bs, static_cast<float>(theta), axes_dim);
+    }
+
+    struct BooguImageRunner : public DiffusionModelRunner {
+        BooguConfig config;
+        BooguImageModel boogu;
+        std::vector<float> pe_vec;
+
+        BooguImageRunner(ggml_backend_t backend,
+                         const String2TensorStorage& tensor_storage_map      = {},
+                         const std::string prefix                            = "",
+                         SDVersion version                                   = VERSION_BOOGU_IMAGE,
+                         std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+            : DiffusionModelRunner(backend, prefix, weight_manager),
+              config(BooguConfig::detect_from_weights(tensor_storage_map, prefix)) {
+            boogu = BooguImageModel(config);
+            boogu.init(params_ctx, tensor_storage_map, prefix);
+        }
+
+        std::string get_desc() override {
+            return "boogu_image";
+        }
+
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
+            boogu.get_param_tensors(tensors, prefix);
+        }
+
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const sd::Tensor<float>& context_tensor,
+                                 const std::vector<sd::Tensor<float>>& ref_latents_tensor = {}) {
+            ggml_cgraph* gf        = new_graph_custom(BOOGU_GRAPH_SIZE);
+            ggml_tensor* x         = make_input(x_tensor);
+            ggml_tensor* timesteps = make_input(timesteps_tensor);
+            GGML_ASSERT(x->ne[3] == 1);
+            GGML_ASSERT(!context_tensor.empty());
+            ggml_tensor* context = make_input(context_tensor);
+
+            std::vector<ggml_tensor*> ref_latents;
+            ref_latents.reserve(ref_latents_tensor.size());
+            for (const auto& ref_latent_tensor : ref_latents_tensor) {
+                ref_latents.push_back(make_input(ref_latent_tensor));
+            }
+
+            pe_vec      = gen_boogu_pe(static_cast<int>(x->ne[1]),
+                                       static_cast<int>(x->ne[0]),
+                                       config.patch_size,
+                                       static_cast<int>(x->ne[3]),
+                                       static_cast<int>(context->ne[1]),
+                                       ref_latents,
+                                       config.theta,
+                                       config.axes_dim);
+            int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
+            auto pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len);
+            set_backend_tensor_data(pe, pe_vec.data());
+
+            auto runner_ctx  = get_context();
+            ggml_tensor* out = boogu.forward(&runner_ctx, x, timesteps, context, pe, ref_latents);
+            ggml_build_forward_expand(gf, out);
+            return gf;
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timesteps,
+                                  const sd::Tensor<float>& context,
+                                  const std::vector<sd::Tensor<float>>& ref_latents = {}) {
+            auto get_graph = [&]() -> ggml_cgraph* {
+                return build_graph(x, timesteps, context, ref_latents);
+            };
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const DiffusionParams& diffusion_params) override {
+            GGML_ASSERT(diffusion_params.x != nullptr);
+            GGML_ASSERT(diffusion_params.timesteps != nullptr);
+            static const std::vector<sd::Tensor<float>> empty_ref_latents;
+            return compute(n_threads,
+                           *diffusion_params.x,
+                           *diffusion_params.timesteps,
+                           tensor_or_empty(diffusion_params.context),
+                           diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents);
+        }
+    };
+}  // namespace Boogu
+
+#endif  // __SD_MODEL_DIFFUSION_BOOGU_HPP__
diff --git a/otherarch/sdcpp/src/model/diffusion/ernie_image.hpp b/otherarch/sdcpp/src/model/diffusion/ernie_image.hpp
index 12fcada597ee..0427b3b384cc 100644
--- a/otherarch/sdcpp/src/model/diffusion/ernie_image.hpp
+++ b/otherarch/sdcpp/src/model/diffusion/ernie_image.hpp
@@ -162,6 +162,8 @@ namespace ErnieImage {
             int64_t S = x->ne[1];
             int64_t N = x->ne[2];
 
+            float scale = (sd_backend_is(ctx->backend, "Vulkan") && ctx->flash_attn_enabled) ? 1.0f / 32.0f : 1.0f;
+
             auto q = to_q->forward(ctx, x);
             auto k = to_k->forward(ctx, x);
             auto v = to_v->forward(ctx, x);
@@ -182,7 +184,7 @@ namespace ErnieImage {
             k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 0, 2, 1, 3));  // [N, heads, S, head_dim]
             k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]);
 
-            x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, ctx->flash_attn_enabled);  // [N, S, hidden_size]
+            x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, ctx->flash_attn_enabled, scale);  // [N, S, hidden_size]
             x = to_out_0->forward(ctx, x);
             return x;
         }
diff --git a/otherarch/sdcpp/src/model/te/llm.hpp b/otherarch/sdcpp/src/model/te/llm.hpp
index 74dc232e5706..12daf5637f3c 100644
--- a/otherarch/sdcpp/src/model/te/llm.hpp
+++ b/otherarch/sdcpp/src/model/te/llm.hpp
@@ -79,6 +79,7 @@ namespace LLM {
         int window_size                     = 112;
         int num_position_embeddings         = 0;
         std::set<int> fullatt_block_indexes = {7, 15, 23, 31};
+        bool split_patch_embed              = false;
     };
 
     struct LLMConfig {
@@ -179,7 +180,8 @@ namespace LLM {
                 config.num_experts_per_tok     = 4;
             }
 
-            config.num_layers = 0;
+            config.num_layers          = 0;
+            int detected_vision_layers = 0;
             for (const auto& [name, tensor_storage] : tensor_storage_map) {
                 if (!starts_with(name, prefix)) {
                     continue;
@@ -190,6 +192,38 @@ namespace LLM {
                     if (contains(name, "attn.q_proj")) {
                         config.llama_cpp_style = true;
                     }
+                    if (contains(name, "visual.patch_embed.proj.1.weight")) {
+                        config.vision.split_patch_embed = true;
+                    }
+                    if (contains(name, "visual.patch_embed.proj.0.weight")) {
+                        config.vision.patch_size  = static_cast<int>(tensor_storage.ne[0]);
+                        config.vision.in_channels = tensor_storage.ne[2];
+                        config.vision.hidden_size = tensor_storage.ne[3];
+                    }
+                    if (contains(name, "visual.patch_embed.bias")) {
+                        config.vision.hidden_size = tensor_storage.ne[0];
+                    }
+                    if (contains(name, "visual.pos_embed.weight")) {
+                        config.vision.hidden_size             = tensor_storage.ne[0];
+                        config.vision.num_position_embeddings = static_cast<int>(tensor_storage.ne[1]);
+                    }
+                    if (contains(name, "visual.blocks.")) {
+                        auto items = split_string(name.substr(pos), '.');
+                        if (items.size() > 2) {
+                            int block_index = atoi(items[2].c_str());
+                            if (block_index + 1 > detected_vision_layers) {
+                                detected_vision_layers = block_index + 1;
+                            }
+                        }
+                    }
+                    if (contains(name, "visual.blocks.0.mlp.linear_fc1.weight") ||
+                        contains(name, "visual.blocks.0.mlp.gate_proj.weight")) {
+                        config.vision.intermediate_size = tensor_storage.ne[1];
+                    }
+                    if (contains(name, "visual.merger.linear_fc2.weight") ||
+                        contains(name, "visual.merger.mlp.2.weight")) {
+                        config.vision.out_hidden_size = tensor_storage.ne[1];
+                    }
                     continue;
                 }
                 pos = name.find("layers.");
@@ -219,6 +253,9 @@ namespace LLM {
             if (arch == LLMArch::QWEN3 && config.num_layers == 28) {
                 config.num_heads = 16;
             }
+            if (detected_vision_layers > 0) {
+                config.vision.num_layers = detected_vision_layers;
+            }
             LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64,
                       config.num_layers,
                       config.vocab_size,
@@ -539,40 +576,51 @@ namespace LLM {
 
     struct VisionPatchEmbed : public GGMLBlock {
     protected:
-        bool llama_cpp_style;
+        bool split_patch_embed;
+        bool bias;
         int patch_size;
         int temporal_patch_size;
         int64_t in_channels;
         int64_t embed_dim;
 
+        void init_params(ggml_context* ctx,
+                         const String2TensorStorage& tensor_storage_map = {},
+                         const std::string prefix                       = "") override {
+            GGML_UNUSED(tensor_storage_map);
+            GGML_UNUSED(prefix);
+            if (split_patch_embed && bias) {
+                params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, embed_dim);
+            }
+        }
+
     public:
-        VisionPatchEmbed(bool llama_cpp_style,
+        VisionPatchEmbed(bool split_patch_embed,
                          LLMVisionArch arch,
                          int patch_size          = 14,
                          int temporal_patch_size = 2,
                          int64_t in_channels     = 3,
                          int64_t embed_dim       = 1152)
-            : llama_cpp_style(llama_cpp_style),
+            : split_patch_embed(split_patch_embed),
+              bias(arch == LLMVisionArch::QWEN3_VL),
               patch_size(patch_size),
               temporal_patch_size(temporal_patch_size),
               in_channels(in_channels),
               embed_dim(embed_dim) {
-            bool bias = arch == LLMVisionArch::QWEN3_VL;
-            if (llama_cpp_style) {
+            if (split_patch_embed) {
                 blocks["proj.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels,
                                                                          embed_dim,
                                                                          {patch_size, patch_size},
                                                                          {patch_size, patch_size},
                                                                          {0, 0},
                                                                          {1, 1},
-                                                                         bias));
+                                                                         false));
                 blocks["proj.1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels,
                                                                          embed_dim,
                                                                          {patch_size, patch_size},
                                                                          {patch_size, patch_size},
                                                                          {0, 0},
                                                                          {1, 1},
-                                                                         bias));
+                                                                         false));
             } else {
                 std::tuple<int, int, int> kernel_size = {(int)temporal_patch_size, (int)patch_size, (int)patch_size};
                 blocks["proj"]                        = std::shared_ptr<GGMLBlock>(new Conv3d(in_channels,
@@ -593,7 +641,7 @@ namespace LLM {
                                 temporal_patch_size,
                                 ggml_nelements(x) / (temporal_patch_size * patch_size * patch_size));
 
-            if (llama_cpp_style) {
+            if (split_patch_embed) {
                 auto proj_0 = std::dynamic_pointer_cast<Conv2d>(blocks["proj.0"]);
                 auto proj_1 = std::dynamic_pointer_cast<Conv2d>(blocks["proj.1"]);
 
@@ -606,6 +654,10 @@ namespace LLM {
                 x1      = proj_1->forward(ctx, x1);
 
                 x = ggml_add(ctx->ggml_ctx, x0, x1);
+                if (bias) {
+                    auto b = ggml_reshape_4d(ctx->ggml_ctx, params["bias"], 1, 1, embed_dim, 1);
+                    x      = ggml_add_inplace(ctx->ggml_ctx, x, b);
+                }
             } else {
                 auto proj = std::dynamic_pointer_cast<Conv3d>(blocks["proj"]);
 
@@ -798,7 +850,7 @@ namespace LLM {
               spatial_merge_size(vision_params.spatial_merge_size),
               num_grid_per_side(vision_params.num_position_embeddings > 0 ? static_cast<int>(std::sqrt(vision_params.num_position_embeddings)) : 0),
               fullatt_block_indexes(vision_params.fullatt_block_indexes) {
-            blocks["patch_embed"] = std::shared_ptr<GGMLBlock>(new VisionPatchEmbed(llama_cpp_style,
+            blocks["patch_embed"] = std::shared_ptr<GGMLBlock>(new VisionPatchEmbed(vision_params.split_patch_embed,
                                                                                     arch_,
                                                                                     vision_params.patch_size,
                                                                                     vision_params.temporal_patch_size,
diff --git a/otherarch/sdcpp/src/model/vae/auto_encoder_kl.hpp b/otherarch/sdcpp/src/model/vae/auto_encoder_kl.hpp
index 478b18edb69f..51e1feda72bf 100644
--- a/otherarch/sdcpp/src/model/vae/auto_encoder_kl.hpp
+++ b/otherarch/sdcpp/src/model/vae/auto_encoder_kl.hpp
@@ -682,7 +682,7 @@ struct AutoEncoderKL : public VAE {
         } else if (sd_version_is_sd3(version)) {
             scale_factor = 1.5305f;
             shift_factor = 0.0609f;
-        } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) {
+        } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_boogu_image(version) || sd_version_is_longcat(version)) {
             scale_factor = 0.3611f;
             shift_factor = 0.1159f;
         } else if (sd_version_uses_flux2_vae(version)) {
diff --git a/otherarch/sdcpp/src/model_loader.cpp b/otherarch/sdcpp/src/model_loader.cpp
index a1788bfbdd25..5c2d57cdec5b 100644
--- a/otherarch/sdcpp/src/model_loader.cpp
+++ b/otherarch/sdcpp/src/model_loader.cpp
@@ -513,6 +513,9 @@ SDVersion ModelLoader::get_sd_version() {
         if (tensor_storage.name.find("model.diffusion_model.cap_embedder.0.weight") != std::string::npos) {
             return VERSION_Z_IMAGE;
         }
+        if (tensor_storage.name.find("double_stream_layers.0.img_instruct_attn.processor.img_to_q.weight") != std::string::npos) {
+            return VERSION_BOOGU_IMAGE;
+        }
         if (tensor_storage.name.find("model.diffusion_model.layers.0.adaLN_sa_ln.weight") != std::string::npos) {
             return VERSION_ERNIE_IMAGE;
         }
diff --git a/otherarch/sdcpp/src/name_conversion.cpp b/otherarch/sdcpp/src/name_conversion.cpp
index 4b7b4008df2b..da2a8d5eda0f 100644
--- a/otherarch/sdcpp/src/name_conversion.cpp
+++ b/otherarch/sdcpp/src/name_conversion.cpp
@@ -184,6 +184,27 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
     return name;
 }
 
+std::string convert_qwen3_vl_vision_name(std::string name) {
+    static const std::vector<std::pair<std::string, std::string>> qwen3_vl_vision_name_map{
+        {"mm.0.", "merger.linear_fc1."},
+        {"mm.2.", "merger.linear_fc2."},
+        {"v.post_ln.", "merger.norm."},
+        {"v.position_embd.weight", "pos_embed.weight"},
+        {"v.patch_embd.weight.1", "patch_embed.proj.1.weight"},
+        {"v.patch_embd.weight", "patch_embed.proj.0.weight"},
+        {"v.patch_embd.bias", "patch_embed.bias"},
+        {"v.blk.", "blocks."},
+        {"attn_qkv.", "attn.qkv."},
+        {"attn_out.", "attn.proj."},
+        {"ffn_up.", "mlp.linear_fc1."},
+        {"ffn_down.", "mlp.linear_fc2."},
+        {"ln1.", "norm1."},
+        {"ln2.", "norm2."},
+    };
+    replace_with_name_map(name, qwen3_vl_vision_name_map);
+    return name;
+}
+
 // ref: https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py
 std::string convert_diffusers_unet_to_original_sd1(std::string name) {
     // (stable-diffusion, HF Diffusers)
@@ -1154,6 +1175,10 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
 
     replace_with_prefix_map(name, prefix_map);
 
+    if (sd_version_is_boogu_image(version) && starts_with(name, "text_encoders.llm.visual.")) {
+        name = convert_qwen3_vl_vision_name(std::move(name));
+    }
+
     // diffusion model
     {
         for (const auto& prefix : diffuison_model_prefix_vec) {
diff --git a/otherarch/sdcpp/src/stable-diffusion.cpp b/otherarch/sdcpp/src/stable-diffusion.cpp
index 99e0709097d6..cd4705696cf1 100644
--- a/otherarch/sdcpp/src/stable-diffusion.cpp
+++ b/otherarch/sdcpp/src/stable-diffusion.cpp
@@ -22,6 +22,7 @@
 #include "extensions/generation_extension.h"
 #include "model/adapter/lora.hpp"
 #include "model/diffusion/anima.hpp"
+#include "model/diffusion/boogu.hpp"
 #include "model/diffusion/control.hpp"
 #include "model/diffusion/ernie_image.hpp"
 #include "model/diffusion/flux.hpp"
@@ -89,6 +90,7 @@ const char* model_version_to_str[] = {
     "LTXAV",
     "HiDream O1",
     "Z-Image",
+    "Boogu Image",
     "Ovis Image",
     "Ernie Image",
     "Lens",
@@ -126,7 +128,8 @@ static bool sd_version_supports_ref_latent_img_cfg(SDVersion version) {
            sd_version_is_flux2(version) ||
            sd_version_is_qwen_image(version) ||
            sd_version_is_longcat(version) ||
-           sd_version_is_z_image(version);
+           sd_version_is_z_image(version) ||
+           sd_version_is_boogu_image(version);
 }
 
 static bool sd_version_supports_img_cfg(SDVersion version, bool has_ref_images) {
@@ -762,9 +765,7 @@ class StableDiffusionGGML {
         auto& tensor_storage_map = model_loader.get_tensor_storage_map();
 
         LOG_INFO("Version: %s ", model_version_to_str[version]);
-        ggml_type wtype               = (int)sd_ctx_params->wtype < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT)
-                                            ? (ggml_type)sd_ctx_params->wtype
-                                            : GGML_TYPE_COUNT;
+        ggml_type wtype               = sd_type_to_ggml_type(sd_ctx_params->wtype);
         std::string tensor_type_rules = SAFE_STR(sd_ctx_params->tensor_type_rules);
         //kcpp: patch hidream to fix broken images on vulkan https://github.com/leejet/stable-diffusion.cpp/issues/1496
         if(version == VERSION_HIDREAM_O1 && tensor_type_rules.size()==0)
@@ -1031,6 +1032,18 @@ class StableDiffusionGGML {
                                                                          "model.diffusion_model",
                                                                          version,
                                                                          model_manager);
+            } else if (sd_version_is_boogu_image(version)) {
+                cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
+                                                                 tensor_storage_map,
+                                                                 version,
+                                                                 "",
+                                                                 true,
+                                                                 model_manager);
+                diffusion_model  = std::make_shared<Boogu::BooguImageRunner>(backend_for(SDBackendModule::DIFFUSION),
+                                                                            tensor_storage_map,
+                                                                            "model.diffusion_model",
+                                                                            version,
+                                                                            model_manager);
             } else if (sd_version_is_ernie_image(version)) {
                 cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
                                                                  tensor_storage_map,
@@ -1467,6 +1480,7 @@ class StableDiffusionGGML {
                            sd_version_is_anima(version) ||
                            sd_version_is_ernie_image(version) ||
                            sd_version_is_z_image(version) ||
+                           sd_version_is_boogu_image(version) ||
                            sd_version_is_pid(version) ||
                            sd_version_is_ideogram4(version)) {
                     pred_type = FLOW_PRED;
@@ -1478,6 +1492,8 @@ class StableDiffusionGGML {
                         default_flow_shift = 1.5f;
                     } else if (sd_version_is_ideogram4(version)) {
                         default_flow_shift = 1.0f;
+                    } else if (sd_version_is_boogu_image(version)) {
+                        default_flow_shift = 3.16f;
                     } else {
                         default_flow_shift = 3.f;
                     }
@@ -1957,7 +1973,7 @@ class StableDiffusionGGML {
                 if (sd_version_is_sd3(version)) {
                     latent_rgb_proj = sd3_latent_rgb_proj;
                     latent_rgb_bias = sd3_latent_rgb_bias;
-                } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) {
+                } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_boogu_image(version) || sd_version_is_longcat(version)) {
                     latent_rgb_proj = flux_latent_rgb_proj;
                     latent_rgb_bias = flux_latent_rgb_bias;
                 } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
@@ -2052,6 +2068,9 @@ class StableDiffusionGGML {
         if (sd_version_is_anima(version)) {
             return std::vector<float>{t / static_cast<float>(TIMESTEPS)};
         }
+        if (sd_version_is_boogu_image(version)) {
+            return std::vector<float>{t / static_cast<float>(TIMESTEPS)};
+        }
         if (version == VERSION_HIDREAM_O1) {
             return std::vector<float>{1.0f - (t / static_cast<float>(TIMESTEPS))};
         }
diff --git a/otherarch/sdcpp/src/tokenizers/bpe_tokenizer.cpp b/otherarch/sdcpp/src/tokenizers/bpe_tokenizer.cpp
index 896975a217cf..7733f00d36b7 100644
--- a/otherarch/sdcpp/src/tokenizers/bpe_tokenizer.cpp
+++ b/otherarch/sdcpp/src/tokenizers/bpe_tokenizer.cpp
@@ -134,7 +134,8 @@ std::vector<int> BPETokenizer::encode(const std::string& text, on_new_token_cb_t
     std::vector<int32_t> bpe_tokens;
     std::vector<std::string> token_strs;
 
-    auto splited_texts = split_with_special_tokens(text, special_tokens);
+    std::string normalized_text = normalize_before_split ? normalize(text) : text;
+    auto splited_texts          = split_with_special_tokens(normalized_text, special_tokens);
 
     for (auto& splited_text : splited_texts) {
         if (is_special_token(splited_text)) {
@@ -159,7 +160,7 @@ std::vector<int> BPETokenizer::encode(const std::string& text, on_new_token_cb_t
                 }
             }
 
-            std::string token_str = normalize(token);
+            std::string token_str = normalize_before_split ? token : normalize(token);
             std::u32string utf32_token;
             if (byte_level_bpe) {
                 for (int i = 0; i < token_str.length(); i++) {
diff --git a/otherarch/sdcpp/src/tokenizers/clip_tokenizer.cpp b/otherarch/sdcpp/src/tokenizers/clip_tokenizer.cpp
index d11c18c31287..d51eadec4022 100644
--- a/otherarch/sdcpp/src/tokenizers/clip_tokenizer.cpp
+++ b/otherarch/sdcpp/src/tokenizers/clip_tokenizer.cpp
@@ -22,9 +22,10 @@ CLIPTokenizer::CLIPTokenizer(int pad_token_id, const std::string& merges_utf8_st
     EOS_TOKEN_ID = 49407;
     PAD_TOKEN_ID = pad_token_id;
 
-    end_of_word_suffix = "</w>";
-    add_bos_token      = true;
-    add_eos_token      = true;
+    end_of_word_suffix     = "</w>";
+    add_bos_token          = true;
+    add_eos_token          = true;
+    normalize_before_split = true;
 
     if (merges_utf8_str.size() > 0) {
         load_from_merges(merges_utf8_str);
diff --git a/otherarch/sdcpp/src/tokenizers/tokenizer.h b/otherarch/sdcpp/src/tokenizers/tokenizer.h
index e044285bbd76..893759e0fba5 100644
--- a/otherarch/sdcpp/src/tokenizers/tokenizer.h
+++ b/otherarch/sdcpp/src/tokenizers/tokenizer.h
@@ -12,9 +12,10 @@ using on_new_token_cb_t = std::function<bool(std::string&, std::vector<int32_t>&
 class Tokenizer {
 protected:
     std::vector<std::string> special_tokens;
-    bool add_bos_token = false;
-    bool add_eos_token = false;
-    bool pad_left      = false;
+    bool add_bos_token          = false;
+    bool add_eos_token          = false;
+    bool pad_left               = false;
+    bool normalize_before_split = false;
     std::string end_of_word_suffix;
 
     virtual std::string decode_token(int token_id) const = 0;

From eb7340014a61098259a20b15ae610abb23db32fc Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@yahoo.com>
Date: Sun, 21 Jun 2026 18:39:16 -0300
Subject: [PATCH 2/6] sd: support for boogu and longcat edit

---
 otherarch/sdcpp/src/stable-diffusion.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/otherarch/sdcpp/src/stable-diffusion.cpp b/otherarch/sdcpp/src/stable-diffusion.cpp
index cd4705696cf1..079f8fb63155 100644
--- a/otherarch/sdcpp/src/stable-diffusion.cpp
+++ b/otherarch/sdcpp/src/stable-diffusion.cpp
@@ -491,7 +491,9 @@ class StableDiffusionGGML {
         bool is_lens = sd_version_is_lens(tempver);
         bool is_ltx = sd_version_is_ltxav(tempver);
         bool is_ideogram = sd_version_is_ideogram4(tempver);
-        bool conditioner_is_llm = (is_qwenimg || iszimg || isflux2 || is_ovis || is_anima || is_ernie || is_longcat || is_lens || is_ltx || is_ideogram);
+        bool is_boogu = sd_version_is_boogu_image(tempver);
+        bool conditioner_is_llm = (is_qwenimg || iszimg || isflux2 || is_ovis || is_anima || is_ernie || is_longcat || is_lens || is_ltx || is_ideogram || is_boogu);
+        bool has_llm_vision = (is_qwenimg || is_longcat || is_boogu);
 
         //kcpp qol fallback: if a llm was loaded as t5 by mistake
         if(conditioner_is_llm && t5_path_fixed!="")
@@ -542,7 +544,7 @@ class StableDiffusionGGML {
                 clip_vision_fixed = clipg_path_fixed;
                 clipg_path_fixed = "";
             }
-            else if(is_qwenimg && llm_vision_path_fixed=="")
+            else if(has_llm_vision && llm_vision_path_fixed=="")
             {
                 llm_vision_path_fixed = clipg_path_fixed;
                 clipg_path_fixed = "";
@@ -584,7 +586,7 @@ class StableDiffusionGGML {
             {
                 to_replace = "taesd_xl.embd";
             }
-            else if(sd_version_is_flux(tempver)||sd_version_is_z_image(tempver)||tempver == VERSION_OVIS_IMAGE||is_longcat)
+            else if(sd_version_is_flux(tempver)||sd_version_is_z_image(tempver)||tempver == VERSION_OVIS_IMAGE||is_longcat||is_boogu)
             {
                 to_replace = "taesd_f.embd";
             }

From bb1bed614de9f6e00fa48233afde6d6ea1a9a565 Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@yahoo.com>
Date: Sun, 21 Jun 2026 18:41:18 -0300
Subject: [PATCH 3/6] sd: remove SD_TYPE_COUNT == GGML_TYPE_COUNT assertion

The current code should be able to deal with an out-of-sync ggml.
---
 otherarch/sdcpp/src/stable-diffusion.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/otherarch/sdcpp/src/stable-diffusion.cpp b/otherarch/sdcpp/src/stable-diffusion.cpp
index 079f8fb63155..9b788c48261e 100644
--- a/otherarch/sdcpp/src/stable-diffusion.cpp
+++ b/otherarch/sdcpp/src/stable-diffusion.cpp
@@ -6078,9 +6078,6 @@ SD_API void free_sd_images(sd_image_t* result_images, int num_images) {
 
 namespace kcpp_sd {
 
-    static_assert((int)SD_TYPE_COUNT == (int)GGML_TYPE_COUNT,
-            "inconsistency between SD_TYPE_COUNT and GGML_TYPE_COUNT");
-
     int get_loaded_sd_version(sd_ctx_t* ctx) {
         return ctx->sd->version;
     }

From 80742da33f7da9be0914696d7e51aec8d61009e7 Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@yahoo.com>
Date: Sun, 21 Jun 2026 19:58:34 -0300
Subject: [PATCH 4/6] sd: generalize edit mode support

---
 otherarch/sdcpp/sdtype_adapter.cpp       | 4 ++--
 otherarch/sdcpp/src/kcpp_sd_extensions.h | 2 ++
 otherarch/sdcpp/src/stable-diffusion.cpp | 2 ++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp
index 4880c3ba8f08..2a5c61f40173 100644
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@@ -972,7 +972,7 @@ static sd_audio_t load_audio_from_b64(const std::string& b64audio) {
 
 bool supports_reference_images(kcpp_sd::model_info info)
 {
-    bool supported = (info.is_wan || info.is_ltx || info.is_qwenimg || info.is_flux2 || info.is_kontext || photomaker_enabled);
+    bool supported = (info.is_wan || info.is_ltx || info.supports_ref_image || info.is_kontext || photomaker_enabled);
     return supported;
 }
 
@@ -1161,7 +1161,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
                         wan_imgs.push_back(extraimage_reference);
                     }
                 }
-                else if(info.is_qwenimg || info.is_flux2)
+                else if(info.supports_ref_image)
                 {
                     uint8_t * loaded = load_image_from_b64(extra_image_data[i],nx2,ny2);
                     if(loaded)
diff --git a/otherarch/sdcpp/src/kcpp_sd_extensions.h b/otherarch/sdcpp/src/kcpp_sd_extensions.h
index c65c11c35b29..0e0c6795ad84 100644
--- a/otherarch/sdcpp/src/kcpp_sd_extensions.h
+++ b/otherarch/sdcpp/src/kcpp_sd_extensions.h
@@ -19,6 +19,8 @@ namespace kcpp_sd {
         bool is_wan;
         bool is_zimage;
         bool is_ltx;
+        bool is_boogu;
+        bool supports_ref_image;
         int vae_scale_factor;
         int spatial_multiple;
     };
diff --git a/otherarch/sdcpp/src/stable-diffusion.cpp b/otherarch/sdcpp/src/stable-diffusion.cpp
index 9b788c48261e..f50bdbb1e1c6 100644
--- a/otherarch/sdcpp/src/stable-diffusion.cpp
+++ b/otherarch/sdcpp/src/stable-diffusion.cpp
@@ -6112,6 +6112,8 @@ namespace kcpp_sd {
         res.is_sd2 = (loadedsdver == SDVersion::VERSION_SD2);
         res.is_sdxl = sd_version_is_sdxl((SDVersion)loadedsdver);
         res.is_ltx = sd_version_is_ltxav((SDVersion)loadedsdver);
+        res.is_boogu = sd_version_is_boogu_image((SDVersion)loadedsdver);
+        res.supports_ref_image = sd_version_supports_ref_latent_img_cfg((SDVersion)loadedsdver);
         res.vae_scale_factor = ctx->sd->get_vae_scale_factor();
         res.spatial_multiple = get_spatial_multiple(ctx);
         return res;

From 4786837748e40700eae53c3c2bc565ebdd4d0570 Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@yahoo.com>
Date: Mon, 22 Jun 2026 18:52:18 -0300
Subject: [PATCH 5/6] sd: sync with master-719-f440ad9

---
 otherarch/sdcpp/examples/common/common.cpp    |  59 +++++++++-
 otherarch/sdcpp/examples/common/common.h      |   1 +
 otherarch/sdcpp/include/stable-diffusion.h    |   1 +
 otherarch/sdcpp/src/model.h                   |   7 ++
 .../sdcpp/src/model/vae/auto_encoder_kl.hpp   |   2 +-
 otherarch/sdcpp/src/model_manager.cpp         |  13 ++-
 otherarch/sdcpp/src/model_manager.h           |   3 +
 otherarch/sdcpp/src/runtime/guidance.cpp      | 102 ++++++++++++++++--
 otherarch/sdcpp/src/runtime/guidance.h        |  16 ++-
 otherarch/sdcpp/src/stable-diffusion.cpp      |  51 ++++++++-
 10 files changed, 229 insertions(+), 26 deletions(-)

diff --git a/otherarch/sdcpp/examples/common/common.cpp b/otherarch/sdcpp/examples/common/common.cpp
index ad3f97a08080..3f357512e66c 100644
--- a/otherarch/sdcpp/examples/common/common.cpp
+++ b/otherarch/sdcpp/examples/common/common.cpp
@@ -6,6 +6,7 @@
 #include <cstdlib>
 #include <ctime>
 #include <filesystem>
+#include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <regex>
@@ -260,15 +261,15 @@ bool parse_options(int argc, const char** argv, const std::vector<ArgOptions>& o
                         invalid_arg = true;
                         return;
                     }
-                    if(option.concat && !option.target->empty()){
-                        if(option.concat > 0 && option.concat <= 0xff){
+                    if (option.concat && !option.target->empty()) {
+                        if (option.concat > 0 && option.concat <= 0xff) {
                             *option.target += static_cast<char>(option.concat);
                         }
                         *option.target += argv_to_utf8(i, argv);
                     } else {
                         *option.target = argv_to_utf8(i, argv);
                     }
-                    found_arg      = true;
+                    found_arg = true;
                 }))
                 break;
 
@@ -496,6 +497,10 @@ ArgOptions SDContextParams::get_options() {
          "--stream-layers",
          "enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram; defaults to false)",
          true, &stream_layers},
+        {"",
+         "--eager-load",
+         "load all params into the params backend at model-load time instead of lazily on first use (defaults to false)",
+         true, &eager_load},
         {"",
          "--force-sdxl-vae-conv-scale",
          "force use of conv scale on sdxl vae",
@@ -799,6 +804,7 @@ std::string SDContextParams::to_string() const {
         << "  offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
         << "  max_vram: \"" << max_vram << "\",\n"
         << "  stream_layers: " << (stream_layers ? "true" : "false") << ",\n"
+        << "  eager_load: " << (eager_load ? "true" : "false") << ",\n"
         << "  backend: \"" << backend << "\",\n"
         << "  params_backend: \"" << params_backend << "\",\n"
         << "  enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
@@ -878,6 +884,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) {
     sd_ctx_params.vae_format                      = str_to_vae_format(vae_format);
     sd_ctx_params.max_vram                        = max_vram.c_str();
     sd_ctx_params.stream_layers                   = stream_layers;
+    sd_ctx_params.eager_load                      = eager_load;
     sd_ctx_params.backend                         = effective_backend.c_str();
     sd_ctx_params.params_backend                  = effective_params_backend.c_str();
     sd_ctx_params.rpc_servers                     = rpc_servers.c_str();
@@ -953,7 +960,7 @@ ArgOptions SDGenerationParams::get_options() {
          &hires_upscaler},
         {"",
          "--extra-sample-args",
-         "extra sampler/scheduler/guidance args, key=value list. APG supports apg_eta, apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma",
+         "extra sampler/scheduler/guidance args, key=value list. CFG supports guidance_schedule; APG supports apg_eta, apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma;",
          (int)',',
          &extra_sample_args},
         {"",
@@ -1415,6 +1422,42 @@ ArgOptions SDGenerationParams::get_options() {
         return 1;
     };
 
+    auto on_prompt_file_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* arg = argv[index];
+        std::ifstream f(arg, std::ios::binary);
+        try {
+            prompt = std::string(std::istreambuf_iterator<char>{f}, {});
+        } catch (const std::ios_base::failure&) {
+            f.setstate(std::ios_base::failbit);
+        }
+        if (f.fail()) {
+            LOG_ERROR("error: failed to read prompt file '%s'\n", arg);
+            return -1;
+        }
+        return 1;
+    };
+
+    auto on_negative_prompt_file_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* arg = argv[index];
+        std::ifstream f(arg, std::ios::binary);
+        try {
+            negative_prompt = std::string(std::istreambuf_iterator<char>{f}, {});
+        } catch (const std::ios_base::failure&) {
+            f.setstate(std::ios_base::failbit);
+        }
+        if (f.fail()) {
+            LOG_ERROR("error: failed to read negative prompt file '%s'\n", arg);
+            return -1;
+        }
+        return 1;
+    };
+
     options.manual_options = {
         {"-s",
          "--seed",
@@ -1478,6 +1521,14 @@ ArgOptions SDGenerationParams::get_options() {
          "--vae-relative-tile-size",
          "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)",
          on_relative_tile_size_arg},
+        {"",
+         "--prompt-file",
+         "path to the file containing the prompt to render",
+         on_prompt_file_arg},
+        {"",
+         "--negative-prompt-file",
+         "path to the file containing the negative prompt",
+         on_negative_prompt_file_arg},
 
     };
 
diff --git a/otherarch/sdcpp/examples/common/common.h b/otherarch/sdcpp/examples/common/common.h
index 587cad29f699..e7c25015bfa1 100644
--- a/otherarch/sdcpp/examples/common/common.h
+++ b/otherarch/sdcpp/examples/common/common.h
@@ -148,6 +148,7 @@ struct SDContextParams {
     bool offload_params_to_cpu  = false;
     std::string max_vram        = "0";
     bool stream_layers          = false;
+    bool eager_load             = false;
     std::string backend;
     std::string params_backend;
     std::string rpc_servers;
diff --git a/otherarch/sdcpp/include/stable-diffusion.h b/otherarch/sdcpp/include/stable-diffusion.h
index bfcd909ccaf2..8772865daadb 100644
--- a/otherarch/sdcpp/include/stable-diffusion.h
+++ b/otherarch/sdcpp/include/stable-diffusion.h
@@ -219,6 +219,7 @@ typedef struct {
     enum sd_vae_format_t vae_format;
     const char* max_vram;  // GiB budget or backend assignment spec for graph-cut segmented param offload (0 = disabled, -1 = auto)
     bool stream_layers;  // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
+    bool eager_load;  // Load all params into the params backend at model-load time instead of lazily on first use
     const char* backend;
     const char* params_backend;
     const char* rpc_servers;
diff --git a/otherarch/sdcpp/src/model.h b/otherarch/sdcpp/src/model.h
index 17272f7d69d9..d02ed65b8f18 100644
--- a/otherarch/sdcpp/src/model.h
+++ b/otherarch/sdcpp/src/model.h
@@ -186,6 +186,13 @@ static inline bool sd_version_is_ideogram4(SDVersion version) {
     return false;
 }
 
+static inline bool sd_version_uses_flux_vae(SDVersion version) {
+    if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_boogu_image(version) || sd_version_is_longcat(version)) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_uses_flux2_vae(SDVersion version) {
     if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version)) {
         return true;
diff --git a/otherarch/sdcpp/src/model/vae/auto_encoder_kl.hpp b/otherarch/sdcpp/src/model/vae/auto_encoder_kl.hpp
index 51e1feda72bf..e41f5fd46a44 100644
--- a/otherarch/sdcpp/src/model/vae/auto_encoder_kl.hpp
+++ b/otherarch/sdcpp/src/model/vae/auto_encoder_kl.hpp
@@ -682,7 +682,7 @@ struct AutoEncoderKL : public VAE {
         } else if (sd_version_is_sd3(version)) {
             scale_factor = 1.5305f;
             shift_factor = 0.0609f;
-        } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_boogu_image(version) || sd_version_is_longcat(version)) {
+        } else if (sd_version_uses_flux_vae(version)) {
             scale_factor = 0.3611f;
             shift_factor = 0.1159f;
         } else if (sd_version_uses_flux2_vae(version)) {
diff --git a/otherarch/sdcpp/src/model_manager.cpp b/otherarch/sdcpp/src/model_manager.cpp
index 5287e1069af2..7095ec6a96f8 100644
--- a/otherarch/sdcpp/src/model_manager.cpp
+++ b/otherarch/sdcpp/src/model_manager.cpp
@@ -147,6 +147,17 @@ bool ModelManager::register_param_tensors(const std::string& desc,
     return true;
 }
 
+bool ModelManager::load_all_params_eagerly() {
+    std::vector<TensorState*> all_states;
+    all_states.reserve(tensor_states_.size());
+    for (const auto& s : tensor_states_) {
+        if (s != nullptr) {
+            all_states.push_back(s.get());
+        }
+    }
+    return load_tensors_to_params_backend(all_states);
+}
+
 bool ModelManager::validate_registered_tensors() {
     bool ok = true;
     for (const auto& state : tensor_states_) {
@@ -469,7 +480,7 @@ bool ModelManager::mmap_params(const std::vector<TensorState*>& states,
         return true;
     }
 
-    auto mmap_store = model_loader_.mmap_tensors(mmap_candidates, {}, true);
+    auto mmap_store = model_loader_.mmap_tensors(mmap_candidates, {}, writable_mmap_);
     if (mmap_store.empty()) {
         return true;
     }
diff --git a/otherarch/sdcpp/src/model_manager.h b/otherarch/sdcpp/src/model_manager.h
index 1a414c15cd37..9225e3ea6935 100644
--- a/otherarch/sdcpp/src/model_manager.h
+++ b/otherarch/sdcpp/src/model_manager.h
@@ -69,6 +69,7 @@ class ModelManager : public RunnerWeightManager {
     uint64_t current_lora_epoch_ = 0;
     int n_threads_               = 0;
     bool enable_mmap_            = false;
+    bool writable_mmap_          = false;
 
     void finish_compute_backend_usage(const std::vector<TensorState*>& states);
     void release_all();
@@ -110,6 +111,7 @@ class ModelManager : public RunnerWeightManager {
         model_loader_.set_n_threads(n_threads);
     }
     void set_enable_mmap(bool enable_mmap) { enable_mmap_ = enable_mmap; }
+    void set_writable_mmap(bool writable_mmap) { writable_mmap_ = writable_mmap; }
     void set_common_ignore_tensors(std::set<std::string> ignore_tensors);
     void set_loras(std::vector<LoraSpec> loras, SDVersion version);
 
@@ -158,6 +160,7 @@ class ModelManager : public RunnerWeightManager {
     }
 
     bool validate_registered_tensors();
+    bool load_all_params_eagerly();
 
     bool prepare_params(const std::vector<ggml_tensor*>& tensors) override;
     void release_compute_backend_params(const std::vector<ggml_tensor*>& tensors) override;
diff --git a/otherarch/sdcpp/src/runtime/guidance.cpp b/otherarch/sdcpp/src/runtime/guidance.cpp
index f925b4b8c9f8..bfb773b0a14c 100644
--- a/otherarch/sdcpp/src/runtime/guidance.cpp
+++ b/otherarch/sdcpp/src/runtime/guidance.cpp
@@ -3,6 +3,7 @@
 #include <algorithm>
 #include <cmath>
 #include <cstdlib>
+#include <optional>
 #include <string>
 #include <utility>
 
@@ -63,6 +64,82 @@ namespace sd::guidance {
         return uncond;
     }
 
+    std::vector<float> parse_guidance_schedule_from_spec(std::string spec) {
+        std::vector<float> schedule;
+
+        while (!spec.empty()) {
+            auto sep     = spec.find('+');
+            auto segment = spec.substr(0, sep);
+
+            auto x = segment.find('x');
+            if (x == std::string::npos) {
+                LOG_ERROR("Invalid guidance schedule segment: '%s' (expected <guidance>x<count>)", segment.c_str());
+                return {};
+            }
+
+            float guidance;
+            int count;
+
+            auto guidance_str = segment.substr(0, x);
+            auto count_str    = segment.substr(x + 1);
+
+            try {
+                size_t idx = 0;
+                guidance   = std::stof(guidance_str, &idx);
+                if (idx != guidance_str.size()) {
+                    LOG_ERROR("Invalid guidance value in guidance schedule: '%s'", guidance_str.c_str());
+                    return {};
+                }
+            } catch (const std::exception&) {
+                LOG_ERROR("Invalid guidance value in guidance schedule: '%s'", guidance_str.c_str());
+                return {};
+            }
+
+            try {
+                size_t idx = 0;
+                count      = std::stoi(count_str, &idx);
+                if (idx != count_str.size()) {
+                    LOG_ERROR("Invalid count in guidance schedule: '%s'", count_str.c_str());
+                    return {};
+                }
+            } catch (const std::exception&) {
+                LOG_ERROR("Invalid count in guidance schedule: '%s'", count_str.c_str());
+                return {};
+            }
+
+            if (count <= 0) {
+                LOG_ERROR("Guidance schedule count must be positive");
+                return {};
+            }
+
+            schedule.insert(schedule.end(), count, guidance);
+
+            if (sep == std::string::npos) {
+                break;
+            }
+
+            spec = spec.substr(sep + 1);
+        }
+
+        return schedule;
+    }
+
+    std::vector<float> parse_guidance_schedule(const char* extra_sample_args) {
+        std::vector<float> guidance_schedule;
+        std::string guidance_schedule_str = "";
+        for (const auto& [key, value] : parse_key_value_args(extra_sample_args, "extra sample arg")) {
+            float parsed = 0.0f;
+            if (key == "guidance_schedule") {
+                guidance_schedule_str = value;
+            }
+        }
+
+        if (!guidance_schedule_str.empty()) {
+            guidance_schedule = parse_guidance_schedule_from_spec(guidance_schedule_str);
+        }
+        return guidance_schedule;
+    }
+
     ClassifierFreeGuidance::ClassifierFreeGuidance(float guidance_scale,
                                                    float image_guidance_scale)
         : guidance_scale_(guidance_scale),
@@ -70,8 +147,10 @@ namespace sd::guidance {
     }
 
     GuiderOutput ClassifierFreeGuidance::forward(const GuidanceInput& input,
-                                                 GuiderOutput previous) const {
+                                                 GuiderOutput previous,
+                                                 std::optional<float> scale_override) const {
         (void)previous;
+        float guidance_scale = scale_override.value_or(guidance_scale_);
 
         GuiderOutput output;
         if (!has_tensor(input.pred_cond)) {
@@ -86,14 +165,14 @@ namespace sd::guidance {
                 const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
                 output.pred                              = pred_img_uncond +
                               image_guidance_scale_ * (pred_uncond - pred_img_uncond) +
-                              guidance_scale_ * (pred_cond - pred_uncond);
+                              guidance_scale * (pred_cond - pred_uncond);
 
             } else {
-                output.pred = pred_uncond + guidance_scale_ * (pred_cond - pred_uncond);
+                output.pred = pred_uncond + guidance_scale * (pred_cond - pred_uncond);
             }
         } else if (has_tensor(input.pred_img_uncond)) {
             const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
-            output.pred                              = pred_img_uncond + guidance_scale_ * (pred_cond - pred_img_uncond);
+            output.pred                              = pred_img_uncond + guidance_scale * (pred_cond - pred_img_uncond);
         }
 
         return output;
@@ -128,8 +207,10 @@ namespace sd::guidance {
     }
 
     GuiderOutput AdaptiveProjectedGuidance::forward(const GuidanceInput& input,
-                                                    GuiderOutput previous) const {
+                                                    GuiderOutput previous,
+                                                    std::optional<float> scale_override) const {
         (void)previous;
+        float guidance_scale = scale_override.value_or(guidance_scale_);
 
         GuiderOutput output;
         if (!has_tensor(input.pred_cond)) {
@@ -144,13 +225,13 @@ namespace sd::guidance {
                 const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
                 output.pred                              = pred_img_uncond +
                               image_guidance_scale_ * (pred_uncond - pred_img_uncond) +
-                              guidance_scale_ * (pred_cond - pred_uncond);
+                              guidance_scale * (pred_cond - pred_uncond);
             } else {
-                output.pred = pred_uncond + guidance_scale_ * (pred_cond - pred_uncond);
+                output.pred = pred_uncond + guidance_scale * (pred_cond - pred_uncond);
             }
         } else if (has_tensor(input.pred_img_uncond)) {
             const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
-            output.pred                              = pred_img_uncond + guidance_scale_ * (pred_cond - pred_img_uncond);
+            output.pred                              = pred_img_uncond + guidance_scale * (pred_cond - pred_img_uncond);
         }
         if (!has_tensor(input.pred_uncond) && !has_tensor(input.pred_img_uncond)) {
             return output;
@@ -162,7 +243,7 @@ namespace sd::guidance {
         sd::Tensor<float> deltas = calculate_guidance_delta(pred_cond,
                                                             pred_uncond,
                                                             pred_img_uncond,
-                                                            guidance_scale_,
+                                                            guidance_scale,
                                                             image_guidance_scale_);
         if (params_.momentum != 0.0f) {
             if (momentum_buffer_.shape() != deltas.shape()) {
@@ -239,7 +320,8 @@ namespace sd::guidance {
     }
 
     GuiderOutput SkipLayerGuidance::forward(const GuidanceInput& input,
-                                            GuiderOutput output) const {
+                                            GuiderOutput output,
+                                            std::optional<float> /*scale_override*/) const {
         if (scale_ == 0.0f || !is_enabled_for_step(input) || !input.predict_skip_layer) {
             return output;
         }
diff --git a/otherarch/sdcpp/src/runtime/guidance.h b/otherarch/sdcpp/src/runtime/guidance.h
index aeba06fd0739..3de337042d33 100644
--- a/otherarch/sdcpp/src/runtime/guidance.h
+++ b/otherarch/sdcpp/src/runtime/guidance.h
@@ -3,6 +3,7 @@
 
 #include <cstddef>
 #include <functional>
+#include <optional>
 #include <vector>
 
 #include "core/tensor.hpp"
@@ -27,6 +28,7 @@ namespace sd::guidance {
     AdaptiveProjectedGuidanceParams parse_adaptive_projected_guidance_args(const char* extra_sample_args);
     bool is_adaptive_projected_guidance_enabled(const AdaptiveProjectedGuidanceParams& params);
     bool parse_skip_layer_guidance_uncond_arg(const char* extra_sample_args);
+    std::vector<float> parse_guidance_schedule(const char* extra_sample_args);
 
     struct GuidanceInput {
         int step                                 = 0;
@@ -40,9 +42,10 @@ namespace sd::guidance {
 
     class BaseGuidance {
     public:
-        virtual ~BaseGuidance()                                   = default;
+        virtual ~BaseGuidance()                                                                = default;
         virtual GuiderOutput forward(const GuidanceInput& input,
-                                     GuiderOutput previous) const = 0;
+                                     GuiderOutput previous,
+                                     std::optional<float> scale_override = std::nullopt) const = 0;
     };
 
     class ClassifierFreeGuidance : public BaseGuidance {
@@ -54,7 +57,8 @@ namespace sd::guidance {
                                float image_guidance_scale);
 
         GuiderOutput forward(const GuidanceInput& input,
-                             GuiderOutput previous) const override;
+                             GuiderOutput previous,
+                             std::optional<float> scale_override = std::nullopt) const override;
     };
 
     class AdaptiveProjectedGuidance : public BaseGuidance {
@@ -69,7 +73,8 @@ namespace sd::guidance {
                                   AdaptiveProjectedGuidanceParams params);
 
         GuiderOutput forward(const GuidanceInput& input,
-                             GuiderOutput previous) const override;
+                             GuiderOutput previous,
+                             std::optional<float> scale_override = std::nullopt) const override;
     };
 
     class SkipLayerGuidance : public BaseGuidance {
@@ -88,7 +93,8 @@ namespace sd::guidance {
         const std::vector<int>& layers() const;
 
         GuiderOutput forward(const GuidanceInput& input,
-                             GuiderOutput previous) const override;
+                             GuiderOutput previous,
+                             std::optional<float> scale_override = std::nullopt) const override;
     };
 
 }  // namespace sd::guidance
diff --git a/otherarch/sdcpp/src/stable-diffusion.cpp b/otherarch/sdcpp/src/stable-diffusion.cpp
index f50bdbb1e1c6..2c0f53a37596 100644
--- a/otherarch/sdcpp/src/stable-diffusion.cpp
+++ b/otherarch/sdcpp/src/stable-diffusion.cpp
@@ -203,6 +203,7 @@ class StableDiffusionGGML {
     bool enable_mmap                     = false;
     sd::ggml_graph_cut::MaxVramAssignment max_vram_assignment;
     bool stream_layers = false;
+    bool eager_load    = false;
     std::string backend_spec;
     std::string params_backend_spec;
 
@@ -349,6 +350,7 @@ class StableDiffusionGGML {
         n_threads           = sd_ctx_params->n_threads;
         enable_mmap         = sd_ctx_params->enable_mmap;
         stream_layers       = sd_ctx_params->stream_layers;
+        eager_load          = sd_ctx_params->eager_load;
         backend_spec        = SAFE_STR(sd_ctx_params->backend);
         params_backend_spec = SAFE_STR(sd_ctx_params->params_backend);
         max_vram_assignment.reset(0.f);
@@ -586,7 +588,7 @@ class StableDiffusionGGML {
             {
                 to_replace = "taesd_xl.embd";
             }
-            else if(sd_version_is_flux(tempver)||sd_version_is_z_image(tempver)||tempver == VERSION_OVIS_IMAGE||is_longcat||is_boogu)
+            else if(sd_version_uses_flux_vae(tempver))
             {
                 to_replace = "taesd_f.embd";
             }
@@ -777,7 +779,6 @@ class StableDiffusionGGML {
         if (wtype != GGML_TYPE_COUNT || tensor_type_rules.size() > 0) {
             model_loader.set_wtype_override(wtype, tensor_type_rules);
         }
-        model_loader.process_model_files(enable_mmap, true);
 
         std::map<ggml_type, uint32_t> wtype_stat                 = model_loader.get_wtype_stat();
         std::map<ggml_type, uint32_t> conditioner_wtype_stat     = model_loader.get_conditioner_wtype_stat();
@@ -831,9 +832,12 @@ class StableDiffusionGGML {
             apply_lora_immediately = false;
         }
 
+        bool needs_writable_mmap = enable_mmap && apply_lora_immediately;
+        model_manager->set_writable_mmap(needs_writable_mmap);
         if (enable_mmap && apply_lora_immediately) {
             LOG_WARN("in mode 'immediately', LoRAs will cause extra memory usage with mmap");
         }
+        model_loader.process_model_files(enable_mmap, needs_writable_mmap);
         load_alphas_cumprod(model_loader);
 
         size_t text_encoder_params_mem_size = 0;
@@ -1400,7 +1404,15 @@ class StableDiffusionGGML {
             return false;
         }
 
-        LOG_DEBUG("model metadata validated; weights will be prepared lazily");
+        if (eager_load) {
+            if (!model_manager->load_all_params_eagerly()) {
+                LOG_ERROR("model params eager load failed");
+                return false;
+            }
+            LOG_DEBUG("model metadata validated; weights pre-loaded to params backend");
+        } else {
+            LOG_DEBUG("model metadata validated; weights will be prepared lazily");
+        }
 
         {
             size_t total_params_ram_size  = 0;
@@ -1975,7 +1987,7 @@ class StableDiffusionGGML {
                 if (sd_version_is_sd3(version)) {
                     latent_rgb_proj = sd3_latent_rgb_proj;
                     latent_rgb_bias = sd3_latent_rgb_bias;
-                } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_boogu_image(version) || sd_version_is_longcat(version)) {
+                } else if (sd_version_uses_flux_vae(version)) {
                     latent_rgb_proj = flux_latent_rgb_proj;
                     latent_rgb_bias = flux_latent_rgb_bias;
                 } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
@@ -2198,6 +2210,32 @@ class StableDiffusionGGML {
         float slg_scale     = guidance.slg.scale;
         bool slg_uncond     = sd::guidance::parse_skip_layer_guidance_uncond_arg(extra_sample_args);
 
+        std::vector<float> guidance_schedule = sd::guidance::parse_guidance_schedule(extra_sample_args);
+        if (!guidance_schedule.empty() && guidance_schedule.size() != sigmas.size() - 1) {
+            if (guidance_schedule.size() > sigmas.size()) {
+                LOG_WARN("guidance_schedule length (%zu) is greater than number of steps (%zu)", guidance_schedule.size(), sigmas.size() - 1);
+                LOG_WARN("truncating guidance_schedule to match step count");
+                guidance_schedule.resize(sigmas.size() - 1);
+            } else {
+                LOG_INFO("padding guidance_schedule with cfg_scale");
+                while (guidance_schedule.size() < sigmas.size() - 1) {
+                    guidance_schedule.push_back(cfg_scale);
+                }
+            }
+        }
+
+        if (!guidance_schedule.empty()) {
+            std::string schedule_str = "[";
+            for (size_t i = 0; i < guidance_schedule.size(); ++i) {
+                schedule_str += std::to_string(guidance_schedule[i]);
+                if (i < guidance_schedule.size() - 1) {
+                    schedule_str += ", ";
+                }
+            }
+            schedule_str += "]";
+            LOG_DEBUG("using guidance schedule: %s", schedule_str.c_str());
+        }
+
         sd_sample::SampleCacheRuntime cache_runtime = sd_sample::init_sample_cache_runtime(version,
                                                                                            cache_params,
                                                                                            denoiser.get(),
@@ -2438,7 +2476,7 @@ class StableDiffusionGGML {
             guidance_input.pred_uncond     = uncond_out.empty() ? nullptr : &uncond_out;
             guidance_input.pred_img_uncond = img_uncond_out.empty() ? nullptr : &img_uncond_out;
 
-            sd::guidance::GuiderOutput guided = primary_guidance.forward(guidance_input, {});
+            sd::guidance::GuiderOutput guided = guidance_schedule.empty() ? primary_guidance.forward(guidance_input, {}) : primary_guidance.forward(guidance_input, {}, guidance_schedule[guidance_schedule.size() - 1 - step]);
             if (guided.pred.empty()) {
                 return {};
             }
@@ -2979,6 +3017,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->lora_apply_mode      = LORA_APPLY_AUTO;
     sd_ctx_params->max_vram             = nullptr;
     sd_ctx_params->stream_layers        = false;
+    sd_ctx_params->eager_load           = false;
     sd_ctx_params->enable_mmap          = false;
     sd_ctx_params->diffusion_flash_attn = false;
     sd_ctx_params->circular_x           = false;
@@ -3025,6 +3064,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "prediction: %s\n"
              "max_vram: %s\n"
              "stream_layers: %s\n"
+             "eager_load: %s\n"
              "backend: %s\n"
              "params_backend: %s\n"
              "flash_attn: %s\n"
@@ -3060,6 +3100,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              sd_prediction_name(sd_ctx_params->prediction),
              SAFE_STR(sd_ctx_params->max_vram),
              BOOL_STR(sd_ctx_params->stream_layers),
+             BOOL_STR(sd_ctx_params->eager_load),
              SAFE_STR(sd_ctx_params->backend),
              SAFE_STR(sd_ctx_params->params_backend),
              BOOL_STR(sd_ctx_params->flash_attn),

From 5a45265f7e25c48ecffca1067fec7eef182d4a93 Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@yahoo.com>
Date: Wed, 24 Jun 2026 20:06:34 -0300
Subject: [PATCH 6/6] sd: sync with master-721-8caa3f9

---
 Makefile                                      |   2 +-
 otherarch/sdcpp/examples/common/common.cpp    |   4 +-
 otherarch/sdcpp/include/stable-diffusion.h    |   1 +
 .../sdcpp/src/conditioning/conditioner.hpp    |  13 +-
 otherarch/sdcpp/src/core/ggml_extend.hpp      |  11 +-
 otherarch/sdcpp/src/model.h                   |  11 +-
 otherarch/sdcpp/src/model/diffusion/krea2.hpp | 683 ++++++++++++++++++
 otherarch/sdcpp/src/model_loader.cpp          |   4 +
 otherarch/sdcpp/src/name_conversion.cpp       |  36 +-
 otherarch/sdcpp/src/runtime/denoiser.hpp      | 202 ++++++
 otherarch/sdcpp/src/stable-diffusion.cpp      |  30 +-
 11 files changed, 986 insertions(+), 11 deletions(-)
 create mode 100644 otherarch/sdcpp/src/model/diffusion/krea2.hpp

diff --git a/Makefile b/Makefile
index 79a37a71889f..051ea092e414 100644
--- a/Makefile
+++ b/Makefile
@@ -699,7 +699,7 @@ llama-impl.o: src/llama-impl.cpp src/llama-impl.h
 budget.o: common/reasoning-budget.cpp common/reasoning-budget.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-SDCPP_COMMON_BASENAMES := include/stable-diffusion.h src/conditioning/conditioner.hpp src/core/ggml_extend_backend.cpp src/core/ggml_extend_backend.h src/core/ggml_extend.hpp src/core/ggml_graph_cut.cpp src/core/ggml_graph_cut.h src/core/ordered_map.hpp src/core/rng.hpp src/core/rng_mt19937.hpp src/core/rng_philox.hpp src/core/tensor_ggml.hpp src/core/tensor.hpp src/core/util.cpp src/core/util.h src/extensions/generation_extension.h src/extensions/photomaker_extension.cpp src/extensions/pulid_extension.cpp src/kcpp_sd_extensions.h src/model/adapter/lora.hpp src/model/adapter/pmid.hpp src/model/adapter/pulid.hpp src/model/common/block.hpp src/model/common/rope.hpp src/model/diffusion/anima.hpp src/model/diffusion/boogu.hpp src/model/diffusion/control.hpp src/model/diffusion/dit.hpp src/model/diffusion/ernie_image.hpp src/model/diffusion/flux.hpp src/model/diffusion/hidream_o1.hpp src/model/diffusion/ideogram4.hpp src/model/diffusion/lens.hpp src/model/diffusion/ltxv.hpp src/model/diffusion/mmdit.hpp src/model/diffusion/model.hpp src/model/diffusion/pid.hpp src/model/diffusion/qwen_image.hpp src/model/diffusion/unet.hpp src/model/diffusion/wan.hpp src/model/diffusion/z_image.hpp src/model.h src/model_io/binary_io.h src/model_io/gguf_io.cpp src/model_io/gguf_io.h src/model_io/gguf_reader_ext.h src/model_io/pickle_io.cpp src/model_io/pickle_io.h src/model_io/safetensors_io.cpp src/model_io/safetensors_io.h src/model_io/tensor_storage.h src/model_io/torch_legacy_io.cpp src/model_io/torch_legacy_io.h src/model_io/torch_zip_io.cpp src/model_io/torch_zip_io.h src/model_loader.cpp src/model_loader.h src/model_manager.cpp src/model_manager.h src/model/te/clip.hpp src/model/te/llm.hpp src/model/te/t5.hpp src/model/upscaler/esrgan.hpp src/model/upscaler/ltx_latent_upscaler.hpp src/model/vae/auto_encoder_kl.hpp src/model/vae/ltx_audio_vae.hpp src/model/vae/ltx_vae.hpp src/model/vae/tae.hpp src/model/vae/vae.hpp src/model/vae/wan_vae.hpp src/name_conversion.cpp src/name_conversion.h src/runtime/cache_dit.hpp src/runtime/condition_cache_utils.hpp src/runtime/denoiser.hpp src/runtime/easycache.hpp src/runtime/gits_noise.h src/runtime/guidance.cpp src/runtime/guidance.h src/runtime/latent-preview.h src/runtime/preprocessing.hpp src/runtime/sample-cache.cpp src/runtime/sample-cache.h src/runtime/spectrum.hpp src/runtime/ucache.hpp src/stable-diffusion.cpp src/tokenizers/bpe_tokenizer.cpp src/tokenizers/bpe_tokenizer.h src/tokenizers/clip_tokenizer.cpp src/tokenizers/clip_tokenizer.h src/tokenizers/gemma_tokenizer.cpp src/tokenizers/gemma_tokenizer.h src/tokenizers/gpt_oss_tokenizer.cpp src/tokenizers/gpt_oss_tokenizer.h src/tokenizers/mistral_tokenizer.cpp src/tokenizers/mistral_tokenizer.h src/tokenizers/qwen2_tokenizer.cpp src/tokenizers/qwen2_tokenizer.h src/tokenizers/t5_unigram_tokenizer.cpp src/tokenizers/t5_unigram_tokenizer.h src/tokenizers/tokenizer.cpp src/tokenizers/tokenizer.h src/tokenizers/tokenize_util.cpp src/tokenizers/tokenize_util.h src/tokenizers/vocab/vocab.h src/upscaler.cpp src/upscaler.h src/weight_manager.h
+SDCPP_COMMON_BASENAMES := include/stable-diffusion.h src/conditioning/conditioner.hpp src/core/ggml_extend_backend.cpp src/core/ggml_extend_backend.h src/core/ggml_extend.hpp src/core/ggml_graph_cut.cpp src/core/ggml_graph_cut.h src/core/ordered_map.hpp src/core/rng.hpp src/core/rng_mt19937.hpp src/core/rng_philox.hpp src/core/tensor_ggml.hpp src/core/tensor.hpp src/core/util.cpp src/core/util.h src/extensions/generation_extension.h src/extensions/photomaker_extension.cpp src/extensions/pulid_extension.cpp src/kcpp_sd_extensions.h src/model/adapter/lora.hpp src/model/adapter/pmid.hpp src/model/adapter/pulid.hpp src/model/common/block.hpp src/model/common/rope.hpp src/model/diffusion/anima.hpp src/model/diffusion/boogu.hpp src/model/diffusion/control.hpp src/model/diffusion/dit.hpp src/model/diffusion/ernie_image.hpp src/model/diffusion/flux.hpp src/model/diffusion/hidream_o1.hpp src/model/diffusion/ideogram4.hpp src/model/diffusion/krea2.hpp src/model/diffusion/lens.hpp src/model/diffusion/ltxv.hpp src/model/diffusion/mmdit.hpp src/model/diffusion/model.hpp src/model/diffusion/pid.hpp src/model/diffusion/qwen_image.hpp src/model/diffusion/unet.hpp src/model/diffusion/wan.hpp src/model/diffusion/z_image.hpp src/model.h src/model_io/binary_io.h src/model_io/gguf_io.cpp src/model_io/gguf_io.h src/model_io/gguf_reader_ext.h src/model_io/pickle_io.cpp src/model_io/pickle_io.h src/model_io/safetensors_io.cpp src/model_io/safetensors_io.h src/model_io/tensor_storage.h src/model_io/torch_legacy_io.cpp src/model_io/torch_legacy_io.h src/model_io/torch_zip_io.cpp src/model_io/torch_zip_io.h src/model_loader.cpp src/model_loader.h src/model_manager.cpp src/model_manager.h src/model/te/clip.hpp src/model/te/llm.hpp src/model/te/t5.hpp src/model/upscaler/esrgan.hpp src/model/upscaler/ltx_latent_upscaler.hpp src/model/vae/auto_encoder_kl.hpp src/model/vae/ltx_audio_vae.hpp src/model/vae/ltx_vae.hpp src/model/vae/tae.hpp src/model/vae/vae.hpp src/model/vae/wan_vae.hpp src/name_conversion.cpp src/name_conversion.h src/runtime/cache_dit.hpp src/runtime/condition_cache_utils.hpp src/runtime/denoiser.hpp src/runtime/easycache.hpp src/runtime/gits_noise.h src/runtime/guidance.cpp src/runtime/guidance.h src/runtime/latent-preview.h src/runtime/preprocessing.hpp src/runtime/sample-cache.cpp src/runtime/sample-cache.h src/runtime/spectrum.hpp src/runtime/ucache.hpp src/stable-diffusion.cpp src/tokenizers/bpe_tokenizer.cpp src/tokenizers/bpe_tokenizer.h src/tokenizers/clip_tokenizer.cpp src/tokenizers/clip_tokenizer.h src/tokenizers/gemma_tokenizer.cpp src/tokenizers/gemma_tokenizer.h src/tokenizers/gpt_oss_tokenizer.cpp src/tokenizers/gpt_oss_tokenizer.h src/tokenizers/mistral_tokenizer.cpp src/tokenizers/mistral_tokenizer.h src/tokenizers/qwen2_tokenizer.cpp src/tokenizers/qwen2_tokenizer.h src/tokenizers/t5_unigram_tokenizer.cpp src/tokenizers/t5_unigram_tokenizer.h src/tokenizers/tokenizer.cpp src/tokenizers/tokenizer.h src/tokenizers/tokenize_util.cpp src/tokenizers/tokenize_util.h src/tokenizers/vocab/vocab.h src/upscaler.cpp src/upscaler.h src/weight_manager.h
 
 SDCPP_MAIN_BASENAMES := examples/cli/image_metadata.cpp examples/cli/image_metadata.h examples/cli/main.cpp examples/cli/msf_gif.h examples/common/common.cpp examples/common/common.h examples/common/log.cpp examples/common/log.h examples/common/media_io.cpp examples/common/media_io.h examples/common/resource_owners.hpp src/tokenizers/vocab/clip_merges.hpp src/tokenizers/vocab/gemma2_merges.hpp src/tokenizers/vocab/gemma2_vocab.hpp src/tokenizers/vocab/gemma_merges.hpp src/tokenizers/vocab/gemma_vocab.hpp src/tokenizers/vocab/gpt_oss_merges.hpp src/tokenizers/vocab/gpt_oss_vocab.hpp src/tokenizers/vocab/mistral_merges.hpp src/tokenizers/vocab/mistral_vocab.hpp src/tokenizers/vocab/qwen_merges.hpp src/tokenizers/vocab/t5.hpp src/tokenizers/vocab/umt5.hpp src/tokenizers/vocab/vocab.cpp src/convert.cpp src/version.cpp
 
diff --git a/otherarch/sdcpp/examples/common/common.cpp b/otherarch/sdcpp/examples/common/common.cpp
index 3f357512e66c..744005af909b 100644
--- a/otherarch/sdcpp/examples/common/common.cpp
+++ b/otherarch/sdcpp/examples/common/common.cpp
@@ -960,7 +960,7 @@ ArgOptions SDGenerationParams::get_options() {
          &hires_upscaler},
         {"",
          "--extra-sample-args",
-         "extra sampler/scheduler/guidance args, key=value list. CFG supports guidance_schedule; APG supports apg_eta, apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma;",
+         "extra sampler/scheduler/guidance args, key=value list. CFG supports guidance_schedule; APG supports apg_eta, apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma;; logit_normal supports mu, std, logsnr_min, logsnr_max, resolution_aware",
          (int)',',
          &extra_sample_args},
         {"",
@@ -1475,7 +1475,7 @@ ArgOptions SDGenerationParams::get_options() {
          on_high_noise_sample_method_arg},
         {"",
          "--scheduler",
-         "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent, ltx2], default: model-specific",
+         "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent, ltx2, logit_normal], default: model-specific",
          on_scheduler_arg},
         {"",
          "--sigmas",
diff --git a/otherarch/sdcpp/include/stable-diffusion.h b/otherarch/sdcpp/include/stable-diffusion.h
index 8772865daadb..7058852cc299 100644
--- a/otherarch/sdcpp/include/stable-diffusion.h
+++ b/otherarch/sdcpp/include/stable-diffusion.h
@@ -70,6 +70,7 @@ enum scheduler_t {
     LCM_SCHEDULER,
     BONG_TANGENT_SCHEDULER,
     LTX2_SCHEDULER,
+    LOGIT_NORMAL_SCHEDULER,
     SCHEDULER_COUNT
 };
 
diff --git a/otherarch/sdcpp/src/conditioning/conditioner.hpp b/otherarch/sdcpp/src/conditioning/conditioner.hpp
index ae1a5b5b387e..e037fe76b081 100644
--- a/otherarch/sdcpp/src/conditioning/conditioner.hpp
+++ b/otherarch/sdcpp/src/conditioning/conditioner.hpp
@@ -1518,7 +1518,7 @@ struct LLMEmbedder : public Conditioner {
             arch = LLM::LLMArch::GPT_OSS_20B;
         } else if (sd_version_is_pid(version)) {
             arch = LLM::LLMArch::GEMMA2_2B;
-        } else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version)) {
+        } else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version) || sd_version_is_krea2(version)) {
             arch = LLM::LLMArch::QWEN3_VL;
         } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
             arch = LLM::LLMArch::QWEN3;
@@ -1837,6 +1837,17 @@ struct LLMEmbedder : public Conditioner {
                 prompt_attn_range.second = static_cast<int>(prompt.size());
                 prompt += "<|im_end|>\n";
             }
+        } else if (sd_version_is_krea2(version)) {
+            prompt_template_encode_start_idx = 34;
+            out_layers                       = {2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 32, 35};
+
+            prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n";
+
+            prompt_attn_range.first = static_cast<int>(prompt.size());
+            prompt += conditioner_params.text;
+            prompt_attn_range.second = static_cast<int>(prompt.size());
+
+            prompt += "<|im_end|>\n<|im_start|>assistant\n";
         } else if (sd_version_is_longcat(version)) {
             spell_quotes = true;
 
diff --git a/otherarch/sdcpp/src/core/ggml_extend.hpp b/otherarch/sdcpp/src/core/ggml_extend.hpp
index f10a84ffd0db..9883103e2f71 100644
--- a/otherarch/sdcpp/src/core/ggml_extend.hpp
+++ b/otherarch/sdcpp/src/core/ggml_extend.hpp
@@ -1382,7 +1382,16 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_attention_ext(ggml_context* ctx,
             if (!ggml_backend_supports_op(backend, kqv)) {
                 kqv = nullptr;
             } else {
-                kqv = ggml_view_3d(ctx, kqv, d_head, n_head, L_q, kqv->nb[1], kqv->nb[2], 0);
+                kqv = ggml_view_4d(ctx,
+                                   kqv,
+                                   d_head,
+                                   n_head,
+                                   L_q,
+                                   N,
+                                   kqv->nb[1],
+                                   kqv->nb[2],
+                                   kqv->nb[1] * n_head,
+                                   0);
             }
         }
     }
diff --git a/otherarch/sdcpp/src/model.h b/otherarch/sdcpp/src/model.h
index d02ed65b8f18..cce309138baf 100644
--- a/otherarch/sdcpp/src/model.h
+++ b/otherarch/sdcpp/src/model.h
@@ -49,6 +49,7 @@ enum SDVersion {
     VERSION_LONGCAT,
     VERSION_PID,
     VERSION_IDEOGRAM4,
+    VERSION_KREA2,
     VERSION_ESRGAN,
     VERSION_COUNT,
 };
@@ -186,6 +187,13 @@ static inline bool sd_version_is_ideogram4(SDVersion version) {
     return false;
 }
 
+static inline bool sd_version_is_krea2(SDVersion version) {
+    if (version == VERSION_KREA2) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_uses_flux_vae(SDVersion version) {
     if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_boogu_image(version) || sd_version_is_longcat(version)) {
         return true;
@@ -226,7 +234,8 @@ static inline bool sd_version_is_dit(SDVersion version) {
         sd_version_is_lens(version) ||
         sd_version_is_longcat(version) ||
         sd_version_is_pid(version) ||
-        sd_version_is_ideogram4(version)) {
+        sd_version_is_ideogram4(version) ||
+        sd_version_is_krea2(version)) {
         return true;
     }
     return false;
diff --git a/otherarch/sdcpp/src/model/diffusion/krea2.hpp b/otherarch/sdcpp/src/model/diffusion/krea2.hpp
new file mode 100644
index 000000000000..02e655590769
--- /dev/null
+++ b/otherarch/sdcpp/src/model/diffusion/krea2.hpp
@@ -0,0 +1,683 @@
+#ifndef __SD_MODEL_DIFFUSION_KREA2_HPP__
+#define __SD_MODEL_DIFFUSION_KREA2_HPP__
+
+#include <inttypes.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "core/ggml_extend.hpp"
+#include "core/ggml_graph_cut.h"
+#include "model/common/rope.hpp"
+#include "model/diffusion/dit.hpp"
+#include "model/diffusion/flux.hpp"
+#include "model/diffusion/model.hpp"
+#include "model_loader.h"
+
+namespace Krea2 {
+    constexpr int KREA2_GRAPH_SIZE = 65536;
+
+    struct Krea2Config {
+        int patch_size            = 2;
+        int64_t in_channels       = 16;
+        int64_t out_channels      = 16;
+        int64_t features          = 6144;
+        int64_t timestep_dim      = 256;
+        int64_t text_dim          = 2560;
+        int64_t text_layers       = 12;
+        int64_t layers            = 28;
+        int64_t heads             = 48;
+        int64_t kv_heads          = 12;
+        int64_t text_heads        = 20;
+        int64_t text_kv_heads     = 20;
+        int64_t mlp_multiplier    = 4;
+        float theta               = 1000.f;
+        float norm_eps            = 1e-5f;
+        std::vector<int> axes_dim = {32, 48, 48};
+        int axes_dim_sum          = 128;
+
+        int64_t head_dim() const {
+            return features / heads;
+        }
+
+        static int64_t count_blocks(const String2TensorStorage& tensor_storage_map,
+                                    const std::string& prefix,
+                                    const std::string& block_prefix) {
+            int64_t count           = 0;
+            std::string full_prefix = prefix.empty() ? block_prefix : prefix + "." + block_prefix;
+            for (const auto& [name, _] : tensor_storage_map) {
+                if (!starts_with(name, full_prefix)) {
+                    continue;
+                }
+                std::string tail = name.substr(full_prefix.size());
+                size_t dot       = tail.find('.');
+                if (dot == std::string::npos) {
+                    continue;
+                }
+                int block_index = std::atoi(tail.substr(0, dot).c_str());
+                count           = std::max<int64_t>(count, block_index + 1);
+            }
+            return count;
+        }
+
+        void update_axes_dim() {
+            int64_t dim_head = head_dim();
+            int64_t unit     = dim_head / 16;
+            axes_dim         = {
+                        static_cast<int>(dim_head - 12 * unit),
+                        static_cast<int>(6 * unit),
+                        static_cast<int>(6 * unit),
+            };
+            axes_dim_sum = axes_dim[0] + axes_dim[1] + axes_dim[2];
+        }
+
+        static Krea2Config detect_from_weights(const String2TensorStorage& tensor_storage_map,
+                                               const std::string& prefix) {
+            Krea2Config config;
+            int64_t detected_head_dim      = 0;
+            int64_t detected_text_head_dim = 0;
+
+            for (const auto& [name, tensor_storage] : tensor_storage_map) {
+                if (!starts_with(name, prefix)) {
+                    continue;
+                }
+                if (ends_with(name, "first.weight") && tensor_storage.n_dims == 2) {
+                    config.in_channels  = tensor_storage.ne[0] / (config.patch_size * config.patch_size);
+                    config.out_channels = config.in_channels;
+                    config.features     = tensor_storage.ne[1];
+                } else if (ends_with(name, "blocks.0.attn.qknorm.qnorm.scale") && tensor_storage.n_dims == 1) {
+                    detected_head_dim = tensor_storage.ne[0];
+                } else if (ends_with(name, "blocks.0.attn.wq.weight") && tensor_storage.n_dims == 2) {
+                    if (detected_head_dim > 0) {
+                        config.heads = tensor_storage.ne[1] / detected_head_dim;
+                    }
+                } else if (ends_with(name, "blocks.0.attn.wk.weight") && tensor_storage.n_dims == 2) {
+                    if (detected_head_dim > 0) {
+                        config.kv_heads = tensor_storage.ne[1] / detected_head_dim;
+                    }
+                } else if (ends_with(name, "txtfusion.projector.weight") && tensor_storage.n_dims == 2) {
+                    config.text_layers = tensor_storage.ne[0];
+                } else if (ends_with(name, "txtfusion.layerwise_blocks.0.prenorm.scale") && tensor_storage.n_dims == 1) {
+                    config.text_dim = tensor_storage.ne[0];
+                } else if (ends_with(name, "txtfusion.layerwise_blocks.0.attn.qknorm.qnorm.scale") && tensor_storage.n_dims == 1) {
+                    detected_text_head_dim = tensor_storage.ne[0];
+                } else if (ends_with(name, "txtfusion.layerwise_blocks.0.attn.wq.weight") && tensor_storage.n_dims == 2) {
+                    if (detected_text_head_dim > 0) {
+                        config.text_heads = tensor_storage.ne[1] / detected_text_head_dim;
+                    }
+                } else if (ends_with(name, "txtfusion.layerwise_blocks.0.attn.wk.weight") && tensor_storage.n_dims == 2) {
+                    if (detected_text_head_dim > 0) {
+                        config.text_kv_heads = tensor_storage.ne[1] / detected_text_head_dim;
+                    }
+                } else if (ends_with(name, "last.linear.weight") && tensor_storage.n_dims == 2) {
+                    config.out_channels = tensor_storage.ne[1] / (config.patch_size * config.patch_size);
+                }
+            }
+
+            config.layers = std::max<int64_t>(1, count_blocks(tensor_storage_map, prefix, "blocks."));
+            if (detected_head_dim > 0 && config.features > 0) {
+                config.heads = config.features / detected_head_dim;
+            }
+            if (detected_head_dim > 0) {
+                std::string wk_name = prefix.empty() ? "blocks.0.attn.wk.weight" : prefix + ".blocks.0.attn.wk.weight";
+                auto it             = tensor_storage_map.find(wk_name);
+                if (it != tensor_storage_map.end() && it->second.n_dims == 2) {
+                    config.kv_heads = it->second.ne[1] / detected_head_dim;
+                }
+            }
+            if (detected_text_head_dim > 0 && config.text_dim > 0) {
+                config.text_heads = config.text_dim / detected_text_head_dim;
+            }
+            if (detected_text_head_dim > 0) {
+                std::string wk_name = prefix.empty() ? "txtfusion.layerwise_blocks.0.attn.wk.weight" : prefix + ".txtfusion.layerwise_blocks.0.attn.wk.weight";
+                auto it             = tensor_storage_map.find(wk_name);
+                if (it != tensor_storage_map.end() && it->second.n_dims == 2) {
+                    config.text_kv_heads = it->second.ne[1] / detected_text_head_dim;
+                }
+            }
+            config.update_axes_dim();
+
+            LOG_DEBUG("krea2: layers=%" PRId64 ", features=%" PRId64 ", heads=%" PRId64 ", kv_heads=%" PRId64 ", text_dim=%" PRId64 ", text_layers=%" PRId64 ", text_heads=%" PRId64 ", text_kv_heads=%" PRId64 ", channels=%" PRId64,
+                      config.layers,
+                      config.features,
+                      config.heads,
+                      config.kv_heads,
+                      config.text_dim,
+                      config.text_layers,
+                      config.text_heads,
+                      config.text_kv_heads,
+                      config.in_channels);
+            return config;
+        }
+    };
+
+    __STATIC_INLINE__ int64_t ceil_to_multiple(int64_t value, int64_t multiple) {
+        return ((value + multiple - 1) / multiple) * multiple;
+    }
+
+    class KreaRMSNorm : public UnaryBlock {
+    protected:
+        int64_t hidden_size;
+        float eps;
+        std::string prefix;
+
+        void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+            GGML_UNUSED(tensor_storage_map);
+            this->prefix    = prefix;
+            params["scale"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+        }
+
+    public:
+        KreaRMSNorm(int64_t hidden_size, float eps = 1e-5f)
+            : hidden_size(hidden_size),
+              eps(eps) {}
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+            ggml_tensor* scale = params["scale"];
+            scale              = ggml_add(ctx->ggml_ctx, scale, ggml_ext_ones(ctx->ggml_ctx, scale->ne[0], 1, 1, 1));
+            x                  = ggml_rms_norm(ctx->ggml_ctx, x, eps);
+            x                  = ggml_mul_inplace(ctx->ggml_ctx, x, scale);
+            return x;
+        }
+    };
+
+    class KreaSwiGLU : public UnaryBlock {
+    public:
+        KreaSwiGLU(int64_t features, int64_t multiplier) {
+            int64_t mlp_dim = ceil_to_multiple(((2 * features) / 3) * multiplier, 128);
+            blocks["gate"]  = std::make_shared<Linear>(features, mlp_dim, false);
+            blocks["up"]    = std::make_shared<Linear>(features, mlp_dim, false);
+            blocks["down"]  = std::make_shared<Linear>(mlp_dim, features, false);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+            auto gate = std::dynamic_pointer_cast<Linear>(blocks["gate"]);
+            auto up   = std::dynamic_pointer_cast<Linear>(blocks["up"]);
+            auto down = std::dynamic_pointer_cast<Linear>(blocks["down"]);
+
+            auto gated = ggml_silu(ctx->ggml_ctx, gate->forward(ctx, x));
+            auto up_x  = up->forward(ctx, x);
+            x          = ggml_mul(ctx->ggml_ctx, gated, up_x);
+            return down->forward(ctx, x);
+        }
+    };
+
+    class KreaAttention : public GGMLBlock {
+    protected:
+        int64_t features;
+        int64_t heads;
+        int64_t kv_heads;
+        int64_t head_dim_;
+
+        ggml_tensor* attention_no_rope(GGMLRunnerContext* ctx,
+                                       ggml_tensor* q,
+                                       ggml_tensor* k,
+                                       ggml_tensor* v,
+                                       ggml_tensor* mask) {
+            int64_t Lq = q->ne[2];
+            int64_t Lk = k->ne[2];
+            int64_t N  = q->ne[3];
+            q          = ggml_reshape_3d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, q), head_dim_ * heads, Lq, N);
+            k          = ggml_reshape_3d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, k), head_dim_ * kv_heads, Lk, N);
+            v          = ggml_reshape_3d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, v), head_dim_ * kv_heads, Lk, N);
+            return ggml_ext_attention_ext(ctx->ggml_ctx,
+                                          ctx->backend,
+                                          q,
+                                          k,
+                                          v,
+                                          heads,
+                                          mask,
+                                          false,
+                                          ctx->flash_attn_enabled);
+        }
+
+    public:
+        KreaAttention(int64_t features,
+                      int64_t heads,
+                      int64_t kv_heads,
+                      float eps = 1e-5f)
+            : features(features),
+              heads(heads),
+              kv_heads(kv_heads),
+              head_dim_(features / heads) {
+            blocks["wq"]           = std::make_shared<Linear>(features, heads * head_dim_, false);
+            blocks["wk"]           = std::make_shared<Linear>(features, kv_heads * head_dim_, false);
+            blocks["wv"]           = std::make_shared<Linear>(features, kv_heads * head_dim_, false);
+            blocks["gate"]         = std::make_shared<Linear>(features, features, false);
+            blocks["qknorm.qnorm"] = std::make_shared<KreaRMSNorm>(head_dim_, eps);
+            blocks["qknorm.knorm"] = std::make_shared<KreaRMSNorm>(head_dim_, eps);
+            blocks["wo"]           = std::make_shared<Linear>(features, features, false);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* pe   = nullptr,
+                             ggml_tensor* mask = nullptr) {
+            auto wq    = std::dynamic_pointer_cast<Linear>(blocks["wq"]);
+            auto wk    = std::dynamic_pointer_cast<Linear>(blocks["wk"]);
+            auto wv    = std::dynamic_pointer_cast<Linear>(blocks["wv"]);
+            auto gate  = std::dynamic_pointer_cast<Linear>(blocks["gate"]);
+            auto qnorm = std::dynamic_pointer_cast<KreaRMSNorm>(blocks["qknorm.qnorm"]);
+            auto knorm = std::dynamic_pointer_cast<KreaRMSNorm>(blocks["qknorm.knorm"]);
+            auto wo    = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
+
+            if (sd_backend_is(ctx->backend, "Vulkan")) {
+                wo->set_force_prec_f32(true);
+            }
+
+            int64_t L = x->ne[1];
+            int64_t N = x->ne[2];
+
+            auto q = wq->forward(ctx, x);
+            q      = ggml_reshape_4d(ctx->ggml_ctx, q, head_dim_, heads, L, N);
+            auto k = wk->forward(ctx, x);
+            k      = ggml_reshape_4d(ctx->ggml_ctx, k, head_dim_, kv_heads, L, N);
+            auto v = wv->forward(ctx, x);
+            v      = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim_, kv_heads, L, N);
+
+            q = qnorm->forward(ctx, q);
+            k = knorm->forward(ctx, k);
+
+            auto out = pe != nullptr ? Rope::attention(ctx, q, k, v, pe, mask)
+                                     : attention_no_rope(ctx, q, k, v, mask);
+            out      = ggml_mul(ctx->ggml_ctx, out, ggml_sigmoid(ctx->ggml_ctx, gate->forward(ctx, x)));
+            out      = wo->forward(ctx, out);
+            return out;
+        }
+    };
+
+    class KreaDoubleSharedModulation : public GGMLBlock {
+    protected:
+        int64_t dim;
+
+        void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+            GGML_UNUSED(tensor_storage_map);
+            GGML_UNUSED(prefix);
+            params["lin"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim * 6);
+        }
+
+    public:
+        KreaDoubleSharedModulation(int64_t dim)
+            : dim(dim) {}
+
+        std::vector<ggml_tensor*> forward(GGMLRunnerContext* ctx, ggml_tensor* vec) {
+            auto lin = ggml_repeat(ctx->ggml_ctx, params["lin"], vec);
+            auto out = ggml_add(ctx->ggml_ctx, vec, lin);
+            return ggml_ext_chunk(ctx->ggml_ctx, out, 6, 0);
+        }
+    };
+
+    class KreaFinalModulation : public GGMLBlock {
+    protected:
+        int64_t dim;
+
+        void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+            GGML_UNUSED(tensor_storage_map);
+            GGML_UNUSED(prefix);
+            params["lin"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, 2);
+        }
+
+    public:
+        KreaFinalModulation(int64_t dim)
+            : dim(dim) {}
+
+        std::vector<ggml_tensor*> forward(GGMLRunnerContext* ctx, ggml_tensor* vec) {
+            auto out = ggml_add(ctx->ggml_ctx, params["lin"], vec);
+            return ggml_ext_chunk(ctx->ggml_ctx, out, 2, 1);
+        }
+    };
+
+    class KreaTextFusionBlock : public UnaryBlock {
+    public:
+        KreaTextFusionBlock(int64_t dim,
+                            int64_t heads,
+                            int64_t kv_heads,
+                            int64_t multiplier,
+                            float eps) {
+            blocks["prenorm"]  = std::make_shared<KreaRMSNorm>(dim, eps);
+            blocks["postnorm"] = std::make_shared<KreaRMSNorm>(dim, eps);
+            blocks["attn"]     = std::make_shared<KreaAttention>(dim, heads, kv_heads, eps);
+            blocks["mlp"]      = std::make_shared<KreaSwiGLU>(dim, multiplier);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+            auto prenorm  = std::dynamic_pointer_cast<KreaRMSNorm>(blocks["prenorm"]);
+            auto postnorm = std::dynamic_pointer_cast<KreaRMSNorm>(blocks["postnorm"]);
+            auto attn     = std::dynamic_pointer_cast<KreaAttention>(blocks["attn"]);
+            auto mlp      = std::dynamic_pointer_cast<KreaSwiGLU>(blocks["mlp"]);
+
+            x = ggml_add(ctx->ggml_ctx, x, attn->forward(ctx, prenorm->forward(ctx, x)));
+            x = ggml_add(ctx->ggml_ctx, x, mlp->forward(ctx, postnorm->forward(ctx, x)));
+            return x;
+        }
+    };
+
+    class KreaTextFusionTransformer : public UnaryBlock {
+    protected:
+        Krea2Config config;
+
+    public:
+        explicit KreaTextFusionTransformer(Krea2Config config)
+            : config(std::move(config)) {
+            for (int i = 0; i < 2; ++i) {
+                blocks["layerwise_blocks." + std::to_string(i)] = std::make_shared<KreaTextFusionBlock>(this->config.text_dim,
+                                                                                                        this->config.text_heads,
+                                                                                                        this->config.text_kv_heads,
+                                                                                                        this->config.mlp_multiplier,
+                                                                                                        this->config.norm_eps);
+                blocks["refiner_blocks." + std::to_string(i)]   = std::make_shared<KreaTextFusionBlock>(this->config.text_dim,
+                                                                                                      this->config.text_heads,
+                                                                                                      this->config.text_kv_heads,
+                                                                                                      this->config.mlp_multiplier,
+                                                                                                      this->config.norm_eps);
+            }
+            blocks["projector"] = std::make_shared<Linear>(this->config.text_layers, 1, false);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* context) override {
+            int64_t text_tokens = context->ne[1];
+            int64_t batch       = context->ne[2];
+
+            context = ggml_reshape_3d(ctx->ggml_ctx,
+                                      context,
+                                      config.text_dim,
+                                      config.text_layers,
+                                      text_tokens * batch);
+
+            for (int i = 0; i < 2; ++i) {
+                auto block = std::dynamic_pointer_cast<KreaTextFusionBlock>(blocks["layerwise_blocks." + std::to_string(i)]);
+                context    = block->forward(ctx, context);
+            }
+
+            context        = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, context, 1, 0, 2, 3));
+            auto projector = std::dynamic_pointer_cast<Linear>(blocks["projector"]);
+            context        = projector->forward(ctx, context);
+            context        = ggml_reshape_3d(ctx->ggml_ctx, context, config.text_dim, text_tokens, batch);
+
+            for (int i = 0; i < 2; ++i) {
+                auto block = std::dynamic_pointer_cast<KreaTextFusionBlock>(blocks["refiner_blocks." + std::to_string(i)]);
+                context    = block->forward(ctx, context);
+            }
+            return context;
+        }
+    };
+
+    class KreaSingleStreamBlock : public UnaryBlock {
+    public:
+        explicit KreaSingleStreamBlock(Krea2Config config) {
+            blocks["mod"]      = std::make_shared<KreaDoubleSharedModulation>(config.features);
+            blocks["prenorm"]  = std::make_shared<KreaRMSNorm>(config.features, config.norm_eps);
+            blocks["postnorm"] = std::make_shared<KreaRMSNorm>(config.features, config.norm_eps);
+            blocks["attn"]     = std::make_shared<KreaAttention>(config.features, config.heads, config.kv_heads, config.norm_eps);
+            blocks["mlp"]      = std::make_shared<KreaSwiGLU>(config.features, config.mlp_multiplier);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* vec,
+                             ggml_tensor* pe) {
+            auto mod      = std::dynamic_pointer_cast<KreaDoubleSharedModulation>(blocks["mod"]);
+            auto prenorm  = std::dynamic_pointer_cast<KreaRMSNorm>(blocks["prenorm"]);
+            auto postnorm = std::dynamic_pointer_cast<KreaRMSNorm>(blocks["postnorm"]);
+            auto attn     = std::dynamic_pointer_cast<KreaAttention>(blocks["attn"]);
+            auto mlp      = std::dynamic_pointer_cast<KreaSwiGLU>(blocks["mlp"]);
+
+            auto mods       = mod->forward(ctx, vec);
+            auto attn_input = Flux::modulate(ctx->ggml_ctx,
+                                             prenorm->forward(ctx, x),
+                                             mods[1],
+                                             mods[0],
+                                             true);
+            auto attn_out   = attn->forward(ctx, attn_input, pe);
+            x               = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn_out, mods[2]));
+
+            auto mlp_input = Flux::modulate(ctx->ggml_ctx,
+                                            postnorm->forward(ctx, x),
+                                            mods[4],
+                                            mods[3],
+                                            true);
+            auto mlp_out   = mlp->forward(ctx, mlp_input);
+            x              = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, mlp_out, mods[5]));
+            return x;
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+            GGML_UNUSED(ctx);
+            GGML_UNUSED(x);
+            GGML_ABORT("KreaSingleStreamBlock requires conditioning");
+            return nullptr;
+        }
+    };
+
+    class KreaTimeMLP : public UnaryBlock {
+    public:
+        explicit KreaTimeMLP(Krea2Config config) {
+            blocks["0"] = std::make_shared<Linear>(config.timestep_dim, config.features, true);
+            blocks["2"] = std::make_shared<Linear>(config.features, config.features, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+            auto linear_0 = std::dynamic_pointer_cast<Linear>(blocks["0"]);
+            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
+            x             = linear_0->forward(ctx, x);
+            x             = ggml_ext_gelu(ctx->ggml_ctx, x, false);
+            x             = linear_2->forward(ctx, x);
+            return x;
+        }
+    };
+
+    class KreaTProj : public UnaryBlock {
+    public:
+        explicit KreaTProj(Krea2Config config) {
+            blocks["1"] = std::make_shared<Linear>(config.features, config.features * 6, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["1"]);
+            x             = ggml_ext_gelu(ctx->ggml_ctx, x, false);
+            x             = linear_1->forward(ctx, x);
+            return x;
+        }
+    };
+
+    class KreaTextMLP : public UnaryBlock {
+    public:
+        explicit KreaTextMLP(Krea2Config config) {
+            blocks["0"] = std::make_shared<KreaRMSNorm>(config.text_dim, config.norm_eps);
+            blocks["1"] = std::make_shared<Linear>(config.text_dim, config.features, true);
+            blocks["3"] = std::make_shared<Linear>(config.features, config.features, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+            auto norm     = std::dynamic_pointer_cast<KreaRMSNorm>(blocks["0"]);
+            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["1"]);
+            auto linear_3 = std::dynamic_pointer_cast<Linear>(blocks["3"]);
+            x             = norm->forward(ctx, x);
+            x             = linear_1->forward(ctx, x);
+            x             = ggml_ext_gelu(ctx->ggml_ctx, x, true);
+            x             = linear_3->forward(ctx, x);
+            return x;
+        }
+    };
+
+    class KreaLastLayer : public GGMLBlock {
+    public:
+        explicit KreaLastLayer(Krea2Config config) {
+            blocks["norm"]       = std::make_shared<KreaRMSNorm>(config.features, config.norm_eps);
+            blocks["linear"]     = std::make_shared<Linear>(config.features, config.patch_size * config.patch_size * config.out_channels, true);
+            blocks["modulation"] = std::make_shared<KreaFinalModulation>(config.features);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* vec) {
+            auto norm       = std::dynamic_pointer_cast<KreaRMSNorm>(blocks["norm"]);
+            auto linear     = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
+            auto modulation = std::dynamic_pointer_cast<KreaFinalModulation>(blocks["modulation"]);
+
+            auto mods = modulation->forward(ctx, vec);
+            x         = Flux::modulate(ctx->ggml_ctx,
+                                       norm->forward(ctx, x),
+                                       mods[1],
+                                       mods[0],
+                                       true);
+            x         = linear->forward(ctx, x);
+            return x;
+        }
+    };
+
+    class Krea2Model : public GGMLBlock {
+    protected:
+        Krea2Config config;
+
+    public:
+        Krea2Model() = default;
+        explicit Krea2Model(Krea2Config config)
+            : config(std::move(config)) {
+            blocks["first"]     = std::make_shared<Linear>(this->config.patch_size * this->config.patch_size * this->config.in_channels,
+                                                       this->config.features,
+                                                       true);
+            blocks["tmlp"]      = std::make_shared<KreaTimeMLP>(this->config);
+            blocks["txtfusion"] = std::make_shared<KreaTextFusionTransformer>(this->config);
+            blocks["txtmlp"]    = std::make_shared<KreaTextMLP>(this->config);
+            blocks["tproj"]     = std::make_shared<KreaTProj>(this->config);
+            for (int i = 0; i < this->config.layers; ++i) {
+                blocks["blocks." + std::to_string(i)] = std::make_shared<KreaSingleStreamBlock>(this->config);
+            }
+            blocks["last"] = std::make_shared<KreaLastLayer>(this->config);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* timestep,
+                             ggml_tensor* context,
+                             ggml_tensor* pe) {
+            int64_t W = x->ne[0];
+            int64_t H = x->ne[1];
+            int64_t N = x->ne[3];
+            GGML_ASSERT(N == 1);
+
+            auto first     = std::dynamic_pointer_cast<Linear>(blocks["first"]);
+            auto tmlp      = std::dynamic_pointer_cast<KreaTimeMLP>(blocks["tmlp"]);
+            auto txtfusion = std::dynamic_pointer_cast<KreaTextFusionTransformer>(blocks["txtfusion"]);
+            auto txtmlp    = std::dynamic_pointer_cast<KreaTextMLP>(blocks["txtmlp"]);
+            auto tproj     = std::dynamic_pointer_cast<KreaTProj>(blocks["tproj"]);
+            auto last      = std::dynamic_pointer_cast<KreaLastLayer>(blocks["last"]);
+
+            auto img        = DiT::pad_and_patchify(ctx, x, config.patch_size, config.patch_size, true);
+            int64_t img_len = img->ne[1];
+            img             = first->forward(ctx, img);
+
+            auto t    = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast<int>(config.timestep_dim), 10000, 1000.f);
+            t         = tmlp->forward(ctx, t);
+            t         = ggml_reshape_3d(ctx->ggml_ctx, t, t->ne[0], 1, t->ne[1]);
+            auto tvec = tproj->forward(ctx, t);
+
+            auto txt        = txtfusion->forward(ctx, context);
+            txt             = txtmlp->forward(ctx, txt);
+            int64_t txt_len = txt->ne[1];
+
+            auto hidden_states = ggml_concat(ctx->ggml_ctx, txt, img, 1);
+            for (int i = 0; i < config.layers; ++i) {
+                auto block    = std::dynamic_pointer_cast<KreaSingleStreamBlock>(blocks["blocks." + std::to_string(i)]);
+                hidden_states = block->forward(ctx, hidden_states, tvec, pe);
+                sd::ggml_graph_cut::mark_graph_cut(hidden_states, "krea2.blocks." + std::to_string(i), "hidden_states");
+            }
+
+            hidden_states = last->forward(ctx, hidden_states, t);
+            hidden_states = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, txt_len, txt_len + img_len);
+            hidden_states = DiT::unpatchify_and_crop(ctx->ggml_ctx, hidden_states, H, W, config.patch_size, config.patch_size, true);
+            return hidden_states;
+        }
+    };
+
+    __STATIC_INLINE__ std::vector<float> gen_krea2_pe(int h,
+                                                      int w,
+                                                      int patch_size,
+                                                      int bs,
+                                                      int context_len,
+                                                      float theta,
+                                                      const std::vector<int>& axes_dim) {
+        auto txt_ids = Rope::gen_flux_txt_ids(bs, context_len, 3, {});
+        auto img_ids = Rope::gen_flux_img_ids(h, w, patch_size, bs, 3, 0, 0, 0, false);
+        auto ids     = Rope::concat_ids(txt_ids, img_ids, bs);
+        return Rope::embed_nd(ids, bs, theta, axes_dim);
+    }
+
+    struct Krea2Runner : public DiffusionModelRunner {
+        Krea2Config config;
+        Krea2Model model;
+        std::vector<float> pe_vec;
+
+        Krea2Runner(ggml_backend_t backend,
+                    const String2TensorStorage& tensor_storage_map      = {},
+                    const std::string prefix                            = "",
+                    std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+            : DiffusionModelRunner(backend, prefix, weight_manager),
+              config(Krea2Config::detect_from_weights(tensor_storage_map, prefix)) {
+            model = Krea2Model(config);
+            model.init(params_ctx, tensor_storage_map, prefix);
+        }
+
+        std::string get_desc() override {
+            return "krea2";
+        }
+
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
+            model.get_param_tensors(tensors, prefix);
+        }
+
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const sd::Tensor<float>& context_tensor) {
+            ggml_cgraph* gf        = new_graph_custom(KREA2_GRAPH_SIZE);
+            ggml_tensor* x         = make_input(x_tensor);
+            ggml_tensor* timesteps = make_input(timesteps_tensor);
+            GGML_ASSERT(x->ne[3] == 1);
+            GGML_ASSERT(!context_tensor.empty());
+            ggml_tensor* context = make_input(context_tensor);
+
+            pe_vec      = gen_krea2_pe(static_cast<int>(x->ne[1]),
+                                       static_cast<int>(x->ne[0]),
+                                       config.patch_size,
+                                       static_cast<int>(x->ne[3]),
+                                       static_cast<int>(context->ne[1]),
+                                       config.theta,
+                                       config.axes_dim);
+            int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
+            auto pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len);
+            set_backend_tensor_data(pe, pe_vec.data());
+
+            auto runner_ctx  = get_context();
+            ggml_tensor* out = model.forward(&runner_ctx, x, timesteps, context, pe);
+            ggml_build_forward_expand(gf, out);
+            return gf;
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timesteps,
+                                  const sd::Tensor<float>& context) {
+            auto get_graph = [&]() -> ggml_cgraph* {
+                return build_graph(x, timesteps, context);
+            };
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const DiffusionParams& diffusion_params) override {
+            GGML_ASSERT(diffusion_params.x != nullptr);
+            GGML_ASSERT(diffusion_params.timesteps != nullptr);
+            return compute(n_threads,
+                           *diffusion_params.x,
+                           *diffusion_params.timesteps,
+                           tensor_or_empty(diffusion_params.context));
+        }
+    };
+}  // namespace Krea2
+
+#endif  // __SD_MODEL_DIFFUSION_KREA2_HPP__
diff --git a/otherarch/sdcpp/src/model_loader.cpp b/otherarch/sdcpp/src/model_loader.cpp
index 5c2d57cdec5b..788663103e08 100644
--- a/otherarch/sdcpp/src/model_loader.cpp
+++ b/otherarch/sdcpp/src/model_loader.cpp
@@ -481,6 +481,10 @@ SDVersion ModelLoader::get_sd_version() {
         if (tensor_storage.name.find("embed_image_indicator.weight") != std::string::npos) {
             return VERSION_IDEOGRAM4;
         }
+        if (tensor_storage.name.find("model.diffusion_model.txtfusion.projector.weight") != std::string::npos ||
+            tensor_storage.name.find("model.diffusion_model.text_fusion.projector.weight") != std::string::npos) {
+            return VERSION_KREA2;
+        }
         if (tensor_storage.name.find("model.diffusion_model.nerf_final_layer_conv.") != std::string::npos) {
             return VERSION_CHROMA_RADIANCE;
         }
diff --git a/otherarch/sdcpp/src/name_conversion.cpp b/otherarch/sdcpp/src/name_conversion.cpp
index da2a8d5eda0f..ccc8347b729b 100644
--- a/otherarch/sdcpp/src/name_conversion.cpp
+++ b/otherarch/sdcpp/src/name_conversion.cpp
@@ -704,6 +704,38 @@ std::string convert_other_dit_to_original_anima(std::string name) {
     return name;
 }
 
+std::string convert_diffusers_dit_to_original_krea2(std::string name) {
+    static const std::vector<std::pair<std::string, std::string>> prefix_map = {
+        {"img_in.", "first."},
+        {"time_embed.linear_1.", "tmlp.0."},
+        {"time_embed.linear_2.", "tmlp.2."},
+        {"time_mod_proj.", "tproj.1."},
+        {"txt_in.linear_1.", "txtmlp.1."},
+        {"txt_in.linear_2.", "txtmlp.3."},
+        {"text_fusion.", "txtfusion."},
+        {"transformer_blocks.", "blocks."},
+        {"final_layer.", "last."},
+    };
+    static const std::vector<std::pair<std::string, std::string>> name_map = {
+        {"attn.to_out.0.", "attn.wo."},
+        {"attn.to_out.", "attn.wo."},
+        {"attn.to_gate.", "attn.gate."},
+        {"attn.to_q.", "attn.wq."},
+        {"attn.to_k.", "attn.wk."},
+        {"attn.to_v.", "attn.wv."},
+        {"ff.gate.", "mlp.gate."},
+        {"ff.up.", "mlp.up."},
+        {"ff.down.", "mlp.down."},
+        {"txt_in.norm.", "txtmlp.0."},
+        {"last.norm.weight", "last.norm.scale"},
+        {"last.modulation.weight", "last.modulation.lin"},
+    };
+
+    replace_with_prefix_map(name, prefix_map);
+    replace_with_name_map(name, name_map);
+    return name;
+}
+
 std::string convert_diffusion_model_name(std::string name, std::string prefix, SDVersion version) {
     if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
         name = convert_diffusers_unet_to_original_sd1(name);
@@ -717,6 +749,8 @@ std::string convert_diffusion_model_name(std::string name, std::string prefix, S
         name = convert_diffusers_dit_to_original_lumina2(name);
     } else if (sd_version_is_anima(version)) {
         name = convert_other_dit_to_original_anima(name);
+    } else if (sd_version_is_krea2(version)) {
+        name = convert_diffusers_dit_to_original_krea2(name);
     }
     return name;
 }
@@ -1175,7 +1209,7 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
 
     replace_with_prefix_map(name, prefix_map);
 
-    if (sd_version_is_boogu_image(version) && starts_with(name, "text_encoders.llm.visual.")) {
+    if ((sd_version_is_boogu_image(version) || sd_version_is_krea2(version)) && starts_with(name, "text_encoders.llm.visual.")) {
         name = convert_qwen3_vl_vision_name(std::move(name));
     }
 
diff --git a/otherarch/sdcpp/src/runtime/denoiser.hpp b/otherarch/sdcpp/src/runtime/denoiser.hpp
index fed5911bc71c..28b29ef2772a 100644
--- a/otherarch/sdcpp/src/runtime/denoiser.hpp
+++ b/otherarch/sdcpp/src/runtime/denoiser.hpp
@@ -559,6 +559,203 @@ struct LTX2Scheduler : SigmaScheduler {
     }
 };
 
+/*
+ * Logit-Normal Scheduler
+ * Based on: https://github.com/ideogram-oss/ideogram4/blob/main/src/ideogram4/scheduler.py
+ */
+struct LogitNormalScheduler : SigmaScheduler {
+    float mean       = 0.0f;
+    float std        = 1.75f;
+    float logsnr_min = -15.0f;
+    float logsnr_max = 18.0f;
+
+    bool resolution_aware = true;
+
+    float one_minus_t_min, one_minus_t_max;
+
+    void parse_extra_sample_args(int image_seq_len = 0, const char* extra_sample_args = nullptr) {
+        const int known_seq_len = (512 * 512) / (16 * 16);
+        if (extra_sample_args) {
+            for (const auto& [key, value] : parse_key_value_args(extra_sample_args, "logit-normal scheduler arg")) {
+                if (key == "mu") {
+                    if (!parse_strict_float(value, mean)) {
+                        LOG_WARN("ignoring invalid logit-normal scheduler arg '%s=%s'", key.c_str(), value.c_str());
+                    }
+                } else if (key == "std") {
+                    if (!parse_strict_float(value, std)) {
+                        LOG_WARN("ignoring invalid logit-normal scheduler arg '%s=%s'", key.c_str(), value.c_str());
+                    }
+                }
+                if (key == "logsnr_min") {
+                    if (!parse_strict_float(value, logsnr_min)) {
+                        LOG_WARN("ignoring invalid logit-normal scheduler arg '%s=%s'", key.c_str(), value.c_str());
+                    }
+                } else if (key == "logsnr_max") {
+                    if (!parse_strict_float(value, logsnr_max)) {
+                        LOG_WARN("ignoring invalid logit-normal scheduler arg '%s=%s'", key.c_str(), value.c_str());
+                    }
+                } else if (key == "resolution_aware") {
+                    if (!parse_strict_bool(value, resolution_aware)) {
+                        LOG_WARN("ignoring invalid logit-normal scheduler arg '%s=%s'", key.c_str(), value.c_str());
+                    }
+                }
+            }
+        }
+        if (image_seq_len > 0 && resolution_aware) {
+            mean += 0.5 * std::log(static_cast<float>(image_seq_len) / static_cast<float>(known_seq_len));
+        }
+    }
+
+    float sigmoid(float x) {
+        return 1.0f / (1.0f + std::exp(-x));
+    }
+
+    LogitNormalScheduler(float mean = 0.0f, float std = 1.75f, float logsnr_min = -18.0f, float logsnr_max = 15.0f)
+        : mean(mean), std(std), logsnr_min(logsnr_min), logsnr_max(logsnr_max) {
+        // t_min = 1.0f / (1.0f + std::exp(0.5f * logsnr_max));
+        one_minus_t_min = sigmoid(0.5f * logsnr_max);
+        // t_max = 1.0f / (1.0f + std::exp(0.5f * logsnr_min));
+        one_minus_t_max = sigmoid(0.5f * logsnr_min);
+
+    }
+
+    LogitNormalScheduler(int image_seq_len = 0, const char* extra_sample_args = nullptr) {
+        mean       = 0.0f;
+        std        = 1.75f;
+        logsnr_min = -15.0f;
+        logsnr_max = 18.0f;
+
+        parse_extra_sample_args(image_seq_len, extra_sample_args);
+        // t_min = 1.0f / (1.0f + std::exp(0.5f * logsnr_max));
+        one_minus_t_min = sigmoid(0.5f * logsnr_max);
+        // t_max = 1.0f / (1.0f + std::exp(0.5f * logsnr_min));
+        one_minus_t_max = sigmoid(0.5f * logsnr_min);
+    }
+
+    // https://stackedboxes.org/2017/05/01/acklams-normal-quantile-function/
+    double ndtri(double p) {
+        if (p <= 0.0) {
+            return -std::numeric_limits<double>::infinity();
+        } else if (p >= 1.0) {
+            return std::numeric_limits<double>::infinity();
+        }
+
+        static const double p_low  = 0.02425;
+        static const double p_high = 1.0 - p_low;
+
+        static const double c[6] = {-7.784894002430293e-03,
+                                    -3.223964580411365e-01,
+                                    -2.400758277161838e+00,
+                                    -2.549732539343734e+00,
+                                    4.374664141464968e+00,
+                                    2.938163982698783e+00};
+
+        static const double d[5] = {7.784695709041462e-03,
+                                    3.224671290700398e-01,
+                                    2.445134137142996e+00,
+                                    3.754408661907416e+00,
+                                    1.0};
+
+        // Coefficients for the central region
+        static const double a[6] = {-3.969683028665376e+01,
+                                    2.209460984245205e+02,
+                                    -2.759285104469687e+02,
+                                    1.383577518672690e+02,
+                                    -3.066479806614716e+01,
+                                    2.506628277459239e+00};
+
+        static const double b[6] = {-5.447609879822406e+01,
+                                    1.615858368580409e+02,
+                                    -1.556989798598866e+02,
+                                    6.680131188771972e+01,
+                                    -1.328068155288572e+01,
+                                    1.0};
+
+        double x = 0.0;
+
+        if (p < p_low) {
+            // Lower region
+            double q = std::sqrt(-2.0 * std::log(p));
+
+            // Numerator: c[0]*q^5 + c[1]*q^4 + ... + c[5]
+            double numerator = c[0];
+            for (int i = 1; i < 6; ++i) {
+                numerator = numerator * q + c[i];
+            }
+
+            // Denominator: d[0]*q^4 + d[1]*q^3 + ... + d[3]*q + 1
+            double denominator = d[0];
+            for (int i = 1; i < 5; ++i) {
+                denominator = denominator * q + d[i];
+            }
+
+            x = numerator / denominator;
+        } else if (p > p_high) {
+            // Upper region
+            double q = std::sqrt(-2.0 * std::log(1.0 - p));
+
+            double numerator = c[0];
+            for (int i = 1; i < 6; ++i) {
+                numerator = numerator * q + c[i];
+            }
+
+            double denominator = d[0];
+            for (int i = 1; i < 5; ++i) {
+                denominator = denominator * q + d[i];
+            }
+
+            x = -(numerator / denominator);
+        } else {
+            // Central region
+            double q = p - 0.5;
+            double r = q * q;
+
+            // Numerator: (a[0]*r^5 + a[1]*r^4 + ... + a[5])*q
+            double numerator = a[0];
+            for (int i = 1; i < 6; ++i) {
+                numerator = numerator * r + a[i];
+            }
+            numerator *= q;
+
+            // Denominator: b[0]*r^4 + b[1]*r^3 + ... + b[4]*r + 1
+            double denominator = b[0];
+            for (int i = 1; i < 6; ++i) {
+                denominator = denominator * r + b[i];
+            }
+
+            x = numerator / denominator;
+        }
+        return x;
+    }
+
+    std::vector<float> get_sigmas(uint32_t n, float /*sigma_min*/, float /*sigma_max*/, t_to_sigma_t /*t_to_sigma*/) override {
+        std::vector<float> sigmas;
+        LOG_INFO("LOGIT_NORMAL_SCHEDULER using mean=%.4f, std=%.4f, logsnr_min=%.4f, logsnr_max=%.4f", mean, std, logsnr_min, logsnr_max);
+        sigmas.reserve(n + 1);
+        for (uint32_t i = 0; i <= n; ++i) {
+            float t = static_cast<float>(i) / static_cast<float>(n);
+
+            // ndtri(1-t) == -ndtri(t)
+            float z = -ndtri(t);
+
+            float y = mean + std * z;
+
+            float timestep = sigmoid(y);
+
+            if (timestep > one_minus_t_min)
+                timestep = one_minus_t_min;
+            if (timestep < one_minus_t_max)
+                timestep = one_minus_t_max;
+
+            float sigma = timestep;
+
+            sigmas.push_back(sigma);
+        }
+        sigmas[n] = 0.0f;
+        return sigmas;
+    }
+};
+
 struct Denoiser {
     virtual float sigma_min()                                                        = 0;
     virtual float sigma_max()                                                        = 0;
@@ -623,6 +820,11 @@ struct Denoiser {
                 LOG_INFO("get_sigmas with LTX2 scheduler");
                 scheduler = std::make_shared<LTX2Scheduler>(image_seq_len, extra_sample_args);
                 break;
+            case LOGIT_NORMAL_SCHEDULER: {
+                LOG_INFO("get_sigmas with Logit-Normal scheduler");
+                scheduler = std::make_shared<LogitNormalScheduler>(image_seq_len, extra_sample_args);
+                break;
+            }
             default:
                 LOG_INFO("get_sigmas with discrete scheduler (default)");
                 scheduler = std::make_shared<DiscreteScheduler>();
diff --git a/otherarch/sdcpp/src/stable-diffusion.cpp b/otherarch/sdcpp/src/stable-diffusion.cpp
index 2c0f53a37596..e951e414324a 100644
--- a/otherarch/sdcpp/src/stable-diffusion.cpp
+++ b/otherarch/sdcpp/src/stable-diffusion.cpp
@@ -28,6 +28,7 @@
 #include "model/diffusion/flux.hpp"
 #include "model/diffusion/hidream_o1.hpp"
 #include "model/diffusion/ideogram4.hpp"
+#include "model/diffusion/krea2.hpp"
 #include "model/diffusion/lens.hpp"
 #include "model/diffusion/ltxv.hpp"
 #include "model/diffusion/mmdit.hpp"
@@ -97,6 +98,7 @@ const char* model_version_to_str[] = {
     "Longcat-Image",
     "PiD",
     "Ideogram 4",
+    "Krea2",
     "ESRGAN",
 };
 
@@ -494,7 +496,8 @@ class StableDiffusionGGML {
         bool is_ltx = sd_version_is_ltxav(tempver);
         bool is_ideogram = sd_version_is_ideogram4(tempver);
         bool is_boogu = sd_version_is_boogu_image(tempver);
-        bool conditioner_is_llm = (is_qwenimg || iszimg || isflux2 || is_ovis || is_anima || is_ernie || is_longcat || is_lens || is_ltx || is_ideogram || is_boogu);
+        bool is_krea2 = sd_version_is_krea2(tempver);
+        bool conditioner_is_llm = (is_qwenimg || iszimg || isflux2 || is_ovis || is_anima || is_ernie || is_longcat || is_lens || is_ltx || is_ideogram || is_boogu || is_krea2);
         bool has_llm_vision = (is_qwenimg || is_longcat || is_boogu);
 
         //kcpp qol fallback: if a llm was loaded as t5 by mistake
@@ -600,7 +603,7 @@ class StableDiffusionGGML {
             {
                 to_replace = "taesd_f2.embd";
             }
-            else if(is_wan21||is_qwenimg||sd_version_is_anima(tempver))
+            else if(is_wan21||is_qwenimg||is_anima||is_krea2)
             {
                 to_replace = "taesd_w21.embd";
             }
@@ -892,6 +895,17 @@ class StableDiffusionGGML {
                                                                                tensor_storage_map,
                                                                                "model.diffusion_model",
                                                                                model_manager);
+            } else if (sd_version_is_krea2(version)) {
+                cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
+                                                                 tensor_storage_map,
+                                                                 version,
+                                                                 "",
+                                                                 false,
+                                                                 model_manager);
+                diffusion_model  = std::make_shared<Krea2::Krea2Runner>(backend_for(SDBackendModule::DIFFUSION),
+                                                                       tensor_storage_map,
+                                                                       "model.diffusion_model",
+                                                                       model_manager);
             } else if (sd_version_is_flux(version)) {
                 bool is_chroma = false;
                 for (auto pair : tensor_storage_map) {
@@ -1128,6 +1142,7 @@ class StableDiffusionGGML {
             auto create_tae = [&](bool decode_only) -> std::shared_ptr<VAE> {
                 if (sd_version_is_wan(version) ||
                     sd_version_is_qwen_image(version) ||
+                    sd_version_is_krea2(version) ||
                     sd_version_is_anima(version) ||
                     sd_version_is_ltxav(version)) {
                     return std::make_shared<TinyVideoAutoEncoder>(backend_for(SDBackendModule::VAE),
@@ -1168,6 +1183,7 @@ class StableDiffusionGGML {
                                                          model_manager);
                 } else if (sd_version_is_wan(version) ||
                            sd_version_is_qwen_image(version) ||
+                           sd_version_is_krea2(version) ||
                            sd_version_is_anima(version)) {
                     return std::make_shared<WAN::WanVAERunner>(backend_for(SDBackendModule::VAE),
                                                                tensor_storage_map,
@@ -1514,7 +1530,8 @@ class StableDiffusionGGML {
                 } else if (sd_version_is_flux(version) ||
                            sd_version_is_longcat(version) ||
                            sd_version_is_lens(version) ||
-                           sd_version_is_ltxav(version)) {
+                           sd_version_is_ltxav(version) ||
+                           sd_version_is_krea2(version)) {
                     pred_type = FLUX_FLOW_PRED;
 
                     default_flow_shift = 1.0f;  // TODO: validate
@@ -1530,6 +1547,8 @@ class StableDiffusionGGML {
                         default_flow_shift = 1.83f;
                     } else if (sd_version_is_ltxav(version)) {
                         default_flow_shift = 2.37f;
+                    } else if (sd_version_is_krea2(version)) {
+                        default_flow_shift = 1.15f;
                     }
                 } else if (sd_version_is_flux2(version)) {
                     pred_type = FLUX2_FLOW_PRED;
@@ -1990,7 +2009,7 @@ class StableDiffusionGGML {
                 } else if (sd_version_uses_flux_vae(version)) {
                     latent_rgb_proj = flux_latent_rgb_proj;
                     latent_rgb_bias = flux_latent_rgb_bias;
-                } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
+                } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || sd_version_is_krea2(version)) {
                     latent_rgb_proj = wan_21_latent_rgb_proj;
                     latent_rgb_bias = wan_21_latent_rgb_bias;
                 } else {
@@ -2818,6 +2837,7 @@ const char* scheduler_to_str[] = {
     "lcm",
     "bong_tangent",
     "ltx2",
+    "logit_normal",
 };
 
 const char* sd_scheduler_name(enum scheduler_t scheduler) {
@@ -3492,6 +3512,8 @@ enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx, enum sample_me
         return SIMPLE_SCHEDULER;
     } else if (sd_ctx != nullptr && sd_ctx->sd != nullptr && sd_version_is_ltxav(sd_ctx->sd->version)) {
         return LTX2_SCHEDULER;
+    } else if(sd_ctx != nullptr && sd_ctx->sd != nullptr && sd_version_is_ideogram4(sd_ctx->sd->version)) {
+        return LOGIT_NORMAL_SCHEDULER;
     }
     return DISCRETE_SCHEDULER;
 }