From f8deb06523b9a44931e8cd684519888724365a45 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Wed, 29 Apr 2026 00:35:52 +0000 Subject: [PATCH 1/2] Add Laguna expert defusion support --- defuser/model_registry.py | 3 ++ pyproject.toml | 2 +- tests/test_convert_model.py | 79 +++++++++++++++++++++++++++++++ tests/test_meta_model_defusion.py | 14 ++++++ 4 files changed, 97 insertions(+), 1 deletion(-) diff --git a/defuser/model_registry.py b/defuser/model_registry.py index 697e7bf..37d919f 100644 --- a/defuser/model_registry.py +++ b/defuser/model_registry.py @@ -195,6 +195,9 @@ class PATCH(str, Enum): "jetmoe": { "min_transformers_version": MIN_SUPPORTED_TRANSFORMERS_VERSION, }, + "laguna": { + "min_transformers_version": MIN_SUPPORTED_TRANSFORMERS_VERSION, + }, "llama4": { "min_transformers_version": MIN_SUPPORTED_TRANSFORMERS_VERSION, PATCH.EXPERTS_DEFUSE: [ diff --git a/pyproject.toml b/pyproject.toml index 7b87412..50ae60b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ build-backend = "setuptools.build_meta" [project] name = "Defuser" -version = "0.0.20" +version = "0.0.21" description = "Model defuser helper for HF Transformers." readme = "README.md" requires-python = ">=3.9" diff --git a/tests/test_convert_model.py b/tests/test_convert_model.py index 3cfbdbf..19b40da 100644 --- a/tests/test_convert_model.py +++ b/tests/test_convert_model.py @@ -17,6 +17,7 @@ ) from transformers.models.glm4v.configuration_glm4v import Glm4vConfig from transformers.models.glm4v.modeling_glm4v import Glm4vForConditionalGeneration +from transformers.models.laguna.modeling_laguna import LagunaConfig, LagunaForCausalLM from transformers.models.mixtral.configuration_mixtral import MixtralConfig from transformers.models.mixtral.modeling_mixtral import MixtralForCausalLM, MixtralSparseMoeBlock from transformers.models.qwen2_moe.modeling_qwen2_moe import ( @@ -255,6 +256,29 @@ def _tiny_gpt_oss_config(): ) +def _tiny_laguna_config(): + return LagunaConfig( + vocab_size=128, + hidden_size=64, + intermediate_size=128, + num_hidden_layers=2, + num_attention_heads=4, + num_attention_heads_per_layer=[4, 4], + num_key_value_heads=1, + head_dim=16, + num_experts=4, + num_experts_per_tok=2, + moe_intermediate_size=32, + shared_expert_intermediate_size=32, + mlp_layer_types=["dense", "sparse"], + layer_types=["full_attention", "full_attention"], + hidden_act="silu", + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + ) + + def _tiny_llama4_config(): return Llama4Config( text_config={ @@ -1173,6 +1197,61 @@ def test_gpt_oss_split_forward_matches_fused_math(): # The split experts path should exactly reproduce the original fused experts math. torch.testing.assert_close(actual, expected) + +def test_laguna(): + from transformers.models.laguna.modeling_laguna import LagunaExperts + + model = LagunaForCausalLM(_tiny_laguna_config()) + assert model.config.model_type == "laguna" + + original_moe_block = model.model.layers[1].mlp + experts = original_moe_block.experts + assert isinstance(experts, LagunaExperts) + + hidden_dim = experts.gate_up_proj.shape[-1] + intermediate_dim = experts.gate_up_proj.shape[1] // 2 + + expected_gate = experts.gate_up_proj[0, :intermediate_dim, :hidden_dim].contiguous().clone() + expected_up = experts.gate_up_proj[0, intermediate_dim:, :hidden_dim].contiguous().clone() + expected_down = experts.down_proj[0, :hidden_dim, :intermediate_dim].contiguous().clone() + + converted = convert_model(model, cleanup_original=False, max_layers=2) + assert converted + + experts = model.model.layers[1].mlp.experts + _assert_unfused_expert_module(experts) + expert0 = getattr(experts, "0") + + materialize_model(model.model.layers[1]) + + torch.testing.assert_close(expert0.gate_proj.weight, expected_gate) + torch.testing.assert_close(expert0.up_proj.weight, expected_up) + torch.testing.assert_close(expert0.down_proj.weight, expected_down) + + +def test_laguna_split_forward_matches_fused_math(): + model = LagunaForCausalLM(_tiny_laguna_config()) + fused_experts = model.model.layers[1].mlp.experts + + hidden_states = torch.randn(5, model.config.hidden_size, dtype=torch.float32) + top_k_index = torch.zeros((hidden_states.size(0), 1), dtype=torch.long) + top_k_weights = torch.ones((hidden_states.size(0), 1), dtype=hidden_states.dtype) + + with torch.no_grad(): + expected = fused_experts(hidden_states, top_k_index, top_k_weights) + + converted = convert_model(model, cleanup_original=False, max_layers=2) + assert converted + + split_experts = model.model.layers[1].mlp.experts + _assert_unfused_expert_module(split_experts) + materialize_model(model.model.layers[1]) + with torch.no_grad(): + actual = split_experts(hidden_states, top_k_index, top_k_weights) + + torch.testing.assert_close(actual, expected) + + def test_llama4(): from transformers.models.llama4.modeling_llama4 import Llama4TextMoe diff --git a/tests/test_meta_model_defusion.py b/tests/test_meta_model_defusion.py index b8a29ef..d65e747 100644 --- a/tests/test_meta_model_defusion.py +++ b/tests/test_meta_model_defusion.py @@ -173,6 +173,10 @@ def _build_model_config(case: dict): elif model_type == "lfm2_moe": config.layer_types = ["full_attention", "short_conv"] config.num_dense_layers = 0 + elif model_type == "laguna": + config.layer_types = ["full_attention"] * config.num_hidden_layers + config.mlp_layer_types = ["dense"] + ["sparse"] * (config.num_hidden_layers - 1) + config.num_attention_heads_per_layer = [config.num_attention_heads] * config.num_hidden_layers elif model_type == "qwen3_omni_moe": config.enable_audio_output = True config.talker_config.spatial_merge_size = 2 @@ -537,6 +541,16 @@ def _validate_defused_module(case: dict, module) -> None: "validator": "parallel", "min_targets": 4, }, + { + "model_type": "laguna", + "mode": "convert", + "model_module": "transformers.models.laguna.modeling_laguna", + "model_class": "LagunaForCausalLM", + "config_module": "transformers.models.laguna.configuration_laguna", + "config_class": "LagunaConfig", + "target_class_paths": ("transformers.models.laguna.modeling_laguna.LagunaExperts",), + "validator": "experts", + }, { "model_type": "lfm2_moe", "mode": "convert", From 35be5a1bb4a98cc9f5bf92066fdc3d5186db69a7 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Wed, 29 Apr 2026 00:53:20 +0000 Subject: [PATCH 2/2] Document Laguna support and stabilize meta tests --- README.md | 60 +++++++++++++++---------------- tests/test_meta_model_defusion.py | 24 ++++++++++++- 2 files changed, 53 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index a37f60b..e72113a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@
image -

Defuser

+

Defuser ๐Ÿ”ง

@@ -11,47 +11,47 @@

-Defuser converts select Hugging Face Transformers `5.3.0+` fused or stacked MoE and MLP blocks back into plain, per-expert `nn.Linear` modules. It keeps the forward math intact while exposing individual projections again so quantizers, activation capture, debugging hooks, and checkpoint tooling can work against a simple module layout instead of fused expert tensors. +๐Ÿงฉ Defuser converts select Hugging Face Transformers `5.3.0+` fused or stacked MoE and MLP blocks back into plain, per-expert `nn.Linear` modules. It keeps the forward math intact while exposing individual projections again so quantizers, activation capture, debugging hooks, and checkpoint tooling can work against a simple module layout instead of fused expert tensors. -Defuser is designed and CI-tested for `transformers>=5.3.0`, and support is only offered for that version range. +โœ… Defuser is designed and CI-tested for `transformers>=5.3.0`, and support is only offered for that version range. -## Purpose +## ๐ŸŽฏ Purpose Defuser exists for cases where newer Transformers modeling code optimizes model structure in ways that are good for runtime, but harder for tooling that needs direct access to individual projections. Depending on the model family, Defuser can: -- patch a supported model class before load so HF instantiates a defused block directly -- split fused tensors such as `gate_up_proj` into `gate_proj` + `up_proj` -- convert 3D expert tensors, including registered expert buffers, into numbered expert `nn.Linear` modules -- preserve the original fused math while presenting a naive module structure again +- ๐Ÿงต patch a supported model class before load so HF instantiates a defused block directly +- โœ‚๏ธ split fused tensors such as `gate_up_proj` into `gate_proj` + `up_proj` +- ๐Ÿงฑ convert 3D expert tensors, including registered expert buffers, into numbered expert `nn.Linear` modules +- ๐Ÿงฎ preserve the original fused math while presenting a naive module structure again -Public API: +๐Ÿ› ๏ธ Public API: ```python from defuser import convert_model, replace_fused_blocks ``` -- `replace_fused_blocks(model_type)` patches supported HF model classes before `from_pretrained()` or direct model construction. -- `convert_model(model, cleanup_original=False, max_layers=None, filter=None)` converts an already loaded model in place. This is the runtime defusion path for supported post-load expert and MLP conversions, including `qwen3_5_moe` style checkpoints. -- Defuser is designed and CI-tested for `transformers>=5.3.0`, and support is only offered for that version range. Older versions log a warning on these public APIs and are skipped as unsupported. -- Some model families appear in both support tables. Full models can be prepatched with `replace_fused_blocks(...)`, while standalone fused expert modules from those same families can still be runtime-defused with `convert_model(...)`. +- ๐Ÿงฐ `replace_fused_blocks(model_type)` patches supported HF model classes before `from_pretrained()` or direct model construction. +- ๐Ÿ”„ `convert_model(model, cleanup_original=False, max_layers=None, filter=None)` converts an already loaded model in place. This is the runtime defusion path for supported post-load expert and MLP conversions, including `qwen3_5_moe` style checkpoints. +- ๐Ÿงช Defuser is designed and CI-tested for `transformers>=5.3.0`, and support is only offered for that version range. Older versions log a warning on these public APIs and are skipped as unsupported. +- ๐Ÿงญ Some model families appear in both support tables. Full models can be prepatched with `replace_fused_blocks(...)`, while standalone fused expert modules from those same families can still be runtime-defused with `convert_model(...)`. `filter` is an optional list of PCRE regex rules evaluated against full module paths such as `model.layers.0.mlp.experts`: -- `+:regex` explicitly includes matching candidate module paths -- `-:regex` explicitly excludes matching candidate module paths -- `regex` is shorthand for `+:regex` -- negative rules take priority over positive rules -- when `filter` is provided, a candidate module is defused only if it matches at least one positive rule and no negative rules +- โœ… `+:regex` explicitly includes matching candidate module paths +- ๐Ÿšซ `-:regex` explicitly excludes matching candidate module paths +- โž• `regex` is shorthand for `+:regex` +- ๐Ÿ›ก๏ธ negative rules take priority over positive rules +- ๐ŸŽฏ when `filter` is provided, a candidate module is defused only if it matches at least one positive rule and no negative rules -## Supported Models +## โœ… Supported Models Defuser currently supports the following `transformers>=5.3.0` `model_type` values. -### `replace_fused_blocks(model_type)` before load +### ๐Ÿงฐ `replace_fused_blocks(model_type)` before load -| Model type | Defused op performed | +| Model type | Defused op performed โš™๏ธ | | --- | --- | | `glm4_moe` | Replaces `Glm4MoeMoE` with a defused per-expert linear MoE block. | | `glm4_moe_lite` | Replaces `Glm4MoeLiteMoE` with a defused per-expert linear MoE block.| @@ -62,11 +62,11 @@ Defuser currently supports the following `transformers>=5.3.0` `model_type` valu | `qwen3_next` | Replaces `Qwen3NextSparseMoeBlock` with a defused per-expert linear MoE block. | | `qwen3_omni_moe` | Replaces both thinker and talker text sparse MoE blocks with defused per-expert linear blocks and applies small runtime compatibility patches for text `forward()` and `generate()`. | -### `convert_model(model)` after load +### ๐Ÿ”„ `convert_model(model)` after load -| Pattern | Supported model types | Defused op performed | +| Pattern | Supported model types | Defused op performed โš™๏ธ | | --- | --- | --- | -| Standard routed expert tensors | `deepseek_v2`, `dots1`, `ernie4_5_moe`, `ernie4_5_vl_moe`, `exaone_moe`, `flex_olmo`, `glm4_moe_lite`, `glm4v_moe`, `hunyuan_v1_moe`, `jamba`, `lfm2_moe`, `minimax`, `minimax_m2`, `olmoe`, `qwen3_vl_moe`, `solar_open` | Splits fused expert tensors or registered expert buffers into numbered expert `nn.Linear` modules with per-expert `gate_proj`, `up_proj`, and `down_proj`. | +| Standard routed expert tensors ๐Ÿงฑ | `deepseek_v2`, `dots1`, `ernie4_5_moe`, `ernie4_5_vl_moe`, `exaone_moe`, `flex_olmo`, `glm4_moe_lite`, `glm4v_moe`, `hunyuan_v1_moe`, `jamba`, `laguna`, `lfm2_moe`, `minimax`, `minimax_m2`, `olmoe`, `qwen3_vl_moe`, `solar_open` | Splits fused expert tensors or registered expert buffers into numbered expert `nn.Linear` modules with per-expert `gate_proj`, `up_proj`, and `down_proj`. | | Mixed sparse and shared experts | `deepseek_v3`, `glm_moe_dsa`, `qwen3_5_moe`, `qwen3_5_moe_text` | Runtime expert tensor defusion for routed experts while preserving the model's shared-expert path. | | Transposed or packed expert tensors | `gpt_oss`, `phimoe` | Splits transposed fused expert `gate_up_proj` tensors into per-expert `gate_proj` + `up_proj`, preserves expert bias when present, and converts expert tensors into numbered expert `nn.Linear` modules. | | Flattened expert layout | `dbrx` | Rebuilds the flattened DBRX expert FFN weights into numbered expert `gate_proj`, `up_proj`, and `down_proj` `nn.Linear` modules. | @@ -76,7 +76,7 @@ Defuser currently supports the following `transformers>=5.3.0` `model_type` valu | Routed experts with identity experts | `longcat_flash` | Defuses routed experts into numbered `gate_proj`, `up_proj`, and `down_proj` modules and preserves zero or identity experts. | | Fused dense `gate_up_proj` MLPs | `dia`, `glm`, `glm4`, `glm_image`, `glm_ocr`, `phi3`, `phi4_multimodal`, `zamba2` | Splits fused dense `gate_up_proj` layers into `gate_proj` + `up_proj` and updates the block `forward()` to preserve the original MLP math. | -## Workflow Summary +## ๐Ÿ” Workflow Summary Use `replace_fused_blocks()` for model families that Defuser can patch before load: @@ -103,7 +103,7 @@ print(converted) # True when runtime defusion happened `convert_model(model)` also preserves meta-device construction for supported meta-initialized models, so structural validation can run without materializing weights. -Use `filter` when only specific blocks should be defused: +Use `filter` when only specific blocks should be defused ๐ŸŽฏ: ```python from defuser import convert_model @@ -117,11 +117,11 @@ convert_model( ) ``` -## Real Qwen3.5 MoE Example +## ๐Ÿงช Real Qwen3.5 MoE Example The example below is written for the `transformers==5.3.0` public API surface and uses the real Hugging Face model `Qwen/Qwen3.5-35B-A3B-Instruct`. Defuser supports `transformers>=5.3.0`. -### Fused Weights Before And After +### ๐Ÿ”ฌ Fused Weights Before And After Before `convert_model(model)`: @@ -149,7 +149,7 @@ After `convert_model(model)`: +-----------------------------------------------------------------+--------------------------------------+ ``` -### Sample 1: Inspect The Conversion In Place +### ๐Ÿงญ Sample 1: Inspect The Conversion In Place ```python from defuser import convert_model @@ -187,7 +187,7 @@ print(after[:6]) # ] ``` -### Sample 2: Convert And Keep Using The Model Normally +### ๐Ÿš€ Sample 2: Convert And Keep Using The Model Normally ```python import torch diff --git a/tests/test_meta_model_defusion.py b/tests/test_meta_model_defusion.py index d65e747..aa88b87 100644 --- a/tests/test_meta_model_defusion.py +++ b/tests/test_meta_model_defusion.py @@ -167,9 +167,20 @@ def _build_model_config(case: dict): elif model_type == "glm_moe_dsa": config.mlp_layer_types = ["sparse"] * config.num_hidden_layers elif model_type == "granitemoehybrid": - config.layer_types = ["attention", "mamba"] + # Keep this meta-structure test on the attention path. The mamba path + # lazy-loads optional hub kernels during construction, which is outside + # the Defuser behavior being validated here. + config.layer_types = ["attention", "attention"] config.shared_intermediate_size = 64 config.mamba_n_heads = 8 + elif model_type == "jamba": + # Keep this meta-structure test on the attention path. The mamba path + # lazy-loads optional hub kernels during construction, which is outside + # the Defuser behavior being validated here. + config.attn_layer_period = 1 + config.attn_layer_offset = 0 + config.expert_layer_period = 1 + config.expert_layer_offset = 0 elif model_type == "lfm2_moe": config.layer_types = ["full_attention", "short_conv"] config.num_dense_layers = 0 @@ -177,6 +188,11 @@ def _build_model_config(case: dict): config.layer_types = ["full_attention"] * config.num_hidden_layers config.mlp_layer_types = ["dense"] + ["sparse"] * (config.num_hidden_layers - 1) config.num_attention_heads_per_layer = [config.num_attention_heads] * config.num_hidden_layers + elif model_type == "nemotron_h": + # Keep this meta-structure test on MoE blocks. Mamba blocks lazy-load + # optional hub kernels during construction, which is outside the + # Defuser behavior being validated here. + config.layers_block_type = ["moe"] * config.num_hidden_layers elif model_type == "qwen3_omni_moe": config.enable_audio_output = True config.talker_config.spatial_merge_size = 2 @@ -192,6 +208,12 @@ def _build_model_config(case: dict): num_key_value_heads=1, head_dim=16, ) + elif model_type == "zamba2": + # Zamba2 always constructs Mamba layers, even for hybrid blocks. Keep + # the optional hub kernels disabled while preserving hybrid MLP targets. + config.use_mamba_kernels = False + config.layers_block_type = ["hybrid"] * config.num_hidden_layers + config.hybrid_layer_ids = list(range(config.num_hidden_layers)) return config