SJTU-DENG-Lab · drewjin · Feb 14, 2026 · Dec 15, 2025 · Dec 15, 2025 · Dec 22, 2025
diff --git a/.gitignore b/.gitignore
@@ -37,6 +37,7 @@ autotuner.log
 Fast-dLLM
 Discrete-Diffusion-Forcing
 position_explanation.md
+temp/
 cuda_cache/
 
 # IDE
@@ -50,4 +51,9 @@ kernel_diff_analysis.md
 tilelang_optimization_analysis.md
 boundary_check_comparison.md
 GITHUB_ISSUE.md
-Tilelang-failed_test_cases/
+Tilelang-failed_test_cases/
+# Benchmark results
+benchmark_results/
+benchmark_results_tmp/
+# Cursor IDE files
+.cursor/
diff --git a/diffulex/__init__.py b/diffulex/__init__.py
@@ -1,4 +1,42 @@
-from diffulex.diffulex import Diffulex
-from diffulex.sampling_params import SamplingParams
-# Import strategies to trigger registration
-from diffulex import strategy  # noqa: F401
+"""Diffulex package root.
+
+Keep this module lightweight so that importing submodules like
+`diffulex.utils.quantization` does not eagerly import the full engine/kernel.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    # These are available for type checkers; runtime import is lazy via __getattr__.
+    from diffulex.diffulex import Diffulex as Diffulex  # noqa: F401
+    from diffulex.sampling_params import SamplingParams as SamplingParams  # noqa: F401
+    from diffulex.logger import get_logger as get_logger, setup_logger as setup_logger, LoggerMixin as LoggerMixin  # noqa: F401
+
+
+def __getattr__(name: str):
+    if name == "Diffulex":
+        # Only trigger heavy side-effect imports when users actually construct the engine.
+        # This keeps `import diffulex.utils.quantization` lightweight.
+        from diffulex import strategy as _strategy  # noqa: F401
+        from diffulex.diffulex import Diffulex
+
+        return Diffulex
+    if name == "SamplingParams":
+        from diffulex.sampling_params import SamplingParams
+
+        return SamplingParams
+    if name == "get_logger":
+        from diffulex.logger import get_logger
+        return get_logger
+    if name == "setup_logger":
+        from diffulex.logger import setup_logger
+        return setup_logger
+    if name == "LoggerMixin":
+        from diffulex.logger import LoggerMixin
+        return LoggerMixin
+    raise AttributeError(name)
+
+
+__all__ = ["Diffulex", "SamplingParams", "get_logger", "setup_logger", "LoggerMixin"]
diff --git a/diffulex/attention/__init__.py b/diffulex/attention/__init__.py
@@ -17,8 +17,11 @@ def __repr__(self):
 def __getattr__(name):
     """Lazy import to avoid circular deps during module init."""
     if name == "Attention":
-        from .attn_impl import Attention
-        return Attention
+        try:
+            from .attn_impl import Attention
+            return Attention
+        except Exception as e:
+            raise ImportError(f"Failed to import diffulex.attention.attn_impl.Attention: {e}")
     if name == "fetch_attn_metadata":
         return metadata.fetch_attn_metadata
     raise AttributeError(f"module {__name__} has no attribute {name}")
diff --git a/diffulex/attention/attn_impl.py b/diffulex/attention/attn_impl.py
@@ -25,6 +25,9 @@ def __init__(
         self.scale = scale
         self.num_kv_heads = num_kv_heads
         self.k_cache = self.v_cache = torch.tensor([])
+        # Quantization scales (will be bound by ModelRunner if strategy requires them)
+        self.k_scale = None
+        self.v_scale = None
 
         self.q_shape = {
             'nh': self.num_heads,
@@ -53,6 +56,21 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
         # Fast Store KV cache
         if k_cache.numel() and v_cache.numel():
             if attn_metadata.need_kv_cache_store:
+                # Update scales if quantization strategy requires them
+                if self.k_scale is not None and self.v_scale is not None:
+                    from diffulex.utils.quantization.context import get_kv_cache_strategy
+                    strategy = get_kv_cache_strategy()
+                    if strategy is not None:
+                        self.k_scale, self.v_scale = strategy.update_scales(
+                            k, v, self.k_scale, self.v_scale,
+                            self.num_kv_heads, k.device
+                        )
+                    # Pass scale to metadata if required by strategy
+                    if strategy is not None:
+                        strategy.maybe_set_attn_metadata_scales(
+                            attn_metadata, k_scale=self.k_scale, v_scale=self.v_scale
+                        )
+
                 store_kvcache = store_kvcache_unified_layout if is_unified_layout else store_kvcache_distinct_layout
                 store_kvcache(k, v, k_cache, v_cache, attn_metadata.slot_mapping, attn_metadata)
 
@@ -64,9 +82,35 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
             o = dllm_flash_attn_prefill(q, k, v, self.scale, attn_metadata)
         else:
             if is_unified_layout:
+                from diffulex.utils.quantization.context import get_kv_cache_strategy
+                strategy = get_kv_cache_strategy()
+                if strategy is not None:
+                    # e.g. FP8: pass scales to metadata for kernel / load_kvcache to handle
+                    strategy.maybe_set_attn_metadata_scales(
+                        attn_metadata, k_scale=self.k_scale, v_scale=self.v_scale
+                    )
+
                 o = dllm_flash_attn_decode(q, k, v, k_cache, v_cache, self.scale, attn_metadata)
             else:
-                raise NotImplementedError("Distinct layout is not supported yet...")
+                # Distinct layout: use varlen mode with load_kvcache
+                from diffulex_kernel import load_kvcache
+                from diffulex.utils.quantization.context import get_kv_cache_strategy
+                strategy = get_kv_cache_strategy()
+                if strategy is not None:
+                    # e.g. FP8: pass scales to metadata for load_kvcache to handle
+                    strategy.maybe_set_attn_metadata_scales(
+                        attn_metadata, k_scale=self.k_scale, v_scale=self.v_scale
+                    )
+
+                # Distinct layout uses varlen mode
+                k_comb, v_comb = load_kvcache(k_cache, v_cache, attn_metadata, k, v)
+                from flash_attn import flash_attn_varlen_func
+                o = flash_attn_varlen_func(
+                    q, k_comb, v_comb,
+                    attn_metadata.cu_seqlens_q, attn_metadata.cu_seqlens_k,
+                    attn_metadata.max_seqlen_q, attn_metadata.max_seqlen_k,
+                    softmax_scale=self.scale, block_table=None
+                )
 
         # Final reshape
         return rearrange(o, 's nh hd -> s (nh hd)').contiguous()
diff --git a/diffulex/attention/metadata.py b/diffulex/attention/metadata.py
@@ -18,6 +18,9 @@ class AttnMetaDataBase:
     attn_type: str = "block_attention"
     diffusion_block_size: int = 32
     decode_mode: str = "static"
+    k_scale: torch.Tensor | None = None  # Quantization scale for K cache, shape [num_kv_heads]
+    v_scale: torch.Tensor | None = None  # Quantization scale for V cache, shape [num_kv_heads]
+    q_scale: torch.Tensor | None = None  # Quantization scale for Q, strategy-defined shape (e.g. [num_heads] or [1])
 
     @property
     def num_seqs(self) -> int:

diff --git a/diffulex/config.py b/diffulex/config.py
@@ -1,7 +1,10 @@
 import os
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from transformers import AutoConfig
+from diffulex.logger import get_logger
+
+logger = get_logger(__name__)
 
 
 @dataclass
@@ -31,9 +34,10 @@ class Config:
     master_addr: str = "localhost"
     master_port: int = 2333
     # Shared memory segment name for intra-TP RPC; must be unique per DP group.
-    shm_name: str = "diffuserve_shm"
+    shm_name: str = "diffulex_shm"
     # Start device index for this TP group (set by DP launcher).
     device_start: int = 0
+    device_ids: list[int] = field(default_factory=lambda: [])
 
     enforce_eager: bool = False
     hf_config: AutoConfig | None = None
@@ -42,6 +46,23 @@ class Config:
     num_kvcache_blocks: int = -1
     k_cache_hdim_split_factor_x: int = 8
     kv_cache_layout: str = "unified"  # "unified" or "distinct"
+    kv_cache_dtype: str = "bf16"  # "bf16", "fp16", "fp32", "fp8_e4m3", "fp8_e5m2"
+    decode_mode: str | None = None  # "static" or "varlen", None means auto-select based on kv_cache_dtype
+    # Attention-Q dtype (activation quantization). "bf16" default; "fp8" is a placeholder
+    # for future kernels (enabling it will currently raise NotImplementedError at runtime).
+    attn_q_dtype: str = "bf16"
+    # Linear quantization (weights + activations). All are placeholders for future kernels.
+    # Use "bf16" to disable quantization.
+    # Supported aliases (normalized in registry): bf16/int8/int4/fp8/fp8_e4m3/fp8_e5m2/gptq/awq.
+    linear_attn_weight_dtype: str = "bf16"
+    linear_mlp_weight_dtype: str = "bf16"
+    linear_attn_act_dtype: str = "bf16"
+    linear_mlp_act_dtype: str = "bf16"
+
+    # Kernel tuning knobs (avoid environment-variable based tuning in library code).
+    # Currently used by some W8A16 linear strategies.
+    linear_w8a16_quant_block_n: int = 256
+    linear_w8a16_allspark_cublas_m_threshold: int = 256
 
     def __post_init__(self):
         assert os.path.isdir(self.model)
@@ -56,9 +77,16 @@ def __post_init__(self):
             if not self.lora_path:
                 raise ValueError("lora_path must be provided when use_lora is True")
             if not os.path.exists(self.lora_path):
-                print(f"Warning: LoRA path {self.lora_path} does not exist")
+                logger.warning(f"LoRA path {self.lora_path} does not exist")
 
         self.hf_config = AutoConfig.from_pretrained(self.model, trust_remote_code=True)
         cfg_max_model_len = self.hf_config.max_position_embeddings if hasattr(self.hf_config, "max_position_embeddings") else self.hf_config.max_sequence_length
         self.max_model_len = min(self.max_model_len, cfg_max_model_len)
-        assert self.max_num_batched_tokens >= self.max_model_len
+        assert self.max_num_batched_tokens >= self.max_model_len
+
+        if not self.device_ids:
+            import torch
+            # When CUDA_VISIBLE_DEVICES is set, PyTorch maps physical devices to logical device 0, 1, ...
+            # So we should use logical device indices (0, 1, ...) instead of physical device IDs
+            self.device_ids = list(range(torch.cuda.device_count()))
+            logger.info(f"Using CUDA devices: {self.device_ids}")
diff --git a/diffulex/diffulex.py b/diffulex/diffulex.py
@@ -4,7 +4,7 @@
 
 class Diffulex:
     def __new__(cls, model, **kwargs):
-        cfg = Config(model, **{k: v for k, v in kwargs.items() if k in Config.__dataclass_fields__.keys()})
-        if cfg.data_parallel_size > 1:
+        data_parallel_size = kwargs.get('data_parallel_size', 1)
+        if data_parallel_size > 1:
             return DiffulexDPWorker(model, **kwargs)
         return DiffulexTPWorker(model, **kwargs)
diff --git a/diffulex/engine/dp_worker.py b/diffulex/engine/dp_worker.py
@@ -15,6 +15,9 @@
 from diffulex.config import Config
 from diffulex.engine.tp_worker import DiffulexTPWorker
 from diffulex.sampling_params import SamplingParams
+from diffulex.logger import get_logger
+
+logger = get_logger(__name__)
 
 
 def _dp_child_entry(config: Config, dp_idx: int, local_devices: list[int], conn):
@@ -25,11 +28,12 @@ def _dp_child_entry(config: Config, dp_idx: int, local_devices: list[int], conn)
             faulthandler.enable(all_threads=True)
         except Exception:
             pass
-        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(x) for x in local_devices)
+        # os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(x) for x in local_devices)
         cfg = Config(
             model=config.model,
             lora_path=config.lora_path,
             model_name=config.model_name,
+            decoding_strategy=config.decoding_strategy,
             mask_token_id=config.mask_token_id,
             diffusion_block_size=config.diffusion_block_size,
             accept_threshold=config.accept_threshold,
@@ -52,6 +56,7 @@ def _dp_child_entry(config: Config, dp_idx: int, local_devices: list[int], conn)
             kv_cache_layout=config.kv_cache_layout,
         )
         setattr(cfg, "device_start", 0)
+        setattr(cfg, "device_ids", local_devices)
 
         engine = DiffulexTPWorker(cfg.model, **{k: getattr(cfg, k) for k in cfg.__dataclass_fields__.keys() if k != "model"})
 
@@ -81,17 +86,23 @@ def _dp_child_entry(config: Config, dp_idx: int, local_devices: list[int], conn)
             else:
                 conn.send(("err", f"unknown_cmd:{cmd}"))
     except Exception as e:
-        # Include full traceback for easier debugging and also print to stderr as a fallback.
+        # Include full traceback for easier debugging and also log as a fallback.
         tb = traceback.format_exc()
         msg = f"{type(e).__name__}: {e}\n{tb}"
         try:
             conn.send(("err", msg))
         except Exception:
             pass
         try:
-            print(f"[DP Child {dp_idx}] Unhandled exception:\n{msg}", file=sys.stderr, flush=True)
+            # Use logger for error reporting
+            child_logger = get_logger(f"diffulex.engine.dp_worker.child_{dp_idx}")
+            child_logger.error(f"[DP Child {dp_idx}] Unhandled exception:\n{msg}")
         except Exception:
-            pass
+            # Final fallback to stderr
+            try:
+                print(f"[DP Child {dp_idx}] Unhandled exception:\n{msg}", file=sys.stderr, flush=True)
+            except Exception:
+                pass
 
 
 class DiffulexDPWorker:
@@ -116,12 +127,10 @@ def __init__(self, model, **kwargs):
         need_gpus = self.dp_size * cfg.tensor_parallel_size
         assert len(vis) >= need_gpus, f"Require {need_gpus} GPUs (dp={self.dp_size}, tp={cfg.tensor_parallel_size}), visible {len(vis)}"
 
-        # Optional overrides: kwargs['device_ids'] or env D2F_DEVICE_MAP
+        # Optional overrides: kwargs['device_ids']
         override = None
         if 'device_ids' in kwargs and kwargs['device_ids']:
             override = list(kwargs['device_ids'])
-        elif os.environ.get('D2F_DEVICE_MAP'):
-            override = [int(x) for x in os.environ['D2F_DEVICE_MAP'].split(',') if x.strip() != '']
         if override is not None:
             assert len(override) >= need_gpus, f"device_ids length {len(override)} < required {need_gpus}"
             # All override devices must be in visible list