intel · chensuyue · Mar 10, 2026 · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026
diff --git a/auto_round/auto_scheme/delta_loss.py b/auto_round/auto_scheme/delta_loss.py
@@ -15,7 +15,7 @@
 import copy
 from dataclasses import asdict
 from functools import wraps
-from typing import Iterable, Union
+from typing import Iterable, Optional, Union
 
 import torch
 from accelerate import dispatch_model
@@ -55,6 +55,8 @@
     set_non_auto_device_map,
     to_device,
 )
+from auto_round.utils.device import MemoryMonitor
+from auto_round.utils.offload import OffloadManager
 from auto_round.wrapper import WrapperLinear
 
 __all__ = ["gen_layer_config"]
@@ -488,6 +490,7 @@ def get_score_for_scheme(
     major_device="cpu",
     batch_size=1,
     disable_opt_rtn=True,
+    offload_context: Optional[OffloadManager] = None,
 ):
     scores_dict = {}  # Key=name,Val=[quant_total_bits, loss]
     for n, m in model.named_modules():
@@ -506,6 +509,8 @@ def get_score_for_scheme(
             break
 
     for name in quant_layer_names:
+        if offload_context is not None:
+            offload_context.ensure_loaded(model, name)
         if name in fixed_layer_scheme.keys():
             continue
         m = get_module(model, name)
@@ -547,6 +552,8 @@ def get_score_for_scheme(
                 disable_opt_rtn=disable_opt_rtn,
             )
             set_module(model, name, new_m)
+    if offload_context is not None:
+        offload_context.flush_loaded(model)
     if low_gpu_mem_usage:
         dataloader = get_dataloader(tokenizer, seqlen, dataset_name=dataset, seed=42, bs=batch_size, nsamples=nsamples)
 
@@ -658,6 +665,21 @@ def _gen_layer_config(
     major_device="cpu",
     device_list=None,
 ):
+    # Initialize memory tracking for AutoScheme
+    memory_monitor = MemoryMonitor()
+    memory_monitor.reset()
+    memory_monitor.update_cpu()
+
+    # Create offload context for CPU RAM optimization
+    # Note: low_cpu_mem_usage only works when low_gpu_mem_usage is also enabled,
+    # because it requires layer-by-layer processing
+    offload_context = None
+    if auto_scheme.low_cpu_mem_usage and auto_scheme.low_gpu_mem_usage:
+        _model_dir = model_name
+        if _model_dir is None and hasattr(model, "config"):
+            _model_dir = getattr(model.config, "_name_or_path", None)
+        offload_context = OffloadManager(enabled=True, mode="clean", model_dir=_model_dir, cache_numel=True)
+
     target_bits = auto_scheme.avg_bits
     model.eval()
 
@@ -762,6 +784,11 @@ def check_bf16_scheme(scheme):
             cal_imatrix(model, dataloader)
             logger.info("finish calculating imatrix")
 
+    # Register hooks and clear all block weights before the scheme loop.
+    # Hooks will transparently reload weights on demand during forward passes.
+    if offload_context is not None:
+        offload_context.add_offload_hooks(model, block_name)
+
     pbar = tqdm(total=pbar_cnt, desc="Generating AutoScheme")
     for index, scheme in enumerate(schemes):
         apply_quant_scheme(
@@ -792,7 +819,11 @@ def check_bf16_scheme(scheme):
                 major_device=major_device,
                 batch_size=batch_size,
                 disable_opt_rtn=auto_scheme.disable_opt_rtn,
+                offload_context=offload_context,
             )
+        # Track peak RAM after each scheme scoring
+        memory_monitor.update_cpu()
+
         new_scores = {}
         for share_layer in shared_layers:
             param_bits = 0
@@ -817,10 +848,17 @@ def check_bf16_scheme(scheme):
         options_scores.append(options_total_loss)
         clear_memory(device_list=device_list)
 
+    # Remove hooks and restore original weights from disk for final bit-budget computations
+    if offload_context is not None:
+        offload_context.remove_offload_hooks(model, block_name)
+
     total_params = 0
     for n, m in model.named_modules():
         if n in quant_layer_names + embedding_layers_names:
-            total_params += m.weight.numel()
+            n_param = m.weight.numel()
+            if n_param == 0 and hasattr(m, "_cached_weight_numel"):
+                n_param = m._cached_weight_numel
+            total_params += n_param
 
     target_params_cnt = int(total_params * target_bits)
     sorted_indices = sorted(range(len(options_scores)), key=lambda i: options_scores[i])
@@ -902,6 +940,12 @@ def check_bf16_scheme(scheme):
     global last_grad_input
     last_grad_input = None
     clear_memory(device_list=device_list)
+
+    # Log AutoScheme memory usage
+    memory_monitor.update_cpu()
+    low_cpu_str = "enabled" if auto_scheme.low_cpu_mem_usage else "disabled"
+    memory_monitor.log_summary(f"AutoScheme complete (low_cpu_mem_usage={low_cpu_str})")
+
     pbar.close()
     return layer_config
 

diff --git a/auto_round/auto_scheme/gen_auto_scheme.py b/auto_round/auto_scheme/gen_auto_scheme.py
@@ -40,6 +40,7 @@ class AutoScheme:
     enable_torch_compile: Optional[bool] = None
     disable_opt_rtn: bool = True
     low_gpu_mem_usage: bool = True
+    low_cpu_mem_usage: bool = True
 
     def __post_init__(self):
         if isinstance(self.options, str):

diff --git a/auto_round/auto_scheme/utils.py b/auto_round/auto_scheme/utils.py
@@ -105,7 +105,10 @@ def compute_avg_bits_for_scheme(
         #     continue
         if not hasattr(module, "weight"):
             continue
-        total_params += module.weight.numel()
+        n_param = module.weight.numel()
+        if n_param == 0 and hasattr(module, "_cached_weight_numel"):
+            n_param = module._cached_weight_numel
+        total_params += n_param
         layer_bits, _ = compute_layer_bits(module, ignore_scale_zp_bits)
         total_quantized_bits += layer_bits
     avg_bits = float(total_quantized_bits) / total_params
@@ -133,7 +136,10 @@ def compute_avg_bits_for_model(model: torch.nn.Module, ignore_scale_zp_bits: boo
             continue
         if not hasattr(module, "weight"):
             continue
-        total_params += module.weight.numel()
+        n_param = module.weight.numel()
+        if n_param == 0 and hasattr(module, "_cached_weight_numel"):
+            n_param = module._cached_weight_numel
+        total_params += n_param
         layer_bits, _ = compute_layer_bits(module, ignore_scale_zp_bits)
         total_quantized_bits += layer_bits
 
@@ -157,6 +163,9 @@ def compute_layer_bits(
     """
     weight = layer.weight
     n_param = weight.numel()
+    # Use cached numel when weight has been cleared to an empty tensor (low_cpu_mem_usage offload)
+    if n_param == 0 and hasattr(layer, "_cached_weight_numel"):
+        n_param = layer._cached_weight_numel
     weight_bits = getattr(layer, "bits", 16)
     group_size = getattr(layer, "group_size", 128)
     data_type = getattr(layer, "data_type", "int")

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -14,6 +14,7 @@
 
 import copy
 import os
+import re
 import sys
 import time
 import traceback
@@ -113,6 +114,7 @@
     set_non_auto_device_map,
 )
 from auto_round.utils.distributed import setup_ddp_if_needed_
+from auto_round.utils.offload import OffloadManager
 from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block
 
 SERIALIZATION_KEYS = (
@@ -371,6 +373,7 @@ def __init__(
         self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES
         self.scale_dtype = convert_dtype_str2torch(scale_dtype)
         self.low_cpu_mem_usage = low_cpu_mem_usage
+        self._offloader = OffloadManager(enabled=low_cpu_mem_usage, mode="offload", offload_dir_prefix="compressor")
 
         if kwargs:
             logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.")
@@ -1259,6 +1262,10 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=T
             m = get_module(self.model, name)
             m.to("cpu")
             shard_writer(self, m, name, False)
+            # Free RAM immediately: the data is now in the shard-writer buffer
+            # (and will be flushed to disk).  Keeping it also in the model tree
+            # causes linear RAM growth for large models.
+            m.to("meta")
 
     def _immediate_pack(self, name: str):
         if not self.is_immediate_packing:
@@ -1331,8 +1338,16 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
             for handle in hook_handles:
                 handle.remove()
         else:
-            # By default, we go with layer-wise way if no replacement happened
+            # By default, we go with layer-wise way if no replacement happened.
+            # In RTN mode (iters == 0), force blockwise quantization to avoid
+            # full-model materialization and linear CPU RAM growth.
             use_blockwise_quantization = global_state.replaced_module_count > 0
+            if self.iters == 0 and not use_blockwise_quantization:
+                logger.info(
+                    "RTN mode detected (iters=0): force blockwise quantization to avoid "
+                    "layer-wise full-model materialization."
+                )
+                use_blockwise_quantization = True
             tied_weights_keys = getattr(self.model, "_tied_weights_keys", [])
             if tied_weights_keys is None:
                 tied_weights_keys = []
@@ -1370,8 +1385,22 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
                                 and self.is_immediate_saving
                             ):
                                 set_module(self.model, m.global_name, copy.deepcopy(m))
-                                shard_writer(self, name=m.global_name)
+                                if self.is_immediate_saving:
+                                    shard_writer(self, name=m.global_name)
+                                    copied_m = get_module(self.model, m.global_name)
+                                    copied_m.to("meta")
                                 m.to("meta")
+                        # Move remaining GPU tensors to CPU; offload to disk if low_cpu_mem_usage.
+                        # This mirrors _quantize_via_rtn_blockwise's post-block cleanup.
+                        if not self.is_immediate_saving:
+                            mv_module_from_gpu(block)
+                        else:
+                            # Save once at block scope to capture tensors that are not saved
+                            # in per-layer branch (e.g., custom module-level params/buffers).
+                            shard_writer(self, name=block_name)
+                            block.to("meta")
+                        if self.low_cpu_mem_usage and not self.is_immediate_saving:
+                            self._offloader.offload(self.model, block_name)
                         clear_memory(device_list=self.device_list)
                         memory_monitor.log_summary()
                         pbar.update(1)
@@ -1413,6 +1442,8 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
 
         # Convert remaining fp8
         convert_module_to_hp_if_necessary(self.model, self.amp_dtype, self.device)
+        if self.low_cpu_mem_usage:
+            self._offloader.reload(self.model)
         if self.is_immediate_saving:
             shard_writer(self, is_finalize=True)
 
@@ -1531,9 +1562,9 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
                         self._quantize_layer_via_rtn(m.global_name, to_cpu=self.low_gpu_mem_usage)
                         all_to_quantized_module_names.remove(m.global_name)
 
-                if not self.is_immediate_saving:
-                    # some modules may have been flushed and set to meta, so we could not  move to gpu
-                    mv_module_from_gpu(block)
+                mv_module_from_gpu(block)
+                if self.low_cpu_mem_usage and not self.is_immediate_saving:
+                    self._offloader.offload(self.model, block_name)
                 if block_name == block_names[-1]:
                     clear_memory(input_ids, device_list=self.device_list)
                 else:
@@ -1644,14 +1675,18 @@ def _adjust_immediate_packing_and_saving(self):
             self.is_immediate_saving = False
 
         if self.low_cpu_mem_usage and self.is_immediate_packing:
-            if self.has_qlayer_outside_block and self.disable_opt_rtn and self.iters == 0:
+            if formats[0].is_gguf():
                 logger.warning(
-                    "`low_cpu_mem_usage` is not fully supported "
-                    "when there are quantized layers outside blocks and optimized RTN is disabled. "
+                    "`low_cpu_mem_usage` is not fully supported for gguf format. "
                     "Setting `low_cpu_mem_usage` to False."
                 )
                 self.low_cpu_mem_usage = False
                 self.is_immediate_saving = False
+            elif self.has_qlayer_outside_block and self.disable_opt_rtn and self.iters == 0:
+                logger.info(
+                    "Keeping `low_cpu_mem_usage` enabled in RTN mode (iters=0): "
+                    "RTN path uses blockwise quantization and supports per-block offloading."
+                )
             elif self.has_qlayer_outside_block and self.iters > 0:
                 logger.warning(
                     "`low_cpu_mem_usage` is not fully supported "
@@ -1660,12 +1695,6 @@ def _adjust_immediate_packing_and_saving(self):
                 )
                 self.low_cpu_mem_usage = False
                 self.is_immediate_saving = False
-            elif formats[0].is_gguf():
-                logger.warning(
-                    "`low_cpu_mem_usage` is not fully supported for gguf format" "Setting `low_cpu_mem_usage `to False."
-                )
-                self.low_cpu_mem_usage = False
-                self.is_immediate_saving = False
 
         if self.is_immediate_saving and "int" not in self.data_type:
             logger.warning("immediate_saving is only supported for int quantization, set to False")
@@ -1700,6 +1729,9 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
 
         self.configure_layer_config(enable_gguf_official_mixed=enable_gguf_official_mixed)
 
+        if self.low_cpu_mem_usage:
+            self._offloader.reset()
+
         def _should_disable_inplace_due_to_layers_outside_block() -> bool:
             return self.has_qlayer_outside_block and (self.iters != 0 or (self.iters == 0 and not self.disable_opt_rtn))
 
@@ -1747,11 +1779,15 @@ def _should_disable_inplace_due_to_layers_outside_block() -> bool:
             all_q_inputs = self.try_cache_inter_data_gpucpu(
                 all_first_block_names, self.nsamples, layer_names=layer_names
             )
-        self.model = safe_device_move_with_meta_handling(self.model, "cpu")
-        clear_memory(device_list=self.device_list)
+        # Remove accelerate dispatch hooks before moving parameters.
+        # hf_device_map is kept for reference but hooks are no longer needed.
         if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1:
-            accelerate.hooks.remove_hook_from_submodules(self.model)  # self.model.hf_device_map has not been changed
+            accelerate.hooks.remove_hook_from_submodules(self.model)
+        self.model = mv_module_from_gpu(self.model)
+        clear_memory(device_list=self.device_list)
         logger.info("caching done")
+        if self.low_cpu_mem_usage:
+            self._offloader.offload(self.model, all_blocks, clear_memory=True, device_list=self.device_list)
         if len(all_blocks) > 1:
             pbar = tqdm(range(0, sum([len(i) for i in all_blocks]), self.nblocks))
         else:
@@ -1797,6 +1833,9 @@ def _should_disable_inplace_due_to_layers_outside_block() -> bool:
         if self.is_immediate_saving:
             shard_writer(self, is_finalize=True)
 
+        if self.low_cpu_mem_usage:
+            self._offloader.reload(self.model)
+
         end_time = time.time()
         cost_time = end_time - start_time
         logger.info(f"quantization tuning time {cost_time}")
@@ -2923,7 +2962,7 @@ def _quantize_block(
             if hook_handles:
                 self._get_block_outputs(
                     block,
-                    q_input,
+                    q_input if q_input is not None else input_ids,
                     input_others,
                     self.batch_size * self.infer_bs_coeff,
                     device,
@@ -3201,6 +3240,12 @@ def _quantize_blocks(
                 modules = [get_module(model, n) for n in names]
                 m = WrapperMultiblock(modules)
 
+            if self.low_cpu_mem_usage:
+                if nblocks == 1:
+                    self._offloader.reload(model, n)
+                else:
+                    self._offloader.reload(model, names)
+
             m.config = model.config if hasattr(model, "config") else None
             q_input, input_ids = self._quantize_block(
                 m,
@@ -3219,6 +3264,13 @@ def _quantize_blocks(
 
             if self.is_immediate_saving:
                 shard_writer(self, m, is_finalize=False)
+
+            if self.low_cpu_mem_usage and not self.is_immediate_saving:
+                if nblocks == 1:
+                    self._offloader.offload(model, n, overwrite=True)
+                else:
+                    for name in names:
+                        self._offloader.offload(model, name, overwrite=True)
         if pbar is not None:
             pbar.update(1)