Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
78 commits
Select commit Hold shift + click to select a range
dee1db7
Optimize CPU RAM peak memeory during quantization
lvliang-intel Feb 3, 2026
459ee8a
Merge branch 'main' into lvl/ram_usage_optimization
lvliang-intel Feb 3, 2026
2a78a18
rm duplicate args of the quantization extra config (#1334)
WeiweiZhang1 Feb 3, 2026
e00c176
fix --device_map cuda and xpu issue (#1383)
wenhuach21 Feb 3, 2026
d6d9f77
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 3, 2026
ca55ae8
refine test case
lvliang-intel Feb 3, 2026
7a3dcac
Disable replace `FP8Expert` (#1379)
yiliu30 Feb 3, 2026
082bf4c
Support general MOE replacement for MOE models (Transformers 5.0 comp…
lvliang-intel Feb 3, 2026
dd45c31
fix cuda ut fail (#1370)
n1ck-guo Feb 4, 2026
10028e8
[Regression] Detach scale tensor to prevent holding computation graph…
xin3he Feb 4, 2026
b2dff81
fix layer config (#1373)
wenhuach21 Feb 4, 2026
5614894
Merge branch 'main' into lvl/ram_usage_optimization
lvliang-intel Feb 4, 2026
a041da8
update code for comments
lvliang-intel Feb 4, 2026
e62a708
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 9, 2026
5dcd064
support AutoScheme cpu ram optimization
lvliang-intel Feb 9, 2026
10f0a4a
Merge branch 'main' into lvl/ram_usage_optimization
lvliang-intel Feb 9, 2026
13c140b
Refactor evaluation in tests to use evaluate_accuracy function (#1402)
xin3he Feb 9, 2026
4daa325
fix bug in PR-1244 (#1422)
xin3he Feb 10, 2026
b32e9a1
remove require_intel_extension_for_pytorch and fix TypeError: unhasha…
xin3he Feb 10, 2026
1e41b94
Fix cuda model ut [glm4][Molmo] (#1428)
Kaihui-intel Feb 10, 2026
accef77
Update BackendInfos for AutoGPTQ based on transformer version and sav…
xin3he Feb 10, 2026
b7a7a30
fix CUDA CI (#1431)
xin3he Feb 11, 2026
b9e925f
refine global scale calculation to blockwise (#1421)
WeiweiZhang1 Feb 11, 2026
3464711
support GPTQ_FORMAT for "gptqmodel:exllamav2" backend (#1434)
xin3he Feb 11, 2026
564f9a1
Update diffusion README (#1439)
mengniwang95 Feb 12, 2026
8a45c86
Enable new cpu test pool (#1435)
chensuyue Feb 12, 2026
5b98860
Update auto-round-lib README.md (#1437)
chensuyue Feb 12, 2026
b2a7b7f
update logic for compatibility with gptqmodel:exllamav2 backend (#1438)
xin3he Feb 12, 2026
60fe3d4
update readme for Intel xpu usage (#1441)
chensuyue Feb 12, 2026
48294a2
fix bug of evaluate_accuracy (#1430)
xin3he Feb 12, 2026
4f8f355
bump version (#1445)
chensuyue Feb 12, 2026
32d3a30
Attach `act_max_hook` for FP8 model (#1447)
yiliu30 Feb 12, 2026
359a4a7
[Regression] fix FP8_STATIC loading (#1452)
xin3he Feb 13, 2026
98351f9
Fix requirements packaging for source distribution (#1455)
timkpaine Feb 14, 2026
b619386
fix: preserve classmethod descriptor in from_pretrained monkey patch …
yiliu30 Feb 25, 2026
a04a665
Support load FP8 model on HPU (#1449)
yiliu30 Feb 25, 2026
5baa6e7
Update compatibility test and version to 0.10.2 (#1463)
XuehaoSun Feb 25, 2026
3694470
support multiple device evaluation for activation quantized model (#1…
wenhuach21 Feb 25, 2026
2203d36
Fix KeyError when GPU is missing from accelerate max_memory (#1457)
lvliang-intel Feb 25, 2026
d6ef4ce
support glm5 (#1466)
wenhuach21 Feb 26, 2026
27a6422
Disable replace `FP8Expert` (#1379)
yiliu30 Feb 3, 2026
4a133f2
fix layer config (#1373)
lvliang-intel Feb 27, 2026
e497f4d
update code for comments
lvliang-intel Feb 4, 2026
6066d83
fix issue 1368
lvliang-intel Feb 27, 2026
581b4ae
Merge branch 'main' into lvl/ram_usage_optimization
lvliang-intel Feb 27, 2026
88c60ec
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 27, 2026
471931f
refactor code according to comments
lvliang-intel Mar 2, 2026
05d06e1
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 2, 2026
341240c
Merge branch 'main' into lvl/ram_usage_optimization
lvliang-intel Mar 2, 2026
5c71a83
refactor code according to comments
lvliang-intel Mar 2, 2026
137b759
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 2, 2026
3b81809
Merge branch 'main' into lvl/ram_usage_optimization
lvliang-intel Mar 2, 2026
1a4a9c6
Grep core dumped issue in UT test (#1484)
chensuyue Mar 3, 2026
2de46ac
gguf better support for transformers5.0 and fix bug of Qwen3Next (#1474)
n1ck-guo Mar 3, 2026
57664fb
Optimize CPU RAM peak memeory during quantization
lvliang-intel Feb 3, 2026
454933c
Support general MOE replacement for MOE models (Transformers 5.0 comp…
lvliang-intel Mar 3, 2026
30593f1
fix cuda ut fail (#1370)
n1ck-guo Feb 4, 2026
484d7a9
update code for comments
lvliang-intel Feb 4, 2026
1d4d6ec
Update BackendInfos for AutoGPTQ based on transformer version and sav…
xin3he Feb 10, 2026
cfdce02
refactor code according to comments
lvliang-intel Mar 2, 2026
337965f
refine code
lvliang-intel Mar 3, 2026
1fd7a1e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 3, 2026
7e202d7
Merge branch 'main' into lvl/ram_usage_optimization
lvliang-intel Mar 3, 2026
0928e8e
fix merge issue
lvliang-intel Mar 3, 2026
06f5c85
fix merge issue
lvliang-intel Mar 3, 2026
a199843
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 3, 2026
89a44e8
Merge branch 'lvl/ram_usage_optimization' of https://github.com/intel…
lvliang-intel Mar 3, 2026
dd2e35a
fix ci issues
lvliang-intel Mar 4, 2026
a088346
refactor offload manager
lvliang-intel Mar 4, 2026
6c92c30
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 4, 2026
fa186ff
Merge branch 'main' into lvl/ram_usage_optimization
lvliang-intel Mar 4, 2026
31c4176
Update base.py
lvliang-intel Mar 4, 2026
e7c606b
update code for comments
lvliang-intel Mar 4, 2026
1190935
Merge branch 'main' into lvl/ram_usage_optimization
lvliang-intel Mar 4, 2026
1c0911e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 4, 2026
cce2dcd
fix ci issues
lvliang-intel Mar 4, 2026
72f8433
Merge branch 'main' into lvl/ram_usage_optimization
lvliang-intel Mar 4, 2026
876fa66
Merge branch 'main' of https://github.com/intel/auto-round into lvl/r…
lvliang-intel Mar 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 46 additions & 2 deletions auto_round/auto_scheme/delta_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import copy
from dataclasses import asdict
from functools import wraps
from typing import Iterable, Union
from typing import Iterable, Optional, Union

import torch
from accelerate import dispatch_model
Expand Down Expand Up @@ -55,6 +55,8 @@
set_non_auto_device_map,
to_device,
)
from auto_round.utils.device import MemoryMonitor
from auto_round.utils.offload import OffloadManager
from auto_round.wrapper import WrapperLinear

__all__ = ["gen_layer_config"]
Expand Down Expand Up @@ -488,6 +490,7 @@ def get_score_for_scheme(
major_device="cpu",
batch_size=1,
disable_opt_rtn=True,
offload_context: Optional[OffloadManager] = None,
):
scores_dict = {} # Key=name,Val=[quant_total_bits, loss]
for n, m in model.named_modules():
Expand All @@ -506,6 +509,8 @@ def get_score_for_scheme(
break

for name in quant_layer_names:
if offload_context is not None:
offload_context.ensure_loaded(model, name)
if name in fixed_layer_scheme.keys():
continue
m = get_module(model, name)
Expand Down Expand Up @@ -547,6 +552,8 @@ def get_score_for_scheme(
disable_opt_rtn=disable_opt_rtn,
)
set_module(model, name, new_m)
if offload_context is not None:
offload_context.flush_loaded(model)
if low_gpu_mem_usage:
dataloader = get_dataloader(tokenizer, seqlen, dataset_name=dataset, seed=42, bs=batch_size, nsamples=nsamples)

Expand Down Expand Up @@ -658,6 +665,21 @@ def _gen_layer_config(
major_device="cpu",
device_list=None,
):
# Initialize memory tracking for AutoScheme
memory_monitor = MemoryMonitor()
memory_monitor.reset()
memory_monitor.update_cpu()

# Create offload context for CPU RAM optimization
# Note: low_cpu_mem_usage only works when low_gpu_mem_usage is also enabled,
# because it requires layer-by-layer processing
offload_context = None
if auto_scheme.low_cpu_mem_usage and auto_scheme.low_gpu_mem_usage:
_model_dir = model_name
if _model_dir is None and hasattr(model, "config"):
_model_dir = getattr(model.config, "_name_or_path", None)
offload_context = OffloadManager(enabled=True, mode="clean", model_dir=_model_dir, cache_numel=True)

target_bits = auto_scheme.avg_bits
model.eval()

Expand Down Expand Up @@ -762,6 +784,11 @@ def check_bf16_scheme(scheme):
cal_imatrix(model, dataloader)
logger.info("finish calculating imatrix")

# Register hooks and clear all block weights before the scheme loop.
# Hooks will transparently reload weights on demand during forward passes.
if offload_context is not None:
offload_context.add_offload_hooks(model, block_name)

pbar = tqdm(total=pbar_cnt, desc="Generating AutoScheme")
for index, scheme in enumerate(schemes):
apply_quant_scheme(
Expand Down Expand Up @@ -792,7 +819,11 @@ def check_bf16_scheme(scheme):
major_device=major_device,
batch_size=batch_size,
disable_opt_rtn=auto_scheme.disable_opt_rtn,
offload_context=offload_context,
)
# Track peak RAM after each scheme scoring
memory_monitor.update_cpu()

new_scores = {}
for share_layer in shared_layers:
param_bits = 0
Expand All @@ -817,10 +848,17 @@ def check_bf16_scheme(scheme):
options_scores.append(options_total_loss)
clear_memory(device_list=device_list)

# Remove hooks and restore original weights from disk for final bit-budget computations
if offload_context is not None:
offload_context.remove_offload_hooks(model, block_name)

total_params = 0
for n, m in model.named_modules():
if n in quant_layer_names + embedding_layers_names:
total_params += m.weight.numel()
n_param = m.weight.numel()
if n_param == 0 and hasattr(m, "_cached_weight_numel"):
n_param = m._cached_weight_numel
total_params += n_param

target_params_cnt = int(total_params * target_bits)
sorted_indices = sorted(range(len(options_scores)), key=lambda i: options_scores[i])
Expand Down Expand Up @@ -902,6 +940,12 @@ def check_bf16_scheme(scheme):
global last_grad_input
last_grad_input = None
clear_memory(device_list=device_list)

# Log AutoScheme memory usage
memory_monitor.update_cpu()
low_cpu_str = "enabled" if auto_scheme.low_cpu_mem_usage else "disabled"
memory_monitor.log_summary(f"AutoScheme complete (low_cpu_mem_usage={low_cpu_str})")

pbar.close()
return layer_config

Expand Down
1 change: 1 addition & 0 deletions auto_round/auto_scheme/gen_auto_scheme.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class AutoScheme:
enable_torch_compile: Optional[bool] = None
disable_opt_rtn: bool = True
low_gpu_mem_usage: bool = True
low_cpu_mem_usage: bool = True

def __post_init__(self):
if isinstance(self.options, str):
Expand Down
13 changes: 11 additions & 2 deletions auto_round/auto_scheme/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,10 @@ def compute_avg_bits_for_scheme(
# continue
if not hasattr(module, "weight"):
continue
total_params += module.weight.numel()
n_param = module.weight.numel()
if n_param == 0 and hasattr(module, "_cached_weight_numel"):
n_param = module._cached_weight_numel
total_params += n_param
layer_bits, _ = compute_layer_bits(module, ignore_scale_zp_bits)
total_quantized_bits += layer_bits
avg_bits = float(total_quantized_bits) / total_params
Expand Down Expand Up @@ -133,7 +136,10 @@ def compute_avg_bits_for_model(model: torch.nn.Module, ignore_scale_zp_bits: boo
continue
if not hasattr(module, "weight"):
continue
total_params += module.weight.numel()
n_param = module.weight.numel()
if n_param == 0 and hasattr(module, "_cached_weight_numel"):
n_param = module._cached_weight_numel
total_params += n_param
layer_bits, _ = compute_layer_bits(module, ignore_scale_zp_bits)
total_quantized_bits += layer_bits

Expand All @@ -157,6 +163,9 @@ def compute_layer_bits(
"""
weight = layer.weight
n_param = weight.numel()
# Use cached numel when weight has been cleared to an empty tensor (low_cpu_mem_usage offload)
if n_param == 0 and hasattr(layer, "_cached_weight_numel"):
n_param = layer._cached_weight_numel
weight_bits = getattr(layer, "bits", 16)
group_size = getattr(layer, "group_size", 128)
data_type = getattr(layer, "data_type", "int")
Expand Down
88 changes: 70 additions & 18 deletions auto_round/compressors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import copy
import os
import re
import sys
import time
import traceback
Expand Down Expand Up @@ -113,6 +114,7 @@
set_non_auto_device_map,
)
from auto_round.utils.distributed import setup_ddp_if_needed_
from auto_round.utils.offload import OffloadManager
from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block

SERIALIZATION_KEYS = (
Expand Down Expand Up @@ -371,6 +373,7 @@ def __init__(
self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES
self.scale_dtype = convert_dtype_str2torch(scale_dtype)
self.low_cpu_mem_usage = low_cpu_mem_usage
self._offloader = OffloadManager(enabled=low_cpu_mem_usage, mode="offload", offload_dir_prefix="compressor")

if kwargs:
logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.")
Expand Down Expand Up @@ -1259,6 +1262,10 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=T
m = get_module(self.model, name)
m.to("cpu")
shard_writer(self, m, name, False)
# Free RAM immediately: the data is now in the shard-writer buffer
# (and will be flushed to disk). Keeping it also in the model tree
# causes linear RAM growth for large models.
m.to("meta")

def _immediate_pack(self, name: str):
if not self.is_immediate_packing:
Expand Down Expand Up @@ -1331,8 +1338,16 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
for handle in hook_handles:
handle.remove()
else:
# By default, we go with layer-wise way if no replacement happened
# By default, we go with layer-wise way if no replacement happened.
# In RTN mode (iters == 0), force blockwise quantization to avoid
# full-model materialization and linear CPU RAM growth.
use_blockwise_quantization = global_state.replaced_module_count > 0
if self.iters == 0 and not use_blockwise_quantization:
logger.info(
"RTN mode detected (iters=0): force blockwise quantization to avoid "
"layer-wise full-model materialization."
)
use_blockwise_quantization = True
tied_weights_keys = getattr(self.model, "_tied_weights_keys", [])
if tied_weights_keys is None:
tied_weights_keys = []
Expand Down Expand Up @@ -1370,8 +1385,22 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
and self.is_immediate_saving
):
set_module(self.model, m.global_name, copy.deepcopy(m))
shard_writer(self, name=m.global_name)
if self.is_immediate_saving:
shard_writer(self, name=m.global_name)
copied_m = get_module(self.model, m.global_name)
copied_m.to("meta")
m.to("meta")
# Move remaining GPU tensors to CPU; offload to disk if low_cpu_mem_usage.
# This mirrors _quantize_via_rtn_blockwise's post-block cleanup.
if not self.is_immediate_saving:
mv_module_from_gpu(block)
else:
# Save once at block scope to capture tensors that are not saved
# in per-layer branch (e.g., custom module-level params/buffers).
shard_writer(self, name=block_name)
block.to("meta")
if self.low_cpu_mem_usage and not self.is_immediate_saving:
self._offloader.offload(self.model, block_name)
clear_memory(device_list=self.device_list)
memory_monitor.log_summary()
pbar.update(1)
Expand Down Expand Up @@ -1413,6 +1442,8 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:

# Convert remaining fp8
convert_module_to_hp_if_necessary(self.model, self.amp_dtype, self.device)
if self.low_cpu_mem_usage:
self._offloader.reload(self.model)
if self.is_immediate_saving:
shard_writer(self, is_finalize=True)

Expand Down Expand Up @@ -1531,9 +1562,9 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
self._quantize_layer_via_rtn(m.global_name, to_cpu=self.low_gpu_mem_usage)
all_to_quantized_module_names.remove(m.global_name)

if not self.is_immediate_saving:
# some modules may have been flushed and set to meta, so we could not move to gpu
mv_module_from_gpu(block)
mv_module_from_gpu(block)
if self.low_cpu_mem_usage and not self.is_immediate_saving:
self._offloader.offload(self.model, block_name)
if block_name == block_names[-1]:
clear_memory(input_ids, device_list=self.device_list)
else:
Expand Down Expand Up @@ -1644,14 +1675,18 @@ def _adjust_immediate_packing_and_saving(self):
self.is_immediate_saving = False

if self.low_cpu_mem_usage and self.is_immediate_packing:
if self.has_qlayer_outside_block and self.disable_opt_rtn and self.iters == 0:
if formats[0].is_gguf():
logger.warning(
"`low_cpu_mem_usage` is not fully supported "
"when there are quantized layers outside blocks and optimized RTN is disabled. "
"`low_cpu_mem_usage` is not fully supported for gguf format. "
"Setting `low_cpu_mem_usage` to False."
)
self.low_cpu_mem_usage = False
self.is_immediate_saving = False
elif self.has_qlayer_outside_block and self.disable_opt_rtn and self.iters == 0:
logger.info(
"Keeping `low_cpu_mem_usage` enabled in RTN mode (iters=0): "
"RTN path uses blockwise quantization and supports per-block offloading."
)
elif self.has_qlayer_outside_block and self.iters > 0:
logger.warning(
"`low_cpu_mem_usage` is not fully supported "
Expand All @@ -1660,12 +1695,6 @@ def _adjust_immediate_packing_and_saving(self):
)
self.low_cpu_mem_usage = False
self.is_immediate_saving = False
elif formats[0].is_gguf():
logger.warning(
"`low_cpu_mem_usage` is not fully supported for gguf format" "Setting `low_cpu_mem_usage `to False."
)
self.low_cpu_mem_usage = False
self.is_immediate_saving = False

if self.is_immediate_saving and "int" not in self.data_type:
logger.warning("immediate_saving is only supported for int quantization, set to False")
Expand Down Expand Up @@ -1700,6 +1729,9 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:

self.configure_layer_config(enable_gguf_official_mixed=enable_gguf_official_mixed)

if self.low_cpu_mem_usage:
self._offloader.reset()

def _should_disable_inplace_due_to_layers_outside_block() -> bool:
return self.has_qlayer_outside_block and (self.iters != 0 or (self.iters == 0 and not self.disable_opt_rtn))

Expand Down Expand Up @@ -1747,11 +1779,15 @@ def _should_disable_inplace_due_to_layers_outside_block() -> bool:
all_q_inputs = self.try_cache_inter_data_gpucpu(
all_first_block_names, self.nsamples, layer_names=layer_names
)
self.model = safe_device_move_with_meta_handling(self.model, "cpu")
clear_memory(device_list=self.device_list)
# Remove accelerate dispatch hooks before moving parameters.
# hf_device_map is kept for reference but hooks are no longer needed.
if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1:
accelerate.hooks.remove_hook_from_submodules(self.model) # self.model.hf_device_map has not been changed
accelerate.hooks.remove_hook_from_submodules(self.model)
self.model = mv_module_from_gpu(self.model)
clear_memory(device_list=self.device_list)
logger.info("caching done")
if self.low_cpu_mem_usage:
self._offloader.offload(self.model, all_blocks, clear_memory=True, device_list=self.device_list)
if len(all_blocks) > 1:
pbar = tqdm(range(0, sum([len(i) for i in all_blocks]), self.nblocks))
else:
Expand Down Expand Up @@ -1797,6 +1833,9 @@ def _should_disable_inplace_due_to_layers_outside_block() -> bool:
if self.is_immediate_saving:
shard_writer(self, is_finalize=True)

if self.low_cpu_mem_usage:
self._offloader.reload(self.model)

end_time = time.time()
cost_time = end_time - start_time
logger.info(f"quantization tuning time {cost_time}")
Expand Down Expand Up @@ -2923,7 +2962,7 @@ def _quantize_block(
if hook_handles:
self._get_block_outputs(
block,
q_input,
q_input if q_input is not None else input_ids,
input_others,
self.batch_size * self.infer_bs_coeff,
device,
Expand Down Expand Up @@ -3201,6 +3240,12 @@ def _quantize_blocks(
modules = [get_module(model, n) for n in names]
m = WrapperMultiblock(modules)

if self.low_cpu_mem_usage:
if nblocks == 1:
self._offloader.reload(model, n)
else:
self._offloader.reload(model, names)

m.config = model.config if hasattr(model, "config") else None
q_input, input_ids = self._quantize_block(
m,
Expand All @@ -3219,6 +3264,13 @@ def _quantize_blocks(

if self.is_immediate_saving:
shard_writer(self, m, is_finalize=False)

if self.low_cpu_mem_usage and not self.is_immediate_saving:
if nblocks == 1:
self._offloader.offload(model, n, overwrite=True)
else:
for name in names:
self._offloader.offload(model, name, overwrite=True)
if pbar is not None:
pbar.update(1)

Expand Down
Loading