Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
c7a952d
Add files via upload
accemlcc Dec 1, 2025
5e65ab1
perf: optimize abliteration matrix op (#46)
accemlcc Dec 2, 2025
3cacbae
Fix line endings to LF
accemlcc Dec 2, 2025
8f1fafd
Resolve merge conflict by keeping LoRA implementation
accemlcc Dec 2, 2025
cc06d27
Add hybrid approach for GPT-OSS compatibility
accemlcc Dec 2, 2025
c6bc76d
Merge upstream/master - keep hybrid approach
accemlcc Dec 2, 2025
5657491
Fix projector bug, update print statement, revert README
accemlcc Dec 3, 2025
86b8852
Revert README changes to match upstream
accemlcc Dec 3, 2025
529a91a
Fix import sorting for ruff
accemlcc Dec 3, 2025
bdd3f57
Fix reload_model for evaluate_model, add type hints and validation
accemlcc Dec 3, 2025
1e0b38b
Apply ruff formatting
accemlcc Dec 3, 2025
e8c9bd0
Replace load_in_4bit with quantization enum
accemlcc Dec 3, 2025
018d926
Fix precision loss: use FP32 refusal direction directly
accemlcc Dec 3, 2025
e4b8eb5
Move r assignment into non-LoRA path
accemlcc Dec 3, 2025
b87e28c
Fix linting: apply ruff formatting
accemlcc Dec 3, 2025
4e4f741
Add auto-merge for LoRA adapters on save/upload
accemlcc Dec 3, 2025
1e3e50e
Fix linting: apply ruff formatting
accemlcc Dec 3, 2025
907d22d
Implement CPU-based merge for 4-bit models with OOM fallback
accemlcc Dec 3, 2025
055ebb2
Remove use_lora flag (LoRA always on), add user prompt for 4-bit export
accemlcc Dec 3, 2025
a0638a9
Fix: PEFT target_modules expects module names without path prefix
accemlcc Dec 3, 2025
63b9071
Fix linting: apply ruff formatting
accemlcc Dec 3, 2025
1a37f3d
Add LoRA fallback and fix quantization_config handling
accemlcc Dec 3, 2025
9d4717e
Add 8-bit quantization support via bitsandbytes
accemlcc Dec 3, 2025
5f65132
Improve LoRA merge warning and fix linting
accemlcc Dec 4, 2025
7531367
Apply final ruff formatting
accemlcc Dec 4, 2025
2ca6943
Fix CI: apply ruff import sorting
accemlcc Dec 4, 2025
6ce0569
Use tiny model for CI efficiency
accemlcc Dec 4, 2025
8aa25a0
Fix import sorting in test_lora.py
accemlcc Dec 4, 2025
595239f
Fix formatting in test_lora.py
accemlcc Dec 4, 2025
e49111a
Merge upstream changes (analyzer + deps)
accemlcc Dec 4, 2025
7eed4ff
feat: Show merge warning for all models (requires high RAM)
accemlcc Dec 5, 2025
7906410
style: Apply ruff fixes
accemlcc Dec 5, 2025
9e3ddb4
Merge remote-tracking branch 'origin/master' into pr-60
Dec 9, 2025
92873a4
Fix undefined Style import in main.py
Dec 9, 2025
6282ee2
Fix(model): Support MoE/3D tensors and enforce dtype safety in ablite…
Dec 9, 2025
c6603df
Fix(ci): Format model.py with ruff
Dec 9, 2025
dac116f
Fix(main): Remove invalid style argument from prompt_select and unuse…
Dec 10, 2025
7b4ca56
Fix logic errors, memory leak, and redundant merges in main.py
accemlcc Dec 11, 2025
9e38d45
Fix linting and formatting issues (isort, ruff)
accemlcc Dec 11, 2025
b4adb23
chore: Simplify .gitattributes as requested
accemlcc Dec 11, 2025
deaf613
refactor: Remove defensive try-except around LoRA initialization
accemlcc Dec 11, 2025
cfead9e
chore: Update uv.lock with peft and bitsandbytes
accemlcc Dec 11, 2025
e376a97
chore: Regenerate uv.lock to include missing peft dependency
accemlcc Dec 11, 2025
1b3ea78
style: Fix import sorting (isort) for CI compliance
accemlcc Dec 11, 2025
a86240e
style: Simplify .gitattributes to single line as requested
accemlcc Dec 11, 2025
df6036b
Address PR #60 feedback: Remove caching, fix LoRA reload, global LoRA…
accemlcc Dec 11, 2025
b39c165
Merge master into pr-60 (resolve conflicts)
accemlcc Dec 11, 2025
5e399f4
Address PR review comments: clarify code, fix quantization, rename me…
accemlcc Dec 12, 2025
c689af7
Restore gc.collect() before empty_cache() for large models
accemlcc Dec 12, 2025
6a29f17
refactor: Remove LoRA fallback remnants, simplify code
accemlcc Dec 12, 2025
34b34fc
Address p-e-w review feedback: rename reset_model, remove loaded_mode…
accemlcc Dec 14, 2025
da6aa82
Restore skip logic for non-LoRA modules and fix 4-bit base_layer.weig…
accemlcc Dec 14, 2025
dc3178f
Remove defensive lora_A check per review - get_layer_modules already …
accemlcc Dec 14, 2025
2493c77
Fix try_add: nest component init inside Module check, add assert for …
accemlcc Dec 14, 2025
e0f8324
Add note about module.weight assumption for type checking
accemlcc Dec 14, 2025
6bd2446
Change 'Reloading model' to 'Resetting model' in logging
accemlcc Dec 14, 2025
f685d14
Merge upstream master, resolve conflict: keep parameter printing with…
accemlcc Dec 14, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* text eol=lf
4 changes: 4 additions & 0 deletions config.default.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ dtypes = [
# Device map to pass to Accelerate when loading the model.
device_map = "auto"

# Quantization method to use when loading the model.
# Options: "none" (no quantization), "bnb_4bit" (4-bit quantization using bitsandbytes).
quantization = "none"

# Memory limits to impose. 0 is usually your first graphics card.
# max_memory = {0 = "16GB", "cpu" = "64GB"}

Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@ classifiers = [
]
dependencies = [
"accelerate>=1.10.0",
"bitsandbytes>=0.45.0",
"datasets>=4.0.0",
"hf-transfer>=0.1.9",
"huggingface-hub>=0.34.4",
"optuna>=4.5.0",
"peft>=0.14.0",
"pydantic-settings>=2.10.1",
"questionary>=2.1.1",
"rich>=14.1.0",
Expand Down
11 changes: 11 additions & 0 deletions src/heretic/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# Copyright (C) 2025 Philipp Emanuel Weidmann <pew@worldwidemann.com>

from enum import Enum
from typing import Dict

from pydantic import BaseModel, Field
Expand All @@ -12,6 +13,11 @@
)


class QuantizationMethod(str, Enum):
NONE = "none"
BNB_4BIT = "bnb_4bit"


class DatasetSpecification(BaseModel):
dataset: str = Field(
description="Hugging Face dataset ID, or path to dataset on disk."
Expand Down Expand Up @@ -71,6 +77,11 @@ class Settings(BaseSettings):
description="Whether to trust remote code when loading the model.",
)

quantization: QuantizationMethod = Field(
default=QuantizationMethod.NONE,
description="Quantization method to use when loading the model. Options: 'none' (no quantization), 'bnb_4bit' (4-bit quantization using bitsandbytes).",
)

batch_size: int = Field(
default=0, # auto
description="Number of input sequences to process in parallel (0 = auto).",
Expand Down
122 changes: 109 additions & 13 deletions src/heretic/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,10 @@
from pydantic import ValidationError
from questionary import Choice
from rich.traceback import install
from transformers import AutoModelForCausalLM

from .analyzer import Analyzer
from .config import Settings
from .config import QuantizationMethod, Settings
from .evaluator import Evaluator
from .model import AbliterationParameters, Model
from .utils import (
Expand All @@ -50,6 +51,73 @@
)


def obtain_merge_strategy(settings: Settings) -> str | None:
"""
Prompts the user for how to proceed with quantized models.
Returns "merge", "adapter", or None (if cancelled/invalid).
"""
# Prompt for all PEFT models to ensure user is aware of merge implications
if settings.quantization == QuantizationMethod.BNB_4BIT:
# Quantized models need special handling - we must reload the base model
# in full precision to merge the LoRA adapters
print()
print(
"[yellow]Model was loaded with quantization. Merging requires reloading the base model.[/]"
)
print(
"[red](!) WARNING: CPU Merging requires dequantizing the entire model to System RAM.[/]"
)
print("[red] This can lead to SYSTEM FREEZES if you run out of memory.[/]")
print(
"[yellow] Rule of thumb: You need approx. 3x the parameter count in GB.[/]"
Comment thread
accemlcc marked this conversation as resolved.
)

try:
# Estimate memory requirements by loading the model structure on the "meta" device.
# This doesn't consume actual RAM but allows us to inspect the parameter count/dtype.
#
# Suppress warnings during meta device loading (e.g., "Some weights were not initialized").
# These are expected and harmless since we're only inspecting model structure, not running inference.
with warnings.catch_warnings():
warnings.simplefilter("ignore")
Comment thread
accemlcc marked this conversation as resolved.
meta_model = AutoModelForCausalLM.from_pretrained(
settings.model,
device_map="meta",
torch_dtype=torch.bfloat16,
trust_remote_code=True,
)
footprint_bytes = meta_model.get_memory_footprint()
footprint_gb = footprint_bytes / (1024**3)
print(
f"[yellow] Estimated net RAM required for model weights (excluding overhead): [bold]~{footprint_gb:.1f} GB[/][/]"
)
except Exception:
# Fallback if meta loading fails (e.g. owing to custom model code
# or `bitsandbytes` quantization config issues on the meta device)
print(
"[yellow] Example: A 27B model requires ~80GB RAM. A 70B model requires ~200GB RAM.[/]"
)
print()

merge_choice = prompt_select(
Comment thread
accemlcc marked this conversation as resolved.
"How do you want to proceed?",
choices=[
Choice(
title="Merge full model (reload base model on CPU - requires high RAM)",
value="merge",
),
Choice(
title="Save LoRA adapter only (can be merged later with llama.cpp or more RAM)",
value="adapter",
),
],
)
return merge_choice

# Default for non-quantized models
return "merge"


def run():
# Enable expandable segments to reduce memory fragmentation on multi-GPU setups.
if (
Expand Down Expand Up @@ -220,7 +288,7 @@ def run():
print()
print(f"Loading model [bold]{settings.evaluate_model}[/]...")
settings.model = settings.evaluate_model
model.reload_model()
model.reset_model()
print("* Evaluating...")
evaluator.get_score()
return
Expand Down Expand Up @@ -330,8 +398,8 @@ def objective(trial: Trial) -> tuple[float, float]:
print("* Parameters:")
for name, value in get_trial_parameters(trial).items():
print(f" * {name} = [bold]{value}[/]")
print("* Reloading model...")
model.reload_model()
print("* Resetting model...")
model.reset_model()
print("* Abliterating...")
model.abliterate(refusal_directions, direction_index, parameters)
print("* Evaluating...")
Expand Down Expand Up @@ -446,8 +514,8 @@ def objective_wrapper(trial: Trial) -> tuple[float, float]:
print("* Parameters:")
for name, value in get_trial_parameters(trial).items():
print(f" * {name} = [bold]{value}[/]")
print("* Reloading model...")
model.reload_model()
print("* Resetting model...")
model.reset_model()
print("* Abliterating...")
model.abliterate(
refusal_directions,
Expand Down Expand Up @@ -481,7 +549,19 @@ def objective_wrapper(trial: Trial) -> tuple[float, float]:
continue

print("Saving model...")
model.model.save_pretrained(save_directory)
strategy = obtain_merge_strategy(settings)
if strategy is None:
print("[yellow]Action cancelled.[/]")
continue

if strategy == "adapter":
model.model.save_pretrained(save_directory)
else:
merged_model = model.get_merged_model()
merged_model.save_pretrained(save_directory)
del merged_model
empty_cache()

model.tokenizer.save_pretrained(save_directory)
print(f"Model saved to [bold]{save_directory}[/].")

Expand Down Expand Up @@ -517,13 +597,29 @@ def objective_wrapper(trial: Trial) -> tuple[float, float]:
)
private = visibility == "Private"

print("Uploading model...")
strategy = obtain_merge_strategy(settings)
if strategy is None:
print("[yellow]Action cancelled.[/]")
Comment thread
accemlcc marked this conversation as resolved.
continue

if strategy == "adapter":
print("Uploading LoRA adapter...")
model.model.push_to_hub(
repo_id,
private=private,
token=token,
)
else:
print("Uploading merged model...")
merged_model = model.get_merged_model()
merged_model.push_to_hub(
repo_id,
private=private,
token=token,
)
del merged_model
empty_cache()
Comment thread
accemlcc marked this conversation as resolved.

model.model.push_to_hub(
repo_id,
private=private,
token=token,
)
model.tokenizer.push_to_hub(
repo_id,
private=private,
Expand Down
Loading