p-e-w · p-e-w · Dec 14, 2025 · Dec 1, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+* text eol=lf
diff --git a/config.default.toml b/config.default.toml
@@ -18,6 +18,10 @@ dtypes = [
 # Device map to pass to Accelerate when loading the model.
 device_map = "auto"
 
+# Quantization method to use when loading the model.
+# Options: "none" (no quantization), "bnb_4bit" (4-bit quantization using bitsandbytes).
+quantization = "none"
+
 # Memory limits to impose. 0 is usually your first graphics card.
 # max_memory = {0 = "16GB", "cpu" = "64GB"}
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -23,10 +23,12 @@ classifiers = [
 ]
 dependencies = [
     "accelerate>=1.10.0",
+    "bitsandbytes>=0.45.0",
     "datasets>=4.0.0",
     "hf-transfer>=0.1.9",
     "huggingface-hub>=0.34.4",
     "optuna>=4.5.0",
+    "peft>=0.14.0",
     "pydantic-settings>=2.10.1",
     "questionary>=2.1.1",
     "rich>=14.1.0",

diff --git a/src/heretic/config.py b/src/heretic/config.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # Copyright (C) 2025  Philipp Emanuel Weidmann <pew@worldwidemann.com>
 
+from enum import Enum
 from typing import Dict
 
 from pydantic import BaseModel, Field
@@ -12,6 +13,11 @@
 )
 
 
+class QuantizationMethod(str, Enum):
+    NONE = "none"
+    BNB_4BIT = "bnb_4bit"
+
+
 class DatasetSpecification(BaseModel):
     dataset: str = Field(
         description="Hugging Face dataset ID, or path to dataset on disk."
@@ -71,6 +77,11 @@ class Settings(BaseSettings):
         description="Whether to trust remote code when loading the model.",
     )
 
+    quantization: QuantizationMethod = Field(
+        default=QuantizationMethod.NONE,
+        description="Quantization method to use when loading the model. Options: 'none' (no quantization), 'bnb_4bit' (4-bit quantization using bitsandbytes).",
+    )
+
     batch_size: int = Field(
         default=0,  # auto
         description="Number of input sequences to process in parallel (0 = auto).",

diff --git a/src/heretic/main.py b/src/heretic/main.py
@@ -31,9 +31,10 @@
 from pydantic import ValidationError
 from questionary import Choice
 from rich.traceback import install
+from transformers import AutoModelForCausalLM
 
 from .analyzer import Analyzer
-from .config import Settings
+from .config import QuantizationMethod, Settings
 from .evaluator import Evaluator
 from .model import AbliterationParameters, Model
 from .utils import (
@@ -50,6 +51,73 @@
 )
 
 
+def obtain_merge_strategy(settings: Settings) -> str | None:
+    """
+    Prompts the user for how to proceed with quantized models.
+    Returns "merge", "adapter", or None (if cancelled/invalid).
+    """
+    # Prompt for all PEFT models to ensure user is aware of merge implications
+    if settings.quantization == QuantizationMethod.BNB_4BIT:
+        # Quantized models need special handling - we must reload the base model
+        # in full precision to merge the LoRA adapters
+        print()
+        print(
+            "[yellow]Model was loaded with quantization. Merging requires reloading the base model.[/]"
+        )
+        print(
+            "[red](!) WARNING: CPU Merging requires dequantizing the entire model to System RAM.[/]"
+        )
+        print("[red]    This can lead to SYSTEM FREEZES if you run out of memory.[/]")
+        print(
+            "[yellow]    Rule of thumb: You need approx. 3x the parameter count in GB.[/]"
+        )
+
+        try:
+            # Estimate memory requirements by loading the model structure on the "meta" device.
+            # This doesn't consume actual RAM but allows us to inspect the parameter count/dtype.
+            #
+            # Suppress warnings during meta device loading (e.g., "Some weights were not initialized").
+            # These are expected and harmless since we're only inspecting model structure, not running inference.
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                meta_model = AutoModelForCausalLM.from_pretrained(
+                    settings.model,
+                    device_map="meta",
+                    torch_dtype=torch.bfloat16,
+                    trust_remote_code=True,
+                )
+                footprint_bytes = meta_model.get_memory_footprint()
+                footprint_gb = footprint_bytes / (1024**3)
+                print(
+                    f"[yellow]    Estimated net RAM required for model weights (excluding overhead): [bold]~{footprint_gb:.1f} GB[/][/]"
+                )
+        except Exception:
+            # Fallback if meta loading fails (e.g. owing to custom model code
+            # or `bitsandbytes` quantization config issues on the meta device)
+            print(
+                "[yellow]    Example: A 27B model requires ~80GB RAM. A 70B model requires ~200GB RAM.[/]"
+            )
+        print()
+
+        merge_choice = prompt_select(
+            "How do you want to proceed?",
+            choices=[
+                Choice(
+                    title="Merge full model (reload base model on CPU - requires high RAM)",
+                    value="merge",
+                ),
+                Choice(
+                    title="Save LoRA adapter only (can be merged later with llama.cpp or more RAM)",
+                    value="adapter",
+                ),
+            ],
+        )
+        return merge_choice
+
+    # Default for non-quantized models
+    return "merge"
+
+
 def run():
     # Enable expandable segments to reduce memory fragmentation on multi-GPU setups.
     if (
@@ -220,7 +288,7 @@ def run():
         print()
         print(f"Loading model [bold]{settings.evaluate_model}[/]...")
         settings.model = settings.evaluate_model
-        model.reload_model()
+        model.reset_model()
         print("* Evaluating...")
         evaluator.get_score()
         return
@@ -330,8 +398,8 @@ def objective(trial: Trial) -> tuple[float, float]:
         print("* Parameters:")
         for name, value in get_trial_parameters(trial).items():
             print(f"  * {name} = [bold]{value}[/]")
-        print("* Reloading model...")
-        model.reload_model()
+        print("* Resetting model...")
+        model.reset_model()
         print("* Abliterating...")
         model.abliterate(refusal_directions, direction_index, parameters)
         print("* Evaluating...")
@@ -446,8 +514,8 @@ def objective_wrapper(trial: Trial) -> tuple[float, float]:
         print("* Parameters:")
         for name, value in get_trial_parameters(trial).items():
             print(f"  * {name} = [bold]{value}[/]")
-        print("* Reloading model...")
-        model.reload_model()
+        print("* Resetting model...")
+        model.reset_model()
         print("* Abliterating...")
         model.abliterate(
             refusal_directions,
@@ -481,7 +549,19 @@ def objective_wrapper(trial: Trial) -> tuple[float, float]:
                             continue
 
                         print("Saving model...")
-                        model.model.save_pretrained(save_directory)
+                        strategy = obtain_merge_strategy(settings)
+                        if strategy is None:
+                            print("[yellow]Action cancelled.[/]")
+                            continue
+
+                        if strategy == "adapter":
+                            model.model.save_pretrained(save_directory)
+                        else:
+                            merged_model = model.get_merged_model()
+                            merged_model.save_pretrained(save_directory)
+                            del merged_model
+                            empty_cache()
+
                         model.tokenizer.save_pretrained(save_directory)
                         print(f"Model saved to [bold]{save_directory}[/].")
 
@@ -517,13 +597,29 @@ def objective_wrapper(trial: Trial) -> tuple[float, float]:
                         )
                         private = visibility == "Private"
 
-                        print("Uploading model...")
+                        strategy = obtain_merge_strategy(settings)
+                        if strategy is None:
+                            print("[yellow]Action cancelled.[/]")
+                            continue
+
+                        if strategy == "adapter":
+                            print("Uploading LoRA adapter...")
+                            model.model.push_to_hub(
+                                repo_id,
+                                private=private,
+                                token=token,
+                            )
+                        else:
+                            print("Uploading merged model...")
+                            merged_model = model.get_merged_model()
+                            merged_model.push_to_hub(
+                                repo_id,
+                                private=private,
+                                token=token,
+                            )
+                            del merged_model
+                            empty_cache()
 
-                        model.model.push_to_hub(
-                            repo_id,
-                            private=private,
-                            token=token,
-                        )
                         model.tokenizer.push_to_hub(
                             repo_id,
                             private=private,