llm-random · crewtool · Dec 31, 2025
diff --git a/configs/_model/llama/1B.yaml b/configs/_model/llama/1B.yaml
@@ -0,0 +1,12 @@
+defaults:
+  - base_model
+  - _self_
+
+common:
+  dmodel: 2048
+  dff: 8192
+  dhead: 64
+  sequence_length: 2048
+  n_blocks: 16
+  q_heads: 32
+  kv_heads: 8
diff --git a/configs/pc_project/hf_to_nano_conversion/1B.yaml b/configs/pc_project/hf_to_nano_conversion/1B.yaml
@@ -0,0 +1,27 @@
+defaults:
+  - ../../_misc@_here_: default
+  - ../../_cluster@_here_: helios
+  - ../../_model/llama@_here_: 1B
+  - _self_
+
+
+trainer:
+  checkpoint:
+    load:
+      type: huggingface
+      path: "meta-llama/Llama-3.2-1B"
+
+    save:
+      path: ??? # CHANGE ME
+
+infrastructure:
+  metric_logger:
+    name: PC_save_pretrained_hf_to_nano
+    tags:
+      - nano
+      - pc
+      - save_pretrained
+
+init_model_opt_sched_fn: 
+  _target_: src.core.llama.save_pretrained_llama_as_nano
+  _partial_: true
diff --git a/src/core/llama.py b/src/core/llama.py
@@ -1,8 +1,10 @@
 from collections import OrderedDict
 import re
 import math
+from omegaconf import OmegaConf
 import torch.nn as nn
 import torch
+from hydra.utils import instantiate
 
 from .model import AttentionMechanism, Linear
 from transformers import AutoModelForCausalLM
@@ -237,3 +239,17 @@ def copy_llama_model_weights_from_HF(model: nn.Module, path: str):
     remapped_state_dict = remap_llamahf_state_dict_to_nano(llama_state_dict)
 
     model.load_state_dict(remapped_state_dict)
+
+
+def save_pretrained_llama_as_nano(cfg: OmegaConf, metric_logger=None):
+
+    with torch.device("meta"):
+        model = instantiate(cfg.model)
+
+    hf_model = AutoModelForCausalLM.from_pretrained(cfg.trainer.checkpoint.load.path)
+    nano_sd = remap_llamahf_state_dict_to_nano(hf_model.state_dict())
+    model.load_state_dict(nano_sd, strict=False, assign=True)
+
+    torch.save(model.state_dict(), cfg.trainer.checkpoint.save.path)
+
+    return None, None, None, None, None