diff --git a/.gitignore b/.gitignore
index b856243..952f2f0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,6 @@ bun.lock
e2e-results/
test-results/
playwright-report/
+__pycache__/
+*.pyc
+*.pyo
diff --git a/PARAMETER_GOLF.md b/PARAMETER_GOLF.md
new file mode 100644
index 0000000..d4d3d77
--- /dev/null
+++ b/PARAMETER_GOLF.md
@@ -0,0 +1,873 @@
+# Parameter Golf: A Q²-Based Strategy
+
+> **Related documents:** [DESIGN.md](DESIGN.md) · [RELATED_WORK.md](RELATED_WORK.md)
+
+Section references of the form §D-x.y refer to [DESIGN.md](DESIGN.md).
+Section references of the form §R-x refer to [RELATED_WORK.md](RELATED_WORK.md).
+
+---
+
+## Contents
+
+1. [The Challenge](#1-the-challenge)
+2. [Current State of the Art](#2-current-state-of-the-art)
+3. [The Q² Compression Advantage](#3-the-q-compression-advantage)
+4. [Architecture: Liquid Time Constant Networks](#4-architecture-liquid-time-constant-networks)
+ - 4.5 [Geode-derived layer layout](#45-geode-derived-layer-layout)
+5. [The Combined Strategy](#5-the-combined-strategy)
+ - 5.5 [LIV cache-line packing and byte tokenization](#55-liv-cache-line-packing-and-byte-tokenization)
+6. [Implementation Roadmap](#6-implementation-roadmap)
+7. [Performance Projections](#7-performance-projections)
+ - 7.5 [Williams SpaceTime bound and optimal bit width](#75-williams-spacetime-bound-and-optimal-bit-width)
+8. [References](#references)
+
+---
+
+## 1 The Challenge
+
+OpenAI's **Parameter Golf** challenge (March–April 2026) asks participants to train
+the language model that achieves the lowest bits-per-byte (bpb) on the FineWeb
+validation set, subject to:
+
+1. **Artifact size:** total compressed artifact (code + compressed model weights) ≤
+ 16,000,000 bytes (decimal 16 MB).
+2. **Training time:** ≤ 10 minutes on 8×H100 SXM GPUs.
+3. **Evaluation:** tokenizer-agnostic bpb on the first 50 000 FineWeb documents.
+
+This is a form of *L(N)* optimisation in neural scaling-law notation — minimise
+loss given a fixed parameter budget — unconstrained by data or total compute, but
+tightly constrained by artifact size and training speed.
+
+The challenge is inspired by NanoGPT Speedrunning (L(T) optimisation) and
+NanoGPT Slowrun (L(D) optimisation). All three are special cases of the same
+Pareto frontier: the scaling law surface $L(N, D, T)$.
+
+---
+
+## 2 Current State of the Art
+
+The top leaderboard entries as of March 2026 use a consistent set of techniques:
+
+| Run | bpb | Key techniques |
+|:----|:---:|:---------------|
+| 10L Int5-MLP + BigramHash(10240) | 1.1428 | Int5/Int6 mixed QAT, BigramHash, SWA 0.4, WD=0.04 |
+| Int6 MLP3x + SmearGate + BigramHash | 1.1458 | Int6 QAT, 3x MLP, SmearGate, OrthoInit, SWA |
+| 11L MLP3x + Int6 QAT | 1.1502 | 11 layers, 3x MLP, Int6 QAT, zstd-22, sliding eval |
+| Naive Baseline | 1.2244 | 9 layers, 512 dim, 1024 vocab, tied embeddings |
+
+The parameter budget for current SOTA entries is approximately:
+
+$$N_{\text{SOTA}} \approx \frac{(B - C) \cdot 8}{b_{\text{eff}}}$$
+
+where $B = 16 \times 10^6$ bytes is the total budget, $C \approx 50{,}000$ bytes
+is the code footprint, and $b_{\text{eff}} \approx 5.5$ is the effective bits per
+weight after int5/int6 packing and zstd-22 compression:
+
+$$N_{\text{SOTA}} \approx \frac{(16{,}000{,}000 - 50{,}000) \times 8}{5.5} \approx 23 \text{ M parameters}$$
+
+The BigramHash technique partitions the 16 MB budget between a vocabulary bigram
+table (providing a strong unigram/bigram prior cheaply) and the neural model
+(providing long-range context). The best entries use a vocabulary of 1024–10240
+tokens; at 1024 tokens a complete bigram table costs $1024^2 \times 1 \approx 1$ MB,
+leaving ~15 MB for the neural model.
+
+**What the current SOTA does not do:**
+- It does not use sub-5-bit structural quantization designed for maximum
+ information preservation per bit (§D-2.4).
+- It does not use recurrent or state-space architectures that provide sequential
+ memory without O(n²) attention cost.
+- It does not exploit the complement structure of the $\mathbb{Z}_4$ alphabet
+ (§D-2.8) as an inductive bias for weight organisation.
+
+---
+
+## 3 The Q² Compression Advantage
+
+### 3.1 Parameter capacity at 2 bits
+
+Q² uses 2 bits per symbol, packing 4 symbols per byte. Applied to model weights
+as a quantization-aware training (QAT) scheme — training with the quaternary
+constraint from the start, as BitNet does with ternary weights (§R-3.1) — the
+parameter capacity in 16 MB is:
+
+$$N_{\text{Q}^2} \approx \frac{(B - C) \cdot 8}{2} \approx \frac{15{,}950{,}000 \times 8}{2} \approx 63.8 \text{ M parameters}$$
+
+This is a **2.8× increase** in parameter count at the same artifact size, relative
+to the current int5/int6 SOTA.
+
+If the Q² weights compress by an additional factor of $r$ under zstd-22 (possible
+when trained weights exhibit run-length structure that Q²'s Gray encoding exploits,
+§D-2.7), the capacity grows further:
+
+$$N_{\text{Q}^2,\, r} \approx 63.8 \cdot r \text{ M parameters}$$
+
+For $r = 1.2$ (conservative 20% compression beyond raw 2-bit packing), the
+effective capacity is ~76 M parameters.
+
+### 3.2 Why structural quantization outperforms uniform grids at 2 bits
+
+Standard int2 post-training quantization (GPTQ/AWQ at 2 bits) loses substantially
+more accuracy than int4 because the reconstruction objective:
+
+$$\min_{\hat{W}} \| W - \hat{W} \|_F^2$$
+
+tries to approximate float32 weights with 4 levels, and the quantization error at
+2 bits is large enough to disrupt learned representations.
+
+Q² structural quantization has a different objective: preserve the *relational
+geometry* of the weight space, not the pointwise values. The four cells
+$\{A, B, C, D\}$ encode **sign** and **magnitude class**, which are the two
+structural features that determine a weight's contribution to the L1 geometry of
+activation space (§D-1.5). A weight quantized to $A$ (strong negative) and one
+quantized to $C$ (weak positive) are separated by Lee distance 2 — the complement
+distance — reflecting a fundamental opposition in their role, not an accident of
+the numerical grid.
+
+This matters for QAT because:
+
+1. **Complement involution as a regulariser.** The constraint $\theta(W_{ij}) \neq W_{ij}$
+ for all weights (§D-2.8) prevents the model from learning redundant weight pairs
+ where $W_{ij}$ and $W_{kl}$ encode the same functional direction. It enforces
+ orthogonality of the weight organisation at the symbolic level.
+
+2. **Lee metric loss.** Training with a Lee distance penalty on weight changes
+ encourages the model to make transitions that preserve complement structure.
+ Gradient steps that would move $A \to C$ (complement flip, Lee distance 2) are
+ penalised more than steps that move $A \to B$ (adjacent, Lee distance 1).
+
+3. **Gray encoding preserves gradient flow.** The Gray map $\phi$ (§D-2.7) makes
+ Hamming distance on the encoded bits equal to Lee distance on the symbols.
+ The straight-through estimator (STE) for Q²-QAT propagates gradients through
+ the Gray encoding as if the quantization were a smooth threshold operation,
+ and the bit-level gradient is correctly ordered: a gradient pointing from $A$
+ toward $D$ passes through $B$ and $C$ in order, not by a shortcut.
+
+### 3.3 Expected compression benefit
+
+The Gray-encoded weight tensor of a Q²-trained model has a specific statistical
+structure. After training, the equiprobable condition (§D-2.5):
+
+$$P(W_{ij} = A) = P(W_{ij} = B) = P(W_{ij} = C) = P(W_{ij} = D) = \tfrac{1}{4}$$
+
+is the maximum-entropy condition: all four symbols are equally likely, so the raw
+2-bit stream is nearly incompressible. The compression ratio $r \approx 1.0$ in
+this limit.
+
+**However**, trained networks organise their weights into structured patterns:
+attention heads form near-orthonormal pairs, MLP neurons often have complementary
+partners, and weight matrices develop block structure. The Q² run-reduction step
+applied to weight rows (§D-3.1) can be used diagnostically to measure this
+structure: a low transition density (many consecutive identical symbols) implies
+longer runs and higher compressibility.
+
+The empirical prediction is that Q²-QAT weights will compress to $r \approx 1.1$–$1.3$
+under zstd-22 — more than a random 2-bit stream but less than the int5/int6 models
+(which have float-shaped distributions amenable to entropy coding).
+
+---
+
+## 4 Architecture: Liquid Time Constant Networks
+
+### 4.1 The parameter inefficiency of attention
+
+Standard transformer attention has quadratic time complexity $O(n^2 d)$ in sequence
+length and requires four weight matrices of size $d \times d$ per head per layer.
+For a model with hidden dimension $d$ and $L$ layers:
+
+$$N_{\text{attn}} = 4 L d^2$$
+
+In the Parameter Golf setting, attention is expensive: each attention layer in a
+512-dim model costs $4 \times 512^2 = 1.05 \text{ M}$ parameters, and the
+information content is dominated by the key-value store, not the query-key
+interaction.
+
+For short-context tasks (1024–2048 tokens, as used in current winning entries), the
+attention mechanism is also overqualified: most of the model's context budget is
+already consumed by the first $\sim$10 positions, and positions beyond that
+contribute diminishing marginal information.
+
+### 4.2 Closed-form Continuous-time (CfC) layers
+
+Hasani et al.'s **Closed-form Continuous-time** (CfC) networks provide a
+parameter-efficient alternative. The CfC layer solves the Liquid Time Constant
+(LTC) ODE:
+
+$$\dot{h}(t) = -\left[\frac{1}{\tau} + f(h(t), x(t); \theta)\right] h(t) + f(h(t), x(t); \theta)$$
+
+analytically, yielding a closed-form update:
+
+$$h(t + \Delta t) = \exp\!\left(-A_1(t) \cdot \Delta t\right) \odot h(t) + \frac{A_2(t)}{A_1(t)} \cdot \left[1 - \exp\!\left(-A_1(t) \cdot \Delta t\right)\right]$$
+
+where $A_1, A_2$ are functions of the input $x(t)$ and current state $h(t)$, and
+$\exp$ denotes the elementwise exponential. This closed form:
+
+1. Eliminates the numerical integration loop of vanilla LTC networks.
+2. Provides causal, single-pass inference: each token updates the state $h$ in
+ $O(d)$ time, independent of sequence length.
+3. Requires only two linear projections ($A_1, A_2$) plus the state update — far
+ fewer parameters than a full attention block.
+
+**Parameter count comparison.** For hidden dimension $d$:
+
+| Block type | Parameters per layer |
+|:-----------|:--------------------:|
+| Full MHA | $4d^2$ |
+| GQA (4 KV heads) | $\approx 3.5 d^2$ |
+| CfC (closed-form) | $\approx 2 d^2 + 2d$ |
+| CfC (compact) | $\approx d^2 + 2d$ |
+
+The CfC layer requires approximately $d^2$ fewer parameters per layer than
+full attention. Over $L$ layers, this frees:
+
+$$\Delta N = L \cdot d^2 \text{ parameters}$$
+
+For $L = 10$, $d = 512$: $\Delta N = 10 \times 512^2 = 2.6 \text{ M}$ parameters
+freed for other components (larger MLP, larger BigramHash table, or more layers).
+
+### 4.3 Liquid Foundation Models (LFM 2.5) as a template
+
+Liquid AI's **LFM 2.5** model demonstrates the viability of hybrid recurrent +
+attention architectures at production scale. The LFM 2.5 architecture uses:
+
+- **10 LIV (Liquid Integrated Vision/Language) Convolution Blocks:** CfC-based
+ sequential processors that provide O(1) per-token memory through recurrent state.
+- **6 GQA (Grouped Query Attention) Blocks:** Standard attention for positional
+ cross-token mixing.
+- **32k token trained context:** Achievable because LIV blocks handle most of the
+ context without O(n²) cost.
+
+The LFM 2.5 result demonstrates that attention is not required for most of the
+model's depth — the CfC state provides sufficient long-range memory. Attention
+is used selectively for in-context reasoning and positional disambiguation.
+
+For the Parameter Golf setting, the 32k context is not needed. But the principle
+transfers: **replace most attention layers with CfC, keep a few GQA layers for
+in-context mixing.**
+
+### 4.4 CfC layers and Q²-QAT synergy
+
+The Q² structural quantization (§D-2.4) is particularly well-suited to CfC weights
+for two reasons:
+
+1. **State update weights have complement structure.** The two matrices $A_1$ and
+ $A_2$ in the CfC update equation have a natural complement relationship: one
+ controls the decay rate and the other controls the input integration rate.
+ The Q² complement involution $\theta(A) = C$, $\theta(B) = D$ (§D-2.8) encodes
+ this opposition directly — strong-decay and strong-integration are complements
+ in the same way that strong-negative and strong-positive activations are.
+
+2. **Fewer weights need high precision.** CfC state updates involve sigmoid
+ activations, which saturate at $\pm 1$. Near the saturation region, the exact
+ weight value matters less than its sign and magnitude class — precisely what Q²
+ preserves (§D-1.5). The two cells $A$ (strong negative, below $-\tau^{\ast}$)
+ and $D$ (strong positive, above $+\tau^{\ast}$) correspond to the saturation
+ regime; $B$ and $C$ correspond to the linear-response regime near zero.
+
+### 4.5 Geode-derived layer layout
+
+LFM 2.5's 10:6 CfC:GQA ratio was found empirically. The Geode factorization
+(§D-4.1) provides a principled derivation that eliminates the guesswork.
+
+The generating function for Q²'s transition sequences:
+
+$$S(x) - 1 = \frac{4x}{1-3x} = \underbrace{4x}_{S_1} \cdot \underbrace{\frac{1}{1-3x}}_{G}$$
+
+decomposes into two factors with a direct architectural interpretation:
+
+- **$S_1 = 4x$**: the first symbol has **4 choices** — the 4 coarse quantization
+ cells. Architecturally: **4 GQA blocks**, each establishing the broadest
+ context structure (equivalent to selecting one of 4 block files in the
+ transition key, §D-3.4).
+
+- **$G = 1/(1-3x) = 1 + 3x + 9x^2 + \cdots$**: each subsequent symbol has
+ **3 choices** — refinement within the established coarse cell.
+ Architecturally: **3 CfC blocks per GQA block**, each performing one 3-way
+ refinement step within the coarse context.
+
+This gives the layer pattern:
+
+$$\underbrace{[\text{GQA},\ \text{CfC},\ \text{CfC},\ \text{CfC}]}_{\text{one Geode level}} \times 4 = 16 \text{ layers total}$$
+
+**4 GQA + 12 CfC**, with CfC:GQA ratio **3:1** — compared to LFM 2.5's empirical
+10:6 = 1.67:1. The Geode predicts a more CfC-heavy architecture, consistent with
+the hypothesis that less attention is needed at the short-context (2048-token)
+parameter-golf scale.
+
+**Information accumulated at each stage.** The Geode gives the bits of
+structural information captured at depth $k$:
+
+- After 1 GQA block: $\log_2 4 = 2$ bits of coarse context.
+- After each additional CfC step: $+\log_2 3 \approx 1.585$ bits of refinement.
+- After all 16 layers (4 coarse + 12 refinement): $4 \times (2 + 3 \times \log_2 3) \approx 27.0$ bits.
+
+This sits within the 51.1-bit capacity of the full 32-symbol key (§D-3.6),
+confirming the 16-layer model can represent sufficient structural information for
+2048-token language modeling.
+
+**Layer position mapping:**
+
+| Layer | Type | Geode node | Purpose |
+|:-----:|:-----|:----------:|:--------|
+| 1 | GQA | $S_1$ root | Coarse context — 4 choices ($r_0$, §D-3.2) |
+| 2–4 | CfC × 3 | $G$ level 1 | First refinement — 3 choices per step |
+| 5 | GQA | $S_1$ reset | Re-establishes coarse context |
+| 6–8 | CfC × 3 | $G$ level 2 | Second refinement |
+| 9 | GQA | $S_1$ reset | Re-establishes coarse context |
+| 10–12 | CfC × 3 | $G$ level 3 | Third refinement |
+| 13 | GQA | $S_1$ reset | Final coarse context |
+| 14–16 | CfC × 3 | $G$ level 4 | Fourth refinement |
+
+The GQA layers act as "semantic resets" — attending across the full token
+sequence to re-establish coarse structure; the CfC layers refine within that
+structure token-by-token using recurrent state.
+
+---
+
+## 5 The Combined Strategy
+
+### 5.1 Architecture
+
+The proposed architecture for the Parameter Golf submission is a **Q²-QAT hybrid
+LTC-Transformer**, combining:
+
+1. **Q² 2-bit QAT** for all weight matrices (attention, MLP, CfC state).
+2. **Hybrid depth:** Geode-derived layout (§4.5) — [GQA, CfC, CfC, CfC] × 4
+ = 16 layers (4 GQA + 12 CfC).
+3. **BigramHash** vocabulary embedding: a hash table of bigram statistics stored
+ as part of the 16 MB artifact.
+4. **Sliding window evaluation** at stride 64.
+
+```mermaid
+flowchart TD
+ subgraph Model["Q2-QAT Hybrid LTC-Transformer (Geode layout)"]
+ direction TB
+ emb["Token Embedding\n(FP16, tied)"]
+ bh["BigramHash\n(bigram log-probs, 2-4 MB)"]
+ subgraph Stack["16-layer Geode stack: (GQA, CfC, CfC, CfC) x4"]
+ direction TB
+ gqa1["GQA Block x4\n(Q2 2-bit, coarse: 4 choices)"]
+ cfc1["CfC Block x12\n(Q2 2-bit, refine: 3 choices each)"]
+ end
+ lm_head["LM Head\n(tied to embedding)"]
+ end
+ emb --> Stack
+ bh -->|"log-prob prior"| lm_head
+ Stack --> lm_head
+```
+
+**Hidden dimension and layer count.** With 64 M parameters at 2 bits per weight,
+packed 4 per byte, and BigramHash(10240) consuming ~4 MB:
+
+$$N_{\text{model}} \approx \frac{(16 \times 10^6 - 4 \times 10^6 - 50{,}000) \times 4 \times 8}{8} \approx 48 \text{ M effective parameters}$$
+
+At hidden dimension $d = 768$ with $n_{\text{kv}} = 4$ KV heads and MLP ratio 3×,
+the parameter count breaks down by component:
+
+- **4 GQA blocks:** Q ($d^2$) + K ($d^2/3$) + V ($d^2/3$) + O ($d^2$) +
+ MLP-up/gate/down (3 × 3$d^2$) = $(8/3 + 9)d^2 \approx 11.67d^2$ each.
+- **12 CfC blocks:** $A_1$ ($2d^2$) + $A_2$ ($2d^2$) + out ($d^2$) = $5d^2$ each.
+
+$$N \approx 4 \times 11.67 d^2 + 12 \times 5 d^2 = 106.7 d^2 \approx 63 \text{ M at } d = 768$$
+
+This matches the 64 M capacity projected in §3.1. Tuning $d$ to 700–730 leaves
+room for the BigramHash table; $d = 768$ fills the budget tightly without it.
+
+### 5.2 Quantization scheme
+
+All linear weight matrices $W \in \mathbb{R}^{m \times n}$ are quantized to Q²
+symbols $\{A, B, C, D\} = \{0, 1, 2, 3\} \subset \mathbb{Z}_4$. The quantization
+threshold applied during training:
+
+$$\tau^{\ast} = \frac{\Phi^{-1}(3/4)}{\sqrt{n}} \approx \frac{0.6745}{\sqrt{n}}$$
+
+is computed from the current batch statistics (the empirical 25th and 75th
+percentile of each row) and updated every 1024 training steps — the same
+reservoir-calibration strategy described in §D-2.5 for activation quantization.
+
+The straight-through estimator (STE) propagates gradients through the
+quantization step:
+
+$$\frac{\partial \mathcal{L}}{\partial W_{ij}} \approx \frac{\partial \mathcal{L}}{\partial \hat{W}_{ij}} \cdot \mathbf{1}\!\left[|W_{ij}| \leq \kappa\right]$$
+
+where the passthrough window $\kappa$ is set to exclude extreme outliers that
+would otherwise receive large gradients through the saturating threshold.
+
+**Packed storage.** Q² symbols are Gray-encoded (§D-2.7) and packed 4 per byte
+using the same packing scheme as the WebAssembly kernel in `src/q2.wat`:
+
+```
+byte = (g[4i] << 6) | (g[4i+1] << 4) | (g[4i+2] << 2) | g[4i+3]
+```
+
+This layout is identical to the activation quantization in `src/q2.wat`, making
+the q2.ts library directly usable for weight packing at checkpoint export time.
+
+### 5.3 Mixed-precision allocation
+
+Not all weight matrices benefit equally from 2-bit precision. Following the
+Geode mixed-precision framework (§D-4.3) and the empirical finding of QuES
+(§R-2.4) that arithmetic-reasoning channels require higher precision:
+
+- **Embedding layer:** Tied FP16. The embedding matrix is not quantized; it
+ serves as the interface between the discrete token space and the continuous
+ weight space. FP16 embeddings with 10240 vocabulary and 768 dimensions cost
+ $10240 \times 768 \times 2 \approx 15.7$ MB — too large. With vocabulary 1024:
+ $1024 \times 768 \times 2 = 1.57$ MB, acceptable.
+- **Q² 2-bit for all linear layers:** All attention projections, CfC state
+ matrices, and MLP weight matrices are quantized to Q² 2-bit.
+- **Layer norm parameters:** Kept in FP16 (negligible count, critical for
+ training stability).
+- **BigramHash:** Stored as FP16 log-probabilities, taking 4–8 MB of the budget.
+
+### 5.4 Training strategy
+
+The training recipe follows the current SOTA structure with Q²-specific additions:
+
+| Component | Setting | Rationale |
+|:----------|:--------|:----------|
+| Optimizer | Muon (Nesterov + spectral normalisation) | Current SOTA |
+| Weight decay | 0.04 | Current SOTA |
+| Learning rate schedule | cosine with warmup 200 steps | Standard |
+| SWA (stochastic weight averaging) | last 40% of training | Current SOTA |
+| Q² threshold update | every 1024 steps, reservoir size 1024 | §D-2.5 |
+| STE passthrough | $\kappa = 3\tau^{\ast}$ | Standard QAT practice |
+| Gradient clipping | 1.0 | Training stability |
+| Sequence length | 2048 | Context for language modeling |
+| Evaluation | sliding window stride 64 | Current SOTA |
+| Vocabulary | SP-1024 (SentencePiece, 1024 tokens) | Matches challenge baseline |
+
+**Warm-up from FP32 pre-training.** A common failure mode of QAT is that the
+model begins training with random 2-bit weights that are too noisy for the
+complement structure to emerge. The recommended warm-up strategy:
+
+1. Train for 500 steps in FP32 with standard initialisation (OrthoInit for
+ attention, standard Kaiming for MLP).
+2. Apply Q² quantization to the FP32 checkpoint with empirical threshold
+ calibration.
+3. Continue training with Q²-QAT from the quantized checkpoint.
+
+This mirrors the BitNet finding (§R-3.1) that training-from-scratch QAT requires
+a brief float-precision warm-up to establish the initial activation distribution
+before the quantization constraint is imposed.
+
+### 5.5 LIV cache-line packing and byte tokenization
+
+Two additional techniques, compatible with the Geode architecture, that can
+improve parameter efficiency and reduce artifact size further:
+
+#### 5.5.1 LIV cache-line packing
+
+LIV (Liquid Integrated Vision/Language) symbols use 5-bit quantisation (int5,
+32 levels). A 64-bit register holds:
+
+$$12 \times 5 + 2 + 2 = 64 \text{ bits}$$
+
+That is, **12 LIV symbols** (60 bits) plus a **2-bit Q² tag** and 2 unused bits.
+The Q² tag is a coarse-context label — one of 4 values matching the
+$S_1 = 4x$ coarse level of the Geode factorization — that identifies which
+GQA "bucket" produced the 12-symbol LIV block.
+
+**Packing layout** (bits 63 → 0, MSB-first):
+
+```
+[sym0(5)] [sym1(5)] … [sym11(5)] [tag(2)] [00]
+ bit 63 bits 8:4 bits 3:2 1:0
+```
+
+sym0 → bits [63:59], sym1 → bits [58:54], …, sym11 → bits [8:4]; tag → bits
+[3:2]; bits [1:0] are unused (zero).
+
+This layout has two computable advantages:
+
+1. **Parallel dispatch by tag.** The 2-bit tag [0..3] partitions the packed
+ words into 4 groups. Each GPU streaming multiprocessor processes one tag
+ group, maximizing cache locality and SM utilization without coordination
+ overhead.
+
+2. **The 10-LIV codon representation.** Taking only the top 10 × 5 = 50 bits,
+ the block can be interpreted as **two 5 × 5 binary matrices** $M_1$ and
+ $M_2$ (25 + 25 = 50 bits). Their Boolean matrix product:
+
+ $$C_{ij} = \bigvee_k \left[(M_1)_{ik} \wedge (M_2)_{kj}\right]$$
+
+ is a deterministic function of the pair. This means:
+ - A "codon" (the Boolean product $C$) uniquely identifies the (M₁, M₂)
+ pair up to equivalence.
+ - Any candidate pair can be verified against a stored codon in $O(25)$
+ Boolean operations — cheap on GPU via warp-level bitwise ops.
+ - The remaining 14 bits (2 LIV sym11 bits + 2-bit tag + 2 unused) serve as
+ a sequence index ordering codons for distributed processing.
+
+ This convolution-verifiable structure mirrors the role of the Q² transition
+ key (§D-3.3) but at a coarser 5-bit resolution, providing a hardware-level
+ checksum for the LIV block without extra storage.
+
+`scripts/q2_pack.py` exports `pack_liv_cacheline` and `unpack_liv_cacheline`
+that implement this layout on GPU-resident tensors.
+
+#### 5.5.2 Byte tokenization — skip the tokeniser encoder
+
+The SP-1024 tokenizer introduces a pre-processing step (encode/decode) that
+costs latency and requires a vocabulary embedding matrix of size
+$V \times d = 1024 \times 768 \approx 1.6$ MB.
+
+At the byte level, vocabulary is always exactly 256, regardless of corpus
+language or domain:
+
+| Tokenization | Vocab | Embedding cost | Tokenizer | Compression |
+|:-------------|:-----:|:--------------:|:---------:|:-----------:|
+| SP-1024 | 1024 | 1.57 MB | Required | ~3× sub-word |
+| Raw bytes | 256 | 0.39 MB | None | 1× byte |
+
+The embedding savings alone free ~1.2 MB — enough for additional model
+parameters at 2 bits/weight ($\approx 5$ M extra weights).
+
+**Training on raw bytes.** Set `BYTE_TOKENS=1` to enable byte mode in
+`scripts/train_q2_ltc.py`. The data shards are read as raw `uint8` streams;
+each byte becomes a token id in [0, 255]. No SentencePiece encode/decode step
+is needed anywhere in the pipeline:
+
+```bash
+BYTE_TOKENS=1 VOCAB_SIZE=256 torchrun --standalone --nproc_per_node=8 \
+ scripts/train_q2_ltc.py
+```
+
+The model sees the same FineWeb text; the challenge scorer operates on bytes
+and computes bpb directly on the byte sequence, so there is no evaluation
+penalty for skipping the tokenizer.
+
+---
+
+## 6 Implementation Roadmap
+
+The implementation is in two Python scripts in `scripts/`:
+
+- **`scripts/q2_pack.py`** — GPU-accelerated Q² weight packing and unpacking.
+- **`scripts/train_q2_ltc.py`** — Complete training script: Q²-QAT, Geode
+ architecture, Muon optimizer, SWA, and artifact packaging.
+
+### 6.1 Phase 1 — Q² weight packing (`scripts/q2_pack.py`)
+
+`q2_pack.py` converts a PyTorch state dict to the Q2BN binary format and back.
+All quantisation operations run on GPU when available, falling back to CPU.
+
+Key functions:
+
+- `empirical_tau(W)` — per-row 75th-percentile threshold (§D-2.5), vectorised
+ on GPU via `torch.quantile`.
+- `q2_quantise(W, tau)` — four-cell quantisation to {A=0,B=1,C=2,D=3} using
+ three vectorised comparisons with no Python loops.
+- `gray_encode(sym)` / `gray_decode(gray)` — Gray map φ: sym XOR (sym >> 1).
+- `pack_symbols(gray)` / `unpack_symbols(packed, n)` — 4 symbols per byte,
+ MSB-first; packing uses a single batched `|` operation over the 4-symbol groups.
+- `pack_state_dict(state_dict, out_path)` — serialise to Q2BN format.
+- `unpack_state_dict(in_path, device)` — deserialise back to float tensors.
+- `pack_liv_cacheline(symbols, seq_tags)` / `unpack_liv_cacheline(packed, n)` —
+ LIV 5-bit cache-line packing (§5.5.1): 12 LIV + 4-bit Q² tag per 64-bit word.
+
+CLI usage:
+
+```bash
+# Pack a PyTorch checkpoint to Q2 binary:
+python scripts/q2_pack.py model.pt model.q2bin
+
+# Inspect a packed file:
+python scripts/q2_pack.py --unpack model.q2bin
+```
+
+### 6.2 Phase 2 — Training script (`scripts/train_q2_ltc.py`)
+
+`train_q2_ltc.py` is the complete training script. It implements:
+
+- **`Q2Linear`** — `nn.Linear` subclass with STE quantisation. Behaves as a
+ standard linear layer during FP32 warm-up; call `activate_q2()` to switch.
+ Refreshes τ* every `tau_update_every` steps from the empirical weight
+ distribution.
+
+- **`CfCBlock`** — One Geode G-level (3-way refinement). Runs the closed-form
+ LTC update per token; state `h` propagates across the sequence with no KV
+ cache. All projections are `Q2Linear`.
+
+- **`GQABlock`** — One Geode S1-level (4-way coarse selection). Uses
+ `F.scaled_dot_product_attention` (FlashAttention kernel on H100) with GQA
+ head sharing. SwiGLU MLP with 3× expansion. All projections are `Q2Linear`.
+
+- **`Q2LTCModel`** — Full 16-layer model with Geode layout
+ `[GQA, CfC, CfC, CfC] × 4`. OrthoInit weights; tied embeddings and LM head.
+
+- **`Muon`** — Nesterov momentum + per-matrix spectral normalisation. Prevents
+ large weight moves from disrupting Q² complement structure during QAT.
+
+- **Training loop** — `torch.compile(mode="max-autotune")` for kernel fusion;
+ bfloat16 autocast; gradient accumulation; cosine LR + warmup; SWA from 60% of
+ training; sliding-window validation; automatic Q2BN + zstd-22 packaging.
+ Byte-mode training (`BYTE_TOKENS=1`) skips the tokeniser encoder entirely
+ (§5.5.2).
+
+Single-GPU smoke test:
+
+```bash
+MAX_STEPS=200 BATCH_TOKENS=8192 python scripts/train_q2_ltc.py
+```
+
+Full 8×H100 run (SP-1024 tokens):
+
+```bash
+torchrun --standalone --nproc_per_node=8 scripts/train_q2_ltc.py
+```
+
+Full 8×H100 run (raw bytes, no tokeniser):
+
+```bash
+BYTE_TOKENS=1 torchrun --standalone --nproc_per_node=8 scripts/train_q2_ltc.py
+```
+
+### 6.3 Phase 3 — Artifact packaging (built into training script)
+
+At the end of training, `train_q2_ltc.py` automatically:
+
+1. Selects the SWA-averaged model (or the final model if SWA has not started).
+2. Packs all weight matrices to Q2BN via `q2_pack.pack_state_dict`.
+3. Compresses with zstd level 22 (requires `pip install zstandard`).
+4. Reports the total artifact size and flags if it exceeds 16 MB.
+
+To trigger packaging on an existing checkpoint:
+
+```bash
+python -c "
+import torch, sys
+sys.path.insert(0, 'scripts')
+import q2_pack
+sd = torch.load('checkpoint.pt', map_location='cpu', weights_only=True)
+n = q2_pack.pack_state_dict(sd.get('model', sd), 'model.q2bin')
+print(f'{n/1e6:.3f} MB')
+"
+```
+
+---
+
+## 7 Performance Projections
+
+### 7.1 Parameter capacity
+
+| Method | Bits/weight | Parameters in 16 MB | Relative capacity |
+|:-------|:-----------:|:-------------------:|:-----------------:|
+| Naive baseline (int8) | 8 | ~11 M | 1.0× |
+| Current SOTA (int5/int6) | 5.5 | ~23 M | 2.1× |
+| Q² 2-bit | 2.0 | ~64 M | 5.8× |
+| Q² 2-bit + zstd compression | ~1.7 | ~75 M | 6.8× |
+
+### 7.2 Scaling law projection
+
+Under the Chinchilla scaling law, language model loss scales as:
+
+$$L(N, D) = E + \frac{A}{N^{\alpha}} + \frac{B}{D^{\beta}}$$
+
+with $E \approx 1.61$ nats/token (irreducible entropy), $\alpha \approx 0.34$,
+$\beta \approx 0.28$.
+
+In the Parameter Golf setting $D$ is effectively unlimited (8B tokens available);
+the bottleneck is $N$. Moving from 23 M to 64 M parameters at the same data
+volume predicts:
+
+$$\Delta L \approx A \cdot \left(N_{23M}^{-\alpha} - N_{64M}^{-\alpha}\right) \approx A \cdot (23M^{-0.34} - 64M^{-0.34})$$
+
+For a rough estimate with $A \approx 406.4$ (Chinchilla value):
+
+$$\Delta L \approx 406.4 \times (4.09 \times 10^{-3} - 2.71 \times 10^{-3}) \approx 0.056 \text{ nats/token}$$
+
+Converting to bpb: $\Delta \text{bpb} = \Delta L / \ln 2 \approx 0.081$.
+
+This suggests a projected bpb of $1.1428 - 0.081 \approx 1.06$ for the pure
+scaling benefit of 2.8× more parameters — ignoring any additional benefit from
+the CfC architecture's superior parameter efficiency per layer.
+
+**Caveat.** This projection assumes that 2-bit Q² model quality matches 5-bit
+quality at the same parameter count, which requires successful QAT. The
+BitNet b1.58 (§R-3.1) and binary/ternary weight literature (§R-3.2) consistently
+show that QAT-from-scratch at ≥1.58 bits is competitive with post-training
+quantization at 4–5 bits. The 2-bit Q² point is between ternary (1.58 bits) and
+binary-weighted quantization (1 bit), and the complement structure of
+$\mathbb{Z}_4$ provides richer inductive bias than either.
+
+### 7.3 The CfC efficiency multiplier
+
+The CfC parameter efficiency argument is harder to quantify analytically. The LFM
+2.5 result (matching or exceeding GPT-class models on language benchmarks with
+far fewer attention operations) suggests that the CfC recurrent state provides
+$O(d)$ effective context memory at $O(d^2)$ parameter cost — the same
+asymptotic complexity as attention, but with lower constant factors because:
+
+- No key-value cache growth with sequence length.
+- No positional encoding overhead.
+- State update is a sigmoid multiply-add, not a softmax over all prior keys.
+
+For the 10-minute training constraint on 8×H100, the CfC blocks train faster per
+step than attention blocks of equal parameter count because there is no CUDA
+FlashAttention kernel overhead for the CfC state update (a simple element-wise
+operation).
+
+### 7.4 Summary projection
+
+| Component | Estimated bpb improvement |
+|:----------|:-------------------------:|
+| Current SOTA baseline | 1.1428 |
+| Q² 2-bit QAT (parameter scaling alone) | -0.08 |
+| CfC architecture (parameter efficiency) | -0.02 to -0.05 (estimated) |
+| Larger BigramHash enabled by space saving | -0.01 to -0.02 |
+| **Projected total** | **~1.00 to 1.03** |
+
+A score of 1.00–1.05 bpb would represent a substantial improvement over the
+current SOTA (1.1428 bpb) — an advance of roughly 0.08–0.14 bpb, well above the
+0.005-nat (~0.007 bpb) significance threshold required for leaderboard submission.
+
+### 7.5 Williams SpaceTime bound and optimal bit width
+
+**Ryan Williams (2025)** proved that any multitape Turing machine running in time
+$t(n)$ can be simulated in space:
+
+$$S = \mathcal{O}\!\left(\sqrt{t(n) \cdot \log t(n)}\right)$$
+
+(Williams, *Simulating Time With Square-Root Space*, STOC 2025 / arXiv:2502.17779.)
+This is a dramatic improvement over the Hopcroft–Paul–Valiant 1975 bound of
+$\mathcal{O}(t / \log t)$, and it gives a rigorous information-theoretic relationship
+between computation time and storage space.
+
+#### Applying Williams to the 16 MB / 10-minute constraint
+
+**Available computation** (8×H100, BF16, 10 min):
+
+$$t = 8 \times 989 \times 10^{12} \times 600 \approx 4.75 \times 10^{18} \text{ FLOPS}$$
+
+**Williams lower bound on space** needed to faithfully simulate $t$:
+
+$$S_{\min} = \mathcal{O}\!\left(\sqrt{4.75 \times 10^{18} \times \log_2(4.75 \times 10^{18})}\right)
+ = \mathcal{O}\!\left(\sqrt{4.75 \times 10^{18} \times 62}\right)
+ \approx 1.72 \times 10^{10} \text{ bits} \approx 2.15 \text{ GB}$$
+
+**Our artifact space**: $S = 16 \times 10^6 \times 8 = 1.28 \times 10^8$ bits (16 MB).
+
+$$\frac{S}{S_{\min}} \approx \frac{1.28 \times 10^8}{1.72 \times 10^{10}} = 0.0075 = 0.75\%$$
+
+We have **0.75% of the Williams-implied storage**. This places the challenge firmly
+in the deep-compression regime: the model is far too small to faithfully represent
+all computation in the training run. Only the most structured, compressible patterns
+in FineWeb can be captured.
+
+#### Reverse: what does 16 MB imply about effective computation?
+
+Inverting $S^2 \approx t \cdot \log_2 t$ for $S = 1.28 \times 10^8$ bits:
+
+$$S^2 = 1.638 \times 10^{16} \implies t_{\max} \approx 3.4 \times 10^{14} \text{ FLOPS}$$
+
+**Interpretation**: A 16 MB model can faithfully encode the structure of approximately
+$3.4 \times 10^{14}$ FLOPS of computation — or about $7 \times 10^{-3}$% of the
+10-minute H100 training budget. The remaining training FLOPS refine the model's
+weights without encoding qualitatively new information (they push the stored structure
+toward the FineWeb distribution, but cannot expand the model's capacity).
+
+This is why the challenge rewards **compression per bit above all else**: every bit
+is precious. Any format that wastes bits on alignment padding, metadata overhead, or
+suboptimal bit-width penalizes the final score.
+
+#### Cache-line efficiency by bit width
+
+A 64-byte cache line holds 512 bits. The waste per line and total parameter budget
+for each integer bit width. The table shows **GPU-native 64-bit register alignment**
+(CUDA operates on 64-bit or 32-bit aligned chunks):
+
+| Bits/weight | Params/register | Wasted bits/register | Params/cache-line | Effective N (16 MB) |
+|:-----------:|:---------------:|:--------------------:|:-----------------:|:-------------------:|
+| 1 | 64 | 0 | 512 | 128 M |
+| **2 (Z₄)** | **32** | **0** | **256** | **64 M** |
+| 4 (Z₈) | 16 | 0 | 128 | 32 M |
+| **5 (int5)** | 12 | **4** | **96** | **~24 M** |
+| **6 (int6)** | 10 | **4** | **80** | **~20 M** |
+| 8 (Z₁₆) | 8 | 0 | 64 | 16 M |
+
+Power-of-2 bit widths (1, 2, 4, 8) divide evenly into 64-bit registers — **zero
+waste**. For int5 and int6, packing per 64-bit register leaves 4 unused bits
+(6.25% per register). Across 2,000,000 registers in 16 MB:
+
+$$2{,}000{,}000 \times 4 \text{ bits} = 8{,}000{,}000 \text{ bits} = 1 \text{ MB wasted}$$
+
+That 1 MB recovers $\approx 4$ M additional Z₄ parameters (1 MB × 8 / 2 bits =
+4 M params) — enough to noticeably move bpb via Chinchilla scaling (§7.2).
+
+#### The LIV bit-width question resolved
+
+The current SOTA uses post-training quantization to **int5** (LFM 2.5 GGUF format).
+Several parallel analyses have been debating whether LIV blocks need 4 or 5 bits.
+The Williams + cache-line analysis gives a definitive answer:
+
+1. **For Q²-QAT training from scratch**: use Z₄ **2-bit** throughout.
+ This maximises $N = 64$ M parameters — the information-theoretically optimal
+ choice for integer bit widths, given that 2-bit is the minimum meaningful
+ representation (1-bit binary weights are viable but lose the complement structure
+ of $\mathbb{Z}_4$ that makes Q² quantization uniquely effective).
+
+2. **For LIV-format post-training compression**: **4-bit (Z₈)** strictly dominates
+ **5-bit (int5)** for GPU-aligned storage because 4-bit has zero register waste
+ ($N = 32$ M) while int5 wastes 4 bits per register ($N \approx 24$ M effective,
+ not 25.6 M nominal).
+
+3. **The §5.5.1 scheme** (12 LIV × 5-bit + 4-bit Q² tag = 64 bits exactly) IS a
+ perfectly aligned 64-bit word — no register waste — but allocates 4 of 64 bits
+ to metadata rather than weight storage, giving an effective density of
+ $64/12 = 5.33$ bits/LIV. This is useful for parallel dispatch and codon
+ verification, but less dense than pure Z₄ (2 bits/param) or Z₈ (4 bits/param).
+
+**Bottom line**: Our Q²-QAT approach uses Z₄ 2-bit weights for all model parameters.
+This is the unique integer bit-width that simultaneously:
+- Achieves maximum $N = 64$ M parameters in the 16 MB budget
+- Packs perfectly into 64-bit registers and 64-byte cache lines (zero waste)
+- Preserves the $\mathbb{Z}_4$ complement structure and Lee metric
+- Falls within the training-from-scratch QAT regime proven competitive by BitNet (§R-3.1)
+
+The int5/int6 debate applies to post-training quantization of float-trained models.
+For QAT-from-scratch, 2-bit is the correct choice from both a Williams perspective
+(maximise $N$) and an algebraic one (preserve $\mathbb{Z}_4$ ring structure).
+
+#### Reconciliation with parallel analyses
+
+Two parallel analyses (in `PARAMETER_GOLF_REVISED.md` and `docs/parameter-golf.md`
+on the `main` branch) reach compatible conclusions:
+
+- `PARAMETER_GOLF_REVISED.md` correctly identifies that **odd bit-widths are
+ suboptimal for cache alignment** and recommends power-of-2 widths. Williams
+ confirms this: every wasted bit reduces $N$, directly increasing bpb.
+
+- `docs/parameter-golf.md` recommends mixed int5/int6 precision, which is the
+ leaderboard SOTA approach. The Williams analysis shows this is suboptimal vs.
+ 2-bit QAT because it achieves $N_{\text{eff}} \approx 24$ M at int5 (not the
+ nominal 25.6 M, due to register alignment), while Q² 2-bit achieves $N = 64$ M.
+ From §7.2, the predicted $\Delta\text{bpb} \approx 0.08$ from this parameter
+ gap alone.
+
+The three analyses converge on: **maximum parameters at lowest possible bit-width
+with perfect cache alignment** — which is Q² 2-bit.
+
+---
+
+## 8 References
+
+- OpenAI Parameter Golf challenge.
+- OpenAI Parameter Golf GitHub repository.
+- Williams, R. (2025). Simulating Time With Square-Root Space. *Proc. STOC 2025*.
+ arXiv:2502.17779. (§7.5)
+- Hasani, R., Lechner, M., Amini, A., Rus, D., & Grosse-Wentrup, M. (2021). Liquid
+ Time-constant Networks. *AAAI-2021*. arXiv:2006.04439.
+- Hasani, R., Lechner, M., Amini, A., Liebenwein, L., Ray, A., Tschaikowski, M.,
+ Teschl, G., & Rus, D. (2022). Closed-form Continuous-time Neural Networks.
+ *Nature Machine Intelligence* 4, 992–1003. arXiv:2106.13898.
+- Liquid AI. LFM 2.5 Technical Report. (2025).
+
+- Ma, S. et al. (2024). The Era of 1-bit LLMs: All Large Language Models are in
+ 1.58 Bits. arXiv:2402.12263. (§R-3.1)
+- Wildberger, N. J. & Rubine, D. (2025). A Hyper-Catalan Series Solution to
+ Polynomial Equations, and the Geode. *Amer. Math. Monthly* 132:5, 383–402.
+ (§D-4.1)
+- Hammons, A. R., Kumar, P. V., Calderbank, A. R., Sloane, N. J. A., & Solé, P.
+ (1994). The $\mathbb{Z}_4$-linearity of Kerdock, Preparata, Goethals, and related
+ codes. *IEEE Trans. Inform. Theory* 40:2, 301–319. (§D-2.7)
+- NanoGPT Speedrunning.
diff --git a/scripts/q2_pack.py b/scripts/q2_pack.py
new file mode 100644
index 0000000..5d28971
--- /dev/null
+++ b/scripts/q2_pack.py
@@ -0,0 +1,501 @@
+#!/usr/bin/env python3
+"""
+q2_pack.py — GPU-accelerated Q² weight packing and unpacking.
+
+Packs PyTorch float32 weight matrices to Q² 2-bit symbols using the Z4
+Lee-metric alphabet {A=0, B=1, C=2, D=3}. Gray-encoded, 4 symbols per byte,
+MSB-first — identical to the q2 dtype in src/q2.ts.
+
+All heavy operations run on CUDA when available; falls back to CPU silently.
+
+Public API
+----------
+ pack_state_dict(state_dict, out_path) -> int (artifact bytes)
+ unpack_state_dict(in_path, device) -> dict[str, Tensor]
+
+CLI
+---
+ python scripts/q2_pack.py model.pt model.q2bin # pack checkpoint
+ python scripts/q2_pack.py --unpack model.q2bin # inspect packed file
+"""
+from __future__ import annotations
+
+import argparse
+import io
+import math
+import re
+import struct
+from pathlib import Path
+from typing import Dict, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Magic bytes and version for the binary format.
+_HEADER_MAGIC = b"Q2BN"
+_FORMAT_VERSION = 2 # v2 adds per-row τ and alias records
+
+# ── quantisation ──────────────────────────────────────────────────────────────
+
+def empirical_tau(W: Tensor) -> Tensor:
+ """Per-row equiprobable threshold τ* from empirical weight statistics.
+
+ Returns the 75th percentile of |W| per row, which equals Φ⁻¹(¾)·σ for
+ Gaussian weights (§D-2.5). The empirical quantile adapts to non-Gaussian
+ shapes (e.g. post-ReLU, SwiGLU) without distributional assumptions.
+
+ Args:
+ W: (rows, cols) float32 on any device.
+
+ Returns:
+ (rows, 1) float32 threshold, same device as W.
+ """
+ return torch.quantile(W.float().abs(), 0.75, dim=1, keepdim=True).clamp(min=1e-6)
+
+
+def q2_quantise(W: Tensor, tau: Tensor | None = None) -> Tensor:
+ """Quantise float32 weight matrix W to Z4 symbols {A=0, B=1, C=2, D=3}.
+
+ The four equiprobable cells:
+ A (0) : w <= -tau (strong negative)
+ B (1) : -tau < w <= 0 (weak negative)
+ C (2) : 0 < w <= tau (weak positive)
+ D (3) : w > tau (strong positive)
+
+ Built with vectorised masks and no Python loops — runs entirely in CUDA
+ kernels when W is on a GPU tensor.
+
+ Args:
+ W: (rows, cols) float32.
+ tau: (rows, 1) threshold. Computed via empirical_tau if None.
+
+ Returns:
+ (rows, cols) uint8, values in {0, 1, 2, 3}.
+ """
+ W = W.float()
+ if tau is None:
+ tau = empirical_tau(W)
+
+ # Build all four masks in parallel; compose sym with integer addition.
+ # Start at 0 (A), increment for each boundary crossed.
+ # neg_strong → sym stays 0
+ sym = (W > -tau).to(torch.uint8) # 0 if A (w <= -tau), else 1
+ sym = sym + (W > 0).to(torch.uint8) # +1 if past zero → 1=B or 2=C/D
+ sym = sym + (W > tau).to(torch.uint8) # +1 if past +tau → 2=C becomes 3=D
+ # Result: A=0 B=1 C=2 D=3, all in one pass.
+ return sym
+
+
+def gray_encode(sym: Tensor) -> Tensor:
+ """Apply the Gray map φ: Z4 → {0,1,2,3}.
+
+ φ(n) = n XOR (n >> 1): A=0→00, B=1→01, C=2→11, D=3→10.
+ Hamming distance on the 2-bit Gray codes equals Lee distance on Z4
+ (Theorem 2.1, DESIGN.md §2.7).
+ """
+ return (sym ^ (sym >> 1)).to(torch.uint8)
+
+
+def gray_decode(gray: Tensor) -> Tensor:
+ """Invert the Gray map (self-inverse for 2-bit codes).
+
+ For 2-bit Gray codes, decoding is the same operation as encoding:
+ sym = gray XOR (gray >> 1).
+ """
+ return (gray ^ (gray >> 1)).to(torch.uint8)
+
+
+def pack_symbols(gray: Tensor) -> Tensor:
+ """Pack 4 Gray-encoded Z4 symbols per byte, MSB-first.
+
+ The packing layout matches src/q2.ts and src/q2.wat:
+ byte = (g[4i] << 6) | (g[4i+1] << 4) | (g[4i+2] << 2) | g[4i+3]
+
+ Args:
+ gray: (rows, cols) uint8 in {0, 1, 2, 3}.
+
+ Returns:
+ (rows, ceil(cols/4)) uint8. If cols % 4 != 0, the last byte is
+ zero-padded on the right.
+ """
+ rows, cols = gray.shape
+ pad = (-cols) % 4
+ if pad:
+ gray = F.pad(gray, (0, pad), value=0)
+ # Reshape to (rows, n_bytes, 4) so each group of 4 symbols is a row.
+ g = gray.view(rows, -1, 4).to(torch.int32)
+ packed = (g[..., 0] << 6) | (g[..., 1] << 4) | (g[..., 2] << 2) | g[..., 3]
+ return packed.to(torch.uint8)
+
+
+def unpack_symbols(packed: Tensor, n: int) -> Tensor:
+ """Unpack bytes to Gray-encoded Z4 symbols.
+
+ Args:
+ packed: (rows, ceil(n/4)) uint8.
+ n: number of symbols per row in the original tensor.
+
+ Returns:
+ (rows, n) uint8 in {0, 1, 2, 3}.
+ """
+ p = packed.to(torch.int32)
+ s0 = (p >> 6) & 0x3
+ s1 = (p >> 4) & 0x3
+ s2 = (p >> 2) & 0x3
+ s3 = p & 0x3
+ # Interleave: (rows, n_bytes, 4) → (rows, n_bytes*4) → trim to (rows, n).
+ syms = torch.stack([s0, s1, s2, s3], dim=2).view(packed.shape[0], -1)
+ return syms[:, :n].to(torch.uint8)
+
+
+# ── state-dict packing ────────────────────────────────────────────────────────
+
+def pack_tensor(W: Tensor) -> Tuple[bytes, int]:
+ """Pack one tensor to Q2 bytes; return (data, dtype_flag).
+
+ dtype_flag meanings:
+ 0 → Q2 packed (2-D or higher weight matrix)
+ data = rows*2 fp16 τ bytes + packed symbol bytes
+ 1 → fp16 raw (1-D tensor: bias, layer-norm scale/shift)
+ 2 → alias (handled by pack_state_dict; never returned here)
+
+ Multi-dimensional tensors (ndim > 2) are flattened to (shape[0], prod(shape[1:]))
+ before quantisation. The original shape is stored separately in the header
+ so unpack_state_dict can reshape correctly.
+
+ Per-row τ is serialised as fp16 so that unpack_state_dict can dequantise
+ weights back to their trained magnitudes, not just unit-scale symbols.
+ """
+ if W.ndim < 2:
+ return W.cpu().half().contiguous().numpy().tobytes(), 1
+
+ # Flatten to 2-D: (rows, cols)
+ rows = W.shape[0]
+ cols = math.prod(W.shape[1:])
+ W_2d = W.reshape(rows, cols)
+
+ W_dev = W_2d.to(_DEVICE).float()
+ tau = empirical_tau(W_dev) # (rows, 1) float32
+ sym = q2_quantise(W_dev, tau)
+ gray = gray_encode(sym)
+ pack = pack_symbols(gray) # (rows, ceil(cols/4)) uint8
+
+ # Serialise: fp16 τ (rows × 2 bytes) followed by packed symbols.
+ tau_fp16 = tau.squeeze(1).half().cpu().contiguous().numpy().tobytes()
+ pack_b = pack.cpu().contiguous().numpy().tobytes()
+ return tau_fp16 + pack_b, 0
+
+
+def _geode_stratum(key: str) -> Tuple[int, int]:
+ """Sort key for Geode-stratum ordering in the binary file.
+
+ Ordering follows the Geode tree traversal (S-1 = S1·G):
+ stratum 0 : embedding, emb_norm (input interface)
+ strata 1–4: [GQA, CfC, CfC, CfC] blocks in sequence-order
+ each GQA+CfC group maps to one S1 vertex and its G sub-tree
+ stratum 5 : output norm, lm_head (output interface)
+ stratum 6 : anything else (buffers etc.)
+
+ Parameters that belong to the same Geode computation unit are adjacent in
+ the file, maximising run-length compression (zstd sees long identical-structure
+ blocks) and enabling sorted page-through during inference reconstruction.
+ """
+ if key.startswith(("embed.", "emb_norm.")):
+ return (0, 0)
+
+ m = re.match(r"layers\.(\d+)\.", key)
+ if m:
+ layer_idx = int(m.group(1))
+ # Group index: each [GQA+CfC×3] unit = 4 consecutive layer indices.
+ group = layer_idx // 4 # 0, 1, 2, 3
+ within = layer_idx % 4 # 0=GQA, 1-3=CfC
+ # GQA (S1 coarse) sorts before its CfC sub-tree (G refinement).
+ return (1 + group, within)
+
+ if key.startswith(("norm.", "lm_head.")):
+ return (5, 0)
+
+ return (6, 0)
+
+
+def pack_state_dict(
+ state_dict: Dict[str, Tensor],
+ out_path: str | Path,
+) -> int:
+ """Serialise a PyTorch state dict to the Q2 binary format (v2).
+
+ Wire format (all integers big-endian):
+ 4 B magic "Q2BN"
+ 1 B version uint8 = 2
+
+ Per tensor (repeated, ordered by Geode stratum):
+ 4 B key_len uint32
+ * key UTF-8 bytes
+ 1 B ndim uint8
+ 4*n shape uint32 × ndim
+ 1 B dtype_flag uint8:
+ 0 = Q2 packed with per-row τ
+ data = rows*2 fp16 τ + ceil(cols/4)*rows packed bytes
+ 1 = fp16 raw (1-D tensors)
+ 2 = alias — data is 4-byte key_len + alias_key UTF-8;
+ unpacker must resolve to a previously-loaded tensor.
+ 8 B n_bytes uint64
+ * data (dtype_flag-specific content above)
+
+ Returns the total file size in bytes.
+
+ Tied weights (embed.weight ≡ lm_head.weight) are deduplicated automatically:
+ the first occurrence is serialised in full; subsequent occurrences become
+ alias records. This mirrors the "clustering and collisions are ok" rule
+ from the Q² design (§D-2.5): we use the structure to avoid redundancy rather
+ than fighting it.
+ """
+ buf = io.BytesIO()
+ buf.write(_HEADER_MAGIC)
+ buf.write(struct.pack(">B", _FORMAT_VERSION))
+
+ # Sort entries by Geode stratum so the file layout mirrors the computation
+ # tree (§5.5.1: parallel dispatch by tag; §D-4.1: Geode traversal order).
+ ordered_keys = sorted(state_dict.keys(), key=_geode_stratum)
+
+ # Track tensors we have already written, keyed by data pointer.
+ # Used to emit alias records for tied weights (e.g., embed.weight ≡ lm_head.weight).
+ seen_ptrs: Dict[int, str] = {}
+
+ for key in ordered_keys:
+ W = state_dict[key]
+ key_b = key.encode()
+ buf.write(struct.pack(">I", len(key_b)))
+ buf.write(key_b)
+
+ shape = tuple(W.shape)
+ buf.write(struct.pack(">B", len(shape)))
+ buf.write(struct.pack(f">{len(shape)}I", *shape))
+
+ ptr = W.data_ptr()
+ if ptr in seen_ptrs:
+ # Emit alias record: dtype_flag=2, data = alias_key bytes.
+ alias_key_b = seen_ptrs[ptr].encode()
+ alias_data = struct.pack(">I", len(alias_key_b)) + alias_key_b
+ buf.write(struct.pack(">BQ", 2, len(alias_data)))
+ buf.write(alias_data)
+ else:
+ seen_ptrs[ptr] = key
+ data, dtype_flag = pack_tensor(W)
+ buf.write(struct.pack(">BQ", dtype_flag, len(data)))
+ buf.write(data)
+
+ payload = buf.getvalue()
+ Path(out_path).write_bytes(payload)
+ return len(payload)
+
+
+def unpack_state_dict(
+ in_path: str | Path,
+ device: str | torch.device = "cpu",
+ dtype: torch.dtype = torch.float32,
+) -> Dict[str, Tensor]:
+ """Load a Q2BN file back to a float-valued state dict.
+
+ Format v2: per-row τ is stored alongside the packed symbols; dequantised
+ values use the saved τ to recover the correct weight magnitudes.
+ Format v1 (legacy): unit-scale reconstruction {-1, -0.5, +0.5, +1}.
+ Alias records (dtype_flag=2) are resolved to the previously-loaded tensor.
+ Multi-dimensional tensors are reshaped back to their original shape.
+ """
+ raw = Path(in_path).read_bytes()
+ if raw[:4] != _HEADER_MAGIC:
+ raise ValueError(f"Not a Q2BN file: {in_path}")
+ file_version = raw[4]
+ pos = 5
+
+ result: Dict[str, Tensor] = {}
+ while pos < len(raw):
+ (key_len,) = struct.unpack_from(">I", raw, pos)
+ pos += 4
+ key = raw[pos : pos + key_len].decode()
+ pos += key_len
+
+ (ndim,) = struct.unpack_from(">B", raw, pos)
+ pos += 1
+ shape = struct.unpack_from(f">{ndim}I", raw, pos)
+ pos += 4 * ndim
+
+ (dtype_flag,) = struct.unpack_from(">B", raw, pos)
+ pos += 1
+ (n_bytes,) = struct.unpack_from(">Q", raw, pos)
+ pos += 8
+ data = raw[pos : pos + n_bytes]
+ pos += n_bytes
+
+ if dtype_flag == 2:
+ # Alias record: resolve to a previously-loaded tensor.
+ (alias_len,) = struct.unpack_from(">I", data, 0)
+ alias_key = data[4 : 4 + alias_len].decode()
+ result[key] = result[alias_key]
+ continue
+
+ if dtype_flag == 1:
+ # fp16 raw (biases, norms).
+ t = torch.frombuffer(bytearray(data), dtype=torch.float16).to(dtype)
+ result[key] = t.reshape(shape).to(device)
+ continue
+
+ # dtype_flag == 0: Q2 packed (with per-row τ in v2, without in v1).
+ rows = shape[0]
+ cols = int(math.prod(shape[1:]))
+ n_packed = math.ceil(cols / 4)
+
+ if file_version >= 2:
+ # v2: first rows*2 bytes are fp16 τ values.
+ tau_bytes = rows * 2
+ tau_arr = torch.frombuffer(bytearray(data[:tau_bytes]), dtype=torch.float16)
+ tau_vals = tau_arr.float().to(device).unsqueeze(1) # (rows, 1)
+ sym_data = data[tau_bytes:]
+ else:
+ tau_vals = None
+ sym_data = data
+
+ packed = torch.frombuffer(bytearray(sym_data), dtype=torch.uint8)
+ packed = packed.reshape(rows, n_packed)
+ gray = unpack_symbols(packed, cols)
+ sym = gray_decode(gray).long()
+
+ if tau_vals is not None:
+ # Dequantise using saved τ: {0,1,2,3} → {-1.5,-0.5,+0.5,+1.5}·τ/1.5
+ # Reconstruction points at ±0.5τ and ±1.5τ (equiprobable cells §D-2.5).
+ val_map = torch.tensor([-1.5, -0.5, 0.5, 1.5], dtype=torch.float32,
+ device=device)
+ W_hat = val_map[sym.to(device)] * (tau_vals / 1.5)
+ else:
+ # Legacy v1: unit-scale reconstruction.
+ val_map = torch.tensor([-1.0, -0.5, 0.5, 1.0], dtype=dtype)
+ W_hat = val_map[sym].to(dtype)
+
+ result[key] = W_hat.reshape(shape).to(device)
+
+ return result
+
+
+# ── LIV cache-line packing (§5.5 of PARAMETER_GOLF.md) ──────────────────────
+#
+# LIV (Liquid Integrated Vision/Language) symbols use 5-bit quantisation
+# (int5, 32 levels). A 64-bit word can hold:
+#
+# 12 LIV × 5 bits = 60 bits + 2-bit tag + 2 unused bits = 64 bits
+# 10 LIV × 5 bits = 50 bits = two 5×5 binary matrices (codon verifiable)
+#
+# Exact bit layout (bits 63 … 0, MSB-first):
+# [sym0(5)] [sym1(5)] … [sym11(5)] [tag(2)] [00]
+# bits 63:59 58:54 8:4 3:2 1:0
+#
+# sym0 → shift = 64 - 5*(0+1) = 59 → bits [63:59]
+# sym11 → shift = 64 - 5*(11+1) = 4 → bits [8:4]
+# tag → bits [3:2], values in [0..3] matching the Geode S1 = 4x four levels
+# bits [1:0] are unused (zero).
+#
+# The 2-bit Q² tag distributes 64-bit words across 4 groups for parallel GPU
+# warp dispatch by Geode coarse level.
+
+
+def pack_liv_cacheline(
+ symbols: Tensor,
+ seq_tags: Tensor | None = None,
+) -> Tensor:
+ """Pack 5-bit LIV symbols into 64-bit words, 12 per word.
+
+ Packs 12 LIV symbols (values in [0, 31]) per uint64 word with a 2-bit
+ Q² sequence tag in bits [3:2]. Bits [1:0] are unused (zero).
+
+ Bit layout (bits 63 → 0):
+ sym0[63:59] sym1[58:54] … sym11[8:4] tag[3:2] 00
+
+ The 2-bit tag (4 values = one Q² symbol from the Geode S1 = 4x level)
+ allows cache-line-level partitioning across GPU streaming multiprocessors:
+ each SM processes one of the 4 tag groups independently.
+
+ Args:
+ symbols: (N,) uint8/int in [0, 31]. Padded to multiple of 12.
+ seq_tags: (N//12,) uint8 in [0, 3]. 2-bit tag per word.
+ If None, all tags are set to 0.
+
+ Returns:
+ (ceil(N/12),) int64 packed words.
+ """
+ if symbols.numel() % 12 != 0:
+ pad = 12 - (symbols.numel() % 12)
+ symbols = torch.cat([symbols.flatten(), symbols.new_zeros(pad)])
+ n_words = symbols.numel() // 12
+ s = symbols.view(n_words, 12).to(torch.int64) & 0x1F # 5-bit clamp
+
+ # sym0 → shift=59 (bits [63:59]), sym11 → shift=4 (bits [8:4]).
+ word = torch.zeros(n_words, dtype=torch.int64, device=symbols.device)
+ for i in range(12):
+ shift = 64 - 5 * (i + 1) # sym0→59, sym1→54, …, sym11→4
+ word |= s[:, i] << shift
+
+ # 2-bit tag in bits [3:2]; bits [1:0] remain zero.
+ if seq_tags is not None:
+ tag = seq_tags.view(n_words).to(torch.int64) & 0x3
+ word |= tag << 2
+ return word
+
+
+def unpack_liv_cacheline(packed: Tensor, n: int) -> Tuple[Tensor, Tensor]:
+ """Unpack 64-bit words to 5-bit LIV symbols and 2-bit Q² tags.
+
+ Args:
+ packed: (N_words,) int64.
+ n: total number of symbols to return (≤ N_words × 12).
+
+ Returns:
+ symbols: (n,) uint8 in [0, 31].
+ seq_tags: (N_words,) uint8 in [0, 3].
+ """
+ n_words = packed.shape[0]
+ out = torch.zeros(n_words * 12, dtype=torch.uint8, device=packed.device)
+ for i in range(12):
+ shift = 64 - 5 * (i + 1) # matches pack_liv_cacheline
+ out[i::12] = ((packed >> shift) & 0x1F).to(torch.uint8)
+ seq_tags = ((packed >> 2) & 0x3).to(torch.uint8)
+ return out[:n], seq_tags
+
+
+# ── CLI ───────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+ parser = argparse.ArgumentParser(
+ description="Pack / inspect a Q2 weight binary.",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=__doc__,
+ )
+ parser.add_argument("input", help="Input .pt checkpoint or .q2bin file")
+ parser.add_argument("output", nargs="?", help="Output .q2bin path (pack mode)")
+ parser.add_argument("--unpack", action="store_true", help="Inspect a .q2bin file")
+ args = parser.parse_args()
+
+ if args.unpack or args.input.endswith(".q2bin"):
+ sd = unpack_state_dict(args.input)
+ total = sum(t.numel() for t in sd.values())
+ print(f"Loaded {len(sd)} tensors, {total:,} total elements")
+ for k, v in sd.items():
+ print(f" {k:<50s} {str(tuple(v.shape)):<25s} {v.dtype}")
+ return
+
+ if not args.output:
+ parser.error("Provide output path (or --unpack to inspect)")
+
+ sd = torch.load(args.input, map_location="cpu", weights_only=True)
+ if isinstance(sd, dict) and "model" in sd:
+ sd = sd["model"]
+
+ n_bytes = pack_state_dict(sd, args.output)
+ print(f"Packed {len(sd)} tensors → {n_bytes:,} bytes ({n_bytes / 1e6:.3f} MB)")
+ print(f"Device used: {_DEVICE}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/train_q2_ltc.py b/scripts/train_q2_ltc.py
new file mode 100644
index 0000000..e0b68ee
--- /dev/null
+++ b/scripts/train_q2_ltc.py
@@ -0,0 +1,798 @@
+#!/usr/bin/env python3
+"""
+train_q2_ltc.py — Q²-QAT Hybrid LTC-Transformer for OpenAI Parameter Golf.
+
+Architecture: [GQA, CfC, CfC, CfC] × 4 = 16 layers (Geode-derived, see §4.5
+of PARAMETER_GOLF.md). The layer layout is derived from the Geode factorization
+S(x) - 1 = S1·G where S1=4x gives 4 GQA (coarse) blocks and G=1/(1-3x) gives
+3 CfC (refinement) blocks per GQA block.
+
+Quantisation: Q² 2-bit QAT with straight-through estimator (STE).
+Optimizer: Muon (Nesterov + spectral normalisation) — current SOTA.
+Compression: Q2-packed weights + zstd-22 for final artifact.
+
+Usage (8×H100):
+ torchrun --standalone --nproc_per_node=8 scripts/train_q2_ltc.py
+
+Single GPU (smoke test):
+ python scripts/train_q2_ltc.py
+
+Environment variables (all optional, reasonable defaults):
+ D_MODEL hidden dimension (default: 768)
+ N_HEADS attention heads (default: 12)
+ N_KV_HEADS KV heads for GQA (default: 4)
+ MAX_STEPS training steps (default: 5000)
+ BATCH_TOKENS tokens per gradient step (default: 131072)
+ SEQ_LEN sequence length (default: 2048)
+ DATA_PATH FineWeb tokenised shards (default: ./data/datasets/fineweb10B_sp1024)
+ VOCAB_SIZE vocabulary size (default: 1024)
+ OUT_DIR checkpoint directory (default: ./checkpoints)
+ WARMUP_STEPS LR warm-up steps (default: 200)
+ Q2_WARMUP FP32 warm-up before QAT (default: 500)
+ VAL_EVERY validation interval (default: 200)
+"""
+from __future__ import annotations
+
+import math
+import os
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Iterator, Tuple
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+
+# ── configuration ─────────────────────────────────────────────────────────────
+
+@dataclass
+class Config:
+ # Model (Geode-derived: 4 GQA + 12 CfC = 16 layers)
+ d_model: int = int(os.getenv("D_MODEL", "768"))
+ n_heads: int = int(os.getenv("N_HEADS", "12"))
+ n_kv_heads: int = int(os.getenv("N_KV_HEADS", "4"))
+ n_layers: int = 16 # fixed: [GQA, CfC, CfC, CfC] × 4
+ mlp_ratio: int = 3 # MLP hidden = d_model × mlp_ratio
+ # vocab_size: set to 256 for byte-mode (BYTE_TOKENS=1); default 1024 for SP-1024.
+ vocab_size: int = 256 if os.getenv("BYTE_TOKENS", "0") == "1" else int(os.getenv("VOCAB_SIZE", "1024"))
+
+ # Q²-QAT
+ q2_warmup: int = int(os.getenv("Q2_WARMUP", "500"))
+ tau_update_every: int = 1024
+ ste_kappa_scale: float = 3.0 # STE passthrough window: κ = kappa_scale × τ*
+
+ # Training
+ max_steps: int = int(os.getenv("MAX_STEPS", "5000"))
+ batch_tokens: int = int(os.getenv("BATCH_TOKENS", "131072"))
+ seq_len: int = int(os.getenv("SEQ_LEN", "2048"))
+ lr: float = 3e-4
+ wd: float = 0.04
+ grad_clip: float = 1.0
+ swa_start: float = 0.6 # SWA from this fraction of total steps
+ warmup_steps: int = int(os.getenv("WARMUP_STEPS", "200"))
+ val_every: int = int(os.getenv("VAL_EVERY", "200"))
+ val_tokens: int = 1_000_000
+ # Byte mode: read raw bytes from .bin shards (no tokeniser encoder needed).
+ # Tokens are raw uint8 bytes [0,255]; vocab_size is automatically set to 256.
+ byte_tokens: bool = os.getenv("BYTE_TOKENS", "0") == "1"
+
+ # Paths
+ data_path: str = os.getenv("DATA_PATH", "./data/datasets/fineweb10B_sp1024")
+ out_dir: str = os.getenv("OUT_DIR", "./checkpoints")
+
+
+# ── Q²-QAT: straight-through estimator ───────────────────────────────────────
+
+class _Q2STEFunction(torch.autograd.Function):
+ """Straight-through estimator for Q² 2-bit weight quantisation.
+
+ Forward: maps float32 weights to Q² reconstruction values
+ {-1, -0.5, +0.5, +1} × τ (cell centroids scaled by threshold).
+ Backward: passes gradient unchanged where |W| ≤ κ (STE window);
+ zeroes gradient outside the window to suppress outlier updates.
+ """
+
+ # Unit reconstruction points for symbols {A=0, B=1, C=2, D=3}.
+ # Module-level constant; moved to device in forward to avoid repeated allocation.
+ _LEVELS = torch.tensor([-1.0, -0.5, 0.5, 1.0])
+
+ @staticmethod
+ def forward( # type: ignore[override]
+ ctx: torch.autograd.function.FunctionCtx,
+ W: Tensor,
+ tau: Tensor,
+ kappa: Tensor,
+ ) -> Tensor:
+ ctx.save_for_backward(W, kappa)
+ # Vectorised quantisation (matches q2_quantise in q2_pack.py).
+ sym = (W > -tau).to(torch.long)
+ sym = sym + (W > 0).to(torch.long)
+ sym = sym + (W > tau).to(torch.long) # sym in {0,1,2,3}
+ # Cache-friendly: _LEVELS is a 4-element constant; .to() is a no-op
+ # when dtype/device already match (which they will after the first call).
+ levels = _Q2STEFunction._LEVELS.to(device=W.device, dtype=W.dtype)
+ return levels[sym] * tau
+
+ @staticmethod
+ def backward( # type: ignore[override]
+ ctx: torch.autograd.function.FunctionCtx,
+ grad_output: Tensor,
+ ) -> Tuple[Tensor, None, None]:
+ W, kappa = ctx.saved_tensors
+ # STE: pass gradient only within the quantisation window.
+ grad_W = grad_output * (W.abs() <= kappa).to(grad_output.dtype)
+ return grad_W, None, None
+
+
+q2_ste = _Q2STEFunction.apply
+
+
+class Q2Linear(nn.Linear):
+ """Linear layer with Q²-QAT: quantised weights in forward, exact in backward.
+
+ Behaves as a standard nn.Linear during FP32 warm-up (quantised=False).
+ Call activate_q2() after warm-up to switch to STE mode.
+
+ The per-row threshold τ* is computed once from the empirical 75th percentile
+ of |W| (reservoir calibration, §D-2.5) and refreshed every tau_update_every
+ forward steps.
+ """
+
+ def __init__(self, in_features: int, out_features: int, bias: bool = False):
+ super().__init__(in_features, out_features, bias=bias)
+ self.quantised = False
+ self._step = 0
+ self._tau_update_every = 1024
+ self._ste_kappa_scale = 3.0
+ # Non-parameter buffers (excluded from optimizer state).
+ self.register_buffer("_tau", torch.full((out_features, 1), 0.6745))
+ self.register_buffer("_kappa", torch.full((out_features, 1), 2.0236))
+
+ @torch.no_grad()
+ def _refresh_tau(self) -> None:
+ tau = torch.quantile(
+ self.weight.float().abs(), 0.75, dim=1, keepdim=True
+ ).clamp(min=1e-6)
+ self._tau.copy_(tau)
+ self._kappa.copy_(tau * self._ste_kappa_scale)
+
+ def forward(self, x: Tensor) -> Tensor:
+ if not self.quantised:
+ return F.linear(x, self.weight, self.bias)
+ self._step += 1
+ if self._step % self._tau_update_every == 0:
+ self._refresh_tau()
+ W_hat = q2_ste(self.weight, self._tau, self._kappa)
+ return F.linear(x, W_hat, self.bias)
+
+ def activate_q2(
+ self,
+ update_every: int = 1024,
+ kappa_scale: float = 3.0,
+ ) -> None:
+ """Switch to QAT mode (call once after FP32 warm-up completes)."""
+ self._tau_update_every = update_every
+ self._ste_kappa_scale = kappa_scale
+ self._refresh_tau()
+ self.quantised = True
+
+
+# ── CfC block (Geode G-level: one 3-way refinement step) ─────────────────────
+
+class CfCBlock(nn.Module):
+ """Closed-form Continuous-time recurrent block.
+
+ Implements one step of the Geode G = 1/(1-3x) refinement tree.
+ Solves the LTC ODE analytically (Hasani et al. 2022, arXiv:2106.13898):
+
+ h_new = exp(-A1·dt) · h + (A2/A1) · (1 - exp(-A1·dt))
+
+ The recurrent state h propagates information across tokens within a
+ sequence without growing a KV cache. Memory cost per layer: O(batch·d)
+ regardless of sequence length.
+
+ **GPU efficiency:** the time constants A1 and A2 are computed from the
+ input x only (not from h), enabling a single batched matmul over all T
+ tokens. The state update is then a sequential element-wise scan — cheap
+ because it has no matmul inside the loop — making the total cost dominated
+ by the three linear projections (ff_a1, ff_a2, out), not the recurrence.
+
+ All Q2Linear layers participate in Q²-QAT when activate_q2() is called
+ on the parent model.
+ """
+
+ def __init__(self, d_model: int):
+ super().__init__()
+ self.norm = nn.RMSNorm(d_model)
+ # A1: decay-rate network (input=x → positive scalar per dim).
+ # Takes d_model (not 2*d_model) so all T tokens are processed in one
+ # batched matmul, with no per-token Python dispatch.
+ self.ff_a1 = Q2Linear(d_model, d_model)
+ # A2: integration-target network (same reasoning).
+ self.ff_a2 = Q2Linear(d_model, d_model)
+ self.out = Q2Linear(d_model, d_model)
+ # Learnable log time-step (log-parameterised → strictly positive).
+ self.log_dt = nn.Parameter(torch.zeros(d_model))
+
+ def forward(self, x: Tensor, h: Tensor) -> Tuple[Tensor, Tensor]:
+ """
+ Args:
+ x: (B, T, D) — token representations from the previous block.
+ h: (B, D) — recurrent state carried from the previous token.
+
+ Returns:
+ y: (B, T, D) — output representations (residual-connected).
+ h: (B, D) — updated recurrent state (final token in sequence).
+ """
+ B, T, D = x.shape
+ residual = x
+ x_norm = self.norm(x)
+ dt = self.log_dt.exp() # (D,) — positive, learnable time step
+
+ # Compute all time constants in one batched matmul over (B·T, D).
+ # No h dependency here → fully parallel over the sequence dimension.
+ a1 = F.softplus(self.ff_a1(x_norm)) # (B, T, D) decay rate > 0
+ a2 = self.ff_a2(x_norm) # (B, T, D) integration target
+ decay = torch.exp(-a1 * dt) # (B, T, D) in (0, 1)
+ c = (a2 / (a1 + 1e-6)) * (1.0 - decay) # (B, T, D) affine offset
+
+ # Sequential scan: h[t] = decay[t]*h[t-1] + c[t].
+ # Each step is element-wise (no matmul); torch.compile traces this loop
+ # into a fused CUDA kernel automatically.
+ out_buf = torch.empty_like(decay)
+ for t in range(T):
+ h = decay[:, t] * h + c[:, t]
+ out_buf[:, t] = h
+
+ return residual + self.out(out_buf), h
+
+
+# ── GQA block (Geode S1-level: one 4-way coarse selection) ───────────────────
+
+class GQABlock(nn.Module):
+ """Grouped Query Attention block with fused MLP.
+
+ Implements one step of the Geode S1 = 4x coarse-quantisation level.
+ Uses PyTorch's fused scaled_dot_product_attention (FlashAttention path on
+ Ampere/Hopper hardware) for memory-efficient causal attention.
+
+ KV heads are shared across Q-head groups (GQA) to reduce parameter count
+ while preserving the representational depth of full MHA.
+
+ The MLP uses a SwiGLU gate (element-wise product of two projections) for
+ parameter efficiency.
+ """
+
+ def __init__(self, d_model: int, n_heads: int, n_kv_heads: int, mlp_ratio: int):
+ super().__init__()
+ assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
+ assert n_heads % n_kv_heads == 0, "n_heads must be divisible by n_kv_heads"
+
+ self.n_heads = n_heads
+ self.n_kv_heads = n_kv_heads
+ self.kv_groups = n_heads // n_kv_heads
+ self.head_dim = d_model // n_heads
+
+ self.attn_norm = nn.RMSNorm(d_model)
+ self.q = Q2Linear(d_model, d_model)
+ self.k = Q2Linear(d_model, self.head_dim * n_kv_heads)
+ self.v = Q2Linear(d_model, self.head_dim * n_kv_heads)
+ self.o = Q2Linear(d_model, d_model)
+
+ d_ff = d_model * mlp_ratio
+ self.mlp_norm = nn.RMSNorm(d_model)
+ self.mlp_up = Q2Linear(d_model, d_ff)
+ self.mlp_gate = Q2Linear(d_model, d_ff)
+ self.mlp_down = Q2Linear(d_ff, d_model)
+
+ def forward(self, x: Tensor) -> Tensor:
+ B, T, D = x.shape
+ residual = x
+ x = self.attn_norm(x)
+
+ # QKV projections.
+ q = self.q(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
+ k = self.k(x).view(B, T, self.n_kv_heads, self.head_dim).transpose(1, 2)
+ v = self.v(x).view(B, T, self.n_kv_heads, self.head_dim).transpose(1, 2)
+
+ # Expand KV heads for GQA (avoids materialising the full n_heads KV).
+ if self.kv_groups > 1:
+ k = k.repeat_interleave(self.kv_groups, dim=1)
+ v = v.repeat_interleave(self.kv_groups, dim=1)
+
+ # FlashAttention (causal; fused kernel on Ampere+).
+ attn = F.scaled_dot_product_attention(q, k, v, is_causal=True)
+ attn = attn.transpose(1, 2).contiguous().view(B, T, D)
+ x = residual + self.o(attn)
+
+ # SwiGLU MLP: gated linear unit with SiLU non-linearity.
+ residual2 = x
+ x = self.mlp_norm(x)
+ x = residual2 + self.mlp_down(F.silu(self.mlp_gate(x)) * self.mlp_up(x))
+ return x
+
+
+# ── full model: [GQA, CfC, CfC, CfC] × 4 ────────────────────────────────────
+
+class Q2LTCModel(nn.Module):
+ """Q²-QAT Hybrid LTC-Transformer with Geode-derived layer layout.
+
+ The layer stack mirrors the Geode factorisation S - 1 = S1·G:
+ S1 = 4x → 4 GQA blocks (coarse: 4 choices each, 2 bits/level)
+ G = 1/(1-3x)→ 3 CfC blocks per GQA (refinement: 3 choices, 1.585 bits/step)
+
+ Pattern: [GQA, CfC, CfC, CfC] × 4 = 16 layers (4 GQA + 12 CfC)
+ GQA positions: 0, 4, 8, 12 (0-indexed in self.layers)
+ CfC positions: 1-3, 5-7, 9-11, 13-15
+
+ Information capacity at depth d:
+ 4 × (2 + 3 × log₂ 3) ≈ 27.0 bits — sufficient for 2048-token LM.
+ """
+
+ def __init__(self, cfg: Config):
+ super().__init__()
+ self.cfg = cfg
+ D = cfg.d_model
+
+ self.embed = nn.Embedding(cfg.vocab_size, D)
+ self.emb_norm = nn.RMSNorm(D)
+
+ # Build [GQA, CfC, CfC, CfC] × 4 using the Geode structure.
+ layers: list[nn.Module] = []
+ for _ in range(4): # 4 coarse S1 nodes
+ layers.append(GQABlock(D, cfg.n_heads, cfg.n_kv_heads, cfg.mlp_ratio))
+ for _ in range(3): # 3 G refinement nodes
+ layers.append(CfCBlock(D))
+ self.layers = nn.ModuleList(layers) # 16 layers total
+
+ self.norm = nn.RMSNorm(D)
+ self.lm_head = nn.Linear(D, cfg.vocab_size, bias=False)
+ self.lm_head.weight = self.embed.weight # tied weights
+
+ # BigramHash log-prior (FP16; loaded separately from the artifact).
+ self.register_buffer(
+ "bigram_logprobs",
+ torch.zeros(cfg.vocab_size, cfg.vocab_size, dtype=torch.float16),
+ )
+
+ self._init_weights()
+
+ def _init_weights(self) -> None:
+ """OrthoInit for projection matrices; small normal for embeddings."""
+ for m in self.modules():
+ if isinstance(m, (nn.Linear, Q2Linear)):
+ if m.weight.ndim >= 2 and m.weight.shape[0] <= m.weight.shape[1]:
+ nn.init.orthogonal_(m.weight)
+ else:
+ nn.init.kaiming_uniform_(m.weight, a=math.sqrt(5))
+ elif isinstance(m, nn.Embedding):
+ nn.init.normal_(m.weight, std=0.02)
+
+ def activate_q2(self, cfg: Config) -> None:
+ """Switch all Q2Linear layers to QAT mode after FP32 warm-up."""
+ for m in self.modules():
+ if isinstance(m, Q2Linear):
+ m.activate_q2(
+ update_every=cfg.tau_update_every,
+ kappa_scale=cfg.ste_kappa_scale,
+ )
+
+ def forward(
+ self,
+ input_ids: Tensor,
+ prev_token: Tensor | None = None,
+ ) -> Tensor:
+ """
+ Args:
+ input_ids: (B, T) int64 token indices.
+ prev_token: (B,) int64 — token immediately before input_ids[:,0];
+ used to look up the BigramHash prior for position 0.
+
+ Returns:
+ logits: (B, T, V) float32.
+ """
+ B, T = input_ids.shape
+ D = self.cfg.d_model
+
+ x = self.emb_norm(self.embed(input_ids)) # (B, T, D)
+
+ # CfC recurrent states: reset to zero at the start of each sequence.
+ # Dict keyed by layer index to avoid storing states for GQA layers.
+ h_states: Dict[int, Tensor] = {}
+
+ for i, layer in enumerate(self.layers):
+ if isinstance(layer, GQABlock):
+ x = layer(x)
+ else:
+ if i not in h_states:
+ h_states[i] = x.new_zeros(B, D)
+ x, h_states[i] = layer(x, h_states[i])
+
+ x = self.norm(x)
+ logits = self.lm_head(x) # (B, T, V)
+
+ # Add BigramHash log-prior for position 0.
+ if prev_token is not None:
+ prior = self.bigram_logprobs[prev_token].to(logits.dtype) # (B, V)
+ logits[:, 0, :] = logits[:, 0, :] + prior
+
+ return logits
+
+ def count_parameters(self) -> int:
+ return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+
+# ── Muon optimizer ─────────────────────────────────────────────────────────────
+
+class Muon(torch.optim.Optimizer):
+ """Muon — Nesterov momentum + per-matrix spectral normalisation.
+
+ Adapted from modded-nanogpt (KellerJordan). The spectral normalisation
+ step divides each weight update by its largest singular value, which
+ prevents large gradient steps from disrupting the Q2 complement structure
+ during QAT — a stronger form of gradient clipping.
+ """
+
+ def __init__(
+ self,
+ params,
+ lr: float = 3e-4,
+ momentum: float = 0.95,
+ weight_decay: float = 0.04,
+ nesterov: bool = True,
+ ):
+ defaults = dict(lr=lr, momentum=momentum,
+ weight_decay=weight_decay, nesterov=nesterov)
+ super().__init__(params, defaults)
+
+ @torch.no_grad()
+ def step(self, closure=None):
+ loss = None
+ if closure is not None:
+ with torch.enable_grad():
+ loss = closure()
+
+ for group in self.param_groups:
+ lr = group["lr"]
+ mom = group["momentum"]
+ wd = group["weight_decay"]
+ for p in group["params"]:
+ if p.grad is None:
+ continue
+ g = p.grad.float()
+ state = self.state[p]
+ if "buf" not in state:
+ state["buf"] = g.clone()
+ else:
+ state["buf"].mul_(mom).add_(g)
+ g = (g + state["buf"] * mom) if group["nesterov"] else state["buf"]
+ # Per-matrix normalisation: scale by inverse Frobenius norm (cheap stabiliser).
+ if g.ndim >= 2:
+ sigma = torch.linalg.norm(g) # Frobenius norm (avoids per-step SVD cost)
+ if sigma > 0:
+ g = g / sigma
+ if wd > 0:
+ p.mul_(1.0 - lr * wd)
+ p.add_(g.to(p.dtype), alpha=-lr)
+
+ return loss
+
+
+# ── data loading ───────────────────────────────────────────────────────────────
+
+def _shard_files(data_path: str) -> list[Path]:
+ p = Path(data_path)
+ files = sorted(p.glob("*.bin")) + sorted(p.glob("*.npy"))
+ # Exclude validation shards (e.g., fineweb_val_*.bin/.npy) from the training set.
+ files = [f for f in files if not f.name.startswith("fineweb_val_")]
+ if not files:
+ raise FileNotFoundError(f"No .bin/.npy shards found in {data_path!r}")
+ return files
+
+
+def token_stream(
+ data_path: str,
+ seq_len: int,
+ device: torch.device,
+ rank: int = 0,
+ world: int = 1,
+ byte_tokens: bool = False,
+) -> Iterator[Tuple[Tensor, Tensor, Tensor]]:
+ """Yield (prev_token, input_ids, target_ids) triples of length seq_len.
+
+ prev_token is the single token immediately before input_ids[0]; the model
+ uses it to apply the BigramHash log-prior at position 0. It is a (1,)
+ int64 tensor. At the start of a new shard, prev_token is 0 (padding).
+
+ Shards are distributed round-robin across ranks so each GPU sees a
+ disjoint subset of the data.
+
+ When byte_tokens=True the .bin shards are read as raw uint8 bytes; each
+ byte is directly used as a token (vocab size 256, no tokeniser encoder).
+ This skips the SentencePiece encode step entirely (see §5.5 of
+ PARAMETER_GOLF.md). The data_path should point to a directory of raw
+ text .bin files (UTF-8 or binary).
+ """
+ import numpy as np
+ files = _shard_files(data_path)
+ # Assign shards to this rank.
+ my_files = [f for i, f in enumerate(files) if i % world == rank]
+ if not my_files:
+ my_files = files # fallback for single-GPU runs
+
+ while True:
+ for f in my_files:
+ if byte_tokens:
+ # Raw-byte mode: each byte is a token directly (vocab=256).
+ raw = f.read_bytes()
+ tokens_np = np.frombuffer(raw, dtype=np.uint8)
+ elif f.suffix == ".npy":
+ # Load NumPy shards via np.load to correctly handle the .npy header.
+ arr = np.load(f, mmap_mode="r")
+ if arr.dtype != np.uint16:
+ arr = arr.astype(np.uint16)
+ tokens_np = np.array(arr, copy=False).ravel()
+ else:
+ # Treat non-.npy shards (e.g. .bin) as raw uint16 buffers.
+ raw = f.read_bytes()
+ tokens_np = np.frombuffer(raw, dtype=np.uint16)
+ tokens = torch.from_numpy(tokens_np.copy()).to(torch.long)
+ # Track the last token of the previous chunk as BigramHash context.
+ shard_prev = torch.zeros(1, dtype=torch.long, device=device)
+ for start in range(0, len(tokens) - seq_len - 1, seq_len + 1):
+ chunk = tokens[start : start + seq_len + 1].to(device)
+ inp, tgt = chunk[:seq_len], chunk[1:]
+ yield shard_prev, inp, tgt
+ shard_prev = inp[-1:] # last token of this chunk is prev for next
+
+
+# ── validation ─────────────────────────────────────────────────────────────────
+
+@torch.no_grad()
+def estimate_val_bpb(
+ model: nn.Module,
+ data_path: str,
+ vocab_size: int,
+ seq_len: int,
+ val_tokens: int,
+ device: torch.device,
+ stride: int = 64,
+) -> float:
+ """Sliding-window bits-per-byte on the validation split."""
+ val_files = sorted(Path(data_path).glob("fineweb_val_*.bin"))
+ if not val_files:
+ return float("nan")
+
+ import numpy as np
+ total_bits = 0.0
+ total_bytes = 0
+ model.eval()
+
+ for f in val_files:
+ raw = f.read_bytes()
+ tokens = torch.from_numpy(np.frombuffer(raw, dtype=np.uint16).copy()).long()
+ # Sliding window evaluation at stride=64 (current SOTA).
+ for start in range(0, min(len(tokens), val_tokens) - seq_len, stride):
+ chunk = tokens[start : start + seq_len + 1].to(device)
+ inp, tgt = chunk[:seq_len].unsqueeze(0), chunk[1:].unsqueeze(0)
+ with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+ logits = model(inp)
+ # Only score the last stride tokens (context consumed earlier).
+ score_start = seq_len - stride
+ loss = F.cross_entropy(
+ logits[0, score_start:].view(-1, vocab_size),
+ tgt[0, score_start:].view(-1),
+ )
+ total_bits += loss.item() * stride * math.log2(math.e)
+ total_bytes += stride # 1 token ≈ 1 byte for SP-1024
+ if total_bytes >= val_tokens:
+ break
+ if total_bytes >= val_tokens:
+ break
+
+ model.train()
+ return total_bits / max(total_bytes, 1)
+
+
+# ── training loop ──────────────────────────────────────────────────────────────
+
+def train(cfg: Config) -> None:
+ # Distributed setup.
+ rank = int(os.getenv("RANK", "0"))
+ world = int(os.getenv("WORLD_SIZE", "1"))
+ local = int(os.getenv("LOCAL_RANK", "0"))
+ use_dist = world > 1
+ if use_dist:
+ dist.init_process_group("nccl")
+
+ torch.cuda.set_device(local)
+ device = torch.device(f"cuda:{local}")
+ master = rank == 0
+
+ # Build model.
+ model = Q2LTCModel(cfg).to(device)
+ if master:
+ n_params = model.count_parameters()
+ print(f"Q2-LTC model: {n_params:,} parameters ({n_params / 1e6:.1f} M)")
+ print(f"Layer layout: [GQA, CfC, CfC, CfC] × 4 = {cfg.n_layers} layers")
+ tok_mode = "byte-level (vocab=256, no tokeniser)" if cfg.byte_tokens else f"SP-{cfg.vocab_size}"
+ print(f"Token mode: {tok_mode}")
+
+ if use_dist:
+ model = DDP(model, device_ids=[local])
+ raw_model: Q2LTCModel = model.module if use_dist else model # type: ignore[assignment]
+
+ # Compile for maximum H100 throughput.
+ model = torch.compile(model, mode="max-autotune")
+
+ # Separate optimizer groups: Q2-quantised weight matrices vs. all other params.
+ q2_params = [
+ p for n, p in raw_model.named_parameters()
+ if "weight" in n and p.ndim >= 2
+ ]
+ other_params = [
+ p for n, p in raw_model.named_parameters()
+ if not ("weight" in n and p.ndim >= 2)
+ ]
+ optimizer = Muon([
+ {"params": q2_params, "lr": cfg.lr, "weight_decay": cfg.wd},
+ {"params": other_params, "lr": cfg.lr, "weight_decay": 0.0},
+ ])
+
+ # SWA (stochastic weight averaging over last 40% of training).
+ swa_model = torch.optim.swa_utils.AveragedModel(raw_model)
+ swa_start = int(cfg.max_steps * cfg.swa_start)
+ swa_active = False
+
+ # bfloat16 autocast on H100; no GradScaler needed (bf16 has enough dynamic range).
+ batch_size = max(1, cfg.batch_tokens // cfg.seq_len)
+ data = token_stream(cfg.data_path, cfg.seq_len, device, rank, world,
+ byte_tokens=cfg.byte_tokens)
+
+ if master:
+ Path(cfg.out_dir).mkdir(parents=True, exist_ok=True)
+
+ t0 = time.perf_counter()
+
+ for step in range(1, cfg.max_steps + 1):
+ # Cosine LR schedule with linear warm-up.
+ if step <= cfg.warmup_steps:
+ lr_scale = step / cfg.warmup_steps
+ else:
+ frac = (step - cfg.warmup_steps) / (cfg.max_steps - cfg.warmup_steps)
+ lr_scale = 0.5 * (1.0 + math.cos(math.pi * frac))
+ for g in optimizer.param_groups:
+ g["lr"] = cfg.lr * lr_scale
+
+ # Switch to Q²-QAT after FP32 warm-up.
+ if step == cfg.q2_warmup + 1:
+ raw_model.activate_q2(cfg)
+ if master:
+ print(f"[step {step:5d}] Q² QAT activated")
+
+ # Gradient accumulation over batch_size micro-batches.
+ optimizer.zero_grad(set_to_none=True)
+ total_loss = 0.0
+ for _ in range(batch_size):
+ prev_tok, inp, tgt = next(data)
+ inp, tgt = inp.unsqueeze(0), tgt.unsqueeze(0)
+ prev_tok = prev_tok.unsqueeze(0) # (1,) → (1,1); squeeze(0) passes (1,) to model
+ with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+ logits = model(inp, prev_token=prev_tok.squeeze(0))
+ loss = F.cross_entropy(
+ logits.view(-1, cfg.vocab_size),
+ tgt.view(-1),
+ ) / batch_size
+ loss.backward()
+ total_loss += loss.item()
+
+ # Gradient clipping + optimizer step.
+ torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip)
+ optimizer.step()
+
+ # SWA update.
+ if step >= swa_start:
+ swa_model.update_parameters(raw_model)
+ swa_active = True
+
+ # Logging.
+ if master and step % 100 == 0:
+ elapsed = time.perf_counter() - t0
+ tok_per_s = 100 * cfg.batch_tokens / elapsed
+ print(
+ f"step {step:5d} | loss {total_loss:.4f} | "
+ f"lr {lr_scale * cfg.lr:.2e} | "
+ f"{tok_per_s / 1e3:.1f} k tok/s"
+ )
+ t0 = time.perf_counter()
+
+ # Validation.
+ if master and step % cfg.val_every == 0:
+ bpb = estimate_val_bpb(
+ swa_model if swa_active else raw_model,
+ cfg.data_path, cfg.vocab_size, cfg.seq_len,
+ cfg.val_tokens, device,
+ )
+ print(f" val_bpb = {bpb:.4f}")
+
+ # ── artifact packaging ─────────────────────────────────────────────────────
+ if not master:
+ if use_dist:
+ dist.destroy_process_group()
+ return
+
+ print("\nPackaging artifact …")
+
+ final_model = swa_model.module if swa_active else raw_model
+
+ # Build the state dict for packing: only trainable parameters, no buffers.
+ # Tied weights (embed.weight ≡ lm_head.weight) are handled by q2_pack via
+ # alias records — we include both keys here and pack_state_dict will emit
+ # lm_head.weight as an alias pointing to embed.weight automatically.
+ # bigram_logprobs is a buffer saved separately (not Q2-packed).
+ sd = final_model.state_dict()
+ packable_sd = {
+ k: v.cpu()
+ for k, v in sd.items()
+ if k != "bigram_logprobs"
+ }
+
+ # Import q2_pack from this scripts/ directory.
+ import importlib.util
+ import sys
+ _spec = importlib.util.spec_from_file_location(
+ "q2_pack", Path(__file__).parent / "q2_pack.py"
+ )
+ assert _spec and _spec.loader
+ q2_pack = importlib.util.module_from_spec(_spec)
+ _spec.loader.exec_module(q2_pack) # type: ignore[union-attr]
+
+ q2bin_path = Path(cfg.out_dir) / "model.q2bin"
+ raw_bytes = q2_pack.pack_state_dict(packable_sd, q2bin_path)
+ print(f" Q2-packed: {raw_bytes:,} bytes ({raw_bytes / 1e6:.3f} MB)")
+
+ # Save bigram_logprobs separately as fp16 (loaded at inference, not Q2-packed).
+ bigram_path = Path(cfg.out_dir) / "bigram_logprobs.fp16"
+ bigram_buf = sd["bigram_logprobs"].cpu().half().contiguous().numpy().tobytes()
+ bigram_path.write_bytes(bigram_buf)
+ print(f" bigrams: {len(bigram_buf):,} bytes ({len(bigram_buf) / 1e6:.3f} MB)")
+
+ # Compress with zstd level 22 (requires the `zstandard` package).
+ try:
+ import zstandard as zstd
+ cctx = zstd.ZstdCompressor(level=22)
+ compressed = cctx.compress(q2bin_path.read_bytes())
+ zst_path = q2bin_path.with_suffix(".q2bin.zst")
+ zst_path.write_bytes(compressed)
+ print(f" zstd-22: {len(compressed):,} bytes ({len(compressed) / 1e6:.3f} MB)")
+ except ImportError:
+ compressed = q2bin_path.read_bytes()
+ zst_path = q2bin_path
+ print(" (zstandard not installed; using uncompressed Q2BN)")
+
+ this_file_bytes = len(Path(__file__).read_bytes())
+ q2_pack_path = Path(__file__).parent / "q2_pack.py"
+ q2_pack_bytes = q2_pack_path.stat().st_size if q2_pack_path.exists() else 0
+ code_bytes = this_file_bytes + q2_pack_bytes
+ total = len(compressed) + code_bytes
+ print(f" code: {code_bytes:,} bytes")
+ print(f" TOTAL: {total:,} bytes ({total / 1e6:.3f} MB)")
+ if total > 16_000_000:
+ print(
+ " WARNING: exceeds 16 MB budget — reduce d_model and/or reduce "
+ "layers / BigramHash size / precision allocation"
+ )
+ else:
+ print(" ✓ within 16 MB budget")
+
+ if use_dist:
+ dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+ train(Config())