$@Wai_H2nKVOn;U!FT_K)lj+NMrbwMOb@{*B_$M0|
zceH$A;wsw`z02!r(%rX?+&nU~`|T5%b&a#H+~54(H!|xEBoF6T`>1@rq89_A($mLo
zoSZ(1*9OnTLp_?jl;o1+8)zrX$_#0)1e3o
zaE@D*;AUvsfH$!YZ!K@s!ZE_V_AFvO)|?w%TK>&vrhv^1Y58`+k!v|_fn4Z}{ZA~-
zDH29@u`~b2`d-q4rO4S4;djG|jc)Ua*q(e~KP*e@&?Fc4D2RTF#^BuBESOt~N&xaK2dl$0p@
zYE)ncTXuKI7A61NEKBrQGFPw9;4
zkVJG@pC(0;D<%2S7YbhWF{C1#68{jz-Xc-18aG6gv>5E}?@zuq`R#9}OEb2b^p#I+
z{9;jYs`tk6>EmynoaxW3YseIVi7u5^-tbI&(&umaZu+ttoA22cTDmeVT_0`Ewj7z;
z(3>qiny@Vvm0urCj=mn5Da{mZp6Xq+yRW;FuGIN-f7V{NVBeClZ@F{s-HW#`X6>!_
zdNTGsQ(a)NIMI&4{?xTQfp;(8zI=cChs_@}&r=ZpQekPT?e(w!t-)xiBSUy;>U6p;
z?Z3Ho`XZzs65U)W?zihwr&C*|ZRuUe%h>s(`GE!Han#jbJZGy}EUKVBrF*kQTNaA8
zXNtDpwfxlio-+B5bl$ZSsc)C*7G&cxLoe>{NH`=|0)I#vT~
zQy$+T-oNSs-yxN4)s=U!&y=%0=EB#(clMwMuqF${W2`Iqjs*vCE$o1gjp+YVIO`gH
z(l`&Sro_LD-!+j{@{LaLE7)-vWXC@YGVI?bI`mTwn6#f+rg-y$dwa&Mkpg!I!hzu$5>s5_nbgra=Y~1+289No*ZM`$7^F1r
z;h*B3?;%?2!!rUG)hn)bST`2}hoCdSql+EZT|>eaO!)`v5MHyU94-smr}`i2
zK9T%zNMUo7jsdeP-wTV+su?_6PE6D@oo6Q~tnxi8vao8FW+fb~*<+HOh~Z%0*@8$e
zyOlsNgo60+(H{x6z`$5A6d58J4^iC{oBLQ;I(>jE_oXd5+SIIkiHnSh^vQwYKxj<1
zDnZ|0at9}BPbyL2YBBn#7~eoli+z5}gD(_W5Mw_iiGPgIOxU%Lb5R@xPVM**ND`PR
z@6*ST-=epE|G3ZlmNth^`-5>-{Ed>z|0E5
z%%7O9aqn^X=9l188~HMz{Y^O3_$Mf*
zUvfKr@bo(x#Tg
z4HYpw>`)MWCdvi#s*gHM^C`R3m?mE&92!SFI84}~2^%35Iu#>jUF#RqKPT);fE~UU
zfLDg8?=*K!n3~%MC(QiuL!ODk9`*@1iVhMwc~6aO$AUu{lmc=Qgqau(1=0ODLnYU|
z=)cyJ!^)#TCdCs1yu{HXpAZ@F`D7RLGuV!mnzDm|qGuqj!nwtNiEhYtsDdo2otb^{
z?U@)1@`~;*TgDKAA7y5HV0bhzD&ajMd@iG+B3O{2T>OyWSOUz%Pm%^%*>S+%LFWbF
zM?!2C0&(#DHx@}F9z_X{jgBs{!I)W>j1rDKv1OCF%QAOW{2i2fROY~mt6*6C4DbGo
zu;lk}n=&jFmM;`;$rNr`T(@VbidSrnOYVv=E8vfMMdT*vIbxdZr~7RtVallu$K4LB5(b>a-tvd#JZSHJaYO1LpH
zJ+e^Y$y9h|8|Ew8mkMjs(OVNYC$fb*62_(CGGObB`0n`a@gKc9U)G*2ZckX2?1d@Y
zJEruZw{5BM8OQ9ddnL1;dk5!=y65dZi|Z2;~$wEvFj&f(d*pKgC|`;Sg!`SyD~bDN&MUvj_o!)HEt=DugHwCB%UhaZ@z^0K{X
z8Hcj={$TH-y@+(OxpHdavx1`Q)ye8q&yDA&pPxPV(YpD9BU6W#Y$d7sw}g3H&5Uif
z?RMc}p*v+uUQ8H2E83XupV`7bc~I0$0X7AR-nY8b#y5LY&!&&hZogMITXXN|{qrA5
zA6)un``o^hzl_c~&d!_9QOV=)Y)`MBb8P-I^A;vs$#x(76Y#V6JZ9z48_AU9PNA>f
ziS!`;8{9nB+*@`4?#sfn>=RJ7bfcORZlw%fswJ#q1o>Z<(Dx2ke8Ev1qkIr-
z5yPObA}Yh6PI&BM3P0lKbo(}L6QxQq5ch%MX4MBrTW!LrW>J);0^o0ujTs|xTj@JV
z>*8P2EjPBukA0$C8G$mx`u8Sr_=RfAW}0wMgE&VI4xeu@MhAI(K9M9?@jubcPdDa<
zJ42}eZWLt~9S#Ni6`!CLy@fSPw9tDi-58Z9Q^pbwHxI_bj79lFvN0T%9l2+bWEEvx
zG%+Hn$g(Ra`SdS)GJ3*Xjp87cAdxZeuqDY=?8K$w1UyNPbj(JfMb{~Hn{IzdH&R|0
zbsn5FwC@9vN1*2c@e7m$@llRbV#{&de>0bI=D*u!;5Pr2!Tslk
zfD3{`(=sQoL0>n{zP<_w$gxa_D%Y`xx;Y?>*5xB7PVLfxKB-Jbcn&grWk
zS+b5JQ#PnBB~?>}i=|~#?uYh5?j`OEzLs-8JjB_!=7;O7T+74KbzJ?!<`QoE!=^fp
ze|VJJ$dx{Ljyue8Rg1-S4@{I;E~ Tuple[bytes, int]:
dtype_flag meanings:
0 → Q2 packed (2-D or higher weight matrix)
+ data = rows*2 fp16 τ bytes + packed symbol bytes
1 → fp16 raw (1-D tensor: bias, layer-norm scale/shift)
+ 2 → alias (handled by pack_state_dict; never returned here)
- 1-D tensors are stored as fp16 to preserve their exact values, since they
- are too small to benefit from Q2 packing and are critical for training
- stability (layer-norm parameters, biases).
+ Multi-dimensional tensors (ndim > 2) are flattened to (shape[0], prod(shape[1:]))
+ before quantisation. The original shape is stored separately in the header
+ so unpack_state_dict can reshape correctly.
+
+ Per-row τ is serialised as fp16 so that unpack_state_dict can dequantise
+ weights back to their trained magnitudes, not just unit-scale symbols.
"""
if W.ndim < 2:
return W.cpu().half().contiguous().numpy().tobytes(), 1
- W_dev = W.to(_DEVICE).float()
- tau = empirical_tau(W_dev)
+ # Flatten to 2-D: (rows, cols)
+ rows = W.shape[0]
+ cols = math.prod(W.shape[1:])
+ W_2d = W.reshape(rows, cols)
+
+ W_dev = W_2d.to(_DEVICE).float()
+ tau = empirical_tau(W_dev) # (rows, 1) float32
sym = q2_quantise(W_dev, tau)
gray = gray_encode(sym)
- pack = pack_symbols(gray)
- return pack.cpu().contiguous().numpy().tobytes(), 0
+ pack = pack_symbols(gray) # (rows, ceil(cols/4)) uint8
+
+ # Serialise: fp16 τ (rows × 2 bytes) followed by packed symbols.
+ tau_fp16 = tau.squeeze(1).half().cpu().contiguous().numpy().tobytes()
+ pack_b = pack.cpu().contiguous().numpy().tobytes()
+ return tau_fp16 + pack_b, 0
+
+
+def _geode_stratum(key: str) -> Tuple[int, int]:
+ """Sort key for Geode-stratum ordering in the binary file.
+
+ Ordering follows the Geode tree traversal (S-1 = S1·G):
+ stratum 0 : embedding, emb_norm (input interface)
+ strata 1–4: [GQA, CfC, CfC, CfC] blocks in sequence-order
+ each GQA+CfC group maps to one S1 vertex and its G sub-tree
+ stratum 5 : output norm, lm_head (output interface)
+ stratum 6 : anything else (buffers etc.)
+
+ Parameters that belong to the same Geode computation unit are adjacent in
+ the file, maximising run-length compression (zstd sees long identical-structure
+ blocks) and enabling sorted page-through during inference reconstruction.
+ """
+ if key.startswith(("embed.", "emb_norm.")):
+ return (0, 0)
+
+ m = re.match(r"layers\.(\d+)\.", key)
+ if m:
+ layer_idx = int(m.group(1))
+ # Group index: each [GQA+CfC×3] unit = 4 consecutive layer indices.
+ group = layer_idx // 4 # 0, 1, 2, 3
+ within = layer_idx % 4 # 0=GQA, 1-3=CfC
+ # GQA (S1 coarse) sorts before its CfC sub-tree (G refinement).
+ return (1 + group, within)
+
+ if key.startswith(("norm.", "lm_head.")):
+ return (5, 0)
+
+ return (6, 0)
def pack_state_dict(
state_dict: Dict[str, Tensor],
out_path: str | Path,
) -> int:
- """Serialise a PyTorch state dict to the Q2 binary format.
+ """Serialise a PyTorch state dict to the Q2 binary format (v2).
Wire format (all integers big-endian):
4 B magic "Q2BN"
- 1 B version uint8
+ 1 B version uint8 = 2
- Per tensor (repeated):
+ Per tensor (repeated, ordered by Geode stratum):
4 B key_len uint32
* key UTF-8 bytes
1 B ndim uint8
4*n shape uint32 × ndim
- 1 B dtype_flag uint8 (0 = Q2 packed, 1 = fp16 raw)
+ 1 B dtype_flag uint8:
+ 0 = Q2 packed with per-row τ
+ data = rows*2 fp16 τ + ceil(cols/4)*rows packed bytes
+ 1 = fp16 raw (1-D tensors)
+ 2 = alias — data is 4-byte key_len + alias_key UTF-8;
+ unpacker must resolve to a previously-loaded tensor.
8 B n_bytes uint64
- * data packed bytes
+ * data (dtype_flag-specific content above)
Returns the total file size in bytes.
+
+ Tied weights (embed.weight ≡ lm_head.weight) are deduplicated automatically:
+ the first occurrence is serialised in full; subsequent occurrences become
+ alias records. This mirrors the "clustering and collisions are ok" rule
+ from the Q² design (§D-2.5): we use the structure to avoid redundancy rather
+ than fighting it.
"""
buf = io.BytesIO()
buf.write(_HEADER_MAGIC)
buf.write(struct.pack(">B", _FORMAT_VERSION))
- for key, W in state_dict.items():
+ # Sort entries by Geode stratum so the file layout mirrors the computation
+ # tree (§5.5.1: parallel dispatch by tag; §D-4.1: Geode traversal order).
+ ordered_keys = sorted(state_dict.keys(), key=_geode_stratum)
+
+ # Track tensors we have already written, keyed by data pointer.
+ # Used to emit alias records for tied weights (e.g., embed.weight ≡ lm_head.weight).
+ seen_ptrs: Dict[int, str] = {}
+
+ for key in ordered_keys:
+ W = state_dict[key]
key_b = key.encode()
buf.write(struct.pack(">I", len(key_b)))
buf.write(key_b)
@@ -208,9 +275,18 @@ def pack_state_dict(
buf.write(struct.pack(">B", len(shape)))
buf.write(struct.pack(f">{len(shape)}I", *shape))
- data, dtype_flag = pack_tensor(W)
- buf.write(struct.pack(">BQ", dtype_flag, len(data)))
- buf.write(data)
+ ptr = W.data_ptr()
+ if ptr in seen_ptrs:
+ # Emit alias record: dtype_flag=2, data = alias_key bytes.
+ alias_key_b = seen_ptrs[ptr].encode()
+ alias_data = struct.pack(">I", len(alias_key_b)) + alias_key_b
+ buf.write(struct.pack(">BQ", 2, len(alias_data)))
+ buf.write(alias_data)
+ else:
+ seen_ptrs[ptr] = key
+ data, dtype_flag = pack_tensor(W)
+ buf.write(struct.pack(">BQ", dtype_flag, len(data)))
+ buf.write(data)
payload = buf.getvalue()
Path(out_path).write_bytes(payload)
@@ -224,14 +300,16 @@ def unpack_state_dict(
) -> Dict[str, Tensor]:
"""Load a Q2BN file back to a float-valued state dict.
- 2-D+ tensors are dequantised to {-1.0, -0.5, +0.5, +1.0} unit
- reconstruction points. This is a valid unit-scale representation;
- callers that need the exact per-row scale must save τ separately.
+ Format v2: per-row τ is stored alongside the packed symbols; dequantised
+ values use the saved τ to recover the correct weight magnitudes.
+ Format v1 (legacy): unit-scale reconstruction {-1, -0.5, +0.5, +1}.
+ Alias records (dtype_flag=2) are resolved to the previously-loaded tensor.
+ Multi-dimensional tensors are reshaped back to their original shape.
"""
raw = Path(in_path).read_bytes()
if raw[:4] != _HEADER_MAGIC:
raise ValueError(f"Not a Q2BN file: {in_path}")
- # _ver = raw[4] # reserved for future version checks
+ file_version = raw[4]
pos = 5
result: Dict[str, Tensor] = {}
@@ -253,23 +331,51 @@ def unpack_state_dict(
data = raw[pos : pos + n_bytes]
pos += n_bytes
+ if dtype_flag == 2:
+ # Alias record: resolve to a previously-loaded tensor.
+ (alias_len,) = struct.unpack_from(">I", data, 0)
+ alias_key = data[4 : 4 + alias_len].decode()
+ result[key] = result[alias_key]
+ continue
+
if dtype_flag == 1:
- # fp16 raw
+ # fp16 raw (biases, norms).
t = torch.frombuffer(bytearray(data), dtype=torch.float16).to(dtype)
result[key] = t.reshape(shape).to(device)
+ continue
+
+ # dtype_flag == 0: Q2 packed (with per-row τ in v2, without in v1).
+ rows = shape[0]
+ cols = int(math.prod(shape[1:]))
+ n_packed = math.ceil(cols / 4)
+
+ if file_version >= 2:
+ # v2: first rows*2 bytes are fp16 τ values.
+ tau_bytes = rows * 2
+ tau_arr = torch.frombuffer(bytearray(data[:tau_bytes]), dtype=torch.float16)
+ tau_vals = tau_arr.float().to(device).unsqueeze(1) # (rows, 1)
+ sym_data = data[tau_bytes:]
else:
- # Q2 packed: unpack → invert Gray map → dequantise to unit levels
- rows = shape[0]
- cols = int(math.prod(shape[1:]))
- n_packed = math.ceil(cols / 4)
- packed = torch.frombuffer(bytearray(data), dtype=torch.uint8)
- packed = packed.reshape(rows, n_packed)
- gray = unpack_symbols(packed, cols)
- sym = gray_decode(gray).long()
- # Unit reconstruction: {0,1,2,3} → {-1.0, -0.5, +0.5, +1.0}
+ tau_vals = None
+ sym_data = data
+
+ packed = torch.frombuffer(bytearray(sym_data), dtype=torch.uint8)
+ packed = packed.reshape(rows, n_packed)
+ gray = unpack_symbols(packed, cols)
+ sym = gray_decode(gray).long()
+
+ if tau_vals is not None:
+ # Dequantise using saved τ: {0,1,2,3} → {-1.5,-0.5,+0.5,+1.5}·τ/1.5
+ # Reconstruction points at ±0.5τ and ±1.5τ (equiprobable cells §D-2.5).
+ val_map = torch.tensor([-1.5, -0.5, 0.5, 1.5], dtype=torch.float32,
+ device=device)
+ W_hat = val_map[sym.to(device)] * (tau_vals / 1.5)
+ else:
+ # Legacy v1: unit-scale reconstruction.
val_map = torch.tensor([-1.0, -0.5, 0.5, 1.0], dtype=dtype)
- W_hat = val_map[sym].reshape(shape)
- result[key] = W_hat.to(device)
+ W_hat = val_map[sym].to(dtype)
+
+ result[key] = W_hat.reshape(shape).to(device)
return result
diff --git a/scripts/train_q2_ltc.py b/scripts/train_q2_ltc.py
index 05632c1..46b416e 100644
--- a/scripts/train_q2_ltc.py
+++ b/scripts/train_q2_ltc.py
@@ -196,6 +196,12 @@ class CfCBlock(nn.Module):
sequence without growing a KV cache. Memory cost per layer: O(batch·d)
regardless of sequence length.
+ **GPU efficiency:** the time constants A1 and A2 are computed from the
+ input x only (not from h), enabling a single batched matmul over all T
+ tokens. The state update is then a sequential element-wise scan — cheap
+ because it has no matmul inside the loop — making the total cost dominated
+ by the three linear projections (ff_a1, ff_a2, out), not the recurrence.
+
All Q2Linear layers participate in Q²-QAT when activate_q2() is called
on the parent model.
"""
@@ -203,10 +209,12 @@ class CfCBlock(nn.Module):
def __init__(self, d_model: int):
super().__init__()
self.norm = nn.RMSNorm(d_model)
- # A1: decay-rate network (input=[x,h] → positive scalar per dim)
- self.ff_a1 = Q2Linear(d_model * 2, d_model)
- # A2: integration-target network (input=[x,h] → target state)
- self.ff_a2 = Q2Linear(d_model * 2, d_model)
+ # A1: decay-rate network (input=x → positive scalar per dim).
+ # Takes d_model (not 2*d_model) so all T tokens are processed in one
+ # batched matmul, with no per-token Python dispatch.
+ self.ff_a1 = Q2Linear(d_model, d_model)
+ # A2: integration-target network (same reasoning).
+ self.ff_a2 = Q2Linear(d_model, d_model)
self.out = Q2Linear(d_model, d_model)
# Learnable log time-step (log-parameterised → strictly positive).
self.log_dt = nn.Parameter(torch.zeros(d_model))
@@ -223,21 +231,25 @@ def forward(self, x: Tensor, h: Tensor) -> Tuple[Tensor, Tensor]:
"""
B, T, D = x.shape
residual = x
- x = self.norm(x)
+ x_norm = self.norm(x)
dt = self.log_dt.exp() # (D,) — positive, learnable time step
- out_steps: list[Tensor] = []
+ # Compute all time constants in one batched matmul over (B·T, D).
+ # No h dependency here → fully parallel over the sequence dimension.
+ a1 = F.softplus(self.ff_a1(x_norm)) # (B, T, D) decay rate > 0
+ a2 = self.ff_a2(x_norm) # (B, T, D) integration target
+ decay = torch.exp(-a1 * dt) # (B, T, D) in (0, 1)
+ c = (a2 / (a1 + 1e-6)) * (1.0 - decay) # (B, T, D) affine offset
+
+ # Sequential scan: h[t] = decay[t]*h[t-1] + c[t].
+ # Each step is element-wise (no matmul); torch.compile traces this loop
+ # into a fused CUDA kernel automatically.
+ out_buf = torch.empty_like(decay)
for t in range(T):
- xt = x[:, t, :] # (B, D)
- xh = torch.cat([xt, h], dim=-1) # (B, 2D)
- a1 = F.softplus(self.ff_a1(xh)) # (B, D) decay rate > 0
- a2 = self.ff_a2(xh) # (B, D) integration target
- decay = torch.exp(-a1 * dt) # (B, D) in (0, 1)
- h = decay * h + (a2 / (a1 + 1e-6)) * (1.0 - decay)
- out_steps.append(h)
+ h = decay[:, t] * h + c[:, t]
+ out_buf[:, t] = h
- y = torch.stack(out_steps, dim=1) # (B, T, D)
- return residual + self.out(y), h
+ return residual + self.out(out_buf), h
# ── GQA block (Geode S1-level: one 4-way coarse selection) ───────────────────
@@ -490,8 +502,12 @@ def token_stream(
rank: int = 0,
world: int = 1,
byte_tokens: bool = False,
-) -> Iterator[Tuple[Tensor, Tensor]]:
- """Yield (input_ids, target_ids) pairs of length seq_len.
+) -> Iterator[Tuple[Tensor, Tensor, Tensor]]:
+ """Yield (prev_token, input_ids, target_ids) triples of length seq_len.
+
+ prev_token is the single token immediately before input_ids[0]; the model
+ uses it to apply the BigramHash log-prior at position 0. It is a (1,)
+ int64 tensor. At the start of a new shard, prev_token is 0 (padding).
Shards are distributed round-robin across ranks so each GPU sees a
disjoint subset of the data.
@@ -526,9 +542,13 @@ def token_stream(
raw = f.read_bytes()
tokens_np = np.frombuffer(raw, dtype=np.uint16)
tokens = torch.from_numpy(tokens_np.copy()).to(torch.long)
+ # Track the last token of the previous chunk as BigramHash context.
+ shard_prev = torch.zeros(1, dtype=torch.long, device=device)
for start in range(0, len(tokens) - seq_len - 1, seq_len + 1):
chunk = tokens[start : start + seq_len + 1].to(device)
- yield chunk[:seq_len], chunk[1:]
+ inp, tgt = chunk[:seq_len], chunk[1:]
+ yield shard_prev, inp, tgt
+ shard_prev = inp[-1:] # last token of this chunk is prev for next
# ── validation ─────────────────────────────────────────────────────────────────
@@ -659,10 +679,11 @@ def train(cfg: Config) -> None:
optimizer.zero_grad(set_to_none=True)
total_loss = 0.0
for _ in range(batch_size):
- inp, tgt = next(data)
+ prev_tok, inp, tgt = next(data)
inp, tgt = inp.unsqueeze(0), tgt.unsqueeze(0)
+ prev_tok = prev_tok.unsqueeze(0) # (1, 1) → squeezed to (1,) by model
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
- logits = model(inp)
+ logits = model(inp, prev_token=prev_tok.squeeze(0))
loss = F.cross_entropy(
logits.view(-1, cfg.vocab_size),
tgt.view(-1),
@@ -707,9 +728,18 @@ def train(cfg: Config) -> None:
print("\nPackaging artifact …")
- final_sd = {
+ final_model = swa_model.module if swa_active else raw_model
+
+ # Build the state dict for packing: only trainable parameters, no buffers.
+ # Tied weights (embed.weight ≡ lm_head.weight) are handled by q2_pack via
+ # alias records — we include both keys here and pack_state_dict will emit
+ # lm_head.weight as an alias pointing to embed.weight automatically.
+ # bigram_logprobs is a buffer saved separately (not Q2-packed).
+ sd = final_model.state_dict()
+ packable_sd = {
k: v.cpu()
- for k, v in (swa_model.module if swa_active else raw_model).state_dict().items()
+ for k, v in sd.items()
+ if k != "bigram_logprobs"
}
# Import q2_pack from this scripts/ directory.
@@ -723,9 +753,15 @@ def train(cfg: Config) -> None:
_spec.loader.exec_module(q2_pack) # type: ignore[union-attr]
q2bin_path = Path(cfg.out_dir) / "model.q2bin"
- raw_bytes = q2_pack.pack_state_dict(final_sd, q2bin_path)
+ raw_bytes = q2_pack.pack_state_dict(packable_sd, q2bin_path)
print(f" Q2-packed: {raw_bytes:,} bytes ({raw_bytes / 1e6:.3f} MB)")
+ # Save bigram_logprobs separately as fp16 (loaded at inference, not Q2-packed).
+ bigram_path = Path(cfg.out_dir) / "bigram_logprobs.fp16"
+ bigram_buf = sd["bigram_logprobs"].cpu().half().contiguous().numpy().tobytes()
+ bigram_path.write_bytes(bigram_buf)
+ print(f" bigrams: {len(bigram_buf):,} bytes ({len(bigram_buf) / 1e6:.3f} MB)")
+
# Compress with zstd level 22 (requires the `zstandard` package).
try:
import zstandard as zstd
From a2393e2ff2f2abc4e950924b3f241755cc11d7ec Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 21 Mar 2026 19:50:07 +0000
Subject: [PATCH 13/14] chore: add __pycache__/*.py[co] to .gitignore, remove
stale .pyc
Co-authored-by: devlux76 <86517969+devlux76@users.noreply.github.com>
Agent-Logs-Url: https://github.com/devlux76/q2/sessions/163ec0f1-c376-4bf9-a4ee-542b7ae7dc33
---
.gitignore | 3 +++
scripts/__pycache__/q2_pack.cpython-312.pyc | Bin 22692 -> 0 bytes
2 files changed, 3 insertions(+)
delete mode 100644 scripts/__pycache__/q2_pack.cpython-312.pyc
diff --git a/.gitignore b/.gitignore
index b856243..952f2f0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,6 @@ bun.lock
e2e-results/
test-results/
playwright-report/
+__pycache__/
+*.pyc
+*.pyo
diff --git a/scripts/__pycache__/q2_pack.cpython-312.pyc b/scripts/__pycache__/q2_pack.cpython-312.pyc
deleted file mode 100644
index 6f2c9d12152ff2b5b60f024c2754334ab6e456c9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001
literal 22692
zcmch9d2kz7njacB0g&J!9wH^GNnM0UP`o6|)Mbj4DB3zL$(oU@ARrqgL4g3(4M-*p
z&d_rLV`l}fT`Q>UiKrdzn2t3W&FpTCT-mI>&NZndJA}avIlbIkme-YC|B)(X<*jU-
zKa$_~`T$5#k~5p6B{p8ae(!tN_rCk}e<&<;7;ydI@BTRW-9CfiujoO$%u0fLILjFf
zHw=Q|fb&3_bzN|&FbXdeva&^^?g79ZExu5~-dID0qoo&0IfKy+_Qbu0Ia+=kqfmKtf)
zQn}yv)udsEY1?P9n7uE~aLXF`1j^#qNut6w9PK~fp
zD8las{JQbGQK%J)@mniw5=!v9N#KQ2{PG2c+&`gCC_`SIU`HQ&l(zEkgT`E0N(uGC
zI<&bNsp{MtVY5(<+%3Wup#sph3R{Ir{B8?u7pg8}^cM|I}zO
zIIgt$#{2`9o5#laPp97EdrzJ7`UeIAp@8U*1_b`}$3N$<1%gAvQJ#tg!$Z73Eby_g
z`qb=joI<|DpBg_K5eJ6(!BE5>ZEw>mjryZva3CP@(FlXp=Iswgd1-vKKN6Dom_z`h
z!vX$6hvQfv;2jN6ao!&q8}|1HqWr5}`&*j$L;G8s`0o8}O?=P(_Q__R?-l*y-avRD
zA_RmczC(vM77+RV@o1pQas137?_f}rqHI(_ArOuR2mB%G1~qr3jTfThV*x%G<|T1p
z=ashRsD!a}g+lyrz<+g|j{ufG8jOS`UW|qLNSNs-g^KYj8X;VYVeEE;GlmX%0?-97%XKi@}UL83_uev1g>K80SFghQBxPCsMy4x
z4TPnLc*)`DKF0c{bZR_0jB!c>VsI=fVNP_+7{x?~n#ZENlDN{=k9nhOGaCs%FdP`T
zJQhJ;p90A1ReHMypkP=U3t(h4H9)*UOrzs-+6x{|cKXBN2xg0x;{=|~r)YkRvbhH%
zk}YRrW1#@Dt;(=u^FU1S%cg;`m_)Scblag5p99wo1L|%js0J(kOa=q@@C_tx7^3>S
z5h}XsiW^2OtFrK@?WhskF7cMzKxZTf-MfBNH3U;7syf6(ypAA3Ij
z#jn224@N`=u-6}xq@X{nwkYupAHUn7VJHC&cj~|0xx4xpf7MY*dHS_z;OMN6dN65
zIH3--OB|9q8AuxT`A)t8W0IQq0ni$cPMLt>{_wcEHwjuCiLQHCIia`X*lTj|SBG
z=mmeinYB5jEbQML{G68ZEYQ;=1V?4t>>Q4a26loRg#+TwYZ37>_9*>-bjQkt
z`V#n}{+L(^0L1R34&0^;UsM}hCD#um52WjEZNItwZO<+5P4C+~XFKkl{>VJ({u?$p^i=L=cqxjFN>hp1;5$ln=+&usrxqjxt9*df%*RhPCyCJk|J${9s@<%%2L
zS4&TFalCUHF83JL++$JNr|cTBYpL9?4#gNZ#vy6^!=Ow6@Fas2|GxwYyV)2!hiN;_
z_JI`8u~6Ntj6c4@pQnv+A;-wp%
z=+t`*L?|Hf*=F2(TTJlJr=PPKWvN3Mod^So@5KF?lFnw^khQgvI$a`L^}}O7b_^+
z2^O!=9w{K!B4;oDBnh`E!-H*x0>^b*()N16RM(=-G4<+FQBArlTU47cJ#ZLYg(*`a
zn%r~UoAkb8x#hg+%sRGAbuT&!Qzfa>)8#iD(~fs`-P(I|Z?<@Q#<5*{b!NKqhHKjO
z&beC`Z(ht6dom8sRQIy8XzF>f3hl1o(9~D_aSx9nK@^5|_yqXpgbTD{4QdnB>C*^N
zr&X6mE7+d=3%u+a8w0u!S0s57I5x)r>U*8E4L|+fZ_{+QHnla;T(imk)%P009)3R`
z=D&9GMZO`-A2`6bdOCSXs3_i|h^drPWL2bEk%GWVsTO6yj`~MI`G*)M^M?n(e#5MN
zMJi^!Aek1(jAa4ZLL^>M7E&@HUjt}BL=24bZOzog;WK@`Cz?kE{^NJsnxFBgxPdt`
zyk`=t@xxF;tU=1di5u~wFccLDhA?=aB20^Qc=<9FsK*URP-G}9PPM1&ruSrA8y1{=
z#>w9)f4Am#P1fn1GkgEPuGz$`zVOw67+twy4N@R9=nc{vqOK8NbkC`J8mEJx&cXW
zxCMm<@u$tR%TQR7>PdG^_hnqQ3(oqCv;NMmclX}jn|1D-Gw*zKo=F;Id*cxH0pq^N
zSbe>W{V>T9M|0B70Vka_#eotzKD^dG!7ek#O@l_kc-Qnk?Ll1N--flM+&FZ0fzwK@
zCEG@s-GE?>5Js-Lv^_a#j++I`db|~^6KE4pHul8P(*)#eqV=Y#-r9cD1y@;!9ZUs730y#v)ym?^Ep+B1z8?{7E-)19_}AlvdoRp00lF`Kg}7mPJ=_Vppmw
z**MkxYnx-qU3@*3jHS+|yV6p6&ur_w+nX>i+g;ZSk_D+PX(O1ay>`J~m$BE~+4^qd
z?Z&LV8LYO*ow6o_>DFX;retRK?e(+%+qId(mU~_I27lJ~QOVCvWD2`K^8eEMC&5hN
z$*CiY&XViTCZB!l+>MLV7qiZ~Idh#tEp%RpuF|M5UEIS?jauM$_4U=bCs7lH3k${x
zKmkR8eoI_geKzCSocCSe~n#D|2)M+2nrh|Aq6eMMMg*5xP?F{*a?cqnM
z07G~WrbpaCw{3KzDTcgINNHLOk!+|ZJht5C5_cj88!(DJA~sVI(lM32W*6J(B^Q>t
zG1(}!;I9>bZTM@K%_NFSq|Yhi)OK5uX3J9SgW>^{_#XbGH*v#$TW2V6)Ao65>y5@~
z?3Y^X7l$)(CAlrtovfWY^uz}g@lGB3%vpSWZ*niT-=P`f%%Qo;?X#!nojVtt9T{iG
zy_bIR^3PuWsP1RJkNUIDzNwx?r~9|LR9mVGf0pE4Qkmb`daLnfW7fHC&b;jjeA$8h
z_Bg%_JrxrwJ`X%5BDU(%bmrgG;QVb}XA{TiWet-7@FOq!!4
z^0F)@n>zq9g&UxpAG&Oj@Ix^nBeKH))I3>A5x?h8@xdv65AnVAM)HaqXV8=1;IB
z{-*0XSrE5B5zqc#wCRW!#2sW$5g1T7UqJy@$raJh(`^bZ-OG#2OvyJ0
z{-__qIhp5ugCYNrCdB7*_|bqr3{fl3-~n?5^emphDO46(Qk%C21d<weylk>0^HYY8HGbib;!EF1u=3?9P}24~2u#7?koR*q);NNDN|b3|U^bb{b<=
zH9#hy68&Tx^Mt2VHq%6lWb~KK!~W2q>==lk?V(sC1{*IJ{Mfi`i$+LrmKgh#&Cnc%
zWiw5^Y*VLz@m$&F>pA>VU-#jh!G~FGnAXl@Q`sh6i3I`^0gsaze#92)N-Jypysu4=
zE$4mk35gpho5-Ll5Yv{C?JyGgX!;ak*w-%+vrzDl-Bo)574|83B%VZxckm}Qf*^xI
z^9Fmt_piQoHN|~vVo?!*U%z%eo{Yc#jRki@#@?^tlRWZW%TciV!yGvn^ey7%4p
z&$$oI*$+Oj!mf1vndCF6)@0{Y&yv$kqHXH*jq}sz)2GufrbxggrtD(*XVV>7CqLD*
z>?*$Ao9un7Hwznut3KnZ&$_lu9bR&+PjAh-_^IwCS7q9gma?wRO4@{i0Pm_wugkh>
zr+UzCaoP1(ldq;*sh{;3cYW5qC9lJ~GVWbj_cIIb{TcWEtoz`CyEo(R&AR)hjxM^(
zuD_Ce<@JdL_m+%%OV+(@!QGT`H)Y+;3+~++_wKBF4@v1|slkkML)t&*+%#w2^aR1-
z6HMph%;`|IRmLZBOw~bgb;D?(HP#OUQCZ$MhjO>rhKSuoV
zOe?J4Vl?myGa3ZpIP2x5SihIXsl(HTvkSw4#esTG!vKRB^$~gH8`Rv#jR&*78+G9g
ziw@Jw!%`mL8~S5|gP2A>5FKb%`N$~{Z@A_um(kV!K!`jE;1xv5ncYlThXH2$W-Ngb
zNyc3K!U*OPe(YeF^_}X_vNmDJuLMVfEC>KLd@mg5LvT+L5YRp1SbW2T6eacnLkeqB
z_v91a#0KElROUmOd@wV(2m)dFN@+ZBag!Z>%s&+H!a^U34Gr@`Ohqz`k=6)I1_W6F
zh6WWn?lDeS8G@g~85bdc6H)G<`%gqfh3yq$a6%z&1n^4z6%g{VwSJweG
zKPP(~s}3kk&mt&r0Xm>OiBXAMoU&02$QEWcl1*4L*~0Jwg-qBnGB+w0utt4B;T7>E
zRF|z(4SsV1o=2gJd_%Oez)jiC5-;NQNBEOQaRZUwV6Z#C|BctaF<0T8w>2-i%RjR_
zuG^FLxsuH@XR>zBRM)R<_N8@|Y3`Qorfr6M+wtbfM8R_L`t<4PqJ(9!q#B8h3F~rk
zN$T))L3;O`yME4Izie|TP#foMO@O-WEP2!tK@1Y|FZb~0AX!iIy6ciWYz}Kl)H&av
zMzA=;V@eVOX4jgQ11_rTJQ~T&LConq&|FG1IJsY~mr-)Vu2k&t(xO0eal}Thc<`WxD}0Q()krSx@7Ic|CP$J=44h+Vf+3s@GsW{fU|Oyso9M
zv4TZWU009m0@TRXTuChzFUqx@*F!xQdaOP6=OA2XG?jz*>w#}9go}k(X?&4XTUCew
zKY%*YaJ5ZSX3xV{O+paMMhqE~IfjVA_6LXHdk_#_lP9@!z(>!ME`o3}cGXi*Q@ulZ
z;Q3beo}?dQnz$4@-+sPLQ3NRqT9c-CH;93;0LAwq+KhxMFi9wu3JX`cMqcAoS?K`y
zkPl3nGN^&}HoXIswS#U5RKX>+sch4#nnO-TqWg4hblJ#~4td1SjilY9pGD>^=
zzq-Lwy%l8@aCznd1}-P|DQzhC7Knv88v*f8ihW{%R=@hr3aI(D*0u>W@&kGPksV|)
z(2NgEom&$e$)&n^AkR3U2J_{&O6fKfSL^^3oyqzEmAAvo94*>%G%7`WBv`XK-@9hh
z1Wz%r0ux3_kA;*K30;Nc904Lj=DHe$RvH@jB8HPZHj0v`h(uJGu00IX!ag=a%Am>K
zt@x$sk**!=&N|Cdu<7X594|%r1qXr0qz98!=V?qsY0?xPmN-U`j!w
zk%562Yz`zJuCy>fx*v>%LVHOXSH%4M!q~l#CPx*KR9Q9>FGWFO1`>c84T@rfiO~VR
zejtQJP?R2$Tj7v`83P*_kuIPPk<0Z6Jw+5G%^?f*qyAAqE{qYuA;`vB)E(k{0*Ixh
z@IXl8nq16Q^=c$2V3~zjSU|wRI1h;s_8|rmk~{`O-Z-XpsXnVFj0WUF-_gTeJ%?ZP9q;Px>z1ubC&Xib
zBD;M@PQG}&>#Xmk!!MrcJ9z?cnOrWLfv*yiPGu|evhS&*+<4?NJOebGF#~GZ3*LNg$yxxRh8ME#CeXsSsF*Wr>+}e3_XSTZW4{GPjo|`)Mz;3A9Jk_`Cu6bxEw`@q57F`u7DS0x%
zEf$wiOwOAw2=I0%bE0e`CrVnl(>-hX(ev5To%8k<^;K%?yuIqP;_{UL#?bW8ngeft{-TuJn#cC4;n{Re06
zulxP4XDbfQyPnhP3%_;pK_lkvL9@YKdOeg3y&j(GT{7DfRo_0bV6M%WYtz@}&D)op
zMTu*#@0&AMDhiG&Vqq~22C}i>nS1ytT%7p|PA=r}Nyc~z(4dgvYD1JToL!fnf$mt8
zv)+(p=WW9eb!Wr3IMiXz%}Lm1CM{4opj_kx(YWb-)k1R_T4%~6ZjL%N*lRj_EO5nI
zG;!=@)P*u)ffE<5Km=Og9sb$K0#t1n(H%k>#GFG3ssWW_wVu+V^^q0}(BfCCYmXN|
zxG(r2W-iz3NeA^fUN8aO_&4l<^DFnjk>4_#FBj@MSIn17nJjOxlx
zyimiUsP6h3(H(NR8h48{b!IK+9GrYAE>0H5i$`<^U9NsySBuuHuM6;bqq!D$2!%Al
z$FNFQ>MQ@HyyO3N>)xDCd^cdsUiOuU;#i`;C`kA7J-0kf}q=~D{p6Qh;LYPM*k0Y*5i4IXiS;j*c`V)Aufyk1RMGo
zWFM#yq?9TWyLKYVuLQh%na~D9E**JdO0p`&BQldz;Hzz&d3so$NWKDF`BSM)$r^t=D$@HfP5Wjzv^{W)i36BrG7nG1ADwP
zm$U`dQ3yqzeC1&XH)9$964CzzdjqT~eJnR#p0$&06fQ{q3O+aoJ$j5!-HP)
z!H42E@hnc^rUH9#&{M4l`VJBP3`Nct1r(4SFHv0lVN#~V5vmrTT&Eg8MA{OyLP5Yv
zJiLRl6wiiSKx(94M2MtpV+U?pcPlYXy#(kS-44_32!S+{j?WyfasiGRDDG7j;>4np
zSR}7(CM=b$ioUD@ZEy2zlx^x!Jn`EEkPY61xB%Hj{tusOuR-8qM3SvoLAdMbybv?8
zD4NN0RF7z%IF6faQx$LV6?!UAbQ02;)FkT4Tr|(^Yp0`tbf8WyAong)py6*u5)+`f
ze6eXH*^WKsBejNwZ85hna}dimy!OF0MXqDG)Fm>N>Z;9J2}Rv0(IUC@GmheI)NBB)
zka{5>volCeH^gY~f}=j;s8@~;*c^$rMAy`{x4fU&HZ2uZ&DAu`o}cZTTfaM7^vs<7
znZ>e-)RpPoQ^yeLg>wUZJzT4f!naJR?lGjO
zE8Oe3Uze@fmpqpE`eN0F^l5~Qe*N~wxvHHDRl74)yR%h$5`Bx+{LK1Hbz|a4sy+F_
zuU+djFvaBy#eAlizq5X}<42qR$ns&~2Zi&+M;40vGR1wr+?p*umFQZmuDRv8>6tm4
zt=_Rv?afqsX9u&@yL4FOIxkP1PQH>bFS$$K>bUXj^t0)6Gp#di>91$YwrAZ9cu`i#
zvd_+R-TYd%Y+J&5s?;e&XD)j6PqvX;1VmRc&0TYRFVI++BaKO!J-v1-$;!kdL}yVZ%s*xDAV_GPN}-9MeJI+!?0$@Wai_H2nKVOn;U!FT_K)lj+NMrbwMOb@{*B_$M0|
zceH$A;wsw`z02!r(%rX?+&nU~`|T5%b&a#H+~54(H!|xEBoF6T`>1@rq89_A($mLo
zoSZ(1*9OnTLp_?jl;o1+8)zrX$_#0)1e3o
zaE@D*;AUvsfH$!YZ!K@s!ZE_V_AFvO)|?w%TK>&vrhv^1Y58`+k!v|_fn4Z}{ZA~-
zDH29@u`~b2`d-q4rO4S4;djG|jc)Ua*q(e~KP*e@&?Fc4D2RTF#^BuBESOt~N&xaK2dl$0p@
zYE)ncTXuKI7A61NEKBrQGFPw9;4
zkVJG@pC(0;D<%2S7YbhWF{C1#68{jz-Xc-18aG6gv>5E}?@zuq`R#9}OEb2b^p#I+
z{9;jYs`tk6>EmynoaxW3YseIVi7u5^-tbI&(&umaZu+ttoA22cTDmeVT_0`Ewj7z;
z(3>qiny@Vvm0urCj=mn5Da{mZp6Xq+yRW;FuGIN-f7V{NVBeClZ@F{s-HW#`X6>!_
zdNTGsQ(a)NIMI&4{?xTQfp;(8zI=cChs_@}&r=ZpQekPT?e(w!t-)xiBSUy;>U6p;
z?Z3Ho`XZzs65U)W?zihwr&C*|ZRuUe%h>s(`GE!Han#jbJZGy}EUKVBrF*kQTNaA8
zXNtDpwfxlio-+B5bl$ZSsc)C*7G&cxLoe>{NH`=|0)I#vT~
zQy$+T-oNSs-yxN4)s=U!&y=%0=EB#(clMwMuqF${W2`Iqjs*vCE$o1gjp+YVIO`gH
z(l`&Sro_LD-!+j{@{LaLE7)-vWXC@YGVI?bI`mTwn6#f+rg-y$dwa&Mkpg!I!hzu$5>s5_nbgra=Y~1+289No*ZM`$7^F1r
z;h*B3?;%?2!!rUG)hn)bST`2}hoCdSql+EZT|>eaO!)`v5MHyU94-smr}`i2
zK9T%zNMUo7jsdeP-wTV+su?_6PE6D@oo6Q~tnxi8vao8FW+fb~*<+HOh~Z%0*@8$e
zyOlsNgo60+(H{x6z`$5A6d58J4^iC{oBLQ;I(>jE_oXd5+SIIkiHnSh^vQwYKxj<1
zDnZ|0at9}BPbyL2YBBn#7~eoli+z5}gD(_W5Mw_iiGPgIOxU%Lb5R@xPVM**ND`PR
z@6*ST-=epE|G3ZlmNth^`-5>-{Ed>z|0E5
z%%7O9aqn^X=9l188~HMz{Y^O3_$Mf*
zUvfKr@bo(x#Tg
z4HYpw>`)MWCdvi#s*gHM^C`R3m?mE&92!SFI84}~2^%35Iu#>jUF#RqKPT);fE~UU
zfLDg8?=*K!n3~%MC(QiuL!ODk9`*@1iVhMwc~6aO$AUu{lmc=Qgqau(1=0ODLnYU|
z=)cyJ!^)#TCdCs1yu{HXpAZ@F`D7RLGuV!mnzDm|qGuqj!nwtNiEhYtsDdo2otb^{
z?U@)1@`~;*TgDKAA7y5HV0bhzD&ajMd@iG+B3O{2T>OyWSOUz%Pm%^%*>S+%LFWbF
zM?!2C0&(#DHx@}F9z_X{jgBs{!I)W>j1rDKv1OCF%QAOW{2i2fROY~mt6*6C4DbGo
zu;lk}n=&jFmM;`;$rNr`T(@VbidSrnOYVv=E8vfMMdT*vIbxdZr~7RtVallu$K4LB5(b>a-tvd#JZSHJaYO1LpH
zJ+e^Y$y9h|8|Ew8mkMjs(OVNYC$fb*62_(CGGObB`0n`a@gKc9U)G*2ZckX2?1d@Y
zJEruZw{5BM8OQ9ddnL1;dk5!=y65dZi|Z2;~$wEvFj&f(d*pKgC|`;Sg!`SyD~bDN&MUvj_o!)HEt=DugHwCB%UhaZ@z^0K{X
z8Hcj={$TH-y@+(OxpHdavx1`Q)ye8q&yDA&pPxPV(YpD9BU6W#Y$d7sw}g3H&5Uif
z?RMc}p*v+uUQ8H2E83XupV`7bc~I0$0X7AR-nY8b#y5LY&!&&hZogMITXXN|{qrA5
zA6)un``o^hzl_c~&d!_9QOV=)Y)`MBb8P-I^A;vs$#x(76Y#V6JZ9z48_AU9PNA>f
ziS!`;8{9nB+*@`4?#sfn>=RJ7bfcORZlw%fswJ#q1o>Z<(Dx2ke8Ev1qkIr-
z5yPObA}Yh6PI&BM3P0lKbo(}L6QxQq5ch%MX4MBrTW!LrW>J);0^o0ujTs|xTj@JV
z>*8P2EjPBukA0$C8G$mx`u8Sr_=RfAW}0wMgE&VI4xeu@MhAI(K9M9?@jubcPdDa<
zJ42}eZWLt~9S#Ni6`!CLy@fSPw9tDi-58Z9Q^pbwHxI_bj79lFvN0T%9l2+bWEEvx
zG%+Hn$g(Ra`SdS)GJ3*Xjp87cAdxZeuqDY=?8K$w1UyNPbj(JfMb{~Hn{IzdH&R|0
zbsn5FwC@9vN1*2c@e7m$@llRbV#{&de>0bI=D*u!;5Pr2!Tslk
zfD3{`(=sQoL0>n{zP<_w$gxa_D%Y`xx;Y?>*5xB7PVLfxKB-Jbcn&grWk
zS+b5JQ#PnBB~?>}i=|~#?uYh5?j`OEzLs-8JjB_!=7;O7T+74KbzJ?!<`QoE!=^fp
ze|VJJ$dx{Ljyue8Rg1-S4@{I;E~
Date: Sat, 21 Mar 2026 19:52:32 +0000
Subject: [PATCH 14/14] fix: clarify prev_tok unsqueeze comment in training
loop
Co-authored-by: devlux76 <86517969+devlux76@users.noreply.github.com>
Agent-Logs-Url: https://github.com/devlux76/q2/sessions/163ec0f1-c376-4bf9-a4ee-542b7ae7dc33
---
scripts/train_q2_ltc.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/scripts/train_q2_ltc.py b/scripts/train_q2_ltc.py
index 46b416e..e0b68ee 100644
--- a/scripts/train_q2_ltc.py
+++ b/scripts/train_q2_ltc.py
@@ -681,7 +681,7 @@ def train(cfg: Config) -> None:
for _ in range(batch_size):
prev_tok, inp, tgt = next(data)
inp, tgt = inp.unsqueeze(0), tgt.unsqueeze(0)
- prev_tok = prev_tok.unsqueeze(0) # (1, 1) → squeezed to (1,) by model
+ prev_tok = prev_tok.unsqueeze(0) # (1,) → (1,1); squeeze(0) passes (1,) to model
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
logits = model(inp, prev_token=prev_tok.squeeze(0))
loss = F.cross_entropy(